1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2019 Joyent, Inc.
  25  */
  26 
  27 /*
  28  * Copyright (c) 2016 by Delphix. All rights reserved.
  29  */
  30 
  31 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  32 /*        All Rights Reserved   */
  33 
  34 /*
  35  * University Copyright- Copyright (c) 1982, 1986, 1988
  36  * The Regents of the University of California
  37  * All Rights Reserved
  38  *
  39  * University Acknowledgment- Portions of this document are derived from
  40  * software developed by the University of California, Berkeley, and its
  41  * contributors.
  42  */
  43 
  44 #include <sys/types.h>
  45 #include <sys/t_lock.h>
  46 #include <sys/sysmacros.h>
  47 #include <sys/conf.h>
  48 #include <sys/cpuvar.h>
  49 #include <sys/errno.h>
  50 #include <sys/debug.h>
  51 #include <sys/buf.h>
  52 #include <sys/var.h>
  53 #include <sys/vnode.h>
  54 #include <sys/bitmap.h>
  55 #include <sys/cmn_err.h>
  56 #include <sys/kmem.h>
  57 #include <sys/vmem.h>
  58 #include <sys/atomic.h>
  59 #include <vm/seg_kmem.h>
  60 #include <vm/page.h>
  61 #include <vm/pvn.h>
  62 #include <sys/vtrace.h>
  63 #include <sys/tnf_probe.h>
  64 #include <sys/fs/ufs_inode.h>
  65 #include <sys/fs/ufs_bio.h>
  66 #include <sys/fs/ufs_log.h>
  67 #include <sys/systm.h>
  68 #include <sys/vfs.h>
  69 #include <sys/sdt.h>
  70 
  71 /* Locks */
  72 static  kmutex_t        blist_lock;     /* protects b_list */
  73 static  kmutex_t        bhdr_lock;      /* protects the bhdrlist */
  74 static  kmutex_t        bfree_lock;     /* protects the bfreelist structure */
  75 
  76 struct hbuf     *hbuf;                  /* Hash buckets */
  77 struct dwbuf    *dwbuf;                 /* Delayed write buckets */
  78 static struct buf *bhdrlist;            /* buf header free list */
  79 static int      nbuf;                   /* number of buffer headers allocated */
  80 
  81 static int      lastindex;              /* Reference point on where to start */
  82                                         /* when looking for free buffers */
  83 
  84 #define bio_bhash(dev, bn)      (hash2ints((dev), (int)(bn)) & v.v_hmask)
  85 #define EMPTY_LIST      ((struct buf *)-1)
  86 
  87 static kcondvar_t       bio_mem_cv;     /* Condition variables */
  88 static kcondvar_t       bio_flushinval_cv;
  89 static int      bio_doingflush;         /* flush in progress */
  90 static int      bio_doinginval;         /* inval in progress */
  91 static int      bio_flinv_cv_wanted;    /* someone waiting for cv */
  92 
  93 /*
  94  * Statistics on the buffer cache
  95  */
  96 struct biostats biostats = {
  97         { "buffer_cache_lookups",               KSTAT_DATA_UINT32 },
  98         { "buffer_cache_hits",                  KSTAT_DATA_UINT32 },
  99         { "new_buffer_requests",                KSTAT_DATA_UINT32 },
 100         { "waits_for_buffer_allocs",            KSTAT_DATA_UINT32 },
 101         { "buffers_locked_by_someone",          KSTAT_DATA_UINT32 },
 102         { "duplicate_buffers_found",            KSTAT_DATA_UINT32 }
 103 };
 104 
 105 /*
 106  * kstat data
 107  */
 108 kstat_named_t   *biostats_ptr = (kstat_named_t *)&biostats;
 109 uint_t          biostats_ndata = (uint_t)(sizeof (biostats) /
 110                                         sizeof (kstat_named_t));
 111 
 112 /*
 113  * Statistics on ufs buffer cache
 114  * Not protected by locks
 115  */
 116 struct ufsbiostats ub = {
 117         { "breads",                     KSTAT_DATA_UINT32 },
 118         { "bwrites",                    KSTAT_DATA_UINT32 },
 119         { "fbiwrites",                  KSTAT_DATA_UINT32 },
 120         { "getpages",                   KSTAT_DATA_UINT32 },
 121         { "getras",                     KSTAT_DATA_UINT32 },
 122         { "putsyncs",                   KSTAT_DATA_UINT32 },
 123         { "putasyncs",                  KSTAT_DATA_UINT32 },
 124         { "putpageios",                 KSTAT_DATA_UINT32 },
 125 };
 126 
 127 /*
 128  * more UFS Logging eccentricities...
 129  *
 130  * required since "#pragma weak ..." doesn't work in reverse order.
 131  * i.e.:  genunix (bio.c) is loaded before the ufs modules and pointers
 132  *        to ufs routines don't get plugged into bio.c calls so
 133  *        we initialize it when setting up the "lufsops" table
 134  *        in "lufs.c:_init()"
 135  */
 136 void (*bio_lufs_strategy)(void *, buf_t *);
 137 void (*bio_snapshot_strategy)(void *, buf_t *);
 138 
 139 
 140 /* Private routines */
 141 static struct buf       *bio_getfreeblk(long);
 142 static void             bio_mem_get(long);
 143 static void             bio_bhdr_free(struct buf *);
 144 static struct buf       *bio_bhdr_alloc(void);
 145 static void             bio_recycle(int, long);
 146 static void             bio_pageio_done(struct buf *);
 147 static int              bio_incore(dev_t, daddr_t);
 148 
 149 /*
 150  * Buffer cache constants
 151  */
 152 #define BIO_BUF_PERCENT (100/2)         /* default: 2% of memory */
 153 #define BIO_MAX_PERCENT (100/20)        /* max is 20% of real memory */
 154 #define BIO_BHDR_POOL   100             /* Default bhdr pool size */
 155 #define BIO_MIN_HDR     10              /* Minimum number of buffer headers */
 156 #define BIO_MIN_HWM     (BIO_MIN_HDR * MAXBSIZE / 1024)
 157 #define BIO_HASHLEN     4               /* Target length of hash chains */
 158 
 159 
 160 /* Flags for bio_recycle() */
 161 #define BIO_HEADER      0x01
 162 #define BIO_MEM         0x02
 163 
 164 extern  int bufhwm;             /* User tunable - high water mark for mem  */
 165 extern  int bufhwm_pct;         /* ditto - given in % of physmem  */
 166 
 167 /*
 168  * The following routines allocate and free
 169  * buffers with various side effects.  In general the
 170  * arguments to an allocate routine are a device and
 171  * a block number, and the value is a pointer to
 172  * to the buffer header; the buffer returned is locked with a
 173  * binary semaphore so that no one else can touch it. If the block was
 174  * already in core, no I/O need be done; if it is
 175  * already locked, the process waits until it becomes free.
 176  * The following routines allocate a buffer:
 177  *      getblk
 178  *      bread/BREAD
 179  *      breada
 180  * Eventually the buffer must be released, possibly with the
 181  * side effect of writing it out, by using one of
 182  *      bwrite/BWRITE/brwrite
 183  *      bdwrite/bdrwrite
 184  *      bawrite
 185  *      brelse
 186  *
 187  * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
 188  * Instead, a binary semaphore, b_sem is used to gain exclusive access to
 189  * a buffer and a binary semaphore, b_io is used for I/O synchronization.
 190  * B_DONE is still used to denote a buffer with I/O complete on it.
 191  *
 192  * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
 193  * should not be used where a very accurate count of the free buffers is
 194  * needed.
 195  */
 196 
 197 /*
 198  * Read in (if necessary) the block and return a buffer pointer.
 199  *
 200  * This interface is provided for binary compatibility.  Using
 201  * BREAD() directly avoids the extra function call overhead invoked
 202  * by calling this routine.
 203  */
 204 struct buf *
 205 bread(dev_t dev, daddr_t blkno, long bsize)
 206 {
 207         return (BREAD(dev, blkno, bsize));
 208 }
 209 
 210 /*
 211  * Common code for reading a buffer with various options
 212  *
 213  * Read in (if necessary) the block and return a buffer pointer.
 214  */
 215 struct buf *
 216 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
 217 {
 218         struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
 219         struct buf *bp;
 220         klwp_t *lwp = ttolwp(curthread);
 221 
 222         CPU_STATS_ADD_K(sys, lread, 1);
 223         bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
 224         if (bp->b_flags & B_DONE)
 225                 return (bp);
 226         bp->b_flags |= B_READ;
 227         ASSERT(bp->b_bcount == bsize);
 228         if (ufsvfsp == NULL) {                                  /* !ufs */
 229                 (void) bdev_strategy(bp);
 230         } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
 231                                                         /* ufs && logging */
 232                 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
 233         } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
 234                                                         /* ufs && snapshots */
 235                 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
 236         } else {
 237                 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
 238                 ub.ub_breads.value.ul++;                /* ufs && !logging */
 239                 (void) bdev_strategy(bp);
 240         }
 241         if (lwp != NULL)
 242                 lwp->lwp_ru.inblock++;
 243         CPU_STATS_ADD_K(sys, bread, 1);
 244         (void) biowait(bp);
 245         return (bp);
 246 }
 247 
 248 /*
 249  * Read in the block, like bread, but also start I/O on the
 250  * read-ahead block (which is not allocated to the caller).
 251  */
 252 struct buf *
 253 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
 254 {
 255         struct buf *bp, *rabp;
 256         klwp_t *lwp = ttolwp(curthread);
 257 
 258         bp = NULL;
 259         if (!bio_incore(dev, blkno)) {
 260                 CPU_STATS_ADD_K(sys, lread, 1);
 261                 bp = GETBLK(dev, blkno, bsize);
 262                 if ((bp->b_flags & B_DONE) == 0) {
 263                         bp->b_flags |= B_READ;
 264                         bp->b_bcount = bsize;
 265                         (void) bdev_strategy(bp);
 266                         if (lwp != NULL)
 267                                 lwp->lwp_ru.inblock++;
 268                         CPU_STATS_ADD_K(sys, bread, 1);
 269                 }
 270         }
 271         if (rablkno && bfreelist.b_bcount > 1 &&
 272             !bio_incore(dev, rablkno)) {
 273                 rabp = GETBLK(dev, rablkno, bsize);
 274                 if (rabp->b_flags & B_DONE)
 275                         brelse(rabp);
 276                 else {
 277                         rabp->b_flags |= B_READ|B_ASYNC;
 278                         rabp->b_bcount = bsize;
 279                         (void) bdev_strategy(rabp);
 280                         if (lwp != NULL)
 281                                 lwp->lwp_ru.inblock++;
 282                         CPU_STATS_ADD_K(sys, bread, 1);
 283                 }
 284         }
 285         if (bp == NULL)
 286                 return (BREAD(dev, blkno, bsize));
 287         (void) biowait(bp);
 288         return (bp);
 289 }
 290 
 291 /*
 292  * Common code for writing a buffer with various options.
 293  *
 294  * force_wait  - wait for write completion regardless of B_ASYNC flag
 295  * do_relse    - release the buffer when we are done
 296  * clear_flags - flags to clear from the buffer
 297  */
 298 void
 299 bwrite_common(void *arg, struct buf *bp, int force_wait,
 300     int do_relse, int clear_flags)
 301 {
 302         register int do_wait;
 303         struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
 304         int flag;
 305         klwp_t *lwp = ttolwp(curthread);
 306         struct cpu *cpup;
 307 
 308         ASSERT(SEMA_HELD(&bp->b_sem));
 309         flag = bp->b_flags;
 310         bp->b_flags &= ~clear_flags;
 311         if (lwp != NULL)
 312                 lwp->lwp_ru.oublock++;
 313         CPU_STATS_ENTER_K();
 314         cpup = CPU;             /* get pointer AFTER preemption is disabled */
 315         CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
 316         CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
 317         do_wait = ((flag & B_ASYNC) == 0 || force_wait);
 318         if (do_wait == 0)
 319                 CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
 320         CPU_STATS_EXIT_K();
 321         if (ufsvfsp == NULL) {
 322                 (void) bdev_strategy(bp);
 323         } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
 324                                                         /* ufs && logging */
 325                 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
 326         } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
 327                                                         /* ufs && snapshots */
 328                 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
 329         } else {
 330                 ub.ub_bwrites.value.ul++;               /* ufs && !logging */
 331                 (void) bdev_strategy(bp);
 332         }
 333         if (do_wait) {
 334                 (void) biowait(bp);
 335                 if (do_relse) {
 336                         brelse(bp);
 337                 }
 338         }
 339 }
 340 
 341 /*
 342  * Write the buffer, waiting for completion (unless B_ASYNC is set).
 343  * Then release the buffer.
 344  * This interface is provided for binary compatibility.  Using
 345  * BWRITE() directly avoids the extra function call overhead invoked
 346  * by calling this routine.
 347  */
 348 void
 349 bwrite(struct buf *bp)
 350 {
 351         BWRITE(bp);
 352 }
 353 
 354 /*
 355  * Write the buffer, waiting for completion.
 356  * But don't release the buffer afterwards.
 357  * This interface is provided for binary compatibility.  Using
 358  * BWRITE2() directly avoids the extra function call overhead.
 359  */
 360 void
 361 bwrite2(struct buf *bp)
 362 {
 363         BWRITE2(bp);
 364 }
 365 
 366 /*
 367  * Release the buffer, marking it so that if it is grabbed
 368  * for another purpose it will be written out before being
 369  * given up (e.g. when writing a partial block where it is
 370  * assumed that another write for the same block will soon follow).
 371  * Also save the time that the block is first marked as delayed
 372  * so that it will be written in a reasonable time.
 373  */
 374 void
 375 bdwrite(struct buf *bp)
 376 {
 377         ASSERT(SEMA_HELD(&bp->b_sem));
 378         CPU_STATS_ADD_K(sys, lwrite, 1);
 379         if ((bp->b_flags & B_DELWRI) == 0)
 380                 bp->b_start = ddi_get_lbolt();
 381         /*
 382          * B_DONE allows others to use the buffer, B_DELWRI causes the
 383          * buffer to be written before being reused, and setting b_resid
 384          * to zero says the buffer is complete.
 385          */
 386         bp->b_flags |= B_DELWRI | B_DONE;
 387         bp->b_resid = 0;
 388         brelse(bp);
 389 }
 390 
 391 /*
 392  * Release the buffer, start I/O on it, but don't wait for completion.
 393  */
 394 void
 395 bawrite(struct buf *bp)
 396 {
 397         ASSERT(SEMA_HELD(&bp->b_sem));
 398 
 399         /* Use bfreelist.b_bcount as a weird-ass heuristic */
 400         if (bfreelist.b_bcount > 4)
 401                 bp->b_flags |= B_ASYNC;
 402         BWRITE(bp);
 403 }
 404 
 405 /*
 406  * Release the buffer, with no I/O implied.
 407  */
 408 void
 409 brelse(struct buf *bp)
 410 {
 411         struct buf      **backp;
 412         uint_t          index;
 413         kmutex_t        *hmp;
 414         struct  buf     *dp;
 415         struct  hbuf    *hp;
 416 
 417 
 418         ASSERT(SEMA_HELD(&bp->b_sem));
 419 
 420         /*
 421          * Clear the retry write flag if the buffer was written without
 422          * error.  The presence of B_DELWRI means the buffer has not yet
 423          * been written and the presence of B_ERROR means that an error
 424          * is still occurring.
 425          */
 426         if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
 427                 bp->b_flags &= ~B_RETRYWRI;
 428         }
 429 
 430         /* Check for anomalous conditions */
 431         if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
 432                 if (bp->b_flags & B_NOCACHE) {
 433                         /* Don't add to the freelist. Destroy it now */
 434                         kmem_free(bp->b_un.b_addr, bp->b_bufsize);
 435                         sema_destroy(&bp->b_sem);
 436                         sema_destroy(&bp->b_io);
 437                         kmem_free(bp, sizeof (struct buf));
 438                         return;
 439                 }
 440                 /*
 441                  * If a write failed and we are supposed to retry write,
 442                  * don't toss the buffer.  Keep it around and mark it
 443                  * delayed write in the hopes that it will eventually
 444                  * get flushed (and still keep the system running.)
 445                  */
 446                 if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
 447                         bp->b_flags |= B_DELWRI;
 448                         /* keep fsflush from trying continuously to flush */
 449                         bp->b_start = ddi_get_lbolt();
 450                 } else
 451                         bp->b_flags |= B_AGE|B_STALE;
 452                 bp->b_flags &= ~B_ERROR;
 453                 bp->b_error = 0;
 454         }
 455 
 456         /*
 457          * If delayed write is set then put in on the delayed
 458          * write list instead of the free buffer list.
 459          */
 460         index = bio_bhash(bp->b_edev, bp->b_blkno);
 461         hmp   = &hbuf[index].b_lock;
 462 
 463         mutex_enter(hmp);
 464         hp = &hbuf[index];
 465         dp = (struct buf *)hp;
 466 
 467         /*
 468          * Make sure that the number of entries on this list are
 469          * Zero <= count <= total # buffers
 470          */
 471         ASSERT(hp->b_length >= 0);
 472         ASSERT(hp->b_length < nbuf);
 473 
 474         hp->b_length++;              /* We are adding this buffer */
 475 
 476         if (bp->b_flags & B_DELWRI) {
 477                 /*
 478                  * This buffer goes on the delayed write buffer list
 479                  */
 480                 dp = (struct buf *)&dwbuf[index];
 481         }
 482         ASSERT(bp->b_bufsize > 0);
 483         ASSERT(bp->b_bcount > 0);
 484         ASSERT(bp->b_un.b_addr != NULL);
 485 
 486         if (bp->b_flags & B_AGE) {
 487                 backp = &dp->av_forw;
 488                 (*backp)->av_back = bp;
 489                 bp->av_forw = *backp;
 490                 *backp = bp;
 491                 bp->av_back = dp;
 492         } else {
 493                 backp = &dp->av_back;
 494                 (*backp)->av_forw = bp;
 495                 bp->av_back = *backp;
 496                 *backp = bp;
 497                 bp->av_forw = dp;
 498         }
 499         mutex_exit(hmp);
 500 
 501         if (bfreelist.b_flags & B_WANTED) {
 502                 /*
 503                  * Should come here very very rarely.
 504                  */
 505                 mutex_enter(&bfree_lock);
 506                 if (bfreelist.b_flags & B_WANTED) {
 507                         bfreelist.b_flags &= ~B_WANTED;
 508                         cv_broadcast(&bio_mem_cv);
 509                 }
 510                 mutex_exit(&bfree_lock);
 511         }
 512 
 513         bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
 514         /*
 515          * Don't let anyone get the buffer off the freelist before we
 516          * release our hold on it.
 517          */
 518         sema_v(&bp->b_sem);
 519 }
 520 
 521 /*
 522  * Return a count of the number of B_BUSY buffers in the system
 523  * Can only be used as a good estimate.  If 'cleanit' is set,
 524  * try to flush all bufs.
 525  */
 526 int
 527 bio_busy(int cleanit)
 528 {
 529         struct buf *bp, *dp;
 530         int busy = 0;
 531         int i;
 532         kmutex_t *hmp;
 533 
 534         for (i = 0; i < v.v_hbuf; i++) {
 535                 dp = (struct buf *)&hbuf[i];
 536                 hmp = &hbuf[i].b_lock;
 537 
 538                 mutex_enter(hmp);
 539                 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 540                         if (bp->b_flags & B_BUSY)
 541                                 busy++;
 542                 }
 543                 mutex_exit(hmp);
 544         }
 545 
 546         if (cleanit && busy != 0) {
 547                 bflush(NODEV);
 548         }
 549 
 550         return (busy);
 551 }
 552 
 553 /*
 554  * this interface is provided for binary compatibility.
 555  *
 556  * Assign a buffer for the given block.  If the appropriate
 557  * block is already associated, return it; otherwise search
 558  * for the oldest non-busy buffer and reassign it.
 559  */
 560 struct buf *
 561 getblk(dev_t dev, daddr_t blkno, long bsize)
 562 {
 563         return (getblk_common(/* ufsvfsp */ NULL, dev,
 564             blkno, bsize, /* errflg */ 0));
 565 }
 566 
 567 /*
 568  * Assign a buffer for the given block.  If the appropriate
 569  * block is already associated, return it; otherwise search
 570  * for the oldest non-busy buffer and reassign it.
 571  */
 572 struct buf *
 573 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
 574 {
 575         ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
 576         struct buf *bp;
 577         struct buf *dp;
 578         struct buf *nbp = NULL;
 579         struct buf *errbp;
 580         uint_t          index;
 581         kmutex_t        *hmp;
 582         struct  hbuf    *hp;
 583 
 584         if (getmajor(dev) >= devcnt)
 585                 cmn_err(CE_PANIC, "blkdev");
 586 
 587         biostats.bio_lookup.value.ui32++;
 588 
 589         index = bio_bhash(dev, blkno);
 590         hp    = &hbuf[index];
 591         dp    = (struct buf *)hp;
 592         hmp   = &hp->b_lock;
 593 
 594         mutex_enter(hmp);
 595 loop:
 596         for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 597                 if (bp->b_blkno != blkno || bp->b_edev != dev ||
 598                     (bp->b_flags & B_STALE))
 599                         continue;
 600                 /*
 601                  * Avoid holding the hash lock in the event that
 602                  * the buffer is locked by someone. Since the hash chain
 603                  * may change when we drop the hash lock
 604                  * we have to start at the beginning of the chain if the
 605                  * buffer identity/contents aren't valid.
 606                  */
 607                 if (!sema_tryp(&bp->b_sem)) {
 608                         biostats.bio_bufbusy.value.ui32++;
 609                         mutex_exit(hmp);
 610                         /*
 611                          * OK, we are dealing with a busy buffer.
 612                          * In the case that we are panicking and we
 613                          * got called from bread(), we have some chance
 614                          * for error recovery. So better bail out from
 615                          * here since sema_p() won't block. If we got
 616                          * called directly from ufs routines, there is
 617                          * no way to report an error yet.
 618                          */
 619                         if (panicstr && errflg)
 620                                 goto errout;
 621                         /*
 622                          * For the following line of code to work
 623                          * correctly never kmem_free the buffer "header".
 624                          */
 625                         sema_p(&bp->b_sem);
 626                         if (bp->b_blkno != blkno || bp->b_edev != dev ||
 627                             (bp->b_flags & B_STALE)) {
 628                                 sema_v(&bp->b_sem);
 629                                 mutex_enter(hmp);
 630                                 goto loop;      /* start over */
 631                         }
 632                         mutex_enter(hmp);
 633                 }
 634                 /* Found */
 635                 biostats.bio_hit.value.ui32++;
 636                 bp->b_flags &= ~B_AGE;
 637 
 638                 /*
 639                  * Yank it off the free/delayed write lists
 640                  */
 641                 hp->b_length--;
 642                 notavail(bp);
 643                 mutex_exit(hmp);
 644 
 645                 ASSERT((bp->b_flags & B_NOCACHE) == 0);
 646 
 647                 if (nbp == NULL) {
 648                         /*
 649                          * Make the common path short.
 650                          */
 651                         ASSERT(SEMA_HELD(&bp->b_sem));
 652                         return (bp);
 653                 }
 654 
 655                 biostats.bio_bufdup.value.ui32++;
 656 
 657                 /*
 658                  * The buffer must have entered during the lock upgrade
 659                  * so free the new buffer we allocated and return the
 660                  * found buffer.
 661                  */
 662                 kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
 663                 nbp->b_un.b_addr = NULL;
 664 
 665                 /*
 666                  * Account for the memory
 667                  */
 668                 mutex_enter(&bfree_lock);
 669                 bfreelist.b_bufsize += nbp->b_bufsize;
 670                 mutex_exit(&bfree_lock);
 671 
 672                 /*
 673                  * Destroy buf identity, and place on avail list
 674                  */
 675                 nbp->b_dev = (o_dev_t)NODEV;
 676                 nbp->b_edev = NODEV;
 677                 nbp->b_flags = 0;
 678                 nbp->b_file = NULL;
 679                 nbp->b_offset = -1;
 680 
 681                 sema_v(&nbp->b_sem);
 682                 bio_bhdr_free(nbp);
 683 
 684                 ASSERT(SEMA_HELD(&bp->b_sem));
 685                 return (bp);
 686         }
 687 
 688         /*
 689          * bio_getfreeblk may block so check the hash chain again.
 690          */
 691         if (nbp == NULL) {
 692                 mutex_exit(hmp);
 693                 nbp = bio_getfreeblk(bsize);
 694                 mutex_enter(hmp);
 695                 goto loop;
 696         }
 697 
 698         /*
 699          * New buffer. Assign nbp and stick it on the hash.
 700          */
 701         nbp->b_flags = B_BUSY;
 702         nbp->b_edev = dev;
 703         nbp->b_dev = (o_dev_t)cmpdev(dev);
 704         nbp->b_blkno = blkno;
 705         nbp->b_iodone = NULL;
 706         nbp->b_bcount = bsize;
 707         /*
 708          * If we are given a ufsvfsp and the vfs_root field is NULL
 709          * then this must be I/O for a superblock.  A superblock's
 710          * buffer is set up in mountfs() and there is no root vnode
 711          * at that point.
 712          */
 713         if (ufsvfsp && ufsvfsp->vfs_root) {
 714                 nbp->b_vp = ufsvfsp->vfs_root;
 715         } else {
 716                 nbp->b_vp = NULL;
 717         }
 718 
 719         ASSERT((nbp->b_flags & B_NOCACHE) == 0);
 720 
 721         binshash(nbp, dp);
 722         mutex_exit(hmp);
 723 
 724         ASSERT(SEMA_HELD(&nbp->b_sem));
 725 
 726         return (nbp);
 727 
 728 
 729         /*
 730          * Come here in case of an internal error. At this point we couldn't
 731          * get a buffer, but we have to return one. Hence we allocate some
 732          * kind of error reply buffer on the fly. This buffer is marked as
 733          * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
 734          *      - B_ERROR will indicate error to the caller.
 735          *      - B_DONE will prevent us from reading the buffer from
 736          *        the device.
 737          *      - B_NOCACHE will cause that this buffer gets free'd in
 738          *        brelse().
 739          */
 740 
 741 errout:
 742         errbp = geteblk();
 743         sema_p(&errbp->b_sem);
 744         errbp->b_flags &= ~B_BUSY;
 745         errbp->b_flags |= (B_ERROR | B_DONE);
 746         return (errbp);
 747 }
 748 
 749 /*
 750  * Get an empty block, not assigned to any particular device.
 751  * Returns a locked buffer that is not on any hash or free list.
 752  */
 753 struct buf *
 754 ngeteblk(long bsize)
 755 {
 756         struct buf *bp;
 757 
 758         bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
 759         bioinit(bp);
 760         bp->av_forw = bp->av_back = NULL;
 761         bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
 762         bp->b_bufsize = bsize;
 763         bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
 764         bp->b_dev = (o_dev_t)NODEV;
 765         bp->b_edev = NODEV;
 766         bp->b_lblkno = 0;
 767         bp->b_bcount = bsize;
 768         bp->b_iodone = NULL;
 769         return (bp);
 770 }
 771 
 772 /*
 773  * Interface of geteblk() is kept intact to maintain driver compatibility.
 774  * Use ngeteblk() to allocate block size other than 1 KB.
 775  */
 776 struct buf *
 777 geteblk(void)
 778 {
 779         return (ngeteblk((long)1024));
 780 }
 781 
 782 /*
 783  * Return a buffer w/o sleeping
 784  */
 785 struct buf *
 786 trygetblk(dev_t dev, daddr_t blkno)
 787 {
 788         struct buf      *bp;
 789         struct buf      *dp;
 790         struct hbuf     *hp;
 791         kmutex_t        *hmp;
 792         uint_t          index;
 793 
 794         index = bio_bhash(dev, blkno);
 795         hp = &hbuf[index];
 796         hmp = &hp->b_lock;
 797 
 798         if (!mutex_tryenter(hmp))
 799                 return (NULL);
 800 
 801         dp = (struct buf *)hp;
 802         for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 803                 if (bp->b_blkno != blkno || bp->b_edev != dev ||
 804                     (bp->b_flags & B_STALE))
 805                         continue;
 806                 /*
 807                  * Get access to a valid buffer without sleeping
 808                  */
 809                 if (sema_tryp(&bp->b_sem)) {
 810                         if (bp->b_flags & B_DONE) {
 811                                 hp->b_length--;
 812                                 notavail(bp);
 813                                 mutex_exit(hmp);
 814                                 return (bp);
 815                         } else {
 816                                 sema_v(&bp->b_sem);
 817                                 break;
 818                         }
 819                 }
 820                 break;
 821         }
 822         mutex_exit(hmp);
 823         return (NULL);
 824 }
 825 
 826 /*
 827  * Wait for I/O completion on the buffer; return errors
 828  * to the user.
 829  */
 830 int
 831 iowait(struct buf *bp)
 832 {
 833         ASSERT(SEMA_HELD(&bp->b_sem));
 834         return (biowait(bp));
 835 }
 836 
 837 /*
 838  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
 839  * and wake up anyone waiting for it.
 840  */
 841 void
 842 iodone(struct buf *bp)
 843 {
 844         ASSERT(SEMA_HELD(&bp->b_sem));
 845         (void) biodone(bp);
 846 }
 847 
 848 /*
 849  * Zero the core associated with a buffer.
 850  */
 851 void
 852 clrbuf(struct buf *bp)
 853 {
 854         ASSERT(SEMA_HELD(&bp->b_sem));
 855         bzero(bp->b_un.b_addr, bp->b_bcount);
 856         bp->b_resid = 0;
 857 }
 858 
 859 
 860 /*
 861  * Make sure all write-behind blocks on dev (or NODEV for all)
 862  * are flushed out.
 863  */
 864 void
 865 bflush(dev_t dev)
 866 {
 867         struct buf *bp, *dp;
 868         struct hbuf *hp;
 869         struct buf *delwri_list = EMPTY_LIST;
 870         int i, index;
 871         kmutex_t *hmp;
 872 
 873         mutex_enter(&blist_lock);
 874         /*
 875          * Wait for any invalidates or flushes ahead of us to finish.
 876          * We really could split blist_lock up per device for better
 877          * parallelism here.
 878          */
 879         while (bio_doinginval || bio_doingflush) {
 880                 bio_flinv_cv_wanted = 1;
 881                 cv_wait(&bio_flushinval_cv, &blist_lock);
 882         }
 883         bio_doingflush++;
 884         /*
 885          * Gather all B_DELWRI buffer for device.
 886          * Lock ordering is b_sem > hash lock (brelse).
 887          * Since we are finding the buffer via the delayed write list,
 888          * it may be busy and we would block trying to get the
 889          * b_sem lock while holding hash lock. So transfer all the
 890          * candidates on the delwri_list and then drop the hash locks.
 891          */
 892         for (i = 0; i < v.v_hbuf; i++) {
 893                 hmp = &hbuf[i].b_lock;
 894                 dp = (struct buf *)&dwbuf[i];
 895                 mutex_enter(hmp);
 896                 for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
 897                         if (dev == NODEV || bp->b_edev == dev) {
 898                                 if (bp->b_list == NULL) {
 899                                         bp->b_list = delwri_list;
 900                                         delwri_list = bp;
 901                                 }
 902                         }
 903                 }
 904                 mutex_exit(hmp);
 905         }
 906         mutex_exit(&blist_lock);
 907 
 908         /*
 909          * Now that the hash locks have been dropped grab the semaphores
 910          * and write back all the buffers that have B_DELWRI set.
 911          */
 912         while (delwri_list != EMPTY_LIST) {
 913                 bp = delwri_list;
 914 
 915                 sema_p(&bp->b_sem);      /* may block */
 916                 if ((dev != bp->b_edev && dev != NODEV) ||
 917                     (panicstr && bp->b_flags & B_BUSY)) {
 918                         sema_v(&bp->b_sem);
 919                         delwri_list = bp->b_list;
 920                         bp->b_list = NULL;
 921                         continue;       /* No longer a candidate */
 922                 }
 923                 if (bp->b_flags & B_DELWRI) {
 924                         index = bio_bhash(bp->b_edev, bp->b_blkno);
 925                         hp = &hbuf[index];
 926                         hmp = &hp->b_lock;
 927                         dp = (struct buf *)hp;
 928 
 929                         bp->b_flags |= B_ASYNC;
 930                         mutex_enter(hmp);
 931                         hp->b_length--;
 932                         notavail(bp);
 933                         mutex_exit(hmp);
 934                         if (bp->b_vp == NULL) {              /* !ufs */
 935                                 BWRITE(bp);
 936                         } else {                        /* ufs */
 937                                 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
 938                         }
 939                 } else {
 940                         sema_v(&bp->b_sem);
 941                 }
 942                 delwri_list = bp->b_list;
 943                 bp->b_list = NULL;
 944         }
 945         mutex_enter(&blist_lock);
 946         bio_doingflush--;
 947         if (bio_flinv_cv_wanted) {
 948                 bio_flinv_cv_wanted = 0;
 949                 cv_broadcast(&bio_flushinval_cv);
 950         }
 951         mutex_exit(&blist_lock);
 952 }
 953 
 954 /*
 955  * Ensure that a specified block is up-to-date on disk.
 956  */
 957 void
 958 blkflush(dev_t dev, daddr_t blkno)
 959 {
 960         struct buf *bp, *dp;
 961         struct hbuf *hp;
 962         struct buf *sbp = NULL;
 963         uint_t index;
 964         kmutex_t *hmp;
 965 
 966         index = bio_bhash(dev, blkno);
 967         hp    = &hbuf[index];
 968         dp    = (struct buf *)hp;
 969         hmp   = &hp->b_lock;
 970 
 971         /*
 972          * Identify the buffer in the cache belonging to
 973          * this device and blkno (if any).
 974          */
 975         mutex_enter(hmp);
 976         for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 977                 if (bp->b_blkno != blkno || bp->b_edev != dev ||
 978                     (bp->b_flags & B_STALE))
 979                         continue;
 980                 sbp = bp;
 981                 break;
 982         }
 983         mutex_exit(hmp);
 984         if (sbp == NULL)
 985                 return;
 986         /*
 987          * Now check the buffer we have identified and
 988          * make sure it still belongs to the device and is B_DELWRI
 989          */
 990         sema_p(&sbp->b_sem);
 991         if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
 992             (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
 993                 mutex_enter(hmp);
 994                 hp->b_length--;
 995                 notavail(sbp);
 996                 mutex_exit(hmp);
 997                 /*
 998                  * XXX - There is nothing to guarantee a synchronous
 999                  * write here if the B_ASYNC flag is set.  This needs
1000                  * some investigation.
1001                  */
1002                 if (sbp->b_vp == NULL) {             /* !ufs */
1003                         BWRITE(sbp);    /* synchronous write */
1004                 } else {                                /* ufs */
1005                         UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1006                 }
1007         } else {
1008                 sema_v(&sbp->b_sem);
1009         }
1010 }
1011 
1012 /*
1013  * Same as binval, except can force-invalidate delayed-write buffers
1014  * (which are not be already flushed because of device errors).  Also
1015  * makes sure that the retry write flag is cleared.
1016  */
1017 int
1018 bfinval(dev_t dev, int force)
1019 {
1020         struct buf *dp;
1021         struct buf *bp;
1022         struct buf *binval_list = EMPTY_LIST;
1023         int i, error = 0;
1024         kmutex_t *hmp;
1025         uint_t index;
1026         struct buf **backp;
1027 
1028         mutex_enter(&blist_lock);
1029         /*
1030          * Wait for any flushes ahead of us to finish, it's ok to
1031          * do invalidates in parallel.
1032          */
1033         while (bio_doingflush) {
1034                 bio_flinv_cv_wanted = 1;
1035                 cv_wait(&bio_flushinval_cv, &blist_lock);
1036         }
1037         bio_doinginval++;
1038 
1039         /* Gather bp's */
1040         for (i = 0; i < v.v_hbuf; i++) {
1041                 dp = (struct buf *)&hbuf[i];
1042                 hmp = &hbuf[i].b_lock;
1043 
1044                 mutex_enter(hmp);
1045                 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1046                         if (bp->b_edev == dev) {
1047                                 if (bp->b_list == NULL) {
1048                                         bp->b_list = binval_list;
1049                                         binval_list = bp;
1050                                 }
1051                         }
1052                 }
1053                 mutex_exit(hmp);
1054         }
1055         mutex_exit(&blist_lock);
1056 
1057         /* Invalidate all bp's found */
1058         while (binval_list != EMPTY_LIST) {
1059                 bp = binval_list;
1060 
1061                 sema_p(&bp->b_sem);
1062                 if (bp->b_edev == dev) {
1063                         if (force && (bp->b_flags & B_DELWRI)) {
1064                                 /* clear B_DELWRI, move to non-dw freelist */
1065                                 index = bio_bhash(bp->b_edev, bp->b_blkno);
1066                                 hmp = &hbuf[index].b_lock;
1067                                 dp = (struct buf *)&hbuf[index];
1068                                 mutex_enter(hmp);
1069 
1070                                 /* remove from delayed write freelist */
1071                                 notavail(bp);
1072 
1073                                 /* add to B_AGE side of non-dw freelist */
1074                                 backp = &dp->av_forw;
1075                                 (*backp)->av_back = bp;
1076                                 bp->av_forw = *backp;
1077                                 *backp = bp;
1078                                 bp->av_back = dp;
1079 
1080                                 /*
1081                                  * make sure write retries and busy are cleared
1082                                  */
1083                                 bp->b_flags &=
1084                                     ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1085                                 mutex_exit(hmp);
1086                         }
1087                         if ((bp->b_flags & B_DELWRI) == 0)
1088                                 bp->b_flags |= B_STALE|B_AGE;
1089                         else
1090                                 error = EIO;
1091                 }
1092                 sema_v(&bp->b_sem);
1093                 binval_list = bp->b_list;
1094                 bp->b_list = NULL;
1095         }
1096         mutex_enter(&blist_lock);
1097         bio_doinginval--;
1098         if (bio_flinv_cv_wanted) {
1099                 cv_broadcast(&bio_flushinval_cv);
1100                 bio_flinv_cv_wanted = 0;
1101         }
1102         mutex_exit(&blist_lock);
1103         return (error);
1104 }
1105 
1106 /*
1107  * If possible, invalidate blocks for a dev on demand
1108  */
1109 void
1110 binval(dev_t dev)
1111 {
1112         (void) bfinval(dev, 0);
1113 }
1114 
1115 /*
1116  * Initialize the buffer I/O system by freeing
1117  * all buffers and setting all device hash buffer lists to empty.
1118  */
1119 void
1120 binit(void)
1121 {
1122         struct buf *bp;
1123         unsigned int i, pct;
1124         ulong_t bio_max_hwm, bio_default_hwm;
1125 
1126         /*
1127          * Maximum/Default values for bufhwm are set to the smallest of:
1128          *      - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1129          *      - 1/4 of kernel virtual memory
1130          *      - INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1131          * Additionally, in order to allow simple tuning by percentage of
1132          * physical memory, bufhwm_pct is used to calculate the default if
1133          * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1134          *
1135          * Since the unit for v.v_bufhwm is kilobytes, this allows for
1136          * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1137          */
1138         bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1139             btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1140         bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1141 
1142         pct = BIO_BUF_PERCENT;
1143         if (bufhwm_pct != 0 &&
1144             ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1145                 pct = BIO_BUF_PERCENT;
1146                 /*
1147                  * Invalid user specified value, emit a warning.
1148                  */
1149                 cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1150                     range(1..%d). Using %d as default.",
1151                     bufhwm_pct,
1152                     100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1153         }
1154 
1155         bio_default_hwm = MIN(physmem / pct,
1156             btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1157         bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1158 
1159         if ((v.v_bufhwm = bufhwm) == 0)
1160                 v.v_bufhwm = bio_default_hwm;
1161 
1162         if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1163                 v.v_bufhwm = (int)bio_max_hwm;
1164                 /*
1165                  * Invalid user specified value, emit a warning.
1166                  */
1167                 cmn_err(CE_WARN,
1168                     "binit: bufhwm(%d) out \
1169                     of range(%d..%lu). Using %lu as default",
1170                     bufhwm,
1171                     BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1172         }
1173 
1174         /*
1175          * Determine the number of hash buckets. Default is to
1176          * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1177          * Round up number to the next power of 2.
1178          */
1179         v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1180             BIO_HASHLEN);
1181         v.v_hmask = v.v_hbuf - 1;
1182         v.v_buf = BIO_BHDR_POOL;
1183 
1184         hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1185 
1186         dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1187 
1188         bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1189         bp = &bfreelist;
1190         bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1191 
1192         for (i = 0; i < v.v_hbuf; i++) {
1193                 hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1194                 hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1195 
1196                 /*
1197                  * Initialize the delayed write buffer list.
1198                  */
1199                 dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1200                 dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1201         }
1202 }
1203 
1204 /*
1205  * Wait for I/O completion on the buffer; return error code.
1206  * If bp was for synchronous I/O, bp is invalid and associated
1207  * resources are freed on return.
1208  */
1209 int
1210 biowait(struct buf *bp)
1211 {
1212         int error = 0;
1213         struct cpu *cpup;
1214 
1215         ASSERT(SEMA_HELD(&bp->b_sem));
1216 
1217         cpup = CPU;
1218         atomic_inc_64(&cpup->cpu_stats.sys.iowait);
1219         DTRACE_IO1(wait__start, struct buf *, bp);
1220 
1221         /*
1222          * In case of panic, busy wait for completion
1223          */
1224         if (panicstr) {
1225                 while ((bp->b_flags & B_DONE) == 0)
1226                         drv_usecwait(10);
1227         } else
1228                 sema_p(&bp->b_io);
1229 
1230         DTRACE_IO1(wait__done, struct buf *, bp);
1231         atomic_dec_64(&cpup->cpu_stats.sys.iowait);
1232 
1233         error = geterror(bp);
1234         if ((bp->b_flags & B_ASYNC) == 0) {
1235                 if (bp->b_flags & B_REMAPPED)
1236                         bp_mapout(bp);
1237         }
1238         return (error);
1239 }
1240 
1241 static void
1242 biodone_tnf_probe(struct buf *bp)
1243 {
1244         /* Kernel probe */
1245         TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1246             tnf_device,         device,         bp->b_edev,
1247             tnf_diskaddr,       block,          bp->b_lblkno,
1248             tnf_opaque,         buf,            bp);
1249 }
1250 
1251 /*
1252  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1253  * and wake up anyone waiting for it.
1254  */
1255 void
1256 biodone(struct buf *bp)
1257 {
1258         if (bp->b_flags & B_STARTED) {
1259                 DTRACE_IO1(done, struct buf *, bp);
1260                 bp->b_flags &= ~B_STARTED;
1261         }
1262 
1263         /*
1264          * Call the TNF probe here instead of the inline code
1265          * to force our compiler to use the tail call optimization.
1266          */
1267         biodone_tnf_probe(bp);
1268 
1269         if (bp->b_iodone != NULL) {
1270                 (*(bp->b_iodone))(bp);
1271                 return;
1272         }
1273         ASSERT((bp->b_flags & B_DONE) == 0);
1274         ASSERT(SEMA_HELD(&bp->b_sem));
1275         bp->b_flags |= B_DONE;
1276         if (bp->b_flags & B_ASYNC) {
1277                 if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1278                         bio_pageio_done(bp);
1279                 else
1280                         brelse(bp);     /* release bp to freelist */
1281         } else {
1282                 sema_v(&bp->b_io);
1283         }
1284 }
1285 
1286 /*
1287  * Pick up the device's error number and pass it to the user;
1288  * if there is an error but the number is 0 set a generalized code.
1289  */
1290 int
1291 geterror(struct buf *bp)
1292 {
1293         int error = 0;
1294 
1295         ASSERT(SEMA_HELD(&bp->b_sem));
1296         if (bp->b_flags & B_ERROR) {
1297                 error = bp->b_error;
1298                 if (!error)
1299                         error = EIO;
1300         }
1301         return (error);
1302 }
1303 
1304 /*
1305  * Support for pageio buffers.
1306  *
1307  * This stuff should be generalized to provide a generalized bp
1308  * header facility that can be used for things other than pageio.
1309  */
1310 
1311 /*
1312  * Allocate and initialize a buf struct for use with pageio.
1313  */
1314 struct buf *
1315 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1316 {
1317         struct buf *bp;
1318         struct cpu *cpup;
1319 
1320         if (flags & B_READ) {
1321                 CPU_STATS_ENTER_K();
1322                 cpup = CPU;     /* get pointer AFTER preemption is disabled */
1323                 CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1324                 CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1325 
1326                 atomic_add_64(&curzone->zone_pgpgin, btopr(len));
1327 
1328                 if ((flags & B_ASYNC) == 0) {
1329                         klwp_t *lwp = ttolwp(curthread);
1330                         if (lwp != NULL)
1331                                 lwp->lwp_ru.majflt++;
1332                         CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1333                         /* Kernel probe */
1334                         TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1335                             tnf_opaque,         vnode,          pp->p_vnode,
1336                             tnf_offset,         offset,         pp->p_offset);
1337                 }
1338                 /*
1339                  * Update statistics for pages being paged in
1340                  */
1341                 if (pp != NULL && pp->p_vnode != NULL) {
1342                         if (IS_SWAPFSVP(pp->p_vnode)) {
1343                                 CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
1344                                 atomic_add_64(&curzone->zone_anonpgin,
1345                                     btopr(len));
1346                         } else {
1347                                 if (pp->p_vnode->v_flag & VVMEXEC) {
1348                                         CPU_STATS_ADDQ(cpup, vm, execpgin,
1349                                             btopr(len));
1350                                         atomic_add_64(&curzone->zone_execpgin,
1351                                             btopr(len));
1352                                 } else {
1353                                         CPU_STATS_ADDQ(cpup, vm, fspgin,
1354                                             btopr(len));
1355                                         atomic_add_64(&curzone->zone_fspgin,
1356                                             btopr(len));
1357                                 }
1358                         }
1359                 }
1360                 CPU_STATS_EXIT_K();
1361                 TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1362                     "page_ws_in:pp %p", pp);
1363                 /* Kernel probe */
1364                 TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1365                     tnf_opaque, vnode,  pp->p_vnode,
1366                     tnf_offset, offset, pp->p_offset,
1367                     tnf_size,   size,   len);
1368         }
1369 
1370         bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1371         bp->b_bcount = len;
1372         bp->b_bufsize = len;
1373         bp->b_pages = pp;
1374         bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1375         bp->b_offset = -1;
1376         sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1377 
1378         /* Initialize bp->b_sem in "locked" state */
1379         sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1380 
1381         VN_HOLD(vp);
1382         bp->b_vp = vp;
1383 
1384         /*
1385          * Caller sets dev & blkno and can adjust
1386          * b_addr for page offset and can use bp_mapin
1387          * to make pages kernel addressable.
1388          */
1389         return (bp);
1390 }
1391 
1392 void
1393 pageio_done(struct buf *bp)
1394 {
1395         ASSERT(SEMA_HELD(&bp->b_sem));
1396         if (bp->b_flags & B_REMAPPED)
1397                 bp_mapout(bp);
1398         VN_RELE(bp->b_vp);
1399         bp->b_vp = NULL;
1400         ASSERT((bp->b_flags & B_NOCACHE) != 0);
1401 
1402         /* A sema_v(bp->b_sem) is implied if we are destroying it */
1403         sema_destroy(&bp->b_sem);
1404         sema_destroy(&bp->b_io);
1405         kmem_free(bp, sizeof (struct buf));
1406 }
1407 
1408 /*
1409  * Check to see whether the buffers, except the one pointed by sbp,
1410  * associated with the device are busy.
1411  * NOTE: This expensive operation shall be improved together with ufs_icheck().
1412  */
1413 int
1414 bcheck(dev_t dev, struct buf *sbp)
1415 {
1416         struct buf      *bp;
1417         struct buf      *dp;
1418         int i;
1419         kmutex_t *hmp;
1420 
1421         /*
1422          * check for busy bufs for this filesystem
1423          */
1424         for (i = 0; i < v.v_hbuf; i++) {
1425                 dp = (struct buf *)&hbuf[i];
1426                 hmp = &hbuf[i].b_lock;
1427 
1428                 mutex_enter(hmp);
1429                 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1430                         /*
1431                          * if buf is busy or dirty, then filesystem is busy
1432                          */
1433                         if ((bp->b_edev == dev) &&
1434                             ((bp->b_flags & B_STALE) == 0) &&
1435                             (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1436                             (bp != sbp)) {
1437                                 mutex_exit(hmp);
1438                                 return (1);
1439                         }
1440                 }
1441                 mutex_exit(hmp);
1442         }
1443         return (0);
1444 }
1445 
1446 /*
1447  * Hash two 32 bit entities.
1448  */
1449 int
1450 hash2ints(int x, int y)
1451 {
1452         int hash = 0;
1453 
1454         hash = x - 1;
1455         hash = ((hash * 7) + (x >> 8)) - 1;
1456         hash = ((hash * 7) + (x >> 16)) - 1;
1457         hash = ((hash * 7) + (x >> 24)) - 1;
1458         hash = ((hash * 7) + y) - 1;
1459         hash = ((hash * 7) + (y >> 8)) - 1;
1460         hash = ((hash * 7) + (y >> 16)) - 1;
1461         hash = ((hash * 7) + (y >> 24)) - 1;
1462 
1463         return (hash);
1464 }
1465 
1466 
1467 /*
1468  * Return a new buffer struct.
1469  *      Create a new buffer if we haven't gone over our high water
1470  *      mark for memory, otherwise try to get one off the freelist.
1471  *
1472  * Returns a locked buf that has no id and is not on any hash or free
1473  * list.
1474  */
1475 static struct buf *
1476 bio_getfreeblk(long bsize)
1477 {
1478         struct buf *bp, *dp;
1479         struct hbuf *hp;
1480         kmutex_t        *hmp;
1481         uint_t          start, end;
1482 
1483         /*
1484          * mutex_enter(&bfree_lock);
1485          * bfreelist.b_bufsize represents the amount of memory
1486          * mutex_exit(&bfree_lock); protect ref to bfreelist
1487          * we are allowed to allocate in the cache before we hit our hwm.
1488          */
1489         bio_mem_get(bsize);     /* Account for our memory request */
1490 
1491 again:
1492         bp = bio_bhdr_alloc();  /* Get a buf hdr */
1493         sema_p(&bp->b_sem);      /* Should never fail */
1494 
1495         ASSERT(bp->b_un.b_addr == NULL);
1496         bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1497         if (bp->b_un.b_addr != NULL) {
1498                 /*
1499                  * Make the common path short
1500                  */
1501                 bp->b_bufsize = bsize;
1502                 ASSERT(SEMA_HELD(&bp->b_sem));
1503                 return (bp);
1504         } else {
1505                 struct buf *save;
1506 
1507                 save = bp;      /* Save bp we allocated */
1508                 start = end = lastindex;
1509 
1510                 biostats.bio_bufwant.value.ui32++;
1511 
1512                 /*
1513                  * Memory isn't available from the system now. Scan
1514                  * the hash buckets till enough space is found.
1515                  */
1516                 do {
1517                         hp = &hbuf[start];
1518                         hmp = &hp->b_lock;
1519                         dp = (struct buf *)hp;
1520 
1521                         mutex_enter(hmp);
1522                         bp = dp->av_forw;
1523 
1524                         while (bp != dp) {
1525 
1526                                 ASSERT(bp != NULL);
1527 
1528                                 if (!sema_tryp(&bp->b_sem)) {
1529                                         bp = bp->av_forw;
1530                                         continue;
1531                                 }
1532 
1533                                 /*
1534                                  * Since we are going down the freelist
1535                                  * associated with this hash bucket the
1536                                  * B_DELWRI flag should not be set.
1537                                  */
1538                                 ASSERT(!(bp->b_flags & B_DELWRI));
1539 
1540                                 if (bp->b_bufsize == bsize) {
1541                                         hp->b_length--;
1542                                         notavail(bp);
1543                                         bremhash(bp);
1544                                         mutex_exit(hmp);
1545 
1546                                         /*
1547                                          * Didn't kmem_alloc any more, so don't
1548                                          * count it twice.
1549                                          */
1550                                         mutex_enter(&bfree_lock);
1551                                         bfreelist.b_bufsize += bsize;
1552                                         mutex_exit(&bfree_lock);
1553 
1554                                         /*
1555                                          * Update the lastindex value.
1556                                          */
1557                                         lastindex = start;
1558 
1559                                         /*
1560                                          * Put our saved bp back on the list
1561                                          */
1562                                         sema_v(&save->b_sem);
1563                                         bio_bhdr_free(save);
1564                                         ASSERT(SEMA_HELD(&bp->b_sem));
1565                                         return (bp);
1566                                 }
1567                                 sema_v(&bp->b_sem);
1568                                 bp = bp->av_forw;
1569                         }
1570                         mutex_exit(hmp);
1571                         start = ((start + 1) % v.v_hbuf);
1572                 } while (start != end);
1573 
1574                 biostats.bio_bufwait.value.ui32++;
1575                 bp = save;              /* Use original bp */
1576                 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1577         }
1578 
1579         bp->b_bufsize = bsize;
1580         ASSERT(SEMA_HELD(&bp->b_sem));
1581         return (bp);
1582 }
1583 
1584 /*
1585  * Allocate a buffer header. If none currently available, allocate
1586  * a new pool.
1587  */
1588 static struct buf *
1589 bio_bhdr_alloc(void)
1590 {
1591         struct buf *dp, *sdp;
1592         struct buf *bp;
1593         int i;
1594 
1595         for (;;) {
1596                 mutex_enter(&bhdr_lock);
1597                 if (bhdrlist != NULL) {
1598                         bp = bhdrlist;
1599                         bhdrlist = bp->av_forw;
1600                         mutex_exit(&bhdr_lock);
1601                         bp->av_forw = NULL;
1602                         return (bp);
1603                 }
1604                 mutex_exit(&bhdr_lock);
1605 
1606                 /*
1607                  * Need to allocate a new pool. If the system is currently
1608                  * out of memory, then try freeing things on the freelist.
1609                  */
1610                 dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1611                 if (dp == NULL) {
1612                         /*
1613                          * System can't give us a pool of headers, try
1614                          * recycling from the free lists.
1615                          */
1616                         bio_recycle(BIO_HEADER, 0);
1617                 } else {
1618                         sdp = dp;
1619                         for (i = 0; i < v.v_buf; i++, dp++) {
1620                                 /*
1621                                  * The next two lines are needed since NODEV
1622                                  * is -1 and not NULL
1623                                  */
1624                                 dp->b_dev = (o_dev_t)NODEV;
1625                                 dp->b_edev = NODEV;
1626                                 dp->av_forw = dp + 1;
1627                                 sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1628                                     NULL);
1629                                 sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1630                                     NULL);
1631                                 dp->b_offset = -1;
1632                         }
1633                         mutex_enter(&bhdr_lock);
1634                         (--dp)->av_forw = bhdrlist;  /* Fix last pointer */
1635                         bhdrlist = sdp;
1636                         nbuf += v.v_buf;
1637                         bp = bhdrlist;
1638                         bhdrlist = bp->av_forw;
1639                         mutex_exit(&bhdr_lock);
1640 
1641                         bp->av_forw = NULL;
1642                         return (bp);
1643                 }
1644         }
1645 }
1646 
1647 static  void
1648 bio_bhdr_free(struct buf *bp)
1649 {
1650         ASSERT(bp->b_back == NULL);
1651         ASSERT(bp->b_forw == NULL);
1652         ASSERT(bp->av_back == NULL);
1653         ASSERT(bp->av_forw == NULL);
1654         ASSERT(bp->b_un.b_addr == NULL);
1655         ASSERT(bp->b_dev == (o_dev_t)NODEV);
1656         ASSERT(bp->b_edev == NODEV);
1657         ASSERT(bp->b_flags == 0);
1658 
1659         mutex_enter(&bhdr_lock);
1660         bp->av_forw = bhdrlist;
1661         bhdrlist = bp;
1662         mutex_exit(&bhdr_lock);
1663 }
1664 
1665 /*
1666  * If we haven't gone over the high water mark, it's o.k. to
1667  * allocate more buffer space, otherwise recycle buffers
1668  * from the freelist until enough memory is free for a bsize request.
1669  *
1670  * We account for this memory, even though
1671  * we don't allocate it here.
1672  */
1673 static void
1674 bio_mem_get(long bsize)
1675 {
1676         mutex_enter(&bfree_lock);
1677         if (bfreelist.b_bufsize > bsize) {
1678                 bfreelist.b_bufsize -= bsize;
1679                 mutex_exit(&bfree_lock);
1680                 return;
1681         }
1682         mutex_exit(&bfree_lock);
1683         bio_recycle(BIO_MEM, bsize);
1684 }
1685 
1686 /*
1687  * flush a list of delayed write buffers.
1688  * (currently used only by bio_recycle below.)
1689  */
1690 static void
1691 bio_flushlist(struct buf *delwri_list)
1692 {
1693         struct buf *bp;
1694 
1695         while (delwri_list != EMPTY_LIST) {
1696                 bp = delwri_list;
1697                 bp->b_flags |= B_AGE | B_ASYNC;
1698                 if (bp->b_vp == NULL) {              /* !ufs */
1699                         BWRITE(bp);
1700                 } else {                        /* ufs */
1701                         UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1702                 }
1703                 delwri_list = bp->b_list;
1704                 bp->b_list = NULL;
1705         }
1706 }
1707 
1708 /*
1709  * Start recycling buffers on the freelist for one of 2 reasons:
1710  *      - we need a buffer header
1711  *      - we need to free up memory
1712  * Once started we continue to recycle buffers until the B_AGE
1713  * buffers are gone.
1714  */
1715 static void
1716 bio_recycle(int want, long bsize)
1717 {
1718         struct buf *bp, *dp, *dwp, *nbp;
1719         struct hbuf *hp;
1720         int     found = 0;
1721         kmutex_t        *hmp;
1722         int             start, end;
1723         struct buf *delwri_list = EMPTY_LIST;
1724 
1725         /*
1726          * Recycle buffers.
1727          */
1728 top:
1729         start = end = lastindex;
1730         do {
1731                 hp = &hbuf[start];
1732                 hmp = &hp->b_lock;
1733                 dp = (struct buf *)hp;
1734 
1735                 mutex_enter(hmp);
1736                 bp = dp->av_forw;
1737 
1738                 while (bp != dp) {
1739 
1740                         ASSERT(bp != NULL);
1741 
1742                         if (!sema_tryp(&bp->b_sem)) {
1743                                 bp = bp->av_forw;
1744                                 continue;
1745                         }
1746                         /*
1747                          * Do we really want to nuke all of the B_AGE stuff??
1748                          */
1749                         if ((bp->b_flags & B_AGE) == 0 && found) {
1750                                 sema_v(&bp->b_sem);
1751                                 mutex_exit(hmp);
1752                                 lastindex = start;
1753                                 return; /* All done */
1754                         }
1755 
1756                         ASSERT(MUTEX_HELD(&hp->b_lock));
1757                         ASSERT(!(bp->b_flags & B_DELWRI));
1758                         hp->b_length--;
1759                         notavail(bp);
1760 
1761                         /*
1762                          * Remove bhdr from cache, free up memory,
1763                          * and add the hdr to the freelist.
1764                          */
1765                         bremhash(bp);
1766                         mutex_exit(hmp);
1767 
1768                         if (bp->b_bufsize) {
1769                                 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1770                                 bp->b_un.b_addr = NULL;
1771                                 mutex_enter(&bfree_lock);
1772                                 bfreelist.b_bufsize += bp->b_bufsize;
1773                                 mutex_exit(&bfree_lock);
1774                         }
1775 
1776                         bp->b_dev = (o_dev_t)NODEV;
1777                         bp->b_edev = NODEV;
1778                         bp->b_flags = 0;
1779                         sema_v(&bp->b_sem);
1780                         bio_bhdr_free(bp);
1781                         if (want == BIO_HEADER) {
1782                                 found = 1;
1783                         } else {
1784                                 ASSERT(want == BIO_MEM);
1785                                 if (!found && bfreelist.b_bufsize >= bsize) {
1786                                         /* Account for the memory we want */
1787                                         mutex_enter(&bfree_lock);
1788                                         if (bfreelist.b_bufsize >= bsize) {
1789                                                 bfreelist.b_bufsize -= bsize;
1790                                                 found = 1;
1791                                         }
1792                                         mutex_exit(&bfree_lock);
1793                                 }
1794                         }
1795 
1796                         /*
1797                          * Since we dropped hmp start from the
1798                          * begining.
1799                          */
1800                         mutex_enter(hmp);
1801                         bp = dp->av_forw;
1802                 }
1803                 mutex_exit(hmp);
1804 
1805                 /*
1806                  * Look at the delayed write list.
1807                  * First gather into a private list, then write them.
1808                  */
1809                 dwp = (struct buf *)&dwbuf[start];
1810                 mutex_enter(&blist_lock);
1811                 bio_doingflush++;
1812                 mutex_enter(hmp);
1813                 for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1814 
1815                         ASSERT(bp != NULL);
1816                         nbp = bp->av_forw;
1817 
1818                         if (!sema_tryp(&bp->b_sem))
1819                                 continue;
1820                         ASSERT(bp->b_flags & B_DELWRI);
1821                         /*
1822                          * Do we really want to nuke all of the B_AGE stuff??
1823                          */
1824 
1825                         if ((bp->b_flags & B_AGE) == 0 && found) {
1826                                 sema_v(&bp->b_sem);
1827                                 mutex_exit(hmp);
1828                                 lastindex = start;
1829                                 mutex_exit(&blist_lock);
1830                                 bio_flushlist(delwri_list);
1831                                 mutex_enter(&blist_lock);
1832                                 bio_doingflush--;
1833                                 if (bio_flinv_cv_wanted) {
1834                                         bio_flinv_cv_wanted = 0;
1835                                         cv_broadcast(&bio_flushinval_cv);
1836                                 }
1837                                 mutex_exit(&blist_lock);
1838                                 return; /* All done */
1839                         }
1840 
1841                         /*
1842                          * If the buffer is already on a flush or
1843                          * invalidate list then just skip it.
1844                          */
1845                         if (bp->b_list != NULL) {
1846                                 sema_v(&bp->b_sem);
1847                                 continue;
1848                         }
1849                         /*
1850                          * We are still on the same bucket.
1851                          */
1852                         hp->b_length--;
1853                         notavail(bp);
1854                         bp->b_list = delwri_list;
1855                         delwri_list = bp;
1856                 }
1857                 mutex_exit(hmp);
1858                 mutex_exit(&blist_lock);
1859                 bio_flushlist(delwri_list);
1860                 delwri_list = EMPTY_LIST;
1861                 mutex_enter(&blist_lock);
1862                 bio_doingflush--;
1863                 if (bio_flinv_cv_wanted) {
1864                         bio_flinv_cv_wanted = 0;
1865                         cv_broadcast(&bio_flushinval_cv);
1866                 }
1867                 mutex_exit(&blist_lock);
1868                 start = (start + 1) % v.v_hbuf;
1869 
1870         } while (start != end);
1871 
1872         if (found)
1873                 return;
1874 
1875         /*
1876          * Free lists exhausted and we haven't satisfied the request.
1877          * Wait here for more entries to be added to freelist.
1878          * Because this might have just happened, make it timed.
1879          */
1880         mutex_enter(&bfree_lock);
1881         bfreelist.b_flags |= B_WANTED;
1882         (void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
1883         mutex_exit(&bfree_lock);
1884         goto top;
1885 }
1886 
1887 /*
1888  * See if the block is associated with some buffer
1889  * (mainly to avoid getting hung up on a wait in breada).
1890  */
1891 static int
1892 bio_incore(dev_t dev, daddr_t blkno)
1893 {
1894         struct buf *bp;
1895         struct buf *dp;
1896         uint_t index;
1897         kmutex_t *hmp;
1898 
1899         index = bio_bhash(dev, blkno);
1900         dp = (struct buf *)&hbuf[index];
1901         hmp = &hbuf[index].b_lock;
1902 
1903         mutex_enter(hmp);
1904         for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1905                 if (bp->b_blkno == blkno && bp->b_edev == dev &&
1906                     (bp->b_flags & B_STALE) == 0) {
1907                         mutex_exit(hmp);
1908                         return (1);
1909                 }
1910         }
1911         mutex_exit(hmp);
1912         return (0);
1913 }
1914 
1915 static void
1916 bio_pageio_done(struct buf *bp)
1917 {
1918         if (bp->b_flags & B_PAGEIO) {
1919 
1920                 if (bp->b_flags & B_REMAPPED)
1921                         bp_mapout(bp);
1922 
1923                 if (bp->b_flags & B_READ)
1924                         pvn_read_done(bp->b_pages, bp->b_flags);
1925                 else
1926                         pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1927                 pageio_done(bp);
1928         } else {
1929                 ASSERT(bp->b_flags & B_REMAPPED);
1930                 bp_mapout(bp);
1931                 brelse(bp);
1932         }
1933 }
1934 
1935 /*
1936  * bioerror(9F) - indicate error in buffer header
1937  * If 'error' is zero, remove the error indication.
1938  */
1939 void
1940 bioerror(struct buf *bp, int error)
1941 {
1942         ASSERT(bp != NULL);
1943         ASSERT(error >= 0);
1944         ASSERT(SEMA_HELD(&bp->b_sem));
1945 
1946         if (error != 0) {
1947                 bp->b_flags |= B_ERROR;
1948         } else {
1949                 bp->b_flags &= ~B_ERROR;
1950         }
1951         bp->b_error = error;
1952 }
1953 
1954 /*
1955  * bioreset(9F) - reuse a private buffer header after I/O is complete
1956  */
1957 void
1958 bioreset(struct buf *bp)
1959 {
1960         ASSERT(bp != NULL);
1961 
1962         biofini(bp);
1963         bioinit(bp);
1964 }
1965 
1966 /*
1967  * biosize(9F) - return size of a buffer header
1968  */
1969 size_t
1970 biosize(void)
1971 {
1972         return (sizeof (struct buf));
1973 }
1974 
1975 /*
1976  * biomodified(9F) - check if buffer is modified
1977  */
1978 int
1979 biomodified(struct buf *bp)
1980 {
1981         int npf;
1982         int ppattr;
1983         struct page *pp;
1984 
1985         ASSERT(bp != NULL);
1986 
1987         if ((bp->b_flags & B_PAGEIO) == 0) {
1988                 return (-1);
1989         }
1990         pp = bp->b_pages;
1991         npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1992 
1993         while (npf > 0) {
1994                 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1995                     HAT_SYNC_STOPON_MOD);
1996                 if (ppattr & P_MOD)
1997                         return (1);
1998                 pp = pp->p_next;
1999                 npf--;
2000         }
2001 
2002         return (0);
2003 }
2004 
2005 /*
2006  * bioinit(9F) - initialize a buffer structure
2007  */
2008 void
2009 bioinit(struct buf *bp)
2010 {
2011         bzero(bp, sizeof (struct buf));
2012         sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
2013         sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
2014         bp->b_offset = -1;
2015 }
2016 
2017 /*
2018  * biofini(9F) - uninitialize a buffer structure
2019  */
2020 void
2021 biofini(struct buf *bp)
2022 {
2023         sema_destroy(&bp->b_io);
2024         sema_destroy(&bp->b_sem);
2025 }
2026 
2027 /*
2028  * bioclone(9F) - clone a buffer
2029  */
2030 struct buf *
2031 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2032     int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2033 {
2034         struct buf *bufp;
2035 
2036         ASSERT(bp);
2037         if (bp_mem == NULL) {
2038                 bufp = kmem_alloc(sizeof (struct buf), sleep);
2039                 if (bufp == NULL) {
2040                         return (NULL);
2041                 }
2042                 bioinit(bufp);
2043         } else {
2044                 bufp = bp_mem;
2045                 bioreset(bufp);
2046         }
2047 
2048 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2049         B_ABRWRITE)
2050 
2051         /*
2052          * The cloned buffer does not inherit the B_REMAPPED flag.
2053          */
2054         bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS)  | B_BUSY;
2055         bufp->b_bcount = len;
2056         bufp->b_blkno = blkno;
2057         bufp->b_iodone = iodone;
2058         bufp->b_proc = bp->b_proc;
2059         bufp->b_edev = dev;
2060         bufp->b_file = bp->b_file;
2061         bufp->b_offset = bp->b_offset;
2062 
2063         if (bp->b_flags & B_SHADOW) {
2064                 ASSERT(bp->b_shadow);
2065                 ASSERT(bp->b_flags & B_PHYS);
2066 
2067                 bufp->b_shadow = bp->b_shadow +
2068                     btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2069                 bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2070                 if (bp->b_flags & B_REMAPPED)
2071                         bufp->b_proc = NULL;
2072         } else {
2073                 if (bp->b_flags & B_PAGEIO) {
2074                         struct page *pp;
2075                         off_t o;
2076                         int i;
2077 
2078                         pp = bp->b_pages;
2079                         o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2080                         for (i = btop(o); i > 0; i--) {
2081                                 pp = pp->p_next;
2082                         }
2083                         bufp->b_pages = pp;
2084                         bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2085                 } else {
2086                         bufp->b_un.b_addr =
2087                             (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2088                         if (bp->b_flags & B_REMAPPED)
2089                                 bufp->b_proc = NULL;
2090                 }
2091         }
2092         return (bufp);
2093 }