1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * University Copyright- Copyright (c) 1982, 1986, 1988
  31  * The Regents of the University of California
  32  * All Rights Reserved
  33  *
  34  * University Acknowledgment- Portions of this document are derived from
  35  * software developed by the University of California, Berkeley, and its
  36  * contributors.
  37  */
  38 
  39 #ifndef _SYS_BUF_H
  40 #define _SYS_BUF_H
  41 
  42 #include <sys/types32.h>
  43 #include <sys/t_lock.h>
  44 #include <sys/kstat.h>
  45 
  46 #ifdef  __cplusplus
  47 extern "C" {
  48 #endif
  49 
  50 /*
  51  *      Each buffer in the pool is usually doubly linked into 2 lists:
  52  *      the device with which it is currently associated (always)
  53  *      and also on a list of blocks available for allocation
  54  *      for other use (usually).
  55  *      The latter list is kept in last-used order, and the two
  56  *      lists are doubly linked to make it easy to remove
  57  *      a buffer from one list when it was found by
  58  *      looking through the other.
  59  *      A buffer is on the available list, and is liable
  60  *      to be reassigned to another disk block, if and only
  61  *      if it is not marked BUSY.  When a buffer is busy, the
  62  *      available-list pointers can be used for other purposes.
  63  *      Most drivers use the forward ptr as a link in their I/O active queue.
  64  *      A buffer header contains all the information required to perform I/O.
  65  *      Most of the routines which manipulate these things are in bio.c.
  66  *
  67  *      There are a number of locks associated with the buffer management
  68  *      system.
  69  *      hbuf.b_lock:    protects hash chains, buffer hdr freelists
  70  *                      and delayed write freelist
  71  *      bfree_lock;     protects the bfreelist structure
  72  *      bhdr_lock:      protects the free header list
  73  *      blist_lock:     protects b_list fields
  74  *      buf.b_sem:      protects all remaining members in the buf struct
  75  *      buf.b_io:       I/O synchronization variable
  76  *
  77  *      A buffer header is never "locked" (b_sem) when it is on
  78  *      a "freelist" (bhdrlist or bfreelist avail lists).
  79  */
  80 typedef struct  buf {
  81         int     b_flags;                /* see defines below */
  82         struct buf *b_forw;             /* headed by d_tab of conf.c */
  83         struct buf *b_back;             /*  "  */
  84         struct buf *av_forw;            /* position on free list, */
  85         struct buf *av_back;            /* if not BUSY */
  86         o_dev_t b_dev;                  /* OLD major+minor device name */
  87         size_t b_bcount;                /* transfer count */
  88         union {
  89                 caddr_t b_addr;         /* low order core address */
  90                 struct fs *b_fs;        /* superblocks */
  91                 struct cg *b_cg;        /* UFS cylinder group block */
  92                 struct dinode *b_dino;  /* UFS ilist */
  93                 daddr32_t *b_daddr;     /* disk blocks */
  94         } b_un;
  95 
  96         lldaddr_t       _b_blkno;       /* block # on device (union) */
  97 #define b_lblkno        _b_blkno._f
  98 #ifdef _LP64
  99 #define b_blkno         _b_blkno._f
 100 #else
 101 #define b_blkno         _b_blkno._p._l
 102 #endif /* _LP64 */
 103 
 104         char    b_obs1;                 /* obsolete */
 105         size_t  b_resid;                /* words not transferred after error */
 106         clock_t b_start;                /* request start time */
 107         struct  proc  *b_proc;          /* process doing physical or swap I/O */
 108         struct  page  *b_pages;         /* page list for PAGEIO */
 109         clock_t b_obs2;                 /* obsolete */
 110         /* Begin new stuff */
 111 #define b_actf  av_forw
 112 #define b_actl  av_back
 113 #define b_active b_bcount
 114 #define b_errcnt b_resid
 115         size_t  b_bufsize;              /* size of allocated buffer */
 116         int     (*b_iodone)(struct buf *);      /* function called by iodone */
 117         struct  vnode *b_vp;            /* vnode associated with block */
 118         struct  buf *b_chain;           /* chain together all buffers here */
 119         int     b_obs3;                 /* obsolete */
 120         int     b_error;                /* expanded error field */
 121         void    *b_private;             /* "opaque" driver private area */
 122         dev_t   b_edev;                 /* expanded dev field */
 123         ksema_t b_sem;                  /* Exclusive access to buf */
 124         ksema_t b_io;                   /* I/O Synchronization */
 125         struct buf *b_list;             /* List of potential B_DELWRI bufs */
 126         struct page **b_shadow;         /* shadow page list */
 127         void    *b_dip;                 /* device info pointer */
 128         struct vnode *b_file;           /* file associated with this buffer */
 129         offset_t b_offset;              /* offset in file assoc. with buffer */
 130 } buf_t;
 131 
 132 /*
 133  * Bufhd structures used at the head of the hashed buffer queues.
 134  * We only need seven words for this, so this abbreviated
 135  * definition saves some space.
 136  */
 137 struct diskhd {
 138         int     b_flags;                /* not used, needed for consistency */
 139         struct buf *b_forw, *b_back;    /* queue of unit queues */
 140         struct buf *av_forw, *av_back;  /* queue of bufs for this unit */
 141         o_dev_t b_dev;                  /* OLD major+minor device name */
 142         size_t b_bcount;                /* transfer count */
 143 };
 144 
 145 
 146 /*
 147  * Statistics on the buffer cache
 148  */
 149 struct biostats {
 150         kstat_named_t   bio_lookup;     /* requests to assign buffer */
 151         kstat_named_t   bio_hit;        /* buffer already associated with blk */
 152         kstat_named_t   bio_bufwant;    /* kmem_allocs NOSLEEP failed new buf */
 153         kstat_named_t   bio_bufwait;    /* kmem_allocs with KM_SLEEP for buf */
 154         kstat_named_t   bio_bufbusy;    /* buffer locked by someone else */
 155         kstat_named_t   bio_bufdup;     /* duplicate buffer found for block */
 156 };
 157 
 158 /*
 159  * These flags are kept in b_flags.
 160  * The first group is part of the DDI
 161  */
 162 #define B_BUSY          0x0001  /* not on av_forw/back list */
 163 #define B_DONE          0x0002  /* transaction finished */
 164 #define B_ERROR         0x0004  /* transaction aborted */
 165 #define B_PAGEIO        0x0010  /* do I/O to pages on bp->p_pages */
 166 #define B_PHYS          0x0020  /* Physical IO potentially using UNIBUS map */
 167 #define B_READ          0x0040  /* read when I/O occurs */
 168 #define B_WRITE         0x0100  /* non-read pseudo-flag */
 169 
 170 /* Not part of the DDI */
 171 #define B_WANTED        0x0080          /* issue wakeup when BUSY goes off */
 172 #define B_AGE           0x000200        /* delayed write for correct aging */
 173 #define B_ASYNC         0x000400        /* don't wait for I/O completion */
 174 #define B_DELWRI        0x000800        /* delayed write-wait til buf needed */
 175 #define B_STALE         0x001000        /* on av_* list; invalid contents */
 176 #define B_DONTNEED      0x002000        /* after write, need not be cached */
 177 #define B_REMAPPED      0x004000        /* buffer is kernel addressable */
 178 #define B_FREE          0x008000        /* free page when done */
 179 #define B_INVAL         0x010000        /* destroy page when done */
 180 #define B_FORCE         0x020000        /* semi-permanent removal from cache */
 181 #define B_NOCACHE       0x080000        /* don't cache block when released */
 182 #define B_TRUNC         0x100000        /* truncate page without I/O */
 183 #define B_SHADOW        0x200000        /* is b_shadow field valid? */
 184 #define B_RETRYWRI      0x400000        /* retry write til works or bfinval */
 185 #define B_FAILFAST      0x1000000       /* Fail promptly if device goes away */
 186 #define B_STARTED       0x2000000       /* io:::start probe called for buf */
 187 #define B_ABRWRITE      0x4000000       /* Application based recovery active */
 188 #define B_PAGE_NOWAIT   0x8000000       /* Skip the page if it is locked */
 189 
 190 /*
 191  * There is some confusion over the meaning of B_FREE and B_INVAL and what
 192  * the use of one over the other implies.
 193  *
 194  * In both cases, when we are done with the page (buffer) we want to free
 195  * up the page.  In the case of B_FREE, the page will go to the cachelist.
 196  * In the case of B_INVAL, the page will be destroyed (hashed out of it's
 197  * vnode) and placed on the freelist.  Beyond this, there is no difference
 198  * between the sole use of these two flags.  In both cases, IO will be done
 199  * if the page is not yet committed to storage.
 200  *
 201  * In order to discard pages without writing them back, (B_INVAL | B_TRUNC)
 202  * should be used.
 203  *
 204  * Use (B_INVAL | B_FORCE) to force the page to be destroyed even if we
 205  * could not successfuly write out the page.
 206  */
 207 
 208 /*
 209  * Insq/Remq for the buffer hash lists.
 210  */
 211 #define bremhash(bp) { \
 212         ASSERT((bp)->b_forw != NULL); \
 213         ASSERT((bp)->b_back != NULL); \
 214         (bp)->b_back->b_forw = (bp)->b_forw; \
 215         (bp)->b_forw->b_back = (bp)->b_back; \
 216         (bp)->b_forw = (bp)->b_back = NULL; \
 217 }
 218 #define binshash(bp, dp) { \
 219         ASSERT((bp)->b_forw == NULL); \
 220         ASSERT((bp)->b_back == NULL); \
 221         ASSERT((dp)->b_forw != NULL); \
 222         ASSERT((dp)->b_back != NULL); \
 223         (bp)->b_forw = (dp)->b_forw; \
 224         (bp)->b_back = (dp); \
 225         (dp)->b_forw->b_back = (bp); \
 226         (dp)->b_forw = (bp); \
 227 }
 228 
 229 
 230 /*
 231  * The hash structure maintains two lists:
 232  *
 233  *      1) The hash list of buffers (b_forw & b_back)
 234  *      2) The LRU free list of buffers on this hash bucket (av_forw & av_back)
 235  *
 236  * The dwbuf structure keeps a list of delayed write buffers per hash bucket
 237  * hence there are exactly the same number of dwbuf structures as there are
 238  * the hash buckets (hbuf structures) in the system.
 239  *
 240  * The number of buffers on the freelist may not be equal to the number of
 241  * buffers on the hash list. That is because when buffers are busy they are
 242  * taken off the freelist but not off the hash list. "b_length" field keeps
 243  * track of the number of free buffers (including delayed writes ones) on
 244  * the hash bucket. The "b_lock" mutex protects the free list as well as
 245  * the hash list. It also protects the counter "b_length".
 246  *
 247  * Enties b_forw, b_back, av_forw & av_back must be at the same offset
 248  * as the ones in buf structure.
 249  */
 250 struct  hbuf {
 251         int     b_flags;
 252 
 253         struct  buf     *b_forw;        /* hash list forw pointer */
 254         struct  buf     *b_back;        /* hash list back pointer */
 255 
 256         struct  buf     *av_forw;       /* free list forw pointer */
 257         struct  buf     *av_back;       /* free list back pointer */
 258 
 259         int             b_length;       /* # of entries on free list */
 260         kmutex_t        b_lock;         /* lock to protect this structure */
 261 };
 262 
 263 
 264 /*
 265  * The delayed list pointer entries should match with the buf strcuture.
 266  */
 267 struct  dwbuf {
 268         int     b_flags;                /* not used */
 269 
 270         struct  buf     *b_forw;        /* not used */
 271         struct  buf     *b_back;        /* not used */
 272 
 273         struct  buf     *av_forw;       /* delayed write forw pointer */
 274         struct  buf     *av_back;       /* delayed write back pointer */
 275 };
 276 
 277 
 278 /*
 279  * Unlink a buffer from the available (free or delayed write) list and mark
 280  * it busy (internal interface).
 281  */
 282 #define notavail(bp) \
 283 {\
 284         ASSERT(SEMA_HELD(&bp->b_sem)); \
 285         ASSERT((bp)->av_forw != NULL); \
 286         ASSERT((bp)->av_back != NULL); \
 287         ASSERT((bp)->av_forw != (bp)); \
 288         ASSERT((bp)->av_back != (bp)); \
 289         (bp)->av_back->av_forw = (bp)->av_forw; \
 290         (bp)->av_forw->av_back = (bp)->av_back; \
 291         (bp)->b_flags |= B_BUSY; \
 292         (bp)->av_forw = (bp)->av_back = NULL; \
 293 }
 294 
 295 #if defined(_KERNEL)
 296 /*
 297  * Macros to avoid the extra function call needed for binary compat.
 298  *
 299  * B_RETRYWRI is not included in clear_flags for BWRITE(), BWRITE2(),
 300  * or brwrite() so that the retry operation is persistent until the
 301  * write either succeeds or the buffer is bfinval()'d.
 302  *
 303  */
 304 #define BREAD(dev, blkno, bsize) \
 305         bread_common(/* ufsvfsp */ NULL, dev, blkno, bsize)
 306 
 307 #define BWRITE(bp) \
 308         bwrite_common(/* ufsvfsp */ NULL, bp, /* force_wait */ 0, \
 309                 /* do_relse */ 1, \
 310                 /* clear_flags */ (B_READ | B_DONE | B_ERROR | B_DELWRI))
 311 
 312 #define BWRITE2(bp) \
 313         bwrite_common(/* ufsvfsp */ NULL, bp, /* force_wait */ 1, \
 314                 /* do_relse */ 0, \
 315                 /* clear_flags */ (B_READ | B_DONE | B_ERROR | B_DELWRI))
 316 
 317 #define GETBLK(dev, blkno, bsize) \
 318         getblk_common(/* ufsvfsp */ NULL, dev, blkno, bsize, /* errflg */ 0)
 319 
 320 
 321 /*
 322  * Macros for new retry write interfaces.
 323  */
 324 
 325 /*
 326  * Same as bdwrite() except write failures are retried.
 327  */
 328 #define bdrwrite(bp) { \
 329         (bp)->b_flags |= B_RETRYWRI; \
 330         bdwrite((bp)); \
 331 }
 332 
 333 /*
 334  * Same as bwrite() except write failures are retried.
 335  */
 336 #define brwrite(bp) { \
 337         (bp)->b_flags |= B_RETRYWRI; \
 338         bwrite_common((bp), /* force_wait */ 0, /* do_relse */ 1, \
 339                 /* clear_flags */ (B_READ | B_DONE | B_ERROR | B_DELWRI)); \
 340 }
 341 
 342 extern struct hbuf      *hbuf;          /* Hash table */
 343 extern struct dwbuf     *dwbuf;         /* delayed write hash table */
 344 extern struct buf       *buf;           /* The buffer pool itself */
 345 extern struct buf       bfreelist;      /* head of available list */
 346 
 347 extern void (*bio_lufs_strategy)(void *, buf_t *);      /* UFS Logging */
 348 extern void (*bio_snapshot_strategy)(void *, buf_t *);  /* UFS snapshots */
 349 
 350 int     bcheck(dev_t, struct buf *);
 351 int     iowait(struct buf *);
 352 int     hash2ints(int x, int y);
 353 int     bio_busy(int);
 354 int     biowait(struct buf *);
 355 int     biomodified(struct buf *);
 356 int     geterror(struct buf *);
 357 void    minphys(struct buf *);
 358 /*
 359  * ufsvfsp is declared as a void * to avoid having everyone that uses
 360  * this header file include sys/fs/ufs_inode.h.
 361  */
 362 void    bwrite_common(void *ufsvfsp, struct buf *, int force_wait,
 363         int do_relse, int clear_flags);
 364 void    bwrite(struct buf *);
 365 void    bwrite2(struct buf *);
 366 void    bdwrite(struct buf *);
 367 void    bawrite(struct buf *);
 368 void    brelse(struct buf *);
 369 void    iodone(struct buf *);
 370 void    clrbuf(struct buf *);
 371 void    bflush(dev_t);
 372 void    blkflush(dev_t, daddr_t);
 373 void    binval(dev_t);
 374 int     bfinval(dev_t, int);
 375 void    binit(void);
 376 void    biodone(struct buf *);
 377 void    bioinit(struct buf *);
 378 void    biofini(struct buf *);
 379 void    bp_mapin(struct buf *);
 380 void    *bp_mapin_common(struct buf *, int);
 381 void    bp_mapout(struct buf *);
 382 int     bp_copyin(struct buf *, void *, offset_t, size_t);
 383 int     bp_copyout(void *, struct buf *, offset_t, size_t);
 384 void    bp_init(size_t, uint_t);
 385 int     bp_color(struct buf *);
 386 void    pageio_done(struct buf *);
 387 struct buf *bread(dev_t, daddr_t, long);
 388 struct buf *bread_common(void *, dev_t, daddr_t, long);
 389 struct buf *breada(dev_t, daddr_t, daddr_t, long);
 390 struct buf *getblk(dev_t, daddr_t, long);
 391 struct buf *getblk_common(void *, dev_t, daddr_t, long, int);
 392 struct buf *ngeteblk(long);
 393 struct buf *geteblk(void);
 394 struct buf *pageio_setup(struct page *, size_t, struct vnode *, int);
 395 void bioerror(struct buf *bp, int error);
 396 void bioreset(struct buf *bp);
 397 struct buf *bioclone(struct buf *, off_t, size_t, dev_t, daddr_t,
 398         int (*)(struct buf *), struct buf *, int);
 399 size_t  biosize(void);
 400 #endif  /* defined(_KERNEL) */
 401 
 402 #ifdef  __cplusplus
 403 }
 404 #endif
 405 
 406 #endif  /* _SYS_BUF_H */