1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  *
  25  * Copyright 2017 RackTop Systems.
  26  */
  27 
  28 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  29 /*        All Rights Reserved   */
  30 
  31 /*
  32  * University Copyright- Copyright (c) 1982, 1986, 1988
  33  * The Regents of the University of California
  34  * All Rights Reserved
  35  *
  36  * University Acknowledgment- Portions of this document are derived from
  37  * software developed by the University of California, Berkeley, and its
  38  * contributors.
  39  */
  40 
  41 #ifndef _SYS_BUF_H
  42 #define _SYS_BUF_H
  43 
  44 #include <sys/types32.h>
  45 #include <sys/t_lock.h>
  46 #include <sys/kstat.h>
  47 
  48 #ifdef  __cplusplus
  49 extern "C" {
  50 #endif
  51 
  52 /*
  53  *      Each buffer in the pool is usually doubly linked into 2 lists:
  54  *      the device with which it is currently associated (always)
  55  *      and also on a list of blocks available for allocation
  56  *      for other use (usually).
  57  *      The latter list is kept in last-used order, and the two
  58  *      lists are doubly linked to make it easy to remove
  59  *      a buffer from one list when it was found by
  60  *      looking through the other.
  61  *      A buffer is on the available list, and is liable
  62  *      to be reassigned to another disk block, if and only
  63  *      if it is not marked BUSY.  When a buffer is busy, the
  64  *      available-list pointers can be used for other purposes.
  65  *      Most drivers use the forward ptr as a link in their I/O active queue.
  66  *      A buffer header contains all the information required to perform I/O.
  67  *      Most of the routines which manipulate these things are in bio.c.
  68  *
  69  *      There are a number of locks associated with the buffer management
  70  *      system.
  71  *      hbuf.b_lock:    protects hash chains, buffer hdr freelists
  72  *                      and delayed write freelist
  73  *      bfree_lock;     protects the bfreelist structure
  74  *      bhdr_lock:      protects the free header list
  75  *      blist_lock:     protects b_list fields
  76  *      buf.b_sem:      protects all remaining members in the buf struct
  77  *      buf.b_io:       I/O synchronization variable
  78  *
  79  *      A buffer header is never "locked" (b_sem) when it is on
  80  *      a "freelist" (bhdrlist or bfreelist avail lists).
  81  */
  82 typedef struct  buf {
  83         int     b_flags;                /* see defines below */
  84         struct buf *b_forw;             /* headed by d_tab of conf.c */
  85         struct buf *b_back;             /*  "  */
  86         struct buf *av_forw;            /* position on free list, */
  87         struct buf *av_back;            /* if not BUSY */
  88         o_dev_t b_dev;                  /* OLD major+minor device name */
  89         size_t b_bcount;                /* transfer count */
  90         union {
  91                 caddr_t b_addr;         /* low order core address */
  92                 struct fs *b_fs;        /* superblocks */
  93                 struct cg *b_cg;        /* UFS cylinder group block */
  94                 struct dinode *b_dino;  /* UFS ilist */
  95                 daddr32_t *b_daddr;     /* disk blocks */
  96         } b_un;
  97 
  98         lldaddr_t       _b_blkno;       /* block # on device (union) */
  99 #define b_lblkno        _b_blkno._f
 100 #ifdef _LP64
 101 #define b_blkno         _b_blkno._f
 102 #else
 103 #define b_blkno         _b_blkno._p._l
 104 #endif /* _LP64 */
 105 
 106         char    b_obs1;                 /* obsolete */
 107         size_t  b_resid;                /* words not transferred after error */
 108         clock_t b_start;                /* request start time */
 109         struct  proc  *b_proc;          /* process doing physical or swap I/O */
 110         struct  page  *b_pages;         /* page list for PAGEIO */
 111         clock_t b_obs2;                 /* obsolete */
 112         /* Begin new stuff */
 113 #define b_actf  av_forw
 114 #define b_actl  av_back
 115 #define b_active b_bcount
 116 #define b_errcnt b_resid
 117         size_t  b_bufsize;              /* size of allocated buffer */
 118         int     (*b_iodone)(struct buf *);      /* function called by iodone */
 119         struct  vnode *b_vp;            /* vnode associated with block */
 120         struct  buf *b_chain;           /* chain together all buffers here */
 121         int     b_obs3;                 /* obsolete */
 122         int     b_error;                /* expanded error field */
 123         void    *b_private;             /* "opaque" driver private area */
 124         dev_t   b_edev;                 /* expanded dev field */
 125         ksema_t b_sem;                  /* Exclusive access to buf */
 126         ksema_t b_io;                   /* I/O Synchronization */
 127         struct buf *b_list;             /* List of potential B_DELWRI bufs */
 128         struct page **b_shadow;         /* shadow page list */
 129         void    *b_dip;                 /* device info pointer */
 130         struct vnode *b_file;           /* file associated with this buffer */
 131         offset_t b_offset;              /* offset in file assoc. with buffer */
 132 } buf_t;
 133 
 134 /*
 135  * Bufhd structures used at the head of the hashed buffer queues.
 136  * We only need seven words for this, so this abbreviated
 137  * definition saves some space.
 138  */
 139 struct diskhd {
 140         int     b_flags;                /* not used, needed for consistency */
 141         struct buf *b_forw, *b_back;    /* queue of unit queues */
 142         struct buf *av_forw, *av_back;  /* queue of bufs for this unit */
 143         o_dev_t b_dev;                  /* OLD major+minor device name */
 144         size_t b_bcount;                /* transfer count */
 145 };
 146 
 147 
 148 /*
 149  * Statistics on the buffer cache
 150  */
 151 struct biostats {
 152         kstat_named_t   bio_lookup;     /* requests to assign buffer */
 153         kstat_named_t   bio_hit;        /* buffer already associated with blk */
 154         kstat_named_t   bio_bufwant;    /* kmem_allocs NOSLEEP failed new buf */
 155         kstat_named_t   bio_bufwait;    /* kmem_allocs with KM_SLEEP for buf */
 156         kstat_named_t   bio_bufbusy;    /* buffer locked by someone else */
 157         kstat_named_t   bio_bufdup;     /* duplicate buffer found for block */
 158 };
 159 
 160 /*
 161  * These flags are kept in b_flags.
 162  * The first group is part of the DDI
 163  */
 164 #define B_BUSY          0x0001  /* not on av_forw/back list */
 165 #define B_DONE          0x0002  /* transaction finished */
 166 #define B_ERROR         0x0004  /* transaction aborted */
 167 #define B_PAGEIO        0x0010  /* do I/O to pages on bp->p_pages */
 168 #define B_PHYS          0x0020  /* Physical IO potentially using UNIBUS map */
 169 #define B_READ          0x0040  /* read when I/O occurs */
 170 #define B_WRITE         0x0100  /* non-read pseudo-flag */
 171 
 172 /* Not part of the DDI */
 173 #define B_WANTED        0x0080          /* issue wakeup when BUSY goes off */
 174 #define B_AGE           0x000200        /* delayed write for correct aging */
 175 #define B_ASYNC         0x000400        /* don't wait for I/O completion */
 176 #define B_DELWRI        0x000800        /* delayed write-wait til buf needed */
 177 #define B_STALE         0x001000        /* on av_* list; invalid contents */
 178 #define B_DONTNEED      0x002000        /* after write, need not be cached */
 179 #define B_REMAPPED      0x004000        /* buffer is kernel addressable */
 180 #define B_FREE          0x008000        /* free page when done */
 181 #define B_INVAL         0x010000        /* destroy page when done */
 182 #define B_FORCE         0x020000        /* semi-permanent removal from cache */
 183 #define B_NOCACHE       0x080000        /* don't cache block when released */
 184 #define B_TRUNC         0x100000        /* truncate page without I/O */
 185 #define B_SHADOW        0x200000        /* is b_shadow field valid? */
 186 #define B_RETRYWRI      0x400000        /* retry write til works or bfinval */
 187 #define B_FAILFAST      0x1000000       /* Fail promptly if device goes away */
 188 #define B_STARTED       0x2000000       /* io:::start probe called for buf */
 189 #define B_ABRWRITE      0x4000000       /* Application based recovery active */
 190 #define B_PAGE_NOWAIT   0x8000000       /* Skip the page if it is locked */
 191 
 192 /*
 193  * There is some confusion over the meaning of B_FREE and B_INVAL and what
 194  * the use of one over the other implies.
 195  *
 196  * In both cases, when we are done with the page (buffer) we want to free
 197  * up the page.  In the case of B_FREE, the page will go to the cachelist.
 198  * In the case of B_INVAL, the page will be destroyed (hashed out of it's
 199  * vnode) and placed on the freelist.  Beyond this, there is no difference
 200  * between the sole use of these two flags.  In both cases, IO will be done
 201  * if the page is not yet committed to storage.
 202  *
 203  * In order to discard pages without writing them back, (B_INVAL | B_TRUNC)
 204  * should be used.
 205  *
 206  * Use (B_INVAL | B_FORCE) to force the page to be destroyed even if we
 207  * could not successfuly write out the page.
 208  */
 209 
 210 /*
 211  * Insq/Remq for the buffer hash lists.
 212  */
 213 #define bremhash(bp) { \
 214         ASSERT((bp)->b_forw != NULL); \
 215         ASSERT((bp)->b_back != NULL); \
 216         (bp)->b_back->b_forw = (bp)->b_forw; \
 217         (bp)->b_forw->b_back = (bp)->b_back; \
 218         (bp)->b_forw = (bp)->b_back = NULL; \
 219 }
 220 #define binshash(bp, dp) { \
 221         ASSERT((bp)->b_forw == NULL); \
 222         ASSERT((bp)->b_back == NULL); \
 223         ASSERT((dp)->b_forw != NULL); \
 224         ASSERT((dp)->b_back != NULL); \
 225         (bp)->b_forw = (dp)->b_forw; \
 226         (bp)->b_back = (dp); \
 227         (dp)->b_forw->b_back = (bp); \
 228         (dp)->b_forw = (bp); \
 229 }
 230 
 231 
 232 /*
 233  * The hash structure maintains two lists:
 234  *
 235  *      1) The hash list of buffers (b_forw & b_back)
 236  *      2) The LRU free list of buffers on this hash bucket (av_forw & av_back)
 237  *
 238  * The dwbuf structure keeps a list of delayed write buffers per hash bucket
 239  * hence there are exactly the same number of dwbuf structures as there are
 240  * the hash buckets (hbuf structures) in the system.
 241  *
 242  * The number of buffers on the freelist may not be equal to the number of
 243  * buffers on the hash list. That is because when buffers are busy they are
 244  * taken off the freelist but not off the hash list. "b_length" field keeps
 245  * track of the number of free buffers (including delayed writes ones) on
 246  * the hash bucket. The "b_lock" mutex protects the free list as well as
 247  * the hash list. It also protects the counter "b_length".
 248  *
 249  * Enties b_forw, b_back, av_forw & av_back must be at the same offset
 250  * as the ones in buf structure.
 251  */
 252 struct  hbuf {
 253         int     b_flags;
 254 
 255         struct  buf     *b_forw;        /* hash list forw pointer */
 256         struct  buf     *b_back;        /* hash list back pointer */
 257 
 258         struct  buf     *av_forw;       /* free list forw pointer */
 259         struct  buf     *av_back;       /* free list back pointer */
 260 
 261         int             b_length;       /* # of entries on free list */
 262         kmutex_t        b_lock;         /* lock to protect this structure */
 263 };
 264 
 265 
 266 /*
 267  * The delayed list pointer entries should match with the buf strcuture.
 268  */
 269 struct  dwbuf {
 270         int     b_flags;                /* not used */
 271 
 272         struct  buf     *b_forw;        /* not used */
 273         struct  buf     *b_back;        /* not used */
 274 
 275         struct  buf     *av_forw;       /* delayed write forw pointer */
 276         struct  buf     *av_back;       /* delayed write back pointer */
 277 };
 278 
 279 
 280 /*
 281  * Unlink a buffer from the available (free or delayed write) list and mark
 282  * it busy (internal interface).
 283  */
 284 #define notavail(bp) \
 285 {\
 286         ASSERT(SEMA_HELD(&bp->b_sem)); \
 287         ASSERT((bp)->av_forw != NULL); \
 288         ASSERT((bp)->av_back != NULL); \
 289         ASSERT((bp)->av_forw != (bp)); \
 290         ASSERT((bp)->av_back != (bp)); \
 291         (bp)->av_back->av_forw = (bp)->av_forw; \
 292         (bp)->av_forw->av_back = (bp)->av_back; \
 293         (bp)->b_flags |= B_BUSY; \
 294         (bp)->av_forw = (bp)->av_back = NULL; \
 295 }
 296 
 297 #if defined(_KERNEL) || defined(_FAKE_KERNEL)
 298 /*
 299  * Macros to avoid the extra function call needed for binary compat.
 300  *
 301  * B_RETRYWRI is not included in clear_flags for BWRITE(), BWRITE2(),
 302  * or brwrite() so that the retry operation is persistent until the
 303  * write either succeeds or the buffer is bfinval()'d.
 304  *
 305  */
 306 #define BREAD(dev, blkno, bsize) \
 307         bread_common(/* ufsvfsp */ NULL, dev, blkno, bsize)
 308 
 309 #define BWRITE(bp) \
 310         bwrite_common(/* ufsvfsp */ NULL, bp, /* force_wait */ 0, \
 311                 /* do_relse */ 1, \
 312                 /* clear_flags */ (B_READ | B_DONE | B_ERROR | B_DELWRI))
 313 
 314 #define BWRITE2(bp) \
 315         bwrite_common(/* ufsvfsp */ NULL, bp, /* force_wait */ 1, \
 316                 /* do_relse */ 0, \
 317                 /* clear_flags */ (B_READ | B_DONE | B_ERROR | B_DELWRI))
 318 
 319 #define GETBLK(dev, blkno, bsize) \
 320         getblk_common(/* ufsvfsp */ NULL, dev, blkno, bsize, /* errflg */ 0)
 321 
 322 
 323 /*
 324  * Macros for new retry write interfaces.
 325  */
 326 
 327 /*
 328  * Same as bdwrite() except write failures are retried.
 329  */
 330 #define bdrwrite(bp) { \
 331         (bp)->b_flags |= B_RETRYWRI; \
 332         bdwrite((bp)); \
 333 }
 334 
 335 /*
 336  * Same as bwrite() except write failures are retried.
 337  */
 338 #define brwrite(bp) { \
 339         (bp)->b_flags |= B_RETRYWRI; \
 340         bwrite_common((bp), /* force_wait */ 0, /* do_relse */ 1, \
 341                 /* clear_flags */ (B_READ | B_DONE | B_ERROR | B_DELWRI)); \
 342 }
 343 
 344 extern struct hbuf      *hbuf;          /* Hash table */
 345 extern struct dwbuf     *dwbuf;         /* delayed write hash table */
 346 extern struct buf       *buf;           /* The buffer pool itself */
 347 extern struct buf       bfreelist;      /* head of available list */
 348 
 349 extern void (*bio_lufs_strategy)(void *, buf_t *);      /* UFS Logging */
 350 extern void (*bio_snapshot_strategy)(void *, buf_t *);  /* UFS snapshots */
 351 
 352 int     bcheck(dev_t, struct buf *);
 353 int     iowait(struct buf *);
 354 int     hash2ints(int x, int y);
 355 int     bio_busy(int);
 356 int     biowait(struct buf *);
 357 int     biomodified(struct buf *);
 358 int     geterror(struct buf *);
 359 void    minphys(struct buf *);
 360 /*
 361  * ufsvfsp is declared as a void * to avoid having everyone that uses
 362  * this header file include sys/fs/ufs_inode.h.
 363  */
 364 void    bwrite_common(void *ufsvfsp, struct buf *, int force_wait,
 365         int do_relse, int clear_flags);
 366 void    bwrite(struct buf *);
 367 void    bwrite2(struct buf *);
 368 void    bdwrite(struct buf *);
 369 void    bawrite(struct buf *);
 370 void    brelse(struct buf *);
 371 void    iodone(struct buf *);
 372 void    clrbuf(struct buf *);
 373 void    bflush(dev_t);
 374 void    blkflush(dev_t, daddr_t);
 375 void    binval(dev_t);
 376 int     bfinval(dev_t, int);
 377 void    binit(void);
 378 void    biodone(struct buf *);
 379 void    bioinit(struct buf *);
 380 void    biofini(struct buf *);
 381 void    bp_mapin(struct buf *);
 382 void    *bp_mapin_common(struct buf *, int);
 383 void    bp_mapout(struct buf *);
 384 int     bp_copyin(struct buf *, void *, offset_t, size_t);
 385 int     bp_copyout(void *, struct buf *, offset_t, size_t);
 386 void    bp_init(size_t, uint_t);
 387 int     bp_color(struct buf *);
 388 void    pageio_done(struct buf *);
 389 struct buf *bread(dev_t, daddr_t, long);
 390 struct buf *bread_common(void *, dev_t, daddr_t, long);
 391 struct buf *breada(dev_t, daddr_t, daddr_t, long);
 392 struct buf *getblk(dev_t, daddr_t, long);
 393 struct buf *getblk_common(void *, dev_t, daddr_t, long, int);
 394 struct buf *ngeteblk(long);
 395 struct buf *geteblk(void);
 396 struct buf *pageio_setup(struct page *, size_t, struct vnode *, int);
 397 void bioerror(struct buf *bp, int error);
 398 void bioreset(struct buf *bp);
 399 struct buf *bioclone(struct buf *, off_t, size_t, dev_t, daddr_t,
 400         int (*)(struct buf *), struct buf *, int);
 401 size_t  biosize(void);
 402 #endif  /* defined(_KERNEL) || defined(_FAKE_KERNEL) */
 403 
 404 #ifdef  __cplusplus
 405 }
 406 #endif
 407 
 408 #endif  /* _SYS_BUF_H */