1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 /*
  26  * Copyright (c) 2017 by Delphix. All rights reserved.
  27  */
  28 
  29 #include <sys/types.h>
  30 #include <sys/param.h>
  31 #include <sys/ksynch.h>
  32 #include <sys/kmem.h>
  33 #include <sys/stat.h>
  34 #include <sys/buf.h>
  35 #include <sys/open.h>
  36 #include <sys/conf.h>
  37 #include <sys/file.h>
  38 #include <sys/cmn_err.h>
  39 #include <sys/errno.h>
  40 #include <sys/ddi.h>
  41 
  42 #include <sys/nsc_thread.h>
  43 #include <sys/nsctl/nsctl.h>
  44 
  45 #include <sys/sdt.h>              /* dtrace is S10 or later */
  46 
  47 #include <vm/seg_kmem.h>
  48 #include "sd_bcache.h"
  49 #include "sd_trace.h"
  50 #include "sd_io.h"
  51 #include "sd_iob.h"
  52 #include "sd_misc.h"
  53 #if defined(_SD_DEBUG)                  /* simulate disk errors */
  54 #include "sd_tdaemon.h"
  55 #endif
  56 
  57 #ifndef DS_DDICT
  58 extern uintptr_t kobj_getsymvalue(char *, int); /* DDI violation */
  59 #endif
  60 
  61 #define DO_PAGE_LIST    sdbc_do_page    /* enable pagelist code */
  62 
  63 int sdbc_do_page = 0;
  64 
  65 #define SGIO_MAX 254
  66 
  67 static kmutex_t sdbc_bio_mutex;
  68 static int sdbc_bio_count;
  69 
  70 static unsigned long page_size, page_offset_mask;
  71 
  72 #ifdef _SD_BIO_STATS
  73 static __start_io_count = 0;
  74 #endif /* _SD_BIO_STATS */
  75 
  76 /*
  77  * Forward declare all statics that are used before defined to enforce
  78  * parameter checking.  Also forward-declare all functions that have 64-bit
  79  * argument types to enforce correct parameter checking.
  80  *
  81  * Some (if not all) of these could be removed if the code were reordered
  82  */
  83 
  84 static int _sd_sync_ea(struct buf *, iob_hook_t *);
  85 static int _sd_async_ea(struct buf *, iob_hook_t *);
  86 static void _sd_pack_pages(struct buf *bp, struct buf *list, sd_addr_t *addr,
  87     nsc_off_t offset, nsc_size_t size);
  88 static void _sd_pack_pages_nopageio(struct buf *bp, struct buf *list,
  89     sd_addr_t *addr, nsc_off_t offset, nsc_size_t size);
  90 static void _sd_setup_iob(struct buf *bp, dev_t dev, nsc_off_t pos, int flag);
  91 
  92 #ifdef  DEBUG
  93 static int _sdbc_ioj_lookup(dev_t);
  94 static void _sdbc_ioj_clear_err(int);
  95 #endif
  96 
  97 static int SD_WRITES_TOT = 0;
  98 static int SD_WRITES_LEN[100];
  99 
 100 _sd_buf_list_t _sd_buflist;
 101 
 102 /*
 103  * _sd_add_vm_to_bp_plist - add the page corresponding to the
 104  * virtual address "v" (kernel virtaddr) to the pagelist linked
 105  * to buffer "bp".
 106  *
 107  * The virtual address "v" is "known" to be allocated by segkmem
 108  * and we can look up the page by using the segkmem vnode kvp.
 109  * This violates the ddi/ddk but is workable for now anyway.
 110  *
 111  *
 112  */
 113 static void
 114 _sd_add_vm_to_bp_plist(struct buf *bp, unsigned char *v)
 115 {
 116         page_t   *pp;
 117         page_t   *one_pg = NULL;
 118 
 119         pp = page_find(&kvp, (u_offset_t)((uintptr_t)v & ~page_offset_mask));
 120         if (!pp) {
 121                 cmn_err(CE_PANIC,
 122                     "_sd_add_vm_to_bp_plist: couldn't find page for 0x%p",
 123                     (void *)v);
 124         }
 125 
 126         page_add(&one_pg, pp);
 127         page_list_concat(&(bp->b_pages), &one_pg);
 128 
 129 }
 130 
 131 #ifdef _SD_BIO_STATS
 132 static int
 133 _sd_count_pages(page_t *pp)
 134 {
 135         int cnt = 0;
 136         page_t *pp1;
 137         if (pp == NULL)
 138                 return (cnt);
 139 
 140         for (cnt = 1, pp1 = pp->p_next; pp != pp1; cnt++, pp1 = pp1->p_next)
 141                 ;
 142 
 143         return (cnt);
 144 }
 145 #endif /* _SD_BIO_STATS */
 146 
 147 
 148 /*
 149  * _sdbc_iobuf_load - load time initialization of io bufs structures.
 150  *
 151  *
 152  * RETURNS:
 153  *      0  - success.
 154  *      -1 - failure.
 155  *
 156  * USAGE:
 157  *      This routine initializes load time buf structures.
 158  *      Should be called when the cache is loaded.
 159  */
 160 
 161 int
 162 _sdbc_iobuf_load(void)
 163 {
 164         mutex_init(&sdbc_bio_mutex, NULL, MUTEX_DRIVER, NULL);
 165 
 166         /*
 167          * HACK add a ref to kvp, to prevent VN_RELE on it from panicing
 168          * the system
 169          */
 170         VN_HOLD(&kvp);
 171 
 172         return (0);
 173 }
 174 
 175 /*
 176  * _sdbc_iobuf_unload - unload time cleanup of io buf structures.
 177  *
 178  *
 179  * USAGE:
 180  *      This routine removes load time buf structures.
 181  *      Should be called when the cache is unloaded.
 182  */
 183 void
 184 _sdbc_iobuf_unload(void)
 185 {
 186         mutex_enter(&kvp.v_lock);
 187         ASSERT(kvp.v_count == 1);
 188         VN_RELE_LOCKED(&kvp);
 189         mutex_exit(&kvp.v_lock);
 190 
 191         mutex_destroy(&sdbc_bio_mutex);
 192         bzero(&_sd_buflist, sizeof (_sd_buf_list_t));
 193 }
 194 
 195 /*
 196  * _sdbc_iobuf_configure - configure a list of io bufs for later use.
 197  *
 198  * ARGUMENTS:
 199  *      num_bufs - number of buffers. (from the configuration file)
 200  *
 201  * RETURNS:
 202  *      0  - success.
 203  * <0  - failure.
 204  *
 205  * USAGE:
 206  *      This routine configures the buf structures for io.
 207  *      Should be called when the cache is configured.
 208  */
 209 
 210 int
 211 _sdbc_iobuf_configure(int num)
 212 {
 213         int i;
 214         _sd_buf_list_t *buflist;
 215         iob_hook_t *hook;
 216         char symbol_name[32];
 217 
 218         if (!num || (num > _SD_DEFAULT_IOBUFS))
 219                 num = _SD_DEFAULT_IOBUFS;
 220 
 221         if ((_sd_buflist.hooks = (iob_hook_t *)nsc_kmem_zalloc(
 222             num * sizeof (iob_hook_t), KM_SLEEP, sdbc_iobuf_mem)) == NULL) {
 223                 return (-1);
 224         }
 225 
 226         buflist = &_sd_buflist;
 227         buflist->bl_init_count = num;
 228         buflist->bl_hooks_avail = num;
 229         buflist->bl_hook_lowmark = num;
 230         hook = buflist->hooks;
 231         buflist->hook_head = hook;
 232         for (i = 0; i < num; i++, hook++) {
 233                 cv_init(&hook->wait, NULL, CV_DRIVER, NULL);
 234                 (void) sprintf(symbol_name, "sd_iob_dcb%d", i);
 235                 hook->iob_drv_iodone = (dcb_t)kobj_getsymvalue(symbol_name, 0);
 236                 if (!hook->iob_drv_iodone) {
 237                         return (-2);
 238                 }
 239                 hook->next_hook = hook+1;
 240         }
 241         (hook-1)->next_hook = NULL;
 242 
 243         for (i = 0; i < MAX_HOOK_LOCKS; i++)
 244                 mutex_init(&_sd_buflist.hook_locks[i], NULL, MUTEX_DRIVER,
 245                     NULL);
 246 
 247         cv_init(&_sd_buflist.hook_wait, NULL, CV_DRIVER, NULL);
 248         _sd_buflist.hook_waiters = 0;
 249 
 250         sdbc_bio_count = 0;
 251         SD_WRITES_TOT = 0;
 252         bzero(SD_WRITES_LEN, sizeof (SD_WRITES_LEN));
 253 
 254         /* pagelist i/o pages must be done in cache_init */
 255 
 256         page_size = ptob(1);
 257         page_offset_mask = page_size - 1;
 258 
 259         return (0);
 260 }
 261 
 262 /*
 263  * _sdbc_iobuf_deconfigure - release all memory allocated for buf list
 264  *
 265  * ARGUMENTS:
 266  *      None.
 267  *
 268  * RETURNS:
 269  *      0
 270  */
 271 void
 272 _sdbc_iobuf_deconfigure(void)
 273 {
 274         ushort_t i;
 275 
 276         if (_sd_buflist.hooks) {
 277                 for (i = 0; i < _sd_buflist.bl_init_count; i ++) {
 278                         cv_destroy(&_sd_buflist.hooks[i].wait);
 279                 }
 280                 cv_destroy(&_sd_buflist.hook_wait);
 281                 nsc_kmem_free(_sd_buflist.hooks,
 282                     _sd_buflist.bl_init_count * sizeof (iob_hook_t));
 283                 for (i = 0; i < MAX_HOOK_LOCKS; i ++) {
 284                         mutex_destroy(&_sd_buflist.hook_locks[i]);
 285                 }
 286         }
 287 
 288         _sd_buflist.hooks = NULL;
 289 
 290 #ifdef DEBUG
 291         {
 292         void _sdbc_ioj_clear_err(int);
 293         _sdbc_ioj_clear_err(-1); /* clear any injected i/o errors */
 294         _sdbc_ioj_set_dev(-1, 0); /* clear dev entries */
 295         }
 296 #endif
 297 
 298 }
 299 
 300 /*
 301  * _sd_pending_iobuf()
 302  *
 303  * Return the number of I/O bufs outstanding
 304  */
 305 int
 306 _sd_pending_iobuf(void)
 307 {
 308         return (sdbc_bio_count);
 309 }
 310 
 311 /*
 312  * _sd_get_iobuf - allocate a buf.
 313  *
 314  * ARGUMENTS:
 315  *      None.
 316  *
 317  * RETURNS:
 318  *      NULL - failure.
 319  *      buf ptr otherwise.
 320  *
 321  * ASSUMPTIONS - process could block if we run out.
 322  *
 323  */
 324 /*ARGSUSED*/
 325 static struct buf *
 326 _sd_get_iobuf(int num_bdl)
 327 {
 328         struct buf *bp;
 329 
 330         /* Get a buffer, ready for page list i/o */
 331 
 332         if (DO_PAGE_LIST)
 333                 bp = pageio_setup(NULL, 0, &kvp, 0);
 334         else
 335                 bp = getrbuf(KM_SLEEP);
 336 
 337         if (bp == NULL)
 338                 return (NULL);
 339         mutex_enter(&sdbc_bio_mutex);
 340         sdbc_bio_count++;
 341         mutex_exit(&sdbc_bio_mutex);
 342         return (bp);
 343 }
 344 
 345 /*
 346  * _sd_put_iobuf - put a buf back in the freelist.
 347  *
 348  * ARGUMENTS:
 349  *      bp - buf pointer.
 350  *
 351  * RETURNS:
 352  *      0
 353  *
 354  */
 355 static void
 356 _sd_put_iobuf(struct buf *bp)
 357 {
 358         mutex_enter(&sdbc_bio_mutex);
 359         sdbc_bio_count--;
 360         mutex_exit(&sdbc_bio_mutex);
 361         if (DO_PAGE_LIST)
 362                 pageio_done(bp);
 363         else
 364                 freerbuf(bp);
 365 }
 366 
 367 
 368 /* use for ORing only */
 369 #define B_KERNBUF 0
 370 
 371 static void
 372 _sd_setup_iob(struct buf *bp, dev_t dev, nsc_off_t pos, int flag)
 373 {
 374         bp->b_pages = NULL;
 375         bp->b_un.b_addr = 0;
 376 
 377         flag &= (B_READ | B_WRITE);
 378 
 379         /*
 380          * if pagelist i/o, _sd_get_iobuf()/pageio_setup() has already
 381          * set b_flags to
 382          * B_KERNBUF | B_PAGEIO | B_NOCACHE | B_BUSY (sol 6,7,8)
 383          * or
 384          * B_PAGEIO | B_NOCACHE | B_BUSY (sol 9)
 385          */
 386 
 387         bp->b_flags |= B_KERNBUF | B_BUSY | flag;
 388 
 389         bp->b_error = 0;
 390 
 391         bp->b_forw = NULL;
 392         bp->b_back = NULL;
 393 
 394         bp->b_lblkno = (diskaddr_t)pos;
 395         bp->b_bufsize = 0;
 396         bp->b_resid = 0;
 397         bp->b_proc = NULL;
 398         bp->b_edev = dev;
 399 }
 400 
 401 
 402 /*
 403  * _sd_get_hook - get an iob hook from the free list.
 404  *
 405  * ARGUMENTS:
 406  *      none
 407  *
 408  * RETURNS:
 409  *      the newly allocated iob_hook.
 410  *
 411  */
 412 static iob_hook_t *
 413 _sd_get_hook(void)
 414 {
 415 
 416         iob_hook_t *ret;
 417 
 418         mutex_enter(&sdbc_bio_mutex);
 419 
 420 retry:
 421         ret = _sd_buflist.hook_head;
 422         if (ret)
 423                 _sd_buflist.hook_head = ret->next_hook;
 424         else {
 425                 ++_sd_buflist.hook_waiters;
 426                 if (_sd_buflist.max_hook_waiters < _sd_buflist.hook_waiters)
 427                         _sd_buflist.max_hook_waiters = _sd_buflist.hook_waiters;
 428                 cv_wait(&_sd_buflist.hook_wait, &sdbc_bio_mutex);
 429                 --_sd_buflist.hook_waiters;
 430                 goto retry;
 431         }
 432 
 433         if (_sd_buflist.bl_hook_lowmark > --_sd_buflist.bl_hooks_avail)
 434                 _sd_buflist.bl_hook_lowmark = _sd_buflist.bl_hooks_avail;
 435 
 436         mutex_exit(&sdbc_bio_mutex);
 437         ret->skipped = 0;
 438 
 439         ret->count = 0;
 440 
 441 #ifdef _SD_BIO_STATS
 442         ret->PAGE_IO = 0;
 443         ret->NORM_IO = 0;
 444         ret->NORM_IO_SIZE = 0;
 445         ret->SKIP_IO = 0;
 446         ret->PAGE_COMBINED = 0;
 447 #endif /* _SD_BIO_STATS */
 448 
 449         return (ret);
 450 }
 451 
 452 /*
 453  * _sd_put_hook - put an iob hook back on the free list.
 454  *
 455  * ARGUMENTS:
 456  *      hook - an iob_hook to be returned to the freelist.
 457  *
 458  *
 459  */
 460 static void
 461 _sd_put_hook(iob_hook_t *hook)
 462 {
 463 
 464         mutex_enter(&sdbc_bio_mutex);
 465 
 466         if (_sd_buflist.hook_waiters) {
 467                 cv_signal(&_sd_buflist.hook_wait);
 468         }
 469         hook->next_hook = _sd_buflist.hook_head;
 470         _sd_buflist.hook_head = hook;
 471 
 472         ++_sd_buflist.bl_hooks_avail;
 473 
 474         mutex_exit(&sdbc_bio_mutex);
 475 }
 476 
 477 /*
 478  * _sd_extend_iob - the i/o block we are handling needs a new struct buf to
 479  *    describe the next hunk of i/o. Get a new struct buf initialize it based
 480  *    on the state in the struct buf we are passed as an arg.
 481  * ARGUMENTS:
 482  *    head_bp - a buffer header in the current i/o block we are handling.
 483  *              (generally the initial header but in fact could be any
 484  *               of the ones [if any] that were chained to the initial
 485  *               one).
 486  */
 487 static struct buf *
 488 _sd_extend_iob(struct buf *head_bp)
 489 {
 490         struct buf *bp;
 491         iob_hook_t *hook = (iob_hook_t *)head_bp->b_private;
 492 
 493 
 494         if (!(bp = _sd_get_iobuf(0)))
 495                 return (0);
 496 
 497         bp->b_pages = NULL;
 498         bp->b_un.b_addr = 0;
 499 
 500         bp->b_flags |=  (head_bp->b_flags & (B_READ | B_WRITE));
 501 
 502         if (!DO_PAGE_LIST)
 503                 bp->b_flags |= B_KERNBUF | B_BUSY;
 504 
 505         bp->b_error = 0;
 506 
 507         /*
 508          *  b_forw/b_back  will form a doubly linked list of all the buffers
 509          *  associated with this block of i/o.
 510          *  hook->tail points to the last buffer in the chain.
 511          */
 512         bp->b_forw = NULL;
 513         bp->b_back = hook->tail;
 514         hook->tail->b_forw = bp;
 515         hook->tail = bp;
 516         hook->count++;
 517 
 518         ASSERT(BLK_FBA_OFF(hook->size) == 0);
 519 
 520         bp->b_lblkno = (diskaddr_t)hook->start_fba +
 521             (diskaddr_t)FBA_NUM(hook->size);
 522 
 523         bp->b_bufsize = 0;
 524         bp->b_resid = 0;
 525         bp->b_proc = NULL;
 526         bp->b_edev = head_bp->b_edev;
 527 
 528         bp->b_iodone = NULL; /* for now */
 529         bp->b_private = hook;
 530 
 531         return (bp);
 532 }
 533 
 534 /*
 535  * sd_alloc_iob - start processing a block of i/o. This allocates an initial
 536  *      buffer header for describing the i/o and a iob_hook for collecting
 537  *      information about all the i/o requests added to this buffer.
 538  *
 539  * ARGUMENTS:
 540  *      dev - the device all the i/o is destined for.
 541  *      fba_pos - the initial disk block to read.
 542  *      blks - ignored
 543  *      flag - signal whether this is a read or write request.
 544  *
 545  * RETURNS:
 546  *      pointer to free struct buf which will be used to describe i/o request.
 547  */
 548 /* ARGSUSED */
 549 struct buf *
 550 sd_alloc_iob(dev_t dev, nsc_off_t fba_pos, int blks, int flag)
 551 {
 552         struct buf *bp;
 553         iob_hook_t *hook;
 554 
 555         if (!(bp = _sd_get_iobuf(0)))
 556                 return (0);
 557 
 558         _sd_setup_iob(bp, dev, fba_pos, flag);
 559 
 560         bp->b_iodone = NULL; /* for now */
 561         hook = _sd_get_hook();
 562         if (!hook) {
 563                 /* can't see how this could happen */
 564                 _sd_put_iobuf(bp);
 565                 return (0);
 566         }
 567 
 568         /*
 569          *  pick an arbitrary lock
 570          */
 571         hook->lockp = &_sd_buflist.hook_locks[((long)hook >> 9) &
 572             (MAX_HOOK_LOCKS - 1)];
 573         hook->start_fba = fba_pos;
 574         hook->last_fba = fba_pos;
 575         hook->size = 0;
 576         hook->tail = bp;
 577         hook->chain = bp;
 578         hook->count = 1;
 579         hook->error = 0;
 580         bp->b_private = hook;
 581 
 582         return (bp);
 583 }
 584 
 585 /*
 586  * _sd_pack_pages - produce i/o requests that will perform the type of i/o
 587  *      described by bp (READ/WRITE). It attempt to tack the i/o onto the
 588  *      buf pointer to by list to minimize the number of bufs required.
 589  *
 590  * ARGUMENTS:
 591  *  bp - is the i/o description i.e. head
 592  *  list - is where to start adding this i/o request (null if we should extend)
 593  *  addr - address describing where the data is.
 594  *  offset - offset from addr where data begins
 595  *  size - size of the i/o request.
 596  */
 597 static void
 598 _sd_pack_pages(struct buf *bp, struct buf *list, sd_addr_t *addr,
 599     nsc_off_t offset, nsc_size_t size)
 600 {
 601         uintptr_t start_addr, end_addr;
 602         int page_end_aligned;
 603 #ifdef _SD_BIO_STATS
 604         iob_hook_t *hook = (iob_hook_t *)bp->b_private;
 605         struct buf *orig_list = list;
 606 #endif /* _SD_BIO_STATS */
 607 
 608         start_addr = (uintptr_t)addr->sa_virt + offset;
 609         end_addr = start_addr + size;
 610 
 611         page_end_aligned = !(end_addr & page_offset_mask);
 612 
 613         if (!list && !(list = _sd_extend_iob(bp))) {
 614                 /*
 615                  *  we're hosed since we have no error return...
 616                  *  though we could ignore stuff from here on out
 617                  *  and return ENOMEM when we get to sd_start_io.
 618                  *  This will do for now.
 619                  */
 620                 cmn_err(CE_PANIC, "_sd_pack_pages: couldn't extend iob");
 621         }
 622 
 623         /*
 624          *      We only want to do pagelist i/o if we end on a page boundary.
 625          *      If we don't end on a page boundary we won't combine with the
 626          *      next request and so we may as well do it as normal as it
 627          *      will only use one buffer.
 628          */
 629 
 630         if (DO_PAGE_LIST && page_end_aligned) {
 631                 if (start_addr & page_offset_mask) {
 632                         /*
 633                          * handle the partial page
 634                          */
 635                         if (list->b_bufsize) {
 636                                 if (!(list = _sd_extend_iob(bp))) {
 637                                         /*
 638                                          * we're hosed since we have no error
 639                                          * return though we could ignore stuff
 640                                          * from here on out and return ENOMEM
 641                                          * when we get to sd_start_io.
 642                                          *  This will do for now.
 643                                          */
 644                                         cmn_err(CE_PANIC,
 645                                         "_sd_pack_pages: couldn't extend iob");
 646                                 }
 647                         }
 648 #ifdef _SD_BIO_STATS
 649                         hook->PAGE_IO++;
 650 #endif /* _SD_BIO_STATS */
 651                         _sd_add_vm_to_bp_plist(list,
 652                             (unsigned char *) start_addr);
 653                         list->b_bufsize = page_size -
 654                             (start_addr & page_offset_mask);
 655                         list->b_un.b_addr = (caddr_t)
 656                             (start_addr & page_offset_mask);
 657                         size -= list->b_bufsize;
 658                         start_addr += list->b_bufsize;
 659                 }
 660                 /*
 661                  *      Now fill with all the full pages remaining.
 662                  */
 663                 for (; size > 0; size -= page_size) {
 664 #ifdef _SD_BIO_STATS
 665                         hook->PAGE_IO++;
 666 #endif /* _SD_BIO_STATS */
 667 
 668                         _sd_add_vm_to_bp_plist(list,
 669                             (unsigned char *) start_addr);
 670                         start_addr += page_size;
 671                         list->b_bufsize += page_size;
 672 #ifdef _SD_BIO_STATS
 673                         if (list == orig_list)
 674                                 hook->PAGE_COMBINED++;
 675 #endif /* _SD_BIO_STATS */
 676                 }
 677                 if (size)
 678                         cmn_err(CE_PANIC, "_sd_pack_pages: bad size: %"
 679                             NSC_SZFMT, size);
 680         } else {
 681                 /*
 682                  *  Wasn't worth it as pagelist i/o, do as normal
 683                  */
 684                 if (list->b_bufsize && !(list = _sd_extend_iob(bp))) {
 685                         /*
 686                          *  we're hosed since we have no error return...
 687                          *  though we could ignore stuff from here on out
 688                          *  and return ENOMEM when we get to sd_start_io.
 689                          *  This will do for now.
 690                          */
 691                         cmn_err(CE_PANIC,
 692                             "_sd_pack_pages: couldn't extend iob");
 693                 }
 694 
 695                 /* kernel virtual */
 696                 list->b_flags &= ~(B_PHYS | B_PAGEIO);
 697                 list->b_un.b_addr = (caddr_t)start_addr;
 698 #ifdef _SD_BIO_STATS
 699                 hook->NORM_IO++;
 700                 hook->NORM_IO_SIZE += size;
 701 #endif /* _SD_BIO_STATS */
 702                 list->b_bufsize = (size_t)size;
 703         }
 704 
 705 }
 706 
 707 /*
 708  * perform same function as _sd_pack_pages() when not doing pageio
 709  */
 710 static void
 711 _sd_pack_pages_nopageio(struct buf *bp, struct buf *list, sd_addr_t *addr,
 712     nsc_off_t offset, nsc_size_t size)
 713 {
 714         uintptr_t start_addr;
 715 #ifdef _SD_BIO_STATS
 716         iob_hook_t *hook = (iob_hook_t *)bp->b_private;
 717         struct buf *orig_list = list;
 718 #endif /* _SD_BIO_STATS */
 719 
 720         start_addr = (uintptr_t)addr->sa_virt + offset;
 721 
 722         if (!list && !(list = _sd_extend_iob(bp))) {
 723                 /*
 724                  *  we're hosed since we have no error return...
 725                  *  though we could ignore stuff from here on out
 726                  *  and return ENOMEM when we get to sd_start_io.
 727                  *  This will do for now.
 728                  */
 729                 cmn_err(CE_PANIC, "_sd_pack_pages_nopageio: couldn't "
 730                     "extend iob");
 731         }
 732 
 733         if (list->b_bufsize &&
 734             (start_addr == (uintptr_t)(list->b_un.b_addr + list->b_bufsize))) {
 735                 /* contiguous */
 736                 list->b_bufsize += (size_t)size;
 737         } else {
 738                 /*
 739                  * not contiguous mem (extend) or first buffer (bufsize == 0).
 740                  */
 741                 if (list->b_bufsize && !(list = _sd_extend_iob(bp))) {
 742                         /*
 743                          *  we're hosed since we have no error return...
 744                          *  though we could ignore stuff from here on out
 745                          *  and return ENOMEM when we get to sd_start_io.
 746                          *  This will do for now.
 747                          */
 748                         cmn_err(CE_PANIC, "_sd_pack_pages_nopageio: couldn't "
 749                             "extend iob");
 750                 }
 751                 list->b_un.b_addr = (caddr_t)start_addr;
 752                 list->b_bufsize = (size_t)size;
 753         }
 754 
 755 #ifdef _SD_BIO_STATS
 756         hook->NORM_IO++;
 757         hook->NORM_IO_SIZE += size;
 758 #endif /* _SD_BIO_STATS */
 759 }
 760 
 761 /*
 762  * sd_add_fba - add an i/o request to the block of i/o described by bp.
 763  *      We try and combine this request with the previous request. In
 764  *      Addition we try and do the i/o as PAGELIST_IO if it satisfies
 765  *      the restrictions for it. If the i/o request can't be combined
 766  *      we extend the i/o description with a new buffer header and add
 767  *      it to the chain headed by bp.
 768  *
 769  * ARGUMENTS:
 770  *      bp - the struct buf describing the block i/o we are collecting.
 771  *      addr - description of the address where the data will read/written to.
 772  *             A NULL indicates that this i/o request doesn't need to actually
 773  *             happen. Used to mark reads when the fba is already in cache and
 774  *             dirty.
 775  *
 776  *      fba_pos - offset from address in addr where the i/o is to start.
 777  *
 778  *      fba_len - number of consecutive fbas to transfer.
 779  *
 780  *  NOTE: It is assumed that the memory is physically contiguous but may span
 781  *  multiple pages (should a cache block be larger than a page).
 782  *
 783  */
 784 void
 785 sd_add_fba(struct buf *bp, sd_addr_t *addr, nsc_off_t fba_pos,
 786     nsc_size_t fba_len)
 787 {
 788         nsc_off_t offset;
 789         nsc_size_t size;
 790         iob_hook_t *hook = (iob_hook_t *)bp->b_private;
 791 
 792         size = FBA_SIZE(fba_len);
 793         offset = FBA_SIZE(fba_pos);
 794 
 795         if (addr) {
 796                 /*
 797                  *  See if this can be combined with previous request(s)
 798                  */
 799                 if (!bp->b_bufsize) {
 800                         if (DO_PAGE_LIST)
 801                                 _sd_pack_pages(bp, bp, addr, offset, size);
 802                         else
 803                                 _sd_pack_pages_nopageio(bp, bp, addr, offset,
 804                                     size);
 805                 } else {
 806                         if (DO_PAGE_LIST) {
 807                                 if (hook->tail->b_flags & B_PAGEIO) {
 808                                         /*
 809                                          * Last buffer was a pagelist. Unless a
 810                                          * skip was detected the last request
 811                                          * ended on a page boundary. If this
 812                                          * one starts on one we combine the
 813                                          * best we can.
 814                                          */
 815                                         if (hook->skipped)
 816                                                 _sd_pack_pages(bp, NULL, addr,
 817                                                     offset, size);
 818                                         else
 819                                                 _sd_pack_pages(bp, hook->tail,
 820                                                     addr, offset, size);
 821                                 } else {
 822                                         /*
 823                                          * Last buffer was vanilla i/o or worse
 824                                          * (sd_add_mem)
 825                                          */
 826                                         _sd_pack_pages(bp, NULL, addr, offset,
 827                                             size);
 828                                 }
 829                         } else {
 830                                 if (hook->skipped)
 831                                         _sd_pack_pages_nopageio(bp, NULL,
 832                                             addr, offset, size);
 833                                 else
 834                                         _sd_pack_pages_nopageio(bp,
 835                                             hook->tail, addr, offset, size);
 836                         }
 837                 }
 838                 hook->skipped = 0;
 839         } else {
 840                 /* Must be a read of dirty block we want to discard */
 841 
 842                 ASSERT(bp->b_flags & B_READ);
 843 #ifdef _SD_BIO_STATS
 844                 hook->SKIP_IO++;
 845 #endif /* _SD_BIO_STATS */
 846                 hook->skipped = 1;
 847                 if (!bp->b_bufsize)
 848                         bp->b_lblkno += fba_len;
 849         }
 850         hook->size += size;
 851 
 852 }
 853 
 854 /*
 855  * sd_add_mem - add an i/o request to the block of i/o described by bp.
 856  *      The memory target for this i/o may span multiple pages and may
 857  *      not be physically contiguous.
 858  *      also the len might also not be a multiple of an fba.
 859  *
 860  * ARGUMENTS:
 861  *      bp - the struct buf describing the block i/o we are collecting.
 862  *
 863  *      buf - target of this i/o request.
 864  *
 865  *      len - number of bytes to transfer.
 866  *
 867  */
 868 void
 869 sd_add_mem(struct buf *bp, char *buf, nsc_size_t len)
 870 {
 871         nsc_size_t n;
 872         uintptr_t start;
 873         iob_hook_t *hook = (iob_hook_t *)bp->b_private;
 874 
 875         start = (uintptr_t)buf & page_offset_mask;
 876 
 877         for (; len > 0; buf += n, len -= n, start = 0) {
 878                 n = min((nsc_size_t)len, (nsc_size_t)(page_size - start));
 879                 /*
 880                  *  i/o size must be multiple of an FBA since we can't
 881                  *  count on lower level drivers to understand b_offset
 882                  */
 883                 if (BLK_FBA_OFF(n) != 0) {
 884                         cmn_err(CE_WARN,
 885                             "!sdbc(sd_add_mem) i/o request not FBA sized (%"
 886                             NSC_SZFMT ")", n);
 887                 }
 888 
 889                 if (!bp->b_bufsize) {
 890                         /* first request */
 891                         bp->b_flags &= ~(B_PHYS | B_PAGEIO);
 892                         bp->b_un.b_addr = buf;
 893                         bp->b_bufsize = (size_t)n;
 894                 } else {
 895                         struct buf *new_bp;
 896                         if (!(new_bp = _sd_extend_iob(bp))) {
 897                                 /* we're hosed */
 898                                 cmn_err(CE_PANIC,
 899                                 "sd_add_mem: couldn't extend iob");
 900                         }
 901                         new_bp->b_flags &= ~(B_PHYS | B_PAGEIO);
 902                         new_bp->b_un.b_addr = buf;
 903                         new_bp->b_bufsize = (size_t)n;
 904                 }
 905                 hook->size += n;
 906         }
 907 }
 908 
 909 
 910 /*
 911  * sd_start_io - start all the i/o needed to satisfy the i/o request described
 912  *      by bp. If supplied the a non-NULL fn then this is an async request
 913  *      and we will return NSC_PENDING and call fn when all the i/o complete.
 914  *      Otherwise this is a synchronous request and we sleep until all the
 915  *      i/o is complete. If any buffer in the chain gets an error we return
 916  *      the first error we see (once all the i/o is complete).
 917  *
 918  * ARGUMENTS:
 919  *      bp - the struct buf describing the block i/o we are collecting.
 920  *
 921  *      strategy - strategy function to call if known by the user, or NULL.
 922  *
 923  *      fn - user's callback function. NULL implies synchronous request.
 924  *
 925  *      arg - an argument passed to user's callback function.
 926  *
 927  */
 928 int
 929 sd_start_io(struct buf *bp, strategy_fn_t strategy, sdbc_ea_fn_t fn,
 930     blind_t arg)
 931 {
 932         int err;
 933         iob_hook_t *hook = (iob_hook_t *)bp->b_private;
 934         struct buf *bp_next;
 935         int (*ea_fn)(struct buf *, iob_hook_t *);
 936 #ifdef _SD_BIO_STATS
 937         static int total_pages, total_pages_combined, total_norm;
 938         static int total_norm_combined, total_skipped;
 939         static nsc_size_t total_norm_size;
 940 
 941         static int total_bufs;
 942         static int total_xpages_w, total_ypages_w;
 943         static int total_xpages_r, total_ypages_r;
 944         static int max_run_r, max_run_w;
 945 
 946 #endif /* _SD_BIO_STATS */
 947 
 948         hook->func = fn;
 949         hook->param = arg;
 950         if (fn != NULL)
 951                 ea_fn = _sd_async_ea;
 952         else
 953                 ea_fn = _sd_sync_ea;
 954 
 955         hook->iob_hook_iodone = ea_fn;
 956 
 957 #ifdef _SD_BIO_STATS
 958         __start_io_count++;
 959         total_pages += hook->PAGE_IO;
 960         total_pages_combined += hook->PAGE_COMBINED;
 961         total_norm += hook->NORM_IO;
 962         total_norm_size += hook->NORM_IO_SIZE;
 963         total_skipped += hook->SKIP_IO;
 964 #endif /* _SD_BIO_STATS */
 965 
 966         for (; bp; bp = bp_next) {
 967 
 968         DTRACE_PROBE4(sd_start_io_bufs, struct buf *, bp, long, bp->b_bufsize,
 969             int, bp->b_flags, iob_hook_t *, hook);
 970 
 971                 bp_next = bp->b_forw;
 972                 if (!(bp->b_flags & B_READ)) {
 973                         SD_WRITES_TOT++;
 974                         SD_WRITES_LEN[(bp->b_bufsize/32768) %
 975                             (sizeof (SD_WRITES_LEN)/sizeof (int))]++;
 976                 }
 977                 bp->b_iodone = hook->iob_drv_iodone;
 978                 bp->b_bcount = bp->b_bufsize;
 979                 bp->b_forw = NULL;
 980                 bp->b_back = NULL;
 981                 bp->b_private = NULL;
 982 
 983 #ifdef _SD_BIO_STATS
 984                 total_bufs ++;
 985                 if (bp->b_flags & B_PAGEIO) {
 986                         int i;
 987                         i = _sd_count_pages(bp->b_pages);
 988                         if (bp->b_flags & B_READ) {
 989                                 if (i > max_run_r)
 990                                         max_run_r = i;
 991                                 total_xpages_r += i;
 992                                 total_ypages_r++;
 993                         } else {
 994                                 if (i > max_run_w)
 995                                         max_run_w = i;
 996                                 total_xpages_w += i;
 997                                 total_ypages_w++;
 998                         }
 999                 }
1000 #endif /* _SD_BIO_STATS */
1001 
1002 
1003                 /*
1004                  *  It's possible for us to be told to read a dirty block
1005                  *  where all the i/o can go away (e.g. read one fba, it's
1006                  *  in cache and dirty) so we really have nothing to do but
1007                  *  say we're done.
1008                  */
1009                 if (bp->b_bcount) {
1010                         if (!strategy) {
1011                                 strategy =
1012                                     nsc_get_strategy(getmajor(bp->b_edev));
1013                         }
1014 
1015                         if (!strategy) {
1016                                 bp->b_flags |= B_ERROR;
1017                                 bp->b_error = ENXIO;
1018                                 (*bp->b_iodone)(bp);
1019                         } else
1020 #ifdef DEBUG
1021                         /* inject i/o error for testing */
1022                         if (bp->b_error = _sdbc_ioj_lookup(bp->b_edev)) {
1023                                 bp->b_flags |= B_ERROR;
1024                                 (*bp->b_iodone)(bp);
1025                         } else
1026 #endif
1027                         {
1028                                 (*strategy)(bp);
1029                         }
1030                 } else {
1031                         (*bp->b_iodone)(bp);
1032                 }
1033 
1034         }
1035 
1036 #ifdef _SD_BIO_STATS
1037         if (__start_io_count == 2000) {
1038                 __start_io_count = 0;
1039                 cmn_err(CE_WARN,
1040                     "!sdbc(sd_start_io) t_bufs %d pages %d "
1041                     "combined %d norm %d norm_size %" NSC_SZFMT " skipped %d",
1042                     total_bufs,
1043                     total_pages, total_pages_combined, total_norm,
1044                     total_norm_size, total_skipped);
1045 
1046                 total_bufs = 0;
1047                 total_pages = 0;
1048                 total_pages_combined = 0;
1049                 total_norm = 0;
1050                 total_norm_combined = 0;
1051                 total_skipped = 0;
1052                 total_norm_size = 0;
1053 
1054                 cmn_err(CE_WARN,
1055                     "!sdbc(sd_start_io)(r) max_run %d, total_xp %d total yp %d",
1056                     max_run_r, total_xpages_r, total_ypages_r);
1057 
1058                 total_xpages_r = 0;
1059                 total_ypages_r = 0;
1060                 max_run_r = 0;
1061 
1062                 cmn_err(CE_WARN,
1063                     "!sdbc(sd_start_io)(w) max_run %d, total_xp %d total yp %d",
1064                     max_run_w, total_xpages_w, total_ypages_w);
1065 
1066                 total_xpages_w = 0;
1067                 total_ypages_w = 0;
1068                 max_run_w = 0;
1069         }
1070 #endif /* _SD_BIO_STATS */
1071 
1072         if (ea_fn == _sd_async_ea) {
1073                 DTRACE_PROBE(sd_start_io_end);
1074 
1075                 return (NSC_PENDING);
1076         }
1077 
1078         mutex_enter(hook->lockp);
1079 
1080         while (hook->count) {
1081                 cv_wait(&hook->wait, hook->lockp);
1082         }
1083         mutex_exit(hook->lockp);
1084 
1085         err = hook->error ? hook->error : NSC_DONE;
1086         bp = hook->tail;
1087         _sd_put_hook(hook);
1088         _sd_put_iobuf(bp);
1089 
1090         return (err);
1091 }
1092 
1093 /*
1094  * _sd_sync_ea - called when a single i/o operation is complete. If this
1095  *      is the last outstanding i/o we wakeup the sleeper.
1096  *      If this i/o had an error then we store the error result in the
1097  *      iob_hook if this was the first error.
1098  *
1099  * ARGUMENTS:
1100  *      bp - the struct buf describing the block i/o that just completed.
1101  *
1102  * Comments:
1103  *      This routine is called at interrupt level when the io is done.
1104  */
1105 
1106 static int
1107 _sd_sync_ea(struct buf *bp, iob_hook_t *hook)
1108 {
1109 
1110         int error;
1111         int done;
1112 
1113         /*
1114          *  We get called for each buf that completes. When they are all done.
1115          *  we wakeup the waiter.
1116          */
1117         error = (bp->b_flags & B_ERROR) ? bp->b_error : 0;
1118 
1119         mutex_enter(hook->lockp);
1120 
1121         if (!hook->error)
1122                 hook->error = error;
1123 
1124         done = !(--hook->count);
1125         if (done) {
1126                 /* remember the last buffer so we can free it later */
1127                 hook->tail = bp;
1128                 cv_signal(&hook->wait);
1129         }
1130         mutex_exit(hook->lockp);
1131 
1132         /*
1133          *  let sd_start_io free the final buffer so the hook can be returned
1134          *  first.
1135          */
1136         if (!done)
1137                 _sd_put_iobuf(bp);
1138 
1139         return (0);
1140 }
1141 
1142 /*
1143  * static int
1144  * _sd_async_ea - End action for async read/write.
1145  *
1146  * ARGUMENTS:
1147  *      bp      - io buf pointer.
1148  *
1149  * RETURNS:
1150  *      NONE.
1151  *
1152  * Comments:
1153  *      This routine is called at interrupt level when the io is done.
1154  *      This is only called when the operation is asynchronous.
1155  */
1156 static int
1157 _sd_async_ea(struct buf *bp, iob_hook_t *hook)
1158 {
1159         int done, error;
1160 
1161         /*
1162          *  We get called for each buf that completes. When they are all done.
1163          *  we call the requestor's callback function.
1164          */
1165         error = (bp->b_flags & B_ERROR) ? bp->b_error : 0;
1166 
1167         mutex_enter(hook->lockp);
1168         done = !(--hook->count);
1169 
1170         if (!hook->error)
1171                 hook->error = error;
1172 
1173         mutex_exit(hook->lockp);
1174 
1175         bp->b_forw = NULL;
1176         bp->b_back = NULL;
1177 
1178         if (done) {
1179                 nsc_off_t fba_pos;
1180                 nsc_size_t fba_len;
1181                 int error;
1182                 sdbc_ea_fn_t fn;
1183                 blind_t arg;
1184 
1185                 arg   =  hook->param;
1186                 fn    =  hook->func;
1187                 error = hook->error;
1188 #if defined(_SD_DEBUG)                  /* simulate disk errors */
1189                 if (_test_async_fail == bp->b_edev) error = EIO;
1190 #endif
1191 
1192                 /* MAKE SURE b_lblkno, b_count never changes!! */
1193                 fba_pos = hook->start_fba;
1194                 fba_len = FBA_LEN(hook->size);
1195 
1196                 _sd_put_hook(hook);
1197                 _sd_put_iobuf(bp);
1198                 (*fn)(arg, fba_pos, fba_len, error);
1199         } else
1200                 _sd_put_iobuf(bp);
1201 
1202         return (0);
1203 }
1204 
1205 #ifdef DEBUG
1206 typedef struct ioerr_inject_s {
1207         dev_t ioj_dev;
1208         int   ioj_err;
1209         int   ioj_cnt;
1210 } ioerr_inject_t;
1211 
1212 static ioerr_inject_t *ioerr_inject_table = NULL;
1213 
1214 void
1215 _sdbc_ioj_load()
1216 {
1217         ioerr_inject_table =
1218             kmem_zalloc(sdbc_max_devs * sizeof (ioerr_inject_t), KM_SLEEP);
1219 }
1220 
1221 void
1222 _sdbc_ioj_unload()
1223 {
1224         if (ioerr_inject_table != NULL) {
1225                 kmem_free(ioerr_inject_table,
1226                     sdbc_max_devs * sizeof (ioerr_inject_t));
1227                 ioerr_inject_table = NULL;
1228         }
1229 }
1230 
1231 static int
1232 _sdbc_ioj_lookup(dev_t dev)
1233 {
1234         int cd;
1235 
1236         for (cd = 0; cd < sdbc_max_devs; ++cd)
1237                 if (ioerr_inject_table[cd].ioj_dev == dev) {
1238                         if (ioerr_inject_table[cd].ioj_cnt > 0) {
1239                                 --ioerr_inject_table[cd].ioj_cnt;
1240                                 return (0);
1241                         } else {
1242                                 return (ioerr_inject_table[cd].ioj_err);
1243                         }
1244                 }
1245         return (0);
1246 }
1247 
1248 void
1249 _sdbc_ioj_set_dev(int cd, dev_t crdev)
1250 {
1251         int i;
1252 
1253         if (cd == -1) {  /* all  -- used for clearing table on shutdown */
1254                 for (i = 0; i < sdbc_max_devs; ++i)  {
1255                         ioerr_inject_table[i].ioj_dev = crdev;
1256                 }
1257         } else
1258                 ioerr_inject_table[cd].ioj_dev = crdev; /* assume valid cd */
1259 }
1260 
1261 static
1262 void
1263 _sdbc_ioj_set_err(int cd, int err, int count)
1264 {
1265         int i;
1266 
1267         if (cd == -1) {  /* all */
1268                 for (i = 0; i < sdbc_max_devs; ++i)  {
1269                         ioerr_inject_table[i].ioj_err = err;
1270                         ioerr_inject_table[i].ioj_cnt = count;
1271                 }
1272         } else {
1273                 ioerr_inject_table[cd].ioj_err = err;
1274                 ioerr_inject_table[cd].ioj_cnt = count;
1275         }
1276 }
1277 
1278 static void
1279 _sdbc_ioj_clear_err(int cd)
1280 {
1281         _sdbc_ioj_set_err(cd, 0, 0);
1282 }
1283 
1284 int
1285 _sdbc_inject_ioerr(int cd, int ioj_err, int count)
1286 {
1287         if ((cd < -1) || (cd >= sdbc_max_devs))
1288                 return (EINVAL);
1289 
1290         _sdbc_ioj_set_err(cd, ioj_err, count);
1291 
1292         return (0);
1293 }
1294 
1295 int
1296 _sdbc_clear_ioerr(int cd)
1297 {
1298         if ((cd < -1) || (cd >= sdbc_max_devs))
1299                 return (EINVAL);
1300 
1301         _sdbc_ioj_clear_err(cd);
1302 
1303         return (0);
1304 }
1305 #endif