1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright (c) 2017 by Delphix. All rights reserved. 27 */ 28 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/ksynch.h> 32 #include <sys/kmem.h> 33 #include <sys/stat.h> 34 #include <sys/buf.h> 35 #include <sys/open.h> 36 #include <sys/conf.h> 37 #include <sys/file.h> 38 #include <sys/cmn_err.h> 39 #include <sys/errno.h> 40 #include <sys/ddi.h> 41 42 #include <sys/nsc_thread.h> 43 #include <sys/nsctl/nsctl.h> 44 45 #include <sys/sdt.h> /* dtrace is S10 or later */ 46 47 #include <vm/seg_kmem.h> 48 #include "sd_bcache.h" 49 #include "sd_trace.h" 50 #include "sd_io.h" 51 #include "sd_iob.h" 52 #include "sd_misc.h" 53 #if defined(_SD_DEBUG) /* simulate disk errors */ 54 #include "sd_tdaemon.h" 55 #endif 56 57 #ifndef DS_DDICT 58 extern uintptr_t kobj_getsymvalue(char *, int); /* DDI violation */ 59 #endif 60 61 #define DO_PAGE_LIST sdbc_do_page /* enable pagelist code */ 62 63 int sdbc_do_page = 0; 64 65 #define SGIO_MAX 254 66 67 static kmutex_t sdbc_bio_mutex; 68 static int sdbc_bio_count; 69 70 static unsigned long page_size, page_offset_mask; 71 72 #ifdef _SD_BIO_STATS 73 static __start_io_count = 0; 74 #endif /* _SD_BIO_STATS */ 75 76 /* 77 * Forward declare all statics that are used before defined to enforce 78 * parameter checking. Also forward-declare all functions that have 64-bit 79 * argument types to enforce correct parameter checking. 80 * 81 * Some (if not all) of these could be removed if the code were reordered 82 */ 83 84 static int _sd_sync_ea(struct buf *, iob_hook_t *); 85 static int _sd_async_ea(struct buf *, iob_hook_t *); 86 static void _sd_pack_pages(struct buf *bp, struct buf *list, sd_addr_t *addr, 87 nsc_off_t offset, nsc_size_t size); 88 static void _sd_pack_pages_nopageio(struct buf *bp, struct buf *list, 89 sd_addr_t *addr, nsc_off_t offset, nsc_size_t size); 90 static void _sd_setup_iob(struct buf *bp, dev_t dev, nsc_off_t pos, int flag); 91 92 #ifdef DEBUG 93 static int _sdbc_ioj_lookup(dev_t); 94 static void _sdbc_ioj_clear_err(int); 95 #endif 96 97 static int SD_WRITES_TOT = 0; 98 static int SD_WRITES_LEN[100]; 99 100 _sd_buf_list_t _sd_buflist; 101 102 /* 103 * _sd_add_vm_to_bp_plist - add the page corresponding to the 104 * virtual address "v" (kernel virtaddr) to the pagelist linked 105 * to buffer "bp". 106 * 107 * The virtual address "v" is "known" to be allocated by segkmem 108 * and we can look up the page by using the segkmem vnode kvp. 109 * This violates the ddi/ddk but is workable for now anyway. 110 * 111 * 112 */ 113 static void 114 _sd_add_vm_to_bp_plist(struct buf *bp, unsigned char *v) 115 { 116 page_t *pp; 117 page_t *one_pg = NULL; 118 119 pp = page_find(&kvp, (u_offset_t)((uintptr_t)v & ~page_offset_mask)); 120 if (!pp) { 121 cmn_err(CE_PANIC, 122 "_sd_add_vm_to_bp_plist: couldn't find page for 0x%p", 123 (void *)v); 124 } 125 126 page_add(&one_pg, pp); 127 page_list_concat(&(bp->b_pages), &one_pg); 128 129 } 130 131 #ifdef _SD_BIO_STATS 132 static int 133 _sd_count_pages(page_t *pp) 134 { 135 int cnt = 0; 136 page_t *pp1; 137 if (pp == NULL) 138 return (cnt); 139 140 for (cnt = 1, pp1 = pp->p_next; pp != pp1; cnt++, pp1 = pp1->p_next) 141 ; 142 143 return (cnt); 144 } 145 #endif /* _SD_BIO_STATS */ 146 147 148 /* 149 * _sdbc_iobuf_load - load time initialization of io bufs structures. 150 * 151 * 152 * RETURNS: 153 * 0 - success. 154 * -1 - failure. 155 * 156 * USAGE: 157 * This routine initializes load time buf structures. 158 * Should be called when the cache is loaded. 159 */ 160 161 int 162 _sdbc_iobuf_load(void) 163 { 164 mutex_init(&sdbc_bio_mutex, NULL, MUTEX_DRIVER, NULL); 165 166 /* 167 * HACK add a ref to kvp, to prevent VN_RELE on it from panicing 168 * the system 169 */ 170 VN_HOLD(&kvp); 171 172 return (0); 173 } 174 175 /* 176 * _sdbc_iobuf_unload - unload time cleanup of io buf structures. 177 * 178 * 179 * USAGE: 180 * This routine removes load time buf structures. 181 * Should be called when the cache is unloaded. 182 */ 183 void 184 _sdbc_iobuf_unload(void) 185 { 186 mutex_enter(&kvp.v_lock); 187 ASSERT(kvp.v_count == 1); 188 VN_RELE_LOCKED(&kvp); 189 mutex_exit(&kvp.v_lock); 190 191 mutex_destroy(&sdbc_bio_mutex); 192 bzero(&_sd_buflist, sizeof (_sd_buf_list_t)); 193 } 194 195 /* 196 * _sdbc_iobuf_configure - configure a list of io bufs for later use. 197 * 198 * ARGUMENTS: 199 * num_bufs - number of buffers. (from the configuration file) 200 * 201 * RETURNS: 202 * 0 - success. 203 * <0 - failure. 204 * 205 * USAGE: 206 * This routine configures the buf structures for io. 207 * Should be called when the cache is configured. 208 */ 209 210 int 211 _sdbc_iobuf_configure(int num) 212 { 213 int i; 214 _sd_buf_list_t *buflist; 215 iob_hook_t *hook; 216 char symbol_name[32]; 217 218 if (!num || (num > _SD_DEFAULT_IOBUFS)) 219 num = _SD_DEFAULT_IOBUFS; 220 221 if ((_sd_buflist.hooks = (iob_hook_t *)nsc_kmem_zalloc( 222 num * sizeof (iob_hook_t), KM_SLEEP, sdbc_iobuf_mem)) == NULL) { 223 return (-1); 224 } 225 226 buflist = &_sd_buflist; 227 buflist->bl_init_count = num; 228 buflist->bl_hooks_avail = num; 229 buflist->bl_hook_lowmark = num; 230 hook = buflist->hooks; 231 buflist->hook_head = hook; 232 for (i = 0; i < num; i++, hook++) { 233 cv_init(&hook->wait, NULL, CV_DRIVER, NULL); 234 (void) sprintf(symbol_name, "sd_iob_dcb%d", i); 235 hook->iob_drv_iodone = (dcb_t)kobj_getsymvalue(symbol_name, 0); 236 if (!hook->iob_drv_iodone) { 237 return (-2); 238 } 239 hook->next_hook = hook+1; 240 } 241 (hook-1)->next_hook = NULL; 242 243 for (i = 0; i < MAX_HOOK_LOCKS; i++) 244 mutex_init(&_sd_buflist.hook_locks[i], NULL, MUTEX_DRIVER, 245 NULL); 246 247 cv_init(&_sd_buflist.hook_wait, NULL, CV_DRIVER, NULL); 248 _sd_buflist.hook_waiters = 0; 249 250 sdbc_bio_count = 0; 251 SD_WRITES_TOT = 0; 252 bzero(SD_WRITES_LEN, sizeof (SD_WRITES_LEN)); 253 254 /* pagelist i/o pages must be done in cache_init */ 255 256 page_size = ptob(1); 257 page_offset_mask = page_size - 1; 258 259 return (0); 260 } 261 262 /* 263 * _sdbc_iobuf_deconfigure - release all memory allocated for buf list 264 * 265 * ARGUMENTS: 266 * None. 267 * 268 * RETURNS: 269 * 0 270 */ 271 void 272 _sdbc_iobuf_deconfigure(void) 273 { 274 ushort_t i; 275 276 if (_sd_buflist.hooks) { 277 for (i = 0; i < _sd_buflist.bl_init_count; i ++) { 278 cv_destroy(&_sd_buflist.hooks[i].wait); 279 } 280 cv_destroy(&_sd_buflist.hook_wait); 281 nsc_kmem_free(_sd_buflist.hooks, 282 _sd_buflist.bl_init_count * sizeof (iob_hook_t)); 283 for (i = 0; i < MAX_HOOK_LOCKS; i ++) { 284 mutex_destroy(&_sd_buflist.hook_locks[i]); 285 } 286 } 287 288 _sd_buflist.hooks = NULL; 289 290 #ifdef DEBUG 291 { 292 void _sdbc_ioj_clear_err(int); 293 _sdbc_ioj_clear_err(-1); /* clear any injected i/o errors */ 294 _sdbc_ioj_set_dev(-1, 0); /* clear dev entries */ 295 } 296 #endif 297 298 } 299 300 /* 301 * _sd_pending_iobuf() 302 * 303 * Return the number of I/O bufs outstanding 304 */ 305 int 306 _sd_pending_iobuf(void) 307 { 308 return (sdbc_bio_count); 309 } 310 311 /* 312 * _sd_get_iobuf - allocate a buf. 313 * 314 * ARGUMENTS: 315 * None. 316 * 317 * RETURNS: 318 * NULL - failure. 319 * buf ptr otherwise. 320 * 321 * ASSUMPTIONS - process could block if we run out. 322 * 323 */ 324 /*ARGSUSED*/ 325 static struct buf * 326 _sd_get_iobuf(int num_bdl) 327 { 328 struct buf *bp; 329 330 /* Get a buffer, ready for page list i/o */ 331 332 if (DO_PAGE_LIST) 333 bp = pageio_setup(NULL, 0, &kvp, 0); 334 else 335 bp = getrbuf(KM_SLEEP); 336 337 if (bp == NULL) 338 return (NULL); 339 mutex_enter(&sdbc_bio_mutex); 340 sdbc_bio_count++; 341 mutex_exit(&sdbc_bio_mutex); 342 return (bp); 343 } 344 345 /* 346 * _sd_put_iobuf - put a buf back in the freelist. 347 * 348 * ARGUMENTS: 349 * bp - buf pointer. 350 * 351 * RETURNS: 352 * 0 353 * 354 */ 355 static void 356 _sd_put_iobuf(struct buf *bp) 357 { 358 mutex_enter(&sdbc_bio_mutex); 359 sdbc_bio_count--; 360 mutex_exit(&sdbc_bio_mutex); 361 if (DO_PAGE_LIST) 362 pageio_done(bp); 363 else 364 freerbuf(bp); 365 } 366 367 368 /* use for ORing only */ 369 #define B_KERNBUF 0 370 371 static void 372 _sd_setup_iob(struct buf *bp, dev_t dev, nsc_off_t pos, int flag) 373 { 374 bp->b_pages = NULL; 375 bp->b_un.b_addr = 0; 376 377 flag &= (B_READ | B_WRITE); 378 379 /* 380 * if pagelist i/o, _sd_get_iobuf()/pageio_setup() has already 381 * set b_flags to 382 * B_KERNBUF | B_PAGEIO | B_NOCACHE | B_BUSY (sol 6,7,8) 383 * or 384 * B_PAGEIO | B_NOCACHE | B_BUSY (sol 9) 385 */ 386 387 bp->b_flags |= B_KERNBUF | B_BUSY | flag; 388 389 bp->b_error = 0; 390 391 bp->b_forw = NULL; 392 bp->b_back = NULL; 393 394 bp->b_lblkno = (diskaddr_t)pos; 395 bp->b_bufsize = 0; 396 bp->b_resid = 0; 397 bp->b_proc = NULL; 398 bp->b_edev = dev; 399 } 400 401 402 /* 403 * _sd_get_hook - get an iob hook from the free list. 404 * 405 * ARGUMENTS: 406 * none 407 * 408 * RETURNS: 409 * the newly allocated iob_hook. 410 * 411 */ 412 static iob_hook_t * 413 _sd_get_hook(void) 414 { 415 416 iob_hook_t *ret; 417 418 mutex_enter(&sdbc_bio_mutex); 419 420 retry: 421 ret = _sd_buflist.hook_head; 422 if (ret) 423 _sd_buflist.hook_head = ret->next_hook; 424 else { 425 ++_sd_buflist.hook_waiters; 426 if (_sd_buflist.max_hook_waiters < _sd_buflist.hook_waiters) 427 _sd_buflist.max_hook_waiters = _sd_buflist.hook_waiters; 428 cv_wait(&_sd_buflist.hook_wait, &sdbc_bio_mutex); 429 --_sd_buflist.hook_waiters; 430 goto retry; 431 } 432 433 if (_sd_buflist.bl_hook_lowmark > --_sd_buflist.bl_hooks_avail) 434 _sd_buflist.bl_hook_lowmark = _sd_buflist.bl_hooks_avail; 435 436 mutex_exit(&sdbc_bio_mutex); 437 ret->skipped = 0; 438 439 ret->count = 0; 440 441 #ifdef _SD_BIO_STATS 442 ret->PAGE_IO = 0; 443 ret->NORM_IO = 0; 444 ret->NORM_IO_SIZE = 0; 445 ret->SKIP_IO = 0; 446 ret->PAGE_COMBINED = 0; 447 #endif /* _SD_BIO_STATS */ 448 449 return (ret); 450 } 451 452 /* 453 * _sd_put_hook - put an iob hook back on the free list. 454 * 455 * ARGUMENTS: 456 * hook - an iob_hook to be returned to the freelist. 457 * 458 * 459 */ 460 static void 461 _sd_put_hook(iob_hook_t *hook) 462 { 463 464 mutex_enter(&sdbc_bio_mutex); 465 466 if (_sd_buflist.hook_waiters) { 467 cv_signal(&_sd_buflist.hook_wait); 468 } 469 hook->next_hook = _sd_buflist.hook_head; 470 _sd_buflist.hook_head = hook; 471 472 ++_sd_buflist.bl_hooks_avail; 473 474 mutex_exit(&sdbc_bio_mutex); 475 } 476 477 /* 478 * _sd_extend_iob - the i/o block we are handling needs a new struct buf to 479 * describe the next hunk of i/o. Get a new struct buf initialize it based 480 * on the state in the struct buf we are passed as an arg. 481 * ARGUMENTS: 482 * head_bp - a buffer header in the current i/o block we are handling. 483 * (generally the initial header but in fact could be any 484 * of the ones [if any] that were chained to the initial 485 * one). 486 */ 487 static struct buf * 488 _sd_extend_iob(struct buf *head_bp) 489 { 490 struct buf *bp; 491 iob_hook_t *hook = (iob_hook_t *)head_bp->b_private; 492 493 494 if (!(bp = _sd_get_iobuf(0))) 495 return (0); 496 497 bp->b_pages = NULL; 498 bp->b_un.b_addr = 0; 499 500 bp->b_flags |= (head_bp->b_flags & (B_READ | B_WRITE)); 501 502 if (!DO_PAGE_LIST) 503 bp->b_flags |= B_KERNBUF | B_BUSY; 504 505 bp->b_error = 0; 506 507 /* 508 * b_forw/b_back will form a doubly linked list of all the buffers 509 * associated with this block of i/o. 510 * hook->tail points to the last buffer in the chain. 511 */ 512 bp->b_forw = NULL; 513 bp->b_back = hook->tail; 514 hook->tail->b_forw = bp; 515 hook->tail = bp; 516 hook->count++; 517 518 ASSERT(BLK_FBA_OFF(hook->size) == 0); 519 520 bp->b_lblkno = (diskaddr_t)hook->start_fba + 521 (diskaddr_t)FBA_NUM(hook->size); 522 523 bp->b_bufsize = 0; 524 bp->b_resid = 0; 525 bp->b_proc = NULL; 526 bp->b_edev = head_bp->b_edev; 527 528 bp->b_iodone = NULL; /* for now */ 529 bp->b_private = hook; 530 531 return (bp); 532 } 533 534 /* 535 * sd_alloc_iob - start processing a block of i/o. This allocates an initial 536 * buffer header for describing the i/o and a iob_hook for collecting 537 * information about all the i/o requests added to this buffer. 538 * 539 * ARGUMENTS: 540 * dev - the device all the i/o is destined for. 541 * fba_pos - the initial disk block to read. 542 * blks - ignored 543 * flag - signal whether this is a read or write request. 544 * 545 * RETURNS: 546 * pointer to free struct buf which will be used to describe i/o request. 547 */ 548 /* ARGSUSED */ 549 struct buf * 550 sd_alloc_iob(dev_t dev, nsc_off_t fba_pos, int blks, int flag) 551 { 552 struct buf *bp; 553 iob_hook_t *hook; 554 555 if (!(bp = _sd_get_iobuf(0))) 556 return (0); 557 558 _sd_setup_iob(bp, dev, fba_pos, flag); 559 560 bp->b_iodone = NULL; /* for now */ 561 hook = _sd_get_hook(); 562 if (!hook) { 563 /* can't see how this could happen */ 564 _sd_put_iobuf(bp); 565 return (0); 566 } 567 568 /* 569 * pick an arbitrary lock 570 */ 571 hook->lockp = &_sd_buflist.hook_locks[((long)hook >> 9) & 572 (MAX_HOOK_LOCKS - 1)]; 573 hook->start_fba = fba_pos; 574 hook->last_fba = fba_pos; 575 hook->size = 0; 576 hook->tail = bp; 577 hook->chain = bp; 578 hook->count = 1; 579 hook->error = 0; 580 bp->b_private = hook; 581 582 return (bp); 583 } 584 585 /* 586 * _sd_pack_pages - produce i/o requests that will perform the type of i/o 587 * described by bp (READ/WRITE). It attempt to tack the i/o onto the 588 * buf pointer to by list to minimize the number of bufs required. 589 * 590 * ARGUMENTS: 591 * bp - is the i/o description i.e. head 592 * list - is where to start adding this i/o request (null if we should extend) 593 * addr - address describing where the data is. 594 * offset - offset from addr where data begins 595 * size - size of the i/o request. 596 */ 597 static void 598 _sd_pack_pages(struct buf *bp, struct buf *list, sd_addr_t *addr, 599 nsc_off_t offset, nsc_size_t size) 600 { 601 uintptr_t start_addr, end_addr; 602 int page_end_aligned; 603 #ifdef _SD_BIO_STATS 604 iob_hook_t *hook = (iob_hook_t *)bp->b_private; 605 struct buf *orig_list = list; 606 #endif /* _SD_BIO_STATS */ 607 608 start_addr = (uintptr_t)addr->sa_virt + offset; 609 end_addr = start_addr + size; 610 611 page_end_aligned = !(end_addr & page_offset_mask); 612 613 if (!list && !(list = _sd_extend_iob(bp))) { 614 /* 615 * we're hosed since we have no error return... 616 * though we could ignore stuff from here on out 617 * and return ENOMEM when we get to sd_start_io. 618 * This will do for now. 619 */ 620 cmn_err(CE_PANIC, "_sd_pack_pages: couldn't extend iob"); 621 } 622 623 /* 624 * We only want to do pagelist i/o if we end on a page boundary. 625 * If we don't end on a page boundary we won't combine with the 626 * next request and so we may as well do it as normal as it 627 * will only use one buffer. 628 */ 629 630 if (DO_PAGE_LIST && page_end_aligned) { 631 if (start_addr & page_offset_mask) { 632 /* 633 * handle the partial page 634 */ 635 if (list->b_bufsize) { 636 if (!(list = _sd_extend_iob(bp))) { 637 /* 638 * we're hosed since we have no error 639 * return though we could ignore stuff 640 * from here on out and return ENOMEM 641 * when we get to sd_start_io. 642 * This will do for now. 643 */ 644 cmn_err(CE_PANIC, 645 "_sd_pack_pages: couldn't extend iob"); 646 } 647 } 648 #ifdef _SD_BIO_STATS 649 hook->PAGE_IO++; 650 #endif /* _SD_BIO_STATS */ 651 _sd_add_vm_to_bp_plist(list, 652 (unsigned char *) start_addr); 653 list->b_bufsize = page_size - 654 (start_addr & page_offset_mask); 655 list->b_un.b_addr = (caddr_t) 656 (start_addr & page_offset_mask); 657 size -= list->b_bufsize; 658 start_addr += list->b_bufsize; 659 } 660 /* 661 * Now fill with all the full pages remaining. 662 */ 663 for (; size > 0; size -= page_size) { 664 #ifdef _SD_BIO_STATS 665 hook->PAGE_IO++; 666 #endif /* _SD_BIO_STATS */ 667 668 _sd_add_vm_to_bp_plist(list, 669 (unsigned char *) start_addr); 670 start_addr += page_size; 671 list->b_bufsize += page_size; 672 #ifdef _SD_BIO_STATS 673 if (list == orig_list) 674 hook->PAGE_COMBINED++; 675 #endif /* _SD_BIO_STATS */ 676 } 677 if (size) 678 cmn_err(CE_PANIC, "_sd_pack_pages: bad size: %" 679 NSC_SZFMT, size); 680 } else { 681 /* 682 * Wasn't worth it as pagelist i/o, do as normal 683 */ 684 if (list->b_bufsize && !(list = _sd_extend_iob(bp))) { 685 /* 686 * we're hosed since we have no error return... 687 * though we could ignore stuff from here on out 688 * and return ENOMEM when we get to sd_start_io. 689 * This will do for now. 690 */ 691 cmn_err(CE_PANIC, 692 "_sd_pack_pages: couldn't extend iob"); 693 } 694 695 /* kernel virtual */ 696 list->b_flags &= ~(B_PHYS | B_PAGEIO); 697 list->b_un.b_addr = (caddr_t)start_addr; 698 #ifdef _SD_BIO_STATS 699 hook->NORM_IO++; 700 hook->NORM_IO_SIZE += size; 701 #endif /* _SD_BIO_STATS */ 702 list->b_bufsize = (size_t)size; 703 } 704 705 } 706 707 /* 708 * perform same function as _sd_pack_pages() when not doing pageio 709 */ 710 static void 711 _sd_pack_pages_nopageio(struct buf *bp, struct buf *list, sd_addr_t *addr, 712 nsc_off_t offset, nsc_size_t size) 713 { 714 uintptr_t start_addr; 715 #ifdef _SD_BIO_STATS 716 iob_hook_t *hook = (iob_hook_t *)bp->b_private; 717 struct buf *orig_list = list; 718 #endif /* _SD_BIO_STATS */ 719 720 start_addr = (uintptr_t)addr->sa_virt + offset; 721 722 if (!list && !(list = _sd_extend_iob(bp))) { 723 /* 724 * we're hosed since we have no error return... 725 * though we could ignore stuff from here on out 726 * and return ENOMEM when we get to sd_start_io. 727 * This will do for now. 728 */ 729 cmn_err(CE_PANIC, "_sd_pack_pages_nopageio: couldn't " 730 "extend iob"); 731 } 732 733 if (list->b_bufsize && 734 (start_addr == (uintptr_t)(list->b_un.b_addr + list->b_bufsize))) { 735 /* contiguous */ 736 list->b_bufsize += (size_t)size; 737 } else { 738 /* 739 * not contiguous mem (extend) or first buffer (bufsize == 0). 740 */ 741 if (list->b_bufsize && !(list = _sd_extend_iob(bp))) { 742 /* 743 * we're hosed since we have no error return... 744 * though we could ignore stuff from here on out 745 * and return ENOMEM when we get to sd_start_io. 746 * This will do for now. 747 */ 748 cmn_err(CE_PANIC, "_sd_pack_pages_nopageio: couldn't " 749 "extend iob"); 750 } 751 list->b_un.b_addr = (caddr_t)start_addr; 752 list->b_bufsize = (size_t)size; 753 } 754 755 #ifdef _SD_BIO_STATS 756 hook->NORM_IO++; 757 hook->NORM_IO_SIZE += size; 758 #endif /* _SD_BIO_STATS */ 759 } 760 761 /* 762 * sd_add_fba - add an i/o request to the block of i/o described by bp. 763 * We try and combine this request with the previous request. In 764 * Addition we try and do the i/o as PAGELIST_IO if it satisfies 765 * the restrictions for it. If the i/o request can't be combined 766 * we extend the i/o description with a new buffer header and add 767 * it to the chain headed by bp. 768 * 769 * ARGUMENTS: 770 * bp - the struct buf describing the block i/o we are collecting. 771 * addr - description of the address where the data will read/written to. 772 * A NULL indicates that this i/o request doesn't need to actually 773 * happen. Used to mark reads when the fba is already in cache and 774 * dirty. 775 * 776 * fba_pos - offset from address in addr where the i/o is to start. 777 * 778 * fba_len - number of consecutive fbas to transfer. 779 * 780 * NOTE: It is assumed that the memory is physically contiguous but may span 781 * multiple pages (should a cache block be larger than a page). 782 * 783 */ 784 void 785 sd_add_fba(struct buf *bp, sd_addr_t *addr, nsc_off_t fba_pos, 786 nsc_size_t fba_len) 787 { 788 nsc_off_t offset; 789 nsc_size_t size; 790 iob_hook_t *hook = (iob_hook_t *)bp->b_private; 791 792 size = FBA_SIZE(fba_len); 793 offset = FBA_SIZE(fba_pos); 794 795 if (addr) { 796 /* 797 * See if this can be combined with previous request(s) 798 */ 799 if (!bp->b_bufsize) { 800 if (DO_PAGE_LIST) 801 _sd_pack_pages(bp, bp, addr, offset, size); 802 else 803 _sd_pack_pages_nopageio(bp, bp, addr, offset, 804 size); 805 } else { 806 if (DO_PAGE_LIST) { 807 if (hook->tail->b_flags & B_PAGEIO) { 808 /* 809 * Last buffer was a pagelist. Unless a 810 * skip was detected the last request 811 * ended on a page boundary. If this 812 * one starts on one we combine the 813 * best we can. 814 */ 815 if (hook->skipped) 816 _sd_pack_pages(bp, NULL, addr, 817 offset, size); 818 else 819 _sd_pack_pages(bp, hook->tail, 820 addr, offset, size); 821 } else { 822 /* 823 * Last buffer was vanilla i/o or worse 824 * (sd_add_mem) 825 */ 826 _sd_pack_pages(bp, NULL, addr, offset, 827 size); 828 } 829 } else { 830 if (hook->skipped) 831 _sd_pack_pages_nopageio(bp, NULL, 832 addr, offset, size); 833 else 834 _sd_pack_pages_nopageio(bp, 835 hook->tail, addr, offset, size); 836 } 837 } 838 hook->skipped = 0; 839 } else { 840 /* Must be a read of dirty block we want to discard */ 841 842 ASSERT(bp->b_flags & B_READ); 843 #ifdef _SD_BIO_STATS 844 hook->SKIP_IO++; 845 #endif /* _SD_BIO_STATS */ 846 hook->skipped = 1; 847 if (!bp->b_bufsize) 848 bp->b_lblkno += fba_len; 849 } 850 hook->size += size; 851 852 } 853 854 /* 855 * sd_add_mem - add an i/o request to the block of i/o described by bp. 856 * The memory target for this i/o may span multiple pages and may 857 * not be physically contiguous. 858 * also the len might also not be a multiple of an fba. 859 * 860 * ARGUMENTS: 861 * bp - the struct buf describing the block i/o we are collecting. 862 * 863 * buf - target of this i/o request. 864 * 865 * len - number of bytes to transfer. 866 * 867 */ 868 void 869 sd_add_mem(struct buf *bp, char *buf, nsc_size_t len) 870 { 871 nsc_size_t n; 872 uintptr_t start; 873 iob_hook_t *hook = (iob_hook_t *)bp->b_private; 874 875 start = (uintptr_t)buf & page_offset_mask; 876 877 for (; len > 0; buf += n, len -= n, start = 0) { 878 n = min((nsc_size_t)len, (nsc_size_t)(page_size - start)); 879 /* 880 * i/o size must be multiple of an FBA since we can't 881 * count on lower level drivers to understand b_offset 882 */ 883 if (BLK_FBA_OFF(n) != 0) { 884 cmn_err(CE_WARN, 885 "!sdbc(sd_add_mem) i/o request not FBA sized (%" 886 NSC_SZFMT ")", n); 887 } 888 889 if (!bp->b_bufsize) { 890 /* first request */ 891 bp->b_flags &= ~(B_PHYS | B_PAGEIO); 892 bp->b_un.b_addr = buf; 893 bp->b_bufsize = (size_t)n; 894 } else { 895 struct buf *new_bp; 896 if (!(new_bp = _sd_extend_iob(bp))) { 897 /* we're hosed */ 898 cmn_err(CE_PANIC, 899 "sd_add_mem: couldn't extend iob"); 900 } 901 new_bp->b_flags &= ~(B_PHYS | B_PAGEIO); 902 new_bp->b_un.b_addr = buf; 903 new_bp->b_bufsize = (size_t)n; 904 } 905 hook->size += n; 906 } 907 } 908 909 910 /* 911 * sd_start_io - start all the i/o needed to satisfy the i/o request described 912 * by bp. If supplied the a non-NULL fn then this is an async request 913 * and we will return NSC_PENDING and call fn when all the i/o complete. 914 * Otherwise this is a synchronous request and we sleep until all the 915 * i/o is complete. If any buffer in the chain gets an error we return 916 * the first error we see (once all the i/o is complete). 917 * 918 * ARGUMENTS: 919 * bp - the struct buf describing the block i/o we are collecting. 920 * 921 * strategy - strategy function to call if known by the user, or NULL. 922 * 923 * fn - user's callback function. NULL implies synchronous request. 924 * 925 * arg - an argument passed to user's callback function. 926 * 927 */ 928 int 929 sd_start_io(struct buf *bp, strategy_fn_t strategy, sdbc_ea_fn_t fn, 930 blind_t arg) 931 { 932 int err; 933 iob_hook_t *hook = (iob_hook_t *)bp->b_private; 934 struct buf *bp_next; 935 int (*ea_fn)(struct buf *, iob_hook_t *); 936 #ifdef _SD_BIO_STATS 937 static int total_pages, total_pages_combined, total_norm; 938 static int total_norm_combined, total_skipped; 939 static nsc_size_t total_norm_size; 940 941 static int total_bufs; 942 static int total_xpages_w, total_ypages_w; 943 static int total_xpages_r, total_ypages_r; 944 static int max_run_r, max_run_w; 945 946 #endif /* _SD_BIO_STATS */ 947 948 hook->func = fn; 949 hook->param = arg; 950 if (fn != NULL) 951 ea_fn = _sd_async_ea; 952 else 953 ea_fn = _sd_sync_ea; 954 955 hook->iob_hook_iodone = ea_fn; 956 957 #ifdef _SD_BIO_STATS 958 __start_io_count++; 959 total_pages += hook->PAGE_IO; 960 total_pages_combined += hook->PAGE_COMBINED; 961 total_norm += hook->NORM_IO; 962 total_norm_size += hook->NORM_IO_SIZE; 963 total_skipped += hook->SKIP_IO; 964 #endif /* _SD_BIO_STATS */ 965 966 for (; bp; bp = bp_next) { 967 968 DTRACE_PROBE4(sd_start_io_bufs, struct buf *, bp, long, bp->b_bufsize, 969 int, bp->b_flags, iob_hook_t *, hook); 970 971 bp_next = bp->b_forw; 972 if (!(bp->b_flags & B_READ)) { 973 SD_WRITES_TOT++; 974 SD_WRITES_LEN[(bp->b_bufsize/32768) % 975 (sizeof (SD_WRITES_LEN)/sizeof (int))]++; 976 } 977 bp->b_iodone = hook->iob_drv_iodone; 978 bp->b_bcount = bp->b_bufsize; 979 bp->b_forw = NULL; 980 bp->b_back = NULL; 981 bp->b_private = NULL; 982 983 #ifdef _SD_BIO_STATS 984 total_bufs ++; 985 if (bp->b_flags & B_PAGEIO) { 986 int i; 987 i = _sd_count_pages(bp->b_pages); 988 if (bp->b_flags & B_READ) { 989 if (i > max_run_r) 990 max_run_r = i; 991 total_xpages_r += i; 992 total_ypages_r++; 993 } else { 994 if (i > max_run_w) 995 max_run_w = i; 996 total_xpages_w += i; 997 total_ypages_w++; 998 } 999 } 1000 #endif /* _SD_BIO_STATS */ 1001 1002 1003 /* 1004 * It's possible for us to be told to read a dirty block 1005 * where all the i/o can go away (e.g. read one fba, it's 1006 * in cache and dirty) so we really have nothing to do but 1007 * say we're done. 1008 */ 1009 if (bp->b_bcount) { 1010 if (!strategy) { 1011 strategy = 1012 nsc_get_strategy(getmajor(bp->b_edev)); 1013 } 1014 1015 if (!strategy) { 1016 bp->b_flags |= B_ERROR; 1017 bp->b_error = ENXIO; 1018 (*bp->b_iodone)(bp); 1019 } else 1020 #ifdef DEBUG 1021 /* inject i/o error for testing */ 1022 if (bp->b_error = _sdbc_ioj_lookup(bp->b_edev)) { 1023 bp->b_flags |= B_ERROR; 1024 (*bp->b_iodone)(bp); 1025 } else 1026 #endif 1027 { 1028 (*strategy)(bp); 1029 } 1030 } else { 1031 (*bp->b_iodone)(bp); 1032 } 1033 1034 } 1035 1036 #ifdef _SD_BIO_STATS 1037 if (__start_io_count == 2000) { 1038 __start_io_count = 0; 1039 cmn_err(CE_WARN, 1040 "!sdbc(sd_start_io) t_bufs %d pages %d " 1041 "combined %d norm %d norm_size %" NSC_SZFMT " skipped %d", 1042 total_bufs, 1043 total_pages, total_pages_combined, total_norm, 1044 total_norm_size, total_skipped); 1045 1046 total_bufs = 0; 1047 total_pages = 0; 1048 total_pages_combined = 0; 1049 total_norm = 0; 1050 total_norm_combined = 0; 1051 total_skipped = 0; 1052 total_norm_size = 0; 1053 1054 cmn_err(CE_WARN, 1055 "!sdbc(sd_start_io)(r) max_run %d, total_xp %d total yp %d", 1056 max_run_r, total_xpages_r, total_ypages_r); 1057 1058 total_xpages_r = 0; 1059 total_ypages_r = 0; 1060 max_run_r = 0; 1061 1062 cmn_err(CE_WARN, 1063 "!sdbc(sd_start_io)(w) max_run %d, total_xp %d total yp %d", 1064 max_run_w, total_xpages_w, total_ypages_w); 1065 1066 total_xpages_w = 0; 1067 total_ypages_w = 0; 1068 max_run_w = 0; 1069 } 1070 #endif /* _SD_BIO_STATS */ 1071 1072 if (ea_fn == _sd_async_ea) { 1073 DTRACE_PROBE(sd_start_io_end); 1074 1075 return (NSC_PENDING); 1076 } 1077 1078 mutex_enter(hook->lockp); 1079 1080 while (hook->count) { 1081 cv_wait(&hook->wait, hook->lockp); 1082 } 1083 mutex_exit(hook->lockp); 1084 1085 err = hook->error ? hook->error : NSC_DONE; 1086 bp = hook->tail; 1087 _sd_put_hook(hook); 1088 _sd_put_iobuf(bp); 1089 1090 return (err); 1091 } 1092 1093 /* 1094 * _sd_sync_ea - called when a single i/o operation is complete. If this 1095 * is the last outstanding i/o we wakeup the sleeper. 1096 * If this i/o had an error then we store the error result in the 1097 * iob_hook if this was the first error. 1098 * 1099 * ARGUMENTS: 1100 * bp - the struct buf describing the block i/o that just completed. 1101 * 1102 * Comments: 1103 * This routine is called at interrupt level when the io is done. 1104 */ 1105 1106 static int 1107 _sd_sync_ea(struct buf *bp, iob_hook_t *hook) 1108 { 1109 1110 int error; 1111 int done; 1112 1113 /* 1114 * We get called for each buf that completes. When they are all done. 1115 * we wakeup the waiter. 1116 */ 1117 error = (bp->b_flags & B_ERROR) ? bp->b_error : 0; 1118 1119 mutex_enter(hook->lockp); 1120 1121 if (!hook->error) 1122 hook->error = error; 1123 1124 done = !(--hook->count); 1125 if (done) { 1126 /* remember the last buffer so we can free it later */ 1127 hook->tail = bp; 1128 cv_signal(&hook->wait); 1129 } 1130 mutex_exit(hook->lockp); 1131 1132 /* 1133 * let sd_start_io free the final buffer so the hook can be returned 1134 * first. 1135 */ 1136 if (!done) 1137 _sd_put_iobuf(bp); 1138 1139 return (0); 1140 } 1141 1142 /* 1143 * static int 1144 * _sd_async_ea - End action for async read/write. 1145 * 1146 * ARGUMENTS: 1147 * bp - io buf pointer. 1148 * 1149 * RETURNS: 1150 * NONE. 1151 * 1152 * Comments: 1153 * This routine is called at interrupt level when the io is done. 1154 * This is only called when the operation is asynchronous. 1155 */ 1156 static int 1157 _sd_async_ea(struct buf *bp, iob_hook_t *hook) 1158 { 1159 int done, error; 1160 1161 /* 1162 * We get called for each buf that completes. When they are all done. 1163 * we call the requestor's callback function. 1164 */ 1165 error = (bp->b_flags & B_ERROR) ? bp->b_error : 0; 1166 1167 mutex_enter(hook->lockp); 1168 done = !(--hook->count); 1169 1170 if (!hook->error) 1171 hook->error = error; 1172 1173 mutex_exit(hook->lockp); 1174 1175 bp->b_forw = NULL; 1176 bp->b_back = NULL; 1177 1178 if (done) { 1179 nsc_off_t fba_pos; 1180 nsc_size_t fba_len; 1181 int error; 1182 sdbc_ea_fn_t fn; 1183 blind_t arg; 1184 1185 arg = hook->param; 1186 fn = hook->func; 1187 error = hook->error; 1188 #if defined(_SD_DEBUG) /* simulate disk errors */ 1189 if (_test_async_fail == bp->b_edev) error = EIO; 1190 #endif 1191 1192 /* MAKE SURE b_lblkno, b_count never changes!! */ 1193 fba_pos = hook->start_fba; 1194 fba_len = FBA_LEN(hook->size); 1195 1196 _sd_put_hook(hook); 1197 _sd_put_iobuf(bp); 1198 (*fn)(arg, fba_pos, fba_len, error); 1199 } else 1200 _sd_put_iobuf(bp); 1201 1202 return (0); 1203 } 1204 1205 #ifdef DEBUG 1206 typedef struct ioerr_inject_s { 1207 dev_t ioj_dev; 1208 int ioj_err; 1209 int ioj_cnt; 1210 } ioerr_inject_t; 1211 1212 static ioerr_inject_t *ioerr_inject_table = NULL; 1213 1214 void 1215 _sdbc_ioj_load() 1216 { 1217 ioerr_inject_table = 1218 kmem_zalloc(sdbc_max_devs * sizeof (ioerr_inject_t), KM_SLEEP); 1219 } 1220 1221 void 1222 _sdbc_ioj_unload() 1223 { 1224 if (ioerr_inject_table != NULL) { 1225 kmem_free(ioerr_inject_table, 1226 sdbc_max_devs * sizeof (ioerr_inject_t)); 1227 ioerr_inject_table = NULL; 1228 } 1229 } 1230 1231 static int 1232 _sdbc_ioj_lookup(dev_t dev) 1233 { 1234 int cd; 1235 1236 for (cd = 0; cd < sdbc_max_devs; ++cd) 1237 if (ioerr_inject_table[cd].ioj_dev == dev) { 1238 if (ioerr_inject_table[cd].ioj_cnt > 0) { 1239 --ioerr_inject_table[cd].ioj_cnt; 1240 return (0); 1241 } else { 1242 return (ioerr_inject_table[cd].ioj_err); 1243 } 1244 } 1245 return (0); 1246 } 1247 1248 void 1249 _sdbc_ioj_set_dev(int cd, dev_t crdev) 1250 { 1251 int i; 1252 1253 if (cd == -1) { /* all -- used for clearing table on shutdown */ 1254 for (i = 0; i < sdbc_max_devs; ++i) { 1255 ioerr_inject_table[i].ioj_dev = crdev; 1256 } 1257 } else 1258 ioerr_inject_table[cd].ioj_dev = crdev; /* assume valid cd */ 1259 } 1260 1261 static 1262 void 1263 _sdbc_ioj_set_err(int cd, int err, int count) 1264 { 1265 int i; 1266 1267 if (cd == -1) { /* all */ 1268 for (i = 0; i < sdbc_max_devs; ++i) { 1269 ioerr_inject_table[i].ioj_err = err; 1270 ioerr_inject_table[i].ioj_cnt = count; 1271 } 1272 } else { 1273 ioerr_inject_table[cd].ioj_err = err; 1274 ioerr_inject_table[cd].ioj_cnt = count; 1275 } 1276 } 1277 1278 static void 1279 _sdbc_ioj_clear_err(int cd) 1280 { 1281 _sdbc_ioj_set_err(cd, 0, 0); 1282 } 1283 1284 int 1285 _sdbc_inject_ioerr(int cd, int ioj_err, int count) 1286 { 1287 if ((cd < -1) || (cd >= sdbc_max_devs)) 1288 return (EINVAL); 1289 1290 _sdbc_ioj_set_err(cd, ioj_err, count); 1291 1292 return (0); 1293 } 1294 1295 int 1296 _sdbc_clear_ioerr(int cd) 1297 { 1298 if ((cd < -1) || (cd >= sdbc_max_devs)) 1299 return (EINVAL); 1300 1301 _sdbc_ioj_clear_err(cd); 1302 1303 return (0); 1304 } 1305 #endif