1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25 /*
26 * Copyright (c) 2017 by Delphix. All rights reserved.
27 */
28
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/ksynch.h>
32 #include <sys/kmem.h>
33 #include <sys/stat.h>
34 #include <sys/buf.h>
35 #include <sys/open.h>
36 #include <sys/conf.h>
37 #include <sys/file.h>
38 #include <sys/cmn_err.h>
39 #include <sys/errno.h>
40 #include <sys/ddi.h>
41
42 #include <sys/nsc_thread.h>
43 #include <sys/nsctl/nsctl.h>
44
45 #include <sys/sdt.h> /* dtrace is S10 or later */
46
47 #include <vm/seg_kmem.h>
48 #include "sd_bcache.h"
49 #include "sd_trace.h"
50 #include "sd_io.h"
51 #include "sd_iob.h"
52 #include "sd_misc.h"
53 #if defined(_SD_DEBUG) /* simulate disk errors */
54 #include "sd_tdaemon.h"
55 #endif
56
57 #ifndef DS_DDICT
58 extern uintptr_t kobj_getsymvalue(char *, int); /* DDI violation */
59 #endif
60
61 #define DO_PAGE_LIST sdbc_do_page /* enable pagelist code */
62
63 int sdbc_do_page = 0;
64
65 #define SGIO_MAX 254
66
67 static kmutex_t sdbc_bio_mutex;
68 static int sdbc_bio_count;
69
70 static unsigned long page_size, page_offset_mask;
71
72 #ifdef _SD_BIO_STATS
73 static __start_io_count = 0;
74 #endif /* _SD_BIO_STATS */
75
76 /*
77 * Forward declare all statics that are used before defined to enforce
78 * parameter checking. Also forward-declare all functions that have 64-bit
79 * argument types to enforce correct parameter checking.
80 *
81 * Some (if not all) of these could be removed if the code were reordered
82 */
83
84 static int _sd_sync_ea(struct buf *, iob_hook_t *);
85 static int _sd_async_ea(struct buf *, iob_hook_t *);
86 static void _sd_pack_pages(struct buf *bp, struct buf *list, sd_addr_t *addr,
87 nsc_off_t offset, nsc_size_t size);
88 static void _sd_pack_pages_nopageio(struct buf *bp, struct buf *list,
89 sd_addr_t *addr, nsc_off_t offset, nsc_size_t size);
90 static void _sd_setup_iob(struct buf *bp, dev_t dev, nsc_off_t pos, int flag);
91
92 #ifdef DEBUG
93 static int _sdbc_ioj_lookup(dev_t);
94 static void _sdbc_ioj_clear_err(int);
95 #endif
96
97 static int SD_WRITES_TOT = 0;
98 static int SD_WRITES_LEN[100];
99
100 _sd_buf_list_t _sd_buflist;
101
102 /*
103 * _sd_add_vm_to_bp_plist - add the page corresponding to the
104 * virtual address "v" (kernel virtaddr) to the pagelist linked
105 * to buffer "bp".
106 *
107 * The virtual address "v" is "known" to be allocated by segkmem
108 * and we can look up the page by using the segkmem vnode kvp.
109 * This violates the ddi/ddk but is workable for now anyway.
110 *
111 *
112 */
113 static void
114 _sd_add_vm_to_bp_plist(struct buf *bp, unsigned char *v)
115 {
116 page_t *pp;
117 page_t *one_pg = NULL;
118
119 pp = page_find(&kvp, (u_offset_t)((uintptr_t)v & ~page_offset_mask));
120 if (!pp) {
121 cmn_err(CE_PANIC,
122 "_sd_add_vm_to_bp_plist: couldn't find page for 0x%p",
123 (void *)v);
124 }
125
126 page_add(&one_pg, pp);
127 page_list_concat(&(bp->b_pages), &one_pg);
128
129 }
130
131 #ifdef _SD_BIO_STATS
132 static int
133 _sd_count_pages(page_t *pp)
134 {
135 int cnt = 0;
136 page_t *pp1;
137 if (pp == NULL)
138 return (cnt);
139
140 for (cnt = 1, pp1 = pp->p_next; pp != pp1; cnt++, pp1 = pp1->p_next)
141 ;
142
143 return (cnt);
144 }
145 #endif /* _SD_BIO_STATS */
146
147
148 /*
149 * _sdbc_iobuf_load - load time initialization of io bufs structures.
150 *
151 *
152 * RETURNS:
153 * 0 - success.
154 * -1 - failure.
155 *
156 * USAGE:
157 * This routine initializes load time buf structures.
158 * Should be called when the cache is loaded.
159 */
160
161 int
162 _sdbc_iobuf_load(void)
163 {
164 mutex_init(&sdbc_bio_mutex, NULL, MUTEX_DRIVER, NULL);
165
166 /*
167 * HACK add a ref to kvp, to prevent VN_RELE on it from panicing
168 * the system
169 */
170 VN_HOLD(&kvp);
171
172 return (0);
173 }
174
175 /*
176 * _sdbc_iobuf_unload - unload time cleanup of io buf structures.
177 *
178 *
179 * USAGE:
180 * This routine removes load time buf structures.
181 * Should be called when the cache is unloaded.
182 */
183 void
184 _sdbc_iobuf_unload(void)
185 {
186 mutex_enter(&kvp.v_lock);
187 ASSERT(kvp.v_count == 1);
188 VN_RELE_LOCKED(&kvp);
189 mutex_exit(&kvp.v_lock);
190
191 mutex_destroy(&sdbc_bio_mutex);
192 bzero(&_sd_buflist, sizeof (_sd_buf_list_t));
193 }
194
195 /*
196 * _sdbc_iobuf_configure - configure a list of io bufs for later use.
197 *
198 * ARGUMENTS:
199 * num_bufs - number of buffers. (from the configuration file)
200 *
201 * RETURNS:
202 * 0 - success.
203 * <0 - failure.
204 *
205 * USAGE:
206 * This routine configures the buf structures for io.
207 * Should be called when the cache is configured.
208 */
209
210 int
211 _sdbc_iobuf_configure(int num)
212 {
213 int i;
214 _sd_buf_list_t *buflist;
215 iob_hook_t *hook;
216 char symbol_name[32];
217
218 if (!num || (num > _SD_DEFAULT_IOBUFS))
219 num = _SD_DEFAULT_IOBUFS;
220
221 if ((_sd_buflist.hooks = (iob_hook_t *)nsc_kmem_zalloc(
222 num * sizeof (iob_hook_t), KM_SLEEP, sdbc_iobuf_mem)) == NULL) {
223 return (-1);
224 }
225
226 buflist = &_sd_buflist;
227 buflist->bl_init_count = num;
228 buflist->bl_hooks_avail = num;
229 buflist->bl_hook_lowmark = num;
230 hook = buflist->hooks;
231 buflist->hook_head = hook;
232 for (i = 0; i < num; i++, hook++) {
233 cv_init(&hook->wait, NULL, CV_DRIVER, NULL);
234 (void) sprintf(symbol_name, "sd_iob_dcb%d", i);
235 hook->iob_drv_iodone = (dcb_t)kobj_getsymvalue(symbol_name, 0);
236 if (!hook->iob_drv_iodone) {
237 return (-2);
238 }
239 hook->next_hook = hook+1;
240 }
241 (hook-1)->next_hook = NULL;
242
243 for (i = 0; i < MAX_HOOK_LOCKS; i++)
244 mutex_init(&_sd_buflist.hook_locks[i], NULL, MUTEX_DRIVER,
245 NULL);
246
247 cv_init(&_sd_buflist.hook_wait, NULL, CV_DRIVER, NULL);
248 _sd_buflist.hook_waiters = 0;
249
250 sdbc_bio_count = 0;
251 SD_WRITES_TOT = 0;
252 bzero(SD_WRITES_LEN, sizeof (SD_WRITES_LEN));
253
254 /* pagelist i/o pages must be done in cache_init */
255
256 page_size = ptob(1);
257 page_offset_mask = page_size - 1;
258
259 return (0);
260 }
261
262 /*
263 * _sdbc_iobuf_deconfigure - release all memory allocated for buf list
264 *
265 * ARGUMENTS:
266 * None.
267 *
268 * RETURNS:
269 * 0
270 */
271 void
272 _sdbc_iobuf_deconfigure(void)
273 {
274 ushort_t i;
275
276 if (_sd_buflist.hooks) {
277 for (i = 0; i < _sd_buflist.bl_init_count; i ++) {
278 cv_destroy(&_sd_buflist.hooks[i].wait);
279 }
280 cv_destroy(&_sd_buflist.hook_wait);
281 nsc_kmem_free(_sd_buflist.hooks,
282 _sd_buflist.bl_init_count * sizeof (iob_hook_t));
283 for (i = 0; i < MAX_HOOK_LOCKS; i ++) {
284 mutex_destroy(&_sd_buflist.hook_locks[i]);
285 }
286 }
287
288 _sd_buflist.hooks = NULL;
289
290 #ifdef DEBUG
291 {
292 void _sdbc_ioj_clear_err(int);
293 _sdbc_ioj_clear_err(-1); /* clear any injected i/o errors */
294 _sdbc_ioj_set_dev(-1, 0); /* clear dev entries */
295 }
296 #endif
297
298 }
299
300 /*
301 * _sd_pending_iobuf()
302 *
303 * Return the number of I/O bufs outstanding
304 */
305 int
306 _sd_pending_iobuf(void)
307 {
308 return (sdbc_bio_count);
309 }
310
311 /*
312 * _sd_get_iobuf - allocate a buf.
313 *
314 * ARGUMENTS:
315 * None.
316 *
317 * RETURNS:
318 * NULL - failure.
319 * buf ptr otherwise.
320 *
321 * ASSUMPTIONS - process could block if we run out.
322 *
323 */
324 /*ARGSUSED*/
325 static struct buf *
326 _sd_get_iobuf(int num_bdl)
327 {
328 struct buf *bp;
329
330 /* Get a buffer, ready for page list i/o */
331
332 if (DO_PAGE_LIST)
333 bp = pageio_setup(NULL, 0, &kvp, 0);
334 else
335 bp = getrbuf(KM_SLEEP);
336
337 if (bp == NULL)
338 return (NULL);
339 mutex_enter(&sdbc_bio_mutex);
340 sdbc_bio_count++;
341 mutex_exit(&sdbc_bio_mutex);
342 return (bp);
343 }
344
345 /*
346 * _sd_put_iobuf - put a buf back in the freelist.
347 *
348 * ARGUMENTS:
349 * bp - buf pointer.
350 *
351 * RETURNS:
352 * 0
353 *
354 */
355 static void
356 _sd_put_iobuf(struct buf *bp)
357 {
358 mutex_enter(&sdbc_bio_mutex);
359 sdbc_bio_count--;
360 mutex_exit(&sdbc_bio_mutex);
361 if (DO_PAGE_LIST)
362 pageio_done(bp);
363 else
364 freerbuf(bp);
365 }
366
367
368 /* use for ORing only */
369 #define B_KERNBUF 0
370
371 static void
372 _sd_setup_iob(struct buf *bp, dev_t dev, nsc_off_t pos, int flag)
373 {
374 bp->b_pages = NULL;
375 bp->b_un.b_addr = 0;
376
377 flag &= (B_READ | B_WRITE);
378
379 /*
380 * if pagelist i/o, _sd_get_iobuf()/pageio_setup() has already
381 * set b_flags to
382 * B_KERNBUF | B_PAGEIO | B_NOCACHE | B_BUSY (sol 6,7,8)
383 * or
384 * B_PAGEIO | B_NOCACHE | B_BUSY (sol 9)
385 */
386
387 bp->b_flags |= B_KERNBUF | B_BUSY | flag;
388
389 bp->b_error = 0;
390
391 bp->b_forw = NULL;
392 bp->b_back = NULL;
393
394 bp->b_lblkno = (diskaddr_t)pos;
395 bp->b_bufsize = 0;
396 bp->b_resid = 0;
397 bp->b_proc = NULL;
398 bp->b_edev = dev;
399 }
400
401
402 /*
403 * _sd_get_hook - get an iob hook from the free list.
404 *
405 * ARGUMENTS:
406 * none
407 *
408 * RETURNS:
409 * the newly allocated iob_hook.
410 *
411 */
412 static iob_hook_t *
413 _sd_get_hook(void)
414 {
415
416 iob_hook_t *ret;
417
418 mutex_enter(&sdbc_bio_mutex);
419
420 retry:
421 ret = _sd_buflist.hook_head;
422 if (ret)
423 _sd_buflist.hook_head = ret->next_hook;
424 else {
425 ++_sd_buflist.hook_waiters;
426 if (_sd_buflist.max_hook_waiters < _sd_buflist.hook_waiters)
427 _sd_buflist.max_hook_waiters = _sd_buflist.hook_waiters;
428 cv_wait(&_sd_buflist.hook_wait, &sdbc_bio_mutex);
429 --_sd_buflist.hook_waiters;
430 goto retry;
431 }
432
433 if (_sd_buflist.bl_hook_lowmark > --_sd_buflist.bl_hooks_avail)
434 _sd_buflist.bl_hook_lowmark = _sd_buflist.bl_hooks_avail;
435
436 mutex_exit(&sdbc_bio_mutex);
437 ret->skipped = 0;
438
439 ret->count = 0;
440
441 #ifdef _SD_BIO_STATS
442 ret->PAGE_IO = 0;
443 ret->NORM_IO = 0;
444 ret->NORM_IO_SIZE = 0;
445 ret->SKIP_IO = 0;
446 ret->PAGE_COMBINED = 0;
447 #endif /* _SD_BIO_STATS */
448
449 return (ret);
450 }
451
452 /*
453 * _sd_put_hook - put an iob hook back on the free list.
454 *
455 * ARGUMENTS:
456 * hook - an iob_hook to be returned to the freelist.
457 *
458 *
459 */
460 static void
461 _sd_put_hook(iob_hook_t *hook)
462 {
463
464 mutex_enter(&sdbc_bio_mutex);
465
466 if (_sd_buflist.hook_waiters) {
467 cv_signal(&_sd_buflist.hook_wait);
468 }
469 hook->next_hook = _sd_buflist.hook_head;
470 _sd_buflist.hook_head = hook;
471
472 ++_sd_buflist.bl_hooks_avail;
473
474 mutex_exit(&sdbc_bio_mutex);
475 }
476
477 /*
478 * _sd_extend_iob - the i/o block we are handling needs a new struct buf to
479 * describe the next hunk of i/o. Get a new struct buf initialize it based
480 * on the state in the struct buf we are passed as an arg.
481 * ARGUMENTS:
482 * head_bp - a buffer header in the current i/o block we are handling.
483 * (generally the initial header but in fact could be any
484 * of the ones [if any] that were chained to the initial
485 * one).
486 */
487 static struct buf *
488 _sd_extend_iob(struct buf *head_bp)
489 {
490 struct buf *bp;
491 iob_hook_t *hook = (iob_hook_t *)head_bp->b_private;
492
493
494 if (!(bp = _sd_get_iobuf(0)))
495 return (0);
496
497 bp->b_pages = NULL;
498 bp->b_un.b_addr = 0;
499
500 bp->b_flags |= (head_bp->b_flags & (B_READ | B_WRITE));
501
502 if (!DO_PAGE_LIST)
503 bp->b_flags |= B_KERNBUF | B_BUSY;
504
505 bp->b_error = 0;
506
507 /*
508 * b_forw/b_back will form a doubly linked list of all the buffers
509 * associated with this block of i/o.
510 * hook->tail points to the last buffer in the chain.
511 */
512 bp->b_forw = NULL;
513 bp->b_back = hook->tail;
514 hook->tail->b_forw = bp;
515 hook->tail = bp;
516 hook->count++;
517
518 ASSERT(BLK_FBA_OFF(hook->size) == 0);
519
520 bp->b_lblkno = (diskaddr_t)hook->start_fba +
521 (diskaddr_t)FBA_NUM(hook->size);
522
523 bp->b_bufsize = 0;
524 bp->b_resid = 0;
525 bp->b_proc = NULL;
526 bp->b_edev = head_bp->b_edev;
527
528 bp->b_iodone = NULL; /* for now */
529 bp->b_private = hook;
530
531 return (bp);
532 }
533
534 /*
535 * sd_alloc_iob - start processing a block of i/o. This allocates an initial
536 * buffer header for describing the i/o and a iob_hook for collecting
537 * information about all the i/o requests added to this buffer.
538 *
539 * ARGUMENTS:
540 * dev - the device all the i/o is destined for.
541 * fba_pos - the initial disk block to read.
542 * blks - ignored
543 * flag - signal whether this is a read or write request.
544 *
545 * RETURNS:
546 * pointer to free struct buf which will be used to describe i/o request.
547 */
548 /* ARGSUSED */
549 struct buf *
550 sd_alloc_iob(dev_t dev, nsc_off_t fba_pos, int blks, int flag)
551 {
552 struct buf *bp;
553 iob_hook_t *hook;
554
555 if (!(bp = _sd_get_iobuf(0)))
556 return (0);
557
558 _sd_setup_iob(bp, dev, fba_pos, flag);
559
560 bp->b_iodone = NULL; /* for now */
561 hook = _sd_get_hook();
562 if (!hook) {
563 /* can't see how this could happen */
564 _sd_put_iobuf(bp);
565 return (0);
566 }
567
568 /*
569 * pick an arbitrary lock
570 */
571 hook->lockp = &_sd_buflist.hook_locks[((long)hook >> 9) &
572 (MAX_HOOK_LOCKS - 1)];
573 hook->start_fba = fba_pos;
574 hook->last_fba = fba_pos;
575 hook->size = 0;
576 hook->tail = bp;
577 hook->chain = bp;
578 hook->count = 1;
579 hook->error = 0;
580 bp->b_private = hook;
581
582 return (bp);
583 }
584
585 /*
586 * _sd_pack_pages - produce i/o requests that will perform the type of i/o
587 * described by bp (READ/WRITE). It attempt to tack the i/o onto the
588 * buf pointer to by list to minimize the number of bufs required.
589 *
590 * ARGUMENTS:
591 * bp - is the i/o description i.e. head
592 * list - is where to start adding this i/o request (null if we should extend)
593 * addr - address describing where the data is.
594 * offset - offset from addr where data begins
595 * size - size of the i/o request.
596 */
597 static void
598 _sd_pack_pages(struct buf *bp, struct buf *list, sd_addr_t *addr,
599 nsc_off_t offset, nsc_size_t size)
600 {
601 uintptr_t start_addr, end_addr;
602 int page_end_aligned;
603 #ifdef _SD_BIO_STATS
604 iob_hook_t *hook = (iob_hook_t *)bp->b_private;
605 struct buf *orig_list = list;
606 #endif /* _SD_BIO_STATS */
607
608 start_addr = (uintptr_t)addr->sa_virt + offset;
609 end_addr = start_addr + size;
610
611 page_end_aligned = !(end_addr & page_offset_mask);
612
613 if (!list && !(list = _sd_extend_iob(bp))) {
614 /*
615 * we're hosed since we have no error return...
616 * though we could ignore stuff from here on out
617 * and return ENOMEM when we get to sd_start_io.
618 * This will do for now.
619 */
620 cmn_err(CE_PANIC, "_sd_pack_pages: couldn't extend iob");
621 }
622
623 /*
624 * We only want to do pagelist i/o if we end on a page boundary.
625 * If we don't end on a page boundary we won't combine with the
626 * next request and so we may as well do it as normal as it
627 * will only use one buffer.
628 */
629
630 if (DO_PAGE_LIST && page_end_aligned) {
631 if (start_addr & page_offset_mask) {
632 /*
633 * handle the partial page
634 */
635 if (list->b_bufsize) {
636 if (!(list = _sd_extend_iob(bp))) {
637 /*
638 * we're hosed since we have no error
639 * return though we could ignore stuff
640 * from here on out and return ENOMEM
641 * when we get to sd_start_io.
642 * This will do for now.
643 */
644 cmn_err(CE_PANIC,
645 "_sd_pack_pages: couldn't extend iob");
646 }
647 }
648 #ifdef _SD_BIO_STATS
649 hook->PAGE_IO++;
650 #endif /* _SD_BIO_STATS */
651 _sd_add_vm_to_bp_plist(list,
652 (unsigned char *) start_addr);
653 list->b_bufsize = page_size -
654 (start_addr & page_offset_mask);
655 list->b_un.b_addr = (caddr_t)
656 (start_addr & page_offset_mask);
657 size -= list->b_bufsize;
658 start_addr += list->b_bufsize;
659 }
660 /*
661 * Now fill with all the full pages remaining.
662 */
663 for (; size > 0; size -= page_size) {
664 #ifdef _SD_BIO_STATS
665 hook->PAGE_IO++;
666 #endif /* _SD_BIO_STATS */
667
668 _sd_add_vm_to_bp_plist(list,
669 (unsigned char *) start_addr);
670 start_addr += page_size;
671 list->b_bufsize += page_size;
672 #ifdef _SD_BIO_STATS
673 if (list == orig_list)
674 hook->PAGE_COMBINED++;
675 #endif /* _SD_BIO_STATS */
676 }
677 if (size)
678 cmn_err(CE_PANIC, "_sd_pack_pages: bad size: %"
679 NSC_SZFMT, size);
680 } else {
681 /*
682 * Wasn't worth it as pagelist i/o, do as normal
683 */
684 if (list->b_bufsize && !(list = _sd_extend_iob(bp))) {
685 /*
686 * we're hosed since we have no error return...
687 * though we could ignore stuff from here on out
688 * and return ENOMEM when we get to sd_start_io.
689 * This will do for now.
690 */
691 cmn_err(CE_PANIC,
692 "_sd_pack_pages: couldn't extend iob");
693 }
694
695 /* kernel virtual */
696 list->b_flags &= ~(B_PHYS | B_PAGEIO);
697 list->b_un.b_addr = (caddr_t)start_addr;
698 #ifdef _SD_BIO_STATS
699 hook->NORM_IO++;
700 hook->NORM_IO_SIZE += size;
701 #endif /* _SD_BIO_STATS */
702 list->b_bufsize = (size_t)size;
703 }
704
705 }
706
707 /*
708 * perform same function as _sd_pack_pages() when not doing pageio
709 */
710 static void
711 _sd_pack_pages_nopageio(struct buf *bp, struct buf *list, sd_addr_t *addr,
712 nsc_off_t offset, nsc_size_t size)
713 {
714 uintptr_t start_addr;
715 #ifdef _SD_BIO_STATS
716 iob_hook_t *hook = (iob_hook_t *)bp->b_private;
717 struct buf *orig_list = list;
718 #endif /* _SD_BIO_STATS */
719
720 start_addr = (uintptr_t)addr->sa_virt + offset;
721
722 if (!list && !(list = _sd_extend_iob(bp))) {
723 /*
724 * we're hosed since we have no error return...
725 * though we could ignore stuff from here on out
726 * and return ENOMEM when we get to sd_start_io.
727 * This will do for now.
728 */
729 cmn_err(CE_PANIC, "_sd_pack_pages_nopageio: couldn't "
730 "extend iob");
731 }
732
733 if (list->b_bufsize &&
734 (start_addr == (uintptr_t)(list->b_un.b_addr + list->b_bufsize))) {
735 /* contiguous */
736 list->b_bufsize += (size_t)size;
737 } else {
738 /*
739 * not contiguous mem (extend) or first buffer (bufsize == 0).
740 */
741 if (list->b_bufsize && !(list = _sd_extend_iob(bp))) {
742 /*
743 * we're hosed since we have no error return...
744 * though we could ignore stuff from here on out
745 * and return ENOMEM when we get to sd_start_io.
746 * This will do for now.
747 */
748 cmn_err(CE_PANIC, "_sd_pack_pages_nopageio: couldn't "
749 "extend iob");
750 }
751 list->b_un.b_addr = (caddr_t)start_addr;
752 list->b_bufsize = (size_t)size;
753 }
754
755 #ifdef _SD_BIO_STATS
756 hook->NORM_IO++;
757 hook->NORM_IO_SIZE += size;
758 #endif /* _SD_BIO_STATS */
759 }
760
761 /*
762 * sd_add_fba - add an i/o request to the block of i/o described by bp.
763 * We try and combine this request with the previous request. In
764 * Addition we try and do the i/o as PAGELIST_IO if it satisfies
765 * the restrictions for it. If the i/o request can't be combined
766 * we extend the i/o description with a new buffer header and add
767 * it to the chain headed by bp.
768 *
769 * ARGUMENTS:
770 * bp - the struct buf describing the block i/o we are collecting.
771 * addr - description of the address where the data will read/written to.
772 * A NULL indicates that this i/o request doesn't need to actually
773 * happen. Used to mark reads when the fba is already in cache and
774 * dirty.
775 *
776 * fba_pos - offset from address in addr where the i/o is to start.
777 *
778 * fba_len - number of consecutive fbas to transfer.
779 *
780 * NOTE: It is assumed that the memory is physically contiguous but may span
781 * multiple pages (should a cache block be larger than a page).
782 *
783 */
784 void
785 sd_add_fba(struct buf *bp, sd_addr_t *addr, nsc_off_t fba_pos,
786 nsc_size_t fba_len)
787 {
788 nsc_off_t offset;
789 nsc_size_t size;
790 iob_hook_t *hook = (iob_hook_t *)bp->b_private;
791
792 size = FBA_SIZE(fba_len);
793 offset = FBA_SIZE(fba_pos);
794
795 if (addr) {
796 /*
797 * See if this can be combined with previous request(s)
798 */
799 if (!bp->b_bufsize) {
800 if (DO_PAGE_LIST)
801 _sd_pack_pages(bp, bp, addr, offset, size);
802 else
803 _sd_pack_pages_nopageio(bp, bp, addr, offset,
804 size);
805 } else {
806 if (DO_PAGE_LIST) {
807 if (hook->tail->b_flags & B_PAGEIO) {
808 /*
809 * Last buffer was a pagelist. Unless a
810 * skip was detected the last request
811 * ended on a page boundary. If this
812 * one starts on one we combine the
813 * best we can.
814 */
815 if (hook->skipped)
816 _sd_pack_pages(bp, NULL, addr,
817 offset, size);
818 else
819 _sd_pack_pages(bp, hook->tail,
820 addr, offset, size);
821 } else {
822 /*
823 * Last buffer was vanilla i/o or worse
824 * (sd_add_mem)
825 */
826 _sd_pack_pages(bp, NULL, addr, offset,
827 size);
828 }
829 } else {
830 if (hook->skipped)
831 _sd_pack_pages_nopageio(bp, NULL,
832 addr, offset, size);
833 else
834 _sd_pack_pages_nopageio(bp,
835 hook->tail, addr, offset, size);
836 }
837 }
838 hook->skipped = 0;
839 } else {
840 /* Must be a read of dirty block we want to discard */
841
842 ASSERT(bp->b_flags & B_READ);
843 #ifdef _SD_BIO_STATS
844 hook->SKIP_IO++;
845 #endif /* _SD_BIO_STATS */
846 hook->skipped = 1;
847 if (!bp->b_bufsize)
848 bp->b_lblkno += fba_len;
849 }
850 hook->size += size;
851
852 }
853
854 /*
855 * sd_add_mem - add an i/o request to the block of i/o described by bp.
856 * The memory target for this i/o may span multiple pages and may
857 * not be physically contiguous.
858 * also the len might also not be a multiple of an fba.
859 *
860 * ARGUMENTS:
861 * bp - the struct buf describing the block i/o we are collecting.
862 *
863 * buf - target of this i/o request.
864 *
865 * len - number of bytes to transfer.
866 *
867 */
868 void
869 sd_add_mem(struct buf *bp, char *buf, nsc_size_t len)
870 {
871 nsc_size_t n;
872 uintptr_t start;
873 iob_hook_t *hook = (iob_hook_t *)bp->b_private;
874
875 start = (uintptr_t)buf & page_offset_mask;
876
877 for (; len > 0; buf += n, len -= n, start = 0) {
878 n = min((nsc_size_t)len, (nsc_size_t)(page_size - start));
879 /*
880 * i/o size must be multiple of an FBA since we can't
881 * count on lower level drivers to understand b_offset
882 */
883 if (BLK_FBA_OFF(n) != 0) {
884 cmn_err(CE_WARN,
885 "!sdbc(sd_add_mem) i/o request not FBA sized (%"
886 NSC_SZFMT ")", n);
887 }
888
889 if (!bp->b_bufsize) {
890 /* first request */
891 bp->b_flags &= ~(B_PHYS | B_PAGEIO);
892 bp->b_un.b_addr = buf;
893 bp->b_bufsize = (size_t)n;
894 } else {
895 struct buf *new_bp;
896 if (!(new_bp = _sd_extend_iob(bp))) {
897 /* we're hosed */
898 cmn_err(CE_PANIC,
899 "sd_add_mem: couldn't extend iob");
900 }
901 new_bp->b_flags &= ~(B_PHYS | B_PAGEIO);
902 new_bp->b_un.b_addr = buf;
903 new_bp->b_bufsize = (size_t)n;
904 }
905 hook->size += n;
906 }
907 }
908
909
910 /*
911 * sd_start_io - start all the i/o needed to satisfy the i/o request described
912 * by bp. If supplied the a non-NULL fn then this is an async request
913 * and we will return NSC_PENDING and call fn when all the i/o complete.
914 * Otherwise this is a synchronous request and we sleep until all the
915 * i/o is complete. If any buffer in the chain gets an error we return
916 * the first error we see (once all the i/o is complete).
917 *
918 * ARGUMENTS:
919 * bp - the struct buf describing the block i/o we are collecting.
920 *
921 * strategy - strategy function to call if known by the user, or NULL.
922 *
923 * fn - user's callback function. NULL implies synchronous request.
924 *
925 * arg - an argument passed to user's callback function.
926 *
927 */
928 int
929 sd_start_io(struct buf *bp, strategy_fn_t strategy, sdbc_ea_fn_t fn,
930 blind_t arg)
931 {
932 int err;
933 iob_hook_t *hook = (iob_hook_t *)bp->b_private;
934 struct buf *bp_next;
935 int (*ea_fn)(struct buf *, iob_hook_t *);
936 #ifdef _SD_BIO_STATS
937 static int total_pages, total_pages_combined, total_norm;
938 static int total_norm_combined, total_skipped;
939 static nsc_size_t total_norm_size;
940
941 static int total_bufs;
942 static int total_xpages_w, total_ypages_w;
943 static int total_xpages_r, total_ypages_r;
944 static int max_run_r, max_run_w;
945
946 #endif /* _SD_BIO_STATS */
947
948 hook->func = fn;
949 hook->param = arg;
950 if (fn != NULL)
951 ea_fn = _sd_async_ea;
952 else
953 ea_fn = _sd_sync_ea;
954
955 hook->iob_hook_iodone = ea_fn;
956
957 #ifdef _SD_BIO_STATS
958 __start_io_count++;
959 total_pages += hook->PAGE_IO;
960 total_pages_combined += hook->PAGE_COMBINED;
961 total_norm += hook->NORM_IO;
962 total_norm_size += hook->NORM_IO_SIZE;
963 total_skipped += hook->SKIP_IO;
964 #endif /* _SD_BIO_STATS */
965
966 for (; bp; bp = bp_next) {
967
968 DTRACE_PROBE4(sd_start_io_bufs, struct buf *, bp, long, bp->b_bufsize,
969 int, bp->b_flags, iob_hook_t *, hook);
970
971 bp_next = bp->b_forw;
972 if (!(bp->b_flags & B_READ)) {
973 SD_WRITES_TOT++;
974 SD_WRITES_LEN[(bp->b_bufsize/32768) %
975 (sizeof (SD_WRITES_LEN)/sizeof (int))]++;
976 }
977 bp->b_iodone = hook->iob_drv_iodone;
978 bp->b_bcount = bp->b_bufsize;
979 bp->b_forw = NULL;
980 bp->b_back = NULL;
981 bp->b_private = NULL;
982
983 #ifdef _SD_BIO_STATS
984 total_bufs ++;
985 if (bp->b_flags & B_PAGEIO) {
986 int i;
987 i = _sd_count_pages(bp->b_pages);
988 if (bp->b_flags & B_READ) {
989 if (i > max_run_r)
990 max_run_r = i;
991 total_xpages_r += i;
992 total_ypages_r++;
993 } else {
994 if (i > max_run_w)
995 max_run_w = i;
996 total_xpages_w += i;
997 total_ypages_w++;
998 }
999 }
1000 #endif /* _SD_BIO_STATS */
1001
1002
1003 /*
1004 * It's possible for us to be told to read a dirty block
1005 * where all the i/o can go away (e.g. read one fba, it's
1006 * in cache and dirty) so we really have nothing to do but
1007 * say we're done.
1008 */
1009 if (bp->b_bcount) {
1010 if (!strategy) {
1011 strategy =
1012 nsc_get_strategy(getmajor(bp->b_edev));
1013 }
1014
1015 if (!strategy) {
1016 bp->b_flags |= B_ERROR;
1017 bp->b_error = ENXIO;
1018 (*bp->b_iodone)(bp);
1019 } else
1020 #ifdef DEBUG
1021 /* inject i/o error for testing */
1022 if (bp->b_error = _sdbc_ioj_lookup(bp->b_edev)) {
1023 bp->b_flags |= B_ERROR;
1024 (*bp->b_iodone)(bp);
1025 } else
1026 #endif
1027 {
1028 (*strategy)(bp);
1029 }
1030 } else {
1031 (*bp->b_iodone)(bp);
1032 }
1033
1034 }
1035
1036 #ifdef _SD_BIO_STATS
1037 if (__start_io_count == 2000) {
1038 __start_io_count = 0;
1039 cmn_err(CE_WARN,
1040 "!sdbc(sd_start_io) t_bufs %d pages %d "
1041 "combined %d norm %d norm_size %" NSC_SZFMT " skipped %d",
1042 total_bufs,
1043 total_pages, total_pages_combined, total_norm,
1044 total_norm_size, total_skipped);
1045
1046 total_bufs = 0;
1047 total_pages = 0;
1048 total_pages_combined = 0;
1049 total_norm = 0;
1050 total_norm_combined = 0;
1051 total_skipped = 0;
1052 total_norm_size = 0;
1053
1054 cmn_err(CE_WARN,
1055 "!sdbc(sd_start_io)(r) max_run %d, total_xp %d total yp %d",
1056 max_run_r, total_xpages_r, total_ypages_r);
1057
1058 total_xpages_r = 0;
1059 total_ypages_r = 0;
1060 max_run_r = 0;
1061
1062 cmn_err(CE_WARN,
1063 "!sdbc(sd_start_io)(w) max_run %d, total_xp %d total yp %d",
1064 max_run_w, total_xpages_w, total_ypages_w);
1065
1066 total_xpages_w = 0;
1067 total_ypages_w = 0;
1068 max_run_w = 0;
1069 }
1070 #endif /* _SD_BIO_STATS */
1071
1072 if (ea_fn == _sd_async_ea) {
1073 DTRACE_PROBE(sd_start_io_end);
1074
1075 return (NSC_PENDING);
1076 }
1077
1078 mutex_enter(hook->lockp);
1079
1080 while (hook->count) {
1081 cv_wait(&hook->wait, hook->lockp);
1082 }
1083 mutex_exit(hook->lockp);
1084
1085 err = hook->error ? hook->error : NSC_DONE;
1086 bp = hook->tail;
1087 _sd_put_hook(hook);
1088 _sd_put_iobuf(bp);
1089
1090 return (err);
1091 }
1092
1093 /*
1094 * _sd_sync_ea - called when a single i/o operation is complete. If this
1095 * is the last outstanding i/o we wakeup the sleeper.
1096 * If this i/o had an error then we store the error result in the
1097 * iob_hook if this was the first error.
1098 *
1099 * ARGUMENTS:
1100 * bp - the struct buf describing the block i/o that just completed.
1101 *
1102 * Comments:
1103 * This routine is called at interrupt level when the io is done.
1104 */
1105
1106 static int
1107 _sd_sync_ea(struct buf *bp, iob_hook_t *hook)
1108 {
1109
1110 int error;
1111 int done;
1112
1113 /*
1114 * We get called for each buf that completes. When they are all done.
1115 * we wakeup the waiter.
1116 */
1117 error = (bp->b_flags & B_ERROR) ? bp->b_error : 0;
1118
1119 mutex_enter(hook->lockp);
1120
1121 if (!hook->error)
1122 hook->error = error;
1123
1124 done = !(--hook->count);
1125 if (done) {
1126 /* remember the last buffer so we can free it later */
1127 hook->tail = bp;
1128 cv_signal(&hook->wait);
1129 }
1130 mutex_exit(hook->lockp);
1131
1132 /*
1133 * let sd_start_io free the final buffer so the hook can be returned
1134 * first.
1135 */
1136 if (!done)
1137 _sd_put_iobuf(bp);
1138
1139 return (0);
1140 }
1141
1142 /*
1143 * static int
1144 * _sd_async_ea - End action for async read/write.
1145 *
1146 * ARGUMENTS:
1147 * bp - io buf pointer.
1148 *
1149 * RETURNS:
1150 * NONE.
1151 *
1152 * Comments:
1153 * This routine is called at interrupt level when the io is done.
1154 * This is only called when the operation is asynchronous.
1155 */
1156 static int
1157 _sd_async_ea(struct buf *bp, iob_hook_t *hook)
1158 {
1159 int done, error;
1160
1161 /*
1162 * We get called for each buf that completes. When they are all done.
1163 * we call the requestor's callback function.
1164 */
1165 error = (bp->b_flags & B_ERROR) ? bp->b_error : 0;
1166
1167 mutex_enter(hook->lockp);
1168 done = !(--hook->count);
1169
1170 if (!hook->error)
1171 hook->error = error;
1172
1173 mutex_exit(hook->lockp);
1174
1175 bp->b_forw = NULL;
1176 bp->b_back = NULL;
1177
1178 if (done) {
1179 nsc_off_t fba_pos;
1180 nsc_size_t fba_len;
1181 int error;
1182 sdbc_ea_fn_t fn;
1183 blind_t arg;
1184
1185 arg = hook->param;
1186 fn = hook->func;
1187 error = hook->error;
1188 #if defined(_SD_DEBUG) /* simulate disk errors */
1189 if (_test_async_fail == bp->b_edev) error = EIO;
1190 #endif
1191
1192 /* MAKE SURE b_lblkno, b_count never changes!! */
1193 fba_pos = hook->start_fba;
1194 fba_len = FBA_LEN(hook->size);
1195
1196 _sd_put_hook(hook);
1197 _sd_put_iobuf(bp);
1198 (*fn)(arg, fba_pos, fba_len, error);
1199 } else
1200 _sd_put_iobuf(bp);
1201
1202 return (0);
1203 }
1204
1205 #ifdef DEBUG
1206 typedef struct ioerr_inject_s {
1207 dev_t ioj_dev;
1208 int ioj_err;
1209 int ioj_cnt;
1210 } ioerr_inject_t;
1211
1212 static ioerr_inject_t *ioerr_inject_table = NULL;
1213
1214 void
1215 _sdbc_ioj_load()
1216 {
1217 ioerr_inject_table =
1218 kmem_zalloc(sdbc_max_devs * sizeof (ioerr_inject_t), KM_SLEEP);
1219 }
1220
1221 void
1222 _sdbc_ioj_unload()
1223 {
1224 if (ioerr_inject_table != NULL) {
1225 kmem_free(ioerr_inject_table,
1226 sdbc_max_devs * sizeof (ioerr_inject_t));
1227 ioerr_inject_table = NULL;
1228 }
1229 }
1230
1231 static int
1232 _sdbc_ioj_lookup(dev_t dev)
1233 {
1234 int cd;
1235
1236 for (cd = 0; cd < sdbc_max_devs; ++cd)
1237 if (ioerr_inject_table[cd].ioj_dev == dev) {
1238 if (ioerr_inject_table[cd].ioj_cnt > 0) {
1239 --ioerr_inject_table[cd].ioj_cnt;
1240 return (0);
1241 } else {
1242 return (ioerr_inject_table[cd].ioj_err);
1243 }
1244 }
1245 return (0);
1246 }
1247
1248 void
1249 _sdbc_ioj_set_dev(int cd, dev_t crdev)
1250 {
1251 int i;
1252
1253 if (cd == -1) { /* all -- used for clearing table on shutdown */
1254 for (i = 0; i < sdbc_max_devs; ++i) {
1255 ioerr_inject_table[i].ioj_dev = crdev;
1256 }
1257 } else
1258 ioerr_inject_table[cd].ioj_dev = crdev; /* assume valid cd */
1259 }
1260
1261 static
1262 void
1263 _sdbc_ioj_set_err(int cd, int err, int count)
1264 {
1265 int i;
1266
1267 if (cd == -1) { /* all */
1268 for (i = 0; i < sdbc_max_devs; ++i) {
1269 ioerr_inject_table[i].ioj_err = err;
1270 ioerr_inject_table[i].ioj_cnt = count;
1271 }
1272 } else {
1273 ioerr_inject_table[cd].ioj_err = err;
1274 ioerr_inject_table[cd].ioj_cnt = count;
1275 }
1276 }
1277
1278 static void
1279 _sdbc_ioj_clear_err(int cd)
1280 {
1281 _sdbc_ioj_set_err(cd, 0, 0);
1282 }
1283
1284 int
1285 _sdbc_inject_ioerr(int cd, int ioj_err, int count)
1286 {
1287 if ((cd < -1) || (cd >= sdbc_max_devs))
1288 return (EINVAL);
1289
1290 _sdbc_ioj_set_err(cd, ioj_err, count);
1291
1292 return (0);
1293 }
1294
1295 int
1296 _sdbc_clear_ioerr(int cd)
1297 {
1298 if ((cd < -1) || (cd >= sdbc_max_devs))
1299 return (EINVAL);
1300
1301 _sdbc_ioj_clear_err(cd);
1302
1303 return (0);
1304 }
1305 #endif