1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011 Bayard G. Bell. All rights reserved. 25 */ 26 27 #include <sys/param.h> 28 #include <sys/systm.h> 29 #include <sys/conf.h> 30 #include <sys/file.h> 31 #include <sys/user.h> 32 #include <sys/uio.h> 33 #include <sys/t_lock.h> 34 #include <sys/buf.h> 35 #include <sys/dkio.h> 36 #include <sys/vtoc.h> 37 #include <sys/kmem.h> 38 #include <vm/page.h> 39 #include <sys/cmn_err.h> 40 #include <sys/sysmacros.h> 41 #include <sys/types.h> 42 #include <sys/mkdev.h> 43 #include <sys/stat.h> 44 #include <sys/open.h> 45 #include <sys/modctl.h> 46 #include <sys/ddi.h> 47 #include <sys/sunddi.h> 48 #include <sys/debug.h> 49 #include <sys/dklabel.h> 50 #include <vm/hat.h> 51 #include <sys/lvm/mdvar.h> 52 #include <sys/lvm/md_mirror.h> 53 #include <sys/lvm/md_convert.h> 54 #include <sys/lvm/md_mddb.h> 55 #include <sys/esunddi.h> 56 57 #include <sys/sysevent/eventdefs.h> 58 #include <sys/sysevent/svm.h> 59 #include <sys/lvm/mdmn_commd.h> 60 #include <sys/avl.h> 61 62 md_ops_t mirror_md_ops; 63 #ifndef lint 64 md_ops_t *md_interface_ops = &mirror_md_ops; 65 #endif 66 67 extern mdq_anchor_t md_done_daemon; 68 extern mdq_anchor_t md_mstr_daemon; 69 extern mdq_anchor_t md_mirror_daemon; 70 extern mdq_anchor_t md_mirror_io_daemon; 71 extern mdq_anchor_t md_mirror_rs_daemon; 72 extern mdq_anchor_t md_mhs_daemon; 73 74 extern unit_t md_nunits; 75 extern set_t md_nsets; 76 extern md_set_t md_set[]; 77 78 extern int md_status; 79 extern clock_t md_hz; 80 81 extern md_krwlock_t md_unit_array_rw; 82 extern kmutex_t md_mx; 83 extern kcondvar_t md_cv; 84 extern int md_mtioctl_cnt; 85 86 daemon_request_t mirror_timeout; 87 static daemon_request_t hotspare_request; 88 static daemon_request_t mn_hs_request[MD_MAXSETS]; /* Multinode hs req */ 89 90 int md_mirror_mcs_buf_off; 91 92 /* Flags for mdmn_ksend_message to allow debugging */ 93 int md_mirror_msg_flags; 94 95 #ifdef DEBUG 96 /* Flag to switch on debug messages */ 97 int mirror_debug_flag = 0; 98 #endif 99 100 /* 101 * Struct used to hold count of DMR reads and the timestamp of last DMR read 102 * It is used to verify, using a debugger, that the DMR read ioctl has been 103 * executed. 104 */ 105 dmr_stats_t mirror_dmr_stats = {0}; 106 107 /* 108 * Mutex protecting list of non-failfast drivers. 109 */ 110 static kmutex_t non_ff_drv_mutex; 111 extern char **non_ff_drivers; 112 113 extern major_t md_major; 114 115 /* 116 * Write-On-Write memory pool. 117 */ 118 static void copy_write_cont(wowhdr_t *wowhdr); 119 static kmem_cache_t *mirror_wowblk_cache = NULL; 120 static int md_wowbuf_size = 16384; 121 static size_t md_wowblk_size; 122 123 /* 124 * This is a flag that allows: 125 * - disabling the write-on-write mechanism. 126 * - logging occurrences of write-on-write 127 * - switching wow handling procedure processing 128 * Counter for occurences of WOW. 129 */ 130 static uint_t md_mirror_wow_flg = 0; 131 static int md_mirror_wow_cnt = 0; 132 133 /* 134 * Tunable to enable/disable dirty region 135 * processing when closing down a mirror. 136 */ 137 static int new_resync = 1; 138 kmem_cache_t *mirror_parent_cache = NULL; 139 kmem_cache_t *mirror_child_cache = NULL; 140 141 extern int md_ff_disable; /* disable failfast */ 142 143 static int mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int); 144 static void mirror_read_strategy(buf_t *, int, void *); 145 static void mirror_write_strategy(buf_t *, int, void *); 146 static void become_owner(daemon_queue_t *); 147 static int mirror_done(struct buf *cb); 148 static int mirror_done_common(struct buf *cb); 149 static void clear_retry_error(struct buf *cb); 150 151 /* 152 * patchables 153 */ 154 int md_min_rr_size = 200; /* 2000 blocks, or 100k */ 155 int md_def_num_rr = 1000; /* Default number of dirty regions */ 156 157 /* 158 * patchable to change delay before rescheduling mirror ownership request. 159 * Value is clock ticks, default 0.5 seconds 160 */ 161 clock_t md_mirror_owner_to = 500000; 162 163 /*ARGSUSED1*/ 164 static int 165 mirror_parent_constructor(void *p, void *d1, int d2) 166 { 167 mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL); 168 return (0); 169 } 170 171 static void 172 mirror_parent_init(md_mps_t *ps) 173 { 174 bzero(ps, offsetof(md_mps_t, ps_mx)); 175 bzero(&ps->ps_overlap_node, sizeof (avl_node_t)); 176 } 177 178 /*ARGSUSED1*/ 179 static void 180 mirror_parent_destructor(void *p, void *d) 181 { 182 mutex_destroy(&((md_mps_t *)p)->ps_mx); 183 } 184 185 /*ARGSUSED1*/ 186 static int 187 mirror_child_constructor(void *p, void *d1, int d2) 188 { 189 bioinit(&((md_mcs_t *)p)->cs_buf); 190 return (0); 191 } 192 193 void 194 mirror_child_init(md_mcs_t *cs) 195 { 196 cs->cs_ps = NULL; 197 cs->cs_mdunit = 0; 198 md_bioreset(&cs->cs_buf); 199 } 200 201 /*ARGSUSED1*/ 202 static void 203 mirror_child_destructor(void *p, void *d) 204 { 205 biofini(&((md_mcs_t *)p)->cs_buf); 206 } 207 208 static void 209 mirror_wowblk_init(wowhdr_t *p) 210 { 211 bzero(p, md_wowblk_size); 212 } 213 214 static void 215 send_poke_hotspares_msg(daemon_request_t *drq) 216 { 217 int rval; 218 int nretries = 0; 219 md_mn_msg_pokehsp_t pokehsp; 220 md_mn_kresult_t *kresult; 221 set_t setno = (set_t)drq->dq.qlen; 222 223 pokehsp.pokehsp_setno = setno; 224 225 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 226 227 retry_sphmsg: 228 rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES, 229 MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp, 230 sizeof (pokehsp), kresult); 231 232 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 233 mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES"); 234 /* If we're shutting down already, pause things here. */ 235 if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) { 236 while (!md_mn_is_commd_present()) { 237 delay(md_hz); 238 } 239 /* 240 * commd has become reachable again, so retry once. 241 * If this fails we'll panic as the system is in an 242 * unexpected state. 243 */ 244 if (nretries++ == 0) 245 goto retry_sphmsg; 246 } 247 cmn_err(CE_PANIC, 248 "ksend_message failure: POKE_HOTSPARES"); 249 } 250 kmem_free(kresult, sizeof (md_mn_kresult_t)); 251 252 /* Allow further requests to use this set's queue structure */ 253 mutex_enter(&drq->dr_mx); 254 drq->dr_pending = 0; 255 mutex_exit(&drq->dr_mx); 256 } 257 258 /* 259 * Send a poke_hotspares message to the master node. To avoid swamping the 260 * commd handler with requests we only send a message if there is not one 261 * already outstanding. We punt the request to a separate thread context as 262 * cannot afford to block waiting on the request to be serviced. This is 263 * essential when a reconfig cycle is in progress as any open() of a multinode 264 * metadevice may result in a livelock. 265 */ 266 static void 267 send_poke_hotspares(set_t setno) 268 { 269 daemon_request_t *drq = &mn_hs_request[setno]; 270 271 mutex_enter(&drq->dr_mx); 272 if (drq->dr_pending == 0) { 273 drq->dr_pending = 1; 274 drq->dq.qlen = (int)setno; 275 daemon_request(&md_mhs_daemon, 276 send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD); 277 } 278 mutex_exit(&drq->dr_mx); 279 } 280 281 void 282 mirror_set_sm_state( 283 mm_submirror_t *sm, 284 mm_submirror_ic_t *smic, 285 sm_state_t newstate, 286 int force) 287 { 288 int compcnt; 289 int i; 290 int errcnt; 291 sm_state_t origstate; 292 md_m_shared_t *shared; 293 294 if (force) { 295 sm->sm_state = newstate; 296 uniqtime32(&sm->sm_timestamp); 297 return; 298 } 299 300 origstate = newstate; 301 302 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); 303 for (i = 0, errcnt = 0; i < compcnt; i++) { 304 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 305 (sm->sm_dev, sm, i); 306 if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED)) 307 newstate |= SMS_COMP_ERRED; 308 if (shared->ms_state & (CS_RESYNC)) 309 newstate |= SMS_COMP_RESYNC; 310 if (shared->ms_state & CS_ERRED) 311 errcnt++; 312 } 313 314 if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0) 315 newstate &= ~origstate; 316 317 if (errcnt == compcnt) 318 newstate |= SMS_ALL_ERRED; 319 else 320 newstate &= ~SMS_ALL_ERRED; 321 322 sm->sm_state = newstate; 323 uniqtime32(&sm->sm_timestamp); 324 } 325 326 static int 327 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error, 328 int frm_probe) 329 { 330 mm_submirror_t *sm; 331 mm_submirror_ic_t *smic; 332 md_m_shared_t *shared; 333 int ci; 334 int i; 335 int compcnt; 336 int open_comp; /* flag for open component */ 337 338 for (i = *smi; i < NMIRROR; i++) { 339 sm = &un->un_sm[i]; 340 smic = &un->un_smic[i]; 341 342 if (!SMS_IS(sm, SMS_INUSE)) 343 continue; 344 345 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 346 for (ci = *cip; ci < compcnt; ci++) { 347 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 348 (sm->sm_dev, sm, ci); 349 /* 350 * if called from any routine but probe, we check for 351 * MDM_S_ISOPEN flag. Since probe does a pseduo open, 352 * it sets MDM_S_PROBEOPEN flag and we test for this 353 * flag. They are both exclusive tests. 354 */ 355 open_comp = (frm_probe) ? 356 (shared->ms_flags & MDM_S_PROBEOPEN): 357 (shared->ms_flags & MDM_S_ISOPEN); 358 if (((shared->ms_flags & MDM_S_IOERR || !open_comp) && 359 ((shared->ms_state == CS_OKAY) || 360 (shared->ms_state == CS_RESYNC))) || 361 (!open_comp && 362 (shared->ms_state == CS_LAST_ERRED))) { 363 if (clr_error) { 364 shared->ms_flags &= ~MDM_S_IOERR; 365 } 366 *cip = ci; 367 *smi = i; 368 return (1); 369 } 370 371 if (clr_error && (shared->ms_flags & MDM_S_IOERR)) { 372 shared->ms_flags &= ~MDM_S_IOERR; 373 } 374 } 375 376 *cip = 0; 377 } 378 return (0); 379 } 380 381 /*ARGSUSED*/ 382 static void 383 mirror_run_queue(void *d) 384 { 385 if (!(md_status & MD_GBL_DAEMONS_LIVE)) 386 md_daemon(1, &md_done_daemon); 387 } 388 /* 389 * check_comp_4_hotspares 390 * 391 * This function attempts to allocate a hotspare for this component if the 392 * component is in error. In a MN set, the function can be called in 2 modes. 393 * It can be called either when a component error has been detected or when a 394 * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set 395 * in flags and the request is sent to all nodes. 396 * The handler on each of the nodes then calls this function with 397 * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed. 398 * 399 * For non-MN sets the function simply attempts to allocate a hotspare. 400 * 401 * On entry, the following locks are held 402 * mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set) 403 * md_unit_writerlock 404 * 405 * Returns 0 if ok 406 * 1 if the unit containing the component has been cleared while 407 * the mdmn_ksend_message() was being executed 408 */ 409 extern int 410 check_comp_4_hotspares( 411 mm_unit_t *un, 412 int smi, 413 int ci, 414 uint_t flags, 415 mddb_recid_t hs_id, /* Only used by MN disksets */ 416 IOLOCK *lockp /* can be NULL */ 417 ) 418 { 419 mm_submirror_t *sm; 420 mm_submirror_ic_t *smic; 421 md_m_shared_t *shared; 422 mddb_recid_t recids[6]; 423 minor_t mnum; 424 intptr_t (*hs_dev)(); 425 void (*hs_done)(); 426 void *hs_data; 427 md_error_t mde = mdnullerror; 428 set_t setno; 429 md_mn_msg_allochsp_t allochspmsg; 430 md_mn_kresult_t *kresult; 431 mm_unit_t *new_un; 432 int rval; 433 int nretries = 0; 434 435 mnum = MD_SID(un); 436 setno = MD_UN2SET(un); 437 sm = &un->un_sm[smi]; 438 smic = &un->un_smic[smi]; 439 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 440 (sm->sm_dev, sm, ci); 441 442 if (shared->ms_state != CS_ERRED) 443 return (0); 444 445 /* Don't start a new component resync if a resync is already running. */ 446 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) 447 return (0); 448 449 if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) { 450 uint_t msgflags; 451 md_mn_msgtype_t msgtype; 452 453 /* Send allocate hotspare message to all nodes */ 454 455 allochspmsg.msg_allochsp_mnum = un->c.un_self_id; 456 allochspmsg.msg_allochsp_sm = smi; 457 allochspmsg.msg_allochsp_comp = ci; 458 allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id; 459 460 /* 461 * Before calling mdmn_ksend_message(), release locks 462 * Can never be in the context of an ioctl. 463 */ 464 md_unit_writerexit(MDI_UNIT(mnum)); 465 if (flags & MD_HOTSPARE_LINKHELD) 466 rw_exit(&mirror_md_ops.md_link_rw.lock); 467 #ifdef DEBUG 468 if (mirror_debug_flag) 469 printf("send alloc hotspare, flags=" 470 "0x%x %x, %x, %x, %x\n", flags, 471 allochspmsg.msg_allochsp_mnum, 472 allochspmsg.msg_allochsp_sm, 473 allochspmsg.msg_allochsp_comp, 474 allochspmsg.msg_allochsp_hs_id); 475 #endif 476 if (flags & MD_HOTSPARE_WMUPDATE) { 477 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE2; 478 /* 479 * When coming from an update of watermarks, there 480 * must already be a message logged that triggered 481 * this action. So, no need to log this message, too. 482 */ 483 msgflags = MD_MSGF_NO_LOG; 484 } else { 485 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE; 486 msgflags = MD_MSGF_DEFAULT_FLAGS; 487 } 488 489 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 490 491 cc4hs_msg: 492 rval = mdmn_ksend_message(setno, msgtype, msgflags, 0, 493 (char *)&allochspmsg, sizeof (allochspmsg), 494 kresult); 495 496 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 497 #ifdef DEBUG 498 if (mirror_debug_flag) 499 mdmn_ksend_show_error(rval, kresult, 500 "ALLOCATE HOTSPARE"); 501 #endif 502 /* 503 * If message is sent ok but exitval indicates an error 504 * it must be because the mirror has been cleared. In 505 * this case re-obtain lock and return an error 506 */ 507 if ((rval == 0) && (kresult->kmmr_exitval != 0)) { 508 if (flags & MD_HOTSPARE_LINKHELD) { 509 rw_enter(&mirror_md_ops.md_link_rw.lock, 510 RW_READER); 511 } 512 kmem_free(kresult, sizeof (md_mn_kresult_t)); 513 return (1); 514 } 515 /* If we're shutting down already, pause things here. */ 516 if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) { 517 while (!md_mn_is_commd_present()) { 518 delay(md_hz); 519 } 520 /* 521 * commd has become reachable again, so retry 522 * once. If this fails we'll panic as the 523 * system is in an unexpected state. 524 */ 525 if (nretries++ == 0) 526 goto cc4hs_msg; 527 } 528 cmn_err(CE_PANIC, 529 "ksend_message failure: ALLOCATE_HOTSPARE"); 530 } 531 kmem_free(kresult, sizeof (md_mn_kresult_t)); 532 533 /* 534 * re-obtain the locks 535 */ 536 if (flags & MD_HOTSPARE_LINKHELD) 537 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); 538 new_un = md_unit_writerlock(MDI_UNIT(mnum)); 539 540 /* 541 * As we had to release the locks in order to send the 542 * message to all nodes, we need to check to see if the 543 * unit has changed. If it has we release the writerlock 544 * and return fail. 545 */ 546 if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) { 547 md_unit_writerexit(MDI_UNIT(mnum)); 548 return (1); 549 } 550 } else { 551 if (MD_MNSET_SETNO(setno)) { 552 /* 553 * If 2 or more nodes simultaneously see a 554 * component failure, these nodes will each 555 * send an ALLOCATE_HOTSPARE[2] message. 556 * The first message will allocate the hotspare 557 * and the subsequent messages should do nothing. 558 * 559 * If a slave node doesn't have a hotspare allocated 560 * at the time the message is initiated, then the 561 * passed in hs_id will be 0. If the node 562 * executing this routine has a component shared 563 * ms_hs_id of non-zero, but the message shows a 564 * hs_id of 0, then just return since a hotspare 565 * has already been allocated for this failing 566 * component. When the slave node returns from 567 * the ksend_message the hotspare will have 568 * already been allocated. 569 * 570 * If the slave node does send an hs_id of non-zero, 571 * and the slave node's hs_id matches this node's 572 * ms_hs_id, then the hotspare has error'd and 573 * should be replaced. 574 * 575 * If the slave node sends an hs_id of non-zero and 576 * this node has a different shared ms_hs_id, then 577 * just return since this hotspare has already 578 * been hotspared. 579 */ 580 if (shared->ms_hs_id != 0) { 581 if (hs_id == 0) { 582 #ifdef DEBUG 583 if (mirror_debug_flag) { 584 printf("check_comp_4_hotspares" 585 "(NOXMIT), short circuit " 586 "hs_id=0x%x, " 587 "ms_hs_id=0x%x\n", 588 hs_id, shared->ms_hs_id); 589 } 590 #endif 591 return (0); 592 } 593 if (hs_id != shared->ms_hs_id) { 594 #ifdef DEBUG 595 if (mirror_debug_flag) { 596 printf("check_comp_4_hotspares" 597 "(NOXMIT), short circuit2 " 598 "hs_id=0x%x, " 599 "ms_hs_id=0x%x\n", 600 hs_id, shared->ms_hs_id); 601 } 602 #endif 603 return (0); 604 } 605 } 606 } 607 608 sm = &un->un_sm[smi]; 609 hs_dev = md_get_named_service(sm->sm_dev, 0, 610 "hotspare device", 0); 611 if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done, 612 &hs_data) != 0) 613 return (0); 614 615 /* 616 * set_sm_comp_state() commits the modified records. 617 * As we don't transmit the changes, no need to drop the lock. 618 */ 619 set_sm_comp_state(un, smi, ci, CS_RESYNC, recids, 620 MD_STATE_NO_XMIT, (IOLOCK *)NULL); 621 622 (*hs_done)(sm->sm_dev, hs_data); 623 624 mirror_check_failfast(mnum); 625 626 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE, 627 setno, MD_SID(un)); 628 629 /* 630 * For a multi-node set we need to reset the un_rs_type, 631 * un_rs_resync_done and un_rs_resync_2_do fields as the 632 * hot-spare resync must copy all applicable data. 633 */ 634 if (MD_MNSET_SETNO(setno)) { 635 un->un_rs_type = MD_RS_NONE; 636 un->un_rs_resync_done = 0; 637 un->un_rs_resync_2_do = 0; 638 } 639 640 /* 641 * Must drop writer lock since mirror_resync_unit will 642 * open devices and must be able to grab readerlock. 643 * Don't need to drop IOLOCK since any descendent routines 644 * calling ksend_messages will drop the IOLOCK as needed. 645 * 646 */ 647 if (lockp) { 648 md_ioctl_writerexit(lockp); 649 } else { 650 md_unit_writerexit(MDI_UNIT(mnum)); 651 } 652 653 /* start resync */ 654 (void) mirror_resync_unit(mnum, NULL, &mde, lockp); 655 656 if (lockp) { 657 new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum)); 658 } else { 659 new_un = md_unit_writerlock(MDI_UNIT(mnum)); 660 } 661 } 662 return (0); 663 } 664 665 /* 666 * check_unit_4_hotspares 667 * 668 * For a given mirror, allocate hotspares, if available for any components 669 * that are in error 670 * 671 * Returns 0 if ok 672 * 1 if check_comp_4_hotspares returns non-zero. This will only 673 * happen for a MN unit where the unit has been cleared while 674 * the allocate hotspare message is sent to all nodes. 675 */ 676 static int 677 check_unit_4_hotspares(mm_unit_t *un, int flags) 678 { 679 mm_submirror_t *sm; 680 mm_submirror_ic_t *smic; 681 int ci; 682 int i; 683 int compcnt; 684 685 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) 686 return (0); 687 688 for (i = 0; i < NMIRROR; i++) { 689 sm = &un->un_sm[i]; 690 smic = &un->un_smic[i]; 691 if (!SMS_IS(sm, SMS_INUSE)) 692 continue; 693 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm); 694 for (ci = 0; ci < compcnt; ci++) { 695 md_m_shared_t *shared; 696 697 shared = (md_m_shared_t *) 698 (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci); 699 /* 700 * Never called from ioctl context, so pass in 701 * (IOLOCK *)NULL. Pass through flags from calling 702 * routine, also setting XMIT flag. 703 */ 704 if (check_comp_4_hotspares(un, i, ci, 705 (MD_HOTSPARE_XMIT | flags), 706 shared->ms_hs_id, (IOLOCK *)NULL) != 0) 707 return (1); 708 } 709 } 710 return (0); 711 } 712 713 static void 714 check_4_hotspares(daemon_request_t *drq) 715 { 716 mdi_unit_t *ui; 717 mm_unit_t *un; 718 md_link_t *next; 719 int x; 720 721 mutex_enter(&drq->dr_mx); /* clear up front so can poke */ 722 drq->dr_pending = 0; /* again in low level routine if */ 723 mutex_exit(&drq->dr_mx); /* something found to do */ 724 725 /* 726 * Used to have a problem here. The disksets weren't marked as being 727 * MNHOLD. This opened a window where we could be searching for 728 * hotspares and have the disk set unloaded (released) from under 729 * us causing a panic in stripe_component_count(). 730 * The way to prevent that is to mark the set MNHOLD which prevents 731 * any diskset from being released while we are scanning the mirrors, 732 * submirrors and components. 733 */ 734 735 for (x = 0; x < md_nsets; x++) 736 md_holdset_enter(x); 737 738 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); 739 for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) { 740 ui = MDI_UNIT(next->ln_id); 741 742 un = (mm_unit_t *)md_unit_readerlock(ui); 743 744 /* 745 * Only check the unit if we are the master for this set 746 * For an MN set, poke_hotspares() is only effective on the 747 * master 748 */ 749 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 750 md_set[MD_UN2SET(un)].s_am_i_master == 0) { 751 md_unit_readerexit(ui); 752 continue; 753 } 754 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) { 755 md_unit_readerexit(ui); 756 continue; 757 } 758 md_unit_readerexit(ui); 759 760 un = (mm_unit_t *)md_unit_writerlock(ui); 761 /* 762 * check_unit_4_hotspares will exit 1 if the unit has been 763 * removed during the process of allocating the hotspare. 764 * This can only happen for a MN metadevice. If unit no longer 765 * exists, no need to release writerlock 766 */ 767 if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0) 768 md_unit_writerexit(ui); 769 else { 770 /* 771 * If check_unit_4_hotspares failed, queue another 772 * request and break out of this one 773 */ 774 (void) poke_hotspares(); 775 break; 776 } 777 } 778 rw_exit(&mirror_md_ops.md_link_rw.lock); 779 780 for (x = 0; x < md_nsets; x++) 781 md_holdset_exit(x); 782 } 783 784 /* 785 * poke_hotspares 786 * 787 * If there is not a pending poke_hotspares request pending, queue a requent 788 * to call check_4_hotspares(). This will scan all mirrors and attempt to 789 * allocate hotspares for all components in error. 790 */ 791 int 792 poke_hotspares() 793 { 794 mutex_enter(&hotspare_request.dr_mx); 795 if (hotspare_request.dr_pending == 0) { 796 hotspare_request.dr_pending = 1; 797 daemon_request(&md_mhs_daemon, 798 check_4_hotspares, (daemon_queue_t *)&hotspare_request, 799 REQ_OLD); 800 } 801 mutex_exit(&hotspare_request.dr_mx); 802 return (0); 803 } 804 805 static void 806 free_all_ecomps(err_comp_t *ecomp) 807 { 808 err_comp_t *d; 809 810 while (ecomp != NULL) { 811 d = ecomp; 812 ecomp = ecomp->ec_next; 813 kmem_free(d, sizeof (err_comp_t)); 814 } 815 } 816 817 /* 818 * NAME: mirror_openfail_console_info 819 * 820 * DESCRIPTION: Prints a informative message to the console when mirror 821 * cannot be opened. 822 * 823 * PARAMETERS: mm_unit_t un - pointer to mirror unit structure 824 * int smi - submirror index 825 * int ci - component index 826 */ 827 828 void 829 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci) 830 { 831 void (*get_dev)(); 832 ms_cd_info_t cd; 833 md_dev64_t tmpdev; 834 835 tmpdev = un->un_sm[smi].sm_dev; 836 get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0); 837 if (get_dev != NULL) { 838 (void) (*get_dev)(tmpdev, smi, ci, &cd); 839 cmn_err(CE_WARN, "md %s: open error on %s", 840 md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un), 841 cd.cd_dev, NULL, 0)); 842 } else { 843 cmn_err(CE_WARN, "md %s: open error", 844 md_shortname(MD_SID(un))); 845 } 846 } 847 848 static int 849 mirror_close_all_devs(mm_unit_t *un, int md_cflags) 850 { 851 int i; 852 md_dev64_t dev; 853 854 for (i = 0; i < NMIRROR; i++) { 855 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 856 continue; 857 dev = un->un_sm[i].sm_dev; 858 md_layered_close(dev, md_cflags); 859 } 860 return (0); 861 } 862 863 /* 864 * Keep track of drivers that don't support failfast. We use this so that 865 * we only log one diagnostic message for each of these drivers, no matter 866 * how many times we run the mirror_check_failfast function. 867 * Return 1 if this is a new driver that does not support failfast, 868 * return 0 if we have already seen this non-failfast driver. 869 */ 870 static int 871 new_non_ff_driver(const char *s) 872 { 873 mutex_enter(&non_ff_drv_mutex); 874 if (non_ff_drivers == NULL) { 875 non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *), 876 KM_NOSLEEP); 877 if (non_ff_drivers == NULL) { 878 mutex_exit(&non_ff_drv_mutex); 879 return (1); 880 } 881 882 non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1, 883 KM_NOSLEEP); 884 if (non_ff_drivers[0] == NULL) { 885 kmem_free(non_ff_drivers, 2 * sizeof (char *)); 886 non_ff_drivers = NULL; 887 mutex_exit(&non_ff_drv_mutex); 888 return (1); 889 } 890 891 (void) strcpy(non_ff_drivers[0], s); 892 non_ff_drivers[1] = NULL; 893 894 } else { 895 int i; 896 char **tnames; 897 char **tmp; 898 899 for (i = 0; non_ff_drivers[i] != NULL; i++) { 900 if (strcmp(s, non_ff_drivers[i]) == 0) { 901 mutex_exit(&non_ff_drv_mutex); 902 return (0); 903 } 904 } 905 906 /* allow for new element and null */ 907 i += 2; 908 tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP); 909 if (tnames == NULL) { 910 mutex_exit(&non_ff_drv_mutex); 911 return (1); 912 } 913 914 for (i = 0; non_ff_drivers[i] != NULL; i++) 915 tnames[i] = non_ff_drivers[i]; 916 917 tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP); 918 if (tnames[i] == NULL) { 919 /* adjust i so that it is the right count to free */ 920 kmem_free(tnames, (i + 2) * sizeof (char *)); 921 mutex_exit(&non_ff_drv_mutex); 922 return (1); 923 } 924 925 (void) strcpy(tnames[i++], s); 926 tnames[i] = NULL; 927 928 tmp = non_ff_drivers; 929 non_ff_drivers = tnames; 930 /* i now represents the count we previously alloced */ 931 kmem_free(tmp, i * sizeof (char *)); 932 } 933 mutex_exit(&non_ff_drv_mutex); 934 935 return (1); 936 } 937 938 /* 939 * Check for the "ddi-failfast-supported" devtree property on each submirror 940 * component to indicate if we should do I/O to that submirror with the 941 * B_FAILFAST flag set or not. This check is made at various state transitions 942 * in the mirror code (e.g. open, enable, hotspare, etc.). Sometimes we 943 * only need to check one drive (e.g. hotspare) but since the check is 944 * fast and infrequent and sometimes needs to be done on all components we 945 * just check all components on each call. 946 */ 947 void 948 mirror_check_failfast(minor_t mnum) 949 { 950 int i; 951 mm_unit_t *un; 952 953 if (md_ff_disable) 954 return; 955 956 un = MD_UNIT(mnum); 957 958 for (i = 0; i < NMIRROR; i++) { 959 int ci; 960 int cnt; 961 int ff = 1; 962 mm_submirror_t *sm; 963 mm_submirror_ic_t *smic; 964 void (*get_dev)(); 965 966 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 967 continue; 968 969 sm = &un->un_sm[i]; 970 smic = &un->un_smic[i]; 971 972 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, 973 "get device", 0); 974 975 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); 976 for (ci = 0; ci < cnt; ci++) { 977 int found = 0; 978 dev_t ci_dev; 979 major_t major; 980 dev_info_t *devi; 981 ms_cd_info_t cd; 982 983 /* 984 * this already returns the hs 985 * dev if the device is spared 986 */ 987 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 988 989 ci_dev = md_dev64_to_dev(cd.cd_dev); 990 major = getmajor(ci_dev); 991 992 if (major == md_major) { 993 /* 994 * this component must be a soft 995 * partition; get the real dev 996 */ 997 minor_t dev_mnum; 998 mdi_unit_t *ui; 999 mp_unit_t *un; 1000 set_t setno; 1001 side_t side; 1002 md_dev64_t tmpdev; 1003 1004 ui = MDI_UNIT(getminor(ci_dev)); 1005 1006 /* grab necessary lock */ 1007 un = (mp_unit_t *)md_unit_readerlock(ui); 1008 1009 dev_mnum = MD_SID(un); 1010 setno = MD_MIN2SET(dev_mnum); 1011 side = mddb_getsidenum(setno); 1012 1013 tmpdev = un->un_dev; 1014 1015 /* Get dev by device id */ 1016 if (md_devid_found(setno, side, 1017 un->un_key) == 1) { 1018 tmpdev = md_resolve_bydevid(dev_mnum, 1019 tmpdev, un->un_key); 1020 } 1021 1022 md_unit_readerexit(ui); 1023 1024 ci_dev = md_dev64_to_dev(tmpdev); 1025 major = getmajor(ci_dev); 1026 } 1027 1028 if (ci_dev != NODEV32 && 1029 (devi = e_ddi_hold_devi_by_dev(ci_dev, 0)) 1030 != NULL) { 1031 ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF; 1032 int propvalue = 0; 1033 int proplength = sizeof (int); 1034 int error; 1035 struct cb_ops *cb; 1036 1037 if ((cb = devopsp[major]->devo_cb_ops) != 1038 NULL) { 1039 error = (*cb->cb_prop_op) 1040 (DDI_DEV_T_ANY, devi, prop_op, 1041 DDI_PROP_NOTPROM|DDI_PROP_DONTPASS, 1042 "ddi-failfast-supported", 1043 (caddr_t)&propvalue, &proplength); 1044 1045 if (error == DDI_PROP_SUCCESS) 1046 found = 1; 1047 } 1048 1049 if (!found && new_non_ff_driver( 1050 ddi_driver_name(devi))) { 1051 cmn_err(CE_NOTE, "!md: B_FAILFAST I/O" 1052 "disabled on %s", 1053 ddi_driver_name(devi)); 1054 } 1055 1056 ddi_release_devi(devi); 1057 } 1058 1059 /* 1060 * All components must support 1061 * failfast in the submirror. 1062 */ 1063 if (!found) { 1064 ff = 0; 1065 break; 1066 } 1067 } 1068 1069 if (ff) { 1070 sm->sm_flags |= MD_SM_FAILFAST; 1071 } else { 1072 sm->sm_flags &= ~MD_SM_FAILFAST; 1073 } 1074 } 1075 } 1076 1077 /* 1078 * Return true if the submirror is unavailable. 1079 * If any of the submirror components are opened then the submirror cannot 1080 * be unavailable (MD_INACCESSIBLE). 1081 * If any of the components are already in the errored state, then the submirror 1082 * cannot be unavailable (MD_INACCESSIBLE). 1083 */ 1084 static bool_t 1085 submirror_unavailable(mm_unit_t *un, int smi, int from_probe) 1086 { 1087 mm_submirror_t *sm; 1088 mm_submirror_ic_t *smic; 1089 md_m_shared_t *shared; 1090 int ci; 1091 int compcnt; 1092 1093 sm = &un->un_sm[smi]; 1094 smic = &un->un_smic[smi]; 1095 1096 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 1097 for (ci = 0; ci < compcnt; ci++) { 1098 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 1099 (sm->sm_dev, sm, ci); 1100 if (from_probe) { 1101 if (shared->ms_flags & MDM_S_PROBEOPEN) 1102 return (B_FALSE); 1103 } else { 1104 if (shared->ms_flags & MDM_S_ISOPEN) 1105 return (B_FALSE); 1106 } 1107 if (shared->ms_state == CS_ERRED || 1108 shared->ms_state == CS_LAST_ERRED) 1109 return (B_FALSE); 1110 } 1111 1112 return (B_TRUE); 1113 } 1114 1115 static int 1116 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp) 1117 { 1118 int i; 1119 mm_unit_t *un; 1120 mdi_unit_t *ui; 1121 int err; 1122 int smi; 1123 int ci; 1124 err_comp_t *c; 1125 err_comp_t *ecomps = NULL; 1126 int smmask = 0; 1127 set_t setno; 1128 int sm_cnt; 1129 int sm_unavail_cnt; 1130 1131 mirror_check_failfast(mnum); 1132 1133 un = MD_UNIT(mnum); 1134 ui = MDI_UNIT(mnum); 1135 setno = MD_UN2SET(un); 1136 1137 for (i = 0; i < NMIRROR; i++) { 1138 md_dev64_t tmpdev = un->un_sm[i].sm_dev; 1139 1140 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1141 continue; 1142 if (md_layered_open(mnum, &tmpdev, md_oflags)) 1143 smmask |= SMI2BIT(i); 1144 un->un_sm[i].sm_dev = tmpdev; 1145 } 1146 1147 /* 1148 * If smmask is clear, all submirrors are accessible. Clear the 1149 * MD_INACCESSIBLE bit in this case. This bit is also cleared for the 1150 * mirror device. If smmask is set, we have to determine which of the 1151 * submirrors are in error. If no submirror is accessible we mark the 1152 * whole mirror as MD_INACCESSIBLE. 1153 */ 1154 if (smmask == 0) { 1155 if (lockp) { 1156 md_ioctl_readerexit(lockp); 1157 (void) md_ioctl_writerlock(lockp, ui); 1158 } else { 1159 md_unit_readerexit(ui); 1160 (void) md_unit_writerlock(ui); 1161 } 1162 ui->ui_tstate &= ~MD_INACCESSIBLE; 1163 if (lockp) { 1164 md_ioctl_writerexit(lockp); 1165 (void) md_ioctl_readerlock(lockp, ui); 1166 } else { 1167 md_unit_writerexit(ui); 1168 (void) md_unit_readerlock(ui); 1169 } 1170 1171 for (i = 0; i < NMIRROR; i++) { 1172 md_dev64_t tmpdev; 1173 mdi_unit_t *sm_ui; 1174 1175 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1176 continue; 1177 1178 tmpdev = un->un_sm[i].sm_dev; 1179 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 1180 (void) md_unit_writerlock(sm_ui); 1181 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 1182 md_unit_writerexit(sm_ui); 1183 } 1184 1185 return (0); 1186 } 1187 1188 for (i = 0; i < NMIRROR; i++) { 1189 md_dev64_t tmpdev; 1190 1191 if (!(smmask & SMI2BIT(i))) 1192 continue; 1193 1194 tmpdev = un->un_sm[i].sm_dev; 1195 err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS); 1196 un->un_sm[i].sm_dev = tmpdev; 1197 ASSERT(err == 0); 1198 } 1199 1200 if (lockp) { 1201 md_ioctl_readerexit(lockp); 1202 un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui); 1203 } else { 1204 md_unit_readerexit(ui); 1205 un = (mm_unit_t *)md_unit_writerlock(ui); 1206 } 1207 1208 /* 1209 * We want to make sure the unavailable flag is not masking a real 1210 * error on the submirror. 1211 * For each submirror, 1212 * if all of the submirror components couldn't be opened and there 1213 * are no errors on the submirror, then set the unavailable flag 1214 * otherwise, clear unavailable. 1215 */ 1216 sm_cnt = 0; 1217 sm_unavail_cnt = 0; 1218 for (i = 0; i < NMIRROR; i++) { 1219 md_dev64_t tmpdev; 1220 mdi_unit_t *sm_ui; 1221 1222 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1223 continue; 1224 1225 sm_cnt++; 1226 tmpdev = un->un_sm[i].sm_dev; 1227 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 1228 1229 (void) md_unit_writerlock(sm_ui); 1230 if (submirror_unavailable(un, i, 0)) { 1231 sm_ui->ui_tstate |= MD_INACCESSIBLE; 1232 sm_unavail_cnt++; 1233 } else { 1234 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 1235 } 1236 md_unit_writerexit(sm_ui); 1237 } 1238 1239 /* 1240 * If all of the submirrors are unavailable, the mirror is also 1241 * unavailable. 1242 */ 1243 if (sm_cnt == sm_unavail_cnt) { 1244 ui->ui_tstate |= MD_INACCESSIBLE; 1245 } else { 1246 ui->ui_tstate &= ~MD_INACCESSIBLE; 1247 } 1248 1249 smi = 0; 1250 ci = 0; 1251 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) { 1252 if (mirror_other_sources(un, smi, ci, 1) == 1) { 1253 1254 free_all_ecomps(ecomps); 1255 (void) mirror_close_all_devs(un, md_oflags); 1256 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, 1257 SVM_TAG_METADEVICE, setno, MD_SID(un)); 1258 mirror_openfail_console_info(un, smi, ci); 1259 if (lockp) { 1260 md_ioctl_writerexit(lockp); 1261 (void) md_ioctl_readerlock(lockp, ui); 1262 } else { 1263 md_unit_writerexit(ui); 1264 (void) md_unit_readerlock(ui); 1265 } 1266 return (ENXIO); 1267 } 1268 1269 /* track all component states that need changing */ 1270 c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP); 1271 c->ec_next = ecomps; 1272 c->ec_smi = smi; 1273 c->ec_ci = ci; 1274 ecomps = c; 1275 ci++; 1276 } 1277 1278 /* Make all state changes and commit them */ 1279 for (c = ecomps; c != NULL; c = c->ec_next) { 1280 /* 1281 * If lockp is set, then entering kernel through ioctl. 1282 * For a MN set, the only ioctl path is via a commd message 1283 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already 1284 * being sent to each node. 1285 * In this case, set NO_XMIT so that set_sm_comp_state 1286 * won't attempt to send a message on a message. 1287 * 1288 * In !MN sets, the xmit flag is ignored, so it doesn't matter 1289 * which flag is passed. 1290 */ 1291 if (lockp) { 1292 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0, 1293 MD_STATE_NO_XMIT, lockp); 1294 } else { 1295 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0, 1296 (MD_STATE_XMIT | MD_STATE_OCHELD), lockp); 1297 } 1298 /* 1299 * For a MN set, the NOTIFY is done when the state change is 1300 * processed on each node 1301 */ 1302 if (!MD_MNSET_SETNO(setno)) { 1303 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 1304 SVM_TAG_METADEVICE, setno, MD_SID(un)); 1305 } 1306 } 1307 1308 if (lockp) { 1309 md_ioctl_writerexit(lockp); 1310 (void) md_ioctl_readerlock(lockp, ui); 1311 } else { 1312 md_unit_writerexit(ui); 1313 (void) md_unit_readerlock(ui); 1314 } 1315 1316 free_all_ecomps(ecomps); 1317 1318 /* allocate hotspares for all errored components */ 1319 if (MD_MNSET_SETNO(setno)) { 1320 /* 1321 * If we're called from an ioctl (lockp set) then we cannot 1322 * directly call send_poke_hotspares as this will block until 1323 * the message gets despatched to all nodes. If the cluster is 1324 * going through a reconfig cycle then the message will block 1325 * until the cycle is complete, and as we originate from a 1326 * service call from commd we will livelock. 1327 */ 1328 if (lockp == NULL) { 1329 md_unit_readerexit(ui); 1330 send_poke_hotspares(setno); 1331 (void) md_unit_readerlock(ui); 1332 } 1333 } else { 1334 (void) poke_hotspares(); 1335 } 1336 return (0); 1337 } 1338 1339 void 1340 mirror_overlap_tree_remove(md_mps_t *ps) 1341 { 1342 mm_unit_t *un; 1343 1344 if (panicstr) 1345 return; 1346 1347 VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP); 1348 un = ps->ps_un; 1349 1350 mutex_enter(&un->un_overlap_tree_mx); 1351 avl_remove(&un->un_overlap_root, ps); 1352 ps->ps_flags &= ~MD_MPS_ON_OVERLAP; 1353 if (un->un_overlap_tree_flag != 0) { 1354 un->un_overlap_tree_flag = 0; 1355 cv_broadcast(&un->un_overlap_tree_cv); 1356 } 1357 mutex_exit(&un->un_overlap_tree_mx); 1358 } 1359 1360 1361 /* 1362 * wait_for_overlaps: 1363 * ----------------- 1364 * Check that given i/o request does not cause an overlap with already pending 1365 * i/o. If it does, block until the overlapped i/o completes. 1366 * 1367 * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent 1368 * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if 1369 * it must not already be in the tree. 1370 */ 1371 static void 1372 wait_for_overlaps(md_mps_t *ps, int flags) 1373 { 1374 mm_unit_t *un; 1375 avl_index_t where; 1376 md_mps_t *ps1; 1377 1378 if (panicstr) 1379 return; 1380 1381 un = ps->ps_un; 1382 mutex_enter(&un->un_overlap_tree_mx); 1383 if ((flags & MD_OVERLAP_ALLOW_REPEAT) && 1384 (ps->ps_flags & MD_MPS_ON_OVERLAP)) { 1385 mutex_exit(&un->un_overlap_tree_mx); 1386 return; 1387 } 1388 1389 VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 1390 1391 do { 1392 ps1 = avl_find(&un->un_overlap_root, ps, &where); 1393 if (ps1 == NULL) { 1394 /* 1395 * The candidate range does not overlap with any 1396 * range in the tree. Insert it and be done. 1397 */ 1398 avl_insert(&un->un_overlap_root, ps, where); 1399 ps->ps_flags |= MD_MPS_ON_OVERLAP; 1400 } else { 1401 /* 1402 * The candidate range would overlap. Set the flag 1403 * indicating we need to be woken up, and sleep 1404 * until another thread removes a range. If upon 1405 * waking up we find this mps was put on the tree 1406 * by another thread, the loop terminates. 1407 */ 1408 un->un_overlap_tree_flag = 1; 1409 cv_wait(&un->un_overlap_tree_cv, 1410 &un->un_overlap_tree_mx); 1411 } 1412 } while (!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 1413 mutex_exit(&un->un_overlap_tree_mx); 1414 } 1415 1416 /* 1417 * This function is called from mirror_done to check whether any pages have 1418 * been modified while a mirrored write was in progress. Returns 0 if 1419 * all pages associated with bp are clean, 1 otherwise. 1420 */ 1421 static int 1422 any_pages_dirty(struct buf *bp) 1423 { 1424 int rval; 1425 1426 rval = biomodified(bp); 1427 if (rval == -1) 1428 rval = 0; 1429 1430 return (rval); 1431 } 1432 1433 #define MAX_EXTRAS 10 1434 1435 void 1436 mirror_commit( 1437 mm_unit_t *un, 1438 int smmask, 1439 mddb_recid_t *extras 1440 ) 1441 { 1442 mm_submirror_t *sm; 1443 md_unit_t *su; 1444 int i; 1445 1446 /* 2=mirror,null id */ 1447 mddb_recid_t recids[NMIRROR+2+MAX_EXTRAS]; 1448 1449 int ri = 0; 1450 1451 if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE) 1452 return; 1453 1454 /* Add two, this includes the mirror unit and the null recid */ 1455 if (extras != NULL) { 1456 int nrecids = 0; 1457 while (extras[nrecids] != 0) { 1458 nrecids++; 1459 } 1460 ASSERT(nrecids <= MAX_EXTRAS); 1461 } 1462 1463 if (un != NULL) 1464 recids[ri++] = un->c.un_record_id; 1465 for (i = 0; i < NMIRROR; i++) { 1466 if (!(smmask & SMI2BIT(i))) 1467 continue; 1468 sm = &un->un_sm[i]; 1469 if (!SMS_IS(sm, SMS_INUSE)) 1470 continue; 1471 if (md_getmajor(sm->sm_dev) != md_major) 1472 continue; 1473 su = MD_UNIT(md_getminor(sm->sm_dev)); 1474 recids[ri++] = su->c.un_record_id; 1475 } 1476 1477 if (extras != NULL) 1478 while (*extras != 0) { 1479 recids[ri++] = *extras; 1480 extras++; 1481 } 1482 1483 if (ri == 0) 1484 return; 1485 recids[ri] = 0; 1486 1487 /* 1488 * Ok to hold ioctl lock across record commit to mddb as 1489 * long as the record(s) being committed aren't resync records. 1490 */ 1491 mddb_commitrecs_wrapper(recids); 1492 } 1493 1494 1495 /* 1496 * This routine is used to set a bit in the writable_bm bitmap 1497 * which represents each submirror in a metamirror which 1498 * is writable. The first writable submirror index is assigned 1499 * to the sm_index. The number of writable submirrors are returned in nunits. 1500 * 1501 * This routine returns the submirror's unit number. 1502 */ 1503 1504 static void 1505 select_write_units(struct mm_unit *un, md_mps_t *ps) 1506 { 1507 1508 int i; 1509 unsigned writable_bm = 0; 1510 unsigned nunits = 0; 1511 1512 for (i = 0; i < NMIRROR; i++) { 1513 if (SUBMIRROR_IS_WRITEABLE(un, i)) { 1514 /* set bit of all writable units */ 1515 writable_bm |= SMI2BIT(i); 1516 nunits++; 1517 } 1518 } 1519 ps->ps_writable_sm = writable_bm; 1520 ps->ps_active_cnt = nunits; 1521 ps->ps_current_sm = 0; 1522 } 1523 1524 static 1525 unsigned 1526 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps) 1527 { 1528 1529 int i; 1530 unsigned writable_bm = 0; 1531 unsigned nunits = 0; 1532 1533 for (i = 0; i < NMIRROR; i++) { 1534 if (SUBMIRROR_IS_WRITEABLE(un, i) && 1535 un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) { 1536 writable_bm |= SMI2BIT(i); 1537 nunits++; 1538 } 1539 } 1540 if ((writable_bm & ps->ps_allfrom_sm) != 0) { 1541 writable_bm &= ~ps->ps_allfrom_sm; 1542 nunits--; 1543 } 1544 ps->ps_writable_sm = writable_bm; 1545 ps->ps_active_cnt = nunits; 1546 ps->ps_current_sm = 0; 1547 return (nunits); 1548 } 1549 1550 static md_dev64_t 1551 select_read_unit( 1552 mm_unit_t *un, 1553 diskaddr_t blkno, 1554 u_longlong_t reqcount, 1555 u_longlong_t *cando, 1556 int must_be_opened, 1557 md_m_shared_t **shared, 1558 md_mcs_t *cs) 1559 { 1560 int i; 1561 md_m_shared_t *s; 1562 uint_t lasterrcnt = 0; 1563 md_dev64_t dev = 0; 1564 u_longlong_t cnt; 1565 u_longlong_t mincnt; 1566 mm_submirror_t *sm; 1567 mm_submirror_ic_t *smic; 1568 mdi_unit_t *ui; 1569 1570 mincnt = reqcount; 1571 for (i = 0; i < NMIRROR; i++) { 1572 if (!SUBMIRROR_IS_READABLE(un, i)) 1573 continue; 1574 sm = &un->un_sm[i]; 1575 smic = &un->un_smic[i]; 1576 cnt = reqcount; 1577 1578 /* 1579 * If the current submirror is marked as inaccessible, do not 1580 * try to access it. 1581 */ 1582 ui = MDI_UNIT(getminor(expldev(sm->sm_dev))); 1583 (void) md_unit_readerlock(ui); 1584 if (ui->ui_tstate & MD_INACCESSIBLE) { 1585 md_unit_readerexit(ui); 1586 continue; 1587 } 1588 md_unit_readerexit(ui); 1589 1590 s = (md_m_shared_t *)(*(smic->sm_shared_by_blk)) 1591 (sm->sm_dev, sm, blkno, &cnt); 1592 1593 if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN)) 1594 continue; 1595 if (s->ms_state == CS_OKAY) { 1596 *cando = cnt; 1597 if (shared != NULL) 1598 *shared = s; 1599 1600 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST && 1601 cs != NULL) { 1602 cs->cs_buf.b_flags |= B_FAILFAST; 1603 } 1604 1605 return (un->un_sm[i].sm_dev); 1606 } 1607 if (s->ms_state != CS_LAST_ERRED) 1608 continue; 1609 1610 /* don't use B_FAILFAST since we're Last Erred */ 1611 1612 if (mincnt > cnt) 1613 mincnt = cnt; 1614 if (s->ms_lasterrcnt > lasterrcnt) { 1615 lasterrcnt = s->ms_lasterrcnt; 1616 if (shared != NULL) 1617 *shared = s; 1618 dev = un->un_sm[i].sm_dev; 1619 } 1620 } 1621 *cando = mincnt; 1622 return (dev); 1623 } 1624 1625 /* 1626 * Given a 32-bit bitmap, this routine will return the bit number 1627 * of the nth bit set. The nth bit set is passed via the index integer. 1628 * 1629 * This routine is used to run through the writable submirror bitmap 1630 * and starting all of the writes. See the value returned is the 1631 * index to appropriate submirror structure, in the md_sm 1632 * array for metamirrors. 1633 */ 1634 static int 1635 md_find_nth_unit(uint_t mask, int index) 1636 { 1637 int bit, nfound; 1638 1639 for (bit = -1, nfound = -1; nfound != index; bit++) { 1640 ASSERT(mask != 0); 1641 nfound += (mask & 1); 1642 mask >>= 1; 1643 } 1644 return (bit); 1645 } 1646 1647 static int 1648 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs) 1649 { 1650 mm_unit_t *un; 1651 buf_t *bp; 1652 int i; 1653 unsigned nunits = 0; 1654 int iunit; 1655 uint_t running_bm = 0; 1656 uint_t sm_index; 1657 1658 bp = &cs->cs_buf; 1659 un = ps->ps_un; 1660 1661 for (i = 0; i < NMIRROR; i++) { 1662 if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING)) 1663 continue; 1664 running_bm |= SMI2BIT(i); 1665 nunits++; 1666 } 1667 if (nunits == 0) 1668 return (1); 1669 1670 /* 1671 * For directed mirror read (DMR) we only use the specified side and 1672 * do not compute the source of the read. 1673 * If we're running with MD_MPS_DIRTY_RD set we always return the 1674 * first mirror side (this prevents unnecessary ownership switching). 1675 * Otherwise we return the submirror according to the mirror read option 1676 */ 1677 if (ps->ps_flags & MD_MPS_DMR) { 1678 sm_index = un->un_dmr_last_read; 1679 } else if (ps->ps_flags & MD_MPS_DIRTY_RD) { 1680 sm_index = md_find_nth_unit(running_bm, 0); 1681 } else { 1682 /* Normal (non-DMR) operation */ 1683 switch (un->un_read_option) { 1684 case RD_GEOMETRY: 1685 iunit = (int)(bp->b_lblkno / 1686 howmany(un->c.un_total_blocks, nunits)); 1687 sm_index = md_find_nth_unit(running_bm, iunit); 1688 break; 1689 case RD_FIRST: 1690 sm_index = md_find_nth_unit(running_bm, 0); 1691 break; 1692 case RD_LOAD_BAL: 1693 /* this is intentional to fall into the default */ 1694 default: 1695 un->un_last_read = (un->un_last_read + 1) % nunits; 1696 sm_index = md_find_nth_unit(running_bm, 1697 un->un_last_read); 1698 break; 1699 } 1700 } 1701 bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev); 1702 ps->ps_allfrom_sm = SMI2BIT(sm_index); 1703 1704 if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) { 1705 bp->b_flags |= B_FAILFAST; 1706 } 1707 1708 return (0); 1709 } 1710 1711 static 1712 int 1713 mirror_are_submirrors_available(mm_unit_t *un) 1714 { 1715 int i; 1716 for (i = 0; i < NMIRROR; i++) { 1717 md_dev64_t tmpdev = un->un_sm[i].sm_dev; 1718 1719 if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) || 1720 md_getmajor(tmpdev) != md_major) 1721 continue; 1722 1723 if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) || 1724 (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits)) 1725 return (0); 1726 1727 if (MDI_UNIT(md_getminor(tmpdev)) == NULL) 1728 return (0); 1729 } 1730 return (1); 1731 } 1732 1733 void 1734 build_submirror(mm_unit_t *un, int i, int snarfing) 1735 { 1736 struct mm_submirror *sm; 1737 struct mm_submirror_ic *smic; 1738 md_unit_t *su; 1739 set_t setno; 1740 1741 sm = &un->un_sm[i]; 1742 smic = &un->un_smic[i]; 1743 1744 sm->sm_flags = 0; /* sometime we may need to do more here */ 1745 1746 setno = MD_UN2SET(un); 1747 1748 if (!SMS_IS(sm, SMS_INUSE)) 1749 return; 1750 if (snarfing) { 1751 sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno), 1752 sm->sm_key, MD_NOTRUST_DEVT); 1753 } else { 1754 if (md_getmajor(sm->sm_dev) == md_major) { 1755 su = MD_UNIT(md_getminor(sm->sm_dev)); 1756 un->c.un_flag |= (su->c.un_flag & MD_LABELED); 1757 /* submirror can no longer be soft partitioned */ 1758 MD_CAPAB(su) &= (~MD_CAN_SP); 1759 } 1760 } 1761 smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev, 1762 0, "shared by blk", 0); 1763 smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev, 1764 0, "shared by indx", 0); 1765 smic->sm_get_component_count = (int (*)())md_get_named_service( 1766 sm->sm_dev, 0, "get component count", 0); 1767 smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0, 1768 "get block count skip size", 0); 1769 sm->sm_state &= ~SMS_IGNORE; 1770 if (SMS_IS(sm, SMS_OFFLINE)) 1771 MD_STATUS(un) |= MD_UN_OFFLINE_SM; 1772 md_set_parent(sm->sm_dev, MD_SID(un)); 1773 } 1774 1775 static void 1776 mirror_cleanup(mm_unit_t *un) 1777 { 1778 mddb_recid_t recid; 1779 int smi; 1780 sv_dev_t sv[NMIRROR]; 1781 int nsv = 0; 1782 1783 /* 1784 * If a MN diskset and this node is not the master, do 1785 * not delete any records on snarf of the mirror records. 1786 */ 1787 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1788 md_set[MD_UN2SET(un)].s_am_i_master == 0) { 1789 return; 1790 } 1791 1792 for (smi = 0; smi < NMIRROR; smi++) { 1793 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 1794 continue; 1795 sv[nsv].setno = MD_UN2SET(un); 1796 sv[nsv++].key = un->un_sm[smi].sm_key; 1797 } 1798 1799 recid = un->un_rr_dirty_recid; 1800 mddb_deleterec_wrapper(un->c.un_record_id); 1801 if (recid > 0) 1802 mddb_deleterec_wrapper(recid); 1803 1804 md_rem_names(sv, nsv); 1805 } 1806 1807 /* 1808 * Comparison function for the avl tree which tracks 1809 * outstanding writes on submirrors. 1810 * 1811 * Returns: 1812 * -1: ps1 < ps2 1813 * 0: ps1 and ps2 overlap 1814 * 1: ps1 > ps2 1815 */ 1816 static int 1817 mirror_overlap_compare(const void *p1, const void *p2) 1818 { 1819 const md_mps_t *ps1 = (md_mps_t *)p1; 1820 const md_mps_t *ps2 = (md_mps_t *)p2; 1821 1822 if (ps1->ps_firstblk < ps2->ps_firstblk) { 1823 if (ps1->ps_lastblk >= ps2->ps_firstblk) 1824 return (0); 1825 return (-1); 1826 } 1827 1828 if (ps1->ps_firstblk > ps2->ps_firstblk) { 1829 if (ps1->ps_firstblk <= ps2->ps_lastblk) 1830 return (0); 1831 return (1); 1832 } 1833 1834 return (0); 1835 } 1836 1837 /* 1838 * Collapse any sparse submirror entries snarfed from the on-disk replica. 1839 * Only the in-core entries are updated. The replica will be updated on-disk 1840 * when the in-core replica is committed on shutdown of the SVM subsystem. 1841 */ 1842 static void 1843 collapse_submirrors(mm_unit_t *un) 1844 { 1845 int smi, nremovals, smiremove; 1846 mm_submirror_t *sm, *new_sm, *old_sm; 1847 mm_submirror_ic_t *smic; 1848 int nsmidx = un->un_nsm - 1; 1849 1850 rescan: 1851 nremovals = 0; 1852 smiremove = -1; 1853 1854 for (smi = 0; smi <= nsmidx; smi++) { 1855 sm = &un->un_sm[smi]; 1856 1857 /* 1858 * Check to see if this submirror is marked as in-use. 1859 * If it isn't then it is a potential sparse entry and 1860 * may need to be cleared from the configuration. 1861 * The records should _already_ have been cleared by the 1862 * original mirror_detach() code, but we need to shuffle 1863 * any NULL entries in un_sm[] to the end of the array. 1864 * Any NULL un_smic[] entries need to be reset to the underlying 1865 * submirror/slice accessor functions. 1866 */ 1867 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) { 1868 nremovals++; 1869 smiremove = smi; 1870 break; 1871 } 1872 } 1873 1874 if (nremovals == 0) { 1875 /* 1876 * Ensure that we have a matching contiguous set of un_smic[] 1877 * entries for the corresponding un_sm[] entries 1878 */ 1879 for (smi = 0; smi <= nsmidx; smi++) { 1880 smic = &un->un_smic[smi]; 1881 sm = &un->un_sm[smi]; 1882 1883 smic->sm_shared_by_blk = 1884 md_get_named_service(sm->sm_dev, 0, 1885 "shared by_blk", 0); 1886 smic->sm_shared_by_indx = 1887 md_get_named_service(sm->sm_dev, 0, 1888 "shared by indx", 0); 1889 smic->sm_get_component_count = 1890 (int (*)())md_get_named_service(sm->sm_dev, 0, 1891 "get component count", 0); 1892 smic->sm_get_bcss = 1893 (int (*)())md_get_named_service(sm->sm_dev, 0, 1894 "get block count skip size", 0); 1895 } 1896 return; 1897 } 1898 1899 /* 1900 * Reshuffle the submirror devices so that we do not have a dead record 1901 * in the middle of the array. Once we've done this we need to rescan 1902 * the mirror to check for any other holes. 1903 */ 1904 for (smi = 0; smi < NMIRROR; smi++) { 1905 if (smi < smiremove) 1906 continue; 1907 if (smi > smiremove) { 1908 old_sm = &un->un_sm[smi]; 1909 new_sm = &un->un_sm[smi - 1]; 1910 bcopy(old_sm, new_sm, sizeof (mm_submirror_t)); 1911 bzero(old_sm, sizeof (mm_submirror_t)); 1912 } 1913 } 1914 1915 /* 1916 * Now we need to rescan the array to find the next potential dead 1917 * entry. 1918 */ 1919 goto rescan; 1920 } 1921 1922 /* Return a -1 if optimized record unavailable and set should be released */ 1923 int 1924 mirror_build_incore(mm_unit_t *un, int snarfing) 1925 { 1926 int i; 1927 1928 if (MD_STATUS(un) & MD_UN_BEING_RESET) { 1929 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN); 1930 return (1); 1931 } 1932 1933 if (mirror_are_submirrors_available(un) == 0) 1934 return (1); 1935 1936 if (MD_UNIT(MD_SID(un)) != NULL) 1937 return (0); 1938 1939 MD_STATUS(un) = 0; 1940 1941 /* pre-4.1 didn't define CAN_META_CHILD capability */ 1942 MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP; 1943 1944 un->un_overlap_tree_flag = 0; 1945 avl_create(&un->un_overlap_root, mirror_overlap_compare, 1946 sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node)); 1947 1948 /* 1949 * We need to collapse any sparse submirror entries into a non-sparse 1950 * array. This is to cover the case where we have an old replica image 1951 * which has not been updated (i.e. snarfed) since being modified. 1952 * The new code expects all submirror access to be sequential (i.e. 1953 * both the un_sm[] and un_smic[] entries correspond to non-empty 1954 * submirrors. 1955 */ 1956 1957 collapse_submirrors(un); 1958 1959 for (i = 0; i < NMIRROR; i++) 1960 build_submirror(un, i, snarfing); 1961 1962 if (unit_setup_resync(un, snarfing) != 0) { 1963 if (snarfing) { 1964 mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT); 1965 /* 1966 * If a MN set and set is not stale, then return -1 1967 * which will force the caller to unload the set. 1968 * The MN diskset nodes will return failure if 1969 * unit_setup_resync fails so that nodes won't 1970 * get out of sync. 1971 * 1972 * If set is STALE, the master node can't allocate 1973 * a resync record (if needed), but node needs to 1974 * join the set so that user can delete broken mddbs. 1975 * So, if set is STALE, just continue on. 1976 */ 1977 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1978 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) { 1979 return (-1); 1980 } 1981 } else 1982 return (1); 1983 } 1984 1985 mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL); 1986 cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL); 1987 1988 un->un_suspend_wr_flag = 0; 1989 mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL); 1990 cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL); 1991 1992 /* 1993 * Allocate mutexes for mirror-owner and resync-owner changes. 1994 * All references to the owner message state field must be guarded 1995 * by this mutex. 1996 */ 1997 mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL); 1998 1999 /* 2000 * Allocate mutex and condvar for resync thread manipulation. These 2001 * will be used by mirror_resync_unit/mirror_ioctl_resync 2002 */ 2003 mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL); 2004 cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL); 2005 2006 /* 2007 * Allocate mutex and condvar for resync progress thread manipulation. 2008 * This allows resyncs to be continued across an intervening reboot. 2009 */ 2010 mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL); 2011 cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL); 2012 2013 /* 2014 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This 2015 * provides synchronization between a user-ioctl and the resulting 2016 * strategy() call that performs the read(). 2017 */ 2018 mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL); 2019 cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL); 2020 2021 /* 2022 * Allocate rwlocks for un_pernode_dirty_bm accessing. 2023 */ 2024 for (i = 0; i < MD_MNMAXSIDES; i++) { 2025 rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL); 2026 } 2027 2028 /* place various information in the in-core data structures */ 2029 md_nblocks_set(MD_SID(un), un->c.un_total_blocks); 2030 MD_UNIT(MD_SID(un)) = un; 2031 2032 return (0); 2033 } 2034 2035 2036 void 2037 reset_mirror(struct mm_unit *un, minor_t mnum, int removing) 2038 { 2039 mddb_recid_t recid, vtoc_id; 2040 size_t bitcnt; 2041 size_t shortcnt; 2042 int smi; 2043 sv_dev_t sv[NMIRROR]; 2044 int nsv = 0; 2045 uint_t bits = 0; 2046 minor_t selfid; 2047 md_unit_t *su; 2048 int i; 2049 2050 md_destroy_unit_incore(mnum, &mirror_md_ops); 2051 2052 shortcnt = un->un_rrd_num * sizeof (short); 2053 bitcnt = howmany(un->un_rrd_num, NBBY); 2054 2055 if (un->un_outstanding_writes) 2056 kmem_free((caddr_t)un->un_outstanding_writes, shortcnt); 2057 if (un->un_goingclean_bm) 2058 kmem_free((caddr_t)un->un_goingclean_bm, bitcnt); 2059 if (un->un_goingdirty_bm) 2060 kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt); 2061 if (un->un_resync_bm) 2062 kmem_free((caddr_t)un->un_resync_bm, bitcnt); 2063 if (un->un_pernode_dirty_sum) 2064 kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num); 2065 2066 /* 2067 * Destroy the taskq for deferred processing of DRL clean requests. 2068 * This taskq will only be present for Multi Owner mirrors. 2069 */ 2070 if (un->un_drl_task != NULL) 2071 ddi_taskq_destroy(un->un_drl_task); 2072 2073 md_nblocks_set(mnum, -1ULL); 2074 MD_UNIT(mnum) = NULL; 2075 2076 /* 2077 * Attempt release of its minor node 2078 */ 2079 md_remove_minor_node(mnum); 2080 2081 if (!removing) 2082 return; 2083 2084 for (smi = 0; smi < NMIRROR; smi++) { 2085 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 2086 continue; 2087 /* reallow soft partitioning of submirror and reset parent */ 2088 su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev)); 2089 MD_CAPAB(su) |= MD_CAN_SP; 2090 md_reset_parent(un->un_sm[smi].sm_dev); 2091 reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]); 2092 2093 sv[nsv].setno = MD_MIN2SET(mnum); 2094 sv[nsv++].key = un->un_sm[smi].sm_key; 2095 bits |= SMI2BIT(smi); 2096 } 2097 2098 MD_STATUS(un) |= MD_UN_BEING_RESET; 2099 recid = un->un_rr_dirty_recid; 2100 vtoc_id = un->c.un_vtoc_id; 2101 selfid = MD_SID(un); 2102 2103 mirror_commit(un, bits, 0); 2104 2105 avl_destroy(&un->un_overlap_root); 2106 2107 /* Destroy all mutexes and condvars before returning. */ 2108 mutex_destroy(&un->un_suspend_wr_mx); 2109 cv_destroy(&un->un_suspend_wr_cv); 2110 mutex_destroy(&un->un_overlap_tree_mx); 2111 cv_destroy(&un->un_overlap_tree_cv); 2112 mutex_destroy(&un->un_owner_mx); 2113 mutex_destroy(&un->un_rs_thread_mx); 2114 cv_destroy(&un->un_rs_thread_cv); 2115 mutex_destroy(&un->un_rs_progress_mx); 2116 cv_destroy(&un->un_rs_progress_cv); 2117 mutex_destroy(&un->un_dmr_mx); 2118 cv_destroy(&un->un_dmr_cv); 2119 2120 for (i = 0; i < MD_MNMAXSIDES; i++) { 2121 rw_destroy(&un->un_pernode_dirty_mx[i]); 2122 if (un->un_pernode_dirty_bm[i]) 2123 kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt); 2124 } 2125 2126 /* 2127 * Remove self from the namespace 2128 */ 2129 if (un->c.un_revision & MD_FN_META_DEV) { 2130 (void) md_rem_selfname(un->c.un_self_id); 2131 } 2132 2133 /* This frees the unit structure. */ 2134 mddb_deleterec_wrapper(un->c.un_record_id); 2135 2136 if (recid != 0) 2137 mddb_deleterec_wrapper(recid); 2138 2139 /* Remove the vtoc, if present */ 2140 if (vtoc_id) 2141 mddb_deleterec_wrapper(vtoc_id); 2142 2143 md_rem_names(sv, nsv); 2144 2145 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE, 2146 MD_MIN2SET(selfid), selfid); 2147 } 2148 2149 int 2150 mirror_internal_open( 2151 minor_t mnum, 2152 int flag, 2153 int otyp, 2154 int md_oflags, 2155 IOLOCK *lockp /* can be NULL */ 2156 ) 2157 { 2158 mdi_unit_t *ui = MDI_UNIT(mnum); 2159 int err = 0; 2160 2161 tryagain: 2162 /* single thread */ 2163 if (lockp) { 2164 /* 2165 * If ioctl lock is held, use openclose_enter 2166 * routine that will set the ioctl flag when 2167 * grabbing the readerlock. 2168 */ 2169 (void) md_ioctl_openclose_enter(lockp, ui); 2170 } else { 2171 (void) md_unit_openclose_enter(ui); 2172 } 2173 2174 /* 2175 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE 2176 * message in a MN diskset and this requires that the openclose 2177 * lock is dropped in order to send this message. So, another 2178 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from 2179 * attempting an open while this thread has an open in progress. 2180 * Call the *_lh version of the lock exit routines since the ui_mx 2181 * mutex must be held from checking for OPENINPROGRESS until 2182 * after the cv_wait call. 2183 */ 2184 mutex_enter(&ui->ui_mx); 2185 if (ui->ui_lock & MD_UL_OPENINPROGRESS) { 2186 if (lockp) { 2187 (void) md_ioctl_openclose_exit_lh(lockp); 2188 } else { 2189 md_unit_openclose_exit_lh(ui); 2190 } 2191 cv_wait(&ui->ui_cv, &ui->ui_mx); 2192 mutex_exit(&ui->ui_mx); 2193 goto tryagain; 2194 } 2195 2196 ui->ui_lock |= MD_UL_OPENINPROGRESS; 2197 mutex_exit(&ui->ui_mx); 2198 2199 /* open devices, if necessary */ 2200 if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) { 2201 if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0) 2202 goto out; 2203 } 2204 2205 /* count open */ 2206 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) 2207 goto out; 2208 2209 /* unlock, return success */ 2210 out: 2211 mutex_enter(&ui->ui_mx); 2212 ui->ui_lock &= ~MD_UL_OPENINPROGRESS; 2213 mutex_exit(&ui->ui_mx); 2214 2215 if (lockp) { 2216 /* 2217 * If ioctl lock is held, use openclose_exit 2218 * routine that will clear the lockp reader flag. 2219 */ 2220 (void) md_ioctl_openclose_exit(lockp); 2221 } else { 2222 md_unit_openclose_exit(ui); 2223 } 2224 return (err); 2225 } 2226 2227 int 2228 mirror_internal_close( 2229 minor_t mnum, 2230 int otyp, 2231 int md_cflags, 2232 IOLOCK *lockp /* can be NULL */ 2233 ) 2234 { 2235 mdi_unit_t *ui = MDI_UNIT(mnum); 2236 mm_unit_t *un; 2237 int err = 0; 2238 2239 /* single thread */ 2240 if (lockp) { 2241 /* 2242 * If ioctl lock is held, use openclose_enter 2243 * routine that will set the ioctl flag when 2244 * grabbing the readerlock. 2245 */ 2246 un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui); 2247 } else { 2248 un = (mm_unit_t *)md_unit_openclose_enter(ui); 2249 } 2250 2251 /* count closed */ 2252 if ((err = md_unit_decopen(mnum, otyp)) != 0) 2253 goto out; 2254 2255 /* close devices, if necessary */ 2256 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 2257 /* 2258 * Clean up dirty bitmap for this unit. Do this 2259 * before closing the underlying devices to avoid 2260 * race conditions with reset_mirror() as a 2261 * result of a 'metaset -r' command running in 2262 * parallel. This might cause deallocation of 2263 * dirty region bitmaps; with underlying metadevices 2264 * in place this can't happen. 2265 * Don't do this if a MN set and ABR not set 2266 */ 2267 if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) { 2268 if (!MD_MNSET_SETNO(MD_UN2SET(un)) || 2269 !(ui->ui_tstate & MD_ABR_CAP)) 2270 mirror_process_unit_resync(un); 2271 } 2272 (void) mirror_close_all_devs(un, md_cflags); 2273 2274 /* 2275 * For a MN set with transient capabilities (eg ABR/DMR) set, 2276 * clear these capabilities on the last open in the cluster. 2277 * To do this we send a message to all nodes to see of the 2278 * device is open. 2279 */ 2280 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 2281 (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) { 2282 if (lockp) { 2283 (void) md_ioctl_openclose_exit(lockp); 2284 } else { 2285 md_unit_openclose_exit(ui); 2286 } 2287 2288 /* 2289 * if we are in the context of an ioctl, drop the 2290 * ioctl lock. 2291 * Otherwise, no other locks should be held. 2292 */ 2293 if (lockp) { 2294 IOLOCK_RETURN_RELEASE(0, lockp); 2295 } 2296 2297 mdmn_clear_all_capabilities(mnum); 2298 2299 /* if dropped the lock previously, regain it */ 2300 if (lockp) { 2301 IOLOCK_RETURN_REACQUIRE(lockp); 2302 } 2303 return (0); 2304 } 2305 /* unlock and return success */ 2306 } 2307 out: 2308 /* Call whether lockp is NULL or not. */ 2309 if (lockp) { 2310 md_ioctl_openclose_exit(lockp); 2311 } else { 2312 md_unit_openclose_exit(ui); 2313 } 2314 return (err); 2315 } 2316 2317 /* 2318 * When a component has completed resyncing and is now ok, check if the 2319 * corresponding component in the other submirrors is in the Last Erred 2320 * state. If it is, we want to change that to the Erred state so we stop 2321 * using that component and start using this good component instead. 2322 * 2323 * This is called from set_sm_comp_state and recursively calls 2324 * set_sm_comp_state if it needs to change the Last Erred state. 2325 */ 2326 static void 2327 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags, 2328 IOLOCK *lockp) 2329 { 2330 mm_submirror_t *sm; 2331 mm_submirror_ic_t *smic; 2332 int ci; 2333 int i; 2334 int compcnt; 2335 int changed = 0; 2336 2337 for (i = 0; i < NMIRROR; i++) { 2338 sm = &un->un_sm[i]; 2339 smic = &un->un_smic[i]; 2340 2341 if (!SMS_IS(sm, SMS_INUSE)) 2342 continue; 2343 2344 /* ignore the submirror that we just made ok */ 2345 if (i == smi) 2346 continue; 2347 2348 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 2349 for (ci = 0; ci < compcnt; ci++) { 2350 md_m_shared_t *shared; 2351 2352 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 2353 (sm->sm_dev, sm, ci); 2354 2355 if ((shared->ms_state & CS_LAST_ERRED) && 2356 !mirror_other_sources(un, i, ci, 1)) { 2357 2358 set_sm_comp_state(un, i, ci, CS_ERRED, extras, 2359 flags, lockp); 2360 changed = 1; 2361 } 2362 } 2363 } 2364 2365 /* maybe there is a hotspare for this newly erred component */ 2366 if (changed) { 2367 set_t setno; 2368 2369 setno = MD_UN2SET(un); 2370 if (MD_MNSET_SETNO(setno)) { 2371 send_poke_hotspares(setno); 2372 } else { 2373 (void) poke_hotspares(); 2374 } 2375 } 2376 } 2377 2378 /* 2379 * set_sm_comp_state 2380 * 2381 * Set the state of a submirror component to the specified new state. 2382 * If the mirror is in a multi-node set, send messages to all nodes to 2383 * block all writes to the mirror and then update the state and release the 2384 * writes. These messages are only sent if MD_STATE_XMIT is set in flags. 2385 * MD_STATE_XMIT will be unset in 2 cases: 2386 * 1. When the state is changed to CS_RESYNC as this state change 2387 * will already have been updated on each node by the processing of the 2388 * distributed metasync command, hence no need to xmit. 2389 * 2. When the state is change to CS_OKAY after a resync has completed. Again 2390 * the resync completion will already have been processed on each node by 2391 * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component 2392 * resync, hence no need to xmit. 2393 * 2394 * In case we are called from the updates of a watermark, 2395 * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to 2396 * a metainit or similar. In this case the message that we sent to propagate 2397 * the state change must not be a class1 message as that would deadlock with 2398 * the metainit command that is still being processed. 2399 * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2 2400 * instead. This also makes the submessage generator to create a class2 2401 * submessage rather than a class1 (which would also block) 2402 * 2403 * On entry, unit_writerlock is held 2404 * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is 2405 * also held. 2406 */ 2407 void 2408 set_sm_comp_state( 2409 mm_unit_t *un, 2410 int smi, 2411 int ci, 2412 int newstate, 2413 mddb_recid_t *extras, 2414 uint_t flags, 2415 IOLOCK *lockp 2416 ) 2417 { 2418 mm_submirror_t *sm; 2419 mm_submirror_ic_t *smic; 2420 md_m_shared_t *shared; 2421 int origstate; 2422 void (*get_dev)(); 2423 ms_cd_info_t cd; 2424 char devname[MD_MAX_CTDLEN]; 2425 int err; 2426 set_t setno = MD_UN2SET(un); 2427 md_mn_msg_stch_t stchmsg; 2428 mdi_unit_t *ui = MDI_UNIT(MD_SID(un)); 2429 md_mn_kresult_t *kresult; 2430 int rval; 2431 uint_t msgflags; 2432 md_mn_msgtype_t msgtype; 2433 int save_lock = 0; 2434 mdi_unit_t *ui_sm; 2435 int nretries = 0; 2436 2437 sm = &un->un_sm[smi]; 2438 smic = &un->un_smic[smi]; 2439 2440 /* If we have a real error status then turn off MD_INACCESSIBLE. */ 2441 ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev))); 2442 if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) && 2443 ui_sm->ui_tstate & MD_INACCESSIBLE) { 2444 ui_sm->ui_tstate &= ~MD_INACCESSIBLE; 2445 } 2446 2447 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 2448 (sm->sm_dev, sm, ci); 2449 origstate = shared->ms_state; 2450 2451 /* 2452 * If the new state is an error and the old one wasn't, generate 2453 * a console message. We do this before we send the state to other 2454 * nodes in a MN set because the state change may change the component 2455 * name if a hotspare is allocated. 2456 */ 2457 if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) && 2458 (newstate & (CS_ERRED|CS_LAST_ERRED))) { 2459 2460 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, 2461 "get device", 0); 2462 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 2463 2464 err = md_getdevname(setno, mddb_getsidenum(setno), 0, 2465 cd.cd_dev, devname, sizeof (devname)); 2466 2467 if (err == ENOENT) { 2468 (void) md_devname(setno, cd.cd_dev, devname, 2469 sizeof (devname)); 2470 } 2471 2472 cmn_err(CE_WARN, "md: %s: %s needs maintenance", 2473 md_shortname(md_getminor(sm->sm_dev)), devname); 2474 2475 if (newstate & CS_LAST_ERRED) { 2476 cmn_err(CE_WARN, "md: %s: %s last erred", 2477 md_shortname(md_getminor(sm->sm_dev)), 2478 devname); 2479 2480 } else if (shared->ms_flags & MDM_S_ISOPEN) { 2481 /* 2482 * Close the broken device and clear the open flag on 2483 * it. Closing the device means the RCM framework will 2484 * be able to unconfigure the device if required. 2485 * 2486 * We have to check that the device is open, otherwise 2487 * the first open on it has resulted in the error that 2488 * is being processed and the actual cd.cd_dev will be 2489 * NODEV64. 2490 * 2491 * If this is a multi-node mirror, then the multinode 2492 * state checks following this code will cause the 2493 * slave nodes to close the mirror in the function 2494 * mirror_set_state(). 2495 */ 2496 md_layered_close(cd.cd_dev, MD_OFLG_NULL); 2497 shared->ms_flags &= ~MDM_S_ISOPEN; 2498 } 2499 2500 } else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) && 2501 (shared->ms_flags & MDM_S_ISOPEN)) { 2502 /* 2503 * Similar to logic above except no log messages since we 2504 * are just transitioning from Last Erred to Erred. 2505 */ 2506 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, 2507 "get device", 0); 2508 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 2509 2510 md_layered_close(cd.cd_dev, MD_OFLG_NULL); 2511 shared->ms_flags &= ~MDM_S_ISOPEN; 2512 } 2513 2514 if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) && 2515 (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) { 2516 /* 2517 * For a multi-node mirror, send the state change to the 2518 * master, which broadcasts to all nodes, including this 2519 * one. Once the message is received, the state is set 2520 * in-core and the master commits the change to disk. 2521 * There is a case, comp_replace, where this function 2522 * can be called from within an ioctl and therefore in this 2523 * case, as the ioctl will already be called on each node, 2524 * there is no need to xmit the state change to the master for 2525 * distribution to the other nodes. MD_STATE_XMIT flag is used 2526 * to indicate whether a xmit is required. The mirror's 2527 * transient state is set to MD_ERR_PENDING to avoid sending 2528 * multiple messages. 2529 */ 2530 if (newstate & (CS_ERRED|CS_LAST_ERRED)) 2531 ui->ui_tstate |= MD_ERR_PENDING; 2532 2533 /* 2534 * Send a state update message to all nodes. This message 2535 * will generate 2 submessages, the first one to suspend 2536 * all writes to the mirror and the second to update the 2537 * state and resume writes. 2538 */ 2539 stchmsg.msg_stch_mnum = un->c.un_self_id; 2540 stchmsg.msg_stch_sm = smi; 2541 stchmsg.msg_stch_comp = ci; 2542 stchmsg.msg_stch_new_state = newstate; 2543 stchmsg.msg_stch_hs_id = shared->ms_hs_id; 2544 #ifdef DEBUG 2545 if (mirror_debug_flag) 2546 printf("send set state, %x, %x, %x, %x, %x\n", 2547 stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm, 2548 stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state, 2549 stchmsg.msg_stch_hs_id); 2550 #endif 2551 if (flags & MD_STATE_WMUPDATE) { 2552 msgtype = MD_MN_MSG_STATE_UPDATE2; 2553 /* 2554 * When coming from an update of watermarks, there 2555 * must already be a message logged that triggered 2556 * this action. So, no need to log this message, too. 2557 */ 2558 msgflags = MD_MSGF_NO_LOG; 2559 } else { 2560 msgtype = MD_MN_MSG_STATE_UPDATE; 2561 msgflags = MD_MSGF_DEFAULT_FLAGS; 2562 } 2563 2564 /* 2565 * If we are in the context of an ioctl, drop the ioctl lock. 2566 * lockp holds the list of locks held. 2567 * 2568 * Otherwise, increment the appropriate reacquire counters. 2569 * If openclose lock is *held, then must reacquire reader 2570 * lock before releasing the openclose lock. 2571 * Do not drop the ARRAY_WRITER lock as we may not be able 2572 * to reacquire it. 2573 */ 2574 if (lockp) { 2575 if (lockp->l_flags & MD_ARRAY_WRITER) { 2576 save_lock = MD_ARRAY_WRITER; 2577 lockp->l_flags &= ~MD_ARRAY_WRITER; 2578 } else if (lockp->l_flags & MD_ARRAY_READER) { 2579 save_lock = MD_ARRAY_READER; 2580 lockp->l_flags &= ~MD_ARRAY_READER; 2581 } 2582 IOLOCK_RETURN_RELEASE(0, lockp); 2583 } else { 2584 if (flags & MD_STATE_OCHELD) { 2585 md_unit_writerexit(ui); 2586 (void) md_unit_readerlock(ui); 2587 md_unit_openclose_exit(ui); 2588 } else { 2589 md_unit_writerexit(ui); 2590 } 2591 } 2592 2593 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 2594 sscs_msg: 2595 rval = mdmn_ksend_message(setno, msgtype, msgflags, 0, 2596 (char *)&stchmsg, sizeof (stchmsg), kresult); 2597 2598 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 2599 mdmn_ksend_show_error(rval, kresult, "STATE UPDATE"); 2600 /* If we're shutting down already, pause things here. */ 2601 if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) { 2602 while (!md_mn_is_commd_present()) { 2603 delay(md_hz); 2604 } 2605 /* 2606 * commd is now available; retry the message 2607 * one time. If that fails we fall through and 2608 * panic as the system is in an unexpected state 2609 */ 2610 if (nretries++ == 0) 2611 goto sscs_msg; 2612 } 2613 cmn_err(CE_PANIC, 2614 "ksend_message failure: STATE_UPDATE"); 2615 } 2616 kmem_free(kresult, sizeof (md_mn_kresult_t)); 2617 2618 /* if dropped the lock previously, regain it */ 2619 if (lockp) { 2620 IOLOCK_RETURN_REACQUIRE(lockp); 2621 lockp->l_flags |= save_lock; 2622 } else { 2623 /* 2624 * Reacquire dropped locks and update acquirecnts 2625 * appropriately. 2626 */ 2627 if (flags & MD_STATE_OCHELD) { 2628 /* 2629 * openclose also grabs readerlock. 2630 */ 2631 (void) md_unit_openclose_enter(ui); 2632 md_unit_readerexit(ui); 2633 (void) md_unit_writerlock(ui); 2634 } else { 2635 (void) md_unit_writerlock(ui); 2636 } 2637 } 2638 2639 ui->ui_tstate &= ~MD_ERR_PENDING; 2640 } else { 2641 shared->ms_state = newstate; 2642 uniqtime32(&shared->ms_timestamp); 2643 2644 if (newstate == CS_ERRED) 2645 shared->ms_flags |= MDM_S_NOWRITE; 2646 else 2647 shared->ms_flags &= ~MDM_S_NOWRITE; 2648 2649 shared->ms_flags &= ~MDM_S_IOERR; 2650 un->un_changecnt++; 2651 shared->ms_lasterrcnt = un->un_changecnt; 2652 2653 mirror_set_sm_state(sm, smic, SMS_RUNNING, 0); 2654 mirror_commit(un, SMI2BIT(smi), extras); 2655 } 2656 2657 if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) { 2658 /* 2659 * Resetting the Last Erred state will recursively call back 2660 * into this function (set_sm_comp_state) to update the state. 2661 */ 2662 reset_lasterred(un, smi, extras, flags, lockp); 2663 } 2664 } 2665 2666 static int 2667 find_another_logical( 2668 mm_unit_t *un, 2669 mm_submirror_t *esm, 2670 diskaddr_t blk, 2671 u_longlong_t cnt, 2672 int must_be_open, 2673 int state, 2674 int err_cnt) 2675 { 2676 u_longlong_t cando; 2677 md_dev64_t dev; 2678 md_m_shared_t *s; 2679 2680 esm->sm_state |= SMS_IGNORE; 2681 while (cnt != 0) { 2682 u_longlong_t mcnt; 2683 2684 mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024)); /* 1 Gig Blks */ 2685 2686 dev = select_read_unit(un, blk, mcnt, &cando, 2687 must_be_open, &s, NULL); 2688 if (dev == (md_dev64_t)0) 2689 break; 2690 2691 if ((state == CS_LAST_ERRED) && 2692 (s->ms_state == CS_LAST_ERRED) && 2693 (err_cnt > s->ms_lasterrcnt)) 2694 break; 2695 2696 cnt -= cando; 2697 blk += cando; 2698 } 2699 esm->sm_state &= ~SMS_IGNORE; 2700 return (cnt != 0); 2701 } 2702 2703 int 2704 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open) 2705 { 2706 mm_submirror_t *sm; 2707 mm_submirror_ic_t *smic; 2708 size_t count; 2709 diskaddr_t block; 2710 u_longlong_t skip; 2711 u_longlong_t size; 2712 md_dev64_t dev; 2713 int cnt; 2714 md_m_shared_t *s; 2715 int not_found; 2716 2717 sm = &un->un_sm[smi]; 2718 smic = &un->un_smic[smi]; 2719 dev = sm->sm_dev; 2720 2721 /* 2722 * Make sure every component of the submirror 2723 * has other sources. 2724 */ 2725 if (ci < 0) { 2726 /* Find the highest lasterrcnt */ 2727 cnt = (*(smic->sm_get_component_count))(dev, sm); 2728 for (ci = 0; ci < cnt; ci++) { 2729 not_found = mirror_other_sources(un, smi, ci, 2730 must_be_open); 2731 if (not_found) 2732 return (1); 2733 } 2734 return (0); 2735 } 2736 2737 /* 2738 * Make sure this component has other sources 2739 */ 2740 (void) (*(smic->sm_get_bcss)) 2741 (dev, sm, ci, &block, &count, &skip, &size); 2742 2743 if (count == 0) 2744 return (1); 2745 2746 s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci); 2747 2748 while (count--) { 2749 if (block >= un->c.un_total_blocks) 2750 return (0); 2751 2752 if ((block + size) > un->c.un_total_blocks) 2753 size = un->c.un_total_blocks - block; 2754 2755 not_found = find_another_logical(un, sm, block, size, 2756 must_be_open, s->ms_state, s->ms_lasterrcnt); 2757 if (not_found) 2758 return (1); 2759 2760 block += size + skip; 2761 } 2762 return (0); 2763 } 2764 2765 static void 2766 finish_error(md_mps_t *ps) 2767 { 2768 struct buf *pb; 2769 mm_unit_t *un; 2770 mdi_unit_t *ui; 2771 uint_t new_str_flags; 2772 2773 pb = ps->ps_bp; 2774 un = ps->ps_un; 2775 ui = ps->ps_ui; 2776 2777 /* 2778 * Must flag any error to the resync originator if we're performing 2779 * a Write-after-Read. This corresponds to an i/o error on a resync 2780 * target device and in this case we ought to abort the resync as there 2781 * is nothing that can be done to recover from this without operator 2782 * intervention. If we don't set the B_ERROR flag we will continue 2783 * reading from the mirror but won't write to the target (as it will 2784 * have been placed into an errored state). 2785 * To handle the case of multiple components within a submirror we only 2786 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR. 2787 * The originator of the resync read will cause this bit to be set if 2788 * the underlying component count is one for a submirror resync. All 2789 * other resync types will have the flag set as there is no underlying 2790 * resync which can be performed on a contained metadevice for these 2791 * resync types (optimized or component). 2792 */ 2793 2794 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) { 2795 if (ps->ps_flags & MD_MPS_FLAG_ERROR) 2796 pb->b_flags |= B_ERROR; 2797 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2798 MPS_FREE(mirror_parent_cache, ps); 2799 md_unit_readerexit(ui); 2800 md_biodone(pb); 2801 return; 2802 } 2803 /* 2804 * Set the MD_IO_COUNTED flag as we are retrying the same I/O 2805 * operation therefore this I/O request has already been counted, 2806 * the I/O count variable will be decremented by mirror_done()'s 2807 * call to md_biodone(). 2808 */ 2809 if (ps->ps_changecnt != un->un_changecnt) { 2810 new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED; 2811 if (ps->ps_flags & MD_MPS_WOW) 2812 new_str_flags |= MD_STR_WOW; 2813 if (ps->ps_flags & MD_MPS_MAPPED) 2814 new_str_flags |= MD_STR_MAPPED; 2815 /* 2816 * If this I/O request was a read that was part of a resync, 2817 * set MD_STR_WAR for the retried read to ensure that the 2818 * resync write (i.e. write-after-read) will be performed 2819 */ 2820 if (ps->ps_flags & MD_MPS_RESYNC_READ) 2821 new_str_flags |= MD_STR_WAR; 2822 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2823 MPS_FREE(mirror_parent_cache, ps); 2824 md_unit_readerexit(ui); 2825 (void) md_mirror_strategy(pb, new_str_flags, NULL); 2826 return; 2827 } 2828 2829 pb->b_flags |= B_ERROR; 2830 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2831 MPS_FREE(mirror_parent_cache, ps); 2832 md_unit_readerexit(ui); 2833 md_biodone(pb); 2834 } 2835 2836 static void 2837 error_update_unit(md_mps_t *ps) 2838 { 2839 mm_unit_t *un; 2840 mdi_unit_t *ui; 2841 int smi; /* sub mirror index */ 2842 int ci; /* errored component */ 2843 set_t setno; 2844 uint_t flags; /* for set_sm_comp_state() */ 2845 uint_t hspflags; /* for check_comp_4_hotspares() */ 2846 2847 ui = ps->ps_ui; 2848 un = (mm_unit_t *)md_unit_writerlock(ui); 2849 setno = MD_UN2SET(un); 2850 2851 /* All of these updates have to propagated in case of MN set */ 2852 flags = MD_STATE_XMIT; 2853 hspflags = MD_HOTSPARE_XMIT; 2854 2855 /* special treatment if we are called during updating watermarks */ 2856 if (ps->ps_flags & MD_MPS_WMUPDATE) { 2857 flags |= MD_STATE_WMUPDATE; 2858 hspflags |= MD_HOTSPARE_WMUPDATE; 2859 } 2860 smi = 0; 2861 ci = 0; 2862 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) { 2863 if (mirror_other_sources(un, smi, ci, 0) == 1) { 2864 2865 /* Never called from ioctl context, so (IOLOCK *)NULL */ 2866 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags, 2867 (IOLOCK *)NULL); 2868 /* 2869 * For a MN set, the NOTIFY is done when the state 2870 * change is processed on each node 2871 */ 2872 if (!MD_MNSET_SETNO(MD_UN2SET(un))) { 2873 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, 2874 SVM_TAG_METADEVICE, setno, MD_SID(un)); 2875 } 2876 continue; 2877 } 2878 /* Never called from ioctl context, so (IOLOCK *)NULL */ 2879 set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags, 2880 (IOLOCK *)NULL); 2881 /* 2882 * For a MN set, the NOTIFY is done when the state 2883 * change is processed on each node 2884 */ 2885 if (!MD_MNSET_SETNO(MD_UN2SET(un))) { 2886 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 2887 SVM_TAG_METADEVICE, setno, MD_SID(un)); 2888 } 2889 smi = 0; 2890 ci = 0; 2891 } 2892 2893 md_unit_writerexit(ui); 2894 if (MD_MNSET_SETNO(setno)) { 2895 send_poke_hotspares(setno); 2896 } else { 2897 (void) poke_hotspares(); 2898 } 2899 (void) md_unit_readerlock(ui); 2900 2901 finish_error(ps); 2902 } 2903 2904 /* 2905 * When we have a B_FAILFAST IO error on a Last Erred component we need to 2906 * retry the IO without B_FAILFAST set so that we try to ensure that the 2907 * component "sees" each IO. 2908 */ 2909 static void 2910 last_err_retry(md_mcs_t *cs) 2911 { 2912 struct buf *cb; 2913 md_mps_t *ps; 2914 uint_t flags; 2915 2916 cb = &cs->cs_buf; 2917 cb->b_flags &= ~B_FAILFAST; 2918 2919 /* if we're panicing just let this I/O error out */ 2920 if (panicstr) { 2921 (void) mirror_done(cb); 2922 return; 2923 } 2924 2925 /* reissue the I/O */ 2926 2927 ps = cs->cs_ps; 2928 2929 bioerror(cb, 0); 2930 2931 mutex_enter(&ps->ps_mx); 2932 2933 flags = MD_STR_NOTTOP; 2934 if (ps->ps_flags & MD_MPS_MAPPED) 2935 flags |= MD_STR_MAPPED; 2936 if (ps->ps_flags & MD_MPS_NOBLOCK) 2937 flags |= MD_NOBLOCK; 2938 2939 mutex_exit(&ps->ps_mx); 2940 2941 clear_retry_error(cb); 2942 2943 cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST", 2944 md_shortname(getminor(cb->b_edev))); 2945 2946 md_call_strategy(cb, flags, NULL); 2947 } 2948 2949 static void 2950 mirror_error(md_mps_t *ps) 2951 { 2952 int smi; /* sub mirror index */ 2953 int ci; /* errored component */ 2954 2955 if (panicstr) { 2956 finish_error(ps); 2957 return; 2958 } 2959 2960 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 2961 mirror_overlap_tree_remove(ps); 2962 2963 smi = 0; 2964 ci = 0; 2965 if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) { 2966 md_unit_readerexit(ps->ps_ui); 2967 daemon_request(&md_mstr_daemon, error_update_unit, 2968 (daemon_queue_t *)ps, REQ_OLD); 2969 return; 2970 } 2971 2972 finish_error(ps); 2973 } 2974 2975 static int 2976 copy_write_done(struct buf *cb) 2977 { 2978 md_mps_t *ps; 2979 buf_t *pb; 2980 char *wowbuf; 2981 wowhdr_t *wowhdr; 2982 ssize_t wow_resid; 2983 2984 /* get wowbuf ans save structure */ 2985 wowbuf = cb->b_un.b_addr; 2986 wowhdr = WOWBUF_HDR(wowbuf); 2987 ps = wowhdr->wow_ps; 2988 pb = ps->ps_bp; 2989 2990 /* Save error information, then free cb */ 2991 if (cb->b_flags & B_ERROR) 2992 pb->b_flags |= B_ERROR; 2993 2994 if (cb->b_flags & B_REMAPPED) 2995 bp_mapout(cb); 2996 2997 freerbuf(cb); 2998 2999 /* update residual and continue if needed */ 3000 if ((pb->b_flags & B_ERROR) == 0) { 3001 wow_resid = pb->b_bcount - wowhdr->wow_offset; 3002 pb->b_resid = wow_resid; 3003 if (wow_resid > 0) { 3004 daemon_request(&md_mstr_daemon, copy_write_cont, 3005 (daemon_queue_t *)wowhdr, REQ_OLD); 3006 return (1); 3007 } 3008 } 3009 3010 /* Write is complete, release resources. */ 3011 kmem_cache_free(mirror_wowblk_cache, wowhdr); 3012 ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 3013 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3014 MPS_FREE(mirror_parent_cache, ps); 3015 md_biodone(pb); 3016 return (0); 3017 } 3018 3019 static void 3020 copy_write_cont(wowhdr_t *wowhdr) 3021 { 3022 buf_t *pb; 3023 buf_t *cb; 3024 char *wowbuf; 3025 int wow_offset; 3026 size_t wow_resid; 3027 diskaddr_t wow_blkno; 3028 3029 wowbuf = WOWHDR_BUF(wowhdr); 3030 pb = wowhdr->wow_ps->ps_bp; 3031 3032 /* get data on current location */ 3033 wow_offset = wowhdr->wow_offset; 3034 wow_resid = pb->b_bcount - wow_offset; 3035 wow_blkno = pb->b_lblkno + lbtodb(wow_offset); 3036 3037 /* setup child buffer */ 3038 cb = getrbuf(KM_SLEEP); 3039 cb->b_flags = B_WRITE; 3040 cb->b_edev = pb->b_edev; 3041 cb->b_un.b_addr = wowbuf; /* change to point at WOWBUF */ 3042 cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */ 3043 cb->b_iodone = copy_write_done; 3044 cb->b_bcount = MIN(md_wowbuf_size, wow_resid); 3045 cb->b_lblkno = wow_blkno; 3046 3047 /* move offset to next section */ 3048 wowhdr->wow_offset += cb->b_bcount; 3049 3050 /* copy and setup write for current section */ 3051 bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount); 3052 3053 /* do it */ 3054 /* 3055 * Do not set the MD_IO_COUNTED flag as this is a new I/O request 3056 * that handles the WOW condition. The resultant increment on the 3057 * I/O count variable is cleared by copy_write_done()'s call to 3058 * md_biodone(). 3059 */ 3060 (void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW 3061 | MD_STR_MAPPED, NULL); 3062 } 3063 3064 static void 3065 md_mirror_copy_write(md_mps_t *ps) 3066 { 3067 wowhdr_t *wowhdr; 3068 3069 wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS); 3070 mirror_wowblk_init(wowhdr); 3071 wowhdr->wow_ps = ps; 3072 wowhdr->wow_offset = 0; 3073 copy_write_cont(wowhdr); 3074 } 3075 3076 static void 3077 handle_wow(md_mps_t *ps) 3078 { 3079 buf_t *pb; 3080 3081 pb = ps->ps_bp; 3082 3083 bp_mapin(pb); 3084 3085 md_mirror_wow_cnt++; 3086 if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) { 3087 cmn_err(CE_NOTE, 3088 "md: %s, blk %lld, cnt %ld: Write on write %d occurred", 3089 md_shortname(getminor(pb->b_edev)), 3090 (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt); 3091 } 3092 3093 /* 3094 * Set the MD_IO_COUNTED flag as we are retrying the same I/O 3095 * operation therefore this I/O request has already been counted, 3096 * the I/O count variable will be decremented by mirror_done()'s 3097 * call to md_biodone(). 3098 */ 3099 if (md_mirror_wow_flg & WOW_NOCOPY) 3100 (void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW | 3101 MD_STR_MAPPED | MD_IO_COUNTED, ps); 3102 else 3103 md_mirror_copy_write(ps); 3104 } 3105 3106 /* 3107 * Return true if the specified submirror is either in the Last Erred 3108 * state or is transitioning into the Last Erred state. 3109 */ 3110 static bool_t 3111 submirror_is_lasterred(mm_unit_t *un, int smi) 3112 { 3113 mm_submirror_t *sm; 3114 mm_submirror_ic_t *smic; 3115 md_m_shared_t *shared; 3116 int ci; 3117 int compcnt; 3118 3119 sm = &un->un_sm[smi]; 3120 smic = &un->un_smic[smi]; 3121 3122 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 3123 for (ci = 0; ci < compcnt; ci++) { 3124 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 3125 (sm->sm_dev, sm, ci); 3126 3127 if (shared->ms_state == CS_LAST_ERRED) 3128 return (B_TRUE); 3129 3130 /* 3131 * It is not currently Last Erred, check if entering Last Erred. 3132 */ 3133 if ((shared->ms_flags & MDM_S_IOERR) && 3134 ((shared->ms_state == CS_OKAY) || 3135 (shared->ms_state == CS_RESYNC))) { 3136 if (mirror_other_sources(un, smi, ci, 0) == 1) 3137 return (B_TRUE); 3138 } 3139 } 3140 3141 return (B_FALSE); 3142 } 3143 3144 3145 static int 3146 mirror_done(struct buf *cb) 3147 { 3148 md_mps_t *ps; 3149 md_mcs_t *cs; 3150 3151 /*LINTED*/ 3152 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 3153 ps = cs->cs_ps; 3154 3155 mutex_enter(&ps->ps_mx); 3156 3157 /* check if we need to retry an errored failfast I/O */ 3158 if (cb->b_flags & B_ERROR) { 3159 struct buf *pb = ps->ps_bp; 3160 3161 if (cb->b_flags & B_FAILFAST) { 3162 int i; 3163 mm_unit_t *un = ps->ps_un; 3164 3165 for (i = 0; i < NMIRROR; i++) { 3166 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 3167 continue; 3168 3169 if (cb->b_edev == 3170 md_dev64_to_dev(un->un_sm[i].sm_dev)) { 3171 3172 /* 3173 * This is the submirror that had the 3174 * error. Check if it is Last Erred. 3175 */ 3176 if (submirror_is_lasterred(un, i)) { 3177 daemon_queue_t *dqp; 3178 3179 mutex_exit(&ps->ps_mx); 3180 dqp = (daemon_queue_t *)cs; 3181 dqp->dq_prev = NULL; 3182 dqp->dq_next = NULL; 3183 daemon_request(&md_done_daemon, 3184 last_err_retry, dqp, 3185 REQ_OLD); 3186 return (1); 3187 } 3188 break; 3189 } 3190 } 3191 } 3192 3193 /* continue to process the buf without doing a retry */ 3194 ps->ps_flags |= MD_MPS_ERROR; 3195 pb->b_error = cb->b_error; 3196 } 3197 3198 return (mirror_done_common(cb)); 3199 } 3200 3201 /* 3202 * Split from the original mirror_done function so we can handle bufs after a 3203 * retry. 3204 * ps->ps_mx is already held in the caller of this function and the cb error 3205 * has already been checked and handled in the caller. 3206 */ 3207 static int 3208 mirror_done_common(struct buf *cb) 3209 { 3210 struct buf *pb; 3211 mm_unit_t *un; 3212 mdi_unit_t *ui; 3213 md_mps_t *ps; 3214 md_mcs_t *cs; 3215 size_t end_rr, start_rr, current_rr; 3216 3217 /*LINTED*/ 3218 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 3219 ps = cs->cs_ps; 3220 pb = ps->ps_bp; 3221 3222 if (cb->b_flags & B_REMAPPED) 3223 bp_mapout(cb); 3224 3225 ps->ps_frags--; 3226 if (ps->ps_frags != 0) { 3227 mutex_exit(&ps->ps_mx); 3228 kmem_cache_free(mirror_child_cache, cs); 3229 return (1); 3230 } 3231 un = ps->ps_un; 3232 ui = ps->ps_ui; 3233 3234 /* 3235 * Do not update outstanding_writes if we're running with ABR 3236 * set for this mirror or the write() was issued with MD_STR_ABR set. 3237 * Also a resync initiated write() has no outstanding_writes update 3238 * either. 3239 */ 3240 if (((cb->b_flags & B_READ) == 0) && 3241 (un->un_nsm >= 2) && 3242 (ps->ps_call == NULL) && 3243 !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) && 3244 !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) { 3245 BLK_TO_RR(end_rr, ps->ps_lastblk, un); 3246 BLK_TO_RR(start_rr, ps->ps_firstblk, un); 3247 mutex_enter(&un->un_resync_mx); 3248 for (current_rr = start_rr; current_rr <= end_rr; current_rr++) 3249 un->un_outstanding_writes[current_rr]--; 3250 mutex_exit(&un->un_resync_mx); 3251 } 3252 kmem_cache_free(mirror_child_cache, cs); 3253 mutex_exit(&ps->ps_mx); 3254 3255 if (ps->ps_call != NULL) { 3256 daemon_request(&md_done_daemon, ps->ps_call, 3257 (daemon_queue_t *)ps, REQ_OLD); 3258 return (1); 3259 } 3260 3261 if ((ps->ps_flags & MD_MPS_ERROR)) { 3262 daemon_request(&md_done_daemon, mirror_error, 3263 (daemon_queue_t *)ps, REQ_OLD); 3264 return (1); 3265 } 3266 3267 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3268 mirror_overlap_tree_remove(ps); 3269 3270 /* 3271 * Handle Write-on-Write problem. 3272 * Skip In case of Raw and Direct I/O as they are 3273 * handled earlier. 3274 * 3275 */ 3276 if (!(md_mirror_wow_flg & WOW_DISABLE) && 3277 !(pb->b_flags & B_READ) && 3278 !(ps->ps_flags & MD_MPS_WOW) && 3279 !(pb->b_flags & B_PHYS) && 3280 any_pages_dirty(pb)) { 3281 md_unit_readerexit(ps->ps_ui); 3282 daemon_request(&md_mstr_daemon, handle_wow, 3283 (daemon_queue_t *)ps, REQ_OLD); 3284 return (1); 3285 } 3286 3287 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3288 MPS_FREE(mirror_parent_cache, ps); 3289 md_unit_readerexit(ui); 3290 md_biodone(pb); 3291 return (0); 3292 } 3293 3294 /* 3295 * Clear error state in submirror component if the retry worked after 3296 * a failfast error. 3297 */ 3298 static void 3299 clear_retry_error(struct buf *cb) 3300 { 3301 int smi; 3302 md_mcs_t *cs; 3303 mm_unit_t *un; 3304 mdi_unit_t *ui_sm; 3305 mm_submirror_t *sm; 3306 mm_submirror_ic_t *smic; 3307 u_longlong_t cnt; 3308 md_m_shared_t *shared; 3309 3310 /*LINTED*/ 3311 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 3312 un = cs->cs_ps->ps_un; 3313 3314 for (smi = 0; smi < NMIRROR; smi++) { 3315 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 3316 continue; 3317 3318 if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev)) 3319 break; 3320 } 3321 3322 if (smi >= NMIRROR) 3323 return; 3324 3325 sm = &un->un_sm[smi]; 3326 smic = &un->un_smic[smi]; 3327 cnt = cb->b_bcount; 3328 3329 ui_sm = MDI_UNIT(getminor(cb->b_edev)); 3330 (void) md_unit_writerlock(ui_sm); 3331 3332 shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm, 3333 cb->b_blkno, &cnt); 3334 3335 if (shared->ms_flags & MDM_S_IOERR) { 3336 shared->ms_flags &= ~MDM_S_IOERR; 3337 3338 } else { 3339 /* the buf spans components and the first one is not erred */ 3340 int cnt; 3341 int i; 3342 3343 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un); 3344 for (i = 0; i < cnt; i++) { 3345 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 3346 (sm->sm_dev, sm, i); 3347 3348 if (shared->ms_flags & MDM_S_IOERR && 3349 shared->ms_state == CS_OKAY) { 3350 3351 shared->ms_flags &= ~MDM_S_IOERR; 3352 break; 3353 } 3354 } 3355 } 3356 3357 md_unit_writerexit(ui_sm); 3358 } 3359 3360 static size_t 3361 mirror_map_read( 3362 md_mps_t *ps, 3363 md_mcs_t *cs, 3364 diskaddr_t blkno, 3365 u_longlong_t count 3366 ) 3367 { 3368 mm_unit_t *un; 3369 buf_t *bp; 3370 u_longlong_t cando; 3371 3372 bp = &cs->cs_buf; 3373 un = ps->ps_un; 3374 3375 bp->b_lblkno = blkno; 3376 if (fast_select_read_unit(ps, cs) == 0) { 3377 bp->b_bcount = ldbtob(count); 3378 return (0); 3379 } 3380 bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno, 3381 count, &cando, 0, NULL, cs)); 3382 bp->b_bcount = ldbtob(cando); 3383 if (count != cando) 3384 return (cando); 3385 return (0); 3386 } 3387 3388 static void 3389 write_after_read(md_mps_t *ps) 3390 { 3391 struct buf *pb; 3392 int flags; 3393 3394 if (ps->ps_flags & MD_MPS_ERROR) { 3395 mirror_error(ps); 3396 return; 3397 } 3398 3399 pb = ps->ps_bp; 3400 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3401 ps->ps_call = NULL; 3402 ps->ps_flags |= MD_MPS_WRITE_AFTER_READ; 3403 flags = MD_STR_NOTTOP | MD_STR_WAR; 3404 if (ps->ps_flags & MD_MPS_MAPPED) 3405 flags |= MD_STR_MAPPED; 3406 if (ps->ps_flags & MD_MPS_NOBLOCK) 3407 flags |= MD_NOBLOCK; 3408 if (ps->ps_flags & MD_MPS_DIRTY_RD) 3409 flags |= MD_STR_DIRTY_RD; 3410 (void) mirror_write_strategy(pb, flags, ps); 3411 } 3412 3413 static void 3414 continue_serial(md_mps_t *ps) 3415 { 3416 md_mcs_t *cs; 3417 buf_t *cb; 3418 mm_unit_t *un; 3419 int flags; 3420 3421 un = ps->ps_un; 3422 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 3423 mirror_child_init(cs); 3424 cb = &cs->cs_buf; 3425 ps->ps_call = NULL; 3426 ps->ps_frags = 1; 3427 (void) mirror_map_write(un, cs, ps, 0); 3428 flags = MD_STR_NOTTOP; 3429 if (ps->ps_flags & MD_MPS_MAPPED) 3430 flags |= MD_STR_MAPPED; 3431 md_call_strategy(cb, flags, NULL); 3432 } 3433 3434 static int 3435 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war) 3436 { 3437 int i; 3438 dev_t dev; /* needed for bioclone, so not md_dev64_t */ 3439 buf_t *cb; 3440 buf_t *pb; 3441 diskaddr_t blkno; 3442 size_t bcount; 3443 off_t offset; 3444 3445 pb = ps->ps_bp; 3446 cb = &cs->cs_buf; 3447 cs->cs_ps = ps; 3448 3449 i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm); 3450 3451 dev = md_dev64_to_dev(un->un_sm[i].sm_dev); 3452 3453 blkno = pb->b_lblkno; 3454 bcount = pb->b_bcount; 3455 offset = 0; 3456 if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) { 3457 blkno = DK_LABEL_LOC + 1; 3458 /* 3459 * This handles the case where we're requesting 3460 * a write to block 0 on a label partition 3461 * and the request size was smaller than the 3462 * size of the label. If this is the case 3463 * then we'll return -1. Failure to do so will 3464 * either cause the calling thread to hang due to 3465 * an ssd bug, or worse if the bcount were allowed 3466 * to go negative (ie large). 3467 */ 3468 if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1)) 3469 return (-1); 3470 bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1)); 3471 offset = (DEV_BSIZE*(DK_LABEL_LOC + 1)); 3472 } 3473 3474 cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done, 3475 cb, KM_NOSLEEP); 3476 if (war) 3477 cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE; 3478 3479 /* 3480 * If the submirror is in the erred stated, check if any component is 3481 * in the Last Erred state. If so, we don't want to use the B_FAILFAST 3482 * flag on the IO. 3483 * 3484 * Provide a fast path for the non-erred case (which should be the 3485 * normal case). 3486 */ 3487 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) { 3488 if (un->un_sm[i].sm_state & SMS_COMP_ERRED) { 3489 mm_submirror_t *sm; 3490 mm_submirror_ic_t *smic; 3491 int ci; 3492 int compcnt; 3493 3494 sm = &un->un_sm[i]; 3495 smic = &un->un_smic[i]; 3496 3497 compcnt = (*(smic->sm_get_component_count)) 3498 (sm->sm_dev, un); 3499 for (ci = 0; ci < compcnt; ci++) { 3500 md_m_shared_t *shared; 3501 3502 shared = (md_m_shared_t *) 3503 (*(smic->sm_shared_by_indx))(sm->sm_dev, 3504 sm, ci); 3505 3506 if (shared->ms_state == CS_LAST_ERRED) 3507 break; 3508 } 3509 if (ci >= compcnt) 3510 cb->b_flags |= B_FAILFAST; 3511 3512 } else { 3513 cb->b_flags |= B_FAILFAST; 3514 } 3515 } 3516 3517 ps->ps_current_sm++; 3518 if (ps->ps_current_sm != ps->ps_active_cnt) { 3519 if (un->un_write_option == WR_SERIAL) { 3520 ps->ps_call = continue_serial; 3521 return (0); 3522 } 3523 return (1); 3524 } 3525 return (0); 3526 } 3527 3528 /* 3529 * directed_read_done: 3530 * ------------------ 3531 * Completion routine called when a DMR request has been returned from the 3532 * underlying driver. Wake-up the original ioctl() and return the data to 3533 * the user. 3534 */ 3535 static void 3536 directed_read_done(md_mps_t *ps) 3537 { 3538 mm_unit_t *un; 3539 mdi_unit_t *ui; 3540 3541 un = ps->ps_un; 3542 ui = ps->ps_ui; 3543 3544 md_unit_readerexit(ui); 3545 md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3546 ps->ps_call = NULL; 3547 3548 mutex_enter(&un->un_dmr_mx); 3549 cv_signal(&un->un_dmr_cv); 3550 mutex_exit(&un->un_dmr_mx); 3551 3552 /* release the parent structure */ 3553 kmem_cache_free(mirror_parent_cache, ps); 3554 } 3555 3556 /* 3557 * daemon_io: 3558 * ------------ 3559 * Called to issue a mirror_write_strategy() or mirror_read_strategy 3560 * call from a blockable context. NOTE: no mutex can be held on entry to this 3561 * routine 3562 */ 3563 static void 3564 daemon_io(daemon_queue_t *dq) 3565 { 3566 md_mps_t *ps = (md_mps_t *)dq; 3567 int flag = MD_STR_NOTTOP; 3568 buf_t *pb = ps->ps_bp; 3569 3570 if (ps->ps_flags & MD_MPS_MAPPED) 3571 flag |= MD_STR_MAPPED; 3572 if (ps->ps_flags & MD_MPS_WOW) 3573 flag |= MD_STR_WOW; 3574 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) 3575 flag |= MD_STR_WAR; 3576 if (ps->ps_flags & MD_MPS_ABR) 3577 flag |= MD_STR_ABR; 3578 if (ps->ps_flags & MD_MPS_BLOCKABLE_IO) 3579 flag |= MD_STR_BLOCK_OK; 3580 3581 /* 3582 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set 3583 * MD_STR_WAR before calling mirror_read_strategy 3584 */ 3585 if (pb->b_flags & B_READ) { 3586 if (!(ps->ps_flags & MD_MPS_DIRTY_RD)) 3587 flag |= MD_STR_WAR; 3588 mirror_read_strategy(pb, flag, ps); 3589 } else 3590 mirror_write_strategy(pb, flag, ps); 3591 } 3592 3593 /* 3594 * update_resync: 3595 * ------------- 3596 * Called to update the in-core version of the resync record with the latest 3597 * version that was committed to disk when the previous mirror owner 3598 * relinquished ownership. This call is likely to block as we must hold-off 3599 * any current resync processing that may be occurring. 3600 * On completion of the resync record update we issue the mirror_write_strategy 3601 * call to complete the i/o that first started this sequence. To remove a race 3602 * condition between a new write() request which is submitted and the resync 3603 * record update we acquire the writerlock. This will hold off all i/o to the 3604 * mirror until the resync update has completed. 3605 * NOTE: no mutex can be held on entry to this routine 3606 */ 3607 static void 3608 update_resync(daemon_queue_t *dq) 3609 { 3610 md_mps_t *ps = (md_mps_t *)dq; 3611 buf_t *pb = ps->ps_bp; 3612 mdi_unit_t *ui = ps->ps_ui; 3613 mm_unit_t *un = MD_UNIT(ui->ui_link.ln_id); 3614 set_t setno; 3615 int restart_resync; 3616 3617 mutex_enter(&un->un_rrp_inflight_mx); 3618 (void) md_unit_writerlock(ui); 3619 ps->ps_un = un; 3620 setno = MD_MIN2SET(getminor(pb->b_edev)); 3621 if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) { 3622 /* 3623 * Synchronize our in-core view of what regions need to be 3624 * resync'd with the on-disk version. 3625 */ 3626 mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm, 3627 un->un_dirty_bm); 3628 3629 /* Region dirty map is now up to date */ 3630 } 3631 restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0; 3632 md_unit_writerexit(ui); 3633 mutex_exit(&un->un_rrp_inflight_mx); 3634 3635 /* Restart the resync thread if it was previously blocked */ 3636 if (restart_resync) { 3637 mutex_enter(&un->un_rs_thread_mx); 3638 un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER; 3639 cv_signal(&un->un_rs_thread_cv); 3640 mutex_exit(&un->un_rs_thread_mx); 3641 } 3642 /* Continue with original deferred i/o */ 3643 daemon_io(dq); 3644 } 3645 3646 /* 3647 * owner_timeout: 3648 * ------------- 3649 * Called if the original mdmn_ksend_message() failed and the request is to be 3650 * retried. Reattempt the original ownership change. 3651 * 3652 * NOTE: called at interrupt context (see timeout(9f)). 3653 */ 3654 static void 3655 owner_timeout(void *arg) 3656 { 3657 daemon_queue_t *dq = (daemon_queue_t *)arg; 3658 3659 daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD); 3660 } 3661 3662 /* 3663 * become_owner: 3664 * ------------ 3665 * Called to issue RPC request to become the owner of the mirror 3666 * associated with this i/o request. We assume that the ownership request 3667 * is synchronous, so if it succeeds we will issue the request via 3668 * mirror_write_strategy(). 3669 * If multiple i/o's are outstanding we will be called from the mirror_daemon 3670 * service thread. 3671 * NOTE: no mutex should be held on entry to this routine. 3672 */ 3673 static void 3674 become_owner(daemon_queue_t *dq) 3675 { 3676 md_mps_t *ps = (md_mps_t *)dq; 3677 mm_unit_t *un = ps->ps_un; 3678 buf_t *pb = ps->ps_bp; 3679 set_t setno; 3680 md_mn_kresult_t *kres; 3681 int msg_flags = md_mirror_msg_flags; 3682 md_mps_t *ps1; 3683 3684 ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL); 3685 3686 /* 3687 * If we're already the mirror owner we do not need to send a message 3688 * but can simply process the i/o request immediately. 3689 * If we've already sent the request to become owner we requeue the 3690 * request as we're waiting for the synchronous ownership message to 3691 * be processed. 3692 */ 3693 if (MD_MN_MIRROR_OWNER(un)) { 3694 /* 3695 * As the strategy() call will potentially block we need to 3696 * punt this to a separate thread and complete this request 3697 * as quickly as possible. Note: if we're a read request 3698 * this must be a resync, we cannot afford to be queued 3699 * behind any intervening i/o requests. In this case we put the 3700 * request on the md_mirror_rs_daemon queue. 3701 */ 3702 if (pb->b_flags & B_READ) { 3703 daemon_request(&md_mirror_rs_daemon, daemon_io, dq, 3704 REQ_OLD); 3705 } else { 3706 daemon_request(&md_mirror_io_daemon, daemon_io, dq, 3707 REQ_OLD); 3708 } 3709 } else { 3710 mutex_enter(&un->un_owner_mx); 3711 if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) { 3712 md_mn_req_owner_t *msg; 3713 int rval = 0; 3714 3715 /* 3716 * Check to see that we haven't exceeded the maximum 3717 * retry count. If we have we fail the i/o as the 3718 * comms mechanism has become wedged beyond recovery. 3719 */ 3720 if (dq->qlen++ >= MD_OWNER_RETRIES) { 3721 mutex_exit(&un->un_owner_mx); 3722 cmn_err(CE_WARN, 3723 "md_mirror: Request exhausted ownership " 3724 "retry limit of %d attempts", dq->qlen); 3725 pb->b_error = EIO; 3726 pb->b_flags |= B_ERROR; 3727 pb->b_resid = pb->b_bcount; 3728 kmem_cache_free(mirror_parent_cache, ps); 3729 md_biodone(pb); 3730 return; 3731 } 3732 3733 /* 3734 * Issue request to change ownership. The call is 3735 * synchronous so when it returns we can complete the 3736 * i/o (if successful), or enqueue it again so that 3737 * the operation will be retried. 3738 */ 3739 un->un_owner_state |= MM_MN_OWNER_SENT; 3740 mutex_exit(&un->un_owner_mx); 3741 3742 msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP); 3743 setno = MD_MIN2SET(getminor(pb->b_edev)); 3744 msg->mnum = MD_SID(un); 3745 msg->owner = md_mn_mynode_id; 3746 msg_flags |= MD_MSGF_NO_LOG; 3747 /* 3748 * If this IO is triggered by updating a watermark, 3749 * it might be issued by the creation of a softpartition 3750 * while the commd subsystem is suspended. 3751 * We don't want this message to block. 3752 */ 3753 if (ps->ps_flags & MD_MPS_WMUPDATE) { 3754 msg_flags |= MD_MSGF_OVERRIDE_SUSPEND; 3755 } 3756 3757 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 3758 rval = mdmn_ksend_message(setno, 3759 MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0, 3760 (char *)msg, sizeof (md_mn_req_owner_t), kres); 3761 3762 kmem_free(msg, sizeof (md_mn_req_owner_t)); 3763 3764 if (MDMN_KSEND_MSG_OK(rval, kres)) { 3765 dq->qlen = 0; 3766 /* 3767 * Successfully changed owner, reread the 3768 * resync record so that we have a valid idea of 3769 * any previously committed incomplete write()s. 3770 * NOTE: As we need to acquire the resync mutex 3771 * this may block, so we defer it to a separate 3772 * thread handler. This makes us (effectively) 3773 * non-blocking once the ownership message 3774 * handling has completed. 3775 */ 3776 mutex_enter(&un->un_owner_mx); 3777 if (un->un_owner_state & MM_MN_BECOME_OWNER) { 3778 un->un_mirror_owner = md_mn_mynode_id; 3779 /* Sets owner of un_rr_dirty record */ 3780 if (un->un_rr_dirty_recid) 3781 (void) mddb_setowner( 3782 un->un_rr_dirty_recid, 3783 md_mn_mynode_id); 3784 un->un_owner_state &= 3785 ~MM_MN_BECOME_OWNER; 3786 /* 3787 * Release the block on the current 3788 * resync region if it is blocked 3789 */ 3790 ps1 = un->un_rs_prev_overlap; 3791 if ((ps1 != NULL) && 3792 (ps1->ps_flags & MD_MPS_ON_OVERLAP)) 3793 mirror_overlap_tree_remove(ps1); 3794 mutex_exit(&un->un_owner_mx); 3795 3796 /* 3797 * If we're a read, this must be a 3798 * resync request, issue 3799 * the i/o request on the 3800 * md_mirror_rs_daemon queue. This is 3801 * to avoid a deadlock between the 3802 * resync_unit thread and 3803 * subsequent i/o requests that may 3804 * block on the resync region. 3805 */ 3806 if (pb->b_flags & B_READ) { 3807 daemon_request( 3808 &md_mirror_rs_daemon, 3809 update_resync, dq, REQ_OLD); 3810 } else { 3811 daemon_request( 3812 &md_mirror_io_daemon, 3813 update_resync, dq, REQ_OLD); 3814 } 3815 kmem_free(kres, 3816 sizeof (md_mn_kresult_t)); 3817 return; 3818 } else { 3819 /* 3820 * Some other node has beaten us to 3821 * obtain ownership. We need to 3822 * reschedule our ownership request 3823 */ 3824 mutex_exit(&un->un_owner_mx); 3825 } 3826 } else { 3827 mdmn_ksend_show_error(rval, kres, 3828 "MD_MN_MSG_REQUIRE_OWNER"); 3829 /* 3830 * Message transport failure is handled by the 3831 * comms layer. If the ownership change request 3832 * does not succeed we need to flag the error to 3833 * the initiator of the i/o. This is handled by 3834 * the retry logic above. As the request failed 3835 * we do not know _who_ the owner of the mirror 3836 * currently is. We reset our idea of the owner 3837 * to None so that any further write()s will 3838 * attempt to become the owner again. This stops 3839 * multiple nodes writing to the same mirror 3840 * simultaneously. 3841 */ 3842 mutex_enter(&un->un_owner_mx); 3843 un->un_owner_state &= 3844 ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER); 3845 un->un_mirror_owner = MD_MN_MIRROR_UNOWNED; 3846 mutex_exit(&un->un_owner_mx); 3847 } 3848 kmem_free(kres, sizeof (md_mn_kresult_t)); 3849 } else 3850 mutex_exit(&un->un_owner_mx); 3851 3852 /* 3853 * Re-enqueue this request on the deferred i/o list. Delay the 3854 * request for md_mirror_owner_to usecs to stop thrashing. 3855 */ 3856 (void) timeout(owner_timeout, dq, 3857 drv_usectohz(md_mirror_owner_to)); 3858 } 3859 } 3860 3861 static void 3862 mirror_write_strategy(buf_t *pb, int flag, void *private) 3863 { 3864 md_mps_t *ps; 3865 md_mcs_t *cs; 3866 int more; 3867 mm_unit_t *un; 3868 mdi_unit_t *ui; 3869 buf_t *cb; /* child buf pointer */ 3870 set_t setno; 3871 int rs_on_overlap = 0; 3872 3873 ui = MDI_UNIT(getminor(pb->b_edev)); 3874 un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev)); 3875 3876 3877 md_kstat_waitq_enter(ui); 3878 3879 /* 3880 * If a state change is in progress for this mirror in a MN set, 3881 * suspend all non-resync writes until the state change is complete. 3882 * The objective of this suspend is to ensure that it is not 3883 * possible for one node to read data from a submirror that another node 3884 * has not written to because of the state change. Therefore we 3885 * suspend all writes until the state change has been made. As it is 3886 * not possible to read from the target of a resync, there is no need 3887 * to suspend resync writes. 3888 * Note that we only block here if the caller can handle a busy-wait. 3889 * The MD_STR_BLOCK_OK flag is set for daemon_io originated i/o only. 3890 */ 3891 3892 if (!(flag & MD_STR_WAR)) { 3893 if (flag & MD_STR_BLOCK_OK) { 3894 mutex_enter(&un->un_suspend_wr_mx); 3895 while (un->un_suspend_wr_flag) { 3896 cv_wait(&un->un_suspend_wr_cv, 3897 &un->un_suspend_wr_mx); 3898 } 3899 mutex_exit(&un->un_suspend_wr_mx); 3900 } 3901 (void) md_unit_readerlock(ui); 3902 } 3903 3904 if (!(flag & MD_STR_NOTTOP)) { 3905 if (md_checkbuf(ui, (md_unit_t *)un, pb)) { 3906 md_kstat_waitq_exit(ui); 3907 return; 3908 } 3909 } 3910 3911 setno = MD_MIN2SET(getminor(pb->b_edev)); 3912 3913 /* If an ABR write has been requested, set MD_STR_ABR flag */ 3914 if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE)) 3915 flag |= MD_STR_ABR; 3916 3917 if (private == NULL) { 3918 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); 3919 mirror_parent_init(ps); 3920 } else { 3921 ps = private; 3922 private = NULL; 3923 } 3924 if (flag & MD_STR_MAPPED) 3925 ps->ps_flags |= MD_MPS_MAPPED; 3926 3927 if (flag & MD_STR_WOW) 3928 ps->ps_flags |= MD_MPS_WOW; 3929 3930 if (flag & MD_STR_ABR) 3931 ps->ps_flags |= MD_MPS_ABR; 3932 3933 if (flag & MD_STR_WMUPDATE) 3934 ps->ps_flags |= MD_MPS_WMUPDATE; 3935 3936 /* 3937 * Save essential information from the original buffhdr 3938 * in the md_save structure. 3939 */ 3940 ps->ps_un = un; 3941 ps->ps_ui = ui; 3942 ps->ps_bp = pb; 3943 ps->ps_addr = pb->b_un.b_addr; 3944 ps->ps_firstblk = pb->b_lblkno; 3945 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1; 3946 ps->ps_changecnt = un->un_changecnt; 3947 3948 /* 3949 * Check for suspended writes here. This is where we can defer the 3950 * write request to the daemon_io queue which will then call us with 3951 * the MD_STR_BLOCK_OK flag set and we'll busy-wait (if necessary) at 3952 * the top of this routine. 3953 */ 3954 if (!(flag & MD_STR_WAR) && !(flag & MD_STR_BLOCK_OK)) { 3955 mutex_enter(&un->un_suspend_wr_mx); 3956 if (un->un_suspend_wr_flag) { 3957 ps->ps_flags |= MD_MPS_BLOCKABLE_IO; 3958 mutex_exit(&un->un_suspend_wr_mx); 3959 md_unit_readerexit(ui); 3960 daemon_request(&md_mirror_daemon, daemon_io, 3961 (daemon_queue_t *)ps, REQ_OLD); 3962 return; 3963 } 3964 mutex_exit(&un->un_suspend_wr_mx); 3965 } 3966 3967 /* 3968 * If not MN owner and this is an ABR write, make sure the current 3969 * resync region is in the overlaps tree 3970 */ 3971 mutex_enter(&un->un_owner_mx); 3972 if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) && 3973 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { 3974 md_mps_t *ps1; 3975 /* Block the current resync region, if not already blocked */ 3976 ps1 = un->un_rs_prev_overlap; 3977 3978 if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) || 3979 (ps1->ps_lastblk != 0))) { 3980 /* Drop locks to avoid deadlock */ 3981 mutex_exit(&un->un_owner_mx); 3982 md_unit_readerexit(ui); 3983 wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT); 3984 rs_on_overlap = 1; 3985 (void) md_unit_readerlock(ui); 3986 mutex_enter(&un->un_owner_mx); 3987 /* 3988 * Check to see if we have obtained ownership 3989 * while waiting for overlaps. If we have, remove 3990 * the resync_region entry from the overlap tree 3991 */ 3992 if (MD_MN_MIRROR_OWNER(un) && 3993 (ps1->ps_flags & MD_MPS_ON_OVERLAP)) { 3994 mirror_overlap_tree_remove(ps1); 3995 rs_on_overlap = 0; 3996 } 3997 } 3998 } 3999 mutex_exit(&un->un_owner_mx); 4000 4001 4002 /* 4003 * following keep write after read from writing to the 4004 * source in the case where it all came from one place 4005 */ 4006 if (flag & MD_STR_WAR) { 4007 int abort_write = 0; 4008 /* 4009 * We are perfoming a write-after-read. This is either as a 4010 * result of a resync read or as a result of a read in a 4011 * dirty resync region when the optimized resync is not 4012 * complete. If in a MN set and a resync generated i/o, 4013 * if the current block is not in the current 4014 * resync region terminate the write as another node must have 4015 * completed this resync region 4016 */ 4017 if ((MD_MNSET_SETNO(MD_UN2SET(un))) && 4018 (!(flag & MD_STR_DIRTY_RD))) { 4019 if (!IN_RESYNC_REGION(un, ps)) 4020 abort_write = 1; 4021 } 4022 if ((select_write_after_read_units(un, ps) == 0) || 4023 (abort_write)) { 4024 #ifdef DEBUG 4025 if (mirror_debug_flag) 4026 printf("Abort resync write on %x, block %lld\n", 4027 MD_SID(un), ps->ps_firstblk); 4028 #endif 4029 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4030 mirror_overlap_tree_remove(ps); 4031 kmem_cache_free(mirror_parent_cache, ps); 4032 md_kstat_waitq_exit(ui); 4033 md_unit_readerexit(ui); 4034 md_biodone(pb); 4035 return; 4036 } 4037 } else { 4038 select_write_units(un, ps); 4039 4040 /* Drop readerlock to avoid deadlock */ 4041 md_unit_readerexit(ui); 4042 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT); 4043 un = md_unit_readerlock(ui); 4044 /* 4045 * For a MN set with an ABR write, if we are now the 4046 * owner and we have a resync region in the overlap 4047 * tree, remove the entry from overlaps and retry the write. 4048 */ 4049 4050 if (MD_MNSET_SETNO(setno) && 4051 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { 4052 mutex_enter(&un->un_owner_mx); 4053 if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) { 4054 mirror_overlap_tree_remove(ps); 4055 md_kstat_waitq_exit(ui); 4056 mutex_exit(&un->un_owner_mx); 4057 md_unit_readerexit(ui); 4058 daemon_request(&md_mirror_daemon, daemon_io, 4059 (daemon_queue_t *)ps, REQ_OLD); 4060 return; 4061 } 4062 mutex_exit(&un->un_owner_mx); 4063 } 4064 } 4065 4066 /* 4067 * For Multinode mirrors with no owner and a Resync Region (not ABR) 4068 * we need to become the mirror owner before continuing with the 4069 * write(). For ABR mirrors we check that we 'own' the resync if 4070 * we're in write-after-read mode. We do this _after_ ensuring that 4071 * there are no overlaps to ensure that once we know that we are 4072 * the owner, the readerlock will not be released until the write is 4073 * complete. As a change of ownership in a MN set requires the 4074 * writerlock, this ensures that ownership cannot be changed until 4075 * the write is complete. 4076 */ 4077 if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) || 4078 (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) { 4079 if (MD_MN_NO_MIRROR_OWNER(un)) { 4080 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4081 mirror_overlap_tree_remove(ps); 4082 md_kstat_waitq_exit(ui); 4083 ASSERT(!(flag & MD_STR_WAR)); 4084 md_unit_readerexit(ui); 4085 daemon_request(&md_mirror_daemon, become_owner, 4086 (daemon_queue_t *)ps, REQ_OLD); 4087 return; 4088 } 4089 } 4090 4091 /* 4092 * Mark resync region if mirror has a Resync Region _and_ we are not 4093 * a resync initiated write(). Don't mark region if we're flagged as 4094 * an ABR write. 4095 */ 4096 if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) && 4097 !(flag & MD_STR_WAR)) { 4098 if (mirror_mark_resync_region(un, ps->ps_firstblk, 4099 ps->ps_lastblk, md_mn_mynode_id)) { 4100 pb->b_flags |= B_ERROR; 4101 pb->b_resid = pb->b_bcount; 4102 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4103 mirror_overlap_tree_remove(ps); 4104 kmem_cache_free(mirror_parent_cache, ps); 4105 md_kstat_waitq_exit(ui); 4106 md_unit_readerexit(ui); 4107 md_biodone(pb); 4108 return; 4109 } 4110 } 4111 4112 ps->ps_childbflags = pb->b_flags | B_WRITE; 4113 ps->ps_childbflags &= ~B_READ; 4114 if (flag & MD_STR_MAPPED) 4115 ps->ps_childbflags &= ~B_PAGEIO; 4116 4117 if (!(flag & MD_STR_NOTTOP) && panicstr) 4118 /* Disable WOW and don't free ps */ 4119 ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE); 4120 4121 md_kstat_waitq_to_runq(ui); 4122 4123 /* 4124 * Treat Raw and Direct I/O as Write-on-Write always 4125 */ 4126 4127 if (!(md_mirror_wow_flg & WOW_DISABLE) && 4128 (md_mirror_wow_flg & WOW_PHYS_ENABLE) && 4129 (pb->b_flags & B_PHYS) && 4130 !(ps->ps_flags & MD_MPS_WOW)) { 4131 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4132 mirror_overlap_tree_remove(ps); 4133 md_unit_readerexit(ui); 4134 daemon_request(&md_mstr_daemon, handle_wow, 4135 (daemon_queue_t *)ps, REQ_OLD); 4136 return; 4137 } 4138 4139 ps->ps_frags = 1; 4140 do { 4141 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 4142 mirror_child_init(cs); 4143 cb = &cs->cs_buf; 4144 more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR)); 4145 4146 /* 4147 * This handles the case where we're requesting 4148 * a write to block 0 on a label partition. (more < 0) 4149 * means that the request size was smaller than the 4150 * size of the label. If so this request is done. 4151 */ 4152 if (more < 0) { 4153 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4154 mirror_overlap_tree_remove(ps); 4155 md_kstat_runq_exit(ui); 4156 kmem_cache_free(mirror_child_cache, cs); 4157 kmem_cache_free(mirror_parent_cache, ps); 4158 md_unit_readerexit(ui); 4159 md_biodone(pb); 4160 return; 4161 } 4162 if (more) { 4163 mutex_enter(&ps->ps_mx); 4164 ps->ps_frags++; 4165 mutex_exit(&ps->ps_mx); 4166 } 4167 md_call_strategy(cb, flag, private); 4168 } while (more); 4169 4170 if (!(flag & MD_STR_NOTTOP) && panicstr) { 4171 while (!(ps->ps_flags & MD_MPS_DONE)) { 4172 md_daemon(1, &md_done_daemon); 4173 drv_usecwait(10); 4174 } 4175 kmem_cache_free(mirror_parent_cache, ps); 4176 } 4177 } 4178 4179 static void 4180 mirror_read_strategy(buf_t *pb, int flag, void *private) 4181 { 4182 md_mps_t *ps; 4183 md_mcs_t *cs; 4184 size_t more; 4185 mm_unit_t *un; 4186 mdi_unit_t *ui; 4187 size_t current_count; 4188 diskaddr_t current_blkno; 4189 off_t current_offset; 4190 buf_t *cb; /* child buf pointer */ 4191 set_t setno; 4192 4193 ui = MDI_UNIT(getminor(pb->b_edev)); 4194 4195 md_kstat_waitq_enter(ui); 4196 4197 un = (mm_unit_t *)md_unit_readerlock(ui); 4198 4199 if (!(flag & MD_STR_NOTTOP)) { 4200 if (md_checkbuf(ui, (md_unit_t *)un, pb)) { 4201 md_kstat_waitq_exit(ui); 4202 return; 4203 } 4204 } 4205 4206 if (private == NULL) { 4207 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); 4208 mirror_parent_init(ps); 4209 } else { 4210 ps = private; 4211 private = NULL; 4212 } 4213 4214 if (flag & MD_STR_MAPPED) 4215 ps->ps_flags |= MD_MPS_MAPPED; 4216 if (flag & MD_NOBLOCK) 4217 ps->ps_flags |= MD_MPS_NOBLOCK; 4218 if (flag & MD_STR_WMUPDATE) 4219 ps->ps_flags |= MD_MPS_WMUPDATE; 4220 4221 /* 4222 * Check to see if this is a DMR driven read. If so we need to use the 4223 * specified side (in un->un_dmr_last_read) for the source of the data. 4224 */ 4225 if (flag & MD_STR_DMR) 4226 ps->ps_flags |= MD_MPS_DMR; 4227 4228 /* 4229 * Save essential information from the original buffhdr 4230 * in the md_save structure. 4231 */ 4232 ps->ps_un = un; 4233 ps->ps_ui = ui; 4234 ps->ps_bp = pb; 4235 ps->ps_addr = pb->b_un.b_addr; 4236 ps->ps_firstblk = pb->b_lblkno; 4237 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1; 4238 ps->ps_changecnt = un->un_changecnt; 4239 4240 current_count = btodb(pb->b_bcount); 4241 current_blkno = pb->b_lblkno; 4242 current_offset = 0; 4243 4244 /* 4245 * If flag has MD_STR_WAR set this means that the read is issued by a 4246 * resync thread which may or may not be an optimised resync. 4247 * 4248 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync 4249 * code has not completed; either a resync has not started since snarf, 4250 * or there is an optimized resync in progress. 4251 * 4252 * We need to generate a write after this read in the following two 4253 * cases, 4254 * 4255 * 1. Any Resync-Generated read 4256 * 4257 * 2. Any read to a DIRTY REGION if there is an optimized resync 4258 * pending or in progress. 4259 * 4260 * The write after read is done in these cases to ensure that all sides 4261 * of the mirror are in sync with the read data and that it is not 4262 * possible for an application to read the same block multiple times 4263 * and get different data. 4264 * 4265 * This would be possible if the block was in a dirty region. 4266 * 4267 * If we're performing a directed read we don't write the data out as 4268 * the application is responsible for restoring the mirror to a known 4269 * state. 4270 */ 4271 if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) && 4272 !(flag & MD_STR_DMR)) { 4273 size_t start_rr, i, end_rr; 4274 int region_dirty = 1; 4275 4276 /* 4277 * We enter here under three circumstances, 4278 * 4279 * MD_UN_OPT_NOT_DONE MD_STR_WAR 4280 * 0 1 4281 * 1 0 4282 * 1 1 4283 * 4284 * To be optimal we only care to explicitly check for dirty 4285 * regions in the second case since if MD_STR_WAR is set we 4286 * always do the write after read. 4287 */ 4288 if (!(flag & MD_STR_WAR)) { 4289 BLK_TO_RR(end_rr, ps->ps_lastblk, un); 4290 BLK_TO_RR(start_rr, ps->ps_firstblk, un); 4291 4292 for (i = start_rr; i <= end_rr; i++) 4293 if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0) 4294 break; 4295 } 4296 4297 if ((region_dirty) && 4298 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) { 4299 ps->ps_call = write_after_read; 4300 /* 4301 * Mark this as a RESYNC_READ in ps_flags. 4302 * This is used if the read fails during a 4303 * resync of a 3-way mirror to ensure that 4304 * the retried read to the remaining 4305 * good submirror has MD_STR_WAR set. This 4306 * is needed to ensure that the resync write 4307 * (write-after-read) takes place. 4308 */ 4309 ps->ps_flags |= MD_MPS_RESYNC_READ; 4310 4311 /* 4312 * If MD_STR_FLAG_ERR is set in the flags we 4313 * set MD_MPS_FLAG_ERROR so that an error on the resync 4314 * write (issued by write_after_read) will be flagged 4315 * to the biowait'ing resync thread. This allows us to 4316 * avoid issuing further resync requests to a device 4317 * that has had a write failure. 4318 */ 4319 if (flag & MD_STR_FLAG_ERR) 4320 ps->ps_flags |= MD_MPS_FLAG_ERROR; 4321 4322 setno = MD_UN2SET(un); 4323 /* 4324 * Drop the readerlock to avoid 4325 * deadlock 4326 */ 4327 md_unit_readerexit(ui); 4328 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT); 4329 un = md_unit_readerlock(ui); 4330 /* 4331 * Ensure that we are owner 4332 */ 4333 if (MD_MNSET_SETNO(setno)) { 4334 /* 4335 * For a non-resync read that requires a 4336 * write-after-read to be done, set a flag 4337 * in the parent structure, so that the 4338 * write_strategy routine can omit the 4339 * test that the write is still within the 4340 * resync region 4341 */ 4342 if (!(flag & MD_STR_WAR)) 4343 ps->ps_flags |= MD_MPS_DIRTY_RD; 4344 4345 /* 4346 * Before reading the buffer, see if 4347 * there is an owner. 4348 */ 4349 if (MD_MN_NO_MIRROR_OWNER(un)) { 4350 ps->ps_call = NULL; 4351 mirror_overlap_tree_remove(ps); 4352 md_kstat_waitq_exit(ui); 4353 md_unit_readerexit(ui); 4354 daemon_request( 4355 &md_mirror_daemon, 4356 become_owner, 4357 (daemon_queue_t *)ps, 4358 REQ_OLD); 4359 return; 4360 } 4361 /* 4362 * For a resync read, check to see if I/O is 4363 * outside of the current resync region, or 4364 * the resync has finished. If so 4365 * just terminate the I/O 4366 */ 4367 if ((flag & MD_STR_WAR) && 4368 (!(un->c.un_status & MD_UN_WAR) || 4369 (!IN_RESYNC_REGION(un, ps)))) { 4370 #ifdef DEBUG 4371 if (mirror_debug_flag) 4372 printf("Abort resync read " 4373 "%x: %lld\n", 4374 MD_SID(un), 4375 ps->ps_firstblk); 4376 #endif 4377 mirror_overlap_tree_remove(ps); 4378 kmem_cache_free(mirror_parent_cache, 4379 ps); 4380 md_kstat_waitq_exit(ui); 4381 md_unit_readerexit(ui); 4382 md_biodone(pb); 4383 return; 4384 } 4385 } 4386 } 4387 } 4388 4389 if (flag & MD_STR_DMR) { 4390 ps->ps_call = directed_read_done; 4391 } 4392 4393 if (!(flag & MD_STR_NOTTOP) && panicstr) 4394 ps->ps_flags |= MD_MPS_DONTFREE; 4395 4396 md_kstat_waitq_to_runq(ui); 4397 4398 ps->ps_frags++; 4399 do { 4400 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 4401 mirror_child_init(cs); 4402 cb = &cs->cs_buf; 4403 cs->cs_ps = ps; 4404 4405 cb = md_bioclone(pb, current_offset, current_count, NODEV, 4406 current_blkno, mirror_done, cb, KM_NOSLEEP); 4407 4408 more = mirror_map_read(ps, cs, current_blkno, 4409 (u_longlong_t)current_count); 4410 if (more) { 4411 mutex_enter(&ps->ps_mx); 4412 ps->ps_frags++; 4413 mutex_exit(&ps->ps_mx); 4414 } 4415 4416 /* 4417 * Do these calculations now, 4418 * so that we pickup a valid b_bcount from the chld_bp. 4419 */ 4420 current_count -= more; 4421 current_offset += cb->b_bcount; 4422 current_blkno += more; 4423 md_call_strategy(cb, flag, private); 4424 } while (more); 4425 4426 if (!(flag & MD_STR_NOTTOP) && panicstr) { 4427 while (!(ps->ps_flags & MD_MPS_DONE)) { 4428 md_daemon(1, &md_done_daemon); 4429 drv_usecwait(10); 4430 } 4431 kmem_cache_free(mirror_parent_cache, ps); 4432 } 4433 } 4434 4435 void 4436 md_mirror_strategy(buf_t *bp, int flag, void *private) 4437 { 4438 set_t setno = MD_MIN2SET(getminor(bp->b_edev)); 4439 4440 /* 4441 * When doing IO to a multi owner meta device, check if set is halted. 4442 * We do this check without the needed lock held, for performance 4443 * reasons. 4444 * If an IO just slips through while the set is locked via an 4445 * MD_MN_SUSPEND_SET, we don't care about it. 4446 * Only check for suspension if we are a top-level i/o request 4447 * (MD_STR_NOTTOP is cleared in 'flag'). 4448 */ 4449 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == 4450 (MD_SET_HALTED | MD_SET_MNSET)) { 4451 if ((flag & MD_STR_NOTTOP) == 0) { 4452 mutex_enter(&md_mx); 4453 /* Here we loop until the set is no longer halted */ 4454 while (md_set[setno].s_status & MD_SET_HALTED) { 4455 cv_wait(&md_cv, &md_mx); 4456 } 4457 mutex_exit(&md_mx); 4458 } 4459 } 4460 4461 if ((flag & MD_IO_COUNTED) == 0) { 4462 if ((flag & MD_NOBLOCK) == 0) { 4463 if (md_inc_iocount(setno) != 0) { 4464 bp->b_flags |= B_ERROR; 4465 bp->b_error = ENXIO; 4466 bp->b_resid = bp->b_bcount; 4467 biodone(bp); 4468 return; 4469 } 4470 } else { 4471 md_inc_iocount_noblock(setno); 4472 } 4473 } 4474 4475 if (bp->b_flags & B_READ) 4476 mirror_read_strategy(bp, flag, private); 4477 else 4478 mirror_write_strategy(bp, flag, private); 4479 } 4480 4481 /* 4482 * mirror_directed_read: 4483 * -------------------- 4484 * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror 4485 * so that the application can determine what (if any) resync needs to be 4486 * performed. The data is copied out to the user-supplied buffer. 4487 * 4488 * Parameters: 4489 * mdev - dev_t for the mirror device 4490 * vdr - directed read parameters specifying location and submirror 4491 * to perform the read from 4492 * mode - used to ddi_copyout() any resulting data from the read 4493 * 4494 * Returns: 4495 * 0 success 4496 * !0 error code 4497 * EINVAL - invalid request format 4498 */ 4499 int 4500 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode) 4501 { 4502 buf_t *bp; 4503 minor_t mnum = getminor(mdev); 4504 mdi_unit_t *ui = MDI_UNIT(mnum); 4505 mm_unit_t *un; 4506 mm_submirror_t *sm; 4507 char *sm_nm; 4508 uint_t next_side; 4509 void *kbuffer; 4510 4511 if (ui == NULL) 4512 return (ENXIO); 4513 4514 if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) { 4515 return (EINVAL); 4516 } 4517 4518 /* Check for aligned block access. We disallow non-aligned requests. */ 4519 if (vdr->vdr_offset % DEV_BSIZE) { 4520 return (EINVAL); 4521 } 4522 4523 /* 4524 * Allocate kernel buffer for target of read(). If we had a reliable 4525 * (sorry functional) DDI this wouldn't be needed. 4526 */ 4527 kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP); 4528 if (kbuffer == NULL) { 4529 cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx" 4530 " bytes\n", vdr->vdr_nbytes); 4531 return (ENOMEM); 4532 } 4533 4534 bp = getrbuf(KM_SLEEP); 4535 4536 bp->b_un.b_addr = kbuffer; 4537 bp->b_flags = B_READ; 4538 bp->b_bcount = vdr->vdr_nbytes; 4539 bp->b_lblkno = lbtodb(vdr->vdr_offset); 4540 bp->b_edev = mdev; 4541 4542 un = md_unit_readerlock(ui); 4543 4544 /* 4545 * If DKV_SIDE_INIT is set we need to determine the first available 4546 * side to start reading from. If it isn't set we increment to the 4547 * next readable submirror. 4548 * If there are no readable submirrors we error out with DKV_DMR_ERROR. 4549 * Note: we check for a readable submirror on completion of the i/o so 4550 * we should _always_ have one available. If this becomes unavailable 4551 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if 4552 * a metadetach is made between the completion of one DKIOCDMR ioctl 4553 * and the start of the next (i.e. a sys-admin 'accident' occurred). 4554 * The chance of this is small, but not non-existent. 4555 */ 4556 if (vdr->vdr_side == DKV_SIDE_INIT) { 4557 next_side = 0; 4558 } else { 4559 next_side = vdr->vdr_side + 1; 4560 } 4561 while ((next_side < NMIRROR) && 4562 !SUBMIRROR_IS_READABLE(un, next_side)) 4563 next_side++; 4564 if (next_side >= NMIRROR) { 4565 vdr->vdr_flags |= DKV_DMR_ERROR; 4566 freerbuf(bp); 4567 vdr->vdr_bytesread = 0; 4568 md_unit_readerexit(ui); 4569 return (0); 4570 } 4571 4572 /* Set the side to read from */ 4573 un->un_dmr_last_read = next_side; 4574 4575 md_unit_readerexit(ui); 4576 4577 /* 4578 * Save timestamp for verification purposes. Can be read by debugger 4579 * to verify that this ioctl has been executed and to find the number 4580 * of DMR reads and the time of the last DMR read. 4581 */ 4582 uniqtime(&mirror_dmr_stats.dmr_timestamp); 4583 mirror_dmr_stats.dmr_count++; 4584 4585 /* Issue READ request and wait for completion */ 4586 mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL); 4587 4588 mutex_enter(&un->un_dmr_mx); 4589 cv_wait(&un->un_dmr_cv, &un->un_dmr_mx); 4590 mutex_exit(&un->un_dmr_mx); 4591 4592 /* 4593 * Check to see if we encountered an error during the read. If so we 4594 * can make no guarantee about any possibly returned data. 4595 */ 4596 if ((bp->b_flags & B_ERROR) == 0) { 4597 vdr->vdr_flags &= ~DKV_DMR_ERROR; 4598 if (bp->b_resid) { 4599 vdr->vdr_flags |= DKV_DMR_SHORT; 4600 vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid; 4601 } else { 4602 vdr->vdr_flags |= DKV_DMR_SUCCESS; 4603 vdr->vdr_bytesread = vdr->vdr_nbytes; 4604 } 4605 /* Copy the data read back out to the user supplied buffer */ 4606 if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread, 4607 mode)) { 4608 kmem_free(kbuffer, vdr->vdr_nbytes); 4609 return (EFAULT); 4610 } 4611 4612 } else { 4613 /* Error out with DKV_DMR_ERROR */ 4614 vdr->vdr_flags |= DKV_DMR_ERROR; 4615 vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE); 4616 } 4617 /* 4618 * Update the DMR parameters with the side and name of submirror that 4619 * we have just read from (un->un_dmr_last_read) 4620 */ 4621 un = md_unit_readerlock(ui); 4622 4623 vdr->vdr_side = un->un_dmr_last_read; 4624 sm = &un->un_sm[un->un_dmr_last_read]; 4625 sm_nm = md_shortname(md_getminor(sm->sm_dev)); 4626 4627 (void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name)); 4628 4629 /* 4630 * Determine if we've completed the read cycle. This is true iff the 4631 * next computed submirror (side) equals or exceeds NMIRROR. We cannot 4632 * use un_nsm as we need to handle a sparse array of submirrors (which 4633 * can occur if a submirror is metadetached). 4634 */ 4635 next_side = un->un_dmr_last_read + 1; 4636 while ((next_side < NMIRROR) && 4637 !SUBMIRROR_IS_READABLE(un, next_side)) 4638 next_side++; 4639 if (next_side >= NMIRROR) { 4640 /* We've finished */ 4641 vdr->vdr_flags |= DKV_DMR_DONE; 4642 } 4643 4644 md_unit_readerexit(ui); 4645 freerbuf(bp); 4646 kmem_free(kbuffer, vdr->vdr_nbytes); 4647 4648 return (0); 4649 } 4650 4651 /* 4652 * mirror_resync_message: 4653 * --------------------- 4654 * Handle the multi-node resync messages that keep all nodes within a given 4655 * disk-set in sync with their view of a mirror's resync status. 4656 * 4657 * The message types dealt with are: 4658 * MD_MN_MSG_RESYNC_STARTING - start a resync thread for a unit 4659 * MD_MN_MSG_RESYNC_NEXT - specified next region to be resynced 4660 * MD_MN_MSG_RESYNC_FINISH - stop the resync thread for a unit 4661 * MD_MN_MSG_RESYNC_PHASE_DONE - end of a resync phase, opt, submirror or comp 4662 * 4663 * Returns: 4664 * 0 Success 4665 * >0 Failure error number 4666 */ 4667 int 4668 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp) 4669 { 4670 mdi_unit_t *ui; 4671 mm_unit_t *un; 4672 set_t setno; 4673 int is_ABR; 4674 int smi; 4675 int ci; 4676 sm_state_t state; 4677 int broke_out; 4678 mm_submirror_t *sm; 4679 mm_submirror_ic_t *smic; 4680 md_m_shared_t *shared; 4681 md_error_t mde = mdnullerror; 4682 md_mps_t *ps; 4683 int rs_active; 4684 int rr, rr_start, rr_end; 4685 4686 /* Check that the given device is part of a multi-node set */ 4687 setno = MD_MIN2SET(p->mnum); 4688 if (setno >= md_nsets) { 4689 return (ENXIO); 4690 } 4691 if (!MD_MNSET_SETNO(setno)) { 4692 return (EINVAL); 4693 } 4694 4695 if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL) 4696 return (EINVAL); 4697 if ((ui = MDI_UNIT(p->mnum)) == NULL) 4698 return (EINVAL); 4699 is_ABR = (ui->ui_tstate & MD_ABR_CAP); 4700 4701 /* Obtain the current resync status */ 4702 (void) md_ioctl_readerlock(lockp, ui); 4703 rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0; 4704 md_ioctl_readerexit(lockp); 4705 4706 switch ((md_mn_msgtype_t)p->msg_type) { 4707 case MD_MN_MSG_RESYNC_STARTING: 4708 /* Start the resync thread for the mirror */ 4709 (void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp); 4710 break; 4711 4712 case MD_MN_MSG_RESYNC_NEXT: 4713 /* 4714 * We have to release any previously marked overlap regions 4715 * so that i/o can resume. Then we need to block the region 4716 * from [rs_start..rs_start+rs_size) * so that no i/o is issued. 4717 * Update un_rs_resync_done and un_rs_resync_2_do. 4718 */ 4719 (void) md_ioctl_readerlock(lockp, ui); 4720 /* 4721 * Ignore the message if there is no active resync thread or 4722 * if it is for a resync type that we have already completed. 4723 * un_resync_completed is set to the last resync completed 4724 * when processing a PHASE_DONE message. 4725 */ 4726 if (!rs_active || (p->rs_type == un->un_resync_completed)) 4727 break; 4728 /* 4729 * If this message is for the same resync and is for an earlier 4730 * resync region, just ignore it. This can only occur if this 4731 * node has progressed on to the next resync region before 4732 * we receive this message. This can occur if the class for 4733 * this message is busy and the originator has to retry thus 4734 * allowing this node to move onto the next resync_region. 4735 */ 4736 if ((p->rs_type == un->un_rs_type) && 4737 (p->rs_start < un->un_resync_startbl)) 4738 break; 4739 ps = un->un_rs_prev_overlap; 4740 4741 /* Allocate previous overlap reference if needed */ 4742 if (ps == NULL) { 4743 ps = kmem_cache_alloc(mirror_parent_cache, 4744 MD_ALLOCFLAGS); 4745 ps->ps_un = un; 4746 ps->ps_ui = ui; 4747 ps->ps_firstblk = 0; 4748 ps->ps_lastblk = 0; 4749 ps->ps_flags = 0; 4750 md_ioctl_readerexit(lockp); 4751 (void) md_ioctl_writerlock(lockp, ui); 4752 un->un_rs_prev_overlap = ps; 4753 md_ioctl_writerexit(lockp); 4754 } else 4755 md_ioctl_readerexit(lockp); 4756 4757 if (p->rs_originator != md_mn_mynode_id) { 4758 /* 4759 * Clear our un_resync_bm for the regions completed. 4760 * The owner (originator) will take care of itself. 4761 */ 4762 BLK_TO_RR(rr_end, ps->ps_lastblk, un); 4763 BLK_TO_RR(rr_start, p->rs_start, un); 4764 if (ps->ps_lastblk && rr_end < rr_start) { 4765 BLK_TO_RR(rr_start, ps->ps_firstblk, un); 4766 mutex_enter(&un->un_resync_mx); 4767 /* 4768 * Update our resync bitmap to reflect that 4769 * another node has synchronized this range. 4770 */ 4771 for (rr = rr_start; rr <= rr_end; rr++) { 4772 CLR_KEEPDIRTY(rr, un); 4773 } 4774 mutex_exit(&un->un_resync_mx); 4775 } 4776 4777 /* 4778 * On all but the originating node, first update 4779 * the resync state, then unblock the previous 4780 * region and block the next one. No need 4781 * to do this if the region is already blocked. 4782 * Update the submirror state and flags from the 4783 * originator. This keeps the cluster in sync with 4784 * regards to the resync status. 4785 */ 4786 4787 (void) md_ioctl_writerlock(lockp, ui); 4788 un->un_rs_resync_done = p->rs_done; 4789 un->un_rs_resync_2_do = p->rs_2_do; 4790 un->un_rs_type = p->rs_type; 4791 un->un_resync_startbl = p->rs_start; 4792 md_ioctl_writerexit(lockp); 4793 /* 4794 * Use un_owner_mx to ensure that an ownership change 4795 * cannot happen at the same time as this message 4796 */ 4797 mutex_enter(&un->un_owner_mx); 4798 if (MD_MN_MIRROR_OWNER(un)) { 4799 ps->ps_firstblk = p->rs_start; 4800 ps->ps_lastblk = ps->ps_firstblk + 4801 p->rs_size - 1; 4802 } else { 4803 if ((ps->ps_firstblk != p->rs_start) || 4804 (ps->ps_lastblk != p->rs_start + 4805 p->rs_size - 1)) { 4806 /* Remove previous overlap range */ 4807 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4808 mirror_overlap_tree_remove(ps); 4809 4810 ps->ps_firstblk = p->rs_start; 4811 ps->ps_lastblk = ps->ps_firstblk + 4812 p->rs_size - 1; 4813 4814 mutex_exit(&un->un_owner_mx); 4815 /* Block this range from all i/o. */ 4816 if (ps->ps_firstblk != 0 || 4817 ps->ps_lastblk != 0) 4818 wait_for_overlaps(ps, 4819 MD_OVERLAP_ALLOW_REPEAT); 4820 mutex_enter(&un->un_owner_mx); 4821 /* 4822 * Check to see if we have obtained 4823 * ownership while waiting for 4824 * overlaps. If we have, remove 4825 * the resync_region entry from the 4826 * overlap tree 4827 */ 4828 if (MD_MN_MIRROR_OWNER(un) && 4829 (ps->ps_flags & MD_MPS_ON_OVERLAP)) 4830 mirror_overlap_tree_remove(ps); 4831 } 4832 } 4833 mutex_exit(&un->un_owner_mx); 4834 4835 /* 4836 * If this is the first RESYNC_NEXT message (i.e. 4837 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags), 4838 * issue RESYNC_START NOTIFY event 4839 */ 4840 if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) { 4841 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START, 4842 SVM_TAG_METADEVICE, MD_UN2SET(un), 4843 MD_SID(un)); 4844 } 4845 4846 /* Ensure that our local resync thread is running */ 4847 if (un->un_rs_thread == NULL) { 4848 (void) mirror_resync_unit(p->mnum, NULL, 4849 &p->mde, lockp); 4850 } 4851 } 4852 4853 break; 4854 case MD_MN_MSG_RESYNC_FINISH: 4855 /* 4856 * Complete the resync by stopping the resync thread. 4857 * Also release the previous overlap region field. 4858 * Update the resync_progress_thread by cv_signal'ing it so 4859 * that we mark the end of the resync as soon as possible. This 4860 * stops an unnecessary delay should be panic after resync 4861 * completion. 4862 */ 4863 #ifdef DEBUG 4864 if (!rs_active) { 4865 if (mirror_debug_flag) 4866 printf("RESYNC_FINISH (mnum = %x), " 4867 "Resync *NOT* active", 4868 p->mnum); 4869 } 4870 #endif 4871 4872 if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) && 4873 (p->rs_originator != md_mn_mynode_id)) { 4874 mutex_enter(&un->un_rs_thread_mx); 4875 un->c.un_status &= ~MD_UN_RESYNC_CANCEL; 4876 un->un_rs_thread_flags |= MD_RI_SHUTDOWN; 4877 un->un_rs_thread_flags &= 4878 ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER); 4879 cv_signal(&un->un_rs_thread_cv); 4880 mutex_exit(&un->un_rs_thread_mx); 4881 } 4882 if (is_ABR) { 4883 /* Resync finished, if ABR set owner to NULL */ 4884 mutex_enter(&un->un_owner_mx); 4885 un->un_mirror_owner = 0; 4886 mutex_exit(&un->un_owner_mx); 4887 } 4888 (void) md_ioctl_writerlock(lockp, ui); 4889 ps = un->un_rs_prev_overlap; 4890 if (ps != NULL) { 4891 /* Remove previous overlap range */ 4892 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4893 mirror_overlap_tree_remove(ps); 4894 /* 4895 * Release the overlap range reference 4896 */ 4897 un->un_rs_prev_overlap = NULL; 4898 kmem_cache_free(mirror_parent_cache, 4899 ps); 4900 } 4901 md_ioctl_writerexit(lockp); 4902 4903 /* Mark the resync as complete in the metadb */ 4904 un->un_rs_resync_done = p->rs_done; 4905 un->un_rs_resync_2_do = p->rs_2_do; 4906 un->un_rs_type = p->rs_type; 4907 mutex_enter(&un->un_rs_progress_mx); 4908 cv_signal(&un->un_rs_progress_cv); 4909 mutex_exit(&un->un_rs_progress_mx); 4910 4911 un = md_ioctl_writerlock(lockp, ui); 4912 un->c.un_status &= ~MD_UN_RESYNC_ACTIVE; 4913 /* Deal with any pending grow_unit */ 4914 if (un->c.un_status & MD_UN_GROW_PENDING) { 4915 if ((mirror_grow_unit(un, &mde) != 0) || 4916 (! mdismderror(&mde, MDE_GROW_DELAYED))) { 4917 un->c.un_status &= ~MD_UN_GROW_PENDING; 4918 } 4919 } 4920 md_ioctl_writerexit(lockp); 4921 break; 4922 4923 case MD_MN_MSG_RESYNC_PHASE_DONE: 4924 /* 4925 * A phase of the resync, optimized. component or 4926 * submirror is complete. Update mirror status. 4927 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the 4928 * mirror owner is peforming a resync. If we have just snarfed 4929 * this set, then we must clear any of the flags set at snarf 4930 * time by unit_setup_resync(). 4931 * Note that unit_setup_resync() sets up these flags to 4932 * indicate that an optimized resync is required. These flags 4933 * need to be reset because if we get here, the mirror owner 4934 * will have handled the optimized resync. 4935 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and 4936 * MD_UN_WAR. In addition, for each submirror, 4937 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC 4938 * set to SMS_OFFLINE. 4939 */ 4940 #ifdef DEBUG 4941 if (mirror_debug_flag) 4942 printf("phase done mess received from %d, mnum=%x," 4943 "type=%x, flags=%x\n", p->rs_originator, p->mnum, 4944 p->rs_type, p->rs_flags); 4945 #endif 4946 /* 4947 * Ignore the message if there is no active resync thread. 4948 */ 4949 if (!rs_active) 4950 break; 4951 4952 broke_out = p->rs_flags & MD_MN_RS_ERR; 4953 switch (RS_TYPE(p->rs_type)) { 4954 case MD_RS_OPTIMIZED: 4955 un = md_ioctl_writerlock(lockp, ui); 4956 if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) { 4957 /* If we are originator, just clear rs_type */ 4958 if (p->rs_originator == md_mn_mynode_id) { 4959 SET_RS_TYPE_NONE(un->un_rs_type); 4960 md_ioctl_writerexit(lockp); 4961 break; 4962 } 4963 /* 4964 * If CLEAR_OPT_NOT_DONE is set, only clear the 4965 * flags if OPT_NOT_DONE is set *and* rs_type 4966 * is MD_RS_NONE. 4967 */ 4968 if ((un->c.un_status & MD_UN_OPT_NOT_DONE) && 4969 (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) { 4970 /* No resync in progress */ 4971 un->c.un_status &= ~MD_UN_OPT_NOT_DONE; 4972 un->c.un_status &= ~MD_UN_WAR; 4973 } else { 4974 /* 4975 * We are in the middle of an 4976 * optimized resync and this message 4977 * should be ignored. 4978 */ 4979 md_ioctl_writerexit(lockp); 4980 break; 4981 } 4982 } else { 4983 /* 4984 * This is the end of an optimized resync, 4985 * clear the OPT_NOT_DONE and OFFLINE_SM flags 4986 */ 4987 4988 un->c.un_status &= ~MD_UN_KEEP_DIRTY; 4989 if (!broke_out) 4990 un->c.un_status &= ~MD_UN_WAR; 4991 4992 /* 4993 * Clear our un_resync_bm for the regions 4994 * completed. The owner (originator) will 4995 * take care of itself. 4996 */ 4997 if (p->rs_originator != md_mn_mynode_id && 4998 (ps = un->un_rs_prev_overlap) != NULL) { 4999 BLK_TO_RR(rr_start, ps->ps_firstblk, 5000 un); 5001 BLK_TO_RR(rr_end, ps->ps_lastblk, un); 5002 mutex_enter(&un->un_resync_mx); 5003 for (rr = rr_start; rr <= rr_end; 5004 rr++) { 5005 CLR_KEEPDIRTY(rr, un); 5006 } 5007 mutex_exit(&un->un_resync_mx); 5008 } 5009 } 5010 5011 /* 5012 * Set resync_completed to last resync type and then 5013 * clear resync_type to indicate no resync in progress 5014 */ 5015 un->un_resync_completed = un->un_rs_type; 5016 SET_RS_TYPE_NONE(un->un_rs_type); 5017 5018 /* 5019 * If resync is as a result of a submirror ONLINE, 5020 * reset the submirror state to SMS_RUNNING if the 5021 * resync was ok else set back to SMS_OFFLINE. 5022 */ 5023 for (smi = 0; smi < NMIRROR; smi++) { 5024 un->un_sm[smi].sm_flags &= 5025 ~MD_SM_RESYNC_TARGET; 5026 if (SMS_BY_INDEX_IS(un, smi, 5027 SMS_OFFLINE_RESYNC)) { 5028 if (p->rs_flags & 5029 MD_MN_RS_CLEAR_OPT_NOT_DONE) { 5030 state = SMS_OFFLINE; 5031 } else { 5032 state = (broke_out ? 5033 SMS_OFFLINE : SMS_RUNNING); 5034 } 5035 mirror_set_sm_state( 5036 &un->un_sm[smi], 5037 &un->un_smic[smi], state, 5038 broke_out); 5039 mirror_commit(un, NO_SUBMIRRORS, 5040 0); 5041 } 5042 /* 5043 * If we still have an offline submirror, reset 5044 * the OFFLINE_SM flag in the mirror status 5045 */ 5046 if (SMS_BY_INDEX_IS(un, smi, 5047 SMS_OFFLINE)) 5048 un->c.un_status |= 5049 MD_UN_OFFLINE_SM; 5050 } 5051 md_ioctl_writerexit(lockp); 5052 break; 5053 case MD_RS_SUBMIRROR: 5054 un = md_ioctl_writerlock(lockp, ui); 5055 smi = RS_SMI(p->rs_type); 5056 sm = &un->un_sm[smi]; 5057 smic = &un->un_smic[smi]; 5058 /* Clear RESYNC target */ 5059 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET; 5060 /* 5061 * Set resync_completed to last resync type and then 5062 * clear resync_type to indicate no resync in progress 5063 */ 5064 un->un_resync_completed = un->un_rs_type; 5065 SET_RS_TYPE_NONE(un->un_rs_type); 5066 /* 5067 * If the resync completed ok reset the submirror 5068 * state to SMS_RUNNING else reset it to SMS_ATTACHED 5069 */ 5070 state = (broke_out ? 5071 SMS_ATTACHED : SMS_RUNNING); 5072 mirror_set_sm_state(sm, smic, state, broke_out); 5073 un->c.un_status &= ~MD_UN_WAR; 5074 mirror_commit(un, SMI2BIT(smi), 0); 5075 md_ioctl_writerexit(lockp); 5076 break; 5077 case MD_RS_COMPONENT: 5078 un = md_ioctl_writerlock(lockp, ui); 5079 smi = RS_SMI(p->rs_type); 5080 ci = RS_CI(p->rs_type); 5081 sm = &un->un_sm[smi]; 5082 smic = &un->un_smic[smi]; 5083 shared = (md_m_shared_t *) 5084 (*(smic->sm_shared_by_indx)) 5085 (sm->sm_dev, sm, ci); 5086 un->c.un_status &= ~MD_UN_WAR; 5087 /* Clear RESYNC target */ 5088 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET; 5089 /* 5090 * Set resync_completed to last resync type and then 5091 * clear resync_type to indicate no resync in progress 5092 */ 5093 un->un_resync_completed = un->un_rs_type; 5094 SET_RS_TYPE_NONE(un->un_rs_type); 5095 5096 /* 5097 * If the resync completed ok, set the component state 5098 * to CS_OKAY. 5099 */ 5100 if (broke_out) 5101 shared->ms_flags |= MDM_S_RS_TRIED; 5102 else { 5103 /* 5104 * As we don't transmit the changes, 5105 * no need to drop the lock. 5106 */ 5107 set_sm_comp_state(un, smi, ci, CS_OKAY, 0, 5108 MD_STATE_NO_XMIT, (IOLOCK *)NULL); 5109 } 5110 md_ioctl_writerexit(lockp); 5111 default: 5112 break; 5113 } 5114 /* 5115 * If the purpose of this PHASE_DONE message is just to 5116 * indicate to all other nodes that the optimized resync 5117 * required (OPT_NOT_DONE) flag is to be cleared, there is 5118 * no need to generate a notify event as there has not 5119 * actually been a resync. 5120 */ 5121 if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) { 5122 if (broke_out) { 5123 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED, 5124 SVM_TAG_METADEVICE, MD_UN2SET(un), 5125 MD_SID(un)); 5126 } else { 5127 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE, 5128 SVM_TAG_METADEVICE, MD_UN2SET(un), 5129 MD_SID(un)); 5130 } 5131 } 5132 break; 5133 5134 default: 5135 #ifdef DEBUG 5136 cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type" 5137 " %x\n", p->msg_type); 5138 #endif 5139 return (EINVAL); 5140 } 5141 return (0); 5142 } 5143 5144 /* Return a -1 if snarf of optimized record failed and set should be released */ 5145 static int 5146 mirror_snarf(md_snarfcmd_t cmd, set_t setno) 5147 { 5148 mddb_recid_t recid; 5149 int gotsomething; 5150 int all_mirrors_gotten; 5151 mm_unit_t *un; 5152 mddb_type_t typ1; 5153 mddb_de_ic_t *dep; 5154 mddb_rb32_t *rbp; 5155 size_t newreqsize; 5156 mm_unit_t *big_un; 5157 mm_unit32_od_t *small_un; 5158 int retval; 5159 mdi_unit_t *ui; 5160 5161 if (cmd == MD_SNARF_CLEANUP) { 5162 if (md_get_setstatus(setno) & MD_SET_STALE) 5163 return (0); 5164 5165 recid = mddb_makerecid(setno, 0); 5166 typ1 = (mddb_type_t)md_getshared_key(setno, 5167 mirror_md_ops.md_driver.md_drivername); 5168 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 5169 if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) { 5170 un = (mm_unit_t *)mddb_getrecaddr(recid); 5171 mirror_cleanup(un); 5172 recid = mddb_makerecid(setno, 0); 5173 } 5174 } 5175 return (0); 5176 } 5177 5178 all_mirrors_gotten = 1; 5179 gotsomething = 0; 5180 5181 recid = mddb_makerecid(setno, 0); 5182 typ1 = (mddb_type_t)md_getshared_key(setno, 5183 mirror_md_ops.md_driver.md_drivername); 5184 5185 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 5186 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 5187 continue; 5188 5189 dep = mddb_getrecdep(recid); 5190 dep->de_flags = MDDB_F_MIRROR; 5191 rbp = dep->de_rb; 5192 5193 switch (rbp->rb_revision) { 5194 case MDDB_REV_RB: 5195 case MDDB_REV_RBFN: 5196 if ((rbp->rb_private & MD_PRV_CONVD) == 0) { 5197 /* 5198 * This means, we have an old and small 5199 * record and this record hasn't already 5200 * been converted. Before we create an 5201 * incore metadevice from this we have to 5202 * convert it to a big record. 5203 */ 5204 small_un = 5205 (mm_unit32_od_t *)mddb_getrecaddr(recid); 5206 newreqsize = sizeof (mm_unit_t); 5207 big_un = (mm_unit_t *)kmem_zalloc(newreqsize, 5208 KM_SLEEP); 5209 mirror_convert((caddr_t)small_un, 5210 (caddr_t)big_un, SMALL_2_BIG); 5211 kmem_free(small_un, dep->de_reqsize); 5212 5213 /* 5214 * Update userdata and incore userdata 5215 * incores are at the end of un 5216 */ 5217 dep->de_rb_userdata_ic = big_un; 5218 dep->de_rb_userdata = big_un; 5219 dep->de_icreqsize = newreqsize; 5220 un = big_un; 5221 rbp->rb_private |= MD_PRV_CONVD; 5222 } else { 5223 /* 5224 * Unit already converted, just get the 5225 * record address. 5226 */ 5227 un = (mm_unit_t *)mddb_getrecaddr_resize(recid, 5228 sizeof (*un), 0); 5229 } 5230 un->c.un_revision &= ~MD_64BIT_META_DEV; 5231 break; 5232 case MDDB_REV_RB64: 5233 case MDDB_REV_RB64FN: 5234 /* Big device */ 5235 un = (mm_unit_t *)mddb_getrecaddr_resize(recid, 5236 sizeof (*un), 0); 5237 un->c.un_revision |= MD_64BIT_META_DEV; 5238 un->c.un_flag |= MD_EFILABEL; 5239 break; 5240 } 5241 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision); 5242 5243 /* 5244 * Create minor device node for snarfed entry. 5245 */ 5246 (void) md_create_minor_node(setno, MD_SID(un)); 5247 5248 if (MD_UNIT(MD_SID(un)) != NULL) { 5249 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 5250 continue; 5251 } 5252 all_mirrors_gotten = 0; 5253 retval = mirror_build_incore(un, 1); 5254 if (retval == 0) { 5255 mddb_setrecprivate(recid, MD_PRV_GOTIT); 5256 md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0); 5257 resync_start_timeout(setno); 5258 gotsomething = 1; 5259 } else { 5260 return (retval); 5261 } 5262 /* 5263 * Set flag to indicate that the mirror has not yet 5264 * been through a reconfig. This flag is used for MN sets 5265 * when determining whether to update the mirror state from 5266 * the Master node. 5267 */ 5268 if (MD_MNSET_SETNO(setno)) { 5269 ui = MDI_UNIT(MD_SID(un)); 5270 ui->ui_tstate |= MD_RESYNC_NOT_DONE; 5271 } 5272 } 5273 5274 if (!all_mirrors_gotten) 5275 return (gotsomething); 5276 5277 recid = mddb_makerecid(setno, 0); 5278 while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0) 5279 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 5280 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 5281 5282 return (0); 5283 } 5284 5285 static int 5286 mirror_halt(md_haltcmd_t cmd, set_t setno) 5287 { 5288 unit_t i; 5289 mdi_unit_t *ui; 5290 minor_t mnum; 5291 int reset_mirror_flag = 0; 5292 5293 if (cmd == MD_HALT_CLOSE) 5294 return (0); 5295 5296 if (cmd == MD_HALT_OPEN) 5297 return (0); 5298 5299 if (cmd == MD_HALT_UNLOAD) 5300 return (0); 5301 5302 if (cmd == MD_HALT_CHECK) { 5303 for (i = 0; i < md_nunits; i++) { 5304 mnum = MD_MKMIN(setno, i); 5305 if ((ui = MDI_UNIT(mnum)) == NULL) 5306 continue; 5307 if (ui->ui_opsindex != mirror_md_ops.md_selfindex) 5308 continue; 5309 if (md_unit_isopen(ui)) 5310 return (1); 5311 } 5312 return (0); 5313 } 5314 5315 if (cmd != MD_HALT_DOIT) 5316 return (1); 5317 5318 for (i = 0; i < md_nunits; i++) { 5319 mnum = MD_MKMIN(setno, i); 5320 if ((ui = MDI_UNIT(mnum)) == NULL) 5321 continue; 5322 if (ui->ui_opsindex != mirror_md_ops.md_selfindex) 5323 continue; 5324 reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0); 5325 5326 /* Set a flag if there is at least one mirror metadevice. */ 5327 reset_mirror_flag = 1; 5328 } 5329 5330 /* 5331 * Only wait for the global dr_timeout to finish 5332 * - if there are mirror metadevices in this diskset or 5333 * - if this is the local set since an unload of the md_mirror 5334 * driver could follow a successful mirror halt in the local set. 5335 */ 5336 if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) { 5337 while ((mirror_md_ops.md_head == NULL) && 5338 (mirror_timeout.dr_timeout_id != 0)) 5339 delay(md_hz); 5340 } 5341 5342 return (0); 5343 } 5344 5345 /*ARGSUSED3*/ 5346 static int 5347 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags) 5348 { 5349 IOLOCK lock; 5350 minor_t mnum = getminor(*dev); 5351 set_t setno; 5352 5353 /* 5354 * When doing an open of a multi owner metadevice, check to see if this 5355 * node is a starting node and if a reconfig cycle is underway. 5356 * If so, the system isn't sufficiently set up enough to handle the 5357 * open (which involves I/O during sp_validate), so fail with ENXIO. 5358 */ 5359 setno = MD_MIN2SET(mnum); 5360 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == 5361 (MD_SET_MNSET | MD_SET_MN_START_RC)) { 5362 return (ENXIO); 5363 } 5364 5365 if (md_oflags & MD_OFLG_FROMIOCTL) { 5366 /* 5367 * This indicates that the caller is an ioctl service routine. 5368 * In this case we initialise our stack-based IOLOCK and pass 5369 * this into the internal open routine. This allows multi-owner 5370 * metadevices to avoid deadlocking if an error is encountered 5371 * during the open() attempt. The failure case is: 5372 * s-p -> mirror -> s-p (with error). Attempting to metaclear 5373 * this configuration would deadlock as the mirror code has to 5374 * send a state-update to the other nodes when it detects the 5375 * failure of the underlying submirror with an errored soft-part 5376 * on it. As there is a class1 message in progress (metaclear) 5377 * set_sm_comp_state() cannot send another class1 message; 5378 * instead we do not send a state_update message as the 5379 * metaclear is distributed and the failed submirror will be 5380 * cleared from the configuration by the metaclear. 5381 */ 5382 IOLOCK_INIT(&lock); 5383 return (mirror_internal_open(getminor(*dev), flag, otyp, 5384 md_oflags, &lock)); 5385 } else { 5386 return (mirror_internal_open(getminor(*dev), flag, otyp, 5387 md_oflags, (IOLOCK *)NULL)); 5388 } 5389 } 5390 5391 5392 /*ARGSUSED1*/ 5393 static int 5394 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags) 5395 { 5396 return (mirror_internal_close(getminor(dev), otyp, md_cflags, 5397 (IOLOCK *)NULL)); 5398 } 5399 5400 5401 /* 5402 * This routine dumps memory to the disk. It assumes that the memory has 5403 * already been mapped into mainbus space. It is called at disk interrupt 5404 * priority when the system is in trouble. 5405 * 5406 */ 5407 static int 5408 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 5409 { 5410 mm_unit_t *un; 5411 dev_t mapdev; 5412 int result; 5413 int smi; 5414 int any_succeed = 0; 5415 int save_result = 0; 5416 5417 /* 5418 * Don't need to grab the unit lock. 5419 * Cause nothing else is suppose to be happenning. 5420 * Also dump is not suppose to sleep. 5421 */ 5422 un = (mm_unit_t *)MD_UNIT(getminor(dev)); 5423 5424 if ((diskaddr_t)blkno >= un->c.un_total_blocks) 5425 return (EINVAL); 5426 5427 if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks) 5428 return (EINVAL); 5429 5430 for (smi = 0; smi < NMIRROR; smi++) { 5431 if (!SUBMIRROR_IS_WRITEABLE(un, smi)) 5432 continue; 5433 mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev); 5434 result = bdev_dump(mapdev, addr, blkno, nblk); 5435 if (result) 5436 save_result = result; 5437 5438 if (result == 0) 5439 any_succeed++; 5440 } 5441 5442 if (any_succeed) 5443 return (0); 5444 5445 return (save_result); 5446 } 5447 5448 /* 5449 * NAME: mirror_probe_dev 5450 * 5451 * DESCRITPION: force opens every component of a mirror. 5452 * 5453 * On entry the unit writerlock is held 5454 */ 5455 static int 5456 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum) 5457 { 5458 int i; 5459 int smi; 5460 int ci; 5461 mm_unit_t *un; 5462 int md_devopen = 0; 5463 set_t setno; 5464 int sm_cnt; 5465 int sm_unavail_cnt; 5466 5467 if (md_unit_isopen(ui)) 5468 md_devopen++; 5469 5470 un = MD_UNIT(mnum); 5471 setno = MD_UN2SET(un); 5472 5473 sm_cnt = 0; 5474 sm_unavail_cnt = 0; 5475 for (i = 0; i < NMIRROR; i++) { 5476 md_dev64_t tmpdev; 5477 mdi_unit_t *sm_ui; 5478 5479 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) { 5480 continue; 5481 } 5482 5483 sm_cnt++; 5484 tmpdev = un->un_sm[i].sm_dev; 5485 (void) md_layered_open(mnum, &tmpdev, 5486 MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV); 5487 un->un_sm[i].sm_dev = tmpdev; 5488 5489 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 5490 5491 /* 5492 * Logic similar to that in mirror_open_all_devs. We set or 5493 * clear the submirror Unavailable bit. 5494 */ 5495 (void) md_unit_writerlock(sm_ui); 5496 if (submirror_unavailable(un, i, 1)) { 5497 sm_ui->ui_tstate |= MD_INACCESSIBLE; 5498 sm_unavail_cnt++; 5499 } else { 5500 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 5501 } 5502 md_unit_writerexit(sm_ui); 5503 } 5504 5505 /* 5506 * If all of the submirrors are unavailable, the mirror is also 5507 * unavailable. 5508 */ 5509 if (sm_cnt == sm_unavail_cnt) { 5510 ui->ui_tstate |= MD_INACCESSIBLE; 5511 } else { 5512 ui->ui_tstate &= ~MD_INACCESSIBLE; 5513 } 5514 5515 /* 5516 * Start checking from probe failures. If failures occur we 5517 * set the appropriate erred state only if the metadevice is in 5518 * use. This is specifically to prevent unnecessary resyncs. 5519 * For instance if the disks were accidentally disconnected when 5520 * the system booted up then until the metadevice is accessed 5521 * (like file system mount) the user can shutdown, recable and 5522 * reboot w/o incurring a potentially huge resync. 5523 */ 5524 5525 smi = 0; 5526 ci = 0; 5527 while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) { 5528 5529 if (mirror_other_sources(un, smi, ci, 0) == 1) { 5530 /* 5531 * Note that for a MN set, there is no need to call 5532 * SE_NOTIFY as that is done when processing the 5533 * state change 5534 */ 5535 if (md_devopen) { 5536 /* 5537 * Never called from ioctl context, 5538 * so (IOLOCK *)NULL 5539 */ 5540 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 5541 0, MD_STATE_XMIT, (IOLOCK *)NULL); 5542 if (!MD_MNSET_SETNO(setno)) { 5543 SE_NOTIFY(EC_SVM_STATE, 5544 ESC_SVM_LASTERRED, 5545 SVM_TAG_METADEVICE, setno, 5546 MD_SID(un)); 5547 } 5548 continue; 5549 } else { 5550 (void) mirror_close_all_devs(un, 5551 MD_OFLG_PROBEDEV); 5552 if (!MD_MNSET_SETNO(setno)) { 5553 SE_NOTIFY(EC_SVM_STATE, 5554 ESC_SVM_OPEN_FAIL, 5555 SVM_TAG_METADEVICE, setno, 5556 MD_SID(un)); 5557 } 5558 mirror_openfail_console_info(un, smi, ci); 5559 return (ENXIO); 5560 } 5561 } 5562 5563 /* 5564 * Note that for a MN set, there is no need to call 5565 * SE_NOTIFY as that is done when processing the 5566 * state change 5567 */ 5568 if (md_devopen) { 5569 /* Never called from ioctl context, so (IOLOCK *)NULL */ 5570 set_sm_comp_state(un, smi, ci, CS_ERRED, 0, 5571 MD_STATE_XMIT, (IOLOCK *)NULL); 5572 if (!MD_MNSET_SETNO(setno)) { 5573 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 5574 SVM_TAG_METADEVICE, setno, 5575 MD_SID(un)); 5576 } 5577 } 5578 mirror_openfail_console_info(un, smi, ci); 5579 ci++; 5580 } 5581 5582 if (MD_MNSET_SETNO(setno)) { 5583 send_poke_hotspares(setno); 5584 } else { 5585 (void) poke_hotspares(); 5586 } 5587 (void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV); 5588 5589 return (0); 5590 } 5591 5592 5593 static int 5594 mirror_imp_set( 5595 set_t setno 5596 ) 5597 { 5598 5599 mddb_recid_t recid; 5600 int gotsomething, i; 5601 mddb_type_t typ1; 5602 mddb_de_ic_t *dep; 5603 mddb_rb32_t *rbp; 5604 mm_unit32_od_t *un32; 5605 mm_unit_t *un64; 5606 md_dev64_t self_devt; 5607 minor_t *self_id; /* minor needs to be updated */ 5608 md_parent_t *parent_id; /* parent needs to be updated */ 5609 mddb_recid_t *record_id; /* record id needs to be updated */ 5610 mddb_recid_t *optrec_id; 5611 md_dev64_t tmpdev; 5612 5613 5614 gotsomething = 0; 5615 5616 typ1 = (mddb_type_t)md_getshared_key(setno, 5617 mirror_md_ops.md_driver.md_drivername); 5618 recid = mddb_makerecid(setno, 0); 5619 5620 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 5621 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 5622 continue; 5623 5624 dep = mddb_getrecdep(recid); 5625 rbp = dep->de_rb; 5626 5627 switch (rbp->rb_revision) { 5628 case MDDB_REV_RB: 5629 case MDDB_REV_RBFN: 5630 /* 5631 * Small device 5632 */ 5633 un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid); 5634 self_id = &(un32->c.un_self_id); 5635 parent_id = &(un32->c.un_parent); 5636 record_id = &(un32->c.un_record_id); 5637 optrec_id = &(un32->un_rr_dirty_recid); 5638 5639 for (i = 0; i < un32->un_nsm; i++) { 5640 tmpdev = md_expldev(un32->un_sm[i].sm_dev); 5641 un32->un_sm[i].sm_dev = md_cmpldev 5642 (md_makedevice(md_major, MD_MKMIN(setno, 5643 MD_MIN2UNIT(md_getminor(tmpdev))))); 5644 5645 if (!md_update_minor(setno, mddb_getsidenum 5646 (setno), un32->un_sm[i].sm_key)) 5647 goto out; 5648 } 5649 break; 5650 case MDDB_REV_RB64: 5651 case MDDB_REV_RB64FN: 5652 un64 = (mm_unit_t *)mddb_getrecaddr(recid); 5653 self_id = &(un64->c.un_self_id); 5654 parent_id = &(un64->c.un_parent); 5655 record_id = &(un64->c.un_record_id); 5656 optrec_id = &(un64->un_rr_dirty_recid); 5657 5658 for (i = 0; i < un64->un_nsm; i++) { 5659 tmpdev = un64->un_sm[i].sm_dev; 5660 un64->un_sm[i].sm_dev = md_makedevice 5661 (md_major, MD_MKMIN(setno, MD_MIN2UNIT 5662 (md_getminor(tmpdev)))); 5663 5664 if (!md_update_minor(setno, mddb_getsidenum 5665 (setno), un64->un_sm[i].sm_key)) 5666 goto out; 5667 } 5668 break; 5669 } 5670 5671 /* 5672 * If this is a top level and a friendly name metadevice, 5673 * update its minor in the namespace. 5674 */ 5675 if ((*parent_id == MD_NO_PARENT) && 5676 ((rbp->rb_revision == MDDB_REV_RBFN) || 5677 (rbp->rb_revision == MDDB_REV_RB64FN))) { 5678 5679 self_devt = md_makedevice(md_major, *self_id); 5680 if (!md_update_top_device_minor(setno, 5681 mddb_getsidenum(setno), self_devt)) 5682 goto out; 5683 } 5684 5685 /* 5686 * Update unit with the imported setno 5687 * 5688 */ 5689 mddb_setrecprivate(recid, MD_PRV_GOTIT); 5690 5691 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 5692 if (*parent_id != MD_NO_PARENT) 5693 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 5694 *record_id = MAKERECID(setno, DBID(*record_id)); 5695 *optrec_id = MAKERECID(setno, DBID(*optrec_id)); 5696 5697 gotsomething = 1; 5698 } 5699 5700 out: 5701 return (gotsomething); 5702 } 5703 5704 /* 5705 * NAME: mirror_check_offline 5706 * 5707 * DESCRIPTION: return offline_status = 1 if any submirrors are offline 5708 * 5709 * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is 5710 * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE 5711 * ioctl. 5712 */ 5713 int 5714 mirror_check_offline(md_dev64_t dev, int *offline_status) 5715 { 5716 mm_unit_t *un; 5717 md_error_t mde = mdnullerror; 5718 5719 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5720 return (EINVAL); 5721 *offline_status = 0; 5722 if (un->c.un_status & MD_UN_OFFLINE_SM) 5723 *offline_status = 1; 5724 return (0); 5725 } 5726 5727 /* 5728 * NAME: mirror_inc_abr_count 5729 * 5730 * DESCRIPTION: increment the count of layered soft parts with ABR set 5731 * 5732 * Called from ioctl, so access to un_abr_count is protected by the global 5733 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl. 5734 */ 5735 int 5736 mirror_inc_abr_count(md_dev64_t dev) 5737 { 5738 mm_unit_t *un; 5739 md_error_t mde = mdnullerror; 5740 5741 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5742 return (EINVAL); 5743 un->un_abr_count++; 5744 return (0); 5745 } 5746 5747 /* 5748 * NAME: mirror_dec_abr_count 5749 * 5750 * DESCRIPTION: decrement the count of layered soft parts with ABR set 5751 * 5752 * Called from ioctl, so access to un_abr_count is protected by the global 5753 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl. 5754 */ 5755 int 5756 mirror_dec_abr_count(md_dev64_t dev) 5757 { 5758 mm_unit_t *un; 5759 md_error_t mde = mdnullerror; 5760 5761 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5762 return (EINVAL); 5763 un->un_abr_count--; 5764 return (0); 5765 } 5766 5767 static md_named_services_t mirror_named_services[] = { 5768 {(intptr_t (*)()) poke_hotspares, "poke hotspares" }, 5769 {(intptr_t (*)()) mirror_rename_listkids, MDRNM_LIST_URKIDS }, 5770 {mirror_rename_check, MDRNM_CHECK }, 5771 {(intptr_t (*)()) mirror_renexch_update_kids, MDRNM_UPDATE_KIDS }, 5772 {(intptr_t (*)()) mirror_exchange_parent_update_to, 5773 MDRNM_PARENT_UPDATE_TO}, 5774 {(intptr_t (*)()) mirror_exchange_self_update_from_down, 5775 MDRNM_SELF_UPDATE_FROM_DOWN }, 5776 {(intptr_t (*)())mirror_probe_dev, "probe open test" }, 5777 {(intptr_t (*)())mirror_check_offline, MD_CHECK_OFFLINE }, 5778 {(intptr_t (*)())mirror_inc_abr_count, MD_INC_ABR_COUNT }, 5779 {(intptr_t (*)())mirror_dec_abr_count, MD_DEC_ABR_COUNT }, 5780 { NULL, 0 } 5781 }; 5782 5783 md_ops_t mirror_md_ops = { 5784 mirror_open, /* open */ 5785 mirror_close, /* close */ 5786 md_mirror_strategy, /* strategy */ 5787 NULL, /* print */ 5788 mirror_dump, /* dump */ 5789 NULL, /* read */ 5790 NULL, /* write */ 5791 md_mirror_ioctl, /* mirror_ioctl, */ 5792 mirror_snarf, /* mirror_snarf */ 5793 mirror_halt, /* mirror_halt */ 5794 NULL, /* aread */ 5795 NULL, /* awrite */ 5796 mirror_imp_set, /* import set */ 5797 mirror_named_services 5798 }; 5799 5800 /* module specific initilization */ 5801 static void 5802 init_init() 5803 { 5804 md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t); 5805 5806 /* Initialize the parent and child save memory pools */ 5807 mirror_parent_cache = kmem_cache_create("md_mirror_parent", 5808 sizeof (md_mps_t), 0, mirror_parent_constructor, 5809 mirror_parent_destructor, mirror_run_queue, NULL, NULL, 5810 0); 5811 5812 mirror_child_cache = kmem_cache_create("md_mirror_child", 5813 sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0, 5814 mirror_child_constructor, mirror_child_destructor, 5815 mirror_run_queue, NULL, NULL, 0); 5816 5817 /* 5818 * Insure wowbuf_size is a multiple of DEV_BSIZE, 5819 * then initialize wowbuf memory pool. 5820 */ 5821 md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE); 5822 if (md_wowbuf_size <= 0) 5823 md_wowbuf_size = 2 * DEV_BSIZE; 5824 if (md_wowbuf_size > (32 * DEV_BSIZE)) 5825 md_wowbuf_size = (32 * DEV_BSIZE); 5826 5827 md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t); 5828 mirror_wowblk_cache = kmem_cache_create("md_mirror_wow", 5829 md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0); 5830 5831 mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL); 5832 mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL); 5833 5834 mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL); 5835 } 5836 5837 /* module specific uninitilization (undo init_init()) */ 5838 static void 5839 fini_uninit() 5840 { 5841 kmem_cache_destroy(mirror_parent_cache); 5842 kmem_cache_destroy(mirror_child_cache); 5843 kmem_cache_destroy(mirror_wowblk_cache); 5844 mirror_parent_cache = mirror_child_cache = 5845 mirror_wowblk_cache = NULL; 5846 5847 mutex_destroy(&mirror_timeout.dr_mx); 5848 mutex_destroy(&hotspare_request.dr_mx); 5849 mutex_destroy(&non_ff_drv_mutex); 5850 } 5851 5852 /* define the module linkage */ 5853 MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit())