illumos-gate Old usr/src/uts/common/io/lvm/mirror/mirror.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
  25  */
  26 
  27 #include <sys/param.h>
  28 #include <sys/systm.h>
  29 #include <sys/conf.h>
  30 #include <sys/file.h>
  31 #include <sys/user.h>
  32 #include <sys/uio.h>
  33 #include <sys/t_lock.h>
  34 #include <sys/buf.h>
  35 #include <sys/dkio.h>
  36 #include <sys/vtoc.h>
  37 #include <sys/kmem.h>
  38 #include <vm/page.h>
  39 #include <sys/cmn_err.h>
  40 #include <sys/sysmacros.h>
  41 #include <sys/types.h>
  42 #include <sys/mkdev.h>
  43 #include <sys/stat.h>
  44 #include <sys/open.h>
  45 #include <sys/modctl.h>
  46 #include <sys/ddi.h>
  47 #include <sys/sunddi.h>
  48 #include <sys/debug.h>
  49 #include <sys/dklabel.h>
  50 #include <vm/hat.h>
  51 #include <sys/lvm/mdvar.h>
  52 #include <sys/lvm/md_mirror.h>
  53 #include <sys/lvm/md_convert.h>
  54 #include <sys/lvm/md_mddb.h>
  55 #include <sys/esunddi.h>
  56 
  57 #include <sys/sysevent/eventdefs.h>
  58 #include <sys/sysevent/svm.h>
  59 #include <sys/lvm/mdmn_commd.h>
  60 #include <sys/avl.h>
  61 
  62 md_ops_t                mirror_md_ops;
  63 #ifndef lint
  64 md_ops_t                *md_interface_ops = &mirror_md_ops;
  65 #endif
  66 
  67 extern mdq_anchor_t     md_done_daemon;
  68 extern mdq_anchor_t     md_mstr_daemon;
  69 extern mdq_anchor_t     md_mirror_daemon;
  70 extern mdq_anchor_t     md_mirror_io_daemon;
  71 extern mdq_anchor_t     md_mirror_rs_daemon;
  72 extern mdq_anchor_t     md_mhs_daemon;
  73 
  74 extern unit_t           md_nunits;
  75 extern set_t            md_nsets;
  76 extern md_set_t         md_set[];
  77 
  78 extern int              md_status;
  79 extern clock_t          md_hz;
  80 
  81 extern md_krwlock_t     md_unit_array_rw;
  82 extern kmutex_t         md_mx;
  83 extern kcondvar_t       md_cv;
  84 extern int              md_mtioctl_cnt;
  85 
  86 daemon_request_t        mirror_timeout;
  87 static daemon_request_t hotspare_request;
  88 static daemon_request_t mn_hs_request[MD_MAXSETS];      /* Multinode hs req */
  89 
  90 int     md_mirror_mcs_buf_off;
  91 
  92 /* Flags for mdmn_ksend_message to allow debugging */
  93 int     md_mirror_msg_flags;
  94 
  95 #ifdef DEBUG
  96 /* Flag to switch on debug messages */
  97 int     mirror_debug_flag = 0;
  98 #endif
  99 
 100 /*
 101  * Struct used to hold count of DMR reads and the timestamp of last DMR read
 102  * It is used to verify, using a debugger, that the DMR read ioctl has been
 103  * executed.
 104  */
 105 dmr_stats_t     mirror_dmr_stats = {0, 0};
 106 
 107 /*
 108  * Mutex protecting list of non-failfast drivers.
 109  */
 110 static kmutex_t non_ff_drv_mutex;
 111 extern char     **non_ff_drivers;
 112 
 113 extern major_t  md_major;
 114 
 115 /*
 116  * Write-On-Write memory pool.
 117  */
 118 static void             copy_write_cont(wowhdr_t *wowhdr);
 119 static kmem_cache_t     *mirror_wowblk_cache = NULL;
 120 static int              md_wowbuf_size = 16384;
 121 static size_t           md_wowblk_size;
 122 
 123 /*
 124  * This is a flag that allows:
 125  *      - disabling the write-on-write mechanism.
 126  *      - logging occurrences of write-on-write
 127  *      - switching wow handling procedure processing
 128  * Counter for occurences of WOW.
 129  */
 130 static uint_t   md_mirror_wow_flg = 0;
 131 static int      md_mirror_wow_cnt = 0;
 132 
 133 /*
 134  * Tunable to enable/disable dirty region
 135  * processing when closing down a mirror.
 136  */
 137 static int      new_resync = 1;
 138 kmem_cache_t    *mirror_parent_cache = NULL;
 139 kmem_cache_t    *mirror_child_cache = NULL;
 140 
 141 extern int      md_ff_disable;          /* disable failfast */
 142 
 143 static int      mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int);
 144 static void     mirror_read_strategy(buf_t *, int, void *);
 145 static void     mirror_write_strategy(buf_t *, int, void *);
 146 static void     become_owner(daemon_queue_t *);
 147 static int      mirror_done(struct buf *cb);
 148 static int      mirror_done_common(struct buf *cb);
 149 static void     clear_retry_error(struct buf *cb);
 150 
 151 /*
 152  * patchables
 153  */
 154 int     md_min_rr_size  = 200;  /* 2000 blocks, or 100k */
 155 int     md_def_num_rr   = 1000; /* Default number of dirty regions */
 156 
 157 /*
 158  * patchable to change delay before rescheduling mirror ownership request.
 159  * Value is clock ticks, default 0.5 seconds
 160  */
 161 clock_t md_mirror_owner_to = 500000;
 162 
 163 /*ARGSUSED1*/
 164 static int
 165 mirror_parent_constructor(void *p, void *d1, int d2)
 166 {
 167         mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL);
 168         return (0);
 169 }
 170 
 171 static void
 172 mirror_parent_init(md_mps_t *ps)
 173 {
 174         bzero(ps, offsetof(md_mps_t, ps_mx));
 175         bzero(&ps->ps_overlap_node, sizeof (avl_node_t));
 176 }
 177 
 178 /*ARGSUSED1*/
 179 static void
 180 mirror_parent_destructor(void *p, void *d)
 181 {
 182         mutex_destroy(&((md_mps_t *)p)->ps_mx);
 183 }
 184 
 185 /*ARGSUSED1*/
 186 static int
 187 mirror_child_constructor(void *p, void *d1, int d2)
 188 {
 189         bioinit(&((md_mcs_t *)p)->cs_buf);
 190         return (0);
 191 }
 192 
 193 void
 194 mirror_child_init(md_mcs_t *cs)
 195 {
 196         cs->cs_ps = NULL;
 197         cs->cs_mdunit = 0;
 198         md_bioreset(&cs->cs_buf);
 199 }
 200 
 201 /*ARGSUSED1*/
 202 static void
 203 mirror_child_destructor(void *p, void *d)
 204 {
 205         biofini(&((md_mcs_t *)p)->cs_buf);
 206 }
 207 
 208 static void
 209 mirror_wowblk_init(wowhdr_t *p)
 210 {
 211         bzero(p, md_wowblk_size);
 212 }
 213 
 214 static void
 215 send_poke_hotspares_msg(daemon_request_t *drq)
 216 {
 217         int                     rval;
 218         int                     nretries = 0;
 219         md_mn_msg_pokehsp_t     pokehsp;
 220         md_mn_kresult_t         *kresult;
 221         set_t                   setno = (set_t)drq->dq.qlen;
 222 
 223         pokehsp.pokehsp_setno = setno;
 224 
 225         kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
 226 
 227 retry_sphmsg:
 228         rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES,
 229             MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp,
 230             sizeof (pokehsp), kresult);
 231 
 232         if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
 233                 mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES");
 234                 /* If we're shutting down already, pause things here. */
 235                 if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
 236                         while (!md_mn_is_commd_present()) {
 237                                 delay(md_hz);
 238                         }
 239                         /*
 240                          * commd has become reachable again, so retry once.
 241                          * If this fails we'll panic as the system is in an
 242                          * unexpected state.
 243                          */
 244                         if (nretries++ == 0)
 245                                 goto retry_sphmsg;
 246                 }
 247                 cmn_err(CE_PANIC,
 248                     "ksend_message failure: POKE_HOTSPARES");
 249         }
 250         kmem_free(kresult, sizeof (md_mn_kresult_t));
 251 
 252         /* Allow further requests to use this set's queue structure */
 253         mutex_enter(&drq->dr_mx);
 254         drq->dr_pending = 0;
 255         mutex_exit(&drq->dr_mx);
 256 }
 257 
 258 /*
 259  * Send a poke_hotspares message to the master node. To avoid swamping the
 260  * commd handler with requests we only send a message if there is not one
 261  * already outstanding. We punt the request to a separate thread context as
 262  * cannot afford to block waiting on the request to be serviced. This is
 263  * essential when a reconfig cycle is in progress as any open() of a multinode
 264  * metadevice may result in a livelock.
 265  */
 266 static void
 267 send_poke_hotspares(set_t setno)
 268 {
 269         daemon_request_t        *drq = &mn_hs_request[setno];
 270 
 271         mutex_enter(&drq->dr_mx);
 272         if (drq->dr_pending == 0) {
 273                 drq->dr_pending = 1;
 274                 drq->dq.qlen = (int)setno;
 275                 daemon_request(&md_mhs_daemon,
 276                     send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD);
 277         }
 278         mutex_exit(&drq->dr_mx);
 279 }
 280 
 281 void
 282 mirror_set_sm_state(
 283         mm_submirror_t          *sm,
 284         mm_submirror_ic_t       *smic,
 285         sm_state_t              newstate,
 286         int                     force)
 287 {
 288         int                     compcnt;
 289         int                     i;
 290         int                     errcnt;
 291         sm_state_t              origstate;
 292         md_m_shared_t           *shared;
 293 
 294         if (force) {
 295                 sm->sm_state = newstate;
 296                 uniqtime32(&sm->sm_timestamp);
 297                 return;
 298         }
 299 
 300         origstate = newstate;
 301 
 302         compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
 303         for (i = 0, errcnt = 0; i < compcnt; i++) {
 304                 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
 305                     (sm->sm_dev, sm, i);
 306                 if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED))
 307                         newstate |= SMS_COMP_ERRED;
 308                 if (shared->ms_state & (CS_RESYNC))
 309                         newstate |= SMS_COMP_RESYNC;
 310                 if (shared->ms_state & CS_ERRED)
 311                         errcnt++;
 312         }
 313 
 314         if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0)
 315                 newstate &= ~origstate;
 316 
 317         if (errcnt == compcnt)
 318                 newstate |= SMS_ALL_ERRED;
 319         else
 320                 newstate &= ~SMS_ALL_ERRED;
 321 
 322         sm->sm_state = newstate;
 323         uniqtime32(&sm->sm_timestamp);
 324 }
 325 
 326 static int
 327 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error,
 328                                                         int frm_probe)
 329 {
 330         mm_submirror_t          *sm;
 331         mm_submirror_ic_t       *smic;
 332         md_m_shared_t           *shared;
 333         int                     ci;
 334         int                     i;
 335         int                     compcnt;
 336         int                     open_comp; /* flag for open component */
 337 
 338         for (i = *smi; i < NMIRROR; i++) {
 339                 sm = &un->un_sm[i];
 340                 smic = &un->un_smic[i];
 341 
 342                 if (!SMS_IS(sm, SMS_INUSE))
 343                         continue;
 344 
 345                 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
 346                 for (ci = *cip; ci < compcnt; ci++) {
 347                         shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
 348                             (sm->sm_dev, sm, ci);
 349                         /*
 350                          * if called from any routine but probe, we check for
 351                          * MDM_S_ISOPEN flag. Since probe does a pseduo open,
 352                          * it sets MDM_S_PROBEOPEN flag and we test for this
 353                          * flag. They are both exclusive tests.
 354                          */
 355                         open_comp = (frm_probe) ?
 356                             (shared->ms_flags & MDM_S_PROBEOPEN):
 357                             (shared->ms_flags & MDM_S_ISOPEN);
 358                         if (((shared->ms_flags & MDM_S_IOERR || !open_comp) &&
 359                             ((shared->ms_state == CS_OKAY) ||
 360                             (shared->ms_state == CS_RESYNC))) ||
 361                             (!open_comp &&
 362                             (shared->ms_state == CS_LAST_ERRED))) {
 363                                 if (clr_error) {
 364                                         shared->ms_flags &= ~MDM_S_IOERR;
 365                                 }
 366                                 *cip = ci;
 367                                 *smi = i;
 368                                 return (1);
 369                         }
 370 
 371                         if (clr_error && (shared->ms_flags & MDM_S_IOERR)) {
 372                                 shared->ms_flags &= ~MDM_S_IOERR;
 373                         }
 374                 }
 375 
 376                 *cip = 0;
 377         }
 378         return (0);
 379 }
 380 
 381 /*ARGSUSED*/
 382 static void
 383 mirror_run_queue(void *d)
 384 {
 385         if (!(md_status & MD_GBL_DAEMONS_LIVE))
 386                 md_daemon(1, &md_done_daemon);
 387 }
 388 /*
 389  * check_comp_4_hotspares
 390  *
 391  * This function attempts to allocate a hotspare for this component if the
 392  * component is in error. In a MN set, the function can be called in 2 modes.
 393  * It can be called either when a component error has been detected or when a
 394  * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set
 395  * in flags and the request is sent to all nodes.
 396  * The handler on each of the nodes then calls this function with
 397  * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed.
 398  *
 399  * For non-MN sets the function simply attempts to allocate a hotspare.
 400  *
 401  * On entry, the following locks are held
 402  *      mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set)
 403  *      md_unit_writerlock
 404  *
 405  * Returns      0 if ok
 406  *              1 if the unit containing the component has been cleared while
 407  *                the mdmn_ksend_message() was being executed
 408  */
 409 extern int
 410 check_comp_4_hotspares(
 411         mm_unit_t       *un,
 412         int             smi,
 413         int             ci,
 414         uint_t          flags,
 415         mddb_recid_t    hs_id,  /* Only used by MN disksets */
 416         IOLOCK          *lockp  /* can be NULL */
 417 )
 418 {
 419         mm_submirror_t          *sm;
 420         mm_submirror_ic_t       *smic;
 421         md_m_shared_t           *shared;
 422         mddb_recid_t            recids[6];
 423         minor_t                 mnum;
 424         intptr_t                (*hs_dev)();
 425         void                    (*hs_done)();
 426         void                    *hs_data;
 427         md_error_t              mde = mdnullerror;
 428         set_t                   setno;
 429         md_mn_msg_allochsp_t    allochspmsg;
 430         md_mn_kresult_t         *kresult;
 431         mm_unit_t               *new_un;
 432         int                     rval;
 433         int                     nretries = 0;
 434 
 435         mnum = MD_SID(un);
 436         setno = MD_UN2SET(un);
 437         sm = &un->un_sm[smi];
 438         smic = &un->un_smic[smi];
 439         shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
 440             (sm->sm_dev, sm, ci);
 441 
 442         if (shared->ms_state != CS_ERRED)
 443                 return (0);
 444 
 445         /* Don't start a new component resync if a resync is already running. */
 446         if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
 447                 return (0);
 448 
 449         if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) {
 450                 uint_t          msgflags;
 451                 md_mn_msgtype_t msgtype;
 452 
 453                 /* Send allocate hotspare message to all nodes */
 454 
 455                 allochspmsg.msg_allochsp_mnum = un->c.un_self_id;
 456                 allochspmsg.msg_allochsp_sm = smi;
 457                 allochspmsg.msg_allochsp_comp = ci;
 458                 allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id;
 459 
 460                 /*
 461                  * Before calling mdmn_ksend_message(), release locks
 462                  * Can never be in the context of an ioctl.
 463                  */
 464                 md_unit_writerexit(MDI_UNIT(mnum));
 465                 if (flags & MD_HOTSPARE_LINKHELD)
 466                         rw_exit(&mirror_md_ops.md_link_rw.lock);
 467 #ifdef DEBUG
 468                 if (mirror_debug_flag)
 469                         printf("send alloc hotspare, flags="
 470                             "0x%x %x, %x, %x, %x\n", flags,
 471                             allochspmsg.msg_allochsp_mnum,
 472                             allochspmsg.msg_allochsp_sm,
 473                             allochspmsg.msg_allochsp_comp,
 474                             allochspmsg.msg_allochsp_hs_id);
 475 #endif
 476                 if (flags & MD_HOTSPARE_WMUPDATE) {
 477                         msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE2;
 478                         /*
 479                          * When coming from an update of watermarks, there
 480                          * must already be a message logged that triggered
 481                          * this action. So, no need to log this message, too.
 482                          */
 483                         msgflags = MD_MSGF_NO_LOG;
 484                 } else {
 485                         msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE;
 486                         msgflags = MD_MSGF_DEFAULT_FLAGS;
 487                 }
 488 
 489                 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
 490 
 491 cc4hs_msg:
 492                 rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
 493                     (char *)&allochspmsg, sizeof (allochspmsg),
 494                     kresult);
 495 
 496                 if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
 497 #ifdef DEBUG
 498                         if (mirror_debug_flag)
 499                                 mdmn_ksend_show_error(rval, kresult,
 500                                     "ALLOCATE HOTSPARE");
 501 #endif
 502                         /*
 503                          * If message is sent ok but exitval indicates an error
 504                          * it must be because the mirror has been cleared. In
 505                          * this case re-obtain lock and return an error
 506                          */
 507                         if ((rval == 0) && (kresult->kmmr_exitval != 0)) {
 508                                 if (flags & MD_HOTSPARE_LINKHELD) {
 509                                         rw_enter(&mirror_md_ops.md_link_rw.lock,
 510                                             RW_READER);
 511                                 }
 512                                 kmem_free(kresult, sizeof (md_mn_kresult_t));
 513                                 return (1);
 514                         }
 515                         /* If we're shutting down already, pause things here. */
 516                         if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
 517                                 while (!md_mn_is_commd_present()) {
 518                                         delay(md_hz);
 519                                 }
 520                                 /*
 521                                  * commd has become reachable again, so retry
 522                                  * once. If this fails we'll panic as the
 523                                  * system is in an unexpected state.
 524                                  */
 525                                 if (nretries++ == 0)
 526                                         goto cc4hs_msg;
 527                         }
 528                         cmn_err(CE_PANIC,
 529                             "ksend_message failure: ALLOCATE_HOTSPARE");
 530                 }
 531                 kmem_free(kresult, sizeof (md_mn_kresult_t));
 532 
 533                 /*
 534                  * re-obtain the locks
 535                  */
 536                 if (flags & MD_HOTSPARE_LINKHELD)
 537                         rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
 538                 new_un = md_unit_writerlock(MDI_UNIT(mnum));
 539 
 540                 /*
 541                  * As we had to release the locks in order to send the
 542                  * message to all nodes, we need to check to see if the
 543                  * unit has changed. If it has we release the writerlock
 544                  * and return fail.
 545                  */
 546                 if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) {
 547                         md_unit_writerexit(MDI_UNIT(mnum));
 548                         return (1);
 549                 }
 550         } else {
 551                 if (MD_MNSET_SETNO(setno)) {
 552                         /*
 553                          * If 2 or more nodes simultaneously see a
 554                          * component failure, these nodes will each
 555                          * send an ALLOCATE_HOTSPARE[2] message.
 556                          * The first message will allocate the hotspare
 557                          * and the subsequent messages should do nothing.
 558                          *
 559                          * If a slave node doesn't have a hotspare allocated
 560                          * at the time the message is initiated, then the
 561                          * passed in hs_id will be 0.  If the node
 562                          * executing this routine has a component shared
 563                          * ms_hs_id of non-zero, but the message shows a
 564                          * hs_id of 0, then just return since a hotspare
 565                          * has already been allocated for this failing
 566                          * component.  When the slave node returns from
 567                          * the ksend_message the hotspare will have
 568                          * already been allocated.
 569                          *
 570                          * If the slave node does send an hs_id of non-zero,
 571                          * and the slave node's hs_id matches this node's
 572                          * ms_hs_id, then the hotspare has error'd and
 573                          * should be replaced.
 574                          *
 575                          * If the slave node sends an hs_id of non-zero and
 576                          * this node has a different shared ms_hs_id, then
 577                          * just return since this hotspare has already
 578                          * been hotspared.
 579                          */
 580                         if (shared->ms_hs_id != 0) {
 581                                 if (hs_id == 0) {
 582 #ifdef DEBUG
 583                                         if (mirror_debug_flag) {
 584                                                 printf("check_comp_4_hotspares"
 585                                                     "(NOXMIT), short circuit "
 586                                                     "hs_id=0x%x, "
 587                                                     "ms_hs_id=0x%x\n",
 588                                                     hs_id, shared->ms_hs_id);
 589                                         }
 590 #endif
 591                                         return (0);
 592                                 }
 593                                 if (hs_id != shared->ms_hs_id) {
 594 #ifdef DEBUG
 595                                         if (mirror_debug_flag) {
 596                                                 printf("check_comp_4_hotspares"
 597                                                     "(NOXMIT), short circuit2 "
 598                                                     "hs_id=0x%x, "
 599                                                     "ms_hs_id=0x%x\n",
 600                                                     hs_id, shared->ms_hs_id);
 601                                         }
 602 #endif
 603                                         return (0);
 604                                 }
 605                         }
 606                 }
 607 
 608                 sm = &un->un_sm[smi];
 609                 hs_dev = md_get_named_service(sm->sm_dev, 0,
 610                     "hotspare device", 0);
 611                 if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done,
 612                     &hs_data) != 0)
 613                         return (0);
 614 
 615                 /*
 616                  * set_sm_comp_state() commits the modified records.
 617                  * As we don't transmit the changes, no need to drop the lock.
 618                  */
 619                 set_sm_comp_state(un, smi, ci, CS_RESYNC, recids,
 620                     MD_STATE_NO_XMIT, (IOLOCK *)NULL);
 621 
 622                 (*hs_done)(sm->sm_dev, hs_data);
 623 
 624                 mirror_check_failfast(mnum);
 625 
 626                 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE,
 627                     setno, MD_SID(un));
 628 
 629                 /*
 630                  * For a multi-node set we need to reset the un_rs_type,
 631                  * un_rs_resync_done and un_rs_resync_2_do fields as the
 632                  * hot-spare resync must copy all applicable data.
 633                  */
 634                 if (MD_MNSET_SETNO(setno)) {
 635                         un->un_rs_type = MD_RS_NONE;
 636                         un->un_rs_resync_done = 0;
 637                         un->un_rs_resync_2_do = 0;
 638                 }
 639 
 640                 /*
 641                  * Must drop writer lock since mirror_resync_unit will
 642                  * open devices and must be able to grab readerlock.
 643                  * Don't need to drop IOLOCK since any descendent routines
 644                  * calling ksend_messages will drop the IOLOCK as needed.
 645                  *
 646                  */
 647                 if (lockp) {
 648                         md_ioctl_writerexit(lockp);
 649                 } else {
 650                         md_unit_writerexit(MDI_UNIT(mnum));
 651                 }
 652 
 653                 /* start resync */
 654                 (void) mirror_resync_unit(mnum, NULL, &mde, lockp);
 655 
 656                 if (lockp) {
 657                         new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum));
 658                 } else {
 659                         new_un = md_unit_writerlock(MDI_UNIT(mnum));
 660                 }
 661         }
 662         return (0);
 663 }
 664 
 665 /*
 666  * check_unit_4_hotspares
 667  *
 668  * For a given mirror, allocate hotspares, if available for any components
 669  * that are in error
 670  *
 671  * Returns      0 if ok
 672  *              1 if check_comp_4_hotspares returns non-zero. This will only
 673  *                happen for a MN unit where the unit has been cleared while
 674  *                the allocate hotspare message is sent to all nodes.
 675  */
 676 static int
 677 check_unit_4_hotspares(mm_unit_t *un, int flags)
 678 {
 679         mm_submirror_t          *sm;
 680         mm_submirror_ic_t       *smic;
 681         int                     ci;
 682         int                     i;
 683         int                     compcnt;
 684 
 685         if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
 686                 return (0);
 687 
 688         for (i = 0; i < NMIRROR; i++) {
 689                 sm = &un->un_sm[i];
 690                 smic = &un->un_smic[i];
 691                 if (!SMS_IS(sm, SMS_INUSE))
 692                         continue;
 693                 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm);
 694                 for (ci = 0; ci < compcnt; ci++) {
 695                         md_m_shared_t           *shared;
 696 
 697                         shared = (md_m_shared_t *)
 698                             (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci);
 699                         /*
 700                          * Never called from ioctl context, so pass in
 701                          * (IOLOCK *)NULL.  Pass through flags from calling
 702                          * routine, also setting XMIT flag.
 703                          */
 704                         if (check_comp_4_hotspares(un, i, ci,
 705                             (MD_HOTSPARE_XMIT | flags),
 706                             shared->ms_hs_id, (IOLOCK *)NULL) != 0)
 707                                 return (1);
 708                 }
 709         }
 710         return (0);
 711 }
 712 
 713 static void
 714 check_4_hotspares(daemon_request_t *drq)
 715 {
 716         mdi_unit_t      *ui;
 717         mm_unit_t       *un;
 718         md_link_t       *next;
 719         int             x;
 720 
 721         mutex_enter(&drq->dr_mx);        /* clear up front so can poke */
 722         drq->dr_pending = 0;         /* again in low level routine if */
 723         mutex_exit(&drq->dr_mx); /* something found to do        */
 724 
 725         /*
 726          * Used to have a problem here. The disksets weren't marked as being
 727          * MNHOLD. This opened a window where we could be searching for
 728          * hotspares and have the disk set unloaded (released) from under
 729          * us causing a panic in stripe_component_count().
 730          * The way to prevent that is to mark the set MNHOLD which prevents
 731          * any diskset from being released while we are scanning the mirrors,
 732          * submirrors and components.
 733          */
 734 
 735         for (x = 0; x < md_nsets; x++)
 736                 md_holdset_enter(x);
 737 
 738         rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
 739         for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
 740                 ui = MDI_UNIT(next->ln_id);
 741 
 742                 un = (mm_unit_t *)md_unit_readerlock(ui);
 743 
 744                 /*
 745                  * Only check the unit if we are the master for this set
 746                  * For an MN set, poke_hotspares() is only effective on the
 747                  * master
 748                  */
 749                 if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
 750                     md_set[MD_UN2SET(un)].s_am_i_master == 0) {
 751                         md_unit_readerexit(ui);
 752                         continue;
 753                 }
 754                 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
 755                         md_unit_readerexit(ui);
 756                         continue;
 757                 }
 758                 md_unit_readerexit(ui);
 759 
 760                 un = (mm_unit_t *)md_unit_writerlock(ui);
 761                 /*
 762                  * check_unit_4_hotspares will exit 1 if the unit has been
 763                  * removed during the process of allocating the hotspare.
 764                  * This can only happen for a MN metadevice. If unit no longer
 765                  * exists, no need to release writerlock
 766                  */
 767                 if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0)
 768                         md_unit_writerexit(ui);
 769                 else {
 770                         /*
 771                          * If check_unit_4_hotspares failed, queue another
 772                          * request and break out of this one
 773                          */
 774                         (void) poke_hotspares();
 775                         break;
 776                 }
 777         }
 778         rw_exit(&mirror_md_ops.md_link_rw.lock);
 779 
 780         for (x = 0; x < md_nsets; x++)
 781                 md_holdset_exit(x);
 782 }
 783 
 784 /*
 785  * poke_hotspares
 786  *
 787  * If there is not a pending poke_hotspares request pending, queue a requent
 788  * to call check_4_hotspares(). This will scan all mirrors and attempt to
 789  * allocate hotspares for all components in error.
 790  */
 791 int
 792 poke_hotspares()
 793 {
 794         mutex_enter(&hotspare_request.dr_mx);
 795         if (hotspare_request.dr_pending == 0) {
 796                 hotspare_request.dr_pending = 1;
 797                 daemon_request(&md_mhs_daemon,
 798                     check_4_hotspares, (daemon_queue_t *)&hotspare_request,
 799                     REQ_OLD);
 800         }
 801         mutex_exit(&hotspare_request.dr_mx);
 802         return (0);
 803 }
 804 
 805 static void
 806 free_all_ecomps(err_comp_t *ecomp)
 807 {
 808         err_comp_t      *d;
 809 
 810         while (ecomp != NULL) {
 811                 d = ecomp;
 812                 ecomp = ecomp->ec_next;
 813                 kmem_free(d, sizeof (err_comp_t));
 814         }
 815 }
 816 
 817 /*
 818  * NAME: mirror_openfail_console_info
 819  *
 820  * DESCRIPTION: Prints a informative message to the console when mirror
 821  *              cannot be opened.
 822  *
 823  * PARAMETERS: mm_unit_t        un - pointer to mirror unit structure
 824  *             int              smi - submirror index
 825  *             int              ci - component index
 826  */
 827 
 828 void
 829 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci)
 830 {
 831         void (*get_dev)();
 832         ms_cd_info_t cd;
 833         md_dev64_t tmpdev;
 834 
 835         tmpdev = un->un_sm[smi].sm_dev;
 836         get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0);
 837         if (get_dev != NULL) {
 838                 (void) (*get_dev)(tmpdev, smi, ci, &cd);
 839                 cmn_err(CE_WARN, "md %s: open error on %s",
 840                     md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un),
 841                     cd.cd_dev, NULL, 0));
 842         } else {
 843                 cmn_err(CE_WARN, "md %s: open error",
 844                     md_shortname(MD_SID(un)));
 845         }
 846 }
 847 
 848 static int
 849 mirror_close_all_devs(mm_unit_t *un, int md_cflags)
 850 {
 851         int i;
 852         md_dev64_t dev;
 853 
 854         for (i = 0; i < NMIRROR; i++) {
 855                 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
 856                         continue;
 857                 dev = un->un_sm[i].sm_dev;
 858                 md_layered_close(dev, md_cflags);
 859         }
 860         return (0);
 861 }
 862 
 863 /*
 864  * Keep track of drivers that don't support failfast.  We use this so that
 865  * we only log one diagnostic message for each of these drivers, no matter
 866  * how many times we run the mirror_check_failfast function.
 867  * Return 1 if this is a new driver that does not support failfast,
 868  * return 0 if we have already seen this non-failfast driver.
 869  */
 870 static int
 871 new_non_ff_driver(const char *s)
 872 {
 873         mutex_enter(&non_ff_drv_mutex);
 874         if (non_ff_drivers == NULL) {
 875                 non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *),
 876                     KM_NOSLEEP);
 877                 if (non_ff_drivers == NULL) {
 878                         mutex_exit(&non_ff_drv_mutex);
 879                         return (1);
 880                 }
 881 
 882                 non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1,
 883                     KM_NOSLEEP);
 884                 if (non_ff_drivers[0] == NULL) {
 885                         kmem_free(non_ff_drivers, 2 * sizeof (char *));
 886                         non_ff_drivers = NULL;
 887                         mutex_exit(&non_ff_drv_mutex);
 888                         return (1);
 889                 }
 890 
 891                 (void) strcpy(non_ff_drivers[0], s);
 892                 non_ff_drivers[1] = NULL;
 893 
 894         } else {
 895                 int i;
 896                 char **tnames;
 897                 char **tmp;
 898 
 899                 for (i = 0; non_ff_drivers[i] != NULL; i++) {
 900                         if (strcmp(s, non_ff_drivers[i]) == 0) {
 901                                 mutex_exit(&non_ff_drv_mutex);
 902                                 return (0);
 903                         }
 904                 }
 905 
 906                 /* allow for new element and null */
 907                 i += 2;
 908                 tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP);
 909                 if (tnames == NULL) {
 910                         mutex_exit(&non_ff_drv_mutex);
 911                         return (1);
 912                 }
 913 
 914                 for (i = 0; non_ff_drivers[i] != NULL; i++)
 915                         tnames[i] = non_ff_drivers[i];
 916 
 917                 tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
 918                 if (tnames[i] == NULL) {
 919                         /* adjust i so that it is the right count to free */
 920                         kmem_free(tnames, (i + 2) * sizeof (char *));
 921                         mutex_exit(&non_ff_drv_mutex);
 922                         return (1);
 923                 }
 924 
 925                 (void) strcpy(tnames[i++], s);
 926                 tnames[i] = NULL;
 927 
 928                 tmp = non_ff_drivers;
 929                 non_ff_drivers = tnames;
 930                 /* i now represents the count we previously alloced */
 931                 kmem_free(tmp, i * sizeof (char *));
 932         }
 933         mutex_exit(&non_ff_drv_mutex);
 934 
 935         return (1);
 936 }
 937 
 938 /*
 939  * Check for the "ddi-failfast-supported" devtree property on each submirror
 940  * component to indicate if we should do I/O to that submirror with the
 941  * B_FAILFAST flag set or not.  This check is made at various state transitions
 942  * in the mirror code (e.g. open, enable, hotspare, etc.).  Sometimes we
 943  * only need to check one drive (e.g. hotspare) but since the check is
 944  * fast and infrequent and sometimes needs to be done on all components we
 945  * just check all components on each call.
 946  */
 947 void
 948 mirror_check_failfast(minor_t mnum)
 949 {
 950         int             i;
 951         mm_unit_t       *un;
 952 
 953         if (md_ff_disable)
 954                 return;
 955 
 956         un = MD_UNIT(mnum);
 957 
 958         for (i = 0; i < NMIRROR; i++) {
 959                 int                     ci;
 960                 int                     cnt;
 961                 int                     ff = 1;
 962                 mm_submirror_t          *sm;
 963                 mm_submirror_ic_t       *smic;
 964                 void                    (*get_dev)();
 965 
 966                 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
 967                         continue;
 968 
 969                 sm = &un->un_sm[i];
 970                 smic = &un->un_smic[i];
 971 
 972                 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
 973                     "get device", 0);
 974 
 975                 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
 976                 for (ci = 0; ci < cnt; ci++) {
 977                         int             found = 0;
 978                         dev_t           ci_dev;
 979                         major_t         major;
 980                         dev_info_t      *devi;
 981                         ms_cd_info_t    cd;
 982 
 983                         /*
 984                          * this already returns the hs
 985                          * dev if the device is spared
 986                          */
 987                         (void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
 988 
 989                         ci_dev = md_dev64_to_dev(cd.cd_dev);
 990                         major = getmajor(ci_dev);
 991 
 992                         if (major == md_major) {
 993                                 /*
 994                                  * this component must be a soft
 995                                  * partition; get the real dev
 996                                  */
 997                                 minor_t dev_mnum;
 998                                 mdi_unit_t      *ui;
 999                                 mp_unit_t       *un;
1000                                 set_t   setno;
1001                                 side_t  side;
1002                                 md_dev64_t      tmpdev;
1003 
1004                                 ui = MDI_UNIT(getminor(ci_dev));
1005 
1006                                 /* grab necessary lock */
1007                                 un = (mp_unit_t *)md_unit_readerlock(ui);
1008 
1009                                 dev_mnum = MD_SID(un);
1010                                 setno = MD_MIN2SET(dev_mnum);
1011                                 side = mddb_getsidenum(setno);
1012 
1013                                 tmpdev = un->un_dev;
1014 
1015                                 /* Get dev by device id */
1016                                 if (md_devid_found(setno, side,
1017                                     un->un_key) == 1) {
1018                                         tmpdev = md_resolve_bydevid(dev_mnum,
1019                                             tmpdev, un->un_key);
1020                                 }
1021 
1022                                 md_unit_readerexit(ui);
1023 
1024                                 ci_dev = md_dev64_to_dev(tmpdev);
1025                                 major = getmajor(ci_dev);
1026                         }
1027 
1028                         if (ci_dev != NODEV32 &&
1029                             (devi = e_ddi_hold_devi_by_dev(ci_dev, 0))
1030                             != NULL) {
1031                                 ddi_prop_op_t   prop_op = PROP_LEN_AND_VAL_BUF;
1032                                 int             propvalue = 0;
1033                                 int             proplength = sizeof (int);
1034                                 int             error;
1035                                 struct cb_ops   *cb;
1036 
1037                                 if ((cb = devopsp[major]->devo_cb_ops) !=
1038                                     NULL) {
1039                                         error = (*cb->cb_prop_op)
1040                                             (DDI_DEV_T_ANY, devi, prop_op,
1041                                             DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
1042                                             "ddi-failfast-supported",
1043                                             (caddr_t)&propvalue, &proplength);
1044 
1045                                         if (error == DDI_PROP_SUCCESS)
1046                                                 found = 1;
1047                                 }
1048 
1049                                 if (!found && new_non_ff_driver(
1050                                     ddi_driver_name(devi))) {
1051                                         cmn_err(CE_NOTE, "!md: B_FAILFAST I/O"
1052                                             "disabled on %s",
1053                                             ddi_driver_name(devi));
1054                                 }
1055 
1056                                 ddi_release_devi(devi);
1057                         }
1058 
1059                         /*
1060                          * All components must support
1061                          * failfast in the submirror.
1062                          */
1063                         if (!found) {
1064                                 ff = 0;
1065                                 break;
1066                         }
1067                 }
1068 
1069                 if (ff) {
1070                         sm->sm_flags |= MD_SM_FAILFAST;
1071                 } else {
1072                         sm->sm_flags &= ~MD_SM_FAILFAST;
1073                 }
1074         }
1075 }
1076 
1077 /*
1078  * Return true if the submirror is unavailable.
1079  * If any of the submirror components are opened then the submirror cannot
1080  * be unavailable (MD_INACCESSIBLE).
1081  * If any of the components are already in the errored state, then the submirror
1082  * cannot be unavailable (MD_INACCESSIBLE).
1083  */
1084 static bool_t
1085 submirror_unavailable(mm_unit_t *un, int smi, int from_probe)
1086 {
1087         mm_submirror_t          *sm;
1088         mm_submirror_ic_t       *smic;
1089         md_m_shared_t           *shared;
1090         int                     ci;
1091         int                     compcnt;
1092 
1093         sm = &un->un_sm[smi];
1094         smic = &un->un_smic[smi];
1095 
1096         compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
1097         for (ci = 0; ci < compcnt; ci++) {
1098                 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
1099                     (sm->sm_dev, sm, ci);
1100                 if (from_probe) {
1101                         if (shared->ms_flags & MDM_S_PROBEOPEN)
1102                                 return (B_FALSE);
1103                 } else {
1104                         if (shared->ms_flags & MDM_S_ISOPEN)
1105                                 return (B_FALSE);
1106                 }
1107                 if (shared->ms_state == CS_ERRED ||
1108                     shared->ms_state == CS_LAST_ERRED)
1109                         return (B_FALSE);
1110         }
1111 
1112         return (B_TRUE);
1113 }
1114 
1115 static int
1116 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp)
1117 {
1118         int             i;
1119         mm_unit_t       *un;
1120         mdi_unit_t      *ui;
1121         int             err;
1122         int             smi;
1123         int             ci;
1124         err_comp_t      *c;
1125         err_comp_t      *ecomps = NULL;
1126         int             smmask = 0;
1127         set_t           setno;
1128         int             sm_cnt;
1129         int             sm_unavail_cnt;
1130 
1131         mirror_check_failfast(mnum);
1132 
1133         un = MD_UNIT(mnum);
1134         ui = MDI_UNIT(mnum);
1135         setno = MD_UN2SET(un);
1136 
1137         for (i = 0; i < NMIRROR; i++) {
1138                 md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1139 
1140                 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1141                         continue;
1142                 if (md_layered_open(mnum, &tmpdev, md_oflags))
1143                         smmask |= SMI2BIT(i);
1144                 un->un_sm[i].sm_dev = tmpdev;
1145         }
1146 
1147         /*
1148          * If smmask is clear, all submirrors are accessible. Clear the
1149          * MD_INACCESSIBLE bit in this case.  This bit is also cleared for the
1150          * mirror device.   If smmask is set, we have to determine which of the
1151          * submirrors are in error. If no submirror is accessible we mark the
1152          * whole mirror as MD_INACCESSIBLE.
1153          */
1154         if (smmask == 0) {
1155                 if (lockp) {
1156                         md_ioctl_readerexit(lockp);
1157                         (void) md_ioctl_writerlock(lockp, ui);
1158                 } else {
1159                         md_unit_readerexit(ui);
1160                         (void) md_unit_writerlock(ui);
1161                 }
1162                 ui->ui_tstate &= ~MD_INACCESSIBLE;
1163                 if (lockp) {
1164                         md_ioctl_writerexit(lockp);
1165                         (void) md_ioctl_readerlock(lockp, ui);
1166                 } else {
1167                         md_unit_writerexit(ui);
1168                         (void) md_unit_readerlock(ui);
1169                 }
1170 
1171                 for (i = 0; i < NMIRROR; i++) {
1172                         md_dev64_t      tmpdev;
1173                         mdi_unit_t      *sm_ui;
1174 
1175                         if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1176                                 continue;
1177 
1178                         tmpdev = un->un_sm[i].sm_dev;
1179                         sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1180                         (void) md_unit_writerlock(sm_ui);
1181                         sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1182                         md_unit_writerexit(sm_ui);
1183                 }
1184 
1185                 return (0);
1186         }
1187 
1188         for (i = 0; i < NMIRROR; i++) {
1189                 md_dev64_t tmpdev;
1190 
1191                 if (!(smmask & SMI2BIT(i)))
1192                         continue;
1193 
1194                 tmpdev = un->un_sm[i].sm_dev;
1195                 err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS);
1196                 un->un_sm[i].sm_dev = tmpdev;
1197                 ASSERT(err == 0);
1198         }
1199 
1200         if (lockp) {
1201                 md_ioctl_readerexit(lockp);
1202                 un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
1203         } else {
1204                 md_unit_readerexit(ui);
1205                 un = (mm_unit_t *)md_unit_writerlock(ui);
1206         }
1207 
1208         /*
1209          * We want to make sure the unavailable flag is not masking a real
1210          * error on the submirror.
1211          * For each submirror,
1212          *    if all of the submirror components couldn't be opened and there
1213          *    are no errors on the submirror, then set the unavailable flag
1214          *    otherwise, clear unavailable.
1215          */
1216         sm_cnt = 0;
1217         sm_unavail_cnt = 0;
1218         for (i = 0; i < NMIRROR; i++) {
1219                 md_dev64_t      tmpdev;
1220                 mdi_unit_t      *sm_ui;
1221 
1222                 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1223                         continue;
1224 
1225                 sm_cnt++;
1226                 tmpdev = un->un_sm[i].sm_dev;
1227                 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1228 
1229                 (void) md_unit_writerlock(sm_ui);
1230                 if (submirror_unavailable(un, i, 0)) {
1231                         sm_ui->ui_tstate |= MD_INACCESSIBLE;
1232                         sm_unavail_cnt++;
1233                 } else {
1234                         sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1235                 }
1236                 md_unit_writerexit(sm_ui);
1237         }
1238 
1239         /*
1240          * If all of the submirrors are unavailable, the mirror is also
1241          * unavailable.
1242          */
1243         if (sm_cnt == sm_unavail_cnt) {
1244                 ui->ui_tstate |= MD_INACCESSIBLE;
1245         } else {
1246                 ui->ui_tstate &= ~MD_INACCESSIBLE;
1247         }
1248 
1249         smi = 0;
1250         ci = 0;
1251         while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
1252                 if (mirror_other_sources(un, smi, ci, 1) == 1) {
1253 
1254                         free_all_ecomps(ecomps);
1255                         (void) mirror_close_all_devs(un, md_oflags);
1256                         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
1257                             SVM_TAG_METADEVICE, setno, MD_SID(un));
1258                         mirror_openfail_console_info(un, smi, ci);
1259                         if (lockp) {
1260                                 md_ioctl_writerexit(lockp);
1261                                 (void) md_ioctl_readerlock(lockp, ui);
1262                         } else {
1263                                 md_unit_writerexit(ui);
1264                                 (void) md_unit_readerlock(ui);
1265                         }
1266                         return (ENXIO);
1267                 }
1268 
1269                 /* track all component states that need changing */
1270                 c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP);
1271                 c->ec_next = ecomps;
1272                 c->ec_smi = smi;
1273                 c->ec_ci = ci;
1274                 ecomps = c;
1275                 ci++;
1276         }
1277 
1278         /* Make all state changes and commit them */
1279         for (c = ecomps; c != NULL; c = c->ec_next) {
1280                 /*
1281                  * If lockp is set, then entering kernel through ioctl.
1282                  * For a MN set, the only ioctl path is via a commd message
1283                  * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already
1284                  * being sent to each node.
1285                  * In this case, set NO_XMIT so that set_sm_comp_state
1286                  * won't attempt to send a message on a message.
1287                  *
1288                  * In !MN sets, the xmit flag is ignored, so it doesn't matter
1289                  * which flag is passed.
1290                  */
1291                 if (lockp) {
1292                         set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1293                             MD_STATE_NO_XMIT, lockp);
1294                 } else {
1295                         set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1296                             (MD_STATE_XMIT | MD_STATE_OCHELD), lockp);
1297                 }
1298                 /*
1299                  * For a MN set, the NOTIFY is done when the state change is
1300                  * processed on each node
1301                  */
1302                 if (!MD_MNSET_SETNO(setno)) {
1303                         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
1304                             SVM_TAG_METADEVICE, setno, MD_SID(un));
1305                 }
1306         }
1307 
1308         if (lockp) {
1309                 md_ioctl_writerexit(lockp);
1310                 (void) md_ioctl_readerlock(lockp, ui);
1311         } else {
1312                 md_unit_writerexit(ui);
1313                 (void) md_unit_readerlock(ui);
1314         }
1315 
1316         free_all_ecomps(ecomps);
1317 
1318         /* allocate hotspares for all errored components */
1319         if (MD_MNSET_SETNO(setno)) {
1320                 /*
1321                  * If we're called from an ioctl (lockp set) then we cannot
1322                  * directly call send_poke_hotspares as this will block until
1323                  * the message gets despatched to all nodes. If the cluster is
1324                  * going through a reconfig cycle then the message will block
1325                  * until the cycle is complete, and as we originate from a
1326                  * service call from commd we will livelock.
1327                  */
1328                 if (lockp == NULL) {
1329                         md_unit_readerexit(ui);
1330                         send_poke_hotspares(setno);
1331                         (void) md_unit_readerlock(ui);
1332                 }
1333         } else {
1334                 (void) poke_hotspares();
1335         }
1336         return (0);
1337 }
1338 
1339 void
1340 mirror_overlap_tree_remove(md_mps_t *ps)
1341 {
1342         mm_unit_t       *un;
1343 
1344         if (panicstr)
1345                 return;
1346 
1347         VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP);
1348         un = ps->ps_un;
1349 
1350         mutex_enter(&un->un_overlap_tree_mx);
1351         avl_remove(&un->un_overlap_root, ps);
1352         ps->ps_flags &= ~MD_MPS_ON_OVERLAP;
1353         if (un->un_overlap_tree_flag != 0) {
1354                 un->un_overlap_tree_flag = 0;
1355                 cv_broadcast(&un->un_overlap_tree_cv);
1356         }
1357         mutex_exit(&un->un_overlap_tree_mx);
1358 }
1359 
1360 
1361 /*
1362  * wait_for_overlaps:
1363  * -----------------
1364  * Check that given i/o request does not cause an overlap with already pending
1365  * i/o. If it does, block until the overlapped i/o completes.
1366  *
1367  * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent
1368  * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if
1369  * it must not already be in the tree.
1370  */
1371 static void
1372 wait_for_overlaps(md_mps_t *ps, int flags)
1373 {
1374         mm_unit_t       *un;
1375         avl_index_t     where;
1376         md_mps_t        *ps1;
1377 
1378         if (panicstr)
1379                 return;
1380 
1381         un = ps->ps_un;
1382         mutex_enter(&un->un_overlap_tree_mx);
1383         if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
1384             (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
1385                 mutex_exit(&un->un_overlap_tree_mx);
1386                 return;
1387         }
1388 
1389         VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1390 
1391         do {
1392                 ps1 = avl_find(&un->un_overlap_root, ps, &where);
1393                 if (ps1 == NULL) {
1394                         /*
1395                          * The candidate range does not overlap with any
1396                          * range in the tree.  Insert it and be done.
1397                          */
1398                         avl_insert(&un->un_overlap_root, ps, where);
1399                         ps->ps_flags |= MD_MPS_ON_OVERLAP;
1400                 } else {
1401                         /*
1402                          * The candidate range would overlap.  Set the flag
1403                          * indicating we need to be woken up, and sleep
1404                          * until another thread removes a range.  If upon
1405                          * waking up we find this mps was put on the tree
1406                          * by another thread, the loop terminates.
1407                          */
1408                         un->un_overlap_tree_flag = 1;
1409                         cv_wait(&un->un_overlap_tree_cv,
1410                             &un->un_overlap_tree_mx);
1411                 }
1412         } while (!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1413         mutex_exit(&un->un_overlap_tree_mx);
1414 }
1415 
1416 /*
1417  * This function is called from mirror_done to check whether any pages have
1418  * been modified while a mirrored write was in progress.  Returns 0 if
1419  * all pages associated with bp are clean, 1 otherwise.
1420  */
1421 static int
1422 any_pages_dirty(struct buf *bp)
1423 {
1424         int     rval;
1425 
1426         rval = biomodified(bp);
1427         if (rval == -1)
1428                 rval = 0;
1429 
1430         return (rval);
1431 }
1432 
1433 #define MAX_EXTRAS 10
1434 
1435 void
1436 mirror_commit(
1437         mm_unit_t       *un,
1438         int             smmask,
1439         mddb_recid_t    *extras
1440 )
1441 {
1442         mm_submirror_t          *sm;
1443         md_unit_t               *su;
1444         int                     i;
1445 
1446         /* 2=mirror,null id */
1447         mddb_recid_t            recids[NMIRROR+2+MAX_EXTRAS];
1448 
1449         int                     ri = 0;
1450 
1451         if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
1452                 return;
1453 
1454         /* Add two, this includes the mirror unit and the null recid */
1455         if (extras != NULL) {
1456                 int     nrecids = 0;
1457                 while (extras[nrecids] != 0) {
1458                         nrecids++;
1459                 }
1460                 ASSERT(nrecids <= MAX_EXTRAS);
1461         }
1462 
1463         if (un != NULL)
1464                 recids[ri++] = un->c.un_record_id;
1465         for (i = 0;  i < NMIRROR; i++) {
1466                 if (!(smmask & SMI2BIT(i)))
1467                         continue;
1468                 sm = &un->un_sm[i];
1469                 if (!SMS_IS(sm, SMS_INUSE))
1470                         continue;
1471                 if (md_getmajor(sm->sm_dev) != md_major)
1472                         continue;
1473                 su =  MD_UNIT(md_getminor(sm->sm_dev));
1474                 recids[ri++] = su->c.un_record_id;
1475         }
1476 
1477         if (extras != NULL)
1478                 while (*extras != 0) {
1479                         recids[ri++] = *extras;
1480                         extras++;
1481                 }
1482 
1483         if (ri == 0)
1484                 return;
1485         recids[ri] = 0;
1486 
1487         /*
1488          * Ok to hold ioctl lock across record commit to mddb as
1489          * long as the record(s) being committed aren't resync records.
1490          */
1491         mddb_commitrecs_wrapper(recids);
1492 }
1493 
1494 
1495 /*
1496  * This routine is used to set a bit in the writable_bm bitmap
1497  * which represents each submirror in a metamirror which
1498  * is writable. The first writable submirror index is assigned
1499  * to the sm_index.  The number of writable submirrors are returned in nunits.
1500  *
1501  * This routine returns the submirror's unit number.
1502  */
1503 
1504 static void
1505 select_write_units(struct mm_unit *un, md_mps_t *ps)
1506 {
1507 
1508         int             i;
1509         unsigned        writable_bm = 0;
1510         unsigned        nunits = 0;
1511 
1512         for (i = 0; i < NMIRROR; i++) {
1513                 if (SUBMIRROR_IS_WRITEABLE(un, i)) {
1514                         /* set bit of all writable units */
1515                         writable_bm |= SMI2BIT(i);
1516                         nunits++;
1517                 }
1518         }
1519         ps->ps_writable_sm = writable_bm;
1520         ps->ps_active_cnt = nunits;
1521         ps->ps_current_sm = 0;
1522 }
1523 
1524 static
1525 unsigned
1526 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps)
1527 {
1528 
1529         int             i;
1530         unsigned        writable_bm = 0;
1531         unsigned        nunits = 0;
1532 
1533         for (i = 0; i < NMIRROR; i++) {
1534                 if (SUBMIRROR_IS_WRITEABLE(un, i) &&
1535                     un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) {
1536                         writable_bm |= SMI2BIT(i);
1537                         nunits++;
1538                 }
1539         }
1540         if ((writable_bm & ps->ps_allfrom_sm) != 0) {
1541                 writable_bm &= ~ps->ps_allfrom_sm;
1542                 nunits--;
1543         }
1544         ps->ps_writable_sm = writable_bm;
1545         ps->ps_active_cnt = nunits;
1546         ps->ps_current_sm = 0;
1547         return (nunits);
1548 }
1549 
1550 static md_dev64_t
1551 select_read_unit(
1552         mm_unit_t       *un,
1553         diskaddr_t      blkno,
1554         u_longlong_t    reqcount,
1555         u_longlong_t    *cando,
1556         int             must_be_opened,
1557         md_m_shared_t   **shared,
1558         md_mcs_t        *cs)
1559 {
1560         int                     i;
1561         md_m_shared_t           *s;
1562         uint_t                  lasterrcnt = 0;
1563         md_dev64_t              dev = 0;
1564         u_longlong_t            cnt;
1565         u_longlong_t            mincnt;
1566         mm_submirror_t          *sm;
1567         mm_submirror_ic_t       *smic;
1568         mdi_unit_t              *ui;
1569 
1570         mincnt = reqcount;
1571         for (i = 0; i < NMIRROR; i++) {
1572                 if (!SUBMIRROR_IS_READABLE(un, i))
1573                         continue;
1574                 sm = &un->un_sm[i];
1575                 smic = &un->un_smic[i];
1576                 cnt = reqcount;
1577 
1578                 /*
1579                  * If the current submirror is marked as inaccessible, do not
1580                  * try to access it.
1581                  */
1582                 ui = MDI_UNIT(getminor(expldev(sm->sm_dev)));
1583                 (void) md_unit_readerlock(ui);
1584                 if (ui->ui_tstate & MD_INACCESSIBLE) {
1585                         md_unit_readerexit(ui);
1586                         continue;
1587                 }
1588                 md_unit_readerexit(ui);
1589 
1590                 s = (md_m_shared_t *)(*(smic->sm_shared_by_blk))
1591                     (sm->sm_dev, sm, blkno, &cnt);
1592 
1593                 if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN))
1594                         continue;
1595                 if (s->ms_state == CS_OKAY) {
1596                         *cando = cnt;
1597                         if (shared != NULL)
1598                                 *shared = s;
1599 
1600                         if (un->un_sm[i].sm_flags & MD_SM_FAILFAST &&
1601                             cs != NULL) {
1602                                 cs->cs_buf.b_flags |= B_FAILFAST;
1603                         }
1604 
1605                         return (un->un_sm[i].sm_dev);
1606                 }
1607                 if (s->ms_state != CS_LAST_ERRED)
1608                         continue;
1609 
1610                 /* don't use B_FAILFAST since we're Last Erred */
1611 
1612                 if (mincnt > cnt)
1613                         mincnt = cnt;
1614                 if (s->ms_lasterrcnt > lasterrcnt) {
1615                         lasterrcnt = s->ms_lasterrcnt;
1616                         if (shared != NULL)
1617                                 *shared = s;
1618                         dev = un->un_sm[i].sm_dev;
1619                 }
1620         }
1621         *cando = mincnt;
1622         return (dev);
1623 }
1624 
1625 /*
1626  * Given a 32-bit bitmap, this routine will return the bit number
1627  * of the nth bit set.  The nth bit set is passed via the index integer.
1628  *
1629  * This routine is used to run through the writable submirror bitmap
1630  * and starting all of the writes.  See the value returned is the
1631  * index to appropriate submirror structure, in the md_sm
1632  * array for metamirrors.
1633  */
1634 static int
1635 md_find_nth_unit(uint_t mask, int index)
1636 {
1637         int     bit, nfound;
1638 
1639         for (bit = -1, nfound = -1; nfound != index; bit++) {
1640                 ASSERT(mask != 0);
1641                 nfound += (mask & 1);
1642                 mask >>= 1;
1643         }
1644         return (bit);
1645 }
1646 
1647 static int
1648 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs)
1649 {
1650         mm_unit_t       *un;
1651         buf_t           *bp;
1652         int             i;
1653         unsigned        nunits = 0;
1654         int             iunit;
1655         uint_t          running_bm = 0;
1656         uint_t          sm_index;
1657 
1658         bp = &cs->cs_buf;
1659         un = ps->ps_un;
1660 
1661         for (i = 0; i < NMIRROR; i++) {
1662                 if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING))
1663                         continue;
1664                 running_bm |= SMI2BIT(i);
1665                 nunits++;
1666         }
1667         if (nunits == 0)
1668                 return (1);
1669 
1670         /*
1671          * For directed mirror read (DMR) we only use the specified side and
1672          * do not compute the source of the read.
1673          * If we're running with MD_MPS_DIRTY_RD set we always return the
1674          * first mirror side (this prevents unnecessary ownership switching).
1675          * Otherwise we return the submirror according to the mirror read option
1676          */
1677         if (ps->ps_flags & MD_MPS_DMR) {
1678                 sm_index = un->un_dmr_last_read;
1679         } else if (ps->ps_flags & MD_MPS_DIRTY_RD) {
1680                 sm_index = md_find_nth_unit(running_bm, 0);
1681         } else {
1682                 /* Normal (non-DMR) operation */
1683                 switch (un->un_read_option) {
1684                 case RD_GEOMETRY:
1685                         iunit = (int)(bp->b_lblkno /
1686                             howmany(un->c.un_total_blocks, nunits));
1687                         sm_index = md_find_nth_unit(running_bm, iunit);
1688                         break;
1689                 case RD_FIRST:
1690                         sm_index = md_find_nth_unit(running_bm, 0);
1691                         break;
1692                 case RD_LOAD_BAL:
1693                         /* this is intentional to fall into the default */
1694                 default:
1695                         un->un_last_read = (un->un_last_read + 1) % nunits;
1696                         sm_index = md_find_nth_unit(running_bm,
1697                             un->un_last_read);
1698                         break;
1699                 }
1700         }
1701         bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev);
1702         ps->ps_allfrom_sm = SMI2BIT(sm_index);
1703 
1704         if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) {
1705                 bp->b_flags |= B_FAILFAST;
1706         }
1707 
1708         return (0);
1709 }
1710 
1711 static
1712 int
1713 mirror_are_submirrors_available(mm_unit_t *un)
1714 {
1715         int i;
1716         for (i = 0; i < NMIRROR; i++) {
1717                 md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1718 
1719                 if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) ||
1720                     md_getmajor(tmpdev) != md_major)
1721                         continue;
1722 
1723                 if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) ||
1724                     (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits))
1725                         return (0);
1726 
1727                 if (MDI_UNIT(md_getminor(tmpdev)) == NULL)
1728                         return (0);
1729         }
1730         return (1);
1731 }
1732 
1733 void
1734 build_submirror(mm_unit_t *un, int i, int snarfing)
1735 {
1736         struct mm_submirror     *sm;
1737         struct mm_submirror_ic  *smic;
1738         md_unit_t               *su;
1739         set_t                   setno;
1740 
1741         sm = &un->un_sm[i];
1742         smic = &un->un_smic[i];
1743 
1744         sm->sm_flags = 0; /* sometime we may need to do more here */
1745 
1746         setno = MD_UN2SET(un);
1747 
1748         if (!SMS_IS(sm, SMS_INUSE))
1749                 return;
1750         if (snarfing) {
1751                 sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno),
1752                     sm->sm_key, MD_NOTRUST_DEVT);
1753         } else {
1754                 if (md_getmajor(sm->sm_dev) == md_major) {
1755                         su = MD_UNIT(md_getminor(sm->sm_dev));
1756                         un->c.un_flag |= (su->c.un_flag & MD_LABELED);
1757                         /* submirror can no longer be soft partitioned */
1758                         MD_CAPAB(su) &= (~MD_CAN_SP);
1759                 }
1760         }
1761         smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev,
1762             0, "shared by blk", 0);
1763         smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev,
1764             0, "shared by indx", 0);
1765         smic->sm_get_component_count = (int (*)())md_get_named_service(
1766             sm->sm_dev, 0, "get component count", 0);
1767         smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0,
1768             "get block count skip size", 0);
1769         sm->sm_state &= ~SMS_IGNORE;
1770         if (SMS_IS(sm, SMS_OFFLINE))
1771                 MD_STATUS(un) |= MD_UN_OFFLINE_SM;
1772         md_set_parent(sm->sm_dev, MD_SID(un));
1773 }
1774 
1775 static void
1776 mirror_cleanup(mm_unit_t *un)
1777 {
1778         mddb_recid_t    recid;
1779         int             smi;
1780         sv_dev_t        sv[NMIRROR];
1781         int             nsv = 0;
1782 
1783         /*
1784          * If a MN diskset and this node is not the master, do
1785          * not delete any records on snarf of the mirror records.
1786          */
1787         if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1788             md_set[MD_UN2SET(un)].s_am_i_master == 0) {
1789                 return;
1790         }
1791 
1792         for (smi = 0; smi < NMIRROR; smi++) {
1793                 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1794                         continue;
1795                 sv[nsv].setno = MD_UN2SET(un);
1796                 sv[nsv++].key = un->un_sm[smi].sm_key;
1797         }
1798 
1799         recid = un->un_rr_dirty_recid;
1800         mddb_deleterec_wrapper(un->c.un_record_id);
1801         if (recid > 0)
1802                 mddb_deleterec_wrapper(recid);
1803 
1804         md_rem_names(sv, nsv);
1805 }
1806 
1807 /*
1808  * Comparison function for the avl tree which tracks
1809  * outstanding writes on submirrors.
1810  *
1811  * Returns:
1812  *      -1: ps1 < ps2
1813  *       0: ps1 and ps2 overlap
1814  *       1: ps1 > ps2
1815  */
1816 static int
1817 mirror_overlap_compare(const void *p1, const void *p2)
1818 {
1819         const md_mps_t *ps1 = (md_mps_t *)p1;
1820         const md_mps_t *ps2 = (md_mps_t *)p2;
1821 
1822         if (ps1->ps_firstblk < ps2->ps_firstblk) {
1823                 if (ps1->ps_lastblk >= ps2->ps_firstblk)
1824                         return (0);
1825                 return (-1);
1826         }
1827 
1828         if (ps1->ps_firstblk > ps2->ps_firstblk) {
1829                 if (ps1->ps_firstblk <= ps2->ps_lastblk)
1830                         return (0);
1831                 return (1);
1832         }
1833 
1834         return (0);
1835 }
1836 
1837 /*
1838  * Collapse any sparse submirror entries snarfed from the on-disk replica.
1839  * Only the in-core entries are updated. The replica will be updated on-disk
1840  * when the in-core replica is committed on shutdown of the SVM subsystem.
1841  */
1842 static void
1843 collapse_submirrors(mm_unit_t *un)
1844 {
1845         int                     smi, nremovals, smiremove;
1846         mm_submirror_t          *sm, *new_sm, *old_sm;
1847         mm_submirror_ic_t       *smic;
1848         int                     nsmidx = un->un_nsm - 1;
1849 
1850 rescan:
1851         nremovals = 0;
1852         smiremove = -1;
1853 
1854         for (smi = 0; smi <= nsmidx; smi++) {
1855                 sm = &un->un_sm[smi];
1856 
1857                 /*
1858                  * Check to see if this submirror is marked as in-use.
1859                  * If it isn't then it is a potential sparse entry and
1860                  * may need to be cleared from the configuration.
1861                  * The records should _already_ have been cleared by the
1862                  * original mirror_detach() code, but we need to shuffle
1863                  * any NULL entries in un_sm[] to the end of the array.
1864                  * Any NULL un_smic[] entries need to be reset to the underlying
1865                  * submirror/slice accessor functions.
1866                  */
1867                 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
1868                         nremovals++;
1869                         smiremove = smi;
1870                         break;
1871                 }
1872         }
1873 
1874         if (nremovals == 0) {
1875                 /*
1876                  * Ensure that we have a matching contiguous set of un_smic[]
1877                  * entries for the corresponding un_sm[] entries
1878                  */
1879                 for (smi = 0; smi <= nsmidx; smi++) {
1880                         smic = &un->un_smic[smi];
1881                         sm = &un->un_sm[smi];
1882 
1883                         smic->sm_shared_by_blk =
1884                             md_get_named_service(sm->sm_dev, 0,
1885                             "shared by_blk", 0);
1886                         smic->sm_shared_by_indx =
1887                             md_get_named_service(sm->sm_dev, 0,
1888                             "shared by indx", 0);
1889                         smic->sm_get_component_count =
1890                             (int (*)())md_get_named_service(sm->sm_dev, 0,
1891                             "get component count", 0);
1892                         smic->sm_get_bcss =
1893                             (int (*)())md_get_named_service(sm->sm_dev, 0,
1894                             "get block count skip size", 0);
1895                 }
1896                 return;
1897         }
1898 
1899         /*
1900          * Reshuffle the submirror devices so that we do not have a dead record
1901          * in the middle of the array. Once we've done this we need to rescan
1902          * the mirror to check for any other holes.
1903          */
1904         for (smi = 0; smi < NMIRROR; smi++) {
1905                 if (smi < smiremove)
1906                         continue;
1907                 if (smi > smiremove) {
1908                         old_sm = &un->un_sm[smi];
1909                         new_sm = &un->un_sm[smi - 1];
1910                         bcopy(old_sm, new_sm, sizeof (mm_submirror_t));
1911                         bzero(old_sm, sizeof (mm_submirror_t));
1912                 }
1913         }
1914 
1915         /*
1916          * Now we need to rescan the array to find the next potential dead
1917          * entry.
1918          */
1919         goto rescan;
1920 }
1921 
1922 /* Return a -1 if optimized record unavailable and set should be released */
1923 int
1924 mirror_build_incore(mm_unit_t *un, int snarfing)
1925 {
1926         int             i;
1927 
1928         if (MD_STATUS(un) & MD_UN_BEING_RESET) {
1929                 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
1930                 return (1);
1931         }
1932 
1933         if (mirror_are_submirrors_available(un) == 0)
1934                 return (1);
1935 
1936         if (MD_UNIT(MD_SID(un)) != NULL)
1937                 return (0);
1938 
1939         MD_STATUS(un) = 0;
1940 
1941         /* pre-4.1 didn't define CAN_META_CHILD capability */
1942         MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP;
1943 
1944         un->un_overlap_tree_flag = 0;
1945         avl_create(&un->un_overlap_root, mirror_overlap_compare,
1946             sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node));
1947 
1948         /*
1949          * We need to collapse any sparse submirror entries into a non-sparse
1950          * array. This is to cover the case where we have an old replica image
1951          * which has not been updated (i.e. snarfed) since being modified.
1952          * The new code expects all submirror access to be sequential (i.e.
1953          * both the un_sm[] and un_smic[] entries correspond to non-empty
1954          * submirrors.
1955          */
1956 
1957         collapse_submirrors(un);
1958 
1959         for (i = 0; i < NMIRROR; i++)
1960                 build_submirror(un, i, snarfing);
1961 
1962         if (unit_setup_resync(un, snarfing) != 0) {
1963                 if (snarfing) {
1964                         mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT);
1965                         /*
1966                          * If a MN set and set is not stale, then return -1
1967                          * which will force the caller to unload the set.
1968                          * The MN diskset nodes will return failure if
1969                          * unit_setup_resync fails so that nodes won't
1970                          * get out of sync.
1971                          *
1972                          * If set is STALE, the master node can't allocate
1973                          * a resync record (if needed), but node needs to
1974                          * join the set so that user can delete broken mddbs.
1975                          * So, if set is STALE, just continue on.
1976                          */
1977                         if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1978                             !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
1979                                 return (-1);
1980                         }
1981                 } else
1982                         return (1);
1983         }
1984 
1985         mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL);
1986         cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL);
1987 
1988         un->un_suspend_wr_flag = 0;
1989         mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL);
1990         cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL);
1991 
1992         /*
1993          * Allocate mutexes for mirror-owner and resync-owner changes.
1994          * All references to the owner message state field must be guarded
1995          * by this mutex.
1996          */
1997         mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL);
1998 
1999         /*
2000          * Allocate mutex and condvar for resync thread manipulation. These
2001          * will be used by mirror_resync_unit/mirror_ioctl_resync
2002          */
2003         mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL);
2004         cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL);
2005 
2006         /*
2007          * Allocate mutex and condvar for resync progress thread manipulation.
2008          * This allows resyncs to be continued across an intervening reboot.
2009          */
2010         mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL);
2011         cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL);
2012 
2013         /*
2014          * Allocate mutex and condvar for Directed Mirror Reads (DMR). This
2015          * provides synchronization between a user-ioctl and the resulting
2016          * strategy() call that performs the read().
2017          */
2018         mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL);
2019         cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL);
2020 
2021         /*
2022          * Allocate rwlocks for un_pernode_dirty_bm accessing.
2023          */
2024         for (i = 0; i < MD_MNMAXSIDES; i++) {
2025                 rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL);
2026         }
2027 
2028         /* place various information in the in-core data structures */
2029         md_nblocks_set(MD_SID(un), un->c.un_total_blocks);
2030         MD_UNIT(MD_SID(un)) = un;
2031 
2032         return (0);
2033 }
2034 
2035 
2036 void
2037 reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
2038 {
2039         mddb_recid_t    recid, vtoc_id;
2040         size_t          bitcnt;
2041         size_t          shortcnt;
2042         int             smi;
2043         sv_dev_t        sv[NMIRROR];
2044         int             nsv = 0;
2045         uint_t          bits = 0;
2046         minor_t         selfid;
2047         md_unit_t       *su;
2048         int             i;
2049 
2050         md_destroy_unit_incore(mnum, &mirror_md_ops);
2051 
2052         shortcnt = un->un_rrd_num * sizeof (short);
2053         bitcnt = howmany(un->un_rrd_num, NBBY);
2054 
2055         if (un->un_outstanding_writes)
2056                 kmem_free((caddr_t)un->un_outstanding_writes, shortcnt);
2057         if (un->un_goingclean_bm)
2058                 kmem_free((caddr_t)un->un_goingclean_bm, bitcnt);
2059         if (un->un_goingdirty_bm)
2060                 kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt);
2061         if (un->un_resync_bm)
2062                 kmem_free((caddr_t)un->un_resync_bm, bitcnt);
2063         if (un->un_pernode_dirty_sum)
2064                 kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num);
2065 
2066         /*
2067          * Destroy the taskq for deferred processing of DRL clean requests.
2068          * This taskq will only be present for Multi Owner mirrors.
2069          */
2070         if (un->un_drl_task != NULL)
2071                 ddi_taskq_destroy(un->un_drl_task);
2072 
2073         md_nblocks_set(mnum, -1ULL);
2074         MD_UNIT(mnum) = NULL;
2075 
2076         /*
2077          * Attempt release of its minor node
2078          */
2079         md_remove_minor_node(mnum);
2080 
2081         if (!removing)
2082                 return;
2083 
2084         for (smi = 0; smi < NMIRROR; smi++) {
2085                 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
2086                         continue;
2087                 /* reallow soft partitioning of submirror and reset parent */
2088                 su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev));
2089                 MD_CAPAB(su) |= MD_CAN_SP;
2090                 md_reset_parent(un->un_sm[smi].sm_dev);
2091                 reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]);
2092 
2093                 sv[nsv].setno = MD_MIN2SET(mnum);
2094                 sv[nsv++].key = un->un_sm[smi].sm_key;
2095                 bits |= SMI2BIT(smi);
2096         }
2097 
2098         MD_STATUS(un) |= MD_UN_BEING_RESET;
2099         recid = un->un_rr_dirty_recid;
2100         vtoc_id = un->c.un_vtoc_id;
2101         selfid = MD_SID(un);
2102 
2103         mirror_commit(un, bits, 0);
2104 
2105         avl_destroy(&un->un_overlap_root);
2106 
2107         /* Destroy all mutexes and condvars before returning. */
2108         mutex_destroy(&un->un_suspend_wr_mx);
2109         cv_destroy(&un->un_suspend_wr_cv);
2110         mutex_destroy(&un->un_overlap_tree_mx);
2111         cv_destroy(&un->un_overlap_tree_cv);
2112         mutex_destroy(&un->un_owner_mx);
2113         mutex_destroy(&un->un_rs_thread_mx);
2114         cv_destroy(&un->un_rs_thread_cv);
2115         mutex_destroy(&un->un_rs_progress_mx);
2116         cv_destroy(&un->un_rs_progress_cv);
2117         mutex_destroy(&un->un_dmr_mx);
2118         cv_destroy(&un->un_dmr_cv);
2119 
2120         for (i = 0; i < MD_MNMAXSIDES; i++) {
2121                 rw_destroy(&un->un_pernode_dirty_mx[i]);
2122                 if (un->un_pernode_dirty_bm[i])
2123                         kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt);
2124         }
2125 
2126         /*
2127          * Remove self from the namespace
2128          */
2129         if (un->c.un_revision & MD_FN_META_DEV) {
2130                 (void) md_rem_selfname(un->c.un_self_id);
2131         }
2132 
2133         /* This frees the unit structure. */
2134         mddb_deleterec_wrapper(un->c.un_record_id);
2135 
2136         if (recid != 0)
2137                 mddb_deleterec_wrapper(recid);
2138 
2139         /* Remove the vtoc, if present */
2140         if (vtoc_id)
2141                 mddb_deleterec_wrapper(vtoc_id);
2142 
2143         md_rem_names(sv, nsv);
2144 
2145         SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
2146             MD_MIN2SET(selfid), selfid);
2147 }
2148 
2149 int
2150 mirror_internal_open(
2151         minor_t         mnum,
2152         int             flag,
2153         int             otyp,
2154         int             md_oflags,
2155         IOLOCK          *lockp          /* can be NULL */
2156 )
2157 {
2158         mdi_unit_t      *ui = MDI_UNIT(mnum);
2159         int             err = 0;
2160 
2161 tryagain:
2162         /* single thread */
2163         if (lockp) {
2164                 /*
2165                  * If ioctl lock is held, use openclose_enter
2166                  * routine that will set the ioctl flag when
2167                  * grabbing the readerlock.
2168                  */
2169                 (void) md_ioctl_openclose_enter(lockp, ui);
2170         } else {
2171                 (void) md_unit_openclose_enter(ui);
2172         }
2173 
2174         /*
2175          * The mirror_open_all_devs routine may end up sending a STATE_UPDATE
2176          * message in a MN diskset and this requires that the openclose
2177          * lock is dropped in order to send this message.  So, another
2178          * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from
2179          * attempting an open while this thread has an open in progress.
2180          * Call the *_lh version of the lock exit routines since the ui_mx
2181          * mutex must be held from checking for OPENINPROGRESS until
2182          * after the cv_wait call.
2183          */
2184         mutex_enter(&ui->ui_mx);
2185         if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
2186                 if (lockp) {
2187                         (void) md_ioctl_openclose_exit_lh(lockp);
2188                 } else {
2189                         md_unit_openclose_exit_lh(ui);
2190                 }
2191                 cv_wait(&ui->ui_cv, &ui->ui_mx);
2192                 mutex_exit(&ui->ui_mx);
2193                 goto tryagain;
2194         }
2195 
2196         ui->ui_lock |= MD_UL_OPENINPROGRESS;
2197         mutex_exit(&ui->ui_mx);
2198 
2199         /* open devices, if necessary */
2200         if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) {
2201                 if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0)
2202                         goto out;
2203         }
2204 
2205         /* count open */
2206         if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
2207                 goto out;
2208 
2209         /* unlock, return success */
2210 out:
2211         mutex_enter(&ui->ui_mx);
2212         ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
2213         mutex_exit(&ui->ui_mx);
2214 
2215         if (lockp) {
2216                 /*
2217                  * If ioctl lock is held, use openclose_exit
2218                  * routine that will clear the lockp reader flag.
2219                  */
2220                 (void) md_ioctl_openclose_exit(lockp);
2221         } else {
2222                 md_unit_openclose_exit(ui);
2223         }
2224         return (err);
2225 }
2226 
2227 int
2228 mirror_internal_close(
2229         minor_t         mnum,
2230         int             otyp,
2231         int             md_cflags,
2232         IOLOCK          *lockp          /* can be NULL */
2233 )
2234 {
2235         mdi_unit_t      *ui = MDI_UNIT(mnum);
2236         mm_unit_t       *un;
2237         int             err = 0;
2238 
2239         /* single thread */
2240         if (lockp) {
2241                 /*
2242                  * If ioctl lock is held, use openclose_enter
2243                  * routine that will set the ioctl flag when
2244                  * grabbing the readerlock.
2245                  */
2246                 un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui);
2247         } else {
2248                 un = (mm_unit_t *)md_unit_openclose_enter(ui);
2249         }
2250 
2251         /* count closed */
2252         if ((err = md_unit_decopen(mnum, otyp)) != 0)
2253                 goto out;
2254 
2255         /* close devices, if necessary */
2256         if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
2257                 /*
2258                  * Clean up dirty bitmap for this unit. Do this
2259                  * before closing the underlying devices to avoid
2260                  * race conditions with reset_mirror() as a
2261                  * result of a 'metaset -r' command running in
2262                  * parallel. This might cause deallocation of
2263                  * dirty region bitmaps; with underlying metadevices
2264                  * in place this can't happen.
2265                  * Don't do this if a MN set and ABR not set
2266                  */
2267                 if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) {
2268                         if (!MD_MNSET_SETNO(MD_UN2SET(un)) ||
2269                             !(ui->ui_tstate & MD_ABR_CAP))
2270                                 mirror_process_unit_resync(un);
2271                 }
2272                 (void) mirror_close_all_devs(un, md_cflags);
2273 
2274                 /*
2275                  * For a MN set with transient capabilities (eg ABR/DMR) set,
2276                  * clear these capabilities on the last open in the cluster.
2277                  * To do this we send a message to all nodes to see of the
2278                  * device is open.
2279                  */
2280                 if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
2281                     (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) {
2282                         if (lockp) {
2283                                 (void) md_ioctl_openclose_exit(lockp);
2284                         } else {
2285                                 md_unit_openclose_exit(ui);
2286                         }
2287 
2288                         /*
2289                          * if we are in the context of an ioctl, drop the
2290                          * ioctl lock.
2291                          * Otherwise, no other locks should be held.
2292                          */
2293                         if (lockp) {
2294                                 IOLOCK_RETURN_RELEASE(0, lockp);
2295                         }
2296 
2297                         mdmn_clear_all_capabilities(mnum);
2298 
2299                         /* if dropped the lock previously, regain it */
2300                         if (lockp) {
2301                                 IOLOCK_RETURN_REACQUIRE(lockp);
2302                         }
2303                         return (0);
2304                 }
2305                 /* unlock and return success */
2306         }
2307 out:
2308         /* Call whether lockp is NULL or not. */
2309         if (lockp) {
2310                 md_ioctl_openclose_exit(lockp);
2311         } else {
2312                 md_unit_openclose_exit(ui);
2313         }
2314         return (err);
2315 }
2316 
2317 /*
2318  * When a component has completed resyncing and is now ok, check if the
2319  * corresponding component in the other submirrors is in the Last Erred
2320  * state.  If it is, we want to change that to the Erred state so we stop
2321  * using that component and start using this good component instead.
2322  *
2323  * This is called from set_sm_comp_state and recursively calls
2324  * set_sm_comp_state if it needs to change the Last Erred state.
2325  */
2326 static void
2327 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags,
2328         IOLOCK *lockp)
2329 {
2330         mm_submirror_t          *sm;
2331         mm_submirror_ic_t       *smic;
2332         int                     ci;
2333         int                     i;
2334         int                     compcnt;
2335         int                     changed = 0;
2336 
2337         for (i = 0; i < NMIRROR; i++) {
2338                 sm = &un->un_sm[i];
2339                 smic = &un->un_smic[i];
2340 
2341                 if (!SMS_IS(sm, SMS_INUSE))
2342                         continue;
2343 
2344                 /* ignore the submirror that we just made ok */
2345                 if (i == smi)
2346                         continue;
2347 
2348                 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
2349                 for (ci = 0; ci < compcnt; ci++) {
2350                         md_m_shared_t   *shared;
2351 
2352                         shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2353                             (sm->sm_dev, sm, ci);
2354 
2355                         if ((shared->ms_state & CS_LAST_ERRED) &&
2356                             !mirror_other_sources(un, i, ci, 1)) {
2357 
2358                                 set_sm_comp_state(un, i, ci, CS_ERRED, extras,
2359                                     flags, lockp);
2360                                 changed = 1;
2361                         }
2362                 }
2363         }
2364 
2365         /* maybe there is a hotspare for this newly erred component */
2366         if (changed) {
2367                 set_t   setno;
2368 
2369                 setno = MD_UN2SET(un);
2370                 if (MD_MNSET_SETNO(setno)) {
2371                         send_poke_hotspares(setno);
2372                 } else {
2373                         (void) poke_hotspares();
2374                 }
2375         }
2376 }
2377 
2378 /*
2379  * set_sm_comp_state
2380  *
2381  * Set the state of a submirror component to the specified new state.
2382  * If the mirror is in a multi-node set, send messages to all nodes to
2383  * block all writes to the mirror and then update the state and release the
2384  * writes. These messages are only sent if MD_STATE_XMIT is set in flags.
2385  * MD_STATE_XMIT will be unset in 2 cases:
2386  * 1. When the state is changed to CS_RESYNC as this state change
2387  * will already have been updated on each node by the processing of the
2388  * distributed metasync command, hence no need to xmit.
2389  * 2. When the state is change to CS_OKAY after a resync has completed. Again
2390  * the resync completion will already have been processed on each node by
2391  * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component
2392  * resync, hence no need to xmit.
2393  *
2394  * In case we are called from the updates of a watermark,
2395  * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to
2396  * a metainit or similar. In this case the message that we sent to propagate
2397  * the state change must not be a class1 message as that would deadlock with
2398  * the metainit command that is still being processed.
2399  * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2
2400  * instead. This also makes the submessage generator to create a class2
2401  * submessage rather than a class1 (which would also block)
2402  *
2403  * On entry, unit_writerlock is held
2404  * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is
2405  * also held.
2406  */
2407 void
2408 set_sm_comp_state(
2409         mm_unit_t       *un,
2410         int             smi,
2411         int             ci,
2412         int             newstate,
2413         mddb_recid_t    *extras,
2414         uint_t          flags,
2415         IOLOCK          *lockp
2416 )
2417 {
2418         mm_submirror_t          *sm;
2419         mm_submirror_ic_t       *smic;
2420         md_m_shared_t           *shared;
2421         int                     origstate;
2422         void                    (*get_dev)();
2423         ms_cd_info_t            cd;
2424         char                    devname[MD_MAX_CTDLEN];
2425         int                     err;
2426         set_t                   setno = MD_UN2SET(un);
2427         md_mn_msg_stch_t        stchmsg;
2428         mdi_unit_t              *ui = MDI_UNIT(MD_SID(un));
2429         md_mn_kresult_t         *kresult;
2430         int                     rval;
2431         uint_t                  msgflags;
2432         md_mn_msgtype_t         msgtype;
2433         int                     save_lock = 0;
2434         mdi_unit_t              *ui_sm;
2435         int                     nretries = 0;
2436 
2437         sm = &un->un_sm[smi];
2438         smic = &un->un_smic[smi];
2439 
2440         /* If we have a real error status then turn off MD_INACCESSIBLE. */
2441         ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev)));
2442         if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) &&
2443             ui_sm->ui_tstate & MD_INACCESSIBLE) {
2444                 ui_sm->ui_tstate &= ~MD_INACCESSIBLE;
2445         }
2446 
2447         shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2448             (sm->sm_dev, sm, ci);
2449         origstate = shared->ms_state;
2450 
2451         /*
2452          * If the new state is an error and the old one wasn't, generate
2453          * a console message. We do this before we send the state to other
2454          * nodes in a MN set because the state change may change the component
2455          * name  if a hotspare is allocated.
2456          */
2457         if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) &&
2458             (newstate & (CS_ERRED|CS_LAST_ERRED))) {
2459 
2460                 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2461                     "get device", 0);
2462                 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2463 
2464                 err = md_getdevname(setno, mddb_getsidenum(setno), 0,
2465                     cd.cd_dev, devname, sizeof (devname));
2466 
2467                 if (err == ENOENT) {
2468                         (void) md_devname(setno, cd.cd_dev, devname,
2469                             sizeof (devname));
2470                 }
2471 
2472                 cmn_err(CE_WARN, "md: %s: %s needs maintenance",
2473                     md_shortname(md_getminor(sm->sm_dev)), devname);
2474 
2475                 if (newstate & CS_LAST_ERRED) {
2476                         cmn_err(CE_WARN, "md: %s: %s last erred",
2477                             md_shortname(md_getminor(sm->sm_dev)),
2478                             devname);
2479 
2480                 } else if (shared->ms_flags & MDM_S_ISOPEN) {
2481                         /*
2482                          * Close the broken device and clear the open flag on
2483                          * it.  Closing the device means the RCM framework will
2484                          * be able to unconfigure the device if required.
2485                          *
2486                          * We have to check that the device is open, otherwise
2487                          * the first open on it has resulted in the error that
2488                          * is being processed and the actual cd.cd_dev will be
2489                          * NODEV64.
2490                          *
2491                          * If this is a multi-node mirror, then the multinode
2492                          * state checks following this code will cause the
2493                          * slave nodes to close the mirror in the function
2494                          * mirror_set_state().
2495                          */
2496                         md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2497                         shared->ms_flags &= ~MDM_S_ISOPEN;
2498                 }
2499 
2500         } else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) &&
2501             (shared->ms_flags & MDM_S_ISOPEN)) {
2502                 /*
2503                  * Similar to logic above except no log messages since we
2504                  * are just transitioning from Last Erred to Erred.
2505                  */
2506                 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2507                     "get device", 0);
2508                 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2509 
2510                 md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2511                 shared->ms_flags &= ~MDM_S_ISOPEN;
2512         }
2513 
2514         if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) &&
2515             (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) {
2516                 /*
2517                  * For a multi-node mirror, send the state change to the
2518                  * master, which broadcasts to all nodes, including this
2519                  * one. Once the message is received, the state is set
2520                  * in-core and the master commits the change to disk.
2521                  * There is a case, comp_replace,  where this function
2522                  * can be called from within an ioctl and therefore in this
2523                  * case, as the ioctl will already be called on each node,
2524                  * there is no need to xmit the state change to the master for
2525                  * distribution to the other nodes. MD_STATE_XMIT flag is used
2526                  * to indicate whether a xmit is required. The mirror's
2527                  * transient state is set to MD_ERR_PENDING to avoid sending
2528                  * multiple messages.
2529                  */
2530                 if (newstate & (CS_ERRED|CS_LAST_ERRED))
2531                         ui->ui_tstate |= MD_ERR_PENDING;
2532 
2533                 /*
2534                  * Send a state update message to all nodes. This message
2535                  * will generate 2 submessages, the first one to suspend
2536                  * all writes to the mirror and the second to update the
2537                  * state and resume writes.
2538                  */
2539                 stchmsg.msg_stch_mnum = un->c.un_self_id;
2540                 stchmsg.msg_stch_sm = smi;
2541                 stchmsg.msg_stch_comp = ci;
2542                 stchmsg.msg_stch_new_state = newstate;
2543                 stchmsg.msg_stch_hs_id = shared->ms_hs_id;
2544 #ifdef DEBUG
2545                 if (mirror_debug_flag)
2546                         printf("send set state, %x, %x, %x, %x, %x\n",
2547                             stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm,
2548                             stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state,
2549                             stchmsg.msg_stch_hs_id);
2550 #endif
2551                 if (flags & MD_STATE_WMUPDATE) {
2552                         msgtype  = MD_MN_MSG_STATE_UPDATE2;
2553                         /*
2554                          * When coming from an update of watermarks, there
2555                          * must already be a message logged that triggered
2556                          * this action. So, no need to log this message, too.
2557                          */
2558                         msgflags = MD_MSGF_NO_LOG;
2559                 } else {
2560                         msgtype  = MD_MN_MSG_STATE_UPDATE;
2561                         msgflags = MD_MSGF_DEFAULT_FLAGS;
2562                 }
2563 
2564                 /*
2565                  * If we are in the context of an ioctl, drop the ioctl lock.
2566                  * lockp holds the list of locks held.
2567                  *
2568                  * Otherwise, increment the appropriate reacquire counters.
2569                  * If openclose lock is *held, then must reacquire reader
2570                  * lock before releasing the openclose lock.
2571                  * Do not drop the ARRAY_WRITER lock as we may not be able
2572                  * to reacquire it.
2573                  */
2574                 if (lockp) {
2575                         if (lockp->l_flags & MD_ARRAY_WRITER) {
2576                                 save_lock = MD_ARRAY_WRITER;
2577                                 lockp->l_flags &= ~MD_ARRAY_WRITER;
2578                         } else if (lockp->l_flags & MD_ARRAY_READER) {
2579                                 save_lock = MD_ARRAY_READER;
2580                                 lockp->l_flags &= ~MD_ARRAY_READER;
2581                         }
2582                         IOLOCK_RETURN_RELEASE(0, lockp);
2583                 } else {
2584                         if (flags & MD_STATE_OCHELD) {
2585                                 md_unit_writerexit(ui);
2586                                 (void) md_unit_readerlock(ui);
2587                                 md_unit_openclose_exit(ui);
2588                         } else {
2589                                 md_unit_writerexit(ui);
2590                         }
2591                 }
2592 
2593                 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
2594 sscs_msg:
2595                 rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
2596                     (char *)&stchmsg, sizeof (stchmsg), kresult);
2597 
2598                 if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
2599                         mdmn_ksend_show_error(rval, kresult, "STATE UPDATE");
2600                         /* If we're shutting down already, pause things here. */
2601                         if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
2602                                 while (!md_mn_is_commd_present()) {
2603                                         delay(md_hz);
2604                                 }
2605                                 /*
2606                                  * commd is now available; retry the message
2607                                  * one time. If that fails we fall through and
2608                                  * panic as the system is in an unexpected state
2609                                  */
2610                                 if (nretries++ == 0)
2611                                         goto sscs_msg;
2612                         }
2613                         cmn_err(CE_PANIC,
2614                             "ksend_message failure: STATE_UPDATE");
2615                 }
2616                 kmem_free(kresult, sizeof (md_mn_kresult_t));
2617 
2618                 /* if dropped the lock previously, regain it */
2619                 if (lockp) {
2620                         IOLOCK_RETURN_REACQUIRE(lockp);
2621                         lockp->l_flags |= save_lock;
2622                 } else {
2623                         /*
2624                          * Reacquire dropped locks and update acquirecnts
2625                          * appropriately.
2626                          */
2627                         if (flags & MD_STATE_OCHELD) {
2628                                 /*
2629                                  * openclose also grabs readerlock.
2630                                  */
2631                                 (void) md_unit_openclose_enter(ui);
2632                                 md_unit_readerexit(ui);
2633                                 (void) md_unit_writerlock(ui);
2634                         } else {
2635                                 (void) md_unit_writerlock(ui);
2636                         }
2637                 }
2638 
2639                 ui->ui_tstate &= ~MD_ERR_PENDING;
2640         } else {
2641                 shared->ms_state = newstate;
2642                 uniqtime32(&shared->ms_timestamp);
2643 
2644                 if (newstate == CS_ERRED)
2645                         shared->ms_flags |= MDM_S_NOWRITE;
2646                 else
2647                         shared->ms_flags &= ~MDM_S_NOWRITE;
2648 
2649                 shared->ms_flags &= ~MDM_S_IOERR;
2650                 un->un_changecnt++;
2651                 shared->ms_lasterrcnt = un->un_changecnt;
2652 
2653                 mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
2654                 mirror_commit(un, SMI2BIT(smi), extras);
2655         }
2656 
2657         if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) {
2658                 /*
2659                  * Resetting the Last Erred state will recursively call back
2660                  * into this function (set_sm_comp_state) to update the state.
2661                  */
2662                 reset_lasterred(un, smi, extras, flags, lockp);
2663         }
2664 }
2665 
2666 static int
2667 find_another_logical(
2668         mm_unit_t               *un,
2669         mm_submirror_t          *esm,
2670         diskaddr_t              blk,
2671         u_longlong_t            cnt,
2672         int                     must_be_open,
2673         int                     state,
2674         int                     err_cnt)
2675 {
2676         u_longlong_t    cando;
2677         md_dev64_t      dev;
2678         md_m_shared_t   *s;
2679 
2680         esm->sm_state |= SMS_IGNORE;
2681         while (cnt != 0) {
2682                 u_longlong_t     mcnt;
2683 
2684                 mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024));    /* 1 Gig Blks */
2685 
2686                 dev = select_read_unit(un, blk, mcnt, &cando,
2687                     must_be_open, &s, NULL);
2688                 if (dev == (md_dev64_t)0)
2689                         break;
2690 
2691                 if ((state == CS_LAST_ERRED) &&
2692                     (s->ms_state == CS_LAST_ERRED) &&
2693                     (err_cnt > s->ms_lasterrcnt))
2694                         break;
2695 
2696                 cnt -= cando;
2697                 blk += cando;
2698         }
2699         esm->sm_state &= ~SMS_IGNORE;
2700         return (cnt != 0);
2701 }
2702 
2703 int
2704 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open)
2705 {
2706         mm_submirror_t          *sm;
2707         mm_submirror_ic_t       *smic;
2708         size_t                  count;
2709         diskaddr_t              block;
2710         u_longlong_t            skip;
2711         u_longlong_t            size;
2712         md_dev64_t              dev;
2713         int                     cnt;
2714         md_m_shared_t           *s;
2715         int                     not_found;
2716 
2717         sm = &un->un_sm[smi];
2718         smic = &un->un_smic[smi];
2719         dev = sm->sm_dev;
2720 
2721         /*
2722          * Make sure every component of the submirror
2723          * has other sources.
2724          */
2725         if (ci < 0) {
2726                 /* Find the highest lasterrcnt */
2727                 cnt = (*(smic->sm_get_component_count))(dev, sm);
2728                 for (ci = 0; ci < cnt; ci++) {
2729                         not_found = mirror_other_sources(un, smi, ci,
2730                             must_be_open);
2731                         if (not_found)
2732                                 return (1);
2733                 }
2734                 return (0);
2735         }
2736 
2737         /*
2738          * Make sure this component has other sources
2739          */
2740         (void) (*(smic->sm_get_bcss))
2741             (dev, sm, ci, &block, &count, &skip, &size);
2742 
2743         if (count == 0)
2744                 return (1);
2745 
2746         s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci);
2747 
2748         while (count--) {
2749                 if (block >= un->c.un_total_blocks)
2750                         return (0);
2751 
2752                 if ((block + size) > un->c.un_total_blocks)
2753                         size = un->c.un_total_blocks - block;
2754 
2755                 not_found = find_another_logical(un, sm, block, size,
2756                     must_be_open, s->ms_state, s->ms_lasterrcnt);
2757                 if (not_found)
2758                         return (1);
2759 
2760                 block += size + skip;
2761         }
2762         return (0);
2763 }
2764 
2765 static void
2766 finish_error(md_mps_t *ps)
2767 {
2768         struct buf      *pb;
2769         mm_unit_t       *un;
2770         mdi_unit_t      *ui;
2771         uint_t          new_str_flags;
2772 
2773         pb = ps->ps_bp;
2774         un = ps->ps_un;
2775         ui = ps->ps_ui;
2776 
2777         /*
2778          * Must flag any error to the resync originator if we're performing
2779          * a Write-after-Read. This corresponds to an i/o error on a resync
2780          * target device and in this case we ought to abort the resync as there
2781          * is nothing that can be done to recover from this without operator
2782          * intervention. If we don't set the B_ERROR flag we will continue
2783          * reading from the mirror but won't write to the target (as it will
2784          * have been placed into an errored state).
2785          * To handle the case of multiple components within a submirror we only
2786          * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR.
2787          * The originator of the resync read will cause this bit to be set if
2788          * the underlying component count is one for a submirror resync. All
2789          * other resync types will have the flag set as there is no underlying
2790          * resync which can be performed on a contained metadevice for these
2791          * resync types (optimized or component).
2792          */
2793 
2794         if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) {
2795                 if (ps->ps_flags & MD_MPS_FLAG_ERROR)
2796                         pb->b_flags |= B_ERROR;
2797                 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2798                 MPS_FREE(mirror_parent_cache, ps);
2799                 md_unit_readerexit(ui);
2800                 md_biodone(pb);
2801                 return;
2802         }
2803         /*
2804          * Set the MD_IO_COUNTED flag as we are retrying the same I/O
2805          * operation therefore this I/O request has already been counted,
2806          * the I/O count variable will be decremented by mirror_done()'s
2807          * call to md_biodone().
2808          */
2809         if (ps->ps_changecnt != un->un_changecnt) {
2810                 new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED;
2811                 if (ps->ps_flags & MD_MPS_WOW)
2812                         new_str_flags |= MD_STR_WOW;
2813                 if (ps->ps_flags & MD_MPS_MAPPED)
2814                         new_str_flags |= MD_STR_MAPPED;
2815                 /*
2816                  * If this I/O request was a read that was part of a resync,
2817                  * set MD_STR_WAR for the retried read to ensure that the
2818                  * resync write (i.e. write-after-read) will be performed
2819                  */
2820                 if (ps->ps_flags & MD_MPS_RESYNC_READ)
2821                         new_str_flags |= MD_STR_WAR;
2822                 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2823                 MPS_FREE(mirror_parent_cache, ps);
2824                 md_unit_readerexit(ui);
2825                 (void) md_mirror_strategy(pb, new_str_flags, NULL);
2826                 return;
2827         }
2828 
2829         pb->b_flags |= B_ERROR;
2830         md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2831         MPS_FREE(mirror_parent_cache, ps);
2832         md_unit_readerexit(ui);
2833         md_biodone(pb);
2834 }
2835 
2836 static void
2837 error_update_unit(md_mps_t *ps)
2838 {
2839         mm_unit_t               *un;
2840         mdi_unit_t              *ui;
2841         int                     smi;    /* sub mirror index */
2842         int                     ci;     /* errored component */
2843         set_t                   setno;
2844         uint_t                  flags;  /* for set_sm_comp_state() */
2845         uint_t                  hspflags; /* for check_comp_4_hotspares() */
2846 
2847         ui = ps->ps_ui;
2848         un = (mm_unit_t *)md_unit_writerlock(ui);
2849         setno = MD_UN2SET(un);
2850 
2851         /* All of these updates have to propagated in case of MN set */
2852         flags = MD_STATE_XMIT;
2853         hspflags = MD_HOTSPARE_XMIT;
2854 
2855         /* special treatment if we are called during updating watermarks */
2856         if (ps->ps_flags & MD_MPS_WMUPDATE) {
2857                 flags |= MD_STATE_WMUPDATE;
2858                 hspflags |= MD_HOTSPARE_WMUPDATE;
2859         }
2860         smi = 0;
2861         ci = 0;
2862         while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
2863                 if (mirror_other_sources(un, smi, ci, 0) == 1) {
2864 
2865                         /* Never called from ioctl context, so (IOLOCK *)NULL */
2866                         set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags,
2867                             (IOLOCK *)NULL);
2868                         /*
2869                          * For a MN set, the NOTIFY is done when the state
2870                          * change is processed on each node
2871                          */
2872                         if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2873                                 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
2874                                     SVM_TAG_METADEVICE, setno, MD_SID(un));
2875                         }
2876                         continue;
2877                 }
2878                 /* Never called from ioctl context, so (IOLOCK *)NULL */
2879                 set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags,
2880                     (IOLOCK *)NULL);
2881                 /*
2882                  * For a MN set, the NOTIFY is done when the state
2883                  * change is processed on each node
2884                  */
2885                 if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2886                         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
2887                             SVM_TAG_METADEVICE, setno, MD_SID(un));
2888                 }
2889                 smi = 0;
2890                 ci = 0;
2891         }
2892 
2893         md_unit_writerexit(ui);
2894         if (MD_MNSET_SETNO(setno)) {
2895                 send_poke_hotspares(setno);
2896         } else {
2897                 (void) poke_hotspares();
2898         }
2899         (void) md_unit_readerlock(ui);
2900 
2901         finish_error(ps);
2902 }
2903 
2904 /*
2905  * When we have a B_FAILFAST IO error on a Last Erred component we need to
2906  * retry the IO without B_FAILFAST set so that we try to ensure that the
2907  * component "sees" each IO.
2908  */
2909 static void
2910 last_err_retry(md_mcs_t *cs)
2911 {
2912         struct buf      *cb;
2913         md_mps_t        *ps;
2914         uint_t          flags;
2915 
2916         cb = &cs->cs_buf;
2917         cb->b_flags &= ~B_FAILFAST;
2918 
2919         /* if we're panicing just let this I/O error out */
2920         if (panicstr) {
2921                 (void) mirror_done(cb);
2922                 return;
2923         }
2924 
2925         /* reissue the I/O */
2926 
2927         ps = cs->cs_ps;
2928 
2929         bioerror(cb, 0);
2930 
2931         mutex_enter(&ps->ps_mx);
2932 
2933         flags = MD_STR_NOTTOP;
2934         if (ps->ps_flags & MD_MPS_MAPPED)
2935                 flags |= MD_STR_MAPPED;
2936         if (ps->ps_flags & MD_MPS_NOBLOCK)
2937                 flags |= MD_NOBLOCK;
2938 
2939         mutex_exit(&ps->ps_mx);
2940 
2941         clear_retry_error(cb);
2942 
2943         cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST",
2944             md_shortname(getminor(cb->b_edev)));
2945 
2946         md_call_strategy(cb, flags, NULL);
2947 }
2948 
2949 static void
2950 mirror_error(md_mps_t *ps)
2951 {
2952         int             smi;    /* sub mirror index */
2953         int             ci;     /* errored component */
2954 
2955         if (panicstr) {
2956                 finish_error(ps);
2957                 return;
2958         }
2959 
2960         if (ps->ps_flags & MD_MPS_ON_OVERLAP)
2961                 mirror_overlap_tree_remove(ps);
2962 
2963         smi = 0;
2964         ci = 0;
2965         if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) {
2966                 md_unit_readerexit(ps->ps_ui);
2967                 daemon_request(&md_mstr_daemon, error_update_unit,
2968                     (daemon_queue_t *)ps, REQ_OLD);
2969                 return;
2970         }
2971 
2972         finish_error(ps);
2973 }
2974 
2975 static int
2976 copy_write_done(struct buf *cb)
2977 {
2978         md_mps_t        *ps;
2979         buf_t           *pb;
2980         char            *wowbuf;
2981         wowhdr_t        *wowhdr;
2982         ssize_t         wow_resid;
2983 
2984         /* get wowbuf ans save structure */
2985         wowbuf = cb->b_un.b_addr;
2986         wowhdr = WOWBUF_HDR(wowbuf);
2987         ps = wowhdr->wow_ps;
2988         pb = ps->ps_bp;
2989 
2990         /* Save error information, then free cb */
2991         if (cb->b_flags & B_ERROR)
2992                 pb->b_flags |= B_ERROR;
2993 
2994         if (cb->b_flags & B_REMAPPED)
2995                 bp_mapout(cb);
2996 
2997         freerbuf(cb);
2998 
2999         /* update residual and continue if needed */
3000         if ((pb->b_flags & B_ERROR) == 0) {
3001                 wow_resid = pb->b_bcount - wowhdr->wow_offset;
3002                 pb->b_resid = wow_resid;
3003                 if (wow_resid > 0)  {
3004                         daemon_request(&md_mstr_daemon, copy_write_cont,
3005                             (daemon_queue_t *)wowhdr, REQ_OLD);
3006                         return (1);
3007                 }
3008         }
3009 
3010         /* Write is complete, release resources. */
3011         kmem_cache_free(mirror_wowblk_cache, wowhdr);
3012         ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
3013         md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3014         MPS_FREE(mirror_parent_cache, ps);
3015         md_biodone(pb);
3016         return (0);
3017 }
3018 
3019 static void
3020 copy_write_cont(wowhdr_t *wowhdr)
3021 {
3022         buf_t           *pb;
3023         buf_t           *cb;
3024         char            *wowbuf;
3025         int             wow_offset;
3026         size_t          wow_resid;
3027         diskaddr_t      wow_blkno;
3028 
3029         wowbuf = WOWHDR_BUF(wowhdr);
3030         pb = wowhdr->wow_ps->ps_bp;
3031 
3032         /* get data on current location */
3033         wow_offset = wowhdr->wow_offset;
3034         wow_resid = pb->b_bcount - wow_offset;
3035         wow_blkno = pb->b_lblkno + lbtodb(wow_offset);
3036 
3037         /* setup child buffer */
3038         cb = getrbuf(KM_SLEEP);
3039         cb->b_flags = B_WRITE;
3040         cb->b_edev = pb->b_edev;
3041         cb->b_un.b_addr = wowbuf;    /* change to point at WOWBUF */
3042         cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */
3043         cb->b_iodone = copy_write_done;
3044         cb->b_bcount = MIN(md_wowbuf_size, wow_resid);
3045         cb->b_lblkno = wow_blkno;
3046 
3047         /* move offset to next section */
3048         wowhdr->wow_offset += cb->b_bcount;
3049 
3050         /* copy and setup write for current section */
3051         bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount);
3052 
3053         /* do it */
3054         /*
3055          * Do not set the MD_IO_COUNTED flag as this is a new I/O request
3056          * that handles the WOW condition. The resultant increment on the
3057          * I/O count variable is cleared by copy_write_done()'s call to
3058          * md_biodone().
3059          */
3060         (void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW
3061             | MD_STR_MAPPED, NULL);
3062 }
3063 
3064 static void
3065 md_mirror_copy_write(md_mps_t *ps)
3066 {
3067         wowhdr_t        *wowhdr;
3068 
3069         wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS);
3070         mirror_wowblk_init(wowhdr);
3071         wowhdr->wow_ps = ps;
3072         wowhdr->wow_offset = 0;
3073         copy_write_cont(wowhdr);
3074 }
3075 
3076 static void
3077 handle_wow(md_mps_t *ps)
3078 {
3079         buf_t           *pb;
3080 
3081         pb = ps->ps_bp;
3082 
3083         bp_mapin(pb);
3084 
3085         md_mirror_wow_cnt++;
3086         if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) {
3087                 cmn_err(CE_NOTE,
3088                     "md: %s, blk %lld, cnt %ld: Write on write %d occurred",
3089                     md_shortname(getminor(pb->b_edev)),
3090                     (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt);
3091         }
3092 
3093         /*
3094          * Set the MD_IO_COUNTED flag as we are retrying the same I/O
3095          * operation therefore this I/O request has already been counted,
3096          * the I/O count variable will be decremented by mirror_done()'s
3097          * call to md_biodone().
3098          */
3099         if (md_mirror_wow_flg & WOW_NOCOPY)
3100                 (void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW |
3101                     MD_STR_MAPPED | MD_IO_COUNTED, ps);
3102         else
3103                 md_mirror_copy_write(ps);
3104 }
3105 
3106 /*
3107  * Return true if the specified submirror is either in the Last Erred
3108  * state or is transitioning into the Last Erred state.
3109  */
3110 static bool_t
3111 submirror_is_lasterred(mm_unit_t *un, int smi)
3112 {
3113         mm_submirror_t          *sm;
3114         mm_submirror_ic_t       *smic;
3115         md_m_shared_t           *shared;
3116         int                     ci;
3117         int                     compcnt;
3118 
3119         sm = &un->un_sm[smi];
3120         smic = &un->un_smic[smi];
3121 
3122         compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
3123         for (ci = 0; ci < compcnt; ci++) {
3124                 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3125                     (sm->sm_dev, sm, ci);
3126 
3127                 if (shared->ms_state == CS_LAST_ERRED)
3128                         return (B_TRUE);
3129 
3130                 /*
3131                  * It is not currently Last Erred, check if entering Last Erred.
3132                  */
3133                 if ((shared->ms_flags & MDM_S_IOERR) &&
3134                     ((shared->ms_state == CS_OKAY) ||
3135                     (shared->ms_state == CS_RESYNC))) {
3136                         if (mirror_other_sources(un, smi, ci, 0) == 1)
3137                                 return (B_TRUE);
3138                 }
3139         }
3140 
3141         return (B_FALSE);
3142 }
3143 
3144 
3145 static int
3146 mirror_done(struct buf *cb)
3147 {
3148         md_mps_t        *ps;
3149         md_mcs_t        *cs;
3150 
3151         /*LINTED*/
3152         cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3153         ps = cs->cs_ps;
3154 
3155         mutex_enter(&ps->ps_mx);
3156 
3157         /* check if we need to retry an errored failfast I/O */
3158         if (cb->b_flags & B_ERROR) {
3159                 struct buf *pb = ps->ps_bp;
3160 
3161                 if (cb->b_flags & B_FAILFAST) {
3162                         int             i;
3163                         mm_unit_t       *un = ps->ps_un;
3164 
3165                         for (i = 0; i < NMIRROR; i++) {
3166                                 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
3167                                         continue;
3168 
3169                                 if (cb->b_edev ==
3170                                     md_dev64_to_dev(un->un_sm[i].sm_dev)) {
3171 
3172                                         /*
3173                                          * This is the submirror that had the
3174                                          * error.  Check if it is Last Erred.
3175                                          */
3176                                         if (submirror_is_lasterred(un, i)) {
3177                                                 daemon_queue_t *dqp;
3178 
3179                                                 mutex_exit(&ps->ps_mx);
3180                                                 dqp = (daemon_queue_t *)cs;
3181                                                 dqp->dq_prev = NULL;
3182                                                 dqp->dq_next = NULL;
3183                                                 daemon_request(&md_done_daemon,
3184                                                     last_err_retry, dqp,
3185                                                     REQ_OLD);
3186                                                 return (1);
3187                                         }
3188                                         break;
3189                                 }
3190                         }
3191                 }
3192 
3193                 /* continue to process the buf without doing a retry */
3194                 ps->ps_flags |= MD_MPS_ERROR;
3195                 pb->b_error = cb->b_error;
3196         }
3197 
3198         return (mirror_done_common(cb));
3199 }
3200 
3201 /*
3202  * Split from the original mirror_done function so we can handle bufs after a
3203  * retry.
3204  * ps->ps_mx is already held in the caller of this function and the cb error
3205  * has already been checked and handled in the caller.
3206  */
3207 static int
3208 mirror_done_common(struct buf *cb)
3209 {
3210         struct buf      *pb;
3211         mm_unit_t       *un;
3212         mdi_unit_t      *ui;
3213         md_mps_t        *ps;
3214         md_mcs_t        *cs;
3215         size_t          end_rr, start_rr, current_rr;
3216 
3217         /*LINTED*/
3218         cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3219         ps = cs->cs_ps;
3220         pb = ps->ps_bp;
3221 
3222         if (cb->b_flags & B_REMAPPED)
3223                 bp_mapout(cb);
3224 
3225         ps->ps_frags--;
3226         if (ps->ps_frags != 0) {
3227                 mutex_exit(&ps->ps_mx);
3228                 kmem_cache_free(mirror_child_cache, cs);
3229                 return (1);
3230         }
3231         un = ps->ps_un;
3232         ui = ps->ps_ui;
3233 
3234         /*
3235          * Do not update outstanding_writes if we're running with ABR
3236          * set for this mirror or the write() was issued with MD_STR_ABR set.
3237          * Also a resync initiated write() has no outstanding_writes update
3238          * either.
3239          */
3240         if (((cb->b_flags & B_READ) == 0) &&
3241             (un->un_nsm >= 2) &&
3242             (ps->ps_call == NULL) &&
3243             !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) &&
3244             !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) {
3245                 BLK_TO_RR(end_rr, ps->ps_lastblk, un);
3246                 BLK_TO_RR(start_rr, ps->ps_firstblk, un);
3247                 mutex_enter(&un->un_resync_mx);
3248                 for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3249                         un->un_outstanding_writes[current_rr]--;
3250                 mutex_exit(&un->un_resync_mx);
3251         }
3252         kmem_cache_free(mirror_child_cache, cs);
3253         mutex_exit(&ps->ps_mx);
3254 
3255         if (ps->ps_call != NULL) {
3256                 daemon_request(&md_done_daemon, ps->ps_call,
3257                     (daemon_queue_t *)ps, REQ_OLD);
3258                 return (1);
3259         }
3260 
3261         if ((ps->ps_flags & MD_MPS_ERROR)) {
3262                 daemon_request(&md_done_daemon, mirror_error,
3263                     (daemon_queue_t *)ps, REQ_OLD);
3264                 return (1);
3265         }
3266 
3267         if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3268                 mirror_overlap_tree_remove(ps);
3269 
3270         /*
3271          * Handle Write-on-Write problem.
3272          * Skip In case of Raw and Direct I/O as they are
3273          * handled earlier.
3274          *
3275          */
3276         if (!(md_mirror_wow_flg & WOW_DISABLE) &&
3277             !(pb->b_flags & B_READ) &&
3278             !(ps->ps_flags & MD_MPS_WOW) &&
3279             !(pb->b_flags & B_PHYS) &&
3280             any_pages_dirty(pb)) {
3281                 md_unit_readerexit(ps->ps_ui);
3282                 daemon_request(&md_mstr_daemon, handle_wow,
3283                     (daemon_queue_t *)ps, REQ_OLD);
3284                 return (1);
3285         }
3286 
3287         md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3288         MPS_FREE(mirror_parent_cache, ps);
3289         md_unit_readerexit(ui);
3290         md_biodone(pb);
3291         return (0);
3292 }
3293 
3294 /*
3295  * Clear error state in submirror component if the retry worked after
3296  * a failfast error.
3297  */
3298 static void
3299 clear_retry_error(struct buf *cb)
3300 {
3301         int                     smi;
3302         md_mcs_t                *cs;
3303         mm_unit_t               *un;
3304         mdi_unit_t              *ui_sm;
3305         mm_submirror_t          *sm;
3306         mm_submirror_ic_t       *smic;
3307         u_longlong_t            cnt;
3308         md_m_shared_t           *shared;
3309 
3310         /*LINTED*/
3311         cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3312         un = cs->cs_ps->ps_un;
3313 
3314         for (smi = 0; smi < NMIRROR; smi++) {
3315                 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
3316                         continue;
3317 
3318                 if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev))
3319                         break;
3320         }
3321 
3322         if (smi >= NMIRROR)
3323                 return;
3324 
3325         sm = &un->un_sm[smi];
3326         smic = &un->un_smic[smi];
3327         cnt = cb->b_bcount;
3328 
3329         ui_sm = MDI_UNIT(getminor(cb->b_edev));
3330         (void) md_unit_writerlock(ui_sm);
3331 
3332         shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm,
3333             cb->b_blkno, &cnt);
3334 
3335         if (shared->ms_flags & MDM_S_IOERR) {
3336                 shared->ms_flags &= ~MDM_S_IOERR;
3337 
3338         } else {
3339                 /* the buf spans components and the first one is not erred */
3340                 int     cnt;
3341                 int     i;
3342 
3343                 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
3344                 for (i = 0; i < cnt; i++) {
3345                         shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3346                             (sm->sm_dev, sm, i);
3347 
3348                         if (shared->ms_flags & MDM_S_IOERR &&
3349                             shared->ms_state == CS_OKAY) {
3350 
3351                                 shared->ms_flags &= ~MDM_S_IOERR;
3352                                 break;
3353                         }
3354                 }
3355         }
3356 
3357         md_unit_writerexit(ui_sm);
3358 }
3359 
3360 static size_t
3361 mirror_map_read(
3362         md_mps_t *ps,
3363         md_mcs_t *cs,
3364         diskaddr_t blkno,
3365         u_longlong_t    count
3366 )
3367 {
3368         mm_unit_t       *un;
3369         buf_t           *bp;
3370         u_longlong_t    cando;
3371 
3372         bp = &cs->cs_buf;
3373         un = ps->ps_un;
3374 
3375         bp->b_lblkno = blkno;
3376         if (fast_select_read_unit(ps, cs) == 0) {
3377                 bp->b_bcount = ldbtob(count);
3378                 return (0);
3379         }
3380         bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno,
3381             count, &cando, 0, NULL, cs));
3382         bp->b_bcount = ldbtob(cando);
3383         if (count != cando)
3384                 return (cando);
3385         return (0);
3386 }
3387 
3388 static void
3389 write_after_read(md_mps_t *ps)
3390 {
3391         struct buf      *pb;
3392         int             flags;
3393 
3394         if (ps->ps_flags & MD_MPS_ERROR) {
3395                 mirror_error(ps);
3396                 return;
3397         }
3398 
3399         pb = ps->ps_bp;
3400         md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3401         ps->ps_call = NULL;
3402         ps->ps_flags |= MD_MPS_WRITE_AFTER_READ;
3403         flags = MD_STR_NOTTOP | MD_STR_WAR;
3404         if (ps->ps_flags & MD_MPS_MAPPED)
3405                 flags |= MD_STR_MAPPED;
3406         if (ps->ps_flags & MD_MPS_NOBLOCK)
3407                 flags |= MD_NOBLOCK;
3408         if (ps->ps_flags & MD_MPS_DIRTY_RD)
3409                 flags |= MD_STR_DIRTY_RD;
3410         (void) mirror_write_strategy(pb, flags, ps);
3411 }
3412 
3413 static void
3414 continue_serial(md_mps_t *ps)
3415 {
3416         md_mcs_t        *cs;
3417         buf_t           *cb;
3418         mm_unit_t       *un;
3419         int             flags;
3420 
3421         un = ps->ps_un;
3422         cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
3423         mirror_child_init(cs);
3424         cb = &cs->cs_buf;
3425         ps->ps_call = NULL;
3426         ps->ps_frags = 1;
3427         (void) mirror_map_write(un, cs, ps, 0);
3428         flags = MD_STR_NOTTOP;
3429         if (ps->ps_flags & MD_MPS_MAPPED)
3430                 flags |= MD_STR_MAPPED;
3431         md_call_strategy(cb, flags, NULL);
3432 }
3433 
3434 static int
3435 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war)
3436 {
3437         int i;
3438         dev_t           dev;    /* needed for bioclone, so not md_dev64_t */
3439         buf_t           *cb;
3440         buf_t           *pb;
3441         diskaddr_t      blkno;
3442         size_t          bcount;
3443         off_t           offset;
3444 
3445         pb = ps->ps_bp;
3446         cb = &cs->cs_buf;
3447         cs->cs_ps = ps;
3448 
3449         i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm);
3450 
3451         dev = md_dev64_to_dev(un->un_sm[i].sm_dev);
3452 
3453         blkno = pb->b_lblkno;
3454         bcount = pb->b_bcount;
3455         offset = 0;
3456         if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) {
3457                 blkno = DK_LABEL_LOC + 1;
3458                 /*
3459                  * This handles the case where we're requesting
3460                  * a write to block 0 on a label partition
3461                  * and the request size was smaller than the
3462                  * size of the label.  If this is the case
3463                  * then we'll return -1.  Failure to do so will
3464                  * either cause the calling thread to hang due to
3465                  * an ssd bug, or worse if the bcount were allowed
3466                  * to go negative (ie large).
3467                  */
3468                 if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1))
3469                         return (-1);
3470                 bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1));
3471                 offset = (DEV_BSIZE*(DK_LABEL_LOC + 1));
3472         }
3473 
3474         cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done,
3475             cb, KM_NOSLEEP);
3476         if (war)
3477                 cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE;
3478 
3479         /*
3480          * If the submirror is in the erred stated, check if any component is
3481          * in the Last Erred state.  If so, we don't want to use the B_FAILFAST
3482          * flag on the IO.
3483          *
3484          * Provide a fast path for the non-erred case (which should be the
3485          * normal case).
3486          */
3487         if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) {
3488                 if (un->un_sm[i].sm_state & SMS_COMP_ERRED) {
3489                         mm_submirror_t          *sm;
3490                         mm_submirror_ic_t       *smic;
3491                         int                     ci;
3492                         int                     compcnt;
3493 
3494                         sm = &un->un_sm[i];
3495                         smic = &un->un_smic[i];
3496 
3497                         compcnt = (*(smic->sm_get_component_count))
3498                             (sm->sm_dev, un);
3499                         for (ci = 0; ci < compcnt; ci++) {
3500                                 md_m_shared_t   *shared;
3501 
3502                                 shared = (md_m_shared_t *)
3503                                     (*(smic->sm_shared_by_indx))(sm->sm_dev,
3504                                     sm, ci);
3505 
3506                                 if (shared->ms_state == CS_LAST_ERRED)
3507                                         break;
3508                         }
3509                         if (ci >= compcnt)
3510                                 cb->b_flags |= B_FAILFAST;
3511 
3512                 } else {
3513                         cb->b_flags |= B_FAILFAST;
3514                 }
3515         }
3516 
3517         ps->ps_current_sm++;
3518         if (ps->ps_current_sm != ps->ps_active_cnt) {
3519                 if (un->un_write_option == WR_SERIAL) {
3520                         ps->ps_call = continue_serial;
3521                         return (0);
3522                 }
3523                 return (1);
3524         }
3525         return (0);
3526 }
3527 
3528 /*
3529  * directed_read_done:
3530  * ------------------
3531  * Completion routine called when a DMR request has been returned from the
3532  * underlying driver. Wake-up the original ioctl() and return the data to
3533  * the user.
3534  */
3535 static void
3536 directed_read_done(md_mps_t *ps)
3537 {
3538         mm_unit_t       *un;
3539         mdi_unit_t      *ui;
3540 
3541         un = ps->ps_un;
3542         ui = ps->ps_ui;
3543 
3544         md_unit_readerexit(ui);
3545         md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3546         ps->ps_call = NULL;
3547 
3548         mutex_enter(&un->un_dmr_mx);
3549         cv_signal(&un->un_dmr_cv);
3550         mutex_exit(&un->un_dmr_mx);
3551 
3552         /* release the parent structure */
3553         kmem_cache_free(mirror_parent_cache, ps);
3554 }
3555 
3556 /*
3557  * daemon_io:
3558  * ------------
3559  * Called to issue a mirror_write_strategy() or mirror_read_strategy
3560  * call from a blockable context. NOTE: no mutex can be held on entry to this
3561  * routine
3562  */
3563 static void
3564 daemon_io(daemon_queue_t *dq)
3565 {
3566         md_mps_t        *ps = (md_mps_t *)dq;
3567         int             flag = MD_STR_NOTTOP;
3568         buf_t           *pb = ps->ps_bp;
3569 
3570         if (ps->ps_flags & MD_MPS_MAPPED)
3571                 flag |= MD_STR_MAPPED;
3572         if (ps->ps_flags & MD_MPS_WOW)
3573                 flag |= MD_STR_WOW;
3574         if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)
3575                 flag |= MD_STR_WAR;
3576         if (ps->ps_flags & MD_MPS_ABR)
3577                 flag |= MD_STR_ABR;
3578         if (ps->ps_flags & MD_MPS_BLOCKABLE_IO)
3579                 flag |= MD_STR_BLOCK_OK;
3580 
3581         /*
3582          * If this is a resync read, ie MD_STR_DIRTY_RD not set, set
3583          * MD_STR_WAR before calling mirror_read_strategy
3584          */
3585         if (pb->b_flags & B_READ) {
3586                 if (!(ps->ps_flags & MD_MPS_DIRTY_RD))
3587                         flag |= MD_STR_WAR;
3588                 mirror_read_strategy(pb, flag, ps);
3589         } else
3590                 mirror_write_strategy(pb, flag, ps);
3591 }
3592 
3593 /*
3594  * update_resync:
3595  * -------------
3596  * Called to update the in-core version of the resync record with the latest
3597  * version that was committed to disk when the previous mirror owner
3598  * relinquished ownership. This call is likely to block as we must hold-off
3599  * any current resync processing that may be occurring.
3600  * On completion of the resync record update we issue the mirror_write_strategy
3601  * call to complete the i/o that first started this sequence. To remove a race
3602  * condition between a new write() request which is submitted and the resync
3603  * record update we acquire the writerlock. This will hold off all i/o to the
3604  * mirror until the resync update has completed.
3605  * NOTE: no mutex can be held on entry to this routine
3606  */
3607 static void
3608 update_resync(daemon_queue_t *dq)
3609 {
3610         md_mps_t        *ps = (md_mps_t *)dq;
3611         buf_t           *pb = ps->ps_bp;
3612         mdi_unit_t      *ui = ps->ps_ui;
3613         mm_unit_t       *un = MD_UNIT(ui->ui_link.ln_id);
3614         set_t           setno;
3615         int             restart_resync;
3616 
3617         mutex_enter(&un->un_rrp_inflight_mx);
3618         (void) md_unit_writerlock(ui);
3619         ps->ps_un = un;
3620         setno = MD_MIN2SET(getminor(pb->b_edev));
3621         if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) {
3622                 /*
3623                  * Synchronize our in-core view of what regions need to be
3624                  * resync'd with the on-disk version.
3625                  */
3626                 mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm,
3627                     un->un_dirty_bm);
3628 
3629                 /* Region dirty map is now up to date */
3630         }
3631         restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0;
3632         md_unit_writerexit(ui);
3633         mutex_exit(&un->un_rrp_inflight_mx);
3634 
3635         /* Restart the resync thread if it was previously blocked */
3636         if (restart_resync) {
3637                 mutex_enter(&un->un_rs_thread_mx);
3638                 un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER;
3639                 cv_signal(&un->un_rs_thread_cv);
3640                 mutex_exit(&un->un_rs_thread_mx);
3641         }
3642         /* Continue with original deferred i/o */
3643         daemon_io(dq);
3644 }
3645 
3646 /*
3647  * owner_timeout:
3648  * -------------
3649  * Called if the original mdmn_ksend_message() failed and the request is to be
3650  * retried. Reattempt the original ownership change.
3651  *
3652  * NOTE: called at interrupt context (see timeout(9f)).
3653  */
3654 static void
3655 owner_timeout(void *arg)
3656 {
3657         daemon_queue_t  *dq = (daemon_queue_t *)arg;
3658 
3659         daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD);
3660 }
3661 
3662 /*
3663  * become_owner:
3664  * ------------
3665  * Called to issue RPC request to become the owner of the mirror
3666  * associated with this i/o request. We assume that the ownership request
3667  * is synchronous, so if it succeeds we will issue the request via
3668  * mirror_write_strategy().
3669  * If multiple i/o's are outstanding we will be called from the mirror_daemon
3670  * service thread.
3671  * NOTE: no mutex should be held on entry to this routine.
3672  */
3673 static void
3674 become_owner(daemon_queue_t *dq)
3675 {
3676         md_mps_t        *ps = (md_mps_t *)dq;
3677         mm_unit_t       *un = ps->ps_un;
3678         buf_t           *pb = ps->ps_bp;
3679         set_t           setno;
3680         md_mn_kresult_t *kres;
3681         int             msg_flags = md_mirror_msg_flags;
3682         md_mps_t        *ps1;
3683 
3684         ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL);
3685 
3686         /*
3687          * If we're already the mirror owner we do not need to send a message
3688          * but can simply process the i/o request immediately.
3689          * If we've already sent the request to become owner we requeue the
3690          * request as we're waiting for the synchronous ownership message to
3691          * be processed.
3692          */
3693         if (MD_MN_MIRROR_OWNER(un)) {
3694                 /*
3695                  * As the strategy() call will potentially block we need to
3696                  * punt this to a separate thread and complete this request
3697                  * as quickly as possible. Note: if we're a read request
3698                  * this must be a resync, we cannot afford to be queued
3699                  * behind any intervening i/o requests. In this case we put the
3700                  * request on the md_mirror_rs_daemon queue.
3701                  */
3702                 if (pb->b_flags & B_READ) {
3703                         daemon_request(&md_mirror_rs_daemon, daemon_io, dq,
3704                             REQ_OLD);
3705                 } else {
3706                         daemon_request(&md_mirror_io_daemon, daemon_io, dq,
3707                             REQ_OLD);
3708                 }
3709         } else {
3710                 mutex_enter(&un->un_owner_mx);
3711                 if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) {
3712                         md_mn_req_owner_t       *msg;
3713                         int                     rval = 0;
3714 
3715                         /*
3716                          * Check to see that we haven't exceeded the maximum
3717                          * retry count. If we have we fail the i/o as the
3718                          * comms mechanism has become wedged beyond recovery.
3719                          */
3720                         if (dq->qlen++ >= MD_OWNER_RETRIES) {
3721                                 mutex_exit(&un->un_owner_mx);
3722                                 cmn_err(CE_WARN,
3723                                     "md_mirror: Request exhausted ownership "
3724                                     "retry limit of %d attempts", dq->qlen);
3725                                 pb->b_error = EIO;
3726                                 pb->b_flags |= B_ERROR;
3727                                 pb->b_resid = pb->b_bcount;
3728                                 kmem_cache_free(mirror_parent_cache, ps);
3729                                 md_biodone(pb);
3730                                 return;
3731                         }
3732 
3733                         /*
3734                          * Issue request to change ownership. The call is
3735                          * synchronous so when it returns we can complete the
3736                          * i/o (if successful), or enqueue it again so that
3737                          * the operation will be retried.
3738                          */
3739                         un->un_owner_state |= MM_MN_OWNER_SENT;
3740                         mutex_exit(&un->un_owner_mx);
3741 
3742                         msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP);
3743                         setno = MD_MIN2SET(getminor(pb->b_edev));
3744                         msg->mnum = MD_SID(un);
3745                         msg->owner = md_mn_mynode_id;
3746                         msg_flags |= MD_MSGF_NO_LOG;
3747                         /*
3748                          * If this IO is triggered by updating a watermark,
3749                          * it might be issued by the creation of a softpartition
3750                          * while the commd subsystem is suspended.
3751                          * We don't want this message to block.
3752                          */
3753                         if (ps->ps_flags & MD_MPS_WMUPDATE) {
3754                                 msg_flags |= MD_MSGF_OVERRIDE_SUSPEND;
3755                         }
3756 
3757                         kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
3758                         rval = mdmn_ksend_message(setno,
3759                             MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0,
3760                             (char *)msg, sizeof (md_mn_req_owner_t), kres);
3761 
3762                         kmem_free(msg, sizeof (md_mn_req_owner_t));
3763 
3764                         if (MDMN_KSEND_MSG_OK(rval, kres)) {
3765                                 dq->qlen = 0;
3766                                 /*
3767                                  * Successfully changed owner, reread the
3768                                  * resync record so that we have a valid idea of
3769                                  * any previously committed incomplete write()s.
3770                                  * NOTE: As we need to acquire the resync mutex
3771                                  * this may block, so we defer it to a separate
3772                                  * thread handler. This makes us (effectively)
3773                                  * non-blocking once the ownership message
3774                                  * handling has completed.
3775                                  */
3776                                 mutex_enter(&un->un_owner_mx);
3777                                 if (un->un_owner_state & MM_MN_BECOME_OWNER) {
3778                                         un->un_mirror_owner = md_mn_mynode_id;
3779                                         /* Sets owner of un_rr_dirty record */
3780                                         if (un->un_rr_dirty_recid)
3781                                                 (void) mddb_setowner(
3782                                                     un->un_rr_dirty_recid,
3783                                                     md_mn_mynode_id);
3784                                         un->un_owner_state &=
3785                                             ~MM_MN_BECOME_OWNER;
3786                                         /*
3787                                          * Release the block on the current
3788                                          * resync region if it is blocked
3789                                          */
3790                                         ps1 = un->un_rs_prev_overlap;
3791                                         if ((ps1 != NULL) &&
3792                                             (ps1->ps_flags & MD_MPS_ON_OVERLAP))
3793                                                 mirror_overlap_tree_remove(ps1);
3794                                         mutex_exit(&un->un_owner_mx);
3795 
3796                                         /*
3797                                          * If we're a read, this must be a
3798                                          * resync request, issue
3799                                          * the i/o request on the
3800                                          * md_mirror_rs_daemon queue. This is
3801                                          * to avoid a deadlock between the
3802                                          * resync_unit thread and
3803                                          * subsequent i/o requests that may
3804                                          * block on the resync region.
3805                                          */
3806                                         if (pb->b_flags & B_READ) {
3807                                                 daemon_request(
3808                                                     &md_mirror_rs_daemon,
3809                                                     update_resync, dq, REQ_OLD);
3810                                         } else {
3811                                                 daemon_request(
3812                                                     &md_mirror_io_daemon,
3813                                                     update_resync, dq, REQ_OLD);
3814                                         }
3815                                         kmem_free(kres,
3816                                             sizeof (md_mn_kresult_t));
3817                                         return;
3818                                 } else {
3819                                         /*
3820                                          * Some other node has beaten us to
3821                                          * obtain ownership. We need to
3822                                          * reschedule our ownership request
3823                                          */
3824                                         mutex_exit(&un->un_owner_mx);
3825                                 }
3826                         } else {
3827                                 mdmn_ksend_show_error(rval, kres,
3828                                     "MD_MN_MSG_REQUIRE_OWNER");
3829                                 /*
3830                                  * Message transport failure is handled by the
3831                                  * comms layer. If the ownership change request
3832                                  * does not succeed we need to flag the error to
3833                                  * the initiator of the i/o. This is handled by
3834                                  * the retry logic above. As the request failed
3835                                  * we do not know _who_ the owner of the mirror
3836                                  * currently is. We reset our idea of the owner
3837                                  * to None so that any further write()s will
3838                                  * attempt to become the owner again. This stops
3839                                  * multiple nodes writing to the same mirror
3840                                  * simultaneously.
3841                                  */
3842                                 mutex_enter(&un->un_owner_mx);
3843                                 un->un_owner_state &=
3844                                     ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER);
3845                                 un->un_mirror_owner = MD_MN_MIRROR_UNOWNED;
3846                                 mutex_exit(&un->un_owner_mx);
3847                         }
3848                         kmem_free(kres, sizeof (md_mn_kresult_t));
3849                 } else
3850                         mutex_exit(&un->un_owner_mx);
3851 
3852                 /*
3853                  * Re-enqueue this request on the deferred i/o list. Delay the
3854                  * request for md_mirror_owner_to usecs to stop thrashing.
3855                  */
3856                 (void) timeout(owner_timeout, dq,
3857                     drv_usectohz(md_mirror_owner_to));
3858         }
3859 }
3860 
3861 static void
3862 mirror_write_strategy(buf_t *pb, int flag, void *private)
3863 {
3864         md_mps_t        *ps;
3865         md_mcs_t        *cs;
3866         int             more;
3867         mm_unit_t       *un;
3868         mdi_unit_t      *ui;
3869         buf_t           *cb;            /* child buf pointer */
3870         set_t           setno;
3871         int             rs_on_overlap = 0;
3872 
3873         ui = MDI_UNIT(getminor(pb->b_edev));
3874         un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev));
3875 
3876 
3877         md_kstat_waitq_enter(ui);
3878 
3879         /*
3880          * If a state change is in progress for this mirror in a MN set,
3881          * suspend all non-resync writes until the state change is complete.
3882          * The objective of this suspend is to ensure that it is not
3883          * possible for one node to read data from a submirror that another node
3884          * has not written to because of the state change. Therefore we
3885          * suspend all writes until the state change has been made. As it is
3886          * not possible to read from the target of a resync, there is no need
3887          * to suspend resync writes.
3888          * Note that we only block here if the caller can handle a busy-wait.
3889          * The MD_STR_BLOCK_OK flag is set for daemon_io originated i/o only.
3890          */
3891 
3892         if (!(flag & MD_STR_WAR)) {
3893                 if (flag & MD_STR_BLOCK_OK) {
3894                         mutex_enter(&un->un_suspend_wr_mx);
3895                         while (un->un_suspend_wr_flag) {
3896                                 cv_wait(&un->un_suspend_wr_cv,
3897                                     &un->un_suspend_wr_mx);
3898                         }
3899                         mutex_exit(&un->un_suspend_wr_mx);
3900                 }
3901                 (void) md_unit_readerlock(ui);
3902         }
3903 
3904         if (!(flag & MD_STR_NOTTOP)) {
3905                 if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
3906                         md_kstat_waitq_exit(ui);
3907                         return;
3908                 }
3909         }
3910 
3911         setno = MD_MIN2SET(getminor(pb->b_edev));
3912 
3913         /* If an ABR write has been requested, set MD_STR_ABR flag */
3914         if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE))
3915                 flag |= MD_STR_ABR;
3916 
3917         if (private == NULL) {
3918                 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
3919                 mirror_parent_init(ps);
3920         } else {
3921                 ps = private;
3922                 private = NULL;
3923         }
3924         if (flag & MD_STR_MAPPED)
3925                 ps->ps_flags |= MD_MPS_MAPPED;
3926 
3927         if (flag & MD_STR_WOW)
3928                 ps->ps_flags |= MD_MPS_WOW;
3929 
3930         if (flag & MD_STR_ABR)
3931                 ps->ps_flags |= MD_MPS_ABR;
3932 
3933         if (flag & MD_STR_WMUPDATE)
3934                 ps->ps_flags |= MD_MPS_WMUPDATE;
3935 
3936         /*
3937          * Save essential information from the original buffhdr
3938          * in the md_save structure.
3939          */
3940         ps->ps_un = un;
3941         ps->ps_ui = ui;
3942         ps->ps_bp = pb;
3943         ps->ps_addr = pb->b_un.b_addr;
3944         ps->ps_firstblk = pb->b_lblkno;
3945         ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
3946         ps->ps_changecnt = un->un_changecnt;
3947 
3948         /*
3949          * Check for suspended writes here. This is where we can defer the
3950          * write request to the daemon_io queue which will then call us with
3951          * the MD_STR_BLOCK_OK flag set and we'll busy-wait (if necessary) at
3952          * the top of this routine.
3953          */
3954         if (!(flag & MD_STR_WAR) && !(flag & MD_STR_BLOCK_OK)) {
3955                 mutex_enter(&un->un_suspend_wr_mx);
3956                 if (un->un_suspend_wr_flag) {
3957                         ps->ps_flags |= MD_MPS_BLOCKABLE_IO;
3958                         mutex_exit(&un->un_suspend_wr_mx);
3959                         md_unit_readerexit(ui);
3960                         daemon_request(&md_mirror_daemon, daemon_io,
3961                             (daemon_queue_t *)ps, REQ_OLD);
3962                         return;
3963                 }
3964                 mutex_exit(&un->un_suspend_wr_mx);
3965         }
3966 
3967         /*
3968          * If not MN owner and this is an ABR write, make sure the current
3969          * resync region is in the overlaps tree
3970          */
3971         mutex_enter(&un->un_owner_mx);
3972         if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) &&
3973             ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
3974                 md_mps_t        *ps1;
3975                 /* Block the current resync region, if not already blocked */
3976                 ps1 = un->un_rs_prev_overlap;
3977 
3978                 if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) ||
3979                     (ps1->ps_lastblk != 0))) {
3980                         /* Drop locks to avoid deadlock */
3981                         mutex_exit(&un->un_owner_mx);
3982                         md_unit_readerexit(ui);
3983                         wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT);
3984                         rs_on_overlap = 1;
3985                         (void) md_unit_readerlock(ui);
3986                         mutex_enter(&un->un_owner_mx);
3987                         /*
3988                          * Check to see if we have obtained ownership
3989                          * while waiting for overlaps. If we have, remove
3990                          * the resync_region entry from the overlap tree
3991                          */
3992                         if (MD_MN_MIRROR_OWNER(un) &&
3993                             (ps1->ps_flags & MD_MPS_ON_OVERLAP)) {
3994                                 mirror_overlap_tree_remove(ps1);
3995                                 rs_on_overlap = 0;
3996                         }
3997                 }
3998         }
3999         mutex_exit(&un->un_owner_mx);
4000 
4001 
4002         /*
4003          * following keep write after read from writing to the
4004          * source in the case where it all came from one place
4005          */
4006         if (flag & MD_STR_WAR) {
4007                 int     abort_write = 0;
4008                 /*
4009                  * We are perfoming a write-after-read. This is either as a
4010                  * result of a resync read or as a result of a read in a
4011                  * dirty resync region when the optimized resync is not
4012                  * complete. If in a MN set and a resync generated i/o,
4013                  * if the current block is not in the current
4014                  * resync region terminate the write as another node must have
4015                  * completed this resync region
4016                  */
4017                 if ((MD_MNSET_SETNO(MD_UN2SET(un))) &&
4018                     (!(flag & MD_STR_DIRTY_RD))) {
4019                         if (!IN_RESYNC_REGION(un, ps))
4020                                 abort_write = 1;
4021                 }
4022                 if ((select_write_after_read_units(un, ps) == 0) ||
4023                     (abort_write)) {
4024 #ifdef DEBUG
4025                         if (mirror_debug_flag)
4026                                 printf("Abort resync write on %x, block %lld\n",
4027                                     MD_SID(un), ps->ps_firstblk);
4028 #endif
4029                         if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4030                                 mirror_overlap_tree_remove(ps);
4031                         kmem_cache_free(mirror_parent_cache, ps);
4032                         md_kstat_waitq_exit(ui);
4033                         md_unit_readerexit(ui);
4034                         md_biodone(pb);
4035                         return;
4036                 }
4037         } else {
4038                 select_write_units(un, ps);
4039 
4040                 /* Drop readerlock to avoid deadlock */
4041                 md_unit_readerexit(ui);
4042                 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
4043                 un = md_unit_readerlock(ui);
4044                 /*
4045                  * For a MN set with an ABR write, if we are now the
4046                  * owner and we have a resync region in the overlap
4047                  * tree, remove the entry from overlaps and retry the write.
4048                  */
4049 
4050                 if (MD_MNSET_SETNO(setno) &&
4051                     ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
4052                         mutex_enter(&un->un_owner_mx);
4053                         if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) {
4054                                 mirror_overlap_tree_remove(ps);
4055                                 md_kstat_waitq_exit(ui);
4056                                 mutex_exit(&un->un_owner_mx);
4057                                 md_unit_readerexit(ui);
4058                                 daemon_request(&md_mirror_daemon, daemon_io,
4059                                     (daemon_queue_t *)ps, REQ_OLD);
4060                                 return;
4061                         }
4062                         mutex_exit(&un->un_owner_mx);
4063                 }
4064         }
4065 
4066         /*
4067          * For Multinode mirrors with no owner and a Resync Region (not ABR)
4068          * we need to become the mirror owner before continuing with the
4069          * write(). For ABR mirrors we check that we 'own' the resync if
4070          * we're in write-after-read mode. We do this _after_ ensuring that
4071          * there are no overlaps to ensure that once we know that we are
4072          * the owner, the readerlock will not be released until the write is
4073          * complete. As a change of ownership in a MN set requires the
4074          * writerlock, this ensures that ownership cannot be changed until
4075          * the write is complete.
4076          */
4077         if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) ||
4078             (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) {
4079                 if (MD_MN_NO_MIRROR_OWNER(un))  {
4080                         if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4081                                 mirror_overlap_tree_remove(ps);
4082                         md_kstat_waitq_exit(ui);
4083                         ASSERT(!(flag & MD_STR_WAR));
4084                         md_unit_readerexit(ui);
4085                         daemon_request(&md_mirror_daemon, become_owner,
4086                             (daemon_queue_t *)ps, REQ_OLD);
4087                         return;
4088                 }
4089         }
4090 
4091         /*
4092          * Mark resync region if mirror has a Resync Region _and_ we are not
4093          * a resync initiated write(). Don't mark region if we're flagged as
4094          * an ABR write.
4095          */
4096         if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) &&
4097             !(flag & MD_STR_WAR)) {
4098                 if (mirror_mark_resync_region(un, ps->ps_firstblk,
4099                     ps->ps_lastblk, md_mn_mynode_id)) {
4100                         pb->b_flags |= B_ERROR;
4101                         pb->b_resid = pb->b_bcount;
4102                         if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4103                                 mirror_overlap_tree_remove(ps);
4104                         kmem_cache_free(mirror_parent_cache, ps);
4105                         md_kstat_waitq_exit(ui);
4106                         md_unit_readerexit(ui);
4107                         md_biodone(pb);
4108                         return;
4109                 }
4110         }
4111 
4112         ps->ps_childbflags = pb->b_flags | B_WRITE;
4113         ps->ps_childbflags &= ~B_READ;
4114         if (flag & MD_STR_MAPPED)
4115                 ps->ps_childbflags &= ~B_PAGEIO;
4116 
4117         if (!(flag & MD_STR_NOTTOP) && panicstr)
4118                 /* Disable WOW and don't free ps */
4119                 ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE);
4120 
4121         md_kstat_waitq_to_runq(ui);
4122 
4123         /*
4124          * Treat Raw and Direct I/O as Write-on-Write always
4125          */
4126 
4127         if (!(md_mirror_wow_flg & WOW_DISABLE) &&
4128             (md_mirror_wow_flg & WOW_PHYS_ENABLE) &&
4129             (pb->b_flags & B_PHYS) &&
4130             !(ps->ps_flags & MD_MPS_WOW)) {
4131                 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4132                         mirror_overlap_tree_remove(ps);
4133                 md_unit_readerexit(ui);
4134                 daemon_request(&md_mstr_daemon, handle_wow,
4135                     (daemon_queue_t *)ps, REQ_OLD);
4136                 return;
4137         }
4138 
4139         ps->ps_frags = 1;
4140         do {
4141                 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
4142                 mirror_child_init(cs);
4143                 cb = &cs->cs_buf;
4144                 more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR));
4145 
4146                 /*
4147                  * This handles the case where we're requesting
4148                  * a write to block 0 on a label partition.  (more < 0)
4149                  * means that the request size was smaller than the
4150                  * size of the label.  If so this request is done.
4151                  */
4152                 if (more < 0) {
4153                         if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4154                                 mirror_overlap_tree_remove(ps);
4155                         md_kstat_runq_exit(ui);
4156                         kmem_cache_free(mirror_child_cache, cs);
4157                         kmem_cache_free(mirror_parent_cache, ps);
4158                         md_unit_readerexit(ui);
4159                         md_biodone(pb);
4160                         return;
4161                 }
4162                 if (more) {
4163                         mutex_enter(&ps->ps_mx);
4164                         ps->ps_frags++;
4165                         mutex_exit(&ps->ps_mx);
4166                 }
4167                 md_call_strategy(cb, flag, private);
4168         } while (more);
4169 
4170         if (!(flag & MD_STR_NOTTOP) && panicstr) {
4171                 while (!(ps->ps_flags & MD_MPS_DONE)) {
4172                         md_daemon(1, &md_done_daemon);
4173                         drv_usecwait(10);
4174                 }
4175                 kmem_cache_free(mirror_parent_cache, ps);
4176         }
4177 }
4178 
4179 static void
4180 mirror_read_strategy(buf_t *pb, int flag, void *private)
4181 {
4182         md_mps_t        *ps;
4183         md_mcs_t        *cs;
4184         size_t          more;
4185         mm_unit_t       *un;
4186         mdi_unit_t      *ui;
4187         size_t          current_count;
4188         diskaddr_t      current_blkno;
4189         off_t           current_offset;
4190         buf_t           *cb;            /* child buf pointer */
4191         set_t           setno;
4192 
4193         ui = MDI_UNIT(getminor(pb->b_edev));
4194 
4195         md_kstat_waitq_enter(ui);
4196 
4197         un = (mm_unit_t *)md_unit_readerlock(ui);
4198 
4199         if (!(flag & MD_STR_NOTTOP)) {
4200                 if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
4201                         md_kstat_waitq_exit(ui);
4202                         return;
4203                 }
4204         }
4205 
4206         if (private == NULL) {
4207                 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
4208                 mirror_parent_init(ps);
4209         } else {
4210                 ps = private;
4211                 private = NULL;
4212         }
4213 
4214         if (flag & MD_STR_MAPPED)
4215                 ps->ps_flags |= MD_MPS_MAPPED;
4216         if (flag & MD_NOBLOCK)
4217                 ps->ps_flags |= MD_MPS_NOBLOCK;
4218         if (flag & MD_STR_WMUPDATE)
4219                 ps->ps_flags |= MD_MPS_WMUPDATE;
4220 
4221         /*
4222          * Check to see if this is a DMR driven read. If so we need to use the
4223          * specified side (in un->un_dmr_last_read) for the source of the data.
4224          */
4225         if (flag & MD_STR_DMR)
4226                 ps->ps_flags |= MD_MPS_DMR;
4227 
4228         /*
4229          * Save essential information from the original buffhdr
4230          * in the md_save structure.
4231          */
4232         ps->ps_un = un;
4233         ps->ps_ui = ui;
4234         ps->ps_bp = pb;
4235         ps->ps_addr = pb->b_un.b_addr;
4236         ps->ps_firstblk = pb->b_lblkno;
4237         ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
4238         ps->ps_changecnt = un->un_changecnt;
4239 
4240         current_count = btodb(pb->b_bcount);
4241         current_blkno = pb->b_lblkno;
4242         current_offset = 0;
4243 
4244         /*
4245          * If flag has MD_STR_WAR set this means that the read is issued by a
4246          * resync thread which may or may not be an optimised resync.
4247          *
4248          * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync
4249          * code has not completed; either a resync has not started since snarf,
4250          * or there is an optimized resync in progress.
4251          *
4252          * We need to generate a write after this read in the following two
4253          * cases,
4254          *
4255          * 1. Any Resync-Generated read
4256          *
4257          * 2. Any read to a DIRTY REGION if there is an optimized resync
4258          *    pending or in progress.
4259          *
4260          * The write after read is done in these cases to ensure that all sides
4261          * of the mirror are in sync with the read data and that it is not
4262          * possible for an application to read the same block multiple times
4263          * and get different data.
4264          *
4265          * This would be possible if the block was in a dirty region.
4266          *
4267          * If we're performing a directed read we don't write the data out as
4268          * the application is responsible for restoring the mirror to a known
4269          * state.
4270          */
4271         if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) &&
4272             !(flag & MD_STR_DMR)) {
4273                 size_t  start_rr, i, end_rr;
4274                 int     region_dirty = 1;
4275 
4276                 /*
4277                  * We enter here under three circumstances,
4278                  *
4279                  * MD_UN_OPT_NOT_DONE   MD_STR_WAR
4280                  * 0                    1
4281                  * 1                    0
4282                  * 1                    1
4283                  *
4284                  * To be optimal we only care to explicitly check for dirty
4285                  * regions in the second case since if MD_STR_WAR is set we
4286                  * always do the write after read.
4287                  */
4288                 if (!(flag & MD_STR_WAR)) {
4289                         BLK_TO_RR(end_rr, ps->ps_lastblk, un);
4290                         BLK_TO_RR(start_rr, ps->ps_firstblk, un);
4291 
4292                         for (i = start_rr; i <= end_rr; i++)
4293                                 if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0)
4294                                         break;
4295                 }
4296 
4297                 if ((region_dirty) &&
4298                     !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
4299                         ps->ps_call = write_after_read;
4300                         /*
4301                          * Mark this as a RESYNC_READ in ps_flags.
4302                          * This is used if the read fails during a
4303                          * resync of a 3-way mirror to ensure that
4304                          * the retried read to the remaining
4305                          * good submirror has MD_STR_WAR set. This
4306                          * is needed to ensure that the resync write
4307                          * (write-after-read) takes place.
4308                          */
4309                         ps->ps_flags |= MD_MPS_RESYNC_READ;
4310 
4311                         /*
4312                          * If MD_STR_FLAG_ERR is set in the flags we
4313                          * set MD_MPS_FLAG_ERROR so that an error on the resync
4314                          * write (issued by write_after_read) will be flagged
4315                          * to the biowait'ing resync thread. This allows us to
4316                          * avoid issuing further resync requests to a device
4317                          * that has had a write failure.
4318                          */
4319                         if (flag & MD_STR_FLAG_ERR)
4320                                 ps->ps_flags |= MD_MPS_FLAG_ERROR;
4321 
4322                         setno = MD_UN2SET(un);
4323                         /*
4324                          * Drop the readerlock to avoid
4325                          * deadlock
4326                          */
4327                         md_unit_readerexit(ui);
4328                         wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
4329                         un = md_unit_readerlock(ui);
4330                         /*
4331                          * Ensure that we are owner
4332                          */
4333                         if (MD_MNSET_SETNO(setno)) {
4334                                 /*
4335                                  * For a non-resync read that requires a
4336                                  * write-after-read to be done, set a flag
4337                                  * in the parent structure, so that the
4338                                  * write_strategy routine can omit the
4339                                  * test that the write is still within the
4340                                  * resync region
4341                                  */
4342                                 if (!(flag & MD_STR_WAR))
4343                                         ps->ps_flags |= MD_MPS_DIRTY_RD;
4344 
4345                                 /*
4346                                  * Before reading the buffer, see if
4347                                  * there is an owner.
4348                                  */
4349                                 if (MD_MN_NO_MIRROR_OWNER(un))  {
4350                                         ps->ps_call = NULL;
4351                                         mirror_overlap_tree_remove(ps);
4352                                         md_kstat_waitq_exit(ui);
4353                                         md_unit_readerexit(ui);
4354                                         daemon_request(
4355                                             &md_mirror_daemon,
4356                                             become_owner,
4357                                             (daemon_queue_t *)ps,
4358                                             REQ_OLD);
4359                                         return;
4360                                 }
4361                                 /*
4362                                  * For a resync read, check to see if I/O is
4363                                  * outside of the current resync region, or
4364                                  * the resync has finished. If so
4365                                  * just terminate the I/O
4366                                  */
4367                                 if ((flag & MD_STR_WAR) &&
4368                                     (!(un->c.un_status & MD_UN_WAR) ||
4369                                     (!IN_RESYNC_REGION(un, ps)))) {
4370 #ifdef DEBUG
4371                                         if (mirror_debug_flag)
4372                                                 printf("Abort resync read "
4373                                                     "%x: %lld\n",
4374                                                     MD_SID(un),
4375                                                     ps->ps_firstblk);
4376 #endif
4377                                         mirror_overlap_tree_remove(ps);
4378                                         kmem_cache_free(mirror_parent_cache,
4379                                             ps);
4380                                         md_kstat_waitq_exit(ui);
4381                                         md_unit_readerexit(ui);
4382                                         md_biodone(pb);
4383                                         return;
4384                                 }
4385                         }
4386                 }
4387         }
4388 
4389         if (flag & MD_STR_DMR) {
4390                 ps->ps_call = directed_read_done;
4391         }
4392 
4393         if (!(flag & MD_STR_NOTTOP) && panicstr)
4394                 ps->ps_flags |= MD_MPS_DONTFREE;
4395 
4396         md_kstat_waitq_to_runq(ui);
4397 
4398         ps->ps_frags++;
4399         do {
4400                 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
4401                 mirror_child_init(cs);
4402                 cb = &cs->cs_buf;
4403                 cs->cs_ps = ps;
4404 
4405                 cb = md_bioclone(pb, current_offset, current_count, NODEV,
4406                     current_blkno, mirror_done, cb, KM_NOSLEEP);
4407 
4408                 more = mirror_map_read(ps, cs, current_blkno,
4409                     (u_longlong_t)current_count);
4410                 if (more) {
4411                         mutex_enter(&ps->ps_mx);
4412                         ps->ps_frags++;
4413                         mutex_exit(&ps->ps_mx);
4414                 }
4415 
4416                 /*
4417                  * Do these calculations now,
4418                  *  so that we pickup a valid b_bcount from the chld_bp.
4419                  */
4420                 current_count -= more;
4421                 current_offset += cb->b_bcount;
4422                 current_blkno +=  more;
4423                 md_call_strategy(cb, flag, private);
4424         } while (more);
4425 
4426         if (!(flag & MD_STR_NOTTOP) && panicstr) {
4427                 while (!(ps->ps_flags & MD_MPS_DONE)) {
4428                         md_daemon(1, &md_done_daemon);
4429                         drv_usecwait(10);
4430                 }
4431                 kmem_cache_free(mirror_parent_cache, ps);
4432         }
4433 }
4434 
4435 void
4436 md_mirror_strategy(buf_t *bp, int flag, void *private)
4437 {
4438         set_t   setno = MD_MIN2SET(getminor(bp->b_edev));
4439 
4440         /*
4441          * When doing IO to a multi owner meta device, check if set is halted.
4442          * We do this check without the needed lock held, for performance
4443          * reasons.
4444          * If an IO just slips through while the set is locked via an
4445          * MD_MN_SUSPEND_SET, we don't care about it.
4446          * Only check for suspension if we are a top-level i/o request
4447          * (MD_STR_NOTTOP is cleared in 'flag').
4448          */
4449         if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
4450             (MD_SET_HALTED | MD_SET_MNSET)) {
4451                 if ((flag & MD_STR_NOTTOP) == 0) {
4452                         mutex_enter(&md_mx);
4453                         /* Here we loop until the set is no longer halted */
4454                         while (md_set[setno].s_status & MD_SET_HALTED) {
4455                                 cv_wait(&md_cv, &md_mx);
4456                         }
4457                         mutex_exit(&md_mx);
4458                 }
4459         }
4460 
4461         if ((flag & MD_IO_COUNTED) == 0) {
4462                 if ((flag & MD_NOBLOCK) == 0) {
4463                         if (md_inc_iocount(setno) != 0) {
4464                                 bp->b_flags |= B_ERROR;
4465                                 bp->b_error = ENXIO;
4466                                 bp->b_resid = bp->b_bcount;
4467                                 biodone(bp);
4468                                 return;
4469                         }
4470                 } else {
4471                         md_inc_iocount_noblock(setno);
4472                 }
4473         }
4474 
4475         if (bp->b_flags & B_READ)
4476                 mirror_read_strategy(bp, flag, private);
4477         else
4478                 mirror_write_strategy(bp, flag, private);
4479 }
4480 
4481 /*
4482  * mirror_directed_read:
4483  * --------------------
4484  * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror
4485  * so that the application can determine what (if any) resync needs to be
4486  * performed. The data is copied out to the user-supplied buffer.
4487  *
4488  * Parameters:
4489  *      mdev    - dev_t for the mirror device
4490  *      vdr     - directed read parameters specifying location and submirror
4491  *                to perform the read from
4492  *      mode    - used to ddi_copyout() any resulting data from the read
4493  *
4494  * Returns:
4495  *      0       success
4496  *      !0      error code
4497  *              EINVAL - invalid request format
4498  */
4499 int
4500 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode)
4501 {
4502         buf_t           *bp;
4503         minor_t         mnum = getminor(mdev);
4504         mdi_unit_t      *ui = MDI_UNIT(mnum);
4505         mm_unit_t       *un;
4506         mm_submirror_t  *sm;
4507         char            *sm_nm;
4508         uint_t          next_side;
4509         void            *kbuffer;
4510 
4511         if (ui == NULL)
4512                 return (ENXIO);
4513 
4514         if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) {
4515                 return (EINVAL);
4516         }
4517 
4518         /* Check for aligned block access. We disallow non-aligned requests. */
4519         if (vdr->vdr_offset % DEV_BSIZE) {
4520                 return (EINVAL);
4521         }
4522 
4523         /*
4524          * Allocate kernel buffer for target of read(). If we had a reliable
4525          * (sorry functional) DDI this wouldn't be needed.
4526          */
4527         kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
4528         if (kbuffer == NULL) {
4529                 cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx"
4530                     " bytes\n", vdr->vdr_nbytes);
4531                 return (ENOMEM);
4532         }
4533 
4534         bp = getrbuf(KM_SLEEP);
4535 
4536         bp->b_un.b_addr = kbuffer;
4537         bp->b_flags = B_READ;
4538         bp->b_bcount = vdr->vdr_nbytes;
4539         bp->b_lblkno = lbtodb(vdr->vdr_offset);
4540         bp->b_edev = mdev;
4541 
4542         un = md_unit_readerlock(ui);
4543 
4544         /*
4545          * If DKV_SIDE_INIT is set we need to determine the first available
4546          * side to start reading from. If it isn't set we increment to the
4547          * next readable submirror.
4548          * If there are no readable submirrors we error out with DKV_DMR_ERROR.
4549          * Note: we check for a readable submirror on completion of the i/o so
4550          * we should _always_ have one available. If this becomes unavailable
4551          * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if
4552          * a metadetach is made between the completion of one DKIOCDMR ioctl
4553          * and the start of the next (i.e. a sys-admin 'accident' occurred).
4554          * The chance of this is small, but not non-existent.
4555          */
4556         if (vdr->vdr_side == DKV_SIDE_INIT) {
4557                 next_side = 0;
4558         } else {
4559                 next_side = vdr->vdr_side + 1;
4560         }
4561         while ((next_side < NMIRROR) &&
4562             !SUBMIRROR_IS_READABLE(un, next_side))
4563                 next_side++;
4564         if (next_side >= NMIRROR) {
4565                 vdr->vdr_flags |= DKV_DMR_ERROR;
4566                 freerbuf(bp);
4567                 vdr->vdr_bytesread = 0;
4568                 md_unit_readerexit(ui);
4569                 return (0);
4570         }
4571 
4572         /* Set the side to read from */
4573         un->un_dmr_last_read = next_side;
4574 
4575         md_unit_readerexit(ui);
4576 
4577         /*
4578          * Save timestamp for verification purposes. Can be read by debugger
4579          * to verify that this ioctl has been executed and to find the number
4580          * of DMR reads and the time of the last DMR read.
4581          */
4582         uniqtime(&mirror_dmr_stats.dmr_timestamp);
4583         mirror_dmr_stats.dmr_count++;
4584 
4585         /* Issue READ request and wait for completion */
4586         mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL);
4587 
4588         mutex_enter(&un->un_dmr_mx);
4589         cv_wait(&un->un_dmr_cv, &un->un_dmr_mx);
4590         mutex_exit(&un->un_dmr_mx);
4591 
4592         /*
4593          * Check to see if we encountered an error during the read. If so we
4594          * can make no guarantee about any possibly returned data.
4595          */
4596         if ((bp->b_flags & B_ERROR) == 0) {
4597                 vdr->vdr_flags &= ~DKV_DMR_ERROR;
4598                 if (bp->b_resid) {
4599                         vdr->vdr_flags |= DKV_DMR_SHORT;
4600                         vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid;
4601                 } else {
4602                         vdr->vdr_flags |= DKV_DMR_SUCCESS;
4603                         vdr->vdr_bytesread = vdr->vdr_nbytes;
4604                 }
4605                 /* Copy the data read back out to the user supplied buffer */
4606                 if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread,
4607                     mode)) {
4608                         kmem_free(kbuffer, vdr->vdr_nbytes);
4609                         return (EFAULT);
4610                 }
4611 
4612         } else {
4613                 /* Error out with DKV_DMR_ERROR */
4614                 vdr->vdr_flags |= DKV_DMR_ERROR;
4615                 vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE);
4616         }
4617         /*
4618          * Update the DMR parameters with the side and name of submirror that
4619          * we have just read from (un->un_dmr_last_read)
4620          */
4621         un = md_unit_readerlock(ui);
4622 
4623         vdr->vdr_side = un->un_dmr_last_read;
4624         sm = &un->un_sm[un->un_dmr_last_read];
4625         sm_nm = md_shortname(md_getminor(sm->sm_dev));
4626 
4627         (void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name));
4628 
4629         /*
4630          * Determine if we've completed the read cycle. This is true iff the
4631          * next computed submirror (side) equals or exceeds NMIRROR. We cannot
4632          * use un_nsm as we need to handle a sparse array of submirrors (which
4633          * can occur if a submirror is metadetached).
4634          */
4635         next_side = un->un_dmr_last_read + 1;
4636         while ((next_side < NMIRROR) &&
4637             !SUBMIRROR_IS_READABLE(un, next_side))
4638                 next_side++;
4639         if (next_side >= NMIRROR) {
4640                 /* We've finished */
4641                 vdr->vdr_flags |= DKV_DMR_DONE;
4642         }
4643 
4644         md_unit_readerexit(ui);
4645         freerbuf(bp);
4646         kmem_free(kbuffer, vdr->vdr_nbytes);
4647 
4648         return (0);
4649 }
4650 
4651 /*
4652  * mirror_resync_message:
4653  * ---------------------
4654  * Handle the multi-node resync messages that keep all nodes within a given
4655  * disk-set in sync with their view of a mirror's resync status.
4656  *
4657  * The message types dealt with are:
4658  * MD_MN_MSG_RESYNC_STARTING    - start a resync thread for a unit
4659  * MD_MN_MSG_RESYNC_NEXT        - specified next region to be resynced
4660  * MD_MN_MSG_RESYNC_FINISH      - stop the resync thread for a unit
4661  * MD_MN_MSG_RESYNC_PHASE_DONE  - end of a resync phase, opt, submirror or comp
4662  *
4663  * Returns:
4664  *      0       Success
4665  *      >0   Failure error number
4666  */
4667 int
4668 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
4669 {
4670         mdi_unit_t              *ui;
4671         mm_unit_t               *un;
4672         set_t                   setno;
4673         int                     is_ABR;
4674         int                     smi;
4675         int                     ci;
4676         sm_state_t              state;
4677         int                     broke_out;
4678         mm_submirror_t          *sm;
4679         mm_submirror_ic_t       *smic;
4680         md_m_shared_t           *shared;
4681         md_error_t              mde = mdnullerror;
4682         md_mps_t                *ps;
4683         int                     rs_active;
4684         int                     rr, rr_start, rr_end;
4685 
4686         /* Check that the given device is part of a multi-node set */
4687         setno = MD_MIN2SET(p->mnum);
4688         if (setno >= md_nsets) {
4689                 return (ENXIO);
4690         }
4691         if (!MD_MNSET_SETNO(setno)) {
4692                 return (EINVAL);
4693         }
4694 
4695         if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL)
4696                 return (EINVAL);
4697         if ((ui = MDI_UNIT(p->mnum)) == NULL)
4698                 return (EINVAL);
4699         is_ABR = (ui->ui_tstate & MD_ABR_CAP);
4700 
4701         /* Obtain the current resync status */
4702         (void) md_ioctl_readerlock(lockp, ui);
4703         rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0;
4704         md_ioctl_readerexit(lockp);
4705 
4706         switch ((md_mn_msgtype_t)p->msg_type) {
4707         case MD_MN_MSG_RESYNC_STARTING:
4708                 /* Start the resync thread for the mirror */
4709                 (void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp);
4710                 break;
4711 
4712         case MD_MN_MSG_RESYNC_NEXT:
4713                 /*
4714                  * We have to release any previously marked overlap regions
4715                  * so that i/o can resume. Then we need to block the region
4716                  * from [rs_start..rs_start+rs_size) * so that no i/o is issued.
4717                  * Update un_rs_resync_done and un_rs_resync_2_do.
4718                  */
4719                 (void) md_ioctl_readerlock(lockp, ui);
4720                 /*
4721                  * Ignore the message if there is no active resync thread or
4722                  * if it is for a resync type that we have already completed.
4723                  * un_resync_completed is set to the last resync completed
4724                  * when processing a PHASE_DONE message.
4725                  */
4726                 if (!rs_active || (p->rs_type == un->un_resync_completed))
4727                         break;
4728                 /*
4729                  * If this message is for the same resync and is for an earlier
4730                  * resync region, just ignore it. This can only occur if this
4731                  * node has progressed on to the next resync region before
4732                  * we receive this message. This can occur if the class for
4733                  * this message is busy and the originator has to retry thus
4734                  * allowing this node to move onto the next resync_region.
4735                  */
4736                 if ((p->rs_type == un->un_rs_type) &&
4737                     (p->rs_start < un->un_resync_startbl))
4738                         break;
4739                 ps = un->un_rs_prev_overlap;
4740 
4741                 /* Allocate previous overlap reference if needed */
4742                 if (ps == NULL) {
4743                         ps = kmem_cache_alloc(mirror_parent_cache,
4744                             MD_ALLOCFLAGS);
4745                         ps->ps_un = un;
4746                         ps->ps_ui = ui;
4747                         ps->ps_firstblk = 0;
4748                         ps->ps_lastblk = 0;
4749                         ps->ps_flags = 0;
4750                         md_ioctl_readerexit(lockp);
4751                         (void) md_ioctl_writerlock(lockp, ui);
4752                         un->un_rs_prev_overlap = ps;
4753                         md_ioctl_writerexit(lockp);
4754                 } else
4755                         md_ioctl_readerexit(lockp);
4756 
4757                 if (p->rs_originator != md_mn_mynode_id) {
4758                         /*
4759                          * Clear our un_resync_bm for the regions completed.
4760                          * The owner (originator) will take care of itself.
4761                          */
4762                         BLK_TO_RR(rr_end, ps->ps_lastblk, un);
4763                         BLK_TO_RR(rr_start, p->rs_start, un);
4764                         if (ps->ps_lastblk && rr_end < rr_start) {
4765                                 BLK_TO_RR(rr_start, ps->ps_firstblk, un);
4766                                 mutex_enter(&un->un_resync_mx);
4767                                 /*
4768                                  * Update our resync bitmap to reflect that
4769                                  * another node has synchronized this range.
4770                                  */
4771                                 for (rr = rr_start; rr <= rr_end; rr++) {
4772                                         CLR_KEEPDIRTY(rr, un);
4773                                 }
4774                                 mutex_exit(&un->un_resync_mx);
4775                         }
4776 
4777                         /*
4778                          * On all but the originating node, first update
4779                          * the resync state, then unblock the previous
4780                          * region and block the next one. No need
4781                          * to do this if the region is already blocked.
4782                          * Update the submirror state and flags from the
4783                          * originator. This keeps the cluster in sync with
4784                          * regards to the resync status.
4785                          */
4786 
4787                         (void) md_ioctl_writerlock(lockp, ui);
4788                         un->un_rs_resync_done = p->rs_done;
4789                         un->un_rs_resync_2_do = p->rs_2_do;
4790                         un->un_rs_type = p->rs_type;
4791                         un->un_resync_startbl = p->rs_start;
4792                         md_ioctl_writerexit(lockp);
4793                         /*
4794                          * Use un_owner_mx to ensure that an ownership change
4795                          * cannot happen at the same time as this message
4796                          */
4797                         mutex_enter(&un->un_owner_mx);
4798                         if (MD_MN_MIRROR_OWNER(un)) {
4799                                 ps->ps_firstblk = p->rs_start;
4800                                 ps->ps_lastblk = ps->ps_firstblk +
4801                                     p->rs_size - 1;
4802                         } else {
4803                                 if ((ps->ps_firstblk != p->rs_start) ||
4804                                     (ps->ps_lastblk != p->rs_start +
4805                                     p->rs_size - 1)) {
4806                                         /* Remove previous overlap range */
4807                                         if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4808                                                 mirror_overlap_tree_remove(ps);
4809 
4810                                         ps->ps_firstblk = p->rs_start;
4811                                         ps->ps_lastblk = ps->ps_firstblk +
4812                                             p->rs_size - 1;
4813 
4814                                         mutex_exit(&un->un_owner_mx);
4815                                         /* Block this range from all i/o. */
4816                                         if (ps->ps_firstblk != 0 ||
4817                                             ps->ps_lastblk != 0)
4818                                                 wait_for_overlaps(ps,
4819                                                     MD_OVERLAP_ALLOW_REPEAT);
4820                                         mutex_enter(&un->un_owner_mx);
4821                                         /*
4822                                          * Check to see if we have obtained
4823                                          * ownership while waiting for
4824                                          * overlaps. If we have, remove
4825                                          * the resync_region entry from the
4826                                          * overlap tree
4827                                          */
4828                                         if (MD_MN_MIRROR_OWNER(un) &&
4829                                             (ps->ps_flags & MD_MPS_ON_OVERLAP))
4830                                                 mirror_overlap_tree_remove(ps);
4831                                 }
4832                         }
4833                         mutex_exit(&un->un_owner_mx);
4834 
4835                         /*
4836                          * If this is the first RESYNC_NEXT message (i.e.
4837                          * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags),
4838                          * issue RESYNC_START NOTIFY event
4839                          */
4840                         if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) {
4841                                 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
4842                                     SVM_TAG_METADEVICE, MD_UN2SET(un),
4843                                     MD_SID(un));
4844                         }
4845 
4846                         /* Ensure that our local resync thread is running */
4847                         if (un->un_rs_thread == NULL) {
4848                                 (void) mirror_resync_unit(p->mnum, NULL,
4849                                     &p->mde, lockp);
4850                         }
4851                 }
4852 
4853                 break;
4854         case MD_MN_MSG_RESYNC_FINISH:
4855                 /*
4856                  * Complete the resync by stopping the resync thread.
4857                  * Also release the previous overlap region field.
4858                  * Update the resync_progress_thread by cv_signal'ing it so
4859                  * that we mark the end of the resync as soon as possible. This
4860                  * stops an unnecessary delay should be panic after resync
4861                  * completion.
4862                  */
4863 #ifdef DEBUG
4864                 if (!rs_active) {
4865                         if (mirror_debug_flag)
4866                                 printf("RESYNC_FINISH (mnum = %x), "
4867                                     "Resync *NOT* active",
4868                                     p->mnum);
4869                 }
4870 #endif
4871 
4872                 if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) &&
4873                     (p->rs_originator != md_mn_mynode_id)) {
4874                         mutex_enter(&un->un_rs_thread_mx);
4875                         un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
4876                         un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
4877                         un->un_rs_thread_flags &=
4878                             ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
4879                         cv_signal(&un->un_rs_thread_cv);
4880                         mutex_exit(&un->un_rs_thread_mx);
4881                 }
4882                 if (is_ABR) {
4883                         /* Resync finished, if ABR set owner to NULL */
4884                         mutex_enter(&un->un_owner_mx);
4885                         un->un_mirror_owner = 0;
4886                         mutex_exit(&un->un_owner_mx);
4887                 }
4888                 (void) md_ioctl_writerlock(lockp, ui);
4889                 ps = un->un_rs_prev_overlap;
4890                 if (ps != NULL) {
4891                         /* Remove previous overlap range */
4892                         if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4893                                 mirror_overlap_tree_remove(ps);
4894                         /*
4895                          * Release the overlap range reference
4896                          */
4897                         un->un_rs_prev_overlap = NULL;
4898                         kmem_cache_free(mirror_parent_cache,
4899                             ps);
4900                 }
4901                 md_ioctl_writerexit(lockp);
4902 
4903                 /* Mark the resync as complete in the metadb */
4904                 un->un_rs_resync_done = p->rs_done;
4905                 un->un_rs_resync_2_do = p->rs_2_do;
4906                 un->un_rs_type = p->rs_type;
4907                 mutex_enter(&un->un_rs_progress_mx);
4908                 cv_signal(&un->un_rs_progress_cv);
4909                 mutex_exit(&un->un_rs_progress_mx);
4910 
4911                 un = md_ioctl_writerlock(lockp, ui);
4912                 un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
4913                 /* Deal with any pending grow_unit */
4914                 if (un->c.un_status & MD_UN_GROW_PENDING) {
4915                         if ((mirror_grow_unit(un, &mde) != 0) ||
4916                             (! mdismderror(&mde, MDE_GROW_DELAYED))) {
4917                                 un->c.un_status &= ~MD_UN_GROW_PENDING;
4918                         }
4919                 }
4920                 md_ioctl_writerexit(lockp);
4921                 break;
4922 
4923         case MD_MN_MSG_RESYNC_PHASE_DONE:
4924                 /*
4925                  * A phase of the resync, optimized. component or
4926                  * submirror is complete. Update mirror status.
4927                  * If the flag CLEAR_OPT_NOT_DONE is set, it means that the
4928                  * mirror owner is peforming a resync. If we have just snarfed
4929                  * this set, then we must clear any of the flags set at snarf
4930                  * time by unit_setup_resync().
4931                  * Note that unit_setup_resync() sets up these flags to
4932                  * indicate that an optimized resync is required. These flags
4933                  * need to be reset because if we get here,  the mirror owner
4934                  * will have handled the optimized resync.
4935                  * The flags that must be cleared are MD_UN_OPT_NOT_DONE and
4936                  * MD_UN_WAR. In addition, for each submirror,
4937                  * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC
4938                  * set to SMS_OFFLINE.
4939                  */
4940 #ifdef DEBUG
4941                 if (mirror_debug_flag)
4942                         printf("phase done mess received from %d, mnum=%x,"
4943                             "type=%x, flags=%x\n", p->rs_originator, p->mnum,
4944                             p->rs_type, p->rs_flags);
4945 #endif
4946                 /*
4947                  * Ignore the message if there is no active resync thread.
4948                  */
4949                 if (!rs_active)
4950                         break;
4951 
4952                 broke_out = p->rs_flags & MD_MN_RS_ERR;
4953                 switch (RS_TYPE(p->rs_type)) {
4954                 case MD_RS_OPTIMIZED:
4955                         un = md_ioctl_writerlock(lockp, ui);
4956                         if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) {
4957                                 /* If we are originator, just clear rs_type */
4958                                 if (p->rs_originator == md_mn_mynode_id) {
4959                                         SET_RS_TYPE_NONE(un->un_rs_type);
4960                                         md_ioctl_writerexit(lockp);
4961                                         break;
4962                                 }
4963                                 /*
4964                                  * If CLEAR_OPT_NOT_DONE is set, only clear the
4965                                  * flags if OPT_NOT_DONE is set *and* rs_type
4966                                  * is MD_RS_NONE.
4967                                  */
4968                                 if ((un->c.un_status & MD_UN_OPT_NOT_DONE) &&
4969                                     (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) {
4970                                         /* No resync in progress */
4971                                         un->c.un_status &= ~MD_UN_OPT_NOT_DONE;
4972                                         un->c.un_status &= ~MD_UN_WAR;
4973                                 } else {
4974                                         /*
4975                                          * We are in the middle of an
4976                                          * optimized resync and this message
4977                                          * should be ignored.
4978                                          */
4979                                         md_ioctl_writerexit(lockp);
4980                                         break;
4981                                 }
4982                         } else {
4983                                 /*
4984                                  * This is the end of an optimized resync,
4985                                  * clear the OPT_NOT_DONE and OFFLINE_SM flags
4986                                  */
4987 
4988                                 un->c.un_status &= ~MD_UN_KEEP_DIRTY;
4989                                 if (!broke_out)
4990                                         un->c.un_status &= ~MD_UN_WAR;
4991 
4992                                 /*
4993                                  * Clear our un_resync_bm for the regions
4994                                  * completed.  The owner (originator) will
4995                                  * take care of itself.
4996                                  */
4997                                 if (p->rs_originator != md_mn_mynode_id &&
4998                                     (ps = un->un_rs_prev_overlap) != NULL) {
4999                                         BLK_TO_RR(rr_start, ps->ps_firstblk,
5000                                             un);
5001                                         BLK_TO_RR(rr_end, ps->ps_lastblk, un);
5002                                         mutex_enter(&un->un_resync_mx);
5003                                         for (rr = rr_start; rr <= rr_end;
5004                                             rr++) {
5005                                                 CLR_KEEPDIRTY(rr, un);
5006                                         }
5007                                         mutex_exit(&un->un_resync_mx);
5008                                 }
5009                         }
5010 
5011                         /*
5012                          * Set resync_completed to last resync type and then
5013                          * clear resync_type to indicate no resync in progress
5014                          */
5015                         un->un_resync_completed = un->un_rs_type;
5016                         SET_RS_TYPE_NONE(un->un_rs_type);
5017 
5018                         /*
5019                          * If resync is as a result of a submirror ONLINE,
5020                          * reset the submirror state to SMS_RUNNING if the
5021                          * resync was ok else set back to SMS_OFFLINE.
5022                          */
5023                         for (smi = 0; smi < NMIRROR; smi++) {
5024                                 un->un_sm[smi].sm_flags &=
5025                                     ~MD_SM_RESYNC_TARGET;
5026                                 if (SMS_BY_INDEX_IS(un, smi,
5027                                     SMS_OFFLINE_RESYNC)) {
5028                                         if (p->rs_flags &
5029                                             MD_MN_RS_CLEAR_OPT_NOT_DONE) {
5030                                                 state = SMS_OFFLINE;
5031                                         } else {
5032                                                 state = (broke_out ?
5033                                                     SMS_OFFLINE : SMS_RUNNING);
5034                                         }
5035                                         mirror_set_sm_state(
5036                                             &un->un_sm[smi],
5037                                             &un->un_smic[smi], state,
5038                                             broke_out);
5039                                         mirror_commit(un, NO_SUBMIRRORS,
5040                                             0);
5041                                 }
5042                                 /*
5043                                  * If we still have an offline submirror, reset
5044                                  * the OFFLINE_SM flag in the mirror status
5045                                  */
5046                                 if (SMS_BY_INDEX_IS(un, smi,
5047                                     SMS_OFFLINE))
5048                                         un->c.un_status |=
5049                                             MD_UN_OFFLINE_SM;
5050                         }
5051                         md_ioctl_writerexit(lockp);
5052                         break;
5053                 case MD_RS_SUBMIRROR:
5054                         un = md_ioctl_writerlock(lockp, ui);
5055                         smi = RS_SMI(p->rs_type);
5056                         sm = &un->un_sm[smi];
5057                         smic = &un->un_smic[smi];
5058                         /* Clear RESYNC target */
5059                         un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
5060                         /*
5061                          * Set resync_completed to last resync type and then
5062                          * clear resync_type to indicate no resync in progress
5063                          */
5064                         un->un_resync_completed = un->un_rs_type;
5065                         SET_RS_TYPE_NONE(un->un_rs_type);
5066                         /*
5067                          * If the resync completed ok reset the submirror
5068                          * state to SMS_RUNNING else reset it to SMS_ATTACHED
5069                          */
5070                         state = (broke_out ?
5071                             SMS_ATTACHED : SMS_RUNNING);
5072                         mirror_set_sm_state(sm, smic, state, broke_out);
5073                         un->c.un_status &= ~MD_UN_WAR;
5074                         mirror_commit(un, SMI2BIT(smi), 0);
5075                         md_ioctl_writerexit(lockp);
5076                         break;
5077                 case MD_RS_COMPONENT:
5078                         un = md_ioctl_writerlock(lockp, ui);
5079                         smi = RS_SMI(p->rs_type);
5080                         ci = RS_CI(p->rs_type);
5081                         sm = &un->un_sm[smi];
5082                         smic = &un->un_smic[smi];
5083                         shared = (md_m_shared_t *)
5084                             (*(smic->sm_shared_by_indx))
5085                             (sm->sm_dev, sm, ci);
5086                         un->c.un_status &= ~MD_UN_WAR;
5087                         /* Clear RESYNC target */
5088                         un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
5089                         /*
5090                          * Set resync_completed to last resync type and then
5091                          * clear resync_type to indicate no resync in progress
5092                          */
5093                         un->un_resync_completed = un->un_rs_type;
5094                         SET_RS_TYPE_NONE(un->un_rs_type);
5095 
5096                         /*
5097                          * If the resync completed ok, set the component state
5098                          * to CS_OKAY.
5099                          */
5100                         if (broke_out)
5101                                 shared->ms_flags |= MDM_S_RS_TRIED;
5102                         else {
5103                                 /*
5104                                  * As we don't transmit the changes,
5105                                  * no need to drop the lock.
5106                                  */
5107                                 set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
5108                                     MD_STATE_NO_XMIT, (IOLOCK *)NULL);
5109                         }
5110                         md_ioctl_writerexit(lockp);
5111                 default:
5112                         break;
5113                 }
5114                 /*
5115                  * If the purpose of this PHASE_DONE message is just to
5116                  * indicate to all other nodes that the optimized resync
5117                  * required (OPT_NOT_DONE) flag is to be cleared, there is
5118                  * no need to generate a notify event as there has not
5119                  * actually been a resync.
5120                  */
5121                 if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) {
5122                         if (broke_out) {
5123                                 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
5124                                     SVM_TAG_METADEVICE, MD_UN2SET(un),
5125                                     MD_SID(un));
5126                         } else {
5127                                 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
5128                                     SVM_TAG_METADEVICE, MD_UN2SET(un),
5129                                     MD_SID(un));
5130                         }
5131                 }
5132                 break;
5133 
5134         default:
5135 #ifdef DEBUG
5136                 cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type"
5137                     " %x\n", p->msg_type);
5138 #endif
5139                 return (EINVAL);
5140         }
5141         return (0);
5142 }
5143 
5144 /* Return a -1 if snarf of optimized record failed and set should be released */
5145 static int
5146 mirror_snarf(md_snarfcmd_t cmd, set_t setno)
5147 {
5148         mddb_recid_t    recid;
5149         int             gotsomething;
5150         int             all_mirrors_gotten;
5151         mm_unit_t       *un;
5152         mddb_type_t     typ1;
5153         mddb_de_ic_t    *dep;
5154         mddb_rb32_t     *rbp;
5155         size_t          newreqsize;
5156         mm_unit_t       *big_un;
5157         mm_unit32_od_t  *small_un;
5158         int             retval;
5159         mdi_unit_t      *ui;
5160 
5161         if (cmd == MD_SNARF_CLEANUP) {
5162                 if (md_get_setstatus(setno) & MD_SET_STALE)
5163                         return (0);
5164 
5165                 recid = mddb_makerecid(setno, 0);
5166                 typ1 = (mddb_type_t)md_getshared_key(setno,
5167                     mirror_md_ops.md_driver.md_drivername);
5168                 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5169                         if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) {
5170                                 un = (mm_unit_t *)mddb_getrecaddr(recid);
5171                                 mirror_cleanup(un);
5172                                 recid = mddb_makerecid(setno, 0);
5173                         }
5174                 }
5175                 return (0);
5176         }
5177 
5178         all_mirrors_gotten = 1;
5179         gotsomething = 0;
5180 
5181         recid = mddb_makerecid(setno, 0);
5182         typ1 = (mddb_type_t)md_getshared_key(setno,
5183             mirror_md_ops.md_driver.md_drivername);
5184 
5185         while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5186                 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5187                         continue;
5188 
5189                 dep = mddb_getrecdep(recid);
5190                 dep->de_flags = MDDB_F_MIRROR;
5191                 rbp = dep->de_rb;
5192 
5193                 switch (rbp->rb_revision) {
5194                 case MDDB_REV_RB:
5195                 case MDDB_REV_RBFN:
5196                         if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
5197                                 /*
5198                                  * This means, we have an old and small
5199                                  * record and this record hasn't already
5200                                  * been converted.  Before we create an
5201                                  * incore metadevice from this we have to
5202                                  * convert it to a big record.
5203                                  */
5204                                 small_un =
5205                                     (mm_unit32_od_t *)mddb_getrecaddr(recid);
5206                                 newreqsize = sizeof (mm_unit_t);
5207                                 big_un = (mm_unit_t *)kmem_zalloc(newreqsize,
5208                                     KM_SLEEP);
5209                                 mirror_convert((caddr_t)small_un,
5210                                     (caddr_t)big_un, SMALL_2_BIG);
5211                                 kmem_free(small_un, dep->de_reqsize);
5212 
5213                                 /*
5214                                  * Update userdata and incore userdata
5215                                  * incores are at the end of un
5216                                  */
5217                                 dep->de_rb_userdata_ic = big_un;
5218                                 dep->de_rb_userdata = big_un;
5219                                 dep->de_icreqsize = newreqsize;
5220                                 un = big_un;
5221                                 rbp->rb_private |= MD_PRV_CONVD;
5222                         } else {
5223                                 /*
5224                                  * Unit already converted, just get the
5225                                  * record address.
5226                                  */
5227                                 un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
5228                                     sizeof (*un), 0);
5229                         }
5230                         un->c.un_revision &= ~MD_64BIT_META_DEV;
5231                         break;
5232                 case MDDB_REV_RB64:
5233                 case MDDB_REV_RB64FN:
5234                         /* Big device */
5235                         un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
5236                             sizeof (*un), 0);
5237                         un->c.un_revision |= MD_64BIT_META_DEV;
5238                         un->c.un_flag |= MD_EFILABEL;
5239                         break;
5240                 }
5241                 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
5242 
5243                 /*
5244                  * Create minor device node for snarfed entry.
5245                  */
5246                 (void) md_create_minor_node(setno, MD_SID(un));
5247 
5248                 if (MD_UNIT(MD_SID(un)) != NULL) {
5249                         mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5250                         continue;
5251                 }
5252                 all_mirrors_gotten = 0;
5253                 retval = mirror_build_incore(un, 1);
5254                 if (retval == 0) {
5255                         mddb_setrecprivate(recid, MD_PRV_GOTIT);
5256                         md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0);
5257                         resync_start_timeout(setno);
5258                         gotsomething = 1;
5259                 } else {
5260                         return (retval);
5261                 }
5262                 /*
5263                  * Set flag to indicate that the mirror has not yet
5264                  * been through a reconfig. This flag is used for MN sets
5265                  * when determining whether to update the mirror state from
5266                  * the Master node.
5267                  */
5268                 if (MD_MNSET_SETNO(setno)) {
5269                         ui = MDI_UNIT(MD_SID(un));
5270                         ui->ui_tstate |= MD_RESYNC_NOT_DONE;
5271                 }
5272         }
5273 
5274         if (!all_mirrors_gotten)
5275                 return (gotsomething);
5276 
5277         recid = mddb_makerecid(setno, 0);
5278         while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0)
5279                 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
5280                         mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5281 
5282         return (0);
5283 }
5284 
5285 static int
5286 mirror_halt(md_haltcmd_t cmd, set_t setno)
5287 {
5288         unit_t          i;
5289         mdi_unit_t      *ui;
5290         minor_t         mnum;
5291         int             reset_mirror_flag = 0;
5292 
5293         if (cmd == MD_HALT_CLOSE)
5294                 return (0);
5295 
5296         if (cmd == MD_HALT_OPEN)
5297                 return (0);
5298 
5299         if (cmd == MD_HALT_UNLOAD)
5300                 return (0);
5301 
5302         if (cmd == MD_HALT_CHECK) {
5303                 for (i = 0; i < md_nunits; i++) {
5304                         mnum = MD_MKMIN(setno, i);
5305                         if ((ui = MDI_UNIT(mnum)) == NULL)
5306                                 continue;
5307                         if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5308                                 continue;
5309                         if (md_unit_isopen(ui))
5310                                 return (1);
5311                 }
5312                 return (0);
5313         }
5314 
5315         if (cmd != MD_HALT_DOIT)
5316                 return (1);
5317 
5318         for (i = 0; i < md_nunits; i++) {
5319                 mnum = MD_MKMIN(setno, i);
5320                 if ((ui = MDI_UNIT(mnum)) == NULL)
5321                         continue;
5322                 if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5323                         continue;
5324                 reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0);
5325 
5326                 /* Set a flag if there is at least one mirror metadevice. */
5327                 reset_mirror_flag = 1;
5328         }
5329 
5330         /*
5331          * Only wait for the global dr_timeout to finish
5332          *  - if there are mirror metadevices in this diskset or
5333          *  - if this is the local set since an unload of the md_mirror
5334          *    driver could follow a successful mirror halt in the local set.
5335          */
5336         if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) {
5337                 while ((mirror_md_ops.md_head == NULL) &&
5338                     (mirror_timeout.dr_timeout_id != 0))
5339                         delay(md_hz);
5340         }
5341 
5342         return (0);
5343 }
5344 
5345 /*ARGSUSED3*/
5346 static int
5347 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
5348 {
5349         IOLOCK  lock;
5350         minor_t         mnum = getminor(*dev);
5351         set_t           setno;
5352 
5353         /*
5354          * When doing an open of a multi owner metadevice, check to see if this
5355          * node is a starting node and if a reconfig cycle is underway.
5356          * If so, the system isn't sufficiently set up enough to handle the
5357          * open (which involves I/O during sp_validate), so fail with ENXIO.
5358          */
5359         setno = MD_MIN2SET(mnum);
5360         if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
5361             (MD_SET_MNSET | MD_SET_MN_START_RC)) {
5362                         return (ENXIO);
5363         }
5364 
5365         if (md_oflags & MD_OFLG_FROMIOCTL) {
5366                 /*
5367                  * This indicates that the caller is an ioctl service routine.
5368                  * In this case we initialise our stack-based IOLOCK and pass
5369                  * this into the internal open routine. This allows multi-owner
5370                  * metadevices to avoid deadlocking if an error is encountered
5371                  * during the open() attempt. The failure case is:
5372                  * s-p -> mirror -> s-p (with error). Attempting to metaclear
5373                  * this configuration would deadlock as the mirror code has to
5374                  * send a state-update to the other nodes when it detects the
5375                  * failure of the underlying submirror with an errored soft-part
5376                  * on it. As there is a class1 message in progress (metaclear)
5377                  * set_sm_comp_state() cannot send another class1 message;
5378                  * instead we do not send a state_update message as the
5379                  * metaclear is distributed and the failed submirror will be
5380                  * cleared from the configuration by the metaclear.
5381                  */
5382                 IOLOCK_INIT(&lock);
5383                 return (mirror_internal_open(getminor(*dev), flag, otyp,
5384                     md_oflags, &lock));
5385         } else {
5386                 return (mirror_internal_open(getminor(*dev), flag, otyp,
5387                     md_oflags, (IOLOCK *)NULL));
5388         }
5389 }
5390 
5391 
5392 /*ARGSUSED1*/
5393 static int
5394 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
5395 {
5396         return (mirror_internal_close(getminor(dev), otyp, md_cflags,
5397             (IOLOCK *)NULL));
5398 }
5399 
5400 
5401 /*
5402  * This routine dumps memory to the disk.  It assumes that the memory has
5403  * already been mapped into mainbus space.  It is called at disk interrupt
5404  * priority when the system is in trouble.
5405  *
5406  */
5407 static int
5408 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
5409 {
5410         mm_unit_t       *un;
5411         dev_t           mapdev;
5412         int             result;
5413         int             smi;
5414         int             any_succeed = 0;
5415         int             save_result = 0;
5416 
5417         /*
5418          * Don't need to grab the unit lock.
5419          * Cause nothing else is suppose to be happenning.
5420          * Also dump is not suppose to sleep.
5421          */
5422         un = (mm_unit_t *)MD_UNIT(getminor(dev));
5423 
5424         if ((diskaddr_t)blkno >= un->c.un_total_blocks)
5425                 return (EINVAL);
5426 
5427         if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
5428                 return (EINVAL);
5429 
5430         for (smi = 0; smi < NMIRROR; smi++) {
5431                 if (!SUBMIRROR_IS_WRITEABLE(un, smi))
5432                         continue;
5433                 mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev);
5434                 result = bdev_dump(mapdev, addr, blkno, nblk);
5435                 if (result)
5436                         save_result = result;
5437 
5438                 if (result == 0)
5439                         any_succeed++;
5440         }
5441 
5442         if (any_succeed)
5443                 return (0);
5444 
5445         return (save_result);
5446 }
5447 
5448 /*
5449  * NAME: mirror_probe_dev
5450  *
5451  * DESCRITPION: force opens every component of a mirror.
5452  *
5453  * On entry the unit writerlock is held
5454  */
5455 static int
5456 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum)
5457 {
5458         int             i;
5459         int             smi;
5460         int             ci;
5461         mm_unit_t       *un;
5462         int             md_devopen = 0;
5463         set_t           setno;
5464         int             sm_cnt;
5465         int             sm_unavail_cnt;
5466 
5467         if (md_unit_isopen(ui))
5468                 md_devopen++;
5469 
5470         un = MD_UNIT(mnum);
5471         setno = MD_UN2SET(un);
5472 
5473         sm_cnt = 0;
5474         sm_unavail_cnt = 0;
5475         for (i = 0; i < NMIRROR; i++) {
5476                 md_dev64_t tmpdev;
5477                 mdi_unit_t      *sm_ui;
5478 
5479                 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) {
5480                         continue;
5481                 }
5482 
5483                 sm_cnt++;
5484                 tmpdev = un->un_sm[i].sm_dev;
5485                 (void) md_layered_open(mnum, &tmpdev,
5486                     MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV);
5487                 un->un_sm[i].sm_dev = tmpdev;
5488 
5489                 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
5490 
5491                 /*
5492                  * Logic similar to that in mirror_open_all_devs.  We set or
5493                  * clear the submirror Unavailable bit.
5494                  */
5495                 (void) md_unit_writerlock(sm_ui);
5496                 if (submirror_unavailable(un, i, 1)) {
5497                         sm_ui->ui_tstate |= MD_INACCESSIBLE;
5498                         sm_unavail_cnt++;
5499                 } else {
5500                         sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
5501                 }
5502                 md_unit_writerexit(sm_ui);
5503         }
5504 
5505         /*
5506          * If all of the submirrors are unavailable, the mirror is also
5507          * unavailable.
5508          */
5509         if (sm_cnt == sm_unavail_cnt) {
5510                 ui->ui_tstate |= MD_INACCESSIBLE;
5511         } else {
5512                 ui->ui_tstate &= ~MD_INACCESSIBLE;
5513         }
5514 
5515         /*
5516          * Start checking from probe failures. If failures occur we
5517          * set the appropriate erred state only if the metadevice is in
5518          * use. This is specifically to prevent unnecessary resyncs.
5519          * For instance if the disks were accidentally disconnected when
5520          * the system booted up then until the metadevice is accessed
5521          * (like file system mount) the user can shutdown, recable and
5522          * reboot w/o incurring a potentially huge resync.
5523          */
5524 
5525         smi = 0;
5526         ci = 0;
5527         while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) {
5528 
5529                 if (mirror_other_sources(un, smi, ci, 0) == 1) {
5530                         /*
5531                          * Note that for a MN set, there is no need to call
5532                          * SE_NOTIFY as that is done when processing the
5533                          * state change
5534                          */
5535                         if (md_devopen) {
5536                                 /*
5537                                  * Never called from ioctl context,
5538                                  * so (IOLOCK *)NULL
5539                                  */
5540                                 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED,
5541                                     0, MD_STATE_XMIT, (IOLOCK *)NULL);
5542                                 if (!MD_MNSET_SETNO(setno)) {
5543                                         SE_NOTIFY(EC_SVM_STATE,
5544                                             ESC_SVM_LASTERRED,
5545                                             SVM_TAG_METADEVICE, setno,
5546                                             MD_SID(un));
5547                                 }
5548                                 continue;
5549                         } else {
5550                                 (void) mirror_close_all_devs(un,
5551                                     MD_OFLG_PROBEDEV);
5552                                 if (!MD_MNSET_SETNO(setno)) {
5553                                         SE_NOTIFY(EC_SVM_STATE,
5554                                             ESC_SVM_OPEN_FAIL,
5555                                             SVM_TAG_METADEVICE, setno,
5556                                             MD_SID(un));
5557                                 }
5558                                 mirror_openfail_console_info(un, smi, ci);
5559                                 return (ENXIO);
5560                         }
5561                 }
5562 
5563                 /*
5564                  * Note that for a MN set, there is no need to call
5565                  * SE_NOTIFY as that is done when processing the
5566                  * state change
5567                  */
5568                 if (md_devopen) {
5569                         /* Never called from ioctl context, so (IOLOCK *)NULL */
5570                         set_sm_comp_state(un, smi, ci, CS_ERRED, 0,
5571                             MD_STATE_XMIT, (IOLOCK *)NULL);
5572                         if (!MD_MNSET_SETNO(setno)) {
5573                                 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
5574                                     SVM_TAG_METADEVICE, setno,
5575                                     MD_SID(un));
5576                         }
5577                 }
5578                 mirror_openfail_console_info(un, smi, ci);
5579                 ci++;
5580         }
5581 
5582         if (MD_MNSET_SETNO(setno)) {
5583                 send_poke_hotspares(setno);
5584         } else {
5585                 (void) poke_hotspares();
5586         }
5587         (void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV);
5588 
5589         return (0);
5590 }
5591 
5592 
5593 static int
5594 mirror_imp_set(
5595         set_t   setno
5596 )
5597 {
5598 
5599         mddb_recid_t    recid;
5600         int             gotsomething, i;
5601         mddb_type_t     typ1;
5602         mddb_de_ic_t    *dep;
5603         mddb_rb32_t     *rbp;
5604         mm_unit32_od_t  *un32;
5605         mm_unit_t       *un64;
5606         md_dev64_t      self_devt;
5607         minor_t         *self_id;       /* minor needs to be updated */
5608         md_parent_t     *parent_id;     /* parent needs to be updated */
5609         mddb_recid_t    *record_id;     /* record id needs to be updated */
5610         mddb_recid_t    *optrec_id;
5611         md_dev64_t      tmpdev;
5612 
5613 
5614         gotsomething = 0;
5615 
5616         typ1 = (mddb_type_t)md_getshared_key(setno,
5617             mirror_md_ops.md_driver.md_drivername);
5618         recid = mddb_makerecid(setno, 0);
5619 
5620         while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5621                 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5622                         continue;
5623 
5624                 dep = mddb_getrecdep(recid);
5625                 rbp = dep->de_rb;
5626 
5627                 switch (rbp->rb_revision) {
5628                 case MDDB_REV_RB:
5629                 case MDDB_REV_RBFN:
5630                         /*
5631                          * Small device
5632                          */
5633                         un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid);
5634                         self_id = &(un32->c.un_self_id);
5635                         parent_id = &(un32->c.un_parent);
5636                         record_id = &(un32->c.un_record_id);
5637                         optrec_id = &(un32->un_rr_dirty_recid);
5638 
5639                         for (i = 0; i < un32->un_nsm; i++) {
5640                                 tmpdev = md_expldev(un32->un_sm[i].sm_dev);
5641                                 un32->un_sm[i].sm_dev = md_cmpldev
5642                                     (md_makedevice(md_major, MD_MKMIN(setno,
5643                                     MD_MIN2UNIT(md_getminor(tmpdev)))));
5644 
5645                                 if (!md_update_minor(setno, mddb_getsidenum
5646                                     (setno), un32->un_sm[i].sm_key))
5647                                 goto out;
5648                         }
5649                         break;
5650                 case MDDB_REV_RB64:
5651                 case MDDB_REV_RB64FN:
5652                         un64 = (mm_unit_t *)mddb_getrecaddr(recid);
5653                         self_id = &(un64->c.un_self_id);
5654                         parent_id = &(un64->c.un_parent);
5655                         record_id = &(un64->c.un_record_id);
5656                         optrec_id = &(un64->un_rr_dirty_recid);
5657 
5658                         for (i = 0; i < un64->un_nsm; i++) {
5659                                 tmpdev = un64->un_sm[i].sm_dev;
5660                                 un64->un_sm[i].sm_dev = md_makedevice
5661                                     (md_major, MD_MKMIN(setno, MD_MIN2UNIT
5662                                     (md_getminor(tmpdev))));
5663 
5664                                 if (!md_update_minor(setno, mddb_getsidenum
5665                                     (setno), un64->un_sm[i].sm_key))
5666                                 goto out;
5667                         }
5668                         break;
5669                 }
5670 
5671                 /*
5672                  * If this is a top level and a friendly name metadevice,
5673                  * update its minor in the namespace.
5674                  */
5675                 if ((*parent_id == MD_NO_PARENT) &&
5676                     ((rbp->rb_revision == MDDB_REV_RBFN) ||
5677                     (rbp->rb_revision == MDDB_REV_RB64FN))) {
5678 
5679                         self_devt = md_makedevice(md_major, *self_id);
5680                         if (!md_update_top_device_minor(setno,
5681                             mddb_getsidenum(setno), self_devt))
5682                                 goto out;
5683                 }
5684 
5685                 /*
5686                  * Update unit with the imported setno
5687                  *
5688                  */
5689                 mddb_setrecprivate(recid, MD_PRV_GOTIT);
5690 
5691                 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
5692                 if (*parent_id != MD_NO_PARENT)
5693                         *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
5694                 *record_id = MAKERECID(setno, DBID(*record_id));
5695                 *optrec_id = MAKERECID(setno, DBID(*optrec_id));
5696 
5697                 gotsomething = 1;
5698         }
5699 
5700 out:
5701         return (gotsomething);
5702 }
5703 
5704 /*
5705  * NAME: mirror_check_offline
5706  *
5707  * DESCRIPTION: return offline_status = 1 if any submirrors are offline
5708  *
5709  * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is
5710  * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE
5711  * ioctl.
5712  */
5713 int
5714 mirror_check_offline(md_dev64_t dev, int *offline_status)
5715 {
5716         mm_unit_t               *un;
5717         md_error_t              mde = mdnullerror;
5718 
5719         if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5720                 return (EINVAL);
5721         *offline_status = 0;
5722         if (un->c.un_status & MD_UN_OFFLINE_SM)
5723                 *offline_status = 1;
5724         return (0);
5725 }
5726 
5727 /*
5728  * NAME: mirror_inc_abr_count
5729  *
5730  * DESCRIPTION: increment the count of layered soft parts with ABR set
5731  *
5732  * Called from ioctl, so access to un_abr_count is protected by the global
5733  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5734  */
5735 int
5736 mirror_inc_abr_count(md_dev64_t dev)
5737 {
5738         mm_unit_t               *un;
5739         md_error_t              mde = mdnullerror;
5740 
5741         if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5742                 return (EINVAL);
5743         un->un_abr_count++;
5744         return (0);
5745 }
5746 
5747 /*
5748  * NAME: mirror_dec_abr_count
5749  *
5750  * DESCRIPTION: decrement the count of layered soft parts with ABR set
5751  *
5752  * Called from ioctl, so access to un_abr_count is protected by the global
5753  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5754  */
5755 int
5756 mirror_dec_abr_count(md_dev64_t dev)
5757 {
5758         mm_unit_t               *un;
5759         md_error_t              mde = mdnullerror;
5760 
5761         if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5762                 return (EINVAL);
5763         un->un_abr_count--;
5764         return (0);
5765 }
5766 
5767 static md_named_services_t mirror_named_services[] = {
5768         {(intptr_t (*)()) poke_hotspares,               "poke hotspares"    },
5769         {(intptr_t (*)()) mirror_rename_listkids,       MDRNM_LIST_URKIDS   },
5770         {mirror_rename_check,                           MDRNM_CHECK         },
5771         {(intptr_t (*)()) mirror_renexch_update_kids,   MDRNM_UPDATE_KIDS   },
5772         {(intptr_t (*)()) mirror_exchange_parent_update_to,
5773                         MDRNM_PARENT_UPDATE_TO},
5774         {(intptr_t (*)()) mirror_exchange_self_update_from_down,
5775                         MDRNM_SELF_UPDATE_FROM_DOWN },
5776         {(intptr_t (*)())mirror_probe_dev,              "probe open test" },
5777         {(intptr_t (*)())mirror_check_offline,          MD_CHECK_OFFLINE },
5778         {(intptr_t (*)())mirror_inc_abr_count,          MD_INC_ABR_COUNT },
5779         {(intptr_t (*)())mirror_dec_abr_count,          MD_DEC_ABR_COUNT },
5780         { NULL,                                         0                   }
5781 };
5782 
5783 md_ops_t mirror_md_ops = {
5784         mirror_open,            /* open */
5785         mirror_close,           /* close */
5786         md_mirror_strategy,     /* strategy */
5787         NULL,                   /* print */
5788         mirror_dump,            /* dump */
5789         NULL,                   /* read */
5790         NULL,                   /* write */
5791         md_mirror_ioctl,        /* mirror_ioctl, */
5792         mirror_snarf,           /* mirror_snarf */
5793         mirror_halt,            /* mirror_halt */
5794         NULL,                   /* aread */
5795         NULL,                   /* awrite */
5796         mirror_imp_set,         /* import set */
5797         mirror_named_services
5798 };
5799 
5800 /* module specific initilization */
5801 static void
5802 init_init()
5803 {
5804         md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t);
5805 
5806         /* Initialize the parent and child save memory pools */
5807         mirror_parent_cache = kmem_cache_create("md_mirror_parent",
5808             sizeof (md_mps_t), 0, mirror_parent_constructor,
5809             mirror_parent_destructor, mirror_run_queue, NULL, NULL,
5810             0);
5811 
5812         mirror_child_cache = kmem_cache_create("md_mirror_child",
5813             sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0,
5814             mirror_child_constructor, mirror_child_destructor,
5815             mirror_run_queue, NULL, NULL, 0);
5816 
5817         /*
5818          * Insure wowbuf_size is a multiple of DEV_BSIZE,
5819          * then initialize wowbuf memory pool.
5820          */
5821         md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE);
5822         if (md_wowbuf_size <= 0)
5823                 md_wowbuf_size = 2 * DEV_BSIZE;
5824         if (md_wowbuf_size > (32 * DEV_BSIZE))
5825                 md_wowbuf_size = (32 * DEV_BSIZE);
5826 
5827         md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t);
5828         mirror_wowblk_cache = kmem_cache_create("md_mirror_wow",
5829             md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0);
5830 
5831         mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5832         mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5833 
5834         mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL);
5835 }
5836 
5837 /* module specific uninitilization (undo init_init()) */
5838 static void
5839 fini_uninit()
5840 {
5841         kmem_cache_destroy(mirror_parent_cache);
5842         kmem_cache_destroy(mirror_child_cache);
5843         kmem_cache_destroy(mirror_wowblk_cache);
5844         mirror_parent_cache = mirror_child_cache =
5845             mirror_wowblk_cache = NULL;
5846 
5847         mutex_destroy(&mirror_timeout.dr_mx);
5848         mutex_destroy(&hotspare_request.dr_mx);
5849         mutex_destroy(&non_ff_drv_mutex);
5850 }
5851 
5852 /* define the module linkage */
5853 MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit())