1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
  25  */
  26 
  27 /*
  28  * Md - is the meta-disk driver.   It sits below the UFS file system
  29  * but above the 'real' disk drivers, xy, id, sd etc.
  30  *
  31  * To the UFS software, md looks like a normal driver, since it has
  32  * the normal kinds of entries in the bdevsw and cdevsw arrays. So
  33  * UFS accesses md in the usual ways.  In particular, the strategy
  34  * routine, mdstrategy(), gets called by fbiwrite(), ufs_getapage(),
  35  * and ufs_writelbn().
  36  *
  37  * Md maintains an array of minor devices (meta-partitions).   Each
  38  * meta partition stands for a matrix of real partitions, in rows
  39  * which are not necessarily of equal length.   Md maintains a table,
  40  * with one entry for each meta-partition,  which lists the rows and
  41  * columns of actual partitions, and the job of the strategy routine
  42  * is to translate from the meta-partition device and block numbers
  43  * known to UFS into the actual partitions' device and block numbers.
  44  *
  45  * See below, in mdstrategy(), mdreal(), and mddone() for details of
  46  * this translation.
  47  */
  48 
  49 /*
  50  * Driver for Virtual Disk.
  51  */
  52 
  53 #include <sys/user.h>
  54 #include <sys/sysmacros.h>
  55 #include <sys/conf.h>
  56 #include <sys/stat.h>
  57 #include <sys/errno.h>
  58 #include <sys/param.h>
  59 #include <sys/systm.h>
  60 #include <sys/file.h>
  61 #include <sys/open.h>
  62 #include <sys/dkio.h>
  63 #include <sys/vtoc.h>
  64 #include <sys/cmn_err.h>
  65 #include <sys/ddi.h>
  66 #include <sys/sunddi.h>
  67 #include <sys/debug.h>
  68 #include <sys/utsname.h>
  69 #include <sys/lvm/mdvar.h>
  70 #include <sys/lvm/md_names.h>
  71 #include <sys/lvm/md_mddb.h>
  72 #include <sys/lvm/md_sp.h>
  73 #include <sys/types.h>
  74 #include <sys/kmem.h>
  75 #include <sys/cladm.h>
  76 #include <sys/priv_names.h>
  77 #include <sys/modhash.h>
  78 
  79 int             md_init_debug   = 0;    /* module binding debug */
  80 
  81 /*
  82  * Tunable to turn off the failfast behavior.
  83  */
  84 int             md_ff_disable = 0;
  85 
  86 /*
  87  * dynamically allocated list of non FF driver names - needs to
  88  * be freed when md is detached.
  89  */
  90 char    **non_ff_drivers = NULL;
  91 
  92 md_krwlock_t    md_unit_array_rw;       /* protects all unit arrays */
  93 md_krwlock_t    nm_lock;                /* protects all the name spaces */
  94 
  95 md_resync_t     md_cpr_resync;
  96 
  97 extern char     svm_bootpath[];
  98 #define SVM_PSEUDO_STR  "/pseudo/md@0:"
  99 
 100 #define         VERSION_LENGTH  6
 101 #define         VERSION         "1.0"
 102 
 103 /*
 104  * Keep track of possible 'orphan' entries in the name space
 105  */
 106 int             *md_nm_snarfed = NULL;
 107 
 108 /*
 109  * Global tunable giving the percentage of free space left in replica during
 110  * conversion of non-devid style replica to devid style replica.
 111  */
 112 int             md_conv_perc = MDDB_DEVID_CONV_PERC;
 113 
 114 #ifdef  DEBUG
 115 /* debug code to verify framework exclusion guarantees */
 116 int             md_in;
 117 kmutex_t        md_in_mx;                       /* used to md global stuff */
 118 #define IN_INIT         0x01
 119 #define IN_FINI         0x02
 120 #define IN_ATTACH       0x04
 121 #define IN_DETACH       0x08
 122 #define IN_OPEN         0x10
 123 #define MD_SET_IN(x) {                                          \
 124         mutex_enter(&md_in_mx);                                     \
 125         if (md_in)                                              \
 126                 debug_enter("MD_SET_IN exclusion lost");        \
 127         if (md_in & x)                                              \
 128                 debug_enter("MD_SET_IN already set");           \
 129         md_in |= x;                                             \
 130         mutex_exit(&md_in_mx);                                      \
 131 }
 132 
 133 #define MD_CLR_IN(x) {                                          \
 134         mutex_enter(&md_in_mx);                                     \
 135         if (md_in & ~(x))                                   \
 136                 debug_enter("MD_CLR_IN exclusion lost");        \
 137         if (!(md_in & x))                                   \
 138                 debug_enter("MD_CLR_IN already clr");           \
 139         md_in &= ~x;                                                \
 140         mutex_exit(&md_in_mx);                                      \
 141 }
 142 #else   /* DEBUG */
 143 #define MD_SET_IN(x)
 144 #define MD_CLR_IN(x)
 145 #endif  /* DEBUG */
 146 hrtime_t savetime1, savetime2;
 147 
 148 
 149 /*
 150  * list things protected by md_mx even if they aren't
 151  * used in this file.
 152  */
 153 kmutex_t        md_mx;                  /* used to md global stuff */
 154 kcondvar_t      md_cv;                  /* md_status events */
 155 int             md_status = 0;          /* global status for the meta-driver */
 156 int             md_num_daemons = 0;
 157 int             md_ioctl_cnt = 0;
 158 int             md_mtioctl_cnt = 0;     /* multithreaded ioctl cnt */
 159 uint_t          md_mdelay = 10;         /* variable so can be patched */
 160 
 161 int             (*mdv_strategy_tstpnt)(buf_t *, int, void*);
 162 
 163 major_t         md_major, md_major_targ;
 164 
 165 unit_t          md_nunits = MD_MAXUNITS;
 166 set_t           md_nsets = MD_MAXSETS;
 167 int             md_nmedh = 0;
 168 char            *md_med_trans_lst = NULL;
 169 md_set_t        md_set[MD_MAXSETS];
 170 md_set_io_t     md_set_io[MD_MAXSETS];
 171 
 172 md_krwlock_t    hsp_rwlp;               /* protects hot_spare_interface */
 173 md_krwlock_t    ni_rwlp;                /* protects notify_interface */
 174 md_ops_t        **md_ops = NULL;
 175 ddi_modhandle_t *md_mods = NULL;
 176 md_ops_t        *md_opslist;
 177 clock_t         md_hz;
 178 md_event_queue_t        *md_event_queue = NULL;
 179 
 180 int             md_in_upgrade;
 181 int             md_keep_repl_state;
 182 int             md_devid_destroy;
 183 
 184 /* for sending messages thru a door to userland */
 185 door_handle_t   mdmn_door_handle = NULL;
 186 int             mdmn_door_did = -1;
 187 
 188 dev_info_t              *md_devinfo = NULL;
 189 
 190 md_mn_nodeid_t  md_mn_mynode_id = ~0u;  /* My node id (for multi-node sets) */
 191 
 192 static  uint_t          md_ocnt[OTYPCNT];
 193 
 194 static int              mdinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
 195 static int              mdattach(dev_info_t *, ddi_attach_cmd_t);
 196 static int              mddetach(dev_info_t *, ddi_detach_cmd_t);
 197 static int              mdopen(dev_t *, int, int, cred_t *);
 198 static int              mdclose(dev_t, int, int, cred_t *);
 199 static int              mddump(dev_t, caddr_t, daddr_t, int);
 200 static int              mdread(dev_t, struct uio *, cred_t *);
 201 static int              mdwrite(dev_t, struct uio *, cred_t *);
 202 static int              mdaread(dev_t, struct aio_req *, cred_t *);
 203 static int              mdawrite(dev_t, struct aio_req *, cred_t *);
 204 static int              mdioctl(dev_t, int, intptr_t, int, cred_t *, int *);
 205 static int              mdprop_op(dev_t, dev_info_t *,
 206                                 ddi_prop_op_t, int, char *, caddr_t, int *);
 207 
 208 static struct cb_ops md_cb_ops = {
 209         mdopen,                 /* open */
 210         mdclose,                /* close */
 211         mdstrategy,             /* strategy */
 212                                 /* print routine -- none yet */
 213         (int(*)(dev_t, char *))nulldev,
 214         mddump,                 /* dump */
 215         mdread,                 /* read */
 216         mdwrite,                /* write */
 217         mdioctl,                /* ioctl */
 218                                 /* devmap */
 219         (int(*)(dev_t, devmap_cookie_t, offset_t, size_t, size_t *,
 220                         uint_t))nodev,
 221                                 /* mmap */
 222         (int(*)(dev_t, off_t, int))nodev,
 223                                 /* segmap */
 224         (int(*)(dev_t, off_t, struct as *, caddr_t *, off_t, unsigned,
 225                 unsigned, unsigned, cred_t *))nodev,
 226         nochpoll,               /* poll */
 227         mdprop_op,              /* prop_op */
 228         0,                      /* streamtab */
 229         (D_64BIT|D_MP|D_NEW),   /* driver compatibility flag */
 230         CB_REV,                 /* cb_ops version */
 231         mdaread,                /* aread */
 232         mdawrite,               /* awrite */
 233 };
 234 
 235 static struct dev_ops md_devops = {
 236         DEVO_REV,               /* dev_ops version */
 237         0,                      /* device reference count */
 238         mdinfo,                 /* info routine */
 239         nulldev,                /* identify routine */
 240         nulldev,                /* probe - not defined */
 241         mdattach,               /* attach routine */
 242         mddetach,               /* detach routine */
 243         nodev,                  /* reset - not defined */
 244         &md_cb_ops,         /* driver operations */
 245         NULL,                   /* bus operations */
 246         nodev,                  /* power management */
 247         ddi_quiesce_not_needed,         /* quiesce */
 248 };
 249 
 250 /*
 251  * loadable module wrapper
 252  */
 253 #include <sys/modctl.h>
 254 
 255 static struct modldrv modldrv = {
 256         &mod_driverops,                     /* type of module -- a pseudodriver */
 257         "Solaris Volume Manager base module", /* name of the module */
 258         &md_devops,                 /* driver ops */
 259 };
 260 
 261 static struct modlinkage modlinkage = {
 262         MODREV_1,
 263         { (void *)&modldrv, NULL }
 264 };
 265 
 266 
 267 /* md_medd.c */
 268 extern  void    med_init(void);
 269 extern  void    med_fini(void);
 270 extern  void    md_devid_cleanup(set_t, uint_t);
 271 
 272 /* md_names.c */
 273 extern struct nm_next_hdr       *get_first_record(set_t, int, int);
 274 
 275 int             md_maxphys      = 0;    /* maximum io size in bytes */
 276 #define         MD_MAXBCOUNT    (1024 * 1024)
 277 unsigned        md_maxbcount    = 0;    /* maximum physio size in bytes */
 278 
 279 /*
 280  * Some md ioctls trigger io framework device tree operations.  An
 281  * example is md ioctls that call md_resolve_bydevid(): which uses the
 282  * io framework to resolve a devid. Such operations result in acquiring
 283  * io framework locks (like ndi_devi_enter() of "/") while holding
 284  * driver locks (like md_unit_writerlock()).
 285  *
 286  * The prop_op(9E) entry point is called from the devinfo driver with
 287  * an active ndi_devi_enter of "/". To avoid deadlock, md's prop_op
 288  * implementation must avoid taking a lock that is held per above md
 289  * ioctl description: i.e. mdprop_op(9E) can't call md_unit_readerlock()
 290  * without risking deadlock.
 291  *
 292  * To service "size" requests without risking deadlock, we maintain a
 293  * "mnum->nblocks" sizemap (protected by a short-term global mutex).
 294  */
 295 static kmutex_t         md_nblocks_mutex;
 296 static mod_hash_t       *md_nblocksmap;         /* mnum -> nblocks */
 297 int                     md_nblocksmap_size = 512;
 298 
 299 /*
 300  * Maintain "mnum->nblocks" sizemap for mdprop_op use:
 301  *
 302  * Create: any code that establishes a unit's un_total_blocks needs the
 303  * following type of call to establish nblocks for mdprop_op():
 304  *      md_nblocks_set(mnum, un->c.un_total_blocks);"
 305  *      NOTE: locate via cscope md_create_minor_node/md_create_unit_incore
 306  *              ...or  "MD_UNIT..*="
 307  *
 308  * Change: any code that changes a unit's un_total_blocks needs the
 309  * following type of call to sync nblocks for mdprop_op():
 310  *      md_nblocks_set(mnum, un->c.un_total_blocks);"
 311  *      NOTE: locate via cscope for "un_total_blocks[ \t]*="
 312  *
 313  * Destroy: any code that deletes a unit needs the following type of call
 314  * to sync nblocks for mdprop_op():
 315  *      md_nblocks_set(mnum, -1ULL);
 316  *      NOTE: locate via cscope md_remove_minor_node/md_destroy_unit_incore
 317  *              ...or  "MD_UNIT..*="
 318  */
 319 void
 320 md_nblocks_set(minor_t mnum, uint64_t nblocks)
 321 {
 322         mutex_enter(&md_nblocks_mutex);
 323         if (nblocks == -1ULL)
 324                 (void) mod_hash_destroy(md_nblocksmap,
 325                     (mod_hash_key_t)(intptr_t)mnum);
 326         else
 327                 (void) mod_hash_replace(md_nblocksmap,
 328                     (mod_hash_key_t)(intptr_t)mnum,
 329                     (mod_hash_val_t)(intptr_t)nblocks);
 330         mutex_exit(&md_nblocks_mutex);
 331 }
 332 
 333 /* get the size of a mnum from "mnum->nblocks" sizemap */
 334 uint64_t
 335 md_nblocks_get(minor_t mnum)
 336 {
 337         mod_hash_val_t  hv;
 338 
 339         mutex_enter(&md_nblocks_mutex);
 340         if (mod_hash_find(md_nblocksmap,
 341             (mod_hash_key_t)(intptr_t)mnum, &hv) == 0) {
 342                 mutex_exit(&md_nblocks_mutex);
 343                 return ((uint64_t)(intptr_t)hv);
 344         }
 345         mutex_exit(&md_nblocks_mutex);
 346         return (0);
 347 }
 348 
 349 /* allocate/free dynamic space associated with driver globals */
 350 void
 351 md_global_alloc_free(int alloc)
 352 {
 353         set_t   s;
 354 
 355         if (alloc) {
 356                 /* initialize driver global locks */
 357                 cv_init(&md_cv, NULL, CV_DEFAULT, NULL);
 358                 mutex_init(&md_mx, NULL, MUTEX_DEFAULT, NULL);
 359                 rw_init(&md_unit_array_rw.lock, NULL, RW_DEFAULT, NULL);
 360                 rw_init(&nm_lock.lock, NULL, RW_DEFAULT, NULL);
 361                 rw_init(&ni_rwlp.lock, NULL, RW_DRIVER, NULL);
 362                 rw_init(&hsp_rwlp.lock, NULL, RW_DRIVER, NULL);
 363                 mutex_init(&md_cpr_resync.md_resync_mutex, NULL,
 364                     MUTEX_DEFAULT, NULL);
 365                 mutex_init(&md_nblocks_mutex, NULL, MUTEX_DEFAULT, NULL);
 366 
 367                 /* initialize per set driver global locks */
 368                 for (s = 0; s < MD_MAXSETS; s++) {
 369                         /* initialize per set driver globals locks */
 370                         mutex_init(&md_set[s].s_dbmx,
 371                             NULL, MUTEX_DEFAULT, NULL);
 372                         mutex_init(&md_set_io[s].md_io_mx,
 373                             NULL, MUTEX_DEFAULT, NULL);
 374                         cv_init(&md_set_io[s].md_io_cv,
 375                             NULL, CV_DEFAULT, NULL);
 376                 }
 377         } else {
 378                 /* destroy per set driver global locks */
 379                 for (s = 0; s < MD_MAXSETS; s++) {
 380                         cv_destroy(&md_set_io[s].md_io_cv);
 381                         mutex_destroy(&md_set_io[s].md_io_mx);
 382                         mutex_destroy(&md_set[s].s_dbmx);
 383                 }
 384 
 385                 /* destroy driver global locks */
 386                 mutex_destroy(&md_nblocks_mutex);
 387                 mutex_destroy(&md_cpr_resync.md_resync_mutex);
 388                 rw_destroy(&hsp_rwlp.lock);
 389                 rw_destroy(&ni_rwlp.lock);
 390                 rw_destroy(&nm_lock.lock);
 391                 rw_destroy(&md_unit_array_rw.lock);
 392                 mutex_destroy(&md_mx);
 393                 cv_destroy(&md_cv);
 394         }
 395 }
 396 
 397 int
 398 _init(void)
 399 {
 400         set_t   s;
 401         int     err;
 402 
 403         MD_SET_IN(IN_INIT);
 404 
 405         /* allocate dynamic space associated with driver globals */
 406         md_global_alloc_free(1);
 407 
 408         /* initialize driver globals */
 409         md_major = ddi_name_to_major("md");
 410         md_hz = drv_usectohz(NUM_USEC_IN_SEC);
 411 
 412         /* initialize tunable globals */
 413         if (md_maxphys == 0)            /* maximum io size in bytes */
 414                 md_maxphys = maxphys;
 415         if (md_maxbcount == 0)          /* maximum physio size in bytes */
 416                 md_maxbcount = MD_MAXBCOUNT;
 417 
 418         /* initialize per set driver globals */
 419         for (s = 0; s < MD_MAXSETS; s++)
 420                 md_set_io[s].io_state = MD_SET_ACTIVE;
 421 
 422         /*
 423          * NOTE: the framework does not currently guarantee exclusion
 424          * between _init and attach after calling mod_install.
 425          */
 426         MD_CLR_IN(IN_INIT);
 427         if ((err = mod_install(&modlinkage))) {
 428                 MD_SET_IN(IN_INIT);
 429                 md_global_alloc_free(0);        /* free dynamic space */
 430                 MD_CLR_IN(IN_INIT);
 431         }
 432         return (err);
 433 }
 434 
 435 int
 436 _fini(void)
 437 {
 438         int     err;
 439 
 440         /*
 441          * NOTE: the framework currently does not guarantee exclusion
 442          * with attach until after mod_remove returns 0.
 443          */
 444         if ((err = mod_remove(&modlinkage)))
 445                 return (err);
 446 
 447         MD_SET_IN(IN_FINI);
 448         md_global_alloc_free(0);        /* free dynamic space */
 449         MD_CLR_IN(IN_FINI);
 450         return (err);
 451 }
 452 
 453 int
 454 _info(struct modinfo *modinfop)
 455 {
 456         return (mod_info(&modlinkage, modinfop));
 457 }
 458 
 459 /* ARGSUSED */
 460 static int
 461 mdattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 462 {
 463         int     len;
 464         unit_t  i;
 465         size_t  sz;
 466         char    ver[VERSION_LENGTH];
 467         char    **maj_str_array;
 468         char    *str, *str2;
 469 
 470         MD_SET_IN(IN_ATTACH);
 471         md_in_upgrade = 0;
 472         md_keep_repl_state = 0;
 473         md_devid_destroy = 0;
 474 
 475         if (cmd != DDI_ATTACH) {
 476                 MD_CLR_IN(IN_ATTACH);
 477                 return (DDI_FAILURE);
 478         }
 479 
 480         if (md_devinfo != NULL) {
 481                 MD_CLR_IN(IN_ATTACH);
 482                 return (DDI_FAILURE);
 483         }
 484 
 485         mddb_init();
 486 
 487         if (md_start_daemons(TRUE)) {
 488                 MD_CLR_IN(IN_ATTACH);
 489                 mddb_unload();          /* undo mddb_init() allocations */
 490                 return (DDI_FAILURE);
 491         }
 492 
 493         /* clear the halted state */
 494         md_clr_status(MD_GBL_HALTED);
 495 
 496         /* see if the diagnostic switch is on */
 497         if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
 498             DDI_PROP_DONTPASS, "md_init_debug", 0))
 499                 md_init_debug++;
 500 
 501         /* see if the failfast disable switch is on */
 502         if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
 503             DDI_PROP_DONTPASS, "md_ff_disable", 0))
 504                 md_ff_disable++;
 505 
 506         /* try and get the md_nmedh property */
 507         md_nmedh = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
 508             DDI_PROP_DONTPASS, "md_nmedh", MED_DEF_HOSTS);
 509         if ((md_nmedh <= 0) || (md_nmedh > MED_MAX_HOSTS))
 510                 md_nmedh = MED_DEF_HOSTS;
 511 
 512         /* try and get the md_med_trans_lst property */
 513         len = 0;
 514         if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN,
 515             0, "md_med_trans_lst", NULL, &len) != DDI_PROP_SUCCESS ||
 516             len == 0) {
 517                 md_med_trans_lst = md_strdup("tcp");
 518         } else {
 519                 md_med_trans_lst = kmem_zalloc((size_t)len, KM_SLEEP);
 520                 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
 521                     0, "md_med_trans_lst", md_med_trans_lst, &len) !=
 522                     DDI_PROP_SUCCESS) {
 523                         kmem_free(md_med_trans_lst, (size_t)len);
 524                         md_med_trans_lst = md_strdup("tcp");
 525                 }
 526         }
 527 
 528         /*
 529          * Must initialize the internal data structures before the
 530          * any possible calls to 'goto attach_failure' as _fini
 531          * routine references them.
 532          */
 533         med_init();
 534 
 535         md_ops = (md_ops_t **)kmem_zalloc(
 536             sizeof (md_ops_t *) * MD_NOPS, KM_SLEEP);
 537         md_mods = (ddi_modhandle_t *)kmem_zalloc(
 538             sizeof (ddi_modhandle_t) * MD_NOPS, KM_SLEEP);
 539 
 540         /* try and get the md_xlate property */
 541         /* Should we only do this if upgrade? */
 542         len = sizeof (char) * 5;
 543         if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
 544             0, "md_xlate_ver", ver, &len) == DDI_PROP_SUCCESS) {
 545                 if (strcmp(ver, VERSION) == 0) {
 546                         len = 0;
 547                         if (ddi_prop_op(DDI_DEV_T_ANY, dip,
 548                             PROP_LEN_AND_VAL_ALLOC, 0, "md_xlate",
 549                             (caddr_t)&md_tuple_table, &len) !=
 550                             DDI_PROP_SUCCESS) {
 551                                 if (md_init_debug)
 552                                         cmn_err(CE_WARN,
 553                                             "md_xlate ddi_prop_op failed");
 554                                 goto attach_failure;
 555                         } else {
 556                                 md_tuple_length =
 557                                     len/(2 * ((int)sizeof (dev32_t)));
 558                                 md_in_upgrade = 1;
 559                         }
 560 
 561                         /* Get target's name to major table */
 562                         if (ddi_prop_lookup_string_array(DDI_DEV_T_ANY,
 563                             dip, DDI_PROP_DONTPASS,
 564                             "md_targ_nm_table", &maj_str_array,
 565                             &md_majortab_len) != DDI_PROP_SUCCESS) {
 566                                 md_majortab_len = 0;
 567                                 if (md_init_debug)
 568                                         cmn_err(CE_WARN, "md_targ_nm_table "
 569                                             "ddi_prop_lookup_string_array "
 570                                             "failed");
 571                                 goto attach_failure;
 572                         }
 573 
 574                         md_major_tuple_table =
 575                             (struct md_xlate_major_table *)
 576                             kmem_zalloc(md_majortab_len *
 577                             sizeof (struct md_xlate_major_table), KM_SLEEP);
 578 
 579                         for (i = 0; i < md_majortab_len; i++) {
 580                                 /* Getting major name */
 581                                 str = strchr(maj_str_array[i], ' ');
 582                                 if (str == NULL)
 583                                         continue;
 584                                 *str = '\0';
 585                                 md_major_tuple_table[i].drv_name =
 586                                     md_strdup(maj_str_array[i]);
 587 
 588                                 /* Simplified atoi to get major number */
 589                                 str2 = str + 1;
 590                                 md_major_tuple_table[i].targ_maj = 0;
 591                                 while ((*str2 >= '0') && (*str2 <= '9')) {
 592                                         md_major_tuple_table[i].targ_maj *= 10;
 593                                         md_major_tuple_table[i].targ_maj +=
 594                                             *str2++ - '0';
 595                                 }
 596                                 *str = ' ';
 597                         }
 598                         ddi_prop_free((void *)maj_str_array);
 599                 } else {
 600                         if (md_init_debug)
 601                                 cmn_err(CE_WARN, "md_xlate_ver is incorrect");
 602                         goto attach_failure;
 603                 }
 604         }
 605 
 606         /*
 607          * Check for properties:
 608          *      md_keep_repl_state and md_devid_destroy
 609          * and set globals if these exist.
 610          */
 611         md_keep_repl_state = ddi_getprop(DDI_DEV_T_ANY, dip,
 612             0, "md_keep_repl_state", 0);
 613 
 614         md_devid_destroy = ddi_getprop(DDI_DEV_T_ANY, dip,
 615             0, "md_devid_destroy", 0);
 616 
 617         if (MD_UPGRADE)
 618                 md_major_targ = md_targ_name_to_major("md");
 619         else
 620                 md_major_targ = 0;
 621 
 622         /* allocate admin device node */
 623         if (ddi_create_priv_minor_node(dip, "admin", S_IFCHR,
 624             MD_ADM_MINOR, DDI_PSEUDO, 0, NULL, PRIV_SYS_CONFIG, 0640))
 625                 goto attach_failure;
 626 
 627         if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
 628             DDI_KERNEL_IOCTL, NULL, 0) != DDI_SUCCESS)
 629                 goto attach_failure;
 630 
 631         if (ddi_prop_update_int(DDI_DEV_T_NONE, dip,
 632             "ddi-abrwrite-supported", 1) != DDI_SUCCESS)
 633                 goto attach_failure;
 634 
 635         /* these could have been cleared by a detach */
 636         md_nunits = MD_MAXUNITS;
 637         md_nsets = MD_MAXSETS;
 638 
 639         sz = sizeof (void *) * MD_MAXUNITS;
 640         if (md_set[0].s_un == NULL)
 641                 md_set[0].s_un = kmem_zalloc(sz, KM_SLEEP);
 642         if (md_set[0].s_ui == NULL)
 643                 md_set[0].s_ui = kmem_zalloc(sz, KM_SLEEP);
 644 
 645         md_devinfo = dip;
 646 
 647         /*
 648          * Only allocate device node for root mirror metadevice.
 649          * Don't pre-allocate unnecessary device nodes (thus slowing down a
 650          * boot when we attach).
 651          * We can't read the mddbs in attach.  The mddbs will be read
 652          * by metainit during the boot process when it is doing the
 653          * auto-take processing and any other minor nodes will be
 654          * allocated at that point.
 655          *
 656          * There are two scenarios to be aware of here:
 657          * 1) when we are booting from a mirrored root we need the root
 658          *    metadevice to exist very early (during vfs_mountroot processing)
 659          * 2) we need all of the nodes to be created so that any mnttab entries
 660          *    will succeed (handled by metainit reading the mddb during boot).
 661          */
 662         if (strncmp(SVM_PSEUDO_STR, svm_bootpath, sizeof (SVM_PSEUDO_STR) - 1)
 663             == 0) {
 664                 char *p;
 665                 int mnum = 0;
 666 
 667                 /*
 668                  * The svm_bootpath string looks something like
 669                  * /pseudo/md@0:0,150,blk where 150 is the minor number
 670                  * in this example so we need to set the pointer p onto
 671                  * the first digit of the minor number and convert it
 672                  * from ascii.
 673                  */
 674                 for (p = svm_bootpath + sizeof (SVM_PSEUDO_STR) + 1;
 675                     *p >= '0' && *p <= '9'; p++) {
 676                         mnum *= 10;
 677                         mnum += *p - '0';
 678                 }
 679 
 680                 if (md_create_minor_node(0, mnum)) {
 681                         kmem_free(md_set[0].s_un, sz);
 682                         kmem_free(md_set[0].s_ui, sz);
 683                         goto attach_failure;
 684                 }
 685         }
 686 
 687         /* create the hash to store the meta device sizes */
 688         md_nblocksmap = mod_hash_create_idhash("md_nblocksmap",
 689             md_nblocksmap_size, mod_hash_null_valdtor);
 690 
 691         MD_CLR_IN(IN_ATTACH);
 692         return (DDI_SUCCESS);
 693 
 694 attach_failure:
 695         /*
 696          * Use our own detach routine to toss any stuff we allocated above.
 697          * NOTE: detach will call md_halt to free the mddb_init allocations.
 698          */
 699         MD_CLR_IN(IN_ATTACH);
 700         if (mddetach(dip, DDI_DETACH) != DDI_SUCCESS)
 701                 cmn_err(CE_WARN, "detach from attach failed");
 702         return (DDI_FAILURE);
 703 }
 704 
 705 /* ARGSUSED */
 706 static int
 707 mddetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 708 {
 709         extern int      check_active_locators();
 710         set_t           s;
 711         size_t          sz;
 712         int             len;
 713 
 714         MD_SET_IN(IN_DETACH);
 715 
 716         /* check command */
 717         if (cmd != DDI_DETACH) {
 718                 MD_CLR_IN(IN_DETACH);
 719                 return (DDI_FAILURE);
 720         }
 721 
 722         /*
 723          * if we have not already halted yet we have no active config
 724          * then automatically initiate a halt so we can detach.
 725          */
 726         if (!(md_get_status() & MD_GBL_HALTED)) {
 727                 if (check_active_locators() == 0) {
 728                         /*
 729                          * NOTE: a successful md_halt will have done the
 730                          * mddb_unload to free allocations done in mddb_init
 731                          */
 732                         if (md_halt(MD_NO_GBL_LOCKS_HELD)) {
 733                                 cmn_err(CE_NOTE, "md:detach: "
 734                                     "Could not halt Solaris Volume Manager");
 735                                 MD_CLR_IN(IN_DETACH);
 736                                 return (DDI_FAILURE);
 737                         }
 738                 }
 739 
 740                 /* fail detach if we have not halted */
 741                 if (!(md_get_status() & MD_GBL_HALTED)) {
 742                         MD_CLR_IN(IN_DETACH);
 743                         return (DDI_FAILURE);
 744                 }
 745         }
 746 
 747         /* must be in halted state, this will be cleared on next attach */
 748         ASSERT(md_get_status() & MD_GBL_HALTED);
 749 
 750         /* cleanup attach allocations and initializations */
 751         md_major_targ = 0;
 752 
 753         sz = sizeof (void *) * md_nunits;
 754         for (s = 0; s < md_nsets; s++) {
 755                 if (md_set[s].s_un != NULL) {
 756                         kmem_free(md_set[s].s_un, sz);
 757                         md_set[s].s_un = NULL;
 758                 }
 759 
 760                 if (md_set[s].s_ui != NULL) {
 761                         kmem_free(md_set[s].s_ui, sz);
 762                         md_set[s].s_ui = NULL;
 763                 }
 764         }
 765         md_nunits = 0;
 766         md_nsets = 0;
 767         md_nmedh = 0;
 768 
 769         if (non_ff_drivers != NULL) {
 770                 int     i;
 771 
 772                 for (i = 0; non_ff_drivers[i] != NULL; i++)
 773                         kmem_free(non_ff_drivers[i],
 774                             strlen(non_ff_drivers[i]) + 1);
 775 
 776                 /* free i+1 entries because there is a null entry at list end */
 777                 kmem_free(non_ff_drivers, (i + 1) * sizeof (char *));
 778                 non_ff_drivers = NULL;
 779         }
 780 
 781         if (md_med_trans_lst != NULL) {
 782                 kmem_free(md_med_trans_lst, strlen(md_med_trans_lst) + 1);
 783                 md_med_trans_lst = NULL;
 784         }
 785 
 786         if (md_mods != NULL) {
 787                 kmem_free(md_mods, sizeof (ddi_modhandle_t) * MD_NOPS);
 788                 md_mods = NULL;
 789         }
 790 
 791         if (md_ops != NULL) {
 792                 kmem_free(md_ops, sizeof (md_ops_t *) * MD_NOPS);
 793                 md_ops = NULL;
 794         }
 795 
 796         if (MD_UPGRADE) {
 797                 len = md_tuple_length * (2 * ((int)sizeof (dev32_t)));
 798                 md_in_upgrade = 0;
 799                 md_xlate_free(len);
 800                 md_majortab_free();
 801         }
 802 
 803         /*
 804          * Undo what we did in mdattach, freeing resources
 805          * and removing things we installed.  The system
 806          * framework guarantees we are not active with this devinfo
 807          * node in any other entry points at this time.
 808          */
 809         ddi_prop_remove_all(dip);
 810         ddi_remove_minor_node(dip, NULL);
 811 
 812         med_fini();
 813 
 814         mod_hash_destroy_idhash(md_nblocksmap);
 815 
 816         md_devinfo = NULL;
 817 
 818         MD_CLR_IN(IN_DETACH);
 819         return (DDI_SUCCESS);
 820 }
 821 
 822 
 823 /*
 824  * Given the device number return the devinfo pointer
 825  * given to md via md_attach
 826  */
 827 /*ARGSUSED*/
 828 static int
 829 mdinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 830 {
 831         int             error = DDI_FAILURE;
 832 
 833         switch (infocmd) {
 834         case DDI_INFO_DEVT2DEVINFO:
 835                 if (md_devinfo) {
 836                         *result = (void *)md_devinfo;
 837                         error = DDI_SUCCESS;
 838                 }
 839                 break;
 840 
 841         case DDI_INFO_DEVT2INSTANCE:
 842                 *result = (void *)0;
 843                 error = DDI_SUCCESS;
 844                 break;
 845         }
 846         return (error);
 847 }
 848 
 849 /*
 850  * property operation routine.  return the number of blocks for the partition
 851  * in question or forward the request to the property facilities.
 852  */
 853 static int
 854 mdprop_op(
 855         dev_t dev,              /* device number associated with device */
 856         dev_info_t *dip,        /* device info struct for this device */
 857         ddi_prop_op_t prop_op,  /* property operator */
 858         int mod_flags,          /* property flags */
 859         char *name,             /* name of property */
 860         caddr_t valuep,         /* where to put property value */
 861         int *lengthp)           /* put length of property here */
 862 {
 863         return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags,
 864             name, valuep, lengthp, md_nblocks_get(getminor(dev))));
 865 }
 866 
 867 static void
 868 snarf_user_data(set_t setno)
 869 {
 870         mddb_recid_t            recid;
 871         mddb_recstatus_t        status;
 872 
 873         recid = mddb_makerecid(setno, 0);
 874         while ((recid = mddb_getnextrec(recid, MDDB_USER, 0)) > 0) {
 875                 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
 876                         continue;
 877 
 878                 status = mddb_getrecstatus(recid);
 879                 if (status == MDDB_STALE)
 880                         continue;
 881 
 882                 if (status == MDDB_NODATA) {
 883                         mddb_setrecprivate(recid, MD_PRV_PENDDEL);
 884                         continue;
 885                 }
 886 
 887                 ASSERT(status == MDDB_OK);
 888 
 889                 mddb_setrecprivate(recid, MD_PRV_GOTIT);
 890         }
 891 }
 892 
 893 static void
 894 md_print_block_usage(mddb_set_t *s, uint_t blks)
 895 {
 896         uint_t          ib;
 897         int             li;
 898         mddb_mb_ic_t    *mbip;
 899         uint_t          max_blk_needed;
 900         mddb_lb_t       *lbp;
 901         mddb_sidelocator_t      *slp;
 902         int             drv_index;
 903         md_splitname    sn;
 904         char            *name;
 905         char            *suffix;
 906         size_t          prefixlen;
 907         size_t          suffixlen;
 908         int             alloc_sz;
 909 
 910 
 911         max_blk_needed = s->s_totalblkcnt - s->s_freeblkcnt + blks;
 912 
 913         cmn_err(CE_WARN, "Blocks in Metadevice State Database: %d\n"
 914             "            Additional Blocks Needed:            %d\n\n"
 915             "            Increase size of following replicas for\n"
 916             "            device relocatability by deleting listed\n"
 917             "            replica and re-adding replica with\n"
 918             "            increased size (see metadb(1M)):\n"
 919             "                Replica                   Increase By",
 920             s->s_totalblkcnt, (blks - s->s_freeblkcnt));
 921 
 922         lbp = s->s_lbp;
 923 
 924         for (li = 0; li < lbp->lb_loccnt; li++) {
 925                 if (lbp->lb_locators[li].l_flags & MDDB_F_DELETED)
 926                         continue;
 927                 ib = 0;
 928                 for (mbip = s->s_mbiarray[li]; mbip != NULL;
 929                     mbip = mbip->mbi_next) {
 930                         ib += (uint_t)mbip->mbi_mddb_mb.mb_blkcnt;
 931                 }
 932                 if (ib == 0)
 933                         continue;
 934                 if (ib < max_blk_needed) {
 935                         slp = &lbp->lb_sidelocators[s->s_sideno][li];
 936                         drv_index = slp->l_drvnm_index;
 937                         mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno,
 938                             &sn);
 939                         prefixlen = SPN_PREFIX(&sn).pre_len;
 940                         suffixlen = SPN_SUFFIX(&sn).suf_len;
 941                         alloc_sz = (int)(prefixlen + suffixlen + 2);
 942                         name = (char *)kmem_alloc(alloc_sz, KM_SLEEP);
 943                         (void) strncpy(name, SPN_PREFIX(&sn).pre_data,
 944                             prefixlen);
 945                         name[prefixlen] = '/';
 946                         suffix = name + (prefixlen + 1);
 947                         (void) strncpy(suffix, SPN_SUFFIX(&sn).suf_data,
 948                             suffixlen);
 949                         name[prefixlen + suffixlen + 1] = '\0';
 950                         cmn_err(CE_WARN,
 951                             "  %s (%s:%d:%d)   %d blocks",
 952                             name, lbp->lb_drvnm[drv_index].dn_data,
 953                             slp->l_mnum, lbp->lb_locators[li].l_blkno,
 954                             (max_blk_needed - ib));
 955                         kmem_free(name, alloc_sz);
 956                 }
 957         }
 958 }
 959 
 960 /*
 961  * md_create_minor_node:
 962  *      Create the minor device for the given set and un_self_id.
 963  *
 964  * Input:
 965  *      setno   - set number
 966  *      mnum    - selfID of unit
 967  *
 968  * Output:
 969  *      None.
 970  *
 971  * Returns 0 for success, 1 for failure.
 972  *
 973  * Side-effects:
 974  *      None.
 975  */
 976 int
 977 md_create_minor_node(set_t setno, minor_t mnum)
 978 {
 979         char            name[20];
 980 
 981         /* Check for valid arguments */
 982         if (setno >= MD_MAXSETS || MD_MIN2UNIT(mnum) >= MD_MAXUNITS)
 983                 return (1);
 984 
 985         (void) snprintf(name, 20, "%u,%u,blk",
 986             (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));
 987 
 988         if (ddi_create_minor_node(md_devinfo, name, S_IFBLK,
 989             MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
 990                 return (1);
 991 
 992         (void) snprintf(name, 20, "%u,%u,raw",
 993             (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));
 994 
 995         if (ddi_create_minor_node(md_devinfo, name, S_IFCHR,
 996             MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
 997                 return (1);
 998 
 999         return (0);
1000 }
1001 
1002 /*
1003  * For a given key check if it is an orphaned record.
1004  * The following conditions are used to determine an orphan.
1005  * 1. The device associated with that key is not a metadevice.
1006  * 2. If DEVID_STYLE then the physical device does not have a device Id
1007  * associated with it.
1008  *
1009  * If a key does not have an entry in the devid namespace it could be
1010  * a device that does not support device ids. Hence the record is not
1011  * deleted.
1012  */
1013 
1014 static int
1015 md_verify_orphaned_record(set_t setno, mdkey_t key)
1016 {
1017         md_dev64_t      odev; /* orphaned dev */
1018         mddb_set_t      *s;
1019         side_t          side = 0;
1020         struct nm_next_hdr      *did_nh = NULL;
1021 
1022         s = (mddb_set_t *)md_set[setno].s_db;
1023         if ((did_nh = get_first_record(setno, 1,  (NM_DEVID | NM_NOTSHARED)))
1024             == NULL)
1025                 return (0);
1026         /*
1027          * If devid style is set then get the dev_t using MD_NOTRUST_DEVT
1028          */
1029         if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE) {
1030                 odev = md_getdevnum(setno, side, key, MD_NOTRUST_DEVT);
1031                 if ((odev == NODEV64) || (md_getmajor(odev) == md_major))
1032                         return (0);
1033                 if (lookup_entry(did_nh, setno, side, key, odev, NM_DEVID) ==
1034                     NULL)
1035                         return (1);
1036         }
1037         return (0);
1038 }
1039 
1040 int
1041 md_snarf_db_set(set_t setno, md_error_t *ep)
1042 {
1043         int                     err = 0;
1044         int                     i;
1045         mddb_recid_t            recid;
1046         mddb_type_t             drvrid;
1047         mddb_recstatus_t        status;
1048         md_ops_t                *ops;
1049         uint_t                  privat;
1050         mddb_set_t              *s;
1051         uint_t                  cvt_blks;
1052         struct nm_next_hdr      *nh;
1053         mdkey_t                 key = MD_KEYWILD;
1054         side_t                  side = 0;
1055         int                     size;
1056         int                     devid_flag;
1057         int                     retval;
1058         uint_t                  un;
1059         int                     un_next_set = 0;
1060 
1061         md_haltsnarf_enter(setno);
1062 
1063         mutex_enter(&md_mx);
1064         if (md_set[setno].s_status & MD_SET_SNARFED) {
1065                 mutex_exit(&md_mx);
1066                 md_haltsnarf_exit(setno);
1067                 return (0);
1068         }
1069         mutex_exit(&md_mx);
1070 
1071         if (! (md_get_status() & MD_GBL_DAEMONS_LIVE)) {
1072                 if (md_start_daemons(TRUE)) {
1073                         if (ep != NULL)
1074                                 (void) mdsyserror(ep, ENXIO);
1075                         err = -1;
1076                         goto out;
1077                 }
1078         }
1079 
1080 
1081         /*
1082          * Load the devid name space if it exists
1083          */
1084         (void) md_load_namespace(setno, NULL, NM_DEVID);
1085         if (!md_load_namespace(setno, ep, 0L)) {
1086                 /*
1087                  * Unload the devid namespace
1088                  */
1089                 (void) md_unload_namespace(setno, NM_DEVID);
1090                 err = -1;
1091                 goto out;
1092         }
1093 
1094         /*
1095          * If replica is in non-devid state, convert if:
1096          *      - not in probe during upgrade (md_keep_repl_state = 0)
1097          *      - enough space available in replica
1098          *      - local set
1099          *      - not a multi-node diskset
1100          *      - clustering is not present (for non-local set)
1101          */
1102         s = (mddb_set_t *)md_set[setno].s_db;
1103         devid_flag = 0;
1104         if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE) && !md_keep_repl_state)
1105                 devid_flag = 1;
1106         if (cluster_bootflags & CLUSTER_CONFIGURED)
1107                 if (setno != MD_LOCAL_SET)
1108                         devid_flag = 0;
1109         if (MD_MNSET_SETNO(setno))
1110                 devid_flag = 0;
1111         if ((md_devid_destroy == 1) && (md_keep_repl_state == 1))
1112                 devid_flag = 0;
1113 
1114         /*
1115          * if we weren't devid style before and md_keep_repl_state=1
1116          * we need to stay non-devid
1117          */
1118         if ((md_keep_repl_state == 1) &&
1119             ((s->s_lbp->lb_flags & MDDB_DEVID_STYLE) == 0))
1120                 devid_flag = 0;
1121         if (devid_flag) {
1122                 /*
1123                  * Determine number of free blocks needed to convert
1124                  * entire replica to device id format - locator blocks
1125                  * and namespace.
1126                  */
1127                 cvt_blks = 0;
1128                 if (mddb_lb_did_convert(s, 0, &cvt_blks) != 0) {
1129                         if (ep != NULL)
1130                                 (void) mdsyserror(ep, EIO);
1131                         err = -1;
1132                         goto out;
1133 
1134                 }
1135                 cvt_blks += md_nm_did_chkspace(setno);
1136 
1137                 /* add MDDB_DEVID_CONV_PERC% */
1138                 if ((md_conv_perc > 0) && (md_conv_perc <= 100)) {
1139                         cvt_blks = cvt_blks * (100 + md_conv_perc) / 100;
1140                 }
1141 
1142                 if (cvt_blks <= s->s_freeblkcnt) {
1143                         if (mddb_lb_did_convert(s, 1, &cvt_blks) != 0) {
1144                                 if (ep != NULL)
1145                                         (void) mdsyserror(ep, EIO);
1146                                 err = -1;
1147                                 goto out;
1148                         }
1149 
1150                 } else {
1151                         /*
1152                          * Print message that replica can't be converted for
1153                          * lack of space.   No failure - just continue to
1154                          * run without device ids.
1155                          */
1156                         cmn_err(CE_WARN,
1157                             "Unable to add Solaris Volume Manager device "
1158                             "relocation data.\n"
1159                             "          To use device relocation feature:\n"
1160                             "          - Increase size of listed replicas\n"
1161                             "          - Reboot");
1162                         md_print_block_usage(s, cvt_blks);
1163                         cmn_err(CE_WARN,
1164                             "Loading set without device relocation data.\n"
1165                             "          Solaris Volume Manager disk movement "
1166                             "not tracked in local set.");
1167                 }
1168         }
1169 
1170         /*
1171          * go through and load any modules referenced in
1172          * data base
1173          */
1174         recid = mddb_makerecid(setno, 0);
1175         while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1176                 status = mddb_getrecstatus(recid);
1177                 if (status == MDDB_STALE) {
1178                         if (! (md_get_setstatus(setno) & MD_SET_STALE)) {
1179                                 md_set_setstatus(setno, MD_SET_STALE);
1180                                 cmn_err(CE_WARN,
1181                                     "md: state database is stale");
1182                         }
1183                 } else if (status == MDDB_NODATA) {
1184                         mddb_setrecprivate(recid, MD_PRV_PENDDEL);
1185                         continue;
1186                 }
1187                 drvrid = mddb_getrectype1(recid);
1188                 if (drvrid < MDDB_FIRST_MODID)
1189                         continue;
1190                 if (md_loadsubmod(setno, md_getshared_name(setno, drvrid),
1191                     drvrid) < 0) {
1192                         cmn_err(CE_NOTE, "md: could not load misc/%s",
1193                             md_getshared_name(setno, drvrid));
1194                 }
1195         }
1196 
1197         if (recid < 0)
1198                 goto out;
1199 
1200         snarf_user_data(setno);
1201 
1202         /*
1203          * Initialize the md_nm_snarfed array
1204          * this array is indexed by the key and
1205          * is set by md_getdevnum during the snarf time
1206          */
1207         if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) != NULL) {
1208                 size = (int)((((struct nm_rec_hdr *)nh->nmn_record)->
1209                     r_next_key) * (sizeof (int)));
1210                 md_nm_snarfed = (int *)kmem_zalloc(size, KM_SLEEP);
1211         }
1212 
1213         /*
1214          * go through and snarf until nothing gets added
1215          */
1216         do {
1217                 i = 0;
1218                 for (ops = md_opslist; ops != NULL; ops = ops->md_next) {
1219                         if (ops->md_snarf != NULL) {
1220                                 retval = ops->md_snarf(MD_SNARF_DOIT, setno);
1221                                 if (retval == -1) {
1222                                         err = -1;
1223                                         /* Don't know the failed unit */
1224                                         (void) mdmderror(ep, MDE_RR_ALLOC_ERROR,
1225                                             0);
1226                                         (void) md_halt_set(setno, MD_HALT_ALL);
1227                                         (void) mddb_unload_set(setno);
1228                                         md_haltsnarf_exit(setno);
1229                                         return (err);
1230                                 } else {
1231                                         i += retval;
1232                                 }
1233                         }
1234                 }
1235         } while (i);
1236 
1237         /*
1238          * Set the first available slot and availability
1239          */
1240         md_set[setno].s_un_avail = 0;
1241         for (un = 0; un < MD_MAXUNITS; un++) {
1242                 if (md_set[setno].s_un[un] != NULL) {
1243                         continue;
1244                 } else {
1245                         if (!un_next_set) {
1246                                 md_set[setno].s_un_next = un;
1247                                 un_next_set = 1;
1248                         }
1249                         md_set[setno].s_un_avail++;
1250                 }
1251         }
1252 
1253         md_set_setstatus(setno, MD_SET_SNARFED);
1254 
1255         recid = mddb_makerecid(setno, 0);
1256         while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1257                 privat = mddb_getrecprivate(recid);
1258                 if (privat & MD_PRV_COMMIT) {
1259                         if (mddb_commitrec(recid)) {
1260                                 if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
1261                                         md_set_setstatus(setno, MD_SET_STALE);
1262                                         cmn_err(CE_WARN,
1263                                             "md: state database is stale");
1264                                 }
1265                         }
1266                         mddb_setrecprivate(recid, MD_PRV_GOTIT);
1267                 }
1268         }
1269 
1270         /* Deletes must happen after all the commits */
1271         recid = mddb_makerecid(setno, 0);
1272         while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1273                 privat = mddb_getrecprivate(recid);
1274                 if (privat & MD_PRV_DELETE) {
1275                         if (mddb_deleterec(recid)) {
1276                                 if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
1277                                         md_set_setstatus(setno, MD_SET_STALE);
1278                                         cmn_err(CE_WARN,
1279                                             "md: state database is stale");
1280                                 }
1281                                 mddb_setrecprivate(recid, MD_PRV_GOTIT);
1282                         }
1283                         recid = mddb_makerecid(setno, 0);
1284                 }
1285         }
1286 
1287         /*
1288          * go through and clean up records until nothing gets cleaned up.
1289          */
1290         do {
1291                 i = 0;
1292                 for (ops = md_opslist; ops != NULL; ops = ops->md_next)
1293                         if (ops->md_snarf != NULL)
1294                                 i += ops->md_snarf(MD_SNARF_CLEANUP, setno);
1295         } while (i);
1296 
1297         if (md_nm_snarfed != NULL &&
1298             !(md_get_setstatus(setno) & MD_SET_STALE)) {
1299                 /*
1300                  * go thru and cleanup the namespace and the device id
1301                  * name space
1302                  */
1303                 for (key = 1;
1304                     key < ((struct nm_rec_hdr *)nh->nmn_record)->r_next_key;
1305                     key++) {
1306                         /*
1307                          * Is the entry an 'orphan'?
1308                          */
1309                         if (lookup_entry(nh, setno, side, key, NODEV64, 0L) !=
1310                             NULL) {
1311                                 /*
1312                                  * If the value is not set then apparently
1313                                  * it is not part of the current configuration,
1314                                  * remove it this can happen when system panic
1315                                  * between the primary name space update and
1316                                  * the device id name space update
1317                                  */
1318                                 if (md_nm_snarfed[key] == 0) {
1319                                         if (md_verify_orphaned_record(setno,
1320                                             key) == 1)
1321                                                 (void) remove_entry(nh,
1322                                                     side, key, 0L);
1323                                 }
1324                         }
1325                 }
1326         }
1327 
1328         if (md_nm_snarfed != NULL) {
1329                 /*
1330                  * Done and free the memory
1331                  */
1332                 kmem_free(md_nm_snarfed, size);
1333                 md_nm_snarfed = NULL;
1334         }
1335 
1336         if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE &&
1337             !(md_get_setstatus(setno) & MD_SET_STALE)) {
1338                 /*
1339                  * if the destroy flag has been set and
1340                  * the MD_SET_DIDCLUP bit is not set in
1341                  * the set's status field, cleanup the
1342                  * entire device id namespace
1343                  */
1344                 if (md_devid_destroy &&
1345                     !(md_get_setstatus(setno) & MD_SET_DIDCLUP)) {
1346                         (void) md_devid_cleanup(setno, 1);
1347                         md_set_setstatus(setno, MD_SET_DIDCLUP);
1348                 } else
1349                         (void) md_devid_cleanup(setno, 0);
1350         }
1351 
1352         /*
1353          * clear single threading on snarf, return success or error
1354          */
1355 out:
1356         md_haltsnarf_exit(setno);
1357         return (err);
1358 }
1359 
1360 void
1361 get_minfo(struct dk_minfo *info, minor_t mnum)
1362 {
1363         md_unit_t       *un;
1364         mdi_unit_t      *ui;
1365 
1366         info->dki_capacity = 0;
1367         info->dki_lbsize = 0;
1368         info->dki_media_type = 0;
1369 
1370         if ((ui = MDI_UNIT(mnum)) == NULL) {
1371                 return;
1372         }
1373         un = (md_unit_t *)md_unit_readerlock(ui);
1374         info->dki_capacity = un->c.un_total_blocks;
1375         md_unit_readerexit(ui);
1376         info->dki_lbsize = DEV_BSIZE;
1377         info->dki_media_type = DK_UNKNOWN;
1378 }
1379 
1380 
1381 void
1382 get_info(struct dk_cinfo *info, minor_t mnum)
1383 {
1384         /*
1385          * Controller Information
1386          */
1387         info->dki_ctype = DKC_MD;
1388         info->dki_cnum = ddi_get_instance(ddi_get_parent(md_devinfo));
1389         (void) strcpy(info->dki_cname,
1390             ddi_get_name(ddi_get_parent(md_devinfo)));
1391         /*
1392          * Unit Information
1393          */
1394         info->dki_unit = mnum;
1395         info->dki_slave = 0;
1396         (void) strcpy(info->dki_dname, ddi_driver_name(md_devinfo));
1397         info->dki_flags = 0;
1398         info->dki_partition = 0;
1399         info->dki_maxtransfer = (ushort_t)(md_maxphys / DEV_BSIZE);
1400 
1401         /*
1402          * We can't get from here to there yet
1403          */
1404         info->dki_addr = 0;
1405         info->dki_space = 0;
1406         info->dki_prio = 0;
1407         info->dki_vec = 0;
1408 }
1409 
1410 /*
1411  * open admin device
1412  */
1413 static int
1414 mdadminopen(
1415         int     flag,
1416         int     otyp)
1417 {
1418         int     err = 0;
1419 
1420         /* single thread */
1421         mutex_enter(&md_mx);
1422 
1423         /* check type and flags */
1424         if ((otyp != OTYP_CHR) && (otyp != OTYP_LYR)) {
1425                 err = EINVAL;
1426                 goto out;
1427         }
1428         if (((flag & FEXCL) && (md_status & MD_GBL_OPEN)) ||
1429             (md_status & MD_GBL_EXCL)) {
1430                 err = EBUSY;
1431                 goto out;
1432         }
1433 
1434         /* count and flag open */
1435         md_ocnt[otyp]++;
1436         md_status |= MD_GBL_OPEN;
1437         if (flag & FEXCL)
1438                 md_status |= MD_GBL_EXCL;
1439 
1440         /* unlock return success */
1441 out:
1442         mutex_exit(&md_mx);
1443         return (err);
1444 }
1445 
1446 /*
1447  * open entry point
1448  */
1449 static int
1450 mdopen(
1451         dev_t           *dev,
1452         int             flag,
1453         int             otyp,
1454         cred_t          *cred_p)
1455 {
1456         minor_t         mnum = getminor(*dev);
1457         unit_t          unit = MD_MIN2UNIT(mnum);
1458         set_t           setno = MD_MIN2SET(mnum);
1459         mdi_unit_t      *ui = NULL;
1460         int             err = 0;
1461         md_parent_t     parent;
1462 
1463         /* dispatch admin device opens */
1464         if (mnum == MD_ADM_MINOR)
1465                 return (mdadminopen(flag, otyp));
1466 
1467         /* lock, check status */
1468         rw_enter(&md_unit_array_rw.lock, RW_READER);
1469 
1470 tryagain:
1471         if (md_get_status() & MD_GBL_HALTED)  {
1472                 err = ENODEV;
1473                 goto out;
1474         }
1475 
1476         /* check minor */
1477         if ((setno >= md_nsets) || (unit >= md_nunits)) {
1478                 err = ENXIO;
1479                 goto out;
1480         }
1481 
1482         /* make sure we're snarfed */
1483         if ((md_get_setstatus(MD_LOCAL_SET) & MD_SET_SNARFED) == 0) {
1484                 if (md_snarf_db_set(MD_LOCAL_SET, NULL) != 0) {
1485                         err = ENODEV;
1486                         goto out;
1487                 }
1488         }
1489         if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) {
1490                 err = ENODEV;
1491                 goto out;
1492         }
1493 
1494         /* check unit */
1495         if ((ui = MDI_UNIT(mnum)) == NULL) {
1496                 err = ENXIO;
1497                 goto out;
1498         }
1499 
1500         /*
1501          * The softpart open routine may do an I/O during the open, in
1502          * which case the open routine will set the OPENINPROGRESS flag
1503          * and drop all locks during the I/O.  If this thread sees
1504          * the OPENINPROGRESS flag set, if should wait until the flag
1505          * is reset before calling the driver's open routine.  It must
1506          * also revalidate the world after it grabs the unit_array lock
1507          * since the set may have been released or the metadevice cleared
1508          * during the sleep.
1509          */
1510         if (MD_MNSET_SETNO(setno)) {
1511                 mutex_enter(&ui->ui_mx);
1512                 if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
1513                         rw_exit(&md_unit_array_rw.lock);
1514                         cv_wait(&ui->ui_cv, &ui->ui_mx);
1515                         rw_enter(&md_unit_array_rw.lock, RW_READER);
1516                         mutex_exit(&ui->ui_mx);
1517                         goto tryagain;
1518                 }
1519                 mutex_exit(&ui->ui_mx);
1520         }
1521 
1522         /* Test if device is openable */
1523         if ((ui->ui_tstate & MD_NOTOPENABLE) != 0) {
1524                 err = ENXIO;
1525                 goto out;
1526         }
1527 
1528         /* don't allow opens w/WRITE flag if stale */
1529         if ((flag & FWRITE) && (md_get_setstatus(setno) & MD_SET_STALE)) {
1530                 err = EROFS;
1531                 goto out;
1532         }
1533 
1534         /* don't allow writes to subdevices */
1535         parent = md_get_parent(md_expldev(*dev));
1536         if ((flag & FWRITE) && MD_HAS_PARENT(parent)) {
1537                 err = EROFS;
1538                 goto out;
1539         }
1540 
1541         /* open underlying driver */
1542         if (md_ops[ui->ui_opsindex]->md_open != NULL) {
1543                 if ((err = (*md_ops[ui->ui_opsindex]->md_open)
1544                     (dev, flag, otyp, cred_p, 0)) != 0)
1545                         goto out;
1546         }
1547 
1548         /* or do it ourselves */
1549         else {
1550                 /* single thread */
1551                 (void) md_unit_openclose_enter(ui);
1552                 err = md_unit_incopen(mnum, flag, otyp);
1553                 md_unit_openclose_exit(ui);
1554                 if (err != 0)
1555                         goto out;
1556         }
1557 
1558         /* unlock, return status */
1559 out:
1560         rw_exit(&md_unit_array_rw.lock);
1561         return (err);
1562 }
1563 
1564 /*
1565  * close admin device
1566  */
1567 static int
1568 mdadminclose(
1569         int     otyp)
1570 {
1571         int     i;
1572         int     err = 0;
1573 
1574         /* single thread */
1575         mutex_enter(&md_mx);
1576 
1577         /* check type and flags */
1578         if ((otyp < 0) || (otyp >= OTYPCNT)) {
1579                 err = EINVAL;
1580                 goto out;
1581         } else if (md_ocnt[otyp] == 0) {
1582                 err = ENXIO;
1583                 goto out;
1584         }
1585 
1586         /* count and flag closed */
1587         if (otyp == OTYP_LYR)
1588                 md_ocnt[otyp]--;
1589         else
1590                 md_ocnt[otyp] = 0;
1591         md_status &= ~MD_GBL_OPEN;
1592         for (i = 0; (i < OTYPCNT); ++i)
1593                 if (md_ocnt[i] != 0)
1594                         md_status |= MD_GBL_OPEN;
1595         if (! (md_status & MD_GBL_OPEN))
1596                 md_status &= ~MD_GBL_EXCL;
1597 
1598         /* unlock return success */
1599 out:
1600         mutex_exit(&md_mx);
1601         return (err);
1602 }
1603 
1604 /*
1605  * close entry point
1606  */
1607 static int
1608 mdclose(
1609         dev_t           dev,
1610         int             flag,
1611         int             otyp,
1612         cred_t          *cred_p)
1613 {
1614         minor_t         mnum = getminor(dev);
1615         set_t           setno = MD_MIN2SET(mnum);
1616         unit_t          unit = MD_MIN2UNIT(mnum);
1617         mdi_unit_t      *ui = NULL;
1618         int             err = 0;
1619 
1620         /* dispatch admin device closes */
1621         if (mnum == MD_ADM_MINOR)
1622                 return (mdadminclose(otyp));
1623 
1624         /* check minor */
1625         if ((setno >= md_nsets) || (unit >= md_nunits) ||
1626             ((ui = MDI_UNIT(mnum)) == NULL)) {
1627                 err = ENXIO;
1628                 goto out;
1629         }
1630 
1631         /* close underlying driver */
1632         if (md_ops[ui->ui_opsindex]->md_close != NULL) {
1633                 if ((err = (*md_ops[ui->ui_opsindex]->md_close)
1634                     (dev, flag, otyp, cred_p, 0)) != 0)
1635                         goto out;
1636         }
1637 
1638         /* or do it ourselves */
1639         else {
1640                 /* single thread */
1641                 (void) md_unit_openclose_enter(ui);
1642                 err = md_unit_decopen(mnum, otyp);
1643                 md_unit_openclose_exit(ui);
1644                 if (err != 0)
1645                         goto out;
1646         }
1647 
1648         /* return success */
1649 out:
1650         return (err);
1651 }
1652 
1653 
1654 /*
1655  * This routine performs raw read operations.  It is called from the
1656  * device switch at normal priority.
1657  *
1658  * The main catch is that the *uio struct which is passed to us may
1659  * specify a read which spans two buffers, which would be contiguous
1660  * on a single partition,  but not on a striped partition. This will
1661  * be handled by mdstrategy.
1662  */
1663 /*ARGSUSED*/
1664 static int
1665 mdread(dev_t dev, struct uio *uio, cred_t *credp)
1666 {
1667         minor_t         mnum;
1668         mdi_unit_t      *ui;
1669         int             error;
1670 
1671         if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1672             (MD_MIN2SET(mnum) >= md_nsets) ||
1673             (MD_MIN2UNIT(mnum) >= md_nunits) ||
1674             ((ui = MDI_UNIT(mnum)) == NULL))
1675                 return (ENXIO);
1676 
1677         if (md_ops[ui->ui_opsindex]->md_read  != NULL)
1678                 return ((*md_ops[ui->ui_opsindex]->md_read)
1679                     (dev, uio, credp));
1680 
1681         if ((error = md_chk_uio(uio)) != 0)
1682                 return (error);
1683 
1684         return (physio(mdstrategy, NULL, dev, B_READ, md_minphys, uio));
1685 }
1686 
1687 /*
1688  * This routine performs async raw read operations.  It is called from the
1689  * device switch at normal priority.
1690  *
1691  * The main catch is that the *aio struct which is passed to us may
1692  * specify a read which spans two buffers, which would be contiguous
1693  * on a single partition,  but not on a striped partition. This will
1694  * be handled by mdstrategy.
1695  */
1696 /*ARGSUSED*/
1697 static int
1698 mdaread(dev_t dev, struct aio_req *aio, cred_t *credp)
1699 {
1700         minor_t         mnum;
1701         mdi_unit_t      *ui;
1702         int             error;
1703 
1704 
1705         if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1706             (MD_MIN2SET(mnum) >= md_nsets) ||
1707             (MD_MIN2UNIT(mnum) >= md_nunits) ||
1708             ((ui = MDI_UNIT(mnum)) == NULL))
1709                 return (ENXIO);
1710 
1711         if (md_ops[ui->ui_opsindex]->md_aread  != NULL)
1712                 return ((*md_ops[ui->ui_opsindex]->md_aread)
1713                     (dev, aio, credp));
1714 
1715         if ((error = md_chk_uio(aio->aio_uio)) != 0)
1716                 return (error);
1717 
1718         return (aphysio(mdstrategy, anocancel, dev, B_READ, md_minphys, aio));
1719 }
1720 
1721 /*
1722  * This routine performs raw write operations.  It is called from the
1723  * device switch at normal priority.
1724  *
1725  * The main catch is that the *uio struct which is passed to us may
1726  * specify a write which spans two buffers, which would be contiguous
1727  * on a single partition,  but not on a striped partition. This is
1728  * handled by mdstrategy.
1729  *
1730  */
1731 /*ARGSUSED*/
1732 static int
1733 mdwrite(dev_t dev, struct uio *uio, cred_t *credp)
1734 {
1735         minor_t         mnum;
1736         mdi_unit_t      *ui;
1737         int             error;
1738 
1739         if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1740             (MD_MIN2SET(mnum) >= md_nsets) ||
1741             (MD_MIN2UNIT(mnum) >= md_nunits) ||
1742             ((ui = MDI_UNIT(mnum)) == NULL))
1743                 return (ENXIO);
1744 
1745         if (md_ops[ui->ui_opsindex]->md_write  != NULL)
1746                 return ((*md_ops[ui->ui_opsindex]->md_write)
1747                     (dev, uio, credp));
1748 
1749         if ((error = md_chk_uio(uio)) != 0)
1750                 return (error);
1751 
1752         return (physio(mdstrategy, NULL, dev, B_WRITE, md_minphys, uio));
1753 }
1754 
1755 /*
1756  * This routine performs async raw write operations.  It is called from the
1757  * device switch at normal priority.
1758  *
1759  * The main catch is that the *aio struct which is passed to us may
1760  * specify a write which spans two buffers, which would be contiguous
1761  * on a single partition,  but not on a striped partition. This is
1762  * handled by mdstrategy.
1763  *
1764  */
1765 /*ARGSUSED*/
1766 static int
1767 mdawrite(dev_t dev, struct aio_req *aio, cred_t *credp)
1768 {
1769         minor_t         mnum;
1770         mdi_unit_t      *ui;
1771         int             error;
1772 
1773 
1774         if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1775             (MD_MIN2SET(mnum) >= md_nsets) ||
1776             (MD_MIN2UNIT(mnum) >= md_nunits) ||
1777             ((ui = MDI_UNIT(mnum)) == NULL))
1778                 return (ENXIO);
1779 
1780         if (md_ops[ui->ui_opsindex]->md_awrite  != NULL)
1781                 return ((*md_ops[ui->ui_opsindex]->md_awrite)
1782                     (dev, aio, credp));
1783 
1784         if ((error = md_chk_uio(aio->aio_uio)) != 0)
1785                 return (error);
1786 
1787         return (aphysio(mdstrategy, anocancel, dev, B_WRITE, md_minphys, aio));
1788 }
1789 
1790 int
1791 mdstrategy(struct buf *bp)
1792 {
1793         minor_t         mnum;
1794         mdi_unit_t      *ui;
1795 
1796         ASSERT((bp->b_flags & B_DONE) == 0);
1797 
1798         if (panicstr)
1799                 md_clr_status(MD_GBL_DAEMONS_LIVE);
1800 
1801         if (((mnum = getminor(bp->b_edev)) == MD_ADM_MINOR) ||
1802             (MD_MIN2SET(mnum) >= md_nsets) ||
1803             (MD_MIN2UNIT(mnum) >= md_nunits) ||
1804             ((ui = MDI_UNIT(mnum)) == NULL)) {
1805                 bp->b_flags |= B_ERROR;
1806                 bp->b_error = ENXIO;
1807                 bp->b_resid = bp->b_bcount;
1808                 biodone(bp);
1809                 return (0);
1810         }
1811 
1812         bp->b_flags &= ~(B_ERROR | B_DONE);
1813         if (md_ops[ui->ui_opsindex]->md_strategy  != NULL) {
1814                 (*md_ops[ui->ui_opsindex]->md_strategy) (bp, 0, NULL);
1815         } else {
1816                 (void) errdone(ui, bp, ENXIO);
1817         }
1818         return (0);
1819 }
1820 
1821 /*
1822  * Return true if the ioctl is allowed to be multithreaded.
1823  * All the ioctls with MN are sent only from the message handlers through
1824  * rpc.mdcommd, which (via it's own locking mechanism) takes care that not two
1825  * ioctl for the same metadevice are issued at the same time.
1826  * So we are safe here.
1827  * The other ioctls do not mess with any metadevice structures and therefor
1828  * are harmless too, if called multiple times at the same time.
1829  */
1830 static boolean_t
1831 is_mt_ioctl(int cmd) {
1832 
1833         switch (cmd) {
1834         case MD_IOCGUNIQMSGID:
1835         case MD_IOCGVERSION:
1836         case MD_IOCISOPEN:
1837         case MD_MN_SET_MM_OWNER:
1838         case MD_MN_SET_STATE:
1839         case MD_MN_SUSPEND_WRITES:
1840         case MD_MN_ALLOCATE_HOTSPARE:
1841         case MD_MN_SET_SETFLAGS:
1842         case MD_MN_GET_SETFLAGS:
1843         case MD_MN_MDDB_OPTRECFIX:
1844         case MD_MN_MDDB_PARSE:
1845         case MD_MN_MDDB_BLOCK:
1846         case MD_MN_DB_USERREQ:
1847         case MD_IOC_SPSTATUS:
1848         case MD_MN_COMMD_ERR:
1849         case MD_MN_SET_COMMD_RUNNING:
1850         case MD_MN_RESYNC:
1851         case MD_MN_SETSYNC:
1852         case MD_MN_POKE_HOTSPARES:
1853         case MD_MN_RR_DIRTY:
1854         case MD_MN_RR_CLEAN:
1855         case MD_MN_IOC_SPUPDATEWM:
1856                 return (1);
1857         default:
1858                 return (0);
1859         }
1860 }
1861 
1862 /*
1863  * This routine implements the ioctl calls for the Virtual Disk System.
1864  * It is called from the device switch at normal priority.
1865  */
1866 /* ARGSUSED */
1867 static int
1868 mdioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cred_p,
1869         int *rval_p)
1870 {
1871         minor_t         mnum = getminor(dev);
1872         mdi_unit_t      *ui;
1873         IOLOCK          lock;
1874         int             err;
1875 
1876         /*
1877          * For multinode disksets  number of ioctls are allowed to be
1878          * multithreaded.
1879          * A fundamental assumption made in this implementation is that
1880          * ioctls either do not interact with other md structures  or the
1881          * ioctl to the admin device can only occur if the metadevice
1882          * device is open. i.e. avoid a race between metaclear and the
1883          * progress of a multithreaded ioctl.
1884          */
1885 
1886         if (!is_mt_ioctl(cmd) && md_ioctl_lock_enter() == EINTR) {
1887                 return (EINTR);
1888         }
1889 
1890         /*
1891          * initialize lock tracker
1892          */
1893         IOLOCK_INIT(&lock);
1894 
1895         /* Flag to indicate that MD_GBL_IOCTL_LOCK is not acquired */
1896 
1897         if (is_mt_ioctl(cmd)) {
1898                 /* increment the md_mtioctl_cnt */
1899                 mutex_enter(&md_mx);
1900                 md_mtioctl_cnt++;
1901                 mutex_exit(&md_mx);
1902                 lock.l_flags |= MD_MT_IOCTL;
1903         }
1904 
1905         /*
1906          * this has been added to prevent notification from re-snarfing
1907          * so metaunload will work.  It may interfere with other modules
1908          * halt process.
1909          */
1910         if (md_get_status() & (MD_GBL_HALTED | MD_GBL_DAEMONS_DIE))
1911                 return (IOLOCK_RETURN(ENXIO, &lock));
1912 
1913         /*
1914          * admin device ioctls
1915          */
1916         if (mnum == MD_ADM_MINOR) {
1917                 err = md_admin_ioctl(md_expldev(dev), cmd, (void *) data,
1918                     mode, &lock);
1919         }
1920 
1921         /*
1922          * metadevice ioctls
1923          */
1924         else if ((MD_MIN2SET(mnum) >= md_nsets) ||
1925             (MD_MIN2UNIT(mnum) >= md_nunits) ||
1926             (md_set[MD_MIN2SET(mnum)].s_ui == NULL) ||
1927             ((ui = MDI_UNIT(mnum)) == NULL)) {
1928                 err = ENXIO;
1929         } else if (md_ops[ui->ui_opsindex]->md_ioctl == NULL) {
1930                 err = ENOTTY;
1931         } else {
1932                 err = (*md_ops[ui->ui_opsindex]->md_ioctl)
1933                     (dev, cmd, (void *) data, mode, &lock);
1934         }
1935 
1936         /*
1937          * drop any locks we grabbed
1938          */
1939         return (IOLOCK_RETURN_IOCTLEND(err, &lock));
1940 }
1941 
1942 static int
1943 mddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1944 {
1945         minor_t         mnum;
1946         set_t           setno;
1947         mdi_unit_t      *ui;
1948 
1949         if ((mnum = getminor(dev)) == MD_ADM_MINOR)
1950                 return (ENXIO);
1951 
1952         setno = MD_MIN2SET(mnum);
1953 
1954         if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits) ||
1955             ((ui = MDI_UNIT(mnum)) == NULL))
1956                 return (ENXIO);
1957 
1958 
1959         if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
1960                 return (ENXIO);
1961 
1962         if (md_ops[ui->ui_opsindex]->md_dump  != NULL)
1963                 return ((*md_ops[ui->ui_opsindex]->md_dump)
1964                     (dev, addr, blkno, nblk));
1965 
1966         return (ENXIO);
1967 }
1968 
1969 /*
1970  * Metadevice unit number dispatcher
1971  * When this routine is called it will scan the
1972  * incore unit array and return the avail slot
1973  * hence the unit number to the caller
1974  *
1975  * Return -1 if there is nothing available
1976  */
1977 unit_t
1978 md_get_nextunit(set_t setno)
1979 {
1980         unit_t  un, start;
1981 
1982         /*
1983          * If nothing available
1984          */
1985         if (md_set[setno].s_un_avail == 0) {
1986                 return (MD_UNITBAD);
1987         }
1988 
1989         mutex_enter(&md_mx);
1990         start = un = md_set[setno].s_un_next;
1991 
1992         /* LINTED: E_CONSTANT_CONDITION */
1993         while (1) {
1994                 if (md_set[setno].s_un[un] == NULL) {
1995                         /*
1996                          * Advance the starting index for the next
1997                          * md_get_nextunit call
1998                          */
1999                         if (un == MD_MAXUNITS - 1) {
2000                                 md_set[setno].s_un_next = 0;
2001                         } else {
2002                                 md_set[setno].s_un_next = un + 1;
2003                         }
2004                         break;
2005                 }
2006 
2007                 un = ((un == MD_MAXUNITS - 1) ? 0 : un + 1);
2008 
2009                 if (un == start) {
2010                         un = MD_UNITBAD;
2011                         break;
2012                 }
2013 
2014         }
2015 
2016         mutex_exit(&md_mx);
2017         return (un);
2018 }