illumos-gate Old usr/src/uts/common/io/lvm/md/md.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
  25  */
  26 
  27 /*
  28  * Md - is the meta-disk driver.   It sits below the UFS file system
  29  * but above the 'real' disk drivers, xy, id, sd etc.
  30  *
  31  * To the UFS software, md looks like a normal driver, since it has
  32  * the normal kinds of entries in the bdevsw and cdevsw arrays. So
  33  * UFS accesses md in the usual ways.  In particular, the strategy
  34  * routine, mdstrategy(), gets called by fbiwrite(), ufs_getapage(),
  35  * and ufs_writelbn().
  36  *
  37  * Md maintains an array of minor devices (meta-partitions).   Each
  38  * meta partition stands for a matrix of real partitions, in rows
  39  * which are not necessarily of equal length.   Md maintains a table,
  40  * with one entry for each meta-partition,  which lists the rows and
  41  * columns of actual partitions, and the job of the strategy routine
  42  * is to translate from the meta-partition device and block numbers
  43  * known to UFS into the actual partitions' device and block numbers.
  44  *
  45  * See below, in mdstrategy(), mdreal(), and mddone() for details of
  46  * this translation.
  47  */
  48 
  49 /*
  50  * Driver for Virtual Disk.
  51  */
  52 
  53 #include <sys/user.h>
  54 #include <sys/sysmacros.h>
  55 #include <sys/conf.h>
  56 #include <sys/stat.h>
  57 #include <sys/errno.h>
  58 #include <sys/param.h>
  59 #include <sys/systm.h>
  60 #include <sys/file.h>
  61 #include <sys/open.h>
  62 #include <sys/dkio.h>
  63 #include <sys/vtoc.h>
  64 #include <sys/cmn_err.h>
  65 #include <sys/ddi.h>
  66 #include <sys/sunddi.h>
  67 #include <sys/debug.h>
  68 #include <sys/utsname.h>
  69 #include <sys/lvm/mdvar.h>
  70 #include <sys/lvm/md_names.h>
  71 #include <sys/lvm/md_mddb.h>
  72 #include <sys/lvm/md_sp.h>
  73 #include <sys/types.h>
  74 #include <sys/kmem.h>
  75 #include <sys/cladm.h>
  76 #include <sys/priv_names.h>
  77 #include <sys/modhash.h>
  78 
  79 int             md_init_debug   = 0;    /* module binding debug */
  80 
  81 /*
  82  * Tunable to turn off the failfast behavior.
  83  */
  84 int             md_ff_disable = 0;
  85 
  86 /*
  87  * dynamically allocated list of non FF driver names - needs to
  88  * be freed when md is detached.
  89  */
  90 char    **non_ff_drivers = NULL;
  91 
  92 md_krwlock_t    md_unit_array_rw;       /* protects all unit arrays */
  93 md_krwlock_t    nm_lock;                /* protects all the name spaces */
  94 
  95 md_resync_t     md_cpr_resync;
  96 
  97 extern char     svm_bootpath[];
  98 #define SVM_PSEUDO_STR  "/pseudo/md@0:"
  99 
 100 #define         VERSION_LENGTH  6
 101 #define         VERSION         "1.0"
 102 
 103 /*
 104  * Keep track of possible 'orphan' entries in the name space
 105  */
 106 int             *md_nm_snarfed = NULL;
 107 
 108 /*
 109  * Global tunable giving the percentage of free space left in replica during
 110  * conversion of non-devid style replica to devid style replica.
 111  */
 112 int             md_conv_perc = MDDB_DEVID_CONV_PERC;
 113 
 114 #ifdef  DEBUG
 115 /* debug code to verify framework exclusion guarantees */
 116 int             md_in;
 117 kmutex_t        md_in_mx;                       /* used to md global stuff */
 118 #define IN_INIT         0x01
 119 #define IN_FINI         0x02
 120 #define IN_ATTACH       0x04
 121 #define IN_DETACH       0x08
 122 #define IN_OPEN         0x10
 123 #define MD_SET_IN(x) {                                          \
 124         mutex_enter(&md_in_mx);                                     \
 125         if (md_in)                                              \
 126                 debug_enter("MD_SET_IN exclusion lost");        \
 127         if (md_in & x)                                              \
 128                 debug_enter("MD_SET_IN already set");           \
 129         md_in |= x;                                             \
 130         mutex_exit(&md_in_mx);                                      \
 131 }
 132 
 133 #define MD_CLR_IN(x) {                                          \
 134         mutex_enter(&md_in_mx);                                     \
 135         if (md_in & ~(x))                                   \
 136                 debug_enter("MD_CLR_IN exclusion lost");        \
 137         if (!(md_in & x))                                   \
 138                 debug_enter("MD_CLR_IN already clr");           \
 139         md_in &= ~x;                                                \
 140         mutex_exit(&md_in_mx);                                      \
 141 }
 142 #else   /* DEBUG */
 143 #define MD_SET_IN(x)
 144 #define MD_CLR_IN(x)
 145 #endif  /* DEBUG */
 146 hrtime_t savetime1, savetime2;
 147 
 148 
 149 /*
 150  * list things protected by md_mx even if they aren't
 151  * used in this file.
 152  */
 153 kmutex_t        md_mx;                  /* used to md global stuff */
 154 kcondvar_t      md_cv;                  /* md_status events */
 155 int             md_status = 0;          /* global status for the meta-driver */
 156 int             md_num_daemons = 0;
 157 int             md_ioctl_cnt = 0;
 158 int             md_mtioctl_cnt = 0;     /* multithreaded ioctl cnt */
 159 uint_t          md_mdelay = 10;         /* variable so can be patched */
 160 
 161 int             (*mdv_strategy_tstpnt)(buf_t *, int, void*);
 162 
 163 major_t         md_major, md_major_targ;
 164 
 165 unit_t          md_nunits = MD_MAXUNITS;
 166 set_t           md_nsets = MD_MAXSETS;
 167 int             md_nmedh = 0;
 168 char            *md_med_trans_lst = NULL;
 169 md_set_t        md_set[MD_MAXSETS];
 170 md_set_io_t     md_set_io[MD_MAXSETS];
 171 
 172 md_krwlock_t    hsp_rwlp;               /* protects hot_spare_interface */
 173 md_krwlock_t    ni_rwlp;                /* protects notify_interface */
 174 md_ops_t        **md_ops = NULL;
 175 ddi_modhandle_t *md_mods = NULL;
 176 md_ops_t        *md_opslist;
 177 clock_t         md_hz;
 178 md_event_queue_t        *md_event_queue = NULL;
 179 
 180 int             md_in_upgrade;
 181 int             md_keep_repl_state;
 182 int             md_devid_destroy;
 183 
 184 /* for sending messages thru a door to userland */
 185 door_handle_t   mdmn_door_handle = NULL;
 186 int             mdmn_door_did = -1;
 187 
 188 dev_info_t              *md_devinfo = NULL;
 189 
 190 md_mn_nodeid_t  md_mn_mynode_id = ~0u;  /* My node id (for multi-node sets) */
 191 
 192 static  uint_t          md_ocnt[OTYPCNT];
 193 
 194 static int              mdinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
 195 static int              mdattach(dev_info_t *, ddi_attach_cmd_t);
 196 static int              mddetach(dev_info_t *, ddi_detach_cmd_t);
 197 static int              mdopen(dev_t *, int, int, cred_t *);
 198 static int              mdclose(dev_t, int, int, cred_t *);
 199 static int              mddump(dev_t, caddr_t, daddr_t, int);
 200 static int              mdread(dev_t, struct uio *, cred_t *);
 201 static int              mdwrite(dev_t, struct uio *, cred_t *);
 202 static int              mdaread(dev_t, struct aio_req *, cred_t *);
 203 static int              mdawrite(dev_t, struct aio_req *, cred_t *);
 204 static int              mdioctl(dev_t, int, intptr_t, int, cred_t *, int *);
 205 static int              mdprop_op(dev_t, dev_info_t *,
 206                                 ddi_prop_op_t, int, char *, caddr_t, int *);
 207 
 208 static struct cb_ops md_cb_ops = {
 209         mdopen,                 /* open */
 210         mdclose,                /* close */
 211         mdstrategy,             /* strategy */
 212                                 /* print routine -- none yet */
 213         (int(*)(dev_t, char *))nulldev,
 214         mddump,                 /* dump */
 215         mdread,                 /* read */
 216         mdwrite,                /* write */
 217         mdioctl,                /* ioctl */
 218                                 /* devmap */
 219         (int(*)(dev_t, devmap_cookie_t, offset_t, size_t, size_t *,
 220                         uint_t))nodev,
 221                                 /* mmap */
 222         (int(*)(dev_t, off_t, int))nodev,
 223                                 /* segmap */
 224         (int(*)(dev_t, off_t, struct as *, caddr_t *, off_t, unsigned,
 225                 unsigned, unsigned, cred_t *))nodev,
 226         nochpoll,               /* poll */
 227         mdprop_op,              /* prop_op */
 228         0,                      /* streamtab */
 229         (D_64BIT|D_MP|D_NEW),   /* driver compatibility flag */
 230         CB_REV,                 /* cb_ops version */
 231         mdaread,                /* aread */
 232         mdawrite,               /* awrite */
 233 };
 234 
 235 static struct dev_ops md_devops = {
 236         DEVO_REV,               /* dev_ops version */
 237         0,                      /* device reference count */
 238         mdinfo,                 /* info routine */
 239         nulldev,                /* identify routine */
 240         nulldev,                /* probe - not defined */
 241         mdattach,               /* attach routine */
 242         mddetach,               /* detach routine */
 243         nodev,                  /* reset - not defined */
 244         &md_cb_ops,         /* driver operations */
 245         NULL,                   /* bus operations */
 246         nodev,                  /* power management */
 247         ddi_quiesce_not_needed,         /* quiesce */
 248 };
 249 
 250 /*
 251  * loadable module wrapper
 252  */
 253 #include <sys/modctl.h>
 254 
 255 static struct modldrv modldrv = {
 256         &mod_driverops,                     /* type of module -- a pseudodriver */
 257         "Solaris Volume Manager base module", /* name of the module */
 258         &md_devops,                 /* driver ops */
 259 };
 260 
 261 static struct modlinkage modlinkage = {
 262         MODREV_1,
 263         (void *)&modldrv,
 264         NULL
 265 };
 266 
 267 
 268 /* md_medd.c */
 269 extern  void    med_init(void);
 270 extern  void    med_fini(void);
 271 extern  void    md_devid_cleanup(set_t, uint_t);
 272 
 273 /* md_names.c */
 274 extern struct nm_next_hdr       *get_first_record(set_t, int, int);
 275 
 276 int             md_maxphys      = 0;    /* maximum io size in bytes */
 277 #define         MD_MAXBCOUNT    (1024 * 1024)
 278 unsigned        md_maxbcount    = 0;    /* maximum physio size in bytes */
 279 
 280 /*
 281  * Some md ioctls trigger io framework device tree operations.  An
 282  * example is md ioctls that call md_resolve_bydevid(): which uses the
 283  * io framework to resolve a devid. Such operations result in acquiring
 284  * io framework locks (like ndi_devi_enter() of "/") while holding
 285  * driver locks (like md_unit_writerlock()).
 286  *
 287  * The prop_op(9E) entry point is called from the devinfo driver with
 288  * an active ndi_devi_enter of "/". To avoid deadlock, md's prop_op
 289  * implementation must avoid taking a lock that is held per above md
 290  * ioctl description: i.e. mdprop_op(9E) can't call md_unit_readerlock()
 291  * without risking deadlock.
 292  *
 293  * To service "size" requests without risking deadlock, we maintain a
 294  * "mnum->nblocks" sizemap (protected by a short-term global mutex).
 295  */
 296 static kmutex_t         md_nblocks_mutex;
 297 static mod_hash_t       *md_nblocksmap;         /* mnum -> nblocks */
 298 int                     md_nblocksmap_size = 512;
 299 
 300 /*
 301  * Maintain "mnum->nblocks" sizemap for mdprop_op use:
 302  *
 303  * Create: any code that establishes a unit's un_total_blocks needs the
 304  * following type of call to establish nblocks for mdprop_op():
 305  *      md_nblocks_set(mnum, un->c.un_total_blocks);"
 306  *      NOTE: locate via cscope md_create_minor_node/md_create_unit_incore
 307  *              ...or  "MD_UNIT..*="
 308  *
 309  * Change: any code that changes a unit's un_total_blocks needs the
 310  * following type of call to sync nblocks for mdprop_op():
 311  *      md_nblocks_set(mnum, un->c.un_total_blocks);"
 312  *      NOTE: locate via cscope for "un_total_blocks[ \t]*="
 313  *
 314  * Destroy: any code that deletes a unit needs the following type of call
 315  * to sync nblocks for mdprop_op():
 316  *      md_nblocks_set(mnum, -1ULL);
 317  *      NOTE: locate via cscope md_remove_minor_node/md_destroy_unit_incore
 318  *              ...or  "MD_UNIT..*="
 319  */
 320 void
 321 md_nblocks_set(minor_t mnum, uint64_t nblocks)
 322 {
 323         mutex_enter(&md_nblocks_mutex);
 324         if (nblocks == -1ULL)
 325                 (void) mod_hash_destroy(md_nblocksmap,
 326                     (mod_hash_key_t)(intptr_t)mnum);
 327         else
 328                 (void) mod_hash_replace(md_nblocksmap,
 329                     (mod_hash_key_t)(intptr_t)mnum,
 330                     (mod_hash_val_t)(intptr_t)nblocks);
 331         mutex_exit(&md_nblocks_mutex);
 332 }
 333 
 334 /* get the size of a mnum from "mnum->nblocks" sizemap */
 335 uint64_t
 336 md_nblocks_get(minor_t mnum)
 337 {
 338         mod_hash_val_t  hv;
 339 
 340         mutex_enter(&md_nblocks_mutex);
 341         if (mod_hash_find(md_nblocksmap,
 342             (mod_hash_key_t)(intptr_t)mnum, &hv) == 0) {
 343                 mutex_exit(&md_nblocks_mutex);
 344                 return ((uint64_t)(intptr_t)hv);
 345         }
 346         mutex_exit(&md_nblocks_mutex);
 347         return (0);
 348 }
 349 
 350 /* allocate/free dynamic space associated with driver globals */
 351 void
 352 md_global_alloc_free(int alloc)
 353 {
 354         set_t   s;
 355 
 356         if (alloc) {
 357                 /* initialize driver global locks */
 358                 cv_init(&md_cv, NULL, CV_DEFAULT, NULL);
 359                 mutex_init(&md_mx, NULL, MUTEX_DEFAULT, NULL);
 360                 rw_init(&md_unit_array_rw.lock, NULL, RW_DEFAULT, NULL);
 361                 rw_init(&nm_lock.lock, NULL, RW_DEFAULT, NULL);
 362                 rw_init(&ni_rwlp.lock, NULL, RW_DRIVER, NULL);
 363                 rw_init(&hsp_rwlp.lock, NULL, RW_DRIVER, NULL);
 364                 mutex_init(&md_cpr_resync.md_resync_mutex, NULL,
 365                     MUTEX_DEFAULT, NULL);
 366                 mutex_init(&md_nblocks_mutex, NULL, MUTEX_DEFAULT, NULL);
 367 
 368                 /* initialize per set driver global locks */
 369                 for (s = 0; s < MD_MAXSETS; s++) {
 370                         /* initialize per set driver globals locks */
 371                         mutex_init(&md_set[s].s_dbmx,
 372                             NULL, MUTEX_DEFAULT, NULL);
 373                         mutex_init(&md_set_io[s].md_io_mx,
 374                             NULL, MUTEX_DEFAULT, NULL);
 375                         cv_init(&md_set_io[s].md_io_cv,
 376                             NULL, CV_DEFAULT, NULL);
 377                 }
 378         } else {
 379                 /* destroy per set driver global locks */
 380                 for (s = 0; s < MD_MAXSETS; s++) {
 381                         cv_destroy(&md_set_io[s].md_io_cv);
 382                         mutex_destroy(&md_set_io[s].md_io_mx);
 383                         mutex_destroy(&md_set[s].s_dbmx);
 384                 }
 385 
 386                 /* destroy driver global locks */
 387                 mutex_destroy(&md_nblocks_mutex);
 388                 mutex_destroy(&md_cpr_resync.md_resync_mutex);
 389                 rw_destroy(&hsp_rwlp.lock);
 390                 rw_destroy(&ni_rwlp.lock);
 391                 rw_destroy(&nm_lock.lock);
 392                 rw_destroy(&md_unit_array_rw.lock);
 393                 mutex_destroy(&md_mx);
 394                 cv_destroy(&md_cv);
 395         }
 396 }
 397 
 398 int
 399 _init(void)
 400 {
 401         set_t   s;
 402         int     err;
 403 
 404         MD_SET_IN(IN_INIT);
 405 
 406         /* allocate dynamic space associated with driver globals */
 407         md_global_alloc_free(1);
 408 
 409         /* initialize driver globals */
 410         md_major = ddi_name_to_major("md");
 411         md_hz = drv_usectohz(NUM_USEC_IN_SEC);
 412 
 413         /* initialize tunable globals */
 414         if (md_maxphys == 0)            /* maximum io size in bytes */
 415                 md_maxphys = maxphys;
 416         if (md_maxbcount == 0)          /* maximum physio size in bytes */
 417                 md_maxbcount = MD_MAXBCOUNT;
 418 
 419         /* initialize per set driver globals */
 420         for (s = 0; s < MD_MAXSETS; s++)
 421                 md_set_io[s].io_state = MD_SET_ACTIVE;
 422 
 423         /*
 424          * NOTE: the framework does not currently guarantee exclusion
 425          * between _init and attach after calling mod_install.
 426          */
 427         MD_CLR_IN(IN_INIT);
 428         if ((err = mod_install(&modlinkage))) {
 429                 MD_SET_IN(IN_INIT);
 430                 md_global_alloc_free(0);        /* free dynamic space */
 431                 MD_CLR_IN(IN_INIT);
 432         }
 433         return (err);
 434 }
 435 
 436 int
 437 _fini(void)
 438 {
 439         int     err;
 440 
 441         /*
 442          * NOTE: the framework currently does not guarantee exclusion
 443          * with attach until after mod_remove returns 0.
 444          */
 445         if ((err = mod_remove(&modlinkage)))
 446                 return (err);
 447 
 448         MD_SET_IN(IN_FINI);
 449         md_global_alloc_free(0);        /* free dynamic space */
 450         MD_CLR_IN(IN_FINI);
 451         return (err);
 452 }
 453 
 454 int
 455 _info(struct modinfo *modinfop)
 456 {
 457         return (mod_info(&modlinkage, modinfop));
 458 }
 459 
 460 /* ARGSUSED */
 461 static int
 462 mdattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 463 {
 464         int     len;
 465         unit_t  i;
 466         size_t  sz;
 467         char    ver[VERSION_LENGTH];
 468         char    **maj_str_array;
 469         char    *str, *str2;
 470 
 471         MD_SET_IN(IN_ATTACH);
 472         md_in_upgrade = 0;
 473         md_keep_repl_state = 0;
 474         md_devid_destroy = 0;
 475 
 476         if (cmd != DDI_ATTACH) {
 477                 MD_CLR_IN(IN_ATTACH);
 478                 return (DDI_FAILURE);
 479         }
 480 
 481         if (md_devinfo != NULL) {
 482                 MD_CLR_IN(IN_ATTACH);
 483                 return (DDI_FAILURE);
 484         }
 485 
 486         mddb_init();
 487 
 488         if (md_start_daemons(TRUE)) {
 489                 MD_CLR_IN(IN_ATTACH);
 490                 mddb_unload();          /* undo mddb_init() allocations */
 491                 return (DDI_FAILURE);
 492         }
 493 
 494         /* clear the halted state */
 495         md_clr_status(MD_GBL_HALTED);
 496 
 497         /* see if the diagnostic switch is on */
 498         if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
 499             DDI_PROP_DONTPASS, "md_init_debug", 0))
 500                 md_init_debug++;
 501 
 502         /* see if the failfast disable switch is on */
 503         if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
 504             DDI_PROP_DONTPASS, "md_ff_disable", 0))
 505                 md_ff_disable++;
 506 
 507         /* try and get the md_nmedh property */
 508         md_nmedh = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
 509             DDI_PROP_DONTPASS, "md_nmedh", MED_DEF_HOSTS);
 510         if ((md_nmedh <= 0) || (md_nmedh > MED_MAX_HOSTS))
 511                 md_nmedh = MED_DEF_HOSTS;
 512 
 513         /* try and get the md_med_trans_lst property */
 514         len = 0;
 515         if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN,
 516             0, "md_med_trans_lst", NULL, &len) != DDI_PROP_SUCCESS ||
 517             len == 0) {
 518                 md_med_trans_lst = md_strdup("tcp");
 519         } else {
 520                 md_med_trans_lst = kmem_zalloc((size_t)len, KM_SLEEP);
 521                 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
 522                     0, "md_med_trans_lst", md_med_trans_lst, &len) !=
 523                     DDI_PROP_SUCCESS) {
 524                         kmem_free(md_med_trans_lst, (size_t)len);
 525                         md_med_trans_lst = md_strdup("tcp");
 526                 }
 527         }
 528 
 529         /*
 530          * Must initialize the internal data structures before the
 531          * any possible calls to 'goto attach_failure' as _fini
 532          * routine references them.
 533          */
 534         med_init();
 535 
 536         md_ops = (md_ops_t **)kmem_zalloc(
 537             sizeof (md_ops_t *) * MD_NOPS, KM_SLEEP);
 538         md_mods = (ddi_modhandle_t *)kmem_zalloc(
 539             sizeof (ddi_modhandle_t) * MD_NOPS, KM_SLEEP);
 540 
 541         /* try and get the md_xlate property */
 542         /* Should we only do this if upgrade? */
 543         len = sizeof (char) * 5;
 544         if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
 545             0, "md_xlate_ver", ver, &len) == DDI_PROP_SUCCESS) {
 546                 if (strcmp(ver, VERSION) == 0) {
 547                         len = 0;
 548                         if (ddi_prop_op(DDI_DEV_T_ANY, dip,
 549                             PROP_LEN_AND_VAL_ALLOC, 0, "md_xlate",
 550                             (caddr_t)&md_tuple_table, &len) !=
 551                             DDI_PROP_SUCCESS) {
 552                                 if (md_init_debug)
 553                                         cmn_err(CE_WARN,
 554                                             "md_xlate ddi_prop_op failed");
 555                                 goto attach_failure;
 556                         } else {
 557                                 md_tuple_length =
 558                                     len/(2 * ((int)sizeof (dev32_t)));
 559                                 md_in_upgrade = 1;
 560                         }
 561 
 562                         /* Get target's name to major table */
 563                         if (ddi_prop_lookup_string_array(DDI_DEV_T_ANY,
 564                             dip, DDI_PROP_DONTPASS,
 565                             "md_targ_nm_table", &maj_str_array,
 566                             &md_majortab_len) != DDI_PROP_SUCCESS) {
 567                                 md_majortab_len = 0;
 568                                 if (md_init_debug)
 569                                         cmn_err(CE_WARN, "md_targ_nm_table "
 570                                             "ddi_prop_lookup_string_array "
 571                                             "failed");
 572                                 goto attach_failure;
 573                         }
 574 
 575                         md_major_tuple_table =
 576                             (struct md_xlate_major_table *)
 577                             kmem_zalloc(md_majortab_len *
 578                             sizeof (struct md_xlate_major_table), KM_SLEEP);
 579 
 580                         for (i = 0; i < md_majortab_len; i++) {
 581                                 /* Getting major name */
 582                                 str = strchr(maj_str_array[i], ' ');
 583                                 if (str == NULL)
 584                                         continue;
 585                                 *str = '\0';
 586                                 md_major_tuple_table[i].drv_name =
 587                                     md_strdup(maj_str_array[i]);
 588 
 589                                 /* Simplified atoi to get major number */
 590                                 str2 = str + 1;
 591                                 md_major_tuple_table[i].targ_maj = 0;
 592                                 while ((*str2 >= '0') && (*str2 <= '9')) {
 593                                         md_major_tuple_table[i].targ_maj *= 10;
 594                                         md_major_tuple_table[i].targ_maj +=
 595                                             *str2++ - '0';
 596                                 }
 597                                 *str = ' ';
 598                         }
 599                         ddi_prop_free((void *)maj_str_array);
 600                 } else {
 601                         if (md_init_debug)
 602                                 cmn_err(CE_WARN, "md_xlate_ver is incorrect");
 603                         goto attach_failure;
 604                 }
 605         }
 606 
 607         /*
 608          * Check for properties:
 609          *      md_keep_repl_state and md_devid_destroy
 610          * and set globals if these exist.
 611          */
 612         md_keep_repl_state = ddi_getprop(DDI_DEV_T_ANY, dip,
 613             0, "md_keep_repl_state", 0);
 614 
 615         md_devid_destroy = ddi_getprop(DDI_DEV_T_ANY, dip,
 616             0, "md_devid_destroy", 0);
 617 
 618         if (MD_UPGRADE)
 619                 md_major_targ = md_targ_name_to_major("md");
 620         else
 621                 md_major_targ = 0;
 622 
 623         /* allocate admin device node */
 624         if (ddi_create_priv_minor_node(dip, "admin", S_IFCHR,
 625             MD_ADM_MINOR, DDI_PSEUDO, 0, NULL, PRIV_SYS_CONFIG, 0640))
 626                 goto attach_failure;
 627 
 628         if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
 629             DDI_KERNEL_IOCTL, NULL, 0) != DDI_SUCCESS)
 630                 goto attach_failure;
 631 
 632         if (ddi_prop_update_int(DDI_DEV_T_NONE, dip,
 633             "ddi-abrwrite-supported", 1) != DDI_SUCCESS)
 634                 goto attach_failure;
 635 
 636         /* these could have been cleared by a detach */
 637         md_nunits = MD_MAXUNITS;
 638         md_nsets = MD_MAXSETS;
 639 
 640         sz = sizeof (void *) * MD_MAXUNITS;
 641         if (md_set[0].s_un == NULL)
 642                 md_set[0].s_un = kmem_zalloc(sz, KM_SLEEP);
 643         if (md_set[0].s_ui == NULL)
 644                 md_set[0].s_ui = kmem_zalloc(sz, KM_SLEEP);
 645 
 646         md_devinfo = dip;
 647 
 648         /*
 649          * Only allocate device node for root mirror metadevice.
 650          * Don't pre-allocate unnecessary device nodes (thus slowing down a
 651          * boot when we attach).
 652          * We can't read the mddbs in attach.  The mddbs will be read
 653          * by metainit during the boot process when it is doing the
 654          * auto-take processing and any other minor nodes will be
 655          * allocated at that point.
 656          *
 657          * There are two scenarios to be aware of here:
 658          * 1) when we are booting from a mirrored root we need the root
 659          *    metadevice to exist very early (during vfs_mountroot processing)
 660          * 2) we need all of the nodes to be created so that any mnttab entries
 661          *    will succeed (handled by metainit reading the mddb during boot).
 662          */
 663         if (strncmp(SVM_PSEUDO_STR, svm_bootpath, sizeof (SVM_PSEUDO_STR) - 1)
 664             == 0) {
 665                 char *p;
 666                 int mnum = 0;
 667 
 668                 /*
 669                  * The svm_bootpath string looks something like
 670                  * /pseudo/md@0:0,150,blk where 150 is the minor number
 671                  * in this example so we need to set the pointer p onto
 672                  * the first digit of the minor number and convert it
 673                  * from ascii.
 674                  */
 675                 for (p = svm_bootpath + sizeof (SVM_PSEUDO_STR) + 1;
 676                     *p >= '0' && *p <= '9'; p++) {
 677                         mnum *= 10;
 678                         mnum += *p - '0';
 679                 }
 680 
 681                 if (md_create_minor_node(0, mnum)) {
 682                         kmem_free(md_set[0].s_un, sz);
 683                         kmem_free(md_set[0].s_ui, sz);
 684                         goto attach_failure;
 685                 }
 686         }
 687 
 688         /* create the hash to store the meta device sizes */
 689         md_nblocksmap = mod_hash_create_idhash("md_nblocksmap",
 690             md_nblocksmap_size, mod_hash_null_valdtor);
 691 
 692         MD_CLR_IN(IN_ATTACH);
 693         return (DDI_SUCCESS);
 694 
 695 attach_failure:
 696         /*
 697          * Use our own detach routine to toss any stuff we allocated above.
 698          * NOTE: detach will call md_halt to free the mddb_init allocations.
 699          */
 700         MD_CLR_IN(IN_ATTACH);
 701         if (mddetach(dip, DDI_DETACH) != DDI_SUCCESS)
 702                 cmn_err(CE_WARN, "detach from attach failed");
 703         return (DDI_FAILURE);
 704 }
 705 
 706 /* ARGSUSED */
 707 static int
 708 mddetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 709 {
 710         extern int      check_active_locators();
 711         set_t           s;
 712         size_t          sz;
 713         int             len;
 714 
 715         MD_SET_IN(IN_DETACH);
 716 
 717         /* check command */
 718         if (cmd != DDI_DETACH) {
 719                 MD_CLR_IN(IN_DETACH);
 720                 return (DDI_FAILURE);
 721         }
 722 
 723         /*
 724          * if we have not already halted yet we have no active config
 725          * then automatically initiate a halt so we can detach.
 726          */
 727         if (!(md_get_status() & MD_GBL_HALTED)) {
 728                 if (check_active_locators() == 0) {
 729                         /*
 730                          * NOTE: a successful md_halt will have done the
 731                          * mddb_unload to free allocations done in mddb_init
 732                          */
 733                         if (md_halt(MD_NO_GBL_LOCKS_HELD)) {
 734                                 cmn_err(CE_NOTE, "md:detach: "
 735                                     "Could not halt Solaris Volume Manager");
 736                                 MD_CLR_IN(IN_DETACH);
 737                                 return (DDI_FAILURE);
 738                         }
 739                 }
 740 
 741                 /* fail detach if we have not halted */
 742                 if (!(md_get_status() & MD_GBL_HALTED)) {
 743                         MD_CLR_IN(IN_DETACH);
 744                         return (DDI_FAILURE);
 745                 }
 746         }
 747 
 748         /* must be in halted state, this will be cleared on next attach */
 749         ASSERT(md_get_status() & MD_GBL_HALTED);
 750 
 751         /* cleanup attach allocations and initializations */
 752         md_major_targ = 0;
 753 
 754         sz = sizeof (void *) * md_nunits;
 755         for (s = 0; s < md_nsets; s++) {
 756                 if (md_set[s].s_un != NULL) {
 757                         kmem_free(md_set[s].s_un, sz);
 758                         md_set[s].s_un = NULL;
 759                 }
 760 
 761                 if (md_set[s].s_ui != NULL) {
 762                         kmem_free(md_set[s].s_ui, sz);
 763                         md_set[s].s_ui = NULL;
 764                 }
 765         }
 766         md_nunits = 0;
 767         md_nsets = 0;
 768         md_nmedh = 0;
 769 
 770         if (non_ff_drivers != NULL) {
 771                 int     i;
 772 
 773                 for (i = 0; non_ff_drivers[i] != NULL; i++)
 774                         kmem_free(non_ff_drivers[i],
 775                             strlen(non_ff_drivers[i]) + 1);
 776 
 777                 /* free i+1 entries because there is a null entry at list end */
 778                 kmem_free(non_ff_drivers, (i + 1) * sizeof (char *));
 779                 non_ff_drivers = NULL;
 780         }
 781 
 782         if (md_med_trans_lst != NULL) {
 783                 kmem_free(md_med_trans_lst, strlen(md_med_trans_lst) + 1);
 784                 md_med_trans_lst = NULL;
 785         }
 786 
 787         if (md_mods != NULL) {
 788                 kmem_free(md_mods, sizeof (ddi_modhandle_t) * MD_NOPS);
 789                 md_mods = NULL;
 790         }
 791 
 792         if (md_ops != NULL) {
 793                 kmem_free(md_ops, sizeof (md_ops_t *) * MD_NOPS);
 794                 md_ops = NULL;
 795         }
 796 
 797         if (MD_UPGRADE) {
 798                 len = md_tuple_length * (2 * ((int)sizeof (dev32_t)));
 799                 md_in_upgrade = 0;
 800                 md_xlate_free(len);
 801                 md_majortab_free();
 802         }
 803 
 804         /*
 805          * Undo what we did in mdattach, freeing resources
 806          * and removing things we installed.  The system
 807          * framework guarantees we are not active with this devinfo
 808          * node in any other entry points at this time.
 809          */
 810         ddi_prop_remove_all(dip);
 811         ddi_remove_minor_node(dip, NULL);
 812 
 813         med_fini();
 814 
 815         mod_hash_destroy_idhash(md_nblocksmap);
 816 
 817         md_devinfo = NULL;
 818 
 819         MD_CLR_IN(IN_DETACH);
 820         return (DDI_SUCCESS);
 821 }
 822 
 823 
 824 /*
 825  * Given the device number return the devinfo pointer
 826  * given to md via md_attach
 827  */
 828 /*ARGSUSED*/
 829 static int
 830 mdinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 831 {
 832         int             error = DDI_FAILURE;
 833 
 834         switch (infocmd) {
 835         case DDI_INFO_DEVT2DEVINFO:
 836                 if (md_devinfo) {
 837                         *result = (void *)md_devinfo;
 838                         error = DDI_SUCCESS;
 839                 }
 840                 break;
 841 
 842         case DDI_INFO_DEVT2INSTANCE:
 843                 *result = (void *)0;
 844                 error = DDI_SUCCESS;
 845                 break;
 846         }
 847         return (error);
 848 }
 849 
 850 /*
 851  * property operation routine.  return the number of blocks for the partition
 852  * in question or forward the request to the property facilities.
 853  */
 854 static int
 855 mdprop_op(
 856         dev_t dev,              /* device number associated with device */
 857         dev_info_t *dip,        /* device info struct for this device */
 858         ddi_prop_op_t prop_op,  /* property operator */
 859         int mod_flags,          /* property flags */
 860         char *name,             /* name of property */
 861         caddr_t valuep,         /* where to put property value */
 862         int *lengthp)           /* put length of property here */
 863 {
 864         return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags,
 865             name, valuep, lengthp, md_nblocks_get(getminor(dev))));
 866 }
 867 
 868 static void
 869 snarf_user_data(set_t setno)
 870 {
 871         mddb_recid_t            recid;
 872         mddb_recstatus_t        status;
 873 
 874         recid = mddb_makerecid(setno, 0);
 875         while ((recid = mddb_getnextrec(recid, MDDB_USER, 0)) > 0) {
 876                 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
 877                         continue;
 878 
 879                 status = mddb_getrecstatus(recid);
 880                 if (status == MDDB_STALE)
 881                         continue;
 882 
 883                 if (status == MDDB_NODATA) {
 884                         mddb_setrecprivate(recid, MD_PRV_PENDDEL);
 885                         continue;
 886                 }
 887 
 888                 ASSERT(status == MDDB_OK);
 889 
 890                 mddb_setrecprivate(recid, MD_PRV_GOTIT);
 891         }
 892 }
 893 
 894 static void
 895 md_print_block_usage(mddb_set_t *s, uint_t blks)
 896 {
 897         uint_t          ib;
 898         int             li;
 899         mddb_mb_ic_t    *mbip;
 900         uint_t          max_blk_needed;
 901         mddb_lb_t       *lbp;
 902         mddb_sidelocator_t      *slp;
 903         int             drv_index;
 904         md_splitname    sn;
 905         char            *name;
 906         char            *suffix;
 907         size_t          prefixlen;
 908         size_t          suffixlen;
 909         int             alloc_sz;
 910 
 911 
 912         max_blk_needed = s->s_totalblkcnt - s->s_freeblkcnt + blks;
 913 
 914         cmn_err(CE_WARN, "Blocks in Metadevice State Database: %d\n"
 915             "            Additional Blocks Needed:            %d\n\n"
 916             "            Increase size of following replicas for\n"
 917             "            device relocatability by deleting listed\n"
 918             "            replica and re-adding replica with\n"
 919             "            increased size (see metadb(1M)):\n"
 920             "                Replica                   Increase By",
 921             s->s_totalblkcnt, (blks - s->s_freeblkcnt));
 922 
 923         lbp = s->s_lbp;
 924 
 925         for (li = 0; li < lbp->lb_loccnt; li++) {
 926                 if (lbp->lb_locators[li].l_flags & MDDB_F_DELETED)
 927                         continue;
 928                 ib = 0;
 929                 for (mbip = s->s_mbiarray[li]; mbip != NULL;
 930                     mbip = mbip->mbi_next) {
 931                         ib += (uint_t)mbip->mbi_mddb_mb.mb_blkcnt;
 932                 }
 933                 if (ib == 0)
 934                         continue;
 935                 if (ib < max_blk_needed) {
 936                         slp = &lbp->lb_sidelocators[s->s_sideno][li];
 937                         drv_index = slp->l_drvnm_index;
 938                         mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno,
 939                             &sn);
 940                         prefixlen = SPN_PREFIX(&sn).pre_len;
 941                         suffixlen = SPN_SUFFIX(&sn).suf_len;
 942                         alloc_sz = (int)(prefixlen + suffixlen + 2);
 943                         name = (char *)kmem_alloc(alloc_sz, KM_SLEEP);
 944                         (void) strncpy(name, SPN_PREFIX(&sn).pre_data,
 945                             prefixlen);
 946                         name[prefixlen] = '/';
 947                         suffix = name + (prefixlen + 1);
 948                         (void) strncpy(suffix, SPN_SUFFIX(&sn).suf_data,
 949                             suffixlen);
 950                         name[prefixlen + suffixlen + 1] = '\0';
 951                         cmn_err(CE_WARN,
 952                             "  %s (%s:%d:%d)   %d blocks",
 953                             name, lbp->lb_drvnm[drv_index].dn_data,
 954                             slp->l_mnum, lbp->lb_locators[li].l_blkno,
 955                             (max_blk_needed - ib));
 956                         kmem_free(name, alloc_sz);
 957                 }
 958         }
 959 }
 960 
 961 /*
 962  * md_create_minor_node:
 963  *      Create the minor device for the given set and un_self_id.
 964  *
 965  * Input:
 966  *      setno   - set number
 967  *      mnum    - selfID of unit
 968  *
 969  * Output:
 970  *      None.
 971  *
 972  * Returns 0 for success, 1 for failure.
 973  *
 974  * Side-effects:
 975  *      None.
 976  */
 977 int
 978 md_create_minor_node(set_t setno, minor_t mnum)
 979 {
 980         char            name[20];
 981 
 982         /* Check for valid arguments */
 983         if (setno >= MD_MAXSETS || MD_MIN2UNIT(mnum) >= MD_MAXUNITS)
 984                 return (1);
 985 
 986         (void) snprintf(name, 20, "%u,%u,blk",
 987             (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));
 988 
 989         if (ddi_create_minor_node(md_devinfo, name, S_IFBLK,
 990             MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
 991                 return (1);
 992 
 993         (void) snprintf(name, 20, "%u,%u,raw",
 994             (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));
 995 
 996         if (ddi_create_minor_node(md_devinfo, name, S_IFCHR,
 997             MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
 998                 return (1);
 999 
1000         return (0);
1001 }
1002 
1003 /*
1004  * For a given key check if it is an orphaned record.
1005  * The following conditions are used to determine an orphan.
1006  * 1. The device associated with that key is not a metadevice.
1007  * 2. If DEVID_STYLE then the physical device does not have a device Id
1008  * associated with it.
1009  *
1010  * If a key does not have an entry in the devid namespace it could be
1011  * a device that does not support device ids. Hence the record is not
1012  * deleted.
1013  */
1014 
1015 static int
1016 md_verify_orphaned_record(set_t setno, mdkey_t key)
1017 {
1018         md_dev64_t      odev; /* orphaned dev */
1019         mddb_set_t      *s;
1020         side_t          side = 0;
1021         struct nm_next_hdr      *did_nh = NULL;
1022 
1023         s = (mddb_set_t *)md_set[setno].s_db;
1024         if ((did_nh = get_first_record(setno, 1,  (NM_DEVID | NM_NOTSHARED)))
1025             == NULL)
1026                 return (0);
1027         /*
1028          * If devid style is set then get the dev_t using MD_NOTRUST_DEVT
1029          */
1030         if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE) {
1031                 odev = md_getdevnum(setno, side, key, MD_NOTRUST_DEVT);
1032                 if ((odev == NODEV64) || (md_getmajor(odev) == md_major))
1033                         return (0);
1034                 if (lookup_entry(did_nh, setno, side, key, odev, NM_DEVID) ==
1035                     NULL)
1036                         return (1);
1037         }
1038         return (0);
1039 }
1040 
1041 int
1042 md_snarf_db_set(set_t setno, md_error_t *ep)
1043 {
1044         int                     err = 0;
1045         int                     i;
1046         mddb_recid_t            recid;
1047         mddb_type_t             drvrid;
1048         mddb_recstatus_t        status;
1049         md_ops_t                *ops;
1050         uint_t                  privat;
1051         mddb_set_t              *s;
1052         uint_t                  cvt_blks;
1053         struct nm_next_hdr      *nh;
1054         mdkey_t                 key = MD_KEYWILD;
1055         side_t                  side = 0;
1056         int                     size;
1057         int                     devid_flag;
1058         int                     retval;
1059         uint_t                  un;
1060         int                     un_next_set = 0;
1061 
1062         md_haltsnarf_enter(setno);
1063 
1064         mutex_enter(&md_mx);
1065         if (md_set[setno].s_status & MD_SET_SNARFED) {
1066                 mutex_exit(&md_mx);
1067                 md_haltsnarf_exit(setno);
1068                 return (0);
1069         }
1070         mutex_exit(&md_mx);
1071 
1072         if (! (md_get_status() & MD_GBL_DAEMONS_LIVE)) {
1073                 if (md_start_daemons(TRUE)) {
1074                         if (ep != NULL)
1075                                 (void) mdsyserror(ep, ENXIO);
1076                         err = -1;
1077                         goto out;
1078                 }
1079         }
1080 
1081 
1082         /*
1083          * Load the devid name space if it exists
1084          */
1085         (void) md_load_namespace(setno, NULL, NM_DEVID);
1086         if (!md_load_namespace(setno, ep, 0L)) {
1087                 /*
1088                  * Unload the devid namespace
1089                  */
1090                 (void) md_unload_namespace(setno, NM_DEVID);
1091                 err = -1;
1092                 goto out;
1093         }
1094 
1095         /*
1096          * If replica is in non-devid state, convert if:
1097          *      - not in probe during upgrade (md_keep_repl_state = 0)
1098          *      - enough space available in replica
1099          *      - local set
1100          *      - not a multi-node diskset
1101          *      - clustering is not present (for non-local set)
1102          */
1103         s = (mddb_set_t *)md_set[setno].s_db;
1104         devid_flag = 0;
1105         if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE) && !md_keep_repl_state)
1106                 devid_flag = 1;
1107         if (cluster_bootflags & CLUSTER_CONFIGURED)
1108                 if (setno != MD_LOCAL_SET)
1109                         devid_flag = 0;
1110         if (MD_MNSET_SETNO(setno))
1111                 devid_flag = 0;
1112         if ((md_devid_destroy == 1) && (md_keep_repl_state == 1))
1113                 devid_flag = 0;
1114 
1115         /*
1116          * if we weren't devid style before and md_keep_repl_state=1
1117          * we need to stay non-devid
1118          */
1119         if ((md_keep_repl_state == 1) &&
1120             ((s->s_lbp->lb_flags & MDDB_DEVID_STYLE) == 0))
1121                 devid_flag = 0;
1122         if (devid_flag) {
1123                 /*
1124                  * Determine number of free blocks needed to convert
1125                  * entire replica to device id format - locator blocks
1126                  * and namespace.
1127                  */
1128                 cvt_blks = 0;
1129                 if (mddb_lb_did_convert(s, 0, &cvt_blks) != 0) {
1130                         if (ep != NULL)
1131                                 (void) mdsyserror(ep, EIO);
1132                         err = -1;
1133                         goto out;
1134 
1135                 }
1136                 cvt_blks += md_nm_did_chkspace(setno);
1137 
1138                 /* add MDDB_DEVID_CONV_PERC% */
1139                 if ((md_conv_perc > 0) && (md_conv_perc <= 100)) {
1140                         cvt_blks = cvt_blks * (100 + md_conv_perc) / 100;
1141                 }
1142 
1143                 if (cvt_blks <= s->s_freeblkcnt) {
1144                         if (mddb_lb_did_convert(s, 1, &cvt_blks) != 0) {
1145                                 if (ep != NULL)
1146                                         (void) mdsyserror(ep, EIO);
1147                                 err = -1;
1148                                 goto out;
1149                         }
1150 
1151                 } else {
1152                         /*
1153                          * Print message that replica can't be converted for
1154                          * lack of space.   No failure - just continue to
1155                          * run without device ids.
1156                          */
1157                         cmn_err(CE_WARN,
1158                             "Unable to add Solaris Volume Manager device "
1159                             "relocation data.\n"
1160                             "          To use device relocation feature:\n"
1161                             "          - Increase size of listed replicas\n"
1162                             "          - Reboot");
1163                         md_print_block_usage(s, cvt_blks);
1164                         cmn_err(CE_WARN,
1165                             "Loading set without device relocation data.\n"
1166                             "          Solaris Volume Manager disk movement "
1167                             "not tracked in local set.");
1168                 }
1169         }
1170 
1171         /*
1172          * go through and load any modules referenced in
1173          * data base
1174          */
1175         recid = mddb_makerecid(setno, 0);
1176         while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1177                 status = mddb_getrecstatus(recid);
1178                 if (status == MDDB_STALE) {
1179                         if (! (md_get_setstatus(setno) & MD_SET_STALE)) {
1180                                 md_set_setstatus(setno, MD_SET_STALE);
1181                                 cmn_err(CE_WARN,
1182                                     "md: state database is stale");
1183                         }
1184                 } else if (status == MDDB_NODATA) {
1185                         mddb_setrecprivate(recid, MD_PRV_PENDDEL);
1186                         continue;
1187                 }
1188                 drvrid = mddb_getrectype1(recid);
1189                 if (drvrid < MDDB_FIRST_MODID)
1190                         continue;
1191                 if (md_loadsubmod(setno, md_getshared_name(setno, drvrid),
1192                     drvrid) < 0) {
1193                         cmn_err(CE_NOTE, "md: could not load misc/%s",
1194                             md_getshared_name(setno, drvrid));
1195                 }
1196         }
1197 
1198         if (recid < 0)
1199                 goto out;
1200 
1201         snarf_user_data(setno);
1202 
1203         /*
1204          * Initialize the md_nm_snarfed array
1205          * this array is indexed by the key and
1206          * is set by md_getdevnum during the snarf time
1207          */
1208         if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) != NULL) {
1209                 size = (int)((((struct nm_rec_hdr *)nh->nmn_record)->
1210                     r_next_key) * (sizeof (int)));
1211                 md_nm_snarfed = (int *)kmem_zalloc(size, KM_SLEEP);
1212         }
1213 
1214         /*
1215          * go through and snarf until nothing gets added
1216          */
1217         do {
1218                 i = 0;
1219                 for (ops = md_opslist; ops != NULL; ops = ops->md_next) {
1220                         if (ops->md_snarf != NULL) {
1221                                 retval = ops->md_snarf(MD_SNARF_DOIT, setno);
1222                                 if (retval == -1) {
1223                                         err = -1;
1224                                         /* Don't know the failed unit */
1225                                         (void) mdmderror(ep, MDE_RR_ALLOC_ERROR,
1226                                             0);
1227                                         (void) md_halt_set(setno, MD_HALT_ALL);
1228                                         (void) mddb_unload_set(setno);
1229                                         md_haltsnarf_exit(setno);
1230                                         return (err);
1231                                 } else {
1232                                         i += retval;
1233                                 }
1234                         }
1235                 }
1236         } while (i);
1237 
1238         /*
1239          * Set the first available slot and availability
1240          */
1241         md_set[setno].s_un_avail = 0;
1242         for (un = 0; un < MD_MAXUNITS; un++) {
1243                 if (md_set[setno].s_un[un] != NULL) {
1244                         continue;
1245                 } else {
1246                         if (!un_next_set) {
1247                                 md_set[setno].s_un_next = un;
1248                                 un_next_set = 1;
1249                         }
1250                         md_set[setno].s_un_avail++;
1251                 }
1252         }
1253 
1254         md_set_setstatus(setno, MD_SET_SNARFED);
1255 
1256         recid = mddb_makerecid(setno, 0);
1257         while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1258                 privat = mddb_getrecprivate(recid);
1259                 if (privat & MD_PRV_COMMIT) {
1260                         if (mddb_commitrec(recid)) {
1261                                 if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
1262                                         md_set_setstatus(setno, MD_SET_STALE);
1263                                         cmn_err(CE_WARN,
1264                                             "md: state database is stale");
1265                                 }
1266                         }
1267                         mddb_setrecprivate(recid, MD_PRV_GOTIT);
1268                 }
1269         }
1270 
1271         /* Deletes must happen after all the commits */
1272         recid = mddb_makerecid(setno, 0);
1273         while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1274                 privat = mddb_getrecprivate(recid);
1275                 if (privat & MD_PRV_DELETE) {
1276                         if (mddb_deleterec(recid)) {
1277                                 if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
1278                                         md_set_setstatus(setno, MD_SET_STALE);
1279                                         cmn_err(CE_WARN,
1280                                             "md: state database is stale");
1281                                 }
1282                                 mddb_setrecprivate(recid, MD_PRV_GOTIT);
1283                         }
1284                         recid = mddb_makerecid(setno, 0);
1285                 }
1286         }
1287 
1288         /*
1289          * go through and clean up records until nothing gets cleaned up.
1290          */
1291         do {
1292                 i = 0;
1293                 for (ops = md_opslist; ops != NULL; ops = ops->md_next)
1294                         if (ops->md_snarf != NULL)
1295                                 i += ops->md_snarf(MD_SNARF_CLEANUP, setno);
1296         } while (i);
1297 
1298         if (md_nm_snarfed != NULL &&
1299             !(md_get_setstatus(setno) & MD_SET_STALE)) {
1300                 /*
1301                  * go thru and cleanup the namespace and the device id
1302                  * name space
1303                  */
1304                 for (key = 1;
1305                     key < ((struct nm_rec_hdr *)nh->nmn_record)->r_next_key;
1306                     key++) {
1307                         /*
1308                          * Is the entry an 'orphan'?
1309                          */
1310                         if (lookup_entry(nh, setno, side, key, NODEV64, 0L) !=
1311                             NULL) {
1312                                 /*
1313                                  * If the value is not set then apparently
1314                                  * it is not part of the current configuration,
1315                                  * remove it this can happen when system panic
1316                                  * between the primary name space update and
1317                                  * the device id name space update
1318                                  */
1319                                 if (md_nm_snarfed[key] == 0) {
1320                                         if (md_verify_orphaned_record(setno,
1321                                             key) == 1)
1322                                                 (void) remove_entry(nh,
1323                                                     side, key, 0L);
1324                                 }
1325                         }
1326                 }
1327         }
1328 
1329         if (md_nm_snarfed != NULL) {
1330                 /*
1331                  * Done and free the memory
1332                  */
1333                 kmem_free(md_nm_snarfed, size);
1334                 md_nm_snarfed = NULL;
1335         }
1336 
1337         if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE &&
1338             !(md_get_setstatus(setno) & MD_SET_STALE)) {
1339                 /*
1340                  * if the destroy flag has been set and
1341                  * the MD_SET_DIDCLUP bit is not set in
1342                  * the set's status field, cleanup the
1343                  * entire device id namespace
1344                  */
1345                 if (md_devid_destroy &&
1346                     !(md_get_setstatus(setno) & MD_SET_DIDCLUP)) {
1347                         (void) md_devid_cleanup(setno, 1);
1348                         md_set_setstatus(setno, MD_SET_DIDCLUP);
1349                 } else
1350                         (void) md_devid_cleanup(setno, 0);
1351         }
1352 
1353         /*
1354          * clear single threading on snarf, return success or error
1355          */
1356 out:
1357         md_haltsnarf_exit(setno);
1358         return (err);
1359 }
1360 
1361 void
1362 get_minfo(struct dk_minfo *info, minor_t mnum)
1363 {
1364         md_unit_t       *un;
1365         mdi_unit_t      *ui;
1366 
1367         info->dki_capacity = 0;
1368         info->dki_lbsize = 0;
1369         info->dki_media_type = 0;
1370 
1371         if ((ui = MDI_UNIT(mnum)) == NULL) {
1372                 return;
1373         }
1374         un = (md_unit_t *)md_unit_readerlock(ui);
1375         info->dki_capacity = un->c.un_total_blocks;
1376         md_unit_readerexit(ui);
1377         info->dki_lbsize = DEV_BSIZE;
1378         info->dki_media_type = DK_UNKNOWN;
1379 }
1380 
1381 
1382 void
1383 get_info(struct dk_cinfo *info, minor_t mnum)
1384 {
1385         /*
1386          * Controller Information
1387          */
1388         info->dki_ctype = DKC_MD;
1389         info->dki_cnum = ddi_get_instance(ddi_get_parent(md_devinfo));
1390         (void) strcpy(info->dki_cname,
1391             ddi_get_name(ddi_get_parent(md_devinfo)));
1392         /*
1393          * Unit Information
1394          */
1395         info->dki_unit = mnum;
1396         info->dki_slave = 0;
1397         (void) strcpy(info->dki_dname, ddi_driver_name(md_devinfo));
1398         info->dki_flags = 0;
1399         info->dki_partition = 0;
1400         info->dki_maxtransfer = (ushort_t)(md_maxphys / DEV_BSIZE);
1401 
1402         /*
1403          * We can't get from here to there yet
1404          */
1405         info->dki_addr = 0;
1406         info->dki_space = 0;
1407         info->dki_prio = 0;
1408         info->dki_vec = 0;
1409 }
1410 
1411 /*
1412  * open admin device
1413  */
1414 static int
1415 mdadminopen(
1416         int     flag,
1417         int     otyp)
1418 {
1419         int     err = 0;
1420 
1421         /* single thread */
1422         mutex_enter(&md_mx);
1423 
1424         /* check type and flags */
1425         if ((otyp != OTYP_CHR) && (otyp != OTYP_LYR)) {
1426                 err = EINVAL;
1427                 goto out;
1428         }
1429         if (((flag & FEXCL) && (md_status & MD_GBL_OPEN)) ||
1430             (md_status & MD_GBL_EXCL)) {
1431                 err = EBUSY;
1432                 goto out;
1433         }
1434 
1435         /* count and flag open */
1436         md_ocnt[otyp]++;
1437         md_status |= MD_GBL_OPEN;
1438         if (flag & FEXCL)
1439                 md_status |= MD_GBL_EXCL;
1440 
1441         /* unlock return success */
1442 out:
1443         mutex_exit(&md_mx);
1444         return (err);
1445 }
1446 
1447 /*
1448  * open entry point
1449  */
1450 static int
1451 mdopen(
1452         dev_t           *dev,
1453         int             flag,
1454         int             otyp,
1455         cred_t          *cred_p)
1456 {
1457         minor_t         mnum = getminor(*dev);
1458         unit_t          unit = MD_MIN2UNIT(mnum);
1459         set_t           setno = MD_MIN2SET(mnum);
1460         mdi_unit_t      *ui = NULL;
1461         int             err = 0;
1462         md_parent_t     parent;
1463 
1464         /* dispatch admin device opens */
1465         if (mnum == MD_ADM_MINOR)
1466                 return (mdadminopen(flag, otyp));
1467 
1468         /* lock, check status */
1469         rw_enter(&md_unit_array_rw.lock, RW_READER);
1470 
1471 tryagain:
1472         if (md_get_status() & MD_GBL_HALTED)  {
1473                 err = ENODEV;
1474                 goto out;
1475         }
1476 
1477         /* check minor */
1478         if ((setno >= md_nsets) || (unit >= md_nunits)) {
1479                 err = ENXIO;
1480                 goto out;
1481         }
1482 
1483         /* make sure we're snarfed */
1484         if ((md_get_setstatus(MD_LOCAL_SET) & MD_SET_SNARFED) == 0) {
1485                 if (md_snarf_db_set(MD_LOCAL_SET, NULL) != 0) {
1486                         err = ENODEV;
1487                         goto out;
1488                 }
1489         }
1490         if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) {
1491                 err = ENODEV;
1492                 goto out;
1493         }
1494 
1495         /* check unit */
1496         if ((ui = MDI_UNIT(mnum)) == NULL) {
1497                 err = ENXIO;
1498                 goto out;
1499         }
1500 
1501         /*
1502          * The softpart open routine may do an I/O during the open, in
1503          * which case the open routine will set the OPENINPROGRESS flag
1504          * and drop all locks during the I/O.  If this thread sees
1505          * the OPENINPROGRESS flag set, if should wait until the flag
1506          * is reset before calling the driver's open routine.  It must
1507          * also revalidate the world after it grabs the unit_array lock
1508          * since the set may have been released or the metadevice cleared
1509          * during the sleep.
1510          */
1511         if (MD_MNSET_SETNO(setno)) {
1512                 mutex_enter(&ui->ui_mx);
1513                 if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
1514                         rw_exit(&md_unit_array_rw.lock);
1515                         cv_wait(&ui->ui_cv, &ui->ui_mx);
1516                         rw_enter(&md_unit_array_rw.lock, RW_READER);
1517                         mutex_exit(&ui->ui_mx);
1518                         goto tryagain;
1519                 }
1520                 mutex_exit(&ui->ui_mx);
1521         }
1522 
1523         /* Test if device is openable */
1524         if ((ui->ui_tstate & MD_NOTOPENABLE) != 0) {
1525                 err = ENXIO;
1526                 goto out;
1527         }
1528 
1529         /* don't allow opens w/WRITE flag if stale */
1530         if ((flag & FWRITE) && (md_get_setstatus(setno) & MD_SET_STALE)) {
1531                 err = EROFS;
1532                 goto out;
1533         }
1534 
1535         /* don't allow writes to subdevices */
1536         parent = md_get_parent(md_expldev(*dev));
1537         if ((flag & FWRITE) && MD_HAS_PARENT(parent)) {
1538                 err = EROFS;
1539                 goto out;
1540         }
1541 
1542         /* open underlying driver */
1543         if (md_ops[ui->ui_opsindex]->md_open != NULL) {
1544                 if ((err = (*md_ops[ui->ui_opsindex]->md_open)
1545                     (dev, flag, otyp, cred_p, 0)) != 0)
1546                         goto out;
1547         }
1548 
1549         /* or do it ourselves */
1550         else {
1551                 /* single thread */
1552                 (void) md_unit_openclose_enter(ui);
1553                 err = md_unit_incopen(mnum, flag, otyp);
1554                 md_unit_openclose_exit(ui);
1555                 if (err != 0)
1556                         goto out;
1557         }
1558 
1559         /* unlock, return status */
1560 out:
1561         rw_exit(&md_unit_array_rw.lock);
1562         return (err);
1563 }
1564 
1565 /*
1566  * close admin device
1567  */
1568 static int
1569 mdadminclose(
1570         int     otyp)
1571 {
1572         int     i;
1573         int     err = 0;
1574 
1575         /* single thread */
1576         mutex_enter(&md_mx);
1577 
1578         /* check type and flags */
1579         if ((otyp < 0) || (otyp >= OTYPCNT)) {
1580                 err = EINVAL;
1581                 goto out;
1582         } else if (md_ocnt[otyp] == 0) {
1583                 err = ENXIO;
1584                 goto out;
1585         }
1586 
1587         /* count and flag closed */
1588         if (otyp == OTYP_LYR)
1589                 md_ocnt[otyp]--;
1590         else
1591                 md_ocnt[otyp] = 0;
1592         md_status &= ~MD_GBL_OPEN;
1593         for (i = 0; (i < OTYPCNT); ++i)
1594                 if (md_ocnt[i] != 0)
1595                         md_status |= MD_GBL_OPEN;
1596         if (! (md_status & MD_GBL_OPEN))
1597                 md_status &= ~MD_GBL_EXCL;
1598 
1599         /* unlock return success */
1600 out:
1601         mutex_exit(&md_mx);
1602         return (err);
1603 }
1604 
1605 /*
1606  * close entry point
1607  */
1608 static int
1609 mdclose(
1610         dev_t           dev,
1611         int             flag,
1612         int             otyp,
1613         cred_t          *cred_p)
1614 {
1615         minor_t         mnum = getminor(dev);
1616         set_t           setno = MD_MIN2SET(mnum);
1617         unit_t          unit = MD_MIN2UNIT(mnum);
1618         mdi_unit_t      *ui = NULL;
1619         int             err = 0;
1620 
1621         /* dispatch admin device closes */
1622         if (mnum == MD_ADM_MINOR)
1623                 return (mdadminclose(otyp));
1624 
1625         /* check minor */
1626         if ((setno >= md_nsets) || (unit >= md_nunits) ||
1627             ((ui = MDI_UNIT(mnum)) == NULL)) {
1628                 err = ENXIO;
1629                 goto out;
1630         }
1631 
1632         /* close underlying driver */
1633         if (md_ops[ui->ui_opsindex]->md_close != NULL) {
1634                 if ((err = (*md_ops[ui->ui_opsindex]->md_close)
1635                     (dev, flag, otyp, cred_p, 0)) != 0)
1636                         goto out;
1637         }
1638 
1639         /* or do it ourselves */
1640         else {
1641                 /* single thread */
1642                 (void) md_unit_openclose_enter(ui);
1643                 err = md_unit_decopen(mnum, otyp);
1644                 md_unit_openclose_exit(ui);
1645                 if (err != 0)
1646                         goto out;
1647         }
1648 
1649         /* return success */
1650 out:
1651         return (err);
1652 }
1653 
1654 
1655 /*
1656  * This routine performs raw read operations.  It is called from the
1657  * device switch at normal priority.
1658  *
1659  * The main catch is that the *uio struct which is passed to us may
1660  * specify a read which spans two buffers, which would be contiguous
1661  * on a single partition,  but not on a striped partition. This will
1662  * be handled by mdstrategy.
1663  */
1664 /*ARGSUSED*/
1665 static int
1666 mdread(dev_t dev, struct uio *uio, cred_t *credp)
1667 {
1668         minor_t         mnum;
1669         mdi_unit_t      *ui;
1670         int             error;
1671 
1672         if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1673             (MD_MIN2SET(mnum) >= md_nsets) ||
1674             (MD_MIN2UNIT(mnum) >= md_nunits) ||
1675             ((ui = MDI_UNIT(mnum)) == NULL))
1676                 return (ENXIO);
1677 
1678         if (md_ops[ui->ui_opsindex]->md_read  != NULL)
1679                 return ((*md_ops[ui->ui_opsindex]->md_read)
1680                     (dev, uio, credp));
1681 
1682         if ((error = md_chk_uio(uio)) != 0)
1683                 return (error);
1684 
1685         return (physio(mdstrategy, NULL, dev, B_READ, md_minphys, uio));
1686 }
1687 
1688 /*
1689  * This routine performs async raw read operations.  It is called from the
1690  * device switch at normal priority.
1691  *
1692  * The main catch is that the *aio struct which is passed to us may
1693  * specify a read which spans two buffers, which would be contiguous
1694  * on a single partition,  but not on a striped partition. This will
1695  * be handled by mdstrategy.
1696  */
1697 /*ARGSUSED*/
1698 static int
1699 mdaread(dev_t dev, struct aio_req *aio, cred_t *credp)
1700 {
1701         minor_t         mnum;
1702         mdi_unit_t      *ui;
1703         int             error;
1704 
1705 
1706         if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1707             (MD_MIN2SET(mnum) >= md_nsets) ||
1708             (MD_MIN2UNIT(mnum) >= md_nunits) ||
1709             ((ui = MDI_UNIT(mnum)) == NULL))
1710                 return (ENXIO);
1711 
1712         if (md_ops[ui->ui_opsindex]->md_aread  != NULL)
1713                 return ((*md_ops[ui->ui_opsindex]->md_aread)
1714                     (dev, aio, credp));
1715 
1716         if ((error = md_chk_uio(aio->aio_uio)) != 0)
1717                 return (error);
1718 
1719         return (aphysio(mdstrategy, anocancel, dev, B_READ, md_minphys, aio));
1720 }
1721 
1722 /*
1723  * This routine performs raw write operations.  It is called from the
1724  * device switch at normal priority.
1725  *
1726  * The main catch is that the *uio struct which is passed to us may
1727  * specify a write which spans two buffers, which would be contiguous
1728  * on a single partition,  but not on a striped partition. This is
1729  * handled by mdstrategy.
1730  *
1731  */
1732 /*ARGSUSED*/
1733 static int
1734 mdwrite(dev_t dev, struct uio *uio, cred_t *credp)
1735 {
1736         minor_t         mnum;
1737         mdi_unit_t      *ui;
1738         int             error;
1739 
1740         if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1741             (MD_MIN2SET(mnum) >= md_nsets) ||
1742             (MD_MIN2UNIT(mnum) >= md_nunits) ||
1743             ((ui = MDI_UNIT(mnum)) == NULL))
1744                 return (ENXIO);
1745 
1746         if (md_ops[ui->ui_opsindex]->md_write  != NULL)
1747                 return ((*md_ops[ui->ui_opsindex]->md_write)
1748                     (dev, uio, credp));
1749 
1750         if ((error = md_chk_uio(uio)) != 0)
1751                 return (error);
1752 
1753         return (physio(mdstrategy, NULL, dev, B_WRITE, md_minphys, uio));
1754 }
1755 
1756 /*
1757  * This routine performs async raw write operations.  It is called from the
1758  * device switch at normal priority.
1759  *
1760  * The main catch is that the *aio struct which is passed to us may
1761  * specify a write which spans two buffers, which would be contiguous
1762  * on a single partition,  but not on a striped partition. This is
1763  * handled by mdstrategy.
1764  *
1765  */
1766 /*ARGSUSED*/
1767 static int
1768 mdawrite(dev_t dev, struct aio_req *aio, cred_t *credp)
1769 {
1770         minor_t         mnum;
1771         mdi_unit_t      *ui;
1772         int             error;
1773 
1774 
1775         if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1776             (MD_MIN2SET(mnum) >= md_nsets) ||
1777             (MD_MIN2UNIT(mnum) >= md_nunits) ||
1778             ((ui = MDI_UNIT(mnum)) == NULL))
1779                 return (ENXIO);
1780 
1781         if (md_ops[ui->ui_opsindex]->md_awrite  != NULL)
1782                 return ((*md_ops[ui->ui_opsindex]->md_awrite)
1783                     (dev, aio, credp));
1784 
1785         if ((error = md_chk_uio(aio->aio_uio)) != 0)
1786                 return (error);
1787 
1788         return (aphysio(mdstrategy, anocancel, dev, B_WRITE, md_minphys, aio));
1789 }
1790 
1791 int
1792 mdstrategy(struct buf *bp)
1793 {
1794         minor_t         mnum;
1795         mdi_unit_t      *ui;
1796 
1797         ASSERT((bp->b_flags & B_DONE) == 0);
1798 
1799         if (panicstr)
1800                 md_clr_status(MD_GBL_DAEMONS_LIVE);
1801 
1802         if (((mnum = getminor(bp->b_edev)) == MD_ADM_MINOR) ||
1803             (MD_MIN2SET(mnum) >= md_nsets) ||
1804             (MD_MIN2UNIT(mnum) >= md_nunits) ||
1805             ((ui = MDI_UNIT(mnum)) == NULL)) {
1806                 bp->b_flags |= B_ERROR;
1807                 bp->b_error = ENXIO;
1808                 bp->b_resid = bp->b_bcount;
1809                 biodone(bp);
1810                 return (0);
1811         }
1812 
1813         bp->b_flags &= ~(B_ERROR | B_DONE);
1814         if (md_ops[ui->ui_opsindex]->md_strategy  != NULL) {
1815                 (*md_ops[ui->ui_opsindex]->md_strategy) (bp, 0, NULL);
1816         } else {
1817                 (void) errdone(ui, bp, ENXIO);
1818         }
1819         return (0);
1820 }
1821 
1822 /*
1823  * Return true if the ioctl is allowed to be multithreaded.
1824  * All the ioctls with MN are sent only from the message handlers through
1825  * rpc.mdcommd, which (via it's own locking mechanism) takes care that not two
1826  * ioctl for the same metadevice are issued at the same time.
1827  * So we are safe here.
1828  * The other ioctls do not mess with any metadevice structures and therefor
1829  * are harmless too, if called multiple times at the same time.
1830  */
1831 static boolean_t
1832 is_mt_ioctl(int cmd) {
1833 
1834         switch (cmd) {
1835         case MD_IOCGUNIQMSGID:
1836         case MD_IOCGVERSION:
1837         case MD_IOCISOPEN:
1838         case MD_MN_SET_MM_OWNER:
1839         case MD_MN_SET_STATE:
1840         case MD_MN_SUSPEND_WRITES:
1841         case MD_MN_ALLOCATE_HOTSPARE:
1842         case MD_MN_SET_SETFLAGS:
1843         case MD_MN_GET_SETFLAGS:
1844         case MD_MN_MDDB_OPTRECFIX:
1845         case MD_MN_MDDB_PARSE:
1846         case MD_MN_MDDB_BLOCK:
1847         case MD_MN_DB_USERREQ:
1848         case MD_IOC_SPSTATUS:
1849         case MD_MN_COMMD_ERR:
1850         case MD_MN_SET_COMMD_RUNNING:
1851         case MD_MN_RESYNC:
1852         case MD_MN_SETSYNC:
1853         case MD_MN_POKE_HOTSPARES:
1854         case MD_MN_RR_DIRTY:
1855         case MD_MN_RR_CLEAN:
1856         case MD_MN_IOC_SPUPDATEWM:
1857                 return (1);
1858         default:
1859                 return (0);
1860         }
1861 }
1862 
1863 /*
1864  * This routine implements the ioctl calls for the Virtual Disk System.
1865  * It is called from the device switch at normal priority.
1866  */
1867 /* ARGSUSED */
1868 static int
1869 mdioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cred_p,
1870         int *rval_p)
1871 {
1872         minor_t         mnum = getminor(dev);
1873         mdi_unit_t      *ui;
1874         IOLOCK          lock;
1875         int             err;
1876 
1877         /*
1878          * For multinode disksets  number of ioctls are allowed to be
1879          * multithreaded.
1880          * A fundamental assumption made in this implementation is that
1881          * ioctls either do not interact with other md structures  or the
1882          * ioctl to the admin device can only occur if the metadevice
1883          * device is open. i.e. avoid a race between metaclear and the
1884          * progress of a multithreaded ioctl.
1885          */
1886 
1887         if (!is_mt_ioctl(cmd) && md_ioctl_lock_enter() == EINTR) {
1888                 return (EINTR);
1889         }
1890 
1891         /*
1892          * initialize lock tracker
1893          */
1894         IOLOCK_INIT(&lock);
1895 
1896         /* Flag to indicate that MD_GBL_IOCTL_LOCK is not acquired */
1897 
1898         if (is_mt_ioctl(cmd)) {
1899                 /* increment the md_mtioctl_cnt */
1900                 mutex_enter(&md_mx);
1901                 md_mtioctl_cnt++;
1902                 mutex_exit(&md_mx);
1903                 lock.l_flags |= MD_MT_IOCTL;
1904         }
1905 
1906         /*
1907          * this has been added to prevent notification from re-snarfing
1908          * so metaunload will work.  It may interfere with other modules
1909          * halt process.
1910          */
1911         if (md_get_status() & (MD_GBL_HALTED | MD_GBL_DAEMONS_DIE))
1912                 return (IOLOCK_RETURN(ENXIO, &lock));
1913 
1914         /*
1915          * admin device ioctls
1916          */
1917         if (mnum == MD_ADM_MINOR) {
1918                 err = md_admin_ioctl(md_expldev(dev), cmd, (void *) data,
1919                     mode, &lock);
1920         }
1921 
1922         /*
1923          * metadevice ioctls
1924          */
1925         else if ((MD_MIN2SET(mnum) >= md_nsets) ||
1926             (MD_MIN2UNIT(mnum) >= md_nunits) ||
1927             (md_set[MD_MIN2SET(mnum)].s_ui == NULL) ||
1928             ((ui = MDI_UNIT(mnum)) == NULL)) {
1929                 err = ENXIO;
1930         } else if (md_ops[ui->ui_opsindex]->md_ioctl == NULL) {
1931                 err = ENOTTY;
1932         } else {
1933                 err = (*md_ops[ui->ui_opsindex]->md_ioctl)
1934                     (dev, cmd, (void *) data, mode, &lock);
1935         }
1936 
1937         /*
1938          * drop any locks we grabbed
1939          */
1940         return (IOLOCK_RETURN_IOCTLEND(err, &lock));
1941 }
1942 
1943 static int
1944 mddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1945 {
1946         minor_t         mnum;
1947         set_t           setno;
1948         mdi_unit_t      *ui;
1949 
1950         if ((mnum = getminor(dev)) == MD_ADM_MINOR)
1951                 return (ENXIO);
1952 
1953         setno = MD_MIN2SET(mnum);
1954 
1955         if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits) ||
1956             ((ui = MDI_UNIT(mnum)) == NULL))
1957                 return (ENXIO);
1958 
1959 
1960         if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
1961                 return (ENXIO);
1962 
1963         if (md_ops[ui->ui_opsindex]->md_dump  != NULL)
1964                 return ((*md_ops[ui->ui_opsindex]->md_dump)
1965                     (dev, addr, blkno, nblk));
1966 
1967         return (ENXIO);
1968 }
1969 
1970 /*
1971  * Metadevice unit number dispatcher
1972  * When this routine is called it will scan the
1973  * incore unit array and return the avail slot
1974  * hence the unit number to the caller
1975  *
1976  * Return -1 if there is nothing available
1977  */
1978 unit_t
1979 md_get_nextunit(set_t setno)
1980 {
1981         unit_t  un, start;
1982 
1983         /*
1984          * If nothing available
1985          */
1986         if (md_set[setno].s_un_avail == 0) {
1987                 return (MD_UNITBAD);
1988         }
1989 
1990         mutex_enter(&md_mx);
1991         start = un = md_set[setno].s_un_next;
1992 
1993         /* LINTED: E_CONSTANT_CONDITION */
1994         while (1) {
1995                 if (md_set[setno].s_un[un] == NULL) {
1996                         /*
1997                          * Advance the starting index for the next
1998                          * md_get_nextunit call
1999                          */
2000                         if (un == MD_MAXUNITS - 1) {
2001                                 md_set[setno].s_un_next = 0;
2002                         } else {
2003                                 md_set[setno].s_un_next = un + 1;
2004                         }
2005                         break;
2006                 }
2007 
2008                 un = ((un == MD_MAXUNITS - 1) ? 0 : un + 1);
2009 
2010                 if (un == start) {
2011                         un = MD_UNITBAD;
2012                         break;
2013                 }
2014 
2015         }
2016 
2017         mutex_exit(&md_mx);
2018         return (un);
2019 }