illumos-gate Wdiff usr/src/uts/common/io/lvm/md/md.c

Print this page

7127  remove -Wno-missing-braces from Makefile.uts

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/io/lvm/md/md.c
          +++ new/usr/src/uts/common/io/lvm/md/md.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
  25   25   */
  26   26  
  27   27  /*
  28   28   * Md - is the meta-disk driver.   It sits below the UFS file system
  29   29   * but above the 'real' disk drivers, xy, id, sd etc.
  30   30   *
  31   31   * To the UFS software, md looks like a normal driver, since it has
  32   32   * the normal kinds of entries in the bdevsw and cdevsw arrays. So
  33   33   * UFS accesses md in the usual ways.  In particular, the strategy
  34   34   * routine, mdstrategy(), gets called by fbiwrite(), ufs_getapage(),
  35   35   * and ufs_writelbn().
  36   36   *
  37   37   * Md maintains an array of minor devices (meta-partitions).   Each
  38   38   * meta partition stands for a matrix of real partitions, in rows
  39   39   * which are not necessarily of equal length.   Md maintains a table,
  40   40   * with one entry for each meta-partition,  which lists the rows and
  41   41   * columns of actual partitions, and the job of the strategy routine
  42   42   * is to translate from the meta-partition device and block numbers
  43   43   * known to UFS into the actual partitions' device and block numbers.
  44   44   *
  45   45   * See below, in mdstrategy(), mdreal(), and mddone() for details of
  46   46   * this translation.
  47   47   */
  48   48  
  49   49  /*
  50   50   * Driver for Virtual Disk.
  51   51   */
  52   52  
  53   53  #include <sys/user.h>
  54   54  #include <sys/sysmacros.h>
  55   55  #include <sys/conf.h>
  56   56  #include <sys/stat.h>
  57   57  #include <sys/errno.h>
  58   58  #include <sys/param.h>
  59   59  #include <sys/systm.h>
  60   60  #include <sys/file.h>
  61   61  #include <sys/open.h>
  62   62  #include <sys/dkio.h>
  63   63  #include <sys/vtoc.h>
  64   64  #include <sys/cmn_err.h>
  65   65  #include <sys/ddi.h>
  66   66  #include <sys/sunddi.h>
  67   67  #include <sys/debug.h>
  68   68  #include <sys/utsname.h>
  69   69  #include <sys/lvm/mdvar.h>
  70   70  #include <sys/lvm/md_names.h>
  71   71  #include <sys/lvm/md_mddb.h>
  72   72  #include <sys/lvm/md_sp.h>
  73   73  #include <sys/types.h>
  74   74  #include <sys/kmem.h>
  75   75  #include <sys/cladm.h>
  76   76  #include <sys/priv_names.h>
  77   77  #include <sys/modhash.h>
  78   78  
  79   79  int             md_init_debug   = 0;    /* module binding debug */
  80   80  
  81   81  /*
  82   82   * Tunable to turn off the failfast behavior.
  83   83   */
  84   84  int             md_ff_disable = 0;
  85   85  
  86   86  /*
  87   87   * dynamically allocated list of non FF driver names - needs to
  88   88   * be freed when md is detached.
  89   89   */
  90   90  char    **non_ff_drivers = NULL;
  91   91  
  92   92  md_krwlock_t    md_unit_array_rw;       /* protects all unit arrays */
  93   93  md_krwlock_t    nm_lock;                /* protects all the name spaces */
  94   94  
  95   95  md_resync_t     md_cpr_resync;
  96   96  
  97   97  extern char     svm_bootpath[];
  98   98  #define SVM_PSEUDO_STR  "/pseudo/md@0:"
  99   99  
 100  100  #define         VERSION_LENGTH  6
 101  101  #define         VERSION         "1.0"
 102  102  
 103  103  /*
 104  104   * Keep track of possible 'orphan' entries in the name space
 105  105   */
 106  106  int             *md_nm_snarfed = NULL;
 107  107  
 108  108  /*
 109  109   * Global tunable giving the percentage of free space left in replica during
 110  110   * conversion of non-devid style replica to devid style replica.
 111  111   */
 112  112  int             md_conv_perc = MDDB_DEVID_CONV_PERC;
 113  113  
 114  114  #ifdef  DEBUG
 115  115  /* debug code to verify framework exclusion guarantees */
 116  116  int             md_in;
 117  117  kmutex_t        md_in_mx;                       /* used to md global stuff */
 118  118  #define IN_INIT         0x01
 119  119  #define IN_FINI         0x02
 120  120  #define IN_ATTACH       0x04
 121  121  #define IN_DETACH       0x08
 122  122  #define IN_OPEN         0x10
 123  123  #define MD_SET_IN(x) {                                          \
 124  124          mutex_enter(&md_in_mx);                                 \
 125  125          if (md_in)                                              \
 126  126                  debug_enter("MD_SET_IN exclusion lost");        \
 127  127          if (md_in & x)                                          \
 128  128                  debug_enter("MD_SET_IN already set");           \
 129  129          md_in |= x;                                             \
 130  130          mutex_exit(&md_in_mx);                                  \
 131  131  }
 132  132  
 133  133  #define MD_CLR_IN(x) {                                          \
 134  134          mutex_enter(&md_in_mx);                                 \
 135  135          if (md_in & ~(x))                                       \
 136  136                  debug_enter("MD_CLR_IN exclusion lost");        \
 137  137          if (!(md_in & x))                                       \
 138  138                  debug_enter("MD_CLR_IN already clr");           \
 139  139          md_in &= ~x;                                            \
 140  140          mutex_exit(&md_in_mx);                                  \
 141  141  }
 142  142  #else   /* DEBUG */
 143  143  #define MD_SET_IN(x)
 144  144  #define MD_CLR_IN(x)
 145  145  #endif  /* DEBUG */
 146  146  hrtime_t savetime1, savetime2;
 147  147  
 148  148  
 149  149  /*
 150  150   * list things protected by md_mx even if they aren't
 151  151   * used in this file.
 152  152   */
 153  153  kmutex_t        md_mx;                  /* used to md global stuff */
 154  154  kcondvar_t      md_cv;                  /* md_status events */
 155  155  int             md_status = 0;          /* global status for the meta-driver */
 156  156  int             md_num_daemons = 0;
 157  157  int             md_ioctl_cnt = 0;
 158  158  int             md_mtioctl_cnt = 0;     /* multithreaded ioctl cnt */
 159  159  uint_t          md_mdelay = 10;         /* variable so can be patched */
 160  160  
 161  161  int             (*mdv_strategy_tstpnt)(buf_t *, int, void*);
 162  162  
 163  163  major_t         md_major, md_major_targ;
 164  164  
 165  165  unit_t          md_nunits = MD_MAXUNITS;
 166  166  set_t           md_nsets = MD_MAXSETS;
 167  167  int             md_nmedh = 0;
 168  168  char            *md_med_trans_lst = NULL;
 169  169  md_set_t        md_set[MD_MAXSETS];
 170  170  md_set_io_t     md_set_io[MD_MAXSETS];
 171  171  
 172  172  md_krwlock_t    hsp_rwlp;               /* protects hot_spare_interface */
 173  173  md_krwlock_t    ni_rwlp;                /* protects notify_interface */
 174  174  md_ops_t        **md_ops = NULL;
 175  175  ddi_modhandle_t *md_mods = NULL;
 176  176  md_ops_t        *md_opslist;
 177  177  clock_t         md_hz;
 178  178  md_event_queue_t        *md_event_queue = NULL;
 179  179  
 180  180  int             md_in_upgrade;
 181  181  int             md_keep_repl_state;
 182  182  int             md_devid_destroy;
 183  183  
 184  184  /* for sending messages thru a door to userland */
 185  185  door_handle_t   mdmn_door_handle = NULL;
 186  186  int             mdmn_door_did = -1;
 187  187  
 188  188  dev_info_t              *md_devinfo = NULL;
 189  189  
 190  190  md_mn_nodeid_t  md_mn_mynode_id = ~0u;  /* My node id (for multi-node sets) */
 191  191  
 192  192  static  uint_t          md_ocnt[OTYPCNT];
 193  193  
 194  194  static int              mdinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
 195  195  static int              mdattach(dev_info_t *, ddi_attach_cmd_t);
 196  196  static int              mddetach(dev_info_t *, ddi_detach_cmd_t);
 197  197  static int              mdopen(dev_t *, int, int, cred_t *);
 198  198  static int              mdclose(dev_t, int, int, cred_t *);
 199  199  static int              mddump(dev_t, caddr_t, daddr_t, int);
 200  200  static int              mdread(dev_t, struct uio *, cred_t *);
 201  201  static int              mdwrite(dev_t, struct uio *, cred_t *);
 202  202  static int              mdaread(dev_t, struct aio_req *, cred_t *);
 203  203  static int              mdawrite(dev_t, struct aio_req *, cred_t *);
 204  204  static int              mdioctl(dev_t, int, intptr_t, int, cred_t *, int *);
 205  205  static int              mdprop_op(dev_t, dev_info_t *,
 206  206                                  ddi_prop_op_t, int, char *, caddr_t, int *);
 207  207  
 208  208  static struct cb_ops md_cb_ops = {
 209  209          mdopen,                 /* open */
 210  210          mdclose,                /* close */
 211  211          mdstrategy,             /* strategy */
 212  212                                  /* print routine -- none yet */
 213  213          (int(*)(dev_t, char *))nulldev,
 214  214          mddump,                 /* dump */
 215  215          mdread,                 /* read */
 216  216          mdwrite,                /* write */
 217  217          mdioctl,                /* ioctl */
 218  218                                  /* devmap */
 219  219          (int(*)(dev_t, devmap_cookie_t, offset_t, size_t, size_t *,
 220  220                          uint_t))nodev,
 221  221                                  /* mmap */
 222  222          (int(*)(dev_t, off_t, int))nodev,
 223  223                                  /* segmap */
 224  224          (int(*)(dev_t, off_t, struct as *, caddr_t *, off_t, unsigned,
 225  225                  unsigned, unsigned, cred_t *))nodev,
 226  226          nochpoll,               /* poll */
 227  227          mdprop_op,              /* prop_op */
 228  228          0,                      /* streamtab */
 229  229          (D_64BIT|D_MP|D_NEW),   /* driver compatibility flag */
 230  230          CB_REV,                 /* cb_ops version */
 231  231          mdaread,                /* aread */
 232  232          mdawrite,               /* awrite */
 233  233  };
 234  234  
 235  235  static struct dev_ops md_devops = {
 236  236          DEVO_REV,               /* dev_ops version */
 237  237          0,                      /* device reference count */
 238  238          mdinfo,                 /* info routine */
 239  239          nulldev,                /* identify routine */
 240  240          nulldev,                /* probe - not defined */
 241  241          mdattach,               /* attach routine */
 242  242          mddetach,               /* detach routine */
 243  243          nodev,                  /* reset - not defined */
 244  244          &md_cb_ops,             /* driver operations */
 245  245          NULL,                   /* bus operations */
 246  246          nodev,                  /* power management */
 247  247          ddi_quiesce_not_needed,         /* quiesce */
 248  248  };
 249  249  
 250  250  /*
 251  251   * loadable module wrapper
 252  252   */

↓ open down ↓

252 lines elided

↑ open up ↑

 253  253  #include <sys/modctl.h>
 254  254  
 255  255  static struct modldrv modldrv = {
 256  256          &mod_driverops,                 /* type of module -- a pseudodriver */
 257  257          "Solaris Volume Manager base module", /* name of the module */
 258  258          &md_devops,                     /* driver ops */
 259  259  };
 260  260  
 261  261  static struct modlinkage modlinkage = {
 262  262          MODREV_1,
 263      -        (void *)&modldrv,
 264      -        NULL
      263 +        { (void *)&modldrv, NULL }
 265  264  };
 266  265  
 267  266  
 268  267  /* md_medd.c */
 269  268  extern  void    med_init(void);
 270  269  extern  void    med_fini(void);
 271  270  extern  void    md_devid_cleanup(set_t, uint_t);
 272  271  
 273  272  /* md_names.c */
 274  273  extern struct nm_next_hdr       *get_first_record(set_t, int, int);

 275  274  
 276  275  int             md_maxphys      = 0;    /* maximum io size in bytes */
 277  276  #define         MD_MAXBCOUNT    (1024 * 1024)
 278  277  unsigned        md_maxbcount    = 0;    /* maximum physio size in bytes */
 279  278  
 280  279  /*
 281  280   * Some md ioctls trigger io framework device tree operations.  An
 282  281   * example is md ioctls that call md_resolve_bydevid(): which uses the
 283  282   * io framework to resolve a devid. Such operations result in acquiring
 284  283   * io framework locks (like ndi_devi_enter() of "/") while holding
 285  284   * driver locks (like md_unit_writerlock()).
 286  285   *
 287  286   * The prop_op(9E) entry point is called from the devinfo driver with
 288  287   * an active ndi_devi_enter of "/". To avoid deadlock, md's prop_op
 289  288   * implementation must avoid taking a lock that is held per above md
 290  289   * ioctl description: i.e. mdprop_op(9E) can't call md_unit_readerlock()
 291  290   * without risking deadlock.
 292  291   *
 293  292   * To service "size" requests without risking deadlock, we maintain a
 294  293   * "mnum->nblocks" sizemap (protected by a short-term global mutex).
 295  294   */
 296  295  static kmutex_t         md_nblocks_mutex;
 297  296  static mod_hash_t       *md_nblocksmap;         /* mnum -> nblocks */
 298  297  int                     md_nblocksmap_size = 512;
 299  298  
 300  299  /*
 301  300   * Maintain "mnum->nblocks" sizemap for mdprop_op use:
 302  301   *
 303  302   * Create: any code that establishes a unit's un_total_blocks needs the
 304  303   * following type of call to establish nblocks for mdprop_op():
 305  304   *      md_nblocks_set(mnum, un->c.un_total_blocks);"
 306  305   *      NOTE: locate via cscope md_create_minor_node/md_create_unit_incore
 307  306   *              ...or  "MD_UNIT..*="
 308  307   *
 309  308   * Change: any code that changes a unit's un_total_blocks needs the
 310  309   * following type of call to sync nblocks for mdprop_op():
 311  310   *      md_nblocks_set(mnum, un->c.un_total_blocks);"
 312  311   *      NOTE: locate via cscope for "un_total_blocks[ \t]*="
 313  312   *
 314  313   * Destroy: any code that deletes a unit needs the following type of call
 315  314   * to sync nblocks for mdprop_op():
 316  315   *      md_nblocks_set(mnum, -1ULL);
 317  316   *      NOTE: locate via cscope md_remove_minor_node/md_destroy_unit_incore
 318  317   *              ...or  "MD_UNIT..*="
 319  318   */
 320  319  void
 321  320  md_nblocks_set(minor_t mnum, uint64_t nblocks)
 322  321  {
 323  322          mutex_enter(&md_nblocks_mutex);
 324  323          if (nblocks == -1ULL)
 325  324                  (void) mod_hash_destroy(md_nblocksmap,
 326  325                      (mod_hash_key_t)(intptr_t)mnum);
 327  326          else
 328  327                  (void) mod_hash_replace(md_nblocksmap,
 329  328                      (mod_hash_key_t)(intptr_t)mnum,
 330  329                      (mod_hash_val_t)(intptr_t)nblocks);
 331  330          mutex_exit(&md_nblocks_mutex);
 332  331  }
 333  332  
 334  333  /* get the size of a mnum from "mnum->nblocks" sizemap */
 335  334  uint64_t
 336  335  md_nblocks_get(minor_t mnum)
 337  336  {
 338  337          mod_hash_val_t  hv;
 339  338  
 340  339          mutex_enter(&md_nblocks_mutex);
 341  340          if (mod_hash_find(md_nblocksmap,
 342  341              (mod_hash_key_t)(intptr_t)mnum, &hv) == 0) {
 343  342                  mutex_exit(&md_nblocks_mutex);
 344  343                  return ((uint64_t)(intptr_t)hv);
 345  344          }
 346  345          mutex_exit(&md_nblocks_mutex);
 347  346          return (0);
 348  347  }
 349  348  
 350  349  /* allocate/free dynamic space associated with driver globals */
 351  350  void
 352  351  md_global_alloc_free(int alloc)
 353  352  {
 354  353          set_t   s;
 355  354  
 356  355          if (alloc) {
 357  356                  /* initialize driver global locks */
 358  357                  cv_init(&md_cv, NULL, CV_DEFAULT, NULL);
 359  358                  mutex_init(&md_mx, NULL, MUTEX_DEFAULT, NULL);
 360  359                  rw_init(&md_unit_array_rw.lock, NULL, RW_DEFAULT, NULL);
 361  360                  rw_init(&nm_lock.lock, NULL, RW_DEFAULT, NULL);
 362  361                  rw_init(&ni_rwlp.lock, NULL, RW_DRIVER, NULL);
 363  362                  rw_init(&hsp_rwlp.lock, NULL, RW_DRIVER, NULL);
 364  363                  mutex_init(&md_cpr_resync.md_resync_mutex, NULL,
 365  364                      MUTEX_DEFAULT, NULL);
 366  365                  mutex_init(&md_nblocks_mutex, NULL, MUTEX_DEFAULT, NULL);
 367  366  
 368  367                  /* initialize per set driver global locks */
 369  368                  for (s = 0; s < MD_MAXSETS; s++) {
 370  369                          /* initialize per set driver globals locks */
 371  370                          mutex_init(&md_set[s].s_dbmx,
 372  371                              NULL, MUTEX_DEFAULT, NULL);
 373  372                          mutex_init(&md_set_io[s].md_io_mx,
 374  373                              NULL, MUTEX_DEFAULT, NULL);
 375  374                          cv_init(&md_set_io[s].md_io_cv,
 376  375                              NULL, CV_DEFAULT, NULL);
 377  376                  }
 378  377          } else {
 379  378                  /* destroy per set driver global locks */
 380  379                  for (s = 0; s < MD_MAXSETS; s++) {
 381  380                          cv_destroy(&md_set_io[s].md_io_cv);
 382  381                          mutex_destroy(&md_set_io[s].md_io_mx);
 383  382                          mutex_destroy(&md_set[s].s_dbmx);
 384  383                  }
 385  384  
 386  385                  /* destroy driver global locks */
 387  386                  mutex_destroy(&md_nblocks_mutex);
 388  387                  mutex_destroy(&md_cpr_resync.md_resync_mutex);
 389  388                  rw_destroy(&hsp_rwlp.lock);
 390  389                  rw_destroy(&ni_rwlp.lock);
 391  390                  rw_destroy(&nm_lock.lock);
 392  391                  rw_destroy(&md_unit_array_rw.lock);
 393  392                  mutex_destroy(&md_mx);
 394  393                  cv_destroy(&md_cv);
 395  394          }
 396  395  }
 397  396  
 398  397  int
 399  398  _init(void)
 400  399  {
 401  400          set_t   s;
 402  401          int     err;
 403  402  
 404  403          MD_SET_IN(IN_INIT);
 405  404  
 406  405          /* allocate dynamic space associated with driver globals */
 407  406          md_global_alloc_free(1);
 408  407  
 409  408          /* initialize driver globals */
 410  409          md_major = ddi_name_to_major("md");
 411  410          md_hz = drv_usectohz(NUM_USEC_IN_SEC);
 412  411  
 413  412          /* initialize tunable globals */
 414  413          if (md_maxphys == 0)            /* maximum io size in bytes */
 415  414                  md_maxphys = maxphys;
 416  415          if (md_maxbcount == 0)          /* maximum physio size in bytes */
 417  416                  md_maxbcount = MD_MAXBCOUNT;
 418  417  
 419  418          /* initialize per set driver globals */
 420  419          for (s = 0; s < MD_MAXSETS; s++)
 421  420                  md_set_io[s].io_state = MD_SET_ACTIVE;
 422  421  
 423  422          /*
 424  423           * NOTE: the framework does not currently guarantee exclusion
 425  424           * between _init and attach after calling mod_install.
 426  425           */
 427  426          MD_CLR_IN(IN_INIT);
 428  427          if ((err = mod_install(&modlinkage))) {
 429  428                  MD_SET_IN(IN_INIT);
 430  429                  md_global_alloc_free(0);        /* free dynamic space */
 431  430                  MD_CLR_IN(IN_INIT);
 432  431          }
 433  432          return (err);
 434  433  }
 435  434  
 436  435  int
 437  436  _fini(void)
 438  437  {
 439  438          int     err;
 440  439  
 441  440          /*
 442  441           * NOTE: the framework currently does not guarantee exclusion
 443  442           * with attach until after mod_remove returns 0.
 444  443           */
 445  444          if ((err = mod_remove(&modlinkage)))
 446  445                  return (err);
 447  446  
 448  447          MD_SET_IN(IN_FINI);
 449  448          md_global_alloc_free(0);        /* free dynamic space */
 450  449          MD_CLR_IN(IN_FINI);
 451  450          return (err);
 452  451  }
 453  452  
 454  453  int
 455  454  _info(struct modinfo *modinfop)
 456  455  {
 457  456          return (mod_info(&modlinkage, modinfop));
 458  457  }
 459  458  
 460  459  /* ARGSUSED */
 461  460  static int
 462  461  mdattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 463  462  {
 464  463          int     len;
 465  464          unit_t  i;
 466  465          size_t  sz;
 467  466          char    ver[VERSION_LENGTH];
 468  467          char    **maj_str_array;
 469  468          char    *str, *str2;
 470  469  
 471  470          MD_SET_IN(IN_ATTACH);
 472  471          md_in_upgrade = 0;
 473  472          md_keep_repl_state = 0;
 474  473          md_devid_destroy = 0;
 475  474  
 476  475          if (cmd != DDI_ATTACH) {
 477  476                  MD_CLR_IN(IN_ATTACH);
 478  477                  return (DDI_FAILURE);
 479  478          }
 480  479  
 481  480          if (md_devinfo != NULL) {
 482  481                  MD_CLR_IN(IN_ATTACH);
 483  482                  return (DDI_FAILURE);
 484  483          }
 485  484  
 486  485          mddb_init();
 487  486  
 488  487          if (md_start_daemons(TRUE)) {
 489  488                  MD_CLR_IN(IN_ATTACH);
 490  489                  mddb_unload();          /* undo mddb_init() allocations */
 491  490                  return (DDI_FAILURE);
 492  491          }
 493  492  
 494  493          /* clear the halted state */
 495  494          md_clr_status(MD_GBL_HALTED);
 496  495  
 497  496          /* see if the diagnostic switch is on */
 498  497          if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
 499  498              DDI_PROP_DONTPASS, "md_init_debug", 0))
 500  499                  md_init_debug++;
 501  500  
 502  501          /* see if the failfast disable switch is on */
 503  502          if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
 504  503              DDI_PROP_DONTPASS, "md_ff_disable", 0))
 505  504                  md_ff_disable++;
 506  505  
 507  506          /* try and get the md_nmedh property */
 508  507          md_nmedh = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
 509  508              DDI_PROP_DONTPASS, "md_nmedh", MED_DEF_HOSTS);
 510  509          if ((md_nmedh <= 0) || (md_nmedh > MED_MAX_HOSTS))
 511  510                  md_nmedh = MED_DEF_HOSTS;
 512  511  
 513  512          /* try and get the md_med_trans_lst property */
 514  513          len = 0;
 515  514          if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN,
 516  515              0, "md_med_trans_lst", NULL, &len) != DDI_PROP_SUCCESS ||
 517  516              len == 0) {
 518  517                  md_med_trans_lst = md_strdup("tcp");
 519  518          } else {
 520  519                  md_med_trans_lst = kmem_zalloc((size_t)len, KM_SLEEP);
 521  520                  if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
 522  521                      0, "md_med_trans_lst", md_med_trans_lst, &len) !=
 523  522                      DDI_PROP_SUCCESS) {
 524  523                          kmem_free(md_med_trans_lst, (size_t)len);
 525  524                          md_med_trans_lst = md_strdup("tcp");
 526  525                  }
 527  526          }
 528  527  
 529  528          /*
 530  529           * Must initialize the internal data structures before the
 531  530           * any possible calls to 'goto attach_failure' as _fini
 532  531           * routine references them.
 533  532           */
 534  533          med_init();
 535  534  
 536  535          md_ops = (md_ops_t **)kmem_zalloc(
 537  536              sizeof (md_ops_t *) * MD_NOPS, KM_SLEEP);
 538  537          md_mods = (ddi_modhandle_t *)kmem_zalloc(
 539  538              sizeof (ddi_modhandle_t) * MD_NOPS, KM_SLEEP);
 540  539  
 541  540          /* try and get the md_xlate property */
 542  541          /* Should we only do this if upgrade? */
 543  542          len = sizeof (char) * 5;
 544  543          if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
 545  544              0, "md_xlate_ver", ver, &len) == DDI_PROP_SUCCESS) {
 546  545                  if (strcmp(ver, VERSION) == 0) {
 547  546                          len = 0;
 548  547                          if (ddi_prop_op(DDI_DEV_T_ANY, dip,
 549  548                              PROP_LEN_AND_VAL_ALLOC, 0, "md_xlate",
 550  549                              (caddr_t)&md_tuple_table, &len) !=
 551  550                              DDI_PROP_SUCCESS) {
 552  551                                  if (md_init_debug)
 553  552                                          cmn_err(CE_WARN,
 554  553                                              "md_xlate ddi_prop_op failed");
 555  554                                  goto attach_failure;
 556  555                          } else {
 557  556                                  md_tuple_length =
 558  557                                      len/(2 * ((int)sizeof (dev32_t)));
 559  558                                  md_in_upgrade = 1;
 560  559                          }
 561  560  
 562  561                          /* Get target's name to major table */
 563  562                          if (ddi_prop_lookup_string_array(DDI_DEV_T_ANY,
 564  563                              dip, DDI_PROP_DONTPASS,
 565  564                              "md_targ_nm_table", &maj_str_array,
 566  565                              &md_majortab_len) != DDI_PROP_SUCCESS) {
 567  566                                  md_majortab_len = 0;
 568  567                                  if (md_init_debug)
 569  568                                          cmn_err(CE_WARN, "md_targ_nm_table "
 570  569                                              "ddi_prop_lookup_string_array "
 571  570                                              "failed");
 572  571                                  goto attach_failure;
 573  572                          }
 574  573  
 575  574                          md_major_tuple_table =
 576  575                              (struct md_xlate_major_table *)
 577  576                              kmem_zalloc(md_majortab_len *
 578  577                              sizeof (struct md_xlate_major_table), KM_SLEEP);
 579  578  
 580  579                          for (i = 0; i < md_majortab_len; i++) {
 581  580                                  /* Getting major name */
 582  581                                  str = strchr(maj_str_array[i], ' ');
 583  582                                  if (str == NULL)
 584  583                                          continue;
 585  584                                  *str = '\0';
 586  585                                  md_major_tuple_table[i].drv_name =
 587  586                                      md_strdup(maj_str_array[i]);
 588  587  
 589  588                                  /* Simplified atoi to get major number */
 590  589                                  str2 = str + 1;
 591  590                                  md_major_tuple_table[i].targ_maj = 0;
 592  591                                  while ((*str2 >= '0') && (*str2 <= '9')) {
 593  592                                          md_major_tuple_table[i].targ_maj *= 10;
 594  593                                          md_major_tuple_table[i].targ_maj +=
 595  594                                              *str2++ - '0';
 596  595                                  }
 597  596                                  *str = ' ';
 598  597                          }
 599  598                          ddi_prop_free((void *)maj_str_array);
 600  599                  } else {
 601  600                          if (md_init_debug)
 602  601                                  cmn_err(CE_WARN, "md_xlate_ver is incorrect");
 603  602                          goto attach_failure;
 604  603                  }
 605  604          }
 606  605  
 607  606          /*
 608  607           * Check for properties:
 609  608           *      md_keep_repl_state and md_devid_destroy
 610  609           * and set globals if these exist.
 611  610           */
 612  611          md_keep_repl_state = ddi_getprop(DDI_DEV_T_ANY, dip,
 613  612              0, "md_keep_repl_state", 0);
 614  613  
 615  614          md_devid_destroy = ddi_getprop(DDI_DEV_T_ANY, dip,
 616  615              0, "md_devid_destroy", 0);
 617  616  
 618  617          if (MD_UPGRADE)
 619  618                  md_major_targ = md_targ_name_to_major("md");
 620  619          else
 621  620                  md_major_targ = 0;
 622  621  
 623  622          /* allocate admin device node */
 624  623          if (ddi_create_priv_minor_node(dip, "admin", S_IFCHR,
 625  624              MD_ADM_MINOR, DDI_PSEUDO, 0, NULL, PRIV_SYS_CONFIG, 0640))
 626  625                  goto attach_failure;
 627  626  
 628  627          if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
 629  628              DDI_KERNEL_IOCTL, NULL, 0) != DDI_SUCCESS)
 630  629                  goto attach_failure;
 631  630  
 632  631          if (ddi_prop_update_int(DDI_DEV_T_NONE, dip,
 633  632              "ddi-abrwrite-supported", 1) != DDI_SUCCESS)
 634  633                  goto attach_failure;
 635  634  
 636  635          /* these could have been cleared by a detach */
 637  636          md_nunits = MD_MAXUNITS;
 638  637          md_nsets = MD_MAXSETS;
 639  638  
 640  639          sz = sizeof (void *) * MD_MAXUNITS;
 641  640          if (md_set[0].s_un == NULL)
 642  641                  md_set[0].s_un = kmem_zalloc(sz, KM_SLEEP);
 643  642          if (md_set[0].s_ui == NULL)
 644  643                  md_set[0].s_ui = kmem_zalloc(sz, KM_SLEEP);
 645  644  
 646  645          md_devinfo = dip;
 647  646  
 648  647          /*
 649  648           * Only allocate device node for root mirror metadevice.
 650  649           * Don't pre-allocate unnecessary device nodes (thus slowing down a
 651  650           * boot when we attach).
 652  651           * We can't read the mddbs in attach.  The mddbs will be read
 653  652           * by metainit during the boot process when it is doing the
 654  653           * auto-take processing and any other minor nodes will be
 655  654           * allocated at that point.
 656  655           *
 657  656           * There are two scenarios to be aware of here:
 658  657           * 1) when we are booting from a mirrored root we need the root
 659  658           *    metadevice to exist very early (during vfs_mountroot processing)
 660  659           * 2) we need all of the nodes to be created so that any mnttab entries
 661  660           *    will succeed (handled by metainit reading the mddb during boot).
 662  661           */
 663  662          if (strncmp(SVM_PSEUDO_STR, svm_bootpath, sizeof (SVM_PSEUDO_STR) - 1)
 664  663              == 0) {
 665  664                  char *p;
 666  665                  int mnum = 0;
 667  666  
 668  667                  /*
 669  668                   * The svm_bootpath string looks something like
 670  669                   * /pseudo/md@0:0,150,blk where 150 is the minor number
 671  670                   * in this example so we need to set the pointer p onto
 672  671                   * the first digit of the minor number and convert it
 673  672                   * from ascii.
 674  673                   */
 675  674                  for (p = svm_bootpath + sizeof (SVM_PSEUDO_STR) + 1;
 676  675                      *p >= '0' && *p <= '9'; p++) {
 677  676                          mnum *= 10;
 678  677                          mnum += *p - '0';
 679  678                  }
 680  679  
 681  680                  if (md_create_minor_node(0, mnum)) {
 682  681                          kmem_free(md_set[0].s_un, sz);
 683  682                          kmem_free(md_set[0].s_ui, sz);
 684  683                          goto attach_failure;
 685  684                  }
 686  685          }
 687  686  
 688  687          /* create the hash to store the meta device sizes */
 689  688          md_nblocksmap = mod_hash_create_idhash("md_nblocksmap",
 690  689              md_nblocksmap_size, mod_hash_null_valdtor);
 691  690  
 692  691          MD_CLR_IN(IN_ATTACH);
 693  692          return (DDI_SUCCESS);
 694  693  
 695  694  attach_failure:
 696  695          /*
 697  696           * Use our own detach routine to toss any stuff we allocated above.
 698  697           * NOTE: detach will call md_halt to free the mddb_init allocations.
 699  698           */
 700  699          MD_CLR_IN(IN_ATTACH);
 701  700          if (mddetach(dip, DDI_DETACH) != DDI_SUCCESS)
 702  701                  cmn_err(CE_WARN, "detach from attach failed");
 703  702          return (DDI_FAILURE);
 704  703  }
 705  704  
 706  705  /* ARGSUSED */
 707  706  static int
 708  707  mddetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 709  708  {
 710  709          extern int      check_active_locators();
 711  710          set_t           s;
 712  711          size_t          sz;
 713  712          int             len;
 714  713  
 715  714          MD_SET_IN(IN_DETACH);
 716  715  
 717  716          /* check command */
 718  717          if (cmd != DDI_DETACH) {
 719  718                  MD_CLR_IN(IN_DETACH);
 720  719                  return (DDI_FAILURE);
 721  720          }
 722  721  
 723  722          /*
 724  723           * if we have not already halted yet we have no active config
 725  724           * then automatically initiate a halt so we can detach.
 726  725           */
 727  726          if (!(md_get_status() & MD_GBL_HALTED)) {
 728  727                  if (check_active_locators() == 0) {
 729  728                          /*
 730  729                           * NOTE: a successful md_halt will have done the
 731  730                           * mddb_unload to free allocations done in mddb_init
 732  731                           */
 733  732                          if (md_halt(MD_NO_GBL_LOCKS_HELD)) {
 734  733                                  cmn_err(CE_NOTE, "md:detach: "
 735  734                                      "Could not halt Solaris Volume Manager");
 736  735                                  MD_CLR_IN(IN_DETACH);
 737  736                                  return (DDI_FAILURE);
 738  737                          }
 739  738                  }
 740  739  
 741  740                  /* fail detach if we have not halted */
 742  741                  if (!(md_get_status() & MD_GBL_HALTED)) {
 743  742                          MD_CLR_IN(IN_DETACH);
 744  743                          return (DDI_FAILURE);
 745  744                  }
 746  745          }
 747  746  
 748  747          /* must be in halted state, this will be cleared on next attach */
 749  748          ASSERT(md_get_status() & MD_GBL_HALTED);
 750  749  
 751  750          /* cleanup attach allocations and initializations */
 752  751          md_major_targ = 0;
 753  752  
 754  753          sz = sizeof (void *) * md_nunits;
 755  754          for (s = 0; s < md_nsets; s++) {
 756  755                  if (md_set[s].s_un != NULL) {
 757  756                          kmem_free(md_set[s].s_un, sz);
 758  757                          md_set[s].s_un = NULL;
 759  758                  }
 760  759  
 761  760                  if (md_set[s].s_ui != NULL) {
 762  761                          kmem_free(md_set[s].s_ui, sz);
 763  762                          md_set[s].s_ui = NULL;
 764  763                  }
 765  764          }
 766  765          md_nunits = 0;
 767  766          md_nsets = 0;
 768  767          md_nmedh = 0;
 769  768  
 770  769          if (non_ff_drivers != NULL) {
 771  770                  int     i;
 772  771  
 773  772                  for (i = 0; non_ff_drivers[i] != NULL; i++)
 774  773                          kmem_free(non_ff_drivers[i],
 775  774                              strlen(non_ff_drivers[i]) + 1);
 776  775  
 777  776                  /* free i+1 entries because there is a null entry at list end */
 778  777                  kmem_free(non_ff_drivers, (i + 1) * sizeof (char *));
 779  778                  non_ff_drivers = NULL;
 780  779          }
 781  780  
 782  781          if (md_med_trans_lst != NULL) {
 783  782                  kmem_free(md_med_trans_lst, strlen(md_med_trans_lst) + 1);
 784  783                  md_med_trans_lst = NULL;
 785  784          }
 786  785  
 787  786          if (md_mods != NULL) {
 788  787                  kmem_free(md_mods, sizeof (ddi_modhandle_t) * MD_NOPS);
 789  788                  md_mods = NULL;
 790  789          }
 791  790  
 792  791          if (md_ops != NULL) {
 793  792                  kmem_free(md_ops, sizeof (md_ops_t *) * MD_NOPS);
 794  793                  md_ops = NULL;
 795  794          }
 796  795  
 797  796          if (MD_UPGRADE) {
 798  797                  len = md_tuple_length * (2 * ((int)sizeof (dev32_t)));
 799  798                  md_in_upgrade = 0;
 800  799                  md_xlate_free(len);
 801  800                  md_majortab_free();
 802  801          }
 803  802  
 804  803          /*
 805  804           * Undo what we did in mdattach, freeing resources
 806  805           * and removing things we installed.  The system
 807  806           * framework guarantees we are not active with this devinfo
 808  807           * node in any other entry points at this time.
 809  808           */
 810  809          ddi_prop_remove_all(dip);
 811  810          ddi_remove_minor_node(dip, NULL);
 812  811  
 813  812          med_fini();
 814  813  
 815  814          mod_hash_destroy_idhash(md_nblocksmap);
 816  815  
 817  816          md_devinfo = NULL;
 818  817  
 819  818          MD_CLR_IN(IN_DETACH);
 820  819          return (DDI_SUCCESS);
 821  820  }
 822  821  
 823  822  
 824  823  /*
 825  824   * Given the device number return the devinfo pointer
 826  825   * given to md via md_attach
 827  826   */
 828  827  /*ARGSUSED*/
 829  828  static int
 830  829  mdinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 831  830  {
 832  831          int             error = DDI_FAILURE;
 833  832  
 834  833          switch (infocmd) {
 835  834          case DDI_INFO_DEVT2DEVINFO:
 836  835                  if (md_devinfo) {
 837  836                          *result = (void *)md_devinfo;
 838  837                          error = DDI_SUCCESS;
 839  838                  }
 840  839                  break;
 841  840  
 842  841          case DDI_INFO_DEVT2INSTANCE:
 843  842                  *result = (void *)0;
 844  843                  error = DDI_SUCCESS;
 845  844                  break;
 846  845          }
 847  846          return (error);
 848  847  }
 849  848  
 850  849  /*
 851  850   * property operation routine.  return the number of blocks for the partition
 852  851   * in question or forward the request to the property facilities.
 853  852   */
 854  853  static int
 855  854  mdprop_op(
 856  855          dev_t dev,              /* device number associated with device */
 857  856          dev_info_t *dip,        /* device info struct for this device */
 858  857          ddi_prop_op_t prop_op,  /* property operator */
 859  858          int mod_flags,          /* property flags */
 860  859          char *name,             /* name of property */
 861  860          caddr_t valuep,         /* where to put property value */
 862  861          int *lengthp)           /* put length of property here */
 863  862  {
 864  863          return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags,
 865  864              name, valuep, lengthp, md_nblocks_get(getminor(dev))));
 866  865  }
 867  866  
 868  867  static void
 869  868  snarf_user_data(set_t setno)
 870  869  {
 871  870          mddb_recid_t            recid;
 872  871          mddb_recstatus_t        status;
 873  872  
 874  873          recid = mddb_makerecid(setno, 0);
 875  874          while ((recid = mddb_getnextrec(recid, MDDB_USER, 0)) > 0) {
 876  875                  if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
 877  876                          continue;
 878  877  
 879  878                  status = mddb_getrecstatus(recid);
 880  879                  if (status == MDDB_STALE)
 881  880                          continue;
 882  881  
 883  882                  if (status == MDDB_NODATA) {
 884  883                          mddb_setrecprivate(recid, MD_PRV_PENDDEL);
 885  884                          continue;
 886  885                  }
 887  886  
 888  887                  ASSERT(status == MDDB_OK);
 889  888  
 890  889                  mddb_setrecprivate(recid, MD_PRV_GOTIT);
 891  890          }
 892  891  }
 893  892  
 894  893  static void
 895  894  md_print_block_usage(mddb_set_t *s, uint_t blks)
 896  895  {
 897  896          uint_t          ib;
 898  897          int             li;
 899  898          mddb_mb_ic_t    *mbip;
 900  899          uint_t          max_blk_needed;
 901  900          mddb_lb_t       *lbp;
 902  901          mddb_sidelocator_t      *slp;
 903  902          int             drv_index;
 904  903          md_splitname    sn;
 905  904          char            *name;
 906  905          char            *suffix;
 907  906          size_t          prefixlen;
 908  907          size_t          suffixlen;
 909  908          int             alloc_sz;
 910  909  
 911  910  
 912  911          max_blk_needed = s->s_totalblkcnt - s->s_freeblkcnt + blks;
 913  912  
 914  913          cmn_err(CE_WARN, "Blocks in Metadevice State Database: %d\n"
 915  914              "            Additional Blocks Needed:            %d\n\n"
 916  915              "            Increase size of following replicas for\n"
 917  916              "            device relocatability by deleting listed\n"
 918  917              "            replica and re-adding replica with\n"
 919  918              "            increased size (see metadb(1M)):\n"
 920  919              "                Replica                   Increase By",
 921  920              s->s_totalblkcnt, (blks - s->s_freeblkcnt));
 922  921  
 923  922          lbp = s->s_lbp;
 924  923  
 925  924          for (li = 0; li < lbp->lb_loccnt; li++) {
 926  925                  if (lbp->lb_locators[li].l_flags & MDDB_F_DELETED)
 927  926                          continue;
 928  927                  ib = 0;
 929  928                  for (mbip = s->s_mbiarray[li]; mbip != NULL;
 930  929                      mbip = mbip->mbi_next) {
 931  930                          ib += (uint_t)mbip->mbi_mddb_mb.mb_blkcnt;
 932  931                  }
 933  932                  if (ib == 0)
 934  933                          continue;
 935  934                  if (ib < max_blk_needed) {
 936  935                          slp = &lbp->lb_sidelocators[s->s_sideno][li];
 937  936                          drv_index = slp->l_drvnm_index;
 938  937                          mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno,
 939  938                              &sn);
 940  939                          prefixlen = SPN_PREFIX(&sn).pre_len;
 941  940                          suffixlen = SPN_SUFFIX(&sn).suf_len;
 942  941                          alloc_sz = (int)(prefixlen + suffixlen + 2);
 943  942                          name = (char *)kmem_alloc(alloc_sz, KM_SLEEP);
 944  943                          (void) strncpy(name, SPN_PREFIX(&sn).pre_data,
 945  944                              prefixlen);
 946  945                          name[prefixlen] = '/';
 947  946                          suffix = name + (prefixlen + 1);
 948  947                          (void) strncpy(suffix, SPN_SUFFIX(&sn).suf_data,
 949  948                              suffixlen);
 950  949                          name[prefixlen + suffixlen + 1] = '\0';
 951  950                          cmn_err(CE_WARN,
 952  951                              "  %s (%s:%d:%d)   %d blocks",
 953  952                              name, lbp->lb_drvnm[drv_index].dn_data,
 954  953                              slp->l_mnum, lbp->lb_locators[li].l_blkno,
 955  954                              (max_blk_needed - ib));
 956  955                          kmem_free(name, alloc_sz);
 957  956                  }
 958  957          }
 959  958  }
 960  959  
 961  960  /*
 962  961   * md_create_minor_node:
 963  962   *      Create the minor device for the given set and un_self_id.
 964  963   *
 965  964   * Input:
 966  965   *      setno   - set number
 967  966   *      mnum    - selfID of unit
 968  967   *
 969  968   * Output:
 970  969   *      None.
 971  970   *
 972  971   * Returns 0 for success, 1 for failure.
 973  972   *
 974  973   * Side-effects:
 975  974   *      None.
 976  975   */
 977  976  int
 978  977  md_create_minor_node(set_t setno, minor_t mnum)
 979  978  {
 980  979          char            name[20];
 981  980  
 982  981          /* Check for valid arguments */
 983  982          if (setno >= MD_MAXSETS || MD_MIN2UNIT(mnum) >= MD_MAXUNITS)
 984  983                  return (1);
 985  984  
 986  985          (void) snprintf(name, 20, "%u,%u,blk",
 987  986              (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));
 988  987  
 989  988          if (ddi_create_minor_node(md_devinfo, name, S_IFBLK,
 990  989              MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
 991  990                  return (1);
 992  991  
 993  992          (void) snprintf(name, 20, "%u,%u,raw",
 994  993              (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));
 995  994  
 996  995          if (ddi_create_minor_node(md_devinfo, name, S_IFCHR,
 997  996              MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
 998  997                  return (1);
 999  998  
1000  999          return (0);
1001 1000  }
1002 1001  
1003 1002  /*
1004 1003   * For a given key check if it is an orphaned record.
1005 1004   * The following conditions are used to determine an orphan.
1006 1005   * 1. The device associated with that key is not a metadevice.
1007 1006   * 2. If DEVID_STYLE then the physical device does not have a device Id
1008 1007   * associated with it.
1009 1008   *
1010 1009   * If a key does not have an entry in the devid namespace it could be
1011 1010   * a device that does not support device ids. Hence the record is not
1012 1011   * deleted.
1013 1012   */
1014 1013  
1015 1014  static int
1016 1015  md_verify_orphaned_record(set_t setno, mdkey_t key)
1017 1016  {
1018 1017          md_dev64_t      odev; /* orphaned dev */
1019 1018          mddb_set_t      *s;
1020 1019          side_t          side = 0;
1021 1020          struct nm_next_hdr      *did_nh = NULL;
1022 1021  
1023 1022          s = (mddb_set_t *)md_set[setno].s_db;
1024 1023          if ((did_nh = get_first_record(setno, 1,  (NM_DEVID | NM_NOTSHARED)))
1025 1024              == NULL)
1026 1025                  return (0);
1027 1026          /*
1028 1027           * If devid style is set then get the dev_t using MD_NOTRUST_DEVT
1029 1028           */
1030 1029          if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE) {
1031 1030                  odev = md_getdevnum(setno, side, key, MD_NOTRUST_DEVT);
1032 1031                  if ((odev == NODEV64) || (md_getmajor(odev) == md_major))
1033 1032                          return (0);
1034 1033                  if (lookup_entry(did_nh, setno, side, key, odev, NM_DEVID) ==
1035 1034                      NULL)
1036 1035                          return (1);
1037 1036          }
1038 1037          return (0);
1039 1038  }
1040 1039  
1041 1040  int
1042 1041  md_snarf_db_set(set_t setno, md_error_t *ep)
1043 1042  {
1044 1043          int                     err = 0;
1045 1044          int                     i;
1046 1045          mddb_recid_t            recid;
1047 1046          mddb_type_t             drvrid;
1048 1047          mddb_recstatus_t        status;
1049 1048          md_ops_t                *ops;
1050 1049          uint_t                  privat;
1051 1050          mddb_set_t              *s;
1052 1051          uint_t                  cvt_blks;
1053 1052          struct nm_next_hdr      *nh;
1054 1053          mdkey_t                 key = MD_KEYWILD;
1055 1054          side_t                  side = 0;
1056 1055          int                     size;
1057 1056          int                     devid_flag;
1058 1057          int                     retval;
1059 1058          uint_t                  un;
1060 1059          int                     un_next_set = 0;
1061 1060  
1062 1061          md_haltsnarf_enter(setno);
1063 1062  
1064 1063          mutex_enter(&md_mx);
1065 1064          if (md_set[setno].s_status & MD_SET_SNARFED) {
1066 1065                  mutex_exit(&md_mx);
1067 1066                  md_haltsnarf_exit(setno);
1068 1067                  return (0);
1069 1068          }
1070 1069          mutex_exit(&md_mx);
1071 1070  
1072 1071          if (! (md_get_status() & MD_GBL_DAEMONS_LIVE)) {
1073 1072                  if (md_start_daemons(TRUE)) {
1074 1073                          if (ep != NULL)
1075 1074                                  (void) mdsyserror(ep, ENXIO);
1076 1075                          err = -1;
1077 1076                          goto out;
1078 1077                  }
1079 1078          }
1080 1079  
1081 1080  
1082 1081          /*
1083 1082           * Load the devid name space if it exists
1084 1083           */
1085 1084          (void) md_load_namespace(setno, NULL, NM_DEVID);
1086 1085          if (!md_load_namespace(setno, ep, 0L)) {
1087 1086                  /*
1088 1087                   * Unload the devid namespace
1089 1088                   */
1090 1089                  (void) md_unload_namespace(setno, NM_DEVID);
1091 1090                  err = -1;
1092 1091                  goto out;
1093 1092          }
1094 1093  
1095 1094          /*
1096 1095           * If replica is in non-devid state, convert if:
1097 1096           *      - not in probe during upgrade (md_keep_repl_state = 0)
1098 1097           *      - enough space available in replica
1099 1098           *      - local set
1100 1099           *      - not a multi-node diskset
1101 1100           *      - clustering is not present (for non-local set)
1102 1101           */
1103 1102          s = (mddb_set_t *)md_set[setno].s_db;
1104 1103          devid_flag = 0;
1105 1104          if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE) && !md_keep_repl_state)
1106 1105                  devid_flag = 1;
1107 1106          if (cluster_bootflags & CLUSTER_CONFIGURED)
1108 1107                  if (setno != MD_LOCAL_SET)
1109 1108                          devid_flag = 0;
1110 1109          if (MD_MNSET_SETNO(setno))
1111 1110                  devid_flag = 0;
1112 1111          if ((md_devid_destroy == 1) && (md_keep_repl_state == 1))
1113 1112                  devid_flag = 0;
1114 1113  
1115 1114          /*
1116 1115           * if we weren't devid style before and md_keep_repl_state=1
1117 1116           * we need to stay non-devid
1118 1117           */
1119 1118          if ((md_keep_repl_state == 1) &&
1120 1119              ((s->s_lbp->lb_flags & MDDB_DEVID_STYLE) == 0))
1121 1120                  devid_flag = 0;
1122 1121          if (devid_flag) {
1123 1122                  /*
1124 1123                   * Determine number of free blocks needed to convert
1125 1124                   * entire replica to device id format - locator blocks
1126 1125                   * and namespace.
1127 1126                   */
1128 1127                  cvt_blks = 0;
1129 1128                  if (mddb_lb_did_convert(s, 0, &cvt_blks) != 0) {
1130 1129                          if (ep != NULL)
1131 1130                                  (void) mdsyserror(ep, EIO);
1132 1131                          err = -1;
1133 1132                          goto out;
1134 1133  
1135 1134                  }
1136 1135                  cvt_blks += md_nm_did_chkspace(setno);
1137 1136  
1138 1137                  /* add MDDB_DEVID_CONV_PERC% */
1139 1138                  if ((md_conv_perc > 0) && (md_conv_perc <= 100)) {
1140 1139                          cvt_blks = cvt_blks * (100 + md_conv_perc) / 100;
1141 1140                  }
1142 1141  
1143 1142                  if (cvt_blks <= s->s_freeblkcnt) {
1144 1143                          if (mddb_lb_did_convert(s, 1, &cvt_blks) != 0) {
1145 1144                                  if (ep != NULL)
1146 1145                                          (void) mdsyserror(ep, EIO);
1147 1146                                  err = -1;
1148 1147                                  goto out;
1149 1148                          }
1150 1149  
1151 1150                  } else {
1152 1151                          /*
1153 1152                           * Print message that replica can't be converted for
1154 1153                           * lack of space.   No failure - just continue to
1155 1154                           * run without device ids.
1156 1155                           */
1157 1156                          cmn_err(CE_WARN,
1158 1157                              "Unable to add Solaris Volume Manager device "
1159 1158                              "relocation data.\n"
1160 1159                              "          To use device relocation feature:\n"
1161 1160                              "          - Increase size of listed replicas\n"
1162 1161                              "          - Reboot");
1163 1162                          md_print_block_usage(s, cvt_blks);
1164 1163                          cmn_err(CE_WARN,
1165 1164                              "Loading set without device relocation data.\n"
1166 1165                              "          Solaris Volume Manager disk movement "
1167 1166                              "not tracked in local set.");
1168 1167                  }
1169 1168          }
1170 1169  
1171 1170          /*
1172 1171           * go through and load any modules referenced in
1173 1172           * data base
1174 1173           */
1175 1174          recid = mddb_makerecid(setno, 0);
1176 1175          while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1177 1176                  status = mddb_getrecstatus(recid);
1178 1177                  if (status == MDDB_STALE) {
1179 1178                          if (! (md_get_setstatus(setno) & MD_SET_STALE)) {
1180 1179                                  md_set_setstatus(setno, MD_SET_STALE);
1181 1180                                  cmn_err(CE_WARN,
1182 1181                                      "md: state database is stale");
1183 1182                          }
1184 1183                  } else if (status == MDDB_NODATA) {
1185 1184                          mddb_setrecprivate(recid, MD_PRV_PENDDEL);
1186 1185                          continue;
1187 1186                  }
1188 1187                  drvrid = mddb_getrectype1(recid);
1189 1188                  if (drvrid < MDDB_FIRST_MODID)
1190 1189                          continue;
1191 1190                  if (md_loadsubmod(setno, md_getshared_name(setno, drvrid),
1192 1191                      drvrid) < 0) {
1193 1192                          cmn_err(CE_NOTE, "md: could not load misc/%s",
1194 1193                              md_getshared_name(setno, drvrid));
1195 1194                  }
1196 1195          }
1197 1196  
1198 1197          if (recid < 0)
1199 1198                  goto out;
1200 1199  
1201 1200          snarf_user_data(setno);
1202 1201  
1203 1202          /*
1204 1203           * Initialize the md_nm_snarfed array
1205 1204           * this array is indexed by the key and
1206 1205           * is set by md_getdevnum during the snarf time
1207 1206           */
1208 1207          if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) != NULL) {
1209 1208                  size = (int)((((struct nm_rec_hdr *)nh->nmn_record)->
1210 1209                      r_next_key) * (sizeof (int)));
1211 1210                  md_nm_snarfed = (int *)kmem_zalloc(size, KM_SLEEP);
1212 1211          }
1213 1212  
1214 1213          /*
1215 1214           * go through and snarf until nothing gets added
1216 1215           */
1217 1216          do {
1218 1217                  i = 0;
1219 1218                  for (ops = md_opslist; ops != NULL; ops = ops->md_next) {
1220 1219                          if (ops->md_snarf != NULL) {
1221 1220                                  retval = ops->md_snarf(MD_SNARF_DOIT, setno);
1222 1221                                  if (retval == -1) {
1223 1222                                          err = -1;
1224 1223                                          /* Don't know the failed unit */
1225 1224                                          (void) mdmderror(ep, MDE_RR_ALLOC_ERROR,
1226 1225                                              0);
1227 1226                                          (void) md_halt_set(setno, MD_HALT_ALL);
1228 1227                                          (void) mddb_unload_set(setno);
1229 1228                                          md_haltsnarf_exit(setno);
1230 1229                                          return (err);
1231 1230                                  } else {
1232 1231                                          i += retval;
1233 1232                                  }
1234 1233                          }
1235 1234                  }
1236 1235          } while (i);
1237 1236  
1238 1237          /*
1239 1238           * Set the first available slot and availability
1240 1239           */
1241 1240          md_set[setno].s_un_avail = 0;
1242 1241          for (un = 0; un < MD_MAXUNITS; un++) {
1243 1242                  if (md_set[setno].s_un[un] != NULL) {
1244 1243                          continue;
1245 1244                  } else {
1246 1245                          if (!un_next_set) {
1247 1246                                  md_set[setno].s_un_next = un;
1248 1247                                  un_next_set = 1;
1249 1248                          }
1250 1249                          md_set[setno].s_un_avail++;
1251 1250                  }
1252 1251          }
1253 1252  
1254 1253          md_set_setstatus(setno, MD_SET_SNARFED);
1255 1254  
1256 1255          recid = mddb_makerecid(setno, 0);
1257 1256          while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1258 1257                  privat = mddb_getrecprivate(recid);
1259 1258                  if (privat & MD_PRV_COMMIT) {
1260 1259                          if (mddb_commitrec(recid)) {
1261 1260                                  if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
1262 1261                                          md_set_setstatus(setno, MD_SET_STALE);
1263 1262                                          cmn_err(CE_WARN,
1264 1263                                              "md: state database is stale");
1265 1264                                  }
1266 1265                          }
1267 1266                          mddb_setrecprivate(recid, MD_PRV_GOTIT);
1268 1267                  }
1269 1268          }
1270 1269  
1271 1270          /* Deletes must happen after all the commits */
1272 1271          recid = mddb_makerecid(setno, 0);
1273 1272          while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1274 1273                  privat = mddb_getrecprivate(recid);
1275 1274                  if (privat & MD_PRV_DELETE) {
1276 1275                          if (mddb_deleterec(recid)) {
1277 1276                                  if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
1278 1277                                          md_set_setstatus(setno, MD_SET_STALE);
1279 1278                                          cmn_err(CE_WARN,
1280 1279                                              "md: state database is stale");
1281 1280                                  }
1282 1281                                  mddb_setrecprivate(recid, MD_PRV_GOTIT);
1283 1282                          }
1284 1283                          recid = mddb_makerecid(setno, 0);
1285 1284                  }
1286 1285          }
1287 1286  
1288 1287          /*
1289 1288           * go through and clean up records until nothing gets cleaned up.
1290 1289           */
1291 1290          do {
1292 1291                  i = 0;
1293 1292                  for (ops = md_opslist; ops != NULL; ops = ops->md_next)
1294 1293                          if (ops->md_snarf != NULL)
1295 1294                                  i += ops->md_snarf(MD_SNARF_CLEANUP, setno);
1296 1295          } while (i);
1297 1296  
1298 1297          if (md_nm_snarfed != NULL &&
1299 1298              !(md_get_setstatus(setno) & MD_SET_STALE)) {
1300 1299                  /*
1301 1300                   * go thru and cleanup the namespace and the device id
1302 1301                   * name space
1303 1302                   */
1304 1303                  for (key = 1;
1305 1304                      key < ((struct nm_rec_hdr *)nh->nmn_record)->r_next_key;
1306 1305                      key++) {
1307 1306                          /*
1308 1307                           * Is the entry an 'orphan'?
1309 1308                           */
1310 1309                          if (lookup_entry(nh, setno, side, key, NODEV64, 0L) !=
1311 1310                              NULL) {
1312 1311                                  /*
1313 1312                                   * If the value is not set then apparently
1314 1313                                   * it is not part of the current configuration,
1315 1314                                   * remove it this can happen when system panic
1316 1315                                   * between the primary name space update and
1317 1316                                   * the device id name space update
1318 1317                                   */
1319 1318                                  if (md_nm_snarfed[key] == 0) {
1320 1319                                          if (md_verify_orphaned_record(setno,
1321 1320                                              key) == 1)
1322 1321                                                  (void) remove_entry(nh,
1323 1322                                                      side, key, 0L);
1324 1323                                  }
1325 1324                          }
1326 1325                  }
1327 1326          }
1328 1327  
1329 1328          if (md_nm_snarfed != NULL) {
1330 1329                  /*
1331 1330                   * Done and free the memory
1332 1331                   */
1333 1332                  kmem_free(md_nm_snarfed, size);
1334 1333                  md_nm_snarfed = NULL;
1335 1334          }
1336 1335  
1337 1336          if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE &&
1338 1337              !(md_get_setstatus(setno) & MD_SET_STALE)) {
1339 1338                  /*
1340 1339                   * if the destroy flag has been set and
1341 1340                   * the MD_SET_DIDCLUP bit is not set in
1342 1341                   * the set's status field, cleanup the
1343 1342                   * entire device id namespace
1344 1343                   */
1345 1344                  if (md_devid_destroy &&
1346 1345                      !(md_get_setstatus(setno) & MD_SET_DIDCLUP)) {
1347 1346                          (void) md_devid_cleanup(setno, 1);
1348 1347                          md_set_setstatus(setno, MD_SET_DIDCLUP);
1349 1348                  } else
1350 1349                          (void) md_devid_cleanup(setno, 0);
1351 1350          }
1352 1351  
1353 1352          /*
1354 1353           * clear single threading on snarf, return success or error
1355 1354           */
1356 1355  out:
1357 1356          md_haltsnarf_exit(setno);
1358 1357          return (err);
1359 1358  }
1360 1359  
1361 1360  void
1362 1361  get_minfo(struct dk_minfo *info, minor_t mnum)
1363 1362  {
1364 1363          md_unit_t       *un;
1365 1364          mdi_unit_t      *ui;
1366 1365  
1367 1366          info->dki_capacity = 0;
1368 1367          info->dki_lbsize = 0;
1369 1368          info->dki_media_type = 0;
1370 1369  
1371 1370          if ((ui = MDI_UNIT(mnum)) == NULL) {
1372 1371                  return;
1373 1372          }
1374 1373          un = (md_unit_t *)md_unit_readerlock(ui);
1375 1374          info->dki_capacity = un->c.un_total_blocks;
1376 1375          md_unit_readerexit(ui);
1377 1376          info->dki_lbsize = DEV_BSIZE;
1378 1377          info->dki_media_type = DK_UNKNOWN;
1379 1378  }
1380 1379  
1381 1380  
1382 1381  void
1383 1382  get_info(struct dk_cinfo *info, minor_t mnum)
1384 1383  {
1385 1384          /*
1386 1385           * Controller Information
1387 1386           */
1388 1387          info->dki_ctype = DKC_MD;
1389 1388          info->dki_cnum = ddi_get_instance(ddi_get_parent(md_devinfo));
1390 1389          (void) strcpy(info->dki_cname,
1391 1390              ddi_get_name(ddi_get_parent(md_devinfo)));
1392 1391          /*
1393 1392           * Unit Information
1394 1393           */
1395 1394          info->dki_unit = mnum;
1396 1395          info->dki_slave = 0;
1397 1396          (void) strcpy(info->dki_dname, ddi_driver_name(md_devinfo));
1398 1397          info->dki_flags = 0;
1399 1398          info->dki_partition = 0;
1400 1399          info->dki_maxtransfer = (ushort_t)(md_maxphys / DEV_BSIZE);
1401 1400  
1402 1401          /*
1403 1402           * We can't get from here to there yet
1404 1403           */
1405 1404          info->dki_addr = 0;
1406 1405          info->dki_space = 0;
1407 1406          info->dki_prio = 0;
1408 1407          info->dki_vec = 0;
1409 1408  }
1410 1409  
1411 1410  /*
1412 1411   * open admin device
1413 1412   */
1414 1413  static int
1415 1414  mdadminopen(
1416 1415          int     flag,
1417 1416          int     otyp)
1418 1417  {
1419 1418          int     err = 0;
1420 1419  
1421 1420          /* single thread */
1422 1421          mutex_enter(&md_mx);
1423 1422  
1424 1423          /* check type and flags */
1425 1424          if ((otyp != OTYP_CHR) && (otyp != OTYP_LYR)) {
1426 1425                  err = EINVAL;
1427 1426                  goto out;
1428 1427          }
1429 1428          if (((flag & FEXCL) && (md_status & MD_GBL_OPEN)) ||
1430 1429              (md_status & MD_GBL_EXCL)) {
1431 1430                  err = EBUSY;
1432 1431                  goto out;
1433 1432          }
1434 1433  
1435 1434          /* count and flag open */
1436 1435          md_ocnt[otyp]++;
1437 1436          md_status |= MD_GBL_OPEN;
1438 1437          if (flag & FEXCL)
1439 1438                  md_status |= MD_GBL_EXCL;
1440 1439  
1441 1440          /* unlock return success */
1442 1441  out:
1443 1442          mutex_exit(&md_mx);
1444 1443          return (err);
1445 1444  }
1446 1445  
1447 1446  /*
1448 1447   * open entry point
1449 1448   */
1450 1449  static int
1451 1450  mdopen(
1452 1451          dev_t           *dev,
1453 1452          int             flag,
1454 1453          int             otyp,
1455 1454          cred_t          *cred_p)
1456 1455  {
1457 1456          minor_t         mnum = getminor(*dev);
1458 1457          unit_t          unit = MD_MIN2UNIT(mnum);
1459 1458          set_t           setno = MD_MIN2SET(mnum);
1460 1459          mdi_unit_t      *ui = NULL;
1461 1460          int             err = 0;
1462 1461          md_parent_t     parent;
1463 1462  
1464 1463          /* dispatch admin device opens */
1465 1464          if (mnum == MD_ADM_MINOR)
1466 1465                  return (mdadminopen(flag, otyp));
1467 1466  
1468 1467          /* lock, check status */
1469 1468          rw_enter(&md_unit_array_rw.lock, RW_READER);
1470 1469  
1471 1470  tryagain:
1472 1471          if (md_get_status() & MD_GBL_HALTED)  {
1473 1472                  err = ENODEV;
1474 1473                  goto out;
1475 1474          }
1476 1475  
1477 1476          /* check minor */
1478 1477          if ((setno >= md_nsets) || (unit >= md_nunits)) {
1479 1478                  err = ENXIO;
1480 1479                  goto out;
1481 1480          }
1482 1481  
1483 1482          /* make sure we're snarfed */
1484 1483          if ((md_get_setstatus(MD_LOCAL_SET) & MD_SET_SNARFED) == 0) {
1485 1484                  if (md_snarf_db_set(MD_LOCAL_SET, NULL) != 0) {
1486 1485                          err = ENODEV;
1487 1486                          goto out;
1488 1487                  }
1489 1488          }
1490 1489          if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) {
1491 1490                  err = ENODEV;
1492 1491                  goto out;
1493 1492          }
1494 1493  
1495 1494          /* check unit */
1496 1495          if ((ui = MDI_UNIT(mnum)) == NULL) {
1497 1496                  err = ENXIO;
1498 1497                  goto out;
1499 1498          }
1500 1499  
1501 1500          /*
1502 1501           * The softpart open routine may do an I/O during the open, in
1503 1502           * which case the open routine will set the OPENINPROGRESS flag
1504 1503           * and drop all locks during the I/O.  If this thread sees
1505 1504           * the OPENINPROGRESS flag set, if should wait until the flag
1506 1505           * is reset before calling the driver's open routine.  It must
1507 1506           * also revalidate the world after it grabs the unit_array lock
1508 1507           * since the set may have been released or the metadevice cleared
1509 1508           * during the sleep.
1510 1509           */
1511 1510          if (MD_MNSET_SETNO(setno)) {
1512 1511                  mutex_enter(&ui->ui_mx);
1513 1512                  if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
1514 1513                          rw_exit(&md_unit_array_rw.lock);
1515 1514                          cv_wait(&ui->ui_cv, &ui->ui_mx);
1516 1515                          rw_enter(&md_unit_array_rw.lock, RW_READER);
1517 1516                          mutex_exit(&ui->ui_mx);
1518 1517                          goto tryagain;
1519 1518                  }
1520 1519                  mutex_exit(&ui->ui_mx);
1521 1520          }
1522 1521  
1523 1522          /* Test if device is openable */
1524 1523          if ((ui->ui_tstate & MD_NOTOPENABLE) != 0) {
1525 1524                  err = ENXIO;
1526 1525                  goto out;
1527 1526          }
1528 1527  
1529 1528          /* don't allow opens w/WRITE flag if stale */
1530 1529          if ((flag & FWRITE) && (md_get_setstatus(setno) & MD_SET_STALE)) {
1531 1530                  err = EROFS;
1532 1531                  goto out;
1533 1532          }
1534 1533  
1535 1534          /* don't allow writes to subdevices */
1536 1535          parent = md_get_parent(md_expldev(*dev));
1537 1536          if ((flag & FWRITE) && MD_HAS_PARENT(parent)) {
1538 1537                  err = EROFS;
1539 1538                  goto out;
1540 1539          }
1541 1540  
1542 1541          /* open underlying driver */
1543 1542          if (md_ops[ui->ui_opsindex]->md_open != NULL) {
1544 1543                  if ((err = (*md_ops[ui->ui_opsindex]->md_open)
1545 1544                      (dev, flag, otyp, cred_p, 0)) != 0)
1546 1545                          goto out;
1547 1546          }
1548 1547  
1549 1548          /* or do it ourselves */
1550 1549          else {
1551 1550                  /* single thread */
1552 1551                  (void) md_unit_openclose_enter(ui);
1553 1552                  err = md_unit_incopen(mnum, flag, otyp);
1554 1553                  md_unit_openclose_exit(ui);
1555 1554                  if (err != 0)
1556 1555                          goto out;
1557 1556          }
1558 1557  
1559 1558          /* unlock, return status */
1560 1559  out:
1561 1560          rw_exit(&md_unit_array_rw.lock);
1562 1561          return (err);
1563 1562  }
1564 1563  
1565 1564  /*
1566 1565   * close admin device
1567 1566   */
1568 1567  static int
1569 1568  mdadminclose(
1570 1569          int     otyp)
1571 1570  {
1572 1571          int     i;
1573 1572          int     err = 0;
1574 1573  
1575 1574          /* single thread */
1576 1575          mutex_enter(&md_mx);
1577 1576  
1578 1577          /* check type and flags */
1579 1578          if ((otyp < 0) || (otyp >= OTYPCNT)) {
1580 1579                  err = EINVAL;
1581 1580                  goto out;
1582 1581          } else if (md_ocnt[otyp] == 0) {
1583 1582                  err = ENXIO;
1584 1583                  goto out;
1585 1584          }
1586 1585  
1587 1586          /* count and flag closed */
1588 1587          if (otyp == OTYP_LYR)
1589 1588                  md_ocnt[otyp]--;
1590 1589          else
1591 1590                  md_ocnt[otyp] = 0;
1592 1591          md_status &= ~MD_GBL_OPEN;
1593 1592          for (i = 0; (i < OTYPCNT); ++i)
1594 1593                  if (md_ocnt[i] != 0)
1595 1594                          md_status |= MD_GBL_OPEN;
1596 1595          if (! (md_status & MD_GBL_OPEN))
1597 1596                  md_status &= ~MD_GBL_EXCL;
1598 1597  
1599 1598          /* unlock return success */
1600 1599  out:
1601 1600          mutex_exit(&md_mx);
1602 1601          return (err);
1603 1602  }
1604 1603  
1605 1604  /*
1606 1605   * close entry point
1607 1606   */
1608 1607  static int
1609 1608  mdclose(
1610 1609          dev_t           dev,
1611 1610          int             flag,
1612 1611          int             otyp,
1613 1612          cred_t          *cred_p)
1614 1613  {
1615 1614          minor_t         mnum = getminor(dev);
1616 1615          set_t           setno = MD_MIN2SET(mnum);
1617 1616          unit_t          unit = MD_MIN2UNIT(mnum);
1618 1617          mdi_unit_t      *ui = NULL;
1619 1618          int             err = 0;
1620 1619  
1621 1620          /* dispatch admin device closes */
1622 1621          if (mnum == MD_ADM_MINOR)
1623 1622                  return (mdadminclose(otyp));
1624 1623  
1625 1624          /* check minor */
1626 1625          if ((setno >= md_nsets) || (unit >= md_nunits) ||
1627 1626              ((ui = MDI_UNIT(mnum)) == NULL)) {
1628 1627                  err = ENXIO;
1629 1628                  goto out;
1630 1629          }
1631 1630  
1632 1631          /* close underlying driver */
1633 1632          if (md_ops[ui->ui_opsindex]->md_close != NULL) {
1634 1633                  if ((err = (*md_ops[ui->ui_opsindex]->md_close)
1635 1634                      (dev, flag, otyp, cred_p, 0)) != 0)
1636 1635                          goto out;
1637 1636          }
1638 1637  
1639 1638          /* or do it ourselves */
1640 1639          else {
1641 1640                  /* single thread */
1642 1641                  (void) md_unit_openclose_enter(ui);
1643 1642                  err = md_unit_decopen(mnum, otyp);
1644 1643                  md_unit_openclose_exit(ui);
1645 1644                  if (err != 0)
1646 1645                          goto out;
1647 1646          }
1648 1647  
1649 1648          /* return success */
1650 1649  out:
1651 1650          return (err);
1652 1651  }
1653 1652  
1654 1653  
1655 1654  /*
1656 1655   * This routine performs raw read operations.  It is called from the
1657 1656   * device switch at normal priority.
1658 1657   *
1659 1658   * The main catch is that the *uio struct which is passed to us may
1660 1659   * specify a read which spans two buffers, which would be contiguous
1661 1660   * on a single partition,  but not on a striped partition. This will
1662 1661   * be handled by mdstrategy.
1663 1662   */
1664 1663  /*ARGSUSED*/
1665 1664  static int
1666 1665  mdread(dev_t dev, struct uio *uio, cred_t *credp)
1667 1666  {
1668 1667          minor_t         mnum;
1669 1668          mdi_unit_t      *ui;
1670 1669          int             error;
1671 1670  
1672 1671          if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1673 1672              (MD_MIN2SET(mnum) >= md_nsets) ||
1674 1673              (MD_MIN2UNIT(mnum) >= md_nunits) ||
1675 1674              ((ui = MDI_UNIT(mnum)) == NULL))
1676 1675                  return (ENXIO);
1677 1676  
1678 1677          if (md_ops[ui->ui_opsindex]->md_read  != NULL)
1679 1678                  return ((*md_ops[ui->ui_opsindex]->md_read)
1680 1679                      (dev, uio, credp));
1681 1680  
1682 1681          if ((error = md_chk_uio(uio)) != 0)
1683 1682                  return (error);
1684 1683  
1685 1684          return (physio(mdstrategy, NULL, dev, B_READ, md_minphys, uio));
1686 1685  }
1687 1686  
1688 1687  /*
1689 1688   * This routine performs async raw read operations.  It is called from the
1690 1689   * device switch at normal priority.
1691 1690   *
1692 1691   * The main catch is that the *aio struct which is passed to us may
1693 1692   * specify a read which spans two buffers, which would be contiguous
1694 1693   * on a single partition,  but not on a striped partition. This will
1695 1694   * be handled by mdstrategy.
1696 1695   */
1697 1696  /*ARGSUSED*/
1698 1697  static int
1699 1698  mdaread(dev_t dev, struct aio_req *aio, cred_t *credp)
1700 1699  {
1701 1700          minor_t         mnum;
1702 1701          mdi_unit_t      *ui;
1703 1702          int             error;
1704 1703  
1705 1704  
1706 1705          if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1707 1706              (MD_MIN2SET(mnum) >= md_nsets) ||
1708 1707              (MD_MIN2UNIT(mnum) >= md_nunits) ||
1709 1708              ((ui = MDI_UNIT(mnum)) == NULL))
1710 1709                  return (ENXIO);
1711 1710  
1712 1711          if (md_ops[ui->ui_opsindex]->md_aread  != NULL)
1713 1712                  return ((*md_ops[ui->ui_opsindex]->md_aread)
1714 1713                      (dev, aio, credp));
1715 1714  
1716 1715          if ((error = md_chk_uio(aio->aio_uio)) != 0)
1717 1716                  return (error);
1718 1717  
1719 1718          return (aphysio(mdstrategy, anocancel, dev, B_READ, md_minphys, aio));
1720 1719  }
1721 1720  
1722 1721  /*
1723 1722   * This routine performs raw write operations.  It is called from the
1724 1723   * device switch at normal priority.
1725 1724   *
1726 1725   * The main catch is that the *uio struct which is passed to us may
1727 1726   * specify a write which spans two buffers, which would be contiguous
1728 1727   * on a single partition,  but not on a striped partition. This is
1729 1728   * handled by mdstrategy.
1730 1729   *
1731 1730   */
1732 1731  /*ARGSUSED*/
1733 1732  static int
1734 1733  mdwrite(dev_t dev, struct uio *uio, cred_t *credp)
1735 1734  {
1736 1735          minor_t         mnum;
1737 1736          mdi_unit_t      *ui;
1738 1737          int             error;
1739 1738  
1740 1739          if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1741 1740              (MD_MIN2SET(mnum) >= md_nsets) ||
1742 1741              (MD_MIN2UNIT(mnum) >= md_nunits) ||
1743 1742              ((ui = MDI_UNIT(mnum)) == NULL))
1744 1743                  return (ENXIO);
1745 1744  
1746 1745          if (md_ops[ui->ui_opsindex]->md_write  != NULL)
1747 1746                  return ((*md_ops[ui->ui_opsindex]->md_write)
1748 1747                      (dev, uio, credp));
1749 1748  
1750 1749          if ((error = md_chk_uio(uio)) != 0)
1751 1750                  return (error);
1752 1751  
1753 1752          return (physio(mdstrategy, NULL, dev, B_WRITE, md_minphys, uio));
1754 1753  }
1755 1754  
1756 1755  /*
1757 1756   * This routine performs async raw write operations.  It is called from the
1758 1757   * device switch at normal priority.
1759 1758   *
1760 1759   * The main catch is that the *aio struct which is passed to us may
1761 1760   * specify a write which spans two buffers, which would be contiguous
1762 1761   * on a single partition,  but not on a striped partition. This is
1763 1762   * handled by mdstrategy.
1764 1763   *
1765 1764   */
1766 1765  /*ARGSUSED*/
1767 1766  static int
1768 1767  mdawrite(dev_t dev, struct aio_req *aio, cred_t *credp)
1769 1768  {
1770 1769          minor_t         mnum;
1771 1770          mdi_unit_t      *ui;
1772 1771          int             error;
1773 1772  
1774 1773  
1775 1774          if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1776 1775              (MD_MIN2SET(mnum) >= md_nsets) ||
1777 1776              (MD_MIN2UNIT(mnum) >= md_nunits) ||
1778 1777              ((ui = MDI_UNIT(mnum)) == NULL))
1779 1778                  return (ENXIO);
1780 1779  
1781 1780          if (md_ops[ui->ui_opsindex]->md_awrite  != NULL)
1782 1781                  return ((*md_ops[ui->ui_opsindex]->md_awrite)
1783 1782                      (dev, aio, credp));
1784 1783  
1785 1784          if ((error = md_chk_uio(aio->aio_uio)) != 0)
1786 1785                  return (error);
1787 1786  
1788 1787          return (aphysio(mdstrategy, anocancel, dev, B_WRITE, md_minphys, aio));
1789 1788  }
1790 1789  
1791 1790  int
1792 1791  mdstrategy(struct buf *bp)
1793 1792  {
1794 1793          minor_t         mnum;
1795 1794          mdi_unit_t      *ui;
1796 1795  
1797 1796          ASSERT((bp->b_flags & B_DONE) == 0);
1798 1797  
1799 1798          if (panicstr)
1800 1799                  md_clr_status(MD_GBL_DAEMONS_LIVE);
1801 1800  
1802 1801          if (((mnum = getminor(bp->b_edev)) == MD_ADM_MINOR) ||
1803 1802              (MD_MIN2SET(mnum) >= md_nsets) ||
1804 1803              (MD_MIN2UNIT(mnum) >= md_nunits) ||
1805 1804              ((ui = MDI_UNIT(mnum)) == NULL)) {
1806 1805                  bp->b_flags |= B_ERROR;
1807 1806                  bp->b_error = ENXIO;
1808 1807                  bp->b_resid = bp->b_bcount;
1809 1808                  biodone(bp);
1810 1809                  return (0);
1811 1810          }
1812 1811  
1813 1812          bp->b_flags &= ~(B_ERROR | B_DONE);
1814 1813          if (md_ops[ui->ui_opsindex]->md_strategy  != NULL) {
1815 1814                  (*md_ops[ui->ui_opsindex]->md_strategy) (bp, 0, NULL);
1816 1815          } else {
1817 1816                  (void) errdone(ui, bp, ENXIO);
1818 1817          }
1819 1818          return (0);
1820 1819  }
1821 1820  
1822 1821  /*
1823 1822   * Return true if the ioctl is allowed to be multithreaded.
1824 1823   * All the ioctls with MN are sent only from the message handlers through
1825 1824   * rpc.mdcommd, which (via it's own locking mechanism) takes care that not two
1826 1825   * ioctl for the same metadevice are issued at the same time.
1827 1826   * So we are safe here.
1828 1827   * The other ioctls do not mess with any metadevice structures and therefor
1829 1828   * are harmless too, if called multiple times at the same time.
1830 1829   */
1831 1830  static boolean_t
1832 1831  is_mt_ioctl(int cmd) {
1833 1832  
1834 1833          switch (cmd) {
1835 1834          case MD_IOCGUNIQMSGID:
1836 1835          case MD_IOCGVERSION:
1837 1836          case MD_IOCISOPEN:
1838 1837          case MD_MN_SET_MM_OWNER:
1839 1838          case MD_MN_SET_STATE:
1840 1839          case MD_MN_SUSPEND_WRITES:
1841 1840          case MD_MN_ALLOCATE_HOTSPARE:
1842 1841          case MD_MN_SET_SETFLAGS:
1843 1842          case MD_MN_GET_SETFLAGS:
1844 1843          case MD_MN_MDDB_OPTRECFIX:
1845 1844          case MD_MN_MDDB_PARSE:
1846 1845          case MD_MN_MDDB_BLOCK:
1847 1846          case MD_MN_DB_USERREQ:
1848 1847          case MD_IOC_SPSTATUS:
1849 1848          case MD_MN_COMMD_ERR:
1850 1849          case MD_MN_SET_COMMD_RUNNING:
1851 1850          case MD_MN_RESYNC:
1852 1851          case MD_MN_SETSYNC:
1853 1852          case MD_MN_POKE_HOTSPARES:
1854 1853          case MD_MN_RR_DIRTY:
1855 1854          case MD_MN_RR_CLEAN:
1856 1855          case MD_MN_IOC_SPUPDATEWM:
1857 1856                  return (1);
1858 1857          default:
1859 1858                  return (0);
1860 1859          }
1861 1860  }
1862 1861  
1863 1862  /*
1864 1863   * This routine implements the ioctl calls for the Virtual Disk System.
1865 1864   * It is called from the device switch at normal priority.
1866 1865   */
1867 1866  /* ARGSUSED */
1868 1867  static int
1869 1868  mdioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cred_p,
1870 1869          int *rval_p)
1871 1870  {
1872 1871          minor_t         mnum = getminor(dev);
1873 1872          mdi_unit_t      *ui;
1874 1873          IOLOCK          lock;
1875 1874          int             err;
1876 1875  
1877 1876          /*
1878 1877           * For multinode disksets  number of ioctls are allowed to be
1879 1878           * multithreaded.
1880 1879           * A fundamental assumption made in this implementation is that
1881 1880           * ioctls either do not interact with other md structures  or the
1882 1881           * ioctl to the admin device can only occur if the metadevice
1883 1882           * device is open. i.e. avoid a race between metaclear and the
1884 1883           * progress of a multithreaded ioctl.
1885 1884           */
1886 1885  
1887 1886          if (!is_mt_ioctl(cmd) && md_ioctl_lock_enter() == EINTR) {
1888 1887                  return (EINTR);
1889 1888          }
1890 1889  
1891 1890          /*
1892 1891           * initialize lock tracker
1893 1892           */
1894 1893          IOLOCK_INIT(&lock);
1895 1894  
1896 1895          /* Flag to indicate that MD_GBL_IOCTL_LOCK is not acquired */
1897 1896  
1898 1897          if (is_mt_ioctl(cmd)) {
1899 1898                  /* increment the md_mtioctl_cnt */
1900 1899                  mutex_enter(&md_mx);
1901 1900                  md_mtioctl_cnt++;
1902 1901                  mutex_exit(&md_mx);
1903 1902                  lock.l_flags |= MD_MT_IOCTL;
1904 1903          }
1905 1904  
1906 1905          /*
1907 1906           * this has been added to prevent notification from re-snarfing
1908 1907           * so metaunload will work.  It may interfere with other modules
1909 1908           * halt process.
1910 1909           */
1911 1910          if (md_get_status() & (MD_GBL_HALTED | MD_GBL_DAEMONS_DIE))
1912 1911                  return (IOLOCK_RETURN(ENXIO, &lock));
1913 1912  
1914 1913          /*
1915 1914           * admin device ioctls
1916 1915           */
1917 1916          if (mnum == MD_ADM_MINOR) {
1918 1917                  err = md_admin_ioctl(md_expldev(dev), cmd, (void *) data,
1919 1918                      mode, &lock);
1920 1919          }
1921 1920  
1922 1921          /*
1923 1922           * metadevice ioctls
1924 1923           */
1925 1924          else if ((MD_MIN2SET(mnum) >= md_nsets) ||
1926 1925              (MD_MIN2UNIT(mnum) >= md_nunits) ||
1927 1926              (md_set[MD_MIN2SET(mnum)].s_ui == NULL) ||
1928 1927              ((ui = MDI_UNIT(mnum)) == NULL)) {
1929 1928                  err = ENXIO;
1930 1929          } else if (md_ops[ui->ui_opsindex]->md_ioctl == NULL) {
1931 1930                  err = ENOTTY;
1932 1931          } else {
1933 1932                  err = (*md_ops[ui->ui_opsindex]->md_ioctl)
1934 1933                      (dev, cmd, (void *) data, mode, &lock);
1935 1934          }
1936 1935  
1937 1936          /*
1938 1937           * drop any locks we grabbed
1939 1938           */
1940 1939          return (IOLOCK_RETURN_IOCTLEND(err, &lock));
1941 1940  }
1942 1941  
1943 1942  static int
1944 1943  mddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1945 1944  {
1946 1945          minor_t         mnum;
1947 1946          set_t           setno;
1948 1947          mdi_unit_t      *ui;
1949 1948  
1950 1949          if ((mnum = getminor(dev)) == MD_ADM_MINOR)
1951 1950                  return (ENXIO);
1952 1951  
1953 1952          setno = MD_MIN2SET(mnum);
1954 1953  
1955 1954          if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits) ||
1956 1955              ((ui = MDI_UNIT(mnum)) == NULL))
1957 1956                  return (ENXIO);
1958 1957  
1959 1958  
1960 1959          if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
1961 1960                  return (ENXIO);
1962 1961  
1963 1962          if (md_ops[ui->ui_opsindex]->md_dump  != NULL)
1964 1963                  return ((*md_ops[ui->ui_opsindex]->md_dump)
1965 1964                      (dev, addr, blkno, nblk));
1966 1965  
1967 1966          return (ENXIO);
1968 1967  }
1969 1968  
1970 1969  /*
1971 1970   * Metadevice unit number dispatcher
1972 1971   * When this routine is called it will scan the
1973 1972   * incore unit array and return the avail slot
1974 1973   * hence the unit number to the caller
1975 1974   *
1976 1975   * Return -1 if there is nothing available
1977 1976   */
1978 1977  unit_t
1979 1978  md_get_nextunit(set_t setno)
1980 1979  {
1981 1980          unit_t  un, start;
1982 1981  
1983 1982          /*
1984 1983           * If nothing available
1985 1984           */
1986 1985          if (md_set[setno].s_un_avail == 0) {
1987 1986                  return (MD_UNITBAD);
1988 1987          }
1989 1988  
1990 1989          mutex_enter(&md_mx);
1991 1990          start = un = md_set[setno].s_un_next;
1992 1991  
1993 1992          /* LINTED: E_CONSTANT_CONDITION */
1994 1993          while (1) {
1995 1994                  if (md_set[setno].s_un[un] == NULL) {
1996 1995                          /*
1997 1996                           * Advance the starting index for the next
1998 1997                           * md_get_nextunit call
1999 1998                           */
2000 1999                          if (un == MD_MAXUNITS - 1) {
2001 2000                                  md_set[setno].s_un_next = 0;
2002 2001                          } else {
2003 2002                                  md_set[setno].s_un_next = un + 1;
2004 2003                          }
2005 2004                          break;
2006 2005                  }
2007 2006  
2008 2007                  un = ((un == MD_MAXUNITS - 1) ? 0 : un + 1);
2009 2008  
2010 2009                  if (un == start) {
2011 2010                          un = MD_UNITBAD;
2012 2011                          break;
2013 2012                  }
2014 2013  
2015 2014          }
2016 2015  
2017 2016          mutex_exit(&md_mx);
2018 2017          return (un);
2019 2018  }

↓ open down ↓

1745 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX