1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011 Bayard G. Bell. All rights reserved. 25 */ 26 27 /* 28 * Md - is the meta-disk driver. It sits below the UFS file system 29 * but above the 'real' disk drivers, xy, id, sd etc. 30 * 31 * To the UFS software, md looks like a normal driver, since it has 32 * the normal kinds of entries in the bdevsw and cdevsw arrays. So 33 * UFS accesses md in the usual ways. In particular, the strategy 34 * routine, mdstrategy(), gets called by fbiwrite(), ufs_getapage(), 35 * and ufs_writelbn(). 36 * 37 * Md maintains an array of minor devices (meta-partitions). Each 38 * meta partition stands for a matrix of real partitions, in rows 39 * which are not necessarily of equal length. Md maintains a table, 40 * with one entry for each meta-partition, which lists the rows and 41 * columns of actual partitions, and the job of the strategy routine 42 * is to translate from the meta-partition device and block numbers 43 * known to UFS into the actual partitions' device and block numbers. 44 * 45 * See below, in mdstrategy(), mdreal(), and mddone() for details of 46 * this translation. 47 */ 48 49 /* 50 * Driver for Virtual Disk. 51 */ 52 53 #include <sys/user.h> 54 #include <sys/sysmacros.h> 55 #include <sys/conf.h> 56 #include <sys/stat.h> 57 #include <sys/errno.h> 58 #include <sys/param.h> 59 #include <sys/systm.h> 60 #include <sys/file.h> 61 #include <sys/open.h> 62 #include <sys/dkio.h> 63 #include <sys/vtoc.h> 64 #include <sys/cmn_err.h> 65 #include <sys/ddi.h> 66 #include <sys/sunddi.h> 67 #include <sys/debug.h> 68 #include <sys/utsname.h> 69 #include <sys/lvm/mdvar.h> 70 #include <sys/lvm/md_names.h> 71 #include <sys/lvm/md_mddb.h> 72 #include <sys/lvm/md_sp.h> 73 #include <sys/types.h> 74 #include <sys/kmem.h> 75 #include <sys/cladm.h> 76 #include <sys/priv_names.h> 77 #include <sys/modhash.h> 78 79 int md_init_debug = 0; /* module binding debug */ 80 81 /* 82 * Tunable to turn off the failfast behavior. 83 */ 84 int md_ff_disable = 0; 85 86 /* 87 * dynamically allocated list of non FF driver names - needs to 88 * be freed when md is detached. 89 */ 90 char **non_ff_drivers = NULL; 91 92 md_krwlock_t md_unit_array_rw; /* protects all unit arrays */ 93 md_krwlock_t nm_lock; /* protects all the name spaces */ 94 95 md_resync_t md_cpr_resync; 96 97 extern char svm_bootpath[]; 98 #define SVM_PSEUDO_STR "/pseudo/md@0:" 99 100 #define VERSION_LENGTH 6 101 #define VERSION "1.0" 102 103 /* 104 * Keep track of possible 'orphan' entries in the name space 105 */ 106 int *md_nm_snarfed = NULL; 107 108 /* 109 * Global tunable giving the percentage of free space left in replica during 110 * conversion of non-devid style replica to devid style replica. 111 */ 112 int md_conv_perc = MDDB_DEVID_CONV_PERC; 113 114 #ifdef DEBUG 115 /* debug code to verify framework exclusion guarantees */ 116 int md_in; 117 kmutex_t md_in_mx; /* used to md global stuff */ 118 #define IN_INIT 0x01 119 #define IN_FINI 0x02 120 #define IN_ATTACH 0x04 121 #define IN_DETACH 0x08 122 #define IN_OPEN 0x10 123 #define MD_SET_IN(x) { \ 124 mutex_enter(&md_in_mx); \ 125 if (md_in) \ 126 debug_enter("MD_SET_IN exclusion lost"); \ 127 if (md_in & x) \ 128 debug_enter("MD_SET_IN already set"); \ 129 md_in |= x; \ 130 mutex_exit(&md_in_mx); \ 131 } 132 133 #define MD_CLR_IN(x) { \ 134 mutex_enter(&md_in_mx); \ 135 if (md_in & ~(x)) \ 136 debug_enter("MD_CLR_IN exclusion lost"); \ 137 if (!(md_in & x)) \ 138 debug_enter("MD_CLR_IN already clr"); \ 139 md_in &= ~x; \ 140 mutex_exit(&md_in_mx); \ 141 } 142 #else /* DEBUG */ 143 #define MD_SET_IN(x) 144 #define MD_CLR_IN(x) 145 #endif /* DEBUG */ 146 hrtime_t savetime1, savetime2; 147 148 149 /* 150 * list things protected by md_mx even if they aren't 151 * used in this file. 152 */ 153 kmutex_t md_mx; /* used to md global stuff */ 154 kcondvar_t md_cv; /* md_status events */ 155 int md_status = 0; /* global status for the meta-driver */ 156 int md_num_daemons = 0; 157 int md_ioctl_cnt = 0; 158 int md_mtioctl_cnt = 0; /* multithreaded ioctl cnt */ 159 uint_t md_mdelay = 10; /* variable so can be patched */ 160 161 int (*mdv_strategy_tstpnt)(buf_t *, int, void*); 162 163 major_t md_major, md_major_targ; 164 165 unit_t md_nunits = MD_MAXUNITS; 166 set_t md_nsets = MD_MAXSETS; 167 int md_nmedh = 0; 168 char *md_med_trans_lst = NULL; 169 md_set_t md_set[MD_MAXSETS]; 170 md_set_io_t md_set_io[MD_MAXSETS]; 171 172 md_krwlock_t hsp_rwlp; /* protects hot_spare_interface */ 173 md_krwlock_t ni_rwlp; /* protects notify_interface */ 174 md_ops_t **md_ops = NULL; 175 ddi_modhandle_t *md_mods = NULL; 176 md_ops_t *md_opslist; 177 clock_t md_hz; 178 md_event_queue_t *md_event_queue = NULL; 179 180 int md_in_upgrade; 181 int md_keep_repl_state; 182 int md_devid_destroy; 183 184 /* for sending messages thru a door to userland */ 185 door_handle_t mdmn_door_handle = NULL; 186 int mdmn_door_did = -1; 187 188 dev_info_t *md_devinfo = NULL; 189 190 md_mn_nodeid_t md_mn_mynode_id = ~0u; /* My node id (for multi-node sets) */ 191 192 static uint_t md_ocnt[OTYPCNT]; 193 194 static int mdinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 195 static int mdattach(dev_info_t *, ddi_attach_cmd_t); 196 static int mddetach(dev_info_t *, ddi_detach_cmd_t); 197 static int mdopen(dev_t *, int, int, cred_t *); 198 static int mdclose(dev_t, int, int, cred_t *); 199 static int mddump(dev_t, caddr_t, daddr_t, int); 200 static int mdread(dev_t, struct uio *, cred_t *); 201 static int mdwrite(dev_t, struct uio *, cred_t *); 202 static int mdaread(dev_t, struct aio_req *, cred_t *); 203 static int mdawrite(dev_t, struct aio_req *, cred_t *); 204 static int mdioctl(dev_t, int, intptr_t, int, cred_t *, int *); 205 static int mdprop_op(dev_t, dev_info_t *, 206 ddi_prop_op_t, int, char *, caddr_t, int *); 207 208 static struct cb_ops md_cb_ops = { 209 mdopen, /* open */ 210 mdclose, /* close */ 211 mdstrategy, /* strategy */ 212 /* print routine -- none yet */ 213 (int(*)(dev_t, char *))nulldev, 214 mddump, /* dump */ 215 mdread, /* read */ 216 mdwrite, /* write */ 217 mdioctl, /* ioctl */ 218 /* devmap */ 219 (int(*)(dev_t, devmap_cookie_t, offset_t, size_t, size_t *, 220 uint_t))nodev, 221 /* mmap */ 222 (int(*)(dev_t, off_t, int))nodev, 223 /* segmap */ 224 (int(*)(dev_t, off_t, struct as *, caddr_t *, off_t, unsigned, 225 unsigned, unsigned, cred_t *))nodev, 226 nochpoll, /* poll */ 227 mdprop_op, /* prop_op */ 228 0, /* streamtab */ 229 (D_64BIT|D_MP|D_NEW), /* driver compatibility flag */ 230 CB_REV, /* cb_ops version */ 231 mdaread, /* aread */ 232 mdawrite, /* awrite */ 233 }; 234 235 static struct dev_ops md_devops = { 236 DEVO_REV, /* dev_ops version */ 237 0, /* device reference count */ 238 mdinfo, /* info routine */ 239 nulldev, /* identify routine */ 240 nulldev, /* probe - not defined */ 241 mdattach, /* attach routine */ 242 mddetach, /* detach routine */ 243 nodev, /* reset - not defined */ 244 &md_cb_ops, /* driver operations */ 245 NULL, /* bus operations */ 246 nodev, /* power management */ 247 ddi_quiesce_not_needed, /* quiesce */ 248 }; 249 250 /* 251 * loadable module wrapper 252 */ 253 #include <sys/modctl.h> 254 255 static struct modldrv modldrv = { 256 &mod_driverops, /* type of module -- a pseudodriver */ 257 "Solaris Volume Manager base module", /* name of the module */ 258 &md_devops, /* driver ops */ 259 }; 260 261 static struct modlinkage modlinkage = { 262 MODREV_1, 263 { (void *)&modldrv, NULL } 264 }; 265 266 267 /* md_medd.c */ 268 extern void med_init(void); 269 extern void med_fini(void); 270 extern void md_devid_cleanup(set_t, uint_t); 271 272 /* md_names.c */ 273 extern struct nm_next_hdr *get_first_record(set_t, int, int); 274 275 int md_maxphys = 0; /* maximum io size in bytes */ 276 #define MD_MAXBCOUNT (1024 * 1024) 277 unsigned md_maxbcount = 0; /* maximum physio size in bytes */ 278 279 /* 280 * Some md ioctls trigger io framework device tree operations. An 281 * example is md ioctls that call md_resolve_bydevid(): which uses the 282 * io framework to resolve a devid. Such operations result in acquiring 283 * io framework locks (like ndi_devi_enter() of "/") while holding 284 * driver locks (like md_unit_writerlock()). 285 * 286 * The prop_op(9E) entry point is called from the devinfo driver with 287 * an active ndi_devi_enter of "/". To avoid deadlock, md's prop_op 288 * implementation must avoid taking a lock that is held per above md 289 * ioctl description: i.e. mdprop_op(9E) can't call md_unit_readerlock() 290 * without risking deadlock. 291 * 292 * To service "size" requests without risking deadlock, we maintain a 293 * "mnum->nblocks" sizemap (protected by a short-term global mutex). 294 */ 295 static kmutex_t md_nblocks_mutex; 296 static mod_hash_t *md_nblocksmap; /* mnum -> nblocks */ 297 int md_nblocksmap_size = 512; 298 299 /* 300 * Maintain "mnum->nblocks" sizemap for mdprop_op use: 301 * 302 * Create: any code that establishes a unit's un_total_blocks needs the 303 * following type of call to establish nblocks for mdprop_op(): 304 * md_nblocks_set(mnum, un->c.un_total_blocks);" 305 * NOTE: locate via cscope md_create_minor_node/md_create_unit_incore 306 * ...or "MD_UNIT..*=" 307 * 308 * Change: any code that changes a unit's un_total_blocks needs the 309 * following type of call to sync nblocks for mdprop_op(): 310 * md_nblocks_set(mnum, un->c.un_total_blocks);" 311 * NOTE: locate via cscope for "un_total_blocks[ \t]*=" 312 * 313 * Destroy: any code that deletes a unit needs the following type of call 314 * to sync nblocks for mdprop_op(): 315 * md_nblocks_set(mnum, -1ULL); 316 * NOTE: locate via cscope md_remove_minor_node/md_destroy_unit_incore 317 * ...or "MD_UNIT..*=" 318 */ 319 void 320 md_nblocks_set(minor_t mnum, uint64_t nblocks) 321 { 322 mutex_enter(&md_nblocks_mutex); 323 if (nblocks == -1ULL) 324 (void) mod_hash_destroy(md_nblocksmap, 325 (mod_hash_key_t)(intptr_t)mnum); 326 else 327 (void) mod_hash_replace(md_nblocksmap, 328 (mod_hash_key_t)(intptr_t)mnum, 329 (mod_hash_val_t)(intptr_t)nblocks); 330 mutex_exit(&md_nblocks_mutex); 331 } 332 333 /* get the size of a mnum from "mnum->nblocks" sizemap */ 334 uint64_t 335 md_nblocks_get(minor_t mnum) 336 { 337 mod_hash_val_t hv; 338 339 mutex_enter(&md_nblocks_mutex); 340 if (mod_hash_find(md_nblocksmap, 341 (mod_hash_key_t)(intptr_t)mnum, &hv) == 0) { 342 mutex_exit(&md_nblocks_mutex); 343 return ((uint64_t)(intptr_t)hv); 344 } 345 mutex_exit(&md_nblocks_mutex); 346 return (0); 347 } 348 349 /* allocate/free dynamic space associated with driver globals */ 350 void 351 md_global_alloc_free(int alloc) 352 { 353 set_t s; 354 355 if (alloc) { 356 /* initialize driver global locks */ 357 cv_init(&md_cv, NULL, CV_DEFAULT, NULL); 358 mutex_init(&md_mx, NULL, MUTEX_DEFAULT, NULL); 359 rw_init(&md_unit_array_rw.lock, NULL, RW_DEFAULT, NULL); 360 rw_init(&nm_lock.lock, NULL, RW_DEFAULT, NULL); 361 rw_init(&ni_rwlp.lock, NULL, RW_DRIVER, NULL); 362 rw_init(&hsp_rwlp.lock, NULL, RW_DRIVER, NULL); 363 mutex_init(&md_cpr_resync.md_resync_mutex, NULL, 364 MUTEX_DEFAULT, NULL); 365 mutex_init(&md_nblocks_mutex, NULL, MUTEX_DEFAULT, NULL); 366 367 /* initialize per set driver global locks */ 368 for (s = 0; s < MD_MAXSETS; s++) { 369 /* initialize per set driver globals locks */ 370 mutex_init(&md_set[s].s_dbmx, 371 NULL, MUTEX_DEFAULT, NULL); 372 mutex_init(&md_set_io[s].md_io_mx, 373 NULL, MUTEX_DEFAULT, NULL); 374 cv_init(&md_set_io[s].md_io_cv, 375 NULL, CV_DEFAULT, NULL); 376 } 377 } else { 378 /* destroy per set driver global locks */ 379 for (s = 0; s < MD_MAXSETS; s++) { 380 cv_destroy(&md_set_io[s].md_io_cv); 381 mutex_destroy(&md_set_io[s].md_io_mx); 382 mutex_destroy(&md_set[s].s_dbmx); 383 } 384 385 /* destroy driver global locks */ 386 mutex_destroy(&md_nblocks_mutex); 387 mutex_destroy(&md_cpr_resync.md_resync_mutex); 388 rw_destroy(&hsp_rwlp.lock); 389 rw_destroy(&ni_rwlp.lock); 390 rw_destroy(&nm_lock.lock); 391 rw_destroy(&md_unit_array_rw.lock); 392 mutex_destroy(&md_mx); 393 cv_destroy(&md_cv); 394 } 395 } 396 397 int 398 _init(void) 399 { 400 set_t s; 401 int err; 402 403 MD_SET_IN(IN_INIT); 404 405 /* allocate dynamic space associated with driver globals */ 406 md_global_alloc_free(1); 407 408 /* initialize driver globals */ 409 md_major = ddi_name_to_major("md"); 410 md_hz = drv_usectohz(NUM_USEC_IN_SEC); 411 412 /* initialize tunable globals */ 413 if (md_maxphys == 0) /* maximum io size in bytes */ 414 md_maxphys = maxphys; 415 if (md_maxbcount == 0) /* maximum physio size in bytes */ 416 md_maxbcount = MD_MAXBCOUNT; 417 418 /* initialize per set driver globals */ 419 for (s = 0; s < MD_MAXSETS; s++) 420 md_set_io[s].io_state = MD_SET_ACTIVE; 421 422 /* 423 * NOTE: the framework does not currently guarantee exclusion 424 * between _init and attach after calling mod_install. 425 */ 426 MD_CLR_IN(IN_INIT); 427 if ((err = mod_install(&modlinkage))) { 428 MD_SET_IN(IN_INIT); 429 md_global_alloc_free(0); /* free dynamic space */ 430 MD_CLR_IN(IN_INIT); 431 } 432 return (err); 433 } 434 435 int 436 _fini(void) 437 { 438 int err; 439 440 /* 441 * NOTE: the framework currently does not guarantee exclusion 442 * with attach until after mod_remove returns 0. 443 */ 444 if ((err = mod_remove(&modlinkage))) 445 return (err); 446 447 MD_SET_IN(IN_FINI); 448 md_global_alloc_free(0); /* free dynamic space */ 449 MD_CLR_IN(IN_FINI); 450 return (err); 451 } 452 453 int 454 _info(struct modinfo *modinfop) 455 { 456 return (mod_info(&modlinkage, modinfop)); 457 } 458 459 /* ARGSUSED */ 460 static int 461 mdattach(dev_info_t *dip, ddi_attach_cmd_t cmd) 462 { 463 int len; 464 unit_t i; 465 size_t sz; 466 char ver[VERSION_LENGTH]; 467 char **maj_str_array; 468 char *str, *str2; 469 470 MD_SET_IN(IN_ATTACH); 471 md_in_upgrade = 0; 472 md_keep_repl_state = 0; 473 md_devid_destroy = 0; 474 475 if (cmd != DDI_ATTACH) { 476 MD_CLR_IN(IN_ATTACH); 477 return (DDI_FAILURE); 478 } 479 480 if (md_devinfo != NULL) { 481 MD_CLR_IN(IN_ATTACH); 482 return (DDI_FAILURE); 483 } 484 485 mddb_init(); 486 487 if (md_start_daemons(TRUE)) { 488 MD_CLR_IN(IN_ATTACH); 489 mddb_unload(); /* undo mddb_init() allocations */ 490 return (DDI_FAILURE); 491 } 492 493 /* clear the halted state */ 494 md_clr_status(MD_GBL_HALTED); 495 496 /* see if the diagnostic switch is on */ 497 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, 498 DDI_PROP_DONTPASS, "md_init_debug", 0)) 499 md_init_debug++; 500 501 /* see if the failfast disable switch is on */ 502 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, 503 DDI_PROP_DONTPASS, "md_ff_disable", 0)) 504 md_ff_disable++; 505 506 /* try and get the md_nmedh property */ 507 md_nmedh = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 508 DDI_PROP_DONTPASS, "md_nmedh", MED_DEF_HOSTS); 509 if ((md_nmedh <= 0) || (md_nmedh > MED_MAX_HOSTS)) 510 md_nmedh = MED_DEF_HOSTS; 511 512 /* try and get the md_med_trans_lst property */ 513 len = 0; 514 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN, 515 0, "md_med_trans_lst", NULL, &len) != DDI_PROP_SUCCESS || 516 len == 0) { 517 md_med_trans_lst = md_strdup("tcp"); 518 } else { 519 md_med_trans_lst = kmem_zalloc((size_t)len, KM_SLEEP); 520 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF, 521 0, "md_med_trans_lst", md_med_trans_lst, &len) != 522 DDI_PROP_SUCCESS) { 523 kmem_free(md_med_trans_lst, (size_t)len); 524 md_med_trans_lst = md_strdup("tcp"); 525 } 526 } 527 528 /* 529 * Must initialize the internal data structures before the 530 * any possible calls to 'goto attach_failure' as _fini 531 * routine references them. 532 */ 533 med_init(); 534 535 md_ops = (md_ops_t **)kmem_zalloc( 536 sizeof (md_ops_t *) * MD_NOPS, KM_SLEEP); 537 md_mods = (ddi_modhandle_t *)kmem_zalloc( 538 sizeof (ddi_modhandle_t) * MD_NOPS, KM_SLEEP); 539 540 /* try and get the md_xlate property */ 541 /* Should we only do this if upgrade? */ 542 len = sizeof (char) * 5; 543 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF, 544 0, "md_xlate_ver", ver, &len) == DDI_PROP_SUCCESS) { 545 if (strcmp(ver, VERSION) == 0) { 546 len = 0; 547 if (ddi_prop_op(DDI_DEV_T_ANY, dip, 548 PROP_LEN_AND_VAL_ALLOC, 0, "md_xlate", 549 (caddr_t)&md_tuple_table, &len) != 550 DDI_PROP_SUCCESS) { 551 if (md_init_debug) 552 cmn_err(CE_WARN, 553 "md_xlate ddi_prop_op failed"); 554 goto attach_failure; 555 } else { 556 md_tuple_length = 557 len/(2 * ((int)sizeof (dev32_t))); 558 md_in_upgrade = 1; 559 } 560 561 /* Get target's name to major table */ 562 if (ddi_prop_lookup_string_array(DDI_DEV_T_ANY, 563 dip, DDI_PROP_DONTPASS, 564 "md_targ_nm_table", &maj_str_array, 565 &md_majortab_len) != DDI_PROP_SUCCESS) { 566 md_majortab_len = 0; 567 if (md_init_debug) 568 cmn_err(CE_WARN, "md_targ_nm_table " 569 "ddi_prop_lookup_string_array " 570 "failed"); 571 goto attach_failure; 572 } 573 574 md_major_tuple_table = 575 (struct md_xlate_major_table *) 576 kmem_zalloc(md_majortab_len * 577 sizeof (struct md_xlate_major_table), KM_SLEEP); 578 579 for (i = 0; i < md_majortab_len; i++) { 580 /* Getting major name */ 581 str = strchr(maj_str_array[i], ' '); 582 if (str == NULL) 583 continue; 584 *str = '\0'; 585 md_major_tuple_table[i].drv_name = 586 md_strdup(maj_str_array[i]); 587 588 /* Simplified atoi to get major number */ 589 str2 = str + 1; 590 md_major_tuple_table[i].targ_maj = 0; 591 while ((*str2 >= '0') && (*str2 <= '9')) { 592 md_major_tuple_table[i].targ_maj *= 10; 593 md_major_tuple_table[i].targ_maj += 594 *str2++ - '0'; 595 } 596 *str = ' '; 597 } 598 ddi_prop_free((void *)maj_str_array); 599 } else { 600 if (md_init_debug) 601 cmn_err(CE_WARN, "md_xlate_ver is incorrect"); 602 goto attach_failure; 603 } 604 } 605 606 /* 607 * Check for properties: 608 * md_keep_repl_state and md_devid_destroy 609 * and set globals if these exist. 610 */ 611 md_keep_repl_state = ddi_getprop(DDI_DEV_T_ANY, dip, 612 0, "md_keep_repl_state", 0); 613 614 md_devid_destroy = ddi_getprop(DDI_DEV_T_ANY, dip, 615 0, "md_devid_destroy", 0); 616 617 if (MD_UPGRADE) 618 md_major_targ = md_targ_name_to_major("md"); 619 else 620 md_major_targ = 0; 621 622 /* allocate admin device node */ 623 if (ddi_create_priv_minor_node(dip, "admin", S_IFCHR, 624 MD_ADM_MINOR, DDI_PSEUDO, 0, NULL, PRIV_SYS_CONFIG, 0640)) 625 goto attach_failure; 626 627 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 628 DDI_KERNEL_IOCTL, NULL, 0) != DDI_SUCCESS) 629 goto attach_failure; 630 631 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, 632 "ddi-abrwrite-supported", 1) != DDI_SUCCESS) 633 goto attach_failure; 634 635 /* these could have been cleared by a detach */ 636 md_nunits = MD_MAXUNITS; 637 md_nsets = MD_MAXSETS; 638 639 sz = sizeof (void *) * MD_MAXUNITS; 640 if (md_set[0].s_un == NULL) 641 md_set[0].s_un = kmem_zalloc(sz, KM_SLEEP); 642 if (md_set[0].s_ui == NULL) 643 md_set[0].s_ui = kmem_zalloc(sz, KM_SLEEP); 644 645 md_devinfo = dip; 646 647 /* 648 * Only allocate device node for root mirror metadevice. 649 * Don't pre-allocate unnecessary device nodes (thus slowing down a 650 * boot when we attach). 651 * We can't read the mddbs in attach. The mddbs will be read 652 * by metainit during the boot process when it is doing the 653 * auto-take processing and any other minor nodes will be 654 * allocated at that point. 655 * 656 * There are two scenarios to be aware of here: 657 * 1) when we are booting from a mirrored root we need the root 658 * metadevice to exist very early (during vfs_mountroot processing) 659 * 2) we need all of the nodes to be created so that any mnttab entries 660 * will succeed (handled by metainit reading the mddb during boot). 661 */ 662 if (strncmp(SVM_PSEUDO_STR, svm_bootpath, sizeof (SVM_PSEUDO_STR) - 1) 663 == 0) { 664 char *p; 665 int mnum = 0; 666 667 /* 668 * The svm_bootpath string looks something like 669 * /pseudo/md@0:0,150,blk where 150 is the minor number 670 * in this example so we need to set the pointer p onto 671 * the first digit of the minor number and convert it 672 * from ascii. 673 */ 674 for (p = svm_bootpath + sizeof (SVM_PSEUDO_STR) + 1; 675 *p >= '0' && *p <= '9'; p++) { 676 mnum *= 10; 677 mnum += *p - '0'; 678 } 679 680 if (md_create_minor_node(0, mnum)) { 681 kmem_free(md_set[0].s_un, sz); 682 kmem_free(md_set[0].s_ui, sz); 683 goto attach_failure; 684 } 685 } 686 687 /* create the hash to store the meta device sizes */ 688 md_nblocksmap = mod_hash_create_idhash("md_nblocksmap", 689 md_nblocksmap_size, mod_hash_null_valdtor); 690 691 MD_CLR_IN(IN_ATTACH); 692 return (DDI_SUCCESS); 693 694 attach_failure: 695 /* 696 * Use our own detach routine to toss any stuff we allocated above. 697 * NOTE: detach will call md_halt to free the mddb_init allocations. 698 */ 699 MD_CLR_IN(IN_ATTACH); 700 if (mddetach(dip, DDI_DETACH) != DDI_SUCCESS) 701 cmn_err(CE_WARN, "detach from attach failed"); 702 return (DDI_FAILURE); 703 } 704 705 /* ARGSUSED */ 706 static int 707 mddetach(dev_info_t *dip, ddi_detach_cmd_t cmd) 708 { 709 extern int check_active_locators(); 710 set_t s; 711 size_t sz; 712 int len; 713 714 MD_SET_IN(IN_DETACH); 715 716 /* check command */ 717 if (cmd != DDI_DETACH) { 718 MD_CLR_IN(IN_DETACH); 719 return (DDI_FAILURE); 720 } 721 722 /* 723 * if we have not already halted yet we have no active config 724 * then automatically initiate a halt so we can detach. 725 */ 726 if (!(md_get_status() & MD_GBL_HALTED)) { 727 if (check_active_locators() == 0) { 728 /* 729 * NOTE: a successful md_halt will have done the 730 * mddb_unload to free allocations done in mddb_init 731 */ 732 if (md_halt(MD_NO_GBL_LOCKS_HELD)) { 733 cmn_err(CE_NOTE, "md:detach: " 734 "Could not halt Solaris Volume Manager"); 735 MD_CLR_IN(IN_DETACH); 736 return (DDI_FAILURE); 737 } 738 } 739 740 /* fail detach if we have not halted */ 741 if (!(md_get_status() & MD_GBL_HALTED)) { 742 MD_CLR_IN(IN_DETACH); 743 return (DDI_FAILURE); 744 } 745 } 746 747 /* must be in halted state, this will be cleared on next attach */ 748 ASSERT(md_get_status() & MD_GBL_HALTED); 749 750 /* cleanup attach allocations and initializations */ 751 md_major_targ = 0; 752 753 sz = sizeof (void *) * md_nunits; 754 for (s = 0; s < md_nsets; s++) { 755 if (md_set[s].s_un != NULL) { 756 kmem_free(md_set[s].s_un, sz); 757 md_set[s].s_un = NULL; 758 } 759 760 if (md_set[s].s_ui != NULL) { 761 kmem_free(md_set[s].s_ui, sz); 762 md_set[s].s_ui = NULL; 763 } 764 } 765 md_nunits = 0; 766 md_nsets = 0; 767 md_nmedh = 0; 768 769 if (non_ff_drivers != NULL) { 770 int i; 771 772 for (i = 0; non_ff_drivers[i] != NULL; i++) 773 kmem_free(non_ff_drivers[i], 774 strlen(non_ff_drivers[i]) + 1); 775 776 /* free i+1 entries because there is a null entry at list end */ 777 kmem_free(non_ff_drivers, (i + 1) * sizeof (char *)); 778 non_ff_drivers = NULL; 779 } 780 781 if (md_med_trans_lst != NULL) { 782 kmem_free(md_med_trans_lst, strlen(md_med_trans_lst) + 1); 783 md_med_trans_lst = NULL; 784 } 785 786 if (md_mods != NULL) { 787 kmem_free(md_mods, sizeof (ddi_modhandle_t) * MD_NOPS); 788 md_mods = NULL; 789 } 790 791 if (md_ops != NULL) { 792 kmem_free(md_ops, sizeof (md_ops_t *) * MD_NOPS); 793 md_ops = NULL; 794 } 795 796 if (MD_UPGRADE) { 797 len = md_tuple_length * (2 * ((int)sizeof (dev32_t))); 798 md_in_upgrade = 0; 799 md_xlate_free(len); 800 md_majortab_free(); 801 } 802 803 /* 804 * Undo what we did in mdattach, freeing resources 805 * and removing things we installed. The system 806 * framework guarantees we are not active with this devinfo 807 * node in any other entry points at this time. 808 */ 809 ddi_prop_remove_all(dip); 810 ddi_remove_minor_node(dip, NULL); 811 812 med_fini(); 813 814 mod_hash_destroy_idhash(md_nblocksmap); 815 816 md_devinfo = NULL; 817 818 MD_CLR_IN(IN_DETACH); 819 return (DDI_SUCCESS); 820 } 821 822 823 /* 824 * Given the device number return the devinfo pointer 825 * given to md via md_attach 826 */ 827 /*ARGSUSED*/ 828 static int 829 mdinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 830 { 831 int error = DDI_FAILURE; 832 833 switch (infocmd) { 834 case DDI_INFO_DEVT2DEVINFO: 835 if (md_devinfo) { 836 *result = (void *)md_devinfo; 837 error = DDI_SUCCESS; 838 } 839 break; 840 841 case DDI_INFO_DEVT2INSTANCE: 842 *result = (void *)0; 843 error = DDI_SUCCESS; 844 break; 845 } 846 return (error); 847 } 848 849 /* 850 * property operation routine. return the number of blocks for the partition 851 * in question or forward the request to the property facilities. 852 */ 853 static int 854 mdprop_op( 855 dev_t dev, /* device number associated with device */ 856 dev_info_t *dip, /* device info struct for this device */ 857 ddi_prop_op_t prop_op, /* property operator */ 858 int mod_flags, /* property flags */ 859 char *name, /* name of property */ 860 caddr_t valuep, /* where to put property value */ 861 int *lengthp) /* put length of property here */ 862 { 863 return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags, 864 name, valuep, lengthp, md_nblocks_get(getminor(dev)))); 865 } 866 867 static void 868 snarf_user_data(set_t setno) 869 { 870 mddb_recid_t recid; 871 mddb_recstatus_t status; 872 873 recid = mddb_makerecid(setno, 0); 874 while ((recid = mddb_getnextrec(recid, MDDB_USER, 0)) > 0) { 875 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 876 continue; 877 878 status = mddb_getrecstatus(recid); 879 if (status == MDDB_STALE) 880 continue; 881 882 if (status == MDDB_NODATA) { 883 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 884 continue; 885 } 886 887 ASSERT(status == MDDB_OK); 888 889 mddb_setrecprivate(recid, MD_PRV_GOTIT); 890 } 891 } 892 893 static void 894 md_print_block_usage(mddb_set_t *s, uint_t blks) 895 { 896 uint_t ib; 897 int li; 898 mddb_mb_ic_t *mbip; 899 uint_t max_blk_needed; 900 mddb_lb_t *lbp; 901 mddb_sidelocator_t *slp; 902 int drv_index; 903 md_splitname sn; 904 char *name; 905 char *suffix; 906 size_t prefixlen; 907 size_t suffixlen; 908 int alloc_sz; 909 910 911 max_blk_needed = s->s_totalblkcnt - s->s_freeblkcnt + blks; 912 913 cmn_err(CE_WARN, "Blocks in Metadevice State Database: %d\n" 914 " Additional Blocks Needed: %d\n\n" 915 " Increase size of following replicas for\n" 916 " device relocatability by deleting listed\n" 917 " replica and re-adding replica with\n" 918 " increased size (see metadb(1M)):\n" 919 " Replica Increase By", 920 s->s_totalblkcnt, (blks - s->s_freeblkcnt)); 921 922 lbp = s->s_lbp; 923 924 for (li = 0; li < lbp->lb_loccnt; li++) { 925 if (lbp->lb_locators[li].l_flags & MDDB_F_DELETED) 926 continue; 927 ib = 0; 928 for (mbip = s->s_mbiarray[li]; mbip != NULL; 929 mbip = mbip->mbi_next) { 930 ib += (uint_t)mbip->mbi_mddb_mb.mb_blkcnt; 931 } 932 if (ib == 0) 933 continue; 934 if (ib < max_blk_needed) { 935 slp = &lbp->lb_sidelocators[s->s_sideno][li]; 936 drv_index = slp->l_drvnm_index; 937 mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno, 938 &sn); 939 prefixlen = SPN_PREFIX(&sn).pre_len; 940 suffixlen = SPN_SUFFIX(&sn).suf_len; 941 alloc_sz = (int)(prefixlen + suffixlen + 2); 942 name = (char *)kmem_alloc(alloc_sz, KM_SLEEP); 943 (void) strncpy(name, SPN_PREFIX(&sn).pre_data, 944 prefixlen); 945 name[prefixlen] = '/'; 946 suffix = name + (prefixlen + 1); 947 (void) strncpy(suffix, SPN_SUFFIX(&sn).suf_data, 948 suffixlen); 949 name[prefixlen + suffixlen + 1] = '\0'; 950 cmn_err(CE_WARN, 951 " %s (%s:%d:%d) %d blocks", 952 name, lbp->lb_drvnm[drv_index].dn_data, 953 slp->l_mnum, lbp->lb_locators[li].l_blkno, 954 (max_blk_needed - ib)); 955 kmem_free(name, alloc_sz); 956 } 957 } 958 } 959 960 /* 961 * md_create_minor_node: 962 * Create the minor device for the given set and un_self_id. 963 * 964 * Input: 965 * setno - set number 966 * mnum - selfID of unit 967 * 968 * Output: 969 * None. 970 * 971 * Returns 0 for success, 1 for failure. 972 * 973 * Side-effects: 974 * None. 975 */ 976 int 977 md_create_minor_node(set_t setno, minor_t mnum) 978 { 979 char name[20]; 980 981 /* Check for valid arguments */ 982 if (setno >= MD_MAXSETS || MD_MIN2UNIT(mnum) >= MD_MAXUNITS) 983 return (1); 984 985 (void) snprintf(name, 20, "%u,%u,blk", 986 (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum)); 987 988 if (ddi_create_minor_node(md_devinfo, name, S_IFBLK, 989 MD_MKMIN(setno, mnum), DDI_PSEUDO, 0)) 990 return (1); 991 992 (void) snprintf(name, 20, "%u,%u,raw", 993 (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum)); 994 995 if (ddi_create_minor_node(md_devinfo, name, S_IFCHR, 996 MD_MKMIN(setno, mnum), DDI_PSEUDO, 0)) 997 return (1); 998 999 return (0); 1000 } 1001 1002 /* 1003 * For a given key check if it is an orphaned record. 1004 * The following conditions are used to determine an orphan. 1005 * 1. The device associated with that key is not a metadevice. 1006 * 2. If DEVID_STYLE then the physical device does not have a device Id 1007 * associated with it. 1008 * 1009 * If a key does not have an entry in the devid namespace it could be 1010 * a device that does not support device ids. Hence the record is not 1011 * deleted. 1012 */ 1013 1014 static int 1015 md_verify_orphaned_record(set_t setno, mdkey_t key) 1016 { 1017 md_dev64_t odev; /* orphaned dev */ 1018 mddb_set_t *s; 1019 side_t side = 0; 1020 struct nm_next_hdr *did_nh = NULL; 1021 1022 s = (mddb_set_t *)md_set[setno].s_db; 1023 if ((did_nh = get_first_record(setno, 1, (NM_DEVID | NM_NOTSHARED))) 1024 == NULL) 1025 return (0); 1026 /* 1027 * If devid style is set then get the dev_t using MD_NOTRUST_DEVT 1028 */ 1029 if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE) { 1030 odev = md_getdevnum(setno, side, key, MD_NOTRUST_DEVT); 1031 if ((odev == NODEV64) || (md_getmajor(odev) == md_major)) 1032 return (0); 1033 if (lookup_entry(did_nh, setno, side, key, odev, NM_DEVID) == 1034 NULL) 1035 return (1); 1036 } 1037 return (0); 1038 } 1039 1040 int 1041 md_snarf_db_set(set_t setno, md_error_t *ep) 1042 { 1043 int err = 0; 1044 int i; 1045 mddb_recid_t recid; 1046 mddb_type_t drvrid; 1047 mddb_recstatus_t status; 1048 md_ops_t *ops; 1049 uint_t privat; 1050 mddb_set_t *s; 1051 uint_t cvt_blks; 1052 struct nm_next_hdr *nh; 1053 mdkey_t key = MD_KEYWILD; 1054 side_t side = 0; 1055 int size; 1056 int devid_flag; 1057 int retval; 1058 uint_t un; 1059 int un_next_set = 0; 1060 1061 md_haltsnarf_enter(setno); 1062 1063 mutex_enter(&md_mx); 1064 if (md_set[setno].s_status & MD_SET_SNARFED) { 1065 mutex_exit(&md_mx); 1066 md_haltsnarf_exit(setno); 1067 return (0); 1068 } 1069 mutex_exit(&md_mx); 1070 1071 if (! (md_get_status() & MD_GBL_DAEMONS_LIVE)) { 1072 if (md_start_daemons(TRUE)) { 1073 if (ep != NULL) 1074 (void) mdsyserror(ep, ENXIO); 1075 err = -1; 1076 goto out; 1077 } 1078 } 1079 1080 1081 /* 1082 * Load the devid name space if it exists 1083 */ 1084 (void) md_load_namespace(setno, NULL, NM_DEVID); 1085 if (!md_load_namespace(setno, ep, 0L)) { 1086 /* 1087 * Unload the devid namespace 1088 */ 1089 (void) md_unload_namespace(setno, NM_DEVID); 1090 err = -1; 1091 goto out; 1092 } 1093 1094 /* 1095 * If replica is in non-devid state, convert if: 1096 * - not in probe during upgrade (md_keep_repl_state = 0) 1097 * - enough space available in replica 1098 * - local set 1099 * - not a multi-node diskset 1100 * - clustering is not present (for non-local set) 1101 */ 1102 s = (mddb_set_t *)md_set[setno].s_db; 1103 devid_flag = 0; 1104 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE) && !md_keep_repl_state) 1105 devid_flag = 1; 1106 if (cluster_bootflags & CLUSTER_CONFIGURED) 1107 if (setno != MD_LOCAL_SET) 1108 devid_flag = 0; 1109 if (MD_MNSET_SETNO(setno)) 1110 devid_flag = 0; 1111 if ((md_devid_destroy == 1) && (md_keep_repl_state == 1)) 1112 devid_flag = 0; 1113 1114 /* 1115 * if we weren't devid style before and md_keep_repl_state=1 1116 * we need to stay non-devid 1117 */ 1118 if ((md_keep_repl_state == 1) && 1119 ((s->s_lbp->lb_flags & MDDB_DEVID_STYLE) == 0)) 1120 devid_flag = 0; 1121 if (devid_flag) { 1122 /* 1123 * Determine number of free blocks needed to convert 1124 * entire replica to device id format - locator blocks 1125 * and namespace. 1126 */ 1127 cvt_blks = 0; 1128 if (mddb_lb_did_convert(s, 0, &cvt_blks) != 0) { 1129 if (ep != NULL) 1130 (void) mdsyserror(ep, EIO); 1131 err = -1; 1132 goto out; 1133 1134 } 1135 cvt_blks += md_nm_did_chkspace(setno); 1136 1137 /* add MDDB_DEVID_CONV_PERC% */ 1138 if ((md_conv_perc > 0) && (md_conv_perc <= 100)) { 1139 cvt_blks = cvt_blks * (100 + md_conv_perc) / 100; 1140 } 1141 1142 if (cvt_blks <= s->s_freeblkcnt) { 1143 if (mddb_lb_did_convert(s, 1, &cvt_blks) != 0) { 1144 if (ep != NULL) 1145 (void) mdsyserror(ep, EIO); 1146 err = -1; 1147 goto out; 1148 } 1149 1150 } else { 1151 /* 1152 * Print message that replica can't be converted for 1153 * lack of space. No failure - just continue to 1154 * run without device ids. 1155 */ 1156 cmn_err(CE_WARN, 1157 "Unable to add Solaris Volume Manager device " 1158 "relocation data.\n" 1159 " To use device relocation feature:\n" 1160 " - Increase size of listed replicas\n" 1161 " - Reboot"); 1162 md_print_block_usage(s, cvt_blks); 1163 cmn_err(CE_WARN, 1164 "Loading set without device relocation data.\n" 1165 " Solaris Volume Manager disk movement " 1166 "not tracked in local set."); 1167 } 1168 } 1169 1170 /* 1171 * go through and load any modules referenced in 1172 * data base 1173 */ 1174 recid = mddb_makerecid(setno, 0); 1175 while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) { 1176 status = mddb_getrecstatus(recid); 1177 if (status == MDDB_STALE) { 1178 if (! (md_get_setstatus(setno) & MD_SET_STALE)) { 1179 md_set_setstatus(setno, MD_SET_STALE); 1180 cmn_err(CE_WARN, 1181 "md: state database is stale"); 1182 } 1183 } else if (status == MDDB_NODATA) { 1184 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 1185 continue; 1186 } 1187 drvrid = mddb_getrectype1(recid); 1188 if (drvrid < MDDB_FIRST_MODID) 1189 continue; 1190 if (md_loadsubmod(setno, md_getshared_name(setno, drvrid), 1191 drvrid) < 0) { 1192 cmn_err(CE_NOTE, "md: could not load misc/%s", 1193 md_getshared_name(setno, drvrid)); 1194 } 1195 } 1196 1197 if (recid < 0) 1198 goto out; 1199 1200 snarf_user_data(setno); 1201 1202 /* 1203 * Initialize the md_nm_snarfed array 1204 * this array is indexed by the key and 1205 * is set by md_getdevnum during the snarf time 1206 */ 1207 if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) != NULL) { 1208 size = (int)((((struct nm_rec_hdr *)nh->nmn_record)-> 1209 r_next_key) * (sizeof (int))); 1210 md_nm_snarfed = (int *)kmem_zalloc(size, KM_SLEEP); 1211 } 1212 1213 /* 1214 * go through and snarf until nothing gets added 1215 */ 1216 do { 1217 i = 0; 1218 for (ops = md_opslist; ops != NULL; ops = ops->md_next) { 1219 if (ops->md_snarf != NULL) { 1220 retval = ops->md_snarf(MD_SNARF_DOIT, setno); 1221 if (retval == -1) { 1222 err = -1; 1223 /* Don't know the failed unit */ 1224 (void) mdmderror(ep, MDE_RR_ALLOC_ERROR, 1225 0); 1226 (void) md_halt_set(setno, MD_HALT_ALL); 1227 (void) mddb_unload_set(setno); 1228 md_haltsnarf_exit(setno); 1229 return (err); 1230 } else { 1231 i += retval; 1232 } 1233 } 1234 } 1235 } while (i); 1236 1237 /* 1238 * Set the first available slot and availability 1239 */ 1240 md_set[setno].s_un_avail = 0; 1241 for (un = 0; un < MD_MAXUNITS; un++) { 1242 if (md_set[setno].s_un[un] != NULL) { 1243 continue; 1244 } else { 1245 if (!un_next_set) { 1246 md_set[setno].s_un_next = un; 1247 un_next_set = 1; 1248 } 1249 md_set[setno].s_un_avail++; 1250 } 1251 } 1252 1253 md_set_setstatus(setno, MD_SET_SNARFED); 1254 1255 recid = mddb_makerecid(setno, 0); 1256 while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) { 1257 privat = mddb_getrecprivate(recid); 1258 if (privat & MD_PRV_COMMIT) { 1259 if (mddb_commitrec(recid)) { 1260 if (!(md_get_setstatus(setno) & MD_SET_STALE)) { 1261 md_set_setstatus(setno, MD_SET_STALE); 1262 cmn_err(CE_WARN, 1263 "md: state database is stale"); 1264 } 1265 } 1266 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1267 } 1268 } 1269 1270 /* Deletes must happen after all the commits */ 1271 recid = mddb_makerecid(setno, 0); 1272 while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) { 1273 privat = mddb_getrecprivate(recid); 1274 if (privat & MD_PRV_DELETE) { 1275 if (mddb_deleterec(recid)) { 1276 if (!(md_get_setstatus(setno) & MD_SET_STALE)) { 1277 md_set_setstatus(setno, MD_SET_STALE); 1278 cmn_err(CE_WARN, 1279 "md: state database is stale"); 1280 } 1281 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1282 } 1283 recid = mddb_makerecid(setno, 0); 1284 } 1285 } 1286 1287 /* 1288 * go through and clean up records until nothing gets cleaned up. 1289 */ 1290 do { 1291 i = 0; 1292 for (ops = md_opslist; ops != NULL; ops = ops->md_next) 1293 if (ops->md_snarf != NULL) 1294 i += ops->md_snarf(MD_SNARF_CLEANUP, setno); 1295 } while (i); 1296 1297 if (md_nm_snarfed != NULL && 1298 !(md_get_setstatus(setno) & MD_SET_STALE)) { 1299 /* 1300 * go thru and cleanup the namespace and the device id 1301 * name space 1302 */ 1303 for (key = 1; 1304 key < ((struct nm_rec_hdr *)nh->nmn_record)->r_next_key; 1305 key++) { 1306 /* 1307 * Is the entry an 'orphan'? 1308 */ 1309 if (lookup_entry(nh, setno, side, key, NODEV64, 0L) != 1310 NULL) { 1311 /* 1312 * If the value is not set then apparently 1313 * it is not part of the current configuration, 1314 * remove it this can happen when system panic 1315 * between the primary name space update and 1316 * the device id name space update 1317 */ 1318 if (md_nm_snarfed[key] == 0) { 1319 if (md_verify_orphaned_record(setno, 1320 key) == 1) 1321 (void) remove_entry(nh, 1322 side, key, 0L); 1323 } 1324 } 1325 } 1326 } 1327 1328 if (md_nm_snarfed != NULL) { 1329 /* 1330 * Done and free the memory 1331 */ 1332 kmem_free(md_nm_snarfed, size); 1333 md_nm_snarfed = NULL; 1334 } 1335 1336 if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE && 1337 !(md_get_setstatus(setno) & MD_SET_STALE)) { 1338 /* 1339 * if the destroy flag has been set and 1340 * the MD_SET_DIDCLUP bit is not set in 1341 * the set's status field, cleanup the 1342 * entire device id namespace 1343 */ 1344 if (md_devid_destroy && 1345 !(md_get_setstatus(setno) & MD_SET_DIDCLUP)) { 1346 (void) md_devid_cleanup(setno, 1); 1347 md_set_setstatus(setno, MD_SET_DIDCLUP); 1348 } else 1349 (void) md_devid_cleanup(setno, 0); 1350 } 1351 1352 /* 1353 * clear single threading on snarf, return success or error 1354 */ 1355 out: 1356 md_haltsnarf_exit(setno); 1357 return (err); 1358 } 1359 1360 void 1361 get_minfo(struct dk_minfo *info, minor_t mnum) 1362 { 1363 md_unit_t *un; 1364 mdi_unit_t *ui; 1365 1366 info->dki_capacity = 0; 1367 info->dki_lbsize = 0; 1368 info->dki_media_type = 0; 1369 1370 if ((ui = MDI_UNIT(mnum)) == NULL) { 1371 return; 1372 } 1373 un = (md_unit_t *)md_unit_readerlock(ui); 1374 info->dki_capacity = un->c.un_total_blocks; 1375 md_unit_readerexit(ui); 1376 info->dki_lbsize = DEV_BSIZE; 1377 info->dki_media_type = DK_UNKNOWN; 1378 } 1379 1380 1381 void 1382 get_info(struct dk_cinfo *info, minor_t mnum) 1383 { 1384 /* 1385 * Controller Information 1386 */ 1387 info->dki_ctype = DKC_MD; 1388 info->dki_cnum = ddi_get_instance(ddi_get_parent(md_devinfo)); 1389 (void) strcpy(info->dki_cname, 1390 ddi_get_name(ddi_get_parent(md_devinfo))); 1391 /* 1392 * Unit Information 1393 */ 1394 info->dki_unit = mnum; 1395 info->dki_slave = 0; 1396 (void) strcpy(info->dki_dname, ddi_driver_name(md_devinfo)); 1397 info->dki_flags = 0; 1398 info->dki_partition = 0; 1399 info->dki_maxtransfer = (ushort_t)(md_maxphys / DEV_BSIZE); 1400 1401 /* 1402 * We can't get from here to there yet 1403 */ 1404 info->dki_addr = 0; 1405 info->dki_space = 0; 1406 info->dki_prio = 0; 1407 info->dki_vec = 0; 1408 } 1409 1410 /* 1411 * open admin device 1412 */ 1413 static int 1414 mdadminopen( 1415 int flag, 1416 int otyp) 1417 { 1418 int err = 0; 1419 1420 /* single thread */ 1421 mutex_enter(&md_mx); 1422 1423 /* check type and flags */ 1424 if ((otyp != OTYP_CHR) && (otyp != OTYP_LYR)) { 1425 err = EINVAL; 1426 goto out; 1427 } 1428 if (((flag & FEXCL) && (md_status & MD_GBL_OPEN)) || 1429 (md_status & MD_GBL_EXCL)) { 1430 err = EBUSY; 1431 goto out; 1432 } 1433 1434 /* count and flag open */ 1435 md_ocnt[otyp]++; 1436 md_status |= MD_GBL_OPEN; 1437 if (flag & FEXCL) 1438 md_status |= MD_GBL_EXCL; 1439 1440 /* unlock return success */ 1441 out: 1442 mutex_exit(&md_mx); 1443 return (err); 1444 } 1445 1446 /* 1447 * open entry point 1448 */ 1449 static int 1450 mdopen( 1451 dev_t *dev, 1452 int flag, 1453 int otyp, 1454 cred_t *cred_p) 1455 { 1456 minor_t mnum = getminor(*dev); 1457 unit_t unit = MD_MIN2UNIT(mnum); 1458 set_t setno = MD_MIN2SET(mnum); 1459 mdi_unit_t *ui = NULL; 1460 int err = 0; 1461 md_parent_t parent; 1462 1463 /* dispatch admin device opens */ 1464 if (mnum == MD_ADM_MINOR) 1465 return (mdadminopen(flag, otyp)); 1466 1467 /* lock, check status */ 1468 rw_enter(&md_unit_array_rw.lock, RW_READER); 1469 1470 tryagain: 1471 if (md_get_status() & MD_GBL_HALTED) { 1472 err = ENODEV; 1473 goto out; 1474 } 1475 1476 /* check minor */ 1477 if ((setno >= md_nsets) || (unit >= md_nunits)) { 1478 err = ENXIO; 1479 goto out; 1480 } 1481 1482 /* make sure we're snarfed */ 1483 if ((md_get_setstatus(MD_LOCAL_SET) & MD_SET_SNARFED) == 0) { 1484 if (md_snarf_db_set(MD_LOCAL_SET, NULL) != 0) { 1485 err = ENODEV; 1486 goto out; 1487 } 1488 } 1489 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) { 1490 err = ENODEV; 1491 goto out; 1492 } 1493 1494 /* check unit */ 1495 if ((ui = MDI_UNIT(mnum)) == NULL) { 1496 err = ENXIO; 1497 goto out; 1498 } 1499 1500 /* 1501 * The softpart open routine may do an I/O during the open, in 1502 * which case the open routine will set the OPENINPROGRESS flag 1503 * and drop all locks during the I/O. If this thread sees 1504 * the OPENINPROGRESS flag set, if should wait until the flag 1505 * is reset before calling the driver's open routine. It must 1506 * also revalidate the world after it grabs the unit_array lock 1507 * since the set may have been released or the metadevice cleared 1508 * during the sleep. 1509 */ 1510 if (MD_MNSET_SETNO(setno)) { 1511 mutex_enter(&ui->ui_mx); 1512 if (ui->ui_lock & MD_UL_OPENINPROGRESS) { 1513 rw_exit(&md_unit_array_rw.lock); 1514 cv_wait(&ui->ui_cv, &ui->ui_mx); 1515 rw_enter(&md_unit_array_rw.lock, RW_READER); 1516 mutex_exit(&ui->ui_mx); 1517 goto tryagain; 1518 } 1519 mutex_exit(&ui->ui_mx); 1520 } 1521 1522 /* Test if device is openable */ 1523 if ((ui->ui_tstate & MD_NOTOPENABLE) != 0) { 1524 err = ENXIO; 1525 goto out; 1526 } 1527 1528 /* don't allow opens w/WRITE flag if stale */ 1529 if ((flag & FWRITE) && (md_get_setstatus(setno) & MD_SET_STALE)) { 1530 err = EROFS; 1531 goto out; 1532 } 1533 1534 /* don't allow writes to subdevices */ 1535 parent = md_get_parent(md_expldev(*dev)); 1536 if ((flag & FWRITE) && MD_HAS_PARENT(parent)) { 1537 err = EROFS; 1538 goto out; 1539 } 1540 1541 /* open underlying driver */ 1542 if (md_ops[ui->ui_opsindex]->md_open != NULL) { 1543 if ((err = (*md_ops[ui->ui_opsindex]->md_open) 1544 (dev, flag, otyp, cred_p, 0)) != 0) 1545 goto out; 1546 } 1547 1548 /* or do it ourselves */ 1549 else { 1550 /* single thread */ 1551 (void) md_unit_openclose_enter(ui); 1552 err = md_unit_incopen(mnum, flag, otyp); 1553 md_unit_openclose_exit(ui); 1554 if (err != 0) 1555 goto out; 1556 } 1557 1558 /* unlock, return status */ 1559 out: 1560 rw_exit(&md_unit_array_rw.lock); 1561 return (err); 1562 } 1563 1564 /* 1565 * close admin device 1566 */ 1567 static int 1568 mdadminclose( 1569 int otyp) 1570 { 1571 int i; 1572 int err = 0; 1573 1574 /* single thread */ 1575 mutex_enter(&md_mx); 1576 1577 /* check type and flags */ 1578 if ((otyp < 0) || (otyp >= OTYPCNT)) { 1579 err = EINVAL; 1580 goto out; 1581 } else if (md_ocnt[otyp] == 0) { 1582 err = ENXIO; 1583 goto out; 1584 } 1585 1586 /* count and flag closed */ 1587 if (otyp == OTYP_LYR) 1588 md_ocnt[otyp]--; 1589 else 1590 md_ocnt[otyp] = 0; 1591 md_status &= ~MD_GBL_OPEN; 1592 for (i = 0; (i < OTYPCNT); ++i) 1593 if (md_ocnt[i] != 0) 1594 md_status |= MD_GBL_OPEN; 1595 if (! (md_status & MD_GBL_OPEN)) 1596 md_status &= ~MD_GBL_EXCL; 1597 1598 /* unlock return success */ 1599 out: 1600 mutex_exit(&md_mx); 1601 return (err); 1602 } 1603 1604 /* 1605 * close entry point 1606 */ 1607 static int 1608 mdclose( 1609 dev_t dev, 1610 int flag, 1611 int otyp, 1612 cred_t *cred_p) 1613 { 1614 minor_t mnum = getminor(dev); 1615 set_t setno = MD_MIN2SET(mnum); 1616 unit_t unit = MD_MIN2UNIT(mnum); 1617 mdi_unit_t *ui = NULL; 1618 int err = 0; 1619 1620 /* dispatch admin device closes */ 1621 if (mnum == MD_ADM_MINOR) 1622 return (mdadminclose(otyp)); 1623 1624 /* check minor */ 1625 if ((setno >= md_nsets) || (unit >= md_nunits) || 1626 ((ui = MDI_UNIT(mnum)) == NULL)) { 1627 err = ENXIO; 1628 goto out; 1629 } 1630 1631 /* close underlying driver */ 1632 if (md_ops[ui->ui_opsindex]->md_close != NULL) { 1633 if ((err = (*md_ops[ui->ui_opsindex]->md_close) 1634 (dev, flag, otyp, cred_p, 0)) != 0) 1635 goto out; 1636 } 1637 1638 /* or do it ourselves */ 1639 else { 1640 /* single thread */ 1641 (void) md_unit_openclose_enter(ui); 1642 err = md_unit_decopen(mnum, otyp); 1643 md_unit_openclose_exit(ui); 1644 if (err != 0) 1645 goto out; 1646 } 1647 1648 /* return success */ 1649 out: 1650 return (err); 1651 } 1652 1653 1654 /* 1655 * This routine performs raw read operations. It is called from the 1656 * device switch at normal priority. 1657 * 1658 * The main catch is that the *uio struct which is passed to us may 1659 * specify a read which spans two buffers, which would be contiguous 1660 * on a single partition, but not on a striped partition. This will 1661 * be handled by mdstrategy. 1662 */ 1663 /*ARGSUSED*/ 1664 static int 1665 mdread(dev_t dev, struct uio *uio, cred_t *credp) 1666 { 1667 minor_t mnum; 1668 mdi_unit_t *ui; 1669 int error; 1670 1671 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1672 (MD_MIN2SET(mnum) >= md_nsets) || 1673 (MD_MIN2UNIT(mnum) >= md_nunits) || 1674 ((ui = MDI_UNIT(mnum)) == NULL)) 1675 return (ENXIO); 1676 1677 if (md_ops[ui->ui_opsindex]->md_read != NULL) 1678 return ((*md_ops[ui->ui_opsindex]->md_read) 1679 (dev, uio, credp)); 1680 1681 if ((error = md_chk_uio(uio)) != 0) 1682 return (error); 1683 1684 return (physio(mdstrategy, NULL, dev, B_READ, md_minphys, uio)); 1685 } 1686 1687 /* 1688 * This routine performs async raw read operations. It is called from the 1689 * device switch at normal priority. 1690 * 1691 * The main catch is that the *aio struct which is passed to us may 1692 * specify a read which spans two buffers, which would be contiguous 1693 * on a single partition, but not on a striped partition. This will 1694 * be handled by mdstrategy. 1695 */ 1696 /*ARGSUSED*/ 1697 static int 1698 mdaread(dev_t dev, struct aio_req *aio, cred_t *credp) 1699 { 1700 minor_t mnum; 1701 mdi_unit_t *ui; 1702 int error; 1703 1704 1705 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1706 (MD_MIN2SET(mnum) >= md_nsets) || 1707 (MD_MIN2UNIT(mnum) >= md_nunits) || 1708 ((ui = MDI_UNIT(mnum)) == NULL)) 1709 return (ENXIO); 1710 1711 if (md_ops[ui->ui_opsindex]->md_aread != NULL) 1712 return ((*md_ops[ui->ui_opsindex]->md_aread) 1713 (dev, aio, credp)); 1714 1715 if ((error = md_chk_uio(aio->aio_uio)) != 0) 1716 return (error); 1717 1718 return (aphysio(mdstrategy, anocancel, dev, B_READ, md_minphys, aio)); 1719 } 1720 1721 /* 1722 * This routine performs raw write operations. It is called from the 1723 * device switch at normal priority. 1724 * 1725 * The main catch is that the *uio struct which is passed to us may 1726 * specify a write which spans two buffers, which would be contiguous 1727 * on a single partition, but not on a striped partition. This is 1728 * handled by mdstrategy. 1729 * 1730 */ 1731 /*ARGSUSED*/ 1732 static int 1733 mdwrite(dev_t dev, struct uio *uio, cred_t *credp) 1734 { 1735 minor_t mnum; 1736 mdi_unit_t *ui; 1737 int error; 1738 1739 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1740 (MD_MIN2SET(mnum) >= md_nsets) || 1741 (MD_MIN2UNIT(mnum) >= md_nunits) || 1742 ((ui = MDI_UNIT(mnum)) == NULL)) 1743 return (ENXIO); 1744 1745 if (md_ops[ui->ui_opsindex]->md_write != NULL) 1746 return ((*md_ops[ui->ui_opsindex]->md_write) 1747 (dev, uio, credp)); 1748 1749 if ((error = md_chk_uio(uio)) != 0) 1750 return (error); 1751 1752 return (physio(mdstrategy, NULL, dev, B_WRITE, md_minphys, uio)); 1753 } 1754 1755 /* 1756 * This routine performs async raw write operations. It is called from the 1757 * device switch at normal priority. 1758 * 1759 * The main catch is that the *aio struct which is passed to us may 1760 * specify a write which spans two buffers, which would be contiguous 1761 * on a single partition, but not on a striped partition. This is 1762 * handled by mdstrategy. 1763 * 1764 */ 1765 /*ARGSUSED*/ 1766 static int 1767 mdawrite(dev_t dev, struct aio_req *aio, cred_t *credp) 1768 { 1769 minor_t mnum; 1770 mdi_unit_t *ui; 1771 int error; 1772 1773 1774 if (((mnum = getminor(dev)) == MD_ADM_MINOR) || 1775 (MD_MIN2SET(mnum) >= md_nsets) || 1776 (MD_MIN2UNIT(mnum) >= md_nunits) || 1777 ((ui = MDI_UNIT(mnum)) == NULL)) 1778 return (ENXIO); 1779 1780 if (md_ops[ui->ui_opsindex]->md_awrite != NULL) 1781 return ((*md_ops[ui->ui_opsindex]->md_awrite) 1782 (dev, aio, credp)); 1783 1784 if ((error = md_chk_uio(aio->aio_uio)) != 0) 1785 return (error); 1786 1787 return (aphysio(mdstrategy, anocancel, dev, B_WRITE, md_minphys, aio)); 1788 } 1789 1790 int 1791 mdstrategy(struct buf *bp) 1792 { 1793 minor_t mnum; 1794 mdi_unit_t *ui; 1795 1796 ASSERT((bp->b_flags & B_DONE) == 0); 1797 1798 if (panicstr) 1799 md_clr_status(MD_GBL_DAEMONS_LIVE); 1800 1801 if (((mnum = getminor(bp->b_edev)) == MD_ADM_MINOR) || 1802 (MD_MIN2SET(mnum) >= md_nsets) || 1803 (MD_MIN2UNIT(mnum) >= md_nunits) || 1804 ((ui = MDI_UNIT(mnum)) == NULL)) { 1805 bp->b_flags |= B_ERROR; 1806 bp->b_error = ENXIO; 1807 bp->b_resid = bp->b_bcount; 1808 biodone(bp); 1809 return (0); 1810 } 1811 1812 bp->b_flags &= ~(B_ERROR | B_DONE); 1813 if (md_ops[ui->ui_opsindex]->md_strategy != NULL) { 1814 (*md_ops[ui->ui_opsindex]->md_strategy) (bp, 0, NULL); 1815 } else { 1816 (void) errdone(ui, bp, ENXIO); 1817 } 1818 return (0); 1819 } 1820 1821 /* 1822 * Return true if the ioctl is allowed to be multithreaded. 1823 * All the ioctls with MN are sent only from the message handlers through 1824 * rpc.mdcommd, which (via it's own locking mechanism) takes care that not two 1825 * ioctl for the same metadevice are issued at the same time. 1826 * So we are safe here. 1827 * The other ioctls do not mess with any metadevice structures and therefor 1828 * are harmless too, if called multiple times at the same time. 1829 */ 1830 static boolean_t 1831 is_mt_ioctl(int cmd) { 1832 1833 switch (cmd) { 1834 case MD_IOCGUNIQMSGID: 1835 case MD_IOCGVERSION: 1836 case MD_IOCISOPEN: 1837 case MD_MN_SET_MM_OWNER: 1838 case MD_MN_SET_STATE: 1839 case MD_MN_SUSPEND_WRITES: 1840 case MD_MN_ALLOCATE_HOTSPARE: 1841 case MD_MN_SET_SETFLAGS: 1842 case MD_MN_GET_SETFLAGS: 1843 case MD_MN_MDDB_OPTRECFIX: 1844 case MD_MN_MDDB_PARSE: 1845 case MD_MN_MDDB_BLOCK: 1846 case MD_MN_DB_USERREQ: 1847 case MD_IOC_SPSTATUS: 1848 case MD_MN_COMMD_ERR: 1849 case MD_MN_SET_COMMD_RUNNING: 1850 case MD_MN_RESYNC: 1851 case MD_MN_SETSYNC: 1852 case MD_MN_POKE_HOTSPARES: 1853 case MD_MN_RR_DIRTY: 1854 case MD_MN_RR_CLEAN: 1855 case MD_MN_IOC_SPUPDATEWM: 1856 return (1); 1857 default: 1858 return (0); 1859 } 1860 } 1861 1862 /* 1863 * This routine implements the ioctl calls for the Virtual Disk System. 1864 * It is called from the device switch at normal priority. 1865 */ 1866 /* ARGSUSED */ 1867 static int 1868 mdioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cred_p, 1869 int *rval_p) 1870 { 1871 minor_t mnum = getminor(dev); 1872 mdi_unit_t *ui; 1873 IOLOCK lock; 1874 int err; 1875 1876 /* 1877 * For multinode disksets number of ioctls are allowed to be 1878 * multithreaded. 1879 * A fundamental assumption made in this implementation is that 1880 * ioctls either do not interact with other md structures or the 1881 * ioctl to the admin device can only occur if the metadevice 1882 * device is open. i.e. avoid a race between metaclear and the 1883 * progress of a multithreaded ioctl. 1884 */ 1885 1886 if (!is_mt_ioctl(cmd) && md_ioctl_lock_enter() == EINTR) { 1887 return (EINTR); 1888 } 1889 1890 /* 1891 * initialize lock tracker 1892 */ 1893 IOLOCK_INIT(&lock); 1894 1895 /* Flag to indicate that MD_GBL_IOCTL_LOCK is not acquired */ 1896 1897 if (is_mt_ioctl(cmd)) { 1898 /* increment the md_mtioctl_cnt */ 1899 mutex_enter(&md_mx); 1900 md_mtioctl_cnt++; 1901 mutex_exit(&md_mx); 1902 lock.l_flags |= MD_MT_IOCTL; 1903 } 1904 1905 /* 1906 * this has been added to prevent notification from re-snarfing 1907 * so metaunload will work. It may interfere with other modules 1908 * halt process. 1909 */ 1910 if (md_get_status() & (MD_GBL_HALTED | MD_GBL_DAEMONS_DIE)) 1911 return (IOLOCK_RETURN(ENXIO, &lock)); 1912 1913 /* 1914 * admin device ioctls 1915 */ 1916 if (mnum == MD_ADM_MINOR) { 1917 err = md_admin_ioctl(md_expldev(dev), cmd, (void *) data, 1918 mode, &lock); 1919 } 1920 1921 /* 1922 * metadevice ioctls 1923 */ 1924 else if ((MD_MIN2SET(mnum) >= md_nsets) || 1925 (MD_MIN2UNIT(mnum) >= md_nunits) || 1926 (md_set[MD_MIN2SET(mnum)].s_ui == NULL) || 1927 ((ui = MDI_UNIT(mnum)) == NULL)) { 1928 err = ENXIO; 1929 } else if (md_ops[ui->ui_opsindex]->md_ioctl == NULL) { 1930 err = ENOTTY; 1931 } else { 1932 err = (*md_ops[ui->ui_opsindex]->md_ioctl) 1933 (dev, cmd, (void *) data, mode, &lock); 1934 } 1935 1936 /* 1937 * drop any locks we grabbed 1938 */ 1939 return (IOLOCK_RETURN_IOCTLEND(err, &lock)); 1940 } 1941 1942 static int 1943 mddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1944 { 1945 minor_t mnum; 1946 set_t setno; 1947 mdi_unit_t *ui; 1948 1949 if ((mnum = getminor(dev)) == MD_ADM_MINOR) 1950 return (ENXIO); 1951 1952 setno = MD_MIN2SET(mnum); 1953 1954 if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits) || 1955 ((ui = MDI_UNIT(mnum)) == NULL)) 1956 return (ENXIO); 1957 1958 1959 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) 1960 return (ENXIO); 1961 1962 if (md_ops[ui->ui_opsindex]->md_dump != NULL) 1963 return ((*md_ops[ui->ui_opsindex]->md_dump) 1964 (dev, addr, blkno, nblk)); 1965 1966 return (ENXIO); 1967 } 1968 1969 /* 1970 * Metadevice unit number dispatcher 1971 * When this routine is called it will scan the 1972 * incore unit array and return the avail slot 1973 * hence the unit number to the caller 1974 * 1975 * Return -1 if there is nothing available 1976 */ 1977 unit_t 1978 md_get_nextunit(set_t setno) 1979 { 1980 unit_t un, start; 1981 1982 /* 1983 * If nothing available 1984 */ 1985 if (md_set[setno].s_un_avail == 0) { 1986 return (MD_UNITBAD); 1987 } 1988 1989 mutex_enter(&md_mx); 1990 start = un = md_set[setno].s_un_next; 1991 1992 /* LINTED: E_CONSTANT_CONDITION */ 1993 while (1) { 1994 if (md_set[setno].s_un[un] == NULL) { 1995 /* 1996 * Advance the starting index for the next 1997 * md_get_nextunit call 1998 */ 1999 if (un == MD_MAXUNITS - 1) { 2000 md_set[setno].s_un_next = 0; 2001 } else { 2002 md_set[setno].s_un_next = un + 1; 2003 } 2004 break; 2005 } 2006 2007 un = ((un == MD_MAXUNITS - 1) ? 0 : un + 1); 2008 2009 if (un == start) { 2010 un = MD_UNITBAD; 2011 break; 2012 } 2013 2014 } 2015 2016 mutex_exit(&md_mx); 2017 return (un); 2018 }