1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
25 */
26
27 /*
28 * Md - is the meta-disk driver. It sits below the UFS file system
29 * but above the 'real' disk drivers, xy, id, sd etc.
30 *
31 * To the UFS software, md looks like a normal driver, since it has
32 * the normal kinds of entries in the bdevsw and cdevsw arrays. So
33 * UFS accesses md in the usual ways. In particular, the strategy
34 * routine, mdstrategy(), gets called by fbiwrite(), ufs_getapage(),
35 * and ufs_writelbn().
36 *
37 * Md maintains an array of minor devices (meta-partitions). Each
38 * meta partition stands for a matrix of real partitions, in rows
39 * which are not necessarily of equal length. Md maintains a table,
40 * with one entry for each meta-partition, which lists the rows and
41 * columns of actual partitions, and the job of the strategy routine
42 * is to translate from the meta-partition device and block numbers
43 * known to UFS into the actual partitions' device and block numbers.
44 *
45 * See below, in mdstrategy(), mdreal(), and mddone() for details of
46 * this translation.
47 */
48
49 /*
50 * Driver for Virtual Disk.
51 */
52
53 #include <sys/user.h>
54 #include <sys/sysmacros.h>
55 #include <sys/conf.h>
56 #include <sys/stat.h>
57 #include <sys/errno.h>
58 #include <sys/param.h>
59 #include <sys/systm.h>
60 #include <sys/file.h>
61 #include <sys/open.h>
62 #include <sys/dkio.h>
63 #include <sys/vtoc.h>
64 #include <sys/cmn_err.h>
65 #include <sys/ddi.h>
66 #include <sys/sunddi.h>
67 #include <sys/debug.h>
68 #include <sys/utsname.h>
69 #include <sys/lvm/mdvar.h>
70 #include <sys/lvm/md_names.h>
71 #include <sys/lvm/md_mddb.h>
72 #include <sys/lvm/md_sp.h>
73 #include <sys/types.h>
74 #include <sys/kmem.h>
75 #include <sys/cladm.h>
76 #include <sys/priv_names.h>
77 #include <sys/modhash.h>
78
79 int md_init_debug = 0; /* module binding debug */
80
81 /*
82 * Tunable to turn off the failfast behavior.
83 */
84 int md_ff_disable = 0;
85
86 /*
87 * dynamically allocated list of non FF driver names - needs to
88 * be freed when md is detached.
89 */
90 char **non_ff_drivers = NULL;
91
92 md_krwlock_t md_unit_array_rw; /* protects all unit arrays */
93 md_krwlock_t nm_lock; /* protects all the name spaces */
94
95 md_resync_t md_cpr_resync;
96
97 extern char svm_bootpath[];
98 #define SVM_PSEUDO_STR "/pseudo/md@0:"
99
100 #define VERSION_LENGTH 6
101 #define VERSION "1.0"
102
103 /*
104 * Keep track of possible 'orphan' entries in the name space
105 */
106 int *md_nm_snarfed = NULL;
107
108 /*
109 * Global tunable giving the percentage of free space left in replica during
110 * conversion of non-devid style replica to devid style replica.
111 */
112 int md_conv_perc = MDDB_DEVID_CONV_PERC;
113
114 #ifdef DEBUG
115 /* debug code to verify framework exclusion guarantees */
116 int md_in;
117 kmutex_t md_in_mx; /* used to md global stuff */
118 #define IN_INIT 0x01
119 #define IN_FINI 0x02
120 #define IN_ATTACH 0x04
121 #define IN_DETACH 0x08
122 #define IN_OPEN 0x10
123 #define MD_SET_IN(x) { \
124 mutex_enter(&md_in_mx); \
125 if (md_in) \
126 debug_enter("MD_SET_IN exclusion lost"); \
127 if (md_in & x) \
128 debug_enter("MD_SET_IN already set"); \
129 md_in |= x; \
130 mutex_exit(&md_in_mx); \
131 }
132
133 #define MD_CLR_IN(x) { \
134 mutex_enter(&md_in_mx); \
135 if (md_in & ~(x)) \
136 debug_enter("MD_CLR_IN exclusion lost"); \
137 if (!(md_in & x)) \
138 debug_enter("MD_CLR_IN already clr"); \
139 md_in &= ~x; \
140 mutex_exit(&md_in_mx); \
141 }
142 #else /* DEBUG */
143 #define MD_SET_IN(x)
144 #define MD_CLR_IN(x)
145 #endif /* DEBUG */
146 hrtime_t savetime1, savetime2;
147
148
149 /*
150 * list things protected by md_mx even if they aren't
151 * used in this file.
152 */
153 kmutex_t md_mx; /* used to md global stuff */
154 kcondvar_t md_cv; /* md_status events */
155 int md_status = 0; /* global status for the meta-driver */
156 int md_num_daemons = 0;
157 int md_ioctl_cnt = 0;
158 int md_mtioctl_cnt = 0; /* multithreaded ioctl cnt */
159 uint_t md_mdelay = 10; /* variable so can be patched */
160
161 int (*mdv_strategy_tstpnt)(buf_t *, int, void*);
162
163 major_t md_major, md_major_targ;
164
165 unit_t md_nunits = MD_MAXUNITS;
166 set_t md_nsets = MD_MAXSETS;
167 int md_nmedh = 0;
168 char *md_med_trans_lst = NULL;
169 md_set_t md_set[MD_MAXSETS];
170 md_set_io_t md_set_io[MD_MAXSETS];
171
172 md_krwlock_t hsp_rwlp; /* protects hot_spare_interface */
173 md_krwlock_t ni_rwlp; /* protects notify_interface */
174 md_ops_t **md_ops = NULL;
175 ddi_modhandle_t *md_mods = NULL;
176 md_ops_t *md_opslist;
177 clock_t md_hz;
178 md_event_queue_t *md_event_queue = NULL;
179
180 int md_in_upgrade;
181 int md_keep_repl_state;
182 int md_devid_destroy;
183
184 /* for sending messages thru a door to userland */
185 door_handle_t mdmn_door_handle = NULL;
186 int mdmn_door_did = -1;
187
188 dev_info_t *md_devinfo = NULL;
189
190 md_mn_nodeid_t md_mn_mynode_id = ~0u; /* My node id (for multi-node sets) */
191
192 static uint_t md_ocnt[OTYPCNT];
193
194 static int mdinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
195 static int mdattach(dev_info_t *, ddi_attach_cmd_t);
196 static int mddetach(dev_info_t *, ddi_detach_cmd_t);
197 static int mdopen(dev_t *, int, int, cred_t *);
198 static int mdclose(dev_t, int, int, cred_t *);
199 static int mddump(dev_t, caddr_t, daddr_t, int);
200 static int mdread(dev_t, struct uio *, cred_t *);
201 static int mdwrite(dev_t, struct uio *, cred_t *);
202 static int mdaread(dev_t, struct aio_req *, cred_t *);
203 static int mdawrite(dev_t, struct aio_req *, cred_t *);
204 static int mdioctl(dev_t, int, intptr_t, int, cred_t *, int *);
205 static int mdprop_op(dev_t, dev_info_t *,
206 ddi_prop_op_t, int, char *, caddr_t, int *);
207
208 static struct cb_ops md_cb_ops = {
209 mdopen, /* open */
210 mdclose, /* close */
211 mdstrategy, /* strategy */
212 /* print routine -- none yet */
213 (int(*)(dev_t, char *))nulldev,
214 mddump, /* dump */
215 mdread, /* read */
216 mdwrite, /* write */
217 mdioctl, /* ioctl */
218 /* devmap */
219 (int(*)(dev_t, devmap_cookie_t, offset_t, size_t, size_t *,
220 uint_t))nodev,
221 /* mmap */
222 (int(*)(dev_t, off_t, int))nodev,
223 /* segmap */
224 (int(*)(dev_t, off_t, struct as *, caddr_t *, off_t, unsigned,
225 unsigned, unsigned, cred_t *))nodev,
226 nochpoll, /* poll */
227 mdprop_op, /* prop_op */
228 0, /* streamtab */
229 (D_64BIT|D_MP|D_NEW), /* driver compatibility flag */
230 CB_REV, /* cb_ops version */
231 mdaread, /* aread */
232 mdawrite, /* awrite */
233 };
234
235 static struct dev_ops md_devops = {
236 DEVO_REV, /* dev_ops version */
237 0, /* device reference count */
238 mdinfo, /* info routine */
239 nulldev, /* identify routine */
240 nulldev, /* probe - not defined */
241 mdattach, /* attach routine */
242 mddetach, /* detach routine */
243 nodev, /* reset - not defined */
244 &md_cb_ops, /* driver operations */
245 NULL, /* bus operations */
246 nodev, /* power management */
247 ddi_quiesce_not_needed, /* quiesce */
248 };
249
250 /*
251 * loadable module wrapper
252 */
253 #include <sys/modctl.h>
254
255 static struct modldrv modldrv = {
256 &mod_driverops, /* type of module -- a pseudodriver */
257 "Solaris Volume Manager base module", /* name of the module */
258 &md_devops, /* driver ops */
259 };
260
261 static struct modlinkage modlinkage = {
262 MODREV_1,
263 { (void *)&modldrv, NULL }
264 };
265
266
267 /* md_medd.c */
268 extern void med_init(void);
269 extern void med_fini(void);
270 extern void md_devid_cleanup(set_t, uint_t);
271
272 /* md_names.c */
273 extern struct nm_next_hdr *get_first_record(set_t, int, int);
274
275 int md_maxphys = 0; /* maximum io size in bytes */
276 #define MD_MAXBCOUNT (1024 * 1024)
277 unsigned md_maxbcount = 0; /* maximum physio size in bytes */
278
279 /*
280 * Some md ioctls trigger io framework device tree operations. An
281 * example is md ioctls that call md_resolve_bydevid(): which uses the
282 * io framework to resolve a devid. Such operations result in acquiring
283 * io framework locks (like ndi_devi_enter() of "/") while holding
284 * driver locks (like md_unit_writerlock()).
285 *
286 * The prop_op(9E) entry point is called from the devinfo driver with
287 * an active ndi_devi_enter of "/". To avoid deadlock, md's prop_op
288 * implementation must avoid taking a lock that is held per above md
289 * ioctl description: i.e. mdprop_op(9E) can't call md_unit_readerlock()
290 * without risking deadlock.
291 *
292 * To service "size" requests without risking deadlock, we maintain a
293 * "mnum->nblocks" sizemap (protected by a short-term global mutex).
294 */
295 static kmutex_t md_nblocks_mutex;
296 static mod_hash_t *md_nblocksmap; /* mnum -> nblocks */
297 int md_nblocksmap_size = 512;
298
299 /*
300 * Maintain "mnum->nblocks" sizemap for mdprop_op use:
301 *
302 * Create: any code that establishes a unit's un_total_blocks needs the
303 * following type of call to establish nblocks for mdprop_op():
304 * md_nblocks_set(mnum, un->c.un_total_blocks);"
305 * NOTE: locate via cscope md_create_minor_node/md_create_unit_incore
306 * ...or "MD_UNIT..*="
307 *
308 * Change: any code that changes a unit's un_total_blocks needs the
309 * following type of call to sync nblocks for mdprop_op():
310 * md_nblocks_set(mnum, un->c.un_total_blocks);"
311 * NOTE: locate via cscope for "un_total_blocks[ \t]*="
312 *
313 * Destroy: any code that deletes a unit needs the following type of call
314 * to sync nblocks for mdprop_op():
315 * md_nblocks_set(mnum, -1ULL);
316 * NOTE: locate via cscope md_remove_minor_node/md_destroy_unit_incore
317 * ...or "MD_UNIT..*="
318 */
319 void
320 md_nblocks_set(minor_t mnum, uint64_t nblocks)
321 {
322 mutex_enter(&md_nblocks_mutex);
323 if (nblocks == -1ULL)
324 (void) mod_hash_destroy(md_nblocksmap,
325 (mod_hash_key_t)(intptr_t)mnum);
326 else
327 (void) mod_hash_replace(md_nblocksmap,
328 (mod_hash_key_t)(intptr_t)mnum,
329 (mod_hash_val_t)(intptr_t)nblocks);
330 mutex_exit(&md_nblocks_mutex);
331 }
332
333 /* get the size of a mnum from "mnum->nblocks" sizemap */
334 uint64_t
335 md_nblocks_get(minor_t mnum)
336 {
337 mod_hash_val_t hv;
338
339 mutex_enter(&md_nblocks_mutex);
340 if (mod_hash_find(md_nblocksmap,
341 (mod_hash_key_t)(intptr_t)mnum, &hv) == 0) {
342 mutex_exit(&md_nblocks_mutex);
343 return ((uint64_t)(intptr_t)hv);
344 }
345 mutex_exit(&md_nblocks_mutex);
346 return (0);
347 }
348
349 /* allocate/free dynamic space associated with driver globals */
350 void
351 md_global_alloc_free(int alloc)
352 {
353 set_t s;
354
355 if (alloc) {
356 /* initialize driver global locks */
357 cv_init(&md_cv, NULL, CV_DEFAULT, NULL);
358 mutex_init(&md_mx, NULL, MUTEX_DEFAULT, NULL);
359 rw_init(&md_unit_array_rw.lock, NULL, RW_DEFAULT, NULL);
360 rw_init(&nm_lock.lock, NULL, RW_DEFAULT, NULL);
361 rw_init(&ni_rwlp.lock, NULL, RW_DRIVER, NULL);
362 rw_init(&hsp_rwlp.lock, NULL, RW_DRIVER, NULL);
363 mutex_init(&md_cpr_resync.md_resync_mutex, NULL,
364 MUTEX_DEFAULT, NULL);
365 mutex_init(&md_nblocks_mutex, NULL, MUTEX_DEFAULT, NULL);
366
367 /* initialize per set driver global locks */
368 for (s = 0; s < MD_MAXSETS; s++) {
369 /* initialize per set driver globals locks */
370 mutex_init(&md_set[s].s_dbmx,
371 NULL, MUTEX_DEFAULT, NULL);
372 mutex_init(&md_set_io[s].md_io_mx,
373 NULL, MUTEX_DEFAULT, NULL);
374 cv_init(&md_set_io[s].md_io_cv,
375 NULL, CV_DEFAULT, NULL);
376 }
377 } else {
378 /* destroy per set driver global locks */
379 for (s = 0; s < MD_MAXSETS; s++) {
380 cv_destroy(&md_set_io[s].md_io_cv);
381 mutex_destroy(&md_set_io[s].md_io_mx);
382 mutex_destroy(&md_set[s].s_dbmx);
383 }
384
385 /* destroy driver global locks */
386 mutex_destroy(&md_nblocks_mutex);
387 mutex_destroy(&md_cpr_resync.md_resync_mutex);
388 rw_destroy(&hsp_rwlp.lock);
389 rw_destroy(&ni_rwlp.lock);
390 rw_destroy(&nm_lock.lock);
391 rw_destroy(&md_unit_array_rw.lock);
392 mutex_destroy(&md_mx);
393 cv_destroy(&md_cv);
394 }
395 }
396
397 int
398 _init(void)
399 {
400 set_t s;
401 int err;
402
403 MD_SET_IN(IN_INIT);
404
405 /* allocate dynamic space associated with driver globals */
406 md_global_alloc_free(1);
407
408 /* initialize driver globals */
409 md_major = ddi_name_to_major("md");
410 md_hz = drv_usectohz(NUM_USEC_IN_SEC);
411
412 /* initialize tunable globals */
413 if (md_maxphys == 0) /* maximum io size in bytes */
414 md_maxphys = maxphys;
415 if (md_maxbcount == 0) /* maximum physio size in bytes */
416 md_maxbcount = MD_MAXBCOUNT;
417
418 /* initialize per set driver globals */
419 for (s = 0; s < MD_MAXSETS; s++)
420 md_set_io[s].io_state = MD_SET_ACTIVE;
421
422 /*
423 * NOTE: the framework does not currently guarantee exclusion
424 * between _init and attach after calling mod_install.
425 */
426 MD_CLR_IN(IN_INIT);
427 if ((err = mod_install(&modlinkage))) {
428 MD_SET_IN(IN_INIT);
429 md_global_alloc_free(0); /* free dynamic space */
430 MD_CLR_IN(IN_INIT);
431 }
432 return (err);
433 }
434
435 int
436 _fini(void)
437 {
438 int err;
439
440 /*
441 * NOTE: the framework currently does not guarantee exclusion
442 * with attach until after mod_remove returns 0.
443 */
444 if ((err = mod_remove(&modlinkage)))
445 return (err);
446
447 MD_SET_IN(IN_FINI);
448 md_global_alloc_free(0); /* free dynamic space */
449 MD_CLR_IN(IN_FINI);
450 return (err);
451 }
452
453 int
454 _info(struct modinfo *modinfop)
455 {
456 return (mod_info(&modlinkage, modinfop));
457 }
458
459 /* ARGSUSED */
460 static int
461 mdattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
462 {
463 int len;
464 unit_t i;
465 size_t sz;
466 char ver[VERSION_LENGTH];
467 char **maj_str_array;
468 char *str, *str2;
469
470 MD_SET_IN(IN_ATTACH);
471 md_in_upgrade = 0;
472 md_keep_repl_state = 0;
473 md_devid_destroy = 0;
474
475 if (cmd != DDI_ATTACH) {
476 MD_CLR_IN(IN_ATTACH);
477 return (DDI_FAILURE);
478 }
479
480 if (md_devinfo != NULL) {
481 MD_CLR_IN(IN_ATTACH);
482 return (DDI_FAILURE);
483 }
484
485 mddb_init();
486
487 if (md_start_daemons(TRUE)) {
488 MD_CLR_IN(IN_ATTACH);
489 mddb_unload(); /* undo mddb_init() allocations */
490 return (DDI_FAILURE);
491 }
492
493 /* clear the halted state */
494 md_clr_status(MD_GBL_HALTED);
495
496 /* see if the diagnostic switch is on */
497 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
498 DDI_PROP_DONTPASS, "md_init_debug", 0))
499 md_init_debug++;
500
501 /* see if the failfast disable switch is on */
502 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
503 DDI_PROP_DONTPASS, "md_ff_disable", 0))
504 md_ff_disable++;
505
506 /* try and get the md_nmedh property */
507 md_nmedh = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
508 DDI_PROP_DONTPASS, "md_nmedh", MED_DEF_HOSTS);
509 if ((md_nmedh <= 0) || (md_nmedh > MED_MAX_HOSTS))
510 md_nmedh = MED_DEF_HOSTS;
511
512 /* try and get the md_med_trans_lst property */
513 len = 0;
514 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN,
515 0, "md_med_trans_lst", NULL, &len) != DDI_PROP_SUCCESS ||
516 len == 0) {
517 md_med_trans_lst = md_strdup("tcp");
518 } else {
519 md_med_trans_lst = kmem_zalloc((size_t)len, KM_SLEEP);
520 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
521 0, "md_med_trans_lst", md_med_trans_lst, &len) !=
522 DDI_PROP_SUCCESS) {
523 kmem_free(md_med_trans_lst, (size_t)len);
524 md_med_trans_lst = md_strdup("tcp");
525 }
526 }
527
528 /*
529 * Must initialize the internal data structures before the
530 * any possible calls to 'goto attach_failure' as _fini
531 * routine references them.
532 */
533 med_init();
534
535 md_ops = (md_ops_t **)kmem_zalloc(
536 sizeof (md_ops_t *) * MD_NOPS, KM_SLEEP);
537 md_mods = (ddi_modhandle_t *)kmem_zalloc(
538 sizeof (ddi_modhandle_t) * MD_NOPS, KM_SLEEP);
539
540 /* try and get the md_xlate property */
541 /* Should we only do this if upgrade? */
542 len = sizeof (char) * 5;
543 if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
544 0, "md_xlate_ver", ver, &len) == DDI_PROP_SUCCESS) {
545 if (strcmp(ver, VERSION) == 0) {
546 len = 0;
547 if (ddi_prop_op(DDI_DEV_T_ANY, dip,
548 PROP_LEN_AND_VAL_ALLOC, 0, "md_xlate",
549 (caddr_t)&md_tuple_table, &len) !=
550 DDI_PROP_SUCCESS) {
551 if (md_init_debug)
552 cmn_err(CE_WARN,
553 "md_xlate ddi_prop_op failed");
554 goto attach_failure;
555 } else {
556 md_tuple_length =
557 len/(2 * ((int)sizeof (dev32_t)));
558 md_in_upgrade = 1;
559 }
560
561 /* Get target's name to major table */
562 if (ddi_prop_lookup_string_array(DDI_DEV_T_ANY,
563 dip, DDI_PROP_DONTPASS,
564 "md_targ_nm_table", &maj_str_array,
565 &md_majortab_len) != DDI_PROP_SUCCESS) {
566 md_majortab_len = 0;
567 if (md_init_debug)
568 cmn_err(CE_WARN, "md_targ_nm_table "
569 "ddi_prop_lookup_string_array "
570 "failed");
571 goto attach_failure;
572 }
573
574 md_major_tuple_table =
575 (struct md_xlate_major_table *)
576 kmem_zalloc(md_majortab_len *
577 sizeof (struct md_xlate_major_table), KM_SLEEP);
578
579 for (i = 0; i < md_majortab_len; i++) {
580 /* Getting major name */
581 str = strchr(maj_str_array[i], ' ');
582 if (str == NULL)
583 continue;
584 *str = '\0';
585 md_major_tuple_table[i].drv_name =
586 md_strdup(maj_str_array[i]);
587
588 /* Simplified atoi to get major number */
589 str2 = str + 1;
590 md_major_tuple_table[i].targ_maj = 0;
591 while ((*str2 >= '0') && (*str2 <= '9')) {
592 md_major_tuple_table[i].targ_maj *= 10;
593 md_major_tuple_table[i].targ_maj +=
594 *str2++ - '0';
595 }
596 *str = ' ';
597 }
598 ddi_prop_free((void *)maj_str_array);
599 } else {
600 if (md_init_debug)
601 cmn_err(CE_WARN, "md_xlate_ver is incorrect");
602 goto attach_failure;
603 }
604 }
605
606 /*
607 * Check for properties:
608 * md_keep_repl_state and md_devid_destroy
609 * and set globals if these exist.
610 */
611 md_keep_repl_state = ddi_getprop(DDI_DEV_T_ANY, dip,
612 0, "md_keep_repl_state", 0);
613
614 md_devid_destroy = ddi_getprop(DDI_DEV_T_ANY, dip,
615 0, "md_devid_destroy", 0);
616
617 if (MD_UPGRADE)
618 md_major_targ = md_targ_name_to_major("md");
619 else
620 md_major_targ = 0;
621
622 /* allocate admin device node */
623 if (ddi_create_priv_minor_node(dip, "admin", S_IFCHR,
624 MD_ADM_MINOR, DDI_PSEUDO, 0, NULL, PRIV_SYS_CONFIG, 0640))
625 goto attach_failure;
626
627 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
628 DDI_KERNEL_IOCTL, NULL, 0) != DDI_SUCCESS)
629 goto attach_failure;
630
631 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip,
632 "ddi-abrwrite-supported", 1) != DDI_SUCCESS)
633 goto attach_failure;
634
635 /* these could have been cleared by a detach */
636 md_nunits = MD_MAXUNITS;
637 md_nsets = MD_MAXSETS;
638
639 sz = sizeof (void *) * MD_MAXUNITS;
640 if (md_set[0].s_un == NULL)
641 md_set[0].s_un = kmem_zalloc(sz, KM_SLEEP);
642 if (md_set[0].s_ui == NULL)
643 md_set[0].s_ui = kmem_zalloc(sz, KM_SLEEP);
644
645 md_devinfo = dip;
646
647 /*
648 * Only allocate device node for root mirror metadevice.
649 * Don't pre-allocate unnecessary device nodes (thus slowing down a
650 * boot when we attach).
651 * We can't read the mddbs in attach. The mddbs will be read
652 * by metainit during the boot process when it is doing the
653 * auto-take processing and any other minor nodes will be
654 * allocated at that point.
655 *
656 * There are two scenarios to be aware of here:
657 * 1) when we are booting from a mirrored root we need the root
658 * metadevice to exist very early (during vfs_mountroot processing)
659 * 2) we need all of the nodes to be created so that any mnttab entries
660 * will succeed (handled by metainit reading the mddb during boot).
661 */
662 if (strncmp(SVM_PSEUDO_STR, svm_bootpath, sizeof (SVM_PSEUDO_STR) - 1)
663 == 0) {
664 char *p;
665 int mnum = 0;
666
667 /*
668 * The svm_bootpath string looks something like
669 * /pseudo/md@0:0,150,blk where 150 is the minor number
670 * in this example so we need to set the pointer p onto
671 * the first digit of the minor number and convert it
672 * from ascii.
673 */
674 for (p = svm_bootpath + sizeof (SVM_PSEUDO_STR) + 1;
675 *p >= '0' && *p <= '9'; p++) {
676 mnum *= 10;
677 mnum += *p - '0';
678 }
679
680 if (md_create_minor_node(0, mnum)) {
681 kmem_free(md_set[0].s_un, sz);
682 kmem_free(md_set[0].s_ui, sz);
683 goto attach_failure;
684 }
685 }
686
687 /* create the hash to store the meta device sizes */
688 md_nblocksmap = mod_hash_create_idhash("md_nblocksmap",
689 md_nblocksmap_size, mod_hash_null_valdtor);
690
691 MD_CLR_IN(IN_ATTACH);
692 return (DDI_SUCCESS);
693
694 attach_failure:
695 /*
696 * Use our own detach routine to toss any stuff we allocated above.
697 * NOTE: detach will call md_halt to free the mddb_init allocations.
698 */
699 MD_CLR_IN(IN_ATTACH);
700 if (mddetach(dip, DDI_DETACH) != DDI_SUCCESS)
701 cmn_err(CE_WARN, "detach from attach failed");
702 return (DDI_FAILURE);
703 }
704
705 /* ARGSUSED */
706 static int
707 mddetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
708 {
709 extern int check_active_locators();
710 set_t s;
711 size_t sz;
712 int len;
713
714 MD_SET_IN(IN_DETACH);
715
716 /* check command */
717 if (cmd != DDI_DETACH) {
718 MD_CLR_IN(IN_DETACH);
719 return (DDI_FAILURE);
720 }
721
722 /*
723 * if we have not already halted yet we have no active config
724 * then automatically initiate a halt so we can detach.
725 */
726 if (!(md_get_status() & MD_GBL_HALTED)) {
727 if (check_active_locators() == 0) {
728 /*
729 * NOTE: a successful md_halt will have done the
730 * mddb_unload to free allocations done in mddb_init
731 */
732 if (md_halt(MD_NO_GBL_LOCKS_HELD)) {
733 cmn_err(CE_NOTE, "md:detach: "
734 "Could not halt Solaris Volume Manager");
735 MD_CLR_IN(IN_DETACH);
736 return (DDI_FAILURE);
737 }
738 }
739
740 /* fail detach if we have not halted */
741 if (!(md_get_status() & MD_GBL_HALTED)) {
742 MD_CLR_IN(IN_DETACH);
743 return (DDI_FAILURE);
744 }
745 }
746
747 /* must be in halted state, this will be cleared on next attach */
748 ASSERT(md_get_status() & MD_GBL_HALTED);
749
750 /* cleanup attach allocations and initializations */
751 md_major_targ = 0;
752
753 sz = sizeof (void *) * md_nunits;
754 for (s = 0; s < md_nsets; s++) {
755 if (md_set[s].s_un != NULL) {
756 kmem_free(md_set[s].s_un, sz);
757 md_set[s].s_un = NULL;
758 }
759
760 if (md_set[s].s_ui != NULL) {
761 kmem_free(md_set[s].s_ui, sz);
762 md_set[s].s_ui = NULL;
763 }
764 }
765 md_nunits = 0;
766 md_nsets = 0;
767 md_nmedh = 0;
768
769 if (non_ff_drivers != NULL) {
770 int i;
771
772 for (i = 0; non_ff_drivers[i] != NULL; i++)
773 kmem_free(non_ff_drivers[i],
774 strlen(non_ff_drivers[i]) + 1);
775
776 /* free i+1 entries because there is a null entry at list end */
777 kmem_free(non_ff_drivers, (i + 1) * sizeof (char *));
778 non_ff_drivers = NULL;
779 }
780
781 if (md_med_trans_lst != NULL) {
782 kmem_free(md_med_trans_lst, strlen(md_med_trans_lst) + 1);
783 md_med_trans_lst = NULL;
784 }
785
786 if (md_mods != NULL) {
787 kmem_free(md_mods, sizeof (ddi_modhandle_t) * MD_NOPS);
788 md_mods = NULL;
789 }
790
791 if (md_ops != NULL) {
792 kmem_free(md_ops, sizeof (md_ops_t *) * MD_NOPS);
793 md_ops = NULL;
794 }
795
796 if (MD_UPGRADE) {
797 len = md_tuple_length * (2 * ((int)sizeof (dev32_t)));
798 md_in_upgrade = 0;
799 md_xlate_free(len);
800 md_majortab_free();
801 }
802
803 /*
804 * Undo what we did in mdattach, freeing resources
805 * and removing things we installed. The system
806 * framework guarantees we are not active with this devinfo
807 * node in any other entry points at this time.
808 */
809 ddi_prop_remove_all(dip);
810 ddi_remove_minor_node(dip, NULL);
811
812 med_fini();
813
814 mod_hash_destroy_idhash(md_nblocksmap);
815
816 md_devinfo = NULL;
817
818 MD_CLR_IN(IN_DETACH);
819 return (DDI_SUCCESS);
820 }
821
822
823 /*
824 * Given the device number return the devinfo pointer
825 * given to md via md_attach
826 */
827 /*ARGSUSED*/
828 static int
829 mdinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
830 {
831 int error = DDI_FAILURE;
832
833 switch (infocmd) {
834 case DDI_INFO_DEVT2DEVINFO:
835 if (md_devinfo) {
836 *result = (void *)md_devinfo;
837 error = DDI_SUCCESS;
838 }
839 break;
840
841 case DDI_INFO_DEVT2INSTANCE:
842 *result = (void *)0;
843 error = DDI_SUCCESS;
844 break;
845 }
846 return (error);
847 }
848
849 /*
850 * property operation routine. return the number of blocks for the partition
851 * in question or forward the request to the property facilities.
852 */
853 static int
854 mdprop_op(
855 dev_t dev, /* device number associated with device */
856 dev_info_t *dip, /* device info struct for this device */
857 ddi_prop_op_t prop_op, /* property operator */
858 int mod_flags, /* property flags */
859 char *name, /* name of property */
860 caddr_t valuep, /* where to put property value */
861 int *lengthp) /* put length of property here */
862 {
863 return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags,
864 name, valuep, lengthp, md_nblocks_get(getminor(dev))));
865 }
866
867 static void
868 snarf_user_data(set_t setno)
869 {
870 mddb_recid_t recid;
871 mddb_recstatus_t status;
872
873 recid = mddb_makerecid(setno, 0);
874 while ((recid = mddb_getnextrec(recid, MDDB_USER, 0)) > 0) {
875 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
876 continue;
877
878 status = mddb_getrecstatus(recid);
879 if (status == MDDB_STALE)
880 continue;
881
882 if (status == MDDB_NODATA) {
883 mddb_setrecprivate(recid, MD_PRV_PENDDEL);
884 continue;
885 }
886
887 ASSERT(status == MDDB_OK);
888
889 mddb_setrecprivate(recid, MD_PRV_GOTIT);
890 }
891 }
892
893 static void
894 md_print_block_usage(mddb_set_t *s, uint_t blks)
895 {
896 uint_t ib;
897 int li;
898 mddb_mb_ic_t *mbip;
899 uint_t max_blk_needed;
900 mddb_lb_t *lbp;
901 mddb_sidelocator_t *slp;
902 int drv_index;
903 md_splitname sn;
904 char *name;
905 char *suffix;
906 size_t prefixlen;
907 size_t suffixlen;
908 int alloc_sz;
909
910
911 max_blk_needed = s->s_totalblkcnt - s->s_freeblkcnt + blks;
912
913 cmn_err(CE_WARN, "Blocks in Metadevice State Database: %d\n"
914 " Additional Blocks Needed: %d\n\n"
915 " Increase size of following replicas for\n"
916 " device relocatability by deleting listed\n"
917 " replica and re-adding replica with\n"
918 " increased size (see metadb(1M)):\n"
919 " Replica Increase By",
920 s->s_totalblkcnt, (blks - s->s_freeblkcnt));
921
922 lbp = s->s_lbp;
923
924 for (li = 0; li < lbp->lb_loccnt; li++) {
925 if (lbp->lb_locators[li].l_flags & MDDB_F_DELETED)
926 continue;
927 ib = 0;
928 for (mbip = s->s_mbiarray[li]; mbip != NULL;
929 mbip = mbip->mbi_next) {
930 ib += (uint_t)mbip->mbi_mddb_mb.mb_blkcnt;
931 }
932 if (ib == 0)
933 continue;
934 if (ib < max_blk_needed) {
935 slp = &lbp->lb_sidelocators[s->s_sideno][li];
936 drv_index = slp->l_drvnm_index;
937 mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno,
938 &sn);
939 prefixlen = SPN_PREFIX(&sn).pre_len;
940 suffixlen = SPN_SUFFIX(&sn).suf_len;
941 alloc_sz = (int)(prefixlen + suffixlen + 2);
942 name = (char *)kmem_alloc(alloc_sz, KM_SLEEP);
943 (void) strncpy(name, SPN_PREFIX(&sn).pre_data,
944 prefixlen);
945 name[prefixlen] = '/';
946 suffix = name + (prefixlen + 1);
947 (void) strncpy(suffix, SPN_SUFFIX(&sn).suf_data,
948 suffixlen);
949 name[prefixlen + suffixlen + 1] = '\0';
950 cmn_err(CE_WARN,
951 " %s (%s:%d:%d) %d blocks",
952 name, lbp->lb_drvnm[drv_index].dn_data,
953 slp->l_mnum, lbp->lb_locators[li].l_blkno,
954 (max_blk_needed - ib));
955 kmem_free(name, alloc_sz);
956 }
957 }
958 }
959
960 /*
961 * md_create_minor_node:
962 * Create the minor device for the given set and un_self_id.
963 *
964 * Input:
965 * setno - set number
966 * mnum - selfID of unit
967 *
968 * Output:
969 * None.
970 *
971 * Returns 0 for success, 1 for failure.
972 *
973 * Side-effects:
974 * None.
975 */
976 int
977 md_create_minor_node(set_t setno, minor_t mnum)
978 {
979 char name[20];
980
981 /* Check for valid arguments */
982 if (setno >= MD_MAXSETS || MD_MIN2UNIT(mnum) >= MD_MAXUNITS)
983 return (1);
984
985 (void) snprintf(name, 20, "%u,%u,blk",
986 (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));
987
988 if (ddi_create_minor_node(md_devinfo, name, S_IFBLK,
989 MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
990 return (1);
991
992 (void) snprintf(name, 20, "%u,%u,raw",
993 (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));
994
995 if (ddi_create_minor_node(md_devinfo, name, S_IFCHR,
996 MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
997 return (1);
998
999 return (0);
1000 }
1001
1002 /*
1003 * For a given key check if it is an orphaned record.
1004 * The following conditions are used to determine an orphan.
1005 * 1. The device associated with that key is not a metadevice.
1006 * 2. If DEVID_STYLE then the physical device does not have a device Id
1007 * associated with it.
1008 *
1009 * If a key does not have an entry in the devid namespace it could be
1010 * a device that does not support device ids. Hence the record is not
1011 * deleted.
1012 */
1013
1014 static int
1015 md_verify_orphaned_record(set_t setno, mdkey_t key)
1016 {
1017 md_dev64_t odev; /* orphaned dev */
1018 mddb_set_t *s;
1019 side_t side = 0;
1020 struct nm_next_hdr *did_nh = NULL;
1021
1022 s = (mddb_set_t *)md_set[setno].s_db;
1023 if ((did_nh = get_first_record(setno, 1, (NM_DEVID | NM_NOTSHARED)))
1024 == NULL)
1025 return (0);
1026 /*
1027 * If devid style is set then get the dev_t using MD_NOTRUST_DEVT
1028 */
1029 if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE) {
1030 odev = md_getdevnum(setno, side, key, MD_NOTRUST_DEVT);
1031 if ((odev == NODEV64) || (md_getmajor(odev) == md_major))
1032 return (0);
1033 if (lookup_entry(did_nh, setno, side, key, odev, NM_DEVID) ==
1034 NULL)
1035 return (1);
1036 }
1037 return (0);
1038 }
1039
1040 int
1041 md_snarf_db_set(set_t setno, md_error_t *ep)
1042 {
1043 int err = 0;
1044 int i;
1045 mddb_recid_t recid;
1046 mddb_type_t drvrid;
1047 mddb_recstatus_t status;
1048 md_ops_t *ops;
1049 uint_t privat;
1050 mddb_set_t *s;
1051 uint_t cvt_blks;
1052 struct nm_next_hdr *nh;
1053 mdkey_t key = MD_KEYWILD;
1054 side_t side = 0;
1055 int size;
1056 int devid_flag;
1057 int retval;
1058 uint_t un;
1059 int un_next_set = 0;
1060
1061 md_haltsnarf_enter(setno);
1062
1063 mutex_enter(&md_mx);
1064 if (md_set[setno].s_status & MD_SET_SNARFED) {
1065 mutex_exit(&md_mx);
1066 md_haltsnarf_exit(setno);
1067 return (0);
1068 }
1069 mutex_exit(&md_mx);
1070
1071 if (! (md_get_status() & MD_GBL_DAEMONS_LIVE)) {
1072 if (md_start_daemons(TRUE)) {
1073 if (ep != NULL)
1074 (void) mdsyserror(ep, ENXIO);
1075 err = -1;
1076 goto out;
1077 }
1078 }
1079
1080
1081 /*
1082 * Load the devid name space if it exists
1083 */
1084 (void) md_load_namespace(setno, NULL, NM_DEVID);
1085 if (!md_load_namespace(setno, ep, 0L)) {
1086 /*
1087 * Unload the devid namespace
1088 */
1089 (void) md_unload_namespace(setno, NM_DEVID);
1090 err = -1;
1091 goto out;
1092 }
1093
1094 /*
1095 * If replica is in non-devid state, convert if:
1096 * - not in probe during upgrade (md_keep_repl_state = 0)
1097 * - enough space available in replica
1098 * - local set
1099 * - not a multi-node diskset
1100 * - clustering is not present (for non-local set)
1101 */
1102 s = (mddb_set_t *)md_set[setno].s_db;
1103 devid_flag = 0;
1104 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE) && !md_keep_repl_state)
1105 devid_flag = 1;
1106 if (cluster_bootflags & CLUSTER_CONFIGURED)
1107 if (setno != MD_LOCAL_SET)
1108 devid_flag = 0;
1109 if (MD_MNSET_SETNO(setno))
1110 devid_flag = 0;
1111 if ((md_devid_destroy == 1) && (md_keep_repl_state == 1))
1112 devid_flag = 0;
1113
1114 /*
1115 * if we weren't devid style before and md_keep_repl_state=1
1116 * we need to stay non-devid
1117 */
1118 if ((md_keep_repl_state == 1) &&
1119 ((s->s_lbp->lb_flags & MDDB_DEVID_STYLE) == 0))
1120 devid_flag = 0;
1121 if (devid_flag) {
1122 /*
1123 * Determine number of free blocks needed to convert
1124 * entire replica to device id format - locator blocks
1125 * and namespace.
1126 */
1127 cvt_blks = 0;
1128 if (mddb_lb_did_convert(s, 0, &cvt_blks) != 0) {
1129 if (ep != NULL)
1130 (void) mdsyserror(ep, EIO);
1131 err = -1;
1132 goto out;
1133
1134 }
1135 cvt_blks += md_nm_did_chkspace(setno);
1136
1137 /* add MDDB_DEVID_CONV_PERC% */
1138 if ((md_conv_perc > 0) && (md_conv_perc <= 100)) {
1139 cvt_blks = cvt_blks * (100 + md_conv_perc) / 100;
1140 }
1141
1142 if (cvt_blks <= s->s_freeblkcnt) {
1143 if (mddb_lb_did_convert(s, 1, &cvt_blks) != 0) {
1144 if (ep != NULL)
1145 (void) mdsyserror(ep, EIO);
1146 err = -1;
1147 goto out;
1148 }
1149
1150 } else {
1151 /*
1152 * Print message that replica can't be converted for
1153 * lack of space. No failure - just continue to
1154 * run without device ids.
1155 */
1156 cmn_err(CE_WARN,
1157 "Unable to add Solaris Volume Manager device "
1158 "relocation data.\n"
1159 " To use device relocation feature:\n"
1160 " - Increase size of listed replicas\n"
1161 " - Reboot");
1162 md_print_block_usage(s, cvt_blks);
1163 cmn_err(CE_WARN,
1164 "Loading set without device relocation data.\n"
1165 " Solaris Volume Manager disk movement "
1166 "not tracked in local set.");
1167 }
1168 }
1169
1170 /*
1171 * go through and load any modules referenced in
1172 * data base
1173 */
1174 recid = mddb_makerecid(setno, 0);
1175 while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1176 status = mddb_getrecstatus(recid);
1177 if (status == MDDB_STALE) {
1178 if (! (md_get_setstatus(setno) & MD_SET_STALE)) {
1179 md_set_setstatus(setno, MD_SET_STALE);
1180 cmn_err(CE_WARN,
1181 "md: state database is stale");
1182 }
1183 } else if (status == MDDB_NODATA) {
1184 mddb_setrecprivate(recid, MD_PRV_PENDDEL);
1185 continue;
1186 }
1187 drvrid = mddb_getrectype1(recid);
1188 if (drvrid < MDDB_FIRST_MODID)
1189 continue;
1190 if (md_loadsubmod(setno, md_getshared_name(setno, drvrid),
1191 drvrid) < 0) {
1192 cmn_err(CE_NOTE, "md: could not load misc/%s",
1193 md_getshared_name(setno, drvrid));
1194 }
1195 }
1196
1197 if (recid < 0)
1198 goto out;
1199
1200 snarf_user_data(setno);
1201
1202 /*
1203 * Initialize the md_nm_snarfed array
1204 * this array is indexed by the key and
1205 * is set by md_getdevnum during the snarf time
1206 */
1207 if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) != NULL) {
1208 size = (int)((((struct nm_rec_hdr *)nh->nmn_record)->
1209 r_next_key) * (sizeof (int)));
1210 md_nm_snarfed = (int *)kmem_zalloc(size, KM_SLEEP);
1211 }
1212
1213 /*
1214 * go through and snarf until nothing gets added
1215 */
1216 do {
1217 i = 0;
1218 for (ops = md_opslist; ops != NULL; ops = ops->md_next) {
1219 if (ops->md_snarf != NULL) {
1220 retval = ops->md_snarf(MD_SNARF_DOIT, setno);
1221 if (retval == -1) {
1222 err = -1;
1223 /* Don't know the failed unit */
1224 (void) mdmderror(ep, MDE_RR_ALLOC_ERROR,
1225 0);
1226 (void) md_halt_set(setno, MD_HALT_ALL);
1227 (void) mddb_unload_set(setno);
1228 md_haltsnarf_exit(setno);
1229 return (err);
1230 } else {
1231 i += retval;
1232 }
1233 }
1234 }
1235 } while (i);
1236
1237 /*
1238 * Set the first available slot and availability
1239 */
1240 md_set[setno].s_un_avail = 0;
1241 for (un = 0; un < MD_MAXUNITS; un++) {
1242 if (md_set[setno].s_un[un] != NULL) {
1243 continue;
1244 } else {
1245 if (!un_next_set) {
1246 md_set[setno].s_un_next = un;
1247 un_next_set = 1;
1248 }
1249 md_set[setno].s_un_avail++;
1250 }
1251 }
1252
1253 md_set_setstatus(setno, MD_SET_SNARFED);
1254
1255 recid = mddb_makerecid(setno, 0);
1256 while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1257 privat = mddb_getrecprivate(recid);
1258 if (privat & MD_PRV_COMMIT) {
1259 if (mddb_commitrec(recid)) {
1260 if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
1261 md_set_setstatus(setno, MD_SET_STALE);
1262 cmn_err(CE_WARN,
1263 "md: state database is stale");
1264 }
1265 }
1266 mddb_setrecprivate(recid, MD_PRV_GOTIT);
1267 }
1268 }
1269
1270 /* Deletes must happen after all the commits */
1271 recid = mddb_makerecid(setno, 0);
1272 while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1273 privat = mddb_getrecprivate(recid);
1274 if (privat & MD_PRV_DELETE) {
1275 if (mddb_deleterec(recid)) {
1276 if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
1277 md_set_setstatus(setno, MD_SET_STALE);
1278 cmn_err(CE_WARN,
1279 "md: state database is stale");
1280 }
1281 mddb_setrecprivate(recid, MD_PRV_GOTIT);
1282 }
1283 recid = mddb_makerecid(setno, 0);
1284 }
1285 }
1286
1287 /*
1288 * go through and clean up records until nothing gets cleaned up.
1289 */
1290 do {
1291 i = 0;
1292 for (ops = md_opslist; ops != NULL; ops = ops->md_next)
1293 if (ops->md_snarf != NULL)
1294 i += ops->md_snarf(MD_SNARF_CLEANUP, setno);
1295 } while (i);
1296
1297 if (md_nm_snarfed != NULL &&
1298 !(md_get_setstatus(setno) & MD_SET_STALE)) {
1299 /*
1300 * go thru and cleanup the namespace and the device id
1301 * name space
1302 */
1303 for (key = 1;
1304 key < ((struct nm_rec_hdr *)nh->nmn_record)->r_next_key;
1305 key++) {
1306 /*
1307 * Is the entry an 'orphan'?
1308 */
1309 if (lookup_entry(nh, setno, side, key, NODEV64, 0L) !=
1310 NULL) {
1311 /*
1312 * If the value is not set then apparently
1313 * it is not part of the current configuration,
1314 * remove it this can happen when system panic
1315 * between the primary name space update and
1316 * the device id name space update
1317 */
1318 if (md_nm_snarfed[key] == 0) {
1319 if (md_verify_orphaned_record(setno,
1320 key) == 1)
1321 (void) remove_entry(nh,
1322 side, key, 0L);
1323 }
1324 }
1325 }
1326 }
1327
1328 if (md_nm_snarfed != NULL) {
1329 /*
1330 * Done and free the memory
1331 */
1332 kmem_free(md_nm_snarfed, size);
1333 md_nm_snarfed = NULL;
1334 }
1335
1336 if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE &&
1337 !(md_get_setstatus(setno) & MD_SET_STALE)) {
1338 /*
1339 * if the destroy flag has been set and
1340 * the MD_SET_DIDCLUP bit is not set in
1341 * the set's status field, cleanup the
1342 * entire device id namespace
1343 */
1344 if (md_devid_destroy &&
1345 !(md_get_setstatus(setno) & MD_SET_DIDCLUP)) {
1346 (void) md_devid_cleanup(setno, 1);
1347 md_set_setstatus(setno, MD_SET_DIDCLUP);
1348 } else
1349 (void) md_devid_cleanup(setno, 0);
1350 }
1351
1352 /*
1353 * clear single threading on snarf, return success or error
1354 */
1355 out:
1356 md_haltsnarf_exit(setno);
1357 return (err);
1358 }
1359
1360 void
1361 get_minfo(struct dk_minfo *info, minor_t mnum)
1362 {
1363 md_unit_t *un;
1364 mdi_unit_t *ui;
1365
1366 info->dki_capacity = 0;
1367 info->dki_lbsize = 0;
1368 info->dki_media_type = 0;
1369
1370 if ((ui = MDI_UNIT(mnum)) == NULL) {
1371 return;
1372 }
1373 un = (md_unit_t *)md_unit_readerlock(ui);
1374 info->dki_capacity = un->c.un_total_blocks;
1375 md_unit_readerexit(ui);
1376 info->dki_lbsize = DEV_BSIZE;
1377 info->dki_media_type = DK_UNKNOWN;
1378 }
1379
1380
1381 void
1382 get_info(struct dk_cinfo *info, minor_t mnum)
1383 {
1384 /*
1385 * Controller Information
1386 */
1387 info->dki_ctype = DKC_MD;
1388 info->dki_cnum = ddi_get_instance(ddi_get_parent(md_devinfo));
1389 (void) strcpy(info->dki_cname,
1390 ddi_get_name(ddi_get_parent(md_devinfo)));
1391 /*
1392 * Unit Information
1393 */
1394 info->dki_unit = mnum;
1395 info->dki_slave = 0;
1396 (void) strcpy(info->dki_dname, ddi_driver_name(md_devinfo));
1397 info->dki_flags = 0;
1398 info->dki_partition = 0;
1399 info->dki_maxtransfer = (ushort_t)(md_maxphys / DEV_BSIZE);
1400
1401 /*
1402 * We can't get from here to there yet
1403 */
1404 info->dki_addr = 0;
1405 info->dki_space = 0;
1406 info->dki_prio = 0;
1407 info->dki_vec = 0;
1408 }
1409
1410 /*
1411 * open admin device
1412 */
1413 static int
1414 mdadminopen(
1415 int flag,
1416 int otyp)
1417 {
1418 int err = 0;
1419
1420 /* single thread */
1421 mutex_enter(&md_mx);
1422
1423 /* check type and flags */
1424 if ((otyp != OTYP_CHR) && (otyp != OTYP_LYR)) {
1425 err = EINVAL;
1426 goto out;
1427 }
1428 if (((flag & FEXCL) && (md_status & MD_GBL_OPEN)) ||
1429 (md_status & MD_GBL_EXCL)) {
1430 err = EBUSY;
1431 goto out;
1432 }
1433
1434 /* count and flag open */
1435 md_ocnt[otyp]++;
1436 md_status |= MD_GBL_OPEN;
1437 if (flag & FEXCL)
1438 md_status |= MD_GBL_EXCL;
1439
1440 /* unlock return success */
1441 out:
1442 mutex_exit(&md_mx);
1443 return (err);
1444 }
1445
1446 /*
1447 * open entry point
1448 */
1449 static int
1450 mdopen(
1451 dev_t *dev,
1452 int flag,
1453 int otyp,
1454 cred_t *cred_p)
1455 {
1456 minor_t mnum = getminor(*dev);
1457 unit_t unit = MD_MIN2UNIT(mnum);
1458 set_t setno = MD_MIN2SET(mnum);
1459 mdi_unit_t *ui = NULL;
1460 int err = 0;
1461 md_parent_t parent;
1462
1463 /* dispatch admin device opens */
1464 if (mnum == MD_ADM_MINOR)
1465 return (mdadminopen(flag, otyp));
1466
1467 /* lock, check status */
1468 rw_enter(&md_unit_array_rw.lock, RW_READER);
1469
1470 tryagain:
1471 if (md_get_status() & MD_GBL_HALTED) {
1472 err = ENODEV;
1473 goto out;
1474 }
1475
1476 /* check minor */
1477 if ((setno >= md_nsets) || (unit >= md_nunits)) {
1478 err = ENXIO;
1479 goto out;
1480 }
1481
1482 /* make sure we're snarfed */
1483 if ((md_get_setstatus(MD_LOCAL_SET) & MD_SET_SNARFED) == 0) {
1484 if (md_snarf_db_set(MD_LOCAL_SET, NULL) != 0) {
1485 err = ENODEV;
1486 goto out;
1487 }
1488 }
1489 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) {
1490 err = ENODEV;
1491 goto out;
1492 }
1493
1494 /* check unit */
1495 if ((ui = MDI_UNIT(mnum)) == NULL) {
1496 err = ENXIO;
1497 goto out;
1498 }
1499
1500 /*
1501 * The softpart open routine may do an I/O during the open, in
1502 * which case the open routine will set the OPENINPROGRESS flag
1503 * and drop all locks during the I/O. If this thread sees
1504 * the OPENINPROGRESS flag set, if should wait until the flag
1505 * is reset before calling the driver's open routine. It must
1506 * also revalidate the world after it grabs the unit_array lock
1507 * since the set may have been released or the metadevice cleared
1508 * during the sleep.
1509 */
1510 if (MD_MNSET_SETNO(setno)) {
1511 mutex_enter(&ui->ui_mx);
1512 if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
1513 rw_exit(&md_unit_array_rw.lock);
1514 cv_wait(&ui->ui_cv, &ui->ui_mx);
1515 rw_enter(&md_unit_array_rw.lock, RW_READER);
1516 mutex_exit(&ui->ui_mx);
1517 goto tryagain;
1518 }
1519 mutex_exit(&ui->ui_mx);
1520 }
1521
1522 /* Test if device is openable */
1523 if ((ui->ui_tstate & MD_NOTOPENABLE) != 0) {
1524 err = ENXIO;
1525 goto out;
1526 }
1527
1528 /* don't allow opens w/WRITE flag if stale */
1529 if ((flag & FWRITE) && (md_get_setstatus(setno) & MD_SET_STALE)) {
1530 err = EROFS;
1531 goto out;
1532 }
1533
1534 /* don't allow writes to subdevices */
1535 parent = md_get_parent(md_expldev(*dev));
1536 if ((flag & FWRITE) && MD_HAS_PARENT(parent)) {
1537 err = EROFS;
1538 goto out;
1539 }
1540
1541 /* open underlying driver */
1542 if (md_ops[ui->ui_opsindex]->md_open != NULL) {
1543 if ((err = (*md_ops[ui->ui_opsindex]->md_open)
1544 (dev, flag, otyp, cred_p, 0)) != 0)
1545 goto out;
1546 }
1547
1548 /* or do it ourselves */
1549 else {
1550 /* single thread */
1551 (void) md_unit_openclose_enter(ui);
1552 err = md_unit_incopen(mnum, flag, otyp);
1553 md_unit_openclose_exit(ui);
1554 if (err != 0)
1555 goto out;
1556 }
1557
1558 /* unlock, return status */
1559 out:
1560 rw_exit(&md_unit_array_rw.lock);
1561 return (err);
1562 }
1563
1564 /*
1565 * close admin device
1566 */
1567 static int
1568 mdadminclose(
1569 int otyp)
1570 {
1571 int i;
1572 int err = 0;
1573
1574 /* single thread */
1575 mutex_enter(&md_mx);
1576
1577 /* check type and flags */
1578 if ((otyp < 0) || (otyp >= OTYPCNT)) {
1579 err = EINVAL;
1580 goto out;
1581 } else if (md_ocnt[otyp] == 0) {
1582 err = ENXIO;
1583 goto out;
1584 }
1585
1586 /* count and flag closed */
1587 if (otyp == OTYP_LYR)
1588 md_ocnt[otyp]--;
1589 else
1590 md_ocnt[otyp] = 0;
1591 md_status &= ~MD_GBL_OPEN;
1592 for (i = 0; (i < OTYPCNT); ++i)
1593 if (md_ocnt[i] != 0)
1594 md_status |= MD_GBL_OPEN;
1595 if (! (md_status & MD_GBL_OPEN))
1596 md_status &= ~MD_GBL_EXCL;
1597
1598 /* unlock return success */
1599 out:
1600 mutex_exit(&md_mx);
1601 return (err);
1602 }
1603
1604 /*
1605 * close entry point
1606 */
1607 static int
1608 mdclose(
1609 dev_t dev,
1610 int flag,
1611 int otyp,
1612 cred_t *cred_p)
1613 {
1614 minor_t mnum = getminor(dev);
1615 set_t setno = MD_MIN2SET(mnum);
1616 unit_t unit = MD_MIN2UNIT(mnum);
1617 mdi_unit_t *ui = NULL;
1618 int err = 0;
1619
1620 /* dispatch admin device closes */
1621 if (mnum == MD_ADM_MINOR)
1622 return (mdadminclose(otyp));
1623
1624 /* check minor */
1625 if ((setno >= md_nsets) || (unit >= md_nunits) ||
1626 ((ui = MDI_UNIT(mnum)) == NULL)) {
1627 err = ENXIO;
1628 goto out;
1629 }
1630
1631 /* close underlying driver */
1632 if (md_ops[ui->ui_opsindex]->md_close != NULL) {
1633 if ((err = (*md_ops[ui->ui_opsindex]->md_close)
1634 (dev, flag, otyp, cred_p, 0)) != 0)
1635 goto out;
1636 }
1637
1638 /* or do it ourselves */
1639 else {
1640 /* single thread */
1641 (void) md_unit_openclose_enter(ui);
1642 err = md_unit_decopen(mnum, otyp);
1643 md_unit_openclose_exit(ui);
1644 if (err != 0)
1645 goto out;
1646 }
1647
1648 /* return success */
1649 out:
1650 return (err);
1651 }
1652
1653
1654 /*
1655 * This routine performs raw read operations. It is called from the
1656 * device switch at normal priority.
1657 *
1658 * The main catch is that the *uio struct which is passed to us may
1659 * specify a read which spans two buffers, which would be contiguous
1660 * on a single partition, but not on a striped partition. This will
1661 * be handled by mdstrategy.
1662 */
1663 /*ARGSUSED*/
1664 static int
1665 mdread(dev_t dev, struct uio *uio, cred_t *credp)
1666 {
1667 minor_t mnum;
1668 mdi_unit_t *ui;
1669 int error;
1670
1671 if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1672 (MD_MIN2SET(mnum) >= md_nsets) ||
1673 (MD_MIN2UNIT(mnum) >= md_nunits) ||
1674 ((ui = MDI_UNIT(mnum)) == NULL))
1675 return (ENXIO);
1676
1677 if (md_ops[ui->ui_opsindex]->md_read != NULL)
1678 return ((*md_ops[ui->ui_opsindex]->md_read)
1679 (dev, uio, credp));
1680
1681 if ((error = md_chk_uio(uio)) != 0)
1682 return (error);
1683
1684 return (physio(mdstrategy, NULL, dev, B_READ, md_minphys, uio));
1685 }
1686
1687 /*
1688 * This routine performs async raw read operations. It is called from the
1689 * device switch at normal priority.
1690 *
1691 * The main catch is that the *aio struct which is passed to us may
1692 * specify a read which spans two buffers, which would be contiguous
1693 * on a single partition, but not on a striped partition. This will
1694 * be handled by mdstrategy.
1695 */
1696 /*ARGSUSED*/
1697 static int
1698 mdaread(dev_t dev, struct aio_req *aio, cred_t *credp)
1699 {
1700 minor_t mnum;
1701 mdi_unit_t *ui;
1702 int error;
1703
1704
1705 if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1706 (MD_MIN2SET(mnum) >= md_nsets) ||
1707 (MD_MIN2UNIT(mnum) >= md_nunits) ||
1708 ((ui = MDI_UNIT(mnum)) == NULL))
1709 return (ENXIO);
1710
1711 if (md_ops[ui->ui_opsindex]->md_aread != NULL)
1712 return ((*md_ops[ui->ui_opsindex]->md_aread)
1713 (dev, aio, credp));
1714
1715 if ((error = md_chk_uio(aio->aio_uio)) != 0)
1716 return (error);
1717
1718 return (aphysio(mdstrategy, anocancel, dev, B_READ, md_minphys, aio));
1719 }
1720
1721 /*
1722 * This routine performs raw write operations. It is called from the
1723 * device switch at normal priority.
1724 *
1725 * The main catch is that the *uio struct which is passed to us may
1726 * specify a write which spans two buffers, which would be contiguous
1727 * on a single partition, but not on a striped partition. This is
1728 * handled by mdstrategy.
1729 *
1730 */
1731 /*ARGSUSED*/
1732 static int
1733 mdwrite(dev_t dev, struct uio *uio, cred_t *credp)
1734 {
1735 minor_t mnum;
1736 mdi_unit_t *ui;
1737 int error;
1738
1739 if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1740 (MD_MIN2SET(mnum) >= md_nsets) ||
1741 (MD_MIN2UNIT(mnum) >= md_nunits) ||
1742 ((ui = MDI_UNIT(mnum)) == NULL))
1743 return (ENXIO);
1744
1745 if (md_ops[ui->ui_opsindex]->md_write != NULL)
1746 return ((*md_ops[ui->ui_opsindex]->md_write)
1747 (dev, uio, credp));
1748
1749 if ((error = md_chk_uio(uio)) != 0)
1750 return (error);
1751
1752 return (physio(mdstrategy, NULL, dev, B_WRITE, md_minphys, uio));
1753 }
1754
1755 /*
1756 * This routine performs async raw write operations. It is called from the
1757 * device switch at normal priority.
1758 *
1759 * The main catch is that the *aio struct which is passed to us may
1760 * specify a write which spans two buffers, which would be contiguous
1761 * on a single partition, but not on a striped partition. This is
1762 * handled by mdstrategy.
1763 *
1764 */
1765 /*ARGSUSED*/
1766 static int
1767 mdawrite(dev_t dev, struct aio_req *aio, cred_t *credp)
1768 {
1769 minor_t mnum;
1770 mdi_unit_t *ui;
1771 int error;
1772
1773
1774 if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1775 (MD_MIN2SET(mnum) >= md_nsets) ||
1776 (MD_MIN2UNIT(mnum) >= md_nunits) ||
1777 ((ui = MDI_UNIT(mnum)) == NULL))
1778 return (ENXIO);
1779
1780 if (md_ops[ui->ui_opsindex]->md_awrite != NULL)
1781 return ((*md_ops[ui->ui_opsindex]->md_awrite)
1782 (dev, aio, credp));
1783
1784 if ((error = md_chk_uio(aio->aio_uio)) != 0)
1785 return (error);
1786
1787 return (aphysio(mdstrategy, anocancel, dev, B_WRITE, md_minphys, aio));
1788 }
1789
1790 int
1791 mdstrategy(struct buf *bp)
1792 {
1793 minor_t mnum;
1794 mdi_unit_t *ui;
1795
1796 ASSERT((bp->b_flags & B_DONE) == 0);
1797
1798 if (panicstr)
1799 md_clr_status(MD_GBL_DAEMONS_LIVE);
1800
1801 if (((mnum = getminor(bp->b_edev)) == MD_ADM_MINOR) ||
1802 (MD_MIN2SET(mnum) >= md_nsets) ||
1803 (MD_MIN2UNIT(mnum) >= md_nunits) ||
1804 ((ui = MDI_UNIT(mnum)) == NULL)) {
1805 bp->b_flags |= B_ERROR;
1806 bp->b_error = ENXIO;
1807 bp->b_resid = bp->b_bcount;
1808 biodone(bp);
1809 return (0);
1810 }
1811
1812 bp->b_flags &= ~(B_ERROR | B_DONE);
1813 if (md_ops[ui->ui_opsindex]->md_strategy != NULL) {
1814 (*md_ops[ui->ui_opsindex]->md_strategy) (bp, 0, NULL);
1815 } else {
1816 (void) errdone(ui, bp, ENXIO);
1817 }
1818 return (0);
1819 }
1820
1821 /*
1822 * Return true if the ioctl is allowed to be multithreaded.
1823 * All the ioctls with MN are sent only from the message handlers through
1824 * rpc.mdcommd, which (via it's own locking mechanism) takes care that not two
1825 * ioctl for the same metadevice are issued at the same time.
1826 * So we are safe here.
1827 * The other ioctls do not mess with any metadevice structures and therefor
1828 * are harmless too, if called multiple times at the same time.
1829 */
1830 static boolean_t
1831 is_mt_ioctl(int cmd) {
1832
1833 switch (cmd) {
1834 case MD_IOCGUNIQMSGID:
1835 case MD_IOCGVERSION:
1836 case MD_IOCISOPEN:
1837 case MD_MN_SET_MM_OWNER:
1838 case MD_MN_SET_STATE:
1839 case MD_MN_SUSPEND_WRITES:
1840 case MD_MN_ALLOCATE_HOTSPARE:
1841 case MD_MN_SET_SETFLAGS:
1842 case MD_MN_GET_SETFLAGS:
1843 case MD_MN_MDDB_OPTRECFIX:
1844 case MD_MN_MDDB_PARSE:
1845 case MD_MN_MDDB_BLOCK:
1846 case MD_MN_DB_USERREQ:
1847 case MD_IOC_SPSTATUS:
1848 case MD_MN_COMMD_ERR:
1849 case MD_MN_SET_COMMD_RUNNING:
1850 case MD_MN_RESYNC:
1851 case MD_MN_SETSYNC:
1852 case MD_MN_POKE_HOTSPARES:
1853 case MD_MN_RR_DIRTY:
1854 case MD_MN_RR_CLEAN:
1855 case MD_MN_IOC_SPUPDATEWM:
1856 return (1);
1857 default:
1858 return (0);
1859 }
1860 }
1861
1862 /*
1863 * This routine implements the ioctl calls for the Virtual Disk System.
1864 * It is called from the device switch at normal priority.
1865 */
1866 /* ARGSUSED */
1867 static int
1868 mdioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cred_p,
1869 int *rval_p)
1870 {
1871 minor_t mnum = getminor(dev);
1872 mdi_unit_t *ui;
1873 IOLOCK lock;
1874 int err;
1875
1876 /*
1877 * For multinode disksets number of ioctls are allowed to be
1878 * multithreaded.
1879 * A fundamental assumption made in this implementation is that
1880 * ioctls either do not interact with other md structures or the
1881 * ioctl to the admin device can only occur if the metadevice
1882 * device is open. i.e. avoid a race between metaclear and the
1883 * progress of a multithreaded ioctl.
1884 */
1885
1886 if (!is_mt_ioctl(cmd) && md_ioctl_lock_enter() == EINTR) {
1887 return (EINTR);
1888 }
1889
1890 /*
1891 * initialize lock tracker
1892 */
1893 IOLOCK_INIT(&lock);
1894
1895 /* Flag to indicate that MD_GBL_IOCTL_LOCK is not acquired */
1896
1897 if (is_mt_ioctl(cmd)) {
1898 /* increment the md_mtioctl_cnt */
1899 mutex_enter(&md_mx);
1900 md_mtioctl_cnt++;
1901 mutex_exit(&md_mx);
1902 lock.l_flags |= MD_MT_IOCTL;
1903 }
1904
1905 /*
1906 * this has been added to prevent notification from re-snarfing
1907 * so metaunload will work. It may interfere with other modules
1908 * halt process.
1909 */
1910 if (md_get_status() & (MD_GBL_HALTED | MD_GBL_DAEMONS_DIE))
1911 return (IOLOCK_RETURN(ENXIO, &lock));
1912
1913 /*
1914 * admin device ioctls
1915 */
1916 if (mnum == MD_ADM_MINOR) {
1917 err = md_admin_ioctl(md_expldev(dev), cmd, (void *) data,
1918 mode, &lock);
1919 }
1920
1921 /*
1922 * metadevice ioctls
1923 */
1924 else if ((MD_MIN2SET(mnum) >= md_nsets) ||
1925 (MD_MIN2UNIT(mnum) >= md_nunits) ||
1926 (md_set[MD_MIN2SET(mnum)].s_ui == NULL) ||
1927 ((ui = MDI_UNIT(mnum)) == NULL)) {
1928 err = ENXIO;
1929 } else if (md_ops[ui->ui_opsindex]->md_ioctl == NULL) {
1930 err = ENOTTY;
1931 } else {
1932 err = (*md_ops[ui->ui_opsindex]->md_ioctl)
1933 (dev, cmd, (void *) data, mode, &lock);
1934 }
1935
1936 /*
1937 * drop any locks we grabbed
1938 */
1939 return (IOLOCK_RETURN_IOCTLEND(err, &lock));
1940 }
1941
1942 static int
1943 mddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1944 {
1945 minor_t mnum;
1946 set_t setno;
1947 mdi_unit_t *ui;
1948
1949 if ((mnum = getminor(dev)) == MD_ADM_MINOR)
1950 return (ENXIO);
1951
1952 setno = MD_MIN2SET(mnum);
1953
1954 if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits) ||
1955 ((ui = MDI_UNIT(mnum)) == NULL))
1956 return (ENXIO);
1957
1958
1959 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
1960 return (ENXIO);
1961
1962 if (md_ops[ui->ui_opsindex]->md_dump != NULL)
1963 return ((*md_ops[ui->ui_opsindex]->md_dump)
1964 (dev, addr, blkno, nblk));
1965
1966 return (ENXIO);
1967 }
1968
1969 /*
1970 * Metadevice unit number dispatcher
1971 * When this routine is called it will scan the
1972 * incore unit array and return the avail slot
1973 * hence the unit number to the caller
1974 *
1975 * Return -1 if there is nothing available
1976 */
1977 unit_t
1978 md_get_nextunit(set_t setno)
1979 {
1980 unit_t un, start;
1981
1982 /*
1983 * If nothing available
1984 */
1985 if (md_set[setno].s_un_avail == 0) {
1986 return (MD_UNITBAD);
1987 }
1988
1989 mutex_enter(&md_mx);
1990 start = un = md_set[setno].s_un_next;
1991
1992 /* LINTED: E_CONSTANT_CONDITION */
1993 while (1) {
1994 if (md_set[setno].s_un[un] == NULL) {
1995 /*
1996 * Advance the starting index for the next
1997 * md_get_nextunit call
1998 */
1999 if (un == MD_MAXUNITS - 1) {
2000 md_set[setno].s_un_next = 0;
2001 } else {
2002 md_set[setno].s_un_next = un + 1;
2003 }
2004 break;
2005 }
2006
2007 un = ((un == MD_MAXUNITS - 1) ? 0 : un + 1);
2008
2009 if (un == start) {
2010 un = MD_UNITBAD;
2011 break;
2012 }
2013
2014 }
2015
2016 mutex_exit(&md_mx);
2017 return (un);
2018 }