Print this page
7127 remove -Wno-missing-braces from Makefile.uts
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/io/lvm/mirror/mirror.c
+++ new/usr/src/uts/common/io/lvm/mirror/mirror.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
25 25 */
26 26
27 27 #include <sys/param.h>
28 28 #include <sys/systm.h>
29 29 #include <sys/conf.h>
30 30 #include <sys/file.h>
31 31 #include <sys/user.h>
32 32 #include <sys/uio.h>
33 33 #include <sys/t_lock.h>
34 34 #include <sys/buf.h>
35 35 #include <sys/dkio.h>
36 36 #include <sys/vtoc.h>
37 37 #include <sys/kmem.h>
38 38 #include <vm/page.h>
39 39 #include <sys/cmn_err.h>
40 40 #include <sys/sysmacros.h>
41 41 #include <sys/types.h>
42 42 #include <sys/mkdev.h>
43 43 #include <sys/stat.h>
44 44 #include <sys/open.h>
45 45 #include <sys/modctl.h>
46 46 #include <sys/ddi.h>
47 47 #include <sys/sunddi.h>
48 48 #include <sys/debug.h>
49 49 #include <sys/dklabel.h>
50 50 #include <vm/hat.h>
51 51 #include <sys/lvm/mdvar.h>
52 52 #include <sys/lvm/md_mirror.h>
53 53 #include <sys/lvm/md_convert.h>
54 54 #include <sys/lvm/md_mddb.h>
55 55 #include <sys/esunddi.h>
56 56
57 57 #include <sys/sysevent/eventdefs.h>
58 58 #include <sys/sysevent/svm.h>
59 59 #include <sys/lvm/mdmn_commd.h>
60 60 #include <sys/avl.h>
61 61
62 62 md_ops_t mirror_md_ops;
63 63 #ifndef lint
64 64 md_ops_t *md_interface_ops = &mirror_md_ops;
65 65 #endif
66 66
67 67 extern mdq_anchor_t md_done_daemon;
68 68 extern mdq_anchor_t md_mstr_daemon;
69 69 extern mdq_anchor_t md_mirror_daemon;
70 70 extern mdq_anchor_t md_mirror_io_daemon;
71 71 extern mdq_anchor_t md_mirror_rs_daemon;
72 72 extern mdq_anchor_t md_mhs_daemon;
73 73
74 74 extern unit_t md_nunits;
75 75 extern set_t md_nsets;
76 76 extern md_set_t md_set[];
77 77
78 78 extern int md_status;
79 79 extern clock_t md_hz;
80 80
81 81 extern md_krwlock_t md_unit_array_rw;
82 82 extern kmutex_t md_mx;
83 83 extern kcondvar_t md_cv;
84 84 extern int md_mtioctl_cnt;
85 85
86 86 daemon_request_t mirror_timeout;
87 87 static daemon_request_t hotspare_request;
88 88 static daemon_request_t mn_hs_request[MD_MAXSETS]; /* Multinode hs req */
89 89
90 90 int md_mirror_mcs_buf_off;
91 91
92 92 /* Flags for mdmn_ksend_message to allow debugging */
93 93 int md_mirror_msg_flags;
94 94
↓ open down ↓ |
94 lines elided |
↑ open up ↑ |
95 95 #ifdef DEBUG
96 96 /* Flag to switch on debug messages */
97 97 int mirror_debug_flag = 0;
98 98 #endif
99 99
100 100 /*
101 101 * Struct used to hold count of DMR reads and the timestamp of last DMR read
102 102 * It is used to verify, using a debugger, that the DMR read ioctl has been
103 103 * executed.
104 104 */
105 -dmr_stats_t mirror_dmr_stats = {0, 0};
105 +dmr_stats_t mirror_dmr_stats = {0};
106 106
107 107 /*
108 108 * Mutex protecting list of non-failfast drivers.
109 109 */
110 110 static kmutex_t non_ff_drv_mutex;
111 111 extern char **non_ff_drivers;
112 112
113 113 extern major_t md_major;
114 114
115 115 /*
116 116 * Write-On-Write memory pool.
117 117 */
118 118 static void copy_write_cont(wowhdr_t *wowhdr);
119 119 static kmem_cache_t *mirror_wowblk_cache = NULL;
120 120 static int md_wowbuf_size = 16384;
121 121 static size_t md_wowblk_size;
122 122
123 123 /*
124 124 * This is a flag that allows:
125 125 * - disabling the write-on-write mechanism.
126 126 * - logging occurrences of write-on-write
127 127 * - switching wow handling procedure processing
128 128 * Counter for occurences of WOW.
129 129 */
130 130 static uint_t md_mirror_wow_flg = 0;
131 131 static int md_mirror_wow_cnt = 0;
132 132
133 133 /*
134 134 * Tunable to enable/disable dirty region
135 135 * processing when closing down a mirror.
136 136 */
137 137 static int new_resync = 1;
138 138 kmem_cache_t *mirror_parent_cache = NULL;
139 139 kmem_cache_t *mirror_child_cache = NULL;
140 140
141 141 extern int md_ff_disable; /* disable failfast */
142 142
143 143 static int mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int);
144 144 static void mirror_read_strategy(buf_t *, int, void *);
145 145 static void mirror_write_strategy(buf_t *, int, void *);
146 146 static void become_owner(daemon_queue_t *);
147 147 static int mirror_done(struct buf *cb);
148 148 static int mirror_done_common(struct buf *cb);
149 149 static void clear_retry_error(struct buf *cb);
150 150
151 151 /*
152 152 * patchables
153 153 */
154 154 int md_min_rr_size = 200; /* 2000 blocks, or 100k */
155 155 int md_def_num_rr = 1000; /* Default number of dirty regions */
156 156
157 157 /*
158 158 * patchable to change delay before rescheduling mirror ownership request.
159 159 * Value is clock ticks, default 0.5 seconds
160 160 */
161 161 clock_t md_mirror_owner_to = 500000;
162 162
163 163 /*ARGSUSED1*/
164 164 static int
165 165 mirror_parent_constructor(void *p, void *d1, int d2)
166 166 {
167 167 mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL);
168 168 return (0);
169 169 }
170 170
171 171 static void
172 172 mirror_parent_init(md_mps_t *ps)
173 173 {
174 174 bzero(ps, offsetof(md_mps_t, ps_mx));
175 175 bzero(&ps->ps_overlap_node, sizeof (avl_node_t));
176 176 }
177 177
178 178 /*ARGSUSED1*/
179 179 static void
180 180 mirror_parent_destructor(void *p, void *d)
181 181 {
182 182 mutex_destroy(&((md_mps_t *)p)->ps_mx);
183 183 }
184 184
185 185 /*ARGSUSED1*/
186 186 static int
187 187 mirror_child_constructor(void *p, void *d1, int d2)
188 188 {
189 189 bioinit(&((md_mcs_t *)p)->cs_buf);
190 190 return (0);
191 191 }
192 192
193 193 void
194 194 mirror_child_init(md_mcs_t *cs)
195 195 {
196 196 cs->cs_ps = NULL;
197 197 cs->cs_mdunit = 0;
198 198 md_bioreset(&cs->cs_buf);
199 199 }
200 200
201 201 /*ARGSUSED1*/
202 202 static void
203 203 mirror_child_destructor(void *p, void *d)
204 204 {
205 205 biofini(&((md_mcs_t *)p)->cs_buf);
206 206 }
207 207
208 208 static void
209 209 mirror_wowblk_init(wowhdr_t *p)
210 210 {
211 211 bzero(p, md_wowblk_size);
212 212 }
213 213
214 214 static void
215 215 send_poke_hotspares_msg(daemon_request_t *drq)
216 216 {
217 217 int rval;
218 218 int nretries = 0;
219 219 md_mn_msg_pokehsp_t pokehsp;
220 220 md_mn_kresult_t *kresult;
221 221 set_t setno = (set_t)drq->dq.qlen;
222 222
223 223 pokehsp.pokehsp_setno = setno;
224 224
225 225 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
226 226
227 227 retry_sphmsg:
228 228 rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES,
229 229 MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp,
230 230 sizeof (pokehsp), kresult);
231 231
232 232 if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
233 233 mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES");
234 234 /* If we're shutting down already, pause things here. */
235 235 if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
236 236 while (!md_mn_is_commd_present()) {
237 237 delay(md_hz);
238 238 }
239 239 /*
240 240 * commd has become reachable again, so retry once.
241 241 * If this fails we'll panic as the system is in an
242 242 * unexpected state.
243 243 */
244 244 if (nretries++ == 0)
245 245 goto retry_sphmsg;
246 246 }
247 247 cmn_err(CE_PANIC,
248 248 "ksend_message failure: POKE_HOTSPARES");
249 249 }
250 250 kmem_free(kresult, sizeof (md_mn_kresult_t));
251 251
252 252 /* Allow further requests to use this set's queue structure */
253 253 mutex_enter(&drq->dr_mx);
254 254 drq->dr_pending = 0;
255 255 mutex_exit(&drq->dr_mx);
256 256 }
257 257
258 258 /*
259 259 * Send a poke_hotspares message to the master node. To avoid swamping the
260 260 * commd handler with requests we only send a message if there is not one
261 261 * already outstanding. We punt the request to a separate thread context as
262 262 * cannot afford to block waiting on the request to be serviced. This is
263 263 * essential when a reconfig cycle is in progress as any open() of a multinode
264 264 * metadevice may result in a livelock.
265 265 */
266 266 static void
267 267 send_poke_hotspares(set_t setno)
268 268 {
269 269 daemon_request_t *drq = &mn_hs_request[setno];
270 270
271 271 mutex_enter(&drq->dr_mx);
272 272 if (drq->dr_pending == 0) {
273 273 drq->dr_pending = 1;
274 274 drq->dq.qlen = (int)setno;
275 275 daemon_request(&md_mhs_daemon,
276 276 send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD);
277 277 }
278 278 mutex_exit(&drq->dr_mx);
279 279 }
280 280
281 281 void
282 282 mirror_set_sm_state(
283 283 mm_submirror_t *sm,
284 284 mm_submirror_ic_t *smic,
285 285 sm_state_t newstate,
286 286 int force)
287 287 {
288 288 int compcnt;
289 289 int i;
290 290 int errcnt;
291 291 sm_state_t origstate;
292 292 md_m_shared_t *shared;
293 293
294 294 if (force) {
295 295 sm->sm_state = newstate;
296 296 uniqtime32(&sm->sm_timestamp);
297 297 return;
298 298 }
299 299
300 300 origstate = newstate;
301 301
302 302 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
303 303 for (i = 0, errcnt = 0; i < compcnt; i++) {
304 304 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
305 305 (sm->sm_dev, sm, i);
306 306 if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED))
307 307 newstate |= SMS_COMP_ERRED;
308 308 if (shared->ms_state & (CS_RESYNC))
309 309 newstate |= SMS_COMP_RESYNC;
310 310 if (shared->ms_state & CS_ERRED)
311 311 errcnt++;
312 312 }
313 313
314 314 if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0)
315 315 newstate &= ~origstate;
316 316
317 317 if (errcnt == compcnt)
318 318 newstate |= SMS_ALL_ERRED;
319 319 else
320 320 newstate &= ~SMS_ALL_ERRED;
321 321
322 322 sm->sm_state = newstate;
323 323 uniqtime32(&sm->sm_timestamp);
324 324 }
325 325
326 326 static int
327 327 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error,
328 328 int frm_probe)
329 329 {
330 330 mm_submirror_t *sm;
331 331 mm_submirror_ic_t *smic;
332 332 md_m_shared_t *shared;
333 333 int ci;
334 334 int i;
335 335 int compcnt;
336 336 int open_comp; /* flag for open component */
337 337
338 338 for (i = *smi; i < NMIRROR; i++) {
339 339 sm = &un->un_sm[i];
340 340 smic = &un->un_smic[i];
341 341
342 342 if (!SMS_IS(sm, SMS_INUSE))
343 343 continue;
344 344
345 345 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
346 346 for (ci = *cip; ci < compcnt; ci++) {
347 347 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
348 348 (sm->sm_dev, sm, ci);
349 349 /*
350 350 * if called from any routine but probe, we check for
351 351 * MDM_S_ISOPEN flag. Since probe does a pseduo open,
352 352 * it sets MDM_S_PROBEOPEN flag and we test for this
353 353 * flag. They are both exclusive tests.
354 354 */
355 355 open_comp = (frm_probe) ?
356 356 (shared->ms_flags & MDM_S_PROBEOPEN):
357 357 (shared->ms_flags & MDM_S_ISOPEN);
358 358 if (((shared->ms_flags & MDM_S_IOERR || !open_comp) &&
359 359 ((shared->ms_state == CS_OKAY) ||
360 360 (shared->ms_state == CS_RESYNC))) ||
361 361 (!open_comp &&
362 362 (shared->ms_state == CS_LAST_ERRED))) {
363 363 if (clr_error) {
364 364 shared->ms_flags &= ~MDM_S_IOERR;
365 365 }
366 366 *cip = ci;
367 367 *smi = i;
368 368 return (1);
369 369 }
370 370
371 371 if (clr_error && (shared->ms_flags & MDM_S_IOERR)) {
372 372 shared->ms_flags &= ~MDM_S_IOERR;
373 373 }
374 374 }
375 375
376 376 *cip = 0;
377 377 }
378 378 return (0);
379 379 }
380 380
381 381 /*ARGSUSED*/
382 382 static void
383 383 mirror_run_queue(void *d)
384 384 {
385 385 if (!(md_status & MD_GBL_DAEMONS_LIVE))
386 386 md_daemon(1, &md_done_daemon);
387 387 }
388 388 /*
389 389 * check_comp_4_hotspares
390 390 *
391 391 * This function attempts to allocate a hotspare for this component if the
392 392 * component is in error. In a MN set, the function can be called in 2 modes.
393 393 * It can be called either when a component error has been detected or when a
394 394 * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set
395 395 * in flags and the request is sent to all nodes.
396 396 * The handler on each of the nodes then calls this function with
397 397 * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed.
398 398 *
399 399 * For non-MN sets the function simply attempts to allocate a hotspare.
400 400 *
401 401 * On entry, the following locks are held
402 402 * mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set)
403 403 * md_unit_writerlock
404 404 *
405 405 * Returns 0 if ok
406 406 * 1 if the unit containing the component has been cleared while
407 407 * the mdmn_ksend_message() was being executed
408 408 */
409 409 extern int
410 410 check_comp_4_hotspares(
411 411 mm_unit_t *un,
412 412 int smi,
413 413 int ci,
414 414 uint_t flags,
415 415 mddb_recid_t hs_id, /* Only used by MN disksets */
416 416 IOLOCK *lockp /* can be NULL */
417 417 )
418 418 {
419 419 mm_submirror_t *sm;
420 420 mm_submirror_ic_t *smic;
421 421 md_m_shared_t *shared;
422 422 mddb_recid_t recids[6];
423 423 minor_t mnum;
424 424 intptr_t (*hs_dev)();
425 425 void (*hs_done)();
426 426 void *hs_data;
427 427 md_error_t mde = mdnullerror;
428 428 set_t setno;
429 429 md_mn_msg_allochsp_t allochspmsg;
430 430 md_mn_kresult_t *kresult;
431 431 mm_unit_t *new_un;
432 432 int rval;
433 433 int nretries = 0;
434 434
435 435 mnum = MD_SID(un);
436 436 setno = MD_UN2SET(un);
437 437 sm = &un->un_sm[smi];
438 438 smic = &un->un_smic[smi];
439 439 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
440 440 (sm->sm_dev, sm, ci);
441 441
442 442 if (shared->ms_state != CS_ERRED)
443 443 return (0);
444 444
445 445 /* Don't start a new component resync if a resync is already running. */
446 446 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
447 447 return (0);
448 448
449 449 if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) {
450 450 uint_t msgflags;
451 451 md_mn_msgtype_t msgtype;
452 452
453 453 /* Send allocate hotspare message to all nodes */
454 454
455 455 allochspmsg.msg_allochsp_mnum = un->c.un_self_id;
456 456 allochspmsg.msg_allochsp_sm = smi;
457 457 allochspmsg.msg_allochsp_comp = ci;
458 458 allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id;
459 459
460 460 /*
461 461 * Before calling mdmn_ksend_message(), release locks
462 462 * Can never be in the context of an ioctl.
463 463 */
464 464 md_unit_writerexit(MDI_UNIT(mnum));
465 465 if (flags & MD_HOTSPARE_LINKHELD)
466 466 rw_exit(&mirror_md_ops.md_link_rw.lock);
467 467 #ifdef DEBUG
468 468 if (mirror_debug_flag)
469 469 printf("send alloc hotspare, flags="
470 470 "0x%x %x, %x, %x, %x\n", flags,
471 471 allochspmsg.msg_allochsp_mnum,
472 472 allochspmsg.msg_allochsp_sm,
473 473 allochspmsg.msg_allochsp_comp,
474 474 allochspmsg.msg_allochsp_hs_id);
475 475 #endif
476 476 if (flags & MD_HOTSPARE_WMUPDATE) {
477 477 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE2;
478 478 /*
479 479 * When coming from an update of watermarks, there
480 480 * must already be a message logged that triggered
481 481 * this action. So, no need to log this message, too.
482 482 */
483 483 msgflags = MD_MSGF_NO_LOG;
484 484 } else {
485 485 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE;
486 486 msgflags = MD_MSGF_DEFAULT_FLAGS;
487 487 }
488 488
489 489 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
490 490
491 491 cc4hs_msg:
492 492 rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
493 493 (char *)&allochspmsg, sizeof (allochspmsg),
494 494 kresult);
495 495
496 496 if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
497 497 #ifdef DEBUG
498 498 if (mirror_debug_flag)
499 499 mdmn_ksend_show_error(rval, kresult,
500 500 "ALLOCATE HOTSPARE");
501 501 #endif
502 502 /*
503 503 * If message is sent ok but exitval indicates an error
504 504 * it must be because the mirror has been cleared. In
505 505 * this case re-obtain lock and return an error
506 506 */
507 507 if ((rval == 0) && (kresult->kmmr_exitval != 0)) {
508 508 if (flags & MD_HOTSPARE_LINKHELD) {
509 509 rw_enter(&mirror_md_ops.md_link_rw.lock,
510 510 RW_READER);
511 511 }
512 512 kmem_free(kresult, sizeof (md_mn_kresult_t));
513 513 return (1);
514 514 }
515 515 /* If we're shutting down already, pause things here. */
516 516 if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
517 517 while (!md_mn_is_commd_present()) {
518 518 delay(md_hz);
519 519 }
520 520 /*
521 521 * commd has become reachable again, so retry
522 522 * once. If this fails we'll panic as the
523 523 * system is in an unexpected state.
524 524 */
525 525 if (nretries++ == 0)
526 526 goto cc4hs_msg;
527 527 }
528 528 cmn_err(CE_PANIC,
529 529 "ksend_message failure: ALLOCATE_HOTSPARE");
530 530 }
531 531 kmem_free(kresult, sizeof (md_mn_kresult_t));
532 532
533 533 /*
534 534 * re-obtain the locks
535 535 */
536 536 if (flags & MD_HOTSPARE_LINKHELD)
537 537 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
538 538 new_un = md_unit_writerlock(MDI_UNIT(mnum));
539 539
540 540 /*
541 541 * As we had to release the locks in order to send the
542 542 * message to all nodes, we need to check to see if the
543 543 * unit has changed. If it has we release the writerlock
544 544 * and return fail.
545 545 */
546 546 if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) {
547 547 md_unit_writerexit(MDI_UNIT(mnum));
548 548 return (1);
549 549 }
550 550 } else {
551 551 if (MD_MNSET_SETNO(setno)) {
552 552 /*
553 553 * If 2 or more nodes simultaneously see a
554 554 * component failure, these nodes will each
555 555 * send an ALLOCATE_HOTSPARE[2] message.
556 556 * The first message will allocate the hotspare
557 557 * and the subsequent messages should do nothing.
558 558 *
559 559 * If a slave node doesn't have a hotspare allocated
560 560 * at the time the message is initiated, then the
561 561 * passed in hs_id will be 0. If the node
562 562 * executing this routine has a component shared
563 563 * ms_hs_id of non-zero, but the message shows a
564 564 * hs_id of 0, then just return since a hotspare
565 565 * has already been allocated for this failing
566 566 * component. When the slave node returns from
567 567 * the ksend_message the hotspare will have
568 568 * already been allocated.
569 569 *
570 570 * If the slave node does send an hs_id of non-zero,
571 571 * and the slave node's hs_id matches this node's
572 572 * ms_hs_id, then the hotspare has error'd and
573 573 * should be replaced.
574 574 *
575 575 * If the slave node sends an hs_id of non-zero and
576 576 * this node has a different shared ms_hs_id, then
577 577 * just return since this hotspare has already
578 578 * been hotspared.
579 579 */
580 580 if (shared->ms_hs_id != 0) {
581 581 if (hs_id == 0) {
582 582 #ifdef DEBUG
583 583 if (mirror_debug_flag) {
584 584 printf("check_comp_4_hotspares"
585 585 "(NOXMIT), short circuit "
586 586 "hs_id=0x%x, "
587 587 "ms_hs_id=0x%x\n",
588 588 hs_id, shared->ms_hs_id);
589 589 }
590 590 #endif
591 591 return (0);
592 592 }
593 593 if (hs_id != shared->ms_hs_id) {
594 594 #ifdef DEBUG
595 595 if (mirror_debug_flag) {
596 596 printf("check_comp_4_hotspares"
597 597 "(NOXMIT), short circuit2 "
598 598 "hs_id=0x%x, "
599 599 "ms_hs_id=0x%x\n",
600 600 hs_id, shared->ms_hs_id);
601 601 }
602 602 #endif
603 603 return (0);
604 604 }
605 605 }
606 606 }
607 607
608 608 sm = &un->un_sm[smi];
609 609 hs_dev = md_get_named_service(sm->sm_dev, 0,
610 610 "hotspare device", 0);
611 611 if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done,
612 612 &hs_data) != 0)
613 613 return (0);
614 614
615 615 /*
616 616 * set_sm_comp_state() commits the modified records.
617 617 * As we don't transmit the changes, no need to drop the lock.
618 618 */
619 619 set_sm_comp_state(un, smi, ci, CS_RESYNC, recids,
620 620 MD_STATE_NO_XMIT, (IOLOCK *)NULL);
621 621
622 622 (*hs_done)(sm->sm_dev, hs_data);
623 623
624 624 mirror_check_failfast(mnum);
625 625
626 626 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE,
627 627 setno, MD_SID(un));
628 628
629 629 /*
630 630 * For a multi-node set we need to reset the un_rs_type,
631 631 * un_rs_resync_done and un_rs_resync_2_do fields as the
632 632 * hot-spare resync must copy all applicable data.
633 633 */
634 634 if (MD_MNSET_SETNO(setno)) {
635 635 un->un_rs_type = MD_RS_NONE;
636 636 un->un_rs_resync_done = 0;
637 637 un->un_rs_resync_2_do = 0;
638 638 }
639 639
640 640 /*
641 641 * Must drop writer lock since mirror_resync_unit will
642 642 * open devices and must be able to grab readerlock.
643 643 * Don't need to drop IOLOCK since any descendent routines
644 644 * calling ksend_messages will drop the IOLOCK as needed.
645 645 *
646 646 */
647 647 if (lockp) {
648 648 md_ioctl_writerexit(lockp);
649 649 } else {
650 650 md_unit_writerexit(MDI_UNIT(mnum));
651 651 }
652 652
653 653 /* start resync */
654 654 (void) mirror_resync_unit(mnum, NULL, &mde, lockp);
655 655
656 656 if (lockp) {
657 657 new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum));
658 658 } else {
659 659 new_un = md_unit_writerlock(MDI_UNIT(mnum));
660 660 }
661 661 }
662 662 return (0);
663 663 }
664 664
665 665 /*
666 666 * check_unit_4_hotspares
667 667 *
668 668 * For a given mirror, allocate hotspares, if available for any components
669 669 * that are in error
670 670 *
671 671 * Returns 0 if ok
672 672 * 1 if check_comp_4_hotspares returns non-zero. This will only
673 673 * happen for a MN unit where the unit has been cleared while
674 674 * the allocate hotspare message is sent to all nodes.
675 675 */
676 676 static int
677 677 check_unit_4_hotspares(mm_unit_t *un, int flags)
678 678 {
679 679 mm_submirror_t *sm;
680 680 mm_submirror_ic_t *smic;
681 681 int ci;
682 682 int i;
683 683 int compcnt;
684 684
685 685 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
686 686 return (0);
687 687
688 688 for (i = 0; i < NMIRROR; i++) {
689 689 sm = &un->un_sm[i];
690 690 smic = &un->un_smic[i];
691 691 if (!SMS_IS(sm, SMS_INUSE))
692 692 continue;
693 693 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm);
694 694 for (ci = 0; ci < compcnt; ci++) {
695 695 md_m_shared_t *shared;
696 696
697 697 shared = (md_m_shared_t *)
698 698 (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci);
699 699 /*
700 700 * Never called from ioctl context, so pass in
701 701 * (IOLOCK *)NULL. Pass through flags from calling
702 702 * routine, also setting XMIT flag.
703 703 */
704 704 if (check_comp_4_hotspares(un, i, ci,
705 705 (MD_HOTSPARE_XMIT | flags),
706 706 shared->ms_hs_id, (IOLOCK *)NULL) != 0)
707 707 return (1);
708 708 }
709 709 }
710 710 return (0);
711 711 }
712 712
713 713 static void
714 714 check_4_hotspares(daemon_request_t *drq)
715 715 {
716 716 mdi_unit_t *ui;
717 717 mm_unit_t *un;
718 718 md_link_t *next;
719 719 int x;
720 720
721 721 mutex_enter(&drq->dr_mx); /* clear up front so can poke */
722 722 drq->dr_pending = 0; /* again in low level routine if */
723 723 mutex_exit(&drq->dr_mx); /* something found to do */
724 724
725 725 /*
726 726 * Used to have a problem here. The disksets weren't marked as being
727 727 * MNHOLD. This opened a window where we could be searching for
728 728 * hotspares and have the disk set unloaded (released) from under
729 729 * us causing a panic in stripe_component_count().
730 730 * The way to prevent that is to mark the set MNHOLD which prevents
731 731 * any diskset from being released while we are scanning the mirrors,
732 732 * submirrors and components.
733 733 */
734 734
735 735 for (x = 0; x < md_nsets; x++)
736 736 md_holdset_enter(x);
737 737
738 738 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
739 739 for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
740 740 ui = MDI_UNIT(next->ln_id);
741 741
742 742 un = (mm_unit_t *)md_unit_readerlock(ui);
743 743
744 744 /*
745 745 * Only check the unit if we are the master for this set
746 746 * For an MN set, poke_hotspares() is only effective on the
747 747 * master
748 748 */
749 749 if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
750 750 md_set[MD_UN2SET(un)].s_am_i_master == 0) {
751 751 md_unit_readerexit(ui);
752 752 continue;
753 753 }
754 754 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
755 755 md_unit_readerexit(ui);
756 756 continue;
757 757 }
758 758 md_unit_readerexit(ui);
759 759
760 760 un = (mm_unit_t *)md_unit_writerlock(ui);
761 761 /*
762 762 * check_unit_4_hotspares will exit 1 if the unit has been
763 763 * removed during the process of allocating the hotspare.
764 764 * This can only happen for a MN metadevice. If unit no longer
765 765 * exists, no need to release writerlock
766 766 */
767 767 if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0)
768 768 md_unit_writerexit(ui);
769 769 else {
770 770 /*
771 771 * If check_unit_4_hotspares failed, queue another
772 772 * request and break out of this one
773 773 */
774 774 (void) poke_hotspares();
775 775 break;
776 776 }
777 777 }
778 778 rw_exit(&mirror_md_ops.md_link_rw.lock);
779 779
780 780 for (x = 0; x < md_nsets; x++)
781 781 md_holdset_exit(x);
782 782 }
783 783
784 784 /*
785 785 * poke_hotspares
786 786 *
787 787 * If there is not a pending poke_hotspares request pending, queue a requent
788 788 * to call check_4_hotspares(). This will scan all mirrors and attempt to
789 789 * allocate hotspares for all components in error.
790 790 */
791 791 int
792 792 poke_hotspares()
793 793 {
794 794 mutex_enter(&hotspare_request.dr_mx);
795 795 if (hotspare_request.dr_pending == 0) {
796 796 hotspare_request.dr_pending = 1;
797 797 daemon_request(&md_mhs_daemon,
798 798 check_4_hotspares, (daemon_queue_t *)&hotspare_request,
799 799 REQ_OLD);
800 800 }
801 801 mutex_exit(&hotspare_request.dr_mx);
802 802 return (0);
803 803 }
804 804
805 805 static void
806 806 free_all_ecomps(err_comp_t *ecomp)
807 807 {
808 808 err_comp_t *d;
809 809
810 810 while (ecomp != NULL) {
811 811 d = ecomp;
812 812 ecomp = ecomp->ec_next;
813 813 kmem_free(d, sizeof (err_comp_t));
814 814 }
815 815 }
816 816
817 817 /*
818 818 * NAME: mirror_openfail_console_info
819 819 *
820 820 * DESCRIPTION: Prints a informative message to the console when mirror
821 821 * cannot be opened.
822 822 *
823 823 * PARAMETERS: mm_unit_t un - pointer to mirror unit structure
824 824 * int smi - submirror index
825 825 * int ci - component index
826 826 */
827 827
828 828 void
829 829 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci)
830 830 {
831 831 void (*get_dev)();
832 832 ms_cd_info_t cd;
833 833 md_dev64_t tmpdev;
834 834
835 835 tmpdev = un->un_sm[smi].sm_dev;
836 836 get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0);
837 837 if (get_dev != NULL) {
838 838 (void) (*get_dev)(tmpdev, smi, ci, &cd);
839 839 cmn_err(CE_WARN, "md %s: open error on %s",
840 840 md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un),
841 841 cd.cd_dev, NULL, 0));
842 842 } else {
843 843 cmn_err(CE_WARN, "md %s: open error",
844 844 md_shortname(MD_SID(un)));
845 845 }
846 846 }
847 847
848 848 static int
849 849 mirror_close_all_devs(mm_unit_t *un, int md_cflags)
850 850 {
851 851 int i;
852 852 md_dev64_t dev;
853 853
854 854 for (i = 0; i < NMIRROR; i++) {
855 855 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
856 856 continue;
857 857 dev = un->un_sm[i].sm_dev;
858 858 md_layered_close(dev, md_cflags);
859 859 }
860 860 return (0);
861 861 }
862 862
863 863 /*
864 864 * Keep track of drivers that don't support failfast. We use this so that
865 865 * we only log one diagnostic message for each of these drivers, no matter
866 866 * how many times we run the mirror_check_failfast function.
867 867 * Return 1 if this is a new driver that does not support failfast,
868 868 * return 0 if we have already seen this non-failfast driver.
869 869 */
870 870 static int
871 871 new_non_ff_driver(const char *s)
872 872 {
873 873 mutex_enter(&non_ff_drv_mutex);
874 874 if (non_ff_drivers == NULL) {
875 875 non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *),
876 876 KM_NOSLEEP);
877 877 if (non_ff_drivers == NULL) {
878 878 mutex_exit(&non_ff_drv_mutex);
879 879 return (1);
880 880 }
881 881
882 882 non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1,
883 883 KM_NOSLEEP);
884 884 if (non_ff_drivers[0] == NULL) {
885 885 kmem_free(non_ff_drivers, 2 * sizeof (char *));
886 886 non_ff_drivers = NULL;
887 887 mutex_exit(&non_ff_drv_mutex);
888 888 return (1);
889 889 }
890 890
891 891 (void) strcpy(non_ff_drivers[0], s);
892 892 non_ff_drivers[1] = NULL;
893 893
894 894 } else {
895 895 int i;
896 896 char **tnames;
897 897 char **tmp;
898 898
899 899 for (i = 0; non_ff_drivers[i] != NULL; i++) {
900 900 if (strcmp(s, non_ff_drivers[i]) == 0) {
901 901 mutex_exit(&non_ff_drv_mutex);
902 902 return (0);
903 903 }
904 904 }
905 905
906 906 /* allow for new element and null */
907 907 i += 2;
908 908 tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP);
909 909 if (tnames == NULL) {
910 910 mutex_exit(&non_ff_drv_mutex);
911 911 return (1);
912 912 }
913 913
914 914 for (i = 0; non_ff_drivers[i] != NULL; i++)
915 915 tnames[i] = non_ff_drivers[i];
916 916
917 917 tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
918 918 if (tnames[i] == NULL) {
919 919 /* adjust i so that it is the right count to free */
920 920 kmem_free(tnames, (i + 2) * sizeof (char *));
921 921 mutex_exit(&non_ff_drv_mutex);
922 922 return (1);
923 923 }
924 924
925 925 (void) strcpy(tnames[i++], s);
926 926 tnames[i] = NULL;
927 927
928 928 tmp = non_ff_drivers;
929 929 non_ff_drivers = tnames;
930 930 /* i now represents the count we previously alloced */
931 931 kmem_free(tmp, i * sizeof (char *));
932 932 }
933 933 mutex_exit(&non_ff_drv_mutex);
934 934
935 935 return (1);
936 936 }
937 937
938 938 /*
939 939 * Check for the "ddi-failfast-supported" devtree property on each submirror
940 940 * component to indicate if we should do I/O to that submirror with the
941 941 * B_FAILFAST flag set or not. This check is made at various state transitions
942 942 * in the mirror code (e.g. open, enable, hotspare, etc.). Sometimes we
943 943 * only need to check one drive (e.g. hotspare) but since the check is
944 944 * fast and infrequent and sometimes needs to be done on all components we
945 945 * just check all components on each call.
946 946 */
947 947 void
948 948 mirror_check_failfast(minor_t mnum)
949 949 {
950 950 int i;
951 951 mm_unit_t *un;
952 952
953 953 if (md_ff_disable)
954 954 return;
955 955
956 956 un = MD_UNIT(mnum);
957 957
958 958 for (i = 0; i < NMIRROR; i++) {
959 959 int ci;
960 960 int cnt;
961 961 int ff = 1;
962 962 mm_submirror_t *sm;
963 963 mm_submirror_ic_t *smic;
964 964 void (*get_dev)();
965 965
966 966 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
967 967 continue;
968 968
969 969 sm = &un->un_sm[i];
970 970 smic = &un->un_smic[i];
971 971
972 972 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
973 973 "get device", 0);
974 974
975 975 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
976 976 for (ci = 0; ci < cnt; ci++) {
977 977 int found = 0;
978 978 dev_t ci_dev;
979 979 major_t major;
980 980 dev_info_t *devi;
981 981 ms_cd_info_t cd;
982 982
983 983 /*
984 984 * this already returns the hs
985 985 * dev if the device is spared
986 986 */
987 987 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
988 988
989 989 ci_dev = md_dev64_to_dev(cd.cd_dev);
990 990 major = getmajor(ci_dev);
991 991
992 992 if (major == md_major) {
993 993 /*
994 994 * this component must be a soft
995 995 * partition; get the real dev
996 996 */
997 997 minor_t dev_mnum;
998 998 mdi_unit_t *ui;
999 999 mp_unit_t *un;
1000 1000 set_t setno;
1001 1001 side_t side;
1002 1002 md_dev64_t tmpdev;
1003 1003
1004 1004 ui = MDI_UNIT(getminor(ci_dev));
1005 1005
1006 1006 /* grab necessary lock */
1007 1007 un = (mp_unit_t *)md_unit_readerlock(ui);
1008 1008
1009 1009 dev_mnum = MD_SID(un);
1010 1010 setno = MD_MIN2SET(dev_mnum);
1011 1011 side = mddb_getsidenum(setno);
1012 1012
1013 1013 tmpdev = un->un_dev;
1014 1014
1015 1015 /* Get dev by device id */
1016 1016 if (md_devid_found(setno, side,
1017 1017 un->un_key) == 1) {
1018 1018 tmpdev = md_resolve_bydevid(dev_mnum,
1019 1019 tmpdev, un->un_key);
1020 1020 }
1021 1021
1022 1022 md_unit_readerexit(ui);
1023 1023
1024 1024 ci_dev = md_dev64_to_dev(tmpdev);
1025 1025 major = getmajor(ci_dev);
1026 1026 }
1027 1027
1028 1028 if (ci_dev != NODEV32 &&
1029 1029 (devi = e_ddi_hold_devi_by_dev(ci_dev, 0))
1030 1030 != NULL) {
1031 1031 ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF;
1032 1032 int propvalue = 0;
1033 1033 int proplength = sizeof (int);
1034 1034 int error;
1035 1035 struct cb_ops *cb;
1036 1036
1037 1037 if ((cb = devopsp[major]->devo_cb_ops) !=
1038 1038 NULL) {
1039 1039 error = (*cb->cb_prop_op)
1040 1040 (DDI_DEV_T_ANY, devi, prop_op,
1041 1041 DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
1042 1042 "ddi-failfast-supported",
1043 1043 (caddr_t)&propvalue, &proplength);
1044 1044
1045 1045 if (error == DDI_PROP_SUCCESS)
1046 1046 found = 1;
1047 1047 }
1048 1048
1049 1049 if (!found && new_non_ff_driver(
1050 1050 ddi_driver_name(devi))) {
1051 1051 cmn_err(CE_NOTE, "!md: B_FAILFAST I/O"
1052 1052 "disabled on %s",
1053 1053 ddi_driver_name(devi));
1054 1054 }
1055 1055
1056 1056 ddi_release_devi(devi);
1057 1057 }
1058 1058
1059 1059 /*
1060 1060 * All components must support
1061 1061 * failfast in the submirror.
1062 1062 */
1063 1063 if (!found) {
1064 1064 ff = 0;
1065 1065 break;
1066 1066 }
1067 1067 }
1068 1068
1069 1069 if (ff) {
1070 1070 sm->sm_flags |= MD_SM_FAILFAST;
1071 1071 } else {
1072 1072 sm->sm_flags &= ~MD_SM_FAILFAST;
1073 1073 }
1074 1074 }
1075 1075 }
1076 1076
1077 1077 /*
1078 1078 * Return true if the submirror is unavailable.
1079 1079 * If any of the submirror components are opened then the submirror cannot
1080 1080 * be unavailable (MD_INACCESSIBLE).
1081 1081 * If any of the components are already in the errored state, then the submirror
1082 1082 * cannot be unavailable (MD_INACCESSIBLE).
1083 1083 */
1084 1084 static bool_t
1085 1085 submirror_unavailable(mm_unit_t *un, int smi, int from_probe)
1086 1086 {
1087 1087 mm_submirror_t *sm;
1088 1088 mm_submirror_ic_t *smic;
1089 1089 md_m_shared_t *shared;
1090 1090 int ci;
1091 1091 int compcnt;
1092 1092
1093 1093 sm = &un->un_sm[smi];
1094 1094 smic = &un->un_smic[smi];
1095 1095
1096 1096 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
1097 1097 for (ci = 0; ci < compcnt; ci++) {
1098 1098 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
1099 1099 (sm->sm_dev, sm, ci);
1100 1100 if (from_probe) {
1101 1101 if (shared->ms_flags & MDM_S_PROBEOPEN)
1102 1102 return (B_FALSE);
1103 1103 } else {
1104 1104 if (shared->ms_flags & MDM_S_ISOPEN)
1105 1105 return (B_FALSE);
1106 1106 }
1107 1107 if (shared->ms_state == CS_ERRED ||
1108 1108 shared->ms_state == CS_LAST_ERRED)
1109 1109 return (B_FALSE);
1110 1110 }
1111 1111
1112 1112 return (B_TRUE);
1113 1113 }
1114 1114
1115 1115 static int
1116 1116 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp)
1117 1117 {
1118 1118 int i;
1119 1119 mm_unit_t *un;
1120 1120 mdi_unit_t *ui;
1121 1121 int err;
1122 1122 int smi;
1123 1123 int ci;
1124 1124 err_comp_t *c;
1125 1125 err_comp_t *ecomps = NULL;
1126 1126 int smmask = 0;
1127 1127 set_t setno;
1128 1128 int sm_cnt;
1129 1129 int sm_unavail_cnt;
1130 1130
1131 1131 mirror_check_failfast(mnum);
1132 1132
1133 1133 un = MD_UNIT(mnum);
1134 1134 ui = MDI_UNIT(mnum);
1135 1135 setno = MD_UN2SET(un);
1136 1136
1137 1137 for (i = 0; i < NMIRROR; i++) {
1138 1138 md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1139 1139
1140 1140 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1141 1141 continue;
1142 1142 if (md_layered_open(mnum, &tmpdev, md_oflags))
1143 1143 smmask |= SMI2BIT(i);
1144 1144 un->un_sm[i].sm_dev = tmpdev;
1145 1145 }
1146 1146
1147 1147 /*
1148 1148 * If smmask is clear, all submirrors are accessible. Clear the
1149 1149 * MD_INACCESSIBLE bit in this case. This bit is also cleared for the
1150 1150 * mirror device. If smmask is set, we have to determine which of the
1151 1151 * submirrors are in error. If no submirror is accessible we mark the
1152 1152 * whole mirror as MD_INACCESSIBLE.
1153 1153 */
1154 1154 if (smmask == 0) {
1155 1155 if (lockp) {
1156 1156 md_ioctl_readerexit(lockp);
1157 1157 (void) md_ioctl_writerlock(lockp, ui);
1158 1158 } else {
1159 1159 md_unit_readerexit(ui);
1160 1160 (void) md_unit_writerlock(ui);
1161 1161 }
1162 1162 ui->ui_tstate &= ~MD_INACCESSIBLE;
1163 1163 if (lockp) {
1164 1164 md_ioctl_writerexit(lockp);
1165 1165 (void) md_ioctl_readerlock(lockp, ui);
1166 1166 } else {
1167 1167 md_unit_writerexit(ui);
1168 1168 (void) md_unit_readerlock(ui);
1169 1169 }
1170 1170
1171 1171 for (i = 0; i < NMIRROR; i++) {
1172 1172 md_dev64_t tmpdev;
1173 1173 mdi_unit_t *sm_ui;
1174 1174
1175 1175 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1176 1176 continue;
1177 1177
1178 1178 tmpdev = un->un_sm[i].sm_dev;
1179 1179 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1180 1180 (void) md_unit_writerlock(sm_ui);
1181 1181 sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1182 1182 md_unit_writerexit(sm_ui);
1183 1183 }
1184 1184
1185 1185 return (0);
1186 1186 }
1187 1187
1188 1188 for (i = 0; i < NMIRROR; i++) {
1189 1189 md_dev64_t tmpdev;
1190 1190
1191 1191 if (!(smmask & SMI2BIT(i)))
1192 1192 continue;
1193 1193
1194 1194 tmpdev = un->un_sm[i].sm_dev;
1195 1195 err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS);
1196 1196 un->un_sm[i].sm_dev = tmpdev;
1197 1197 ASSERT(err == 0);
1198 1198 }
1199 1199
1200 1200 if (lockp) {
1201 1201 md_ioctl_readerexit(lockp);
1202 1202 un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
1203 1203 } else {
1204 1204 md_unit_readerexit(ui);
1205 1205 un = (mm_unit_t *)md_unit_writerlock(ui);
1206 1206 }
1207 1207
1208 1208 /*
1209 1209 * We want to make sure the unavailable flag is not masking a real
1210 1210 * error on the submirror.
1211 1211 * For each submirror,
1212 1212 * if all of the submirror components couldn't be opened and there
1213 1213 * are no errors on the submirror, then set the unavailable flag
1214 1214 * otherwise, clear unavailable.
1215 1215 */
1216 1216 sm_cnt = 0;
1217 1217 sm_unavail_cnt = 0;
1218 1218 for (i = 0; i < NMIRROR; i++) {
1219 1219 md_dev64_t tmpdev;
1220 1220 mdi_unit_t *sm_ui;
1221 1221
1222 1222 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1223 1223 continue;
1224 1224
1225 1225 sm_cnt++;
1226 1226 tmpdev = un->un_sm[i].sm_dev;
1227 1227 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1228 1228
1229 1229 (void) md_unit_writerlock(sm_ui);
1230 1230 if (submirror_unavailable(un, i, 0)) {
1231 1231 sm_ui->ui_tstate |= MD_INACCESSIBLE;
1232 1232 sm_unavail_cnt++;
1233 1233 } else {
1234 1234 sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1235 1235 }
1236 1236 md_unit_writerexit(sm_ui);
1237 1237 }
1238 1238
1239 1239 /*
1240 1240 * If all of the submirrors are unavailable, the mirror is also
1241 1241 * unavailable.
1242 1242 */
1243 1243 if (sm_cnt == sm_unavail_cnt) {
1244 1244 ui->ui_tstate |= MD_INACCESSIBLE;
1245 1245 } else {
1246 1246 ui->ui_tstate &= ~MD_INACCESSIBLE;
1247 1247 }
1248 1248
1249 1249 smi = 0;
1250 1250 ci = 0;
1251 1251 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
1252 1252 if (mirror_other_sources(un, smi, ci, 1) == 1) {
1253 1253
1254 1254 free_all_ecomps(ecomps);
1255 1255 (void) mirror_close_all_devs(un, md_oflags);
1256 1256 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
1257 1257 SVM_TAG_METADEVICE, setno, MD_SID(un));
1258 1258 mirror_openfail_console_info(un, smi, ci);
1259 1259 if (lockp) {
1260 1260 md_ioctl_writerexit(lockp);
1261 1261 (void) md_ioctl_readerlock(lockp, ui);
1262 1262 } else {
1263 1263 md_unit_writerexit(ui);
1264 1264 (void) md_unit_readerlock(ui);
1265 1265 }
1266 1266 return (ENXIO);
1267 1267 }
1268 1268
1269 1269 /* track all component states that need changing */
1270 1270 c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP);
1271 1271 c->ec_next = ecomps;
1272 1272 c->ec_smi = smi;
1273 1273 c->ec_ci = ci;
1274 1274 ecomps = c;
1275 1275 ci++;
1276 1276 }
1277 1277
1278 1278 /* Make all state changes and commit them */
1279 1279 for (c = ecomps; c != NULL; c = c->ec_next) {
1280 1280 /*
1281 1281 * If lockp is set, then entering kernel through ioctl.
1282 1282 * For a MN set, the only ioctl path is via a commd message
1283 1283 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already
1284 1284 * being sent to each node.
1285 1285 * In this case, set NO_XMIT so that set_sm_comp_state
1286 1286 * won't attempt to send a message on a message.
1287 1287 *
1288 1288 * In !MN sets, the xmit flag is ignored, so it doesn't matter
1289 1289 * which flag is passed.
1290 1290 */
1291 1291 if (lockp) {
1292 1292 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1293 1293 MD_STATE_NO_XMIT, lockp);
1294 1294 } else {
1295 1295 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1296 1296 (MD_STATE_XMIT | MD_STATE_OCHELD), lockp);
1297 1297 }
1298 1298 /*
1299 1299 * For a MN set, the NOTIFY is done when the state change is
1300 1300 * processed on each node
1301 1301 */
1302 1302 if (!MD_MNSET_SETNO(setno)) {
1303 1303 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
1304 1304 SVM_TAG_METADEVICE, setno, MD_SID(un));
1305 1305 }
1306 1306 }
1307 1307
1308 1308 if (lockp) {
1309 1309 md_ioctl_writerexit(lockp);
1310 1310 (void) md_ioctl_readerlock(lockp, ui);
1311 1311 } else {
1312 1312 md_unit_writerexit(ui);
1313 1313 (void) md_unit_readerlock(ui);
1314 1314 }
1315 1315
1316 1316 free_all_ecomps(ecomps);
1317 1317
1318 1318 /* allocate hotspares for all errored components */
1319 1319 if (MD_MNSET_SETNO(setno)) {
1320 1320 /*
1321 1321 * If we're called from an ioctl (lockp set) then we cannot
1322 1322 * directly call send_poke_hotspares as this will block until
1323 1323 * the message gets despatched to all nodes. If the cluster is
1324 1324 * going through a reconfig cycle then the message will block
1325 1325 * until the cycle is complete, and as we originate from a
1326 1326 * service call from commd we will livelock.
1327 1327 */
1328 1328 if (lockp == NULL) {
1329 1329 md_unit_readerexit(ui);
1330 1330 send_poke_hotspares(setno);
1331 1331 (void) md_unit_readerlock(ui);
1332 1332 }
1333 1333 } else {
1334 1334 (void) poke_hotspares();
1335 1335 }
1336 1336 return (0);
1337 1337 }
1338 1338
1339 1339 void
1340 1340 mirror_overlap_tree_remove(md_mps_t *ps)
1341 1341 {
1342 1342 mm_unit_t *un;
1343 1343
1344 1344 if (panicstr)
1345 1345 return;
1346 1346
1347 1347 VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP);
1348 1348 un = ps->ps_un;
1349 1349
1350 1350 mutex_enter(&un->un_overlap_tree_mx);
1351 1351 avl_remove(&un->un_overlap_root, ps);
1352 1352 ps->ps_flags &= ~MD_MPS_ON_OVERLAP;
1353 1353 if (un->un_overlap_tree_flag != 0) {
1354 1354 un->un_overlap_tree_flag = 0;
1355 1355 cv_broadcast(&un->un_overlap_tree_cv);
1356 1356 }
1357 1357 mutex_exit(&un->un_overlap_tree_mx);
1358 1358 }
1359 1359
1360 1360
1361 1361 /*
1362 1362 * wait_for_overlaps:
1363 1363 * -----------------
1364 1364 * Check that given i/o request does not cause an overlap with already pending
1365 1365 * i/o. If it does, block until the overlapped i/o completes.
1366 1366 *
1367 1367 * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent
1368 1368 * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if
1369 1369 * it must not already be in the tree.
1370 1370 */
1371 1371 static void
1372 1372 wait_for_overlaps(md_mps_t *ps, int flags)
1373 1373 {
1374 1374 mm_unit_t *un;
1375 1375 avl_index_t where;
1376 1376 md_mps_t *ps1;
1377 1377
1378 1378 if (panicstr)
1379 1379 return;
1380 1380
1381 1381 un = ps->ps_un;
1382 1382 mutex_enter(&un->un_overlap_tree_mx);
1383 1383 if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
1384 1384 (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
1385 1385 mutex_exit(&un->un_overlap_tree_mx);
1386 1386 return;
1387 1387 }
1388 1388
1389 1389 VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1390 1390
1391 1391 do {
1392 1392 ps1 = avl_find(&un->un_overlap_root, ps, &where);
1393 1393 if (ps1 == NULL) {
1394 1394 /*
1395 1395 * The candidate range does not overlap with any
1396 1396 * range in the tree. Insert it and be done.
1397 1397 */
1398 1398 avl_insert(&un->un_overlap_root, ps, where);
1399 1399 ps->ps_flags |= MD_MPS_ON_OVERLAP;
1400 1400 } else {
1401 1401 /*
1402 1402 * The candidate range would overlap. Set the flag
1403 1403 * indicating we need to be woken up, and sleep
1404 1404 * until another thread removes a range. If upon
1405 1405 * waking up we find this mps was put on the tree
1406 1406 * by another thread, the loop terminates.
1407 1407 */
1408 1408 un->un_overlap_tree_flag = 1;
1409 1409 cv_wait(&un->un_overlap_tree_cv,
1410 1410 &un->un_overlap_tree_mx);
1411 1411 }
1412 1412 } while (!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1413 1413 mutex_exit(&un->un_overlap_tree_mx);
1414 1414 }
1415 1415
1416 1416 /*
1417 1417 * This function is called from mirror_done to check whether any pages have
1418 1418 * been modified while a mirrored write was in progress. Returns 0 if
1419 1419 * all pages associated with bp are clean, 1 otherwise.
1420 1420 */
1421 1421 static int
1422 1422 any_pages_dirty(struct buf *bp)
1423 1423 {
1424 1424 int rval;
1425 1425
1426 1426 rval = biomodified(bp);
1427 1427 if (rval == -1)
1428 1428 rval = 0;
1429 1429
1430 1430 return (rval);
1431 1431 }
1432 1432
1433 1433 #define MAX_EXTRAS 10
1434 1434
1435 1435 void
1436 1436 mirror_commit(
1437 1437 mm_unit_t *un,
1438 1438 int smmask,
1439 1439 mddb_recid_t *extras
1440 1440 )
1441 1441 {
1442 1442 mm_submirror_t *sm;
1443 1443 md_unit_t *su;
1444 1444 int i;
1445 1445
1446 1446 /* 2=mirror,null id */
1447 1447 mddb_recid_t recids[NMIRROR+2+MAX_EXTRAS];
1448 1448
1449 1449 int ri = 0;
1450 1450
1451 1451 if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
1452 1452 return;
1453 1453
1454 1454 /* Add two, this includes the mirror unit and the null recid */
1455 1455 if (extras != NULL) {
1456 1456 int nrecids = 0;
1457 1457 while (extras[nrecids] != 0) {
1458 1458 nrecids++;
1459 1459 }
1460 1460 ASSERT(nrecids <= MAX_EXTRAS);
1461 1461 }
1462 1462
1463 1463 if (un != NULL)
1464 1464 recids[ri++] = un->c.un_record_id;
1465 1465 for (i = 0; i < NMIRROR; i++) {
1466 1466 if (!(smmask & SMI2BIT(i)))
1467 1467 continue;
1468 1468 sm = &un->un_sm[i];
1469 1469 if (!SMS_IS(sm, SMS_INUSE))
1470 1470 continue;
1471 1471 if (md_getmajor(sm->sm_dev) != md_major)
1472 1472 continue;
1473 1473 su = MD_UNIT(md_getminor(sm->sm_dev));
1474 1474 recids[ri++] = su->c.un_record_id;
1475 1475 }
1476 1476
1477 1477 if (extras != NULL)
1478 1478 while (*extras != 0) {
1479 1479 recids[ri++] = *extras;
1480 1480 extras++;
1481 1481 }
1482 1482
1483 1483 if (ri == 0)
1484 1484 return;
1485 1485 recids[ri] = 0;
1486 1486
1487 1487 /*
1488 1488 * Ok to hold ioctl lock across record commit to mddb as
1489 1489 * long as the record(s) being committed aren't resync records.
1490 1490 */
1491 1491 mddb_commitrecs_wrapper(recids);
1492 1492 }
1493 1493
1494 1494
1495 1495 /*
1496 1496 * This routine is used to set a bit in the writable_bm bitmap
1497 1497 * which represents each submirror in a metamirror which
1498 1498 * is writable. The first writable submirror index is assigned
1499 1499 * to the sm_index. The number of writable submirrors are returned in nunits.
1500 1500 *
1501 1501 * This routine returns the submirror's unit number.
1502 1502 */
1503 1503
1504 1504 static void
1505 1505 select_write_units(struct mm_unit *un, md_mps_t *ps)
1506 1506 {
1507 1507
1508 1508 int i;
1509 1509 unsigned writable_bm = 0;
1510 1510 unsigned nunits = 0;
1511 1511
1512 1512 for (i = 0; i < NMIRROR; i++) {
1513 1513 if (SUBMIRROR_IS_WRITEABLE(un, i)) {
1514 1514 /* set bit of all writable units */
1515 1515 writable_bm |= SMI2BIT(i);
1516 1516 nunits++;
1517 1517 }
1518 1518 }
1519 1519 ps->ps_writable_sm = writable_bm;
1520 1520 ps->ps_active_cnt = nunits;
1521 1521 ps->ps_current_sm = 0;
1522 1522 }
1523 1523
1524 1524 static
1525 1525 unsigned
1526 1526 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps)
1527 1527 {
1528 1528
1529 1529 int i;
1530 1530 unsigned writable_bm = 0;
1531 1531 unsigned nunits = 0;
1532 1532
1533 1533 for (i = 0; i < NMIRROR; i++) {
1534 1534 if (SUBMIRROR_IS_WRITEABLE(un, i) &&
1535 1535 un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) {
1536 1536 writable_bm |= SMI2BIT(i);
1537 1537 nunits++;
1538 1538 }
1539 1539 }
1540 1540 if ((writable_bm & ps->ps_allfrom_sm) != 0) {
1541 1541 writable_bm &= ~ps->ps_allfrom_sm;
1542 1542 nunits--;
1543 1543 }
1544 1544 ps->ps_writable_sm = writable_bm;
1545 1545 ps->ps_active_cnt = nunits;
1546 1546 ps->ps_current_sm = 0;
1547 1547 return (nunits);
1548 1548 }
1549 1549
1550 1550 static md_dev64_t
1551 1551 select_read_unit(
1552 1552 mm_unit_t *un,
1553 1553 diskaddr_t blkno,
1554 1554 u_longlong_t reqcount,
1555 1555 u_longlong_t *cando,
1556 1556 int must_be_opened,
1557 1557 md_m_shared_t **shared,
1558 1558 md_mcs_t *cs)
1559 1559 {
1560 1560 int i;
1561 1561 md_m_shared_t *s;
1562 1562 uint_t lasterrcnt = 0;
1563 1563 md_dev64_t dev = 0;
1564 1564 u_longlong_t cnt;
1565 1565 u_longlong_t mincnt;
1566 1566 mm_submirror_t *sm;
1567 1567 mm_submirror_ic_t *smic;
1568 1568 mdi_unit_t *ui;
1569 1569
1570 1570 mincnt = reqcount;
1571 1571 for (i = 0; i < NMIRROR; i++) {
1572 1572 if (!SUBMIRROR_IS_READABLE(un, i))
1573 1573 continue;
1574 1574 sm = &un->un_sm[i];
1575 1575 smic = &un->un_smic[i];
1576 1576 cnt = reqcount;
1577 1577
1578 1578 /*
1579 1579 * If the current submirror is marked as inaccessible, do not
1580 1580 * try to access it.
1581 1581 */
1582 1582 ui = MDI_UNIT(getminor(expldev(sm->sm_dev)));
1583 1583 (void) md_unit_readerlock(ui);
1584 1584 if (ui->ui_tstate & MD_INACCESSIBLE) {
1585 1585 md_unit_readerexit(ui);
1586 1586 continue;
1587 1587 }
1588 1588 md_unit_readerexit(ui);
1589 1589
1590 1590 s = (md_m_shared_t *)(*(smic->sm_shared_by_blk))
1591 1591 (sm->sm_dev, sm, blkno, &cnt);
1592 1592
1593 1593 if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN))
1594 1594 continue;
1595 1595 if (s->ms_state == CS_OKAY) {
1596 1596 *cando = cnt;
1597 1597 if (shared != NULL)
1598 1598 *shared = s;
1599 1599
1600 1600 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST &&
1601 1601 cs != NULL) {
1602 1602 cs->cs_buf.b_flags |= B_FAILFAST;
1603 1603 }
1604 1604
1605 1605 return (un->un_sm[i].sm_dev);
1606 1606 }
1607 1607 if (s->ms_state != CS_LAST_ERRED)
1608 1608 continue;
1609 1609
1610 1610 /* don't use B_FAILFAST since we're Last Erred */
1611 1611
1612 1612 if (mincnt > cnt)
1613 1613 mincnt = cnt;
1614 1614 if (s->ms_lasterrcnt > lasterrcnt) {
1615 1615 lasterrcnt = s->ms_lasterrcnt;
1616 1616 if (shared != NULL)
1617 1617 *shared = s;
1618 1618 dev = un->un_sm[i].sm_dev;
1619 1619 }
1620 1620 }
1621 1621 *cando = mincnt;
1622 1622 return (dev);
1623 1623 }
1624 1624
1625 1625 /*
1626 1626 * Given a 32-bit bitmap, this routine will return the bit number
1627 1627 * of the nth bit set. The nth bit set is passed via the index integer.
1628 1628 *
1629 1629 * This routine is used to run through the writable submirror bitmap
1630 1630 * and starting all of the writes. See the value returned is the
1631 1631 * index to appropriate submirror structure, in the md_sm
1632 1632 * array for metamirrors.
1633 1633 */
1634 1634 static int
1635 1635 md_find_nth_unit(uint_t mask, int index)
1636 1636 {
1637 1637 int bit, nfound;
1638 1638
1639 1639 for (bit = -1, nfound = -1; nfound != index; bit++) {
1640 1640 ASSERT(mask != 0);
1641 1641 nfound += (mask & 1);
1642 1642 mask >>= 1;
1643 1643 }
1644 1644 return (bit);
1645 1645 }
1646 1646
1647 1647 static int
1648 1648 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs)
1649 1649 {
1650 1650 mm_unit_t *un;
1651 1651 buf_t *bp;
1652 1652 int i;
1653 1653 unsigned nunits = 0;
1654 1654 int iunit;
1655 1655 uint_t running_bm = 0;
1656 1656 uint_t sm_index;
1657 1657
1658 1658 bp = &cs->cs_buf;
1659 1659 un = ps->ps_un;
1660 1660
1661 1661 for (i = 0; i < NMIRROR; i++) {
1662 1662 if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING))
1663 1663 continue;
1664 1664 running_bm |= SMI2BIT(i);
1665 1665 nunits++;
1666 1666 }
1667 1667 if (nunits == 0)
1668 1668 return (1);
1669 1669
1670 1670 /*
1671 1671 * For directed mirror read (DMR) we only use the specified side and
1672 1672 * do not compute the source of the read.
1673 1673 * If we're running with MD_MPS_DIRTY_RD set we always return the
1674 1674 * first mirror side (this prevents unnecessary ownership switching).
1675 1675 * Otherwise we return the submirror according to the mirror read option
1676 1676 */
1677 1677 if (ps->ps_flags & MD_MPS_DMR) {
1678 1678 sm_index = un->un_dmr_last_read;
1679 1679 } else if (ps->ps_flags & MD_MPS_DIRTY_RD) {
1680 1680 sm_index = md_find_nth_unit(running_bm, 0);
1681 1681 } else {
1682 1682 /* Normal (non-DMR) operation */
1683 1683 switch (un->un_read_option) {
1684 1684 case RD_GEOMETRY:
1685 1685 iunit = (int)(bp->b_lblkno /
1686 1686 howmany(un->c.un_total_blocks, nunits));
1687 1687 sm_index = md_find_nth_unit(running_bm, iunit);
1688 1688 break;
1689 1689 case RD_FIRST:
1690 1690 sm_index = md_find_nth_unit(running_bm, 0);
1691 1691 break;
1692 1692 case RD_LOAD_BAL:
1693 1693 /* this is intentional to fall into the default */
1694 1694 default:
1695 1695 un->un_last_read = (un->un_last_read + 1) % nunits;
1696 1696 sm_index = md_find_nth_unit(running_bm,
1697 1697 un->un_last_read);
1698 1698 break;
1699 1699 }
1700 1700 }
1701 1701 bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev);
1702 1702 ps->ps_allfrom_sm = SMI2BIT(sm_index);
1703 1703
1704 1704 if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) {
1705 1705 bp->b_flags |= B_FAILFAST;
1706 1706 }
1707 1707
1708 1708 return (0);
1709 1709 }
1710 1710
1711 1711 static
1712 1712 int
1713 1713 mirror_are_submirrors_available(mm_unit_t *un)
1714 1714 {
1715 1715 int i;
1716 1716 for (i = 0; i < NMIRROR; i++) {
1717 1717 md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1718 1718
1719 1719 if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) ||
1720 1720 md_getmajor(tmpdev) != md_major)
1721 1721 continue;
1722 1722
1723 1723 if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) ||
1724 1724 (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits))
1725 1725 return (0);
1726 1726
1727 1727 if (MDI_UNIT(md_getminor(tmpdev)) == NULL)
1728 1728 return (0);
1729 1729 }
1730 1730 return (1);
1731 1731 }
1732 1732
1733 1733 void
1734 1734 build_submirror(mm_unit_t *un, int i, int snarfing)
1735 1735 {
1736 1736 struct mm_submirror *sm;
1737 1737 struct mm_submirror_ic *smic;
1738 1738 md_unit_t *su;
1739 1739 set_t setno;
1740 1740
1741 1741 sm = &un->un_sm[i];
1742 1742 smic = &un->un_smic[i];
1743 1743
1744 1744 sm->sm_flags = 0; /* sometime we may need to do more here */
1745 1745
1746 1746 setno = MD_UN2SET(un);
1747 1747
1748 1748 if (!SMS_IS(sm, SMS_INUSE))
1749 1749 return;
1750 1750 if (snarfing) {
1751 1751 sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno),
1752 1752 sm->sm_key, MD_NOTRUST_DEVT);
1753 1753 } else {
1754 1754 if (md_getmajor(sm->sm_dev) == md_major) {
1755 1755 su = MD_UNIT(md_getminor(sm->sm_dev));
1756 1756 un->c.un_flag |= (su->c.un_flag & MD_LABELED);
1757 1757 /* submirror can no longer be soft partitioned */
1758 1758 MD_CAPAB(su) &= (~MD_CAN_SP);
1759 1759 }
1760 1760 }
1761 1761 smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev,
1762 1762 0, "shared by blk", 0);
1763 1763 smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev,
1764 1764 0, "shared by indx", 0);
1765 1765 smic->sm_get_component_count = (int (*)())md_get_named_service(
1766 1766 sm->sm_dev, 0, "get component count", 0);
1767 1767 smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0,
1768 1768 "get block count skip size", 0);
1769 1769 sm->sm_state &= ~SMS_IGNORE;
1770 1770 if (SMS_IS(sm, SMS_OFFLINE))
1771 1771 MD_STATUS(un) |= MD_UN_OFFLINE_SM;
1772 1772 md_set_parent(sm->sm_dev, MD_SID(un));
1773 1773 }
1774 1774
1775 1775 static void
1776 1776 mirror_cleanup(mm_unit_t *un)
1777 1777 {
1778 1778 mddb_recid_t recid;
1779 1779 int smi;
1780 1780 sv_dev_t sv[NMIRROR];
1781 1781 int nsv = 0;
1782 1782
1783 1783 /*
1784 1784 * If a MN diskset and this node is not the master, do
1785 1785 * not delete any records on snarf of the mirror records.
1786 1786 */
1787 1787 if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1788 1788 md_set[MD_UN2SET(un)].s_am_i_master == 0) {
1789 1789 return;
1790 1790 }
1791 1791
1792 1792 for (smi = 0; smi < NMIRROR; smi++) {
1793 1793 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1794 1794 continue;
1795 1795 sv[nsv].setno = MD_UN2SET(un);
1796 1796 sv[nsv++].key = un->un_sm[smi].sm_key;
1797 1797 }
1798 1798
1799 1799 recid = un->un_rr_dirty_recid;
1800 1800 mddb_deleterec_wrapper(un->c.un_record_id);
1801 1801 if (recid > 0)
1802 1802 mddb_deleterec_wrapper(recid);
1803 1803
1804 1804 md_rem_names(sv, nsv);
1805 1805 }
1806 1806
1807 1807 /*
1808 1808 * Comparison function for the avl tree which tracks
1809 1809 * outstanding writes on submirrors.
1810 1810 *
1811 1811 * Returns:
1812 1812 * -1: ps1 < ps2
1813 1813 * 0: ps1 and ps2 overlap
1814 1814 * 1: ps1 > ps2
1815 1815 */
1816 1816 static int
1817 1817 mirror_overlap_compare(const void *p1, const void *p2)
1818 1818 {
1819 1819 const md_mps_t *ps1 = (md_mps_t *)p1;
1820 1820 const md_mps_t *ps2 = (md_mps_t *)p2;
1821 1821
1822 1822 if (ps1->ps_firstblk < ps2->ps_firstblk) {
1823 1823 if (ps1->ps_lastblk >= ps2->ps_firstblk)
1824 1824 return (0);
1825 1825 return (-1);
1826 1826 }
1827 1827
1828 1828 if (ps1->ps_firstblk > ps2->ps_firstblk) {
1829 1829 if (ps1->ps_firstblk <= ps2->ps_lastblk)
1830 1830 return (0);
1831 1831 return (1);
1832 1832 }
1833 1833
1834 1834 return (0);
1835 1835 }
1836 1836
1837 1837 /*
1838 1838 * Collapse any sparse submirror entries snarfed from the on-disk replica.
1839 1839 * Only the in-core entries are updated. The replica will be updated on-disk
1840 1840 * when the in-core replica is committed on shutdown of the SVM subsystem.
1841 1841 */
1842 1842 static void
1843 1843 collapse_submirrors(mm_unit_t *un)
1844 1844 {
1845 1845 int smi, nremovals, smiremove;
1846 1846 mm_submirror_t *sm, *new_sm, *old_sm;
1847 1847 mm_submirror_ic_t *smic;
1848 1848 int nsmidx = un->un_nsm - 1;
1849 1849
1850 1850 rescan:
1851 1851 nremovals = 0;
1852 1852 smiremove = -1;
1853 1853
1854 1854 for (smi = 0; smi <= nsmidx; smi++) {
1855 1855 sm = &un->un_sm[smi];
1856 1856
1857 1857 /*
1858 1858 * Check to see if this submirror is marked as in-use.
1859 1859 * If it isn't then it is a potential sparse entry and
1860 1860 * may need to be cleared from the configuration.
1861 1861 * The records should _already_ have been cleared by the
1862 1862 * original mirror_detach() code, but we need to shuffle
1863 1863 * any NULL entries in un_sm[] to the end of the array.
1864 1864 * Any NULL un_smic[] entries need to be reset to the underlying
1865 1865 * submirror/slice accessor functions.
1866 1866 */
1867 1867 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
1868 1868 nremovals++;
1869 1869 smiremove = smi;
1870 1870 break;
1871 1871 }
1872 1872 }
1873 1873
1874 1874 if (nremovals == 0) {
1875 1875 /*
1876 1876 * Ensure that we have a matching contiguous set of un_smic[]
1877 1877 * entries for the corresponding un_sm[] entries
1878 1878 */
1879 1879 for (smi = 0; smi <= nsmidx; smi++) {
1880 1880 smic = &un->un_smic[smi];
1881 1881 sm = &un->un_sm[smi];
1882 1882
1883 1883 smic->sm_shared_by_blk =
1884 1884 md_get_named_service(sm->sm_dev, 0,
1885 1885 "shared by_blk", 0);
1886 1886 smic->sm_shared_by_indx =
1887 1887 md_get_named_service(sm->sm_dev, 0,
1888 1888 "shared by indx", 0);
1889 1889 smic->sm_get_component_count =
1890 1890 (int (*)())md_get_named_service(sm->sm_dev, 0,
1891 1891 "get component count", 0);
1892 1892 smic->sm_get_bcss =
1893 1893 (int (*)())md_get_named_service(sm->sm_dev, 0,
1894 1894 "get block count skip size", 0);
1895 1895 }
1896 1896 return;
1897 1897 }
1898 1898
1899 1899 /*
1900 1900 * Reshuffle the submirror devices so that we do not have a dead record
1901 1901 * in the middle of the array. Once we've done this we need to rescan
1902 1902 * the mirror to check for any other holes.
1903 1903 */
1904 1904 for (smi = 0; smi < NMIRROR; smi++) {
1905 1905 if (smi < smiremove)
1906 1906 continue;
1907 1907 if (smi > smiremove) {
1908 1908 old_sm = &un->un_sm[smi];
1909 1909 new_sm = &un->un_sm[smi - 1];
1910 1910 bcopy(old_sm, new_sm, sizeof (mm_submirror_t));
1911 1911 bzero(old_sm, sizeof (mm_submirror_t));
1912 1912 }
1913 1913 }
1914 1914
1915 1915 /*
1916 1916 * Now we need to rescan the array to find the next potential dead
1917 1917 * entry.
1918 1918 */
1919 1919 goto rescan;
1920 1920 }
1921 1921
1922 1922 /* Return a -1 if optimized record unavailable and set should be released */
1923 1923 int
1924 1924 mirror_build_incore(mm_unit_t *un, int snarfing)
1925 1925 {
1926 1926 int i;
1927 1927
1928 1928 if (MD_STATUS(un) & MD_UN_BEING_RESET) {
1929 1929 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
1930 1930 return (1);
1931 1931 }
1932 1932
1933 1933 if (mirror_are_submirrors_available(un) == 0)
1934 1934 return (1);
1935 1935
1936 1936 if (MD_UNIT(MD_SID(un)) != NULL)
1937 1937 return (0);
1938 1938
1939 1939 MD_STATUS(un) = 0;
1940 1940
1941 1941 /* pre-4.1 didn't define CAN_META_CHILD capability */
1942 1942 MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP;
1943 1943
1944 1944 un->un_overlap_tree_flag = 0;
1945 1945 avl_create(&un->un_overlap_root, mirror_overlap_compare,
1946 1946 sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node));
1947 1947
1948 1948 /*
1949 1949 * We need to collapse any sparse submirror entries into a non-sparse
1950 1950 * array. This is to cover the case where we have an old replica image
1951 1951 * which has not been updated (i.e. snarfed) since being modified.
1952 1952 * The new code expects all submirror access to be sequential (i.e.
1953 1953 * both the un_sm[] and un_smic[] entries correspond to non-empty
1954 1954 * submirrors.
1955 1955 */
1956 1956
1957 1957 collapse_submirrors(un);
1958 1958
1959 1959 for (i = 0; i < NMIRROR; i++)
1960 1960 build_submirror(un, i, snarfing);
1961 1961
1962 1962 if (unit_setup_resync(un, snarfing) != 0) {
1963 1963 if (snarfing) {
1964 1964 mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT);
1965 1965 /*
1966 1966 * If a MN set and set is not stale, then return -1
1967 1967 * which will force the caller to unload the set.
1968 1968 * The MN diskset nodes will return failure if
1969 1969 * unit_setup_resync fails so that nodes won't
1970 1970 * get out of sync.
1971 1971 *
1972 1972 * If set is STALE, the master node can't allocate
1973 1973 * a resync record (if needed), but node needs to
1974 1974 * join the set so that user can delete broken mddbs.
1975 1975 * So, if set is STALE, just continue on.
1976 1976 */
1977 1977 if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1978 1978 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
1979 1979 return (-1);
1980 1980 }
1981 1981 } else
1982 1982 return (1);
1983 1983 }
1984 1984
1985 1985 mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL);
1986 1986 cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL);
1987 1987
1988 1988 un->un_suspend_wr_flag = 0;
1989 1989 mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL);
1990 1990 cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL);
1991 1991
1992 1992 /*
1993 1993 * Allocate mutexes for mirror-owner and resync-owner changes.
1994 1994 * All references to the owner message state field must be guarded
1995 1995 * by this mutex.
1996 1996 */
1997 1997 mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL);
1998 1998
1999 1999 /*
2000 2000 * Allocate mutex and condvar for resync thread manipulation. These
2001 2001 * will be used by mirror_resync_unit/mirror_ioctl_resync
2002 2002 */
2003 2003 mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL);
2004 2004 cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL);
2005 2005
2006 2006 /*
2007 2007 * Allocate mutex and condvar for resync progress thread manipulation.
2008 2008 * This allows resyncs to be continued across an intervening reboot.
2009 2009 */
2010 2010 mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL);
2011 2011 cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL);
2012 2012
2013 2013 /*
2014 2014 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This
2015 2015 * provides synchronization between a user-ioctl and the resulting
2016 2016 * strategy() call that performs the read().
2017 2017 */
2018 2018 mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL);
2019 2019 cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL);
2020 2020
2021 2021 /*
2022 2022 * Allocate rwlocks for un_pernode_dirty_bm accessing.
2023 2023 */
2024 2024 for (i = 0; i < MD_MNMAXSIDES; i++) {
2025 2025 rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL);
2026 2026 }
2027 2027
2028 2028 /* place various information in the in-core data structures */
2029 2029 md_nblocks_set(MD_SID(un), un->c.un_total_blocks);
2030 2030 MD_UNIT(MD_SID(un)) = un;
2031 2031
2032 2032 return (0);
2033 2033 }
2034 2034
2035 2035
2036 2036 void
2037 2037 reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
2038 2038 {
2039 2039 mddb_recid_t recid, vtoc_id;
2040 2040 size_t bitcnt;
2041 2041 size_t shortcnt;
2042 2042 int smi;
2043 2043 sv_dev_t sv[NMIRROR];
2044 2044 int nsv = 0;
2045 2045 uint_t bits = 0;
2046 2046 minor_t selfid;
2047 2047 md_unit_t *su;
2048 2048 int i;
2049 2049
2050 2050 md_destroy_unit_incore(mnum, &mirror_md_ops);
2051 2051
2052 2052 shortcnt = un->un_rrd_num * sizeof (short);
2053 2053 bitcnt = howmany(un->un_rrd_num, NBBY);
2054 2054
2055 2055 if (un->un_outstanding_writes)
2056 2056 kmem_free((caddr_t)un->un_outstanding_writes, shortcnt);
2057 2057 if (un->un_goingclean_bm)
2058 2058 kmem_free((caddr_t)un->un_goingclean_bm, bitcnt);
2059 2059 if (un->un_goingdirty_bm)
2060 2060 kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt);
2061 2061 if (un->un_resync_bm)
2062 2062 kmem_free((caddr_t)un->un_resync_bm, bitcnt);
2063 2063 if (un->un_pernode_dirty_sum)
2064 2064 kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num);
2065 2065
2066 2066 /*
2067 2067 * Destroy the taskq for deferred processing of DRL clean requests.
2068 2068 * This taskq will only be present for Multi Owner mirrors.
2069 2069 */
2070 2070 if (un->un_drl_task != NULL)
2071 2071 ddi_taskq_destroy(un->un_drl_task);
2072 2072
2073 2073 md_nblocks_set(mnum, -1ULL);
2074 2074 MD_UNIT(mnum) = NULL;
2075 2075
2076 2076 /*
2077 2077 * Attempt release of its minor node
2078 2078 */
2079 2079 md_remove_minor_node(mnum);
2080 2080
2081 2081 if (!removing)
2082 2082 return;
2083 2083
2084 2084 for (smi = 0; smi < NMIRROR; smi++) {
2085 2085 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
2086 2086 continue;
2087 2087 /* reallow soft partitioning of submirror and reset parent */
2088 2088 su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev));
2089 2089 MD_CAPAB(su) |= MD_CAN_SP;
2090 2090 md_reset_parent(un->un_sm[smi].sm_dev);
2091 2091 reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]);
2092 2092
2093 2093 sv[nsv].setno = MD_MIN2SET(mnum);
2094 2094 sv[nsv++].key = un->un_sm[smi].sm_key;
2095 2095 bits |= SMI2BIT(smi);
2096 2096 }
2097 2097
2098 2098 MD_STATUS(un) |= MD_UN_BEING_RESET;
2099 2099 recid = un->un_rr_dirty_recid;
2100 2100 vtoc_id = un->c.un_vtoc_id;
2101 2101 selfid = MD_SID(un);
2102 2102
2103 2103 mirror_commit(un, bits, 0);
2104 2104
2105 2105 avl_destroy(&un->un_overlap_root);
2106 2106
2107 2107 /* Destroy all mutexes and condvars before returning. */
2108 2108 mutex_destroy(&un->un_suspend_wr_mx);
2109 2109 cv_destroy(&un->un_suspend_wr_cv);
2110 2110 mutex_destroy(&un->un_overlap_tree_mx);
2111 2111 cv_destroy(&un->un_overlap_tree_cv);
2112 2112 mutex_destroy(&un->un_owner_mx);
2113 2113 mutex_destroy(&un->un_rs_thread_mx);
2114 2114 cv_destroy(&un->un_rs_thread_cv);
2115 2115 mutex_destroy(&un->un_rs_progress_mx);
2116 2116 cv_destroy(&un->un_rs_progress_cv);
2117 2117 mutex_destroy(&un->un_dmr_mx);
2118 2118 cv_destroy(&un->un_dmr_cv);
2119 2119
2120 2120 for (i = 0; i < MD_MNMAXSIDES; i++) {
2121 2121 rw_destroy(&un->un_pernode_dirty_mx[i]);
2122 2122 if (un->un_pernode_dirty_bm[i])
2123 2123 kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt);
2124 2124 }
2125 2125
2126 2126 /*
2127 2127 * Remove self from the namespace
2128 2128 */
2129 2129 if (un->c.un_revision & MD_FN_META_DEV) {
2130 2130 (void) md_rem_selfname(un->c.un_self_id);
2131 2131 }
2132 2132
2133 2133 /* This frees the unit structure. */
2134 2134 mddb_deleterec_wrapper(un->c.un_record_id);
2135 2135
2136 2136 if (recid != 0)
2137 2137 mddb_deleterec_wrapper(recid);
2138 2138
2139 2139 /* Remove the vtoc, if present */
2140 2140 if (vtoc_id)
2141 2141 mddb_deleterec_wrapper(vtoc_id);
2142 2142
2143 2143 md_rem_names(sv, nsv);
2144 2144
2145 2145 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
2146 2146 MD_MIN2SET(selfid), selfid);
2147 2147 }
2148 2148
2149 2149 int
2150 2150 mirror_internal_open(
2151 2151 minor_t mnum,
2152 2152 int flag,
2153 2153 int otyp,
2154 2154 int md_oflags,
2155 2155 IOLOCK *lockp /* can be NULL */
2156 2156 )
2157 2157 {
2158 2158 mdi_unit_t *ui = MDI_UNIT(mnum);
2159 2159 int err = 0;
2160 2160
2161 2161 tryagain:
2162 2162 /* single thread */
2163 2163 if (lockp) {
2164 2164 /*
2165 2165 * If ioctl lock is held, use openclose_enter
2166 2166 * routine that will set the ioctl flag when
2167 2167 * grabbing the readerlock.
2168 2168 */
2169 2169 (void) md_ioctl_openclose_enter(lockp, ui);
2170 2170 } else {
2171 2171 (void) md_unit_openclose_enter(ui);
2172 2172 }
2173 2173
2174 2174 /*
2175 2175 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE
2176 2176 * message in a MN diskset and this requires that the openclose
2177 2177 * lock is dropped in order to send this message. So, another
2178 2178 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from
2179 2179 * attempting an open while this thread has an open in progress.
2180 2180 * Call the *_lh version of the lock exit routines since the ui_mx
2181 2181 * mutex must be held from checking for OPENINPROGRESS until
2182 2182 * after the cv_wait call.
2183 2183 */
2184 2184 mutex_enter(&ui->ui_mx);
2185 2185 if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
2186 2186 if (lockp) {
2187 2187 (void) md_ioctl_openclose_exit_lh(lockp);
2188 2188 } else {
2189 2189 md_unit_openclose_exit_lh(ui);
2190 2190 }
2191 2191 cv_wait(&ui->ui_cv, &ui->ui_mx);
2192 2192 mutex_exit(&ui->ui_mx);
2193 2193 goto tryagain;
2194 2194 }
2195 2195
2196 2196 ui->ui_lock |= MD_UL_OPENINPROGRESS;
2197 2197 mutex_exit(&ui->ui_mx);
2198 2198
2199 2199 /* open devices, if necessary */
2200 2200 if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) {
2201 2201 if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0)
2202 2202 goto out;
2203 2203 }
2204 2204
2205 2205 /* count open */
2206 2206 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
2207 2207 goto out;
2208 2208
2209 2209 /* unlock, return success */
2210 2210 out:
2211 2211 mutex_enter(&ui->ui_mx);
2212 2212 ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
2213 2213 mutex_exit(&ui->ui_mx);
2214 2214
2215 2215 if (lockp) {
2216 2216 /*
2217 2217 * If ioctl lock is held, use openclose_exit
2218 2218 * routine that will clear the lockp reader flag.
2219 2219 */
2220 2220 (void) md_ioctl_openclose_exit(lockp);
2221 2221 } else {
2222 2222 md_unit_openclose_exit(ui);
2223 2223 }
2224 2224 return (err);
2225 2225 }
2226 2226
2227 2227 int
2228 2228 mirror_internal_close(
2229 2229 minor_t mnum,
2230 2230 int otyp,
2231 2231 int md_cflags,
2232 2232 IOLOCK *lockp /* can be NULL */
2233 2233 )
2234 2234 {
2235 2235 mdi_unit_t *ui = MDI_UNIT(mnum);
2236 2236 mm_unit_t *un;
2237 2237 int err = 0;
2238 2238
2239 2239 /* single thread */
2240 2240 if (lockp) {
2241 2241 /*
2242 2242 * If ioctl lock is held, use openclose_enter
2243 2243 * routine that will set the ioctl flag when
2244 2244 * grabbing the readerlock.
2245 2245 */
2246 2246 un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui);
2247 2247 } else {
2248 2248 un = (mm_unit_t *)md_unit_openclose_enter(ui);
2249 2249 }
2250 2250
2251 2251 /* count closed */
2252 2252 if ((err = md_unit_decopen(mnum, otyp)) != 0)
2253 2253 goto out;
2254 2254
2255 2255 /* close devices, if necessary */
2256 2256 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
2257 2257 /*
2258 2258 * Clean up dirty bitmap for this unit. Do this
2259 2259 * before closing the underlying devices to avoid
2260 2260 * race conditions with reset_mirror() as a
2261 2261 * result of a 'metaset -r' command running in
2262 2262 * parallel. This might cause deallocation of
2263 2263 * dirty region bitmaps; with underlying metadevices
2264 2264 * in place this can't happen.
2265 2265 * Don't do this if a MN set and ABR not set
2266 2266 */
2267 2267 if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) {
2268 2268 if (!MD_MNSET_SETNO(MD_UN2SET(un)) ||
2269 2269 !(ui->ui_tstate & MD_ABR_CAP))
2270 2270 mirror_process_unit_resync(un);
2271 2271 }
2272 2272 (void) mirror_close_all_devs(un, md_cflags);
2273 2273
2274 2274 /*
2275 2275 * For a MN set with transient capabilities (eg ABR/DMR) set,
2276 2276 * clear these capabilities on the last open in the cluster.
2277 2277 * To do this we send a message to all nodes to see of the
2278 2278 * device is open.
2279 2279 */
2280 2280 if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
2281 2281 (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) {
2282 2282 if (lockp) {
2283 2283 (void) md_ioctl_openclose_exit(lockp);
2284 2284 } else {
2285 2285 md_unit_openclose_exit(ui);
2286 2286 }
2287 2287
2288 2288 /*
2289 2289 * if we are in the context of an ioctl, drop the
2290 2290 * ioctl lock.
2291 2291 * Otherwise, no other locks should be held.
2292 2292 */
2293 2293 if (lockp) {
2294 2294 IOLOCK_RETURN_RELEASE(0, lockp);
2295 2295 }
2296 2296
2297 2297 mdmn_clear_all_capabilities(mnum);
2298 2298
2299 2299 /* if dropped the lock previously, regain it */
2300 2300 if (lockp) {
2301 2301 IOLOCK_RETURN_REACQUIRE(lockp);
2302 2302 }
2303 2303 return (0);
2304 2304 }
2305 2305 /* unlock and return success */
2306 2306 }
2307 2307 out:
2308 2308 /* Call whether lockp is NULL or not. */
2309 2309 if (lockp) {
2310 2310 md_ioctl_openclose_exit(lockp);
2311 2311 } else {
2312 2312 md_unit_openclose_exit(ui);
2313 2313 }
2314 2314 return (err);
2315 2315 }
2316 2316
2317 2317 /*
2318 2318 * When a component has completed resyncing and is now ok, check if the
2319 2319 * corresponding component in the other submirrors is in the Last Erred
2320 2320 * state. If it is, we want to change that to the Erred state so we stop
2321 2321 * using that component and start using this good component instead.
2322 2322 *
2323 2323 * This is called from set_sm_comp_state and recursively calls
2324 2324 * set_sm_comp_state if it needs to change the Last Erred state.
2325 2325 */
2326 2326 static void
2327 2327 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags,
2328 2328 IOLOCK *lockp)
2329 2329 {
2330 2330 mm_submirror_t *sm;
2331 2331 mm_submirror_ic_t *smic;
2332 2332 int ci;
2333 2333 int i;
2334 2334 int compcnt;
2335 2335 int changed = 0;
2336 2336
2337 2337 for (i = 0; i < NMIRROR; i++) {
2338 2338 sm = &un->un_sm[i];
2339 2339 smic = &un->un_smic[i];
2340 2340
2341 2341 if (!SMS_IS(sm, SMS_INUSE))
2342 2342 continue;
2343 2343
2344 2344 /* ignore the submirror that we just made ok */
2345 2345 if (i == smi)
2346 2346 continue;
2347 2347
2348 2348 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
2349 2349 for (ci = 0; ci < compcnt; ci++) {
2350 2350 md_m_shared_t *shared;
2351 2351
2352 2352 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2353 2353 (sm->sm_dev, sm, ci);
2354 2354
2355 2355 if ((shared->ms_state & CS_LAST_ERRED) &&
2356 2356 !mirror_other_sources(un, i, ci, 1)) {
2357 2357
2358 2358 set_sm_comp_state(un, i, ci, CS_ERRED, extras,
2359 2359 flags, lockp);
2360 2360 changed = 1;
2361 2361 }
2362 2362 }
2363 2363 }
2364 2364
2365 2365 /* maybe there is a hotspare for this newly erred component */
2366 2366 if (changed) {
2367 2367 set_t setno;
2368 2368
2369 2369 setno = MD_UN2SET(un);
2370 2370 if (MD_MNSET_SETNO(setno)) {
2371 2371 send_poke_hotspares(setno);
2372 2372 } else {
2373 2373 (void) poke_hotspares();
2374 2374 }
2375 2375 }
2376 2376 }
2377 2377
2378 2378 /*
2379 2379 * set_sm_comp_state
2380 2380 *
2381 2381 * Set the state of a submirror component to the specified new state.
2382 2382 * If the mirror is in a multi-node set, send messages to all nodes to
2383 2383 * block all writes to the mirror and then update the state and release the
2384 2384 * writes. These messages are only sent if MD_STATE_XMIT is set in flags.
2385 2385 * MD_STATE_XMIT will be unset in 2 cases:
2386 2386 * 1. When the state is changed to CS_RESYNC as this state change
2387 2387 * will already have been updated on each node by the processing of the
2388 2388 * distributed metasync command, hence no need to xmit.
2389 2389 * 2. When the state is change to CS_OKAY after a resync has completed. Again
2390 2390 * the resync completion will already have been processed on each node by
2391 2391 * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component
2392 2392 * resync, hence no need to xmit.
2393 2393 *
2394 2394 * In case we are called from the updates of a watermark,
2395 2395 * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to
2396 2396 * a metainit or similar. In this case the message that we sent to propagate
2397 2397 * the state change must not be a class1 message as that would deadlock with
2398 2398 * the metainit command that is still being processed.
2399 2399 * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2
2400 2400 * instead. This also makes the submessage generator to create a class2
2401 2401 * submessage rather than a class1 (which would also block)
2402 2402 *
2403 2403 * On entry, unit_writerlock is held
2404 2404 * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is
2405 2405 * also held.
2406 2406 */
2407 2407 void
2408 2408 set_sm_comp_state(
2409 2409 mm_unit_t *un,
2410 2410 int smi,
2411 2411 int ci,
2412 2412 int newstate,
2413 2413 mddb_recid_t *extras,
2414 2414 uint_t flags,
2415 2415 IOLOCK *lockp
2416 2416 )
2417 2417 {
2418 2418 mm_submirror_t *sm;
2419 2419 mm_submirror_ic_t *smic;
2420 2420 md_m_shared_t *shared;
2421 2421 int origstate;
2422 2422 void (*get_dev)();
2423 2423 ms_cd_info_t cd;
2424 2424 char devname[MD_MAX_CTDLEN];
2425 2425 int err;
2426 2426 set_t setno = MD_UN2SET(un);
2427 2427 md_mn_msg_stch_t stchmsg;
2428 2428 mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
2429 2429 md_mn_kresult_t *kresult;
2430 2430 int rval;
2431 2431 uint_t msgflags;
2432 2432 md_mn_msgtype_t msgtype;
2433 2433 int save_lock = 0;
2434 2434 mdi_unit_t *ui_sm;
2435 2435 int nretries = 0;
2436 2436
2437 2437 sm = &un->un_sm[smi];
2438 2438 smic = &un->un_smic[smi];
2439 2439
2440 2440 /* If we have a real error status then turn off MD_INACCESSIBLE. */
2441 2441 ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev)));
2442 2442 if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) &&
2443 2443 ui_sm->ui_tstate & MD_INACCESSIBLE) {
2444 2444 ui_sm->ui_tstate &= ~MD_INACCESSIBLE;
2445 2445 }
2446 2446
2447 2447 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2448 2448 (sm->sm_dev, sm, ci);
2449 2449 origstate = shared->ms_state;
2450 2450
2451 2451 /*
2452 2452 * If the new state is an error and the old one wasn't, generate
2453 2453 * a console message. We do this before we send the state to other
2454 2454 * nodes in a MN set because the state change may change the component
2455 2455 * name if a hotspare is allocated.
2456 2456 */
2457 2457 if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) &&
2458 2458 (newstate & (CS_ERRED|CS_LAST_ERRED))) {
2459 2459
2460 2460 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2461 2461 "get device", 0);
2462 2462 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2463 2463
2464 2464 err = md_getdevname(setno, mddb_getsidenum(setno), 0,
2465 2465 cd.cd_dev, devname, sizeof (devname));
2466 2466
2467 2467 if (err == ENOENT) {
2468 2468 (void) md_devname(setno, cd.cd_dev, devname,
2469 2469 sizeof (devname));
2470 2470 }
2471 2471
2472 2472 cmn_err(CE_WARN, "md: %s: %s needs maintenance",
2473 2473 md_shortname(md_getminor(sm->sm_dev)), devname);
2474 2474
2475 2475 if (newstate & CS_LAST_ERRED) {
2476 2476 cmn_err(CE_WARN, "md: %s: %s last erred",
2477 2477 md_shortname(md_getminor(sm->sm_dev)),
2478 2478 devname);
2479 2479
2480 2480 } else if (shared->ms_flags & MDM_S_ISOPEN) {
2481 2481 /*
2482 2482 * Close the broken device and clear the open flag on
2483 2483 * it. Closing the device means the RCM framework will
2484 2484 * be able to unconfigure the device if required.
2485 2485 *
2486 2486 * We have to check that the device is open, otherwise
2487 2487 * the first open on it has resulted in the error that
2488 2488 * is being processed and the actual cd.cd_dev will be
2489 2489 * NODEV64.
2490 2490 *
2491 2491 * If this is a multi-node mirror, then the multinode
2492 2492 * state checks following this code will cause the
2493 2493 * slave nodes to close the mirror in the function
2494 2494 * mirror_set_state().
2495 2495 */
2496 2496 md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2497 2497 shared->ms_flags &= ~MDM_S_ISOPEN;
2498 2498 }
2499 2499
2500 2500 } else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) &&
2501 2501 (shared->ms_flags & MDM_S_ISOPEN)) {
2502 2502 /*
2503 2503 * Similar to logic above except no log messages since we
2504 2504 * are just transitioning from Last Erred to Erred.
2505 2505 */
2506 2506 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2507 2507 "get device", 0);
2508 2508 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2509 2509
2510 2510 md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2511 2511 shared->ms_flags &= ~MDM_S_ISOPEN;
2512 2512 }
2513 2513
2514 2514 if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) &&
2515 2515 (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) {
2516 2516 /*
2517 2517 * For a multi-node mirror, send the state change to the
2518 2518 * master, which broadcasts to all nodes, including this
2519 2519 * one. Once the message is received, the state is set
2520 2520 * in-core and the master commits the change to disk.
2521 2521 * There is a case, comp_replace, where this function
2522 2522 * can be called from within an ioctl and therefore in this
2523 2523 * case, as the ioctl will already be called on each node,
2524 2524 * there is no need to xmit the state change to the master for
2525 2525 * distribution to the other nodes. MD_STATE_XMIT flag is used
2526 2526 * to indicate whether a xmit is required. The mirror's
2527 2527 * transient state is set to MD_ERR_PENDING to avoid sending
2528 2528 * multiple messages.
2529 2529 */
2530 2530 if (newstate & (CS_ERRED|CS_LAST_ERRED))
2531 2531 ui->ui_tstate |= MD_ERR_PENDING;
2532 2532
2533 2533 /*
2534 2534 * Send a state update message to all nodes. This message
2535 2535 * will generate 2 submessages, the first one to suspend
2536 2536 * all writes to the mirror and the second to update the
2537 2537 * state and resume writes.
2538 2538 */
2539 2539 stchmsg.msg_stch_mnum = un->c.un_self_id;
2540 2540 stchmsg.msg_stch_sm = smi;
2541 2541 stchmsg.msg_stch_comp = ci;
2542 2542 stchmsg.msg_stch_new_state = newstate;
2543 2543 stchmsg.msg_stch_hs_id = shared->ms_hs_id;
2544 2544 #ifdef DEBUG
2545 2545 if (mirror_debug_flag)
2546 2546 printf("send set state, %x, %x, %x, %x, %x\n",
2547 2547 stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm,
2548 2548 stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state,
2549 2549 stchmsg.msg_stch_hs_id);
2550 2550 #endif
2551 2551 if (flags & MD_STATE_WMUPDATE) {
2552 2552 msgtype = MD_MN_MSG_STATE_UPDATE2;
2553 2553 /*
2554 2554 * When coming from an update of watermarks, there
2555 2555 * must already be a message logged that triggered
2556 2556 * this action. So, no need to log this message, too.
2557 2557 */
2558 2558 msgflags = MD_MSGF_NO_LOG;
2559 2559 } else {
2560 2560 msgtype = MD_MN_MSG_STATE_UPDATE;
2561 2561 msgflags = MD_MSGF_DEFAULT_FLAGS;
2562 2562 }
2563 2563
2564 2564 /*
2565 2565 * If we are in the context of an ioctl, drop the ioctl lock.
2566 2566 * lockp holds the list of locks held.
2567 2567 *
2568 2568 * Otherwise, increment the appropriate reacquire counters.
2569 2569 * If openclose lock is *held, then must reacquire reader
2570 2570 * lock before releasing the openclose lock.
2571 2571 * Do not drop the ARRAY_WRITER lock as we may not be able
2572 2572 * to reacquire it.
2573 2573 */
2574 2574 if (lockp) {
2575 2575 if (lockp->l_flags & MD_ARRAY_WRITER) {
2576 2576 save_lock = MD_ARRAY_WRITER;
2577 2577 lockp->l_flags &= ~MD_ARRAY_WRITER;
2578 2578 } else if (lockp->l_flags & MD_ARRAY_READER) {
2579 2579 save_lock = MD_ARRAY_READER;
2580 2580 lockp->l_flags &= ~MD_ARRAY_READER;
2581 2581 }
2582 2582 IOLOCK_RETURN_RELEASE(0, lockp);
2583 2583 } else {
2584 2584 if (flags & MD_STATE_OCHELD) {
2585 2585 md_unit_writerexit(ui);
2586 2586 (void) md_unit_readerlock(ui);
2587 2587 md_unit_openclose_exit(ui);
2588 2588 } else {
2589 2589 md_unit_writerexit(ui);
2590 2590 }
2591 2591 }
2592 2592
2593 2593 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
2594 2594 sscs_msg:
2595 2595 rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
2596 2596 (char *)&stchmsg, sizeof (stchmsg), kresult);
2597 2597
2598 2598 if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
2599 2599 mdmn_ksend_show_error(rval, kresult, "STATE UPDATE");
2600 2600 /* If we're shutting down already, pause things here. */
2601 2601 if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
2602 2602 while (!md_mn_is_commd_present()) {
2603 2603 delay(md_hz);
2604 2604 }
2605 2605 /*
2606 2606 * commd is now available; retry the message
2607 2607 * one time. If that fails we fall through and
2608 2608 * panic as the system is in an unexpected state
2609 2609 */
2610 2610 if (nretries++ == 0)
2611 2611 goto sscs_msg;
2612 2612 }
2613 2613 cmn_err(CE_PANIC,
2614 2614 "ksend_message failure: STATE_UPDATE");
2615 2615 }
2616 2616 kmem_free(kresult, sizeof (md_mn_kresult_t));
2617 2617
2618 2618 /* if dropped the lock previously, regain it */
2619 2619 if (lockp) {
2620 2620 IOLOCK_RETURN_REACQUIRE(lockp);
2621 2621 lockp->l_flags |= save_lock;
2622 2622 } else {
2623 2623 /*
2624 2624 * Reacquire dropped locks and update acquirecnts
2625 2625 * appropriately.
2626 2626 */
2627 2627 if (flags & MD_STATE_OCHELD) {
2628 2628 /*
2629 2629 * openclose also grabs readerlock.
2630 2630 */
2631 2631 (void) md_unit_openclose_enter(ui);
2632 2632 md_unit_readerexit(ui);
2633 2633 (void) md_unit_writerlock(ui);
2634 2634 } else {
2635 2635 (void) md_unit_writerlock(ui);
2636 2636 }
2637 2637 }
2638 2638
2639 2639 ui->ui_tstate &= ~MD_ERR_PENDING;
2640 2640 } else {
2641 2641 shared->ms_state = newstate;
2642 2642 uniqtime32(&shared->ms_timestamp);
2643 2643
2644 2644 if (newstate == CS_ERRED)
2645 2645 shared->ms_flags |= MDM_S_NOWRITE;
2646 2646 else
2647 2647 shared->ms_flags &= ~MDM_S_NOWRITE;
2648 2648
2649 2649 shared->ms_flags &= ~MDM_S_IOERR;
2650 2650 un->un_changecnt++;
2651 2651 shared->ms_lasterrcnt = un->un_changecnt;
2652 2652
2653 2653 mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
2654 2654 mirror_commit(un, SMI2BIT(smi), extras);
2655 2655 }
2656 2656
2657 2657 if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) {
2658 2658 /*
2659 2659 * Resetting the Last Erred state will recursively call back
2660 2660 * into this function (set_sm_comp_state) to update the state.
2661 2661 */
2662 2662 reset_lasterred(un, smi, extras, flags, lockp);
2663 2663 }
2664 2664 }
2665 2665
2666 2666 static int
2667 2667 find_another_logical(
2668 2668 mm_unit_t *un,
2669 2669 mm_submirror_t *esm,
2670 2670 diskaddr_t blk,
2671 2671 u_longlong_t cnt,
2672 2672 int must_be_open,
2673 2673 int state,
2674 2674 int err_cnt)
2675 2675 {
2676 2676 u_longlong_t cando;
2677 2677 md_dev64_t dev;
2678 2678 md_m_shared_t *s;
2679 2679
2680 2680 esm->sm_state |= SMS_IGNORE;
2681 2681 while (cnt != 0) {
2682 2682 u_longlong_t mcnt;
2683 2683
2684 2684 mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024)); /* 1 Gig Blks */
2685 2685
2686 2686 dev = select_read_unit(un, blk, mcnt, &cando,
2687 2687 must_be_open, &s, NULL);
2688 2688 if (dev == (md_dev64_t)0)
2689 2689 break;
2690 2690
2691 2691 if ((state == CS_LAST_ERRED) &&
2692 2692 (s->ms_state == CS_LAST_ERRED) &&
2693 2693 (err_cnt > s->ms_lasterrcnt))
2694 2694 break;
2695 2695
2696 2696 cnt -= cando;
2697 2697 blk += cando;
2698 2698 }
2699 2699 esm->sm_state &= ~SMS_IGNORE;
2700 2700 return (cnt != 0);
2701 2701 }
2702 2702
2703 2703 int
2704 2704 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open)
2705 2705 {
2706 2706 mm_submirror_t *sm;
2707 2707 mm_submirror_ic_t *smic;
2708 2708 size_t count;
2709 2709 diskaddr_t block;
2710 2710 u_longlong_t skip;
2711 2711 u_longlong_t size;
2712 2712 md_dev64_t dev;
2713 2713 int cnt;
2714 2714 md_m_shared_t *s;
2715 2715 int not_found;
2716 2716
2717 2717 sm = &un->un_sm[smi];
2718 2718 smic = &un->un_smic[smi];
2719 2719 dev = sm->sm_dev;
2720 2720
2721 2721 /*
2722 2722 * Make sure every component of the submirror
2723 2723 * has other sources.
2724 2724 */
2725 2725 if (ci < 0) {
2726 2726 /* Find the highest lasterrcnt */
2727 2727 cnt = (*(smic->sm_get_component_count))(dev, sm);
2728 2728 for (ci = 0; ci < cnt; ci++) {
2729 2729 not_found = mirror_other_sources(un, smi, ci,
2730 2730 must_be_open);
2731 2731 if (not_found)
2732 2732 return (1);
2733 2733 }
2734 2734 return (0);
2735 2735 }
2736 2736
2737 2737 /*
2738 2738 * Make sure this component has other sources
2739 2739 */
2740 2740 (void) (*(smic->sm_get_bcss))
2741 2741 (dev, sm, ci, &block, &count, &skip, &size);
2742 2742
2743 2743 if (count == 0)
2744 2744 return (1);
2745 2745
2746 2746 s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci);
2747 2747
2748 2748 while (count--) {
2749 2749 if (block >= un->c.un_total_blocks)
2750 2750 return (0);
2751 2751
2752 2752 if ((block + size) > un->c.un_total_blocks)
2753 2753 size = un->c.un_total_blocks - block;
2754 2754
2755 2755 not_found = find_another_logical(un, sm, block, size,
2756 2756 must_be_open, s->ms_state, s->ms_lasterrcnt);
2757 2757 if (not_found)
2758 2758 return (1);
2759 2759
2760 2760 block += size + skip;
2761 2761 }
2762 2762 return (0);
2763 2763 }
2764 2764
2765 2765 static void
2766 2766 finish_error(md_mps_t *ps)
2767 2767 {
2768 2768 struct buf *pb;
2769 2769 mm_unit_t *un;
2770 2770 mdi_unit_t *ui;
2771 2771 uint_t new_str_flags;
2772 2772
2773 2773 pb = ps->ps_bp;
2774 2774 un = ps->ps_un;
2775 2775 ui = ps->ps_ui;
2776 2776
2777 2777 /*
2778 2778 * Must flag any error to the resync originator if we're performing
2779 2779 * a Write-after-Read. This corresponds to an i/o error on a resync
2780 2780 * target device and in this case we ought to abort the resync as there
2781 2781 * is nothing that can be done to recover from this without operator
2782 2782 * intervention. If we don't set the B_ERROR flag we will continue
2783 2783 * reading from the mirror but won't write to the target (as it will
2784 2784 * have been placed into an errored state).
2785 2785 * To handle the case of multiple components within a submirror we only
2786 2786 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR.
2787 2787 * The originator of the resync read will cause this bit to be set if
2788 2788 * the underlying component count is one for a submirror resync. All
2789 2789 * other resync types will have the flag set as there is no underlying
2790 2790 * resync which can be performed on a contained metadevice for these
2791 2791 * resync types (optimized or component).
2792 2792 */
2793 2793
2794 2794 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) {
2795 2795 if (ps->ps_flags & MD_MPS_FLAG_ERROR)
2796 2796 pb->b_flags |= B_ERROR;
2797 2797 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2798 2798 MPS_FREE(mirror_parent_cache, ps);
2799 2799 md_unit_readerexit(ui);
2800 2800 md_biodone(pb);
2801 2801 return;
2802 2802 }
2803 2803 /*
2804 2804 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
2805 2805 * operation therefore this I/O request has already been counted,
2806 2806 * the I/O count variable will be decremented by mirror_done()'s
2807 2807 * call to md_biodone().
2808 2808 */
2809 2809 if (ps->ps_changecnt != un->un_changecnt) {
2810 2810 new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED;
2811 2811 if (ps->ps_flags & MD_MPS_WOW)
2812 2812 new_str_flags |= MD_STR_WOW;
2813 2813 if (ps->ps_flags & MD_MPS_MAPPED)
2814 2814 new_str_flags |= MD_STR_MAPPED;
2815 2815 /*
2816 2816 * If this I/O request was a read that was part of a resync,
2817 2817 * set MD_STR_WAR for the retried read to ensure that the
2818 2818 * resync write (i.e. write-after-read) will be performed
2819 2819 */
2820 2820 if (ps->ps_flags & MD_MPS_RESYNC_READ)
2821 2821 new_str_flags |= MD_STR_WAR;
2822 2822 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2823 2823 MPS_FREE(mirror_parent_cache, ps);
2824 2824 md_unit_readerexit(ui);
2825 2825 (void) md_mirror_strategy(pb, new_str_flags, NULL);
2826 2826 return;
2827 2827 }
2828 2828
2829 2829 pb->b_flags |= B_ERROR;
2830 2830 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2831 2831 MPS_FREE(mirror_parent_cache, ps);
2832 2832 md_unit_readerexit(ui);
2833 2833 md_biodone(pb);
2834 2834 }
2835 2835
2836 2836 static void
2837 2837 error_update_unit(md_mps_t *ps)
2838 2838 {
2839 2839 mm_unit_t *un;
2840 2840 mdi_unit_t *ui;
2841 2841 int smi; /* sub mirror index */
2842 2842 int ci; /* errored component */
2843 2843 set_t setno;
2844 2844 uint_t flags; /* for set_sm_comp_state() */
2845 2845 uint_t hspflags; /* for check_comp_4_hotspares() */
2846 2846
2847 2847 ui = ps->ps_ui;
2848 2848 un = (mm_unit_t *)md_unit_writerlock(ui);
2849 2849 setno = MD_UN2SET(un);
2850 2850
2851 2851 /* All of these updates have to propagated in case of MN set */
2852 2852 flags = MD_STATE_XMIT;
2853 2853 hspflags = MD_HOTSPARE_XMIT;
2854 2854
2855 2855 /* special treatment if we are called during updating watermarks */
2856 2856 if (ps->ps_flags & MD_MPS_WMUPDATE) {
2857 2857 flags |= MD_STATE_WMUPDATE;
2858 2858 hspflags |= MD_HOTSPARE_WMUPDATE;
2859 2859 }
2860 2860 smi = 0;
2861 2861 ci = 0;
2862 2862 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
2863 2863 if (mirror_other_sources(un, smi, ci, 0) == 1) {
2864 2864
2865 2865 /* Never called from ioctl context, so (IOLOCK *)NULL */
2866 2866 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags,
2867 2867 (IOLOCK *)NULL);
2868 2868 /*
2869 2869 * For a MN set, the NOTIFY is done when the state
2870 2870 * change is processed on each node
2871 2871 */
2872 2872 if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2873 2873 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
2874 2874 SVM_TAG_METADEVICE, setno, MD_SID(un));
2875 2875 }
2876 2876 continue;
2877 2877 }
2878 2878 /* Never called from ioctl context, so (IOLOCK *)NULL */
2879 2879 set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags,
2880 2880 (IOLOCK *)NULL);
2881 2881 /*
2882 2882 * For a MN set, the NOTIFY is done when the state
2883 2883 * change is processed on each node
2884 2884 */
2885 2885 if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2886 2886 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
2887 2887 SVM_TAG_METADEVICE, setno, MD_SID(un));
2888 2888 }
2889 2889 smi = 0;
2890 2890 ci = 0;
2891 2891 }
2892 2892
2893 2893 md_unit_writerexit(ui);
2894 2894 if (MD_MNSET_SETNO(setno)) {
2895 2895 send_poke_hotspares(setno);
2896 2896 } else {
2897 2897 (void) poke_hotspares();
2898 2898 }
2899 2899 (void) md_unit_readerlock(ui);
2900 2900
2901 2901 finish_error(ps);
2902 2902 }
2903 2903
2904 2904 /*
2905 2905 * When we have a B_FAILFAST IO error on a Last Erred component we need to
2906 2906 * retry the IO without B_FAILFAST set so that we try to ensure that the
2907 2907 * component "sees" each IO.
2908 2908 */
2909 2909 static void
2910 2910 last_err_retry(md_mcs_t *cs)
2911 2911 {
2912 2912 struct buf *cb;
2913 2913 md_mps_t *ps;
2914 2914 uint_t flags;
2915 2915
2916 2916 cb = &cs->cs_buf;
2917 2917 cb->b_flags &= ~B_FAILFAST;
2918 2918
2919 2919 /* if we're panicing just let this I/O error out */
2920 2920 if (panicstr) {
2921 2921 (void) mirror_done(cb);
2922 2922 return;
2923 2923 }
2924 2924
2925 2925 /* reissue the I/O */
2926 2926
2927 2927 ps = cs->cs_ps;
2928 2928
2929 2929 bioerror(cb, 0);
2930 2930
2931 2931 mutex_enter(&ps->ps_mx);
2932 2932
2933 2933 flags = MD_STR_NOTTOP;
2934 2934 if (ps->ps_flags & MD_MPS_MAPPED)
2935 2935 flags |= MD_STR_MAPPED;
2936 2936 if (ps->ps_flags & MD_MPS_NOBLOCK)
2937 2937 flags |= MD_NOBLOCK;
2938 2938
2939 2939 mutex_exit(&ps->ps_mx);
2940 2940
2941 2941 clear_retry_error(cb);
2942 2942
2943 2943 cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST",
2944 2944 md_shortname(getminor(cb->b_edev)));
2945 2945
2946 2946 md_call_strategy(cb, flags, NULL);
2947 2947 }
2948 2948
2949 2949 static void
2950 2950 mirror_error(md_mps_t *ps)
2951 2951 {
2952 2952 int smi; /* sub mirror index */
2953 2953 int ci; /* errored component */
2954 2954
2955 2955 if (panicstr) {
2956 2956 finish_error(ps);
2957 2957 return;
2958 2958 }
2959 2959
2960 2960 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
2961 2961 mirror_overlap_tree_remove(ps);
2962 2962
2963 2963 smi = 0;
2964 2964 ci = 0;
2965 2965 if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) {
2966 2966 md_unit_readerexit(ps->ps_ui);
2967 2967 daemon_request(&md_mstr_daemon, error_update_unit,
2968 2968 (daemon_queue_t *)ps, REQ_OLD);
2969 2969 return;
2970 2970 }
2971 2971
2972 2972 finish_error(ps);
2973 2973 }
2974 2974
2975 2975 static int
2976 2976 copy_write_done(struct buf *cb)
2977 2977 {
2978 2978 md_mps_t *ps;
2979 2979 buf_t *pb;
2980 2980 char *wowbuf;
2981 2981 wowhdr_t *wowhdr;
2982 2982 ssize_t wow_resid;
2983 2983
2984 2984 /* get wowbuf ans save structure */
2985 2985 wowbuf = cb->b_un.b_addr;
2986 2986 wowhdr = WOWBUF_HDR(wowbuf);
2987 2987 ps = wowhdr->wow_ps;
2988 2988 pb = ps->ps_bp;
2989 2989
2990 2990 /* Save error information, then free cb */
2991 2991 if (cb->b_flags & B_ERROR)
2992 2992 pb->b_flags |= B_ERROR;
2993 2993
2994 2994 if (cb->b_flags & B_REMAPPED)
2995 2995 bp_mapout(cb);
2996 2996
2997 2997 freerbuf(cb);
2998 2998
2999 2999 /* update residual and continue if needed */
3000 3000 if ((pb->b_flags & B_ERROR) == 0) {
3001 3001 wow_resid = pb->b_bcount - wowhdr->wow_offset;
3002 3002 pb->b_resid = wow_resid;
3003 3003 if (wow_resid > 0) {
3004 3004 daemon_request(&md_mstr_daemon, copy_write_cont,
3005 3005 (daemon_queue_t *)wowhdr, REQ_OLD);
3006 3006 return (1);
3007 3007 }
3008 3008 }
3009 3009
3010 3010 /* Write is complete, release resources. */
3011 3011 kmem_cache_free(mirror_wowblk_cache, wowhdr);
3012 3012 ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
3013 3013 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3014 3014 MPS_FREE(mirror_parent_cache, ps);
3015 3015 md_biodone(pb);
3016 3016 return (0);
3017 3017 }
3018 3018
3019 3019 static void
3020 3020 copy_write_cont(wowhdr_t *wowhdr)
3021 3021 {
3022 3022 buf_t *pb;
3023 3023 buf_t *cb;
3024 3024 char *wowbuf;
3025 3025 int wow_offset;
3026 3026 size_t wow_resid;
3027 3027 diskaddr_t wow_blkno;
3028 3028
3029 3029 wowbuf = WOWHDR_BUF(wowhdr);
3030 3030 pb = wowhdr->wow_ps->ps_bp;
3031 3031
3032 3032 /* get data on current location */
3033 3033 wow_offset = wowhdr->wow_offset;
3034 3034 wow_resid = pb->b_bcount - wow_offset;
3035 3035 wow_blkno = pb->b_lblkno + lbtodb(wow_offset);
3036 3036
3037 3037 /* setup child buffer */
3038 3038 cb = getrbuf(KM_SLEEP);
3039 3039 cb->b_flags = B_WRITE;
3040 3040 cb->b_edev = pb->b_edev;
3041 3041 cb->b_un.b_addr = wowbuf; /* change to point at WOWBUF */
3042 3042 cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */
3043 3043 cb->b_iodone = copy_write_done;
3044 3044 cb->b_bcount = MIN(md_wowbuf_size, wow_resid);
3045 3045 cb->b_lblkno = wow_blkno;
3046 3046
3047 3047 /* move offset to next section */
3048 3048 wowhdr->wow_offset += cb->b_bcount;
3049 3049
3050 3050 /* copy and setup write for current section */
3051 3051 bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount);
3052 3052
3053 3053 /* do it */
3054 3054 /*
3055 3055 * Do not set the MD_IO_COUNTED flag as this is a new I/O request
3056 3056 * that handles the WOW condition. The resultant increment on the
3057 3057 * I/O count variable is cleared by copy_write_done()'s call to
3058 3058 * md_biodone().
3059 3059 */
3060 3060 (void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW
3061 3061 | MD_STR_MAPPED, NULL);
3062 3062 }
3063 3063
3064 3064 static void
3065 3065 md_mirror_copy_write(md_mps_t *ps)
3066 3066 {
3067 3067 wowhdr_t *wowhdr;
3068 3068
3069 3069 wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS);
3070 3070 mirror_wowblk_init(wowhdr);
3071 3071 wowhdr->wow_ps = ps;
3072 3072 wowhdr->wow_offset = 0;
3073 3073 copy_write_cont(wowhdr);
3074 3074 }
3075 3075
3076 3076 static void
3077 3077 handle_wow(md_mps_t *ps)
3078 3078 {
3079 3079 buf_t *pb;
3080 3080
3081 3081 pb = ps->ps_bp;
3082 3082
3083 3083 bp_mapin(pb);
3084 3084
3085 3085 md_mirror_wow_cnt++;
3086 3086 if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) {
3087 3087 cmn_err(CE_NOTE,
3088 3088 "md: %s, blk %lld, cnt %ld: Write on write %d occurred",
3089 3089 md_shortname(getminor(pb->b_edev)),
3090 3090 (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt);
3091 3091 }
3092 3092
3093 3093 /*
3094 3094 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
3095 3095 * operation therefore this I/O request has already been counted,
3096 3096 * the I/O count variable will be decremented by mirror_done()'s
3097 3097 * call to md_biodone().
3098 3098 */
3099 3099 if (md_mirror_wow_flg & WOW_NOCOPY)
3100 3100 (void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW |
3101 3101 MD_STR_MAPPED | MD_IO_COUNTED, ps);
3102 3102 else
3103 3103 md_mirror_copy_write(ps);
3104 3104 }
3105 3105
3106 3106 /*
3107 3107 * Return true if the specified submirror is either in the Last Erred
3108 3108 * state or is transitioning into the Last Erred state.
3109 3109 */
3110 3110 static bool_t
3111 3111 submirror_is_lasterred(mm_unit_t *un, int smi)
3112 3112 {
3113 3113 mm_submirror_t *sm;
3114 3114 mm_submirror_ic_t *smic;
3115 3115 md_m_shared_t *shared;
3116 3116 int ci;
3117 3117 int compcnt;
3118 3118
3119 3119 sm = &un->un_sm[smi];
3120 3120 smic = &un->un_smic[smi];
3121 3121
3122 3122 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
3123 3123 for (ci = 0; ci < compcnt; ci++) {
3124 3124 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3125 3125 (sm->sm_dev, sm, ci);
3126 3126
3127 3127 if (shared->ms_state == CS_LAST_ERRED)
3128 3128 return (B_TRUE);
3129 3129
3130 3130 /*
3131 3131 * It is not currently Last Erred, check if entering Last Erred.
3132 3132 */
3133 3133 if ((shared->ms_flags & MDM_S_IOERR) &&
3134 3134 ((shared->ms_state == CS_OKAY) ||
3135 3135 (shared->ms_state == CS_RESYNC))) {
3136 3136 if (mirror_other_sources(un, smi, ci, 0) == 1)
3137 3137 return (B_TRUE);
3138 3138 }
3139 3139 }
3140 3140
3141 3141 return (B_FALSE);
3142 3142 }
3143 3143
3144 3144
3145 3145 static int
3146 3146 mirror_done(struct buf *cb)
3147 3147 {
3148 3148 md_mps_t *ps;
3149 3149 md_mcs_t *cs;
3150 3150
3151 3151 /*LINTED*/
3152 3152 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3153 3153 ps = cs->cs_ps;
3154 3154
3155 3155 mutex_enter(&ps->ps_mx);
3156 3156
3157 3157 /* check if we need to retry an errored failfast I/O */
3158 3158 if (cb->b_flags & B_ERROR) {
3159 3159 struct buf *pb = ps->ps_bp;
3160 3160
3161 3161 if (cb->b_flags & B_FAILFAST) {
3162 3162 int i;
3163 3163 mm_unit_t *un = ps->ps_un;
3164 3164
3165 3165 for (i = 0; i < NMIRROR; i++) {
3166 3166 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
3167 3167 continue;
3168 3168
3169 3169 if (cb->b_edev ==
3170 3170 md_dev64_to_dev(un->un_sm[i].sm_dev)) {
3171 3171
3172 3172 /*
3173 3173 * This is the submirror that had the
3174 3174 * error. Check if it is Last Erred.
3175 3175 */
3176 3176 if (submirror_is_lasterred(un, i)) {
3177 3177 daemon_queue_t *dqp;
3178 3178
3179 3179 mutex_exit(&ps->ps_mx);
3180 3180 dqp = (daemon_queue_t *)cs;
3181 3181 dqp->dq_prev = NULL;
3182 3182 dqp->dq_next = NULL;
3183 3183 daemon_request(&md_done_daemon,
3184 3184 last_err_retry, dqp,
3185 3185 REQ_OLD);
3186 3186 return (1);
3187 3187 }
3188 3188 break;
3189 3189 }
3190 3190 }
3191 3191 }
3192 3192
3193 3193 /* continue to process the buf without doing a retry */
3194 3194 ps->ps_flags |= MD_MPS_ERROR;
3195 3195 pb->b_error = cb->b_error;
3196 3196 }
3197 3197
3198 3198 return (mirror_done_common(cb));
3199 3199 }
3200 3200
3201 3201 /*
3202 3202 * Split from the original mirror_done function so we can handle bufs after a
3203 3203 * retry.
3204 3204 * ps->ps_mx is already held in the caller of this function and the cb error
3205 3205 * has already been checked and handled in the caller.
3206 3206 */
3207 3207 static int
3208 3208 mirror_done_common(struct buf *cb)
3209 3209 {
3210 3210 struct buf *pb;
3211 3211 mm_unit_t *un;
3212 3212 mdi_unit_t *ui;
3213 3213 md_mps_t *ps;
3214 3214 md_mcs_t *cs;
3215 3215 size_t end_rr, start_rr, current_rr;
3216 3216
3217 3217 /*LINTED*/
3218 3218 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3219 3219 ps = cs->cs_ps;
3220 3220 pb = ps->ps_bp;
3221 3221
3222 3222 if (cb->b_flags & B_REMAPPED)
3223 3223 bp_mapout(cb);
3224 3224
3225 3225 ps->ps_frags--;
3226 3226 if (ps->ps_frags != 0) {
3227 3227 mutex_exit(&ps->ps_mx);
3228 3228 kmem_cache_free(mirror_child_cache, cs);
3229 3229 return (1);
3230 3230 }
3231 3231 un = ps->ps_un;
3232 3232 ui = ps->ps_ui;
3233 3233
3234 3234 /*
3235 3235 * Do not update outstanding_writes if we're running with ABR
3236 3236 * set for this mirror or the write() was issued with MD_STR_ABR set.
3237 3237 * Also a resync initiated write() has no outstanding_writes update
3238 3238 * either.
3239 3239 */
3240 3240 if (((cb->b_flags & B_READ) == 0) &&
3241 3241 (un->un_nsm >= 2) &&
3242 3242 (ps->ps_call == NULL) &&
3243 3243 !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) &&
3244 3244 !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) {
3245 3245 BLK_TO_RR(end_rr, ps->ps_lastblk, un);
3246 3246 BLK_TO_RR(start_rr, ps->ps_firstblk, un);
3247 3247 mutex_enter(&un->un_resync_mx);
3248 3248 for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3249 3249 un->un_outstanding_writes[current_rr]--;
3250 3250 mutex_exit(&un->un_resync_mx);
3251 3251 }
3252 3252 kmem_cache_free(mirror_child_cache, cs);
3253 3253 mutex_exit(&ps->ps_mx);
3254 3254
3255 3255 if (ps->ps_call != NULL) {
3256 3256 daemon_request(&md_done_daemon, ps->ps_call,
3257 3257 (daemon_queue_t *)ps, REQ_OLD);
3258 3258 return (1);
3259 3259 }
3260 3260
3261 3261 if ((ps->ps_flags & MD_MPS_ERROR)) {
3262 3262 daemon_request(&md_done_daemon, mirror_error,
3263 3263 (daemon_queue_t *)ps, REQ_OLD);
3264 3264 return (1);
3265 3265 }
3266 3266
3267 3267 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3268 3268 mirror_overlap_tree_remove(ps);
3269 3269
3270 3270 /*
3271 3271 * Handle Write-on-Write problem.
3272 3272 * Skip In case of Raw and Direct I/O as they are
3273 3273 * handled earlier.
3274 3274 *
3275 3275 */
3276 3276 if (!(md_mirror_wow_flg & WOW_DISABLE) &&
3277 3277 !(pb->b_flags & B_READ) &&
3278 3278 !(ps->ps_flags & MD_MPS_WOW) &&
3279 3279 !(pb->b_flags & B_PHYS) &&
3280 3280 any_pages_dirty(pb)) {
3281 3281 md_unit_readerexit(ps->ps_ui);
3282 3282 daemon_request(&md_mstr_daemon, handle_wow,
3283 3283 (daemon_queue_t *)ps, REQ_OLD);
3284 3284 return (1);
3285 3285 }
3286 3286
3287 3287 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3288 3288 MPS_FREE(mirror_parent_cache, ps);
3289 3289 md_unit_readerexit(ui);
3290 3290 md_biodone(pb);
3291 3291 return (0);
3292 3292 }
3293 3293
3294 3294 /*
3295 3295 * Clear error state in submirror component if the retry worked after
3296 3296 * a failfast error.
3297 3297 */
3298 3298 static void
3299 3299 clear_retry_error(struct buf *cb)
3300 3300 {
3301 3301 int smi;
3302 3302 md_mcs_t *cs;
3303 3303 mm_unit_t *un;
3304 3304 mdi_unit_t *ui_sm;
3305 3305 mm_submirror_t *sm;
3306 3306 mm_submirror_ic_t *smic;
3307 3307 u_longlong_t cnt;
3308 3308 md_m_shared_t *shared;
3309 3309
3310 3310 /*LINTED*/
3311 3311 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3312 3312 un = cs->cs_ps->ps_un;
3313 3313
3314 3314 for (smi = 0; smi < NMIRROR; smi++) {
3315 3315 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
3316 3316 continue;
3317 3317
3318 3318 if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev))
3319 3319 break;
3320 3320 }
3321 3321
3322 3322 if (smi >= NMIRROR)
3323 3323 return;
3324 3324
3325 3325 sm = &un->un_sm[smi];
3326 3326 smic = &un->un_smic[smi];
3327 3327 cnt = cb->b_bcount;
3328 3328
3329 3329 ui_sm = MDI_UNIT(getminor(cb->b_edev));
3330 3330 (void) md_unit_writerlock(ui_sm);
3331 3331
3332 3332 shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm,
3333 3333 cb->b_blkno, &cnt);
3334 3334
3335 3335 if (shared->ms_flags & MDM_S_IOERR) {
3336 3336 shared->ms_flags &= ~MDM_S_IOERR;
3337 3337
3338 3338 } else {
3339 3339 /* the buf spans components and the first one is not erred */
3340 3340 int cnt;
3341 3341 int i;
3342 3342
3343 3343 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
3344 3344 for (i = 0; i < cnt; i++) {
3345 3345 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3346 3346 (sm->sm_dev, sm, i);
3347 3347
3348 3348 if (shared->ms_flags & MDM_S_IOERR &&
3349 3349 shared->ms_state == CS_OKAY) {
3350 3350
3351 3351 shared->ms_flags &= ~MDM_S_IOERR;
3352 3352 break;
3353 3353 }
3354 3354 }
3355 3355 }
3356 3356
3357 3357 md_unit_writerexit(ui_sm);
3358 3358 }
3359 3359
3360 3360 static size_t
3361 3361 mirror_map_read(
3362 3362 md_mps_t *ps,
3363 3363 md_mcs_t *cs,
3364 3364 diskaddr_t blkno,
3365 3365 u_longlong_t count
3366 3366 )
3367 3367 {
3368 3368 mm_unit_t *un;
3369 3369 buf_t *bp;
3370 3370 u_longlong_t cando;
3371 3371
3372 3372 bp = &cs->cs_buf;
3373 3373 un = ps->ps_un;
3374 3374
3375 3375 bp->b_lblkno = blkno;
3376 3376 if (fast_select_read_unit(ps, cs) == 0) {
3377 3377 bp->b_bcount = ldbtob(count);
3378 3378 return (0);
3379 3379 }
3380 3380 bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno,
3381 3381 count, &cando, 0, NULL, cs));
3382 3382 bp->b_bcount = ldbtob(cando);
3383 3383 if (count != cando)
3384 3384 return (cando);
3385 3385 return (0);
3386 3386 }
3387 3387
3388 3388 static void
3389 3389 write_after_read(md_mps_t *ps)
3390 3390 {
3391 3391 struct buf *pb;
3392 3392 int flags;
3393 3393
3394 3394 if (ps->ps_flags & MD_MPS_ERROR) {
3395 3395 mirror_error(ps);
3396 3396 return;
3397 3397 }
3398 3398
3399 3399 pb = ps->ps_bp;
3400 3400 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3401 3401 ps->ps_call = NULL;
3402 3402 ps->ps_flags |= MD_MPS_WRITE_AFTER_READ;
3403 3403 flags = MD_STR_NOTTOP | MD_STR_WAR;
3404 3404 if (ps->ps_flags & MD_MPS_MAPPED)
3405 3405 flags |= MD_STR_MAPPED;
3406 3406 if (ps->ps_flags & MD_MPS_NOBLOCK)
3407 3407 flags |= MD_NOBLOCK;
3408 3408 if (ps->ps_flags & MD_MPS_DIRTY_RD)
3409 3409 flags |= MD_STR_DIRTY_RD;
3410 3410 (void) mirror_write_strategy(pb, flags, ps);
3411 3411 }
3412 3412
3413 3413 static void
3414 3414 continue_serial(md_mps_t *ps)
3415 3415 {
3416 3416 md_mcs_t *cs;
3417 3417 buf_t *cb;
3418 3418 mm_unit_t *un;
3419 3419 int flags;
3420 3420
3421 3421 un = ps->ps_un;
3422 3422 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
3423 3423 mirror_child_init(cs);
3424 3424 cb = &cs->cs_buf;
3425 3425 ps->ps_call = NULL;
3426 3426 ps->ps_frags = 1;
3427 3427 (void) mirror_map_write(un, cs, ps, 0);
3428 3428 flags = MD_STR_NOTTOP;
3429 3429 if (ps->ps_flags & MD_MPS_MAPPED)
3430 3430 flags |= MD_STR_MAPPED;
3431 3431 md_call_strategy(cb, flags, NULL);
3432 3432 }
3433 3433
3434 3434 static int
3435 3435 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war)
3436 3436 {
3437 3437 int i;
3438 3438 dev_t dev; /* needed for bioclone, so not md_dev64_t */
3439 3439 buf_t *cb;
3440 3440 buf_t *pb;
3441 3441 diskaddr_t blkno;
3442 3442 size_t bcount;
3443 3443 off_t offset;
3444 3444
3445 3445 pb = ps->ps_bp;
3446 3446 cb = &cs->cs_buf;
3447 3447 cs->cs_ps = ps;
3448 3448
3449 3449 i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm);
3450 3450
3451 3451 dev = md_dev64_to_dev(un->un_sm[i].sm_dev);
3452 3452
3453 3453 blkno = pb->b_lblkno;
3454 3454 bcount = pb->b_bcount;
3455 3455 offset = 0;
3456 3456 if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) {
3457 3457 blkno = DK_LABEL_LOC + 1;
3458 3458 /*
3459 3459 * This handles the case where we're requesting
3460 3460 * a write to block 0 on a label partition
3461 3461 * and the request size was smaller than the
3462 3462 * size of the label. If this is the case
3463 3463 * then we'll return -1. Failure to do so will
3464 3464 * either cause the calling thread to hang due to
3465 3465 * an ssd bug, or worse if the bcount were allowed
3466 3466 * to go negative (ie large).
3467 3467 */
3468 3468 if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1))
3469 3469 return (-1);
3470 3470 bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1));
3471 3471 offset = (DEV_BSIZE*(DK_LABEL_LOC + 1));
3472 3472 }
3473 3473
3474 3474 cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done,
3475 3475 cb, KM_NOSLEEP);
3476 3476 if (war)
3477 3477 cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE;
3478 3478
3479 3479 /*
3480 3480 * If the submirror is in the erred stated, check if any component is
3481 3481 * in the Last Erred state. If so, we don't want to use the B_FAILFAST
3482 3482 * flag on the IO.
3483 3483 *
3484 3484 * Provide a fast path for the non-erred case (which should be the
3485 3485 * normal case).
3486 3486 */
3487 3487 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) {
3488 3488 if (un->un_sm[i].sm_state & SMS_COMP_ERRED) {
3489 3489 mm_submirror_t *sm;
3490 3490 mm_submirror_ic_t *smic;
3491 3491 int ci;
3492 3492 int compcnt;
3493 3493
3494 3494 sm = &un->un_sm[i];
3495 3495 smic = &un->un_smic[i];
3496 3496
3497 3497 compcnt = (*(smic->sm_get_component_count))
3498 3498 (sm->sm_dev, un);
3499 3499 for (ci = 0; ci < compcnt; ci++) {
3500 3500 md_m_shared_t *shared;
3501 3501
3502 3502 shared = (md_m_shared_t *)
3503 3503 (*(smic->sm_shared_by_indx))(sm->sm_dev,
3504 3504 sm, ci);
3505 3505
3506 3506 if (shared->ms_state == CS_LAST_ERRED)
3507 3507 break;
3508 3508 }
3509 3509 if (ci >= compcnt)
3510 3510 cb->b_flags |= B_FAILFAST;
3511 3511
3512 3512 } else {
3513 3513 cb->b_flags |= B_FAILFAST;
3514 3514 }
3515 3515 }
3516 3516
3517 3517 ps->ps_current_sm++;
3518 3518 if (ps->ps_current_sm != ps->ps_active_cnt) {
3519 3519 if (un->un_write_option == WR_SERIAL) {
3520 3520 ps->ps_call = continue_serial;
3521 3521 return (0);
3522 3522 }
3523 3523 return (1);
3524 3524 }
3525 3525 return (0);
3526 3526 }
3527 3527
3528 3528 /*
3529 3529 * directed_read_done:
3530 3530 * ------------------
3531 3531 * Completion routine called when a DMR request has been returned from the
3532 3532 * underlying driver. Wake-up the original ioctl() and return the data to
3533 3533 * the user.
3534 3534 */
3535 3535 static void
3536 3536 directed_read_done(md_mps_t *ps)
3537 3537 {
3538 3538 mm_unit_t *un;
3539 3539 mdi_unit_t *ui;
3540 3540
3541 3541 un = ps->ps_un;
3542 3542 ui = ps->ps_ui;
3543 3543
3544 3544 md_unit_readerexit(ui);
3545 3545 md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3546 3546 ps->ps_call = NULL;
3547 3547
3548 3548 mutex_enter(&un->un_dmr_mx);
3549 3549 cv_signal(&un->un_dmr_cv);
3550 3550 mutex_exit(&un->un_dmr_mx);
3551 3551
3552 3552 /* release the parent structure */
3553 3553 kmem_cache_free(mirror_parent_cache, ps);
3554 3554 }
3555 3555
3556 3556 /*
3557 3557 * daemon_io:
3558 3558 * ------------
3559 3559 * Called to issue a mirror_write_strategy() or mirror_read_strategy
3560 3560 * call from a blockable context. NOTE: no mutex can be held on entry to this
3561 3561 * routine
3562 3562 */
3563 3563 static void
3564 3564 daemon_io(daemon_queue_t *dq)
3565 3565 {
3566 3566 md_mps_t *ps = (md_mps_t *)dq;
3567 3567 int flag = MD_STR_NOTTOP;
3568 3568 buf_t *pb = ps->ps_bp;
3569 3569
3570 3570 if (ps->ps_flags & MD_MPS_MAPPED)
3571 3571 flag |= MD_STR_MAPPED;
3572 3572 if (ps->ps_flags & MD_MPS_WOW)
3573 3573 flag |= MD_STR_WOW;
3574 3574 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)
3575 3575 flag |= MD_STR_WAR;
3576 3576 if (ps->ps_flags & MD_MPS_ABR)
3577 3577 flag |= MD_STR_ABR;
3578 3578 if (ps->ps_flags & MD_MPS_BLOCKABLE_IO)
3579 3579 flag |= MD_STR_BLOCK_OK;
3580 3580
3581 3581 /*
3582 3582 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set
3583 3583 * MD_STR_WAR before calling mirror_read_strategy
3584 3584 */
3585 3585 if (pb->b_flags & B_READ) {
3586 3586 if (!(ps->ps_flags & MD_MPS_DIRTY_RD))
3587 3587 flag |= MD_STR_WAR;
3588 3588 mirror_read_strategy(pb, flag, ps);
3589 3589 } else
3590 3590 mirror_write_strategy(pb, flag, ps);
3591 3591 }
3592 3592
3593 3593 /*
3594 3594 * update_resync:
3595 3595 * -------------
3596 3596 * Called to update the in-core version of the resync record with the latest
3597 3597 * version that was committed to disk when the previous mirror owner
3598 3598 * relinquished ownership. This call is likely to block as we must hold-off
3599 3599 * any current resync processing that may be occurring.
3600 3600 * On completion of the resync record update we issue the mirror_write_strategy
3601 3601 * call to complete the i/o that first started this sequence. To remove a race
3602 3602 * condition between a new write() request which is submitted and the resync
3603 3603 * record update we acquire the writerlock. This will hold off all i/o to the
3604 3604 * mirror until the resync update has completed.
3605 3605 * NOTE: no mutex can be held on entry to this routine
3606 3606 */
3607 3607 static void
3608 3608 update_resync(daemon_queue_t *dq)
3609 3609 {
3610 3610 md_mps_t *ps = (md_mps_t *)dq;
3611 3611 buf_t *pb = ps->ps_bp;
3612 3612 mdi_unit_t *ui = ps->ps_ui;
3613 3613 mm_unit_t *un = MD_UNIT(ui->ui_link.ln_id);
3614 3614 set_t setno;
3615 3615 int restart_resync;
3616 3616
3617 3617 mutex_enter(&un->un_rrp_inflight_mx);
3618 3618 (void) md_unit_writerlock(ui);
3619 3619 ps->ps_un = un;
3620 3620 setno = MD_MIN2SET(getminor(pb->b_edev));
3621 3621 if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) {
3622 3622 /*
3623 3623 * Synchronize our in-core view of what regions need to be
3624 3624 * resync'd with the on-disk version.
3625 3625 */
3626 3626 mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm,
3627 3627 un->un_dirty_bm);
3628 3628
3629 3629 /* Region dirty map is now up to date */
3630 3630 }
3631 3631 restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0;
3632 3632 md_unit_writerexit(ui);
3633 3633 mutex_exit(&un->un_rrp_inflight_mx);
3634 3634
3635 3635 /* Restart the resync thread if it was previously blocked */
3636 3636 if (restart_resync) {
3637 3637 mutex_enter(&un->un_rs_thread_mx);
3638 3638 un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER;
3639 3639 cv_signal(&un->un_rs_thread_cv);
3640 3640 mutex_exit(&un->un_rs_thread_mx);
3641 3641 }
3642 3642 /* Continue with original deferred i/o */
3643 3643 daemon_io(dq);
3644 3644 }
3645 3645
3646 3646 /*
3647 3647 * owner_timeout:
3648 3648 * -------------
3649 3649 * Called if the original mdmn_ksend_message() failed and the request is to be
3650 3650 * retried. Reattempt the original ownership change.
3651 3651 *
3652 3652 * NOTE: called at interrupt context (see timeout(9f)).
3653 3653 */
3654 3654 static void
3655 3655 owner_timeout(void *arg)
3656 3656 {
3657 3657 daemon_queue_t *dq = (daemon_queue_t *)arg;
3658 3658
3659 3659 daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD);
3660 3660 }
3661 3661
3662 3662 /*
3663 3663 * become_owner:
3664 3664 * ------------
3665 3665 * Called to issue RPC request to become the owner of the mirror
3666 3666 * associated with this i/o request. We assume that the ownership request
3667 3667 * is synchronous, so if it succeeds we will issue the request via
3668 3668 * mirror_write_strategy().
3669 3669 * If multiple i/o's are outstanding we will be called from the mirror_daemon
3670 3670 * service thread.
3671 3671 * NOTE: no mutex should be held on entry to this routine.
3672 3672 */
3673 3673 static void
3674 3674 become_owner(daemon_queue_t *dq)
3675 3675 {
3676 3676 md_mps_t *ps = (md_mps_t *)dq;
3677 3677 mm_unit_t *un = ps->ps_un;
3678 3678 buf_t *pb = ps->ps_bp;
3679 3679 set_t setno;
3680 3680 md_mn_kresult_t *kres;
3681 3681 int msg_flags = md_mirror_msg_flags;
3682 3682 md_mps_t *ps1;
3683 3683
3684 3684 ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL);
3685 3685
3686 3686 /*
3687 3687 * If we're already the mirror owner we do not need to send a message
3688 3688 * but can simply process the i/o request immediately.
3689 3689 * If we've already sent the request to become owner we requeue the
3690 3690 * request as we're waiting for the synchronous ownership message to
3691 3691 * be processed.
3692 3692 */
3693 3693 if (MD_MN_MIRROR_OWNER(un)) {
3694 3694 /*
3695 3695 * As the strategy() call will potentially block we need to
3696 3696 * punt this to a separate thread and complete this request
3697 3697 * as quickly as possible. Note: if we're a read request
3698 3698 * this must be a resync, we cannot afford to be queued
3699 3699 * behind any intervening i/o requests. In this case we put the
3700 3700 * request on the md_mirror_rs_daemon queue.
3701 3701 */
3702 3702 if (pb->b_flags & B_READ) {
3703 3703 daemon_request(&md_mirror_rs_daemon, daemon_io, dq,
3704 3704 REQ_OLD);
3705 3705 } else {
3706 3706 daemon_request(&md_mirror_io_daemon, daemon_io, dq,
3707 3707 REQ_OLD);
3708 3708 }
3709 3709 } else {
3710 3710 mutex_enter(&un->un_owner_mx);
3711 3711 if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) {
3712 3712 md_mn_req_owner_t *msg;
3713 3713 int rval = 0;
3714 3714
3715 3715 /*
3716 3716 * Check to see that we haven't exceeded the maximum
3717 3717 * retry count. If we have we fail the i/o as the
3718 3718 * comms mechanism has become wedged beyond recovery.
3719 3719 */
3720 3720 if (dq->qlen++ >= MD_OWNER_RETRIES) {
3721 3721 mutex_exit(&un->un_owner_mx);
3722 3722 cmn_err(CE_WARN,
3723 3723 "md_mirror: Request exhausted ownership "
3724 3724 "retry limit of %d attempts", dq->qlen);
3725 3725 pb->b_error = EIO;
3726 3726 pb->b_flags |= B_ERROR;
3727 3727 pb->b_resid = pb->b_bcount;
3728 3728 kmem_cache_free(mirror_parent_cache, ps);
3729 3729 md_biodone(pb);
3730 3730 return;
3731 3731 }
3732 3732
3733 3733 /*
3734 3734 * Issue request to change ownership. The call is
3735 3735 * synchronous so when it returns we can complete the
3736 3736 * i/o (if successful), or enqueue it again so that
3737 3737 * the operation will be retried.
3738 3738 */
3739 3739 un->un_owner_state |= MM_MN_OWNER_SENT;
3740 3740 mutex_exit(&un->un_owner_mx);
3741 3741
3742 3742 msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP);
3743 3743 setno = MD_MIN2SET(getminor(pb->b_edev));
3744 3744 msg->mnum = MD_SID(un);
3745 3745 msg->owner = md_mn_mynode_id;
3746 3746 msg_flags |= MD_MSGF_NO_LOG;
3747 3747 /*
3748 3748 * If this IO is triggered by updating a watermark,
3749 3749 * it might be issued by the creation of a softpartition
3750 3750 * while the commd subsystem is suspended.
3751 3751 * We don't want this message to block.
3752 3752 */
3753 3753 if (ps->ps_flags & MD_MPS_WMUPDATE) {
3754 3754 msg_flags |= MD_MSGF_OVERRIDE_SUSPEND;
3755 3755 }
3756 3756
3757 3757 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
3758 3758 rval = mdmn_ksend_message(setno,
3759 3759 MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0,
3760 3760 (char *)msg, sizeof (md_mn_req_owner_t), kres);
3761 3761
3762 3762 kmem_free(msg, sizeof (md_mn_req_owner_t));
3763 3763
3764 3764 if (MDMN_KSEND_MSG_OK(rval, kres)) {
3765 3765 dq->qlen = 0;
3766 3766 /*
3767 3767 * Successfully changed owner, reread the
3768 3768 * resync record so that we have a valid idea of
3769 3769 * any previously committed incomplete write()s.
3770 3770 * NOTE: As we need to acquire the resync mutex
3771 3771 * this may block, so we defer it to a separate
3772 3772 * thread handler. This makes us (effectively)
3773 3773 * non-blocking once the ownership message
3774 3774 * handling has completed.
3775 3775 */
3776 3776 mutex_enter(&un->un_owner_mx);
3777 3777 if (un->un_owner_state & MM_MN_BECOME_OWNER) {
3778 3778 un->un_mirror_owner = md_mn_mynode_id;
3779 3779 /* Sets owner of un_rr_dirty record */
3780 3780 if (un->un_rr_dirty_recid)
3781 3781 (void) mddb_setowner(
3782 3782 un->un_rr_dirty_recid,
3783 3783 md_mn_mynode_id);
3784 3784 un->un_owner_state &=
3785 3785 ~MM_MN_BECOME_OWNER;
3786 3786 /*
3787 3787 * Release the block on the current
3788 3788 * resync region if it is blocked
3789 3789 */
3790 3790 ps1 = un->un_rs_prev_overlap;
3791 3791 if ((ps1 != NULL) &&
3792 3792 (ps1->ps_flags & MD_MPS_ON_OVERLAP))
3793 3793 mirror_overlap_tree_remove(ps1);
3794 3794 mutex_exit(&un->un_owner_mx);
3795 3795
3796 3796 /*
3797 3797 * If we're a read, this must be a
3798 3798 * resync request, issue
3799 3799 * the i/o request on the
3800 3800 * md_mirror_rs_daemon queue. This is
3801 3801 * to avoid a deadlock between the
3802 3802 * resync_unit thread and
3803 3803 * subsequent i/o requests that may
3804 3804 * block on the resync region.
3805 3805 */
3806 3806 if (pb->b_flags & B_READ) {
3807 3807 daemon_request(
3808 3808 &md_mirror_rs_daemon,
3809 3809 update_resync, dq, REQ_OLD);
3810 3810 } else {
3811 3811 daemon_request(
3812 3812 &md_mirror_io_daemon,
3813 3813 update_resync, dq, REQ_OLD);
3814 3814 }
3815 3815 kmem_free(kres,
3816 3816 sizeof (md_mn_kresult_t));
3817 3817 return;
3818 3818 } else {
3819 3819 /*
3820 3820 * Some other node has beaten us to
3821 3821 * obtain ownership. We need to
3822 3822 * reschedule our ownership request
3823 3823 */
3824 3824 mutex_exit(&un->un_owner_mx);
3825 3825 }
3826 3826 } else {
3827 3827 mdmn_ksend_show_error(rval, kres,
3828 3828 "MD_MN_MSG_REQUIRE_OWNER");
3829 3829 /*
3830 3830 * Message transport failure is handled by the
3831 3831 * comms layer. If the ownership change request
3832 3832 * does not succeed we need to flag the error to
3833 3833 * the initiator of the i/o. This is handled by
3834 3834 * the retry logic above. As the request failed
3835 3835 * we do not know _who_ the owner of the mirror
3836 3836 * currently is. We reset our idea of the owner
3837 3837 * to None so that any further write()s will
3838 3838 * attempt to become the owner again. This stops
3839 3839 * multiple nodes writing to the same mirror
3840 3840 * simultaneously.
3841 3841 */
3842 3842 mutex_enter(&un->un_owner_mx);
3843 3843 un->un_owner_state &=
3844 3844 ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER);
3845 3845 un->un_mirror_owner = MD_MN_MIRROR_UNOWNED;
3846 3846 mutex_exit(&un->un_owner_mx);
3847 3847 }
3848 3848 kmem_free(kres, sizeof (md_mn_kresult_t));
3849 3849 } else
3850 3850 mutex_exit(&un->un_owner_mx);
3851 3851
3852 3852 /*
3853 3853 * Re-enqueue this request on the deferred i/o list. Delay the
3854 3854 * request for md_mirror_owner_to usecs to stop thrashing.
3855 3855 */
3856 3856 (void) timeout(owner_timeout, dq,
3857 3857 drv_usectohz(md_mirror_owner_to));
3858 3858 }
3859 3859 }
3860 3860
3861 3861 static void
3862 3862 mirror_write_strategy(buf_t *pb, int flag, void *private)
3863 3863 {
3864 3864 md_mps_t *ps;
3865 3865 md_mcs_t *cs;
3866 3866 int more;
3867 3867 mm_unit_t *un;
3868 3868 mdi_unit_t *ui;
3869 3869 buf_t *cb; /* child buf pointer */
3870 3870 set_t setno;
3871 3871 int rs_on_overlap = 0;
3872 3872
3873 3873 ui = MDI_UNIT(getminor(pb->b_edev));
3874 3874 un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev));
3875 3875
3876 3876
3877 3877 md_kstat_waitq_enter(ui);
3878 3878
3879 3879 /*
3880 3880 * If a state change is in progress for this mirror in a MN set,
3881 3881 * suspend all non-resync writes until the state change is complete.
3882 3882 * The objective of this suspend is to ensure that it is not
3883 3883 * possible for one node to read data from a submirror that another node
3884 3884 * has not written to because of the state change. Therefore we
3885 3885 * suspend all writes until the state change has been made. As it is
3886 3886 * not possible to read from the target of a resync, there is no need
3887 3887 * to suspend resync writes.
3888 3888 * Note that we only block here if the caller can handle a busy-wait.
3889 3889 * The MD_STR_BLOCK_OK flag is set for daemon_io originated i/o only.
3890 3890 */
3891 3891
3892 3892 if (!(flag & MD_STR_WAR)) {
3893 3893 if (flag & MD_STR_BLOCK_OK) {
3894 3894 mutex_enter(&un->un_suspend_wr_mx);
3895 3895 while (un->un_suspend_wr_flag) {
3896 3896 cv_wait(&un->un_suspend_wr_cv,
3897 3897 &un->un_suspend_wr_mx);
3898 3898 }
3899 3899 mutex_exit(&un->un_suspend_wr_mx);
3900 3900 }
3901 3901 (void) md_unit_readerlock(ui);
3902 3902 }
3903 3903
3904 3904 if (!(flag & MD_STR_NOTTOP)) {
3905 3905 if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
3906 3906 md_kstat_waitq_exit(ui);
3907 3907 return;
3908 3908 }
3909 3909 }
3910 3910
3911 3911 setno = MD_MIN2SET(getminor(pb->b_edev));
3912 3912
3913 3913 /* If an ABR write has been requested, set MD_STR_ABR flag */
3914 3914 if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE))
3915 3915 flag |= MD_STR_ABR;
3916 3916
3917 3917 if (private == NULL) {
3918 3918 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
3919 3919 mirror_parent_init(ps);
3920 3920 } else {
3921 3921 ps = private;
3922 3922 private = NULL;
3923 3923 }
3924 3924 if (flag & MD_STR_MAPPED)
3925 3925 ps->ps_flags |= MD_MPS_MAPPED;
3926 3926
3927 3927 if (flag & MD_STR_WOW)
3928 3928 ps->ps_flags |= MD_MPS_WOW;
3929 3929
3930 3930 if (flag & MD_STR_ABR)
3931 3931 ps->ps_flags |= MD_MPS_ABR;
3932 3932
3933 3933 if (flag & MD_STR_WMUPDATE)
3934 3934 ps->ps_flags |= MD_MPS_WMUPDATE;
3935 3935
3936 3936 /*
3937 3937 * Save essential information from the original buffhdr
3938 3938 * in the md_save structure.
3939 3939 */
3940 3940 ps->ps_un = un;
3941 3941 ps->ps_ui = ui;
3942 3942 ps->ps_bp = pb;
3943 3943 ps->ps_addr = pb->b_un.b_addr;
3944 3944 ps->ps_firstblk = pb->b_lblkno;
3945 3945 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
3946 3946 ps->ps_changecnt = un->un_changecnt;
3947 3947
3948 3948 /*
3949 3949 * Check for suspended writes here. This is where we can defer the
3950 3950 * write request to the daemon_io queue which will then call us with
3951 3951 * the MD_STR_BLOCK_OK flag set and we'll busy-wait (if necessary) at
3952 3952 * the top of this routine.
3953 3953 */
3954 3954 if (!(flag & MD_STR_WAR) && !(flag & MD_STR_BLOCK_OK)) {
3955 3955 mutex_enter(&un->un_suspend_wr_mx);
3956 3956 if (un->un_suspend_wr_flag) {
3957 3957 ps->ps_flags |= MD_MPS_BLOCKABLE_IO;
3958 3958 mutex_exit(&un->un_suspend_wr_mx);
3959 3959 md_unit_readerexit(ui);
3960 3960 daemon_request(&md_mirror_daemon, daemon_io,
3961 3961 (daemon_queue_t *)ps, REQ_OLD);
3962 3962 return;
3963 3963 }
3964 3964 mutex_exit(&un->un_suspend_wr_mx);
3965 3965 }
3966 3966
3967 3967 /*
3968 3968 * If not MN owner and this is an ABR write, make sure the current
3969 3969 * resync region is in the overlaps tree
3970 3970 */
3971 3971 mutex_enter(&un->un_owner_mx);
3972 3972 if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) &&
3973 3973 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
3974 3974 md_mps_t *ps1;
3975 3975 /* Block the current resync region, if not already blocked */
3976 3976 ps1 = un->un_rs_prev_overlap;
3977 3977
3978 3978 if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) ||
3979 3979 (ps1->ps_lastblk != 0))) {
3980 3980 /* Drop locks to avoid deadlock */
3981 3981 mutex_exit(&un->un_owner_mx);
3982 3982 md_unit_readerexit(ui);
3983 3983 wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT);
3984 3984 rs_on_overlap = 1;
3985 3985 (void) md_unit_readerlock(ui);
3986 3986 mutex_enter(&un->un_owner_mx);
3987 3987 /*
3988 3988 * Check to see if we have obtained ownership
3989 3989 * while waiting for overlaps. If we have, remove
3990 3990 * the resync_region entry from the overlap tree
3991 3991 */
3992 3992 if (MD_MN_MIRROR_OWNER(un) &&
3993 3993 (ps1->ps_flags & MD_MPS_ON_OVERLAP)) {
3994 3994 mirror_overlap_tree_remove(ps1);
3995 3995 rs_on_overlap = 0;
3996 3996 }
3997 3997 }
3998 3998 }
3999 3999 mutex_exit(&un->un_owner_mx);
4000 4000
4001 4001
4002 4002 /*
4003 4003 * following keep write after read from writing to the
4004 4004 * source in the case where it all came from one place
4005 4005 */
4006 4006 if (flag & MD_STR_WAR) {
4007 4007 int abort_write = 0;
4008 4008 /*
4009 4009 * We are perfoming a write-after-read. This is either as a
4010 4010 * result of a resync read or as a result of a read in a
4011 4011 * dirty resync region when the optimized resync is not
4012 4012 * complete. If in a MN set and a resync generated i/o,
4013 4013 * if the current block is not in the current
4014 4014 * resync region terminate the write as another node must have
4015 4015 * completed this resync region
4016 4016 */
4017 4017 if ((MD_MNSET_SETNO(MD_UN2SET(un))) &&
4018 4018 (!(flag & MD_STR_DIRTY_RD))) {
4019 4019 if (!IN_RESYNC_REGION(un, ps))
4020 4020 abort_write = 1;
4021 4021 }
4022 4022 if ((select_write_after_read_units(un, ps) == 0) ||
4023 4023 (abort_write)) {
4024 4024 #ifdef DEBUG
4025 4025 if (mirror_debug_flag)
4026 4026 printf("Abort resync write on %x, block %lld\n",
4027 4027 MD_SID(un), ps->ps_firstblk);
4028 4028 #endif
4029 4029 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4030 4030 mirror_overlap_tree_remove(ps);
4031 4031 kmem_cache_free(mirror_parent_cache, ps);
4032 4032 md_kstat_waitq_exit(ui);
4033 4033 md_unit_readerexit(ui);
4034 4034 md_biodone(pb);
4035 4035 return;
4036 4036 }
4037 4037 } else {
4038 4038 select_write_units(un, ps);
4039 4039
4040 4040 /* Drop readerlock to avoid deadlock */
4041 4041 md_unit_readerexit(ui);
4042 4042 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
4043 4043 un = md_unit_readerlock(ui);
4044 4044 /*
4045 4045 * For a MN set with an ABR write, if we are now the
4046 4046 * owner and we have a resync region in the overlap
4047 4047 * tree, remove the entry from overlaps and retry the write.
4048 4048 */
4049 4049
4050 4050 if (MD_MNSET_SETNO(setno) &&
4051 4051 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
4052 4052 mutex_enter(&un->un_owner_mx);
4053 4053 if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) {
4054 4054 mirror_overlap_tree_remove(ps);
4055 4055 md_kstat_waitq_exit(ui);
4056 4056 mutex_exit(&un->un_owner_mx);
4057 4057 md_unit_readerexit(ui);
4058 4058 daemon_request(&md_mirror_daemon, daemon_io,
4059 4059 (daemon_queue_t *)ps, REQ_OLD);
4060 4060 return;
4061 4061 }
4062 4062 mutex_exit(&un->un_owner_mx);
4063 4063 }
4064 4064 }
4065 4065
4066 4066 /*
4067 4067 * For Multinode mirrors with no owner and a Resync Region (not ABR)
4068 4068 * we need to become the mirror owner before continuing with the
4069 4069 * write(). For ABR mirrors we check that we 'own' the resync if
4070 4070 * we're in write-after-read mode. We do this _after_ ensuring that
4071 4071 * there are no overlaps to ensure that once we know that we are
4072 4072 * the owner, the readerlock will not be released until the write is
4073 4073 * complete. As a change of ownership in a MN set requires the
4074 4074 * writerlock, this ensures that ownership cannot be changed until
4075 4075 * the write is complete.
4076 4076 */
4077 4077 if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) ||
4078 4078 (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) {
4079 4079 if (MD_MN_NO_MIRROR_OWNER(un)) {
4080 4080 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4081 4081 mirror_overlap_tree_remove(ps);
4082 4082 md_kstat_waitq_exit(ui);
4083 4083 ASSERT(!(flag & MD_STR_WAR));
4084 4084 md_unit_readerexit(ui);
4085 4085 daemon_request(&md_mirror_daemon, become_owner,
4086 4086 (daemon_queue_t *)ps, REQ_OLD);
4087 4087 return;
4088 4088 }
4089 4089 }
4090 4090
4091 4091 /*
4092 4092 * Mark resync region if mirror has a Resync Region _and_ we are not
4093 4093 * a resync initiated write(). Don't mark region if we're flagged as
4094 4094 * an ABR write.
4095 4095 */
4096 4096 if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) &&
4097 4097 !(flag & MD_STR_WAR)) {
4098 4098 if (mirror_mark_resync_region(un, ps->ps_firstblk,
4099 4099 ps->ps_lastblk, md_mn_mynode_id)) {
4100 4100 pb->b_flags |= B_ERROR;
4101 4101 pb->b_resid = pb->b_bcount;
4102 4102 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4103 4103 mirror_overlap_tree_remove(ps);
4104 4104 kmem_cache_free(mirror_parent_cache, ps);
4105 4105 md_kstat_waitq_exit(ui);
4106 4106 md_unit_readerexit(ui);
4107 4107 md_biodone(pb);
4108 4108 return;
4109 4109 }
4110 4110 }
4111 4111
4112 4112 ps->ps_childbflags = pb->b_flags | B_WRITE;
4113 4113 ps->ps_childbflags &= ~B_READ;
4114 4114 if (flag & MD_STR_MAPPED)
4115 4115 ps->ps_childbflags &= ~B_PAGEIO;
4116 4116
4117 4117 if (!(flag & MD_STR_NOTTOP) && panicstr)
4118 4118 /* Disable WOW and don't free ps */
4119 4119 ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE);
4120 4120
4121 4121 md_kstat_waitq_to_runq(ui);
4122 4122
4123 4123 /*
4124 4124 * Treat Raw and Direct I/O as Write-on-Write always
4125 4125 */
4126 4126
4127 4127 if (!(md_mirror_wow_flg & WOW_DISABLE) &&
4128 4128 (md_mirror_wow_flg & WOW_PHYS_ENABLE) &&
4129 4129 (pb->b_flags & B_PHYS) &&
4130 4130 !(ps->ps_flags & MD_MPS_WOW)) {
4131 4131 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4132 4132 mirror_overlap_tree_remove(ps);
4133 4133 md_unit_readerexit(ui);
4134 4134 daemon_request(&md_mstr_daemon, handle_wow,
4135 4135 (daemon_queue_t *)ps, REQ_OLD);
4136 4136 return;
4137 4137 }
4138 4138
4139 4139 ps->ps_frags = 1;
4140 4140 do {
4141 4141 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
4142 4142 mirror_child_init(cs);
4143 4143 cb = &cs->cs_buf;
4144 4144 more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR));
4145 4145
4146 4146 /*
4147 4147 * This handles the case where we're requesting
4148 4148 * a write to block 0 on a label partition. (more < 0)
4149 4149 * means that the request size was smaller than the
4150 4150 * size of the label. If so this request is done.
4151 4151 */
4152 4152 if (more < 0) {
4153 4153 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4154 4154 mirror_overlap_tree_remove(ps);
4155 4155 md_kstat_runq_exit(ui);
4156 4156 kmem_cache_free(mirror_child_cache, cs);
4157 4157 kmem_cache_free(mirror_parent_cache, ps);
4158 4158 md_unit_readerexit(ui);
4159 4159 md_biodone(pb);
4160 4160 return;
4161 4161 }
4162 4162 if (more) {
4163 4163 mutex_enter(&ps->ps_mx);
4164 4164 ps->ps_frags++;
4165 4165 mutex_exit(&ps->ps_mx);
4166 4166 }
4167 4167 md_call_strategy(cb, flag, private);
4168 4168 } while (more);
4169 4169
4170 4170 if (!(flag & MD_STR_NOTTOP) && panicstr) {
4171 4171 while (!(ps->ps_flags & MD_MPS_DONE)) {
4172 4172 md_daemon(1, &md_done_daemon);
4173 4173 drv_usecwait(10);
4174 4174 }
4175 4175 kmem_cache_free(mirror_parent_cache, ps);
4176 4176 }
4177 4177 }
4178 4178
4179 4179 static void
4180 4180 mirror_read_strategy(buf_t *pb, int flag, void *private)
4181 4181 {
4182 4182 md_mps_t *ps;
4183 4183 md_mcs_t *cs;
4184 4184 size_t more;
4185 4185 mm_unit_t *un;
4186 4186 mdi_unit_t *ui;
4187 4187 size_t current_count;
4188 4188 diskaddr_t current_blkno;
4189 4189 off_t current_offset;
4190 4190 buf_t *cb; /* child buf pointer */
4191 4191 set_t setno;
4192 4192
4193 4193 ui = MDI_UNIT(getminor(pb->b_edev));
4194 4194
4195 4195 md_kstat_waitq_enter(ui);
4196 4196
4197 4197 un = (mm_unit_t *)md_unit_readerlock(ui);
4198 4198
4199 4199 if (!(flag & MD_STR_NOTTOP)) {
4200 4200 if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
4201 4201 md_kstat_waitq_exit(ui);
4202 4202 return;
4203 4203 }
4204 4204 }
4205 4205
4206 4206 if (private == NULL) {
4207 4207 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
4208 4208 mirror_parent_init(ps);
4209 4209 } else {
4210 4210 ps = private;
4211 4211 private = NULL;
4212 4212 }
4213 4213
4214 4214 if (flag & MD_STR_MAPPED)
4215 4215 ps->ps_flags |= MD_MPS_MAPPED;
4216 4216 if (flag & MD_NOBLOCK)
4217 4217 ps->ps_flags |= MD_MPS_NOBLOCK;
4218 4218 if (flag & MD_STR_WMUPDATE)
4219 4219 ps->ps_flags |= MD_MPS_WMUPDATE;
4220 4220
4221 4221 /*
4222 4222 * Check to see if this is a DMR driven read. If so we need to use the
4223 4223 * specified side (in un->un_dmr_last_read) for the source of the data.
4224 4224 */
4225 4225 if (flag & MD_STR_DMR)
4226 4226 ps->ps_flags |= MD_MPS_DMR;
4227 4227
4228 4228 /*
4229 4229 * Save essential information from the original buffhdr
4230 4230 * in the md_save structure.
4231 4231 */
4232 4232 ps->ps_un = un;
4233 4233 ps->ps_ui = ui;
4234 4234 ps->ps_bp = pb;
4235 4235 ps->ps_addr = pb->b_un.b_addr;
4236 4236 ps->ps_firstblk = pb->b_lblkno;
4237 4237 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
4238 4238 ps->ps_changecnt = un->un_changecnt;
4239 4239
4240 4240 current_count = btodb(pb->b_bcount);
4241 4241 current_blkno = pb->b_lblkno;
4242 4242 current_offset = 0;
4243 4243
4244 4244 /*
4245 4245 * If flag has MD_STR_WAR set this means that the read is issued by a
4246 4246 * resync thread which may or may not be an optimised resync.
4247 4247 *
4248 4248 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync
4249 4249 * code has not completed; either a resync has not started since snarf,
4250 4250 * or there is an optimized resync in progress.
4251 4251 *
4252 4252 * We need to generate a write after this read in the following two
4253 4253 * cases,
4254 4254 *
4255 4255 * 1. Any Resync-Generated read
4256 4256 *
4257 4257 * 2. Any read to a DIRTY REGION if there is an optimized resync
4258 4258 * pending or in progress.
4259 4259 *
4260 4260 * The write after read is done in these cases to ensure that all sides
4261 4261 * of the mirror are in sync with the read data and that it is not
4262 4262 * possible for an application to read the same block multiple times
4263 4263 * and get different data.
4264 4264 *
4265 4265 * This would be possible if the block was in a dirty region.
4266 4266 *
4267 4267 * If we're performing a directed read we don't write the data out as
4268 4268 * the application is responsible for restoring the mirror to a known
4269 4269 * state.
4270 4270 */
4271 4271 if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) &&
4272 4272 !(flag & MD_STR_DMR)) {
4273 4273 size_t start_rr, i, end_rr;
4274 4274 int region_dirty = 1;
4275 4275
4276 4276 /*
4277 4277 * We enter here under three circumstances,
4278 4278 *
4279 4279 * MD_UN_OPT_NOT_DONE MD_STR_WAR
4280 4280 * 0 1
4281 4281 * 1 0
4282 4282 * 1 1
4283 4283 *
4284 4284 * To be optimal we only care to explicitly check for dirty
4285 4285 * regions in the second case since if MD_STR_WAR is set we
4286 4286 * always do the write after read.
4287 4287 */
4288 4288 if (!(flag & MD_STR_WAR)) {
4289 4289 BLK_TO_RR(end_rr, ps->ps_lastblk, un);
4290 4290 BLK_TO_RR(start_rr, ps->ps_firstblk, un);
4291 4291
4292 4292 for (i = start_rr; i <= end_rr; i++)
4293 4293 if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0)
4294 4294 break;
4295 4295 }
4296 4296
4297 4297 if ((region_dirty) &&
4298 4298 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
4299 4299 ps->ps_call = write_after_read;
4300 4300 /*
4301 4301 * Mark this as a RESYNC_READ in ps_flags.
4302 4302 * This is used if the read fails during a
4303 4303 * resync of a 3-way mirror to ensure that
4304 4304 * the retried read to the remaining
4305 4305 * good submirror has MD_STR_WAR set. This
4306 4306 * is needed to ensure that the resync write
4307 4307 * (write-after-read) takes place.
4308 4308 */
4309 4309 ps->ps_flags |= MD_MPS_RESYNC_READ;
4310 4310
4311 4311 /*
4312 4312 * If MD_STR_FLAG_ERR is set in the flags we
4313 4313 * set MD_MPS_FLAG_ERROR so that an error on the resync
4314 4314 * write (issued by write_after_read) will be flagged
4315 4315 * to the biowait'ing resync thread. This allows us to
4316 4316 * avoid issuing further resync requests to a device
4317 4317 * that has had a write failure.
4318 4318 */
4319 4319 if (flag & MD_STR_FLAG_ERR)
4320 4320 ps->ps_flags |= MD_MPS_FLAG_ERROR;
4321 4321
4322 4322 setno = MD_UN2SET(un);
4323 4323 /*
4324 4324 * Drop the readerlock to avoid
4325 4325 * deadlock
4326 4326 */
4327 4327 md_unit_readerexit(ui);
4328 4328 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
4329 4329 un = md_unit_readerlock(ui);
4330 4330 /*
4331 4331 * Ensure that we are owner
4332 4332 */
4333 4333 if (MD_MNSET_SETNO(setno)) {
4334 4334 /*
4335 4335 * For a non-resync read that requires a
4336 4336 * write-after-read to be done, set a flag
4337 4337 * in the parent structure, so that the
4338 4338 * write_strategy routine can omit the
4339 4339 * test that the write is still within the
4340 4340 * resync region
4341 4341 */
4342 4342 if (!(flag & MD_STR_WAR))
4343 4343 ps->ps_flags |= MD_MPS_DIRTY_RD;
4344 4344
4345 4345 /*
4346 4346 * Before reading the buffer, see if
4347 4347 * there is an owner.
4348 4348 */
4349 4349 if (MD_MN_NO_MIRROR_OWNER(un)) {
4350 4350 ps->ps_call = NULL;
4351 4351 mirror_overlap_tree_remove(ps);
4352 4352 md_kstat_waitq_exit(ui);
4353 4353 md_unit_readerexit(ui);
4354 4354 daemon_request(
4355 4355 &md_mirror_daemon,
4356 4356 become_owner,
4357 4357 (daemon_queue_t *)ps,
4358 4358 REQ_OLD);
4359 4359 return;
4360 4360 }
4361 4361 /*
4362 4362 * For a resync read, check to see if I/O is
4363 4363 * outside of the current resync region, or
4364 4364 * the resync has finished. If so
4365 4365 * just terminate the I/O
4366 4366 */
4367 4367 if ((flag & MD_STR_WAR) &&
4368 4368 (!(un->c.un_status & MD_UN_WAR) ||
4369 4369 (!IN_RESYNC_REGION(un, ps)))) {
4370 4370 #ifdef DEBUG
4371 4371 if (mirror_debug_flag)
4372 4372 printf("Abort resync read "
4373 4373 "%x: %lld\n",
4374 4374 MD_SID(un),
4375 4375 ps->ps_firstblk);
4376 4376 #endif
4377 4377 mirror_overlap_tree_remove(ps);
4378 4378 kmem_cache_free(mirror_parent_cache,
4379 4379 ps);
4380 4380 md_kstat_waitq_exit(ui);
4381 4381 md_unit_readerexit(ui);
4382 4382 md_biodone(pb);
4383 4383 return;
4384 4384 }
4385 4385 }
4386 4386 }
4387 4387 }
4388 4388
4389 4389 if (flag & MD_STR_DMR) {
4390 4390 ps->ps_call = directed_read_done;
4391 4391 }
4392 4392
4393 4393 if (!(flag & MD_STR_NOTTOP) && panicstr)
4394 4394 ps->ps_flags |= MD_MPS_DONTFREE;
4395 4395
4396 4396 md_kstat_waitq_to_runq(ui);
4397 4397
4398 4398 ps->ps_frags++;
4399 4399 do {
4400 4400 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
4401 4401 mirror_child_init(cs);
4402 4402 cb = &cs->cs_buf;
4403 4403 cs->cs_ps = ps;
4404 4404
4405 4405 cb = md_bioclone(pb, current_offset, current_count, NODEV,
4406 4406 current_blkno, mirror_done, cb, KM_NOSLEEP);
4407 4407
4408 4408 more = mirror_map_read(ps, cs, current_blkno,
4409 4409 (u_longlong_t)current_count);
4410 4410 if (more) {
4411 4411 mutex_enter(&ps->ps_mx);
4412 4412 ps->ps_frags++;
4413 4413 mutex_exit(&ps->ps_mx);
4414 4414 }
4415 4415
4416 4416 /*
4417 4417 * Do these calculations now,
4418 4418 * so that we pickup a valid b_bcount from the chld_bp.
4419 4419 */
4420 4420 current_count -= more;
4421 4421 current_offset += cb->b_bcount;
4422 4422 current_blkno += more;
4423 4423 md_call_strategy(cb, flag, private);
4424 4424 } while (more);
4425 4425
4426 4426 if (!(flag & MD_STR_NOTTOP) && panicstr) {
4427 4427 while (!(ps->ps_flags & MD_MPS_DONE)) {
4428 4428 md_daemon(1, &md_done_daemon);
4429 4429 drv_usecwait(10);
4430 4430 }
4431 4431 kmem_cache_free(mirror_parent_cache, ps);
4432 4432 }
4433 4433 }
4434 4434
4435 4435 void
4436 4436 md_mirror_strategy(buf_t *bp, int flag, void *private)
4437 4437 {
4438 4438 set_t setno = MD_MIN2SET(getminor(bp->b_edev));
4439 4439
4440 4440 /*
4441 4441 * When doing IO to a multi owner meta device, check if set is halted.
4442 4442 * We do this check without the needed lock held, for performance
4443 4443 * reasons.
4444 4444 * If an IO just slips through while the set is locked via an
4445 4445 * MD_MN_SUSPEND_SET, we don't care about it.
4446 4446 * Only check for suspension if we are a top-level i/o request
4447 4447 * (MD_STR_NOTTOP is cleared in 'flag').
4448 4448 */
4449 4449 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
4450 4450 (MD_SET_HALTED | MD_SET_MNSET)) {
4451 4451 if ((flag & MD_STR_NOTTOP) == 0) {
4452 4452 mutex_enter(&md_mx);
4453 4453 /* Here we loop until the set is no longer halted */
4454 4454 while (md_set[setno].s_status & MD_SET_HALTED) {
4455 4455 cv_wait(&md_cv, &md_mx);
4456 4456 }
4457 4457 mutex_exit(&md_mx);
4458 4458 }
4459 4459 }
4460 4460
4461 4461 if ((flag & MD_IO_COUNTED) == 0) {
4462 4462 if ((flag & MD_NOBLOCK) == 0) {
4463 4463 if (md_inc_iocount(setno) != 0) {
4464 4464 bp->b_flags |= B_ERROR;
4465 4465 bp->b_error = ENXIO;
4466 4466 bp->b_resid = bp->b_bcount;
4467 4467 biodone(bp);
4468 4468 return;
4469 4469 }
4470 4470 } else {
4471 4471 md_inc_iocount_noblock(setno);
4472 4472 }
4473 4473 }
4474 4474
4475 4475 if (bp->b_flags & B_READ)
4476 4476 mirror_read_strategy(bp, flag, private);
4477 4477 else
4478 4478 mirror_write_strategy(bp, flag, private);
4479 4479 }
4480 4480
4481 4481 /*
4482 4482 * mirror_directed_read:
4483 4483 * --------------------
4484 4484 * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror
4485 4485 * so that the application can determine what (if any) resync needs to be
4486 4486 * performed. The data is copied out to the user-supplied buffer.
4487 4487 *
4488 4488 * Parameters:
4489 4489 * mdev - dev_t for the mirror device
4490 4490 * vdr - directed read parameters specifying location and submirror
4491 4491 * to perform the read from
4492 4492 * mode - used to ddi_copyout() any resulting data from the read
4493 4493 *
4494 4494 * Returns:
4495 4495 * 0 success
4496 4496 * !0 error code
4497 4497 * EINVAL - invalid request format
4498 4498 */
4499 4499 int
4500 4500 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode)
4501 4501 {
4502 4502 buf_t *bp;
4503 4503 minor_t mnum = getminor(mdev);
4504 4504 mdi_unit_t *ui = MDI_UNIT(mnum);
4505 4505 mm_unit_t *un;
4506 4506 mm_submirror_t *sm;
4507 4507 char *sm_nm;
4508 4508 uint_t next_side;
4509 4509 void *kbuffer;
4510 4510
4511 4511 if (ui == NULL)
4512 4512 return (ENXIO);
4513 4513
4514 4514 if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) {
4515 4515 return (EINVAL);
4516 4516 }
4517 4517
4518 4518 /* Check for aligned block access. We disallow non-aligned requests. */
4519 4519 if (vdr->vdr_offset % DEV_BSIZE) {
4520 4520 return (EINVAL);
4521 4521 }
4522 4522
4523 4523 /*
4524 4524 * Allocate kernel buffer for target of read(). If we had a reliable
4525 4525 * (sorry functional) DDI this wouldn't be needed.
4526 4526 */
4527 4527 kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
4528 4528 if (kbuffer == NULL) {
4529 4529 cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx"
4530 4530 " bytes\n", vdr->vdr_nbytes);
4531 4531 return (ENOMEM);
4532 4532 }
4533 4533
4534 4534 bp = getrbuf(KM_SLEEP);
4535 4535
4536 4536 bp->b_un.b_addr = kbuffer;
4537 4537 bp->b_flags = B_READ;
4538 4538 bp->b_bcount = vdr->vdr_nbytes;
4539 4539 bp->b_lblkno = lbtodb(vdr->vdr_offset);
4540 4540 bp->b_edev = mdev;
4541 4541
4542 4542 un = md_unit_readerlock(ui);
4543 4543
4544 4544 /*
4545 4545 * If DKV_SIDE_INIT is set we need to determine the first available
4546 4546 * side to start reading from. If it isn't set we increment to the
4547 4547 * next readable submirror.
4548 4548 * If there are no readable submirrors we error out with DKV_DMR_ERROR.
4549 4549 * Note: we check for a readable submirror on completion of the i/o so
4550 4550 * we should _always_ have one available. If this becomes unavailable
4551 4551 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if
4552 4552 * a metadetach is made between the completion of one DKIOCDMR ioctl
4553 4553 * and the start of the next (i.e. a sys-admin 'accident' occurred).
4554 4554 * The chance of this is small, but not non-existent.
4555 4555 */
4556 4556 if (vdr->vdr_side == DKV_SIDE_INIT) {
4557 4557 next_side = 0;
4558 4558 } else {
4559 4559 next_side = vdr->vdr_side + 1;
4560 4560 }
4561 4561 while ((next_side < NMIRROR) &&
4562 4562 !SUBMIRROR_IS_READABLE(un, next_side))
4563 4563 next_side++;
4564 4564 if (next_side >= NMIRROR) {
4565 4565 vdr->vdr_flags |= DKV_DMR_ERROR;
4566 4566 freerbuf(bp);
4567 4567 vdr->vdr_bytesread = 0;
4568 4568 md_unit_readerexit(ui);
4569 4569 return (0);
4570 4570 }
4571 4571
4572 4572 /* Set the side to read from */
4573 4573 un->un_dmr_last_read = next_side;
4574 4574
4575 4575 md_unit_readerexit(ui);
4576 4576
4577 4577 /*
4578 4578 * Save timestamp for verification purposes. Can be read by debugger
4579 4579 * to verify that this ioctl has been executed and to find the number
4580 4580 * of DMR reads and the time of the last DMR read.
4581 4581 */
4582 4582 uniqtime(&mirror_dmr_stats.dmr_timestamp);
4583 4583 mirror_dmr_stats.dmr_count++;
4584 4584
4585 4585 /* Issue READ request and wait for completion */
4586 4586 mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL);
4587 4587
4588 4588 mutex_enter(&un->un_dmr_mx);
4589 4589 cv_wait(&un->un_dmr_cv, &un->un_dmr_mx);
4590 4590 mutex_exit(&un->un_dmr_mx);
4591 4591
4592 4592 /*
4593 4593 * Check to see if we encountered an error during the read. If so we
4594 4594 * can make no guarantee about any possibly returned data.
4595 4595 */
4596 4596 if ((bp->b_flags & B_ERROR) == 0) {
4597 4597 vdr->vdr_flags &= ~DKV_DMR_ERROR;
4598 4598 if (bp->b_resid) {
4599 4599 vdr->vdr_flags |= DKV_DMR_SHORT;
4600 4600 vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid;
4601 4601 } else {
4602 4602 vdr->vdr_flags |= DKV_DMR_SUCCESS;
4603 4603 vdr->vdr_bytesread = vdr->vdr_nbytes;
4604 4604 }
4605 4605 /* Copy the data read back out to the user supplied buffer */
4606 4606 if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread,
4607 4607 mode)) {
4608 4608 kmem_free(kbuffer, vdr->vdr_nbytes);
4609 4609 return (EFAULT);
4610 4610 }
4611 4611
4612 4612 } else {
4613 4613 /* Error out with DKV_DMR_ERROR */
4614 4614 vdr->vdr_flags |= DKV_DMR_ERROR;
4615 4615 vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE);
4616 4616 }
4617 4617 /*
4618 4618 * Update the DMR parameters with the side and name of submirror that
4619 4619 * we have just read from (un->un_dmr_last_read)
4620 4620 */
4621 4621 un = md_unit_readerlock(ui);
4622 4622
4623 4623 vdr->vdr_side = un->un_dmr_last_read;
4624 4624 sm = &un->un_sm[un->un_dmr_last_read];
4625 4625 sm_nm = md_shortname(md_getminor(sm->sm_dev));
4626 4626
4627 4627 (void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name));
4628 4628
4629 4629 /*
4630 4630 * Determine if we've completed the read cycle. This is true iff the
4631 4631 * next computed submirror (side) equals or exceeds NMIRROR. We cannot
4632 4632 * use un_nsm as we need to handle a sparse array of submirrors (which
4633 4633 * can occur if a submirror is metadetached).
4634 4634 */
4635 4635 next_side = un->un_dmr_last_read + 1;
4636 4636 while ((next_side < NMIRROR) &&
4637 4637 !SUBMIRROR_IS_READABLE(un, next_side))
4638 4638 next_side++;
4639 4639 if (next_side >= NMIRROR) {
4640 4640 /* We've finished */
4641 4641 vdr->vdr_flags |= DKV_DMR_DONE;
4642 4642 }
4643 4643
4644 4644 md_unit_readerexit(ui);
4645 4645 freerbuf(bp);
4646 4646 kmem_free(kbuffer, vdr->vdr_nbytes);
4647 4647
4648 4648 return (0);
4649 4649 }
4650 4650
4651 4651 /*
4652 4652 * mirror_resync_message:
4653 4653 * ---------------------
4654 4654 * Handle the multi-node resync messages that keep all nodes within a given
4655 4655 * disk-set in sync with their view of a mirror's resync status.
4656 4656 *
4657 4657 * The message types dealt with are:
4658 4658 * MD_MN_MSG_RESYNC_STARTING - start a resync thread for a unit
4659 4659 * MD_MN_MSG_RESYNC_NEXT - specified next region to be resynced
4660 4660 * MD_MN_MSG_RESYNC_FINISH - stop the resync thread for a unit
4661 4661 * MD_MN_MSG_RESYNC_PHASE_DONE - end of a resync phase, opt, submirror or comp
4662 4662 *
4663 4663 * Returns:
4664 4664 * 0 Success
4665 4665 * >0 Failure error number
4666 4666 */
4667 4667 int
4668 4668 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
4669 4669 {
4670 4670 mdi_unit_t *ui;
4671 4671 mm_unit_t *un;
4672 4672 set_t setno;
4673 4673 int is_ABR;
4674 4674 int smi;
4675 4675 int ci;
4676 4676 sm_state_t state;
4677 4677 int broke_out;
4678 4678 mm_submirror_t *sm;
4679 4679 mm_submirror_ic_t *smic;
4680 4680 md_m_shared_t *shared;
4681 4681 md_error_t mde = mdnullerror;
4682 4682 md_mps_t *ps;
4683 4683 int rs_active;
4684 4684 int rr, rr_start, rr_end;
4685 4685
4686 4686 /* Check that the given device is part of a multi-node set */
4687 4687 setno = MD_MIN2SET(p->mnum);
4688 4688 if (setno >= md_nsets) {
4689 4689 return (ENXIO);
4690 4690 }
4691 4691 if (!MD_MNSET_SETNO(setno)) {
4692 4692 return (EINVAL);
4693 4693 }
4694 4694
4695 4695 if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL)
4696 4696 return (EINVAL);
4697 4697 if ((ui = MDI_UNIT(p->mnum)) == NULL)
4698 4698 return (EINVAL);
4699 4699 is_ABR = (ui->ui_tstate & MD_ABR_CAP);
4700 4700
4701 4701 /* Obtain the current resync status */
4702 4702 (void) md_ioctl_readerlock(lockp, ui);
4703 4703 rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0;
4704 4704 md_ioctl_readerexit(lockp);
4705 4705
4706 4706 switch ((md_mn_msgtype_t)p->msg_type) {
4707 4707 case MD_MN_MSG_RESYNC_STARTING:
4708 4708 /* Start the resync thread for the mirror */
4709 4709 (void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp);
4710 4710 break;
4711 4711
4712 4712 case MD_MN_MSG_RESYNC_NEXT:
4713 4713 /*
4714 4714 * We have to release any previously marked overlap regions
4715 4715 * so that i/o can resume. Then we need to block the region
4716 4716 * from [rs_start..rs_start+rs_size) * so that no i/o is issued.
4717 4717 * Update un_rs_resync_done and un_rs_resync_2_do.
4718 4718 */
4719 4719 (void) md_ioctl_readerlock(lockp, ui);
4720 4720 /*
4721 4721 * Ignore the message if there is no active resync thread or
4722 4722 * if it is for a resync type that we have already completed.
4723 4723 * un_resync_completed is set to the last resync completed
4724 4724 * when processing a PHASE_DONE message.
4725 4725 */
4726 4726 if (!rs_active || (p->rs_type == un->un_resync_completed))
4727 4727 break;
4728 4728 /*
4729 4729 * If this message is for the same resync and is for an earlier
4730 4730 * resync region, just ignore it. This can only occur if this
4731 4731 * node has progressed on to the next resync region before
4732 4732 * we receive this message. This can occur if the class for
4733 4733 * this message is busy and the originator has to retry thus
4734 4734 * allowing this node to move onto the next resync_region.
4735 4735 */
4736 4736 if ((p->rs_type == un->un_rs_type) &&
4737 4737 (p->rs_start < un->un_resync_startbl))
4738 4738 break;
4739 4739 ps = un->un_rs_prev_overlap;
4740 4740
4741 4741 /* Allocate previous overlap reference if needed */
4742 4742 if (ps == NULL) {
4743 4743 ps = kmem_cache_alloc(mirror_parent_cache,
4744 4744 MD_ALLOCFLAGS);
4745 4745 ps->ps_un = un;
4746 4746 ps->ps_ui = ui;
4747 4747 ps->ps_firstblk = 0;
4748 4748 ps->ps_lastblk = 0;
4749 4749 ps->ps_flags = 0;
4750 4750 md_ioctl_readerexit(lockp);
4751 4751 (void) md_ioctl_writerlock(lockp, ui);
4752 4752 un->un_rs_prev_overlap = ps;
4753 4753 md_ioctl_writerexit(lockp);
4754 4754 } else
4755 4755 md_ioctl_readerexit(lockp);
4756 4756
4757 4757 if (p->rs_originator != md_mn_mynode_id) {
4758 4758 /*
4759 4759 * Clear our un_resync_bm for the regions completed.
4760 4760 * The owner (originator) will take care of itself.
4761 4761 */
4762 4762 BLK_TO_RR(rr_end, ps->ps_lastblk, un);
4763 4763 BLK_TO_RR(rr_start, p->rs_start, un);
4764 4764 if (ps->ps_lastblk && rr_end < rr_start) {
4765 4765 BLK_TO_RR(rr_start, ps->ps_firstblk, un);
4766 4766 mutex_enter(&un->un_resync_mx);
4767 4767 /*
4768 4768 * Update our resync bitmap to reflect that
4769 4769 * another node has synchronized this range.
4770 4770 */
4771 4771 for (rr = rr_start; rr <= rr_end; rr++) {
4772 4772 CLR_KEEPDIRTY(rr, un);
4773 4773 }
4774 4774 mutex_exit(&un->un_resync_mx);
4775 4775 }
4776 4776
4777 4777 /*
4778 4778 * On all but the originating node, first update
4779 4779 * the resync state, then unblock the previous
4780 4780 * region and block the next one. No need
4781 4781 * to do this if the region is already blocked.
4782 4782 * Update the submirror state and flags from the
4783 4783 * originator. This keeps the cluster in sync with
4784 4784 * regards to the resync status.
4785 4785 */
4786 4786
4787 4787 (void) md_ioctl_writerlock(lockp, ui);
4788 4788 un->un_rs_resync_done = p->rs_done;
4789 4789 un->un_rs_resync_2_do = p->rs_2_do;
4790 4790 un->un_rs_type = p->rs_type;
4791 4791 un->un_resync_startbl = p->rs_start;
4792 4792 md_ioctl_writerexit(lockp);
4793 4793 /*
4794 4794 * Use un_owner_mx to ensure that an ownership change
4795 4795 * cannot happen at the same time as this message
4796 4796 */
4797 4797 mutex_enter(&un->un_owner_mx);
4798 4798 if (MD_MN_MIRROR_OWNER(un)) {
4799 4799 ps->ps_firstblk = p->rs_start;
4800 4800 ps->ps_lastblk = ps->ps_firstblk +
4801 4801 p->rs_size - 1;
4802 4802 } else {
4803 4803 if ((ps->ps_firstblk != p->rs_start) ||
4804 4804 (ps->ps_lastblk != p->rs_start +
4805 4805 p->rs_size - 1)) {
4806 4806 /* Remove previous overlap range */
4807 4807 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4808 4808 mirror_overlap_tree_remove(ps);
4809 4809
4810 4810 ps->ps_firstblk = p->rs_start;
4811 4811 ps->ps_lastblk = ps->ps_firstblk +
4812 4812 p->rs_size - 1;
4813 4813
4814 4814 mutex_exit(&un->un_owner_mx);
4815 4815 /* Block this range from all i/o. */
4816 4816 if (ps->ps_firstblk != 0 ||
4817 4817 ps->ps_lastblk != 0)
4818 4818 wait_for_overlaps(ps,
4819 4819 MD_OVERLAP_ALLOW_REPEAT);
4820 4820 mutex_enter(&un->un_owner_mx);
4821 4821 /*
4822 4822 * Check to see if we have obtained
4823 4823 * ownership while waiting for
4824 4824 * overlaps. If we have, remove
4825 4825 * the resync_region entry from the
4826 4826 * overlap tree
4827 4827 */
4828 4828 if (MD_MN_MIRROR_OWNER(un) &&
4829 4829 (ps->ps_flags & MD_MPS_ON_OVERLAP))
4830 4830 mirror_overlap_tree_remove(ps);
4831 4831 }
4832 4832 }
4833 4833 mutex_exit(&un->un_owner_mx);
4834 4834
4835 4835 /*
4836 4836 * If this is the first RESYNC_NEXT message (i.e.
4837 4837 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags),
4838 4838 * issue RESYNC_START NOTIFY event
4839 4839 */
4840 4840 if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) {
4841 4841 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
4842 4842 SVM_TAG_METADEVICE, MD_UN2SET(un),
4843 4843 MD_SID(un));
4844 4844 }
4845 4845
4846 4846 /* Ensure that our local resync thread is running */
4847 4847 if (un->un_rs_thread == NULL) {
4848 4848 (void) mirror_resync_unit(p->mnum, NULL,
4849 4849 &p->mde, lockp);
4850 4850 }
4851 4851 }
4852 4852
4853 4853 break;
4854 4854 case MD_MN_MSG_RESYNC_FINISH:
4855 4855 /*
4856 4856 * Complete the resync by stopping the resync thread.
4857 4857 * Also release the previous overlap region field.
4858 4858 * Update the resync_progress_thread by cv_signal'ing it so
4859 4859 * that we mark the end of the resync as soon as possible. This
4860 4860 * stops an unnecessary delay should be panic after resync
4861 4861 * completion.
4862 4862 */
4863 4863 #ifdef DEBUG
4864 4864 if (!rs_active) {
4865 4865 if (mirror_debug_flag)
4866 4866 printf("RESYNC_FINISH (mnum = %x), "
4867 4867 "Resync *NOT* active",
4868 4868 p->mnum);
4869 4869 }
4870 4870 #endif
4871 4871
4872 4872 if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) &&
4873 4873 (p->rs_originator != md_mn_mynode_id)) {
4874 4874 mutex_enter(&un->un_rs_thread_mx);
4875 4875 un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
4876 4876 un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
4877 4877 un->un_rs_thread_flags &=
4878 4878 ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
4879 4879 cv_signal(&un->un_rs_thread_cv);
4880 4880 mutex_exit(&un->un_rs_thread_mx);
4881 4881 }
4882 4882 if (is_ABR) {
4883 4883 /* Resync finished, if ABR set owner to NULL */
4884 4884 mutex_enter(&un->un_owner_mx);
4885 4885 un->un_mirror_owner = 0;
4886 4886 mutex_exit(&un->un_owner_mx);
4887 4887 }
4888 4888 (void) md_ioctl_writerlock(lockp, ui);
4889 4889 ps = un->un_rs_prev_overlap;
4890 4890 if (ps != NULL) {
4891 4891 /* Remove previous overlap range */
4892 4892 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4893 4893 mirror_overlap_tree_remove(ps);
4894 4894 /*
4895 4895 * Release the overlap range reference
4896 4896 */
4897 4897 un->un_rs_prev_overlap = NULL;
4898 4898 kmem_cache_free(mirror_parent_cache,
4899 4899 ps);
4900 4900 }
4901 4901 md_ioctl_writerexit(lockp);
4902 4902
4903 4903 /* Mark the resync as complete in the metadb */
4904 4904 un->un_rs_resync_done = p->rs_done;
4905 4905 un->un_rs_resync_2_do = p->rs_2_do;
4906 4906 un->un_rs_type = p->rs_type;
4907 4907 mutex_enter(&un->un_rs_progress_mx);
4908 4908 cv_signal(&un->un_rs_progress_cv);
4909 4909 mutex_exit(&un->un_rs_progress_mx);
4910 4910
4911 4911 un = md_ioctl_writerlock(lockp, ui);
4912 4912 un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
4913 4913 /* Deal with any pending grow_unit */
4914 4914 if (un->c.un_status & MD_UN_GROW_PENDING) {
4915 4915 if ((mirror_grow_unit(un, &mde) != 0) ||
4916 4916 (! mdismderror(&mde, MDE_GROW_DELAYED))) {
4917 4917 un->c.un_status &= ~MD_UN_GROW_PENDING;
4918 4918 }
4919 4919 }
4920 4920 md_ioctl_writerexit(lockp);
4921 4921 break;
4922 4922
4923 4923 case MD_MN_MSG_RESYNC_PHASE_DONE:
4924 4924 /*
4925 4925 * A phase of the resync, optimized. component or
4926 4926 * submirror is complete. Update mirror status.
4927 4927 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the
4928 4928 * mirror owner is peforming a resync. If we have just snarfed
4929 4929 * this set, then we must clear any of the flags set at snarf
4930 4930 * time by unit_setup_resync().
4931 4931 * Note that unit_setup_resync() sets up these flags to
4932 4932 * indicate that an optimized resync is required. These flags
4933 4933 * need to be reset because if we get here, the mirror owner
4934 4934 * will have handled the optimized resync.
4935 4935 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and
4936 4936 * MD_UN_WAR. In addition, for each submirror,
4937 4937 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC
4938 4938 * set to SMS_OFFLINE.
4939 4939 */
4940 4940 #ifdef DEBUG
4941 4941 if (mirror_debug_flag)
4942 4942 printf("phase done mess received from %d, mnum=%x,"
4943 4943 "type=%x, flags=%x\n", p->rs_originator, p->mnum,
4944 4944 p->rs_type, p->rs_flags);
4945 4945 #endif
4946 4946 /*
4947 4947 * Ignore the message if there is no active resync thread.
4948 4948 */
4949 4949 if (!rs_active)
4950 4950 break;
4951 4951
4952 4952 broke_out = p->rs_flags & MD_MN_RS_ERR;
4953 4953 switch (RS_TYPE(p->rs_type)) {
4954 4954 case MD_RS_OPTIMIZED:
4955 4955 un = md_ioctl_writerlock(lockp, ui);
4956 4956 if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) {
4957 4957 /* If we are originator, just clear rs_type */
4958 4958 if (p->rs_originator == md_mn_mynode_id) {
4959 4959 SET_RS_TYPE_NONE(un->un_rs_type);
4960 4960 md_ioctl_writerexit(lockp);
4961 4961 break;
4962 4962 }
4963 4963 /*
4964 4964 * If CLEAR_OPT_NOT_DONE is set, only clear the
4965 4965 * flags if OPT_NOT_DONE is set *and* rs_type
4966 4966 * is MD_RS_NONE.
4967 4967 */
4968 4968 if ((un->c.un_status & MD_UN_OPT_NOT_DONE) &&
4969 4969 (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) {
4970 4970 /* No resync in progress */
4971 4971 un->c.un_status &= ~MD_UN_OPT_NOT_DONE;
4972 4972 un->c.un_status &= ~MD_UN_WAR;
4973 4973 } else {
4974 4974 /*
4975 4975 * We are in the middle of an
4976 4976 * optimized resync and this message
4977 4977 * should be ignored.
4978 4978 */
4979 4979 md_ioctl_writerexit(lockp);
4980 4980 break;
4981 4981 }
4982 4982 } else {
4983 4983 /*
4984 4984 * This is the end of an optimized resync,
4985 4985 * clear the OPT_NOT_DONE and OFFLINE_SM flags
4986 4986 */
4987 4987
4988 4988 un->c.un_status &= ~MD_UN_KEEP_DIRTY;
4989 4989 if (!broke_out)
4990 4990 un->c.un_status &= ~MD_UN_WAR;
4991 4991
4992 4992 /*
4993 4993 * Clear our un_resync_bm for the regions
4994 4994 * completed. The owner (originator) will
4995 4995 * take care of itself.
4996 4996 */
4997 4997 if (p->rs_originator != md_mn_mynode_id &&
4998 4998 (ps = un->un_rs_prev_overlap) != NULL) {
4999 4999 BLK_TO_RR(rr_start, ps->ps_firstblk,
5000 5000 un);
5001 5001 BLK_TO_RR(rr_end, ps->ps_lastblk, un);
5002 5002 mutex_enter(&un->un_resync_mx);
5003 5003 for (rr = rr_start; rr <= rr_end;
5004 5004 rr++) {
5005 5005 CLR_KEEPDIRTY(rr, un);
5006 5006 }
5007 5007 mutex_exit(&un->un_resync_mx);
5008 5008 }
5009 5009 }
5010 5010
5011 5011 /*
5012 5012 * Set resync_completed to last resync type and then
5013 5013 * clear resync_type to indicate no resync in progress
5014 5014 */
5015 5015 un->un_resync_completed = un->un_rs_type;
5016 5016 SET_RS_TYPE_NONE(un->un_rs_type);
5017 5017
5018 5018 /*
5019 5019 * If resync is as a result of a submirror ONLINE,
5020 5020 * reset the submirror state to SMS_RUNNING if the
5021 5021 * resync was ok else set back to SMS_OFFLINE.
5022 5022 */
5023 5023 for (smi = 0; smi < NMIRROR; smi++) {
5024 5024 un->un_sm[smi].sm_flags &=
5025 5025 ~MD_SM_RESYNC_TARGET;
5026 5026 if (SMS_BY_INDEX_IS(un, smi,
5027 5027 SMS_OFFLINE_RESYNC)) {
5028 5028 if (p->rs_flags &
5029 5029 MD_MN_RS_CLEAR_OPT_NOT_DONE) {
5030 5030 state = SMS_OFFLINE;
5031 5031 } else {
5032 5032 state = (broke_out ?
5033 5033 SMS_OFFLINE : SMS_RUNNING);
5034 5034 }
5035 5035 mirror_set_sm_state(
5036 5036 &un->un_sm[smi],
5037 5037 &un->un_smic[smi], state,
5038 5038 broke_out);
5039 5039 mirror_commit(un, NO_SUBMIRRORS,
5040 5040 0);
5041 5041 }
5042 5042 /*
5043 5043 * If we still have an offline submirror, reset
5044 5044 * the OFFLINE_SM flag in the mirror status
5045 5045 */
5046 5046 if (SMS_BY_INDEX_IS(un, smi,
5047 5047 SMS_OFFLINE))
5048 5048 un->c.un_status |=
5049 5049 MD_UN_OFFLINE_SM;
5050 5050 }
5051 5051 md_ioctl_writerexit(lockp);
5052 5052 break;
5053 5053 case MD_RS_SUBMIRROR:
5054 5054 un = md_ioctl_writerlock(lockp, ui);
5055 5055 smi = RS_SMI(p->rs_type);
5056 5056 sm = &un->un_sm[smi];
5057 5057 smic = &un->un_smic[smi];
5058 5058 /* Clear RESYNC target */
5059 5059 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
5060 5060 /*
5061 5061 * Set resync_completed to last resync type and then
5062 5062 * clear resync_type to indicate no resync in progress
5063 5063 */
5064 5064 un->un_resync_completed = un->un_rs_type;
5065 5065 SET_RS_TYPE_NONE(un->un_rs_type);
5066 5066 /*
5067 5067 * If the resync completed ok reset the submirror
5068 5068 * state to SMS_RUNNING else reset it to SMS_ATTACHED
5069 5069 */
5070 5070 state = (broke_out ?
5071 5071 SMS_ATTACHED : SMS_RUNNING);
5072 5072 mirror_set_sm_state(sm, smic, state, broke_out);
5073 5073 un->c.un_status &= ~MD_UN_WAR;
5074 5074 mirror_commit(un, SMI2BIT(smi), 0);
5075 5075 md_ioctl_writerexit(lockp);
5076 5076 break;
5077 5077 case MD_RS_COMPONENT:
5078 5078 un = md_ioctl_writerlock(lockp, ui);
5079 5079 smi = RS_SMI(p->rs_type);
5080 5080 ci = RS_CI(p->rs_type);
5081 5081 sm = &un->un_sm[smi];
5082 5082 smic = &un->un_smic[smi];
5083 5083 shared = (md_m_shared_t *)
5084 5084 (*(smic->sm_shared_by_indx))
5085 5085 (sm->sm_dev, sm, ci);
5086 5086 un->c.un_status &= ~MD_UN_WAR;
5087 5087 /* Clear RESYNC target */
5088 5088 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
5089 5089 /*
5090 5090 * Set resync_completed to last resync type and then
5091 5091 * clear resync_type to indicate no resync in progress
5092 5092 */
5093 5093 un->un_resync_completed = un->un_rs_type;
5094 5094 SET_RS_TYPE_NONE(un->un_rs_type);
5095 5095
5096 5096 /*
5097 5097 * If the resync completed ok, set the component state
5098 5098 * to CS_OKAY.
5099 5099 */
5100 5100 if (broke_out)
5101 5101 shared->ms_flags |= MDM_S_RS_TRIED;
5102 5102 else {
5103 5103 /*
5104 5104 * As we don't transmit the changes,
5105 5105 * no need to drop the lock.
5106 5106 */
5107 5107 set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
5108 5108 MD_STATE_NO_XMIT, (IOLOCK *)NULL);
5109 5109 }
5110 5110 md_ioctl_writerexit(lockp);
5111 5111 default:
5112 5112 break;
5113 5113 }
5114 5114 /*
5115 5115 * If the purpose of this PHASE_DONE message is just to
5116 5116 * indicate to all other nodes that the optimized resync
5117 5117 * required (OPT_NOT_DONE) flag is to be cleared, there is
5118 5118 * no need to generate a notify event as there has not
5119 5119 * actually been a resync.
5120 5120 */
5121 5121 if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) {
5122 5122 if (broke_out) {
5123 5123 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
5124 5124 SVM_TAG_METADEVICE, MD_UN2SET(un),
5125 5125 MD_SID(un));
5126 5126 } else {
5127 5127 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
5128 5128 SVM_TAG_METADEVICE, MD_UN2SET(un),
5129 5129 MD_SID(un));
5130 5130 }
5131 5131 }
5132 5132 break;
5133 5133
5134 5134 default:
5135 5135 #ifdef DEBUG
5136 5136 cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type"
5137 5137 " %x\n", p->msg_type);
5138 5138 #endif
5139 5139 return (EINVAL);
5140 5140 }
5141 5141 return (0);
5142 5142 }
5143 5143
5144 5144 /* Return a -1 if snarf of optimized record failed and set should be released */
5145 5145 static int
5146 5146 mirror_snarf(md_snarfcmd_t cmd, set_t setno)
5147 5147 {
5148 5148 mddb_recid_t recid;
5149 5149 int gotsomething;
5150 5150 int all_mirrors_gotten;
5151 5151 mm_unit_t *un;
5152 5152 mddb_type_t typ1;
5153 5153 mddb_de_ic_t *dep;
5154 5154 mddb_rb32_t *rbp;
5155 5155 size_t newreqsize;
5156 5156 mm_unit_t *big_un;
5157 5157 mm_unit32_od_t *small_un;
5158 5158 int retval;
5159 5159 mdi_unit_t *ui;
5160 5160
5161 5161 if (cmd == MD_SNARF_CLEANUP) {
5162 5162 if (md_get_setstatus(setno) & MD_SET_STALE)
5163 5163 return (0);
5164 5164
5165 5165 recid = mddb_makerecid(setno, 0);
5166 5166 typ1 = (mddb_type_t)md_getshared_key(setno,
5167 5167 mirror_md_ops.md_driver.md_drivername);
5168 5168 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5169 5169 if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) {
5170 5170 un = (mm_unit_t *)mddb_getrecaddr(recid);
5171 5171 mirror_cleanup(un);
5172 5172 recid = mddb_makerecid(setno, 0);
5173 5173 }
5174 5174 }
5175 5175 return (0);
5176 5176 }
5177 5177
5178 5178 all_mirrors_gotten = 1;
5179 5179 gotsomething = 0;
5180 5180
5181 5181 recid = mddb_makerecid(setno, 0);
5182 5182 typ1 = (mddb_type_t)md_getshared_key(setno,
5183 5183 mirror_md_ops.md_driver.md_drivername);
5184 5184
5185 5185 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5186 5186 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5187 5187 continue;
5188 5188
5189 5189 dep = mddb_getrecdep(recid);
5190 5190 dep->de_flags = MDDB_F_MIRROR;
5191 5191 rbp = dep->de_rb;
5192 5192
5193 5193 switch (rbp->rb_revision) {
5194 5194 case MDDB_REV_RB:
5195 5195 case MDDB_REV_RBFN:
5196 5196 if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
5197 5197 /*
5198 5198 * This means, we have an old and small
5199 5199 * record and this record hasn't already
5200 5200 * been converted. Before we create an
5201 5201 * incore metadevice from this we have to
5202 5202 * convert it to a big record.
5203 5203 */
5204 5204 small_un =
5205 5205 (mm_unit32_od_t *)mddb_getrecaddr(recid);
5206 5206 newreqsize = sizeof (mm_unit_t);
5207 5207 big_un = (mm_unit_t *)kmem_zalloc(newreqsize,
5208 5208 KM_SLEEP);
5209 5209 mirror_convert((caddr_t)small_un,
5210 5210 (caddr_t)big_un, SMALL_2_BIG);
5211 5211 kmem_free(small_un, dep->de_reqsize);
5212 5212
5213 5213 /*
5214 5214 * Update userdata and incore userdata
5215 5215 * incores are at the end of un
5216 5216 */
5217 5217 dep->de_rb_userdata_ic = big_un;
5218 5218 dep->de_rb_userdata = big_un;
5219 5219 dep->de_icreqsize = newreqsize;
5220 5220 un = big_un;
5221 5221 rbp->rb_private |= MD_PRV_CONVD;
5222 5222 } else {
5223 5223 /*
5224 5224 * Unit already converted, just get the
5225 5225 * record address.
5226 5226 */
5227 5227 un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
5228 5228 sizeof (*un), 0);
5229 5229 }
5230 5230 un->c.un_revision &= ~MD_64BIT_META_DEV;
5231 5231 break;
5232 5232 case MDDB_REV_RB64:
5233 5233 case MDDB_REV_RB64FN:
5234 5234 /* Big device */
5235 5235 un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
5236 5236 sizeof (*un), 0);
5237 5237 un->c.un_revision |= MD_64BIT_META_DEV;
5238 5238 un->c.un_flag |= MD_EFILABEL;
5239 5239 break;
5240 5240 }
5241 5241 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
5242 5242
5243 5243 /*
5244 5244 * Create minor device node for snarfed entry.
5245 5245 */
5246 5246 (void) md_create_minor_node(setno, MD_SID(un));
5247 5247
5248 5248 if (MD_UNIT(MD_SID(un)) != NULL) {
5249 5249 mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5250 5250 continue;
5251 5251 }
5252 5252 all_mirrors_gotten = 0;
5253 5253 retval = mirror_build_incore(un, 1);
5254 5254 if (retval == 0) {
5255 5255 mddb_setrecprivate(recid, MD_PRV_GOTIT);
5256 5256 md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0);
5257 5257 resync_start_timeout(setno);
5258 5258 gotsomething = 1;
5259 5259 } else {
5260 5260 return (retval);
5261 5261 }
5262 5262 /*
5263 5263 * Set flag to indicate that the mirror has not yet
5264 5264 * been through a reconfig. This flag is used for MN sets
5265 5265 * when determining whether to update the mirror state from
5266 5266 * the Master node.
5267 5267 */
5268 5268 if (MD_MNSET_SETNO(setno)) {
5269 5269 ui = MDI_UNIT(MD_SID(un));
5270 5270 ui->ui_tstate |= MD_RESYNC_NOT_DONE;
5271 5271 }
5272 5272 }
5273 5273
5274 5274 if (!all_mirrors_gotten)
5275 5275 return (gotsomething);
5276 5276
5277 5277 recid = mddb_makerecid(setno, 0);
5278 5278 while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0)
5279 5279 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
5280 5280 mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5281 5281
5282 5282 return (0);
5283 5283 }
5284 5284
5285 5285 static int
5286 5286 mirror_halt(md_haltcmd_t cmd, set_t setno)
5287 5287 {
5288 5288 unit_t i;
5289 5289 mdi_unit_t *ui;
5290 5290 minor_t mnum;
5291 5291 int reset_mirror_flag = 0;
5292 5292
5293 5293 if (cmd == MD_HALT_CLOSE)
5294 5294 return (0);
5295 5295
5296 5296 if (cmd == MD_HALT_OPEN)
5297 5297 return (0);
5298 5298
5299 5299 if (cmd == MD_HALT_UNLOAD)
5300 5300 return (0);
5301 5301
5302 5302 if (cmd == MD_HALT_CHECK) {
5303 5303 for (i = 0; i < md_nunits; i++) {
5304 5304 mnum = MD_MKMIN(setno, i);
5305 5305 if ((ui = MDI_UNIT(mnum)) == NULL)
5306 5306 continue;
5307 5307 if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5308 5308 continue;
5309 5309 if (md_unit_isopen(ui))
5310 5310 return (1);
5311 5311 }
5312 5312 return (0);
5313 5313 }
5314 5314
5315 5315 if (cmd != MD_HALT_DOIT)
5316 5316 return (1);
5317 5317
5318 5318 for (i = 0; i < md_nunits; i++) {
5319 5319 mnum = MD_MKMIN(setno, i);
5320 5320 if ((ui = MDI_UNIT(mnum)) == NULL)
5321 5321 continue;
5322 5322 if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5323 5323 continue;
5324 5324 reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0);
5325 5325
5326 5326 /* Set a flag if there is at least one mirror metadevice. */
5327 5327 reset_mirror_flag = 1;
5328 5328 }
5329 5329
5330 5330 /*
5331 5331 * Only wait for the global dr_timeout to finish
5332 5332 * - if there are mirror metadevices in this diskset or
5333 5333 * - if this is the local set since an unload of the md_mirror
5334 5334 * driver could follow a successful mirror halt in the local set.
5335 5335 */
5336 5336 if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) {
5337 5337 while ((mirror_md_ops.md_head == NULL) &&
5338 5338 (mirror_timeout.dr_timeout_id != 0))
5339 5339 delay(md_hz);
5340 5340 }
5341 5341
5342 5342 return (0);
5343 5343 }
5344 5344
5345 5345 /*ARGSUSED3*/
5346 5346 static int
5347 5347 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
5348 5348 {
5349 5349 IOLOCK lock;
5350 5350 minor_t mnum = getminor(*dev);
5351 5351 set_t setno;
5352 5352
5353 5353 /*
5354 5354 * When doing an open of a multi owner metadevice, check to see if this
5355 5355 * node is a starting node and if a reconfig cycle is underway.
5356 5356 * If so, the system isn't sufficiently set up enough to handle the
5357 5357 * open (which involves I/O during sp_validate), so fail with ENXIO.
5358 5358 */
5359 5359 setno = MD_MIN2SET(mnum);
5360 5360 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
5361 5361 (MD_SET_MNSET | MD_SET_MN_START_RC)) {
5362 5362 return (ENXIO);
5363 5363 }
5364 5364
5365 5365 if (md_oflags & MD_OFLG_FROMIOCTL) {
5366 5366 /*
5367 5367 * This indicates that the caller is an ioctl service routine.
5368 5368 * In this case we initialise our stack-based IOLOCK and pass
5369 5369 * this into the internal open routine. This allows multi-owner
5370 5370 * metadevices to avoid deadlocking if an error is encountered
5371 5371 * during the open() attempt. The failure case is:
5372 5372 * s-p -> mirror -> s-p (with error). Attempting to metaclear
5373 5373 * this configuration would deadlock as the mirror code has to
5374 5374 * send a state-update to the other nodes when it detects the
5375 5375 * failure of the underlying submirror with an errored soft-part
5376 5376 * on it. As there is a class1 message in progress (metaclear)
5377 5377 * set_sm_comp_state() cannot send another class1 message;
5378 5378 * instead we do not send a state_update message as the
5379 5379 * metaclear is distributed and the failed submirror will be
5380 5380 * cleared from the configuration by the metaclear.
5381 5381 */
5382 5382 IOLOCK_INIT(&lock);
5383 5383 return (mirror_internal_open(getminor(*dev), flag, otyp,
5384 5384 md_oflags, &lock));
5385 5385 } else {
5386 5386 return (mirror_internal_open(getminor(*dev), flag, otyp,
5387 5387 md_oflags, (IOLOCK *)NULL));
5388 5388 }
5389 5389 }
5390 5390
5391 5391
5392 5392 /*ARGSUSED1*/
5393 5393 static int
5394 5394 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
5395 5395 {
5396 5396 return (mirror_internal_close(getminor(dev), otyp, md_cflags,
5397 5397 (IOLOCK *)NULL));
5398 5398 }
5399 5399
5400 5400
5401 5401 /*
5402 5402 * This routine dumps memory to the disk. It assumes that the memory has
5403 5403 * already been mapped into mainbus space. It is called at disk interrupt
5404 5404 * priority when the system is in trouble.
5405 5405 *
5406 5406 */
5407 5407 static int
5408 5408 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
5409 5409 {
5410 5410 mm_unit_t *un;
5411 5411 dev_t mapdev;
5412 5412 int result;
5413 5413 int smi;
5414 5414 int any_succeed = 0;
5415 5415 int save_result = 0;
5416 5416
5417 5417 /*
5418 5418 * Don't need to grab the unit lock.
5419 5419 * Cause nothing else is suppose to be happenning.
5420 5420 * Also dump is not suppose to sleep.
5421 5421 */
5422 5422 un = (mm_unit_t *)MD_UNIT(getminor(dev));
5423 5423
5424 5424 if ((diskaddr_t)blkno >= un->c.un_total_blocks)
5425 5425 return (EINVAL);
5426 5426
5427 5427 if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
5428 5428 return (EINVAL);
5429 5429
5430 5430 for (smi = 0; smi < NMIRROR; smi++) {
5431 5431 if (!SUBMIRROR_IS_WRITEABLE(un, smi))
5432 5432 continue;
5433 5433 mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev);
5434 5434 result = bdev_dump(mapdev, addr, blkno, nblk);
5435 5435 if (result)
5436 5436 save_result = result;
5437 5437
5438 5438 if (result == 0)
5439 5439 any_succeed++;
5440 5440 }
5441 5441
5442 5442 if (any_succeed)
5443 5443 return (0);
5444 5444
5445 5445 return (save_result);
5446 5446 }
5447 5447
5448 5448 /*
5449 5449 * NAME: mirror_probe_dev
5450 5450 *
5451 5451 * DESCRITPION: force opens every component of a mirror.
5452 5452 *
5453 5453 * On entry the unit writerlock is held
5454 5454 */
5455 5455 static int
5456 5456 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum)
5457 5457 {
5458 5458 int i;
5459 5459 int smi;
5460 5460 int ci;
5461 5461 mm_unit_t *un;
5462 5462 int md_devopen = 0;
5463 5463 set_t setno;
5464 5464 int sm_cnt;
5465 5465 int sm_unavail_cnt;
5466 5466
5467 5467 if (md_unit_isopen(ui))
5468 5468 md_devopen++;
5469 5469
5470 5470 un = MD_UNIT(mnum);
5471 5471 setno = MD_UN2SET(un);
5472 5472
5473 5473 sm_cnt = 0;
5474 5474 sm_unavail_cnt = 0;
5475 5475 for (i = 0; i < NMIRROR; i++) {
5476 5476 md_dev64_t tmpdev;
5477 5477 mdi_unit_t *sm_ui;
5478 5478
5479 5479 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) {
5480 5480 continue;
5481 5481 }
5482 5482
5483 5483 sm_cnt++;
5484 5484 tmpdev = un->un_sm[i].sm_dev;
5485 5485 (void) md_layered_open(mnum, &tmpdev,
5486 5486 MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV);
5487 5487 un->un_sm[i].sm_dev = tmpdev;
5488 5488
5489 5489 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
5490 5490
5491 5491 /*
5492 5492 * Logic similar to that in mirror_open_all_devs. We set or
5493 5493 * clear the submirror Unavailable bit.
5494 5494 */
5495 5495 (void) md_unit_writerlock(sm_ui);
5496 5496 if (submirror_unavailable(un, i, 1)) {
5497 5497 sm_ui->ui_tstate |= MD_INACCESSIBLE;
5498 5498 sm_unavail_cnt++;
5499 5499 } else {
5500 5500 sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
5501 5501 }
5502 5502 md_unit_writerexit(sm_ui);
5503 5503 }
5504 5504
5505 5505 /*
5506 5506 * If all of the submirrors are unavailable, the mirror is also
5507 5507 * unavailable.
5508 5508 */
5509 5509 if (sm_cnt == sm_unavail_cnt) {
5510 5510 ui->ui_tstate |= MD_INACCESSIBLE;
5511 5511 } else {
5512 5512 ui->ui_tstate &= ~MD_INACCESSIBLE;
5513 5513 }
5514 5514
5515 5515 /*
5516 5516 * Start checking from probe failures. If failures occur we
5517 5517 * set the appropriate erred state only if the metadevice is in
5518 5518 * use. This is specifically to prevent unnecessary resyncs.
5519 5519 * For instance if the disks were accidentally disconnected when
5520 5520 * the system booted up then until the metadevice is accessed
5521 5521 * (like file system mount) the user can shutdown, recable and
5522 5522 * reboot w/o incurring a potentially huge resync.
5523 5523 */
5524 5524
5525 5525 smi = 0;
5526 5526 ci = 0;
5527 5527 while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) {
5528 5528
5529 5529 if (mirror_other_sources(un, smi, ci, 0) == 1) {
5530 5530 /*
5531 5531 * Note that for a MN set, there is no need to call
5532 5532 * SE_NOTIFY as that is done when processing the
5533 5533 * state change
5534 5534 */
5535 5535 if (md_devopen) {
5536 5536 /*
5537 5537 * Never called from ioctl context,
5538 5538 * so (IOLOCK *)NULL
5539 5539 */
5540 5540 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED,
5541 5541 0, MD_STATE_XMIT, (IOLOCK *)NULL);
5542 5542 if (!MD_MNSET_SETNO(setno)) {
5543 5543 SE_NOTIFY(EC_SVM_STATE,
5544 5544 ESC_SVM_LASTERRED,
5545 5545 SVM_TAG_METADEVICE, setno,
5546 5546 MD_SID(un));
5547 5547 }
5548 5548 continue;
5549 5549 } else {
5550 5550 (void) mirror_close_all_devs(un,
5551 5551 MD_OFLG_PROBEDEV);
5552 5552 if (!MD_MNSET_SETNO(setno)) {
5553 5553 SE_NOTIFY(EC_SVM_STATE,
5554 5554 ESC_SVM_OPEN_FAIL,
5555 5555 SVM_TAG_METADEVICE, setno,
5556 5556 MD_SID(un));
5557 5557 }
5558 5558 mirror_openfail_console_info(un, smi, ci);
5559 5559 return (ENXIO);
5560 5560 }
5561 5561 }
5562 5562
5563 5563 /*
5564 5564 * Note that for a MN set, there is no need to call
5565 5565 * SE_NOTIFY as that is done when processing the
5566 5566 * state change
5567 5567 */
5568 5568 if (md_devopen) {
5569 5569 /* Never called from ioctl context, so (IOLOCK *)NULL */
5570 5570 set_sm_comp_state(un, smi, ci, CS_ERRED, 0,
5571 5571 MD_STATE_XMIT, (IOLOCK *)NULL);
5572 5572 if (!MD_MNSET_SETNO(setno)) {
5573 5573 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
5574 5574 SVM_TAG_METADEVICE, setno,
5575 5575 MD_SID(un));
5576 5576 }
5577 5577 }
5578 5578 mirror_openfail_console_info(un, smi, ci);
5579 5579 ci++;
5580 5580 }
5581 5581
5582 5582 if (MD_MNSET_SETNO(setno)) {
5583 5583 send_poke_hotspares(setno);
5584 5584 } else {
5585 5585 (void) poke_hotspares();
5586 5586 }
5587 5587 (void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV);
5588 5588
5589 5589 return (0);
5590 5590 }
5591 5591
5592 5592
5593 5593 static int
5594 5594 mirror_imp_set(
5595 5595 set_t setno
5596 5596 )
5597 5597 {
5598 5598
5599 5599 mddb_recid_t recid;
5600 5600 int gotsomething, i;
5601 5601 mddb_type_t typ1;
5602 5602 mddb_de_ic_t *dep;
5603 5603 mddb_rb32_t *rbp;
5604 5604 mm_unit32_od_t *un32;
5605 5605 mm_unit_t *un64;
5606 5606 md_dev64_t self_devt;
5607 5607 minor_t *self_id; /* minor needs to be updated */
5608 5608 md_parent_t *parent_id; /* parent needs to be updated */
5609 5609 mddb_recid_t *record_id; /* record id needs to be updated */
5610 5610 mddb_recid_t *optrec_id;
5611 5611 md_dev64_t tmpdev;
5612 5612
5613 5613
5614 5614 gotsomething = 0;
5615 5615
5616 5616 typ1 = (mddb_type_t)md_getshared_key(setno,
5617 5617 mirror_md_ops.md_driver.md_drivername);
5618 5618 recid = mddb_makerecid(setno, 0);
5619 5619
5620 5620 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5621 5621 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5622 5622 continue;
5623 5623
5624 5624 dep = mddb_getrecdep(recid);
5625 5625 rbp = dep->de_rb;
5626 5626
5627 5627 switch (rbp->rb_revision) {
5628 5628 case MDDB_REV_RB:
5629 5629 case MDDB_REV_RBFN:
5630 5630 /*
5631 5631 * Small device
5632 5632 */
5633 5633 un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid);
5634 5634 self_id = &(un32->c.un_self_id);
5635 5635 parent_id = &(un32->c.un_parent);
5636 5636 record_id = &(un32->c.un_record_id);
5637 5637 optrec_id = &(un32->un_rr_dirty_recid);
5638 5638
5639 5639 for (i = 0; i < un32->un_nsm; i++) {
5640 5640 tmpdev = md_expldev(un32->un_sm[i].sm_dev);
5641 5641 un32->un_sm[i].sm_dev = md_cmpldev
5642 5642 (md_makedevice(md_major, MD_MKMIN(setno,
5643 5643 MD_MIN2UNIT(md_getminor(tmpdev)))));
5644 5644
5645 5645 if (!md_update_minor(setno, mddb_getsidenum
5646 5646 (setno), un32->un_sm[i].sm_key))
5647 5647 goto out;
5648 5648 }
5649 5649 break;
5650 5650 case MDDB_REV_RB64:
5651 5651 case MDDB_REV_RB64FN:
5652 5652 un64 = (mm_unit_t *)mddb_getrecaddr(recid);
5653 5653 self_id = &(un64->c.un_self_id);
5654 5654 parent_id = &(un64->c.un_parent);
5655 5655 record_id = &(un64->c.un_record_id);
5656 5656 optrec_id = &(un64->un_rr_dirty_recid);
5657 5657
5658 5658 for (i = 0; i < un64->un_nsm; i++) {
5659 5659 tmpdev = un64->un_sm[i].sm_dev;
5660 5660 un64->un_sm[i].sm_dev = md_makedevice
5661 5661 (md_major, MD_MKMIN(setno, MD_MIN2UNIT
5662 5662 (md_getminor(tmpdev))));
5663 5663
5664 5664 if (!md_update_minor(setno, mddb_getsidenum
5665 5665 (setno), un64->un_sm[i].sm_key))
5666 5666 goto out;
5667 5667 }
5668 5668 break;
5669 5669 }
5670 5670
5671 5671 /*
5672 5672 * If this is a top level and a friendly name metadevice,
5673 5673 * update its minor in the namespace.
5674 5674 */
5675 5675 if ((*parent_id == MD_NO_PARENT) &&
5676 5676 ((rbp->rb_revision == MDDB_REV_RBFN) ||
5677 5677 (rbp->rb_revision == MDDB_REV_RB64FN))) {
5678 5678
5679 5679 self_devt = md_makedevice(md_major, *self_id);
5680 5680 if (!md_update_top_device_minor(setno,
5681 5681 mddb_getsidenum(setno), self_devt))
5682 5682 goto out;
5683 5683 }
5684 5684
5685 5685 /*
5686 5686 * Update unit with the imported setno
5687 5687 *
5688 5688 */
5689 5689 mddb_setrecprivate(recid, MD_PRV_GOTIT);
5690 5690
5691 5691 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
5692 5692 if (*parent_id != MD_NO_PARENT)
5693 5693 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
5694 5694 *record_id = MAKERECID(setno, DBID(*record_id));
5695 5695 *optrec_id = MAKERECID(setno, DBID(*optrec_id));
5696 5696
5697 5697 gotsomething = 1;
5698 5698 }
5699 5699
5700 5700 out:
5701 5701 return (gotsomething);
5702 5702 }
5703 5703
5704 5704 /*
5705 5705 * NAME: mirror_check_offline
5706 5706 *
5707 5707 * DESCRIPTION: return offline_status = 1 if any submirrors are offline
5708 5708 *
5709 5709 * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is
5710 5710 * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE
5711 5711 * ioctl.
5712 5712 */
5713 5713 int
5714 5714 mirror_check_offline(md_dev64_t dev, int *offline_status)
5715 5715 {
5716 5716 mm_unit_t *un;
5717 5717 md_error_t mde = mdnullerror;
5718 5718
5719 5719 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5720 5720 return (EINVAL);
5721 5721 *offline_status = 0;
5722 5722 if (un->c.un_status & MD_UN_OFFLINE_SM)
5723 5723 *offline_status = 1;
5724 5724 return (0);
5725 5725 }
5726 5726
5727 5727 /*
5728 5728 * NAME: mirror_inc_abr_count
5729 5729 *
5730 5730 * DESCRIPTION: increment the count of layered soft parts with ABR set
5731 5731 *
5732 5732 * Called from ioctl, so access to un_abr_count is protected by the global
5733 5733 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5734 5734 */
5735 5735 int
5736 5736 mirror_inc_abr_count(md_dev64_t dev)
5737 5737 {
5738 5738 mm_unit_t *un;
5739 5739 md_error_t mde = mdnullerror;
5740 5740
5741 5741 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5742 5742 return (EINVAL);
5743 5743 un->un_abr_count++;
5744 5744 return (0);
5745 5745 }
5746 5746
5747 5747 /*
5748 5748 * NAME: mirror_dec_abr_count
5749 5749 *
5750 5750 * DESCRIPTION: decrement the count of layered soft parts with ABR set
5751 5751 *
5752 5752 * Called from ioctl, so access to un_abr_count is protected by the global
5753 5753 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5754 5754 */
5755 5755 int
5756 5756 mirror_dec_abr_count(md_dev64_t dev)
5757 5757 {
5758 5758 mm_unit_t *un;
5759 5759 md_error_t mde = mdnullerror;
5760 5760
5761 5761 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5762 5762 return (EINVAL);
5763 5763 un->un_abr_count--;
5764 5764 return (0);
5765 5765 }
5766 5766
5767 5767 static md_named_services_t mirror_named_services[] = {
5768 5768 {(intptr_t (*)()) poke_hotspares, "poke hotspares" },
5769 5769 {(intptr_t (*)()) mirror_rename_listkids, MDRNM_LIST_URKIDS },
5770 5770 {mirror_rename_check, MDRNM_CHECK },
5771 5771 {(intptr_t (*)()) mirror_renexch_update_kids, MDRNM_UPDATE_KIDS },
5772 5772 {(intptr_t (*)()) mirror_exchange_parent_update_to,
5773 5773 MDRNM_PARENT_UPDATE_TO},
5774 5774 {(intptr_t (*)()) mirror_exchange_self_update_from_down,
5775 5775 MDRNM_SELF_UPDATE_FROM_DOWN },
5776 5776 {(intptr_t (*)())mirror_probe_dev, "probe open test" },
5777 5777 {(intptr_t (*)())mirror_check_offline, MD_CHECK_OFFLINE },
5778 5778 {(intptr_t (*)())mirror_inc_abr_count, MD_INC_ABR_COUNT },
5779 5779 {(intptr_t (*)())mirror_dec_abr_count, MD_DEC_ABR_COUNT },
5780 5780 { NULL, 0 }
5781 5781 };
5782 5782
5783 5783 md_ops_t mirror_md_ops = {
5784 5784 mirror_open, /* open */
5785 5785 mirror_close, /* close */
5786 5786 md_mirror_strategy, /* strategy */
5787 5787 NULL, /* print */
5788 5788 mirror_dump, /* dump */
5789 5789 NULL, /* read */
5790 5790 NULL, /* write */
5791 5791 md_mirror_ioctl, /* mirror_ioctl, */
5792 5792 mirror_snarf, /* mirror_snarf */
5793 5793 mirror_halt, /* mirror_halt */
5794 5794 NULL, /* aread */
5795 5795 NULL, /* awrite */
5796 5796 mirror_imp_set, /* import set */
5797 5797 mirror_named_services
5798 5798 };
5799 5799
5800 5800 /* module specific initilization */
5801 5801 static void
5802 5802 init_init()
5803 5803 {
5804 5804 md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t);
5805 5805
5806 5806 /* Initialize the parent and child save memory pools */
5807 5807 mirror_parent_cache = kmem_cache_create("md_mirror_parent",
5808 5808 sizeof (md_mps_t), 0, mirror_parent_constructor,
5809 5809 mirror_parent_destructor, mirror_run_queue, NULL, NULL,
5810 5810 0);
5811 5811
5812 5812 mirror_child_cache = kmem_cache_create("md_mirror_child",
5813 5813 sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0,
5814 5814 mirror_child_constructor, mirror_child_destructor,
5815 5815 mirror_run_queue, NULL, NULL, 0);
5816 5816
5817 5817 /*
5818 5818 * Insure wowbuf_size is a multiple of DEV_BSIZE,
5819 5819 * then initialize wowbuf memory pool.
5820 5820 */
5821 5821 md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE);
5822 5822 if (md_wowbuf_size <= 0)
5823 5823 md_wowbuf_size = 2 * DEV_BSIZE;
5824 5824 if (md_wowbuf_size > (32 * DEV_BSIZE))
5825 5825 md_wowbuf_size = (32 * DEV_BSIZE);
5826 5826
5827 5827 md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t);
5828 5828 mirror_wowblk_cache = kmem_cache_create("md_mirror_wow",
5829 5829 md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0);
5830 5830
5831 5831 mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5832 5832 mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5833 5833
5834 5834 mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL);
5835 5835 }
5836 5836
5837 5837 /* module specific uninitilization (undo init_init()) */
5838 5838 static void
5839 5839 fini_uninit()
5840 5840 {
5841 5841 kmem_cache_destroy(mirror_parent_cache);
5842 5842 kmem_cache_destroy(mirror_child_cache);
5843 5843 kmem_cache_destroy(mirror_wowblk_cache);
5844 5844 mirror_parent_cache = mirror_child_cache =
5845 5845 mirror_wowblk_cache = NULL;
5846 5846
5847 5847 mutex_destroy(&mirror_timeout.dr_mx);
5848 5848 mutex_destroy(&hotspare_request.dr_mx);
5849 5849 mutex_destroy(&non_ff_drv_mutex);
5850 5850 }
5851 5851
5852 5852 /* define the module linkage */
5853 5853 MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit())
↓ open down ↓ |
5738 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX