Print this page
3006 VERIFY[S,U,P] and ASSERT[S,U,P] frequently check if first argument is zero
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/metaslab.c
+++ new/usr/src/uts/common/fs/zfs/metaslab.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 2012 by Delphix. All rights reserved.
24 24 */
25 25
26 26 #include <sys/zfs_context.h>
27 27 #include <sys/dmu.h>
28 28 #include <sys/dmu_tx.h>
29 29 #include <sys/space_map.h>
30 30 #include <sys/metaslab_impl.h>
31 31 #include <sys/vdev_impl.h>
32 32 #include <sys/zio.h>
33 33
34 34 /*
35 35 * Allow allocations to switch to gang blocks quickly. We do this to
36 36 * avoid having to load lots of space_maps in a given txg. There are,
37 37 * however, some cases where we want to avoid "fast" ganging and instead
38 38 * we want to do an exhaustive search of all metaslabs on this device.
39 39 * Currently we don't allow any gang, zil, or dump device related allocations
40 40 * to "fast" gang.
41 41 */
42 42 #define CAN_FASTGANG(flags) \
43 43 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
44 44 METASLAB_GANG_AVOID)))
45 45
46 46 uint64_t metaslab_aliquot = 512ULL << 10;
47 47 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
48 48
49 49 /*
50 50 * This value defines the number of allowed allocation failures per vdev.
51 51 * If a device reaches this threshold in a given txg then we consider skipping
52 52 * allocations on that device.
53 53 */
54 54 int zfs_mg_alloc_failures;
55 55
56 56 /*
57 57 * Metaslab debugging: when set, keeps all space maps in core to verify frees.
58 58 */
59 59 static int metaslab_debug = 0;
60 60
61 61 /*
62 62 * Minimum size which forces the dynamic allocator to change
63 63 * it's allocation strategy. Once the space map cannot satisfy
64 64 * an allocation of this size then it switches to using more
65 65 * aggressive strategy (i.e search by size rather than offset).
66 66 */
67 67 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
68 68
69 69 /*
70 70 * The minimum free space, in percent, which must be available
71 71 * in a space map to continue allocations in a first-fit fashion.
72 72 * Once the space_map's free space drops below this level we dynamically
73 73 * switch to using best-fit allocations.
74 74 */
75 75 int metaslab_df_free_pct = 4;
76 76
77 77 /*
78 78 * A metaslab is considered "free" if it contains a contiguous
79 79 * segment which is greater than metaslab_min_alloc_size.
80 80 */
81 81 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
82 82
83 83 /*
84 84 * Max number of space_maps to prefetch.
85 85 */
86 86 int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
87 87
88 88 /*
89 89 * Percentage bonus multiplier for metaslabs that are in the bonus area.
90 90 */
91 91 int metaslab_smo_bonus_pct = 150;
92 92
93 93 /*
94 94 * ==========================================================================
95 95 * Metaslab classes
96 96 * ==========================================================================
97 97 */
98 98 metaslab_class_t *
99 99 metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
100 100 {
101 101 metaslab_class_t *mc;
102 102
103 103 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
104 104
105 105 mc->mc_spa = spa;
106 106 mc->mc_rotor = NULL;
107 107 mc->mc_ops = ops;
108 108
109 109 return (mc);
110 110 }
111 111
112 112 void
113 113 metaslab_class_destroy(metaslab_class_t *mc)
114 114 {
115 115 ASSERT(mc->mc_rotor == NULL);
116 116 ASSERT(mc->mc_alloc == 0);
117 117 ASSERT(mc->mc_deferred == 0);
118 118 ASSERT(mc->mc_space == 0);
119 119 ASSERT(mc->mc_dspace == 0);
120 120
121 121 kmem_free(mc, sizeof (metaslab_class_t));
122 122 }
123 123
124 124 int
125 125 metaslab_class_validate(metaslab_class_t *mc)
126 126 {
127 127 metaslab_group_t *mg;
128 128 vdev_t *vd;
129 129
130 130 /*
131 131 * Must hold one of the spa_config locks.
132 132 */
133 133 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
134 134 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
135 135
136 136 if ((mg = mc->mc_rotor) == NULL)
137 137 return (0);
138 138
139 139 do {
140 140 vd = mg->mg_vd;
141 141 ASSERT(vd->vdev_mg != NULL);
142 142 ASSERT3P(vd->vdev_top, ==, vd);
143 143 ASSERT3P(mg->mg_class, ==, mc);
144 144 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
145 145 } while ((mg = mg->mg_next) != mc->mc_rotor);
146 146
147 147 return (0);
148 148 }
149 149
150 150 void
151 151 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
152 152 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
153 153 {
154 154 atomic_add_64(&mc->mc_alloc, alloc_delta);
155 155 atomic_add_64(&mc->mc_deferred, defer_delta);
156 156 atomic_add_64(&mc->mc_space, space_delta);
157 157 atomic_add_64(&mc->mc_dspace, dspace_delta);
158 158 }
159 159
160 160 uint64_t
161 161 metaslab_class_get_alloc(metaslab_class_t *mc)
162 162 {
163 163 return (mc->mc_alloc);
164 164 }
165 165
166 166 uint64_t
167 167 metaslab_class_get_deferred(metaslab_class_t *mc)
168 168 {
169 169 return (mc->mc_deferred);
170 170 }
171 171
172 172 uint64_t
173 173 metaslab_class_get_space(metaslab_class_t *mc)
174 174 {
175 175 return (mc->mc_space);
176 176 }
177 177
178 178 uint64_t
179 179 metaslab_class_get_dspace(metaslab_class_t *mc)
180 180 {
181 181 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
182 182 }
183 183
184 184 /*
185 185 * ==========================================================================
186 186 * Metaslab groups
187 187 * ==========================================================================
188 188 */
189 189 static int
190 190 metaslab_compare(const void *x1, const void *x2)
191 191 {
192 192 const metaslab_t *m1 = x1;
193 193 const metaslab_t *m2 = x2;
194 194
195 195 if (m1->ms_weight < m2->ms_weight)
196 196 return (1);
197 197 if (m1->ms_weight > m2->ms_weight)
198 198 return (-1);
199 199
200 200 /*
201 201 * If the weights are identical, use the offset to force uniqueness.
202 202 */
203 203 if (m1->ms_map.sm_start < m2->ms_map.sm_start)
204 204 return (-1);
205 205 if (m1->ms_map.sm_start > m2->ms_map.sm_start)
206 206 return (1);
207 207
208 208 ASSERT3P(m1, ==, m2);
209 209
210 210 return (0);
211 211 }
212 212
213 213 metaslab_group_t *
214 214 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
215 215 {
216 216 metaslab_group_t *mg;
217 217
218 218 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
219 219 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
220 220 avl_create(&mg->mg_metaslab_tree, metaslab_compare,
221 221 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
222 222 mg->mg_vd = vd;
223 223 mg->mg_class = mc;
224 224 mg->mg_activation_count = 0;
225 225
226 226 return (mg);
227 227 }
228 228
229 229 void
230 230 metaslab_group_destroy(metaslab_group_t *mg)
231 231 {
232 232 ASSERT(mg->mg_prev == NULL);
233 233 ASSERT(mg->mg_next == NULL);
234 234 /*
235 235 * We may have gone below zero with the activation count
236 236 * either because we never activated in the first place or
237 237 * because we're done, and possibly removing the vdev.
238 238 */
239 239 ASSERT(mg->mg_activation_count <= 0);
240 240
241 241 avl_destroy(&mg->mg_metaslab_tree);
242 242 mutex_destroy(&mg->mg_lock);
243 243 kmem_free(mg, sizeof (metaslab_group_t));
244 244 }
245 245
246 246 void
247 247 metaslab_group_activate(metaslab_group_t *mg)
248 248 {
249 249 metaslab_class_t *mc = mg->mg_class;
250 250 metaslab_group_t *mgprev, *mgnext;
251 251
252 252 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
253 253
254 254 ASSERT(mc->mc_rotor != mg);
255 255 ASSERT(mg->mg_prev == NULL);
256 256 ASSERT(mg->mg_next == NULL);
257 257 ASSERT(mg->mg_activation_count <= 0);
258 258
259 259 if (++mg->mg_activation_count <= 0)
260 260 return;
261 261
262 262 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
263 263
264 264 if ((mgprev = mc->mc_rotor) == NULL) {
265 265 mg->mg_prev = mg;
266 266 mg->mg_next = mg;
267 267 } else {
268 268 mgnext = mgprev->mg_next;
269 269 mg->mg_prev = mgprev;
270 270 mg->mg_next = mgnext;
271 271 mgprev->mg_next = mg;
272 272 mgnext->mg_prev = mg;
273 273 }
274 274 mc->mc_rotor = mg;
275 275 }
276 276
277 277 void
278 278 metaslab_group_passivate(metaslab_group_t *mg)
279 279 {
280 280 metaslab_class_t *mc = mg->mg_class;
281 281 metaslab_group_t *mgprev, *mgnext;
282 282
283 283 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
284 284
285 285 if (--mg->mg_activation_count != 0) {
286 286 ASSERT(mc->mc_rotor != mg);
287 287 ASSERT(mg->mg_prev == NULL);
288 288 ASSERT(mg->mg_next == NULL);
289 289 ASSERT(mg->mg_activation_count < 0);
290 290 return;
291 291 }
292 292
293 293 mgprev = mg->mg_prev;
294 294 mgnext = mg->mg_next;
295 295
296 296 if (mg == mgnext) {
297 297 mc->mc_rotor = NULL;
298 298 } else {
299 299 mc->mc_rotor = mgnext;
300 300 mgprev->mg_next = mgnext;
301 301 mgnext->mg_prev = mgprev;
302 302 }
303 303
304 304 mg->mg_prev = NULL;
305 305 mg->mg_next = NULL;
306 306 }
307 307
308 308 static void
309 309 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
310 310 {
311 311 mutex_enter(&mg->mg_lock);
312 312 ASSERT(msp->ms_group == NULL);
313 313 msp->ms_group = mg;
314 314 msp->ms_weight = 0;
315 315 avl_add(&mg->mg_metaslab_tree, msp);
316 316 mutex_exit(&mg->mg_lock);
317 317 }
318 318
319 319 static void
320 320 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
321 321 {
322 322 mutex_enter(&mg->mg_lock);
323 323 ASSERT(msp->ms_group == mg);
324 324 avl_remove(&mg->mg_metaslab_tree, msp);
325 325 msp->ms_group = NULL;
326 326 mutex_exit(&mg->mg_lock);
327 327 }
328 328
329 329 static void
330 330 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
331 331 {
332 332 /*
333 333 * Although in principle the weight can be any value, in
334 334 * practice we do not use values in the range [1, 510].
335 335 */
336 336 ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
337 337 ASSERT(MUTEX_HELD(&msp->ms_lock));
338 338
339 339 mutex_enter(&mg->mg_lock);
340 340 ASSERT(msp->ms_group == mg);
341 341 avl_remove(&mg->mg_metaslab_tree, msp);
342 342 msp->ms_weight = weight;
343 343 avl_add(&mg->mg_metaslab_tree, msp);
344 344 mutex_exit(&mg->mg_lock);
345 345 }
346 346
347 347 /*
348 348 * ==========================================================================
349 349 * Common allocator routines
350 350 * ==========================================================================
351 351 */
352 352 static int
353 353 metaslab_segsize_compare(const void *x1, const void *x2)
354 354 {
355 355 const space_seg_t *s1 = x1;
356 356 const space_seg_t *s2 = x2;
357 357 uint64_t ss_size1 = s1->ss_end - s1->ss_start;
358 358 uint64_t ss_size2 = s2->ss_end - s2->ss_start;
359 359
360 360 if (ss_size1 < ss_size2)
361 361 return (-1);
362 362 if (ss_size1 > ss_size2)
363 363 return (1);
364 364
365 365 if (s1->ss_start < s2->ss_start)
366 366 return (-1);
367 367 if (s1->ss_start > s2->ss_start)
368 368 return (1);
369 369
370 370 return (0);
371 371 }
372 372
373 373 /*
374 374 * This is a helper function that can be used by the allocator to find
375 375 * a suitable block to allocate. This will search the specified AVL
376 376 * tree looking for a block that matches the specified criteria.
377 377 */
378 378 static uint64_t
379 379 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
380 380 uint64_t align)
381 381 {
382 382 space_seg_t *ss, ssearch;
383 383 avl_index_t where;
384 384
385 385 ssearch.ss_start = *cursor;
386 386 ssearch.ss_end = *cursor + size;
387 387
388 388 ss = avl_find(t, &ssearch, &where);
389 389 if (ss == NULL)
390 390 ss = avl_nearest(t, where, AVL_AFTER);
391 391
392 392 while (ss != NULL) {
393 393 uint64_t offset = P2ROUNDUP(ss->ss_start, align);
394 394
395 395 if (offset + size <= ss->ss_end) {
396 396 *cursor = offset + size;
397 397 return (offset);
398 398 }
399 399 ss = AVL_NEXT(t, ss);
400 400 }
401 401
402 402 /*
403 403 * If we know we've searched the whole map (*cursor == 0), give up.
404 404 * Otherwise, reset the cursor to the beginning and try again.
405 405 */
406 406 if (*cursor == 0)
407 407 return (-1ULL);
408 408
409 409 *cursor = 0;
410 410 return (metaslab_block_picker(t, cursor, size, align));
411 411 }
412 412
413 413 static void
414 414 metaslab_pp_load(space_map_t *sm)
415 415 {
416 416 space_seg_t *ss;
417 417
418 418 ASSERT(sm->sm_ppd == NULL);
419 419 sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
420 420
421 421 sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
422 422 avl_create(sm->sm_pp_root, metaslab_segsize_compare,
423 423 sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
424 424
425 425 for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
426 426 avl_add(sm->sm_pp_root, ss);
427 427 }
428 428
429 429 static void
430 430 metaslab_pp_unload(space_map_t *sm)
431 431 {
432 432 void *cookie = NULL;
433 433
434 434 kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
435 435 sm->sm_ppd = NULL;
436 436
437 437 while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
438 438 /* tear down the tree */
439 439 }
440 440
441 441 avl_destroy(sm->sm_pp_root);
442 442 kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
443 443 sm->sm_pp_root = NULL;
444 444 }
445 445
446 446 /* ARGSUSED */
447 447 static void
448 448 metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
449 449 {
450 450 /* No need to update cursor */
451 451 }
452 452
453 453 /* ARGSUSED */
454 454 static void
455 455 metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
456 456 {
457 457 /* No need to update cursor */
458 458 }
459 459
460 460 /*
461 461 * Return the maximum contiguous segment within the metaslab.
462 462 */
463 463 uint64_t
464 464 metaslab_pp_maxsize(space_map_t *sm)
465 465 {
466 466 avl_tree_t *t = sm->sm_pp_root;
467 467 space_seg_t *ss;
468 468
469 469 if (t == NULL || (ss = avl_last(t)) == NULL)
470 470 return (0ULL);
471 471
472 472 return (ss->ss_end - ss->ss_start);
473 473 }
474 474
475 475 /*
476 476 * ==========================================================================
477 477 * The first-fit block allocator
478 478 * ==========================================================================
479 479 */
480 480 static uint64_t
481 481 metaslab_ff_alloc(space_map_t *sm, uint64_t size)
482 482 {
483 483 avl_tree_t *t = &sm->sm_root;
484 484 uint64_t align = size & -size;
485 485 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
486 486
487 487 return (metaslab_block_picker(t, cursor, size, align));
488 488 }
489 489
490 490 /* ARGSUSED */
491 491 boolean_t
492 492 metaslab_ff_fragmented(space_map_t *sm)
493 493 {
494 494 return (B_TRUE);
495 495 }
496 496
497 497 static space_map_ops_t metaslab_ff_ops = {
498 498 metaslab_pp_load,
499 499 metaslab_pp_unload,
500 500 metaslab_ff_alloc,
501 501 metaslab_pp_claim,
502 502 metaslab_pp_free,
503 503 metaslab_pp_maxsize,
504 504 metaslab_ff_fragmented
505 505 };
506 506
507 507 /*
508 508 * ==========================================================================
509 509 * Dynamic block allocator -
510 510 * Uses the first fit allocation scheme until space get low and then
511 511 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
512 512 * and metaslab_df_free_pct to determine when to switch the allocation scheme.
513 513 * ==========================================================================
514 514 */
515 515 static uint64_t
516 516 metaslab_df_alloc(space_map_t *sm, uint64_t size)
517 517 {
518 518 avl_tree_t *t = &sm->sm_root;
519 519 uint64_t align = size & -size;
520 520 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
521 521 uint64_t max_size = metaslab_pp_maxsize(sm);
522 522 int free_pct = sm->sm_space * 100 / sm->sm_size;
523 523
524 524 ASSERT(MUTEX_HELD(sm->sm_lock));
525 525 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
526 526
527 527 if (max_size < size)
528 528 return (-1ULL);
529 529
530 530 /*
531 531 * If we're running low on space switch to using the size
532 532 * sorted AVL tree (best-fit).
533 533 */
534 534 if (max_size < metaslab_df_alloc_threshold ||
535 535 free_pct < metaslab_df_free_pct) {
536 536 t = sm->sm_pp_root;
537 537 *cursor = 0;
538 538 }
539 539
540 540 return (metaslab_block_picker(t, cursor, size, 1ULL));
541 541 }
542 542
543 543 static boolean_t
544 544 metaslab_df_fragmented(space_map_t *sm)
545 545 {
546 546 uint64_t max_size = metaslab_pp_maxsize(sm);
547 547 int free_pct = sm->sm_space * 100 / sm->sm_size;
548 548
549 549 if (max_size >= metaslab_df_alloc_threshold &&
550 550 free_pct >= metaslab_df_free_pct)
551 551 return (B_FALSE);
552 552
553 553 return (B_TRUE);
554 554 }
555 555
556 556 static space_map_ops_t metaslab_df_ops = {
557 557 metaslab_pp_load,
558 558 metaslab_pp_unload,
559 559 metaslab_df_alloc,
560 560 metaslab_pp_claim,
561 561 metaslab_pp_free,
562 562 metaslab_pp_maxsize,
563 563 metaslab_df_fragmented
564 564 };
565 565
566 566 /*
567 567 * ==========================================================================
568 568 * Other experimental allocators
569 569 * ==========================================================================
570 570 */
571 571 static uint64_t
572 572 metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
573 573 {
574 574 avl_tree_t *t = &sm->sm_root;
575 575 uint64_t *cursor = (uint64_t *)sm->sm_ppd;
576 576 uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
577 577 uint64_t max_size = metaslab_pp_maxsize(sm);
578 578 uint64_t rsize = size;
579 579 uint64_t offset = 0;
580 580
581 581 ASSERT(MUTEX_HELD(sm->sm_lock));
582 582 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
583 583
584 584 if (max_size < size)
585 585 return (-1ULL);
586 586
587 587 ASSERT3U(*extent_end, >=, *cursor);
588 588
589 589 /*
590 590 * If we're running low on space switch to using the size
591 591 * sorted AVL tree (best-fit).
592 592 */
593 593 if ((*cursor + size) > *extent_end) {
594 594
595 595 t = sm->sm_pp_root;
596 596 *cursor = *extent_end = 0;
597 597
598 598 if (max_size > 2 * SPA_MAXBLOCKSIZE)
599 599 rsize = MIN(metaslab_min_alloc_size, max_size);
600 600 offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
601 601 if (offset != -1)
602 602 *cursor = offset + size;
603 603 } else {
604 604 offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
605 605 }
606 606 ASSERT3U(*cursor, <=, *extent_end);
607 607 return (offset);
608 608 }
609 609
610 610 static boolean_t
611 611 metaslab_cdf_fragmented(space_map_t *sm)
612 612 {
613 613 uint64_t max_size = metaslab_pp_maxsize(sm);
614 614
615 615 if (max_size > (metaslab_min_alloc_size * 10))
616 616 return (B_FALSE);
617 617 return (B_TRUE);
618 618 }
619 619
620 620 static space_map_ops_t metaslab_cdf_ops = {
621 621 metaslab_pp_load,
622 622 metaslab_pp_unload,
623 623 metaslab_cdf_alloc,
624 624 metaslab_pp_claim,
625 625 metaslab_pp_free,
626 626 metaslab_pp_maxsize,
627 627 metaslab_cdf_fragmented
628 628 };
629 629
630 630 uint64_t metaslab_ndf_clump_shift = 4;
631 631
632 632 static uint64_t
633 633 metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
634 634 {
635 635 avl_tree_t *t = &sm->sm_root;
636 636 avl_index_t where;
637 637 space_seg_t *ss, ssearch;
638 638 uint64_t hbit = highbit(size);
639 639 uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
640 640 uint64_t max_size = metaslab_pp_maxsize(sm);
641 641
642 642 ASSERT(MUTEX_HELD(sm->sm_lock));
643 643 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
644 644
645 645 if (max_size < size)
646 646 return (-1ULL);
647 647
648 648 ssearch.ss_start = *cursor;
649 649 ssearch.ss_end = *cursor + size;
650 650
651 651 ss = avl_find(t, &ssearch, &where);
652 652 if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
653 653 t = sm->sm_pp_root;
654 654
655 655 ssearch.ss_start = 0;
656 656 ssearch.ss_end = MIN(max_size,
657 657 1ULL << (hbit + metaslab_ndf_clump_shift));
658 658 ss = avl_find(t, &ssearch, &where);
659 659 if (ss == NULL)
660 660 ss = avl_nearest(t, where, AVL_AFTER);
661 661 ASSERT(ss != NULL);
662 662 }
663 663
664 664 if (ss != NULL) {
665 665 if (ss->ss_start + size <= ss->ss_end) {
666 666 *cursor = ss->ss_start + size;
667 667 return (ss->ss_start);
668 668 }
669 669 }
670 670 return (-1ULL);
671 671 }
672 672
673 673 static boolean_t
674 674 metaslab_ndf_fragmented(space_map_t *sm)
675 675 {
676 676 uint64_t max_size = metaslab_pp_maxsize(sm);
677 677
678 678 if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
679 679 return (B_FALSE);
680 680 return (B_TRUE);
681 681 }
682 682
683 683
684 684 static space_map_ops_t metaslab_ndf_ops = {
685 685 metaslab_pp_load,
686 686 metaslab_pp_unload,
687 687 metaslab_ndf_alloc,
688 688 metaslab_pp_claim,
689 689 metaslab_pp_free,
690 690 metaslab_pp_maxsize,
691 691 metaslab_ndf_fragmented
692 692 };
693 693
694 694 space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
695 695
696 696 /*
697 697 * ==========================================================================
698 698 * Metaslabs
699 699 * ==========================================================================
700 700 */
701 701 metaslab_t *
702 702 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
703 703 uint64_t start, uint64_t size, uint64_t txg)
704 704 {
705 705 vdev_t *vd = mg->mg_vd;
706 706 metaslab_t *msp;
707 707
708 708 msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
709 709 mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
710 710
711 711 msp->ms_smo_syncing = *smo;
712 712
713 713 /*
714 714 * We create the main space map here, but we don't create the
715 715 * allocmaps and freemaps until metaslab_sync_done(). This serves
716 716 * two purposes: it allows metaslab_sync_done() to detect the
717 717 * addition of new space; and for debugging, it ensures that we'd
718 718 * data fault on any attempt to use this metaslab before it's ready.
719 719 */
720 720 space_map_create(&msp->ms_map, start, size,
721 721 vd->vdev_ashift, &msp->ms_lock);
722 722
723 723 metaslab_group_add(mg, msp);
724 724
725 725 if (metaslab_debug && smo->smo_object != 0) {
726 726 mutex_enter(&msp->ms_lock);
727 727 VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops,
728 728 SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
729 729 mutex_exit(&msp->ms_lock);
730 730 }
731 731
732 732 /*
733 733 * If we're opening an existing pool (txg == 0) or creating
734 734 * a new one (txg == TXG_INITIAL), all space is available now.
735 735 * If we're adding space to an existing pool, the new space
736 736 * does not become available until after this txg has synced.
737 737 */
738 738 if (txg <= TXG_INITIAL)
739 739 metaslab_sync_done(msp, 0);
740 740
741 741 if (txg != 0) {
742 742 vdev_dirty(vd, 0, NULL, txg);
743 743 vdev_dirty(vd, VDD_METASLAB, msp, txg);
744 744 }
745 745
746 746 return (msp);
747 747 }
748 748
749 749 void
750 750 metaslab_fini(metaslab_t *msp)
751 751 {
752 752 metaslab_group_t *mg = msp->ms_group;
753 753
754 754 vdev_space_update(mg->mg_vd,
755 755 -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
756 756
757 757 metaslab_group_remove(mg, msp);
758 758
759 759 mutex_enter(&msp->ms_lock);
760 760
761 761 space_map_unload(&msp->ms_map);
↓ open down ↓ |
761 lines elided |
↑ open up ↑ |
762 762 space_map_destroy(&msp->ms_map);
763 763
764 764 for (int t = 0; t < TXG_SIZE; t++) {
765 765 space_map_destroy(&msp->ms_allocmap[t]);
766 766 space_map_destroy(&msp->ms_freemap[t]);
767 767 }
768 768
769 769 for (int t = 0; t < TXG_DEFER_SIZE; t++)
770 770 space_map_destroy(&msp->ms_defermap[t]);
771 771
772 - ASSERT3S(msp->ms_deferspace, ==, 0);
772 + ASSERT0(msp->ms_deferspace);
773 773
774 774 mutex_exit(&msp->ms_lock);
775 775 mutex_destroy(&msp->ms_lock);
776 776
777 777 kmem_free(msp, sizeof (metaslab_t));
778 778 }
779 779
780 780 #define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
781 781 #define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
782 782 #define METASLAB_ACTIVE_MASK \
783 783 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
784 784
785 785 static uint64_t
786 786 metaslab_weight(metaslab_t *msp)
787 787 {
788 788 metaslab_group_t *mg = msp->ms_group;
789 789 space_map_t *sm = &msp->ms_map;
790 790 space_map_obj_t *smo = &msp->ms_smo;
791 791 vdev_t *vd = mg->mg_vd;
792 792 uint64_t weight, space;
793 793
794 794 ASSERT(MUTEX_HELD(&msp->ms_lock));
795 795
796 796 /*
797 797 * The baseline weight is the metaslab's free space.
798 798 */
799 799 space = sm->sm_size - smo->smo_alloc;
800 800 weight = space;
801 801
802 802 /*
803 803 * Modern disks have uniform bit density and constant angular velocity.
804 804 * Therefore, the outer recording zones are faster (higher bandwidth)
805 805 * than the inner zones by the ratio of outer to inner track diameter,
806 806 * which is typically around 2:1. We account for this by assigning
807 807 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
808 808 * In effect, this means that we'll select the metaslab with the most
809 809 * free bandwidth rather than simply the one with the most free space.
810 810 */
811 811 weight = 2 * weight -
812 812 ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
813 813 ASSERT(weight >= space && weight <= 2 * space);
814 814
815 815 /*
816 816 * For locality, assign higher weight to metaslabs which have
817 817 * a lower offset than what we've already activated.
818 818 */
819 819 if (sm->sm_start <= mg->mg_bonus_area)
820 820 weight *= (metaslab_smo_bonus_pct / 100);
821 821 ASSERT(weight >= space &&
822 822 weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
823 823
824 824 if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
825 825 /*
826 826 * If this metaslab is one we're actively using, adjust its
827 827 * weight to make it preferable to any inactive metaslab so
828 828 * we'll polish it off.
829 829 */
830 830 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
831 831 }
832 832 return (weight);
833 833 }
834 834
835 835 static void
836 836 metaslab_prefetch(metaslab_group_t *mg)
837 837 {
838 838 spa_t *spa = mg->mg_vd->vdev_spa;
839 839 metaslab_t *msp;
840 840 avl_tree_t *t = &mg->mg_metaslab_tree;
841 841 int m;
842 842
843 843 mutex_enter(&mg->mg_lock);
844 844
845 845 /*
846 846 * Prefetch the next potential metaslabs
847 847 */
848 848 for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
849 849 space_map_t *sm = &msp->ms_map;
850 850 space_map_obj_t *smo = &msp->ms_smo;
851 851
852 852 /* If we have reached our prefetch limit then we're done */
853 853 if (m >= metaslab_prefetch_limit)
854 854 break;
855 855
856 856 if (!sm->sm_loaded && smo->smo_object != 0) {
857 857 mutex_exit(&mg->mg_lock);
858 858 dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
859 859 0ULL, smo->smo_objsize);
860 860 mutex_enter(&mg->mg_lock);
861 861 }
862 862 }
863 863 mutex_exit(&mg->mg_lock);
864 864 }
865 865
866 866 static int
867 867 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
868 868 {
869 869 metaslab_group_t *mg = msp->ms_group;
870 870 space_map_t *sm = &msp->ms_map;
871 871 space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
872 872
873 873 ASSERT(MUTEX_HELD(&msp->ms_lock));
874 874
875 875 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
876 876 space_map_load_wait(sm);
877 877 if (!sm->sm_loaded) {
878 878 int error = space_map_load(sm, sm_ops, SM_FREE,
879 879 &msp->ms_smo,
880 880 spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
881 881 if (error) {
882 882 metaslab_group_sort(msp->ms_group, msp, 0);
883 883 return (error);
884 884 }
885 885 for (int t = 0; t < TXG_DEFER_SIZE; t++)
886 886 space_map_walk(&msp->ms_defermap[t],
887 887 space_map_claim, sm);
888 888
889 889 }
890 890
891 891 /*
892 892 * Track the bonus area as we activate new metaslabs.
893 893 */
894 894 if (sm->sm_start > mg->mg_bonus_area) {
895 895 mutex_enter(&mg->mg_lock);
896 896 mg->mg_bonus_area = sm->sm_start;
897 897 mutex_exit(&mg->mg_lock);
898 898 }
899 899
900 900 metaslab_group_sort(msp->ms_group, msp,
901 901 msp->ms_weight | activation_weight);
902 902 }
903 903 ASSERT(sm->sm_loaded);
904 904 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
905 905
906 906 return (0);
907 907 }
908 908
909 909 static void
910 910 metaslab_passivate(metaslab_t *msp, uint64_t size)
911 911 {
912 912 /*
913 913 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
914 914 * this metaslab again. In that case, it had better be empty,
915 915 * or we would be leaving space on the table.
916 916 */
917 917 ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
918 918 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
919 919 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
920 920 }
921 921
922 922 /*
923 923 * Write a metaslab to disk in the context of the specified transaction group.
924 924 */
925 925 void
926 926 metaslab_sync(metaslab_t *msp, uint64_t txg)
927 927 {
928 928 vdev_t *vd = msp->ms_group->mg_vd;
929 929 spa_t *spa = vd->vdev_spa;
930 930 objset_t *mos = spa_meta_objset(spa);
931 931 space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
932 932 space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
933 933 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
934 934 space_map_t *sm = &msp->ms_map;
935 935 space_map_obj_t *smo = &msp->ms_smo_syncing;
936 936 dmu_buf_t *db;
937 937 dmu_tx_t *tx;
938 938
939 939 ASSERT(!vd->vdev_ishole);
940 940
941 941 if (allocmap->sm_space == 0 && freemap->sm_space == 0)
942 942 return;
943 943
944 944 /*
945 945 * The only state that can actually be changing concurrently with
946 946 * metaslab_sync() is the metaslab's ms_map. No other thread can
947 947 * be modifying this txg's allocmap, freemap, freed_map, or smo.
948 948 * Therefore, we only hold ms_lock to satify space_map ASSERTs.
949 949 * We drop it whenever we call into the DMU, because the DMU
950 950 * can call down to us (e.g. via zio_free()) at any time.
951 951 */
952 952
953 953 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
954 954
955 955 if (smo->smo_object == 0) {
956 956 ASSERT(smo->smo_objsize == 0);
957 957 ASSERT(smo->smo_alloc == 0);
958 958 smo->smo_object = dmu_object_alloc(mos,
959 959 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
960 960 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
961 961 ASSERT(smo->smo_object != 0);
962 962 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
963 963 (sm->sm_start >> vd->vdev_ms_shift),
964 964 sizeof (uint64_t), &smo->smo_object, tx);
965 965 }
966 966
967 967 mutex_enter(&msp->ms_lock);
968 968
969 969 space_map_walk(freemap, space_map_add, freed_map);
970 970
971 971 if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
972 972 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) {
973 973 /*
974 974 * The in-core space map representation is twice as compact
975 975 * as the on-disk one, so it's time to condense the latter
976 976 * by generating a pure allocmap from first principles.
977 977 *
978 978 * This metaslab is 100% allocated,
979 979 * minus the content of the in-core map (sm),
980 980 * minus what's been freed this txg (freed_map),
981 981 * minus deferred frees (ms_defermap[]),
982 982 * minus allocations from txgs in the future
983 983 * (because they haven't been committed yet).
984 984 */
985 985 space_map_vacate(allocmap, NULL, NULL);
986 986 space_map_vacate(freemap, NULL, NULL);
987 987
988 988 space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size);
989 989
990 990 space_map_walk(sm, space_map_remove, allocmap);
991 991 space_map_walk(freed_map, space_map_remove, allocmap);
992 992
993 993 for (int t = 0; t < TXG_DEFER_SIZE; t++)
994 994 space_map_walk(&msp->ms_defermap[t],
995 995 space_map_remove, allocmap);
996 996
997 997 for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
998 998 space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
999 999 space_map_remove, allocmap);
1000 1000
1001 1001 mutex_exit(&msp->ms_lock);
1002 1002 space_map_truncate(smo, mos, tx);
1003 1003 mutex_enter(&msp->ms_lock);
1004 1004 }
1005 1005
1006 1006 space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
1007 1007 space_map_sync(freemap, SM_FREE, smo, mos, tx);
1008 1008
1009 1009 mutex_exit(&msp->ms_lock);
1010 1010
1011 1011 VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
1012 1012 dmu_buf_will_dirty(db, tx);
1013 1013 ASSERT3U(db->db_size, >=, sizeof (*smo));
1014 1014 bcopy(smo, db->db_data, sizeof (*smo));
1015 1015 dmu_buf_rele(db, FTAG);
1016 1016
1017 1017 dmu_tx_commit(tx);
1018 1018 }
1019 1019
1020 1020 /*
1021 1021 * Called after a transaction group has completely synced to mark
1022 1022 * all of the metaslab's free space as usable.
1023 1023 */
1024 1024 void
1025 1025 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1026 1026 {
1027 1027 space_map_obj_t *smo = &msp->ms_smo;
1028 1028 space_map_obj_t *smosync = &msp->ms_smo_syncing;
1029 1029 space_map_t *sm = &msp->ms_map;
1030 1030 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1031 1031 space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
1032 1032 metaslab_group_t *mg = msp->ms_group;
1033 1033 vdev_t *vd = mg->mg_vd;
1034 1034 int64_t alloc_delta, defer_delta;
1035 1035
1036 1036 ASSERT(!vd->vdev_ishole);
1037 1037
1038 1038 mutex_enter(&msp->ms_lock);
1039 1039
1040 1040 /*
1041 1041 * If this metaslab is just becoming available, initialize its
1042 1042 * allocmaps and freemaps and add its capacity to the vdev.
1043 1043 */
1044 1044 if (freed_map->sm_size == 0) {
1045 1045 for (int t = 0; t < TXG_SIZE; t++) {
1046 1046 space_map_create(&msp->ms_allocmap[t], sm->sm_start,
1047 1047 sm->sm_size, sm->sm_shift, sm->sm_lock);
1048 1048 space_map_create(&msp->ms_freemap[t], sm->sm_start,
1049 1049 sm->sm_size, sm->sm_shift, sm->sm_lock);
1050 1050 }
1051 1051
1052 1052 for (int t = 0; t < TXG_DEFER_SIZE; t++)
1053 1053 space_map_create(&msp->ms_defermap[t], sm->sm_start,
1054 1054 sm->sm_size, sm->sm_shift, sm->sm_lock);
1055 1055
1056 1056 vdev_space_update(vd, 0, 0, sm->sm_size);
1057 1057 }
1058 1058
1059 1059 alloc_delta = smosync->smo_alloc - smo->smo_alloc;
1060 1060 defer_delta = freed_map->sm_space - defer_map->sm_space;
1061 1061
1062 1062 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
1063 1063
1064 1064 ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
1065 1065 ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
1066 1066
1067 1067 /*
1068 1068 * If there's a space_map_load() in progress, wait for it to complete
1069 1069 * so that we have a consistent view of the in-core space map.
1070 1070 * Then, add defer_map (oldest deferred frees) to this map and
1071 1071 * transfer freed_map (this txg's frees) to defer_map.
1072 1072 */
1073 1073 space_map_load_wait(sm);
1074 1074 space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
1075 1075 space_map_vacate(freed_map, space_map_add, defer_map);
1076 1076
1077 1077 *smo = *smosync;
1078 1078
1079 1079 msp->ms_deferspace += defer_delta;
1080 1080 ASSERT3S(msp->ms_deferspace, >=, 0);
1081 1081 ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
1082 1082 if (msp->ms_deferspace != 0) {
1083 1083 /*
1084 1084 * Keep syncing this metaslab until all deferred frees
1085 1085 * are back in circulation.
1086 1086 */
1087 1087 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1088 1088 }
1089 1089
1090 1090 /*
1091 1091 * If the map is loaded but no longer active, evict it as soon as all
1092 1092 * future allocations have synced. (If we unloaded it now and then
1093 1093 * loaded a moment later, the map wouldn't reflect those allocations.)
1094 1094 */
1095 1095 if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1096 1096 int evictable = 1;
1097 1097
1098 1098 for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
1099 1099 if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
1100 1100 evictable = 0;
1101 1101
1102 1102 if (evictable && !metaslab_debug)
1103 1103 space_map_unload(sm);
1104 1104 }
1105 1105
1106 1106 metaslab_group_sort(mg, msp, metaslab_weight(msp));
1107 1107
1108 1108 mutex_exit(&msp->ms_lock);
1109 1109 }
1110 1110
1111 1111 void
1112 1112 metaslab_sync_reassess(metaslab_group_t *mg)
1113 1113 {
1114 1114 vdev_t *vd = mg->mg_vd;
1115 1115 int64_t failures = mg->mg_alloc_failures;
1116 1116
1117 1117 /*
1118 1118 * Re-evaluate all metaslabs which have lower offsets than the
1119 1119 * bonus area.
1120 1120 */
1121 1121 for (int m = 0; m < vd->vdev_ms_count; m++) {
1122 1122 metaslab_t *msp = vd->vdev_ms[m];
1123 1123
1124 1124 if (msp->ms_map.sm_start > mg->mg_bonus_area)
1125 1125 break;
1126 1126
1127 1127 mutex_enter(&msp->ms_lock);
1128 1128 metaslab_group_sort(mg, msp, metaslab_weight(msp));
1129 1129 mutex_exit(&msp->ms_lock);
1130 1130 }
1131 1131
1132 1132 atomic_add_64(&mg->mg_alloc_failures, -failures);
1133 1133
1134 1134 /*
1135 1135 * Prefetch the next potential metaslabs
1136 1136 */
1137 1137 metaslab_prefetch(mg);
1138 1138 }
1139 1139
1140 1140 static uint64_t
1141 1141 metaslab_distance(metaslab_t *msp, dva_t *dva)
1142 1142 {
1143 1143 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
1144 1144 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
1145 1145 uint64_t start = msp->ms_map.sm_start >> ms_shift;
1146 1146
1147 1147 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
1148 1148 return (1ULL << 63);
1149 1149
1150 1150 if (offset < start)
1151 1151 return ((start - offset) << ms_shift);
1152 1152 if (offset > start)
1153 1153 return ((offset - start) << ms_shift);
1154 1154 return (0);
1155 1155 }
1156 1156
1157 1157 static uint64_t
1158 1158 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1159 1159 uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
1160 1160 {
1161 1161 spa_t *spa = mg->mg_vd->vdev_spa;
1162 1162 metaslab_t *msp = NULL;
1163 1163 uint64_t offset = -1ULL;
1164 1164 avl_tree_t *t = &mg->mg_metaslab_tree;
1165 1165 uint64_t activation_weight;
1166 1166 uint64_t target_distance;
1167 1167 int i;
1168 1168
1169 1169 activation_weight = METASLAB_WEIGHT_PRIMARY;
1170 1170 for (i = 0; i < d; i++) {
1171 1171 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
1172 1172 activation_weight = METASLAB_WEIGHT_SECONDARY;
1173 1173 break;
1174 1174 }
1175 1175 }
1176 1176
1177 1177 for (;;) {
1178 1178 boolean_t was_active;
1179 1179
1180 1180 mutex_enter(&mg->mg_lock);
1181 1181 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1182 1182 if (msp->ms_weight < asize) {
1183 1183 spa_dbgmsg(spa, "%s: failed to meet weight "
1184 1184 "requirement: vdev %llu, txg %llu, mg %p, "
1185 1185 "msp %p, psize %llu, asize %llu, "
1186 1186 "failures %llu, weight %llu",
1187 1187 spa_name(spa), mg->mg_vd->vdev_id, txg,
1188 1188 mg, msp, psize, asize,
1189 1189 mg->mg_alloc_failures, msp->ms_weight);
1190 1190 mutex_exit(&mg->mg_lock);
1191 1191 return (-1ULL);
1192 1192 }
1193 1193 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1194 1194 if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1195 1195 break;
1196 1196
1197 1197 target_distance = min_distance +
1198 1198 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
1199 1199
1200 1200 for (i = 0; i < d; i++)
1201 1201 if (metaslab_distance(msp, &dva[i]) <
1202 1202 target_distance)
1203 1203 break;
1204 1204 if (i == d)
1205 1205 break;
1206 1206 }
1207 1207 mutex_exit(&mg->mg_lock);
1208 1208 if (msp == NULL)
1209 1209 return (-1ULL);
1210 1210
1211 1211 /*
1212 1212 * If we've already reached the allowable number of failed
1213 1213 * allocation attempts on this metaslab group then we
1214 1214 * consider skipping it. We skip it only if we're allowed
1215 1215 * to "fast" gang, the physical size is larger than
1216 1216 * a gang block, and we're attempting to allocate from
1217 1217 * the primary metaslab.
1218 1218 */
1219 1219 if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1220 1220 CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1221 1221 activation_weight == METASLAB_WEIGHT_PRIMARY) {
1222 1222 spa_dbgmsg(spa, "%s: skipping metaslab group: "
1223 1223 "vdev %llu, txg %llu, mg %p, psize %llu, "
1224 1224 "asize %llu, failures %llu", spa_name(spa),
1225 1225 mg->mg_vd->vdev_id, txg, mg, psize, asize,
1226 1226 mg->mg_alloc_failures);
1227 1227 return (-1ULL);
1228 1228 }
1229 1229
1230 1230 mutex_enter(&msp->ms_lock);
1231 1231
1232 1232 /*
1233 1233 * Ensure that the metaslab we have selected is still
1234 1234 * capable of handling our request. It's possible that
1235 1235 * another thread may have changed the weight while we
1236 1236 * were blocked on the metaslab lock.
1237 1237 */
1238 1238 if (msp->ms_weight < asize || (was_active &&
1239 1239 !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1240 1240 activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1241 1241 mutex_exit(&msp->ms_lock);
1242 1242 continue;
1243 1243 }
1244 1244
1245 1245 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1246 1246 activation_weight == METASLAB_WEIGHT_PRIMARY) {
1247 1247 metaslab_passivate(msp,
1248 1248 msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1249 1249 mutex_exit(&msp->ms_lock);
1250 1250 continue;
1251 1251 }
1252 1252
1253 1253 if (metaslab_activate(msp, activation_weight) != 0) {
1254 1254 mutex_exit(&msp->ms_lock);
1255 1255 continue;
1256 1256 }
1257 1257
1258 1258 if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
1259 1259 break;
1260 1260
1261 1261 atomic_inc_64(&mg->mg_alloc_failures);
1262 1262
1263 1263 metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
1264 1264
1265 1265 mutex_exit(&msp->ms_lock);
1266 1266 }
1267 1267
1268 1268 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
1269 1269 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1270 1270
1271 1271 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);
1272 1272
1273 1273 mutex_exit(&msp->ms_lock);
1274 1274
1275 1275 return (offset);
1276 1276 }
1277 1277
1278 1278 /*
1279 1279 * Allocate a block for the specified i/o.
1280 1280 */
1281 1281 static int
1282 1282 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1283 1283 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
1284 1284 {
1285 1285 metaslab_group_t *mg, *rotor;
1286 1286 vdev_t *vd;
1287 1287 int dshift = 3;
1288 1288 int all_zero;
1289 1289 int zio_lock = B_FALSE;
1290 1290 boolean_t allocatable;
1291 1291 uint64_t offset = -1ULL;
1292 1292 uint64_t asize;
1293 1293 uint64_t distance;
1294 1294
1295 1295 ASSERT(!DVA_IS_VALID(&dva[d]));
1296 1296
1297 1297 /*
1298 1298 * For testing, make some blocks above a certain size be gang blocks.
1299 1299 */
1300 1300 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
1301 1301 return (ENOSPC);
1302 1302
1303 1303 /*
1304 1304 * Start at the rotor and loop through all mgs until we find something.
1305 1305 * Note that there's no locking on mc_rotor or mc_aliquot because
1306 1306 * nothing actually breaks if we miss a few updates -- we just won't
1307 1307 * allocate quite as evenly. It all balances out over time.
1308 1308 *
1309 1309 * If we are doing ditto or log blocks, try to spread them across
1310 1310 * consecutive vdevs. If we're forced to reuse a vdev before we've
1311 1311 * allocated all of our ditto blocks, then try and spread them out on
1312 1312 * that vdev as much as possible. If it turns out to not be possible,
1313 1313 * gradually lower our standards until anything becomes acceptable.
1314 1314 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
1315 1315 * gives us hope of containing our fault domains to something we're
1316 1316 * able to reason about. Otherwise, any two top-level vdev failures
1317 1317 * will guarantee the loss of data. With consecutive allocation,
1318 1318 * only two adjacent top-level vdev failures will result in data loss.
1319 1319 *
1320 1320 * If we are doing gang blocks (hintdva is non-NULL), try to keep
1321 1321 * ourselves on the same vdev as our gang block header. That
1322 1322 * way, we can hope for locality in vdev_cache, plus it makes our
1323 1323 * fault domains something tractable.
1324 1324 */
1325 1325 if (hintdva) {
1326 1326 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
1327 1327
1328 1328 /*
1329 1329 * It's possible the vdev we're using as the hint no
1330 1330 * longer exists (i.e. removed). Consult the rotor when
1331 1331 * all else fails.
1332 1332 */
1333 1333 if (vd != NULL) {
1334 1334 mg = vd->vdev_mg;
1335 1335
1336 1336 if (flags & METASLAB_HINTBP_AVOID &&
1337 1337 mg->mg_next != NULL)
1338 1338 mg = mg->mg_next;
1339 1339 } else {
1340 1340 mg = mc->mc_rotor;
1341 1341 }
1342 1342 } else if (d != 0) {
1343 1343 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
1344 1344 mg = vd->vdev_mg->mg_next;
1345 1345 } else {
1346 1346 mg = mc->mc_rotor;
1347 1347 }
1348 1348
1349 1349 /*
1350 1350 * If the hint put us into the wrong metaslab class, or into a
1351 1351 * metaslab group that has been passivated, just follow the rotor.
1352 1352 */
1353 1353 if (mg->mg_class != mc || mg->mg_activation_count <= 0)
1354 1354 mg = mc->mc_rotor;
1355 1355
1356 1356 rotor = mg;
1357 1357 top:
1358 1358 all_zero = B_TRUE;
1359 1359 do {
1360 1360 ASSERT(mg->mg_activation_count == 1);
1361 1361
1362 1362 vd = mg->mg_vd;
1363 1363
1364 1364 /*
1365 1365 * Don't allocate from faulted devices.
1366 1366 */
1367 1367 if (zio_lock) {
1368 1368 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1369 1369 allocatable = vdev_allocatable(vd);
1370 1370 spa_config_exit(spa, SCL_ZIO, FTAG);
1371 1371 } else {
1372 1372 allocatable = vdev_allocatable(vd);
1373 1373 }
1374 1374 if (!allocatable)
1375 1375 goto next;
1376 1376
1377 1377 /*
1378 1378 * Avoid writing single-copy data to a failing vdev
1379 1379 */
1380 1380 if ((vd->vdev_stat.vs_write_errors > 0 ||
1381 1381 vd->vdev_state < VDEV_STATE_HEALTHY) &&
1382 1382 d == 0 && dshift == 3) {
1383 1383 all_zero = B_FALSE;
1384 1384 goto next;
1385 1385 }
1386 1386
1387 1387 ASSERT(mg->mg_class == mc);
1388 1388
1389 1389 distance = vd->vdev_asize >> dshift;
1390 1390 if (distance <= (1ULL << vd->vdev_ms_shift))
1391 1391 distance = 0;
1392 1392 else
1393 1393 all_zero = B_FALSE;
1394 1394
1395 1395 asize = vdev_psize_to_asize(vd, psize);
1396 1396 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1397 1397
1398 1398 offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
1399 1399 dva, d, flags);
1400 1400 if (offset != -1ULL) {
1401 1401 /*
1402 1402 * If we've just selected this metaslab group,
1403 1403 * figure out whether the corresponding vdev is
1404 1404 * over- or under-used relative to the pool,
1405 1405 * and set an allocation bias to even it out.
1406 1406 */
1407 1407 if (mc->mc_aliquot == 0) {
1408 1408 vdev_stat_t *vs = &vd->vdev_stat;
1409 1409 int64_t vu, cu;
1410 1410
1411 1411 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
1412 1412 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
1413 1413
1414 1414 /*
1415 1415 * Calculate how much more or less we should
1416 1416 * try to allocate from this device during
1417 1417 * this iteration around the rotor.
1418 1418 * For example, if a device is 80% full
1419 1419 * and the pool is 20% full then we should
1420 1420 * reduce allocations by 60% on this device.
1421 1421 *
1422 1422 * mg_bias = (20 - 80) * 512K / 100 = -307K
1423 1423 *
1424 1424 * This reduces allocations by 307K for this
1425 1425 * iteration.
1426 1426 */
1427 1427 mg->mg_bias = ((cu - vu) *
1428 1428 (int64_t)mg->mg_aliquot) / 100;
1429 1429 }
1430 1430
1431 1431 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1432 1432 mg->mg_aliquot + mg->mg_bias) {
1433 1433 mc->mc_rotor = mg->mg_next;
1434 1434 mc->mc_aliquot = 0;
1435 1435 }
1436 1436
1437 1437 DVA_SET_VDEV(&dva[d], vd->vdev_id);
1438 1438 DVA_SET_OFFSET(&dva[d], offset);
1439 1439 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
1440 1440 DVA_SET_ASIZE(&dva[d], asize);
1441 1441
1442 1442 return (0);
1443 1443 }
1444 1444 next:
1445 1445 mc->mc_rotor = mg->mg_next;
1446 1446 mc->mc_aliquot = 0;
1447 1447 } while ((mg = mg->mg_next) != rotor);
1448 1448
1449 1449 if (!all_zero) {
1450 1450 dshift++;
1451 1451 ASSERT(dshift < 64);
1452 1452 goto top;
1453 1453 }
1454 1454
1455 1455 if (!allocatable && !zio_lock) {
1456 1456 dshift = 3;
1457 1457 zio_lock = B_TRUE;
1458 1458 goto top;
1459 1459 }
1460 1460
1461 1461 bzero(&dva[d], sizeof (dva_t));
1462 1462
1463 1463 return (ENOSPC);
1464 1464 }
1465 1465
1466 1466 /*
1467 1467 * Free the block represented by DVA in the context of the specified
1468 1468 * transaction group.
1469 1469 */
1470 1470 static void
1471 1471 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
1472 1472 {
1473 1473 uint64_t vdev = DVA_GET_VDEV(dva);
1474 1474 uint64_t offset = DVA_GET_OFFSET(dva);
1475 1475 uint64_t size = DVA_GET_ASIZE(dva);
1476 1476 vdev_t *vd;
1477 1477 metaslab_t *msp;
1478 1478
1479 1479 ASSERT(DVA_IS_VALID(dva));
1480 1480
1481 1481 if (txg > spa_freeze_txg(spa))
1482 1482 return;
1483 1483
1484 1484 if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1485 1485 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
1486 1486 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
1487 1487 (u_longlong_t)vdev, (u_longlong_t)offset);
1488 1488 ASSERT(0);
1489 1489 return;
1490 1490 }
1491 1491
1492 1492 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1493 1493
1494 1494 if (DVA_GET_GANG(dva))
1495 1495 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1496 1496
1497 1497 mutex_enter(&msp->ms_lock);
1498 1498
1499 1499 if (now) {
1500 1500 space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
1501 1501 offset, size);
1502 1502 space_map_free(&msp->ms_map, offset, size);
1503 1503 } else {
1504 1504 if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
1505 1505 vdev_dirty(vd, VDD_METASLAB, msp, txg);
1506 1506 space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
1507 1507 }
1508 1508
1509 1509 mutex_exit(&msp->ms_lock);
1510 1510 }
1511 1511
1512 1512 /*
1513 1513 * Intent log support: upon opening the pool after a crash, notify the SPA
1514 1514 * of blocks that the intent log has allocated for immediate write, but
1515 1515 * which are still considered free by the SPA because the last transaction
1516 1516 * group didn't commit yet.
1517 1517 */
1518 1518 static int
1519 1519 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
1520 1520 {
1521 1521 uint64_t vdev = DVA_GET_VDEV(dva);
1522 1522 uint64_t offset = DVA_GET_OFFSET(dva);
1523 1523 uint64_t size = DVA_GET_ASIZE(dva);
1524 1524 vdev_t *vd;
1525 1525 metaslab_t *msp;
1526 1526 int error = 0;
1527 1527
1528 1528 ASSERT(DVA_IS_VALID(dva));
1529 1529
1530 1530 if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1531 1531 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
1532 1532 return (ENXIO);
1533 1533
1534 1534 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1535 1535
1536 1536 if (DVA_GET_GANG(dva))
1537 1537 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1538 1538
1539 1539 mutex_enter(&msp->ms_lock);
1540 1540
1541 1541 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
1542 1542 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
1543 1543
1544 1544 if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
1545 1545 error = ENOENT;
1546 1546
1547 1547 if (error || txg == 0) { /* txg == 0 indicates dry run */
1548 1548 mutex_exit(&msp->ms_lock);
1549 1549 return (error);
1550 1550 }
1551 1551
1552 1552 space_map_claim(&msp->ms_map, offset, size);
1553 1553
1554 1554 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
1555 1555 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
1556 1556 vdev_dirty(vd, VDD_METASLAB, msp, txg);
1557 1557 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
1558 1558 }
1559 1559
1560 1560 mutex_exit(&msp->ms_lock);
1561 1561
1562 1562 return (0);
1563 1563 }
1564 1564
1565 1565 int
1566 1566 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
1567 1567 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
1568 1568 {
1569 1569 dva_t *dva = bp->blk_dva;
1570 1570 dva_t *hintdva = hintbp->blk_dva;
1571 1571 int error = 0;
1572 1572
1573 1573 ASSERT(bp->blk_birth == 0);
1574 1574 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
1575 1575
1576 1576 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1577 1577
1578 1578 if (mc->mc_rotor == NULL) { /* no vdevs in this class */
1579 1579 spa_config_exit(spa, SCL_ALLOC, FTAG);
1580 1580 return (ENOSPC);
1581 1581 }
1582 1582
1583 1583 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
1584 1584 ASSERT(BP_GET_NDVAS(bp) == 0);
1585 1585 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
1586 1586
1587 1587 for (int d = 0; d < ndvas; d++) {
1588 1588 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
1589 1589 txg, flags);
1590 1590 if (error) {
1591 1591 for (d--; d >= 0; d--) {
1592 1592 metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
1593 1593 bzero(&dva[d], sizeof (dva_t));
1594 1594 }
1595 1595 spa_config_exit(spa, SCL_ALLOC, FTAG);
1596 1596 return (error);
1597 1597 }
1598 1598 }
1599 1599 ASSERT(error == 0);
1600 1600 ASSERT(BP_GET_NDVAS(bp) == ndvas);
1601 1601
1602 1602 spa_config_exit(spa, SCL_ALLOC, FTAG);
1603 1603
1604 1604 BP_SET_BIRTH(bp, txg, txg);
1605 1605
1606 1606 return (0);
1607 1607 }
1608 1608
1609 1609 void
1610 1610 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
1611 1611 {
1612 1612 const dva_t *dva = bp->blk_dva;
1613 1613 int ndvas = BP_GET_NDVAS(bp);
1614 1614
1615 1615 ASSERT(!BP_IS_HOLE(bp));
1616 1616 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
1617 1617
1618 1618 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
1619 1619
1620 1620 for (int d = 0; d < ndvas; d++)
1621 1621 metaslab_free_dva(spa, &dva[d], txg, now);
1622 1622
1623 1623 spa_config_exit(spa, SCL_FREE, FTAG);
1624 1624 }
1625 1625
1626 1626 int
1627 1627 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
1628 1628 {
1629 1629 const dva_t *dva = bp->blk_dva;
1630 1630 int ndvas = BP_GET_NDVAS(bp);
1631 1631 int error = 0;
1632 1632
1633 1633 ASSERT(!BP_IS_HOLE(bp));
1634 1634
1635 1635 if (txg != 0) {
1636 1636 /*
1637 1637 * First do a dry run to make sure all DVAs are claimable,
1638 1638 * so we don't have to unwind from partial failures below.
1639 1639 */
1640 1640 if ((error = metaslab_claim(spa, bp, 0)) != 0)
1641 1641 return (error);
1642 1642 }
1643 1643
1644 1644 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1645 1645
1646 1646 for (int d = 0; d < ndvas; d++)
1647 1647 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
1648 1648 break;
1649 1649
1650 1650 spa_config_exit(spa, SCL_ALLOC, FTAG);
1651 1651
1652 1652 ASSERT(error == 0 || txg == 0);
1653 1653
1654 1654 return (error);
1655 1655 }
↓ open down ↓ |
873 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX