41 * to "fast" gang.
42 */
43 #define CAN_FASTGANG(flags) \
44 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
45 METASLAB_GANG_AVOID)))
46
47 uint64_t metaslab_aliquot = 512ULL << 10;
48 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
49
50 /*
51 * The in-core space map representation is more compact than its on-disk form.
52 * The zfs_condense_pct determines how much more compact the in-core
53 * space_map representation must be before we compact it on-disk.
54 * Values should be greater than or equal to 100.
55 */
56 int zfs_condense_pct = 200;
57
58 /*
59 * This value defines the number of allowed allocation failures per vdev.
60 * If a device reaches this threshold in a given txg then we consider skipping
61 * allocations on that device.
62 */
63 int zfs_mg_alloc_failures;
64
65 /*
66 * Metaslab debugging: when set, keeps all space maps in core to verify frees.
67 */
68 static int metaslab_debug = 0;
69
70 /*
71 * Minimum size which forces the dynamic allocator to change
72 * it's allocation strategy. Once the space map cannot satisfy
73 * an allocation of this size then it switches to using more
74 * aggressive strategy (i.e search by size rather than offset).
75 */
76 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
77
78 /*
79 * The minimum free space, in percent, which must be available
80 * in a space map to continue allocations in a first-fit fashion.
81 * Once the space_map's free space drops below this level we dynamically
82 * switch to using best-fit allocations.
83 */
84 int metaslab_df_free_pct = 4;
85
207 const metaslab_t *m2 = x2;
208
209 if (m1->ms_weight < m2->ms_weight)
210 return (1);
211 if (m1->ms_weight > m2->ms_weight)
212 return (-1);
213
214 /*
215 * If the weights are identical, use the offset to force uniqueness.
216 */
217 if (m1->ms_map->sm_start < m2->ms_map->sm_start)
218 return (-1);
219 if (m1->ms_map->sm_start > m2->ms_map->sm_start)
220 return (1);
221
222 ASSERT3P(m1, ==, m2);
223
224 return (0);
225 }
226
227 metaslab_group_t *
228 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
229 {
230 metaslab_group_t *mg;
231
232 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
233 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
234 avl_create(&mg->mg_metaslab_tree, metaslab_compare,
235 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
236 mg->mg_vd = vd;
237 mg->mg_class = mc;
238 mg->mg_activation_count = 0;
239
240 return (mg);
241 }
242
243 void
244 metaslab_group_destroy(metaslab_group_t *mg)
245 {
246 ASSERT(mg->mg_prev == NULL);
257 kmem_free(mg, sizeof (metaslab_group_t));
258 }
259
260 void
261 metaslab_group_activate(metaslab_group_t *mg)
262 {
263 metaslab_class_t *mc = mg->mg_class;
264 metaslab_group_t *mgprev, *mgnext;
265
266 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
267
268 ASSERT(mc->mc_rotor != mg);
269 ASSERT(mg->mg_prev == NULL);
270 ASSERT(mg->mg_next == NULL);
271 ASSERT(mg->mg_activation_count <= 0);
272
273 if (++mg->mg_activation_count <= 0)
274 return;
275
276 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
277
278 if ((mgprev = mc->mc_rotor) == NULL) {
279 mg->mg_prev = mg;
280 mg->mg_next = mg;
281 } else {
282 mgnext = mgprev->mg_next;
283 mg->mg_prev = mgprev;
284 mg->mg_next = mgnext;
285 mgprev->mg_next = mg;
286 mgnext->mg_prev = mg;
287 }
288 mc->mc_rotor = mg;
289 }
290
291 void
292 metaslab_group_passivate(metaslab_group_t *mg)
293 {
294 metaslab_class_t *mc = mg->mg_class;
295 metaslab_group_t *mgprev, *mgnext;
296
342
343 static void
344 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
345 {
346 /*
347 * Although in principle the weight can be any value, in
348 * practice we do not use values in the range [1, 510].
349 */
350 ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
351 ASSERT(MUTEX_HELD(&msp->ms_lock));
352
353 mutex_enter(&mg->mg_lock);
354 ASSERT(msp->ms_group == mg);
355 avl_remove(&mg->mg_metaslab_tree, msp);
356 msp->ms_weight = weight;
357 avl_add(&mg->mg_metaslab_tree, msp);
358 mutex_exit(&mg->mg_lock);
359 }
360
361 /*
362 * ==========================================================================
363 * Common allocator routines
364 * ==========================================================================
365 */
366 static int
367 metaslab_segsize_compare(const void *x1, const void *x2)
368 {
369 const space_seg_t *s1 = x1;
370 const space_seg_t *s2 = x2;
371 uint64_t ss_size1 = s1->ss_end - s1->ss_start;
372 uint64_t ss_size2 = s2->ss_end - s2->ss_start;
373
374 if (ss_size1 < ss_size2)
375 return (-1);
376 if (ss_size1 > ss_size2)
377 return (1);
378
379 if (s1->ss_start < s2->ss_start)
380 return (-1);
381 if (s1->ss_start > s2->ss_start)
1290
1291 for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
1292 if (msp->ms_allocmap[(txg + t) & TXG_MASK]->sm_space)
1293 evictable = 0;
1294
1295 if (evictable && !metaslab_debug)
1296 space_map_unload(sm);
1297 }
1298
1299 metaslab_group_sort(mg, msp, metaslab_weight(msp));
1300
1301 mutex_exit(&msp->ms_lock);
1302 }
1303
1304 void
1305 metaslab_sync_reassess(metaslab_group_t *mg)
1306 {
1307 vdev_t *vd = mg->mg_vd;
1308 int64_t failures = mg->mg_alloc_failures;
1309
1310 /*
1311 * Re-evaluate all metaslabs which have lower offsets than the
1312 * bonus area.
1313 */
1314 for (int m = 0; m < vd->vdev_ms_count; m++) {
1315 metaslab_t *msp = vd->vdev_ms[m];
1316
1317 if (msp->ms_map->sm_start > mg->mg_bonus_area)
1318 break;
1319
1320 mutex_enter(&msp->ms_lock);
1321 metaslab_group_sort(mg, msp, metaslab_weight(msp));
1322 mutex_exit(&msp->ms_lock);
1323 }
1324
1325 atomic_add_64(&mg->mg_alloc_failures, -failures);
1326
1327 /*
1328 * Prefetch the next potential metaslabs
1329 */
1391 continue;
1392
1393 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1394 if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1395 break;
1396
1397 target_distance = min_distance +
1398 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
1399
1400 for (i = 0; i < d; i++)
1401 if (metaslab_distance(msp, &dva[i]) <
1402 target_distance)
1403 break;
1404 if (i == d)
1405 break;
1406 }
1407 mutex_exit(&mg->mg_lock);
1408 if (msp == NULL)
1409 return (-1ULL);
1410
1411 /*
1412 * If we've already reached the allowable number of failed
1413 * allocation attempts on this metaslab group then we
1414 * consider skipping it. We skip it only if we're allowed
1415 * to "fast" gang, the physical size is larger than
1416 * a gang block, and we're attempting to allocate from
1417 * the primary metaslab.
1418 */
1419 if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1420 CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1421 activation_weight == METASLAB_WEIGHT_PRIMARY) {
1422 spa_dbgmsg(spa, "%s: skipping metaslab group: "
1423 "vdev %llu, txg %llu, mg %p, psize %llu, "
1424 "asize %llu, failures %llu", spa_name(spa),
1425 mg->mg_vd->vdev_id, txg, mg, psize, asize,
1426 mg->mg_alloc_failures);
1427 return (-1ULL);
1428 }
1429
1430 mutex_enter(&msp->ms_lock);
1431
1432 /*
1433 * Ensure that the metaslab we have selected is still
1434 * capable of handling our request. It's possible that
1435 * another thread may have changed the weight while we
1436 * were blocked on the metaslab lock.
1437 */
1438 if (msp->ms_weight < asize || (was_active &&
1439 !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1440 activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1441 mutex_exit(&msp->ms_lock);
1442 continue;
1443 }
1444
1445 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1446 activation_weight == METASLAB_WEIGHT_PRIMARY) {
1447 metaslab_passivate(msp,
1448 msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1449 mutex_exit(&msp->ms_lock);
1450 continue;
1451 }
1564 mg = mc->mc_rotor;
1565
1566 rotor = mg;
1567 top:
1568 all_zero = B_TRUE;
1569 do {
1570 ASSERT(mg->mg_activation_count == 1);
1571
1572 vd = mg->mg_vd;
1573
1574 /*
1575 * Don't allocate from faulted devices.
1576 */
1577 if (zio_lock) {
1578 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1579 allocatable = vdev_allocatable(vd);
1580 spa_config_exit(spa, SCL_ZIO, FTAG);
1581 } else {
1582 allocatable = vdev_allocatable(vd);
1583 }
1584 if (!allocatable)
1585 goto next;
1586
1587 /*
1588 * Avoid writing single-copy data to a failing vdev
1589 * unless the user instructs us that it is okay.
1590 */
1591 if ((vd->vdev_stat.vs_write_errors > 0 ||
1592 vd->vdev_state < VDEV_STATE_HEALTHY) &&
1593 d == 0 && dshift == 3 &&
1594 !(zfs_write_to_degraded && vd->vdev_state ==
1595 VDEV_STATE_DEGRADED)) {
1596 all_zero = B_FALSE;
1597 goto next;
1598 }
1599
1600 ASSERT(mg->mg_class == mc);
1601
1602 distance = vd->vdev_asize >> dshift;
1603 if (distance <= (1ULL << vd->vdev_ms_shift))
|
41 * to "fast" gang.
42 */
43 #define CAN_FASTGANG(flags) \
44 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
45 METASLAB_GANG_AVOID)))
46
47 uint64_t metaslab_aliquot = 512ULL << 10;
48 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
49
50 /*
51 * The in-core space map representation is more compact than its on-disk form.
52 * The zfs_condense_pct determines how much more compact the in-core
53 * space_map representation must be before we compact it on-disk.
54 * Values should be greater than or equal to 100.
55 */
56 int zfs_condense_pct = 200;
57
58 /*
59 * This value defines the number of allowed allocation failures per vdev.
60 * If a device reaches this threshold in a given txg then we consider skipping
61 * allocations on that device. The value of zfs_mg_alloc_failures is computed
62 * in zio_init() unless it has been overridden in /etc/system.
63 */
64 int zfs_mg_alloc_failures = 0;
65
66 /*
67 * The zfs_mg_noalloc_threshold defines which metaslab groups should
68 * be eligible for allocation. The value is defined as a percentage of
69 * a free space. Metaslab groups that have more free space than
70 * zfs_mg_noalloc_threshold are always eligible for allocations. Once
71 * a metaslab group's free space is less than or equal to the
72 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
73 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
74 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
75 * groups are allowed to accept allocations. Gang blocks are always
76 * eligible to allocate on any metaslab group. The default value of 0 means
77 * no metaslab group will be excluded based on this criterion.
78 */
79 int zfs_mg_noalloc_threshold = 0;
80
81 /*
82 * Metaslab debugging: when set, keeps all space maps in core to verify frees.
83 */
84 static int metaslab_debug = 0;
85
86 /*
87 * Minimum size which forces the dynamic allocator to change
88 * it's allocation strategy. Once the space map cannot satisfy
89 * an allocation of this size then it switches to using more
90 * aggressive strategy (i.e search by size rather than offset).
91 */
92 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
93
94 /*
95 * The minimum free space, in percent, which must be available
96 * in a space map to continue allocations in a first-fit fashion.
97 * Once the space_map's free space drops below this level we dynamically
98 * switch to using best-fit allocations.
99 */
100 int metaslab_df_free_pct = 4;
101
223 const metaslab_t *m2 = x2;
224
225 if (m1->ms_weight < m2->ms_weight)
226 return (1);
227 if (m1->ms_weight > m2->ms_weight)
228 return (-1);
229
230 /*
231 * If the weights are identical, use the offset to force uniqueness.
232 */
233 if (m1->ms_map->sm_start < m2->ms_map->sm_start)
234 return (-1);
235 if (m1->ms_map->sm_start > m2->ms_map->sm_start)
236 return (1);
237
238 ASSERT3P(m1, ==, m2);
239
240 return (0);
241 }
242
243 /*
244 * Update the allocatable flag and the metaslab group's capacity.
245 * The allocatable flag is set to true if the capacity is below
246 * the zfs_mg_noalloc_threshold. If a metaslab group transitions
247 * from allocatable to non-allocatable or vice versa then the metaslab
248 * group's class is updated to reflect the transition.
249 */
250 static void
251 metaslab_group_alloc_update(metaslab_group_t *mg)
252 {
253 vdev_t *vd = mg->mg_vd;
254 metaslab_class_t *mc = mg->mg_class;
255 vdev_stat_t *vs = &vd->vdev_stat;
256 boolean_t was_allocatable;
257
258 ASSERT(vd == vd->vdev_top);
259
260 mutex_enter(&mg->mg_lock);
261 was_allocatable = mg->mg_allocatable;
262
263 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
264 (vs->vs_space + 1);
265
266 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
267
268 /*
269 * The mc_alloc_groups maintains a count of the number of
270 * groups in this metaslab class that are still above the
271 * zfs_mg_noalloc_threshold. This is used by the allocating
272 * threads to determine if they should avoid allocations to
273 * a given group. The allocator will avoid allocations to a group
274 * if that group has reached or is below the zfs_mg_noalloc_threshold
275 * and there are still other groups that are above the threshold.
276 * When a group transitions from allocatable to non-allocatable or
277 * vice versa we update the metaslab class to reflect that change.
278 * When the mc_alloc_groups value drops to 0 that means that all
279 * groups have reached the zfs_mg_noalloc_threshold making all groups
280 * eligible for allocations. This effectively means that all devices
281 * are balanced again.
282 */
283 if (was_allocatable && !mg->mg_allocatable)
284 mc->mc_alloc_groups--;
285 else if (!was_allocatable && mg->mg_allocatable)
286 mc->mc_alloc_groups++;
287 mutex_exit(&mg->mg_lock);
288 }
289
290 metaslab_group_t *
291 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
292 {
293 metaslab_group_t *mg;
294
295 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
296 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
297 avl_create(&mg->mg_metaslab_tree, metaslab_compare,
298 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
299 mg->mg_vd = vd;
300 mg->mg_class = mc;
301 mg->mg_activation_count = 0;
302
303 return (mg);
304 }
305
306 void
307 metaslab_group_destroy(metaslab_group_t *mg)
308 {
309 ASSERT(mg->mg_prev == NULL);
320 kmem_free(mg, sizeof (metaslab_group_t));
321 }
322
323 void
324 metaslab_group_activate(metaslab_group_t *mg)
325 {
326 metaslab_class_t *mc = mg->mg_class;
327 metaslab_group_t *mgprev, *mgnext;
328
329 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
330
331 ASSERT(mc->mc_rotor != mg);
332 ASSERT(mg->mg_prev == NULL);
333 ASSERT(mg->mg_next == NULL);
334 ASSERT(mg->mg_activation_count <= 0);
335
336 if (++mg->mg_activation_count <= 0)
337 return;
338
339 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
340 metaslab_group_alloc_update(mg);
341
342 if ((mgprev = mc->mc_rotor) == NULL) {
343 mg->mg_prev = mg;
344 mg->mg_next = mg;
345 } else {
346 mgnext = mgprev->mg_next;
347 mg->mg_prev = mgprev;
348 mg->mg_next = mgnext;
349 mgprev->mg_next = mg;
350 mgnext->mg_prev = mg;
351 }
352 mc->mc_rotor = mg;
353 }
354
355 void
356 metaslab_group_passivate(metaslab_group_t *mg)
357 {
358 metaslab_class_t *mc = mg->mg_class;
359 metaslab_group_t *mgprev, *mgnext;
360
406
407 static void
408 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
409 {
410 /*
411 * Although in principle the weight can be any value, in
412 * practice we do not use values in the range [1, 510].
413 */
414 ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
415 ASSERT(MUTEX_HELD(&msp->ms_lock));
416
417 mutex_enter(&mg->mg_lock);
418 ASSERT(msp->ms_group == mg);
419 avl_remove(&mg->mg_metaslab_tree, msp);
420 msp->ms_weight = weight;
421 avl_add(&mg->mg_metaslab_tree, msp);
422 mutex_exit(&mg->mg_lock);
423 }
424
425 /*
426 * Determine if a given metaslab group should skip allocations. A metaslab
427 * group should avoid allocations if its used capacity has crossed the
428 * zfs_mg_noalloc_threshold and there is at least one metaslab group
429 * that can still handle allocations.
430 */
431 static boolean_t
432 metaslab_group_allocatable(metaslab_group_t *mg)
433 {
434 vdev_t *vd = mg->mg_vd;
435 spa_t *spa = vd->vdev_spa;
436 metaslab_class_t *mc = mg->mg_class;
437
438 /*
439 * A metaslab group is considered allocatable if its free capacity
440 * is greater than the set value of zfs_mg_noalloc_threshold, it's
441 * associated with a slog, or there are no other metaslab groups
442 * with free capacity greater than zfs_mg_noalloc_threshold.
443 */
444 return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
445 mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
446 }
447
448 /*
449 * ==========================================================================
450 * Common allocator routines
451 * ==========================================================================
452 */
453 static int
454 metaslab_segsize_compare(const void *x1, const void *x2)
455 {
456 const space_seg_t *s1 = x1;
457 const space_seg_t *s2 = x2;
458 uint64_t ss_size1 = s1->ss_end - s1->ss_start;
459 uint64_t ss_size2 = s2->ss_end - s2->ss_start;
460
461 if (ss_size1 < ss_size2)
462 return (-1);
463 if (ss_size1 > ss_size2)
464 return (1);
465
466 if (s1->ss_start < s2->ss_start)
467 return (-1);
468 if (s1->ss_start > s2->ss_start)
1377
1378 for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
1379 if (msp->ms_allocmap[(txg + t) & TXG_MASK]->sm_space)
1380 evictable = 0;
1381
1382 if (evictable && !metaslab_debug)
1383 space_map_unload(sm);
1384 }
1385
1386 metaslab_group_sort(mg, msp, metaslab_weight(msp));
1387
1388 mutex_exit(&msp->ms_lock);
1389 }
1390
1391 void
1392 metaslab_sync_reassess(metaslab_group_t *mg)
1393 {
1394 vdev_t *vd = mg->mg_vd;
1395 int64_t failures = mg->mg_alloc_failures;
1396
1397 metaslab_group_alloc_update(mg);
1398
1399 /*
1400 * Re-evaluate all metaslabs which have lower offsets than the
1401 * bonus area.
1402 */
1403 for (int m = 0; m < vd->vdev_ms_count; m++) {
1404 metaslab_t *msp = vd->vdev_ms[m];
1405
1406 if (msp->ms_map->sm_start > mg->mg_bonus_area)
1407 break;
1408
1409 mutex_enter(&msp->ms_lock);
1410 metaslab_group_sort(mg, msp, metaslab_weight(msp));
1411 mutex_exit(&msp->ms_lock);
1412 }
1413
1414 atomic_add_64(&mg->mg_alloc_failures, -failures);
1415
1416 /*
1417 * Prefetch the next potential metaslabs
1418 */
1480 continue;
1481
1482 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1483 if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1484 break;
1485
1486 target_distance = min_distance +
1487 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
1488
1489 for (i = 0; i < d; i++)
1490 if (metaslab_distance(msp, &dva[i]) <
1491 target_distance)
1492 break;
1493 if (i == d)
1494 break;
1495 }
1496 mutex_exit(&mg->mg_lock);
1497 if (msp == NULL)
1498 return (-1ULL);
1499
1500 mutex_enter(&msp->ms_lock);
1501
1502 /*
1503 * If we've already reached the allowable number of failed
1504 * allocation attempts on this metaslab group then we
1505 * consider skipping it. We skip it only if we're allowed
1506 * to "fast" gang, the physical size is larger than
1507 * a gang block, and we're attempting to allocate from
1508 * the primary metaslab.
1509 */
1510 if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1511 CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1512 activation_weight == METASLAB_WEIGHT_PRIMARY) {
1513 spa_dbgmsg(spa, "%s: skipping metaslab group: "
1514 "vdev %llu, txg %llu, mg %p, psize %llu, "
1515 "asize %llu, failures %llu", spa_name(spa),
1516 mg->mg_vd->vdev_id, txg, mg, psize, asize,
1517 mg->mg_alloc_failures);
1518 mutex_exit(&msp->ms_lock);
1519 return (-1ULL);
1520 }
1521
1522 /*
1523 * Ensure that the metaslab we have selected is still
1524 * capable of handling our request. It's possible that
1525 * another thread may have changed the weight while we
1526 * were blocked on the metaslab lock.
1527 */
1528 if (msp->ms_weight < asize || (was_active &&
1529 !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1530 activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1531 mutex_exit(&msp->ms_lock);
1532 continue;
1533 }
1534
1535 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1536 activation_weight == METASLAB_WEIGHT_PRIMARY) {
1537 metaslab_passivate(msp,
1538 msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1539 mutex_exit(&msp->ms_lock);
1540 continue;
1541 }
1654 mg = mc->mc_rotor;
1655
1656 rotor = mg;
1657 top:
1658 all_zero = B_TRUE;
1659 do {
1660 ASSERT(mg->mg_activation_count == 1);
1661
1662 vd = mg->mg_vd;
1663
1664 /*
1665 * Don't allocate from faulted devices.
1666 */
1667 if (zio_lock) {
1668 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1669 allocatable = vdev_allocatable(vd);
1670 spa_config_exit(spa, SCL_ZIO, FTAG);
1671 } else {
1672 allocatable = vdev_allocatable(vd);
1673 }
1674
1675 /*
1676 * Determine if the selected metaslab group is eligible
1677 * for allocations. If we're ganging or have requested
1678 * an allocation for the smallest gang block size
1679 * then we don't want to avoid allocating to the this
1680 * metaslab group. If we're in this condition we should
1681 * try to allocate from any device possible so that we
1682 * don't inadvertently return ENOSPC and suspend the pool
1683 * even though space is still available.
1684 */
1685 if (allocatable && CAN_FASTGANG(flags) &&
1686 psize > SPA_GANGBLOCKSIZE)
1687 allocatable = metaslab_group_allocatable(mg);
1688
1689 if (!allocatable)
1690 goto next;
1691
1692 /*
1693 * Avoid writing single-copy data to a failing vdev
1694 * unless the user instructs us that it is okay.
1695 */
1696 if ((vd->vdev_stat.vs_write_errors > 0 ||
1697 vd->vdev_state < VDEV_STATE_HEALTHY) &&
1698 d == 0 && dshift == 3 &&
1699 !(zfs_write_to_degraded && vd->vdev_state ==
1700 VDEV_STATE_DEGRADED)) {
1701 all_zero = B_FALSE;
1702 goto next;
1703 }
1704
1705 ASSERT(mg->mg_class == mc);
1706
1707 distance = vd->vdev_asize >> dshift;
1708 if (distance <= (1ULL << vd->vdev_ms_shift))
|