Print this page
3954 metaslabs continue to load even after hitting zfs_mg_alloc_failure limit
4080 zpool clear fails to clear pool
4081 need zfs_mg_noalloc_threshold
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>


  41  * to "fast" gang.
  42  */
  43 #define CAN_FASTGANG(flags) \
  44         (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
  45         METASLAB_GANG_AVOID)))
  46 
  47 uint64_t metaslab_aliquot = 512ULL << 10;
  48 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;     /* force gang blocks */
  49 
  50 /*
  51  * The in-core space map representation is more compact than its on-disk form.
  52  * The zfs_condense_pct determines how much more compact the in-core
  53  * space_map representation must be before we compact it on-disk.
  54  * Values should be greater than or equal to 100.
  55  */
  56 int zfs_condense_pct = 200;
  57 
  58 /*
  59  * This value defines the number of allowed allocation failures per vdev.
  60  * If a device reaches this threshold in a given txg then we consider skipping
  61  * allocations on that device.

  62  */
  63 int zfs_mg_alloc_failures;
  64 
  65 /*















  66  * Metaslab debugging: when set, keeps all space maps in core to verify frees.
  67  */
  68 static int metaslab_debug = 0;
  69 
  70 /*
  71  * Minimum size which forces the dynamic allocator to change
  72  * it's allocation strategy.  Once the space map cannot satisfy
  73  * an allocation of this size then it switches to using more
  74  * aggressive strategy (i.e search by size rather than offset).
  75  */
  76 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
  77 
  78 /*
  79  * The minimum free space, in percent, which must be available
  80  * in a space map to continue allocations in a first-fit fashion.
  81  * Once the space_map's free space drops below this level we dynamically
  82  * switch to using best-fit allocations.
  83  */
  84 int metaslab_df_free_pct = 4;
  85 


 207         const metaslab_t *m2 = x2;
 208 
 209         if (m1->ms_weight < m2->ms_weight)
 210                 return (1);
 211         if (m1->ms_weight > m2->ms_weight)
 212                 return (-1);
 213 
 214         /*
 215          * If the weights are identical, use the offset to force uniqueness.
 216          */
 217         if (m1->ms_map->sm_start < m2->ms_map->sm_start)
 218                 return (-1);
 219         if (m1->ms_map->sm_start > m2->ms_map->sm_start)
 220                 return (1);
 221 
 222         ASSERT3P(m1, ==, m2);
 223 
 224         return (0);
 225 }
 226 















































 227 metaslab_group_t *
 228 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 229 {
 230         metaslab_group_t *mg;
 231 
 232         mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 233         mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 234         avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 235             sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 236         mg->mg_vd = vd;
 237         mg->mg_class = mc;
 238         mg->mg_activation_count = 0;
 239 
 240         return (mg);
 241 }
 242 
 243 void
 244 metaslab_group_destroy(metaslab_group_t *mg)
 245 {
 246         ASSERT(mg->mg_prev == NULL);


 257         kmem_free(mg, sizeof (metaslab_group_t));
 258 }
 259 
 260 void
 261 metaslab_group_activate(metaslab_group_t *mg)
 262 {
 263         metaslab_class_t *mc = mg->mg_class;
 264         metaslab_group_t *mgprev, *mgnext;
 265 
 266         ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 267 
 268         ASSERT(mc->mc_rotor != mg);
 269         ASSERT(mg->mg_prev == NULL);
 270         ASSERT(mg->mg_next == NULL);
 271         ASSERT(mg->mg_activation_count <= 0);
 272 
 273         if (++mg->mg_activation_count <= 0)
 274                 return;
 275 
 276         mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);

 277 
 278         if ((mgprev = mc->mc_rotor) == NULL) {
 279                 mg->mg_prev = mg;
 280                 mg->mg_next = mg;
 281         } else {
 282                 mgnext = mgprev->mg_next;
 283                 mg->mg_prev = mgprev;
 284                 mg->mg_next = mgnext;
 285                 mgprev->mg_next = mg;
 286                 mgnext->mg_prev = mg;
 287         }
 288         mc->mc_rotor = mg;
 289 }
 290 
 291 void
 292 metaslab_group_passivate(metaslab_group_t *mg)
 293 {
 294         metaslab_class_t *mc = mg->mg_class;
 295         metaslab_group_t *mgprev, *mgnext;
 296 


 342 
 343 static void
 344 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 345 {
 346         /*
 347          * Although in principle the weight can be any value, in
 348          * practice we do not use values in the range [1, 510].
 349          */
 350         ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
 351         ASSERT(MUTEX_HELD(&msp->ms_lock));
 352 
 353         mutex_enter(&mg->mg_lock);
 354         ASSERT(msp->ms_group == mg);
 355         avl_remove(&mg->mg_metaslab_tree, msp);
 356         msp->ms_weight = weight;
 357         avl_add(&mg->mg_metaslab_tree, msp);
 358         mutex_exit(&mg->mg_lock);
 359 }
 360 
 361 /*























 362  * ==========================================================================
 363  * Common allocator routines
 364  * ==========================================================================
 365  */
 366 static int
 367 metaslab_segsize_compare(const void *x1, const void *x2)
 368 {
 369         const space_seg_t *s1 = x1;
 370         const space_seg_t *s2 = x2;
 371         uint64_t ss_size1 = s1->ss_end - s1->ss_start;
 372         uint64_t ss_size2 = s2->ss_end - s2->ss_start;
 373 
 374         if (ss_size1 < ss_size2)
 375                 return (-1);
 376         if (ss_size1 > ss_size2)
 377                 return (1);
 378 
 379         if (s1->ss_start < s2->ss_start)
 380                 return (-1);
 381         if (s1->ss_start > s2->ss_start)


1290 
1291                 for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
1292                         if (msp->ms_allocmap[(txg + t) & TXG_MASK]->sm_space)
1293                                 evictable = 0;
1294 
1295                 if (evictable && !metaslab_debug)
1296                         space_map_unload(sm);
1297         }
1298 
1299         metaslab_group_sort(mg, msp, metaslab_weight(msp));
1300 
1301         mutex_exit(&msp->ms_lock);
1302 }
1303 
1304 void
1305 metaslab_sync_reassess(metaslab_group_t *mg)
1306 {
1307         vdev_t *vd = mg->mg_vd;
1308         int64_t failures = mg->mg_alloc_failures;
1309 


1310         /*
1311          * Re-evaluate all metaslabs which have lower offsets than the
1312          * bonus area.
1313          */
1314         for (int m = 0; m < vd->vdev_ms_count; m++) {
1315                 metaslab_t *msp = vd->vdev_ms[m];
1316 
1317                 if (msp->ms_map->sm_start > mg->mg_bonus_area)
1318                         break;
1319 
1320                 mutex_enter(&msp->ms_lock);
1321                 metaslab_group_sort(mg, msp, metaslab_weight(msp));
1322                 mutex_exit(&msp->ms_lock);
1323         }
1324 
1325         atomic_add_64(&mg->mg_alloc_failures, -failures);
1326 
1327         /*
1328          * Prefetch the next potential metaslabs
1329          */


1391                                 continue;
1392 
1393                         was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1394                         if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1395                                 break;
1396 
1397                         target_distance = min_distance +
1398                             (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
1399 
1400                         for (i = 0; i < d; i++)
1401                                 if (metaslab_distance(msp, &dva[i]) <
1402                                     target_distance)
1403                                         break;
1404                         if (i == d)
1405                                 break;
1406                 }
1407                 mutex_exit(&mg->mg_lock);
1408                 if (msp == NULL)
1409                         return (-1ULL);
1410 


1411                 /*
1412                  * If we've already reached the allowable number of failed
1413                  * allocation attempts on this metaslab group then we
1414                  * consider skipping it. We skip it only if we're allowed
1415                  * to "fast" gang, the physical size is larger than
1416                  * a gang block, and we're attempting to allocate from
1417                  * the primary metaslab.
1418                  */
1419                 if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1420                     CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1421                     activation_weight == METASLAB_WEIGHT_PRIMARY) {
1422                         spa_dbgmsg(spa, "%s: skipping metaslab group: "
1423                             "vdev %llu, txg %llu, mg %p, psize %llu, "
1424                             "asize %llu, failures %llu", spa_name(spa),
1425                             mg->mg_vd->vdev_id, txg, mg, psize, asize,
1426                             mg->mg_alloc_failures);

1427                         return (-1ULL);
1428                 }
1429 
1430                 mutex_enter(&msp->ms_lock);
1431 
1432                 /*
1433                  * Ensure that the metaslab we have selected is still
1434                  * capable of handling our request. It's possible that
1435                  * another thread may have changed the weight while we
1436                  * were blocked on the metaslab lock.
1437                  */
1438                 if (msp->ms_weight < asize || (was_active &&
1439                     !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1440                     activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1441                         mutex_exit(&msp->ms_lock);
1442                         continue;
1443                 }
1444 
1445                 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1446                     activation_weight == METASLAB_WEIGHT_PRIMARY) {
1447                         metaslab_passivate(msp,
1448                             msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1449                         mutex_exit(&msp->ms_lock);
1450                         continue;
1451                 }


1564                 mg = mc->mc_rotor;
1565 
1566         rotor = mg;
1567 top:
1568         all_zero = B_TRUE;
1569         do {
1570                 ASSERT(mg->mg_activation_count == 1);
1571 
1572                 vd = mg->mg_vd;
1573 
1574                 /*
1575                  * Don't allocate from faulted devices.
1576                  */
1577                 if (zio_lock) {
1578                         spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1579                         allocatable = vdev_allocatable(vd);
1580                         spa_config_exit(spa, SCL_ZIO, FTAG);
1581                 } else {
1582                         allocatable = vdev_allocatable(vd);
1583                 }















1584                 if (!allocatable)
1585                         goto next;
1586 
1587                 /*
1588                  * Avoid writing single-copy data to a failing vdev
1589                  * unless the user instructs us that it is okay.
1590                  */
1591                 if ((vd->vdev_stat.vs_write_errors > 0 ||
1592                     vd->vdev_state < VDEV_STATE_HEALTHY) &&
1593                     d == 0 && dshift == 3 &&
1594                     !(zfs_write_to_degraded && vd->vdev_state ==
1595                     VDEV_STATE_DEGRADED)) {
1596                         all_zero = B_FALSE;
1597                         goto next;
1598                 }
1599 
1600                 ASSERT(mg->mg_class == mc);
1601 
1602                 distance = vd->vdev_asize >> dshift;
1603                 if (distance <= (1ULL << vd->vdev_ms_shift))




  41  * to "fast" gang.
  42  */
  43 #define CAN_FASTGANG(flags) \
  44         (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
  45         METASLAB_GANG_AVOID)))
  46 
  47 uint64_t metaslab_aliquot = 512ULL << 10;
  48 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;     /* force gang blocks */
  49 
  50 /*
  51  * The in-core space map representation is more compact than its on-disk form.
  52  * The zfs_condense_pct determines how much more compact the in-core
  53  * space_map representation must be before we compact it on-disk.
  54  * Values should be greater than or equal to 100.
  55  */
  56 int zfs_condense_pct = 200;
  57 
  58 /*
  59  * This value defines the number of allowed allocation failures per vdev.
  60  * If a device reaches this threshold in a given txg then we consider skipping
  61  * allocations on that device. The value of zfs_mg_alloc_failures is computed
  62  * in zio_init() unless it has been overridden in /etc/system.
  63  */
  64 int zfs_mg_alloc_failures = 0;
  65 
  66 /*
  67  * The zfs_mg_noalloc_threshold defines which metaslab groups should
  68  * be eligible for allocation. The value is defined as a percentage of
  69  * a free space. Metaslab groups that have more free space than
  70  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  71  * a metaslab group's free space is less than or equal to the
  72  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  73  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  74  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  75  * groups are allowed to accept allocations. Gang blocks are always
  76  * eligible to allocate on any metaslab group. The default value of 0 means
  77  * no metaslab group will be excluded based on this criterion.
  78  */
  79 int zfs_mg_noalloc_threshold = 0;
  80 
  81 /*
  82  * Metaslab debugging: when set, keeps all space maps in core to verify frees.
  83  */
  84 static int metaslab_debug = 0;
  85 
  86 /*
  87  * Minimum size which forces the dynamic allocator to change
  88  * it's allocation strategy.  Once the space map cannot satisfy
  89  * an allocation of this size then it switches to using more
  90  * aggressive strategy (i.e search by size rather than offset).
  91  */
  92 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
  93 
  94 /*
  95  * The minimum free space, in percent, which must be available
  96  * in a space map to continue allocations in a first-fit fashion.
  97  * Once the space_map's free space drops below this level we dynamically
  98  * switch to using best-fit allocations.
  99  */
 100 int metaslab_df_free_pct = 4;
 101 


 223         const metaslab_t *m2 = x2;
 224 
 225         if (m1->ms_weight < m2->ms_weight)
 226                 return (1);
 227         if (m1->ms_weight > m2->ms_weight)
 228                 return (-1);
 229 
 230         /*
 231          * If the weights are identical, use the offset to force uniqueness.
 232          */
 233         if (m1->ms_map->sm_start < m2->ms_map->sm_start)
 234                 return (-1);
 235         if (m1->ms_map->sm_start > m2->ms_map->sm_start)
 236                 return (1);
 237 
 238         ASSERT3P(m1, ==, m2);
 239 
 240         return (0);
 241 }
 242 
 243 /*
 244  * Update the allocatable flag and the metaslab group's capacity.
 245  * The allocatable flag is set to true if the capacity is below
 246  * the zfs_mg_noalloc_threshold. If a metaslab group transitions
 247  * from allocatable to non-allocatable or vice versa then the metaslab
 248  * group's class is updated to reflect the transition.
 249  */
 250 static void
 251 metaslab_group_alloc_update(metaslab_group_t *mg)
 252 {
 253         vdev_t *vd = mg->mg_vd;
 254         metaslab_class_t *mc = mg->mg_class;
 255         vdev_stat_t *vs = &vd->vdev_stat;
 256         boolean_t was_allocatable;
 257 
 258         ASSERT(vd == vd->vdev_top);
 259 
 260         mutex_enter(&mg->mg_lock);
 261         was_allocatable = mg->mg_allocatable;
 262 
 263         mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 264             (vs->vs_space + 1);
 265 
 266         mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
 267 
 268         /*
 269          * The mc_alloc_groups maintains a count of the number of
 270          * groups in this metaslab class that are still above the
 271          * zfs_mg_noalloc_threshold. This is used by the allocating
 272          * threads to determine if they should avoid allocations to
 273          * a given group. The allocator will avoid allocations to a group
 274          * if that group has reached or is below the zfs_mg_noalloc_threshold
 275          * and there are still other groups that are above the threshold.
 276          * When a group transitions from allocatable to non-allocatable or
 277          * vice versa we update the metaslab class to reflect that change.
 278          * When the mc_alloc_groups value drops to 0 that means that all
 279          * groups have reached the zfs_mg_noalloc_threshold making all groups
 280          * eligible for allocations. This effectively means that all devices
 281          * are balanced again.
 282          */
 283         if (was_allocatable && !mg->mg_allocatable)
 284                 mc->mc_alloc_groups--;
 285         else if (!was_allocatable && mg->mg_allocatable)
 286                 mc->mc_alloc_groups++;
 287         mutex_exit(&mg->mg_lock);
 288 }
 289 
 290 metaslab_group_t *
 291 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 292 {
 293         metaslab_group_t *mg;
 294 
 295         mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 296         mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 297         avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 298             sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 299         mg->mg_vd = vd;
 300         mg->mg_class = mc;
 301         mg->mg_activation_count = 0;
 302 
 303         return (mg);
 304 }
 305 
 306 void
 307 metaslab_group_destroy(metaslab_group_t *mg)
 308 {
 309         ASSERT(mg->mg_prev == NULL);


 320         kmem_free(mg, sizeof (metaslab_group_t));
 321 }
 322 
 323 void
 324 metaslab_group_activate(metaslab_group_t *mg)
 325 {
 326         metaslab_class_t *mc = mg->mg_class;
 327         metaslab_group_t *mgprev, *mgnext;
 328 
 329         ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 330 
 331         ASSERT(mc->mc_rotor != mg);
 332         ASSERT(mg->mg_prev == NULL);
 333         ASSERT(mg->mg_next == NULL);
 334         ASSERT(mg->mg_activation_count <= 0);
 335 
 336         if (++mg->mg_activation_count <= 0)
 337                 return;
 338 
 339         mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 340         metaslab_group_alloc_update(mg);
 341 
 342         if ((mgprev = mc->mc_rotor) == NULL) {
 343                 mg->mg_prev = mg;
 344                 mg->mg_next = mg;
 345         } else {
 346                 mgnext = mgprev->mg_next;
 347                 mg->mg_prev = mgprev;
 348                 mg->mg_next = mgnext;
 349                 mgprev->mg_next = mg;
 350                 mgnext->mg_prev = mg;
 351         }
 352         mc->mc_rotor = mg;
 353 }
 354 
 355 void
 356 metaslab_group_passivate(metaslab_group_t *mg)
 357 {
 358         metaslab_class_t *mc = mg->mg_class;
 359         metaslab_group_t *mgprev, *mgnext;
 360 


 406 
 407 static void
 408 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 409 {
 410         /*
 411          * Although in principle the weight can be any value, in
 412          * practice we do not use values in the range [1, 510].
 413          */
 414         ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
 415         ASSERT(MUTEX_HELD(&msp->ms_lock));
 416 
 417         mutex_enter(&mg->mg_lock);
 418         ASSERT(msp->ms_group == mg);
 419         avl_remove(&mg->mg_metaslab_tree, msp);
 420         msp->ms_weight = weight;
 421         avl_add(&mg->mg_metaslab_tree, msp);
 422         mutex_exit(&mg->mg_lock);
 423 }
 424 
 425 /*
 426  * Determine if a given metaslab group should skip allocations. A metaslab
 427  * group should avoid allocations if its used capacity has crossed the
 428  * zfs_mg_noalloc_threshold and there is at least one metaslab group
 429  * that can still handle allocations.
 430  */
 431 static boolean_t
 432 metaslab_group_allocatable(metaslab_group_t *mg)
 433 {
 434         vdev_t *vd = mg->mg_vd;
 435         spa_t *spa = vd->vdev_spa;
 436         metaslab_class_t *mc = mg->mg_class;
 437 
 438         /*
 439          * A metaslab group is considered allocatable if its free capacity
 440          * is greater than the set value of zfs_mg_noalloc_threshold, it's
 441          * associated with a slog, or there are no other metaslab groups
 442          * with free capacity greater than zfs_mg_noalloc_threshold.
 443          */
 444         return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
 445             mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
 446 }
 447 
 448 /*
 449  * ==========================================================================
 450  * Common allocator routines
 451  * ==========================================================================
 452  */
 453 static int
 454 metaslab_segsize_compare(const void *x1, const void *x2)
 455 {
 456         const space_seg_t *s1 = x1;
 457         const space_seg_t *s2 = x2;
 458         uint64_t ss_size1 = s1->ss_end - s1->ss_start;
 459         uint64_t ss_size2 = s2->ss_end - s2->ss_start;
 460 
 461         if (ss_size1 < ss_size2)
 462                 return (-1);
 463         if (ss_size1 > ss_size2)
 464                 return (1);
 465 
 466         if (s1->ss_start < s2->ss_start)
 467                 return (-1);
 468         if (s1->ss_start > s2->ss_start)


1377 
1378                 for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
1379                         if (msp->ms_allocmap[(txg + t) & TXG_MASK]->sm_space)
1380                                 evictable = 0;
1381 
1382                 if (evictable && !metaslab_debug)
1383                         space_map_unload(sm);
1384         }
1385 
1386         metaslab_group_sort(mg, msp, metaslab_weight(msp));
1387 
1388         mutex_exit(&msp->ms_lock);
1389 }
1390 
1391 void
1392 metaslab_sync_reassess(metaslab_group_t *mg)
1393 {
1394         vdev_t *vd = mg->mg_vd;
1395         int64_t failures = mg->mg_alloc_failures;
1396 
1397         metaslab_group_alloc_update(mg);
1398 
1399         /*
1400          * Re-evaluate all metaslabs which have lower offsets than the
1401          * bonus area.
1402          */
1403         for (int m = 0; m < vd->vdev_ms_count; m++) {
1404                 metaslab_t *msp = vd->vdev_ms[m];
1405 
1406                 if (msp->ms_map->sm_start > mg->mg_bonus_area)
1407                         break;
1408 
1409                 mutex_enter(&msp->ms_lock);
1410                 metaslab_group_sort(mg, msp, metaslab_weight(msp));
1411                 mutex_exit(&msp->ms_lock);
1412         }
1413 
1414         atomic_add_64(&mg->mg_alloc_failures, -failures);
1415 
1416         /*
1417          * Prefetch the next potential metaslabs
1418          */


1480                                 continue;
1481 
1482                         was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1483                         if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1484                                 break;
1485 
1486                         target_distance = min_distance +
1487                             (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
1488 
1489                         for (i = 0; i < d; i++)
1490                                 if (metaslab_distance(msp, &dva[i]) <
1491                                     target_distance)
1492                                         break;
1493                         if (i == d)
1494                                 break;
1495                 }
1496                 mutex_exit(&mg->mg_lock);
1497                 if (msp == NULL)
1498                         return (-1ULL);
1499 
1500                 mutex_enter(&msp->ms_lock);
1501 
1502                 /*
1503                  * If we've already reached the allowable number of failed
1504                  * allocation attempts on this metaslab group then we
1505                  * consider skipping it. We skip it only if we're allowed
1506                  * to "fast" gang, the physical size is larger than
1507                  * a gang block, and we're attempting to allocate from
1508                  * the primary metaslab.
1509                  */
1510                 if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1511                     CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1512                     activation_weight == METASLAB_WEIGHT_PRIMARY) {
1513                         spa_dbgmsg(spa, "%s: skipping metaslab group: "
1514                             "vdev %llu, txg %llu, mg %p, psize %llu, "
1515                             "asize %llu, failures %llu", spa_name(spa),
1516                             mg->mg_vd->vdev_id, txg, mg, psize, asize,
1517                             mg->mg_alloc_failures);
1518                         mutex_exit(&msp->ms_lock);
1519                         return (-1ULL);
1520                 }
1521 


1522                 /*
1523                  * Ensure that the metaslab we have selected is still
1524                  * capable of handling our request. It's possible that
1525                  * another thread may have changed the weight while we
1526                  * were blocked on the metaslab lock.
1527                  */
1528                 if (msp->ms_weight < asize || (was_active &&
1529                     !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1530                     activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1531                         mutex_exit(&msp->ms_lock);
1532                         continue;
1533                 }
1534 
1535                 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1536                     activation_weight == METASLAB_WEIGHT_PRIMARY) {
1537                         metaslab_passivate(msp,
1538                             msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1539                         mutex_exit(&msp->ms_lock);
1540                         continue;
1541                 }


1654                 mg = mc->mc_rotor;
1655 
1656         rotor = mg;
1657 top:
1658         all_zero = B_TRUE;
1659         do {
1660                 ASSERT(mg->mg_activation_count == 1);
1661 
1662                 vd = mg->mg_vd;
1663 
1664                 /*
1665                  * Don't allocate from faulted devices.
1666                  */
1667                 if (zio_lock) {
1668                         spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1669                         allocatable = vdev_allocatable(vd);
1670                         spa_config_exit(spa, SCL_ZIO, FTAG);
1671                 } else {
1672                         allocatable = vdev_allocatable(vd);
1673                 }
1674 
1675                 /*
1676                  * Determine if the selected metaslab group is eligible
1677                  * for allocations. If we're ganging or have requested
1678                  * an allocation for the smallest gang block size
1679                  * then we don't want to avoid allocating to the this
1680                  * metaslab group. If we're in this condition we should
1681                  * try to allocate from any device possible so that we
1682                  * don't inadvertently return ENOSPC and suspend the pool
1683                  * even though space is still available.
1684                  */
1685                 if (allocatable && CAN_FASTGANG(flags) &&
1686                     psize > SPA_GANGBLOCKSIZE)
1687                         allocatable = metaslab_group_allocatable(mg);
1688 
1689                 if (!allocatable)
1690                         goto next;
1691 
1692                 /*
1693                  * Avoid writing single-copy data to a failing vdev
1694                  * unless the user instructs us that it is okay.
1695                  */
1696                 if ((vd->vdev_stat.vs_write_errors > 0 ||
1697                     vd->vdev_state < VDEV_STATE_HEALTHY) &&
1698                     d == 0 && dshift == 3 &&
1699                     !(zfs_write_to_degraded && vd->vdev_state ==
1700                     VDEV_STATE_DEGRADED)) {
1701                         all_zero = B_FALSE;
1702                         goto next;
1703                 }
1704 
1705                 ASSERT(mg->mg_class == mc);
1706 
1707                 distance = vd->vdev_asize >> dshift;
1708                 if (distance <= (1ULL << vd->vdev_ms_shift))