Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>


 233 static kmutex_t spa_l2cache_lock;
 234 static avl_tree_t spa_l2cache_avl;
 235 
 236 kmem_cache_t *spa_buffer_pool;
 237 int spa_mode_global;
 238 
 239 #ifdef ZFS_DEBUG
 240 /* Everything except dprintf and spa is on by default in debug builds */
 241 int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA);
 242 #else
 243 int zfs_flags = 0;
 244 #endif
 245 
 246 /*
 247  * zfs_recover can be set to nonzero to attempt to recover from
 248  * otherwise-fatal errors, typically caused by on-disk corruption.  When
 249  * set, calls to zfs_panic_recover() will turn into warning messages.
 250  */
 251 int zfs_recover = 0;
 252 
 253 extern int zfs_txg_synctime_ms;








 254 
 255 /*
 256  * Expiration time in units of zfs_txg_synctime_ms. This value has two
 257  * meanings. First it is used to determine when the spa_deadman logic
 258  * should fire. By default the spa_deadman will fire if spa_sync has
 259  * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds).
 260  * Secondly, the value determines if an I/O is considered "hung".
 261  * Any I/O that has not completed in zfs_deadman_synctime is considered
 262  * "hung" resulting in a system panic.
 263  */
 264 uint64_t zfs_deadman_synctime = 1000ULL;
 265 
 266 /*
 267  * Override the zfs deadman behavior via /etc/system. By default the
 268  * deadman is enabled except on VMware and sparc deployments.
 269  */
 270 int zfs_deadman_enabled = -1;
 271 










 272 
 273 /*
 274  * ==========================================================================
 275  * SPA config locking
 276  * ==========================================================================
 277  */
 278 static void
 279 spa_config_lock_init(spa_t *spa)
 280 {
 281         for (int i = 0; i < SCL_LOCKS; i++) {
 282                 spa_config_lock_t *scl = &spa->spa_config_lock[i];
 283                 mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
 284                 cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
 285                 refcount_create_untracked(&scl->scl_count);
 286                 scl->scl_writer = NULL;
 287                 scl->scl_write_wanted = 0;
 288         }
 289 }
 290 
 291 static void


 482         cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
 483         cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
 484         cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
 485         cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
 486 
 487         for (int t = 0; t < TXG_SIZE; t++)
 488                 bplist_create(&spa->spa_free_bplist[t]);
 489 
 490         (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
 491         spa->spa_state = POOL_STATE_UNINITIALIZED;
 492         spa->spa_freeze_txg = UINT64_MAX;
 493         spa->spa_final_txg = UINT64_MAX;
 494         spa->spa_load_max_txg = UINT64_MAX;
 495         spa->spa_proc = &p0;
 496         spa->spa_proc_state = SPA_PROC_NONE;
 497 
 498         hdlr.cyh_func = spa_deadman;
 499         hdlr.cyh_arg = spa;
 500         hdlr.cyh_level = CY_LOW_LEVEL;
 501 
 502         spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime *
 503             zfs_txg_synctime_ms);
 504 
 505         /*
 506          * This determines how often we need to check for hung I/Os after
 507          * the cyclic has already fired. Since checking for hung I/Os is
 508          * an expensive operation we don't want to check too frequently.
 509          * Instead wait for 5 synctimes before checking again.
 510          */
 511         when.cyt_interval = MSEC2NSEC(5 * zfs_txg_synctime_ms);
 512         when.cyt_when = CY_INFINITY;
 513         mutex_enter(&cpu_lock);
 514         spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
 515         mutex_exit(&cpu_lock);
 516 
 517         refcount_create(&spa->spa_refcount);
 518         spa_config_lock_init(spa);
 519 
 520         avl_add(&spa_namespace_avl, spa);
 521 
 522         /*
 523          * Set the alternate root, if there is one.
 524          */
 525         if (altroot) {
 526                 spa->spa_root = spa_strdup(altroot);
 527                 spa_active_count++;
 528         }
 529 
 530         /*
 531          * Every pool starts with the default cachefile


1482 {
1483         return (spa->spa_state);
1484 }
1485 
1486 spa_load_state_t
1487 spa_load_state(spa_t *spa)
1488 {
1489         return (spa->spa_load_state);
1490 }
1491 
1492 uint64_t
1493 spa_freeze_txg(spa_t *spa)
1494 {
1495         return (spa->spa_freeze_txg);
1496 }
1497 
1498 /* ARGSUSED */
1499 uint64_t
1500 spa_get_asize(spa_t *spa, uint64_t lsize)
1501 {
1502         /*
1503          * The worst case is single-sector max-parity RAID-Z blocks, in which
1504          * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
1505          * times the size; so just assume that.  Add to this the fact that
1506          * we can have up to 3 DVAs per bp, and one more factor of 2 because
1507          * the block may be dittoed with up to 3 DVAs by ddt_sync().
1508          */
1509         return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2);
1510 }
1511 
1512 uint64_t
1513 spa_get_dspace(spa_t *spa)
1514 {
1515         return (spa->spa_dspace);
1516 }
1517 
1518 void
1519 spa_update_dspace(spa_t *spa)
1520 {
1521         spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
1522             ddt_get_dedup_dspace(spa);
1523 }
1524 
1525 /*
1526  * Return the failure mode that has been set to this pool. The default
1527  * behavior will be to block all I/Os when a complete failure occurs.
1528  */
1529 uint8_t




 233 static kmutex_t spa_l2cache_lock;
 234 static avl_tree_t spa_l2cache_avl;
 235 
 236 kmem_cache_t *spa_buffer_pool;
 237 int spa_mode_global;
 238 
 239 #ifdef ZFS_DEBUG
 240 /* Everything except dprintf and spa is on by default in debug builds */
 241 int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA);
 242 #else
 243 int zfs_flags = 0;
 244 #endif
 245 
 246 /*
 247  * zfs_recover can be set to nonzero to attempt to recover from
 248  * otherwise-fatal errors, typically caused by on-disk corruption.  When
 249  * set, calls to zfs_panic_recover() will turn into warning messages.
 250  */
 251 int zfs_recover = 0;
 252 
 253 /*
 254  * Expiration time in milliseconds. This value has two meanings. First it is
 255  * used to determine when the spa_deadman() logic should fire. By default the
 256  * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
 257  * Secondly, the value determines if an I/O is considered "hung". Any I/O that
 258  * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
 259  * in a system panic.
 260  */
 261 uint64_t zfs_deadman_synctime_ms = 1000000ULL;
 262 
 263 /*
 264  * Check time in milliseconds. This defines the frequency at which we check
 265  * for hung I/O.





 266  */
 267 uint64_t zfs_deadman_checktime_ms = 5000ULL;
 268 
 269 /*
 270  * Override the zfs deadman behavior via /etc/system. By default the
 271  * deadman is enabled except on VMware and sparc deployments.
 272  */
 273 int zfs_deadman_enabled = -1;
 274 
 275 /*
 276  * The worst case is single-sector max-parity RAID-Z blocks, in which
 277  * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
 278  * times the size; so just assume that.  Add to this the fact that
 279  * we can have up to 3 DVAs per bp, and one more factor of 2 because
 280  * the block may be dittoed with up to 3 DVAs by ddt_sync().  All together,
 281  * the worst case is:
 282  *     (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
 283  */
 284 int spa_asize_inflation = 24;
 285 
 286 /*
 287  * ==========================================================================
 288  * SPA config locking
 289  * ==========================================================================
 290  */
 291 static void
 292 spa_config_lock_init(spa_t *spa)
 293 {
 294         for (int i = 0; i < SCL_LOCKS; i++) {
 295                 spa_config_lock_t *scl = &spa->spa_config_lock[i];
 296                 mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
 297                 cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
 298                 refcount_create_untracked(&scl->scl_count);
 299                 scl->scl_writer = NULL;
 300                 scl->scl_write_wanted = 0;
 301         }
 302 }
 303 
 304 static void


 495         cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
 496         cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
 497         cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
 498         cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
 499 
 500         for (int t = 0; t < TXG_SIZE; t++)
 501                 bplist_create(&spa->spa_free_bplist[t]);
 502 
 503         (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
 504         spa->spa_state = POOL_STATE_UNINITIALIZED;
 505         spa->spa_freeze_txg = UINT64_MAX;
 506         spa->spa_final_txg = UINT64_MAX;
 507         spa->spa_load_max_txg = UINT64_MAX;
 508         spa->spa_proc = &p0;
 509         spa->spa_proc_state = SPA_PROC_NONE;
 510 
 511         hdlr.cyh_func = spa_deadman;
 512         hdlr.cyh_arg = spa;
 513         hdlr.cyh_level = CY_LOW_LEVEL;
 514 
 515         spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);

 516 
 517         /*
 518          * This determines how often we need to check for hung I/Os after
 519          * the cyclic has already fired. Since checking for hung I/Os is
 520          * an expensive operation we don't want to check too frequently.
 521          * Instead wait for 5 seconds before checking again.
 522          */
 523         when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
 524         when.cyt_when = CY_INFINITY;
 525         mutex_enter(&cpu_lock);
 526         spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
 527         mutex_exit(&cpu_lock);
 528 
 529         refcount_create(&spa->spa_refcount);
 530         spa_config_lock_init(spa);
 531 
 532         avl_add(&spa_namespace_avl, spa);
 533 
 534         /*
 535          * Set the alternate root, if there is one.
 536          */
 537         if (altroot) {
 538                 spa->spa_root = spa_strdup(altroot);
 539                 spa_active_count++;
 540         }
 541 
 542         /*
 543          * Every pool starts with the default cachefile


1494 {
1495         return (spa->spa_state);
1496 }
1497 
1498 spa_load_state_t
1499 spa_load_state(spa_t *spa)
1500 {
1501         return (spa->spa_load_state);
1502 }
1503 
1504 uint64_t
1505 spa_freeze_txg(spa_t *spa)
1506 {
1507         return (spa->spa_freeze_txg);
1508 }
1509 
1510 /* ARGSUSED */
1511 uint64_t
1512 spa_get_asize(spa_t *spa, uint64_t lsize)
1513 {
1514         return (lsize * spa_asize_inflation);







1515 }
1516 
1517 uint64_t
1518 spa_get_dspace(spa_t *spa)
1519 {
1520         return (spa->spa_dspace);
1521 }
1522 
1523 void
1524 spa_update_dspace(spa_t *spa)
1525 {
1526         spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
1527             ddt_get_dedup_dspace(spa);
1528 }
1529 
1530 /*
1531  * Return the failure mode that has been set to this pool. The default
1532  * behavior will be to block all I/Os when a complete failure occurs.
1533  */
1534 uint8_t