Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/spa_misc.c
          +++ new/usr/src/uts/common/fs/zfs/spa_misc.c
↓ open down ↓ 242 lines elided ↑ open up ↑
 243  243  int zfs_flags = 0;
 244  244  #endif
 245  245  
 246  246  /*
 247  247   * zfs_recover can be set to nonzero to attempt to recover from
 248  248   * otherwise-fatal errors, typically caused by on-disk corruption.  When
 249  249   * set, calls to zfs_panic_recover() will turn into warning messages.
 250  250   */
 251  251  int zfs_recover = 0;
 252  252  
 253      -extern int zfs_txg_synctime_ms;
      253 +/*
      254 + * Expiration time in milliseconds. This value has two meanings. First it is
      255 + * used to determine when the spa_deadman() logic should fire. By default the
      256 + * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
      257 + * Secondly, the value determines if an I/O is considered "hung". Any I/O that
      258 + * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
      259 + * in a system panic.
      260 + */
      261 +uint64_t zfs_deadman_synctime_ms = 1000000ULL;
 254  262  
 255  263  /*
 256      - * Expiration time in units of zfs_txg_synctime_ms. This value has two
 257      - * meanings. First it is used to determine when the spa_deadman logic
 258      - * should fire. By default the spa_deadman will fire if spa_sync has
 259      - * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds).
 260      - * Secondly, the value determines if an I/O is considered "hung".
 261      - * Any I/O that has not completed in zfs_deadman_synctime is considered
 262      - * "hung" resulting in a system panic.
      264 + * Check time in milliseconds. This defines the frequency at which we check
      265 + * for hung I/O.
 263  266   */
 264      -uint64_t zfs_deadman_synctime = 1000ULL;
      267 +uint64_t zfs_deadman_checktime_ms = 5000ULL;
 265  268  
 266  269  /*
 267  270   * Override the zfs deadman behavior via /etc/system. By default the
 268  271   * deadman is enabled except on VMware and sparc deployments.
 269  272   */
 270  273  int zfs_deadman_enabled = -1;
 271  274  
      275 +/*
      276 + * The worst case is single-sector max-parity RAID-Z blocks, in which
      277 + * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
      278 + * times the size; so just assume that.  Add to this the fact that
      279 + * we can have up to 3 DVAs per bp, and one more factor of 2 because
      280 + * the block may be dittoed with up to 3 DVAs by ddt_sync().  All together,
      281 + * the worst case is:
      282 + *     (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
      283 + */
      284 +int spa_asize_inflation = 24;
 272  285  
 273  286  /*
 274  287   * ==========================================================================
 275  288   * SPA config locking
 276  289   * ==========================================================================
 277  290   */
 278  291  static void
 279  292  spa_config_lock_init(spa_t *spa)
 280  293  {
 281  294          for (int i = 0; i < SCL_LOCKS; i++) {
↓ open down ↓ 210 lines elided ↑ open up ↑
 492  505          spa->spa_freeze_txg = UINT64_MAX;
 493  506          spa->spa_final_txg = UINT64_MAX;
 494  507          spa->spa_load_max_txg = UINT64_MAX;
 495  508          spa->spa_proc = &p0;
 496  509          spa->spa_proc_state = SPA_PROC_NONE;
 497  510  
 498  511          hdlr.cyh_func = spa_deadman;
 499  512          hdlr.cyh_arg = spa;
 500  513          hdlr.cyh_level = CY_LOW_LEVEL;
 501  514  
 502      -        spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime *
 503      -            zfs_txg_synctime_ms);
      515 +        spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
 504  516  
 505  517          /*
 506  518           * This determines how often we need to check for hung I/Os after
 507  519           * the cyclic has already fired. Since checking for hung I/Os is
 508  520           * an expensive operation we don't want to check too frequently.
 509      -         * Instead wait for 5 synctimes before checking again.
      521 +         * Instead wait for 5 seconds before checking again.
 510  522           */
 511      -        when.cyt_interval = MSEC2NSEC(5 * zfs_txg_synctime_ms);
      523 +        when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
 512  524          when.cyt_when = CY_INFINITY;
 513  525          mutex_enter(&cpu_lock);
 514  526          spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
 515  527          mutex_exit(&cpu_lock);
 516  528  
 517  529          refcount_create(&spa->spa_refcount);
 518  530          spa_config_lock_init(spa);
 519  531  
 520  532          avl_add(&spa_namespace_avl, spa);
 521  533  
↓ open down ↓ 970 lines elided ↑ open up ↑
1492 1504  uint64_t
1493 1505  spa_freeze_txg(spa_t *spa)
1494 1506  {
1495 1507          return (spa->spa_freeze_txg);
1496 1508  }
1497 1509  
1498 1510  /* ARGSUSED */
1499 1511  uint64_t
1500 1512  spa_get_asize(spa_t *spa, uint64_t lsize)
1501 1513  {
1502      -        /*
1503      -         * The worst case is single-sector max-parity RAID-Z blocks, in which
1504      -         * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
1505      -         * times the size; so just assume that.  Add to this the fact that
1506      -         * we can have up to 3 DVAs per bp, and one more factor of 2 because
1507      -         * the block may be dittoed with up to 3 DVAs by ddt_sync().
1508      -         */
1509      -        return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2);
     1514 +        return (lsize * spa_asize_inflation);
1510 1515  }
1511 1516  
1512 1517  uint64_t
1513 1518  spa_get_dspace(spa_t *spa)
1514 1519  {
1515 1520          return (spa->spa_dspace);
1516 1521  }
1517 1522  
1518 1523  void
1519 1524  spa_update_dspace(spa_t *spa)
↓ open down ↓ 329 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX