illumos6537-1 Wdiff usr/src/uts/common/fs/zfs/dsl_scan.c

Print this page

6537 Panic on zpool scrub with DEBUG kernel

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dsl_scan.c
          +++ new/usr/src/uts/common/fs/zfs/dsl_scan.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each

↓ open down ↓

13 lines elided

↑ open up ↑

  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
       24 + * Copyright 2016 Gary Mills
  24   25   */
  25   26  
  26   27  #include <sys/dsl_scan.h>
  27   28  #include <sys/dsl_pool.h>
  28   29  #include <sys/dsl_dataset.h>
  29   30  #include <sys/dsl_prop.h>
  30   31  #include <sys/dsl_dir.h>
  31   32  #include <sys/dsl_synctask.h>
  32   33  #include <sys/dnode.h>
  33   34  #include <sys/dmu_tx.h>

  34   35  #include <sys/dmu_objset.h>
  35   36  #include <sys/arc.h>
  36   37  #include <sys/zap.h>
  37   38  #include <sys/zio.h>
  38   39  #include <sys/zfs_context.h>
  39   40  #include <sys/fs/zfs.h>
  40   41  #include <sys/zfs_znode.h>
  41   42  #include <sys/spa_impl.h>
  42   43  #include <sys/vdev_impl.h>
  43   44  #include <sys/zil_impl.h>
  44   45  #include <sys/zio_checksum.h>
  45   46  #include <sys/ddt.h>
  46   47  #include <sys/sa.h>
  47   48  #include <sys/sa_impl.h>
  48   49  #include <sys/zfeature.h>
  49   50  #ifdef _KERNEL
  50   51  #include <sys/zfs_vfsops.h>
  51   52  #endif
  52   53  
  53   54  typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
  54   55      const zbookmark_phys_t *);
  55   56  
  56   57  static scan_cb_t dsl_scan_scrub_cb;
  57   58  static void dsl_scan_cancel_sync(void *, dmu_tx_t *);
  58   59  static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
  59   60  
  60   61  int zfs_top_maxinflight = 32;           /* maximum I/Os per top-level */
  61   62  int zfs_resilver_delay = 2;             /* number of ticks to delay resilver */
  62   63  int zfs_scrub_delay = 4;                /* number of ticks to delay scrub */
  63   64  int zfs_scan_idle = 50;                 /* idle window in clock ticks */
  64   65  
  65   66  int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
  66   67  int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
  67   68  int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
  68   69  boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
  69   70  boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
  70   71  enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
  71   72  int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
  72   73  /* max number of blocks to free in a single TXG */
  73   74  uint64_t zfs_free_max_blocks = UINT64_MAX;
  74   75  
  75   76  #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
  76   77          ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
  77   78          (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
  78   79  
  79   80  extern int zfs_txg_timeout;
  80   81  
  81   82  /*
  82   83   * Enable/disable the processing of the free_bpobj object.
  83   84   */
  84   85  boolean_t zfs_free_bpobj_enabled = B_TRUE;
  85   86  
  86   87  /* the order has to match pool_scan_type */
  87   88  static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
  88   89          NULL,
  89   90          dsl_scan_scrub_cb,      /* POOL_SCAN_SCRUB */
  90   91          dsl_scan_scrub_cb,      /* POOL_SCAN_RESILVER */
  91   92  };
  92   93  
  93   94  int
  94   95  dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
  95   96  {
  96   97          int err;
  97   98          dsl_scan_t *scn;
  98   99          spa_t *spa = dp->dp_spa;
  99  100          uint64_t f;
 100  101  
 101  102          scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
 102  103          scn->scn_dp = dp;
 103  104  
 104  105          /*
 105  106           * It's possible that we're resuming a scan after a reboot so
 106  107           * make sure that the scan_async_destroying flag is initialized
 107  108           * appropriately.
 108  109           */
 109  110          ASSERT(!scn->scn_async_destroying);
 110  111          scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
 111  112              SPA_FEATURE_ASYNC_DESTROY);
 112  113  
 113  114          err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 114  115              "scrub_func", sizeof (uint64_t), 1, &f);
 115  116          if (err == 0) {
 116  117                  /*
 117  118                   * There was an old-style scrub in progress.  Restart a
 118  119                   * new-style scrub from the beginning.
 119  120                   */
 120  121                  scn->scn_restart_txg = txg;
 121  122                  zfs_dbgmsg("old-style scrub was in progress; "
 122  123                      "restarting new-style scrub in txg %llu",
 123  124                      scn->scn_restart_txg);
 124  125  
 125  126                  /*
 126  127                   * Load the queue obj from the old location so that it
 127  128                   * can be freed by dsl_scan_done().
 128  129                   */
 129  130                  (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 130  131                      "scrub_queue", sizeof (uint64_t), 1,
 131  132                      &scn->scn_phys.scn_queue_obj);
 132  133          } else {
 133  134                  err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 134  135                      DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 135  136                      &scn->scn_phys);
 136  137                  if (err == ENOENT)
 137  138                          return (0);
 138  139                  else if (err)
 139  140                          return (err);
 140  141  
 141  142                  if (scn->scn_phys.scn_state == DSS_SCANNING &&
 142  143                      spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
 143  144                          /*
 144  145                           * A new-type scrub was in progress on an old
 145  146                           * pool, and the pool was accessed by old
 146  147                           * software.  Restart from the beginning, since
 147  148                           * the old software may have changed the pool in
 148  149                           * the meantime.
 149  150                           */
 150  151                          scn->scn_restart_txg = txg;
 151  152                          zfs_dbgmsg("new-style scrub was modified "
 152  153                              "by old software; restarting in txg %llu",
 153  154                              scn->scn_restart_txg);
 154  155                  }
 155  156          }
 156  157  
 157  158          spa_scan_stat_init(spa);
 158  159          return (0);
 159  160  }
 160  161  
 161  162  void
 162  163  dsl_scan_fini(dsl_pool_t *dp)
 163  164  {
 164  165          if (dp->dp_scan) {
 165  166                  kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
 166  167                  dp->dp_scan = NULL;
 167  168          }
 168  169  }
 169  170  
 170  171  /* ARGSUSED */
 171  172  static int
 172  173  dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
 173  174  {
 174  175          dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 175  176  
 176  177          if (scn->scn_phys.scn_state == DSS_SCANNING)
 177  178                  return (SET_ERROR(EBUSY));
 178  179  
 179  180          return (0);
 180  181  }
 181  182  
 182  183  static void
 183  184  dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 184  185  {
 185  186          dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 186  187          pool_scan_func_t *funcp = arg;
 187  188          dmu_object_type_t ot = 0;
 188  189          dsl_pool_t *dp = scn->scn_dp;
 189  190          spa_t *spa = dp->dp_spa;
 190  191  
 191  192          ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
 192  193          ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
 193  194          bzero(&scn->scn_phys, sizeof (scn->scn_phys));
 194  195          scn->scn_phys.scn_func = *funcp;
 195  196          scn->scn_phys.scn_state = DSS_SCANNING;
 196  197          scn->scn_phys.scn_min_txg = 0;
 197  198          scn->scn_phys.scn_max_txg = tx->tx_txg;
 198  199          scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
 199  200          scn->scn_phys.scn_start_time = gethrestime_sec();
 200  201          scn->scn_phys.scn_errors = 0;
 201  202          scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
 202  203          scn->scn_restart_txg = 0;
 203  204          scn->scn_done_txg = 0;
 204  205          spa_scan_stat_init(spa);
 205  206  
 206  207          if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
 207  208                  scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
 208  209  
 209  210                  /* rewrite all disk labels */
 210  211                  vdev_config_dirty(spa->spa_root_vdev);
 211  212  
 212  213                  if (vdev_resilver_needed(spa->spa_root_vdev,
 213  214                      &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
 214  215                          spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
 215  216                  } else {
 216  217                          spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START);
 217  218                  }
 218  219  
 219  220                  spa->spa_scrub_started = B_TRUE;
 220  221                  /*
 221  222                   * If this is an incremental scrub, limit the DDT scrub phase
 222  223                   * to just the auto-ditto class (for correctness); the rest
 223  224                   * of the scrub should go faster using top-down pruning.
 224  225                   */
 225  226                  if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
 226  227                          scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
 227  228  
 228  229          }
 229  230  
 230  231          /* back to the generic stuff */
 231  232  
 232  233          if (dp->dp_blkstats == NULL) {
 233  234                  dp->dp_blkstats =
 234  235                      kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
 235  236          }
 236  237          bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
 237  238  
 238  239          if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
 239  240                  ot = DMU_OT_ZAP_OTHER;
 240  241  
 241  242          scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
 242  243              ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
 243  244  
 244  245          dsl_scan_sync_state(scn, tx);
 245  246  
 246  247          spa_history_log_internal(spa, "scan setup", tx,
 247  248              "func=%u mintxg=%llu maxtxg=%llu",
 248  249              *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
 249  250  }
 250  251  
 251  252  /* ARGSUSED */
 252  253  static void
 253  254  dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 254  255  {
 255  256          static const char *old_names[] = {
 256  257                  "scrub_bookmark",
 257  258                  "scrub_ddt_bookmark",
 258  259                  "scrub_ddt_class_max",
 259  260                  "scrub_queue",
 260  261                  "scrub_min_txg",
 261  262                  "scrub_max_txg",
 262  263                  "scrub_func",
 263  264                  "scrub_errors",
 264  265                  NULL
 265  266          };
 266  267  
 267  268          dsl_pool_t *dp = scn->scn_dp;
 268  269          spa_t *spa = dp->dp_spa;
 269  270          int i;
 270  271  
 271  272          /* Remove any remnants of an old-style scrub. */
 272  273          for (i = 0; old_names[i]; i++) {
 273  274                  (void) zap_remove(dp->dp_meta_objset,
 274  275                      DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
 275  276          }
 276  277  
 277  278          if (scn->scn_phys.scn_queue_obj != 0) {
 278  279                  VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
 279  280                      scn->scn_phys.scn_queue_obj, tx));
 280  281                  scn->scn_phys.scn_queue_obj = 0;
 281  282          }
 282  283  
 283  284          /*
 284  285           * If we were "restarted" from a stopped state, don't bother
 285  286           * with anything else.
 286  287           */
 287  288          if (scn->scn_phys.scn_state != DSS_SCANNING)
 288  289                  return;
 289  290  
 290  291          if (complete)
 291  292                  scn->scn_phys.scn_state = DSS_FINISHED;
 292  293          else
 293  294                  scn->scn_phys.scn_state = DSS_CANCELED;
 294  295  
 295  296          spa_history_log_internal(spa, "scan done", tx,
 296  297              "complete=%u", complete);
 297  298  
 298  299          if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
 299  300                  mutex_enter(&spa->spa_scrub_lock);
 300  301                  while (spa->spa_scrub_inflight > 0) {
 301  302                          cv_wait(&spa->spa_scrub_io_cv,
 302  303                              &spa->spa_scrub_lock);
 303  304                  }
 304  305                  mutex_exit(&spa->spa_scrub_lock);
 305  306                  spa->spa_scrub_started = B_FALSE;
 306  307                  spa->spa_scrub_active = B_FALSE;
 307  308  
 308  309                  /*
 309  310                   * If the scrub/resilver completed, update all DTLs to
 310  311                   * reflect this.  Whether it succeeded or not, vacate
 311  312                   * all temporary scrub DTLs.
 312  313                   */
 313  314                  vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
 314  315                      complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
 315  316                  if (complete) {
 316  317                          spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
 317  318                              ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
 318  319                  }
 319  320                  spa_errlog_rotate(spa);
 320  321  
 321  322                  /*
 322  323                   * We may have finished replacing a device.
 323  324                   * Let the async thread assess this and handle the detach.
 324  325                   */
 325  326                  spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 326  327          }
 327  328  
 328  329          scn->scn_phys.scn_end_time = gethrestime_sec();
 329  330  }
 330  331  
 331  332  /* ARGSUSED */
 332  333  static int
 333  334  dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
 334  335  {
 335  336          dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 336  337  
 337  338          if (scn->scn_phys.scn_state != DSS_SCANNING)
 338  339                  return (SET_ERROR(ENOENT));
 339  340          return (0);
 340  341  }
 341  342  
 342  343  /* ARGSUSED */
 343  344  static void
 344  345  dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
 345  346  {
 346  347          dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 347  348  
 348  349          dsl_scan_done(scn, B_FALSE, tx);
 349  350          dsl_scan_sync_state(scn, tx);
 350  351  }
 351  352  
 352  353  int
 353  354  dsl_scan_cancel(dsl_pool_t *dp)
 354  355  {
 355  356          return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
 356  357              dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
 357  358  }
 358  359  
 359  360  static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
 360  361      dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
 361  362      dmu_objset_type_t ostype, dmu_tx_t *tx);
 362  363  static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds,
 363  364      dmu_objset_type_t ostype,
 364  365      dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
 365  366  
 366  367  void
 367  368  dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
 368  369  {
 369  370          zio_free(dp->dp_spa, txg, bp);
 370  371  }
 371  372  
 372  373  void
 373  374  dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
 374  375  {
 375  376          ASSERT(dsl_pool_sync_context(dp));
 376  377          zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
 377  378  }
 378  379  
 379  380  static uint64_t
 380  381  dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
 381  382  {
 382  383          uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
 383  384          if (ds->ds_is_snapshot)
 384  385                  return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
 385  386          return (smt);
 386  387  }
 387  388  
 388  389  static void
 389  390  dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
 390  391  {
 391  392          VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
 392  393              DMU_POOL_DIRECTORY_OBJECT,
 393  394              DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 394  395              &scn->scn_phys, tx));
 395  396  }
 396  397  
 397  398  extern int zfs_vdev_async_write_active_min_dirty_percent;
 398  399  
 399  400  static boolean_t
 400  401  dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 401  402  {
 402  403          /* we never skip user/group accounting objects */
 403  404          if (zb && (int64_t)zb->zb_object < 0)
 404  405                  return (B_FALSE);
 405  406  
 406  407          if (scn->scn_pausing)
 407  408                  return (B_TRUE); /* we're already pausing */
 408  409  
 409  410          if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
 410  411                  return (B_FALSE); /* we're resuming */
 411  412  
 412  413          /* We only know how to resume from level-0 blocks. */
 413  414          if (zb && zb->zb_level != 0)
 414  415                  return (B_FALSE);
 415  416  
 416  417          /*
 417  418           * We pause if:
 418  419           *  - we have scanned for the maximum time: an entire txg
 419  420           *    timeout (default 5 sec)
 420  421           *  or
 421  422           *  - we have scanned for at least the minimum time (default 1 sec
 422  423           *    for scrub, 3 sec for resilver), and either we have sufficient
 423  424           *    dirty data that we are starting to write more quickly
 424  425           *    (default 30%), or someone is explicitly waiting for this txg
 425  426           *    to complete.
 426  427           *  or
 427  428           *  - the spa is shutting down because this pool is being exported
 428  429           *    or the machine is rebooting.
 429  430           */
 430  431          int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
 431  432              zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
 432  433          uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
 433  434          int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
 434  435          if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout ||
 435  436              (NSEC2MSEC(elapsed_nanosecs) > mintime &&
 436  437              (txg_sync_waiting(scn->scn_dp) ||
 437  438              dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) ||
 438  439              spa_shutting_down(scn->scn_dp->dp_spa)) {
 439  440                  if (zb) {
 440  441                          dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
 441  442                              (longlong_t)zb->zb_objset,
 442  443                              (longlong_t)zb->zb_object,
 443  444                              (longlong_t)zb->zb_level,
 444  445                              (longlong_t)zb->zb_blkid);
 445  446                          scn->scn_phys.scn_bookmark = *zb;
 446  447                  }
 447  448                  dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
 448  449                      (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
 449  450                      (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
 450  451                      (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
 451  452                      (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
 452  453                  scn->scn_pausing = B_TRUE;
 453  454                  return (B_TRUE);
 454  455          }
 455  456          return (B_FALSE);
 456  457  }
 457  458  
 458  459  typedef struct zil_scan_arg {
 459  460          dsl_pool_t      *zsa_dp;
 460  461          zil_header_t    *zsa_zh;
 461  462  } zil_scan_arg_t;
 462  463  
 463  464  /* ARGSUSED */
 464  465  static int
 465  466  dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 466  467  {
 467  468          zil_scan_arg_t *zsa = arg;
 468  469          dsl_pool_t *dp = zsa->zsa_dp;
 469  470          dsl_scan_t *scn = dp->dp_scan;
 470  471          zil_header_t *zh = zsa->zsa_zh;
 471  472          zbookmark_phys_t zb;
 472  473  
 473  474          if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
 474  475                  return (0);
 475  476  
 476  477          /*
 477  478           * One block ("stubby") can be allocated a long time ago; we
 478  479           * want to visit that one because it has been allocated
 479  480           * (on-disk) even if it hasn't been claimed (even though for
 480  481           * scrub there's nothing to do to it).
 481  482           */
 482  483          if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
 483  484                  return (0);
 484  485  
 485  486          SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 486  487              ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 487  488  
 488  489          VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
 489  490          return (0);
 490  491  }
 491  492  
 492  493  /* ARGSUSED */
 493  494  static int
 494  495  dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
 495  496  {
 496  497          if (lrc->lrc_txtype == TX_WRITE) {
 497  498                  zil_scan_arg_t *zsa = arg;
 498  499                  dsl_pool_t *dp = zsa->zsa_dp;
 499  500                  dsl_scan_t *scn = dp->dp_scan;
 500  501                  zil_header_t *zh = zsa->zsa_zh;
 501  502                  lr_write_t *lr = (lr_write_t *)lrc;
 502  503                  blkptr_t *bp = &lr->lr_blkptr;
 503  504                  zbookmark_phys_t zb;
 504  505  
 505  506                  if (BP_IS_HOLE(bp) ||
 506  507                      bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
 507  508                          return (0);
 508  509  
 509  510                  /*
 510  511                   * birth can be < claim_txg if this record's txg is
 511  512                   * already txg sync'ed (but this log block contains
 512  513                   * other records that are not synced)
 513  514                   */
 514  515                  if (claim_txg == 0 || bp->blk_birth < claim_txg)
 515  516                          return (0);
 516  517  
 517  518                  SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 518  519                      lr->lr_foid, ZB_ZIL_LEVEL,
 519  520                      lr->lr_offset / BP_GET_LSIZE(bp));
 520  521  
 521  522                  VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
 522  523          }
 523  524          return (0);
 524  525  }
 525  526  
 526  527  static void
 527  528  dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
 528  529  {
 529  530          uint64_t claim_txg = zh->zh_claim_txg;
 530  531          zil_scan_arg_t zsa = { dp, zh };
 531  532          zilog_t *zilog;
 532  533  
 533  534          /*
 534  535           * We only want to visit blocks that have been claimed but not yet
 535  536           * replayed (or, in read-only mode, blocks that *would* be claimed).
 536  537           */
 537  538          if (claim_txg == 0 && spa_writeable(dp->dp_spa))
 538  539                  return;
 539  540  
 540  541          zilog = zil_alloc(dp->dp_meta_objset, zh);
 541  542  
 542  543          (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
 543  544              claim_txg);
 544  545  
 545  546          zil_free(zilog);
 546  547  }
 547  548  
 548  549  /* ARGSUSED */
 549  550  static void
 550  551  dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
 551  552      uint64_t objset, uint64_t object, uint64_t blkid)
 552  553  {
 553  554          zbookmark_phys_t czb;
 554  555          arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 555  556  
 556  557          if (zfs_no_scrub_prefetch)
 557  558                  return;
 558  559  
 559  560          if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
 560  561              (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
 561  562                  return;
 562  563  
 563  564          SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
 564  565  
 565  566          (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
 566  567              NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
 567  568              ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
 568  569  }
 569  570  
 570  571  static boolean_t
 571  572  dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
 572  573      const zbookmark_phys_t *zb)
 573  574  {
 574  575          /*
 575  576           * We never skip over user/group accounting objects (obj<0)
 576  577           */
 577  578          if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
 578  579              (int64_t)zb->zb_object >= 0) {
 579  580                  /*
 580  581                   * If we already visited this bp & everything below (in
 581  582                   * a prior txg sync), don't bother doing it again.
 582  583                   */
 583  584                  if (zbookmark_subtree_completed(dnp, zb,
 584  585                      &scn->scn_phys.scn_bookmark))
 585  586                          return (B_TRUE);
 586  587  
 587  588                  /*
 588  589                   * If we found the block we're trying to resume from, or
 589  590                   * we went past it to a different object, zero it out to
 590  591                   * indicate that it's OK to start checking for pausing
 591  592                   * again.
 592  593                   */
 593  594                  if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
 594  595                      zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
 595  596                          dprintf("resuming at %llx/%llx/%llx/%llx\n",
 596  597                              (longlong_t)zb->zb_objset,
 597  598                              (longlong_t)zb->zb_object,
 598  599                              (longlong_t)zb->zb_level,
 599  600                              (longlong_t)zb->zb_blkid);
 600  601                          bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
 601  602                  }
 602  603          }
 603  604          return (B_FALSE);
 604  605  }
 605  606  
 606  607  /*
 607  608   * Return nonzero on i/o error.
 608  609   * Return new buf to write out in *bufp.
 609  610   */
 610  611  static int
 611  612  dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 612  613      dnode_phys_t *dnp, const blkptr_t *bp,
 613  614      const zbookmark_phys_t *zb, dmu_tx_t *tx)
 614  615  {
 615  616          dsl_pool_t *dp = scn->scn_dp;
 616  617          int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
 617  618          int err;
 618  619  
 619  620          if (BP_GET_LEVEL(bp) > 0) {
 620  621                  arc_flags_t flags = ARC_FLAG_WAIT;
 621  622                  int i;
 622  623                  blkptr_t *cbp;
 623  624                  int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 624  625                  arc_buf_t *buf;
 625  626  
 626  627                  err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
 627  628                      ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 628  629                  if (err) {
 629  630                          scn->scn_phys.scn_errors++;
 630  631                          return (err);
 631  632                  }
 632  633                  for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
 633  634                          dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset,
 634  635                              zb->zb_object, zb->zb_blkid * epb + i);
 635  636                  }
 636  637                  for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
 637  638                          zbookmark_phys_t czb;
 638  639  
 639  640                          SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 640  641                              zb->zb_level - 1,
 641  642                              zb->zb_blkid * epb + i);
 642  643                          dsl_scan_visitbp(cbp, &czb, dnp,
 643  644                              ds, scn, ostype, tx);
 644  645                  }
 645  646                  (void) arc_buf_remove_ref(buf, &buf);
 646  647          } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 647  648                  arc_flags_t flags = ARC_FLAG_WAIT;
 648  649                  dnode_phys_t *cdnp;
 649  650                  int i, j;
 650  651                  int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 651  652                  arc_buf_t *buf;
 652  653  
 653  654                  err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
 654  655                      ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 655  656                  if (err) {
 656  657                          scn->scn_phys.scn_errors++;
 657  658                          return (err);
 658  659                  }
 659  660                  for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
 660  661                          for (j = 0; j < cdnp->dn_nblkptr; j++) {
 661  662                                  blkptr_t *cbp = &cdnp->dn_blkptr[j];
 662  663                                  dsl_scan_prefetch(scn, buf, cbp,
 663  664                                      zb->zb_objset, zb->zb_blkid * epb + i, j);
 664  665                          }
 665  666                  }
 666  667                  for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
 667  668                          dsl_scan_visitdnode(scn, ds, ostype,
 668  669                              cdnp, zb->zb_blkid * epb + i, tx);
 669  670                  }
 670  671  
 671  672                  (void) arc_buf_remove_ref(buf, &buf);
 672  673          } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 673  674                  arc_flags_t flags = ARC_FLAG_WAIT;
 674  675                  objset_phys_t *osp;
 675  676                  arc_buf_t *buf;
 676  677  
 677  678                  err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
 678  679                      ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 679  680                  if (err) {
 680  681                          scn->scn_phys.scn_errors++;
 681  682                          return (err);
 682  683                  }
 683  684  
 684  685                  osp = buf->b_data;
 685  686  
 686  687                  dsl_scan_visitdnode(scn, ds, osp->os_type,
 687  688                      &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);
 688  689  
 689  690                  if (OBJSET_BUF_HAS_USERUSED(buf)) {
 690  691                          /*
 691  692                           * We also always visit user/group accounting
 692  693                           * objects, and never skip them, even if we are
 693  694                           * pausing.  This is necessary so that the space
 694  695                           * deltas from this txg get integrated.
 695  696                           */
 696  697                          dsl_scan_visitdnode(scn, ds, osp->os_type,
 697  698                              &osp->os_groupused_dnode,
 698  699                              DMU_GROUPUSED_OBJECT, tx);
 699  700                          dsl_scan_visitdnode(scn, ds, osp->os_type,
 700  701                              &osp->os_userused_dnode,
 701  702                              DMU_USERUSED_OBJECT, tx);
 702  703                  }
 703  704                  (void) arc_buf_remove_ref(buf, &buf);
 704  705          }
 705  706  
 706  707          return (0);
 707  708  }
 708  709  
 709  710  static void
 710  711  dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
 711  712      dmu_objset_type_t ostype, dnode_phys_t *dnp,
 712  713      uint64_t object, dmu_tx_t *tx)
 713  714  {
 714  715          int j;
 715  716  
 716  717          for (j = 0; j < dnp->dn_nblkptr; j++) {
 717  718                  zbookmark_phys_t czb;
 718  719  
 719  720                  SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
 720  721                      dnp->dn_nlevels - 1, j);
 721  722                  dsl_scan_visitbp(&dnp->dn_blkptr[j],
 722  723                      &czb, dnp, ds, scn, ostype, tx);
 723  724          }
 724  725  
 725  726          if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 726  727                  zbookmark_phys_t czb;
 727  728                  SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
 728  729                      0, DMU_SPILL_BLKID);
 729  730                  dsl_scan_visitbp(&dnp->dn_spill,
 730  731                      &czb, dnp, ds, scn, ostype, tx);
 731  732          }
 732  733  }
 733  734  
 734  735  /*
 735  736   * The arguments are in this order because mdb can only print the
 736  737   * first 5; we want them to be useful.
 737  738   */
 738  739  static void
 739  740  dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
 740  741      dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
 741  742      dmu_objset_type_t ostype, dmu_tx_t *tx)
 742  743  {
 743  744          dsl_pool_t *dp = scn->scn_dp;
 744  745          arc_buf_t *buf = NULL;
 745  746          blkptr_t bp_toread = *bp;
 746  747  
 747  748          /* ASSERT(pbuf == NULL || arc_released(pbuf)); */
 748  749  
 749  750          if (dsl_scan_check_pause(scn, zb))
 750  751                  return;
 751  752  
 752  753          if (dsl_scan_check_resume(scn, dnp, zb))
 753  754                  return;
 754  755  
 755  756          if (BP_IS_HOLE(bp))
 756  757                  return;
 757  758  
 758  759          scn->scn_visited_this_txg++;
 759  760  
 760  761          dprintf_bp(bp,
 761  762              "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
 762  763              ds, ds ? ds->ds_object : 0,
 763  764              zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
 764  765              bp);
 765  766  
 766  767          if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
 767  768                  return;
 768  769  
 769  770          if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx) != 0)
 770  771                  return;
 771  772  
 772  773          /*
 773  774           * If dsl_scan_ddt() has aready visited this block, it will have
 774  775           * already done any translations or scrubbing, so don't call the
 775  776           * callback again.
 776  777           */
 777  778          if (ddt_class_contains(dp->dp_spa,
 778  779              scn->scn_phys.scn_ddt_class_max, bp)) {
 779  780                  ASSERT(buf == NULL);
 780  781                  return;
 781  782          }
 782  783  
 783  784          /*
 784  785           * If this block is from the future (after cur_max_txg), then we
 785  786           * are doing this on behalf of a deleted snapshot, and we will
 786  787           * revisit the future block on the next pass of this dataset.
 787  788           * Don't scan it now unless we need to because something
 788  789           * under it was modified.
 789  790           */
 790  791          if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) {
 791  792                  scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
 792  793          }
 793  794  }
 794  795  
 795  796  static void
 796  797  dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
 797  798      dmu_tx_t *tx)
 798  799  {
 799  800          zbookmark_phys_t zb;
 800  801  
 801  802          SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 802  803              ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 803  804          dsl_scan_visitbp(bp, &zb, NULL,
 804  805              ds, scn, DMU_OST_NONE, tx);
 805  806  
 806  807          dprintf_ds(ds, "finished scan%s", "");
 807  808  }
 808  809  
 809  810  void
 810  811  dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
 811  812  {
 812  813          dsl_pool_t *dp = ds->ds_dir->dd_pool;
 813  814          dsl_scan_t *scn = dp->dp_scan;
 814  815          uint64_t mintxg;
 815  816  
 816  817          if (scn->scn_phys.scn_state != DSS_SCANNING)
 817  818                  return;
 818  819  
 819  820          if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
 820  821                  if (ds->ds_is_snapshot) {
 821  822                          /* Note, scn_cur_{min,max}_txg stays the same. */
 822  823                          scn->scn_phys.scn_bookmark.zb_objset =
 823  824                              dsl_dataset_phys(ds)->ds_next_snap_obj;
 824  825                          zfs_dbgmsg("destroying ds %llu; currently traversing; "
 825  826                              "reset zb_objset to %llu",
 826  827                              (u_longlong_t)ds->ds_object,
 827  828                              (u_longlong_t)dsl_dataset_phys(ds)->
 828  829                              ds_next_snap_obj);
 829  830                          scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
 830  831                  } else {
 831  832                          SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
 832  833                              ZB_DESTROYED_OBJSET, 0, 0, 0);
 833  834                          zfs_dbgmsg("destroying ds %llu; currently traversing; "
 834  835                              "reset bookmark to -1,0,0,0",
 835  836                              (u_longlong_t)ds->ds_object);
 836  837                  }
 837  838          } else if (zap_lookup_int_key(dp->dp_meta_objset,
 838  839              scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
 839  840                  ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
 840  841                  VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 841  842                      scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
 842  843                  if (ds->ds_is_snapshot) {
 843  844                          /*
 844  845                           * We keep the same mintxg; it could be >
 845  846                           * ds_creation_txg if the previous snapshot was
 846  847                           * deleted too.
 847  848                           */
 848  849                          VERIFY(zap_add_int_key(dp->dp_meta_objset,
 849  850                              scn->scn_phys.scn_queue_obj,
 850  851                              dsl_dataset_phys(ds)->ds_next_snap_obj,
 851  852                              mintxg, tx) == 0);
 852  853                          zfs_dbgmsg("destroying ds %llu; in queue; "
 853  854                              "replacing with %llu",
 854  855                              (u_longlong_t)ds->ds_object,
 855  856                              (u_longlong_t)dsl_dataset_phys(ds)->
 856  857                              ds_next_snap_obj);
 857  858                  } else {
 858  859                          zfs_dbgmsg("destroying ds %llu; in queue; removing",
 859  860                              (u_longlong_t)ds->ds_object);
 860  861                  }
 861  862          } else {
 862  863                  zfs_dbgmsg("destroying ds %llu; ignoring",
 863  864                      (u_longlong_t)ds->ds_object);
 864  865          }
 865  866  
 866  867          /*
 867  868           * dsl_scan_sync() should be called after this, and should sync
 868  869           * out our changed state, but just to be safe, do it here.
 869  870           */
 870  871          dsl_scan_sync_state(scn, tx);
 871  872  }
 872  873  
 873  874  void
 874  875  dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
 875  876  {
 876  877          dsl_pool_t *dp = ds->ds_dir->dd_pool;
 877  878          dsl_scan_t *scn = dp->dp_scan;
 878  879          uint64_t mintxg;
 879  880  
 880  881          if (scn->scn_phys.scn_state != DSS_SCANNING)
 881  882                  return;
 882  883  
 883  884          ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
 884  885  
 885  886          if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
 886  887                  scn->scn_phys.scn_bookmark.zb_objset =
 887  888                      dsl_dataset_phys(ds)->ds_prev_snap_obj;
 888  889                  zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
 889  890                      "reset zb_objset to %llu",
 890  891                      (u_longlong_t)ds->ds_object,
 891  892                      (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
 892  893          } else if (zap_lookup_int_key(dp->dp_meta_objset,
 893  894              scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
 894  895                  VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 895  896                      scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
 896  897                  VERIFY(zap_add_int_key(dp->dp_meta_objset,
 897  898                      scn->scn_phys.scn_queue_obj,
 898  899                      dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
 899  900                  zfs_dbgmsg("snapshotting ds %llu; in queue; "
 900  901                      "replacing with %llu",
 901  902                      (u_longlong_t)ds->ds_object,
 902  903                      (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
 903  904          }
 904  905          dsl_scan_sync_state(scn, tx);
 905  906  }
 906  907  
 907  908  void
 908  909  dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
 909  910  {
 910  911          dsl_pool_t *dp = ds1->ds_dir->dd_pool;
 911  912          dsl_scan_t *scn = dp->dp_scan;
 912  913          uint64_t mintxg;
 913  914  
 914  915          if (scn->scn_phys.scn_state != DSS_SCANNING)
 915  916                  return;
 916  917  
 917  918          if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
 918  919                  scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
 919  920                  zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
 920  921                      "reset zb_objset to %llu",
 921  922                      (u_longlong_t)ds1->ds_object,
 922  923                      (u_longlong_t)ds2->ds_object);
 923  924          } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
 924  925                  scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
 925  926                  zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
 926  927                      "reset zb_objset to %llu",
 927  928                      (u_longlong_t)ds2->ds_object,
 928  929                      (u_longlong_t)ds1->ds_object);
 929  930          }
 930  931  
 931  932          if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
 932  933              ds1->ds_object, &mintxg) == 0) {
 933  934                  int err;
 934  935  
 935  936                  ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
 936  937                  ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 937  938                  VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 938  939                      scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
 939  940                  err = zap_add_int_key(dp->dp_meta_objset,
 940  941                      scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
 941  942                  VERIFY(err == 0 || err == EEXIST);
 942  943                  if (err == EEXIST) {
 943  944                          /* Both were there to begin with */
 944  945                          VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
 945  946                              scn->scn_phys.scn_queue_obj,
 946  947                              ds1->ds_object, mintxg, tx));
 947  948                  }
 948  949                  zfs_dbgmsg("clone_swap ds %llu; in queue; "
 949  950                      "replacing with %llu",
 950  951                      (u_longlong_t)ds1->ds_object,
 951  952                      (u_longlong_t)ds2->ds_object);
 952  953          } else if (zap_lookup_int_key(dp->dp_meta_objset,
 953  954              scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
 954  955                  ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
 955  956                  ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 956  957                  VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 957  958                      scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
 958  959                  VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
 959  960                      scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
 960  961                  zfs_dbgmsg("clone_swap ds %llu; in queue; "
 961  962                      "replacing with %llu",
 962  963                      (u_longlong_t)ds2->ds_object,
 963  964                      (u_longlong_t)ds1->ds_object);
 964  965          }
 965  966  
 966  967          dsl_scan_sync_state(scn, tx);
 967  968  }
 968  969  
 969  970  struct enqueue_clones_arg {
 970  971          dmu_tx_t *tx;
 971  972          uint64_t originobj;
 972  973  };
 973  974  
 974  975  /* ARGSUSED */
 975  976  static int
 976  977  enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 977  978  {
 978  979          struct enqueue_clones_arg *eca = arg;
 979  980          dsl_dataset_t *ds;
 980  981          int err;
 981  982          dsl_scan_t *scn = dp->dp_scan;
 982  983  
 983  984          if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj)
 984  985                  return (0);
 985  986  
 986  987          err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 987  988          if (err)
 988  989                  return (err);
 989  990  
 990  991          while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) {
 991  992                  dsl_dataset_t *prev;
 992  993                  err = dsl_dataset_hold_obj(dp,
 993  994                      dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 994  995  
 995  996                  dsl_dataset_rele(ds, FTAG);
 996  997                  if (err)
 997  998                          return (err);
 998  999                  ds = prev;
 999 1000          }
1000 1001          VERIFY(zap_add_int_key(dp->dp_meta_objset,
1001 1002              scn->scn_phys.scn_queue_obj, ds->ds_object,
1002 1003              dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0);
1003 1004          dsl_dataset_rele(ds, FTAG);
1004 1005          return (0);
1005 1006  }
1006 1007  
1007 1008  static void
1008 1009  dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
1009 1010  {
1010 1011          dsl_pool_t *dp = scn->scn_dp;
1011 1012          dsl_dataset_t *ds;
1012 1013          objset_t *os;
1013 1014  
1014 1015          VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
1015 1016  
1016 1017          if (dmu_objset_from_ds(ds, &os))
1017 1018                  goto out;
1018 1019  
1019 1020          /*
1020 1021           * Only the ZIL in the head (non-snapshot) is valid.  Even though
1021 1022           * snapshots can have ZIL block pointers (which may be the same
1022 1023           * BP as in the head), they must be ignored.  So we traverse the
1023 1024           * ZIL here, rather than in scan_recurse(), because the regular
1024 1025           * snapshot block-sharing rules don't apply to it.
1025 1026           */
1026 1027          if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot)
1027 1028                  dsl_scan_zil(dp, &os->os_zil_header);
1028 1029  
1029 1030          /*
1030 1031           * Iterate over the bps in this ds.
1031 1032           */
1032 1033          dmu_buf_will_dirty(ds->ds_dbuf, tx);
1033 1034          dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
1034 1035  
1035 1036          char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP);
1036 1037          dsl_dataset_name(ds, dsname);
1037 1038          zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
1038 1039              "pausing=%u",
1039 1040              (longlong_t)dsobj, dsname,
1040 1041              (longlong_t)scn->scn_phys.scn_cur_min_txg,
1041 1042              (longlong_t)scn->scn_phys.scn_cur_max_txg,
1042 1043              (int)scn->scn_pausing);
1043 1044          kmem_free(dsname, ZFS_MAXNAMELEN);
1044 1045  
1045 1046          if (scn->scn_pausing)
1046 1047                  goto out;
1047 1048  
1048 1049          /*
1049 1050           * We've finished this pass over this dataset.
1050 1051           */
1051 1052  
1052 1053          /*
1053 1054           * If we did not completely visit this dataset, do another pass.
1054 1055           */
1055 1056          if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
1056 1057                  zfs_dbgmsg("incomplete pass; visiting again");
1057 1058                  scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
1058 1059                  VERIFY(zap_add_int_key(dp->dp_meta_objset,
1059 1060                      scn->scn_phys.scn_queue_obj, ds->ds_object,
1060 1061                      scn->scn_phys.scn_cur_max_txg, tx) == 0);
1061 1062                  goto out;
1062 1063          }
1063 1064  
1064 1065          /*
1065 1066           * Add descendent datasets to work queue.
1066 1067           */
1067 1068          if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
1068 1069                  VERIFY(zap_add_int_key(dp->dp_meta_objset,
1069 1070                      scn->scn_phys.scn_queue_obj,
1070 1071                      dsl_dataset_phys(ds)->ds_next_snap_obj,
1071 1072                      dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0);
1072 1073          }
1073 1074          if (dsl_dataset_phys(ds)->ds_num_children > 1) {
1074 1075                  boolean_t usenext = B_FALSE;
1075 1076                  if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
1076 1077                          uint64_t count;
1077 1078                          /*
1078 1079                           * A bug in a previous version of the code could
1079 1080                           * cause upgrade_clones_cb() to not set
1080 1081                           * ds_next_snap_obj when it should, leading to a
1081 1082                           * missing entry.  Therefore we can only use the
1082 1083                           * next_clones_obj when its count is correct.
1083 1084                           */
1084 1085                          int err = zap_count(dp->dp_meta_objset,
1085 1086                              dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
1086 1087                          if (err == 0 &&
1087 1088                              count == dsl_dataset_phys(ds)->ds_num_children - 1)
1088 1089                                  usenext = B_TRUE;
1089 1090                  }
1090 1091  
1091 1092                  if (usenext) {
1092 1093                          VERIFY0(zap_join_key(dp->dp_meta_objset,
1093 1094                              dsl_dataset_phys(ds)->ds_next_clones_obj,
1094 1095                              scn->scn_phys.scn_queue_obj,
1095 1096                              dsl_dataset_phys(ds)->ds_creation_txg, tx));
1096 1097                  } else {
1097 1098                          struct enqueue_clones_arg eca;
1098 1099                          eca.tx = tx;
1099 1100                          eca.originobj = ds->ds_object;
1100 1101  
1101 1102                          VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
1102 1103                              enqueue_clones_cb, &eca, DS_FIND_CHILDREN));
1103 1104                  }
1104 1105          }
1105 1106  
1106 1107  out:
1107 1108          dsl_dataset_rele(ds, FTAG);
1108 1109  }
1109 1110  
1110 1111  /* ARGSUSED */
1111 1112  static int
1112 1113  enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
1113 1114  {
1114 1115          dmu_tx_t *tx = arg;
1115 1116          dsl_dataset_t *ds;
1116 1117          int err;
1117 1118          dsl_scan_t *scn = dp->dp_scan;
1118 1119  
1119 1120          err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
1120 1121          if (err)
1121 1122                  return (err);
1122 1123  
1123 1124          while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
1124 1125                  dsl_dataset_t *prev;
1125 1126                  err = dsl_dataset_hold_obj(dp,
1126 1127                      dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
1127 1128                  if (err) {
1128 1129                          dsl_dataset_rele(ds, FTAG);
1129 1130                          return (err);
1130 1131                  }
1131 1132  
1132 1133                  /*
1133 1134                   * If this is a clone, we don't need to worry about it for now.
1134 1135                   */
1135 1136                  if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
1136 1137                          dsl_dataset_rele(ds, FTAG);
1137 1138                          dsl_dataset_rele(prev, FTAG);
1138 1139                          return (0);
1139 1140                  }
1140 1141                  dsl_dataset_rele(ds, FTAG);
1141 1142                  ds = prev;
1142 1143          }
1143 1144  
1144 1145          VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
1145 1146              ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0);
1146 1147          dsl_dataset_rele(ds, FTAG);
1147 1148          return (0);
1148 1149  }
1149 1150  
1150 1151  /*
1151 1152   * Scrub/dedup interaction.
1152 1153   *
1153 1154   * If there are N references to a deduped block, we don't want to scrub it
1154 1155   * N times -- ideally, we should scrub it exactly once.
1155 1156   *
1156 1157   * We leverage the fact that the dde's replication class (enum ddt_class)
1157 1158   * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
1158 1159   * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
1159 1160   *
1160 1161   * To prevent excess scrubbing, the scrub begins by walking the DDT
1161 1162   * to find all blocks with refcnt > 1, and scrubs each of these once.
1162 1163   * Since there are two replication classes which contain blocks with
1163 1164   * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
1164 1165   * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
1165 1166   *
1166 1167   * There would be nothing more to say if a block's refcnt couldn't change
1167 1168   * during a scrub, but of course it can so we must account for changes
1168 1169   * in a block's replication class.
1169 1170   *
1170 1171   * Here's an example of what can occur:
1171 1172   *
1172 1173   * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
1173 1174   * when visited during the top-down scrub phase, it will be scrubbed twice.
1174 1175   * This negates our scrub optimization, but is otherwise harmless.
1175 1176   *
1176 1177   * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
1177 1178   * on each visit during the top-down scrub phase, it will never be scrubbed.
1178 1179   * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
1179 1180   * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
1180 1181   * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
1181 1182   * while a scrub is in progress, it scrubs the block right then.
1182 1183   */
1183 1184  static void
1184 1185  dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
1185 1186  {
1186 1187          ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
1187 1188          ddt_entry_t dde = { 0 };
1188 1189          int error;
1189 1190          uint64_t n = 0;
1190 1191  
1191 1192          while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
1192 1193                  ddt_t *ddt;
1193 1194  
1194 1195                  if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
1195 1196                          break;
1196 1197                  dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
1197 1198                      (longlong_t)ddb->ddb_class,
1198 1199                      (longlong_t)ddb->ddb_type,
1199 1200                      (longlong_t)ddb->ddb_checksum,
1200 1201                      (longlong_t)ddb->ddb_cursor);
1201 1202  
1202 1203                  /* There should be no pending changes to the dedup table */
1203 1204                  ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
1204 1205                  ASSERT(avl_first(&ddt->ddt_tree) == NULL);
1205 1206  
1206 1207                  dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
1207 1208                  n++;
1208 1209  
1209 1210                  if (dsl_scan_check_pause(scn, NULL))
1210 1211                          break;
1211 1212          }
1212 1213  
1213 1214          zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
1214 1215              (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max,
1215 1216              (int)scn->scn_pausing);
1216 1217  
1217 1218          ASSERT(error == 0 || error == ENOENT);
1218 1219          ASSERT(error != ENOENT ||
1219 1220              ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
1220 1221  }
1221 1222  
1222 1223  /* ARGSUSED */
1223 1224  void
1224 1225  dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
1225 1226      ddt_entry_t *dde, dmu_tx_t *tx)
1226 1227  {
1227 1228          const ddt_key_t *ddk = &dde->dde_key;
1228 1229          ddt_phys_t *ddp = dde->dde_phys;
1229 1230          blkptr_t bp;
1230 1231          zbookmark_phys_t zb = { 0 };
1231 1232  
1232 1233          if (scn->scn_phys.scn_state != DSS_SCANNING)
1233 1234                  return;
1234 1235  
1235 1236          for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1236 1237                  if (ddp->ddp_phys_birth == 0 ||
1237 1238                      ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
1238 1239                          continue;
1239 1240                  ddt_bp_create(checksum, ddk, ddp, &bp);
1240 1241  
1241 1242                  scn->scn_visited_this_txg++;
1242 1243                  scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
1243 1244          }
1244 1245  }
1245 1246  
1246 1247  static void
1247 1248  dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
1248 1249  {
1249 1250          dsl_pool_t *dp = scn->scn_dp;
1250 1251          zap_cursor_t zc;
1251 1252          zap_attribute_t za;
1252 1253  
1253 1254          if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
1254 1255              scn->scn_phys.scn_ddt_class_max) {
1255 1256                  scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
1256 1257                  scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
1257 1258                  dsl_scan_ddt(scn, tx);
1258 1259                  if (scn->scn_pausing)
1259 1260                          return;
1260 1261          }
1261 1262  
1262 1263          if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
1263 1264                  /* First do the MOS & ORIGIN */
1264 1265  
1265 1266                  scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
1266 1267                  scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
1267 1268                  dsl_scan_visit_rootbp(scn, NULL,
1268 1269                      &dp->dp_meta_rootbp, tx);
1269 1270                  spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
1270 1271                  if (scn->scn_pausing)
1271 1272                          return;
1272 1273  
1273 1274                  if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
1274 1275                          VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
1275 1276                              enqueue_cb, tx, DS_FIND_CHILDREN));
1276 1277                  } else {
1277 1278                          dsl_scan_visitds(scn,
1278 1279                              dp->dp_origin_snap->ds_object, tx);
1279 1280                  }
1280 1281                  ASSERT(!scn->scn_pausing);
1281 1282          } else if (scn->scn_phys.scn_bookmark.zb_objset !=
1282 1283              ZB_DESTROYED_OBJSET) {
1283 1284                  /*
1284 1285                   * If we were paused, continue from here.  Note if the
1285 1286                   * ds we were paused on was deleted, the zb_objset may
1286 1287                   * be -1, so we will skip this and find a new objset
1287 1288                   * below.
1288 1289                   */
1289 1290                  dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
1290 1291                  if (scn->scn_pausing)
1291 1292                          return;
1292 1293          }
1293 1294  
1294 1295          /*
1295 1296           * In case we were paused right at the end of the ds, zero the
1296 1297           * bookmark so we don't think that we're still trying to resume.
1297 1298           */
1298 1299          bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
1299 1300  
1300 1301          /* keep pulling things out of the zap-object-as-queue */
1301 1302          while (zap_cursor_init(&zc, dp->dp_meta_objset,
1302 1303              scn->scn_phys.scn_queue_obj),
1303 1304              zap_cursor_retrieve(&zc, &za) == 0) {
1304 1305                  dsl_dataset_t *ds;
1305 1306                  uint64_t dsobj;
1306 1307  
1307 1308                  dsobj = strtonum(za.za_name, NULL);
1308 1309                  VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
1309 1310                      scn->scn_phys.scn_queue_obj, dsobj, tx));
1310 1311  
1311 1312                  /* Set up min/max txg */
1312 1313                  VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
1313 1314                  if (za.za_first_integer != 0) {
1314 1315                          scn->scn_phys.scn_cur_min_txg =
1315 1316                              MAX(scn->scn_phys.scn_min_txg,
1316 1317                              za.za_first_integer);
1317 1318                  } else {
1318 1319                          scn->scn_phys.scn_cur_min_txg =
1319 1320                              MAX(scn->scn_phys.scn_min_txg,
1320 1321                              dsl_dataset_phys(ds)->ds_prev_snap_txg);
1321 1322                  }
1322 1323                  scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
1323 1324                  dsl_dataset_rele(ds, FTAG);
1324 1325  
1325 1326                  dsl_scan_visitds(scn, dsobj, tx);
1326 1327                  zap_cursor_fini(&zc);
1327 1328                  if (scn->scn_pausing)
1328 1329                          return;
1329 1330          }
1330 1331          zap_cursor_fini(&zc);
1331 1332  }
1332 1333  
1333 1334  static boolean_t
1334 1335  dsl_scan_free_should_pause(dsl_scan_t *scn)
1335 1336  {
1336 1337          uint64_t elapsed_nanosecs;
1337 1338  
1338 1339          if (zfs_recover)
1339 1340                  return (B_FALSE);
1340 1341  
1341 1342          if (scn->scn_visited_this_txg >= zfs_free_max_blocks)
1342 1343                  return (B_TRUE);
1343 1344  
1344 1345          elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
1345 1346          return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
1346 1347              (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms &&
1347 1348              txg_sync_waiting(scn->scn_dp)) ||
1348 1349              spa_shutting_down(scn->scn_dp->dp_spa));
1349 1350  }
1350 1351  
1351 1352  static int
1352 1353  dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1353 1354  {
1354 1355          dsl_scan_t *scn = arg;
1355 1356  
1356 1357          if (!scn->scn_is_bptree ||
1357 1358              (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
1358 1359                  if (dsl_scan_free_should_pause(scn))
1359 1360                          return (SET_ERROR(ERESTART));
1360 1361          }
1361 1362  
1362 1363          zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
1363 1364              dmu_tx_get_txg(tx), bp, 0));
1364 1365          dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
1365 1366              -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
1366 1367              -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
1367 1368          scn->scn_visited_this_txg++;
1368 1369          return (0);
1369 1370  }
1370 1371  
1371 1372  boolean_t
1372 1373  dsl_scan_active(dsl_scan_t *scn)
1373 1374  {
1374 1375          spa_t *spa = scn->scn_dp->dp_spa;
1375 1376          uint64_t used = 0, comp, uncomp;
1376 1377  
1377 1378          if (spa->spa_load_state != SPA_LOAD_NONE)
1378 1379                  return (B_FALSE);
1379 1380          if (spa_shutting_down(spa))
1380 1381                  return (B_FALSE);
1381 1382          if (scn->scn_phys.scn_state == DSS_SCANNING ||
1382 1383              (scn->scn_async_destroying && !scn->scn_async_stalled))
1383 1384                  return (B_TRUE);
1384 1385  
1385 1386          if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
1386 1387                  (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
1387 1388                      &used, &comp, &uncomp);
1388 1389          }
1389 1390          return (used != 0);
1390 1391  }
1391 1392  
1392 1393  void
1393 1394  dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
1394 1395  {
1395 1396          dsl_scan_t *scn = dp->dp_scan;
1396 1397          spa_t *spa = dp->dp_spa;
1397 1398          int err = 0;
1398 1399  
1399 1400          /*
1400 1401           * Check for scn_restart_txg before checking spa_load_state, so
1401 1402           * that we can restart an old-style scan while the pool is being
1402 1403           * imported (see dsl_scan_init).
1403 1404           */
1404 1405          if (scn->scn_restart_txg != 0 &&
1405 1406              scn->scn_restart_txg <= tx->tx_txg) {
1406 1407                  pool_scan_func_t func = POOL_SCAN_SCRUB;
1407 1408                  dsl_scan_done(scn, B_FALSE, tx);
1408 1409                  if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
1409 1410                          func = POOL_SCAN_RESILVER;
1410 1411                  zfs_dbgmsg("restarting scan func=%u txg=%llu",
1411 1412                      func, tx->tx_txg);
1412 1413                  dsl_scan_setup_sync(&func, tx);
1413 1414          }
1414 1415  
1415 1416          /*
1416 1417           * If the scan is inactive due to a stalled async destroy, try again.
1417 1418           */
1418 1419          if ((!scn->scn_async_stalled && !dsl_scan_active(scn)) ||
1419 1420              spa_sync_pass(dp->dp_spa) > 1)
1420 1421                  return;
1421 1422  
1422 1423          scn->scn_visited_this_txg = 0;
1423 1424          scn->scn_pausing = B_FALSE;
1424 1425          scn->scn_sync_start_time = gethrtime();
1425 1426          spa->spa_scrub_active = B_TRUE;
1426 1427  
1427 1428          /*
1428 1429           * First process the async destroys.  If we pause, don't do
1429 1430           * any scrubbing or resilvering.  This ensures that there are no
1430 1431           * async destroys while we are scanning, so the scan code doesn't
1431 1432           * have to worry about traversing it.  It is also faster to free the
1432 1433           * blocks than to scrub them.
1433 1434           */
1434 1435          if (zfs_free_bpobj_enabled &&
1435 1436              spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
1436 1437                  scn->scn_is_bptree = B_FALSE;
1437 1438                  scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1438 1439                      NULL, ZIO_FLAG_MUSTSUCCEED);
1439 1440                  err = bpobj_iterate(&dp->dp_free_bpobj,
1440 1441                      dsl_scan_free_block_cb, scn, tx);
1441 1442                  VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
1442 1443  
1443 1444                  if (err != 0 && err != ERESTART)
1444 1445                          zfs_panic_recover("error %u from bpobj_iterate()", err);
1445 1446          }
1446 1447  
1447 1448          if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
1448 1449                  ASSERT(scn->scn_async_destroying);
1449 1450                  scn->scn_is_bptree = B_TRUE;
1450 1451                  scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1451 1452                      NULL, ZIO_FLAG_MUSTSUCCEED);
1452 1453                  err = bptree_iterate(dp->dp_meta_objset,
1453 1454                      dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
1454 1455                  VERIFY0(zio_wait(scn->scn_zio_root));
1455 1456  
1456 1457                  if (err == EIO || err == ECKSUM) {
1457 1458                          err = 0;
1458 1459                  } else if (err != 0 && err != ERESTART) {
1459 1460                          zfs_panic_recover("error %u from "
1460 1461                              "traverse_dataset_destroyed()", err);
1461 1462                  }
1462 1463  
1463 1464                  if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
1464 1465                          /* finished; deactivate async destroy feature */
1465 1466                          spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
1466 1467                          ASSERT(!spa_feature_is_active(spa,
1467 1468                              SPA_FEATURE_ASYNC_DESTROY));
1468 1469                          VERIFY0(zap_remove(dp->dp_meta_objset,
1469 1470                              DMU_POOL_DIRECTORY_OBJECT,
1470 1471                              DMU_POOL_BPTREE_OBJ, tx));
1471 1472                          VERIFY0(bptree_free(dp->dp_meta_objset,
1472 1473                              dp->dp_bptree_obj, tx));
1473 1474                          dp->dp_bptree_obj = 0;
1474 1475                          scn->scn_async_destroying = B_FALSE;
1475 1476                          scn->scn_async_stalled = B_FALSE;
1476 1477                  } else {
1477 1478                          /*
1478 1479                           * If we didn't make progress, mark the async
1479 1480                           * destroy as stalled, so that we will not initiate
1480 1481                           * a spa_sync() on its behalf.  Note that we only
1481 1482                           * check this if we are not finished, because if the
1482 1483                           * bptree had no blocks for us to visit, we can
1483 1484                           * finish without "making progress".
1484 1485                           */
1485 1486                          scn->scn_async_stalled =
1486 1487                              (scn->scn_visited_this_txg == 0);
1487 1488                  }
1488 1489          }
1489 1490          if (scn->scn_visited_this_txg) {
1490 1491                  zfs_dbgmsg("freed %llu blocks in %llums from "
1491 1492                      "free_bpobj/bptree txg %llu; err=%u",
1492 1493                      (longlong_t)scn->scn_visited_this_txg,
1493 1494                      (longlong_t)
1494 1495                      NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
1495 1496                      (longlong_t)tx->tx_txg, err);
1496 1497                  scn->scn_visited_this_txg = 0;

↓ open down ↓

1463 lines elided

↑ open up ↑

1497 1498  
1498 1499                  /*
1499 1500                   * Write out changes to the DDT that may be required as a
1500 1501                   * result of the blocks freed.  This ensures that the DDT
1501 1502                   * is clean when a scrub/resilver runs.
1502 1503                   */
1503 1504                  ddt_sync(spa, tx->tx_txg);
1504 1505          }
1505 1506          if (err != 0)
1506 1507                  return;
1507      -        if (!scn->scn_async_destroying && zfs_free_leak_on_eio &&
     1508 +        if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
     1509 +            zfs_free_leak_on_eio && 
1508 1510              (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
1509 1511              dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 ||
1510 1512              dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) {
1511 1513                  /*
1512 1514                   * We have finished background destroying, but there is still
1513 1515                   * some space left in the dp_free_dir. Transfer this leaked
1514 1516                   * space to the dp_leak_dir.
1515 1517                   */
1516 1518                  if (dp->dp_leak_dir == NULL) {
1517 1519                          rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);

1518 1520                          (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
1519 1521                              LEAK_DIR_NAME, tx);
1520 1522                          VERIFY0(dsl_pool_open_special_dir(dp,
1521 1523                              LEAK_DIR_NAME, &dp->dp_leak_dir));
1522 1524                          rrw_exit(&dp->dp_config_rwlock, FTAG);

↓ open down ↓

5 lines elided

↑ open up ↑

1523 1525                  }
1524 1526                  dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
1525 1527                      dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
1526 1528                      dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
1527 1529                      dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
1528 1530                  dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
1529 1531                      -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
1530 1532                      -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
1531 1533                      -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
1532 1534          }
1533      -        if (!scn->scn_async_destroying) {
     1535 +        if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) {
1534 1536                  /* finished; verify that space accounting went to zero */
1535 1537                  ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
1536 1538                  ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
1537 1539                  ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
1538 1540          }
1539 1541  
1540 1542          if (scn->scn_phys.scn_state != DSS_SCANNING)
1541 1543                  return;
1542 1544  
1543 1545          if (scn->scn_done_txg == tx->tx_txg) {

1544 1546                  ASSERT(!scn->scn_pausing);
1545 1547                  /* finished with scan. */
1546 1548                  zfs_dbgmsg("txg %llu scan complete", tx->tx_txg);
1547 1549                  dsl_scan_done(scn, B_TRUE, tx);
1548 1550                  ASSERT3U(spa->spa_scrub_inflight, ==, 0);
1549 1551                  dsl_scan_sync_state(scn, tx);
1550 1552                  return;
1551 1553          }
1552 1554  
1553 1555          if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
1554 1556              scn->scn_phys.scn_ddt_class_max) {
1555 1557                  zfs_dbgmsg("doing scan sync txg %llu; "
1556 1558                      "ddt bm=%llu/%llu/%llu/%llx",
1557 1559                      (longlong_t)tx->tx_txg,
1558 1560                      (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
1559 1561                      (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
1560 1562                      (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
1561 1563                      (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
1562 1564                  ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
1563 1565                  ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
1564 1566                  ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
1565 1567                  ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
1566 1568          } else {
1567 1569                  zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
1568 1570                      (longlong_t)tx->tx_txg,
1569 1571                      (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
1570 1572                      (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
1571 1573                      (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
1572 1574                      (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
1573 1575          }
1574 1576  
1575 1577          scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1576 1578              NULL, ZIO_FLAG_CANFAIL);
1577 1579          dsl_pool_config_enter(dp, FTAG);
1578 1580          dsl_scan_visit(scn, tx);
1579 1581          dsl_pool_config_exit(dp, FTAG);
1580 1582          (void) zio_wait(scn->scn_zio_root);
1581 1583          scn->scn_zio_root = NULL;
1582 1584  
1583 1585          zfs_dbgmsg("visited %llu blocks in %llums",
1584 1586              (longlong_t)scn->scn_visited_this_txg,
1585 1587              (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));
1586 1588  
1587 1589          if (!scn->scn_pausing) {
1588 1590                  scn->scn_done_txg = tx->tx_txg + 1;
1589 1591                  zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu",
1590 1592                      tx->tx_txg, scn->scn_done_txg);
1591 1593          }
1592 1594  
1593 1595          if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
1594 1596                  mutex_enter(&spa->spa_scrub_lock);
1595 1597                  while (spa->spa_scrub_inflight > 0) {
1596 1598                          cv_wait(&spa->spa_scrub_io_cv,
1597 1599                              &spa->spa_scrub_lock);
1598 1600                  }
1599 1601                  mutex_exit(&spa->spa_scrub_lock);
1600 1602          }
1601 1603  
1602 1604          dsl_scan_sync_state(scn, tx);
1603 1605  }
1604 1606  
1605 1607  /*
1606 1608   * This will start a new scan, or restart an existing one.
1607 1609   */
1608 1610  void
1609 1611  dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
1610 1612  {
1611 1613          if (txg == 0) {
1612 1614                  dmu_tx_t *tx;
1613 1615                  tx = dmu_tx_create_dd(dp->dp_mos_dir);
1614 1616                  VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
1615 1617  
1616 1618                  txg = dmu_tx_get_txg(tx);
1617 1619                  dp->dp_scan->scn_restart_txg = txg;
1618 1620                  dmu_tx_commit(tx);
1619 1621          } else {
1620 1622                  dp->dp_scan->scn_restart_txg = txg;
1621 1623          }
1622 1624          zfs_dbgmsg("restarting resilver txg=%llu", txg);
1623 1625  }
1624 1626  
1625 1627  boolean_t
1626 1628  dsl_scan_resilvering(dsl_pool_t *dp)
1627 1629  {
1628 1630          return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
1629 1631              dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
1630 1632  }
1631 1633  
1632 1634  /*
1633 1635   * scrub consumers
1634 1636   */
1635 1637  
1636 1638  static void
1637 1639  count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
1638 1640  {
1639 1641          int i;
1640 1642  
1641 1643          /*
1642 1644           * If we resume after a reboot, zab will be NULL; don't record
1643 1645           * incomplete stats in that case.
1644 1646           */
1645 1647          if (zab == NULL)
1646 1648                  return;
1647 1649  
1648 1650          for (i = 0; i < 4; i++) {
1649 1651                  int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
1650 1652                  int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
1651 1653                  if (t & DMU_OT_NEWTYPE)
1652 1654                          t = DMU_OT_OTHER;
1653 1655                  zfs_blkstat_t *zb = &zab->zab_type[l][t];
1654 1656                  int equal;
1655 1657  
1656 1658                  zb->zb_count++;
1657 1659                  zb->zb_asize += BP_GET_ASIZE(bp);
1658 1660                  zb->zb_lsize += BP_GET_LSIZE(bp);
1659 1661                  zb->zb_psize += BP_GET_PSIZE(bp);
1660 1662                  zb->zb_gangs += BP_COUNT_GANG(bp);
1661 1663  
1662 1664                  switch (BP_GET_NDVAS(bp)) {
1663 1665                  case 2:
1664 1666                          if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1665 1667                              DVA_GET_VDEV(&bp->blk_dva[1]))
1666 1668                                  zb->zb_ditto_2_of_2_samevdev++;
1667 1669                          break;
1668 1670                  case 3:
1669 1671                          equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1670 1672                              DVA_GET_VDEV(&bp->blk_dva[1])) +
1671 1673                              (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1672 1674                              DVA_GET_VDEV(&bp->blk_dva[2])) +
1673 1675                              (DVA_GET_VDEV(&bp->blk_dva[1]) ==
1674 1676                              DVA_GET_VDEV(&bp->blk_dva[2]));
1675 1677                          if (equal == 1)
1676 1678                                  zb->zb_ditto_2_of_3_samevdev++;
1677 1679                          else if (equal == 3)
1678 1680                                  zb->zb_ditto_3_of_3_samevdev++;
1679 1681                          break;
1680 1682                  }
1681 1683          }
1682 1684  }
1683 1685  
1684 1686  static void
1685 1687  dsl_scan_scrub_done(zio_t *zio)
1686 1688  {
1687 1689          spa_t *spa = zio->io_spa;
1688 1690  
1689 1691          zio_data_buf_free(zio->io_data, zio->io_size);
1690 1692  
1691 1693          mutex_enter(&spa->spa_scrub_lock);
1692 1694          spa->spa_scrub_inflight--;
1693 1695          cv_broadcast(&spa->spa_scrub_io_cv);
1694 1696  
1695 1697          if (zio->io_error && (zio->io_error != ECKSUM ||
1696 1698              !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
1697 1699                  spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
1698 1700          }
1699 1701          mutex_exit(&spa->spa_scrub_lock);
1700 1702  }
1701 1703  
1702 1704  static int
1703 1705  dsl_scan_scrub_cb(dsl_pool_t *dp,
1704 1706      const blkptr_t *bp, const zbookmark_phys_t *zb)
1705 1707  {
1706 1708          dsl_scan_t *scn = dp->dp_scan;
1707 1709          size_t size = BP_GET_PSIZE(bp);
1708 1710          spa_t *spa = dp->dp_spa;
1709 1711          uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
1710 1712          boolean_t needs_io;
1711 1713          int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
1712 1714          int scan_delay = 0;
1713 1715  
1714 1716          if (phys_birth <= scn->scn_phys.scn_min_txg ||
1715 1717              phys_birth >= scn->scn_phys.scn_max_txg)
1716 1718                  return (0);
1717 1719  
1718 1720          count_block(dp->dp_blkstats, bp);
1719 1721  
1720 1722          if (BP_IS_EMBEDDED(bp))
1721 1723                  return (0);
1722 1724  
1723 1725          ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
1724 1726          if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
1725 1727                  zio_flags |= ZIO_FLAG_SCRUB;
1726 1728                  needs_io = B_TRUE;
1727 1729                  scan_delay = zfs_scrub_delay;
1728 1730          } else {
1729 1731                  ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
1730 1732                  zio_flags |= ZIO_FLAG_RESILVER;
1731 1733                  needs_io = B_FALSE;
1732 1734                  scan_delay = zfs_resilver_delay;
1733 1735          }
1734 1736  
1735 1737          /* If it's an intent log block, failure is expected. */
1736 1738          if (zb->zb_level == ZB_ZIL_LEVEL)
1737 1739                  zio_flags |= ZIO_FLAG_SPECULATIVE;
1738 1740  
1739 1741          for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
1740 1742                  vdev_t *vd = vdev_lookup_top(spa,
1741 1743                      DVA_GET_VDEV(&bp->blk_dva[d]));
1742 1744  
1743 1745                  /*
1744 1746                   * Keep track of how much data we've examined so that
1745 1747                   * zpool(1M) status can make useful progress reports.
1746 1748                   */
1747 1749                  scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
1748 1750                  spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
1749 1751  
1750 1752                  /* if it's a resilver, this may not be in the target range */
1751 1753                  if (!needs_io) {
1752 1754                          if (DVA_GET_GANG(&bp->blk_dva[d])) {
1753 1755                                  /*
1754 1756                                   * Gang members may be spread across multiple
1755 1757                                   * vdevs, so the best estimate we have is the
1756 1758                                   * scrub range, which has already been checked.
1757 1759                                   * XXX -- it would be better to change our
1758 1760                                   * allocation policy to ensure that all
1759 1761                                   * gang members reside on the same vdev.
1760 1762                                   */
1761 1763                                  needs_io = B_TRUE;
1762 1764                          } else {
1763 1765                                  needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
1764 1766                                      phys_birth, 1);
1765 1767                          }
1766 1768                  }
1767 1769          }
1768 1770  
1769 1771          if (needs_io && !zfs_no_scrub_io) {
1770 1772                  vdev_t *rvd = spa->spa_root_vdev;
1771 1773                  uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight;
1772 1774                  void *data = zio_data_buf_alloc(size);
1773 1775  
1774 1776                  mutex_enter(&spa->spa_scrub_lock);
1775 1777                  while (spa->spa_scrub_inflight >= maxinflight)
1776 1778                          cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
1777 1779                  spa->spa_scrub_inflight++;
1778 1780                  mutex_exit(&spa->spa_scrub_lock);
1779 1781  
1780 1782                  /*
1781 1783                   * If we're seeing recent (zfs_scan_idle) "important" I/Os
1782 1784                   * then throttle our workload to limit the impact of a scan.
1783 1785                   */
1784 1786                  if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
1785 1787                          delay(scan_delay);
1786 1788  
1787 1789                  zio_nowait(zio_read(NULL, spa, bp, data, size,
1788 1790                      dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB,
1789 1791                      zio_flags, zb));
1790 1792          }
1791 1793  
1792 1794          /* do not relocate this block */
1793 1795          return (0);
1794 1796  }
1795 1797  
1796 1798  int
1797 1799  dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
1798 1800  {
1799 1801          spa_t *spa = dp->dp_spa;
1800 1802  
1801 1803          /*
1802 1804           * Purge all vdev caches and probe all devices.  We do this here
1803 1805           * rather than in sync context because this requires a writer lock
1804 1806           * on the spa_config lock, which we can't do from sync context.  The
1805 1807           * spa_scrub_reopen flag indicates that vdev_open() should not
1806 1808           * attempt to start another scrub.
1807 1809           */
1808 1810          spa_vdev_state_enter(spa, SCL_NONE);
1809 1811          spa->spa_scrub_reopen = B_TRUE;
1810 1812          vdev_reopen(spa->spa_root_vdev);
1811 1813          spa->spa_scrub_reopen = B_FALSE;
1812 1814          (void) spa_vdev_state_exit(spa, NULL, 0);
1813 1815  
1814 1816          return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
1815 1817              dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
1816 1818  }

↓ open down ↓

273 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX