illumos-gate.git Wdiff usr/src/uts/common/fs/zfs/spa_misc.c

Print this page

4185 New hash algorithm support

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/spa_misc.c
          +++ new/usr/src/uts/common/fs/zfs/spa_misc.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.

↓ open down ↓

14 lines elided

↑ open up ↑

  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2013 by Delphix. All rights reserved.
  24   24   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
       25 + * Copyright 2013 Saso Kiselkov. All rights reserved.
  25   26   */
  26   27  
  27   28  #include <sys/zfs_context.h>
  28   29  #include <sys/spa_impl.h>
  29   30  #include <sys/spa_boot.h>
  30   31  #include <sys/zio.h>
  31   32  #include <sys/zio_checksum.h>
  32   33  #include <sys/zio_compress.h>
  33   34  #include <sys/dmu.h>
  34   35  #include <sys/dmu_tx.h>

  35   36  #include <sys/zap.h>
  36   37  #include <sys/zil.h>
  37   38  #include <sys/vdev_impl.h>
  38   39  #include <sys/metaslab.h>
  39   40  #include <sys/uberblock_impl.h>
  40   41  #include <sys/txg.h>
  41   42  #include <sys/avl.h>

↓ open down ↓

7 lines elided

↑ open up ↑

  42   43  #include <sys/unique.h>
  43   44  #include <sys/dsl_pool.h>
  44   45  #include <sys/dsl_dir.h>
  45   46  #include <sys/dsl_prop.h>
  46   47  #include <sys/dsl_scan.h>
  47   48  #include <sys/fs/zfs.h>
  48   49  #include <sys/metaslab_impl.h>
  49   50  #include <sys/arc.h>
  50   51  #include <sys/ddt.h>
  51   52  #include "zfs_prop.h"
  52      -#include "zfeature_common.h"
       53 +#include <sys/zfeature.h>
  53   54  
  54   55  /*
  55   56   * SPA locking
  56   57   *
  57   58   * There are four basic locks for managing spa_t structures:
  58   59   *
  59   60   * spa_namespace_lock (global mutex)
  60   61   *
  61   62   *      This lock must be acquired to do any of the following:
  62   63   *

  63   64   *              - Lookup a spa_t by name
  64   65   *              - Add or remove a spa_t from the namespace
  65   66   *              - Increase spa_refcount from non-zero
  66   67   *              - Check if spa_refcount is zero
  67   68   *              - Rename a spa_t
  68   69   *              - add/remove/attach/detach devices
  69   70   *              - Held for the duration of create/destroy/import/export
  70   71   *
  71   72   *      It does not need to handle recursion.  A create or destroy may
  72   73   *      reference objects (files or zvols) in other pools, but by
  73   74   *      definition they must have an existing reference, and will never need
  74   75   *      to lookup a spa_t by name.
  75   76   *
  76   77   * spa_refcount (per-spa refcount_t protected by mutex)
  77   78   *
  78   79   *      This reference count keep track of any active users of the spa_t.  The
  79   80   *      spa_t cannot be destroyed or freed while this is non-zero.  Internally,
  80   81   *      the refcount is never really 'zero' - opening a pool implicitly keeps
  81   82   *      some references in the DMU.  Internally we check against spa_minref, but
  82   83   *      present the image of a zero/non-zero value to consumers.
  83   84   *
  84   85   * spa_config_lock[] (per-spa array of rwlocks)
  85   86   *
  86   87   *      This protects the spa_t from config changes, and must be held in
  87   88   *      the following circumstances:
  88   89   *
  89   90   *              - RW_READER to perform I/O to the spa
  90   91   *              - RW_WRITER to change the vdev config
  91   92   *
  92   93   * The locking order is fairly straightforward:
  93   94   *
  94   95   *              spa_namespace_lock      ->      spa_refcount
  95   96   *
  96   97   *      The namespace lock must be acquired to increase the refcount from 0
  97   98   *      or to check if it is zero.
  98   99   *
  99  100   *              spa_refcount            ->      spa_config_lock[]
 100  101   *
 101  102   *      There must be at least one valid reference on the spa_t to acquire
 102  103   *      the config lock.
 103  104   *
 104  105   *              spa_namespace_lock      ->      spa_config_lock[]
 105  106   *
 106  107   *      The namespace lock must always be taken before the config lock.
 107  108   *
 108  109   *
 109  110   * The spa_namespace_lock can be acquired directly and is globally visible.
 110  111   *
 111  112   * The namespace is manipulated using the following functions, all of which
 112  113   * require the spa_namespace_lock to be held.
 113  114   *
 114  115   *      spa_lookup()            Lookup a spa_t by name.
 115  116   *
 116  117   *      spa_add()               Create a new spa_t in the namespace.
 117  118   *
 118  119   *      spa_remove()            Remove a spa_t from the namespace.  This also
 119  120   *                              frees up any memory associated with the spa_t.
 120  121   *
 121  122   *      spa_next()              Returns the next spa_t in the system, or the
 122  123   *                              first if NULL is passed.
 123  124   *
 124  125   *      spa_evict_all()         Shutdown and remove all spa_t structures in
 125  126   *                              the system.
 126  127   *
 127  128   *      spa_guid_exists()       Determine whether a pool/device guid exists.
 128  129   *
 129  130   * The spa_refcount is manipulated using the following functions:
 130  131   *
 131  132   *      spa_open_ref()          Adds a reference to the given spa_t.  Must be
 132  133   *                              called with spa_namespace_lock held if the
 133  134   *                              refcount is currently zero.
 134  135   *
 135  136   *      spa_close()             Remove a reference from the spa_t.  This will
 136  137   *                              not free the spa_t or remove it from the
 137  138   *                              namespace.  No locking is required.
 138  139   *
 139  140   *      spa_refcount_zero()     Returns true if the refcount is currently
 140  141   *                              zero.  Must be called with spa_namespace_lock
 141  142   *                              held.
 142  143   *
 143  144   * The spa_config_lock[] is an array of rwlocks, ordered as follows:
 144  145   * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
 145  146   * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
 146  147   *
 147  148   * To read the configuration, it suffices to hold one of these locks as reader.
 148  149   * To modify the configuration, you must hold all locks as writer.  To modify
 149  150   * vdev state without altering the vdev tree's topology (e.g. online/offline),
 150  151   * you must hold SCL_STATE and SCL_ZIO as writer.
 151  152   *
 152  153   * We use these distinct config locks to avoid recursive lock entry.
 153  154   * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
 154  155   * block allocations (SCL_ALLOC), which may require reading space maps
 155  156   * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
 156  157   *
 157  158   * The spa config locks cannot be normal rwlocks because we need the
 158  159   * ability to hand off ownership.  For example, SCL_ZIO is acquired
 159  160   * by the issuing thread and later released by an interrupt thread.
 160  161   * They do, however, obey the usual write-wanted semantics to prevent
 161  162   * writer (i.e. system administrator) starvation.
 162  163   *
 163  164   * The lock acquisition rules are as follows:
 164  165   *
 165  166   * SCL_CONFIG
 166  167   *      Protects changes to the vdev tree topology, such as vdev
 167  168   *      add/remove/attach/detach.  Protects the dirty config list
 168  169   *      (spa_config_dirty_list) and the set of spares and l2arc devices.
 169  170   *
 170  171   * SCL_STATE
 171  172   *      Protects changes to pool state and vdev state, such as vdev
 172  173   *      online/offline/fault/degrade/clear.  Protects the dirty state list
 173  174   *      (spa_state_dirty_list) and global pool state (spa_state).
 174  175   *
 175  176   * SCL_ALLOC
 176  177   *      Protects changes to metaslab groups and classes.
 177  178   *      Held as reader by metaslab_alloc() and metaslab_claim().
 178  179   *
 179  180   * SCL_ZIO
 180  181   *      Held by bp-level zios (those which have no io_vd upon entry)
 181  182   *      to prevent changes to the vdev tree.  The bp-level zio implicitly
 182  183   *      protects all of its vdev child zios, which do not hold SCL_ZIO.
 183  184   *
 184  185   * SCL_FREE
 185  186   *      Protects changes to metaslab groups and classes.
 186  187   *      Held as reader by metaslab_free().  SCL_FREE is distinct from
 187  188   *      SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
 188  189   *      blocks in zio_done() while another i/o that holds either
 189  190   *      SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
 190  191   *
 191  192   * SCL_VDEV
 192  193   *      Held as reader to prevent changes to the vdev tree during trivial
 193  194   *      inquiries such as bp_get_dsize().  SCL_VDEV is distinct from the
 194  195   *      other locks, and lower than all of them, to ensure that it's safe
 195  196   *      to acquire regardless of caller context.
 196  197   *
 197  198   * In addition, the following rules apply:
 198  199   *
 199  200   * (a)  spa_props_lock protects pool properties, spa_config and spa_config_list.
 200  201   *      The lock ordering is SCL_CONFIG > spa_props_lock.
 201  202   *
 202  203   * (b)  I/O operations on leaf vdevs.  For any zio operation that takes
 203  204   *      an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
 204  205   *      or zio_write_phys() -- the caller must ensure that the config cannot
 205  206   *      cannot change in the interim, and that the vdev cannot be reopened.
 206  207   *      SCL_STATE as reader suffices for both.
 207  208   *
 208  209   * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
 209  210   *
 210  211   *      spa_vdev_enter()        Acquire the namespace lock and the config lock
 211  212   *                              for writing.
 212  213   *
 213  214   *      spa_vdev_exit()         Release the config lock, wait for all I/O
 214  215   *                              to complete, sync the updated configs to the
 215  216   *                              cache, and release the namespace lock.
 216  217   *
 217  218   * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
 218  219   * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
 219  220   * locking is, always, based on spa_namespace_lock and spa_config_lock[].
 220  221   *
 221  222   * spa_rename() is also implemented within this file since it requires
 222  223   * manipulation of the namespace.
 223  224   */
 224  225  
 225  226  static avl_tree_t spa_namespace_avl;
 226  227  kmutex_t spa_namespace_lock;
 227  228  static kcondvar_t spa_namespace_cv;
 228  229  static int spa_active_count;
 229  230  int spa_max_replication_override = SPA_DVAS_PER_BP;
 230  231  
 231  232  static kmutex_t spa_spare_lock;
 232  233  static avl_tree_t spa_spare_avl;
 233  234  static kmutex_t spa_l2cache_lock;
 234  235  static avl_tree_t spa_l2cache_avl;
 235  236  
 236  237  kmem_cache_t *spa_buffer_pool;
 237  238  int spa_mode_global;
 238  239  
 239  240  #ifdef ZFS_DEBUG
 240  241  /* Everything except dprintf and spa is on by default in debug builds */
 241  242  int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA);
 242  243  #else
 243  244  int zfs_flags = 0;
 244  245  #endif
 245  246  
 246  247  /*
 247  248   * zfs_recover can be set to nonzero to attempt to recover from
 248  249   * otherwise-fatal errors, typically caused by on-disk corruption.  When
 249  250   * set, calls to zfs_panic_recover() will turn into warning messages.
 250  251   */
 251  252  int zfs_recover = 0;
 252  253  
 253  254  /*
 254  255   * Expiration time in milliseconds. This value has two meanings. First it is
 255  256   * used to determine when the spa_deadman() logic should fire. By default the
 256  257   * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
 257  258   * Secondly, the value determines if an I/O is considered "hung". Any I/O that
 258  259   * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
 259  260   * in a system panic.
 260  261   */
 261  262  uint64_t zfs_deadman_synctime_ms = 1000000ULL;
 262  263  
 263  264  /*
 264  265   * Check time in milliseconds. This defines the frequency at which we check
 265  266   * for hung I/O.
 266  267   */
 267  268  uint64_t zfs_deadman_checktime_ms = 5000ULL;
 268  269  
 269  270  /*
 270  271   * Override the zfs deadman behavior via /etc/system. By default the
 271  272   * deadman is enabled except on VMware and sparc deployments.
 272  273   */
 273  274  int zfs_deadman_enabled = -1;
 274  275  
 275  276  /*
 276  277   * The worst case is single-sector max-parity RAID-Z blocks, in which
 277  278   * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
 278  279   * times the size; so just assume that.  Add to this the fact that
 279  280   * we can have up to 3 DVAs per bp, and one more factor of 2 because
 280  281   * the block may be dittoed with up to 3 DVAs by ddt_sync().  All together,
 281  282   * the worst case is:
 282  283   *     (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
 283  284   */
 284  285  int spa_asize_inflation = 24;
 285  286  
 286  287  /*
 287  288   * ==========================================================================
 288  289   * SPA config locking
 289  290   * ==========================================================================
 290  291   */
 291  292  static void
 292  293  spa_config_lock_init(spa_t *spa)
 293  294  {
 294  295          for (int i = 0; i < SCL_LOCKS; i++) {
 295  296                  spa_config_lock_t *scl = &spa->spa_config_lock[i];
 296  297                  mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
 297  298                  cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
 298  299                  refcount_create_untracked(&scl->scl_count);
 299  300                  scl->scl_writer = NULL;
 300  301                  scl->scl_write_wanted = 0;
 301  302          }
 302  303  }
 303  304  
 304  305  static void
 305  306  spa_config_lock_destroy(spa_t *spa)
 306  307  {
 307  308          for (int i = 0; i < SCL_LOCKS; i++) {
 308  309                  spa_config_lock_t *scl = &spa->spa_config_lock[i];
 309  310                  mutex_destroy(&scl->scl_lock);
 310  311                  cv_destroy(&scl->scl_cv);
 311  312                  refcount_destroy(&scl->scl_count);
 312  313                  ASSERT(scl->scl_writer == NULL);
 313  314                  ASSERT(scl->scl_write_wanted == 0);
 314  315          }
 315  316  }
 316  317  
 317  318  int
 318  319  spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
 319  320  {
 320  321          for (int i = 0; i < SCL_LOCKS; i++) {
 321  322                  spa_config_lock_t *scl = &spa->spa_config_lock[i];
 322  323                  if (!(locks & (1 << i)))
 323  324                          continue;
 324  325                  mutex_enter(&scl->scl_lock);
 325  326                  if (rw == RW_READER) {
 326  327                          if (scl->scl_writer || scl->scl_write_wanted) {
 327  328                                  mutex_exit(&scl->scl_lock);
 328  329                                  spa_config_exit(spa, locks ^ (1 << i), tag);
 329  330                                  return (0);
 330  331                          }
 331  332                  } else {
 332  333                          ASSERT(scl->scl_writer != curthread);
 333  334                          if (!refcount_is_zero(&scl->scl_count)) {
 334  335                                  mutex_exit(&scl->scl_lock);
 335  336                                  spa_config_exit(spa, locks ^ (1 << i), tag);
 336  337                                  return (0);
 337  338                          }
 338  339                          scl->scl_writer = curthread;
 339  340                  }
 340  341                  (void) refcount_add(&scl->scl_count, tag);
 341  342                  mutex_exit(&scl->scl_lock);
 342  343          }
 343  344          return (1);
 344  345  }
 345  346  
 346  347  void
 347  348  spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
 348  349  {
 349  350          int wlocks_held = 0;
 350  351  
 351  352          ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
 352  353  
 353  354          for (int i = 0; i < SCL_LOCKS; i++) {
 354  355                  spa_config_lock_t *scl = &spa->spa_config_lock[i];
 355  356                  if (scl->scl_writer == curthread)
 356  357                          wlocks_held |= (1 << i);
 357  358                  if (!(locks & (1 << i)))
 358  359                          continue;
 359  360                  mutex_enter(&scl->scl_lock);
 360  361                  if (rw == RW_READER) {
 361  362                          while (scl->scl_writer || scl->scl_write_wanted) {
 362  363                                  cv_wait(&scl->scl_cv, &scl->scl_lock);
 363  364                          }
 364  365                  } else {
 365  366                          ASSERT(scl->scl_writer != curthread);
 366  367                          while (!refcount_is_zero(&scl->scl_count)) {
 367  368                                  scl->scl_write_wanted++;
 368  369                                  cv_wait(&scl->scl_cv, &scl->scl_lock);
 369  370                                  scl->scl_write_wanted--;
 370  371                          }
 371  372                          scl->scl_writer = curthread;
 372  373                  }
 373  374                  (void) refcount_add(&scl->scl_count, tag);
 374  375                  mutex_exit(&scl->scl_lock);
 375  376          }
 376  377          ASSERT(wlocks_held <= locks);
 377  378  }
 378  379  
 379  380  void
 380  381  spa_config_exit(spa_t *spa, int locks, void *tag)
 381  382  {
 382  383          for (int i = SCL_LOCKS - 1; i >= 0; i--) {
 383  384                  spa_config_lock_t *scl = &spa->spa_config_lock[i];
 384  385                  if (!(locks & (1 << i)))
 385  386                          continue;
 386  387                  mutex_enter(&scl->scl_lock);
 387  388                  ASSERT(!refcount_is_zero(&scl->scl_count));
 388  389                  if (refcount_remove(&scl->scl_count, tag) == 0) {
 389  390                          ASSERT(scl->scl_writer == NULL ||
 390  391                              scl->scl_writer == curthread);
 391  392                          scl->scl_writer = NULL; /* OK in either case */
 392  393                          cv_broadcast(&scl->scl_cv);
 393  394                  }
 394  395                  mutex_exit(&scl->scl_lock);
 395  396          }
 396  397  }
 397  398  
 398  399  int
 399  400  spa_config_held(spa_t *spa, int locks, krw_t rw)
 400  401  {
 401  402          int locks_held = 0;
 402  403  
 403  404          for (int i = 0; i < SCL_LOCKS; i++) {
 404  405                  spa_config_lock_t *scl = &spa->spa_config_lock[i];
 405  406                  if (!(locks & (1 << i)))
 406  407                          continue;
 407  408                  if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) ||
 408  409                      (rw == RW_WRITER && scl->scl_writer == curthread))
 409  410                          locks_held |= 1 << i;
 410  411          }
 411  412  
 412  413          return (locks_held);
 413  414  }
 414  415  
 415  416  /*
 416  417   * ==========================================================================
 417  418   * SPA namespace functions
 418  419   * ==========================================================================
 419  420   */
 420  421  
 421  422  /*
 422  423   * Lookup the named spa_t in the AVL tree.  The spa_namespace_lock must be held.
 423  424   * Returns NULL if no matching spa_t is found.
 424  425   */
 425  426  spa_t *
 426  427  spa_lookup(const char *name)
 427  428  {
 428  429          static spa_t search;    /* spa_t is large; don't allocate on stack */
 429  430          spa_t *spa;
 430  431          avl_index_t where;
 431  432          char *cp;
 432  433  
 433  434          ASSERT(MUTEX_HELD(&spa_namespace_lock));
 434  435  
 435  436          (void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
 436  437  
 437  438          /*
 438  439           * If it's a full dataset name, figure out the pool name and
 439  440           * just use that.
 440  441           */
 441  442          cp = strpbrk(search.spa_name, "/@");
 442  443          if (cp != NULL)
 443  444                  *cp = '\0';
 444  445  
 445  446          spa = avl_find(&spa_namespace_avl, &search, &where);
 446  447  
 447  448          return (spa);
 448  449  }
 449  450  
 450  451  /*
 451  452   * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
 452  453   * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
 453  454   * looking for potentially hung I/Os.
 454  455   */
 455  456  void
 456  457  spa_deadman(void *arg)
 457  458  {
 458  459          spa_t *spa = arg;
 459  460  
 460  461          /*
 461  462           * Disable the deadman timer if the pool is suspended.
 462  463           */
 463  464          if (spa_suspended(spa)) {
 464  465                  VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
 465  466                  return;
 466  467          }
 467  468  
 468  469          zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
 469  470              (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
 470  471              ++spa->spa_deadman_calls);
 471  472          if (zfs_deadman_enabled)
 472  473                  vdev_deadman(spa->spa_root_vdev);
 473  474  }
 474  475  
 475  476  /*
 476  477   * Create an uninitialized spa_t with the given name.  Requires
 477  478   * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
 478  479   * exist by calling spa_lookup() first.
 479  480   */
 480  481  spa_t *
 481  482  spa_add(const char *name, nvlist_t *config, const char *altroot)
 482  483  {
 483  484          spa_t *spa;
 484  485          spa_config_dirent_t *dp;
 485  486          cyc_handler_t hdlr;
 486  487          cyc_time_t when;
 487  488

↓ open down ↓

425 lines elided

↑ open up ↑

 488  489          ASSERT(MUTEX_HELD(&spa_namespace_lock));
 489  490  
 490  491          spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
 491  492  
 492  493          mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
 493  494          mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
 494  495          mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
 495  496          mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
 496  497          mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
 497  498          mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
      499 +        mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
 498  500          mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
 499  501          mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
 500  502          mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
 501  503          mutex_init(&spa->spa_iokstat_lock, NULL, MUTEX_DEFAULT, NULL);
 502  504  
 503  505          cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
 504  506          cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
 505  507          cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
 506  508          cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
 507  509

 508  510          for (int t = 0; t < TXG_SIZE; t++)
 509  511                  bplist_create(&spa->spa_free_bplist[t]);
 510  512  
 511  513          (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
 512  514          spa->spa_state = POOL_STATE_UNINITIALIZED;
 513  515          spa->spa_freeze_txg = UINT64_MAX;
 514  516          spa->spa_final_txg = UINT64_MAX;
 515  517          spa->spa_load_max_txg = UINT64_MAX;
 516  518          spa->spa_proc = &p0;
 517  519          spa->spa_proc_state = SPA_PROC_NONE;
 518  520  
 519  521          hdlr.cyh_func = spa_deadman;
 520  522          hdlr.cyh_arg = spa;
 521  523          hdlr.cyh_level = CY_LOW_LEVEL;
 522  524  
 523  525          spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
 524  526  
 525  527          /*
 526  528           * This determines how often we need to check for hung I/Os after
 527  529           * the cyclic has already fired. Since checking for hung I/Os is
 528  530           * an expensive operation we don't want to check too frequently.
 529  531           * Instead wait for 5 seconds before checking again.
 530  532           */
 531  533          when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
 532  534          when.cyt_when = CY_INFINITY;
 533  535          mutex_enter(&cpu_lock);
 534  536          spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
 535  537          mutex_exit(&cpu_lock);
 536  538  
 537  539          refcount_create(&spa->spa_refcount);
 538  540          spa_config_lock_init(spa);
 539  541  
 540  542          avl_add(&spa_namespace_avl, spa);
 541  543  
 542  544          /*
 543  545           * Set the alternate root, if there is one.
 544  546           */
 545  547          if (altroot) {
 546  548                  spa->spa_root = spa_strdup(altroot);
 547  549                  spa_active_count++;
 548  550          }
 549  551  
 550  552          /*
 551  553           * Every pool starts with the default cachefile
 552  554           */
 553  555          list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
 554  556              offsetof(spa_config_dirent_t, scd_link));
 555  557  
 556  558          dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
 557  559          dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
 558  560          list_insert_head(&spa->spa_config_list, dp);
 559  561  
 560  562          VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
 561  563              KM_SLEEP) == 0);
 562  564  
 563  565          if (config != NULL) {
 564  566                  nvlist_t *features;
 565  567  
 566  568                  if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
 567  569                      &features) == 0) {
 568  570                          VERIFY(nvlist_dup(features, &spa->spa_label_features,
 569  571                              0) == 0);
 570  572                  }
 571  573  
 572  574                  VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
 573  575          }
 574  576  
 575  577          if (spa->spa_label_features == NULL) {
 576  578                  VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
 577  579                      KM_SLEEP) == 0);
 578  580          }
 579  581  
 580  582          spa->spa_iokstat = kstat_create("zfs", 0, name,
 581  583              "disk", KSTAT_TYPE_IO, 1, 0);
 582  584          if (spa->spa_iokstat) {
 583  585                  spa->spa_iokstat->ks_lock = &spa->spa_iokstat_lock;
 584  586                  kstat_install(spa->spa_iokstat);
 585  587          }
 586  588  
 587  589          spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0);
 588  590  
 589  591          return (spa);
 590  592  }
 591  593  
 592  594  /*
 593  595   * Removes a spa_t from the namespace, freeing up any memory used.  Requires
 594  596   * spa_namespace_lock.  This is called only after the spa_t has been closed and
 595  597   * deactivated.
 596  598   */
 597  599  void
 598  600  spa_remove(spa_t *spa)
 599  601  {
 600  602          spa_config_dirent_t *dp;
 601  603  
 602  604          ASSERT(MUTEX_HELD(&spa_namespace_lock));
 603  605          ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 604  606  
 605  607          nvlist_free(spa->spa_config_splitting);
 606  608  
 607  609          avl_remove(&spa_namespace_avl, spa);
 608  610          cv_broadcast(&spa_namespace_cv);
 609  611  
 610  612          if (spa->spa_root) {
 611  613                  spa_strfree(spa->spa_root);
 612  614                  spa_active_count--;
 613  615          }
 614  616  
 615  617          while ((dp = list_head(&spa->spa_config_list)) != NULL) {
 616  618                  list_remove(&spa->spa_config_list, dp);
 617  619                  if (dp->scd_path != NULL)
 618  620                          spa_strfree(dp->scd_path);
 619  621                  kmem_free(dp, sizeof (spa_config_dirent_t));
 620  622          }
 621  623  
 622  624          list_destroy(&spa->spa_config_list);
 623  625  
 624  626          nvlist_free(spa->spa_label_features);
 625  627          nvlist_free(spa->spa_load_info);
 626  628          spa_config_set(spa, NULL);
 627  629  
 628  630          mutex_enter(&cpu_lock);
 629  631          if (spa->spa_deadman_cycid != CYCLIC_NONE)
 630  632                  cyclic_remove(spa->spa_deadman_cycid);
 631  633          mutex_exit(&cpu_lock);
 632  634          spa->spa_deadman_cycid = CYCLIC_NONE;
 633  635

↓ open down ↓

126 lines elided

↑ open up ↑

 634  636          refcount_destroy(&spa->spa_refcount);
 635  637  
 636  638          spa_config_lock_destroy(spa);
 637  639  
 638  640          kstat_delete(spa->spa_iokstat);
 639  641          spa->spa_iokstat = NULL;
 640  642  
 641  643          for (int t = 0; t < TXG_SIZE; t++)
 642  644                  bplist_destroy(&spa->spa_free_bplist[t]);
 643  645  
      646 +        zio_checksum_templates_free(spa);
      647 +
 644  648          cv_destroy(&spa->spa_async_cv);
 645  649          cv_destroy(&spa->spa_proc_cv);
 646  650          cv_destroy(&spa->spa_scrub_io_cv);
 647  651          cv_destroy(&spa->spa_suspend_cv);
 648  652  
 649  653          mutex_destroy(&spa->spa_async_lock);
 650  654          mutex_destroy(&spa->spa_errlist_lock);
 651  655          mutex_destroy(&spa->spa_errlog_lock);
 652  656          mutex_destroy(&spa->spa_history_lock);
 653  657          mutex_destroy(&spa->spa_proc_lock);
 654  658          mutex_destroy(&spa->spa_props_lock);
      659 +        mutex_destroy(&spa->spa_cksum_tmpls_lock);
 655  660          mutex_destroy(&spa->spa_scrub_lock);
 656  661          mutex_destroy(&spa->spa_suspend_lock);
 657  662          mutex_destroy(&spa->spa_vdev_top_lock);
 658  663          mutex_destroy(&spa->spa_iokstat_lock);
 659  664  
 660  665          kmem_free(spa, sizeof (spa_t));
 661  666  }
 662  667  
 663  668  /*
 664  669   * Given a pool, return the next pool in the namespace, or NULL if there is

 665  670   * none.  If 'prev' is NULL, return the first pool.
 666  671   */
 667  672  spa_t *
 668  673  spa_next(spa_t *prev)
 669  674  {
 670  675          ASSERT(MUTEX_HELD(&spa_namespace_lock));
 671  676  
 672  677          if (prev)
 673  678                  return (AVL_NEXT(&spa_namespace_avl, prev));
 674  679          else
 675  680                  return (avl_first(&spa_namespace_avl));
 676  681  }
 677  682  
 678  683  /*
 679  684   * ==========================================================================
 680  685   * SPA refcount functions
 681  686   * ==========================================================================
 682  687   */
 683  688  
 684  689  /*
 685  690   * Add a reference to the given spa_t.  Must have at least one reference, or
 686  691   * have the namespace lock held.
 687  692   */
 688  693  void
 689  694  spa_open_ref(spa_t *spa, void *tag)
 690  695  {
 691  696          ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
 692  697              MUTEX_HELD(&spa_namespace_lock));
 693  698          (void) refcount_add(&spa->spa_refcount, tag);
 694  699  }
 695  700  
 696  701  /*
 697  702   * Remove a reference to the given spa_t.  Must have at least one reference, or
 698  703   * have the namespace lock held.
 699  704   */
 700  705  void
 701  706  spa_close(spa_t *spa, void *tag)
 702  707  {
 703  708          ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref ||
 704  709              MUTEX_HELD(&spa_namespace_lock));
 705  710          (void) refcount_remove(&spa->spa_refcount, tag);
 706  711  }
 707  712  
 708  713  /*
 709  714   * Check to see if the spa refcount is zero.  Must be called with
 710  715   * spa_namespace_lock held.  We really compare against spa_minref, which is the
 711  716   * number of references acquired when opening a pool
 712  717   */
 713  718  boolean_t
 714  719  spa_refcount_zero(spa_t *spa)
 715  720  {
 716  721          ASSERT(MUTEX_HELD(&spa_namespace_lock));
 717  722  
 718  723          return (refcount_count(&spa->spa_refcount) == spa->spa_minref);
 719  724  }
 720  725  
 721  726  /*
 722  727   * ==========================================================================
 723  728   * SPA spare and l2cache tracking
 724  729   * ==========================================================================
 725  730   */
 726  731  
 727  732  /*
 728  733   * Hot spares and cache devices are tracked using the same code below,
 729  734   * for 'auxiliary' devices.
 730  735   */
 731  736  
 732  737  typedef struct spa_aux {
 733  738          uint64_t        aux_guid;
 734  739          uint64_t        aux_pool;
 735  740          avl_node_t      aux_avl;
 736  741          int             aux_count;
 737  742  } spa_aux_t;
 738  743  
 739  744  static int
 740  745  spa_aux_compare(const void *a, const void *b)
 741  746  {
 742  747          const spa_aux_t *sa = a;
 743  748          const spa_aux_t *sb = b;
 744  749  
 745  750          if (sa->aux_guid < sb->aux_guid)
 746  751                  return (-1);
 747  752          else if (sa->aux_guid > sb->aux_guid)
 748  753                  return (1);
 749  754          else
 750  755                  return (0);
 751  756  }
 752  757  
 753  758  void
 754  759  spa_aux_add(vdev_t *vd, avl_tree_t *avl)
 755  760  {
 756  761          avl_index_t where;
 757  762          spa_aux_t search;
 758  763          spa_aux_t *aux;
 759  764  
 760  765          search.aux_guid = vd->vdev_guid;
 761  766          if ((aux = avl_find(avl, &search, &where)) != NULL) {
 762  767                  aux->aux_count++;
 763  768          } else {
 764  769                  aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
 765  770                  aux->aux_guid = vd->vdev_guid;
 766  771                  aux->aux_count = 1;
 767  772                  avl_insert(avl, aux, where);
 768  773          }
 769  774  }
 770  775  
 771  776  void
 772  777  spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
 773  778  {
 774  779          spa_aux_t search;
 775  780          spa_aux_t *aux;
 776  781          avl_index_t where;
 777  782  
 778  783          search.aux_guid = vd->vdev_guid;
 779  784          aux = avl_find(avl, &search, &where);
 780  785  
 781  786          ASSERT(aux != NULL);
 782  787  
 783  788          if (--aux->aux_count == 0) {
 784  789                  avl_remove(avl, aux);
 785  790                  kmem_free(aux, sizeof (spa_aux_t));
 786  791          } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
 787  792                  aux->aux_pool = 0ULL;
 788  793          }
 789  794  }
 790  795  
 791  796  boolean_t
 792  797  spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
 793  798  {
 794  799          spa_aux_t search, *found;
 795  800  
 796  801          search.aux_guid = guid;
 797  802          found = avl_find(avl, &search, NULL);
 798  803  
 799  804          if (pool) {
 800  805                  if (found)
 801  806                          *pool = found->aux_pool;
 802  807                  else
 803  808                          *pool = 0ULL;
 804  809          }
 805  810  
 806  811          if (refcnt) {
 807  812                  if (found)
 808  813                          *refcnt = found->aux_count;
 809  814                  else
 810  815                          *refcnt = 0;
 811  816          }
 812  817  
 813  818          return (found != NULL);
 814  819  }
 815  820  
 816  821  void
 817  822  spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
 818  823  {
 819  824          spa_aux_t search, *found;
 820  825          avl_index_t where;
 821  826  
 822  827          search.aux_guid = vd->vdev_guid;
 823  828          found = avl_find(avl, &search, &where);
 824  829          ASSERT(found != NULL);
 825  830          ASSERT(found->aux_pool == 0ULL);
 826  831  
 827  832          found->aux_pool = spa_guid(vd->vdev_spa);
 828  833  }
 829  834  
 830  835  /*
 831  836   * Spares are tracked globally due to the following constraints:
 832  837   *
 833  838   *      - A spare may be part of multiple pools.
 834  839   *      - A spare may be added to a pool even if it's actively in use within
 835  840   *        another pool.
 836  841   *      - A spare in use in any pool can only be the source of a replacement if
 837  842   *        the target is a spare in the same pool.
 838  843   *
 839  844   * We keep track of all spares on the system through the use of a reference
 840  845   * counted AVL tree.  When a vdev is added as a spare, or used as a replacement
 841  846   * spare, then we bump the reference count in the AVL tree.  In addition, we set
 842  847   * the 'vdev_isspare' member to indicate that the device is a spare (active or
 843  848   * inactive).  When a spare is made active (used to replace a device in the
 844  849   * pool), we also keep track of which pool its been made a part of.
 845  850   *
 846  851   * The 'spa_spare_lock' protects the AVL tree.  These functions are normally
 847  852   * called under the spa_namespace lock as part of vdev reconfiguration.  The
 848  853   * separate spare lock exists for the status query path, which does not need to
 849  854   * be completely consistent with respect to other vdev configuration changes.
 850  855   */
 851  856  
 852  857  static int
 853  858  spa_spare_compare(const void *a, const void *b)
 854  859  {
 855  860          return (spa_aux_compare(a, b));
 856  861  }
 857  862  
 858  863  void
 859  864  spa_spare_add(vdev_t *vd)
 860  865  {
 861  866          mutex_enter(&spa_spare_lock);
 862  867          ASSERT(!vd->vdev_isspare);
 863  868          spa_aux_add(vd, &spa_spare_avl);
 864  869          vd->vdev_isspare = B_TRUE;
 865  870          mutex_exit(&spa_spare_lock);
 866  871  }
 867  872  
 868  873  void
 869  874  spa_spare_remove(vdev_t *vd)
 870  875  {
 871  876          mutex_enter(&spa_spare_lock);
 872  877          ASSERT(vd->vdev_isspare);
 873  878          spa_aux_remove(vd, &spa_spare_avl);
 874  879          vd->vdev_isspare = B_FALSE;
 875  880          mutex_exit(&spa_spare_lock);
 876  881  }
 877  882  
 878  883  boolean_t
 879  884  spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
 880  885  {
 881  886          boolean_t found;
 882  887  
 883  888          mutex_enter(&spa_spare_lock);
 884  889          found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
 885  890          mutex_exit(&spa_spare_lock);
 886  891  
 887  892          return (found);
 888  893  }
 889  894  
 890  895  void
 891  896  spa_spare_activate(vdev_t *vd)
 892  897  {
 893  898          mutex_enter(&spa_spare_lock);
 894  899          ASSERT(vd->vdev_isspare);
 895  900          spa_aux_activate(vd, &spa_spare_avl);
 896  901          mutex_exit(&spa_spare_lock);
 897  902  }
 898  903  
 899  904  /*
 900  905   * Level 2 ARC devices are tracked globally for the same reasons as spares.
 901  906   * Cache devices currently only support one pool per cache device, and so
 902  907   * for these devices the aux reference count is currently unused beyond 1.
 903  908   */
 904  909  
 905  910  static int
 906  911  spa_l2cache_compare(const void *a, const void *b)
 907  912  {
 908  913          return (spa_aux_compare(a, b));
 909  914  }
 910  915  
 911  916  void
 912  917  spa_l2cache_add(vdev_t *vd)
 913  918  {
 914  919          mutex_enter(&spa_l2cache_lock);
 915  920          ASSERT(!vd->vdev_isl2cache);
 916  921          spa_aux_add(vd, &spa_l2cache_avl);
 917  922          vd->vdev_isl2cache = B_TRUE;
 918  923          mutex_exit(&spa_l2cache_lock);
 919  924  }
 920  925  
 921  926  void
 922  927  spa_l2cache_remove(vdev_t *vd)
 923  928  {
 924  929          mutex_enter(&spa_l2cache_lock);
 925  930          ASSERT(vd->vdev_isl2cache);
 926  931          spa_aux_remove(vd, &spa_l2cache_avl);
 927  932          vd->vdev_isl2cache = B_FALSE;
 928  933          mutex_exit(&spa_l2cache_lock);
 929  934  }
 930  935  
 931  936  boolean_t
 932  937  spa_l2cache_exists(uint64_t guid, uint64_t *pool)
 933  938  {
 934  939          boolean_t found;
 935  940  
 936  941          mutex_enter(&spa_l2cache_lock);
 937  942          found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
 938  943          mutex_exit(&spa_l2cache_lock);
 939  944  
 940  945          return (found);
 941  946  }
 942  947  
 943  948  void
 944  949  spa_l2cache_activate(vdev_t *vd)
 945  950  {
 946  951          mutex_enter(&spa_l2cache_lock);
 947  952          ASSERT(vd->vdev_isl2cache);
 948  953          spa_aux_activate(vd, &spa_l2cache_avl);
 949  954          mutex_exit(&spa_l2cache_lock);
 950  955  }
 951  956  
 952  957  /*
 953  958   * ==========================================================================
 954  959   * SPA vdev locking
 955  960   * ==========================================================================
 956  961   */
 957  962  
 958  963  /*
 959  964   * Lock the given spa_t for the purpose of adding or removing a vdev.
 960  965   * Grabs the global spa_namespace_lock plus the spa config lock for writing.
 961  966   * It returns the next transaction group for the spa_t.
 962  967   */
 963  968  uint64_t
 964  969  spa_vdev_enter(spa_t *spa)
 965  970  {
 966  971          mutex_enter(&spa->spa_vdev_top_lock);
 967  972          mutex_enter(&spa_namespace_lock);
 968  973          return (spa_vdev_config_enter(spa));
 969  974  }
 970  975  
 971  976  /*
 972  977   * Internal implementation for spa_vdev_enter().  Used when a vdev
 973  978   * operation requires multiple syncs (i.e. removing a device) while
 974  979   * keeping the spa_namespace_lock held.
 975  980   */
 976  981  uint64_t
 977  982  spa_vdev_config_enter(spa_t *spa)
 978  983  {
 979  984          ASSERT(MUTEX_HELD(&spa_namespace_lock));
 980  985  
 981  986          spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 982  987  
 983  988          return (spa_last_synced_txg(spa) + 1);
 984  989  }
 985  990  
 986  991  /*
 987  992   * Used in combination with spa_vdev_config_enter() to allow the syncing
 988  993   * of multiple transactions without releasing the spa_namespace_lock.
 989  994   */
 990  995  void
 991  996  spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
 992  997  {
 993  998          ASSERT(MUTEX_HELD(&spa_namespace_lock));
 994  999  
 995 1000          int config_changed = B_FALSE;
 996 1001  
 997 1002          ASSERT(txg > spa_last_synced_txg(spa));
 998 1003  
 999 1004          spa->spa_pending_vdev = NULL;
1000 1005  
1001 1006          /*
1002 1007           * Reassess the DTLs.
1003 1008           */
1004 1009          vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
1005 1010  
1006 1011          if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
1007 1012                  config_changed = B_TRUE;
1008 1013                  spa->spa_config_generation++;
1009 1014          }
1010 1015  
1011 1016          /*
1012 1017           * Verify the metaslab classes.
1013 1018           */
1014 1019          ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
1015 1020          ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
1016 1021  
1017 1022          spa_config_exit(spa, SCL_ALL, spa);
1018 1023  
1019 1024          /*
1020 1025           * Panic the system if the specified tag requires it.  This
1021 1026           * is useful for ensuring that configurations are updated
1022 1027           * transactionally.
1023 1028           */
1024 1029          if (zio_injection_enabled)
1025 1030                  zio_handle_panic_injection(spa, tag, 0);
1026 1031  
1027 1032          /*
1028 1033           * Note: this txg_wait_synced() is important because it ensures
1029 1034           * that there won't be more than one config change per txg.
1030 1035           * This allows us to use the txg as the generation number.
1031 1036           */
1032 1037          if (error == 0)
1033 1038                  txg_wait_synced(spa->spa_dsl_pool, txg);
1034 1039  
1035 1040          if (vd != NULL) {
1036 1041                  ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
1037 1042                  spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1038 1043                  vdev_free(vd);
1039 1044                  spa_config_exit(spa, SCL_ALL, spa);
1040 1045          }
1041 1046  
1042 1047          /*
1043 1048           * If the config changed, update the config cache.
1044 1049           */
1045 1050          if (config_changed)
1046 1051                  spa_config_sync(spa, B_FALSE, B_TRUE);
1047 1052  }
1048 1053  
1049 1054  /*
1050 1055   * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
1051 1056   * locking of spa_vdev_enter(), we also want make sure the transactions have
1052 1057   * synced to disk, and then update the global configuration cache with the new
1053 1058   * information.
1054 1059   */
1055 1060  int
1056 1061  spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
1057 1062  {
1058 1063          spa_vdev_config_exit(spa, vd, txg, error, FTAG);
1059 1064          mutex_exit(&spa_namespace_lock);
1060 1065          mutex_exit(&spa->spa_vdev_top_lock);
1061 1066  
1062 1067          return (error);
1063 1068  }
1064 1069  
1065 1070  /*
1066 1071   * Lock the given spa_t for the purpose of changing vdev state.
1067 1072   */
1068 1073  void
1069 1074  spa_vdev_state_enter(spa_t *spa, int oplocks)
1070 1075  {
1071 1076          int locks = SCL_STATE_ALL | oplocks;
1072 1077  
1073 1078          /*
1074 1079           * Root pools may need to read of the underlying devfs filesystem
1075 1080           * when opening up a vdev.  Unfortunately if we're holding the
1076 1081           * SCL_ZIO lock it will result in a deadlock when we try to issue
1077 1082           * the read from the root filesystem.  Instead we "prefetch"
1078 1083           * the associated vnodes that we need prior to opening the
1079 1084           * underlying devices and cache them so that we can prevent
1080 1085           * any I/O when we are doing the actual open.
1081 1086           */
1082 1087          if (spa_is_root(spa)) {
1083 1088                  int low = locks & ~(SCL_ZIO - 1);
1084 1089                  int high = locks & ~low;
1085 1090  
1086 1091                  spa_config_enter(spa, high, spa, RW_WRITER);
1087 1092                  vdev_hold(spa->spa_root_vdev);
1088 1093                  spa_config_enter(spa, low, spa, RW_WRITER);
1089 1094          } else {
1090 1095                  spa_config_enter(spa, locks, spa, RW_WRITER);
1091 1096          }
1092 1097          spa->spa_vdev_locks = locks;
1093 1098  }
1094 1099  
1095 1100  int
1096 1101  spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
1097 1102  {
1098 1103          boolean_t config_changed = B_FALSE;
1099 1104  
1100 1105          if (vd != NULL || error == 0)
1101 1106                  vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
1102 1107                      0, 0, B_FALSE);
1103 1108  
1104 1109          if (vd != NULL) {
1105 1110                  vdev_state_dirty(vd->vdev_top);
1106 1111                  config_changed = B_TRUE;
1107 1112                  spa->spa_config_generation++;
1108 1113          }
1109 1114  
1110 1115          if (spa_is_root(spa))
1111 1116                  vdev_rele(spa->spa_root_vdev);
1112 1117  
1113 1118          ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
1114 1119          spa_config_exit(spa, spa->spa_vdev_locks, spa);
1115 1120  
1116 1121          /*
1117 1122           * If anything changed, wait for it to sync.  This ensures that,
1118 1123           * from the system administrator's perspective, zpool(1M) commands
1119 1124           * are synchronous.  This is important for things like zpool offline:
1120 1125           * when the command completes, you expect no further I/O from ZFS.
1121 1126           */
1122 1127          if (vd != NULL)
1123 1128                  txg_wait_synced(spa->spa_dsl_pool, 0);
1124 1129  
1125 1130          /*
1126 1131           * If the config changed, update the config cache.
1127 1132           */
1128 1133          if (config_changed) {
1129 1134                  mutex_enter(&spa_namespace_lock);
1130 1135                  spa_config_sync(spa, B_FALSE, B_TRUE);
1131 1136                  mutex_exit(&spa_namespace_lock);
1132 1137          }
1133 1138  
1134 1139          return (error);
1135 1140  }
1136 1141  
1137 1142  /*
1138 1143   * ==========================================================================
1139 1144   * Miscellaneous functions
1140 1145   * ==========================================================================
1141 1146   */
1142 1147  
1143 1148  void
1144 1149  spa_activate_mos_feature(spa_t *spa, const char *feature)
1145 1150  {
1146 1151          (void) nvlist_add_boolean(spa->spa_label_features, feature);
1147 1152          vdev_config_dirty(spa->spa_root_vdev);
1148 1153  }
1149 1154  
1150 1155  void
1151 1156  spa_deactivate_mos_feature(spa_t *spa, const char *feature)
1152 1157  {
1153 1158          (void) nvlist_remove_all(spa->spa_label_features, feature);
1154 1159          vdev_config_dirty(spa->spa_root_vdev);
1155 1160  }
1156 1161  
1157 1162  /*
1158 1163   * Rename a spa_t.
1159 1164   */
1160 1165  int
1161 1166  spa_rename(const char *name, const char *newname)
1162 1167  {
1163 1168          spa_t *spa;
1164 1169          int err;
1165 1170  
1166 1171          /*
1167 1172           * Lookup the spa_t and grab the config lock for writing.  We need to
1168 1173           * actually open the pool so that we can sync out the necessary labels.
1169 1174           * It's OK to call spa_open() with the namespace lock held because we
1170 1175           * allow recursive calls for other reasons.
1171 1176           */
1172 1177          mutex_enter(&spa_namespace_lock);
1173 1178          if ((err = spa_open(name, &spa, FTAG)) != 0) {
1174 1179                  mutex_exit(&spa_namespace_lock);
1175 1180                  return (err);
1176 1181          }
1177 1182  
1178 1183          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1179 1184  
1180 1185          avl_remove(&spa_namespace_avl, spa);
1181 1186          (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name));
1182 1187          avl_add(&spa_namespace_avl, spa);
1183 1188  
1184 1189          /*
1185 1190           * Sync all labels to disk with the new names by marking the root vdev
1186 1191           * dirty and waiting for it to sync.  It will pick up the new pool name
1187 1192           * during the sync.
1188 1193           */
1189 1194          vdev_config_dirty(spa->spa_root_vdev);
1190 1195  
1191 1196          spa_config_exit(spa, SCL_ALL, FTAG);
1192 1197  
1193 1198          txg_wait_synced(spa->spa_dsl_pool, 0);
1194 1199  
1195 1200          /*
1196 1201           * Sync the updated config cache.
1197 1202           */
1198 1203          spa_config_sync(spa, B_FALSE, B_TRUE);
1199 1204  
1200 1205          spa_close(spa, FTAG);
1201 1206  
1202 1207          mutex_exit(&spa_namespace_lock);
1203 1208  
1204 1209          return (0);
1205 1210  }
1206 1211  
1207 1212  /*
1208 1213   * Return the spa_t associated with given pool_guid, if it exists.  If
1209 1214   * device_guid is non-zero, determine whether the pool exists *and* contains
1210 1215   * a device with the specified device_guid.
1211 1216   */
1212 1217  spa_t *
1213 1218  spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
1214 1219  {
1215 1220          spa_t *spa;
1216 1221          avl_tree_t *t = &spa_namespace_avl;
1217 1222  
1218 1223          ASSERT(MUTEX_HELD(&spa_namespace_lock));
1219 1224  
1220 1225          for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
1221 1226                  if (spa->spa_state == POOL_STATE_UNINITIALIZED)
1222 1227                          continue;
1223 1228                  if (spa->spa_root_vdev == NULL)
1224 1229                          continue;
1225 1230                  if (spa_guid(spa) == pool_guid) {
1226 1231                          if (device_guid == 0)
1227 1232                                  break;
1228 1233  
1229 1234                          if (vdev_lookup_by_guid(spa->spa_root_vdev,
1230 1235                              device_guid) != NULL)
1231 1236                                  break;
1232 1237  
1233 1238                          /*
1234 1239                           * Check any devices we may be in the process of adding.
1235 1240                           */
1236 1241                          if (spa->spa_pending_vdev) {
1237 1242                                  if (vdev_lookup_by_guid(spa->spa_pending_vdev,
1238 1243                                      device_guid) != NULL)
1239 1244                                          break;
1240 1245                          }
1241 1246                  }
1242 1247          }
1243 1248  
1244 1249          return (spa);
1245 1250  }
1246 1251  
1247 1252  /*
1248 1253   * Determine whether a pool with the given pool_guid exists.
1249 1254   */
1250 1255  boolean_t
1251 1256  spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
1252 1257  {
1253 1258          return (spa_by_guid(pool_guid, device_guid) != NULL);
1254 1259  }
1255 1260  
1256 1261  char *
1257 1262  spa_strdup(const char *s)
1258 1263  {
1259 1264          size_t len;
1260 1265          char *new;
1261 1266  
1262 1267          len = strlen(s);
1263 1268          new = kmem_alloc(len + 1, KM_SLEEP);
1264 1269          bcopy(s, new, len);
1265 1270          new[len] = '\0';
1266 1271  
1267 1272          return (new);
1268 1273  }
1269 1274  
1270 1275  void
1271 1276  spa_strfree(char *s)
1272 1277  {
1273 1278          kmem_free(s, strlen(s) + 1);
1274 1279  }
1275 1280  
1276 1281  uint64_t
1277 1282  spa_get_random(uint64_t range)
1278 1283  {
1279 1284          uint64_t r;
1280 1285  
1281 1286          ASSERT(range != 0);
1282 1287  
1283 1288          (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
1284 1289  
1285 1290          return (r % range);
1286 1291  }
1287 1292  
1288 1293  uint64_t
1289 1294  spa_generate_guid(spa_t *spa)
1290 1295  {
1291 1296          uint64_t guid = spa_get_random(-1ULL);
1292 1297  
1293 1298          if (spa != NULL) {
1294 1299                  while (guid == 0 || spa_guid_exists(spa_guid(spa), guid))
1295 1300                          guid = spa_get_random(-1ULL);
1296 1301          } else {
1297 1302                  while (guid == 0 || spa_guid_exists(guid, 0))
1298 1303                          guid = spa_get_random(-1ULL);
1299 1304          }
1300 1305  
1301 1306          return (guid);
1302 1307  }
1303 1308  
1304 1309  void
1305 1310  sprintf_blkptr(char *buf, const blkptr_t *bp)
1306 1311  {
1307 1312          char type[256];
1308 1313          char *checksum = NULL;
1309 1314          char *compress = NULL;
1310 1315  
1311 1316          if (bp != NULL) {
1312 1317                  if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
1313 1318                          dmu_object_byteswap_t bswap =
1314 1319                              DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
1315 1320                          (void) snprintf(type, sizeof (type), "bswap %s %s",
1316 1321                              DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
1317 1322                              "metadata" : "data",
1318 1323                              dmu_ot_byteswap[bswap].ob_name);
1319 1324                  } else {
1320 1325                          (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
1321 1326                              sizeof (type));
1322 1327                  }
1323 1328                  checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
1324 1329                  compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
1325 1330          }
1326 1331  
1327 1332          SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress);
1328 1333  }
1329 1334  
1330 1335  void
1331 1336  spa_freeze(spa_t *spa)
1332 1337  {
1333 1338          uint64_t freeze_txg = 0;
1334 1339  
1335 1340          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1336 1341          if (spa->spa_freeze_txg == UINT64_MAX) {
1337 1342                  freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
1338 1343                  spa->spa_freeze_txg = freeze_txg;
1339 1344          }
1340 1345          spa_config_exit(spa, SCL_ALL, FTAG);
1341 1346          if (freeze_txg != 0)
1342 1347                  txg_wait_synced(spa_get_dsl(spa), freeze_txg);
1343 1348  }
1344 1349  
1345 1350  void
1346 1351  zfs_panic_recover(const char *fmt, ...)
1347 1352  {
1348 1353          va_list adx;
1349 1354  
1350 1355          va_start(adx, fmt);
1351 1356          vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
1352 1357          va_end(adx);
1353 1358  }
1354 1359  
1355 1360  /*
1356 1361   * This is a stripped-down version of strtoull, suitable only for converting
1357 1362   * lowercase hexadecimal numbers that don't overflow.
1358 1363   */
1359 1364  uint64_t
1360 1365  strtonum(const char *str, char **nptr)
1361 1366  {
1362 1367          uint64_t val = 0;
1363 1368          char c;
1364 1369          int digit;
1365 1370  
1366 1371          while ((c = *str) != '\0') {
1367 1372                  if (c >= '0' && c <= '9')
1368 1373                          digit = c - '0';
1369 1374                  else if (c >= 'a' && c <= 'f')
1370 1375                          digit = 10 + c - 'a';
1371 1376                  else
1372 1377                          break;
1373 1378  
1374 1379                  val *= 16;
1375 1380                  val += digit;
1376 1381  
1377 1382                  str++;
1378 1383          }
1379 1384  
1380 1385          if (nptr)
1381 1386                  *nptr = (char *)str;
1382 1387  
1383 1388          return (val);
1384 1389  }
1385 1390  
1386 1391  /*
1387 1392   * ==========================================================================
1388 1393   * Accessor functions
1389 1394   * ==========================================================================
1390 1395   */
1391 1396  
1392 1397  boolean_t
1393 1398  spa_shutting_down(spa_t *spa)
1394 1399  {
1395 1400          return (spa->spa_async_suspended);
1396 1401  }
1397 1402  
1398 1403  dsl_pool_t *
1399 1404  spa_get_dsl(spa_t *spa)
1400 1405  {
1401 1406          return (spa->spa_dsl_pool);
1402 1407  }
1403 1408  
1404 1409  boolean_t
1405 1410  spa_is_initializing(spa_t *spa)
1406 1411  {
1407 1412          return (spa->spa_is_initializing);
1408 1413  }
1409 1414  
1410 1415  blkptr_t *
1411 1416  spa_get_rootblkptr(spa_t *spa)
1412 1417  {
1413 1418          return (&spa->spa_ubsync.ub_rootbp);
1414 1419  }
1415 1420  
1416 1421  void
1417 1422  spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
1418 1423  {
1419 1424          spa->spa_uberblock.ub_rootbp = *bp;
1420 1425  }
1421 1426  
1422 1427  void
1423 1428  spa_altroot(spa_t *spa, char *buf, size_t buflen)
1424 1429  {
1425 1430          if (spa->spa_root == NULL)
1426 1431                  buf[0] = '\0';
1427 1432          else
1428 1433                  (void) strncpy(buf, spa->spa_root, buflen);
1429 1434  }
1430 1435  
1431 1436  int
1432 1437  spa_sync_pass(spa_t *spa)
1433 1438  {
1434 1439          return (spa->spa_sync_pass);
1435 1440  }
1436 1441  
1437 1442  char *
1438 1443  spa_name(spa_t *spa)
1439 1444  {
1440 1445          return (spa->spa_name);
1441 1446  }
1442 1447  
1443 1448  uint64_t
1444 1449  spa_guid(spa_t *spa)
1445 1450  {
1446 1451          dsl_pool_t *dp = spa_get_dsl(spa);
1447 1452          uint64_t guid;
1448 1453  
1449 1454          /*
1450 1455           * If we fail to parse the config during spa_load(), we can go through
1451 1456           * the error path (which posts an ereport) and end up here with no root
1452 1457           * vdev.  We stash the original pool guid in 'spa_config_guid' to handle
1453 1458           * this case.
1454 1459           */
1455 1460          if (spa->spa_root_vdev == NULL)
1456 1461                  return (spa->spa_config_guid);
1457 1462  
1458 1463          guid = spa->spa_last_synced_guid != 0 ?
1459 1464              spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
1460 1465  
1461 1466          /*
1462 1467           * Return the most recently synced out guid unless we're
1463 1468           * in syncing context.
1464 1469           */
1465 1470          if (dp && dsl_pool_sync_context(dp))
1466 1471                  return (spa->spa_root_vdev->vdev_guid);
1467 1472          else
1468 1473                  return (guid);
1469 1474  }
1470 1475  
1471 1476  uint64_t
1472 1477  spa_load_guid(spa_t *spa)
1473 1478  {
1474 1479          /*
1475 1480           * This is a GUID that exists solely as a reference for the
1476 1481           * purposes of the arc.  It is generated at load time, and
1477 1482           * is never written to persistent storage.
1478 1483           */
1479 1484          return (spa->spa_load_guid);
1480 1485  }
1481 1486  
1482 1487  uint64_t
1483 1488  spa_last_synced_txg(spa_t *spa)
1484 1489  {
1485 1490          return (spa->spa_ubsync.ub_txg);
1486 1491  }
1487 1492  
1488 1493  uint64_t
1489 1494  spa_first_txg(spa_t *spa)
1490 1495  {
1491 1496          return (spa->spa_first_txg);
1492 1497  }
1493 1498  
1494 1499  uint64_t
1495 1500  spa_syncing_txg(spa_t *spa)
1496 1501  {
1497 1502          return (spa->spa_syncing_txg);
1498 1503  }
1499 1504  
1500 1505  pool_state_t
1501 1506  spa_state(spa_t *spa)
1502 1507  {
1503 1508          return (spa->spa_state);
1504 1509  }
1505 1510  
1506 1511  spa_load_state_t
1507 1512  spa_load_state(spa_t *spa)
1508 1513  {
1509 1514          return (spa->spa_load_state);
1510 1515  }
1511 1516  
1512 1517  uint64_t
1513 1518  spa_freeze_txg(spa_t *spa)
1514 1519  {
1515 1520          return (spa->spa_freeze_txg);
1516 1521  }
1517 1522  
1518 1523  /* ARGSUSED */
1519 1524  uint64_t
1520 1525  spa_get_asize(spa_t *spa, uint64_t lsize)
1521 1526  {
1522 1527          return (lsize * spa_asize_inflation);
1523 1528  }
1524 1529  
1525 1530  uint64_t
1526 1531  spa_get_dspace(spa_t *spa)
1527 1532  {
1528 1533          return (spa->spa_dspace);
1529 1534  }
1530 1535  
1531 1536  void
1532 1537  spa_update_dspace(spa_t *spa)
1533 1538  {
1534 1539          spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
1535 1540              ddt_get_dedup_dspace(spa);
1536 1541  }
1537 1542  
1538 1543  /*
1539 1544   * Return the failure mode that has been set to this pool. The default
1540 1545   * behavior will be to block all I/Os when a complete failure occurs.
1541 1546   */
1542 1547  uint8_t
1543 1548  spa_get_failmode(spa_t *spa)
1544 1549  {
1545 1550          return (spa->spa_failmode);
1546 1551  }
1547 1552  
1548 1553  boolean_t
1549 1554  spa_suspended(spa_t *spa)
1550 1555  {
1551 1556          return (spa->spa_suspended);
1552 1557  }
1553 1558  
1554 1559  uint64_t
1555 1560  spa_version(spa_t *spa)
1556 1561  {
1557 1562          return (spa->spa_ubsync.ub_version);
1558 1563  }
1559 1564  
1560 1565  boolean_t
1561 1566  spa_deflate(spa_t *spa)
1562 1567  {
1563 1568          return (spa->spa_deflate);
1564 1569  }
1565 1570  
1566 1571  metaslab_class_t *
1567 1572  spa_normal_class(spa_t *spa)
1568 1573  {
1569 1574          return (spa->spa_normal_class);
1570 1575  }
1571 1576  
1572 1577  metaslab_class_t *
1573 1578  spa_log_class(spa_t *spa)
1574 1579  {
1575 1580          return (spa->spa_log_class);
1576 1581  }
1577 1582  
1578 1583  int
1579 1584  spa_max_replication(spa_t *spa)
1580 1585  {
1581 1586          /*
1582 1587           * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
1583 1588           * handle BPs with more than one DVA allocated.  Set our max
1584 1589           * replication level accordingly.
1585 1590           */
1586 1591          if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
1587 1592                  return (1);
1588 1593          return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
1589 1594  }
1590 1595  
1591 1596  int
1592 1597  spa_prev_software_version(spa_t *spa)
1593 1598  {
1594 1599          return (spa->spa_prev_software_version);
1595 1600  }
1596 1601  
1597 1602  uint64_t
1598 1603  spa_deadman_synctime(spa_t *spa)
1599 1604  {
1600 1605          return (spa->spa_deadman_synctime);
1601 1606  }
1602 1607  
1603 1608  uint64_t
1604 1609  dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
1605 1610  {
1606 1611          uint64_t asize = DVA_GET_ASIZE(dva);
1607 1612          uint64_t dsize = asize;
1608 1613  
1609 1614          ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1610 1615  
1611 1616          if (asize != 0 && spa->spa_deflate) {
1612 1617                  vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
1613 1618                  dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
1614 1619          }
1615 1620  
1616 1621          return (dsize);
1617 1622  }
1618 1623  
1619 1624  uint64_t
1620 1625  bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
1621 1626  {
1622 1627          uint64_t dsize = 0;
1623 1628  
1624 1629          for (int d = 0; d < SPA_DVAS_PER_BP; d++)
1625 1630                  dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
1626 1631  
1627 1632          return (dsize);
1628 1633  }
1629 1634  
1630 1635  uint64_t
1631 1636  bp_get_dsize(spa_t *spa, const blkptr_t *bp)
1632 1637  {
1633 1638          uint64_t dsize = 0;
1634 1639  
1635 1640          spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
1636 1641  
1637 1642          for (int d = 0; d < SPA_DVAS_PER_BP; d++)
1638 1643                  dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
1639 1644  
1640 1645          spa_config_exit(spa, SCL_VDEV, FTAG);
1641 1646  
1642 1647          return (dsize);
1643 1648  }
1644 1649  
1645 1650  /*
1646 1651   * ==========================================================================
1647 1652   * Initialization and Termination
1648 1653   * ==========================================================================
1649 1654   */
1650 1655  
1651 1656  static int
1652 1657  spa_name_compare(const void *a1, const void *a2)
1653 1658  {
1654 1659          const spa_t *s1 = a1;
1655 1660          const spa_t *s2 = a2;
1656 1661          int s;
1657 1662  
1658 1663          s = strcmp(s1->spa_name, s2->spa_name);
1659 1664          if (s > 0)
1660 1665                  return (1);
1661 1666          if (s < 0)
1662 1667                  return (-1);
1663 1668          return (0);
1664 1669  }
1665 1670  
1666 1671  int
1667 1672  spa_busy(void)
1668 1673  {
1669 1674          return (spa_active_count);
1670 1675  }
1671 1676  
1672 1677  void
1673 1678  spa_boot_init()
1674 1679  {
1675 1680          spa_config_load();
1676 1681  }
1677 1682  
1678 1683  void
1679 1684  spa_init(int mode)
1680 1685  {
1681 1686          mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
1682 1687          mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
1683 1688          mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
1684 1689          cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
1685 1690  
1686 1691          avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
1687 1692              offsetof(spa_t, spa_avl));
1688 1693  
1689 1694          avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
1690 1695              offsetof(spa_aux_t, aux_avl));
1691 1696  
1692 1697          avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
1693 1698              offsetof(spa_aux_t, aux_avl));
1694 1699  
1695 1700          spa_mode_global = mode;
1696 1701  
1697 1702  #ifdef _KERNEL
1698 1703          spa_arch_init();
1699 1704  #else
1700 1705          if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
1701 1706                  arc_procfd = open("/proc/self/ctl", O_WRONLY);
1702 1707                  if (arc_procfd == -1) {
1703 1708                          perror("could not enable watchpoints: "
1704 1709                              "opening /proc/self/ctl failed: ");
1705 1710                  } else {
1706 1711                          arc_watch = B_TRUE;
1707 1712                  }
1708 1713          }
1709 1714  #endif
1710 1715  
1711 1716          refcount_init();
1712 1717          unique_init();
1713 1718          range_tree_init();
1714 1719          zio_init();
1715 1720          dmu_init();
1716 1721          zil_init();
1717 1722          vdev_cache_stat_init();
1718 1723          zfs_prop_init();
1719 1724          zpool_prop_init();
1720 1725          zpool_feature_init();
1721 1726          spa_config_load();
1722 1727          l2arc_start();
1723 1728  }
1724 1729  
1725 1730  void
1726 1731  spa_fini(void)
1727 1732  {
1728 1733          l2arc_stop();
1729 1734  
1730 1735          spa_evict_all();
1731 1736  
1732 1737          vdev_cache_stat_fini();
1733 1738          zil_fini();
1734 1739          dmu_fini();
1735 1740          zio_fini();
1736 1741          range_tree_fini();
1737 1742          unique_fini();
1738 1743          refcount_fini();
1739 1744  
1740 1745          avl_destroy(&spa_namespace_avl);
1741 1746          avl_destroy(&spa_spare_avl);
1742 1747          avl_destroy(&spa_l2cache_avl);
1743 1748  
1744 1749          cv_destroy(&spa_namespace_cv);
1745 1750          mutex_destroy(&spa_namespace_lock);
1746 1751          mutex_destroy(&spa_spare_lock);
1747 1752          mutex_destroy(&spa_l2cache_lock);
1748 1753  }
1749 1754  
1750 1755  /*
1751 1756   * Return whether this pool has slogs. No locking needed.
1752 1757   * It's not a problem if the wrong answer is returned as it's only for
1753 1758   * performance and not correctness
1754 1759   */
1755 1760  boolean_t
1756 1761  spa_has_slogs(spa_t *spa)
1757 1762  {
1758 1763          return (spa->spa_log_class->mc_rotor != NULL);
1759 1764  }
1760 1765  
1761 1766  spa_log_state_t
1762 1767  spa_get_log_state(spa_t *spa)
1763 1768  {
1764 1769          return (spa->spa_log_state);
1765 1770  }
1766 1771  
1767 1772  void
1768 1773  spa_set_log_state(spa_t *spa, spa_log_state_t state)
1769 1774  {
1770 1775          spa->spa_log_state = state;
1771 1776  }
1772 1777  
1773 1778  boolean_t
1774 1779  spa_is_root(spa_t *spa)

↓ open down ↓

1110 lines elided

↑ open up ↑

1775 1780  {
1776 1781          return (spa->spa_is_root);
1777 1782  }
1778 1783  
1779 1784  boolean_t
1780 1785  spa_writeable(spa_t *spa)
1781 1786  {
1782 1787          return (!!(spa->spa_mode & FWRITE));
1783 1788  }
1784 1789  
     1790 +static int
     1791 +activate_salted_cksum_check(zfeature_info_t *feature, dmu_tx_t *tx)
     1792 +{
     1793 +        spa_t   *spa = dmu_tx_pool(tx)->dp_spa;
     1794 +
     1795 +        if (!spa_feature_is_active(spa, feature))
     1796 +                return (0);
     1797 +        else
     1798 +                return (SET_ERROR(EBUSY));
     1799 +}
     1800 +
     1801 +static void
     1802 +activate_salted_cksum_sync(zfeature_info_t *feature, dmu_tx_t *tx)
     1803 +{
     1804 +        spa_t   *spa = dmu_tx_pool(tx)->dp_spa;
     1805 +
     1806 +        spa_feature_incr(spa, feature, tx);
     1807 +        /*
     1808 +         * This is the first salted checksum that's been activated, so
     1809 +         * create the persistent checksum salt object now.
     1810 +         */
     1811 +        if (spa->spa_cksum_salt_obj == 0) {
     1812 +                spa->spa_cksum_salt_obj = zap_create_link(spa->spa_meta_objset,
     1813 +                    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
     1814 +                    DMU_POOL_CHECKSUM_SALT, tx);
     1815 +                VERIFY3U(zap_add(spa->spa_meta_objset,
     1816 +                    spa->spa_cksum_salt_obj, DMU_POOL_CHECKSUM_SALT, 1,
     1817 +                    sizeof (spa->spa_cksum_salt.zcs_bytes),
     1818 +                    spa->spa_cksum_salt.zcs_bytes, tx), ==, 0);
     1819 +        }
     1820 +}
     1821 +
     1822 +/*
     1823 + * Activates a feature associated with a salted checksum. You must call this
     1824 + * function instead of calling spa_feature_incr() directly, because we may
     1825 + * also need to sync the MOS object holding the checksum salt.
     1826 + * Arguments:
     1827 + *      spa     Pool on which to activate the salted checksum feature.
     1828 + *      feature Salted checksum algorithm feature to activate (see
     1829 + *              spa_feature_table).
     1830 + */
     1831 +int
     1832 +spa_activate_salted_cksum(spa_t *spa, struct zfeature_info *feature)
     1833 +{
     1834 +        int err;
     1835 +
     1836 +        /* EBUSY here indicates that the feature is already active */
     1837 +        err = dsl_sync_task(spa_name(spa),
     1838 +            (dsl_checkfunc_t *)activate_salted_cksum_check,
     1839 +            (dsl_syncfunc_t *)activate_salted_cksum_sync, feature, 2);
     1840 +
     1841 +        if (err != 0 && err != EBUSY)
     1842 +                return (err);
     1843 +        else
     1844 +                return (0);
     1845 +}
     1846 +
1785 1847  int
1786 1848  spa_mode(spa_t *spa)
1787 1849  {
1788 1850          return (spa->spa_mode);
1789 1851  }
1790 1852  
1791 1853  uint64_t
1792 1854  spa_bootfs(spa_t *spa)
1793 1855  {
1794 1856          return (spa->spa_bootfs);

1795 1857  }
1796 1858  
1797 1859  uint64_t
1798 1860  spa_delegation(spa_t *spa)
1799 1861  {
1800 1862          return (spa->spa_delegation);
1801 1863  }
1802 1864  
1803 1865  objset_t *
1804 1866  spa_meta_objset(spa_t *spa)
1805 1867  {
1806 1868          return (spa->spa_meta_objset);
1807 1869  }
1808 1870  
1809 1871  enum zio_checksum
1810 1872  spa_dedup_checksum(spa_t *spa)
1811 1873  {
1812 1874          return (spa->spa_dedup_checksum);
1813 1875  }
1814 1876  
1815 1877  /*
1816 1878   * Reset pool scan stat per scan pass (or reboot).
1817 1879   */
1818 1880  void
1819 1881  spa_scan_stat_init(spa_t *spa)
1820 1882  {
1821 1883          /* data not stored on disk */
1822 1884          spa->spa_scan_pass_start = gethrestime_sec();
1823 1885          spa->spa_scan_pass_exam = 0;
1824 1886          vdev_scan_stat_init(spa->spa_root_vdev);
1825 1887  }
1826 1888  
1827 1889  /*
1828 1890   * Get scan stats for zpool status reports
1829 1891   */
1830 1892  int
1831 1893  spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
1832 1894  {
1833 1895          dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
1834 1896  
1835 1897          if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
1836 1898                  return (SET_ERROR(ENOENT));
1837 1899          bzero(ps, sizeof (pool_scan_stat_t));
1838 1900  
1839 1901          /* data stored on disk */
1840 1902          ps->pss_func = scn->scn_phys.scn_func;
1841 1903          ps->pss_start_time = scn->scn_phys.scn_start_time;
1842 1904          ps->pss_end_time = scn->scn_phys.scn_end_time;
1843 1905          ps->pss_to_examine = scn->scn_phys.scn_to_examine;
1844 1906          ps->pss_examined = scn->scn_phys.scn_examined;
1845 1907          ps->pss_to_process = scn->scn_phys.scn_to_process;
1846 1908          ps->pss_processed = scn->scn_phys.scn_processed;
1847 1909          ps->pss_errors = scn->scn_phys.scn_errors;
1848 1910          ps->pss_state = scn->scn_phys.scn_state;
1849 1911  
1850 1912          /* data not stored on disk */
1851 1913          ps->pss_pass_start = spa->spa_scan_pass_start;
1852 1914          ps->pss_pass_exam = spa->spa_scan_pass_exam;
1853 1915  
1854 1916          return (0);
1855 1917  }
1856 1918  
1857 1919  boolean_t
1858 1920  spa_debug_enabled(spa_t *spa)
1859 1921  {
1860 1922          return (spa->spa_debug);
1861 1923  }

↓ open down ↓

67 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX