dlpx-os-diff Wdiff usr/src/uts/common/fs/zfs/spa_misc.c

Print this page

4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/spa_misc.c
          +++ new/usr/src/uts/common/fs/zfs/spa_misc.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2013 by Delphix. All rights reserved.
  24   24   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/zfs_context.h>
  28   28  #include <sys/spa_impl.h>
  29   29  #include <sys/spa_boot.h>
  30   30  #include <sys/zio.h>
  31   31  #include <sys/zio_checksum.h>
  32   32  #include <sys/zio_compress.h>
  33   33  #include <sys/dmu.h>
  34   34  #include <sys/dmu_tx.h>
  35   35  #include <sys/zap.h>
  36   36  #include <sys/zil.h>
  37   37  #include <sys/vdev_impl.h>
  38   38  #include <sys/metaslab.h>
  39   39  #include <sys/uberblock_impl.h>
  40   40  #include <sys/txg.h>
  41   41  #include <sys/avl.h>
  42   42  #include <sys/unique.h>
  43   43  #include <sys/dsl_pool.h>
  44   44  #include <sys/dsl_dir.h>
  45   45  #include <sys/dsl_prop.h>
  46   46  #include <sys/dsl_scan.h>
  47   47  #include <sys/fs/zfs.h>
  48   48  #include <sys/metaslab_impl.h>
  49   49  #include <sys/arc.h>
  50   50  #include <sys/ddt.h>
  51   51  #include "zfs_prop.h"
  52   52  #include "zfeature_common.h"
  53   53  
  54   54  /*
  55   55   * SPA locking
  56   56   *
  57   57   * There are four basic locks for managing spa_t structures:
  58   58   *
  59   59   * spa_namespace_lock (global mutex)
  60   60   *
  61   61   *      This lock must be acquired to do any of the following:
  62   62   *
  63   63   *              - Lookup a spa_t by name
  64   64   *              - Add or remove a spa_t from the namespace
  65   65   *              - Increase spa_refcount from non-zero
  66   66   *              - Check if spa_refcount is zero
  67   67   *              - Rename a spa_t
  68   68   *              - add/remove/attach/detach devices
  69   69   *              - Held for the duration of create/destroy/import/export
  70   70   *
  71   71   *      It does not need to handle recursion.  A create or destroy may
  72   72   *      reference objects (files or zvols) in other pools, but by
  73   73   *      definition they must have an existing reference, and will never need
  74   74   *      to lookup a spa_t by name.
  75   75   *
  76   76   * spa_refcount (per-spa refcount_t protected by mutex)
  77   77   *
  78   78   *      This reference count keep track of any active users of the spa_t.  The
  79   79   *      spa_t cannot be destroyed or freed while this is non-zero.  Internally,
  80   80   *      the refcount is never really 'zero' - opening a pool implicitly keeps
  81   81   *      some references in the DMU.  Internally we check against spa_minref, but
  82   82   *      present the image of a zero/non-zero value to consumers.
  83   83   *
  84   84   * spa_config_lock[] (per-spa array of rwlocks)
  85   85   *
  86   86   *      This protects the spa_t from config changes, and must be held in
  87   87   *      the following circumstances:
  88   88   *
  89   89   *              - RW_READER to perform I/O to the spa
  90   90   *              - RW_WRITER to change the vdev config
  91   91   *
  92   92   * The locking order is fairly straightforward:
  93   93   *
  94   94   *              spa_namespace_lock      ->      spa_refcount
  95   95   *
  96   96   *      The namespace lock must be acquired to increase the refcount from 0
  97   97   *      or to check if it is zero.
  98   98   *
  99   99   *              spa_refcount            ->      spa_config_lock[]
 100  100   *
 101  101   *      There must be at least one valid reference on the spa_t to acquire
 102  102   *      the config lock.
 103  103   *
 104  104   *              spa_namespace_lock      ->      spa_config_lock[]
 105  105   *
 106  106   *      The namespace lock must always be taken before the config lock.
 107  107   *
 108  108   *
 109  109   * The spa_namespace_lock can be acquired directly and is globally visible.
 110  110   *
 111  111   * The namespace is manipulated using the following functions, all of which
 112  112   * require the spa_namespace_lock to be held.
 113  113   *
 114  114   *      spa_lookup()            Lookup a spa_t by name.
 115  115   *
 116  116   *      spa_add()               Create a new spa_t in the namespace.
 117  117   *
 118  118   *      spa_remove()            Remove a spa_t from the namespace.  This also
 119  119   *                              frees up any memory associated with the spa_t.
 120  120   *
 121  121   *      spa_next()              Returns the next spa_t in the system, or the
 122  122   *                              first if NULL is passed.
 123  123   *
 124  124   *      spa_evict_all()         Shutdown and remove all spa_t structures in
 125  125   *                              the system.
 126  126   *
 127  127   *      spa_guid_exists()       Determine whether a pool/device guid exists.
 128  128   *
 129  129   * The spa_refcount is manipulated using the following functions:
 130  130   *
 131  131   *      spa_open_ref()          Adds a reference to the given spa_t.  Must be
 132  132   *                              called with spa_namespace_lock held if the
 133  133   *                              refcount is currently zero.
 134  134   *
 135  135   *      spa_close()             Remove a reference from the spa_t.  This will
 136  136   *                              not free the spa_t or remove it from the
 137  137   *                              namespace.  No locking is required.
 138  138   *
 139  139   *      spa_refcount_zero()     Returns true if the refcount is currently
 140  140   *                              zero.  Must be called with spa_namespace_lock
 141  141   *                              held.
 142  142   *
 143  143   * The spa_config_lock[] is an array of rwlocks, ordered as follows:
 144  144   * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
 145  145   * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
 146  146   *
 147  147   * To read the configuration, it suffices to hold one of these locks as reader.
 148  148   * To modify the configuration, you must hold all locks as writer.  To modify
 149  149   * vdev state without altering the vdev tree's topology (e.g. online/offline),
 150  150   * you must hold SCL_STATE and SCL_ZIO as writer.
 151  151   *
 152  152   * We use these distinct config locks to avoid recursive lock entry.
 153  153   * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
 154  154   * block allocations (SCL_ALLOC), which may require reading space maps
 155  155   * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
 156  156   *
 157  157   * The spa config locks cannot be normal rwlocks because we need the
 158  158   * ability to hand off ownership.  For example, SCL_ZIO is acquired
 159  159   * by the issuing thread and later released by an interrupt thread.
 160  160   * They do, however, obey the usual write-wanted semantics to prevent
 161  161   * writer (i.e. system administrator) starvation.
 162  162   *
 163  163   * The lock acquisition rules are as follows:
 164  164   *
 165  165   * SCL_CONFIG
 166  166   *      Protects changes to the vdev tree topology, such as vdev
 167  167   *      add/remove/attach/detach.  Protects the dirty config list
 168  168   *      (spa_config_dirty_list) and the set of spares and l2arc devices.
 169  169   *
 170  170   * SCL_STATE
 171  171   *      Protects changes to pool state and vdev state, such as vdev
 172  172   *      online/offline/fault/degrade/clear.  Protects the dirty state list
 173  173   *      (spa_state_dirty_list) and global pool state (spa_state).
 174  174   *
 175  175   * SCL_ALLOC
 176  176   *      Protects changes to metaslab groups and classes.
 177  177   *      Held as reader by metaslab_alloc() and metaslab_claim().
 178  178   *
 179  179   * SCL_ZIO
 180  180   *      Held by bp-level zios (those which have no io_vd upon entry)
 181  181   *      to prevent changes to the vdev tree.  The bp-level zio implicitly
 182  182   *      protects all of its vdev child zios, which do not hold SCL_ZIO.
 183  183   *
 184  184   * SCL_FREE
 185  185   *      Protects changes to metaslab groups and classes.
 186  186   *      Held as reader by metaslab_free().  SCL_FREE is distinct from
 187  187   *      SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
 188  188   *      blocks in zio_done() while another i/o that holds either
 189  189   *      SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
 190  190   *
 191  191   * SCL_VDEV
 192  192   *      Held as reader to prevent changes to the vdev tree during trivial
 193  193   *      inquiries such as bp_get_dsize().  SCL_VDEV is distinct from the
 194  194   *      other locks, and lower than all of them, to ensure that it's safe
 195  195   *      to acquire regardless of caller context.
 196  196   *
 197  197   * In addition, the following rules apply:
 198  198   *
 199  199   * (a)  spa_props_lock protects pool properties, spa_config and spa_config_list.
 200  200   *      The lock ordering is SCL_CONFIG > spa_props_lock.
 201  201   *
 202  202   * (b)  I/O operations on leaf vdevs.  For any zio operation that takes
 203  203   *      an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
 204  204   *      or zio_write_phys() -- the caller must ensure that the config cannot
 205  205   *      cannot change in the interim, and that the vdev cannot be reopened.
 206  206   *      SCL_STATE as reader suffices for both.
 207  207   *
 208  208   * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
 209  209   *
 210  210   *      spa_vdev_enter()        Acquire the namespace lock and the config lock
 211  211   *                              for writing.
 212  212   *
 213  213   *      spa_vdev_exit()         Release the config lock, wait for all I/O
 214  214   *                              to complete, sync the updated configs to the
 215  215   *                              cache, and release the namespace lock.
 216  216   *
 217  217   * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
 218  218   * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
 219  219   * locking is, always, based on spa_namespace_lock and spa_config_lock[].
 220  220   *
 221  221   * spa_rename() is also implemented within this file since it requires
 222  222   * manipulation of the namespace.
 223  223   */
 224  224  
 225  225  static avl_tree_t spa_namespace_avl;
 226  226  kmutex_t spa_namespace_lock;
 227  227  static kcondvar_t spa_namespace_cv;
 228  228  static int spa_active_count;
 229  229  int spa_max_replication_override = SPA_DVAS_PER_BP;
 230  230  
 231  231  static kmutex_t spa_spare_lock;
 232  232  static avl_tree_t spa_spare_avl;
 233  233  static kmutex_t spa_l2cache_lock;
 234  234  static avl_tree_t spa_l2cache_avl;
 235  235  
 236  236  kmem_cache_t *spa_buffer_pool;
 237  237  int spa_mode_global;
 238  238  
 239  239  #ifdef ZFS_DEBUG
 240  240  /* Everything except dprintf and spa is on by default in debug builds */
 241  241  int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA);
 242  242  #else

↓ open down ↓

242 lines elided

↑ open up ↑

 243  243  int zfs_flags = 0;
 244  244  #endif
 245  245  
 246  246  /*
 247  247   * zfs_recover can be set to nonzero to attempt to recover from
 248  248   * otherwise-fatal errors, typically caused by on-disk corruption.  When
 249  249   * set, calls to zfs_panic_recover() will turn into warning messages.
 250  250   */
 251  251  int zfs_recover = 0;
 252  252  
 253      -extern int zfs_txg_synctime_ms;
      253 +/*
      254 + * Expiration time in milliseconds. This value has two meanings. First it is
      255 + * used to determine when the spa_deadman() logic should fire. By default the
      256 + * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
      257 + * Secondly, the value determines if an I/O is considered "hung". Any I/O that
      258 + * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
      259 + * in a system panic.
      260 + */
      261 +uint64_t zfs_deadman_synctime_ms = 1000000ULL;
 254  262  
 255  263  /*
 256      - * Expiration time in units of zfs_txg_synctime_ms. This value has two
 257      - * meanings. First it is used to determine when the spa_deadman logic
 258      - * should fire. By default the spa_deadman will fire if spa_sync has
 259      - * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds).
 260      - * Secondly, the value determines if an I/O is considered "hung".
 261      - * Any I/O that has not completed in zfs_deadman_synctime is considered
 262      - * "hung" resulting in a system panic.
      264 + * Check time in milliseconds. This defines the frequency at which we check
      265 + * for hung I/O.
 263  266   */
 264      -uint64_t zfs_deadman_synctime = 1000ULL;
      267 +uint64_t zfs_deadman_checktime_ms = 5000ULL;
 265  268  
 266  269  /*
 267  270   * Override the zfs deadman behavior via /etc/system. By default the
 268  271   * deadman is enabled except on VMware and sparc deployments.
 269  272   */
 270  273  int zfs_deadman_enabled = -1;
 271  274  
      275 +/*
      276 + * The worst case is single-sector max-parity RAID-Z blocks, in which
      277 + * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
      278 + * times the size; so just assume that.  Add to this the fact that
      279 + * we can have up to 3 DVAs per bp, and one more factor of 2 because
      280 + * the block may be dittoed with up to 3 DVAs by ddt_sync().  All together,
      281 + * the worst case is:
      282 + *     (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
      283 + */
      284 +int spa_asize_inflation = 24;
 272  285  
 273  286  /*
 274  287   * ==========================================================================
 275  288   * SPA config locking
 276  289   * ==========================================================================
 277  290   */
 278  291  static void
 279  292  spa_config_lock_init(spa_t *spa)
 280  293  {
 281  294          for (int i = 0; i < SCL_LOCKS; i++) {

 282  295                  spa_config_lock_t *scl = &spa->spa_config_lock[i];
 283  296                  mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
 284  297                  cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
 285  298                  refcount_create_untracked(&scl->scl_count);
 286  299                  scl->scl_writer = NULL;
 287  300                  scl->scl_write_wanted = 0;
 288  301          }
 289  302  }
 290  303  
 291  304  static void
 292  305  spa_config_lock_destroy(spa_t *spa)
 293  306  {
 294  307          for (int i = 0; i < SCL_LOCKS; i++) {
 295  308                  spa_config_lock_t *scl = &spa->spa_config_lock[i];
 296  309                  mutex_destroy(&scl->scl_lock);
 297  310                  cv_destroy(&scl->scl_cv);
 298  311                  refcount_destroy(&scl->scl_count);
 299  312                  ASSERT(scl->scl_writer == NULL);
 300  313                  ASSERT(scl->scl_write_wanted == 0);
 301  314          }
 302  315  }
 303  316  
 304  317  int
 305  318  spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
 306  319  {
 307  320          for (int i = 0; i < SCL_LOCKS; i++) {
 308  321                  spa_config_lock_t *scl = &spa->spa_config_lock[i];
 309  322                  if (!(locks & (1 << i)))
 310  323                          continue;
 311  324                  mutex_enter(&scl->scl_lock);
 312  325                  if (rw == RW_READER) {
 313  326                          if (scl->scl_writer || scl->scl_write_wanted) {
 314  327                                  mutex_exit(&scl->scl_lock);
 315  328                                  spa_config_exit(spa, locks ^ (1 << i), tag);
 316  329                                  return (0);
 317  330                          }
 318  331                  } else {
 319  332                          ASSERT(scl->scl_writer != curthread);
 320  333                          if (!refcount_is_zero(&scl->scl_count)) {
 321  334                                  mutex_exit(&scl->scl_lock);
 322  335                                  spa_config_exit(spa, locks ^ (1 << i), tag);
 323  336                                  return (0);
 324  337                          }
 325  338                          scl->scl_writer = curthread;
 326  339                  }
 327  340                  (void) refcount_add(&scl->scl_count, tag);
 328  341                  mutex_exit(&scl->scl_lock);
 329  342          }
 330  343          return (1);
 331  344  }
 332  345  
 333  346  void
 334  347  spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
 335  348  {
 336  349          int wlocks_held = 0;
 337  350  
 338  351          ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
 339  352  
 340  353          for (int i = 0; i < SCL_LOCKS; i++) {
 341  354                  spa_config_lock_t *scl = &spa->spa_config_lock[i];
 342  355                  if (scl->scl_writer == curthread)
 343  356                          wlocks_held |= (1 << i);
 344  357                  if (!(locks & (1 << i)))
 345  358                          continue;
 346  359                  mutex_enter(&scl->scl_lock);
 347  360                  if (rw == RW_READER) {
 348  361                          while (scl->scl_writer || scl->scl_write_wanted) {
 349  362                                  cv_wait(&scl->scl_cv, &scl->scl_lock);
 350  363                          }
 351  364                  } else {
 352  365                          ASSERT(scl->scl_writer != curthread);
 353  366                          while (!refcount_is_zero(&scl->scl_count)) {
 354  367                                  scl->scl_write_wanted++;
 355  368                                  cv_wait(&scl->scl_cv, &scl->scl_lock);
 356  369                                  scl->scl_write_wanted--;
 357  370                          }
 358  371                          scl->scl_writer = curthread;
 359  372                  }
 360  373                  (void) refcount_add(&scl->scl_count, tag);
 361  374                  mutex_exit(&scl->scl_lock);
 362  375          }
 363  376          ASSERT(wlocks_held <= locks);
 364  377  }
 365  378  
 366  379  void
 367  380  spa_config_exit(spa_t *spa, int locks, void *tag)
 368  381  {
 369  382          for (int i = SCL_LOCKS - 1; i >= 0; i--) {
 370  383                  spa_config_lock_t *scl = &spa->spa_config_lock[i];
 371  384                  if (!(locks & (1 << i)))
 372  385                          continue;
 373  386                  mutex_enter(&scl->scl_lock);
 374  387                  ASSERT(!refcount_is_zero(&scl->scl_count));
 375  388                  if (refcount_remove(&scl->scl_count, tag) == 0) {
 376  389                          ASSERT(scl->scl_writer == NULL ||
 377  390                              scl->scl_writer == curthread);
 378  391                          scl->scl_writer = NULL; /* OK in either case */
 379  392                          cv_broadcast(&scl->scl_cv);
 380  393                  }
 381  394                  mutex_exit(&scl->scl_lock);
 382  395          }
 383  396  }
 384  397  
 385  398  int
 386  399  spa_config_held(spa_t *spa, int locks, krw_t rw)
 387  400  {
 388  401          int locks_held = 0;
 389  402  
 390  403          for (int i = 0; i < SCL_LOCKS; i++) {
 391  404                  spa_config_lock_t *scl = &spa->spa_config_lock[i];
 392  405                  if (!(locks & (1 << i)))
 393  406                          continue;
 394  407                  if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) ||
 395  408                      (rw == RW_WRITER && scl->scl_writer == curthread))
 396  409                          locks_held |= 1 << i;
 397  410          }
 398  411  
 399  412          return (locks_held);
 400  413  }
 401  414  
 402  415  /*
 403  416   * ==========================================================================
 404  417   * SPA namespace functions
 405  418   * ==========================================================================
 406  419   */
 407  420  
 408  421  /*
 409  422   * Lookup the named spa_t in the AVL tree.  The spa_namespace_lock must be held.
 410  423   * Returns NULL if no matching spa_t is found.
 411  424   */
 412  425  spa_t *
 413  426  spa_lookup(const char *name)
 414  427  {
 415  428          static spa_t search;    /* spa_t is large; don't allocate on stack */
 416  429          spa_t *spa;
 417  430          avl_index_t where;
 418  431          char *cp;
 419  432  
 420  433          ASSERT(MUTEX_HELD(&spa_namespace_lock));
 421  434  
 422  435          (void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
 423  436  
 424  437          /*
 425  438           * If it's a full dataset name, figure out the pool name and
 426  439           * just use that.
 427  440           */
 428  441          cp = strpbrk(search.spa_name, "/@");
 429  442          if (cp != NULL)
 430  443                  *cp = '\0';
 431  444  
 432  445          spa = avl_find(&spa_namespace_avl, &search, &where);
 433  446  
 434  447          return (spa);
 435  448  }
 436  449  
 437  450  /*
 438  451   * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
 439  452   * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
 440  453   * looking for potentially hung I/Os.
 441  454   */
 442  455  void
 443  456  spa_deadman(void *arg)
 444  457  {
 445  458          spa_t *spa = arg;
 446  459  
 447  460          zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
 448  461              (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
 449  462              ++spa->spa_deadman_calls);
 450  463          if (zfs_deadman_enabled)
 451  464                  vdev_deadman(spa->spa_root_vdev);
 452  465  }
 453  466  
 454  467  /*
 455  468   * Create an uninitialized spa_t with the given name.  Requires
 456  469   * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
 457  470   * exist by calling spa_lookup() first.
 458  471   */
 459  472  spa_t *
 460  473  spa_add(const char *name, nvlist_t *config, const char *altroot)
 461  474  {
 462  475          spa_t *spa;
 463  476          spa_config_dirent_t *dp;
 464  477          cyc_handler_t hdlr;
 465  478          cyc_time_t when;
 466  479  
 467  480          ASSERT(MUTEX_HELD(&spa_namespace_lock));
 468  481  
 469  482          spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
 470  483  
 471  484          mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
 472  485          mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
 473  486          mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
 474  487          mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
 475  488          mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
 476  489          mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
 477  490          mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
 478  491          mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
 479  492          mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
 480  493          mutex_init(&spa->spa_iokstat_lock, NULL, MUTEX_DEFAULT, NULL);
 481  494  
 482  495          cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
 483  496          cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
 484  497          cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
 485  498          cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
 486  499  
 487  500          for (int t = 0; t < TXG_SIZE; t++)
 488  501                  bplist_create(&spa->spa_free_bplist[t]);
 489  502  
 490  503          (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
 491  504          spa->spa_state = POOL_STATE_UNINITIALIZED;

↓ open down ↓

210 lines elided

↑ open up ↑

 492  505          spa->spa_freeze_txg = UINT64_MAX;
 493  506          spa->spa_final_txg = UINT64_MAX;
 494  507          spa->spa_load_max_txg = UINT64_MAX;
 495  508          spa->spa_proc = &p0;
 496  509          spa->spa_proc_state = SPA_PROC_NONE;
 497  510  
 498  511          hdlr.cyh_func = spa_deadman;
 499  512          hdlr.cyh_arg = spa;
 500  513          hdlr.cyh_level = CY_LOW_LEVEL;
 501  514  
 502      -        spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime *
 503      -            zfs_txg_synctime_ms);
      515 +        spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
 504  516  
 505  517          /*
 506  518           * This determines how often we need to check for hung I/Os after
 507  519           * the cyclic has already fired. Since checking for hung I/Os is
 508  520           * an expensive operation we don't want to check too frequently.
 509      -         * Instead wait for 5 synctimes before checking again.
      521 +         * Instead wait for 5 seconds before checking again.
 510  522           */
 511      -        when.cyt_interval = MSEC2NSEC(5 * zfs_txg_synctime_ms);
      523 +        when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
 512  524          when.cyt_when = CY_INFINITY;
 513  525          mutex_enter(&cpu_lock);
 514  526          spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
 515  527          mutex_exit(&cpu_lock);
 516  528  
 517  529          refcount_create(&spa->spa_refcount);
 518  530          spa_config_lock_init(spa);
 519  531  
 520  532          avl_add(&spa_namespace_avl, spa);
 521  533

 522  534          /*
 523  535           * Set the alternate root, if there is one.
 524  536           */
 525  537          if (altroot) {
 526  538                  spa->spa_root = spa_strdup(altroot);
 527  539                  spa_active_count++;
 528  540          }
 529  541  
 530  542          /*
 531  543           * Every pool starts with the default cachefile
 532  544           */
 533  545          list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
 534  546              offsetof(spa_config_dirent_t, scd_link));
 535  547  
 536  548          dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
 537  549          dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
 538  550          list_insert_head(&spa->spa_config_list, dp);
 539  551  
 540  552          VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
 541  553              KM_SLEEP) == 0);
 542  554  
 543  555          if (config != NULL) {
 544  556                  nvlist_t *features;
 545  557  
 546  558                  if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
 547  559                      &features) == 0) {
 548  560                          VERIFY(nvlist_dup(features, &spa->spa_label_features,
 549  561                              0) == 0);
 550  562                  }
 551  563  
 552  564                  VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
 553  565          }
 554  566  
 555  567          if (spa->spa_label_features == NULL) {
 556  568                  VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
 557  569                      KM_SLEEP) == 0);
 558  570          }
 559  571  
 560  572          spa->spa_iokstat = kstat_create("zfs", 0, name,
 561  573              "disk", KSTAT_TYPE_IO, 1, 0);
 562  574          if (spa->spa_iokstat) {
 563  575                  spa->spa_iokstat->ks_lock = &spa->spa_iokstat_lock;
 564  576                  kstat_install(spa->spa_iokstat);
 565  577          }
 566  578  
 567  579          spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0);
 568  580  
 569  581          return (spa);
 570  582  }
 571  583  
 572  584  /*
 573  585   * Removes a spa_t from the namespace, freeing up any memory used.  Requires
 574  586   * spa_namespace_lock.  This is called only after the spa_t has been closed and
 575  587   * deactivated.
 576  588   */
 577  589  void
 578  590  spa_remove(spa_t *spa)
 579  591  {
 580  592          spa_config_dirent_t *dp;
 581  593  
 582  594          ASSERT(MUTEX_HELD(&spa_namespace_lock));
 583  595          ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 584  596  
 585  597          nvlist_free(spa->spa_config_splitting);
 586  598  
 587  599          avl_remove(&spa_namespace_avl, spa);
 588  600          cv_broadcast(&spa_namespace_cv);
 589  601  
 590  602          if (spa->spa_root) {
 591  603                  spa_strfree(spa->spa_root);
 592  604                  spa_active_count--;
 593  605          }
 594  606  
 595  607          while ((dp = list_head(&spa->spa_config_list)) != NULL) {
 596  608                  list_remove(&spa->spa_config_list, dp);
 597  609                  if (dp->scd_path != NULL)
 598  610                          spa_strfree(dp->scd_path);
 599  611                  kmem_free(dp, sizeof (spa_config_dirent_t));
 600  612          }
 601  613  
 602  614          list_destroy(&spa->spa_config_list);
 603  615  
 604  616          nvlist_free(spa->spa_label_features);
 605  617          nvlist_free(spa->spa_load_info);
 606  618          spa_config_set(spa, NULL);
 607  619  
 608  620          mutex_enter(&cpu_lock);
 609  621          if (spa->spa_deadman_cycid != CYCLIC_NONE)
 610  622                  cyclic_remove(spa->spa_deadman_cycid);
 611  623          mutex_exit(&cpu_lock);
 612  624          spa->spa_deadman_cycid = CYCLIC_NONE;
 613  625  
 614  626          refcount_destroy(&spa->spa_refcount);
 615  627  
 616  628          spa_config_lock_destroy(spa);
 617  629  
 618  630          kstat_delete(spa->spa_iokstat);
 619  631          spa->spa_iokstat = NULL;
 620  632  
 621  633          for (int t = 0; t < TXG_SIZE; t++)
 622  634                  bplist_destroy(&spa->spa_free_bplist[t]);
 623  635  
 624  636          cv_destroy(&spa->spa_async_cv);
 625  637          cv_destroy(&spa->spa_proc_cv);
 626  638          cv_destroy(&spa->spa_scrub_io_cv);
 627  639          cv_destroy(&spa->spa_suspend_cv);
 628  640  
 629  641          mutex_destroy(&spa->spa_async_lock);
 630  642          mutex_destroy(&spa->spa_errlist_lock);
 631  643          mutex_destroy(&spa->spa_errlog_lock);
 632  644          mutex_destroy(&spa->spa_history_lock);
 633  645          mutex_destroy(&spa->spa_proc_lock);
 634  646          mutex_destroy(&spa->spa_props_lock);
 635  647          mutex_destroy(&spa->spa_scrub_lock);
 636  648          mutex_destroy(&spa->spa_suspend_lock);
 637  649          mutex_destroy(&spa->spa_vdev_top_lock);
 638  650          mutex_destroy(&spa->spa_iokstat_lock);
 639  651  
 640  652          kmem_free(spa, sizeof (spa_t));
 641  653  }
 642  654  
 643  655  /*
 644  656   * Given a pool, return the next pool in the namespace, or NULL if there is
 645  657   * none.  If 'prev' is NULL, return the first pool.
 646  658   */
 647  659  spa_t *
 648  660  spa_next(spa_t *prev)
 649  661  {
 650  662          ASSERT(MUTEX_HELD(&spa_namespace_lock));
 651  663  
 652  664          if (prev)
 653  665                  return (AVL_NEXT(&spa_namespace_avl, prev));
 654  666          else
 655  667                  return (avl_first(&spa_namespace_avl));
 656  668  }
 657  669  
 658  670  /*
 659  671   * ==========================================================================
 660  672   * SPA refcount functions
 661  673   * ==========================================================================
 662  674   */
 663  675  
 664  676  /*
 665  677   * Add a reference to the given spa_t.  Must have at least one reference, or
 666  678   * have the namespace lock held.
 667  679   */
 668  680  void
 669  681  spa_open_ref(spa_t *spa, void *tag)
 670  682  {
 671  683          ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
 672  684              MUTEX_HELD(&spa_namespace_lock));
 673  685          (void) refcount_add(&spa->spa_refcount, tag);
 674  686  }
 675  687  
 676  688  /*
 677  689   * Remove a reference to the given spa_t.  Must have at least one reference, or
 678  690   * have the namespace lock held.
 679  691   */
 680  692  void
 681  693  spa_close(spa_t *spa, void *tag)
 682  694  {
 683  695          ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref ||
 684  696              MUTEX_HELD(&spa_namespace_lock));
 685  697          (void) refcount_remove(&spa->spa_refcount, tag);
 686  698  }
 687  699  
 688  700  /*
 689  701   * Check to see if the spa refcount is zero.  Must be called with
 690  702   * spa_namespace_lock held.  We really compare against spa_minref, which is the
 691  703   * number of references acquired when opening a pool
 692  704   */
 693  705  boolean_t
 694  706  spa_refcount_zero(spa_t *spa)
 695  707  {
 696  708          ASSERT(MUTEX_HELD(&spa_namespace_lock));
 697  709  
 698  710          return (refcount_count(&spa->spa_refcount) == spa->spa_minref);
 699  711  }
 700  712  
 701  713  /*
 702  714   * ==========================================================================
 703  715   * SPA spare and l2cache tracking
 704  716   * ==========================================================================
 705  717   */
 706  718  
 707  719  /*
 708  720   * Hot spares and cache devices are tracked using the same code below,
 709  721   * for 'auxiliary' devices.
 710  722   */
 711  723  
 712  724  typedef struct spa_aux {
 713  725          uint64_t        aux_guid;
 714  726          uint64_t        aux_pool;
 715  727          avl_node_t      aux_avl;
 716  728          int             aux_count;
 717  729  } spa_aux_t;
 718  730  
 719  731  static int
 720  732  spa_aux_compare(const void *a, const void *b)
 721  733  {
 722  734          const spa_aux_t *sa = a;
 723  735          const spa_aux_t *sb = b;
 724  736  
 725  737          if (sa->aux_guid < sb->aux_guid)
 726  738                  return (-1);
 727  739          else if (sa->aux_guid > sb->aux_guid)
 728  740                  return (1);
 729  741          else
 730  742                  return (0);
 731  743  }
 732  744  
 733  745  void
 734  746  spa_aux_add(vdev_t *vd, avl_tree_t *avl)
 735  747  {
 736  748          avl_index_t where;
 737  749          spa_aux_t search;
 738  750          spa_aux_t *aux;
 739  751  
 740  752          search.aux_guid = vd->vdev_guid;
 741  753          if ((aux = avl_find(avl, &search, &where)) != NULL) {
 742  754                  aux->aux_count++;
 743  755          } else {
 744  756                  aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
 745  757                  aux->aux_guid = vd->vdev_guid;
 746  758                  aux->aux_count = 1;
 747  759                  avl_insert(avl, aux, where);
 748  760          }
 749  761  }
 750  762  
 751  763  void
 752  764  spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
 753  765  {
 754  766          spa_aux_t search;
 755  767          spa_aux_t *aux;
 756  768          avl_index_t where;
 757  769  
 758  770          search.aux_guid = vd->vdev_guid;
 759  771          aux = avl_find(avl, &search, &where);
 760  772  
 761  773          ASSERT(aux != NULL);
 762  774  
 763  775          if (--aux->aux_count == 0) {
 764  776                  avl_remove(avl, aux);
 765  777                  kmem_free(aux, sizeof (spa_aux_t));
 766  778          } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
 767  779                  aux->aux_pool = 0ULL;
 768  780          }
 769  781  }
 770  782  
 771  783  boolean_t
 772  784  spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
 773  785  {
 774  786          spa_aux_t search, *found;
 775  787  
 776  788          search.aux_guid = guid;
 777  789          found = avl_find(avl, &search, NULL);
 778  790  
 779  791          if (pool) {
 780  792                  if (found)
 781  793                          *pool = found->aux_pool;
 782  794                  else
 783  795                          *pool = 0ULL;
 784  796          }
 785  797  
 786  798          if (refcnt) {
 787  799                  if (found)
 788  800                          *refcnt = found->aux_count;
 789  801                  else
 790  802                          *refcnt = 0;
 791  803          }
 792  804  
 793  805          return (found != NULL);
 794  806  }
 795  807  
 796  808  void
 797  809  spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
 798  810  {
 799  811          spa_aux_t search, *found;
 800  812          avl_index_t where;
 801  813  
 802  814          search.aux_guid = vd->vdev_guid;
 803  815          found = avl_find(avl, &search, &where);
 804  816          ASSERT(found != NULL);
 805  817          ASSERT(found->aux_pool == 0ULL);
 806  818  
 807  819          found->aux_pool = spa_guid(vd->vdev_spa);
 808  820  }
 809  821  
 810  822  /*
 811  823   * Spares are tracked globally due to the following constraints:
 812  824   *
 813  825   *      - A spare may be part of multiple pools.
 814  826   *      - A spare may be added to a pool even if it's actively in use within
 815  827   *        another pool.
 816  828   *      - A spare in use in any pool can only be the source of a replacement if
 817  829   *        the target is a spare in the same pool.
 818  830   *
 819  831   * We keep track of all spares on the system through the use of a reference
 820  832   * counted AVL tree.  When a vdev is added as a spare, or used as a replacement
 821  833   * spare, then we bump the reference count in the AVL tree.  In addition, we set
 822  834   * the 'vdev_isspare' member to indicate that the device is a spare (active or
 823  835   * inactive).  When a spare is made active (used to replace a device in the
 824  836   * pool), we also keep track of which pool its been made a part of.
 825  837   *
 826  838   * The 'spa_spare_lock' protects the AVL tree.  These functions are normally
 827  839   * called under the spa_namespace lock as part of vdev reconfiguration.  The
 828  840   * separate spare lock exists for the status query path, which does not need to
 829  841   * be completely consistent with respect to other vdev configuration changes.
 830  842   */
 831  843  
 832  844  static int
 833  845  spa_spare_compare(const void *a, const void *b)
 834  846  {
 835  847          return (spa_aux_compare(a, b));
 836  848  }
 837  849  
 838  850  void
 839  851  spa_spare_add(vdev_t *vd)
 840  852  {
 841  853          mutex_enter(&spa_spare_lock);
 842  854          ASSERT(!vd->vdev_isspare);
 843  855          spa_aux_add(vd, &spa_spare_avl);
 844  856          vd->vdev_isspare = B_TRUE;
 845  857          mutex_exit(&spa_spare_lock);
 846  858  }
 847  859  
 848  860  void
 849  861  spa_spare_remove(vdev_t *vd)
 850  862  {
 851  863          mutex_enter(&spa_spare_lock);
 852  864          ASSERT(vd->vdev_isspare);
 853  865          spa_aux_remove(vd, &spa_spare_avl);
 854  866          vd->vdev_isspare = B_FALSE;
 855  867          mutex_exit(&spa_spare_lock);
 856  868  }
 857  869  
 858  870  boolean_t
 859  871  spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
 860  872  {
 861  873          boolean_t found;
 862  874  
 863  875          mutex_enter(&spa_spare_lock);
 864  876          found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
 865  877          mutex_exit(&spa_spare_lock);
 866  878  
 867  879          return (found);
 868  880  }
 869  881  
 870  882  void
 871  883  spa_spare_activate(vdev_t *vd)
 872  884  {
 873  885          mutex_enter(&spa_spare_lock);
 874  886          ASSERT(vd->vdev_isspare);
 875  887          spa_aux_activate(vd, &spa_spare_avl);
 876  888          mutex_exit(&spa_spare_lock);
 877  889  }
 878  890  
 879  891  /*
 880  892   * Level 2 ARC devices are tracked globally for the same reasons as spares.
 881  893   * Cache devices currently only support one pool per cache device, and so
 882  894   * for these devices the aux reference count is currently unused beyond 1.
 883  895   */
 884  896  
 885  897  static int
 886  898  spa_l2cache_compare(const void *a, const void *b)
 887  899  {
 888  900          return (spa_aux_compare(a, b));
 889  901  }
 890  902  
 891  903  void
 892  904  spa_l2cache_add(vdev_t *vd)
 893  905  {
 894  906          mutex_enter(&spa_l2cache_lock);
 895  907          ASSERT(!vd->vdev_isl2cache);
 896  908          spa_aux_add(vd, &spa_l2cache_avl);
 897  909          vd->vdev_isl2cache = B_TRUE;
 898  910          mutex_exit(&spa_l2cache_lock);
 899  911  }
 900  912  
 901  913  void
 902  914  spa_l2cache_remove(vdev_t *vd)
 903  915  {
 904  916          mutex_enter(&spa_l2cache_lock);
 905  917          ASSERT(vd->vdev_isl2cache);
 906  918          spa_aux_remove(vd, &spa_l2cache_avl);
 907  919          vd->vdev_isl2cache = B_FALSE;
 908  920          mutex_exit(&spa_l2cache_lock);
 909  921  }
 910  922  
 911  923  boolean_t
 912  924  spa_l2cache_exists(uint64_t guid, uint64_t *pool)
 913  925  {
 914  926          boolean_t found;
 915  927  
 916  928          mutex_enter(&spa_l2cache_lock);
 917  929          found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
 918  930          mutex_exit(&spa_l2cache_lock);
 919  931  
 920  932          return (found);
 921  933  }
 922  934  
 923  935  void
 924  936  spa_l2cache_activate(vdev_t *vd)
 925  937  {
 926  938          mutex_enter(&spa_l2cache_lock);
 927  939          ASSERT(vd->vdev_isl2cache);
 928  940          spa_aux_activate(vd, &spa_l2cache_avl);
 929  941          mutex_exit(&spa_l2cache_lock);
 930  942  }
 931  943  
 932  944  /*
 933  945   * ==========================================================================
 934  946   * SPA vdev locking
 935  947   * ==========================================================================
 936  948   */
 937  949  
 938  950  /*
 939  951   * Lock the given spa_t for the purpose of adding or removing a vdev.
 940  952   * Grabs the global spa_namespace_lock plus the spa config lock for writing.
 941  953   * It returns the next transaction group for the spa_t.
 942  954   */
 943  955  uint64_t
 944  956  spa_vdev_enter(spa_t *spa)
 945  957  {
 946  958          mutex_enter(&spa->spa_vdev_top_lock);
 947  959          mutex_enter(&spa_namespace_lock);
 948  960          return (spa_vdev_config_enter(spa));
 949  961  }
 950  962  
 951  963  /*
 952  964   * Internal implementation for spa_vdev_enter().  Used when a vdev
 953  965   * operation requires multiple syncs (i.e. removing a device) while
 954  966   * keeping the spa_namespace_lock held.
 955  967   */
 956  968  uint64_t
 957  969  spa_vdev_config_enter(spa_t *spa)
 958  970  {
 959  971          ASSERT(MUTEX_HELD(&spa_namespace_lock));
 960  972  
 961  973          spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 962  974  
 963  975          return (spa_last_synced_txg(spa) + 1);
 964  976  }
 965  977  
 966  978  /*
 967  979   * Used in combination with spa_vdev_config_enter() to allow the syncing
 968  980   * of multiple transactions without releasing the spa_namespace_lock.
 969  981   */
 970  982  void
 971  983  spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
 972  984  {
 973  985          ASSERT(MUTEX_HELD(&spa_namespace_lock));
 974  986  
 975  987          int config_changed = B_FALSE;
 976  988  
 977  989          ASSERT(txg > spa_last_synced_txg(spa));
 978  990  
 979  991          spa->spa_pending_vdev = NULL;
 980  992  
 981  993          /*
 982  994           * Reassess the DTLs.
 983  995           */
 984  996          vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
 985  997  
 986  998          if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
 987  999                  config_changed = B_TRUE;
 988 1000                  spa->spa_config_generation++;
 989 1001          }
 990 1002  
 991 1003          /*
 992 1004           * Verify the metaslab classes.
 993 1005           */
 994 1006          ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
 995 1007          ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
 996 1008  
 997 1009          spa_config_exit(spa, SCL_ALL, spa);
 998 1010  
 999 1011          /*
1000 1012           * Panic the system if the specified tag requires it.  This
1001 1013           * is useful for ensuring that configurations are updated
1002 1014           * transactionally.
1003 1015           */
1004 1016          if (zio_injection_enabled)
1005 1017                  zio_handle_panic_injection(spa, tag, 0);
1006 1018  
1007 1019          /*
1008 1020           * Note: this txg_wait_synced() is important because it ensures
1009 1021           * that there won't be more than one config change per txg.
1010 1022           * This allows us to use the txg as the generation number.
1011 1023           */
1012 1024          if (error == 0)
1013 1025                  txg_wait_synced(spa->spa_dsl_pool, txg);
1014 1026  
1015 1027          if (vd != NULL) {
1016 1028                  ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0);
1017 1029                  spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1018 1030                  vdev_free(vd);
1019 1031                  spa_config_exit(spa, SCL_ALL, spa);
1020 1032          }
1021 1033  
1022 1034          /*
1023 1035           * If the config changed, update the config cache.
1024 1036           */
1025 1037          if (config_changed)
1026 1038                  spa_config_sync(spa, B_FALSE, B_TRUE);
1027 1039  }
1028 1040  
1029 1041  /*
1030 1042   * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
1031 1043   * locking of spa_vdev_enter(), we also want make sure the transactions have
1032 1044   * synced to disk, and then update the global configuration cache with the new
1033 1045   * information.
1034 1046   */
1035 1047  int
1036 1048  spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
1037 1049  {
1038 1050          spa_vdev_config_exit(spa, vd, txg, error, FTAG);
1039 1051          mutex_exit(&spa_namespace_lock);
1040 1052          mutex_exit(&spa->spa_vdev_top_lock);
1041 1053  
1042 1054          return (error);
1043 1055  }
1044 1056  
1045 1057  /*
1046 1058   * Lock the given spa_t for the purpose of changing vdev state.
1047 1059   */
1048 1060  void
1049 1061  spa_vdev_state_enter(spa_t *spa, int oplocks)
1050 1062  {
1051 1063          int locks = SCL_STATE_ALL | oplocks;
1052 1064  
1053 1065          /*
1054 1066           * Root pools may need to read of the underlying devfs filesystem
1055 1067           * when opening up a vdev.  Unfortunately if we're holding the
1056 1068           * SCL_ZIO lock it will result in a deadlock when we try to issue
1057 1069           * the read from the root filesystem.  Instead we "prefetch"
1058 1070           * the associated vnodes that we need prior to opening the
1059 1071           * underlying devices and cache them so that we can prevent
1060 1072           * any I/O when we are doing the actual open.
1061 1073           */
1062 1074          if (spa_is_root(spa)) {
1063 1075                  int low = locks & ~(SCL_ZIO - 1);
1064 1076                  int high = locks & ~low;
1065 1077  
1066 1078                  spa_config_enter(spa, high, spa, RW_WRITER);
1067 1079                  vdev_hold(spa->spa_root_vdev);
1068 1080                  spa_config_enter(spa, low, spa, RW_WRITER);
1069 1081          } else {
1070 1082                  spa_config_enter(spa, locks, spa, RW_WRITER);
1071 1083          }
1072 1084          spa->spa_vdev_locks = locks;
1073 1085  }
1074 1086  
1075 1087  int
1076 1088  spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
1077 1089  {
1078 1090          boolean_t config_changed = B_FALSE;
1079 1091  
1080 1092          if (vd != NULL || error == 0)
1081 1093                  vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
1082 1094                      0, 0, B_FALSE);
1083 1095  
1084 1096          if (vd != NULL) {
1085 1097                  vdev_state_dirty(vd->vdev_top);
1086 1098                  config_changed = B_TRUE;
1087 1099                  spa->spa_config_generation++;
1088 1100          }
1089 1101  
1090 1102          if (spa_is_root(spa))
1091 1103                  vdev_rele(spa->spa_root_vdev);
1092 1104  
1093 1105          ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
1094 1106          spa_config_exit(spa, spa->spa_vdev_locks, spa);
1095 1107  
1096 1108          /*
1097 1109           * If anything changed, wait for it to sync.  This ensures that,
1098 1110           * from the system administrator's perspective, zpool(1M) commands
1099 1111           * are synchronous.  This is important for things like zpool offline:
1100 1112           * when the command completes, you expect no further I/O from ZFS.
1101 1113           */
1102 1114          if (vd != NULL)
1103 1115                  txg_wait_synced(spa->spa_dsl_pool, 0);
1104 1116  
1105 1117          /*
1106 1118           * If the config changed, update the config cache.
1107 1119           */
1108 1120          if (config_changed) {
1109 1121                  mutex_enter(&spa_namespace_lock);
1110 1122                  spa_config_sync(spa, B_FALSE, B_TRUE);
1111 1123                  mutex_exit(&spa_namespace_lock);
1112 1124          }
1113 1125  
1114 1126          return (error);
1115 1127  }
1116 1128  
1117 1129  /*
1118 1130   * ==========================================================================
1119 1131   * Miscellaneous functions
1120 1132   * ==========================================================================
1121 1133   */
1122 1134  
1123 1135  void
1124 1136  spa_activate_mos_feature(spa_t *spa, const char *feature)
1125 1137  {
1126 1138          (void) nvlist_add_boolean(spa->spa_label_features, feature);
1127 1139          vdev_config_dirty(spa->spa_root_vdev);
1128 1140  }
1129 1141  
1130 1142  void
1131 1143  spa_deactivate_mos_feature(spa_t *spa, const char *feature)
1132 1144  {
1133 1145          (void) nvlist_remove_all(spa->spa_label_features, feature);
1134 1146          vdev_config_dirty(spa->spa_root_vdev);
1135 1147  }
1136 1148  
1137 1149  /*
1138 1150   * Rename a spa_t.
1139 1151   */
1140 1152  int
1141 1153  spa_rename(const char *name, const char *newname)
1142 1154  {
1143 1155          spa_t *spa;
1144 1156          int err;
1145 1157  
1146 1158          /*
1147 1159           * Lookup the spa_t and grab the config lock for writing.  We need to
1148 1160           * actually open the pool so that we can sync out the necessary labels.
1149 1161           * It's OK to call spa_open() with the namespace lock held because we
1150 1162           * allow recursive calls for other reasons.
1151 1163           */
1152 1164          mutex_enter(&spa_namespace_lock);
1153 1165          if ((err = spa_open(name, &spa, FTAG)) != 0) {
1154 1166                  mutex_exit(&spa_namespace_lock);
1155 1167                  return (err);
1156 1168          }
1157 1169  
1158 1170          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1159 1171  
1160 1172          avl_remove(&spa_namespace_avl, spa);
1161 1173          (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name));
1162 1174          avl_add(&spa_namespace_avl, spa);
1163 1175  
1164 1176          /*
1165 1177           * Sync all labels to disk with the new names by marking the root vdev
1166 1178           * dirty and waiting for it to sync.  It will pick up the new pool name
1167 1179           * during the sync.
1168 1180           */
1169 1181          vdev_config_dirty(spa->spa_root_vdev);
1170 1182  
1171 1183          spa_config_exit(spa, SCL_ALL, FTAG);
1172 1184  
1173 1185          txg_wait_synced(spa->spa_dsl_pool, 0);
1174 1186  
1175 1187          /*
1176 1188           * Sync the updated config cache.
1177 1189           */
1178 1190          spa_config_sync(spa, B_FALSE, B_TRUE);
1179 1191  
1180 1192          spa_close(spa, FTAG);
1181 1193  
1182 1194          mutex_exit(&spa_namespace_lock);
1183 1195  
1184 1196          return (0);
1185 1197  }
1186 1198  
1187 1199  /*
1188 1200   * Return the spa_t associated with given pool_guid, if it exists.  If
1189 1201   * device_guid is non-zero, determine whether the pool exists *and* contains
1190 1202   * a device with the specified device_guid.
1191 1203   */
1192 1204  spa_t *
1193 1205  spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
1194 1206  {
1195 1207          spa_t *spa;
1196 1208          avl_tree_t *t = &spa_namespace_avl;
1197 1209  
1198 1210          ASSERT(MUTEX_HELD(&spa_namespace_lock));
1199 1211  
1200 1212          for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
1201 1213                  if (spa->spa_state == POOL_STATE_UNINITIALIZED)
1202 1214                          continue;
1203 1215                  if (spa->spa_root_vdev == NULL)
1204 1216                          continue;
1205 1217                  if (spa_guid(spa) == pool_guid) {
1206 1218                          if (device_guid == 0)
1207 1219                                  break;
1208 1220  
1209 1221                          if (vdev_lookup_by_guid(spa->spa_root_vdev,
1210 1222                              device_guid) != NULL)
1211 1223                                  break;
1212 1224  
1213 1225                          /*
1214 1226                           * Check any devices we may be in the process of adding.
1215 1227                           */
1216 1228                          if (spa->spa_pending_vdev) {
1217 1229                                  if (vdev_lookup_by_guid(spa->spa_pending_vdev,
1218 1230                                      device_guid) != NULL)
1219 1231                                          break;
1220 1232                          }
1221 1233                  }
1222 1234          }
1223 1235  
1224 1236          return (spa);
1225 1237  }
1226 1238  
1227 1239  /*
1228 1240   * Determine whether a pool with the given pool_guid exists.
1229 1241   */
1230 1242  boolean_t
1231 1243  spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
1232 1244  {
1233 1245          return (spa_by_guid(pool_guid, device_guid) != NULL);
1234 1246  }
1235 1247  
1236 1248  char *
1237 1249  spa_strdup(const char *s)
1238 1250  {
1239 1251          size_t len;
1240 1252          char *new;
1241 1253  
1242 1254          len = strlen(s);
1243 1255          new = kmem_alloc(len + 1, KM_SLEEP);
1244 1256          bcopy(s, new, len);
1245 1257          new[len] = '\0';
1246 1258  
1247 1259          return (new);
1248 1260  }
1249 1261  
1250 1262  void
1251 1263  spa_strfree(char *s)
1252 1264  {
1253 1265          kmem_free(s, strlen(s) + 1);
1254 1266  }
1255 1267  
1256 1268  uint64_t
1257 1269  spa_get_random(uint64_t range)
1258 1270  {
1259 1271          uint64_t r;
1260 1272  
1261 1273          ASSERT(range != 0);
1262 1274  
1263 1275          (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
1264 1276  
1265 1277          return (r % range);
1266 1278  }
1267 1279  
1268 1280  uint64_t
1269 1281  spa_generate_guid(spa_t *spa)
1270 1282  {
1271 1283          uint64_t guid = spa_get_random(-1ULL);
1272 1284  
1273 1285          if (spa != NULL) {
1274 1286                  while (guid == 0 || spa_guid_exists(spa_guid(spa), guid))
1275 1287                          guid = spa_get_random(-1ULL);
1276 1288          } else {
1277 1289                  while (guid == 0 || spa_guid_exists(guid, 0))
1278 1290                          guid = spa_get_random(-1ULL);
1279 1291          }
1280 1292  
1281 1293          return (guid);
1282 1294  }
1283 1295  
1284 1296  void
1285 1297  sprintf_blkptr(char *buf, const blkptr_t *bp)
1286 1298  {
1287 1299          char type[256];
1288 1300          char *checksum = NULL;
1289 1301          char *compress = NULL;
1290 1302  
1291 1303          if (bp != NULL) {
1292 1304                  if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
1293 1305                          dmu_object_byteswap_t bswap =
1294 1306                              DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
1295 1307                          (void) snprintf(type, sizeof (type), "bswap %s %s",
1296 1308                              DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
1297 1309                              "metadata" : "data",
1298 1310                              dmu_ot_byteswap[bswap].ob_name);
1299 1311                  } else {
1300 1312                          (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
1301 1313                              sizeof (type));
1302 1314                  }
1303 1315                  checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
1304 1316                  compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
1305 1317          }
1306 1318  
1307 1319          SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress);
1308 1320  }
1309 1321  
1310 1322  void
1311 1323  spa_freeze(spa_t *spa)
1312 1324  {
1313 1325          uint64_t freeze_txg = 0;
1314 1326  
1315 1327          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1316 1328          if (spa->spa_freeze_txg == UINT64_MAX) {
1317 1329                  freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
1318 1330                  spa->spa_freeze_txg = freeze_txg;
1319 1331          }
1320 1332          spa_config_exit(spa, SCL_ALL, FTAG);
1321 1333          if (freeze_txg != 0)
1322 1334                  txg_wait_synced(spa_get_dsl(spa), freeze_txg);
1323 1335  }
1324 1336  
1325 1337  void
1326 1338  zfs_panic_recover(const char *fmt, ...)
1327 1339  {
1328 1340          va_list adx;
1329 1341  
1330 1342          va_start(adx, fmt);
1331 1343          vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
1332 1344          va_end(adx);
1333 1345  }
1334 1346  
1335 1347  /*
1336 1348   * This is a stripped-down version of strtoull, suitable only for converting
1337 1349   * lowercase hexadecimal numbers that don't overflow.
1338 1350   */
1339 1351  uint64_t
1340 1352  strtonum(const char *str, char **nptr)
1341 1353  {
1342 1354          uint64_t val = 0;
1343 1355          char c;
1344 1356          int digit;
1345 1357  
1346 1358          while ((c = *str) != '\0') {
1347 1359                  if (c >= '0' && c <= '9')
1348 1360                          digit = c - '0';
1349 1361                  else if (c >= 'a' && c <= 'f')
1350 1362                          digit = 10 + c - 'a';
1351 1363                  else
1352 1364                          break;
1353 1365  
1354 1366                  val *= 16;
1355 1367                  val += digit;
1356 1368  
1357 1369                  str++;
1358 1370          }
1359 1371  
1360 1372          if (nptr)
1361 1373                  *nptr = (char *)str;
1362 1374  
1363 1375          return (val);
1364 1376  }
1365 1377  
1366 1378  /*
1367 1379   * ==========================================================================
1368 1380   * Accessor functions
1369 1381   * ==========================================================================
1370 1382   */
1371 1383  
1372 1384  boolean_t
1373 1385  spa_shutting_down(spa_t *spa)
1374 1386  {
1375 1387          return (spa->spa_async_suspended);
1376 1388  }
1377 1389  
1378 1390  dsl_pool_t *
1379 1391  spa_get_dsl(spa_t *spa)
1380 1392  {
1381 1393          return (spa->spa_dsl_pool);
1382 1394  }
1383 1395  
1384 1396  boolean_t
1385 1397  spa_is_initializing(spa_t *spa)
1386 1398  {
1387 1399          return (spa->spa_is_initializing);
1388 1400  }
1389 1401  
1390 1402  blkptr_t *
1391 1403  spa_get_rootblkptr(spa_t *spa)
1392 1404  {
1393 1405          return (&spa->spa_ubsync.ub_rootbp);
1394 1406  }
1395 1407  
1396 1408  void
1397 1409  spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
1398 1410  {
1399 1411          spa->spa_uberblock.ub_rootbp = *bp;
1400 1412  }
1401 1413  
1402 1414  void
1403 1415  spa_altroot(spa_t *spa, char *buf, size_t buflen)
1404 1416  {
1405 1417          if (spa->spa_root == NULL)
1406 1418                  buf[0] = '\0';
1407 1419          else
1408 1420                  (void) strncpy(buf, spa->spa_root, buflen);
1409 1421  }
1410 1422  
1411 1423  int
1412 1424  spa_sync_pass(spa_t *spa)
1413 1425  {
1414 1426          return (spa->spa_sync_pass);
1415 1427  }
1416 1428  
1417 1429  char *
1418 1430  spa_name(spa_t *spa)
1419 1431  {
1420 1432          return (spa->spa_name);
1421 1433  }
1422 1434  
1423 1435  uint64_t
1424 1436  spa_guid(spa_t *spa)
1425 1437  {
1426 1438          dsl_pool_t *dp = spa_get_dsl(spa);
1427 1439          uint64_t guid;
1428 1440  
1429 1441          /*
1430 1442           * If we fail to parse the config during spa_load(), we can go through
1431 1443           * the error path (which posts an ereport) and end up here with no root
1432 1444           * vdev.  We stash the original pool guid in 'spa_config_guid' to handle
1433 1445           * this case.
1434 1446           */
1435 1447          if (spa->spa_root_vdev == NULL)
1436 1448                  return (spa->spa_config_guid);
1437 1449  
1438 1450          guid = spa->spa_last_synced_guid != 0 ?
1439 1451              spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
1440 1452  
1441 1453          /*
1442 1454           * Return the most recently synced out guid unless we're
1443 1455           * in syncing context.
1444 1456           */
1445 1457          if (dp && dsl_pool_sync_context(dp))
1446 1458                  return (spa->spa_root_vdev->vdev_guid);
1447 1459          else
1448 1460                  return (guid);
1449 1461  }
1450 1462  
1451 1463  uint64_t
1452 1464  spa_load_guid(spa_t *spa)
1453 1465  {
1454 1466          /*
1455 1467           * This is a GUID that exists solely as a reference for the
1456 1468           * purposes of the arc.  It is generated at load time, and
1457 1469           * is never written to persistent storage.
1458 1470           */
1459 1471          return (spa->spa_load_guid);
1460 1472  }
1461 1473  
1462 1474  uint64_t
1463 1475  spa_last_synced_txg(spa_t *spa)
1464 1476  {
1465 1477          return (spa->spa_ubsync.ub_txg);
1466 1478  }
1467 1479  
1468 1480  uint64_t
1469 1481  spa_first_txg(spa_t *spa)
1470 1482  {
1471 1483          return (spa->spa_first_txg);
1472 1484  }
1473 1485  
1474 1486  uint64_t
1475 1487  spa_syncing_txg(spa_t *spa)
1476 1488  {
1477 1489          return (spa->spa_syncing_txg);
1478 1490  }
1479 1491  
1480 1492  pool_state_t
1481 1493  spa_state(spa_t *spa)
1482 1494  {
1483 1495          return (spa->spa_state);
1484 1496  }
1485 1497  
1486 1498  spa_load_state_t
1487 1499  spa_load_state(spa_t *spa)
1488 1500  {
1489 1501          return (spa->spa_load_state);
1490 1502  }
1491 1503

↓ open down ↓

970 lines elided

↑ open up ↑

1492 1504  uint64_t
1493 1505  spa_freeze_txg(spa_t *spa)
1494 1506  {
1495 1507          return (spa->spa_freeze_txg);
1496 1508  }
1497 1509  
1498 1510  /* ARGSUSED */
1499 1511  uint64_t
1500 1512  spa_get_asize(spa_t *spa, uint64_t lsize)
1501 1513  {
1502      -        /*
1503      -         * The worst case is single-sector max-parity RAID-Z blocks, in which
1504      -         * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
1505      -         * times the size; so just assume that.  Add to this the fact that
1506      -         * we can have up to 3 DVAs per bp, and one more factor of 2 because
1507      -         * the block may be dittoed with up to 3 DVAs by ddt_sync().
1508      -         */
1509      -        return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2);
     1514 +        return (lsize * spa_asize_inflation);
1510 1515  }
1511 1516  
1512 1517  uint64_t
1513 1518  spa_get_dspace(spa_t *spa)
1514 1519  {
1515 1520          return (spa->spa_dspace);
1516 1521  }
1517 1522  
1518 1523  void
1519 1524  spa_update_dspace(spa_t *spa)

1520 1525  {
1521 1526          spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
1522 1527              ddt_get_dedup_dspace(spa);
1523 1528  }
1524 1529  
1525 1530  /*
1526 1531   * Return the failure mode that has been set to this pool. The default
1527 1532   * behavior will be to block all I/Os when a complete failure occurs.
1528 1533   */
1529 1534  uint8_t
1530 1535  spa_get_failmode(spa_t *spa)
1531 1536  {
1532 1537          return (spa->spa_failmode);
1533 1538  }
1534 1539  
1535 1540  boolean_t
1536 1541  spa_suspended(spa_t *spa)
1537 1542  {
1538 1543          return (spa->spa_suspended);
1539 1544  }
1540 1545  
1541 1546  uint64_t
1542 1547  spa_version(spa_t *spa)
1543 1548  {
1544 1549          return (spa->spa_ubsync.ub_version);
1545 1550  }
1546 1551  
1547 1552  boolean_t
1548 1553  spa_deflate(spa_t *spa)
1549 1554  {
1550 1555          return (spa->spa_deflate);
1551 1556  }
1552 1557  
1553 1558  metaslab_class_t *
1554 1559  spa_normal_class(spa_t *spa)
1555 1560  {
1556 1561          return (spa->spa_normal_class);
1557 1562  }
1558 1563  
1559 1564  metaslab_class_t *
1560 1565  spa_log_class(spa_t *spa)
1561 1566  {
1562 1567          return (spa->spa_log_class);
1563 1568  }
1564 1569  
1565 1570  int
1566 1571  spa_max_replication(spa_t *spa)
1567 1572  {
1568 1573          /*
1569 1574           * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
1570 1575           * handle BPs with more than one DVA allocated.  Set our max
1571 1576           * replication level accordingly.
1572 1577           */
1573 1578          if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
1574 1579                  return (1);
1575 1580          return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
1576 1581  }
1577 1582  
1578 1583  int
1579 1584  spa_prev_software_version(spa_t *spa)
1580 1585  {
1581 1586          return (spa->spa_prev_software_version);
1582 1587  }
1583 1588  
1584 1589  uint64_t
1585 1590  spa_deadman_synctime(spa_t *spa)
1586 1591  {
1587 1592          return (spa->spa_deadman_synctime);
1588 1593  }
1589 1594  
1590 1595  uint64_t
1591 1596  dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
1592 1597  {
1593 1598          uint64_t asize = DVA_GET_ASIZE(dva);
1594 1599          uint64_t dsize = asize;
1595 1600  
1596 1601          ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1597 1602  
1598 1603          if (asize != 0 && spa->spa_deflate) {
1599 1604                  vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
1600 1605                  dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
1601 1606          }
1602 1607  
1603 1608          return (dsize);
1604 1609  }
1605 1610  
1606 1611  uint64_t
1607 1612  bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
1608 1613  {
1609 1614          uint64_t dsize = 0;
1610 1615  
1611 1616          for (int d = 0; d < SPA_DVAS_PER_BP; d++)
1612 1617                  dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
1613 1618  
1614 1619          return (dsize);
1615 1620  }
1616 1621  
1617 1622  uint64_t
1618 1623  bp_get_dsize(spa_t *spa, const blkptr_t *bp)
1619 1624  {
1620 1625          uint64_t dsize = 0;
1621 1626  
1622 1627          spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
1623 1628  
1624 1629          for (int d = 0; d < SPA_DVAS_PER_BP; d++)
1625 1630                  dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
1626 1631  
1627 1632          spa_config_exit(spa, SCL_VDEV, FTAG);
1628 1633  
1629 1634          return (dsize);
1630 1635  }
1631 1636  
1632 1637  /*
1633 1638   * ==========================================================================
1634 1639   * Initialization and Termination
1635 1640   * ==========================================================================
1636 1641   */
1637 1642  
1638 1643  static int
1639 1644  spa_name_compare(const void *a1, const void *a2)
1640 1645  {
1641 1646          const spa_t *s1 = a1;
1642 1647          const spa_t *s2 = a2;
1643 1648          int s;
1644 1649  
1645 1650          s = strcmp(s1->spa_name, s2->spa_name);
1646 1651          if (s > 0)
1647 1652                  return (1);
1648 1653          if (s < 0)
1649 1654                  return (-1);
1650 1655          return (0);
1651 1656  }
1652 1657  
1653 1658  int
1654 1659  spa_busy(void)
1655 1660  {
1656 1661          return (spa_active_count);
1657 1662  }
1658 1663  
1659 1664  void
1660 1665  spa_boot_init()
1661 1666  {
1662 1667          spa_config_load();
1663 1668  }
1664 1669  
1665 1670  void
1666 1671  spa_init(int mode)
1667 1672  {
1668 1673          mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
1669 1674          mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
1670 1675          mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
1671 1676          cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
1672 1677  
1673 1678          avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
1674 1679              offsetof(spa_t, spa_avl));
1675 1680  
1676 1681          avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
1677 1682              offsetof(spa_aux_t, aux_avl));
1678 1683  
1679 1684          avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
1680 1685              offsetof(spa_aux_t, aux_avl));
1681 1686  
1682 1687          spa_mode_global = mode;
1683 1688  
1684 1689  #ifdef _KERNEL
1685 1690          spa_arch_init();
1686 1691  #else
1687 1692          if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
1688 1693                  arc_procfd = open("/proc/self/ctl", O_WRONLY);
1689 1694                  if (arc_procfd == -1) {
1690 1695                          perror("could not enable watchpoints: "
1691 1696                              "opening /proc/self/ctl failed: ");
1692 1697                  } else {
1693 1698                          arc_watch = B_TRUE;
1694 1699                  }
1695 1700          }
1696 1701  #endif
1697 1702  
1698 1703          refcount_init();
1699 1704          unique_init();
1700 1705          space_map_init();
1701 1706          zio_init();
1702 1707          dmu_init();
1703 1708          zil_init();
1704 1709          vdev_cache_stat_init();
1705 1710          zfs_prop_init();
1706 1711          zpool_prop_init();
1707 1712          zpool_feature_init();
1708 1713          spa_config_load();
1709 1714          l2arc_start();
1710 1715  }
1711 1716  
1712 1717  void
1713 1718  spa_fini(void)
1714 1719  {
1715 1720          l2arc_stop();
1716 1721  
1717 1722          spa_evict_all();
1718 1723  
1719 1724          vdev_cache_stat_fini();
1720 1725          zil_fini();
1721 1726          dmu_fini();
1722 1727          zio_fini();
1723 1728          space_map_fini();
1724 1729          unique_fini();
1725 1730          refcount_fini();
1726 1731  
1727 1732          avl_destroy(&spa_namespace_avl);
1728 1733          avl_destroy(&spa_spare_avl);
1729 1734          avl_destroy(&spa_l2cache_avl);
1730 1735  
1731 1736          cv_destroy(&spa_namespace_cv);
1732 1737          mutex_destroy(&spa_namespace_lock);
1733 1738          mutex_destroy(&spa_spare_lock);
1734 1739          mutex_destroy(&spa_l2cache_lock);
1735 1740  }
1736 1741  
1737 1742  /*
1738 1743   * Return whether this pool has slogs. No locking needed.
1739 1744   * It's not a problem if the wrong answer is returned as it's only for
1740 1745   * performance and not correctness
1741 1746   */
1742 1747  boolean_t
1743 1748  spa_has_slogs(spa_t *spa)
1744 1749  {
1745 1750          return (spa->spa_log_class->mc_rotor != NULL);
1746 1751  }
1747 1752  
1748 1753  spa_log_state_t
1749 1754  spa_get_log_state(spa_t *spa)
1750 1755  {
1751 1756          return (spa->spa_log_state);
1752 1757  }
1753 1758  
1754 1759  void
1755 1760  spa_set_log_state(spa_t *spa, spa_log_state_t state)
1756 1761  {
1757 1762          spa->spa_log_state = state;
1758 1763  }
1759 1764  
1760 1765  boolean_t
1761 1766  spa_is_root(spa_t *spa)
1762 1767  {
1763 1768          return (spa->spa_is_root);
1764 1769  }
1765 1770  
1766 1771  boolean_t
1767 1772  spa_writeable(spa_t *spa)
1768 1773  {
1769 1774          return (!!(spa->spa_mode & FWRITE));
1770 1775  }
1771 1776  
1772 1777  int
1773 1778  spa_mode(spa_t *spa)
1774 1779  {
1775 1780          return (spa->spa_mode);
1776 1781  }
1777 1782  
1778 1783  uint64_t
1779 1784  spa_bootfs(spa_t *spa)
1780 1785  {
1781 1786          return (spa->spa_bootfs);
1782 1787  }
1783 1788  
1784 1789  uint64_t
1785 1790  spa_delegation(spa_t *spa)
1786 1791  {
1787 1792          return (spa->spa_delegation);
1788 1793  }
1789 1794  
1790 1795  objset_t *
1791 1796  spa_meta_objset(spa_t *spa)
1792 1797  {
1793 1798          return (spa->spa_meta_objset);
1794 1799  }
1795 1800  
1796 1801  enum zio_checksum
1797 1802  spa_dedup_checksum(spa_t *spa)
1798 1803  {
1799 1804          return (spa->spa_dedup_checksum);
1800 1805  }
1801 1806  
1802 1807  /*
1803 1808   * Reset pool scan stat per scan pass (or reboot).
1804 1809   */
1805 1810  void
1806 1811  spa_scan_stat_init(spa_t *spa)
1807 1812  {
1808 1813          /* data not stored on disk */
1809 1814          spa->spa_scan_pass_start = gethrestime_sec();
1810 1815          spa->spa_scan_pass_exam = 0;
1811 1816          vdev_scan_stat_init(spa->spa_root_vdev);
1812 1817  }
1813 1818  
1814 1819  /*
1815 1820   * Get scan stats for zpool status reports
1816 1821   */
1817 1822  int
1818 1823  spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
1819 1824  {
1820 1825          dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
1821 1826  
1822 1827          if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
1823 1828                  return (SET_ERROR(ENOENT));
1824 1829          bzero(ps, sizeof (pool_scan_stat_t));
1825 1830  
1826 1831          /* data stored on disk */
1827 1832          ps->pss_func = scn->scn_phys.scn_func;
1828 1833          ps->pss_start_time = scn->scn_phys.scn_start_time;
1829 1834          ps->pss_end_time = scn->scn_phys.scn_end_time;
1830 1835          ps->pss_to_examine = scn->scn_phys.scn_to_examine;
1831 1836          ps->pss_examined = scn->scn_phys.scn_examined;
1832 1837          ps->pss_to_process = scn->scn_phys.scn_to_process;
1833 1838          ps->pss_processed = scn->scn_phys.scn_processed;
1834 1839          ps->pss_errors = scn->scn_phys.scn_errors;
1835 1840          ps->pss_state = scn->scn_phys.scn_state;
1836 1841  
1837 1842          /* data not stored on disk */
1838 1843          ps->pss_pass_start = spa->spa_scan_pass_start;
1839 1844          ps->pss_pass_exam = spa->spa_scan_pass_exam;
1840 1845  
1841 1846          return (0);
1842 1847  }
1843 1848  
1844 1849  boolean_t
1845 1850  spa_debug_enabled(spa_t *spa)
1846 1851  {
1847 1852          return (spa->spa_debug);
1848 1853  }

↓ open down ↓

329 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX