Print this page
    
Possibility to physically reserve space without writing leaf blocks
    
      
        | Split | Close | 
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/sys/dbuf.h
          +++ new/usr/src/uts/common/fs/zfs/sys/dbuf.h
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25   25   */
  26   26  
  27   27  #ifndef _SYS_DBUF_H
  28   28  #define _SYS_DBUF_H
  29   29  
  30   30  #include <sys/dmu.h>
  31   31  #include <sys/spa.h>
  32   32  #include <sys/txg.h>
  33   33  #include <sys/zio.h>
  34   34  #include <sys/arc.h>
  35   35  #include <sys/zfs_context.h>
  36   36  #include <sys/refcount.h>
  37   37  #include <sys/zrlock.h>
  38   38  
  39   39  #ifdef  __cplusplus
  40   40  extern "C" {
  41   41  #endif
  42   42  
  43   43  #define IN_DMU_SYNC 2
  44   44  
  45   45  /*
  46   46   * define flags for dbuf_read
  47   47   */
  48   48  
  49   49  #define DB_RF_MUST_SUCCEED      (1 << 0)
  50   50  #define DB_RF_CANFAIL           (1 << 1)
  51   51  #define DB_RF_HAVESTRUCT        (1 << 2)
  52   52  #define DB_RF_NOPREFETCH        (1 << 3)
  53   53  #define DB_RF_NEVERWAIT         (1 << 4)
  54   54  #define DB_RF_CACHED            (1 << 5)
  55   55  
  56   56  /*
  57   57   * The simplified state transition diagram for dbufs looks like:
  58   58   *
  59   59   *              +----> READ ----+
  60   60   *              |               |
  61   61   *              |               V
  62   62   *  (alloc)-->UNCACHED       CACHED-->EVICTING-->(free)
  63   63   *              |               ^        ^
  64   64   *              |               |        |
  65   65   *              +----> FILL ----+        |
  66   66   *              |                        |
  67   67   *              |                        |
  68   68   *              +--------> NOFILL -------+
  69   69   *
  70   70   * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range
  71   71   * to find all dbufs in a range of a dnode and must be less than any other
  72   72   * dbuf_states_t (see comment on dn_dbufs in dnode.h).
  73   73   */
  74   74  typedef enum dbuf_states {
  75   75          DB_SEARCH = -1,
  76   76          DB_UNCACHED,
  77   77          DB_FILL,
  78   78          DB_NOFILL,
  79   79          DB_READ,
  80   80          DB_CACHED,
  81   81          DB_EVICTING
  82   82  } dbuf_states_t;
  83   83  
  84   84  struct dnode;
  85   85  struct dmu_tx;
  86   86  
  87   87  /*
  88   88   * level = 0 means the user data
  89   89   * level = 1 means the single indirect block
  90   90   * etc.
  91   91   */
  92   92  
  93   93  struct dmu_buf_impl;
  94   94  
  95   95  typedef enum override_states {
  96   96          DR_NOT_OVERRIDDEN,
  97   97          DR_IN_DMU_SYNC,
  98   98          DR_OVERRIDDEN
  99   99  } override_states_t;
 100  100  
 101  101  typedef struct dbuf_dirty_record {
 102  102          /* link on our parents dirty list */
 103  103          list_node_t dr_dirty_node;
 104  104  
 105  105          /* transaction group this data will sync in */
 106  106          uint64_t dr_txg;
 107  107  
 108  108          /* zio of outstanding write IO */
 109  109          zio_t *dr_zio;
 110  110  
 111  111          /* pointer back to our dbuf */
 112  112          struct dmu_buf_impl *dr_dbuf;
 113  113  
 114  114          /* pointer to next dirty record */
 115  115          struct dbuf_dirty_record *dr_next;
 116  116  
 117  117          /* pointer to parent dirty record */
 118  118          struct dbuf_dirty_record *dr_parent;
 119  119  
 120  120          /* How much space was changed to dsl_pool_dirty_space() for this? */
 121  121          unsigned int dr_accounted;
 122  122  
 123  123          union dirty_types {
 124  124                  struct dirty_indirect {
 125  125  
 126  126                          /* protect access to list */
 127  127                          kmutex_t dr_mtx;
 128  128  
 129  129                          /* Our list of dirty children */
 130  130                          list_t dr_children;
 131  131                  } di;
 132  132                  struct dirty_leaf {
 133  133  
 134  134                          /*
 135  135                           * dr_data is set when we dirty the buffer
  
    | ↓ open down ↓ | 135 lines elided | ↑ open up ↑ | 
 136  136                           * so that we can retain the pointer even if it
 137  137                           * gets COW'd in a subsequent transaction group.
 138  138                           */
 139  139                          arc_buf_t *dr_data;
 140  140                          blkptr_t dr_overridden_by;
 141  141                          override_states_t dr_override_state;
 142  142                          uint8_t dr_copies;
 143  143                          boolean_t dr_nopwrite;
 144  144                  } dl;
 145  145          } dt;
      146 +
      147 +        boolean_t dr_zero_write;
 146  148  } dbuf_dirty_record_t;
 147  149  
 148  150  typedef struct dmu_buf_impl {
 149  151          /*
 150  152           * The following members are immutable, with the exception of
 151  153           * db.db_data, which is protected by db_mtx.
 152  154           */
 153  155  
 154  156          /* the publicly visible structure */
 155  157          dmu_buf_t db;
 156  158  
 157  159          /* the objset we belong to */
 158  160          struct objset *db_objset;
 159  161  
 160  162          /*
 161  163           * handle to safely access the dnode we belong to (NULL when evicted)
 162  164           */
 163  165          struct dnode_handle *db_dnode_handle;
 164  166  
 165  167          /*
 166  168           * our parent buffer; if the dnode points to us directly,
 167  169           * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
 168  170           * only accessed by sync thread ???
 169  171           * (NULL when evicted)
 170  172           * May change from NULL to non-NULL under the protection of db_mtx
 171  173           * (see dbuf_check_blkptr())
 172  174           */
 173  175          struct dmu_buf_impl *db_parent;
 174  176  
 175  177          /*
 176  178           * link for hash table of all dmu_buf_impl_t's
 177  179           */
 178  180          struct dmu_buf_impl *db_hash_next;
 179  181  
 180  182          /* our block number */
 181  183          uint64_t db_blkid;
 182  184  
 183  185          /*
 184  186           * Pointer to the blkptr_t which points to us. May be NULL if we
 185  187           * don't have one yet. (NULL when evicted)
 186  188           */
 187  189          blkptr_t *db_blkptr;
 188  190  
 189  191          /*
 190  192           * Our indirection level.  Data buffers have db_level==0.
 191  193           * Indirect buffers which point to data buffers have
 192  194           * db_level==1. etc.  Buffers which contain dnodes have
 193  195           * db_level==0, since the dnodes are stored in a file.
 194  196           */
 195  197          uint8_t db_level;
 196  198  
 197  199          /* db_mtx protects the members below */
 198  200          kmutex_t db_mtx;
 199  201  
 200  202          /*
 201  203           * Current state of the buffer
 202  204           */
 203  205          dbuf_states_t db_state;
 204  206  
 205  207          /*
 206  208           * Refcount accessed by dmu_buf_{hold,rele}.
 207  209           * If nonzero, the buffer can't be destroyed.
 208  210           * Protected by db_mtx.
 209  211           */
 210  212          refcount_t db_holds;
 211  213  
 212  214          /* buffer holding our data */
 213  215          arc_buf_t *db_buf;
 214  216  
 215  217          kcondvar_t db_changed;
 216  218          dbuf_dirty_record_t *db_data_pending;
 217  219  
 218  220          /* pointer to most recent dirty record for this buffer */
 219  221          dbuf_dirty_record_t *db_last_dirty;
 220  222  
 221  223          /*
 222  224           * Our link on the owner dnodes's dn_dbufs list.
 223  225           * Protected by its dn_dbufs_mtx.
 224  226           */
 225  227          avl_node_t db_link;
 226  228  
 227  229          /* Data which is unique to data (leaf) blocks: */
 228  230  
 229  231          /* stuff we store for the user (see dmu_buf_set_user) */
 230  232          void *db_user_ptr;
 231  233          void **db_user_data_ptr_ptr;
 232  234          dmu_buf_evict_func_t *db_evict_func;
 233  235  
 234  236          uint8_t db_immediate_evict;
 235  237          uint8_t db_freed_in_flight;
 236  238  
 237  239          uint8_t db_dirtycnt;
 238  240  } dmu_buf_impl_t;
 239  241  
 240  242  /* Note: the dbuf hash table is exposed only for the mdb module */
 241  243  #define DBUF_MUTEXES 256
 242  244  #define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
 243  245  typedef struct dbuf_hash_table {
 244  246          uint64_t hash_table_mask;
 245  247          dmu_buf_impl_t **hash_table;
 246  248          kmutex_t hash_mutexes[DBUF_MUTEXES];
 247  249  } dbuf_hash_table_t;
 248  250  
 249  251  
 250  252  uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
 251  253  
 252  254  dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
 253  255  void dbuf_create_bonus(struct dnode *dn);
 254  256  int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
 255  257  void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag);
 256  258  
 257  259  void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
 258  260  
 259  261  dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
 260  262  dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
 261  263      void *tag);
 262  264  int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
 263  265      void *tag, dmu_buf_impl_t **dbp);
 264  266  
 265  267  void dbuf_prefetch(struct dnode *dn, uint64_t blkid, zio_priority_t prio);
 266  268  
 267  269  void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
  
    | ↓ open down ↓ | 112 lines elided | ↑ open up ↑ | 
 268  270  uint64_t dbuf_refcount(dmu_buf_impl_t *db);
 269  271  
 270  272  void dbuf_rele(dmu_buf_impl_t *db, void *tag);
 271  273  void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
 272  274  
 273  275  dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
 274  276  
 275  277  int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
 276  278  void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
 277  279  void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
      280 +void dmu_buf_will_zero_fill(dmu_buf_t *db, dmu_tx_t *tx);
 278  281  void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
 279  282  void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
 280      -dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
      283 +dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx, boolean_t zero_write);
      284 +dbuf_dirty_record_t *dbuf_zero_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 281  285  arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
 282  286  void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
 283  287      bp_embedded_type_t etype, enum zio_compress comp,
 284  288      int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
 285  289  
 286  290  void dbuf_clear(dmu_buf_impl_t *db);
 287  291  void dbuf_evict(dmu_buf_impl_t *db);
 288  292  
 289  293  void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 290  294  void dbuf_unoverride(dbuf_dirty_record_t *dr);
 291  295  void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
 292  296  void dbuf_release_bp(dmu_buf_impl_t *db);
 293  297  
 294  298  void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
 295  299      struct dmu_tx *);
 296  300  
 297  301  void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
 298  302  
 299  303  #define DB_DNODE(_db)           ((_db)->db_dnode_handle->dnh_dnode)
 300  304  #define DB_DNODE_LOCK(_db)      ((_db)->db_dnode_handle->dnh_zrlock)
 301  305  #define DB_DNODE_ENTER(_db)     (zrl_add(&DB_DNODE_LOCK(_db)))
 302  306  #define DB_DNODE_EXIT(_db)      (zrl_remove(&DB_DNODE_LOCK(_db)))
 303  307  #define DB_DNODE_HELD(_db)      (!zrl_is_zero(&DB_DNODE_LOCK(_db)))
 304  308  
 305  309  void dbuf_init(void);
 306  310  void dbuf_fini(void);
 307  311  
 308  312  boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
 309  313  
 310  314  #define DBUF_GET_BUFC_TYPE(_db) \
 311  315          (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
 312  316  
 313  317  #define DBUF_IS_CACHEABLE(_db)                                          \
 314  318          ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL ||         \
 315  319          (dbuf_is_metadata(_db) &&                                       \
 316  320          ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
 317  321  
 318  322  #define DBUF_IS_L2CACHEABLE(_db)                                        \
 319  323          ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||       \
 320  324          (dbuf_is_metadata(_db) &&                                       \
 321  325          ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
 322  326  
 323  327  #define DBUF_IS_L2COMPRESSIBLE(_db)                                     \
 324  328          ((_db)->db_objset->os_compress != ZIO_COMPRESS_OFF ||           \
 325  329          (dbuf_is_metadata(_db) && zfs_mdcomp_disable == B_FALSE))
 326  330  
 327  331  #ifdef ZFS_DEBUG
 328  332  
 329  333  /*
 330  334   * There should be a ## between the string literal and fmt, to make it
 331  335   * clear that we're joining two strings together, but gcc does not
 332  336   * support that preprocessor token.
 333  337   */
 334  338  #define dprintf_dbuf(dbuf, fmt, ...) do { \
 335  339          if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
 336  340          char __db_buf[32]; \
 337  341          uint64_t __db_obj = (dbuf)->db.db_object; \
 338  342          if (__db_obj == DMU_META_DNODE_OBJECT) \
 339  343                  (void) strcpy(__db_buf, "mdn"); \
 340  344          else \
 341  345                  (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
 342  346                      (u_longlong_t)__db_obj); \
 343  347          dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
 344  348              "obj=%s lvl=%u blkid=%lld " fmt, \
 345  349              __db_buf, (dbuf)->db_level, \
 346  350              (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
 347  351          } \
 348  352  _NOTE(CONSTCOND) } while (0)
 349  353  
 350  354  #define dprintf_dbuf_bp(db, bp, fmt, ...) do {                  \
 351  355          if (zfs_flags & ZFS_DEBUG_DPRINTF) {                    \
 352  356          char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);  \
 353  357          snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp);          \
 354  358          dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf);   \
 355  359          kmem_free(__blkbuf, BP_SPRINTF_LEN);                    \
 356  360          }                                                       \
 357  361  _NOTE(CONSTCOND) } while (0)
 358  362  
 359  363  #define DBUF_VERIFY(db) dbuf_verify(db)
 360  364  
 361  365  #else
 362  366  
 363  367  #define dprintf_dbuf(db, fmt, ...)
 364  368  #define dprintf_dbuf_bp(db, bp, fmt, ...)
 365  369  #define DBUF_VERIFY(db)
 366  370  
 367  371  #endif
 368  372  
 369  373  
 370  374  #ifdef  __cplusplus
 371  375  }
 372  376  #endif
 373  377  
 374  378  #endif /* _SYS_DBUF_H */
  
    | ↓ open down ↓ | 84 lines elided | ↑ open up ↑ | 
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX