Print this page
    
3006 VERIFY[S,U,P] and ASSERT[S,U,P] frequently check if first argument is zero
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/dnode.c
          +++ new/usr/src/uts/common/fs/zfs/dnode.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012 by Delphix. All rights reserved.
  24   24   */
  25   25  
  26   26  #include <sys/zfs_context.h>
  27   27  #include <sys/dbuf.h>
  28   28  #include <sys/dnode.h>
  29   29  #include <sys/dmu.h>
  30   30  #include <sys/dmu_impl.h>
  31   31  #include <sys/dmu_tx.h>
  32   32  #include <sys/dmu_objset.h>
  33   33  #include <sys/dsl_dir.h>
  34   34  #include <sys/dsl_dataset.h>
  35   35  #include <sys/spa.h>
  36   36  #include <sys/zio.h>
  37   37  #include <sys/dmu_zfetch.h>
  38   38  
  39   39  static int free_range_compar(const void *node1, const void *node2);
  40   40  
  41   41  static kmem_cache_t *dnode_cache;
  42   42  /*
  43   43   * Define DNODE_STATS to turn on statistic gathering. By default, it is only
  44   44   * turned on when DEBUG is also defined.
  45   45   */
  46   46  #ifdef  DEBUG
  47   47  #define DNODE_STATS
  48   48  #endif  /* DEBUG */
  49   49  
  50   50  #ifdef  DNODE_STATS
  51   51  #define DNODE_STAT_ADD(stat)                    ((stat)++)
  52   52  #else
  53   53  #define DNODE_STAT_ADD(stat)                    /* nothing */
  54   54  #endif  /* DNODE_STATS */
  55   55  
  56   56  static dnode_phys_t dnode_phys_zero;
  57   57  
  58   58  int zfs_default_bs = SPA_MINBLOCKSHIFT;
  59   59  int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
  60   60  
  61   61  static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
  62   62  
  63   63  /* ARGSUSED */
  64   64  static int
  65   65  dnode_cons(void *arg, void *unused, int kmflag)
  66   66  {
  67   67          dnode_t *dn = arg;
  68   68          int i;
  69   69  
  70   70          rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
  71   71          mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
  72   72          mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
  73   73          cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
  74   74  
  75   75          refcount_create(&dn->dn_holds);
  76   76          refcount_create(&dn->dn_tx_holds);
  77   77          list_link_init(&dn->dn_link);
  78   78  
  79   79          bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
  80   80          bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
  81   81          bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
  82   82          bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
  83   83          bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
  84   84          bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
  85   85          bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
  86   86  
  87   87          for (i = 0; i < TXG_SIZE; i++) {
  88   88                  list_link_init(&dn->dn_dirty_link[i]);
  89   89                  avl_create(&dn->dn_ranges[i], free_range_compar,
  90   90                      sizeof (free_range_t),
  91   91                      offsetof(struct free_range, fr_node));
  92   92                  list_create(&dn->dn_dirty_records[i],
  93   93                      sizeof (dbuf_dirty_record_t),
  94   94                      offsetof(dbuf_dirty_record_t, dr_dirty_node));
  95   95          }
  96   96  
  97   97          dn->dn_allocated_txg = 0;
  98   98          dn->dn_free_txg = 0;
  99   99          dn->dn_assigned_txg = 0;
 100  100          dn->dn_dirtyctx = 0;
 101  101          dn->dn_dirtyctx_firstset = NULL;
 102  102          dn->dn_bonus = NULL;
 103  103          dn->dn_have_spill = B_FALSE;
 104  104          dn->dn_zio = NULL;
 105  105          dn->dn_oldused = 0;
 106  106          dn->dn_oldflags = 0;
 107  107          dn->dn_olduid = 0;
 108  108          dn->dn_oldgid = 0;
 109  109          dn->dn_newuid = 0;
 110  110          dn->dn_newgid = 0;
 111  111          dn->dn_id_flags = 0;
 112  112  
 113  113          dn->dn_dbufs_count = 0;
 114  114          list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
 115  115              offsetof(dmu_buf_impl_t, db_link));
 116  116  
 117  117          dn->dn_moved = 0;
 118  118          return (0);
 119  119  }
 120  120  
 121  121  /* ARGSUSED */
 122  122  static void
 123  123  dnode_dest(void *arg, void *unused)
 124  124  {
 125  125          int i;
 126  126          dnode_t *dn = arg;
 127  127  
 128  128          rw_destroy(&dn->dn_struct_rwlock);
 129  129          mutex_destroy(&dn->dn_mtx);
  
    | 
      ↓ open down ↓ | 
    129 lines elided | 
    
      ↑ open up ↑ | 
  
 130  130          mutex_destroy(&dn->dn_dbufs_mtx);
 131  131          cv_destroy(&dn->dn_notxholds);
 132  132          refcount_destroy(&dn->dn_holds);
 133  133          refcount_destroy(&dn->dn_tx_holds);
 134  134          ASSERT(!list_link_active(&dn->dn_link));
 135  135  
 136  136          for (i = 0; i < TXG_SIZE; i++) {
 137  137                  ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 138  138                  avl_destroy(&dn->dn_ranges[i]);
 139  139                  list_destroy(&dn->dn_dirty_records[i]);
 140      -                ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
 141      -                ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
 142      -                ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
 143      -                ASSERT3U(dn->dn_next_bonustype[i], ==, 0);
 144      -                ASSERT3U(dn->dn_rm_spillblk[i], ==, 0);
 145      -                ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
 146      -                ASSERT3U(dn->dn_next_blksz[i], ==, 0);
      140 +                ASSERT0(dn->dn_next_nblkptr[i]);
      141 +                ASSERT0(dn->dn_next_nlevels[i]);
      142 +                ASSERT0(dn->dn_next_indblkshift[i]);
      143 +                ASSERT0(dn->dn_next_bonustype[i]);
      144 +                ASSERT0(dn->dn_rm_spillblk[i]);
      145 +                ASSERT0(dn->dn_next_bonuslen[i]);
      146 +                ASSERT0(dn->dn_next_blksz[i]);
 147  147          }
 148  148  
 149      -        ASSERT3U(dn->dn_allocated_txg, ==, 0);
 150      -        ASSERT3U(dn->dn_free_txg, ==, 0);
 151      -        ASSERT3U(dn->dn_assigned_txg, ==, 0);
 152      -        ASSERT3U(dn->dn_dirtyctx, ==, 0);
      149 +        ASSERT0(dn->dn_allocated_txg);
      150 +        ASSERT0(dn->dn_free_txg);
      151 +        ASSERT0(dn->dn_assigned_txg);
      152 +        ASSERT0(dn->dn_dirtyctx);
 153  153          ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
 154  154          ASSERT3P(dn->dn_bonus, ==, NULL);
 155  155          ASSERT(!dn->dn_have_spill);
 156  156          ASSERT3P(dn->dn_zio, ==, NULL);
 157      -        ASSERT3U(dn->dn_oldused, ==, 0);
 158      -        ASSERT3U(dn->dn_oldflags, ==, 0);
 159      -        ASSERT3U(dn->dn_olduid, ==, 0);
 160      -        ASSERT3U(dn->dn_oldgid, ==, 0);
 161      -        ASSERT3U(dn->dn_newuid, ==, 0);
 162      -        ASSERT3U(dn->dn_newgid, ==, 0);
 163      -        ASSERT3U(dn->dn_id_flags, ==, 0);
      157 +        ASSERT0(dn->dn_oldused);
      158 +        ASSERT0(dn->dn_oldflags);
      159 +        ASSERT0(dn->dn_olduid);
      160 +        ASSERT0(dn->dn_oldgid);
      161 +        ASSERT0(dn->dn_newuid);
      162 +        ASSERT0(dn->dn_newgid);
      163 +        ASSERT0(dn->dn_id_flags);
 164  164  
 165      -        ASSERT3U(dn->dn_dbufs_count, ==, 0);
      165 +        ASSERT0(dn->dn_dbufs_count);
 166  166          list_destroy(&dn->dn_dbufs);
 167  167  }
 168  168  
 169  169  void
 170  170  dnode_init(void)
 171  171  {
 172  172          ASSERT(dnode_cache == NULL);
 173  173          dnode_cache = kmem_cache_create("dnode_t",
 174  174              sizeof (dnode_t),
 175  175              0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
 176  176          kmem_cache_set_move(dnode_cache, dnode_move);
 177  177  }
 178  178  
 179  179  void
 180  180  dnode_fini(void)
 181  181  {
 182  182          kmem_cache_destroy(dnode_cache);
 183  183          dnode_cache = NULL;
 184  184  }
 185  185  
 186  186  
 187  187  #ifdef ZFS_DEBUG
 188  188  void
 189  189  dnode_verify(dnode_t *dn)
 190  190  {
 191  191          int drop_struct_lock = FALSE;
 192  192  
 193  193          ASSERT(dn->dn_phys);
 194  194          ASSERT(dn->dn_objset);
 195  195          ASSERT(dn->dn_handle->dnh_dnode == dn);
 196  196  
 197  197          ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 198  198  
 199  199          if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
 200  200                  return;
 201  201  
 202  202          if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 203  203                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
 204  204                  drop_struct_lock = TRUE;
 205  205          }
 206  206          if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
 207  207                  int i;
 208  208                  ASSERT3U(dn->dn_indblkshift, >=, 0);
 209  209                  ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
 210  210                  if (dn->dn_datablkshift) {
 211  211                          ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
 212  212                          ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
 213  213                          ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
 214  214                  }
 215  215                  ASSERT3U(dn->dn_nlevels, <=, 30);
 216  216                  ASSERT(DMU_OT_IS_VALID(dn->dn_type));
 217  217                  ASSERT3U(dn->dn_nblkptr, >=, 1);
 218  218                  ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
 219  219                  ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
 220  220                  ASSERT3U(dn->dn_datablksz, ==,
 221  221                      dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 222  222                  ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
 223  223                  ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
 224  224                      dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
 225  225                  for (i = 0; i < TXG_SIZE; i++) {
 226  226                          ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
 227  227                  }
 228  228          }
 229  229          if (dn->dn_phys->dn_type != DMU_OT_NONE)
 230  230                  ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
 231  231          ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
 232  232          if (dn->dn_dbuf != NULL) {
 233  233                  ASSERT3P(dn->dn_phys, ==,
 234  234                      (dnode_phys_t *)dn->dn_dbuf->db.db_data +
 235  235                      (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
 236  236          }
 237  237          if (drop_struct_lock)
 238  238                  rw_exit(&dn->dn_struct_rwlock);
 239  239  }
 240  240  #endif
 241  241  
 242  242  void
 243  243  dnode_byteswap(dnode_phys_t *dnp)
 244  244  {
 245  245          uint64_t *buf64 = (void*)&dnp->dn_blkptr;
 246  246          int i;
 247  247  
 248  248          if (dnp->dn_type == DMU_OT_NONE) {
 249  249                  bzero(dnp, sizeof (dnode_phys_t));
 250  250                  return;
 251  251          }
 252  252  
 253  253          dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
 254  254          dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
 255  255          dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
 256  256          dnp->dn_used = BSWAP_64(dnp->dn_used);
 257  257  
 258  258          /*
 259  259           * dn_nblkptr is only one byte, so it's OK to read it in either
 260  260           * byte order.  We can't read dn_bouslen.
 261  261           */
 262  262          ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
 263  263          ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
 264  264          for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
 265  265                  buf64[i] = BSWAP_64(buf64[i]);
 266  266  
 267  267          /*
 268  268           * OK to check dn_bonuslen for zero, because it won't matter if
 269  269           * we have the wrong byte order.  This is necessary because the
 270  270           * dnode dnode is smaller than a regular dnode.
 271  271           */
 272  272          if (dnp->dn_bonuslen != 0) {
 273  273                  /*
 274  274                   * Note that the bonus length calculated here may be
 275  275                   * longer than the actual bonus buffer.  This is because
 276  276                   * we always put the bonus buffer after the last block
 277  277                   * pointer (instead of packing it against the end of the
 278  278                   * dnode buffer).
 279  279                   */
 280  280                  int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
 281  281                  size_t len = DN_MAX_BONUSLEN - off;
 282  282                  ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
 283  283                  dmu_object_byteswap_t byteswap =
 284  284                      DMU_OT_BYTESWAP(dnp->dn_bonustype);
 285  285                  dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
 286  286          }
 287  287  
 288  288          /* Swap SPILL block if we have one */
 289  289          if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
 290  290                  byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
 291  291  
 292  292  }
 293  293  
 294  294  void
 295  295  dnode_buf_byteswap(void *vbuf, size_t size)
 296  296  {
 297  297          dnode_phys_t *buf = vbuf;
 298  298          int i;
 299  299  
 300  300          ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
 301  301          ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
 302  302  
 303  303          size >>= DNODE_SHIFT;
 304  304          for (i = 0; i < size; i++) {
 305  305                  dnode_byteswap(buf);
 306  306                  buf++;
 307  307          }
 308  308  }
 309  309  
 310  310  static int
 311  311  free_range_compar(const void *node1, const void *node2)
 312  312  {
 313  313          const free_range_t *rp1 = node1;
 314  314          const free_range_t *rp2 = node2;
 315  315  
 316  316          if (rp1->fr_blkid < rp2->fr_blkid)
 317  317                  return (-1);
 318  318          else if (rp1->fr_blkid > rp2->fr_blkid)
 319  319                  return (1);
 320  320          else return (0);
 321  321  }
 322  322  
 323  323  void
 324  324  dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
 325  325  {
 326  326          ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
 327  327  
 328  328          dnode_setdirty(dn, tx);
 329  329          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 330  330          ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
 331  331              (dn->dn_nblkptr-1) * sizeof (blkptr_t));
 332  332          dn->dn_bonuslen = newsize;
 333  333          if (newsize == 0)
 334  334                  dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
 335  335          else
 336  336                  dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
 337  337          rw_exit(&dn->dn_struct_rwlock);
 338  338  }
 339  339  
 340  340  void
 341  341  dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
 342  342  {
 343  343          ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
 344  344          dnode_setdirty(dn, tx);
 345  345          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 346  346          dn->dn_bonustype = newtype;
 347  347          dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
 348  348          rw_exit(&dn->dn_struct_rwlock);
 349  349  }
 350  350  
 351  351  void
 352  352  dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
 353  353  {
  
    | 
      ↓ open down ↓ | 
    178 lines elided | 
    
      ↑ open up ↑ | 
  
 354  354          ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
 355  355          ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 356  356          dnode_setdirty(dn, tx);
 357  357          dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
 358  358          dn->dn_have_spill = B_FALSE;
 359  359  }
 360  360  
 361  361  static void
 362  362  dnode_setdblksz(dnode_t *dn, int size)
 363  363  {
 364      -        ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0);
      364 +        ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
 365  365          ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 366  366          ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
 367  367          ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
 368  368              1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
 369  369          dn->dn_datablksz = size;
 370  370          dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
 371  371          dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
 372  372  }
 373  373  
 374  374  static dnode_t *
 375  375  dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
 376  376      uint64_t object, dnode_handle_t *dnh)
 377  377  {
 378  378          dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
 379  379  
 380  380          ASSERT(!POINTER_IS_VALID(dn->dn_objset));
 381  381          dn->dn_moved = 0;
 382  382  
 383  383          /*
 384  384           * Defer setting dn_objset until the dnode is ready to be a candidate
 385  385           * for the dnode_move() callback.
 386  386           */
 387  387          dn->dn_object = object;
 388  388          dn->dn_dbuf = db;
 389  389          dn->dn_handle = dnh;
 390  390          dn->dn_phys = dnp;
 391  391  
 392  392          if (dnp->dn_datablkszsec) {
 393  393                  dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 394  394          } else {
 395  395                  dn->dn_datablksz = 0;
 396  396                  dn->dn_datablkszsec = 0;
 397  397                  dn->dn_datablkshift = 0;
 398  398          }
 399  399          dn->dn_indblkshift = dnp->dn_indblkshift;
 400  400          dn->dn_nlevels = dnp->dn_nlevels;
 401  401          dn->dn_type = dnp->dn_type;
 402  402          dn->dn_nblkptr = dnp->dn_nblkptr;
 403  403          dn->dn_checksum = dnp->dn_checksum;
 404  404          dn->dn_compress = dnp->dn_compress;
 405  405          dn->dn_bonustype = dnp->dn_bonustype;
 406  406          dn->dn_bonuslen = dnp->dn_bonuslen;
 407  407          dn->dn_maxblkid = dnp->dn_maxblkid;
 408  408          dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
 409  409          dn->dn_id_flags = 0;
 410  410  
 411  411          dmu_zfetch_init(&dn->dn_zfetch, dn);
 412  412  
 413  413          ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 414  414  
 415  415          mutex_enter(&os->os_lock);
 416  416          list_insert_head(&os->os_dnodes, dn);
 417  417          membar_producer();
 418  418          /*
 419  419           * Everything else must be valid before assigning dn_objset makes the
 420  420           * dnode eligible for dnode_move().
 421  421           */
 422  422          dn->dn_objset = os;
 423  423          mutex_exit(&os->os_lock);
 424  424  
 425  425          arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
 426  426          return (dn);
 427  427  }
 428  428  
 429  429  /*
 430  430   * Caller must be holding the dnode handle, which is released upon return.
 431  431   */
 432  432  static void
 433  433  dnode_destroy(dnode_t *dn)
 434  434  {
 435  435          objset_t *os = dn->dn_objset;
 436  436  
 437  437          ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 438  438  
 439  439          mutex_enter(&os->os_lock);
 440  440          POINTER_INVALIDATE(&dn->dn_objset);
 441  441          list_remove(&os->os_dnodes, dn);
 442  442          mutex_exit(&os->os_lock);
 443  443  
 444  444          /* the dnode can no longer move, so we can release the handle */
 445  445          zrl_remove(&dn->dn_handle->dnh_zrlock);
 446  446  
 447  447          dn->dn_allocated_txg = 0;
 448  448          dn->dn_free_txg = 0;
 449  449          dn->dn_assigned_txg = 0;
 450  450  
 451  451          dn->dn_dirtyctx = 0;
 452  452          if (dn->dn_dirtyctx_firstset != NULL) {
 453  453                  kmem_free(dn->dn_dirtyctx_firstset, 1);
 454  454                  dn->dn_dirtyctx_firstset = NULL;
 455  455          }
 456  456          if (dn->dn_bonus != NULL) {
 457  457                  mutex_enter(&dn->dn_bonus->db_mtx);
 458  458                  dbuf_evict(dn->dn_bonus);
 459  459                  dn->dn_bonus = NULL;
 460  460          }
 461  461          dn->dn_zio = NULL;
 462  462  
 463  463          dn->dn_have_spill = B_FALSE;
 464  464          dn->dn_oldused = 0;
 465  465          dn->dn_oldflags = 0;
 466  466          dn->dn_olduid = 0;
 467  467          dn->dn_oldgid = 0;
 468  468          dn->dn_newuid = 0;
 469  469          dn->dn_newgid = 0;
 470  470          dn->dn_id_flags = 0;
 471  471  
 472  472          dmu_zfetch_rele(&dn->dn_zfetch);
 473  473          kmem_cache_free(dnode_cache, dn);
 474  474          arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
 475  475  }
 476  476  
 477  477  void
 478  478  dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 479  479      dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 480  480  {
 481  481          int i;
 482  482  
 483  483          if (blocksize == 0)
 484  484                  blocksize = 1 << zfs_default_bs;
 485  485          else if (blocksize > SPA_MAXBLOCKSIZE)
 486  486                  blocksize = SPA_MAXBLOCKSIZE;
 487  487          else
 488  488                  blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
 489  489  
 490  490          if (ibs == 0)
 491  491                  ibs = zfs_default_ibs;
 492  492  
 493  493          ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
 494  494  
 495  495          dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
 496  496              dn->dn_object, tx->tx_txg, blocksize, ibs);
 497  497  
 498  498          ASSERT(dn->dn_type == DMU_OT_NONE);
  
    | 
      ↓ open down ↓ | 
    124 lines elided | 
    
      ↑ open up ↑ | 
  
 499  499          ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
 500  500          ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
 501  501          ASSERT(ot != DMU_OT_NONE);
 502  502          ASSERT(DMU_OT_IS_VALID(ot));
 503  503          ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 504  504              (bonustype == DMU_OT_SA && bonuslen == 0) ||
 505  505              (bonustype != DMU_OT_NONE && bonuslen != 0));
 506  506          ASSERT(DMU_OT_IS_VALID(bonustype));
 507  507          ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 508  508          ASSERT(dn->dn_type == DMU_OT_NONE);
 509      -        ASSERT3U(dn->dn_maxblkid, ==, 0);
 510      -        ASSERT3U(dn->dn_allocated_txg, ==, 0);
 511      -        ASSERT3U(dn->dn_assigned_txg, ==, 0);
      509 +        ASSERT0(dn->dn_maxblkid);
      510 +        ASSERT0(dn->dn_allocated_txg);
      511 +        ASSERT0(dn->dn_assigned_txg);
 512  512          ASSERT(refcount_is_zero(&dn->dn_tx_holds));
 513  513          ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
 514  514          ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
 515  515  
 516  516          for (i = 0; i < TXG_SIZE; i++) {
 517      -                ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
 518      -                ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
 519      -                ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
 520      -                ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
 521      -                ASSERT3U(dn->dn_next_bonustype[i], ==, 0);
 522      -                ASSERT3U(dn->dn_rm_spillblk[i], ==, 0);
 523      -                ASSERT3U(dn->dn_next_blksz[i], ==, 0);
      517 +                ASSERT0(dn->dn_next_nblkptr[i]);
      518 +                ASSERT0(dn->dn_next_nlevels[i]);
      519 +                ASSERT0(dn->dn_next_indblkshift[i]);
      520 +                ASSERT0(dn->dn_next_bonuslen[i]);
      521 +                ASSERT0(dn->dn_next_bonustype[i]);
      522 +                ASSERT0(dn->dn_rm_spillblk[i]);
      523 +                ASSERT0(dn->dn_next_blksz[i]);
 524  524                  ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 525  525                  ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
 526      -                ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0);
      526 +                ASSERT0(avl_numnodes(&dn->dn_ranges[i]));
 527  527          }
 528  528  
 529  529          dn->dn_type = ot;
 530  530          dnode_setdblksz(dn, blocksize);
 531  531          dn->dn_indblkshift = ibs;
 532  532          dn->dn_nlevels = 1;
 533  533          if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
 534  534                  dn->dn_nblkptr = 1;
 535  535          else
 536  536                  dn->dn_nblkptr = 1 +
 537  537                      ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
 538  538          dn->dn_bonustype = bonustype;
 539  539          dn->dn_bonuslen = bonuslen;
 540  540          dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
 541  541          dn->dn_compress = ZIO_COMPRESS_INHERIT;
 542  542          dn->dn_dirtyctx = 0;
 543  543  
 544  544          dn->dn_free_txg = 0;
 545  545          if (dn->dn_dirtyctx_firstset) {
 546  546                  kmem_free(dn->dn_dirtyctx_firstset, 1);
 547  547                  dn->dn_dirtyctx_firstset = NULL;
 548  548          }
 549  549  
 550  550          dn->dn_allocated_txg = tx->tx_txg;
 551  551          dn->dn_id_flags = 0;
 552  552  
 553  553          dnode_setdirty(dn, tx);
 554  554          dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
 555  555          dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
 556  556          dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
 557  557          dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
  
    | 
      ↓ open down ↓ | 
    21 lines elided | 
    
      ↑ open up ↑ | 
  
 558  558  }
 559  559  
 560  560  void
 561  561  dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 562  562      dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 563  563  {
 564  564          int nblkptr;
 565  565  
 566  566          ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
 567  567          ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
 568      -        ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0);
      568 +        ASSERT0(blocksize % SPA_MINBLOCKSIZE);
 569  569          ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
 570  570          ASSERT(tx->tx_txg != 0);
 571  571          ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 572  572              (bonustype != DMU_OT_NONE && bonuslen != 0) ||
 573  573              (bonustype == DMU_OT_SA && bonuslen == 0));
 574  574          ASSERT(DMU_OT_IS_VALID(bonustype));
 575  575          ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 576  576  
 577  577          /* clean up any unreferenced dbufs */
 578  578          dnode_evict_dbufs(dn);
 579  579  
 580  580          dn->dn_id_flags = 0;
 581  581  
 582  582          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 583  583          dnode_setdirty(dn, tx);
 584  584          if (dn->dn_datablksz != blocksize) {
 585  585                  /* change blocksize */
 586  586                  ASSERT(dn->dn_maxblkid == 0 &&
 587  587                      (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
 588  588                      dnode_block_freed(dn, 0)));
 589  589                  dnode_setdblksz(dn, blocksize);
 590  590                  dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
 591  591          }
 592  592          if (dn->dn_bonuslen != bonuslen)
 593  593                  dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
 594  594  
 595  595          if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
 596  596                  nblkptr = 1;
 597  597          else
 598  598                  nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
 599  599          if (dn->dn_bonustype != bonustype)
 600  600                  dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
 601  601          if (dn->dn_nblkptr != nblkptr)
 602  602                  dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
 603  603          if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 604  604                  dbuf_rm_spill(dn, tx);
 605  605                  dnode_rm_spill(dn, tx);
 606  606          }
 607  607          rw_exit(&dn->dn_struct_rwlock);
 608  608  
 609  609          /* change type */
 610  610          dn->dn_type = ot;
 611  611  
 612  612          /* change bonus size and type */
 613  613          mutex_enter(&dn->dn_mtx);
 614  614          dn->dn_bonustype = bonustype;
 615  615          dn->dn_bonuslen = bonuslen;
 616  616          dn->dn_nblkptr = nblkptr;
 617  617          dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
 618  618          dn->dn_compress = ZIO_COMPRESS_INHERIT;
 619  619          ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
 620  620  
 621  621          /* fix up the bonus db_size */
 622  622          if (dn->dn_bonus) {
 623  623                  dn->dn_bonus->db.db_size =
 624  624                      DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
 625  625                  ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
 626  626          }
 627  627  
 628  628          dn->dn_allocated_txg = tx->tx_txg;
 629  629          mutex_exit(&dn->dn_mtx);
 630  630  }
 631  631  
 632  632  #ifdef  DNODE_STATS
 633  633  static struct {
 634  634          uint64_t dms_dnode_invalid;
 635  635          uint64_t dms_dnode_recheck1;
 636  636          uint64_t dms_dnode_recheck2;
 637  637          uint64_t dms_dnode_special;
 638  638          uint64_t dms_dnode_handle;
 639  639          uint64_t dms_dnode_rwlock;
 640  640          uint64_t dms_dnode_active;
 641  641  } dnode_move_stats;
 642  642  #endif  /* DNODE_STATS */
 643  643  
 644  644  static void
 645  645  dnode_move_impl(dnode_t *odn, dnode_t *ndn)
 646  646  {
 647  647          int i;
 648  648  
 649  649          ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
 650  650          ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
 651  651          ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
 652  652          ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
 653  653  
 654  654          /* Copy fields. */
 655  655          ndn->dn_objset = odn->dn_objset;
 656  656          ndn->dn_object = odn->dn_object;
 657  657          ndn->dn_dbuf = odn->dn_dbuf;
 658  658          ndn->dn_handle = odn->dn_handle;
 659  659          ndn->dn_phys = odn->dn_phys;
 660  660          ndn->dn_type = odn->dn_type;
 661  661          ndn->dn_bonuslen = odn->dn_bonuslen;
 662  662          ndn->dn_bonustype = odn->dn_bonustype;
 663  663          ndn->dn_nblkptr = odn->dn_nblkptr;
 664  664          ndn->dn_checksum = odn->dn_checksum;
 665  665          ndn->dn_compress = odn->dn_compress;
 666  666          ndn->dn_nlevels = odn->dn_nlevels;
 667  667          ndn->dn_indblkshift = odn->dn_indblkshift;
 668  668          ndn->dn_datablkshift = odn->dn_datablkshift;
 669  669          ndn->dn_datablkszsec = odn->dn_datablkszsec;
 670  670          ndn->dn_datablksz = odn->dn_datablksz;
 671  671          ndn->dn_maxblkid = odn->dn_maxblkid;
 672  672          bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
 673  673              sizeof (odn->dn_next_nblkptr));
 674  674          bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
 675  675              sizeof (odn->dn_next_nlevels));
 676  676          bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
 677  677              sizeof (odn->dn_next_indblkshift));
 678  678          bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
 679  679              sizeof (odn->dn_next_bonustype));
 680  680          bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
 681  681              sizeof (odn->dn_rm_spillblk));
 682  682          bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
 683  683              sizeof (odn->dn_next_bonuslen));
 684  684          bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
 685  685              sizeof (odn->dn_next_blksz));
 686  686          for (i = 0; i < TXG_SIZE; i++) {
 687  687                  list_move_tail(&ndn->dn_dirty_records[i],
 688  688                      &odn->dn_dirty_records[i]);
 689  689          }
 690  690          bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges));
 691  691          ndn->dn_allocated_txg = odn->dn_allocated_txg;
 692  692          ndn->dn_free_txg = odn->dn_free_txg;
 693  693          ndn->dn_assigned_txg = odn->dn_assigned_txg;
 694  694          ndn->dn_dirtyctx = odn->dn_dirtyctx;
 695  695          ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
 696  696          ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
 697  697          refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
 698  698          ASSERT(list_is_empty(&ndn->dn_dbufs));
 699  699          list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
 700  700          ndn->dn_dbufs_count = odn->dn_dbufs_count;
 701  701          ndn->dn_bonus = odn->dn_bonus;
 702  702          ndn->dn_have_spill = odn->dn_have_spill;
 703  703          ndn->dn_zio = odn->dn_zio;
 704  704          ndn->dn_oldused = odn->dn_oldused;
 705  705          ndn->dn_oldflags = odn->dn_oldflags;
 706  706          ndn->dn_olduid = odn->dn_olduid;
 707  707          ndn->dn_oldgid = odn->dn_oldgid;
 708  708          ndn->dn_newuid = odn->dn_newuid;
 709  709          ndn->dn_newgid = odn->dn_newgid;
 710  710          ndn->dn_id_flags = odn->dn_id_flags;
 711  711          dmu_zfetch_init(&ndn->dn_zfetch, NULL);
 712  712          list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
 713  713          ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
 714  714          ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
 715  715          ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
 716  716  
 717  717          /*
 718  718           * Update back pointers. Updating the handle fixes the back pointer of
 719  719           * every descendant dbuf as well as the bonus dbuf.
 720  720           */
 721  721          ASSERT(ndn->dn_handle->dnh_dnode == odn);
 722  722          ndn->dn_handle->dnh_dnode = ndn;
 723  723          if (ndn->dn_zfetch.zf_dnode == odn) {
 724  724                  ndn->dn_zfetch.zf_dnode = ndn;
 725  725          }
 726  726  
 727  727          /*
 728  728           * Invalidate the original dnode by clearing all of its back pointers.
 729  729           */
 730  730          odn->dn_dbuf = NULL;
 731  731          odn->dn_handle = NULL;
 732  732          list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
 733  733              offsetof(dmu_buf_impl_t, db_link));
 734  734          odn->dn_dbufs_count = 0;
 735  735          odn->dn_bonus = NULL;
 736  736          odn->dn_zfetch.zf_dnode = NULL;
 737  737  
 738  738          /*
 739  739           * Set the low bit of the objset pointer to ensure that dnode_move()
 740  740           * recognizes the dnode as invalid in any subsequent callback.
 741  741           */
 742  742          POINTER_INVALIDATE(&odn->dn_objset);
 743  743  
 744  744          /*
 745  745           * Satisfy the destructor.
 746  746           */
 747  747          for (i = 0; i < TXG_SIZE; i++) {
 748  748                  list_create(&odn->dn_dirty_records[i],
 749  749                      sizeof (dbuf_dirty_record_t),
 750  750                      offsetof(dbuf_dirty_record_t, dr_dirty_node));
 751  751                  odn->dn_ranges[i].avl_root = NULL;
 752  752                  odn->dn_ranges[i].avl_numnodes = 0;
 753  753                  odn->dn_next_nlevels[i] = 0;
 754  754                  odn->dn_next_indblkshift[i] = 0;
 755  755                  odn->dn_next_bonustype[i] = 0;
 756  756                  odn->dn_rm_spillblk[i] = 0;
 757  757                  odn->dn_next_bonuslen[i] = 0;
 758  758                  odn->dn_next_blksz[i] = 0;
 759  759          }
 760  760          odn->dn_allocated_txg = 0;
 761  761          odn->dn_free_txg = 0;
 762  762          odn->dn_assigned_txg = 0;
 763  763          odn->dn_dirtyctx = 0;
 764  764          odn->dn_dirtyctx_firstset = NULL;
 765  765          odn->dn_have_spill = B_FALSE;
 766  766          odn->dn_zio = NULL;
 767  767          odn->dn_oldused = 0;
 768  768          odn->dn_oldflags = 0;
 769  769          odn->dn_olduid = 0;
 770  770          odn->dn_oldgid = 0;
 771  771          odn->dn_newuid = 0;
 772  772          odn->dn_newgid = 0;
 773  773          odn->dn_id_flags = 0;
 774  774  
 775  775          /*
 776  776           * Mark the dnode.
 777  777           */
 778  778          ndn->dn_moved = 1;
 779  779          odn->dn_moved = (uint8_t)-1;
 780  780  }
 781  781  
 782  782  #ifdef  _KERNEL
 783  783  /*ARGSUSED*/
 784  784  static kmem_cbrc_t
 785  785  dnode_move(void *buf, void *newbuf, size_t size, void *arg)
 786  786  {
 787  787          dnode_t *odn = buf, *ndn = newbuf;
 788  788          objset_t *os;
 789  789          int64_t refcount;
 790  790          uint32_t dbufs;
 791  791  
 792  792          /*
 793  793           * The dnode is on the objset's list of known dnodes if the objset
 794  794           * pointer is valid. We set the low bit of the objset pointer when
 795  795           * freeing the dnode to invalidate it, and the memory patterns written
 796  796           * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
 797  797           * A newly created dnode sets the objset pointer last of all to indicate
 798  798           * that the dnode is known and in a valid state to be moved by this
 799  799           * function.
 800  800           */
 801  801          os = odn->dn_objset;
 802  802          if (!POINTER_IS_VALID(os)) {
 803  803                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
 804  804                  return (KMEM_CBRC_DONT_KNOW);
 805  805          }
 806  806  
 807  807          /*
 808  808           * Ensure that the objset does not go away during the move.
 809  809           */
 810  810          rw_enter(&os_lock, RW_WRITER);
 811  811          if (os != odn->dn_objset) {
 812  812                  rw_exit(&os_lock);
 813  813                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
 814  814                  return (KMEM_CBRC_DONT_KNOW);
 815  815          }
 816  816  
 817  817          /*
 818  818           * If the dnode is still valid, then so is the objset. We know that no
 819  819           * valid objset can be freed while we hold os_lock, so we can safely
 820  820           * ensure that the objset remains in use.
 821  821           */
 822  822          mutex_enter(&os->os_lock);
 823  823  
 824  824          /*
 825  825           * Recheck the objset pointer in case the dnode was removed just before
 826  826           * acquiring the lock.
 827  827           */
 828  828          if (os != odn->dn_objset) {
 829  829                  mutex_exit(&os->os_lock);
 830  830                  rw_exit(&os_lock);
 831  831                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
 832  832                  return (KMEM_CBRC_DONT_KNOW);
 833  833          }
 834  834  
 835  835          /*
 836  836           * At this point we know that as long as we hold os->os_lock, the dnode
 837  837           * cannot be freed and fields within the dnode can be safely accessed.
 838  838           * The objset listing this dnode cannot go away as long as this dnode is
 839  839           * on its list.
 840  840           */
 841  841          rw_exit(&os_lock);
 842  842          if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
 843  843                  mutex_exit(&os->os_lock);
 844  844                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
 845  845                  return (KMEM_CBRC_NO);
 846  846          }
 847  847          ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
 848  848  
 849  849          /*
 850  850           * Lock the dnode handle to prevent the dnode from obtaining any new
 851  851           * holds. This also prevents the descendant dbufs and the bonus dbuf
 852  852           * from accessing the dnode, so that we can discount their holds. The
 853  853           * handle is safe to access because we know that while the dnode cannot
 854  854           * go away, neither can its handle. Once we hold dnh_zrlock, we can
 855  855           * safely move any dnode referenced only by dbufs.
 856  856           */
 857  857          if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
 858  858                  mutex_exit(&os->os_lock);
 859  859                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
 860  860                  return (KMEM_CBRC_LATER);
 861  861          }
 862  862  
 863  863          /*
 864  864           * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
 865  865           * We need to guarantee that there is a hold for every dbuf in order to
 866  866           * determine whether the dnode is actively referenced. Falsely matching
 867  867           * a dbuf to an active hold would lead to an unsafe move. It's possible
 868  868           * that a thread already having an active dnode hold is about to add a
 869  869           * dbuf, and we can't compare hold and dbuf counts while the add is in
 870  870           * progress.
 871  871           */
 872  872          if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
 873  873                  zrl_exit(&odn->dn_handle->dnh_zrlock);
 874  874                  mutex_exit(&os->os_lock);
 875  875                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
 876  876                  return (KMEM_CBRC_LATER);
 877  877          }
 878  878  
 879  879          /*
 880  880           * A dbuf may be removed (evicted) without an active dnode hold. In that
 881  881           * case, the dbuf count is decremented under the handle lock before the
 882  882           * dbuf's hold is released. This order ensures that if we count the hold
 883  883           * after the dbuf is removed but before its hold is released, we will
 884  884           * treat the unmatched hold as active and exit safely. If we count the
 885  885           * hold before the dbuf is removed, the hold is discounted, and the
 886  886           * removal is blocked until the move completes.
 887  887           */
 888  888          refcount = refcount_count(&odn->dn_holds);
 889  889          ASSERT(refcount >= 0);
 890  890          dbufs = odn->dn_dbufs_count;
 891  891  
 892  892          /* We can't have more dbufs than dnode holds. */
 893  893          ASSERT3U(dbufs, <=, refcount);
 894  894          DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
 895  895              uint32_t, dbufs);
 896  896  
 897  897          if (refcount > dbufs) {
 898  898                  rw_exit(&odn->dn_struct_rwlock);
 899  899                  zrl_exit(&odn->dn_handle->dnh_zrlock);
 900  900                  mutex_exit(&os->os_lock);
 901  901                  DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
 902  902                  return (KMEM_CBRC_LATER);
 903  903          }
 904  904  
 905  905          rw_exit(&odn->dn_struct_rwlock);
 906  906  
 907  907          /*
 908  908           * At this point we know that anyone with a hold on the dnode is not
 909  909           * actively referencing it. The dnode is known and in a valid state to
 910  910           * move. We're holding the locks needed to execute the critical section.
 911  911           */
 912  912          dnode_move_impl(odn, ndn);
 913  913  
 914  914          list_link_replace(&odn->dn_link, &ndn->dn_link);
 915  915          /* If the dnode was safe to move, the refcount cannot have changed. */
 916  916          ASSERT(refcount == refcount_count(&ndn->dn_holds));
 917  917          ASSERT(dbufs == ndn->dn_dbufs_count);
 918  918          zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
 919  919          mutex_exit(&os->os_lock);
 920  920  
 921  921          return (KMEM_CBRC_YES);
 922  922  }
 923  923  #endif  /* _KERNEL */
 924  924  
 925  925  void
 926  926  dnode_special_close(dnode_handle_t *dnh)
 927  927  {
 928  928          dnode_t *dn = dnh->dnh_dnode;
 929  929  
 930  930          /*
 931  931           * Wait for final references to the dnode to clear.  This can
 932  932           * only happen if the arc is asyncronously evicting state that
 933  933           * has a hold on this dnode while we are trying to evict this
 934  934           * dnode.
 935  935           */
 936  936          while (refcount_count(&dn->dn_holds) > 0)
 937  937                  delay(1);
 938  938          zrl_add(&dnh->dnh_zrlock);
 939  939          dnode_destroy(dn); /* implicit zrl_remove() */
 940  940          zrl_destroy(&dnh->dnh_zrlock);
 941  941          dnh->dnh_dnode = NULL;
 942  942  }
 943  943  
 944  944  dnode_t *
 945  945  dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
 946  946      dnode_handle_t *dnh)
 947  947  {
 948  948          dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
 949  949          dnh->dnh_dnode = dn;
 950  950          zrl_init(&dnh->dnh_zrlock);
 951  951          DNODE_VERIFY(dn);
 952  952          return (dn);
 953  953  }
 954  954  
 955  955  static void
 956  956  dnode_buf_pageout(dmu_buf_t *db, void *arg)
 957  957  {
 958  958          dnode_children_t *children_dnodes = arg;
 959  959          int i;
 960  960          int epb = db->db_size >> DNODE_SHIFT;
 961  961  
 962  962          ASSERT(epb == children_dnodes->dnc_count);
 963  963  
 964  964          for (i = 0; i < epb; i++) {
 965  965                  dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
 966  966                  dnode_t *dn;
 967  967  
 968  968                  /*
 969  969                   * The dnode handle lock guards against the dnode moving to
 970  970                   * another valid address, so there is no need here to guard
 971  971                   * against changes to or from NULL.
 972  972                   */
 973  973                  if (dnh->dnh_dnode == NULL) {
 974  974                          zrl_destroy(&dnh->dnh_zrlock);
 975  975                          continue;
 976  976                  }
 977  977  
 978  978                  zrl_add(&dnh->dnh_zrlock);
 979  979                  dn = dnh->dnh_dnode;
 980  980                  /*
 981  981                   * If there are holds on this dnode, then there should
 982  982                   * be holds on the dnode's containing dbuf as well; thus
 983  983                   * it wouldn't be eligible for eviction and this function
 984  984                   * would not have been called.
 985  985                   */
 986  986                  ASSERT(refcount_is_zero(&dn->dn_holds));
 987  987                  ASSERT(refcount_is_zero(&dn->dn_tx_holds));
 988  988  
 989  989                  dnode_destroy(dn); /* implicit zrl_remove() */
 990  990                  zrl_destroy(&dnh->dnh_zrlock);
 991  991                  dnh->dnh_dnode = NULL;
 992  992          }
 993  993          kmem_free(children_dnodes, sizeof (dnode_children_t) +
 994  994              (epb - 1) * sizeof (dnode_handle_t));
 995  995  }
 996  996  
 997  997  /*
 998  998   * errors:
 999  999   * EINVAL - invalid object number.
1000 1000   * EIO - i/o error.
1001 1001   * succeeds even for free dnodes.
1002 1002   */
1003 1003  int
1004 1004  dnode_hold_impl(objset_t *os, uint64_t object, int flag,
1005 1005      void *tag, dnode_t **dnp)
1006 1006  {
1007 1007          int epb, idx, err;
1008 1008          int drop_struct_lock = FALSE;
1009 1009          int type;
1010 1010          uint64_t blk;
1011 1011          dnode_t *mdn, *dn;
1012 1012          dmu_buf_impl_t *db;
1013 1013          dnode_children_t *children_dnodes;
1014 1014          dnode_handle_t *dnh;
1015 1015  
1016 1016          /*
1017 1017           * If you are holding the spa config lock as writer, you shouldn't
1018 1018           * be asking the DMU to do *anything* unless it's the root pool
1019 1019           * which may require us to read from the root filesystem while
1020 1020           * holding some (not all) of the locks as writer.
1021 1021           */
1022 1022          ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
1023 1023              (spa_is_root(os->os_spa) &&
1024 1024              spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
1025 1025  
1026 1026          if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
1027 1027                  dn = (object == DMU_USERUSED_OBJECT) ?
1028 1028                      DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
1029 1029                  if (dn == NULL)
1030 1030                          return (ENOENT);
1031 1031                  type = dn->dn_type;
1032 1032                  if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
1033 1033                          return (ENOENT);
1034 1034                  if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
1035 1035                          return (EEXIST);
1036 1036                  DNODE_VERIFY(dn);
1037 1037                  (void) refcount_add(&dn->dn_holds, tag);
1038 1038                  *dnp = dn;
1039 1039                  return (0);
1040 1040          }
1041 1041  
1042 1042          if (object == 0 || object >= DN_MAX_OBJECT)
1043 1043                  return (EINVAL);
1044 1044  
1045 1045          mdn = DMU_META_DNODE(os);
1046 1046          ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
1047 1047  
1048 1048          DNODE_VERIFY(mdn);
1049 1049  
1050 1050          if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
1051 1051                  rw_enter(&mdn->dn_struct_rwlock, RW_READER);
1052 1052                  drop_struct_lock = TRUE;
1053 1053          }
1054 1054  
1055 1055          blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
1056 1056  
1057 1057          db = dbuf_hold(mdn, blk, FTAG);
1058 1058          if (drop_struct_lock)
1059 1059                  rw_exit(&mdn->dn_struct_rwlock);
1060 1060          if (db == NULL)
1061 1061                  return (EIO);
1062 1062          err = dbuf_read(db, NULL, DB_RF_CANFAIL);
1063 1063          if (err) {
1064 1064                  dbuf_rele(db, FTAG);
1065 1065                  return (err);
1066 1066          }
1067 1067  
1068 1068          ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
1069 1069          epb = db->db.db_size >> DNODE_SHIFT;
1070 1070  
1071 1071          idx = object & (epb-1);
1072 1072  
1073 1073          ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
1074 1074          children_dnodes = dmu_buf_get_user(&db->db);
1075 1075          if (children_dnodes == NULL) {
1076 1076                  int i;
1077 1077                  dnode_children_t *winner;
1078 1078                  children_dnodes = kmem_alloc(sizeof (dnode_children_t) +
1079 1079                      (epb - 1) * sizeof (dnode_handle_t), KM_SLEEP);
1080 1080                  children_dnodes->dnc_count = epb;
1081 1081                  dnh = &children_dnodes->dnc_children[0];
1082 1082                  for (i = 0; i < epb; i++) {
1083 1083                          zrl_init(&dnh[i].dnh_zrlock);
1084 1084                          dnh[i].dnh_dnode = NULL;
1085 1085                  }
1086 1086                  if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
1087 1087                      dnode_buf_pageout)) {
1088 1088                          kmem_free(children_dnodes, sizeof (dnode_children_t) +
1089 1089                              (epb - 1) * sizeof (dnode_handle_t));
1090 1090                          children_dnodes = winner;
1091 1091                  }
1092 1092          }
1093 1093          ASSERT(children_dnodes->dnc_count == epb);
1094 1094  
1095 1095          dnh = &children_dnodes->dnc_children[idx];
1096 1096          zrl_add(&dnh->dnh_zrlock);
1097 1097          if ((dn = dnh->dnh_dnode) == NULL) {
1098 1098                  dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
1099 1099                  dnode_t *winner;
1100 1100  
1101 1101                  dn = dnode_create(os, phys, db, object, dnh);
1102 1102                  winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
1103 1103                  if (winner != NULL) {
1104 1104                          zrl_add(&dnh->dnh_zrlock);
1105 1105                          dnode_destroy(dn); /* implicit zrl_remove() */
1106 1106                          dn = winner;
1107 1107                  }
1108 1108          }
1109 1109  
1110 1110          mutex_enter(&dn->dn_mtx);
1111 1111          type = dn->dn_type;
1112 1112          if (dn->dn_free_txg ||
1113 1113              ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
1114 1114              ((flag & DNODE_MUST_BE_FREE) &&
1115 1115              (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
1116 1116                  mutex_exit(&dn->dn_mtx);
1117 1117                  zrl_remove(&dnh->dnh_zrlock);
1118 1118                  dbuf_rele(db, FTAG);
1119 1119                  return (type == DMU_OT_NONE ? ENOENT : EEXIST);
1120 1120          }
1121 1121          mutex_exit(&dn->dn_mtx);
1122 1122  
1123 1123          if (refcount_add(&dn->dn_holds, tag) == 1)
1124 1124                  dbuf_add_ref(db, dnh);
1125 1125          /* Now we can rely on the hold to prevent the dnode from moving. */
1126 1126          zrl_remove(&dnh->dnh_zrlock);
1127 1127  
1128 1128          DNODE_VERIFY(dn);
1129 1129          ASSERT3P(dn->dn_dbuf, ==, db);
1130 1130          ASSERT3U(dn->dn_object, ==, object);
1131 1131          dbuf_rele(db, FTAG);
1132 1132  
1133 1133          *dnp = dn;
1134 1134          return (0);
1135 1135  }
1136 1136  
1137 1137  /*
1138 1138   * Return held dnode if the object is allocated, NULL if not.
1139 1139   */
1140 1140  int
1141 1141  dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
1142 1142  {
1143 1143          return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
1144 1144  }
1145 1145  
1146 1146  /*
1147 1147   * Can only add a reference if there is already at least one
1148 1148   * reference on the dnode.  Returns FALSE if unable to add a
1149 1149   * new reference.
1150 1150   */
1151 1151  boolean_t
1152 1152  dnode_add_ref(dnode_t *dn, void *tag)
1153 1153  {
1154 1154          mutex_enter(&dn->dn_mtx);
1155 1155          if (refcount_is_zero(&dn->dn_holds)) {
1156 1156                  mutex_exit(&dn->dn_mtx);
1157 1157                  return (FALSE);
1158 1158          }
1159 1159          VERIFY(1 < refcount_add(&dn->dn_holds, tag));
1160 1160          mutex_exit(&dn->dn_mtx);
1161 1161          return (TRUE);
1162 1162  }
1163 1163  
1164 1164  void
1165 1165  dnode_rele(dnode_t *dn, void *tag)
1166 1166  {
1167 1167          uint64_t refs;
1168 1168          /* Get while the hold prevents the dnode from moving. */
1169 1169          dmu_buf_impl_t *db = dn->dn_dbuf;
1170 1170          dnode_handle_t *dnh = dn->dn_handle;
1171 1171  
1172 1172          mutex_enter(&dn->dn_mtx);
1173 1173          refs = refcount_remove(&dn->dn_holds, tag);
1174 1174          mutex_exit(&dn->dn_mtx);
1175 1175  
1176 1176          /*
1177 1177           * It's unsafe to release the last hold on a dnode by dnode_rele() or
1178 1178           * indirectly by dbuf_rele() while relying on the dnode handle to
1179 1179           * prevent the dnode from moving, since releasing the last hold could
1180 1180           * result in the dnode's parent dbuf evicting its dnode handles. For
1181 1181           * that reason anyone calling dnode_rele() or dbuf_rele() without some
1182 1182           * other direct or indirect hold on the dnode must first drop the dnode
1183 1183           * handle.
1184 1184           */
1185 1185          ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
1186 1186  
1187 1187          /* NOTE: the DNODE_DNODE does not have a dn_dbuf */
1188 1188          if (refs == 0 && db != NULL) {
1189 1189                  /*
1190 1190                   * Another thread could add a hold to the dnode handle in
1191 1191                   * dnode_hold_impl() while holding the parent dbuf. Since the
1192 1192                   * hold on the parent dbuf prevents the handle from being
1193 1193                   * destroyed, the hold on the handle is OK. We can't yet assert
1194 1194                   * that the handle has zero references, but that will be
1195 1195                   * asserted anyway when the handle gets destroyed.
1196 1196                   */
1197 1197                  dbuf_rele(db, dnh);
1198 1198          }
1199 1199  }
1200 1200  
1201 1201  void
1202 1202  dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
1203 1203  {
1204 1204          objset_t *os = dn->dn_objset;
1205 1205          uint64_t txg = tx->tx_txg;
1206 1206  
1207 1207          if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
1208 1208                  dsl_dataset_dirty(os->os_dsl_dataset, tx);
1209 1209                  return;
1210 1210          }
1211 1211  
1212 1212          DNODE_VERIFY(dn);
1213 1213  
1214 1214  #ifdef ZFS_DEBUG
1215 1215          mutex_enter(&dn->dn_mtx);
1216 1216          ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
1217 1217          ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
1218 1218          mutex_exit(&dn->dn_mtx);
1219 1219  #endif
1220 1220  
1221 1221          /*
1222 1222           * Determine old uid/gid when necessary
1223 1223           */
1224 1224          dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
1225 1225  
1226 1226          mutex_enter(&os->os_lock);
1227 1227  
  
    | 
      ↓ open down ↓ | 
    649 lines elided | 
    
      ↑ open up ↑ | 
  
1228 1228          /*
1229 1229           * If we are already marked dirty, we're done.
1230 1230           */
1231 1231          if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
1232 1232                  mutex_exit(&os->os_lock);
1233 1233                  return;
1234 1234          }
1235 1235  
1236 1236          ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
1237 1237          ASSERT(dn->dn_datablksz != 0);
1238      -        ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0);
1239      -        ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
1240      -        ASSERT3U(dn->dn_next_bonustype[txg&TXG_MASK], ==, 0);
     1238 +        ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
     1239 +        ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
     1240 +        ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
1241 1241  
1242 1242          dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
1243 1243              dn->dn_object, txg);
1244 1244  
1245 1245          if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
1246 1246                  list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
1247 1247          } else {
1248 1248                  list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
1249 1249          }
1250 1250  
1251 1251          mutex_exit(&os->os_lock);
1252 1252  
1253 1253          /*
1254 1254           * The dnode maintains a hold on its containing dbuf as
1255 1255           * long as there are holds on it.  Each instantiated child
1256 1256           * dbuf maintains a hold on the dnode.  When the last child
1257 1257           * drops its hold, the dnode will drop its hold on the
1258 1258           * containing dbuf. We add a "dirty hold" here so that the
1259 1259           * dnode will hang around after we finish processing its
1260 1260           * children.
1261 1261           */
1262 1262          VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
1263 1263  
1264 1264          (void) dbuf_dirty(dn->dn_dbuf, tx);
1265 1265  
1266 1266          dsl_dataset_dirty(os->os_dsl_dataset, tx);
1267 1267  }
1268 1268  
1269 1269  void
1270 1270  dnode_free(dnode_t *dn, dmu_tx_t *tx)
1271 1271  {
1272 1272          int txgoff = tx->tx_txg & TXG_MASK;
1273 1273  
1274 1274          dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
1275 1275  
1276 1276          /* we should be the only holder... hopefully */
1277 1277          /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */
1278 1278  
1279 1279          mutex_enter(&dn->dn_mtx);
1280 1280          if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
1281 1281                  mutex_exit(&dn->dn_mtx);
1282 1282                  return;
1283 1283          }
1284 1284          dn->dn_free_txg = tx->tx_txg;
1285 1285          mutex_exit(&dn->dn_mtx);
1286 1286  
1287 1287          /*
1288 1288           * If the dnode is already dirty, it needs to be moved from
1289 1289           * the dirty list to the free list.
1290 1290           */
1291 1291          mutex_enter(&dn->dn_objset->os_lock);
1292 1292          if (list_link_active(&dn->dn_dirty_link[txgoff])) {
1293 1293                  list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn);
1294 1294                  list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn);
1295 1295                  mutex_exit(&dn->dn_objset->os_lock);
1296 1296          } else {
1297 1297                  mutex_exit(&dn->dn_objset->os_lock);
1298 1298                  dnode_setdirty(dn, tx);
1299 1299          }
1300 1300  }
1301 1301  
1302 1302  /*
1303 1303   * Try to change the block size for the indicated dnode.  This can only
1304 1304   * succeed if there are no blocks allocated or dirty beyond first block
1305 1305   */
1306 1306  int
1307 1307  dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
1308 1308  {
1309 1309          dmu_buf_impl_t *db, *db_next;
1310 1310          int err;
1311 1311  
1312 1312          if (size == 0)
1313 1313                  size = SPA_MINBLOCKSIZE;
1314 1314          if (size > SPA_MAXBLOCKSIZE)
1315 1315                  size = SPA_MAXBLOCKSIZE;
1316 1316          else
1317 1317                  size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
1318 1318  
1319 1319          if (ibs == dn->dn_indblkshift)
1320 1320                  ibs = 0;
1321 1321  
1322 1322          if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
1323 1323                  return (0);
1324 1324  
1325 1325          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1326 1326  
1327 1327          /* Check for any allocated blocks beyond the first */
1328 1328          if (dn->dn_phys->dn_maxblkid != 0)
1329 1329                  goto fail;
1330 1330  
1331 1331          mutex_enter(&dn->dn_dbufs_mtx);
1332 1332          for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
1333 1333                  db_next = list_next(&dn->dn_dbufs, db);
1334 1334  
1335 1335                  if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
1336 1336                      db->db_blkid != DMU_SPILL_BLKID) {
1337 1337                          mutex_exit(&dn->dn_dbufs_mtx);
1338 1338                          goto fail;
1339 1339                  }
1340 1340          }
1341 1341          mutex_exit(&dn->dn_dbufs_mtx);
1342 1342  
1343 1343          if (ibs && dn->dn_nlevels != 1)
1344 1344                  goto fail;
1345 1345  
1346 1346          /* resize the old block */
1347 1347          err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
1348 1348          if (err == 0)
1349 1349                  dbuf_new_size(db, size, tx);
1350 1350          else if (err != ENOENT)
1351 1351                  goto fail;
1352 1352  
1353 1353          dnode_setdblksz(dn, size);
1354 1354          dnode_setdirty(dn, tx);
1355 1355          dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
1356 1356          if (ibs) {
1357 1357                  dn->dn_indblkshift = ibs;
1358 1358                  dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
1359 1359          }
1360 1360          /* rele after we have fixed the blocksize in the dnode */
1361 1361          if (db)
1362 1362                  dbuf_rele(db, FTAG);
1363 1363  
1364 1364          rw_exit(&dn->dn_struct_rwlock);
1365 1365          return (0);
1366 1366  
1367 1367  fail:
1368 1368          rw_exit(&dn->dn_struct_rwlock);
1369 1369          return (ENOTSUP);
1370 1370  }
1371 1371  
1372 1372  /* read-holding callers must not rely on the lock being continuously held */
1373 1373  void
1374 1374  dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
1375 1375  {
1376 1376          uint64_t txgoff = tx->tx_txg & TXG_MASK;
1377 1377          int epbs, new_nlevels;
1378 1378          uint64_t sz;
1379 1379  
1380 1380          ASSERT(blkid != DMU_BONUS_BLKID);
1381 1381  
1382 1382          ASSERT(have_read ?
1383 1383              RW_READ_HELD(&dn->dn_struct_rwlock) :
1384 1384              RW_WRITE_HELD(&dn->dn_struct_rwlock));
1385 1385  
1386 1386          /*
1387 1387           * if we have a read-lock, check to see if we need to do any work
1388 1388           * before upgrading to a write-lock.
1389 1389           */
1390 1390          if (have_read) {
1391 1391                  if (blkid <= dn->dn_maxblkid)
1392 1392                          return;
1393 1393  
1394 1394                  if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
1395 1395                          rw_exit(&dn->dn_struct_rwlock);
1396 1396                          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1397 1397                  }
1398 1398          }
1399 1399  
1400 1400          if (blkid <= dn->dn_maxblkid)
1401 1401                  goto out;
1402 1402  
1403 1403          dn->dn_maxblkid = blkid;
1404 1404  
1405 1405          /*
1406 1406           * Compute the number of levels necessary to support the new maxblkid.
1407 1407           */
1408 1408          new_nlevels = 1;
1409 1409          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1410 1410          for (sz = dn->dn_nblkptr;
1411 1411              sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
1412 1412                  new_nlevels++;
1413 1413  
1414 1414          if (new_nlevels > dn->dn_nlevels) {
1415 1415                  int old_nlevels = dn->dn_nlevels;
1416 1416                  dmu_buf_impl_t *db;
1417 1417                  list_t *list;
1418 1418                  dbuf_dirty_record_t *new, *dr, *dr_next;
1419 1419  
1420 1420                  dn->dn_nlevels = new_nlevels;
1421 1421  
1422 1422                  ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
1423 1423                  dn->dn_next_nlevels[txgoff] = new_nlevels;
1424 1424  
1425 1425                  /* dirty the left indirects */
1426 1426                  db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
1427 1427                  ASSERT(db != NULL);
1428 1428                  new = dbuf_dirty(db, tx);
1429 1429                  dbuf_rele(db, FTAG);
1430 1430  
1431 1431                  /* transfer the dirty records to the new indirect */
1432 1432                  mutex_enter(&dn->dn_mtx);
1433 1433                  mutex_enter(&new->dt.di.dr_mtx);
1434 1434                  list = &dn->dn_dirty_records[txgoff];
1435 1435                  for (dr = list_head(list); dr; dr = dr_next) {
1436 1436                          dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
1437 1437                          if (dr->dr_dbuf->db_level != new_nlevels-1 &&
1438 1438                              dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
1439 1439                              dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
1440 1440                                  ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
1441 1441                                  list_remove(&dn->dn_dirty_records[txgoff], dr);
1442 1442                                  list_insert_tail(&new->dt.di.dr_children, dr);
1443 1443                                  dr->dr_parent = new;
1444 1444                          }
1445 1445                  }
1446 1446                  mutex_exit(&new->dt.di.dr_mtx);
1447 1447                  mutex_exit(&dn->dn_mtx);
1448 1448          }
1449 1449  
1450 1450  out:
1451 1451          if (have_read)
1452 1452                  rw_downgrade(&dn->dn_struct_rwlock);
1453 1453  }
1454 1454  
1455 1455  void
1456 1456  dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
1457 1457  {
1458 1458          avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
1459 1459          avl_index_t where;
1460 1460          free_range_t *rp;
1461 1461          free_range_t rp_tofind;
1462 1462          uint64_t endblk = blkid + nblks;
1463 1463  
1464 1464          ASSERT(MUTEX_HELD(&dn->dn_mtx));
1465 1465          ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
1466 1466  
1467 1467          dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
1468 1468              blkid, nblks, tx->tx_txg);
1469 1469          rp_tofind.fr_blkid = blkid;
1470 1470          rp = avl_find(tree, &rp_tofind, &where);
1471 1471          if (rp == NULL)
1472 1472                  rp = avl_nearest(tree, where, AVL_BEFORE);
1473 1473          if (rp == NULL)
1474 1474                  rp = avl_nearest(tree, where, AVL_AFTER);
1475 1475  
1476 1476          while (rp && (rp->fr_blkid <= blkid + nblks)) {
1477 1477                  uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
1478 1478                  free_range_t *nrp = AVL_NEXT(tree, rp);
1479 1479  
1480 1480                  if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
1481 1481                          /* clear this entire range */
1482 1482                          avl_remove(tree, rp);
1483 1483                          kmem_free(rp, sizeof (free_range_t));
1484 1484                  } else if (blkid <= rp->fr_blkid &&
1485 1485                      endblk > rp->fr_blkid && endblk < fr_endblk) {
1486 1486                          /* clear the beginning of this range */
1487 1487                          rp->fr_blkid = endblk;
1488 1488                          rp->fr_nblks = fr_endblk - endblk;
1489 1489                  } else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
1490 1490                      endblk >= fr_endblk) {
1491 1491                          /* clear the end of this range */
1492 1492                          rp->fr_nblks = blkid - rp->fr_blkid;
1493 1493                  } else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
1494 1494                          /* clear a chunk out of this range */
1495 1495                          free_range_t *new_rp =
1496 1496                              kmem_alloc(sizeof (free_range_t), KM_SLEEP);
1497 1497  
1498 1498                          new_rp->fr_blkid = endblk;
1499 1499                          new_rp->fr_nblks = fr_endblk - endblk;
1500 1500                          avl_insert_here(tree, new_rp, rp, AVL_AFTER);
1501 1501                          rp->fr_nblks = blkid - rp->fr_blkid;
1502 1502                  }
1503 1503                  /* there may be no overlap */
1504 1504                  rp = nrp;
1505 1505          }
1506 1506  }
1507 1507  
1508 1508  void
1509 1509  dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
1510 1510  {
1511 1511          dmu_buf_impl_t *db;
1512 1512          uint64_t blkoff, blkid, nblks;
1513 1513          int blksz, blkshift, head, tail;
1514 1514          int trunc = FALSE;
1515 1515          int epbs;
1516 1516  
1517 1517          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1518 1518          blksz = dn->dn_datablksz;
1519 1519          blkshift = dn->dn_datablkshift;
1520 1520          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1521 1521  
1522 1522          if (len == -1ULL) {
1523 1523                  len = UINT64_MAX - off;
1524 1524                  trunc = TRUE;
1525 1525          }
1526 1526  
1527 1527          /*
1528 1528           * First, block align the region to free:
1529 1529           */
1530 1530          if (ISP2(blksz)) {
1531 1531                  head = P2NPHASE(off, blksz);
1532 1532                  blkoff = P2PHASE(off, blksz);
1533 1533                  if ((off >> blkshift) > dn->dn_maxblkid)
1534 1534                          goto out;
1535 1535          } else {
1536 1536                  ASSERT(dn->dn_maxblkid == 0);
1537 1537                  if (off == 0 && len >= blksz) {
1538 1538                          /* Freeing the whole block; fast-track this request */
1539 1539                          blkid = 0;
1540 1540                          nblks = 1;
1541 1541                          goto done;
1542 1542                  } else if (off >= blksz) {
1543 1543                          /* Freeing past end-of-data */
1544 1544                          goto out;
1545 1545                  } else {
1546 1546                          /* Freeing part of the block. */
1547 1547                          head = blksz - off;
1548 1548                          ASSERT3U(head, >, 0);
1549 1549                  }
1550 1550                  blkoff = off;
1551 1551          }
1552 1552          /* zero out any partial block data at the start of the range */
1553 1553          if (head) {
1554 1554                  ASSERT3U(blkoff + head, ==, blksz);
1555 1555                  if (len < head)
1556 1556                          head = len;
1557 1557                  if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
1558 1558                      FTAG, &db) == 0) {
1559 1559                          caddr_t data;
1560 1560  
1561 1561                          /* don't dirty if it isn't on disk and isn't dirty */
1562 1562                          if (db->db_last_dirty ||
1563 1563                              (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
1564 1564                                  rw_exit(&dn->dn_struct_rwlock);
1565 1565                                  dbuf_will_dirty(db, tx);
1566 1566                                  rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1567 1567                                  data = db->db.db_data;
1568 1568                                  bzero(data + blkoff, head);
1569 1569                          }
1570 1570                          dbuf_rele(db, FTAG);
1571 1571                  }
1572 1572                  off += head;
1573 1573                  len -= head;
1574 1574          }
1575 1575  
1576 1576          /* If the range was less than one block, we're done */
1577 1577          if (len == 0)
1578 1578                  goto out;
1579 1579  
  
    | 
      ↓ open down ↓ | 
    329 lines elided | 
    
      ↑ open up ↑ | 
  
1580 1580          /* If the remaining range is past end of file, we're done */
1581 1581          if ((off >> blkshift) > dn->dn_maxblkid)
1582 1582                  goto out;
1583 1583  
1584 1584          ASSERT(ISP2(blksz));
1585 1585          if (trunc)
1586 1586                  tail = 0;
1587 1587          else
1588 1588                  tail = P2PHASE(len, blksz);
1589 1589  
1590      -        ASSERT3U(P2PHASE(off, blksz), ==, 0);
     1590 +        ASSERT0(P2PHASE(off, blksz));
1591 1591          /* zero out any partial block data at the end of the range */
1592 1592          if (tail) {
1593 1593                  if (len < tail)
1594 1594                          tail = len;
1595 1595                  if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
1596 1596                      TRUE, FTAG, &db) == 0) {
1597 1597                          /* don't dirty if not on disk and not dirty */
1598 1598                          if (db->db_last_dirty ||
1599 1599                              (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
1600 1600                                  rw_exit(&dn->dn_struct_rwlock);
1601 1601                                  dbuf_will_dirty(db, tx);
1602 1602                                  rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1603 1603                                  bzero(db->db.db_data, tail);
1604 1604                          }
1605 1605                          dbuf_rele(db, FTAG);
1606 1606                  }
1607 1607                  len -= tail;
1608 1608          }
1609 1609  
1610 1610          /* If the range did not include a full block, we are done */
1611 1611          if (len == 0)
1612 1612                  goto out;
1613 1613  
1614 1614          ASSERT(IS_P2ALIGNED(off, blksz));
1615 1615          ASSERT(trunc || IS_P2ALIGNED(len, blksz));
1616 1616          blkid = off >> blkshift;
1617 1617          nblks = len >> blkshift;
1618 1618          if (trunc)
1619 1619                  nblks += 1;
1620 1620  
1621 1621          /*
1622 1622           * Read in and mark all the level-1 indirects dirty,
1623 1623           * so that they will stay in memory until syncing phase.
1624 1624           * Always dirty the first and last indirect to make sure
1625 1625           * we dirty all the partial indirects.
1626 1626           */
1627 1627          if (dn->dn_nlevels > 1) {
1628 1628                  uint64_t i, first, last;
1629 1629                  int shift = epbs + dn->dn_datablkshift;
1630 1630  
1631 1631                  first = blkid >> epbs;
1632 1632                  if (db = dbuf_hold_level(dn, 1, first, FTAG)) {
1633 1633                          dbuf_will_dirty(db, tx);
1634 1634                          dbuf_rele(db, FTAG);
1635 1635                  }
1636 1636                  if (trunc)
1637 1637                          last = dn->dn_maxblkid >> epbs;
1638 1638                  else
1639 1639                          last = (blkid + nblks - 1) >> epbs;
1640 1640                  if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
1641 1641                          dbuf_will_dirty(db, tx);
1642 1642                          dbuf_rele(db, FTAG);
1643 1643                  }
1644 1644                  for (i = first + 1; i < last; i++) {
1645 1645                          uint64_t ibyte = i << shift;
1646 1646                          int err;
1647 1647  
1648 1648                          err = dnode_next_offset(dn,
1649 1649                              DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0);
1650 1650                          i = ibyte >> shift;
1651 1651                          if (err == ESRCH || i >= last)
1652 1652                                  break;
1653 1653                          ASSERT(err == 0);
1654 1654                          db = dbuf_hold_level(dn, 1, i, FTAG);
1655 1655                          if (db) {
1656 1656                                  dbuf_will_dirty(db, tx);
1657 1657                                  dbuf_rele(db, FTAG);
1658 1658                          }
1659 1659                  }
1660 1660          }
1661 1661  done:
1662 1662          /*
1663 1663           * Add this range to the dnode range list.
1664 1664           * We will finish up this free operation in the syncing phase.
1665 1665           */
1666 1666          mutex_enter(&dn->dn_mtx);
1667 1667          dnode_clear_range(dn, blkid, nblks, tx);
1668 1668          {
1669 1669                  free_range_t *rp, *found;
1670 1670                  avl_index_t where;
1671 1671                  avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
1672 1672  
1673 1673                  /* Add new range to dn_ranges */
1674 1674                  rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP);
1675 1675                  rp->fr_blkid = blkid;
1676 1676                  rp->fr_nblks = nblks;
1677 1677                  found = avl_find(tree, rp, &where);
1678 1678                  ASSERT(found == NULL);
1679 1679                  avl_insert(tree, rp, where);
1680 1680                  dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
1681 1681                      blkid, nblks, tx->tx_txg);
1682 1682          }
1683 1683          mutex_exit(&dn->dn_mtx);
1684 1684  
1685 1685          dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
1686 1686          dnode_setdirty(dn, tx);
1687 1687  out:
1688 1688          if (trunc && dn->dn_maxblkid >= (off >> blkshift))
1689 1689                  dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0);
1690 1690  
1691 1691          rw_exit(&dn->dn_struct_rwlock);
1692 1692  }
1693 1693  
1694 1694  static boolean_t
1695 1695  dnode_spill_freed(dnode_t *dn)
1696 1696  {
1697 1697          int i;
1698 1698  
1699 1699          mutex_enter(&dn->dn_mtx);
1700 1700          for (i = 0; i < TXG_SIZE; i++) {
1701 1701                  if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
1702 1702                          break;
1703 1703          }
1704 1704          mutex_exit(&dn->dn_mtx);
1705 1705          return (i < TXG_SIZE);
1706 1706  }
1707 1707  
1708 1708  /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
1709 1709  uint64_t
1710 1710  dnode_block_freed(dnode_t *dn, uint64_t blkid)
1711 1711  {
1712 1712          free_range_t range_tofind;
1713 1713          void *dp = spa_get_dsl(dn->dn_objset->os_spa);
1714 1714          int i;
1715 1715  
1716 1716          if (blkid == DMU_BONUS_BLKID)
1717 1717                  return (FALSE);
1718 1718  
1719 1719          /*
1720 1720           * If we're in the process of opening the pool, dp will not be
1721 1721           * set yet, but there shouldn't be anything dirty.
1722 1722           */
1723 1723          if (dp == NULL)
1724 1724                  return (FALSE);
1725 1725  
1726 1726          if (dn->dn_free_txg)
1727 1727                  return (TRUE);
1728 1728  
1729 1729          if (blkid == DMU_SPILL_BLKID)
1730 1730                  return (dnode_spill_freed(dn));
1731 1731  
1732 1732          range_tofind.fr_blkid = blkid;
1733 1733          mutex_enter(&dn->dn_mtx);
1734 1734          for (i = 0; i < TXG_SIZE; i++) {
1735 1735                  free_range_t *range_found;
1736 1736                  avl_index_t idx;
1737 1737  
1738 1738                  range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
1739 1739                  if (range_found) {
1740 1740                          ASSERT(range_found->fr_nblks > 0);
1741 1741                          break;
1742 1742                  }
1743 1743                  range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
1744 1744                  if (range_found &&
1745 1745                      range_found->fr_blkid + range_found->fr_nblks > blkid)
1746 1746                          break;
1747 1747          }
1748 1748          mutex_exit(&dn->dn_mtx);
1749 1749          return (i < TXG_SIZE);
1750 1750  }
1751 1751  
1752 1752  /* call from syncing context when we actually write/free space for this dnode */
1753 1753  void
1754 1754  dnode_diduse_space(dnode_t *dn, int64_t delta)
1755 1755  {
1756 1756          uint64_t space;
1757 1757          dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
1758 1758              dn, dn->dn_phys,
1759 1759              (u_longlong_t)dn->dn_phys->dn_used,
1760 1760              (longlong_t)delta);
1761 1761  
  
    | 
      ↓ open down ↓ | 
    161 lines elided | 
    
      ↑ open up ↑ | 
  
1762 1762          mutex_enter(&dn->dn_mtx);
1763 1763          space = DN_USED_BYTES(dn->dn_phys);
1764 1764          if (delta > 0) {
1765 1765                  ASSERT3U(space + delta, >=, space); /* no overflow */
1766 1766          } else {
1767 1767                  ASSERT3U(space, >=, -delta); /* no underflow */
1768 1768          }
1769 1769          space += delta;
1770 1770          if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
1771 1771                  ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
1772      -                ASSERT3U(P2PHASE(space, 1<<DEV_BSHIFT), ==, 0);
     1772 +                ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
1773 1773                  dn->dn_phys->dn_used = space >> DEV_BSHIFT;
1774 1774          } else {
1775 1775                  dn->dn_phys->dn_used = space;
1776 1776                  dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
1777 1777          }
1778 1778          mutex_exit(&dn->dn_mtx);
1779 1779  }
1780 1780  
1781 1781  /*
1782 1782   * Call when we think we're going to write/free space in open context.
1783 1783   * Be conservative (ie. OK to write less than this or free more than
1784 1784   * this, but don't write more or free less).
1785 1785   */
1786 1786  void
1787 1787  dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
1788 1788  {
1789 1789          objset_t *os = dn->dn_objset;
1790 1790          dsl_dataset_t *ds = os->os_dsl_dataset;
1791 1791  
1792 1792          if (space > 0)
1793 1793                  space = spa_get_asize(os->os_spa, space);
1794 1794  
1795 1795          if (ds)
1796 1796                  dsl_dir_willuse_space(ds->ds_dir, space, tx);
1797 1797  
1798 1798          dmu_tx_willuse_space(tx, space);
1799 1799  }
1800 1800  
1801 1801  /*
1802 1802   * This function scans a block at the indicated "level" looking for
1803 1803   * a hole or data (depending on 'flags').  If level > 0, then we are
1804 1804   * scanning an indirect block looking at its pointers.  If level == 0,
1805 1805   * then we are looking at a block of dnodes.  If we don't find what we
1806 1806   * are looking for in the block, we return ESRCH.  Otherwise, return
1807 1807   * with *offset pointing to the beginning (if searching forwards) or
1808 1808   * end (if searching backwards) of the range covered by the block
1809 1809   * pointer we matched on (or dnode).
1810 1810   *
1811 1811   * The basic search algorithm used below by dnode_next_offset() is to
1812 1812   * use this function to search up the block tree (widen the search) until
1813 1813   * we find something (i.e., we don't return ESRCH) and then search back
1814 1814   * down the tree (narrow the search) until we reach our original search
1815 1815   * level.
1816 1816   */
1817 1817  static int
1818 1818  dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
1819 1819          int lvl, uint64_t blkfill, uint64_t txg)
1820 1820  {
1821 1821          dmu_buf_impl_t *db = NULL;
1822 1822          void *data = NULL;
1823 1823          uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
1824 1824          uint64_t epb = 1ULL << epbs;
1825 1825          uint64_t minfill, maxfill;
1826 1826          boolean_t hole;
1827 1827          int i, inc, error, span;
1828 1828  
1829 1829          dprintf("probing object %llu offset %llx level %d of %u\n",
1830 1830              dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
1831 1831  
1832 1832          hole = ((flags & DNODE_FIND_HOLE) != 0);
1833 1833          inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
1834 1834          ASSERT(txg == 0 || !hole);
1835 1835  
1836 1836          if (lvl == dn->dn_phys->dn_nlevels) {
1837 1837                  error = 0;
1838 1838                  epb = dn->dn_phys->dn_nblkptr;
1839 1839                  data = dn->dn_phys->dn_blkptr;
1840 1840          } else {
1841 1841                  uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
1842 1842                  error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
1843 1843                  if (error) {
1844 1844                          if (error != ENOENT)
1845 1845                                  return (error);
1846 1846                          if (hole)
1847 1847                                  return (0);
1848 1848                          /*
1849 1849                           * This can only happen when we are searching up
1850 1850                           * the block tree for data.  We don't really need to
1851 1851                           * adjust the offset, as we will just end up looking
1852 1852                           * at the pointer to this block in its parent, and its
1853 1853                           * going to be unallocated, so we will skip over it.
1854 1854                           */
1855 1855                          return (ESRCH);
1856 1856                  }
1857 1857                  error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
1858 1858                  if (error) {
1859 1859                          dbuf_rele(db, FTAG);
1860 1860                          return (error);
1861 1861                  }
1862 1862                  data = db->db.db_data;
1863 1863          }
1864 1864  
1865 1865          if (db && txg &&
1866 1866              (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) {
1867 1867                  /*
1868 1868                   * This can only happen when we are searching up the tree
1869 1869                   * and these conditions mean that we need to keep climbing.
1870 1870                   */
1871 1871                  error = ESRCH;
1872 1872          } else if (lvl == 0) {
1873 1873                  dnode_phys_t *dnp = data;
1874 1874                  span = DNODE_SHIFT;
1875 1875                  ASSERT(dn->dn_type == DMU_OT_DNODE);
1876 1876  
1877 1877                  for (i = (*offset >> span) & (blkfill - 1);
1878 1878                      i >= 0 && i < blkfill; i += inc) {
1879 1879                          if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
1880 1880                                  break;
1881 1881                          *offset += (1ULL << span) * inc;
1882 1882                  }
1883 1883                  if (i < 0 || i == blkfill)
1884 1884                          error = ESRCH;
1885 1885          } else {
1886 1886                  blkptr_t *bp = data;
1887 1887                  uint64_t start = *offset;
1888 1888                  span = (lvl - 1) * epbs + dn->dn_datablkshift;
1889 1889                  minfill = 0;
1890 1890                  maxfill = blkfill << ((lvl - 1) * epbs);
1891 1891  
1892 1892                  if (hole)
1893 1893                          maxfill--;
1894 1894                  else
1895 1895                          minfill++;
1896 1896  
1897 1897                  *offset = *offset >> span;
1898 1898                  for (i = BF64_GET(*offset, 0, epbs);
1899 1899                      i >= 0 && i < epb; i += inc) {
1900 1900                          if (bp[i].blk_fill >= minfill &&
1901 1901                              bp[i].blk_fill <= maxfill &&
1902 1902                              (hole || bp[i].blk_birth > txg))
1903 1903                                  break;
1904 1904                          if (inc > 0 || *offset > 0)
1905 1905                                  *offset += inc;
1906 1906                  }
1907 1907                  *offset = *offset << span;
1908 1908                  if (inc < 0) {
1909 1909                          /* traversing backwards; position offset at the end */
1910 1910                          ASSERT3U(*offset, <=, start);
1911 1911                          *offset = MIN(*offset + (1ULL << span) - 1, start);
1912 1912                  } else if (*offset < start) {
1913 1913                          *offset = start;
1914 1914                  }
1915 1915                  if (i < 0 || i >= epb)
1916 1916                          error = ESRCH;
1917 1917          }
1918 1918  
1919 1919          if (db)
1920 1920                  dbuf_rele(db, FTAG);
1921 1921  
1922 1922          return (error);
1923 1923  }
1924 1924  
1925 1925  /*
1926 1926   * Find the next hole, data, or sparse region at or after *offset.
1927 1927   * The value 'blkfill' tells us how many items we expect to find
1928 1928   * in an L0 data block; this value is 1 for normal objects,
1929 1929   * DNODES_PER_BLOCK for the meta dnode, and some fraction of
1930 1930   * DNODES_PER_BLOCK when searching for sparse regions thereof.
1931 1931   *
1932 1932   * Examples:
1933 1933   *
1934 1934   * dnode_next_offset(dn, flags, offset, 1, 1, 0);
1935 1935   *      Finds the next/previous hole/data in a file.
1936 1936   *      Used in dmu_offset_next().
1937 1937   *
1938 1938   * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
1939 1939   *      Finds the next free/allocated dnode an objset's meta-dnode.
1940 1940   *      Only finds objects that have new contents since txg (ie.
1941 1941   *      bonus buffer changes and content removal are ignored).
1942 1942   *      Used in dmu_object_next().
1943 1943   *
1944 1944   * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
1945 1945   *      Finds the next L2 meta-dnode bp that's at most 1/4 full.
1946 1946   *      Used in dmu_object_alloc().
1947 1947   */
1948 1948  int
1949 1949  dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
1950 1950      int minlvl, uint64_t blkfill, uint64_t txg)
1951 1951  {
1952 1952          uint64_t initial_offset = *offset;
1953 1953          int lvl, maxlvl;
1954 1954          int error = 0;
1955 1955  
1956 1956          if (!(flags & DNODE_FIND_HAVELOCK))
1957 1957                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
1958 1958  
1959 1959          if (dn->dn_phys->dn_nlevels == 0) {
1960 1960                  error = ESRCH;
1961 1961                  goto out;
1962 1962          }
1963 1963  
1964 1964          if (dn->dn_datablkshift == 0) {
1965 1965                  if (*offset < dn->dn_datablksz) {
1966 1966                          if (flags & DNODE_FIND_HOLE)
1967 1967                                  *offset = dn->dn_datablksz;
1968 1968                  } else {
1969 1969                          error = ESRCH;
1970 1970                  }
1971 1971                  goto out;
1972 1972          }
1973 1973  
1974 1974          maxlvl = dn->dn_phys->dn_nlevels;
1975 1975  
1976 1976          for (lvl = minlvl; lvl <= maxlvl; lvl++) {
1977 1977                  error = dnode_next_offset_level(dn,
1978 1978                      flags, offset, lvl, blkfill, txg);
1979 1979                  if (error != ESRCH)
1980 1980                          break;
1981 1981          }
1982 1982  
1983 1983          while (error == 0 && --lvl >= minlvl) {
1984 1984                  error = dnode_next_offset_level(dn,
1985 1985                      flags, offset, lvl, blkfill, txg);
1986 1986          }
1987 1987  
1988 1988          if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
1989 1989              initial_offset < *offset : initial_offset > *offset))
1990 1990                  error = ESRCH;
1991 1991  out:
1992 1992          if (!(flags & DNODE_FIND_HAVELOCK))
1993 1993                  rw_exit(&dn->dn_struct_rwlock);
1994 1994  
1995 1995          return (error);
1996 1996  }
  
    | 
      ↓ open down ↓ | 
    214 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX