1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2012 by Delphix. All rights reserved.
  24  */
  25 
  26 #include <sys/arc.h>
  27 #include <sys/bptree.h>
  28 #include <sys/dmu.h>
  29 #include <sys/dmu_objset.h>
  30 #include <sys/dmu_tx.h>
  31 #include <sys/dmu_traverse.h>
  32 #include <sys/dsl_dataset.h>
  33 #include <sys/dsl_dir.h>
  34 #include <sys/dsl_pool.h>
  35 #include <sys/dnode.h>
  36 #include <sys/refcount.h>
  37 #include <sys/spa.h>
  38 
  39 /*
  40  * A bptree is a queue of root block pointers from destroyed datasets. When a
  41  * dataset is destroyed its root block pointer is put on the end of the pool's
  42  * bptree queue so the dataset's blocks can be freed asynchronously by
  43  * dsl_scan_sync. This allows the delete operation to finish without traversing
  44  * all the dataset's blocks.
  45  *
  46  * Note that while bt_begin and bt_end are only ever incremented in this code
  47  * they are effectively reset to 0 every time the entire bptree is freed because
  48  * the bptree's object is destroyed and re-created.
  49  */
  50 
  51 struct bptree_args {
  52         bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */
  53         boolean_t ba_free;      /* true if freeing during traversal */
  54 
  55         bptree_itor_t *ba_func; /* function to call for each blockpointer */
  56         void *ba_arg;           /* caller supplied argument to ba_func */
  57         dmu_tx_t *ba_tx;        /* caller supplied tx, NULL if not freeing */
  58 } bptree_args_t;
  59 
  60 uint64_t
  61 bptree_alloc(objset_t *os, dmu_tx_t *tx)
  62 {
  63         uint64_t obj;
  64         dmu_buf_t *db;
  65         bptree_phys_t *bt;
  66 
  67         obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
  68             SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
  69             sizeof (bptree_phys_t), tx);
  70 
  71         /*
  72          * Bonus buffer contents are already initialized to 0, but for
  73          * readability we make it explicit.
  74          */
  75         VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
  76         dmu_buf_will_dirty(db, tx);
  77         bt = db->db_data;
  78         bt->bt_begin = 0;
  79         bt->bt_end = 0;
  80         bt->bt_bytes = 0;
  81         bt->bt_comp = 0;
  82         bt->bt_uncomp = 0;
  83         dmu_buf_rele(db, FTAG);
  84 
  85         return (obj);
  86 }
  87 
  88 int
  89 bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
  90 {
  91         dmu_buf_t *db;
  92         bptree_phys_t *bt;
  93 
  94         VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
  95         bt = db->db_data;
  96         ASSERT3U(bt->bt_begin, ==, bt->bt_end);
  97         ASSERT3U(bt->bt_bytes, ==, 0);
  98         ASSERT3U(bt->bt_comp, ==, 0);
  99         ASSERT3U(bt->bt_uncomp, ==, 0);
 100         dmu_buf_rele(db, FTAG);
 101 
 102         return (dmu_object_free(os, obj, tx));
 103 }
 104 
 105 void
 106 bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
 107     uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
 108 {
 109         dmu_buf_t *db;
 110         bptree_phys_t *bt;
 111         bptree_entry_phys_t bte;
 112 
 113         /*
 114          * bptree objects are in the pool mos, therefore they can only be
 115          * modified in syncing context. Furthermore, this is only modified
 116          * by the sync thread, so no locking is necessary.
 117          */
 118         ASSERT(dmu_tx_is_syncing(tx));
 119 
 120         VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
 121         bt = db->db_data;
 122 
 123         bte.be_birth_txg = birth_txg;
 124         bte.be_bp = *bp;
 125         bzero(&bte.be_zb, sizeof (bte.be_zb));
 126         dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
 127 
 128         dmu_buf_will_dirty(db, tx);
 129         bt->bt_end++;
 130         bt->bt_bytes += bytes;
 131         bt->bt_comp += comp;
 132         bt->bt_uncomp += uncomp;
 133         dmu_buf_rele(db, FTAG);
 134 }
 135 
 136 /* ARGSUSED */
 137 static int
 138 bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
 139     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 140 {
 141         int err;
 142         struct bptree_args *ba = arg;
 143 
 144         if (bp == NULL)
 145                 return (0);
 146 
 147         err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
 148         if (err == 0 && ba->ba_free) {
 149                 ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp);
 150                 ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp);
 151                 ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp);
 152         }
 153         return (err);
 154 }
 155 
 156 int
 157 bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
 158     void *arg, dmu_tx_t *tx)
 159 {
 160         int err;
 161         uint64_t i;
 162         dmu_buf_t *db;
 163         struct bptree_args ba;
 164 
 165         ASSERT(!free || dmu_tx_is_syncing(tx));
 166 
 167         err = dmu_bonus_hold(os, obj, FTAG, &db);
 168         if (err != 0)
 169                 return (err);
 170 
 171         if (free)
 172                 dmu_buf_will_dirty(db, tx);
 173 
 174         ba.ba_phys = db->db_data;
 175         ba.ba_free = free;
 176         ba.ba_func = func;
 177         ba.ba_arg = arg;
 178         ba.ba_tx = tx;
 179 
 180         err = 0;
 181         for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
 182                 bptree_entry_phys_t bte;
 183 
 184                 ASSERT(!free || i == ba.ba_phys->bt_begin);
 185 
 186                 err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
 187                     &bte, DMU_READ_NO_PREFETCH);
 188                 if (err != 0)
 189                         break;
 190 
 191                 err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
 192                     bte.be_birth_txg, &bte.be_zb, TRAVERSE_POST,
 193                     bptree_visit_cb, &ba);
 194                 if (free) {
 195                         ASSERT(err == 0 || err == ERESTART);
 196                         if (err != 0) {
 197                                 /* save bookmark for future resume */
 198                                 ASSERT3U(bte.be_zb.zb_objset, ==,
 199                                     ZB_DESTROYED_OBJSET);
 200                                 ASSERT3U(bte.be_zb.zb_level, ==, 0);
 201                                 dmu_write(os, obj, i * sizeof (bte),
 202                                     sizeof (bte), &bte, tx);
 203                                 break;
 204                         } else {
 205                                 ba.ba_phys->bt_begin++;
 206                                 (void) dmu_free_range(os, obj,
 207                                     i * sizeof (bte), sizeof (bte), tx);
 208                         }
 209                 }
 210         }
 211 
 212         ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
 213 
 214         /* if all blocks are free there should be no used space */
 215         if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
 216                 ASSERT3U(ba.ba_phys->bt_bytes, ==, 0);
 217                 ASSERT3U(ba.ba_phys->bt_comp, ==, 0);
 218                 ASSERT3U(ba.ba_phys->bt_uncomp, ==, 0);
 219         }
 220 
 221         dmu_buf_rele(db, FTAG);
 222 
 223         return (err);
 224 }