1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2014 Integros [integros.com] 25 */ 26 27 #include <sys/arc.h> 28 #include <sys/bptree.h> 29 #include <sys/dmu.h> 30 #include <sys/dmu_objset.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/dmu_traverse.h> 33 #include <sys/dsl_dataset.h> 34 #include <sys/dsl_dir.h> 35 #include <sys/dsl_pool.h> 36 #include <sys/dnode.h> 37 #include <sys/refcount.h> 38 #include <sys/spa.h> 39 40 /* 41 * A bptree is a queue of root block pointers from destroyed datasets. When a 42 * dataset is destroyed its root block pointer is put on the end of the pool's 43 * bptree queue so the dataset's blocks can be freed asynchronously by 44 * dsl_scan_sync. This allows the delete operation to finish without traversing 45 * all the dataset's blocks. 46 * 47 * Note that while bt_begin and bt_end are only ever incremented in this code, 48 * they are effectively reset to 0 every time the entire bptree is freed because 49 * the bptree's object is destroyed and re-created. 50 */ 51 52 struct bptree_args { 53 bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */ 54 boolean_t ba_free; /* true if freeing during traversal */ 55 56 bptree_itor_t *ba_func; /* function to call for each blockpointer */ 57 void *ba_arg; /* caller supplied argument to ba_func */ 58 dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */ 59 } bptree_args_t; 60 61 uint64_t 62 bptree_alloc(objset_t *os, dmu_tx_t *tx) 63 { 64 uint64_t obj; 65 dmu_buf_t *db; 66 bptree_phys_t *bt; 67 68 obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, 69 SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, 70 sizeof (bptree_phys_t), tx); 71 72 /* 73 * Bonus buffer contents are already initialized to 0, but for 74 * readability we make it explicit. 75 */ 76 VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 77 dmu_buf_will_dirty(db, tx); 78 bt = db->db_data; 79 bt->bt_begin = 0; 80 bt->bt_end = 0; 81 bt->bt_bytes = 0; 82 bt->bt_comp = 0; 83 bt->bt_uncomp = 0; 84 dmu_buf_rele(db, FTAG); 85 86 return (obj); 87 } 88 89 int 90 bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) 91 { 92 dmu_buf_t *db; 93 bptree_phys_t *bt; 94 95 VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 96 bt = db->db_data; 97 ASSERT3U(bt->bt_begin, ==, bt->bt_end); 98 ASSERT0(bt->bt_bytes); 99 ASSERT0(bt->bt_comp); 100 ASSERT0(bt->bt_uncomp); 101 dmu_buf_rele(db, FTAG); 102 103 return (dmu_object_free(os, obj, tx)); 104 } 105 106 boolean_t 107 bptree_is_empty(objset_t *os, uint64_t obj) 108 { 109 dmu_buf_t *db; 110 bptree_phys_t *bt; 111 boolean_t rv; 112 113 VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db)); 114 bt = db->db_data; 115 rv = (bt->bt_begin == bt->bt_end); 116 dmu_buf_rele(db, FTAG); 117 return (rv); 118 } 119 120 void 121 bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, 122 uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) 123 { 124 dmu_buf_t *db; 125 bptree_phys_t *bt; 126 bptree_entry_phys_t bte = { .be_birth_txg = 0 }; 127 128 /* 129 * bptree objects are in the pool mos, therefore they can only be 130 * modified in syncing context. Furthermore, this is only modified 131 * by the sync thread, so no locking is necessary. 132 */ 133 ASSERT(dmu_tx_is_syncing(tx)); 134 135 VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 136 bt = db->db_data; 137 138 bte.be_birth_txg = birth_txg; 139 bte.be_bp = *bp; 140 dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx); 141 142 dmu_buf_will_dirty(db, tx); 143 bt->bt_end++; 144 bt->bt_bytes += bytes; 145 bt->bt_comp += comp; 146 bt->bt_uncomp += uncomp; 147 dmu_buf_rele(db, FTAG); 148 } 149 150 /* ARGSUSED */ 151 static int 152 bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 153 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 154 { 155 int err; 156 struct bptree_args *ba = arg; 157 158 if (bp == NULL || BP_IS_HOLE(bp)) 159 return (0); 160 161 err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); 162 if (err == 0 && ba->ba_free) { 163 ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp); 164 ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp); 165 ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp); 166 } 167 return (err); 168 } 169 170 /* 171 * If "free" is set: 172 * - It is assumed that "func" will be freeing the block pointers. 173 * - If "func" returns nonzero, the bookmark will be remembered and 174 * iteration will be restarted from this point on next invocation. 175 * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM), 176 * bptree_iterate will remember the bookmark, continue traversing 177 * any additional entries, and return 0. 178 * 179 * If "free" is not set, traversal will stop and return an error if 180 * an i/o error is encountered. 181 * 182 * In either case, if zfs_free_leak_on_eio is set, i/o errors will be 183 * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to 184 * traverse_dataset_destroyed()). 185 */ 186 int 187 bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, 188 void *arg, dmu_tx_t *tx) 189 { 190 boolean_t ioerr = B_FALSE; 191 int err; 192 uint64_t i; 193 dmu_buf_t *db; 194 struct bptree_args ba; 195 196 ASSERT(!free || dmu_tx_is_syncing(tx)); 197 198 err = dmu_bonus_hold(os, obj, FTAG, &db); 199 if (err != 0) 200 return (err); 201 202 if (free) 203 dmu_buf_will_dirty(db, tx); 204 205 ba.ba_phys = db->db_data; 206 ba.ba_free = free; 207 ba.ba_func = func; 208 ba.ba_arg = arg; 209 ba.ba_tx = tx; 210 211 err = 0; 212 for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { 213 bptree_entry_phys_t bte; 214 int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST; 215 216 err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), 217 &bte, DMU_READ_NO_PREFETCH); 218 if (err != 0) 219 break; 220 221 if (zfs_free_leak_on_eio) 222 flags |= TRAVERSE_HARD; 223 zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld " 224 "bookmark %lld/%lld/%lld/%lld", 225 (longlong_t)i, 226 (longlong_t)bte.be_birth_txg, 227 (longlong_t)bte.be_zb.zb_objset, 228 (longlong_t)bte.be_zb.zb_object, 229 (longlong_t)bte.be_zb.zb_level, 230 (longlong_t)bte.be_zb.zb_blkid); 231 err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, 232 bte.be_birth_txg, &bte.be_zb, flags, 233 bptree_visit_cb, &ba); 234 if (free) { 235 /* 236 * The callback has freed the visited block pointers. 237 * Record our traversal progress on disk, either by 238 * updating this record's bookmark, or by logically 239 * removing this record by advancing bt_begin. 240 */ 241 if (err != 0) { 242 /* save bookmark for future resume */ 243 ASSERT3U(bte.be_zb.zb_objset, ==, 244 ZB_DESTROYED_OBJSET); 245 ASSERT0(bte.be_zb.zb_level); 246 dmu_write(os, obj, i * sizeof (bte), 247 sizeof (bte), &bte, tx); 248 if (err == EIO || err == ECKSUM || 249 err == ENXIO) { 250 /* 251 * Skip the rest of this tree and 252 * continue on to the next entry. 253 */ 254 err = 0; 255 ioerr = B_TRUE; 256 } else { 257 break; 258 } 259 } else if (ioerr) { 260 /* 261 * This entry is finished, but there were 262 * i/o errors on previous entries, so we 263 * can't adjust bt_begin. Set this entry's 264 * be_birth_txg such that it will be 265 * treated as a no-op in future traversals. 266 */ 267 bte.be_birth_txg = UINT64_MAX; 268 dmu_write(os, obj, i * sizeof (bte), 269 sizeof (bte), &bte, tx); 270 } 271 272 if (!ioerr) { 273 ba.ba_phys->bt_begin++; 274 (void) dmu_free_range(os, obj, 275 i * sizeof (bte), sizeof (bte), tx); 276 } 277 } else if (err != 0) { 278 break; 279 } 280 } 281 282 ASSERT(!free || err != 0 || ioerr || 283 ba.ba_phys->bt_begin == ba.ba_phys->bt_end); 284 285 /* if all blocks are free there should be no used space */ 286 if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { 287 if (zfs_free_leak_on_eio) { 288 ba.ba_phys->bt_bytes = 0; 289 ba.ba_phys->bt_comp = 0; 290 ba.ba_phys->bt_uncomp = 0; 291 } 292 293 ASSERT0(ba.ba_phys->bt_bytes); 294 ASSERT0(ba.ba_phys->bt_comp); 295 ASSERT0(ba.ba_phys->bt_uncomp); 296 } 297 298 dmu_buf_rele(db, FTAG); 299 300 return (err); 301 }