1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/dsl_dataset.h> 27 #include <sys/dmu.h> 28 #include <sys/refcount.h> 29 #include <sys/zap.h> 30 #include <sys/zfs_context.h> 31 #include <sys/dsl_pool.h> 32 33 /* 34 * Deadlist concurrency: 35 * 36 * Deadlists can only be modified from the syncing thread. 37 * 38 * Except for dsl_deadlist_insert(), it can only be modified with the 39 * dp_config_rwlock held with RW_WRITER. 40 * 41 * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can 42 * be called concurrently, from open context, with the dl_config_rwlock held 43 * with RW_READER. 44 * 45 * Therefore, we only need to provide locking between dsl_deadlist_insert() and 46 * the accessors, protecting: 47 * dl_phys->dl_used,comp,uncomp 48 * and protecting the dl_tree from being loaded. 49 * The locking is provided by dl_lock. Note that locking on the bpobj_t 50 * provides its own locking, and dl_oldfmt is immutable. 51 */ 52 53 static int 54 dsl_deadlist_compare(const void *arg1, const void *arg2) 55 { 56 const dsl_deadlist_entry_t *dle1 = arg1; 57 const dsl_deadlist_entry_t *dle2 = arg2; 58 59 if (dle1->dle_mintxg < dle2->dle_mintxg) 60 return (-1); 61 else if (dle1->dle_mintxg > dle2->dle_mintxg) 62 return (+1); 63 else 64 return (0); 65 } 66 67 static void 68 dsl_deadlist_load_tree(dsl_deadlist_t *dl) 69 { 70 zap_cursor_t zc; 71 zap_attribute_t za; 72 73 ASSERT(!dl->dl_oldfmt); 74 if (dl->dl_havetree) 75 return; 76 77 avl_create(&dl->dl_tree, dsl_deadlist_compare, 78 sizeof (dsl_deadlist_entry_t), 79 offsetof(dsl_deadlist_entry_t, dle_node)); 80 for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); 81 zap_cursor_retrieve(&zc, &za) == 0; 82 zap_cursor_advance(&zc)) { 83 dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 84 dle->dle_mintxg = strtonum(za.za_name, NULL); 85 VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, 86 za.za_first_integer)); 87 avl_add(&dl->dl_tree, dle); 88 } 89 zap_cursor_fini(&zc); 90 dl->dl_havetree = B_TRUE; 91 } 92 93 void 94 dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) 95 { 96 dmu_object_info_t doi; 97 98 mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); 99 dl->dl_os = os; 100 dl->dl_object = object; 101 VERIFY0(dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); 102 dmu_object_info_from_db(dl->dl_dbuf, &doi); 103 if (doi.doi_type == DMU_OT_BPOBJ) { 104 dmu_buf_rele(dl->dl_dbuf, dl); 105 dl->dl_dbuf = NULL; 106 dl->dl_oldfmt = B_TRUE; 107 VERIFY0(bpobj_open(&dl->dl_bpobj, os, object)); 108 return; 109 } 110 111 dl->dl_oldfmt = B_FALSE; 112 dl->dl_phys = dl->dl_dbuf->db_data; 113 dl->dl_havetree = B_FALSE; 114 } 115 116 void 117 dsl_deadlist_close(dsl_deadlist_t *dl) 118 { 119 void *cookie = NULL; 120 dsl_deadlist_entry_t *dle; 121 122 if (dl->dl_oldfmt) { 123 dl->dl_oldfmt = B_FALSE; 124 bpobj_close(&dl->dl_bpobj); 125 return; 126 } 127 128 if (dl->dl_havetree) { 129 while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) 130 != NULL) { 131 bpobj_close(&dle->dle_bpobj); 132 kmem_free(dle, sizeof (*dle)); 133 } 134 avl_destroy(&dl->dl_tree); 135 } 136 dmu_buf_rele(dl->dl_dbuf, dl); 137 mutex_destroy(&dl->dl_lock); 138 dl->dl_dbuf = NULL; 139 dl->dl_phys = NULL; 140 } 141 142 uint64_t 143 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) 144 { 145 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) 146 return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx)); 147 return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, 148 sizeof (dsl_deadlist_phys_t), tx)); 149 } 150 151 void 152 dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) 153 { 154 dmu_object_info_t doi; 155 zap_cursor_t zc; 156 zap_attribute_t za; 157 158 VERIFY0(dmu_object_info(os, dlobj, &doi)); 159 if (doi.doi_type == DMU_OT_BPOBJ) { 160 bpobj_free(os, dlobj, tx); 161 return; 162 } 163 164 for (zap_cursor_init(&zc, os, dlobj); 165 zap_cursor_retrieve(&zc, &za) == 0; 166 zap_cursor_advance(&zc)) 167 bpobj_free(os, za.za_first_integer, tx); 168 zap_cursor_fini(&zc); 169 VERIFY0(dmu_object_free(os, dlobj, tx)); 170 } 171 172 void 173 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) 174 { 175 dsl_deadlist_entry_t dle_tofind; 176 dsl_deadlist_entry_t *dle; 177 avl_index_t where; 178 179 if (dl->dl_oldfmt) { 180 bpobj_enqueue(&dl->dl_bpobj, bp, tx); 181 return; 182 } 183 184 dsl_deadlist_load_tree(dl); 185 186 dmu_buf_will_dirty(dl->dl_dbuf, tx); 187 mutex_enter(&dl->dl_lock); 188 dl->dl_phys->dl_used += 189 bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); 190 dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); 191 dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); 192 mutex_exit(&dl->dl_lock); 193 194 dle_tofind.dle_mintxg = bp->blk_birth; 195 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 196 if (dle == NULL) 197 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 198 else 199 dle = AVL_PREV(&dl->dl_tree, dle); 200 bpobj_enqueue(&dle->dle_bpobj, bp, tx); 201 } 202 203 /* 204 * Insert new key in deadlist, which must be > all current entries. 205 * mintxg is not inclusive. 206 */ 207 void 208 dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 209 { 210 uint64_t obj; 211 dsl_deadlist_entry_t *dle; 212 213 if (dl->dl_oldfmt) 214 return; 215 216 dsl_deadlist_load_tree(dl); 217 218 dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 219 dle->dle_mintxg = mintxg; 220 obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx); 221 VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 222 avl_add(&dl->dl_tree, dle); 223 224 VERIFY0(zap_add_int_key(dl->dl_os, dl->dl_object, 225 mintxg, obj, tx)); 226 } 227 228 /* 229 * Remove this key, merging its entries into the previous key. 230 */ 231 void 232 dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 233 { 234 dsl_deadlist_entry_t dle_tofind; 235 dsl_deadlist_entry_t *dle, *dle_prev; 236 237 if (dl->dl_oldfmt) 238 return; 239 240 dsl_deadlist_load_tree(dl); 241 242 dle_tofind.dle_mintxg = mintxg; 243 dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); 244 dle_prev = AVL_PREV(&dl->dl_tree, dle); 245 246 bpobj_enqueue_subobj(&dle_prev->dle_bpobj, 247 dle->dle_bpobj.bpo_object, tx); 248 249 avl_remove(&dl->dl_tree, dle); 250 bpobj_close(&dle->dle_bpobj); 251 kmem_free(dle, sizeof (*dle)); 252 253 VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); 254 } 255 256 /* 257 * Walk ds's snapshots to regenerate generate ZAP & AVL. 258 */ 259 static void 260 dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, 261 uint64_t mrs_obj, dmu_tx_t *tx) 262 { 263 dsl_deadlist_t dl; 264 dsl_pool_t *dp = dmu_objset_pool(os); 265 266 dsl_deadlist_open(&dl, os, dlobj); 267 if (dl.dl_oldfmt) { 268 dsl_deadlist_close(&dl); 269 return; 270 } 271 272 while (mrs_obj != 0) { 273 dsl_dataset_t *ds; 274 VERIFY0(dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); 275 dsl_deadlist_add_key(&dl, ds->ds_phys->ds_prev_snap_txg, tx); 276 mrs_obj = ds->ds_phys->ds_prev_snap_obj; 277 dsl_dataset_rele(ds, FTAG); 278 } 279 dsl_deadlist_close(&dl); 280 } 281 282 uint64_t 283 dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, 284 uint64_t mrs_obj, dmu_tx_t *tx) 285 { 286 dsl_deadlist_entry_t *dle; 287 uint64_t newobj; 288 289 newobj = dsl_deadlist_alloc(dl->dl_os, tx); 290 291 if (dl->dl_oldfmt) { 292 dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); 293 return (newobj); 294 } 295 296 dsl_deadlist_load_tree(dl); 297 298 for (dle = avl_first(&dl->dl_tree); dle; 299 dle = AVL_NEXT(&dl->dl_tree, dle)) { 300 uint64_t obj; 301 302 if (dle->dle_mintxg >= maxtxg) 303 break; 304 305 obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx); 306 VERIFY0(zap_add_int_key(dl->dl_os, newobj, 307 dle->dle_mintxg, obj, tx)); 308 } 309 return (newobj); 310 } 311 312 void 313 dsl_deadlist_space(dsl_deadlist_t *dl, 314 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 315 { 316 if (dl->dl_oldfmt) { 317 VERIFY0(bpobj_space(&dl->dl_bpobj, 318 usedp, compp, uncompp)); 319 return; 320 } 321 322 mutex_enter(&dl->dl_lock); 323 *usedp = dl->dl_phys->dl_used; 324 *compp = dl->dl_phys->dl_comp; 325 *uncompp = dl->dl_phys->dl_uncomp; 326 mutex_exit(&dl->dl_lock); 327 } 328 329 /* 330 * return space used in the range (mintxg, maxtxg]. 331 * Includes maxtxg, does not include mintxg. 332 * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is 333 * larger than any bp in the deadlist (eg. UINT64_MAX)). 334 */ 335 void 336 dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, 337 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 338 { 339 dsl_deadlist_entry_t *dle; 340 dsl_deadlist_entry_t dle_tofind; 341 avl_index_t where; 342 343 if (dl->dl_oldfmt) { 344 VERIFY0(bpobj_space_range(&dl->dl_bpobj, 345 mintxg, maxtxg, usedp, compp, uncompp)); 346 return; 347 } 348 349 *usedp = *compp = *uncompp = 0; 350 351 mutex_enter(&dl->dl_lock); 352 dsl_deadlist_load_tree(dl); 353 dle_tofind.dle_mintxg = mintxg; 354 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 355 /* 356 * If we don't find this mintxg, there shouldn't be anything 357 * after it either. 358 */ 359 ASSERT(dle != NULL || 360 avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); 361 362 for (; dle && dle->dle_mintxg < maxtxg; 363 dle = AVL_NEXT(&dl->dl_tree, dle)) { 364 uint64_t used, comp, uncomp; 365 366 VERIFY0(bpobj_space(&dle->dle_bpobj, 367 &used, &comp, &uncomp)); 368 369 *usedp += used; 370 *compp += comp; 371 *uncompp += uncomp; 372 } 373 mutex_exit(&dl->dl_lock); 374 } 375 376 static void 377 dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, 378 dmu_tx_t *tx) 379 { 380 dsl_deadlist_entry_t dle_tofind; 381 dsl_deadlist_entry_t *dle; 382 avl_index_t where; 383 uint64_t used, comp, uncomp; 384 bpobj_t bpo; 385 386 VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); 387 VERIFY0(bpobj_space(&bpo, &used, &comp, &uncomp)); 388 bpobj_close(&bpo); 389 390 dsl_deadlist_load_tree(dl); 391 392 dmu_buf_will_dirty(dl->dl_dbuf, tx); 393 mutex_enter(&dl->dl_lock); 394 dl->dl_phys->dl_used += used; 395 dl->dl_phys->dl_comp += comp; 396 dl->dl_phys->dl_uncomp += uncomp; 397 mutex_exit(&dl->dl_lock); 398 399 dle_tofind.dle_mintxg = birth; 400 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 401 if (dle == NULL) 402 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 403 bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); 404 } 405 406 static int 407 dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 408 { 409 dsl_deadlist_t *dl = arg; 410 dsl_deadlist_insert(dl, bp, tx); 411 return (0); 412 } 413 414 /* 415 * Merge the deadlist pointed to by 'obj' into dl. obj will be left as 416 * an empty deadlist. 417 */ 418 void 419 dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) 420 { 421 zap_cursor_t zc; 422 zap_attribute_t za; 423 dmu_buf_t *bonus; 424 dsl_deadlist_phys_t *dlp; 425 dmu_object_info_t doi; 426 427 VERIFY0(dmu_object_info(dl->dl_os, obj, &doi)); 428 if (doi.doi_type == DMU_OT_BPOBJ) { 429 bpobj_t bpo; 430 VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); 431 VERIFY0(bpobj_iterate(&bpo, 432 dsl_deadlist_insert_cb, dl, tx)); 433 bpobj_close(&bpo); 434 return; 435 } 436 437 for (zap_cursor_init(&zc, dl->dl_os, obj); 438 zap_cursor_retrieve(&zc, &za) == 0; 439 zap_cursor_advance(&zc)) { 440 uint64_t mintxg = strtonum(za.za_name, NULL); 441 dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); 442 VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx)); 443 } 444 zap_cursor_fini(&zc); 445 446 VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); 447 dlp = bonus->db_data; 448 dmu_buf_will_dirty(bonus, tx); 449 bzero(dlp, sizeof (*dlp)); 450 dmu_buf_rele(bonus, FTAG); 451 } 452 453 /* 454 * Remove entries on dl that are >= mintxg, and put them on the bpobj. 455 */ 456 void 457 dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, 458 dmu_tx_t *tx) 459 { 460 dsl_deadlist_entry_t dle_tofind; 461 dsl_deadlist_entry_t *dle; 462 avl_index_t where; 463 464 ASSERT(!dl->dl_oldfmt); 465 dmu_buf_will_dirty(dl->dl_dbuf, tx); 466 dsl_deadlist_load_tree(dl); 467 468 dle_tofind.dle_mintxg = mintxg; 469 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 470 if (dle == NULL) 471 dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); 472 while (dle) { 473 uint64_t used, comp, uncomp; 474 dsl_deadlist_entry_t *dle_next; 475 476 bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); 477 478 VERIFY0(bpobj_space(&dle->dle_bpobj, 479 &used, &comp, &uncomp)); 480 mutex_enter(&dl->dl_lock); 481 ASSERT3U(dl->dl_phys->dl_used, >=, used); 482 ASSERT3U(dl->dl_phys->dl_comp, >=, comp); 483 ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); 484 dl->dl_phys->dl_used -= used; 485 dl->dl_phys->dl_comp -= comp; 486 dl->dl_phys->dl_uncomp -= uncomp; 487 mutex_exit(&dl->dl_lock); 488 489 VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, 490 dle->dle_mintxg, tx)); 491 492 dle_next = AVL_NEXT(&dl->dl_tree, dle); 493 avl_remove(&dl->dl_tree, dle); 494 bpobj_close(&dle->dle_bpobj); 495 kmem_free(dle, sizeof (*dle)); 496 dle = dle_next; 497 } 498 }