1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/bpobj.h> 27 #include <sys/zfs_context.h> 28 #include <sys/refcount.h> 29 #include <sys/dsl_pool.h> 30 31 uint64_t 32 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) 33 { 34 int size; 35 36 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) 37 size = BPOBJ_SIZE_V0; 38 else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) 39 size = BPOBJ_SIZE_V1; 40 else 41 size = sizeof (bpobj_phys_t); 42 43 return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, 44 DMU_OT_BPOBJ_HDR, size, tx)); 45 } 46 47 void 48 bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) 49 { 50 int64_t i; 51 bpobj_t bpo; 52 dmu_object_info_t doi; 53 int epb; 54 dmu_buf_t *dbuf = NULL; 55 56 VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); 57 58 mutex_enter(&bpo.bpo_lock); 59 60 if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) 61 goto out; 62 63 VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); 64 epb = doi.doi_data_block_size / sizeof (uint64_t); 65 66 for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { 67 uint64_t *objarray; 68 uint64_t offset, blkoff; 69 70 offset = i * sizeof (uint64_t); 71 blkoff = P2PHASE(i, epb); 72 73 if (dbuf == NULL || dbuf->db_offset > offset) { 74 if (dbuf) 75 dmu_buf_rele(dbuf, FTAG); 76 VERIFY3U(0, ==, dmu_buf_hold(os, 77 bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); 78 } 79 80 ASSERT3U(offset, >=, dbuf->db_offset); 81 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); 82 83 objarray = dbuf->db_data; 84 bpobj_free(os, objarray[blkoff], tx); 85 } 86 if (dbuf) { 87 dmu_buf_rele(dbuf, FTAG); 88 dbuf = NULL; 89 } 90 VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); 91 92 out: 93 mutex_exit(&bpo.bpo_lock); 94 bpobj_close(&bpo); 95 96 VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); 97 } 98 99 int 100 bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) 101 { 102 dmu_object_info_t doi; 103 int err; 104 105 err = dmu_object_info(os, object, &doi); 106 if (err) 107 return (err); 108 109 bzero(bpo, sizeof (*bpo)); 110 mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); 111 112 ASSERT(bpo->bpo_dbuf == NULL); 113 ASSERT(bpo->bpo_phys == NULL); 114 ASSERT(object != 0); 115 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); 116 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); 117 118 err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf); 119 if (err) 120 return (err); 121 122 bpo->bpo_os = os; 123 bpo->bpo_object = object; 124 bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; 125 bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); 126 bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); 127 bpo->bpo_phys = bpo->bpo_dbuf->db_data; 128 return (0); 129 } 130 131 void 132 bpobj_close(bpobj_t *bpo) 133 { 134 /* Lame workaround for closing a bpobj that was never opened. */ 135 if (bpo->bpo_object == 0) 136 return; 137 138 dmu_buf_rele(bpo->bpo_dbuf, bpo); 139 if (bpo->bpo_cached_dbuf != NULL) 140 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); 141 bpo->bpo_dbuf = NULL; 142 bpo->bpo_phys = NULL; 143 bpo->bpo_cached_dbuf = NULL; 144 bpo->bpo_object = 0; 145 146 mutex_destroy(&bpo->bpo_lock); 147 } 148 149 static int 150 bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, 151 boolean_t free) 152 { 153 dmu_object_info_t doi; 154 int epb; 155 int64_t i; 156 int err = 0; 157 dmu_buf_t *dbuf = NULL; 158 159 mutex_enter(&bpo->bpo_lock); 160 161 if (free) 162 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 163 164 for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { 165 blkptr_t *bparray; 166 blkptr_t *bp; 167 uint64_t offset, blkoff; 168 169 offset = i * sizeof (blkptr_t); 170 blkoff = P2PHASE(i, bpo->bpo_epb); 171 172 if (dbuf == NULL || dbuf->db_offset > offset) { 173 if (dbuf) 174 dmu_buf_rele(dbuf, FTAG); 175 err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, 176 FTAG, &dbuf, 0); 177 if (err) 178 break; 179 } 180 181 ASSERT3U(offset, >=, dbuf->db_offset); 182 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); 183 184 bparray = dbuf->db_data; 185 bp = &bparray[blkoff]; 186 err = func(arg, bp, tx); 187 if (err) 188 break; 189 if (free) { 190 bpo->bpo_phys->bpo_bytes -= 191 bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); 192 ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); 193 if (bpo->bpo_havecomp) { 194 bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp); 195 bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp); 196 } 197 bpo->bpo_phys->bpo_num_blkptrs--; 198 ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); 199 } 200 } 201 if (dbuf) { 202 dmu_buf_rele(dbuf, FTAG); 203 dbuf = NULL; 204 } 205 if (free) { 206 i++; 207 VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object, 208 i * sizeof (blkptr_t), -1ULL, tx)); 209 } 210 if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0) 211 goto out; 212 213 ASSERT(bpo->bpo_havecomp); 214 err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi); 215 if (err) { 216 mutex_exit(&bpo->bpo_lock); 217 return (err); 218 } 219 epb = doi.doi_data_block_size / sizeof (uint64_t); 220 221 for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { 222 uint64_t *objarray; 223 uint64_t offset, blkoff; 224 bpobj_t sublist; 225 uint64_t used_before, comp_before, uncomp_before; 226 uint64_t used_after, comp_after, uncomp_after; 227 228 offset = i * sizeof (uint64_t); 229 blkoff = P2PHASE(i, epb); 230 231 if (dbuf == NULL || dbuf->db_offset > offset) { 232 if (dbuf) 233 dmu_buf_rele(dbuf, FTAG); 234 err = dmu_buf_hold(bpo->bpo_os, 235 bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0); 236 if (err) 237 break; 238 } 239 240 ASSERT3U(offset, >=, dbuf->db_offset); 241 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); 242 243 objarray = dbuf->db_data; 244 err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]); 245 if (err) 246 break; 247 if (free) { 248 err = bpobj_space(&sublist, 249 &used_before, &comp_before, &uncomp_before); 250 if (err) 251 break; 252 } 253 err = bpobj_iterate_impl(&sublist, func, arg, tx, free); 254 if (free) { 255 VERIFY3U(0, ==, bpobj_space(&sublist, 256 &used_after, &comp_after, &uncomp_after)); 257 bpo->bpo_phys->bpo_bytes -= used_before - used_after; 258 ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); 259 bpo->bpo_phys->bpo_comp -= comp_before - comp_after; 260 bpo->bpo_phys->bpo_uncomp -= 261 uncomp_before - uncomp_after; 262 } 263 264 bpobj_close(&sublist); 265 if (err) 266 break; 267 if (free) { 268 err = dmu_object_free(bpo->bpo_os, 269 objarray[blkoff], tx); 270 if (err) 271 break; 272 bpo->bpo_phys->bpo_num_subobjs--; 273 ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0); 274 } 275 } 276 if (dbuf) { 277 dmu_buf_rele(dbuf, FTAG); 278 dbuf = NULL; 279 } 280 if (free) { 281 VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, 282 bpo->bpo_phys->bpo_subobjs, 283 (i + 1) * sizeof (uint64_t), -1ULL, tx)); 284 } 285 286 out: 287 /* If there are no entries, there should be no bytes. */ 288 ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 || 289 (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) || 290 bpo->bpo_phys->bpo_bytes == 0); 291 292 mutex_exit(&bpo->bpo_lock); 293 return (err); 294 } 295 296 /* 297 * Iterate and remove the entries. If func returns nonzero, iteration 298 * will stop and that entry will not be removed. 299 */ 300 int 301 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) 302 { 303 return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE)); 304 } 305 306 /* 307 * Iterate the entries. If func returns nonzero, iteration will stop. 308 */ 309 int 310 bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) 311 { 312 return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE)); 313 } 314 315 void 316 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) 317 { 318 bpobj_t subbpo; 319 uint64_t used, comp, uncomp, subsubobjs; 320 321 ASSERT(bpo->bpo_havesubobj); 322 ASSERT(bpo->bpo_havecomp); 323 324 VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); 325 VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); 326 327 if (used == 0) { 328 /* No point in having an empty subobj. */ 329 bpobj_close(&subbpo); 330 bpobj_free(bpo->bpo_os, subobj, tx); 331 return; 332 } 333 334 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 335 if (bpo->bpo_phys->bpo_subobjs == 0) { 336 bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os, 337 DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); 338 } 339 340 mutex_enter(&bpo->bpo_lock); 341 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 342 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 343 sizeof (subobj), &subobj, tx); 344 bpo->bpo_phys->bpo_num_subobjs++; 345 346 /* 347 * If subobj has only one block of subobjs, then move subobj's 348 * subobjs to bpo's subobj list directly. This reduces 349 * recursion in bpobj_iterate due to nested subobjs. 350 */ 351 subsubobjs = subbpo.bpo_phys->bpo_subobjs; 352 if (subsubobjs != 0) { 353 dmu_object_info_t doi; 354 355 VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); 356 if (doi.doi_max_offset == doi.doi_data_block_size) { 357 dmu_buf_t *subdb; 358 uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; 359 360 VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs, 361 0, FTAG, &subdb, 0)); 362 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 363 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 364 numsubsub * sizeof (subobj), subdb->db_data, tx); 365 dmu_buf_rele(subdb, FTAG); 366 bpo->bpo_phys->bpo_num_subobjs += numsubsub; 367 368 dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); 369 subbpo.bpo_phys->bpo_subobjs = 0; 370 VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os, 371 subsubobjs, tx)); 372 } 373 } 374 bpo->bpo_phys->bpo_bytes += used; 375 bpo->bpo_phys->bpo_comp += comp; 376 bpo->bpo_phys->bpo_uncomp += uncomp; 377 mutex_exit(&bpo->bpo_lock); 378 379 bpobj_close(&subbpo); 380 } 381 382 void 383 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) 384 { 385 blkptr_t stored_bp = *bp; 386 uint64_t offset; 387 int blkoff; 388 blkptr_t *bparray; 389 390 ASSERT(!BP_IS_HOLE(bp)); 391 392 /* We never need the fill count. */ 393 stored_bp.blk_fill = 0; 394 395 /* The bpobj will compress better if we can leave off the checksum */ 396 if (!BP_GET_DEDUP(bp)) 397 bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); 398 399 mutex_enter(&bpo->bpo_lock); 400 401 offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); 402 blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); 403 404 if (bpo->bpo_cached_dbuf == NULL || 405 offset < bpo->bpo_cached_dbuf->db_offset || 406 offset >= bpo->bpo_cached_dbuf->db_offset + 407 bpo->bpo_cached_dbuf->db_size) { 408 if (bpo->bpo_cached_dbuf) 409 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); 410 VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, 411 offset, bpo, &bpo->bpo_cached_dbuf, 0)); 412 } 413 414 dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); 415 bparray = bpo->bpo_cached_dbuf->db_data; 416 bparray[blkoff] = stored_bp; 417 418 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 419 bpo->bpo_phys->bpo_num_blkptrs++; 420 bpo->bpo_phys->bpo_bytes += 421 bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); 422 if (bpo->bpo_havecomp) { 423 bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp); 424 bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp); 425 } 426 mutex_exit(&bpo->bpo_lock); 427 } 428 429 struct space_range_arg { 430 spa_t *spa; 431 uint64_t mintxg; 432 uint64_t maxtxg; 433 uint64_t used; 434 uint64_t comp; 435 uint64_t uncomp; 436 }; 437 438 /* ARGSUSED */ 439 static int 440 space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 441 { 442 struct space_range_arg *sra = arg; 443 444 if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { 445 if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) 446 sra->used += bp_get_dsize_sync(sra->spa, bp); 447 else 448 sra->used += bp_get_dsize(sra->spa, bp); 449 sra->comp += BP_GET_PSIZE(bp); 450 sra->uncomp += BP_GET_UCSIZE(bp); 451 } 452 return (0); 453 } 454 455 int 456 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 457 { 458 mutex_enter(&bpo->bpo_lock); 459 460 *usedp = bpo->bpo_phys->bpo_bytes; 461 if (bpo->bpo_havecomp) { 462 *compp = bpo->bpo_phys->bpo_comp; 463 *uncompp = bpo->bpo_phys->bpo_uncomp; 464 mutex_exit(&bpo->bpo_lock); 465 return (0); 466 } else { 467 mutex_exit(&bpo->bpo_lock); 468 return (bpobj_space_range(bpo, 0, UINT64_MAX, 469 usedp, compp, uncompp)); 470 } 471 } 472 473 /* 474 * Return the amount of space in the bpobj which is: 475 * mintxg < blk_birth <= maxtxg 476 */ 477 int 478 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, 479 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 480 { 481 struct space_range_arg sra = { 0 }; 482 int err; 483 484 /* 485 * As an optimization, if they want the whole txg range, just 486 * get bpo_bytes rather than iterating over the bps. 487 */ 488 if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) 489 return (bpobj_space(bpo, usedp, compp, uncompp)); 490 491 sra.spa = dmu_objset_spa(bpo->bpo_os); 492 sra.mintxg = mintxg; 493 sra.maxtxg = maxtxg; 494 495 err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); 496 *usedp = sra.used; 497 *compp = sra.comp; 498 *uncompp = sra.uncomp; 499 return (err); 500 }