1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012 by Delphix. All rights reserved.
  24  */
  25 
  26 #include <sys/bpobj.h>
  27 #include <sys/zfs_context.h>
  28 #include <sys/refcount.h>
  29 #include <sys/dsl_pool.h>
  30 
  31 uint64_t
  32 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
  33 {
  34         int size;
  35 
  36         if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
  37                 size = BPOBJ_SIZE_V0;
  38         else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
  39                 size = BPOBJ_SIZE_V1;
  40         else
  41                 size = sizeof (bpobj_phys_t);
  42 
  43         return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
  44             DMU_OT_BPOBJ_HDR, size, tx));
  45 }
  46 
  47 void
  48 bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
  49 {
  50         int64_t i;
  51         bpobj_t bpo;
  52         dmu_object_info_t doi;
  53         int epb;
  54         dmu_buf_t *dbuf = NULL;
  55 
  56         VERIFY0(bpobj_open(&bpo, os, obj));
  57 
  58         mutex_enter(&bpo.bpo_lock);
  59 
  60         if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
  61                 goto out;
  62 
  63         VERIFY0(dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
  64         epb = doi.doi_data_block_size / sizeof (uint64_t);
  65 
  66         for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
  67                 uint64_t *objarray;
  68                 uint64_t offset, blkoff;
  69 
  70                 offset = i * sizeof (uint64_t);
  71                 blkoff = P2PHASE(i, epb);
  72 
  73                 if (dbuf == NULL || dbuf->db_offset > offset) {
  74                         if (dbuf)
  75                                 dmu_buf_rele(dbuf, FTAG);
  76                         VERIFY0(dmu_buf_hold(os,
  77                             bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
  78                 }
  79 
  80                 ASSERT3U(offset, >=, dbuf->db_offset);
  81                 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
  82 
  83                 objarray = dbuf->db_data;
  84                 bpobj_free(os, objarray[blkoff], tx);
  85         }
  86         if (dbuf) {
  87                 dmu_buf_rele(dbuf, FTAG);
  88                 dbuf = NULL;
  89         }
  90         VERIFY0(dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
  91 
  92 out:
  93         mutex_exit(&bpo.bpo_lock);
  94         bpobj_close(&bpo);
  95 
  96         VERIFY0(dmu_object_free(os, obj, tx));
  97 }
  98 
  99 int
 100 bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
 101 {
 102         dmu_object_info_t doi;
 103         int err;
 104 
 105         err = dmu_object_info(os, object, &doi);
 106         if (err)
 107                 return (err);
 108 
 109         bzero(bpo, sizeof (*bpo));
 110         mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
 111 
 112         ASSERT(bpo->bpo_dbuf == NULL);
 113         ASSERT(bpo->bpo_phys == NULL);
 114         ASSERT(object != 0);
 115         ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
 116         ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
 117 
 118         err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
 119         if (err)
 120                 return (err);
 121 
 122         bpo->bpo_os = os;
 123         bpo->bpo_object = object;
 124         bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
 125         bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
 126         bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
 127         bpo->bpo_phys = bpo->bpo_dbuf->db_data;
 128         return (0);
 129 }
 130 
 131 void
 132 bpobj_close(bpobj_t *bpo)
 133 {
 134         /* Lame workaround for closing a bpobj that was never opened. */
 135         if (bpo->bpo_object == 0)
 136                 return;
 137 
 138         dmu_buf_rele(bpo->bpo_dbuf, bpo);
 139         if (bpo->bpo_cached_dbuf != NULL)
 140                 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
 141         bpo->bpo_dbuf = NULL;
 142         bpo->bpo_phys = NULL;
 143         bpo->bpo_cached_dbuf = NULL;
 144         bpo->bpo_object = 0;
 145 
 146         mutex_destroy(&bpo->bpo_lock);
 147 }
 148 
 149 static int
 150 bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
 151     boolean_t free)
 152 {
 153         dmu_object_info_t doi;
 154         int epb;
 155         int64_t i;
 156         int err = 0;
 157         dmu_buf_t *dbuf = NULL;
 158 
 159         mutex_enter(&bpo->bpo_lock);
 160 
 161         if (free)
 162                 dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
 163 
 164         for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
 165                 blkptr_t *bparray;
 166                 blkptr_t *bp;
 167                 uint64_t offset, blkoff;
 168 
 169                 offset = i * sizeof (blkptr_t);
 170                 blkoff = P2PHASE(i, bpo->bpo_epb);
 171 
 172                 if (dbuf == NULL || dbuf->db_offset > offset) {
 173                         if (dbuf)
 174                                 dmu_buf_rele(dbuf, FTAG);
 175                         err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
 176                             FTAG, &dbuf, 0);
 177                         if (err)
 178                                 break;
 179                 }
 180 
 181                 ASSERT3U(offset, >=, dbuf->db_offset);
 182                 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
 183 
 184                 bparray = dbuf->db_data;
 185                 bp = &bparray[blkoff];
 186                 err = func(arg, bp, tx);
 187                 if (err)
 188                         break;
 189                 if (free) {
 190                         bpo->bpo_phys->bpo_bytes -=
 191                             bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
 192                         ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
 193                         if (bpo->bpo_havecomp) {
 194                                 bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
 195                                 bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
 196                         }
 197                         bpo->bpo_phys->bpo_num_blkptrs--;
 198                         ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
 199                 }
 200         }
 201         if (dbuf) {
 202                 dmu_buf_rele(dbuf, FTAG);
 203                 dbuf = NULL;
 204         }
 205         if (free) {
 206                 i++;
 207                 VERIFY0(dmu_free_range(bpo->bpo_os, bpo->bpo_object,
 208                     i * sizeof (blkptr_t), -1ULL, tx));
 209         }
 210         if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
 211                 goto out;
 212 
 213         ASSERT(bpo->bpo_havecomp);
 214         err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
 215         if (err) {
 216                 mutex_exit(&bpo->bpo_lock);
 217                 return (err);
 218         }
 219         epb = doi.doi_data_block_size / sizeof (uint64_t);
 220 
 221         for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
 222                 uint64_t *objarray;
 223                 uint64_t offset, blkoff;
 224                 bpobj_t sublist;
 225                 uint64_t used_before, comp_before, uncomp_before;
 226                 uint64_t used_after, comp_after, uncomp_after;
 227 
 228                 offset = i * sizeof (uint64_t);
 229                 blkoff = P2PHASE(i, epb);
 230 
 231                 if (dbuf == NULL || dbuf->db_offset > offset) {
 232                         if (dbuf)
 233                                 dmu_buf_rele(dbuf, FTAG);
 234                         err = dmu_buf_hold(bpo->bpo_os,
 235                             bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
 236                         if (err)
 237                                 break;
 238                 }
 239 
 240                 ASSERT3U(offset, >=, dbuf->db_offset);
 241                 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
 242 
 243                 objarray = dbuf->db_data;
 244                 err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
 245                 if (err)
 246                         break;
 247                 if (free) {
 248                         err = bpobj_space(&sublist,
 249                             &used_before, &comp_before, &uncomp_before);
 250                         if (err)
 251                                 break;
 252                 }
 253                 err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
 254                 if (free) {
 255                         VERIFY0(bpobj_space(&sublist,
 256                             &used_after, &comp_after, &uncomp_after));
 257                         bpo->bpo_phys->bpo_bytes -= used_before - used_after;
 258                         ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
 259                         bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
 260                         bpo->bpo_phys->bpo_uncomp -=
 261                             uncomp_before - uncomp_after;
 262                 }
 263 
 264                 bpobj_close(&sublist);
 265                 if (err)
 266                         break;
 267                 if (free) {
 268                         err = dmu_object_free(bpo->bpo_os,
 269                             objarray[blkoff], tx);
 270                         if (err)
 271                                 break;
 272                         bpo->bpo_phys->bpo_num_subobjs--;
 273                         ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
 274                 }
 275         }
 276         if (dbuf) {
 277                 dmu_buf_rele(dbuf, FTAG);
 278                 dbuf = NULL;
 279         }
 280         if (free) {
 281                 VERIFY0(dmu_free_range(bpo->bpo_os,
 282                     bpo->bpo_phys->bpo_subobjs,
 283                     (i + 1) * sizeof (uint64_t), -1ULL, tx));
 284         }
 285 
 286 out:
 287         /* If there are no entries, there should be no bytes. */
 288         ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 ||
 289             (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) ||
 290             bpo->bpo_phys->bpo_bytes == 0);
 291 
 292         mutex_exit(&bpo->bpo_lock);
 293         return (err);
 294 }
 295 
 296 /*
 297  * Iterate and remove the entries.  If func returns nonzero, iteration
 298  * will stop and that entry will not be removed.
 299  */
 300 int
 301 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
 302 {
 303         return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
 304 }
 305 
 306 /*
 307  * Iterate the entries.  If func returns nonzero, iteration will stop.
 308  */
 309 int
 310 bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
 311 {
 312         return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
 313 }
 314 
 315 void
 316 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
 317 {
 318         bpobj_t subbpo;
 319         uint64_t used, comp, uncomp, subsubobjs;
 320 
 321         ASSERT(bpo->bpo_havesubobj);
 322         ASSERT(bpo->bpo_havecomp);
 323 
 324         VERIFY0(bpobj_open(&subbpo, bpo->bpo_os, subobj));
 325         VERIFY0(bpobj_space(&subbpo, &used, &comp, &uncomp));
 326 
 327         if (used == 0) {
 328                 /* No point in having an empty subobj. */
 329                 bpobj_close(&subbpo);
 330                 bpobj_free(bpo->bpo_os, subobj, tx);
 331                 return;
 332         }
 333 
 334         dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
 335         if (bpo->bpo_phys->bpo_subobjs == 0) {
 336                 bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
 337                     DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
 338         }
 339 
 340         mutex_enter(&bpo->bpo_lock);
 341         dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
 342             bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
 343             sizeof (subobj), &subobj, tx);
 344         bpo->bpo_phys->bpo_num_subobjs++;
 345 
 346         /*
 347          * If subobj has only one block of subobjs, then move subobj's
 348          * subobjs to bpo's subobj list directly.  This reduces
 349          * recursion in bpobj_iterate due to nested subobjs.
 350          */
 351         subsubobjs = subbpo.bpo_phys->bpo_subobjs;
 352         if (subsubobjs != 0) {
 353                 dmu_object_info_t doi;
 354 
 355                 VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
 356                 if (doi.doi_max_offset == doi.doi_data_block_size) {
 357                         dmu_buf_t *subdb;
 358                         uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
 359 
 360                         VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs,
 361                             0, FTAG, &subdb, 0));
 362                         dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
 363                             bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
 364                             numsubsub * sizeof (subobj), subdb->db_data, tx);
 365                         dmu_buf_rele(subdb, FTAG);
 366                         bpo->bpo_phys->bpo_num_subobjs += numsubsub;
 367 
 368                         dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
 369                         subbpo.bpo_phys->bpo_subobjs = 0;
 370                         VERIFY0(dmu_object_free(bpo->bpo_os,
 371                             subsubobjs, tx));
 372                 }
 373         }
 374         bpo->bpo_phys->bpo_bytes += used;
 375         bpo->bpo_phys->bpo_comp += comp;
 376         bpo->bpo_phys->bpo_uncomp += uncomp;
 377         mutex_exit(&bpo->bpo_lock);
 378 
 379         bpobj_close(&subbpo);
 380 }
 381 
 382 void
 383 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
 384 {
 385         blkptr_t stored_bp = *bp;
 386         uint64_t offset;
 387         int blkoff;
 388         blkptr_t *bparray;
 389 
 390         ASSERT(!BP_IS_HOLE(bp));
 391 
 392         /* We never need the fill count. */
 393         stored_bp.blk_fill = 0;
 394 
 395         /* The bpobj will compress better if we can leave off the checksum */
 396         if (!BP_GET_DEDUP(bp))
 397                 bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
 398 
 399         mutex_enter(&bpo->bpo_lock);
 400 
 401         offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
 402         blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
 403 
 404         if (bpo->bpo_cached_dbuf == NULL ||
 405             offset < bpo->bpo_cached_dbuf->db_offset ||
 406             offset >= bpo->bpo_cached_dbuf->db_offset +
 407             bpo->bpo_cached_dbuf->db_size) {
 408                 if (bpo->bpo_cached_dbuf)
 409                         dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
 410                 VERIFY0(dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
 411                     offset, bpo, &bpo->bpo_cached_dbuf, 0));
 412         }
 413 
 414         dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
 415         bparray = bpo->bpo_cached_dbuf->db_data;
 416         bparray[blkoff] = stored_bp;
 417 
 418         dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
 419         bpo->bpo_phys->bpo_num_blkptrs++;
 420         bpo->bpo_phys->bpo_bytes +=
 421             bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
 422         if (bpo->bpo_havecomp) {
 423                 bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
 424                 bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
 425         }
 426         mutex_exit(&bpo->bpo_lock);
 427 }
 428 
 429 struct space_range_arg {
 430         spa_t *spa;
 431         uint64_t mintxg;
 432         uint64_t maxtxg;
 433         uint64_t used;
 434         uint64_t comp;
 435         uint64_t uncomp;
 436 };
 437 
 438 /* ARGSUSED */
 439 static int
 440 space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 441 {
 442         struct space_range_arg *sra = arg;
 443 
 444         if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
 445                 if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
 446                         sra->used += bp_get_dsize_sync(sra->spa, bp);
 447                 else
 448                         sra->used += bp_get_dsize(sra->spa, bp);
 449                 sra->comp += BP_GET_PSIZE(bp);
 450                 sra->uncomp += BP_GET_UCSIZE(bp);
 451         }
 452         return (0);
 453 }
 454 
 455 int
 456 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 457 {
 458         mutex_enter(&bpo->bpo_lock);
 459 
 460         *usedp = bpo->bpo_phys->bpo_bytes;
 461         if (bpo->bpo_havecomp) {
 462                 *compp = bpo->bpo_phys->bpo_comp;
 463                 *uncompp = bpo->bpo_phys->bpo_uncomp;
 464                 mutex_exit(&bpo->bpo_lock);
 465                 return (0);
 466         } else {
 467                 mutex_exit(&bpo->bpo_lock);
 468                 return (bpobj_space_range(bpo, 0, UINT64_MAX,
 469                     usedp, compp, uncompp));
 470         }
 471 }
 472 
 473 /*
 474  * Return the amount of space in the bpobj which is:
 475  * mintxg < blk_birth <= maxtxg
 476  */
 477 int
 478 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
 479     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 480 {
 481         struct space_range_arg sra = { 0 };
 482         int err;
 483 
 484         /*
 485          * As an optimization, if they want the whole txg range, just
 486          * get bpo_bytes rather than iterating over the bps.
 487          */
 488         if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
 489                 return (bpobj_space(bpo, usedp, compp, uncompp));
 490 
 491         sra.spa = dmu_objset_spa(bpo->bpo_os);
 492         sra.mintxg = mintxg;
 493         sra.maxtxg = maxtxg;
 494 
 495         err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
 496         *usedp = sra.used;
 497         *compp = sra.comp;
 498         *uncompp = sra.uncomp;
 499         return (err);
 500 }