1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2013 by Delphix. All rights reserved.
  25  */
  26 
  27 #include <sys/zfs_context.h>
  28 #include <sys/spa.h>
  29 #include <sys/spa_impl.h>
  30 #include <sys/zio.h>
  31 #include <sys/ddt.h>
  32 #include <sys/zap.h>
  33 #include <sys/dmu_tx.h>
  34 #include <sys/arc.h>
  35 #include <sys/dsl_pool.h>
  36 #include <sys/zio_checksum.h>
  37 #include <sys/zio_compress.h>
  38 #include <sys/dsl_scan.h>
  39 
  40 /*
  41  * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
  42  */
  43 int zfs_dedup_prefetch = 1;
  44 
  45 static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
  46         &ddt_zap_ops,
  47 };
  48 
  49 static const char *ddt_class_name[DDT_CLASSES] = {
  50         "ditto",
  51         "duplicate",
  52         "unique",
  53 };
  54 
  55 static void
  56 ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
  57     dmu_tx_t *tx)
  58 {
  59         spa_t *spa = ddt->ddt_spa;
  60         objset_t *os = ddt->ddt_os;
  61         uint64_t *objectp = &ddt->ddt_object[type][class];
  62         boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup;
  63         char name[DDT_NAMELEN];
  64 
  65         ddt_object_name(ddt, type, class, name);
  66 
  67         ASSERT(*objectp == 0);
  68         VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
  69         ASSERT(*objectp != 0);
  70 
  71         VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
  72             sizeof (uint64_t), 1, objectp, tx) == 0);
  73 
  74         VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
  75             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
  76             &ddt->ddt_histogram[type][class], tx) == 0);
  77 }
  78 
  79 static void
  80 ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
  81     dmu_tx_t *tx)
  82 {
  83         spa_t *spa = ddt->ddt_spa;
  84         objset_t *os = ddt->ddt_os;
  85         uint64_t *objectp = &ddt->ddt_object[type][class];
  86         char name[DDT_NAMELEN];
  87 
  88         ddt_object_name(ddt, type, class, name);
  89 
  90         ASSERT(*objectp != 0);
  91         ASSERT(ddt_object_count(ddt, type, class) == 0);
  92         ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
  93         VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
  94         VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
  95         VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
  96         bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
  97 
  98         *objectp = 0;
  99 }
 100 
 101 static int
 102 ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 103 {
 104         ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 105         dmu_object_info_t doi;
 106         char name[DDT_NAMELEN];
 107         int error;
 108 
 109         ddt_object_name(ddt, type, class, name);
 110 
 111         error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
 112             sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
 113 
 114         if (error)
 115                 return (error);
 116 
 117         error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 118             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 119             &ddt->ddt_histogram[type][class]);
 120 
 121         /*
 122          * Seed the cached statistics.
 123          */
 124         VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
 125 
 126         ddo->ddo_count = ddt_object_count(ddt, type, class);
 127         ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 128         ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 129 
 130         ASSERT(error == 0);
 131         return (error);
 132 }
 133 
 134 static void
 135 ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 136     dmu_tx_t *tx)
 137 {
 138         ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 139         dmu_object_info_t doi;
 140         char name[DDT_NAMELEN];
 141 
 142         ddt_object_name(ddt, type, class, name);
 143 
 144         VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 145             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 146             &ddt->ddt_histogram[type][class], tx) == 0);
 147 
 148         /*
 149          * Cache DDT statistics; this is the only time they'll change.
 150          */
 151         VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
 152 
 153         ddo->ddo_count = ddt_object_count(ddt, type, class);
 154         ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 155         ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 156 }
 157 
 158 static int
 159 ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 160     ddt_entry_t *dde)
 161 {
 162         if (!ddt_object_exists(ddt, type, class))
 163                 return (SET_ERROR(ENOENT));
 164 
 165         return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
 166             ddt->ddt_object[type][class], dde));
 167 }
 168 
 169 static void
 170 ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 171     ddt_entry_t *dde)
 172 {
 173         if (!ddt_object_exists(ddt, type, class))
 174                 return;
 175 
 176         ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
 177             ddt->ddt_object[type][class], dde);
 178 }
 179 
 180 int
 181 ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class ddt_class,
 182     ddt_entry_t *dde, dmu_tx_t *tx)
 183 {
 184         ASSERT(ddt_object_exists(ddt, type, ddt_class));
 185 
 186         return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
 187             ddt->ddt_object[type][ddt_class], dde, tx));
 188 }
 189 
 190 static int
 191 ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 192     ddt_entry_t *dde, dmu_tx_t *tx)
 193 {
 194         ASSERT(ddt_object_exists(ddt, type, class));
 195 
 196         return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
 197             ddt->ddt_object[type][class], dde, tx));
 198 }
 199 
 200 int
 201 ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class ddt_class,
 202     uint64_t *walk, ddt_entry_t *dde)
 203 {
 204         ASSERT(ddt_object_exists(ddt, type, ddt_class));
 205 
 206         return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
 207             ddt->ddt_object[type][ddt_class], dde, walk));
 208 }
 209 
 210 uint64_t
 211 ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class ddt_class)
 212 {
 213         ASSERT(ddt_object_exists(ddt, type, ddt_class));
 214 
 215         return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
 216             ddt->ddt_object[type][ddt_class]));
 217 }
 218 
 219 int
 220 ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class ddt_class,
 221     dmu_object_info_t *doi)
 222 {
 223         if (!ddt_object_exists(ddt, type, ddt_class))
 224                 return (SET_ERROR(ENOENT));
 225 
 226         return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][ddt_class],
 227             doi));
 228 }
 229 
 230 boolean_t
 231 ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class ddt_class)
 232 {
 233         return (!!ddt->ddt_object[type][ddt_class]);
 234 }
 235 
 236 void
 237 ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class ddt_class,
 238     char *name)
 239 {
 240         (void) sprintf(name, DMU_POOL_DDT,
 241             zio_checksum_table[ddt->ddt_checksum].ci_name,
 242             ddt_ops[type]->ddt_op_name, ddt_class_name[ddt_class]);
 243 }
 244 
 245 void
 246 ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
 247 {
 248         ASSERT(txg != 0);
 249 
 250         for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 251                 bp->blk_dva[d] = ddp->ddp_dva[d];
 252         BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
 253 }
 254 
 255 void
 256 ddt_bp_create(enum zio_checksum checksum,
 257     const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
 258 {
 259         BP_ZERO(bp);
 260 
 261         if (ddp != NULL)
 262                 ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
 263 
 264         bp->blk_cksum = ddk->ddk_cksum;
 265         bp->blk_fill = 1;
 266 
 267         BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
 268         BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
 269         BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
 270         BP_SET_CHECKSUM(bp, checksum);
 271         BP_SET_TYPE(bp, DMU_OT_DEDUP);
 272         BP_SET_LEVEL(bp, 0);
 273         BP_SET_DEDUP(bp, 0);
 274         BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 275 }
 276 
 277 void
 278 ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
 279 {
 280         ddk->ddk_cksum = bp->blk_cksum;
 281         ddk->ddk_prop = 0;
 282 
 283         DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
 284         DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
 285         DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
 286 }
 287 
 288 void
 289 ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
 290 {
 291         ASSERT(ddp->ddp_phys_birth == 0);
 292 
 293         for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 294                 ddp->ddp_dva[d] = bp->blk_dva[d];
 295         ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
 296 }
 297 
 298 void
 299 ddt_phys_clear(ddt_phys_t *ddp)
 300 {
 301         bzero(ddp, sizeof (*ddp));
 302 }
 303 
 304 void
 305 ddt_phys_addref(ddt_phys_t *ddp)
 306 {
 307         ddp->ddp_refcnt++;
 308 }
 309 
 310 void
 311 ddt_phys_decref(ddt_phys_t *ddp)
 312 {
 313         ASSERT((int64_t)ddp->ddp_refcnt > 0);
 314         ddp->ddp_refcnt--;
 315 }
 316 
 317 void
 318 ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
 319 {
 320         blkptr_t blk;
 321 
 322         ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 323         ddt_phys_clear(ddp);
 324         zio_free(ddt->ddt_spa, txg, &blk);
 325 }
 326 
 327 ddt_phys_t *
 328 ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
 329 {
 330         ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
 331 
 332         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 333                 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
 334                     BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
 335                         return (ddp);
 336         }
 337         return (NULL);
 338 }
 339 
 340 uint64_t
 341 ddt_phys_total_refcnt(const ddt_entry_t *dde)
 342 {
 343         uint64_t refcnt = 0;
 344 
 345         for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
 346                 refcnt += dde->dde_phys[p].ddp_refcnt;
 347 
 348         return (refcnt);
 349 }
 350 
 351 static void
 352 ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
 353 {
 354         spa_t *spa = ddt->ddt_spa;
 355         ddt_phys_t *ddp = dde->dde_phys;
 356         ddt_key_t *ddk = &dde->dde_key;
 357         uint64_t lsize = DDK_GET_LSIZE(ddk);
 358         uint64_t psize = DDK_GET_PSIZE(ddk);
 359 
 360         bzero(dds, sizeof (*dds));
 361 
 362         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 363                 uint64_t dsize = 0;
 364                 uint64_t refcnt = ddp->ddp_refcnt;
 365 
 366                 if (ddp->ddp_phys_birth == 0)
 367                         continue;
 368 
 369                 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 370                         dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
 371 
 372                 dds->dds_blocks += 1;
 373                 dds->dds_lsize += lsize;
 374                 dds->dds_psize += psize;
 375                 dds->dds_dsize += dsize;
 376 
 377                 dds->dds_ref_blocks += refcnt;
 378                 dds->dds_ref_lsize += lsize * refcnt;
 379                 dds->dds_ref_psize += psize * refcnt;
 380                 dds->dds_ref_dsize += dsize * refcnt;
 381         }
 382 }
 383 
 384 void
 385 ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
 386 {
 387         const uint64_t *s = (const uint64_t *)src;
 388         uint64_t *d = (uint64_t *)dst;
 389         uint64_t *d_end = (uint64_t *)(dst + 1);
 390 
 391         ASSERT(neg == 0 || neg == -1ULL);       /* add or subtract */
 392 
 393         while (d < d_end)
 394                 *d++ += (*s++ ^ neg) - neg;
 395 }
 396 
 397 static void
 398 ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
 399 {
 400         ddt_stat_t dds;
 401         ddt_histogram_t *ddh;
 402         int bucket;
 403 
 404         ddt_stat_generate(ddt, dde, &dds);
 405 
 406         bucket = highbit(dds.dds_ref_blocks) - 1;
 407         ASSERT(bucket >= 0);
 408 
 409         ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
 410 
 411         ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
 412 }
 413 
 414 void
 415 ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
 416 {
 417         for (int h = 0; h < 64; h++)
 418                 ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
 419 }
 420 
 421 void
 422 ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
 423 {
 424         bzero(dds, sizeof (*dds));
 425 
 426         for (int h = 0; h < 64; h++)
 427                 ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
 428 }
 429 
 430 boolean_t
 431 ddt_histogram_empty(const ddt_histogram_t *ddh)
 432 {
 433         const uint64_t *s = (const uint64_t *)ddh;
 434         const uint64_t *s_end = (const uint64_t *)(ddh + 1);
 435 
 436         while (s < s_end)
 437                 if (*s++ != 0)
 438                         return (B_FALSE);
 439 
 440         return (B_TRUE);
 441 }
 442 
 443 void
 444 ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
 445 {
 446         /* Sum the statistics we cached in ddt_object_sync(). */
 447         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 448                 ddt_t *ddt = spa->spa_ddt[c];
 449                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 450                         for (enum ddt_class class = 0; class < DDT_CLASSES;
 451                             class++) {
 452                                 ddt_object_t *ddo =
 453                                     &ddt->ddt_object_stats[type][class];
 454                                 ddo_total->ddo_count += ddo->ddo_count;
 455                                 ddo_total->ddo_dspace += ddo->ddo_dspace;
 456                                 ddo_total->ddo_mspace += ddo->ddo_mspace;
 457                         }
 458                 }
 459         }
 460 
 461         /* ... and compute the averages. */
 462         if (ddo_total->ddo_count != 0) {
 463                 ddo_total->ddo_dspace /= ddo_total->ddo_count;
 464                 ddo_total->ddo_mspace /= ddo_total->ddo_count;
 465         }
 466 }
 467 
 468 void
 469 ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
 470 {
 471         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 472                 ddt_t *ddt = spa->spa_ddt[c];
 473                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 474                         for (enum ddt_class class = 0; class < DDT_CLASSES;
 475                             class++) {
 476                                 ddt_histogram_add(ddh,
 477                                     &ddt->ddt_histogram_cache[type][class]);
 478                         }
 479                 }
 480         }
 481 }
 482 
 483 void
 484 ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
 485 {
 486         ddt_histogram_t *ddh_total;
 487 
 488         ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
 489         ddt_get_dedup_histogram(spa, ddh_total);
 490         ddt_histogram_stat(dds_total, ddh_total);
 491         kmem_free(ddh_total, sizeof (ddt_histogram_t));
 492 }
 493 
 494 uint64_t
 495 ddt_get_dedup_dspace(spa_t *spa)
 496 {
 497         ddt_stat_t dds_total = { 0 };
 498 
 499         ddt_get_dedup_stats(spa, &dds_total);
 500         return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
 501 }
 502 
 503 uint64_t
 504 ddt_get_pool_dedup_ratio(spa_t *spa)
 505 {
 506         ddt_stat_t dds_total = { 0 };
 507 
 508         ddt_get_dedup_stats(spa, &dds_total);
 509         if (dds_total.dds_dsize == 0)
 510                 return (100);
 511 
 512         return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
 513 }
 514 
 515 int
 516 ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
 517 {
 518         spa_t *spa = ddt->ddt_spa;
 519         uint64_t total_refcnt = 0;
 520         uint64_t ditto = spa->spa_dedup_ditto;
 521         int total_copies = 0;
 522         int desired_copies = 0;
 523 
 524         for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
 525                 ddt_phys_t *ddp = &dde->dde_phys[p];
 526                 zio_t *zio = dde->dde_lead_zio[p];
 527                 uint64_t refcnt = ddp->ddp_refcnt;   /* committed refs */
 528                 if (zio != NULL)
 529                         refcnt += zio->io_parent_count;      /* pending refs */
 530                 if (ddp == ddp_willref)
 531                         refcnt++;                       /* caller's ref */
 532                 if (refcnt != 0) {
 533                         total_refcnt += refcnt;
 534                         total_copies += p;
 535                 }
 536         }
 537 
 538         if (ditto == 0 || ditto > UINT32_MAX)
 539                 ditto = UINT32_MAX;
 540 
 541         if (total_refcnt >= 1)
 542                 desired_copies++;
 543         if (total_refcnt >= ditto)
 544                 desired_copies++;
 545         if (total_refcnt >= ditto * ditto)
 546                 desired_copies++;
 547 
 548         return (MAX(desired_copies, total_copies) - total_copies);
 549 }
 550 
 551 int
 552 ddt_ditto_copies_present(ddt_entry_t *dde)
 553 {
 554         ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
 555         dva_t *dva = ddp->ddp_dva;
 556         int copies = 0 - DVA_GET_GANG(dva);
 557 
 558         for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
 559                 if (DVA_IS_VALID(dva))
 560                         copies++;
 561 
 562         ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
 563 
 564         return (copies);
 565 }
 566 
 567 size_t
 568 ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
 569 {
 570         uchar_t *version = dst++;
 571         int cpfunc = ZIO_COMPRESS_ZLE;
 572         zio_compress_info_t *ci = &zio_compress_table[cpfunc];
 573         size_t c_len;
 574 
 575         ASSERT(d_len >= s_len + 1);  /* no compression plus version byte */
 576 
 577         c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
 578 
 579         if (c_len == s_len) {
 580                 cpfunc = ZIO_COMPRESS_OFF;
 581                 bcopy(src, dst, s_len);
 582         }
 583 
 584         *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc;
 585 
 586         return (c_len + 1);
 587 }
 588 
 589 void
 590 ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
 591 {
 592         uchar_t version = *src++;
 593         int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
 594         zio_compress_info_t *ci = &zio_compress_table[cpfunc];
 595 
 596         if (ci->ci_decompress != NULL)
 597                 (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
 598         else
 599                 bcopy(src, dst, d_len);
 600 
 601         if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK)
 602                 byteswap_uint64_array(dst, d_len);
 603 }
 604 
 605 ddt_t *
 606 ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
 607 {
 608         return (spa->spa_ddt[c]);
 609 }
 610 
 611 ddt_t *
 612 ddt_select(spa_t *spa, const blkptr_t *bp)
 613 {
 614         return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
 615 }
 616 
 617 void
 618 ddt_enter(ddt_t *ddt)
 619 {
 620         mutex_enter(&ddt->ddt_lock);
 621 }
 622 
 623 void
 624 ddt_exit(ddt_t *ddt)
 625 {
 626         mutex_exit(&ddt->ddt_lock);
 627 }
 628 
 629 static ddt_entry_t *
 630 ddt_alloc(const ddt_key_t *ddk)
 631 {
 632         ddt_entry_t *dde;
 633 
 634         dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
 635         cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
 636 
 637         dde->dde_key = *ddk;
 638 
 639         return (dde);
 640 }
 641 
 642 static void
 643 ddt_free(ddt_entry_t *dde)
 644 {
 645         ASSERT(!dde->dde_loading);
 646 
 647         for (int p = 0; p < DDT_PHYS_TYPES; p++)
 648                 ASSERT(dde->dde_lead_zio[p] == NULL);
 649 
 650         if (dde->dde_repair_data != NULL)
 651                 zio_buf_free(dde->dde_repair_data,
 652                     DDK_GET_PSIZE(&dde->dde_key));
 653 
 654         cv_destroy(&dde->dde_cv);
 655         kmem_free(dde, sizeof (*dde));
 656 }
 657 
 658 void
 659 ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
 660 {
 661         ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 662 
 663         avl_remove(&ddt->ddt_tree, dde);
 664         ddt_free(dde);
 665 }
 666 
 667 ddt_entry_t *
 668 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
 669 {
 670         ddt_entry_t *dde, dde_search;
 671         enum ddt_type type;
 672         enum ddt_class class;
 673         avl_index_t where;
 674         int error;
 675 
 676         ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 677 
 678         ddt_key_fill(&dde_search.dde_key, bp);
 679 
 680         dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
 681         if (dde == NULL) {
 682                 if (!add)
 683                         return (NULL);
 684                 dde = ddt_alloc(&dde_search.dde_key);
 685                 avl_insert(&ddt->ddt_tree, dde, where);
 686         }
 687 
 688         while (dde->dde_loading)
 689                 cv_wait(&dde->dde_cv, &ddt->ddt_lock);
 690 
 691         if (dde->dde_loaded)
 692                 return (dde);
 693 
 694         dde->dde_loading = B_TRUE;
 695 
 696         ddt_exit(ddt);
 697 
 698         error = ENOENT;
 699 
 700         for (type = 0; type < DDT_TYPES; type++) {
 701                 for (class = 0; class < DDT_CLASSES; class++) {
 702                         error = ddt_object_lookup(ddt, type, class, dde);
 703                         if (error != ENOENT)
 704                                 break;
 705                 }
 706                 if (error != ENOENT)
 707                         break;
 708         }
 709 
 710         ASSERT(error == 0 || error == ENOENT);
 711 
 712         ddt_enter(ddt);
 713 
 714         ASSERT(dde->dde_loaded == B_FALSE);
 715         ASSERT(dde->dde_loading == B_TRUE);
 716 
 717         dde->dde_type = type;        /* will be DDT_TYPES if no entry found */
 718         dde->dde_class = class;      /* will be DDT_CLASSES if no entry found */
 719         dde->dde_loaded = B_TRUE;
 720         dde->dde_loading = B_FALSE;
 721 
 722         if (error == 0)
 723                 ddt_stat_update(ddt, dde, -1ULL);
 724 
 725         cv_broadcast(&dde->dde_cv);
 726 
 727         return (dde);
 728 }
 729 
 730 void
 731 ddt_prefetch(spa_t *spa, const blkptr_t *bp)
 732 {
 733         ddt_t *ddt;
 734         ddt_entry_t dde;
 735 
 736         if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
 737                 return;
 738 
 739         /*
 740          * We only remove the DDT once all tables are empty and only
 741          * prefetch dedup blocks when there are entries in the DDT.
 742          * Thus no locking is required as the DDT can't disappear on us.
 743          */
 744         ddt = ddt_select(spa, bp);
 745         ddt_key_fill(&dde.dde_key, bp);
 746 
 747         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 748                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
 749                         ddt_object_prefetch(ddt, type, class, &dde);
 750                 }
 751         }
 752 }
 753 
 754 int
 755 ddt_entry_compare(const void *x1, const void *x2)
 756 {
 757         const ddt_entry_t *dde1 = x1;
 758         const ddt_entry_t *dde2 = x2;
 759         const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
 760         const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
 761 
 762         for (int i = 0; i < DDT_KEY_WORDS; i++) {
 763                 if (u1[i] < u2[i])
 764                         return (-1);
 765                 if (u1[i] > u2[i])
 766                         return (1);
 767         }
 768 
 769         return (0);
 770 }
 771 
 772 static ddt_t *
 773 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 774 {
 775         ddt_t *ddt;
 776 
 777         ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
 778 
 779         mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
 780         avl_create(&ddt->ddt_tree, ddt_entry_compare,
 781             sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 782         avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
 783             sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 784         ddt->ddt_checksum = c;
 785         ddt->ddt_spa = spa;
 786         ddt->ddt_os = spa->spa_meta_objset;
 787 
 788         return (ddt);
 789 }
 790 
 791 static void
 792 ddt_table_free(ddt_t *ddt)
 793 {
 794         ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
 795         ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
 796         avl_destroy(&ddt->ddt_tree);
 797         avl_destroy(&ddt->ddt_repair_tree);
 798         mutex_destroy(&ddt->ddt_lock);
 799         kmem_free(ddt, sizeof (*ddt));
 800 }
 801 
 802 void
 803 ddt_create(spa_t *spa)
 804 {
 805         spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
 806 
 807         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
 808                 spa->spa_ddt[c] = ddt_table_alloc(spa, c);
 809 }
 810 
 811 int
 812 ddt_load(spa_t *spa)
 813 {
 814         int error;
 815 
 816         ddt_create(spa);
 817 
 818         error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 819             DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
 820             &spa->spa_ddt_stat_object);
 821 
 822         if (error)
 823                 return (error == ENOENT ? 0 : error);
 824 
 825         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 826                 ddt_t *ddt = spa->spa_ddt[c];
 827                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 828                         for (enum ddt_class class = 0; class < DDT_CLASSES;
 829                             class++) {
 830                                 error = ddt_object_load(ddt, type, class);
 831                                 if (error != 0 && error != ENOENT)
 832                                         return (error);
 833                         }
 834                 }
 835 
 836                 /*
 837                  * Seed the cached histograms.
 838                  */
 839                 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
 840                     sizeof (ddt->ddt_histogram));
 841         }
 842 
 843         return (0);
 844 }
 845 
 846 void
 847 ddt_unload(spa_t *spa)
 848 {
 849         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 850                 if (spa->spa_ddt[c]) {
 851                         ddt_table_free(spa->spa_ddt[c]);
 852                         spa->spa_ddt[c] = NULL;
 853                 }
 854         }
 855 }
 856 
 857 boolean_t
 858 ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
 859 {
 860         ddt_t *ddt;
 861         ddt_entry_t dde;
 862 
 863         if (!BP_GET_DEDUP(bp))
 864                 return (B_FALSE);
 865 
 866         if (max_class == DDT_CLASS_UNIQUE)
 867                 return (B_TRUE);
 868 
 869         ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
 870 
 871         ddt_key_fill(&dde.dde_key, bp);
 872 
 873         for (enum ddt_type type = 0; type < DDT_TYPES; type++)
 874                 for (enum ddt_class class = 0; class <= max_class; class++)
 875                         if (ddt_object_lookup(ddt, type, class, &dde) == 0)
 876                                 return (B_TRUE);
 877 
 878         return (B_FALSE);
 879 }
 880 
 881 ddt_entry_t *
 882 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
 883 {
 884         ddt_key_t ddk;
 885         ddt_entry_t *dde;
 886 
 887         ddt_key_fill(&ddk, bp);
 888 
 889         dde = ddt_alloc(&ddk);
 890 
 891         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 892                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
 893                         /*
 894                          * We can only do repair if there are multiple copies
 895                          * of the block.  For anything in the UNIQUE class,
 896                          * there's definitely only one copy, so don't even try.
 897                          */
 898                         if (class != DDT_CLASS_UNIQUE &&
 899                             ddt_object_lookup(ddt, type, class, dde) == 0)
 900                                 return (dde);
 901                 }
 902         }
 903 
 904         bzero(dde->dde_phys, sizeof (dde->dde_phys));
 905 
 906         return (dde);
 907 }
 908 
 909 void
 910 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
 911 {
 912         avl_index_t where;
 913 
 914         ddt_enter(ddt);
 915 
 916         if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) &&
 917             avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
 918                 avl_insert(&ddt->ddt_repair_tree, dde, where);
 919         else
 920                 ddt_free(dde);
 921 
 922         ddt_exit(ddt);
 923 }
 924 
 925 static void
 926 ddt_repair_entry_done(zio_t *zio)
 927 {
 928         ddt_entry_t *rdde = zio->io_private;
 929 
 930         ddt_free(rdde);
 931 }
 932 
 933 static void
 934 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
 935 {
 936         ddt_phys_t *ddp = dde->dde_phys;
 937         ddt_phys_t *rddp = rdde->dde_phys;
 938         ddt_key_t *ddk = &dde->dde_key;
 939         ddt_key_t *rddk = &rdde->dde_key;
 940         zio_t *zio;
 941         blkptr_t blk;
 942 
 943         zio = zio_null(rio, rio->io_spa, NULL,
 944             ddt_repair_entry_done, rdde, rio->io_flags);
 945 
 946         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
 947                 if (ddp->ddp_phys_birth == 0 ||
 948                     ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
 949                     bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
 950                         continue;
 951                 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 952                 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
 953                     rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL,
 954                     ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
 955         }
 956 
 957         zio_nowait(zio);
 958 }
 959 
 960 static void
 961 ddt_repair_table(ddt_t *ddt, zio_t *rio)
 962 {
 963         spa_t *spa = ddt->ddt_spa;
 964         ddt_entry_t *dde, *rdde_next, *rdde;
 965         avl_tree_t *t = &ddt->ddt_repair_tree;
 966         blkptr_t blk;
 967 
 968         if (spa_sync_pass(spa) > 1)
 969                 return;
 970 
 971         ddt_enter(ddt);
 972         for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
 973                 rdde_next = AVL_NEXT(t, rdde);
 974                 avl_remove(&ddt->ddt_repair_tree, rdde);
 975                 ddt_exit(ddt);
 976                 ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
 977                 dde = ddt_repair_start(ddt, &blk);
 978                 ddt_repair_entry(ddt, dde, rdde, rio);
 979                 ddt_repair_done(ddt, dde);
 980                 ddt_enter(ddt);
 981         }
 982         ddt_exit(ddt);
 983 }
 984 
 985 static void
 986 ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 987 {
 988         dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
 989         ddt_phys_t *ddp = dde->dde_phys;
 990         ddt_key_t *ddk = &dde->dde_key;
 991         enum ddt_type otype = dde->dde_type;
 992         enum ddt_type ntype = DDT_TYPE_CURRENT;
 993         enum ddt_class oclass = dde->dde_class;
 994         enum ddt_class nclass;
 995         uint64_t total_refcnt = 0;
 996 
 997         ASSERT(dde->dde_loaded);
 998         ASSERT(!dde->dde_loading);
 999 
1000         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1001                 ASSERT(dde->dde_lead_zio[p] == NULL);
1002                 ASSERT((int64_t)ddp->ddp_refcnt >= 0);
1003                 if (ddp->ddp_phys_birth == 0) {
1004                         ASSERT(ddp->ddp_refcnt == 0);
1005                         continue;
1006                 }
1007                 if (p == DDT_PHYS_DITTO) {
1008                         if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
1009                                 ddt_phys_free(ddt, ddk, ddp, txg);
1010                         continue;
1011                 }
1012                 if (ddp->ddp_refcnt == 0)
1013                         ddt_phys_free(ddt, ddk, ddp, txg);
1014                 total_refcnt += ddp->ddp_refcnt;
1015         }
1016 
1017         if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
1018                 nclass = DDT_CLASS_DITTO;
1019         else if (total_refcnt > 1)
1020                 nclass = DDT_CLASS_DUPLICATE;
1021         else
1022                 nclass = DDT_CLASS_UNIQUE;
1023 
1024         if (otype != DDT_TYPES &&
1025             (otype != ntype || oclass != nclass || total_refcnt == 0)) {
1026                 VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
1027                 ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
1028         }
1029 
1030         if (total_refcnt != 0) {
1031                 dde->dde_type = ntype;
1032                 dde->dde_class = nclass;
1033                 ddt_stat_update(ddt, dde, 0);
1034                 if (!ddt_object_exists(ddt, ntype, nclass))
1035                         ddt_object_create(ddt, ntype, nclass, tx);
1036                 VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
1037 
1038                 /*
1039                  * If the class changes, the order that we scan this bp
1040                  * changes.  If it decreases, we could miss it, so
1041                  * scan it right now.  (This covers both class changing
1042                  * while we are doing ddt_walk(), and when we are
1043                  * traversing.)
1044                  */
1045                 if (nclass < oclass) {
1046                         dsl_scan_ddt_entry(dp->dp_scan,
1047                             ddt->ddt_checksum, dde, tx);
1048                 }
1049         }
1050 }
1051 
1052 static void
1053 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
1054 {
1055         spa_t *spa = ddt->ddt_spa;
1056         ddt_entry_t *dde;
1057         void *cookie = NULL;
1058 
1059         if (avl_numnodes(&ddt->ddt_tree) == 0)
1060                 return;
1061 
1062         ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
1063 
1064         if (spa->spa_ddt_stat_object == 0) {
1065                 spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
1066                     DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
1067                     DMU_POOL_DDT_STATS, tx);
1068         }
1069 
1070         while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
1071                 ddt_sync_entry(ddt, dde, tx, txg);
1072                 ddt_free(dde);
1073         }
1074 
1075         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1076                 uint64_t count = 0;
1077                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
1078                         if (ddt_object_exists(ddt, type, class)) {
1079                                 ddt_object_sync(ddt, type, class, tx);
1080                                 count += ddt_object_count(ddt, type, class);
1081                         }
1082                 }
1083                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
1084                         if (count == 0 && ddt_object_exists(ddt, type, class))
1085                                 ddt_object_destroy(ddt, type, class, tx);
1086                 }
1087         }
1088 
1089         bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
1090             sizeof (ddt->ddt_histogram));
1091 }
1092 
1093 void
1094 ddt_sync(spa_t *spa, uint64_t txg)
1095 {
1096         dmu_tx_t *tx;
1097         zio_t *rio = zio_root(spa, NULL, NULL,
1098             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1099 
1100         ASSERT(spa_syncing_txg(spa) == txg);
1101 
1102         tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1103 
1104         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1105                 ddt_t *ddt = spa->spa_ddt[c];
1106                 if (ddt == NULL)
1107                         continue;
1108                 ddt_sync_table(ddt, tx, txg);
1109                 ddt_repair_table(ddt, rio);
1110         }
1111 
1112         (void) zio_wait(rio);
1113 
1114         dmu_tx_commit(tx);
1115 }
1116 
1117 int
1118 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
1119 {
1120         do {
1121                 do {
1122                         do {
1123                                 ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
1124                                 int error = ENOENT;
1125                                 if (ddt_object_exists(ddt, ddb->ddb_type,
1126                                     ddb->ddb_class)) {
1127                                         error = ddt_object_walk(ddt,
1128                                             ddb->ddb_type, ddb->ddb_class,
1129                                             &ddb->ddb_cursor, dde);
1130                                 }
1131                                 dde->dde_type = ddb->ddb_type;
1132                                 dde->dde_class = ddb->ddb_class;
1133                                 if (error == 0)
1134                                         return (0);
1135                                 if (error != ENOENT)
1136                                         return (error);
1137                                 ddb->ddb_cursor = 0;
1138                         } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
1139                         ddb->ddb_checksum = 0;
1140                 } while (++ddb->ddb_type < DDT_TYPES);
1141                 ddb->ddb_type = 0;
1142         } while (++ddb->ddb_class < DDT_CLASSES);
1143 
1144         return (SET_ERROR(ENOENT));
1145 }