1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 #include <sys/zfs_context.h>
  27 #include <sys/spa.h>
  28 #include <sys/spa_impl.h>
  29 #include <sys/zio.h>
  30 #include <sys/ddt.h>
  31 #include <sys/zap.h>
  32 #include <sys/dmu_tx.h>
  33 #include <sys/arc.h>
  34 #include <sys/dsl_pool.h>
  35 #include <sys/zio_checksum.h>
  36 #include <sys/zio_compress.h>
  37 #include <sys/dsl_scan.h>
  38 
  39 /*
  40  * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
  41  */
  42 int zfs_dedup_prefetch = 1;
  43 
  44 static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
  45         &ddt_zap_ops,
  46 };
  47 
  48 static const char *ddt_class_name[DDT_CLASSES] = {
  49         "ditto",
  50         "duplicate",
  51         "unique",
  52 };
  53 
  54 static void
  55 ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
  56     dmu_tx_t *tx)
  57 {
  58         spa_t *spa = ddt->ddt_spa;
  59         objset_t *os = ddt->ddt_os;
  60         uint64_t *objectp = &ddt->ddt_object[type][class];
  61         boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup;
  62         char name[DDT_NAMELEN];
  63 
  64         ddt_object_name(ddt, type, class, name);
  65 
  66         ASSERT(*objectp == 0);
  67         VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
  68         ASSERT(*objectp != 0);
  69 
  70         VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
  71             sizeof (uint64_t), 1, objectp, tx) == 0);
  72 
  73         VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
  74             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
  75             &ddt->ddt_histogram[type][class], tx) == 0);
  76 }
  77 
  78 static void
  79 ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
  80     dmu_tx_t *tx)
  81 {
  82         spa_t *spa = ddt->ddt_spa;
  83         objset_t *os = ddt->ddt_os;
  84         uint64_t *objectp = &ddt->ddt_object[type][class];
  85         char name[DDT_NAMELEN];
  86 
  87         ddt_object_name(ddt, type, class, name);
  88 
  89         ASSERT(*objectp != 0);
  90         ASSERT(ddt_object_count(ddt, type, class) == 0);
  91         ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
  92         VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
  93         VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
  94         VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
  95         bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
  96 
  97         *objectp = 0;
  98 }
  99 
 100 static int
 101 ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 102 {
 103         ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 104         dmu_object_info_t doi;
 105         char name[DDT_NAMELEN];
 106         int error;
 107 
 108         ddt_object_name(ddt, type, class, name);
 109 
 110         error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
 111             sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
 112 
 113         if (error)
 114                 return (error);
 115 
 116         error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 117             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 118             &ddt->ddt_histogram[type][class]);
 119 
 120         /*
 121          * Seed the cached statistics.
 122          */
 123         VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
 124 
 125         ddo->ddo_count = ddt_object_count(ddt, type, class);
 126         ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 127         ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 128 
 129         ASSERT(error == 0);
 130         return (error);
 131 }
 132 
 133 static void
 134 ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 135     dmu_tx_t *tx)
 136 {
 137         ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 138         dmu_object_info_t doi;
 139         char name[DDT_NAMELEN];
 140 
 141         ddt_object_name(ddt, type, class, name);
 142 
 143         VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 144             sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 145             &ddt->ddt_histogram[type][class], tx) == 0);
 146 
 147         /*
 148          * Cache DDT statistics; this is the only time they'll change.
 149          */
 150         VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
 151 
 152         ddo->ddo_count = ddt_object_count(ddt, type, class);
 153         ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 154         ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 155 }
 156 
 157 static int
 158 ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 159     ddt_entry_t *dde)
 160 {
 161         if (!ddt_object_exists(ddt, type, class))
 162                 return (ENOENT);
 163 
 164         return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
 165             ddt->ddt_object[type][class], dde));
 166 }
 167 
 168 static void
 169 ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 170     ddt_entry_t *dde)
 171 {
 172         if (!ddt_object_exists(ddt, type, class))
 173                 return;
 174 
 175         ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
 176             ddt->ddt_object[type][class], dde);
 177 }
 178 
 179 int
 180 ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 181     ddt_entry_t *dde, dmu_tx_t *tx)
 182 {
 183         ASSERT(ddt_object_exists(ddt, type, class));
 184 
 185         return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
 186             ddt->ddt_object[type][class], dde, tx));
 187 }
 188 
 189 static int
 190 ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 191     ddt_entry_t *dde, dmu_tx_t *tx)
 192 {
 193         ASSERT(ddt_object_exists(ddt, type, class));
 194 
 195         return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
 196             ddt->ddt_object[type][class], dde, tx));
 197 }
 198 
 199 int
 200 ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 201     uint64_t *walk, ddt_entry_t *dde)
 202 {
 203         ASSERT(ddt_object_exists(ddt, type, class));
 204 
 205         return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
 206             ddt->ddt_object[type][class], dde, walk));
 207 }
 208 
 209 uint64_t
 210 ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 211 {
 212         ASSERT(ddt_object_exists(ddt, type, class));
 213 
 214         return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
 215             ddt->ddt_object[type][class]));
 216 }
 217 
 218 int
 219 ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 220     dmu_object_info_t *doi)
 221 {
 222         if (!ddt_object_exists(ddt, type, class))
 223                 return (ENOENT);
 224 
 225         return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
 226             doi));
 227 }
 228 
 229 boolean_t
 230 ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 231 {
 232         return (!!ddt->ddt_object[type][class]);
 233 }
 234 
 235 void
 236 ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 237     char *name)
 238 {
 239         (void) sprintf(name, DMU_POOL_DDT,
 240             zio_checksum_table[ddt->ddt_checksum].ci_name,
 241             ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
 242 }
 243 
 244 void
 245 ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
 246 {
 247         ASSERT(txg != 0);
 248 
 249         for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 250                 bp->blk_dva[d] = ddp->ddp_dva[d];
 251         BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
 252 }
 253 
 254 void
 255 ddt_bp_create(enum zio_checksum checksum,
 256     const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
 257 {
 258         BP_ZERO(bp);
 259 
 260         if (ddp != NULL)
 261                 ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
 262 
 263         bp->blk_cksum = ddk->ddk_cksum;
 264         bp->blk_fill = 1;
 265 
 266         BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
 267         BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
 268         BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
 269         BP_SET_CHECKSUM(bp, checksum);
 270         BP_SET_TYPE(bp, DMU_OT_DEDUP);
 271         BP_SET_LEVEL(bp, 0);
 272         BP_SET_DEDUP(bp, 0);
 273         BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 274 }
 275 
 276 void
 277 ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
 278 {
 279         ddk->ddk_cksum = bp->blk_cksum;
 280         ddk->ddk_prop = 0;
 281 
 282         DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
 283         DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
 284         DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
 285 }
 286 
 287 void
 288 ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
 289 {
 290         ASSERT(ddp != NULL);
 291         ASSERT(ddp->ddp_phys_birth == 0);
 292 
 293         for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 294                 ddp->ddp_dva[d] = bp->blk_dva[d];
 295         ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
 296 }
 297 
 298 void
 299 ddt_phys_clear(ddt_phys_t *ddp)
 300 {
 301         ASSERT(ddp != NULL);
 302         if (ddp) {
 303                 bzero(ddp, sizeof (*ddp));
 304         }
 305 }
 306 
 307 void
 308 ddt_phys_addref(ddt_phys_t *ddp)
 309 {
 310         ASSERT(ddp != NULL);
 311         if (ddp) {
 312                 ddp->ddp_refcnt++;
 313         }
 314 }
 315 
 316 void
 317 ddt_phys_decref(ddt_phys_t *ddp)
 318 {
 319 //      ASSERT(ddp != NULL);
 320         if (ddp) {
 321                 ASSERT((int64_t)ddp->ddp_refcnt > 0);
 322                 ddp->ddp_refcnt--;
 323         }
 324 }
 325 
 326 void
 327 ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
 328 {
 329         blkptr_t blk;
 330 
 331         ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 332         ddt_phys_clear(ddp);
 333         zio_free(ddt->ddt_spa, txg, &blk);
 334 }
 335 
 336 ddt_phys_t *
 337 ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
 338 {
 339         ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
 340 
 341         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 342                 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
 343                     BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
 344                         return (ddp);
 345         }
 346         (void) printf("ddt_phys_select() found nothing for "
 347             "DVA[BP]=<%llu:%llx:%llx> and phys_birth[BP]=%llu\n",
 348             (u_longlong_t)DVA_GET_VDEV(BP_IDENTITY(bp)),
 349             (u_longlong_t)DVA_GET_OFFSET(BP_IDENTITY(bp)),
 350             (u_longlong_t)DVA_GET_ASIZE(BP_IDENTITY(bp)),
 351             (u_longlong_t)BP_PHYSICAL_BIRTH(bp)
 352         );
 353         return (NULL);
 354 }
 355 
 356 uint64_t
 357 ddt_phys_total_refcnt(const ddt_entry_t *dde)
 358 {
 359         uint64_t refcnt = 0;
 360 
 361         for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
 362                 refcnt += dde->dde_phys[p].ddp_refcnt;
 363 
 364         return (refcnt);
 365 }
 366 
 367 static void
 368 ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
 369 {
 370         spa_t *spa = ddt->ddt_spa;
 371         ddt_phys_t *ddp = dde->dde_phys;
 372         ddt_key_t *ddk = &dde->dde_key;
 373         uint64_t lsize = DDK_GET_LSIZE(ddk);
 374         uint64_t psize = DDK_GET_PSIZE(ddk);
 375 
 376         bzero(dds, sizeof (*dds));
 377 
 378         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 379                 uint64_t dsize = 0;
 380                 uint64_t refcnt = ddp->ddp_refcnt;
 381 
 382                 if (ddp->ddp_phys_birth == 0)
 383                         continue;
 384 
 385                 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 386                         dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
 387 
 388                 dds->dds_blocks += 1;
 389                 dds->dds_lsize += lsize;
 390                 dds->dds_psize += psize;
 391                 dds->dds_dsize += dsize;
 392 
 393                 dds->dds_ref_blocks += refcnt;
 394                 dds->dds_ref_lsize += lsize * refcnt;
 395                 dds->dds_ref_psize += psize * refcnt;
 396                 dds->dds_ref_dsize += dsize * refcnt;
 397         }
 398 }
 399 
 400 void
 401 ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
 402 {
 403         const uint64_t *s = (const uint64_t *)src;
 404         uint64_t *d = (uint64_t *)dst;
 405         uint64_t *d_end = (uint64_t *)(dst + 1);
 406 
 407         ASSERT(neg == 0 || neg == -1ULL);       /* add or subtract */
 408 
 409         while (d < d_end)
 410                 *d++ += (*s++ ^ neg) - neg;
 411 }
 412 
 413 static void
 414 ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
 415 {
 416         ddt_stat_t dds;
 417         ddt_histogram_t *ddh;
 418         int bucket;
 419 
 420         ddt_stat_generate(ddt, dde, &dds);
 421 
 422         bucket = highbit(dds.dds_ref_blocks) - 1;
 423         ASSERT(bucket >= 0);
 424 
 425         ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
 426 
 427         ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
 428 }
 429 
 430 void
 431 ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
 432 {
 433         for (int h = 0; h < 64; h++)
 434                 ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
 435 }
 436 
 437 void
 438 ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
 439 {
 440         bzero(dds, sizeof (*dds));
 441 
 442         for (int h = 0; h < 64; h++)
 443                 ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
 444 }
 445 
 446 boolean_t
 447 ddt_histogram_empty(const ddt_histogram_t *ddh)
 448 {
 449         const uint64_t *s = (const uint64_t *)ddh;
 450         const uint64_t *s_end = (const uint64_t *)(ddh + 1);
 451 
 452         while (s < s_end)
 453                 if (*s++ != 0)
 454                         return (B_FALSE);
 455 
 456         return (B_TRUE);
 457 }
 458 
 459 void
 460 ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
 461 {
 462         /* Sum the statistics we cached in ddt_object_sync(). */
 463         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 464                 ddt_t *ddt = spa->spa_ddt[c];
 465                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 466                         for (enum ddt_class class = 0; class < DDT_CLASSES;
 467                             class++) {
 468                                 ddt_object_t *ddo =
 469                                     &ddt->ddt_object_stats[type][class];
 470                                 ddo_total->ddo_count += ddo->ddo_count;
 471                                 ddo_total->ddo_dspace += ddo->ddo_dspace;
 472                                 ddo_total->ddo_mspace += ddo->ddo_mspace;
 473                         }
 474                 }
 475         }
 476 
 477         /* ... and compute the averages. */
 478         if (ddo_total->ddo_count != 0) {
 479                 ddo_total->ddo_dspace /= ddo_total->ddo_count;
 480                 ddo_total->ddo_mspace /= ddo_total->ddo_count;
 481         }
 482 }
 483 
 484 void
 485 ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
 486 {
 487         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 488                 ddt_t *ddt = spa->spa_ddt[c];
 489                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 490                         for (enum ddt_class class = 0; class < DDT_CLASSES;
 491                             class++) {
 492                                 ddt_histogram_add(ddh,
 493                                     &ddt->ddt_histogram_cache[type][class]);
 494                         }
 495                 }
 496         }
 497 }
 498 
 499 void
 500 ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
 501 {
 502         ddt_histogram_t *ddh_total;
 503 
 504         ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
 505         ddt_get_dedup_histogram(spa, ddh_total);
 506         ddt_histogram_stat(dds_total, ddh_total);
 507         kmem_free(ddh_total, sizeof (ddt_histogram_t));
 508 }
 509 
 510 uint64_t
 511 ddt_get_dedup_dspace(spa_t *spa)
 512 {
 513         ddt_stat_t dds_total = { 0 };
 514 
 515         ddt_get_dedup_stats(spa, &dds_total);
 516         return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
 517 }
 518 
 519 uint64_t
 520 ddt_get_pool_dedup_ratio(spa_t *spa)
 521 {
 522         ddt_stat_t dds_total = { 0 };
 523 
 524         ddt_get_dedup_stats(spa, &dds_total);
 525         if (dds_total.dds_dsize == 0)
 526                 return (100);
 527 
 528         return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
 529 }
 530 
 531 int
 532 ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
 533 {
 534         spa_t *spa = ddt->ddt_spa;
 535         uint64_t total_refcnt = 0;
 536         uint64_t ditto = spa->spa_dedup_ditto;
 537         int total_copies = 0;
 538         int desired_copies = 0;
 539 
 540         for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
 541                 ddt_phys_t *ddp = &dde->dde_phys[p];
 542                 zio_t *zio = dde->dde_lead_zio[p];
 543                 uint64_t refcnt = ddp->ddp_refcnt;   /* committed refs */
 544                 if (zio != NULL)
 545                         refcnt += zio->io_parent_count;      /* pending refs */
 546                 if (ddp == ddp_willref)
 547                         refcnt++;                       /* caller's ref */
 548                 if (refcnt != 0) {
 549                         total_refcnt += refcnt;
 550                         total_copies += p;
 551                 }
 552         }
 553 
 554         if (ditto == 0 || ditto > UINT32_MAX)
 555                 ditto = UINT32_MAX;
 556 
 557         if (total_refcnt >= 1)
 558                 desired_copies++;
 559         if (total_refcnt >= ditto)
 560                 desired_copies++;
 561         if (total_refcnt >= ditto * ditto)
 562                 desired_copies++;
 563 
 564         return (MAX(desired_copies, total_copies) - total_copies);
 565 }
 566 
 567 int
 568 ddt_ditto_copies_present(ddt_entry_t *dde)
 569 {
 570         ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
 571         dva_t *dva = ddp->ddp_dva;
 572         int copies = 0 - DVA_GET_GANG(dva);
 573 
 574         for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
 575                 if (DVA_IS_VALID(dva))
 576                         copies++;
 577 
 578         ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
 579 
 580         return (copies);
 581 }
 582 
 583 size_t
 584 ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
 585 {
 586         uchar_t *version = dst++;
 587         int cpfunc = ZIO_COMPRESS_ZLE;
 588         zio_compress_info_t *ci = &zio_compress_table[cpfunc];
 589         size_t c_len;
 590 
 591         ASSERT(d_len >= s_len + 1);  /* no compression plus version byte */
 592 
 593         c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
 594 
 595         if (c_len == s_len) {
 596                 cpfunc = ZIO_COMPRESS_OFF;
 597                 bcopy(src, dst, s_len);
 598         }
 599 
 600         *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc;
 601 
 602         return (c_len + 1);
 603 }
 604 
 605 void
 606 ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
 607 {
 608         uchar_t version = *src++;
 609         int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
 610         zio_compress_info_t *ci = &zio_compress_table[cpfunc];
 611 
 612         if (ci->ci_decompress != NULL)
 613                 (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
 614         else
 615                 bcopy(src, dst, d_len);
 616 
 617         if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK)
 618                 byteswap_uint64_array(dst, d_len);
 619 }
 620 
 621 ddt_t *
 622 ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
 623 {
 624         return (spa->spa_ddt[c]);
 625 }
 626 
 627 ddt_t *
 628 ddt_select(spa_t *spa, const blkptr_t *bp)
 629 {
 630         return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
 631 }
 632 
 633 void
 634 ddt_enter(ddt_t *ddt)
 635 {
 636         mutex_enter(&ddt->ddt_lock);
 637 }
 638 
 639 void
 640 ddt_exit(ddt_t *ddt)
 641 {
 642         mutex_exit(&ddt->ddt_lock);
 643 }
 644 
 645 static ddt_entry_t *
 646 ddt_alloc(const ddt_key_t *ddk)
 647 {
 648         ddt_entry_t *dde;
 649 
 650         dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
 651         cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
 652 
 653         dde->dde_key = *ddk;
 654 
 655         return (dde);
 656 }
 657 
 658 static void
 659 ddt_free(ddt_entry_t *dde)
 660 {
 661         ASSERT(!dde->dde_loading);
 662 
 663         for (int p = 0; p < DDT_PHYS_TYPES; p++)
 664                 ASSERT(dde->dde_lead_zio[p] == NULL);
 665 
 666         if (dde->dde_repair_data != NULL)
 667                 zio_buf_free(dde->dde_repair_data,
 668                     DDK_GET_PSIZE(&dde->dde_key));
 669 
 670         cv_destroy(&dde->dde_cv);
 671         kmem_free(dde, sizeof (*dde));
 672 }
 673 
 674 void
 675 ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
 676 {
 677         ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 678 
 679         avl_remove(&ddt->ddt_tree, dde);
 680         ddt_free(dde);
 681 }
 682 
 683 ddt_entry_t *
 684 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
 685 {
 686         ddt_entry_t *dde, dde_search;
 687         enum ddt_type type;
 688         enum ddt_class class;
 689         avl_index_t where;
 690         int error;
 691 
 692         ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 693 
 694         ddt_key_fill(&dde_search.dde_key, bp);
 695 
 696         dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
 697         if (dde == NULL) {
 698                 if (!add)
 699                         return (NULL);
 700                 dde = ddt_alloc(&dde_search.dde_key);
 701                 avl_insert(&ddt->ddt_tree, dde, where);
 702         }
 703 
 704         while (dde->dde_loading)
 705                 cv_wait(&dde->dde_cv, &ddt->ddt_lock);
 706 
 707         if (dde->dde_loaded)
 708                 return (dde);
 709 
 710         dde->dde_loading = B_TRUE;
 711 
 712         ddt_exit(ddt);
 713 
 714         error = ENOENT;
 715 
 716         for (type = 0; type < DDT_TYPES; type++) {
 717                 for (class = 0; class < DDT_CLASSES; class++) {
 718                         error = ddt_object_lookup(ddt, type, class, dde);
 719                         if (error != ENOENT)
 720                                 break;
 721                 }
 722                 if (error != ENOENT)
 723                         break;
 724         }
 725 
 726         ASSERT(error == 0 || error == ENOENT);
 727 
 728         ddt_enter(ddt);
 729 
 730         ASSERT(dde->dde_loaded == B_FALSE);
 731         ASSERT(dde->dde_loading == B_TRUE);
 732 
 733         dde->dde_type = type;        /* will be DDT_TYPES if no entry found */
 734         dde->dde_class = class;      /* will be DDT_CLASSES if no entry found */
 735         dde->dde_loaded = B_TRUE;
 736         dde->dde_loading = B_FALSE;
 737 
 738         if (error == 0)
 739                 ddt_stat_update(ddt, dde, -1ULL);
 740 
 741         cv_broadcast(&dde->dde_cv);
 742 
 743         return (dde);
 744 }
 745 
 746 void
 747 ddt_prefetch(spa_t *spa, const blkptr_t *bp)
 748 {
 749         ddt_t *ddt;
 750         ddt_entry_t dde;
 751 
 752         if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
 753                 return;
 754 
 755         /*
 756          * We only remove the DDT once all tables are empty and only
 757          * prefetch dedup blocks when there are entries in the DDT.
 758          * Thus no locking is required as the DDT can't disappear on us.
 759          */
 760         ddt = ddt_select(spa, bp);
 761         ddt_key_fill(&dde.dde_key, bp);
 762 
 763         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 764                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
 765                         ddt_object_prefetch(ddt, type, class, &dde);
 766                 }
 767         }
 768 }
 769 
 770 int
 771 ddt_entry_compare(const void *x1, const void *x2)
 772 {
 773         const ddt_entry_t *dde1 = x1;
 774         const ddt_entry_t *dde2 = x2;
 775         const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
 776         const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
 777 
 778         for (int i = 0; i < DDT_KEY_WORDS; i++) {
 779                 if (u1[i] < u2[i])
 780                         return (-1);
 781                 if (u1[i] > u2[i])
 782                         return (1);
 783         }
 784 
 785         return (0);
 786 }
 787 
 788 static ddt_t *
 789 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 790 {
 791         ddt_t *ddt;
 792 
 793         ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
 794 
 795         mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
 796         avl_create(&ddt->ddt_tree, ddt_entry_compare,
 797             sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 798         avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
 799             sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 800         ddt->ddt_checksum = c;
 801         ddt->ddt_spa = spa;
 802         ddt->ddt_os = spa->spa_meta_objset;
 803 
 804         return (ddt);
 805 }
 806 
 807 static void
 808 ddt_table_free(ddt_t *ddt)
 809 {
 810         ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
 811         ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
 812         avl_destroy(&ddt->ddt_tree);
 813         avl_destroy(&ddt->ddt_repair_tree);
 814         mutex_destroy(&ddt->ddt_lock);
 815         kmem_free(ddt, sizeof (*ddt));
 816 }
 817 
 818 void
 819 ddt_create(spa_t *spa)
 820 {
 821         spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
 822 
 823         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
 824                 spa->spa_ddt[c] = ddt_table_alloc(spa, c);
 825 }
 826 
 827 int
 828 ddt_load(spa_t *spa)
 829 {
 830         int error;
 831 
 832         ddt_create(spa);
 833 
 834         error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 835             DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
 836             &spa->spa_ddt_stat_object);
 837 
 838         if (error)
 839                 return (error == ENOENT ? 0 : error);
 840 
 841         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 842                 ddt_t *ddt = spa->spa_ddt[c];
 843                 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 844                         for (enum ddt_class class = 0; class < DDT_CLASSES;
 845                             class++) {
 846                                 error = ddt_object_load(ddt, type, class);
 847                                 if (error != 0 && error != ENOENT)
 848                                         return (error);
 849                         }
 850                 }
 851 
 852                 /*
 853                  * Seed the cached histograms.
 854                  */
 855                 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
 856                     sizeof (ddt->ddt_histogram));
 857         }
 858 
 859         return (0);
 860 }
 861 
 862 void
 863 ddt_unload(spa_t *spa)
 864 {
 865         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 866                 if (spa->spa_ddt[c]) {
 867                         ddt_table_free(spa->spa_ddt[c]);
 868                         spa->spa_ddt[c] = NULL;
 869                 }
 870         }
 871 }
 872 
 873 boolean_t
 874 ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
 875 {
 876         ddt_t *ddt;
 877         ddt_entry_t dde;
 878 
 879         if (!BP_GET_DEDUP(bp))
 880                 return (B_FALSE);
 881 
 882         if (max_class == DDT_CLASS_UNIQUE)
 883                 return (B_TRUE);
 884 
 885         ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
 886 
 887         ddt_key_fill(&dde.dde_key, bp);
 888 
 889         for (enum ddt_type type = 0; type < DDT_TYPES; type++)
 890                 for (enum ddt_class class = 0; class <= max_class; class++)
 891                         if (ddt_object_lookup(ddt, type, class, &dde) == 0)
 892                                 return (B_TRUE);
 893 
 894         return (B_FALSE);
 895 }
 896 
 897 ddt_entry_t *
 898 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
 899 {
 900         ddt_key_t ddk;
 901         ddt_entry_t *dde;
 902 
 903         ddt_key_fill(&ddk, bp);
 904 
 905         dde = ddt_alloc(&ddk);
 906 
 907         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 908                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
 909                         /*
 910                          * We can only do repair if there are multiple copies
 911                          * of the block.  For anything in the UNIQUE class,
 912                          * there's definitely only one copy, so don't even try.
 913                          */
 914                         if (class != DDT_CLASS_UNIQUE &&
 915                             ddt_object_lookup(ddt, type, class, dde) == 0)
 916                                 return (dde);
 917                 }
 918         }
 919 
 920         bzero(dde->dde_phys, sizeof (dde->dde_phys));
 921 
 922         return (dde);
 923 }
 924 
 925 void
 926 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
 927 {
 928         avl_index_t where;
 929 
 930         ddt_enter(ddt);
 931 
 932         if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) &&
 933             avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
 934                 avl_insert(&ddt->ddt_repair_tree, dde, where);
 935         else
 936                 ddt_free(dde);
 937 
 938         ddt_exit(ddt);
 939 }
 940 
 941 static void
 942 ddt_repair_entry_done(zio_t *zio)
 943 {
 944         ddt_entry_t *rdde = zio->io_private;
 945 
 946         ddt_free(rdde);
 947 }
 948 
 949 static void
 950 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
 951 {
 952         ddt_phys_t *ddp = dde->dde_phys;
 953         ddt_phys_t *rddp = rdde->dde_phys;
 954         ddt_key_t *ddk = &dde->dde_key;
 955         ddt_key_t *rddk = &rdde->dde_key;
 956         zio_t *zio;
 957         blkptr_t blk;
 958 
 959         zio = zio_null(rio, rio->io_spa, NULL,
 960             ddt_repair_entry_done, rdde, rio->io_flags);
 961 
 962         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
 963                 if (ddp->ddp_phys_birth == 0 ||
 964                     ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
 965                     bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
 966                         continue;
 967                 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 968                 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
 969                     rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL,
 970                     ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
 971         }
 972 
 973         zio_nowait(zio);
 974 }
 975 
 976 static void
 977 ddt_repair_table(ddt_t *ddt, zio_t *rio)
 978 {
 979         spa_t *spa = ddt->ddt_spa;
 980         ddt_entry_t *dde, *rdde_next, *rdde;
 981         avl_tree_t *t = &ddt->ddt_repair_tree;
 982         blkptr_t blk;
 983 
 984         if (spa_sync_pass(spa) > 1)
 985                 return;
 986 
 987         ddt_enter(ddt);
 988         for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
 989                 rdde_next = AVL_NEXT(t, rdde);
 990                 avl_remove(&ddt->ddt_repair_tree, rdde);
 991                 ddt_exit(ddt);
 992                 ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
 993                 dde = ddt_repair_start(ddt, &blk);
 994                 ddt_repair_entry(ddt, dde, rdde, rio);
 995                 ddt_repair_done(ddt, dde);
 996                 ddt_enter(ddt);
 997         }
 998         ddt_exit(ddt);
 999 }
1000 
1001 static void
1002 ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
1003 {
1004         dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
1005         ddt_phys_t *ddp = dde->dde_phys;
1006         ddt_key_t *ddk = &dde->dde_key;
1007         enum ddt_type otype = dde->dde_type;
1008         enum ddt_type ntype = DDT_TYPE_CURRENT;
1009         enum ddt_class oclass = dde->dde_class;
1010         enum ddt_class nclass;
1011         uint64_t total_refcnt = 0;
1012 
1013         ASSERT(dde->dde_loaded);
1014         ASSERT(!dde->dde_loading);
1015 
1016         for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1017                 ASSERT(dde->dde_lead_zio[p] == NULL);
1018                 ASSERT((int64_t)ddp->ddp_refcnt >= 0);
1019                 if (ddp->ddp_phys_birth == 0) {
1020                         ASSERT(ddp->ddp_refcnt == 0);
1021                         continue;
1022                 }
1023                 if (p == DDT_PHYS_DITTO) {
1024                         if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
1025                                 ddt_phys_free(ddt, ddk, ddp, txg);
1026                         continue;
1027                 }
1028                 if (ddp->ddp_refcnt == 0)
1029                         ddt_phys_free(ddt, ddk, ddp, txg);
1030                 total_refcnt += ddp->ddp_refcnt;
1031         }
1032 
1033         if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
1034                 nclass = DDT_CLASS_DITTO;
1035         else if (total_refcnt > 1)
1036                 nclass = DDT_CLASS_DUPLICATE;
1037         else
1038                 nclass = DDT_CLASS_UNIQUE;
1039 
1040         if (otype != DDT_TYPES &&
1041             (otype != ntype || oclass != nclass || total_refcnt == 0)) {
1042                 VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
1043                 ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
1044         }
1045 
1046         if (total_refcnt != 0) {
1047                 dde->dde_type = ntype;
1048                 dde->dde_class = nclass;
1049                 ddt_stat_update(ddt, dde, 0);
1050                 if (!ddt_object_exists(ddt, ntype, nclass))
1051                         ddt_object_create(ddt, ntype, nclass, tx);
1052                 VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
1053 
1054                 /*
1055                  * If the class changes, the order that we scan this bp
1056                  * changes.  If it decreases, we could miss it, so
1057                  * scan it right now.  (This covers both class changing
1058                  * while we are doing ddt_walk(), and when we are
1059                  * traversing.)
1060                  */
1061                 if (nclass < oclass) {
1062                         dsl_scan_ddt_entry(dp->dp_scan,
1063                             ddt->ddt_checksum, dde, tx);
1064                 }
1065         }
1066 }
1067 
1068 static void
1069 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
1070 {
1071         spa_t *spa = ddt->ddt_spa;
1072         ddt_entry_t *dde;
1073         void *cookie = NULL;
1074 
1075         if (avl_numnodes(&ddt->ddt_tree) == 0)
1076                 return;
1077 
1078         ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
1079 
1080         if (spa->spa_ddt_stat_object == 0) {
1081                 spa->spa_ddt_stat_object = zap_create(ddt->ddt_os,
1082                     DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx);
1083                 VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT,
1084                     DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
1085                     &spa->spa_ddt_stat_object, tx) == 0);
1086         }
1087 
1088         while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
1089                 ddt_sync_entry(ddt, dde, tx, txg);
1090                 ddt_free(dde);
1091         }
1092 
1093         for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1094                 uint64_t count = 0;
1095                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
1096                         if (ddt_object_exists(ddt, type, class)) {
1097                                 ddt_object_sync(ddt, type, class, tx);
1098                                 count += ddt_object_count(ddt, type, class);
1099                         }
1100                 }
1101                 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
1102                         if (count == 0 && ddt_object_exists(ddt, type, class))
1103                                 ddt_object_destroy(ddt, type, class, tx);
1104                 }
1105         }
1106 
1107         bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
1108             sizeof (ddt->ddt_histogram));
1109 }
1110 
1111 void
1112 ddt_sync(spa_t *spa, uint64_t txg)
1113 {
1114         dmu_tx_t *tx;
1115         zio_t *rio = zio_root(spa, NULL, NULL,
1116             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1117 
1118         ASSERT(spa_syncing_txg(spa) == txg);
1119 
1120         tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1121 
1122         for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1123                 ddt_t *ddt = spa->spa_ddt[c];
1124                 if (ddt == NULL)
1125                         continue;
1126                 ddt_sync_table(ddt, tx, txg);
1127                 ddt_repair_table(ddt, rio);
1128         }
1129 
1130         (void) zio_wait(rio);
1131 
1132         dmu_tx_commit(tx);
1133 }
1134 
1135 int
1136 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
1137 {
1138         do {
1139                 do {
1140                         do {
1141                                 ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
1142                                 int error = ENOENT;
1143                                 if (ddt_object_exists(ddt, ddb->ddb_type,
1144                                     ddb->ddb_class)) {
1145                                         error = ddt_object_walk(ddt,
1146                                             ddb->ddb_type, ddb->ddb_class,
1147                                             &ddb->ddb_cursor, dde);
1148                                 }
1149                                 dde->dde_type = ddb->ddb_type;
1150                                 dde->dde_class = ddb->ddb_class;
1151                                 if (error == 0)
1152                                         return (0);
1153                                 if (error != ENOENT)
1154                                         return (error);
1155                                 ddb->ddb_cursor = 0;
1156                         } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
1157                         ddb->ddb_checksum = 0;
1158                 } while (++ddb->ddb_type < DDT_TYPES);
1159                 ddb->ddb_type = 0;
1160         } while (++ddb->ddb_class < DDT_CLASSES);
1161 
1162         return (ENOENT);
1163 }