1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2013 by Delphix. All rights reserved.
  24  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  25  * Copyright 2013 Saso Kiselkov. All rights reserved.
  26  */
  27 
  28 #include <sys/zfs_context.h>
  29 #include <sys/spa.h>
  30 #include <sys/spa_impl.h>
  31 #include <sys/zio.h>
  32 #include <sys/zio_checksum.h>
  33 #include <sys/zil.h>
  34 #include <zfs_fletcher.h>
  35 
  36 /*
  37  * Checksum vectors.
  38  *
  39  * In the SPA, everything is checksummed.  We support checksum vectors
  40  * for three distinct reasons:
  41  *
  42  *   1. Different kinds of data need different levels of protection.
  43  *      For SPA metadata, we always want a very strong checksum.
  44  *      For user data, we let users make the trade-off between speed
  45  *      and checksum strength.
  46  *
  47  *   2. Cryptographic hash and MAC algorithms are an area of active research.
  48  *      It is likely that in future hash functions will be at least as strong
  49  *      as current best-of-breed, and may be substantially faster as well.
  50  *      We want the ability to take advantage of these new hashes as soon as
  51  *      they become available.
  52  *
  53  *   3. If someone develops hardware that can compute a strong hash quickly,
  54  *      we want the ability to take advantage of that hardware.
  55  *
  56  * Of course, we don't want a checksum upgrade to invalidate existing
  57  * data, so we store the checksum *function* in eight bits of the bp.
  58  * This gives us room for up to 256 different checksum functions.
  59  *
  60  * When writing a block, we always checksum it with the latest-and-greatest
  61  * checksum function of the appropriate strength.  When reading a block,
  62  * we compare the expected checksum against the actual checksum, which we
  63  * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
  64  *
  65  * SALTED CHECKSUMS
  66  *
  67  * To enable the use of non-cryptographically secure hash algorithms in
  68  * dedup we introduce the notion of salted checksums (MACs, really). A salted
  69  * checksum is fed both a random 256-bit value (the salt) and the data to be
  70  * checksummed. This salt is kept secret (stored on the pool, but never shown
  71  * to the user), thus even if an attacker knew of collision weaknesses in the
  72  * hash algorithm, they won't be able to mount a known plaintext attack on
  73  * the DDT, since the actual hash value cannot be known ahead of time. How
  74  * the salt is used is algorithm-specific (some might simply prefix it to the
  75  * data block, others might need to utilize a full-blown HMAC). On disk the
  76  * salt is stored in a ZAP object in the MOS (DMU_POOL_CHECKSUM_SALT).
  77  *
  78  * CONTEXT TEMPLATES
  79  *
  80  * Some hashing algorithms need to perform a substantial amount of
  81  * initialization work (e.g. salted checksums above may need to pre-hash the
  82  * salt) before being able to process data. Performing this redundant work
  83  * for each block would be very wasteful, so we instead allow a checksum
  84  * algorithm to do the work once (the first time it's used) and then keep
  85  * this pre-initialized context as a template inside the spa_t
  86  * (spa_cksum_tmpls). If the zio_checksum_info_t contains non-NULL
  87  * ci_tmpl_init and ci_tmpl_free callbacks, they are used to construct and
  88  * destruct the pre-initialized checksum context. The pre-initialized
  89  * context is then reused during each checksum invocation and passed to the
  90  * checksum function.
  91  */
  92 
  93 /*ARGSUSED*/
  94 static void
  95 zio_checksum_off(const void *buf, uint64_t size, const zio_cksum_salt_t *salt,
  96     const void *ctx_template, zio_cksum_t *zcp)
  97 {
  98         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
  99 }
 100 
 101 zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 102         {{NULL, NULL}, NULL, NULL, 0, 0, 0, 0, "inherit"},
 103         {{NULL, NULL}, NULL, NULL, 0, 0, 0, 0, "on"},
 104         {{zio_checksum_off,             zio_checksum_off},
 105             NULL, NULL, 0, 0, 0, 0, "off"},
 106         {{zio_checksum_SHA256,          zio_checksum_SHA256},
 107             NULL, NULL, 1, 1, 0, 0, "label"},
 108         {{zio_checksum_SHA256,          zio_checksum_SHA256},
 109             NULL, NULL, 1, 1, 0, 0, "gang_header"},
 110         {{fletcher_2_native,            fletcher_2_byteswap},
 111             NULL, NULL, 0, 1, 0, 0, "zilog"},
 112         {{fletcher_2_native,            fletcher_2_byteswap},
 113             NULL, NULL, 0, 0, 0, 0, "fletcher2"},
 114         {{fletcher_4_native,            fletcher_4_byteswap},
 115             NULL, NULL, 1, 0, 0, 0, "fletcher4"},
 116         {{zio_checksum_SHA256,          zio_checksum_SHA256},
 117             NULL, NULL, 1, 0, 1, 0, "sha256"},
 118         {{fletcher_4_native,            fletcher_4_byteswap},
 119             NULL, NULL, 0, 1, 0, 0, "zilog2"},
 120         {{zio_checksum_off,             zio_checksum_off},
 121             NULL, NULL, 0, 0, 0, 0, "noparity"},
 122         {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap},
 123             NULL, NULL, 1, 0, 1, 0, "sha512"},
 124         {{zio_checksum_skein_native,    zio_checksum_skein_byteswap},
 125             zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
 126             1, 0, 1, 1, "skein"},
 127         {{zio_checksum_edonr_native,    zio_checksum_edonr_byteswap},
 128             zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
 129             1, 0, 1, 1, "edonr"}
 130 };
 131 
 132 enum zio_checksum
 133 zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
 134 {
 135         ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
 136         ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
 137         ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
 138 
 139         if (child == ZIO_CHECKSUM_INHERIT)
 140                 return (parent);
 141 
 142         if (child == ZIO_CHECKSUM_ON)
 143                 return (ZIO_CHECKSUM_ON_VALUE);
 144 
 145         return (child);
 146 }
 147 
 148 enum zio_checksum
 149 zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
 150     enum zio_checksum parent)
 151 {
 152         ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
 153         ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
 154         ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
 155 
 156         if (child == ZIO_CHECKSUM_INHERIT)
 157                 return (parent);
 158 
 159         if (child == ZIO_CHECKSUM_ON)
 160                 return (spa_dedup_checksum(spa));
 161 
 162         if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
 163                 return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
 164 
 165         ASSERT(zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_dedup ||
 166             (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
 167 
 168         return (child);
 169 }
 170 
 171 /*
 172  * Set the external verifier for a gang block based on <vdev, offset, txg>,
 173  * a tuple which is guaranteed to be unique for the life of the pool.
 174  */
 175 static void
 176 zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp)
 177 {
 178         dva_t *dva = BP_IDENTITY(bp);
 179         uint64_t txg = BP_PHYSICAL_BIRTH(bp);
 180 
 181         ASSERT(BP_IS_GANG(bp));
 182 
 183         ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
 184 }
 185 
 186 /*
 187  * Set the external verifier for a label block based on its offset.
 188  * The vdev is implicit, and the txg is unknowable at pool open time --
 189  * hence the logic in vdev_uberblock_load() to find the most recent copy.
 190  */
 191 static void
 192 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
 193 {
 194         ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
 195 }
 196 
 197 /*
 198  * Calls the template init function of a checksum which supports context
 199  * templates and installs the template into the spa_t.
 200  */
 201 static void
 202 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
 203 {
 204         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 205 
 206         VERIFY(ci->ci_tmpl_init != NULL && ci->ci_tmpl_free != NULL);
 207         mutex_enter(&spa->spa_cksum_tmpls_lock);
 208         if (spa->spa_cksum_tmpls[checksum] == NULL) {
 209                 spa->spa_cksum_tmpls[checksum] =
 210                     ci->ci_tmpl_init(&spa->spa_cksum_salt);
 211                 VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
 212         }
 213         mutex_exit(&spa->spa_cksum_tmpls_lock);
 214 }
 215 
 216 /*
 217  * Generate the checksum.
 218  */
 219 void
 220 zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
 221         void *data, uint64_t size)
 222 {
 223         blkptr_t *bp = zio->io_bp;
 224         uint64_t offset = zio->io_offset;
 225         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 226         zio_cksum_t cksum;
 227         spa_t *spa = zio->io_spa;
 228 
 229         ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
 230         ASSERT(ci->ci_func[0] != NULL);
 231 
 232         if (ci->ci_tmpl_init != NULL && spa->spa_cksum_tmpls[checksum] == NULL)
 233                 zio_checksum_template_init(checksum, spa);
 234 
 235         if (ci->ci_eck) {
 236                 zio_eck_t *eck;
 237 
 238                 if (checksum == ZIO_CHECKSUM_ZILOG2) {
 239                         zil_chain_t *zilc = data;
 240 
 241                         size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ,
 242                             uint64_t);
 243                         eck = &zilc->zc_eck;
 244                 } else {
 245                         eck = (zio_eck_t *)((char *)data + size) - 1;
 246                 }
 247                 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
 248                         zio_checksum_gang_verifier(&eck->zec_cksum, bp);
 249                 else if (checksum == ZIO_CHECKSUM_LABEL)
 250                         zio_checksum_label_verifier(&eck->zec_cksum, offset);
 251                 else
 252                         bp->blk_cksum = eck->zec_cksum;
 253                 eck->zec_magic = ZEC_MAGIC;
 254                 ci->ci_func[0](data, size, &spa->spa_cksum_salt,
 255                     spa->spa_cksum_tmpls[checksum], &cksum);
 256                 eck->zec_cksum = cksum;
 257         } else {
 258                 ci->ci_func[0](data, size, &spa->spa_cksum_salt,
 259                     spa->spa_cksum_tmpls[checksum], &bp->blk_cksum);
 260         }
 261 }
 262 
 263 int
 264 zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
 265 {
 266         blkptr_t *bp = zio->io_bp;
 267         uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
 268             (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
 269         int byteswap;
 270         int error;
 271         uint64_t size = (bp == NULL ? zio->io_size :
 272             (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
 273         uint64_t offset = zio->io_offset;
 274         void *data = zio->io_data;
 275         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 276         zio_cksum_t actual_cksum, expected_cksum, verifier;
 277         spa_t *spa = zio->io_spa;
 278 
 279         if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
 280                 return (SET_ERROR(EINVAL));
 281 
 282         if (ci->ci_tmpl_init != NULL && spa->spa_cksum_tmpls[checksum] == NULL)
 283                 zio_checksum_template_init(checksum, spa);
 284 
 285         if (ci->ci_eck) {
 286                 zio_eck_t *eck;
 287 
 288                 if (checksum == ZIO_CHECKSUM_ZILOG2) {
 289                         zil_chain_t *zilc = data;
 290                         uint64_t nused;
 291 
 292                         eck = &zilc->zc_eck;
 293                         if (eck->zec_magic == ZEC_MAGIC)
 294                                 nused = zilc->zc_nused;
 295                         else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC))
 296                                 nused = BSWAP_64(zilc->zc_nused);
 297                         else
 298                                 return (SET_ERROR(ECKSUM));
 299 
 300                         if (nused > size)
 301                                 return (SET_ERROR(ECKSUM));
 302 
 303                         size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
 304                 } else {
 305                         eck = (zio_eck_t *)((char *)data + size) - 1;
 306                 }
 307 
 308                 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
 309                         zio_checksum_gang_verifier(&verifier, bp);
 310                 else if (checksum == ZIO_CHECKSUM_LABEL)
 311                         zio_checksum_label_verifier(&verifier, offset);
 312                 else
 313                         verifier = bp->blk_cksum;
 314 
 315                 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
 316 
 317                 if (byteswap)
 318                         byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 319 
 320                 expected_cksum = eck->zec_cksum;
 321                 eck->zec_cksum = verifier;
 322                 ci->ci_func[byteswap](data, size, &spa->spa_cksum_salt,
 323                     spa->spa_cksum_tmpls[checksum], &actual_cksum);
 324                 eck->zec_cksum = expected_cksum;
 325 
 326                 if (byteswap)
 327                         byteswap_uint64_array(&expected_cksum,
 328                             sizeof (zio_cksum_t));
 329         } else {
 330                 ASSERT(!BP_IS_GANG(bp));
 331                 byteswap = BP_SHOULD_BYTESWAP(bp);
 332                 expected_cksum = bp->blk_cksum;
 333                 ci->ci_func[byteswap](data, size, &spa->spa_cksum_salt,
 334                     spa->spa_cksum_tmpls[checksum], &actual_cksum);
 335         }
 336 
 337         info->zbc_expected = expected_cksum;
 338         info->zbc_actual = actual_cksum;
 339         info->zbc_checksum_name = ci->ci_name;
 340         info->zbc_byteswapped = byteswap;
 341         info->zbc_injected = 0;
 342         info->zbc_has_cksum = 1;
 343 
 344         if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
 345                 return (SET_ERROR(ECKSUM));
 346 
 347         if (zio_injection_enabled && !zio->io_error &&
 348             (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) {
 349 
 350                 info->zbc_injected = 1;
 351                 return (error);
 352         }
 353 
 354         return (0);
 355 }
 356 
 357 /*
 358  * Called by a spa_t that's about to be deallocated. This steps through
 359  * all of the checksum context templates and deallocates any that were
 360  * initialized using the algorithm-specific template init function.
 361  */
 362 void
 363 zio_checksum_templates_free(spa_t *spa)
 364 {
 365         for (int checksum = 0; checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
 366                 if (spa->spa_cksum_tmpls[checksum] != NULL) {
 367                         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 368 
 369                         VERIFY(ci->ci_tmpl_free != NULL);
 370                         ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
 371                         spa->spa_cksum_tmpls[checksum] = NULL;
 372                 }
 373         }
 374 }