Print this page
    
4185 New hash algorithm support
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/zio_checksum.c
          +++ new/usr/src/uts/common/fs/zfs/zio_checksum.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  
    | 
      ↓ open down ↓ | 
    14 lines elided | 
    
      ↑ open up ↑ | 
  
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2013 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2013, Joyent, Inc. All rights reserved.
       25 + * Copyright 2013 Saso Kiselkov. All rights reserved.
  25   26   */
  26   27  
  27   28  #include <sys/zfs_context.h>
  28   29  #include <sys/spa.h>
       30 +#include <sys/spa_impl.h>
  29   31  #include <sys/zio.h>
  30   32  #include <sys/zio_checksum.h>
  31   33  #include <sys/zil.h>
  32   34  #include <zfs_fletcher.h>
  33   35  
  34   36  /*
  35   37   * Checksum vectors.
  36   38   *
  37   39   * In the SPA, everything is checksummed.  We support checksum vectors
  38   40   * for three distinct reasons:
  39   41   *
  40   42   *   1. Different kinds of data need different levels of protection.
  41   43   *      For SPA metadata, we always want a very strong checksum.
  42   44   *      For user data, we let users make the trade-off between speed
  43   45   *      and checksum strength.
  44   46   *
  45   47   *   2. Cryptographic hash and MAC algorithms are an area of active research.
  46   48   *      It is likely that in future hash functions will be at least as strong
  47   49   *      as current best-of-breed, and may be substantially faster as well.
  48   50   *      We want the ability to take advantage of these new hashes as soon as
  49   51   *      they become available.
  50   52   *
  51   53   *   3. If someone develops hardware that can compute a strong hash quickly,
  
    | 
      ↓ open down ↓ | 
    13 lines elided | 
    
      ↑ open up ↑ | 
  
  52   54   *      we want the ability to take advantage of that hardware.
  53   55   *
  54   56   * Of course, we don't want a checksum upgrade to invalidate existing
  55   57   * data, so we store the checksum *function* in eight bits of the bp.
  56   58   * This gives us room for up to 256 different checksum functions.
  57   59   *
  58   60   * When writing a block, we always checksum it with the latest-and-greatest
  59   61   * checksum function of the appropriate strength.  When reading a block,
  60   62   * we compare the expected checksum against the actual checksum, which we
  61   63   * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
       64 + *
       65 + * SALTED CHECKSUMS
       66 + *
       67 + * To enable the use of non-cryptographically secure hash algorithms in
       68 + * dedup we introduce the notion of salted checksums (MACs, really). A salted
       69 + * checksum is fed both a random 256-bit value (the salt) and the data to be
       70 + * checksummed. This salt is kept secret (stored on the pool, but never shown
       71 + * to the user), thus even if an attacker knew of collision weaknesses in the
       72 + * hash algorithm, they won't be able to mount a known plaintext attack on
       73 + * the DDT, since the actual hash value cannot be known ahead of time. How
       74 + * the salt is used is algorithm-specific (some might simply prefix it to the
       75 + * data block, others might need to utilize a full-blown HMAC). On disk the
       76 + * salt is stored in a ZAP object in the MOS (DMU_POOL_CHECKSUM_SALT).
       77 + *
       78 + * CONTEXT TEMPLATES
       79 + *
       80 + * Some hashing algorithms need to perform a substantial amount of
       81 + * initialization work (e.g. salted checksums above may need to pre-hash the
       82 + * salt) before being able to process data. Performing this redundant work
       83 + * for each block would be very wasteful, so we instead allow a checksum
       84 + * algorithm to do the work once (the first time it's used) and then keep
       85 + * this pre-initialized context as a template inside the spa_t
       86 + * (spa_cksum_tmpls). If the zio_checksum_info_t contains non-NULL
       87 + * ci_tmpl_init and ci_tmpl_free callbacks, they are used to construct and
       88 + * destruct the pre-initialized checksum context. The pre-initialized
       89 + * context is then reused during each checksum invocation and passed to the
       90 + * checksum function.
  62   91   */
  63   92  
  64   93  /*ARGSUSED*/
  65   94  static void
  66      -zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
       95 +zio_checksum_off(const void *buf, uint64_t size, const zio_cksum_salt_t *salt,
       96 +    const void *ctx_template, zio_cksum_t *zcp)
  67   97  {
  68   98          ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
  69   99  }
  70  100  
  71  101  zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
  72      -        {{NULL,                 NULL},                  0, 0, 0, "inherit"},
  73      -        {{NULL,                 NULL},                  0, 0, 0, "on"},
  74      -        {{zio_checksum_off,     zio_checksum_off},      0, 0, 0, "off"},
  75      -        {{zio_checksum_SHA256,  zio_checksum_SHA256},   1, 1, 0, "label"},
  76      -        {{zio_checksum_SHA256,  zio_checksum_SHA256},   1, 1, 0, "gang_header"},
  77      -        {{fletcher_2_native,    fletcher_2_byteswap},   0, 1, 0, "zilog"},
  78      -        {{fletcher_2_native,    fletcher_2_byteswap},   0, 0, 0, "fletcher2"},
  79      -        {{fletcher_4_native,    fletcher_4_byteswap},   1, 0, 0, "fletcher4"},
  80      -        {{zio_checksum_SHA256,  zio_checksum_SHA256},   1, 0, 1, "sha256"},
  81      -        {{fletcher_4_native,    fletcher_4_byteswap},   0, 1, 0, "zilog2"},
  82      -        {{zio_checksum_off,     zio_checksum_off},      0, 0, 0, "noparity"},
      102 +        {{NULL, NULL}, NULL, NULL, 0, 0, 0, 0, "inherit"},
      103 +        {{NULL, NULL}, NULL, NULL, 0, 0, 0, 0, "on"},
      104 +        {{zio_checksum_off,             zio_checksum_off},
      105 +            NULL, NULL, 0, 0, 0, 0, "off"},
      106 +        {{zio_checksum_SHA256,          zio_checksum_SHA256},
      107 +            NULL, NULL, 1, 1, 0, 0, "label"},
      108 +        {{zio_checksum_SHA256,          zio_checksum_SHA256},
      109 +            NULL, NULL, 1, 1, 0, 0, "gang_header"},
      110 +        {{fletcher_2_native,            fletcher_2_byteswap},
      111 +            NULL, NULL, 0, 1, 0, 0, "zilog"},
      112 +        {{fletcher_2_native,            fletcher_2_byteswap},
      113 +            NULL, NULL, 0, 0, 0, 0, "fletcher2"},
      114 +        {{fletcher_4_native,            fletcher_4_byteswap},
      115 +            NULL, NULL, 1, 0, 0, 0, "fletcher4"},
      116 +        {{zio_checksum_SHA256,          zio_checksum_SHA256},
      117 +            NULL, NULL, 1, 0, 1, 0, "sha256"},
      118 +        {{fletcher_4_native,            fletcher_4_byteswap},
      119 +            NULL, NULL, 0, 1, 0, 0, "zilog2"},
      120 +        {{zio_checksum_off,             zio_checksum_off},
      121 +            NULL, NULL, 0, 0, 0, 0, "noparity"},
      122 +        {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap},
      123 +            NULL, NULL, 1, 0, 1, 0, "sha512"},
      124 +        {{zio_checksum_skein_native,    zio_checksum_skein_byteswap},
      125 +            zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
      126 +            1, 0, 1, 1, "skein"},
      127 +        {{zio_checksum_edonr_native,    zio_checksum_edonr_byteswap},
      128 +            zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
      129 +            1, 0, 1, 1, "edonr"}
  83  130  };
  84  131  
  85  132  enum zio_checksum
  86  133  zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
  87  134  {
  88  135          ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
  89  136          ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
  90  137          ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
  91  138  
  92  139          if (child == ZIO_CHECKSUM_INHERIT)
  93  140                  return (parent);
  94  141  
  95  142          if (child == ZIO_CHECKSUM_ON)
  96  143                  return (ZIO_CHECKSUM_ON_VALUE);
  97  144  
  98  145          return (child);
  99  146  }
 100  147  
 101  148  enum zio_checksum
 102  149  zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
 103  150      enum zio_checksum parent)
 104  151  {
 105  152          ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
 106  153          ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
 107  154          ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
 108  155  
 109  156          if (child == ZIO_CHECKSUM_INHERIT)
 110  157                  return (parent);
 111  158  
 112  159          if (child == ZIO_CHECKSUM_ON)
 113  160                  return (spa_dedup_checksum(spa));
 114  161  
 115  162          if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
 116  163                  return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
 117  164  
 118  165          ASSERT(zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_dedup ||
 119  166              (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
 120  167  
 121  168          return (child);
 122  169  }
 123  170  
 124  171  /*
 125  172   * Set the external verifier for a gang block based on <vdev, offset, txg>,
 126  173   * a tuple which is guaranteed to be unique for the life of the pool.
 127  174   */
 128  175  static void
 129  176  zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp)
 130  177  {
 131  178          dva_t *dva = BP_IDENTITY(bp);
 132  179          uint64_t txg = BP_PHYSICAL_BIRTH(bp);
 133  180  
 134  181          ASSERT(BP_IS_GANG(bp));
 135  182  
 136  183          ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
 137  184  }
 138  185  
 139  186  /*
 140  187   * Set the external verifier for a label block based on its offset.
  
    | 
      ↓ open down ↓ | 
    48 lines elided | 
    
      ↑ open up ↑ | 
  
 141  188   * The vdev is implicit, and the txg is unknowable at pool open time --
 142  189   * hence the logic in vdev_uberblock_load() to find the most recent copy.
 143  190   */
 144  191  static void
 145  192  zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
 146  193  {
 147  194          ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
 148  195  }
 149  196  
 150  197  /*
      198 + * Calls the template init function of a checksum which supports context
      199 + * templates and installs the template into the spa_t.
      200 + */
      201 +static void
      202 +zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
      203 +{
      204 +        zio_checksum_info_t *ci = &zio_checksum_table[checksum];
      205 +
      206 +        VERIFY(ci->ci_tmpl_init != NULL && ci->ci_tmpl_free != NULL);
      207 +        mutex_enter(&spa->spa_cksum_tmpls_lock);
      208 +        if (spa->spa_cksum_tmpls[checksum] == NULL) {
      209 +                spa->spa_cksum_tmpls[checksum] =
      210 +                    ci->ci_tmpl_init(&spa->spa_cksum_salt);
      211 +                VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
      212 +        }
      213 +        mutex_exit(&spa->spa_cksum_tmpls_lock);
      214 +}
      215 +
      216 +/*
 151  217   * Generate the checksum.
 152  218   */
 153  219  void
 154  220  zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
 155  221          void *data, uint64_t size)
 156  222  {
 157  223          blkptr_t *bp = zio->io_bp;
 158  224          uint64_t offset = zio->io_offset;
 159  225          zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 160  226          zio_cksum_t cksum;
      227 +        spa_t *spa = zio->io_spa;
 161  228  
 162  229          ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
 163  230          ASSERT(ci->ci_func[0] != NULL);
 164  231  
      232 +        if (ci->ci_tmpl_init != NULL && spa->spa_cksum_tmpls[checksum] == NULL)
      233 +                zio_checksum_template_init(checksum, spa);
      234 +
 165  235          if (ci->ci_eck) {
 166  236                  zio_eck_t *eck;
 167  237  
 168  238                  if (checksum == ZIO_CHECKSUM_ZILOG2) {
 169  239                          zil_chain_t *zilc = data;
 170  240  
 171  241                          size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ,
 172  242                              uint64_t);
 173  243                          eck = &zilc->zc_eck;
 174  244                  } else {
 175  245                          eck = (zio_eck_t *)((char *)data + size) - 1;
 176  246                  }
 177  247                  if (checksum == ZIO_CHECKSUM_GANG_HEADER)
 178  248                          zio_checksum_gang_verifier(&eck->zec_cksum, bp);
 179  249                  else if (checksum == ZIO_CHECKSUM_LABEL)
 180  250                          zio_checksum_label_verifier(&eck->zec_cksum, offset);
 181  251                  else
 182  252                          bp->blk_cksum = eck->zec_cksum;
 183  253                  eck->zec_magic = ZEC_MAGIC;
 184      -                ci->ci_func[0](data, size, &cksum);
      254 +                ci->ci_func[0](data, size, &spa->spa_cksum_salt,
      255 +                    spa->spa_cksum_tmpls[checksum], &cksum);
 185  256                  eck->zec_cksum = cksum;
 186  257          } else {
 187      -                ci->ci_func[0](data, size, &bp->blk_cksum);
      258 +                ci->ci_func[0](data, size, &spa->spa_cksum_salt,
      259 +                    spa->spa_cksum_tmpls[checksum], &bp->blk_cksum);
 188  260          }
 189  261  }
 190  262  
 191  263  int
 192  264  zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
 193  265  {
 194  266          blkptr_t *bp = zio->io_bp;
 195  267          uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
 196  268              (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
 197  269          int byteswap;
 198  270          int error;
 199  271          uint64_t size = (bp == NULL ? zio->io_size :
 200  272              (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
 201  273          uint64_t offset = zio->io_offset;
 202  274          void *data = zio->io_data;
 203  275          zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 204  276          zio_cksum_t actual_cksum, expected_cksum, verifier;
      277 +        spa_t *spa = zio->io_spa;
 205  278  
 206  279          if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
 207  280                  return (SET_ERROR(EINVAL));
 208  281  
      282 +        if (ci->ci_tmpl_init != NULL && spa->spa_cksum_tmpls[checksum] == NULL)
      283 +                zio_checksum_template_init(checksum, spa);
      284 +
 209  285          if (ci->ci_eck) {
 210  286                  zio_eck_t *eck;
 211  287  
 212  288                  if (checksum == ZIO_CHECKSUM_ZILOG2) {
 213  289                          zil_chain_t *zilc = data;
 214  290                          uint64_t nused;
 215  291  
 216  292                          eck = &zilc->zc_eck;
 217  293                          if (eck->zec_magic == ZEC_MAGIC)
 218  294                                  nused = zilc->zc_nused;
 219  295                          else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC))
 220  296                                  nused = BSWAP_64(zilc->zc_nused);
 221  297                          else
 222  298                                  return (SET_ERROR(ECKSUM));
 223  299  
 224  300                          if (nused > size)
 225  301                                  return (SET_ERROR(ECKSUM));
 226  302  
 227  303                          size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
 228  304                  } else {
 229  305                          eck = (zio_eck_t *)((char *)data + size) - 1;
 230  306                  }
 231  307  
 232  308                  if (checksum == ZIO_CHECKSUM_GANG_HEADER)
 233  309                          zio_checksum_gang_verifier(&verifier, bp);
 234  310                  else if (checksum == ZIO_CHECKSUM_LABEL)
 235  311                          zio_checksum_label_verifier(&verifier, offset);
  
    | 
      ↓ open down ↓ | 
    17 lines elided | 
    
      ↑ open up ↑ | 
  
 236  312                  else
 237  313                          verifier = bp->blk_cksum;
 238  314  
 239  315                  byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
 240  316  
 241  317                  if (byteswap)
 242  318                          byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 243  319  
 244  320                  expected_cksum = eck->zec_cksum;
 245  321                  eck->zec_cksum = verifier;
 246      -                ci->ci_func[byteswap](data, size, &actual_cksum);
      322 +                ci->ci_func[byteswap](data, size, &spa->spa_cksum_salt,
      323 +                    spa->spa_cksum_tmpls[checksum], &actual_cksum);
 247  324                  eck->zec_cksum = expected_cksum;
 248  325  
 249  326                  if (byteswap)
 250  327                          byteswap_uint64_array(&expected_cksum,
 251  328                              sizeof (zio_cksum_t));
 252  329          } else {
 253  330                  ASSERT(!BP_IS_GANG(bp));
 254  331                  byteswap = BP_SHOULD_BYTESWAP(bp);
 255  332                  expected_cksum = bp->blk_cksum;
 256      -                ci->ci_func[byteswap](data, size, &actual_cksum);
      333 +                ci->ci_func[byteswap](data, size, &spa->spa_cksum_salt,
      334 +                    spa->spa_cksum_tmpls[checksum], &actual_cksum);
 257  335          }
 258  336  
 259  337          info->zbc_expected = expected_cksum;
 260  338          info->zbc_actual = actual_cksum;
 261  339          info->zbc_checksum_name = ci->ci_name;
 262  340          info->zbc_byteswapped = byteswap;
 263  341          info->zbc_injected = 0;
 264  342          info->zbc_has_cksum = 1;
 265  343  
 266  344          if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
 267  345                  return (SET_ERROR(ECKSUM));
 268  346  
 269  347          if (zio_injection_enabled && !zio->io_error &&
 270  348              (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) {
 271  349  
 272  350                  info->zbc_injected = 1;
 273  351                  return (error);
 274  352          }
 275  353  
 276  354          return (0);
      355 +}
      356 +
      357 +/*
      358 + * Called by a spa_t that's about to be deallocated. This steps through
      359 + * all of the checksum context templates and deallocates any that were
      360 + * initialized using the algorithm-specific template init function.
      361 + */
      362 +void
      363 +zio_checksum_templates_free(spa_t *spa)
      364 +{
      365 +        for (int checksum = 0; checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
      366 +                if (spa->spa_cksum_tmpls[checksum] != NULL) {
      367 +                        zio_checksum_info_t *ci = &zio_checksum_table[checksum];
      368 +
      369 +                        VERIFY(ci->ci_tmpl_free != NULL);
      370 +                        ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
      371 +                        spa->spa_cksum_tmpls[checksum] = NULL;
      372 +                }
      373 +        }
 277  374  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX