Print this page
4896 Performance improvements for KCF AES modes

Split Close
Expand all
Collapse all
          --- old/usr/src/common/crypto/aes/aes_modes.c
          +++ new/usr/src/common/crypto/aes/aes_modes.c
↓ open down ↓ 14 lines elided ↑ open up ↑
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
       25 +/*
       26 + * Copyright 2015 by Saso Kiselkov. All rights reserved.
       27 + */
  25   28  
  26   29  #include <sys/types.h>
  27   30  #include <sys/sysmacros.h>
  28   31  #include <modes/modes.h>
  29   32  #include "aes_impl.h"
  30   33  #ifndef _KERNEL
  31   34  #include <stdlib.h>
  32   35  #endif  /* !_KERNEL */
  33   36  
       37 +#if defined(__amd64)
  34   38  
       39 +/*
       40 + * XORs a range of contiguous AES blocks in `data' with blocks in 'dst'
       41 + * and places the result in `dst'. On x86-64 this exploits the 128-bit
       42 + * floating point registers (xmm) to maximize performance.
       43 + */
       44 +static void
       45 +aes_xor_range(const uint8_t *data, uint8_t *dst, uint64_t length)
       46 +{
       47 +        uint64_t i = 0;
       48 +
       49 +        /* First use the unrolled version. */
       50 +        for (; i + 8 * AES_BLOCK_LEN <= length; i += 8 * AES_BLOCK_LEN)
       51 +                aes_xor_intel8(&data[i], &dst[i]);
       52 +        /* Finish the rest in single blocks. */
       53 +        for (; i < length; i += AES_BLOCK_LEN)
       54 +                aes_xor_intel(&data[i], &dst[i]);
       55 +}
       56 +
       57 +#else   /* !__amd64 */
       58 +
       59 +/*
       60 + * XORs a range of contiguous AES blocks in `data' with blocks in 'dst'
       61 + * and places the result in `dst'.
       62 + */
       63 +static void
       64 +aes_xor_range(const uint8_t *data, uint8_t *dst, uint64_t length)
       65 +{
       66 +        uint64_t i = 0;
       67 +
       68 +        if (IS_P2ALIGNED2(dst, data, sizeof (uint64_t))) {
       69 +                /* Unroll the loop to enable efficiency. */
       70 +                for (; i + 8 * AES_BLOCK_LEN < length; i += 8 * AES_BLOCK_LEN) {
       71 +                        AES_XOR_BLOCK_ALIGNED(&data[i + 0x00], &dst[i + 0x00]);
       72 +                        AES_XOR_BLOCK_ALIGNED(&data[i + 0x10], &dst[i + 0x10]);
       73 +                        AES_XOR_BLOCK_ALIGNED(&data[i + 0x20], &dst[i + 0x20]);
       74 +                        AES_XOR_BLOCK_ALIGNED(&data[i + 0x30], &dst[i + 0x30]);
       75 +                        AES_XOR_BLOCK_ALIGNED(&data[i + 0x40], &dst[i + 0x40]);
       76 +                        AES_XOR_BLOCK_ALIGNED(&data[i + 0x50], &dst[i + 0x50]);
       77 +                        AES_XOR_BLOCK_ALIGNED(&data[i + 0x60], &dst[i + 0x60]);
       78 +                        AES_XOR_BLOCK_ALIGNED(&data[i + 0x70], &dst[i + 0x70]);
       79 +                }
       80 +        }
       81 +        /* Finish the rest in single blocks. */
       82 +        for (; i < length; i += AES_BLOCK_LEN)
       83 +                AES_XOR_BLOCK(&data[i], &dst[i]);
       84 +}
       85 +
       86 +#endif  /* !__amd64 */
       87 +
  35   88  /* Copy a 16-byte AES block from "in" to "out" */
  36   89  void
  37      -aes_copy_block(uint8_t *in, uint8_t *out)
       90 +aes_copy_block(const uint8_t *in, uint8_t *out)
  38   91  {
  39   92          if (IS_P2ALIGNED2(in, out, sizeof (uint32_t))) {
  40      -                /* LINTED: pointer alignment */
  41      -                *(uint32_t *)&out[0] = *(uint32_t *)&in[0];
  42      -                /* LINTED: pointer alignment */
  43      -                *(uint32_t *)&out[4] = *(uint32_t *)&in[4];
  44      -                /* LINTED: pointer alignment */
  45      -                *(uint32_t *)&out[8] = *(uint32_t *)&in[8];
  46      -                /* LINTED: pointer alignment */
  47      -                *(uint32_t *)&out[12] = *(uint32_t *)&in[12];
       93 +                AES_COPY_BLOCK_ALIGNED(in, out);
  48   94          } else {
  49      -                AES_COPY_BLOCK(in, out);
       95 +                AES_COPY_BLOCK_UNALIGNED(in, out);
  50   96          }
  51   97  }
  52   98  
  53      -
  54   99  /* XOR a 16-byte AES block of data into dst */
  55  100  void
  56      -aes_xor_block(uint8_t *data, uint8_t *dst)
      101 +aes_xor_block(const uint8_t *data, uint8_t *dst)
  57  102  {
  58  103          if (IS_P2ALIGNED2(dst, data, sizeof (uint32_t))) {
  59      -                /* LINTED: pointer alignment */
  60      -                *(uint32_t *)&dst[0] ^= *(uint32_t *)&data[0];
  61      -                /* LINTED: pointer alignment */
  62      -                *(uint32_t *)&dst[4] ^= *(uint32_t *)&data[4];
  63      -                /* LINTED: pointer alignment */
  64      -                *(uint32_t *)&dst[8] ^= *(uint32_t *)&data[8];
  65      -                /* LINTED: pointer alignment */
  66      -                *(uint32_t *)&dst[12] ^= *(uint32_t *)&data[12];
      104 +                AES_XOR_BLOCK_ALIGNED(data, dst);
  67  105          } else {
  68      -                AES_XOR_BLOCK(data, dst);
      106 +                AES_XOR_BLOCK_UNALIGNED(data, dst);
  69  107          }
  70  108  }
  71  109  
  72      -
  73  110  /*
  74  111   * Encrypt multiple blocks of data according to mode.
  75  112   */
  76  113  int
  77  114  aes_encrypt_contiguous_blocks(void *ctx, char *data, size_t length,
  78  115      crypto_data_t *out)
  79  116  {
  80  117          aes_ctx_t *aes_ctx = ctx;
  81      -        int rv;
      118 +        int rv = CRYPTO_SUCCESS;
  82  119  
  83      -        if (aes_ctx->ac_flags & CTR_MODE) {
  84      -                rv = ctr_mode_contiguous_blocks(ctx, data, length, out,
  85      -                    AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block);
      120 +        for (size_t i = 0; i < length; i += AES_OPSZ) {
      121 +                size_t opsz = MIN(length - i, AES_OPSZ);
      122 +                AES_ACCEL_SAVESTATE(savestate);
      123 +                aes_accel_enter(savestate);
      124 +
      125 +                if (aes_ctx->ac_flags & CTR_MODE) {
      126 +                        rv = ctr_mode_contiguous_blocks(ctx, &data[i], opsz,
      127 +                            out, AES_BLOCK_LEN, aes_encrypt_block,
      128 +                            AES_XOR_BLOCK, aes_ctr_mode);
  86  129  #ifdef _KERNEL
  87      -        } else if (aes_ctx->ac_flags & CCM_MODE) {
  88      -                rv = ccm_mode_encrypt_contiguous_blocks(ctx, data, length,
  89      -                    out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
  90      -                    aes_xor_block);
  91      -        } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
  92      -                rv = gcm_mode_encrypt_contiguous_blocks(ctx, data, length,
  93      -                    out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
  94      -                    aes_xor_block);
      130 +                } else if (aes_ctx->ac_flags & CCM_MODE) {
      131 +                        rv = ccm_mode_encrypt_contiguous_blocks(ctx, &data[i],
      132 +                            opsz, out, AES_BLOCK_LEN, aes_encrypt_block,
      133 +                            AES_COPY_BLOCK, AES_XOR_BLOCK);
      134 +                } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
      135 +                        rv = gcm_mode_encrypt_contiguous_blocks(ctx, &data[i],
      136 +                            opsz, out, AES_BLOCK_LEN, aes_encrypt_block,
      137 +                            AES_COPY_BLOCK, AES_XOR_BLOCK, aes_ctr_mode);
  95  138  #endif
  96      -        } else if (aes_ctx->ac_flags & CBC_MODE) {
  97      -                rv = cbc_encrypt_contiguous_blocks(ctx,
  98      -                    data, length, out, AES_BLOCK_LEN, aes_encrypt_block,
  99      -                    aes_copy_block, aes_xor_block);
 100      -        } else {
 101      -                rv = ecb_cipher_contiguous_blocks(ctx, data, length, out,
 102      -                    AES_BLOCK_LEN, aes_encrypt_block);
      139 +                } else if (aes_ctx->ac_flags & CBC_MODE) {
      140 +                        rv = cbc_encrypt_contiguous_blocks(ctx, &data[i], opsz,
      141 +                            out, AES_BLOCK_LEN, aes_encrypt_block,
      142 +                            AES_COPY_BLOCK, AES_XOR_BLOCK, aes_encrypt_cbc);
      143 +                } else {
      144 +                        rv = ecb_cipher_contiguous_blocks(ctx, &data[i], opsz,
      145 +                            out, AES_BLOCK_LEN, aes_encrypt_block,
      146 +                            aes_encrypt_ecb);
      147 +                }
      148 +
      149 +                aes_accel_exit(savestate);
      150 +
      151 +                if (rv != CRYPTO_SUCCESS)
      152 +                                break;
 103  153          }
      154 +
 104  155          return (rv);
 105  156  }
 106  157  
 107      -
 108  158  /*
 109  159   * Decrypt multiple blocks of data according to mode.
 110  160   */
 111  161  int
 112  162  aes_decrypt_contiguous_blocks(void *ctx, char *data, size_t length,
 113  163      crypto_data_t *out)
 114  164  {
 115  165          aes_ctx_t *aes_ctx = ctx;
 116      -        int rv;
      166 +        int rv = CRYPTO_SUCCESS;
 117  167  
 118      -        if (aes_ctx->ac_flags & CTR_MODE) {
 119      -                rv = ctr_mode_contiguous_blocks(ctx, data, length, out,
 120      -                    AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block);
 121      -                if (rv == CRYPTO_DATA_LEN_RANGE)
 122      -                        rv = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
      168 +
      169 +        for (size_t i = 0; i < length; i += AES_OPSZ) {
      170 +                size_t opsz = MIN(length - i, AES_OPSZ);
      171 +                AES_ACCEL_SAVESTATE(savestate);
      172 +                aes_accel_enter(savestate);
      173 +
      174 +                if (aes_ctx->ac_flags & CTR_MODE) {
      175 +                        rv = ctr_mode_contiguous_blocks(ctx, &data[i], opsz,
      176 +                            out, AES_BLOCK_LEN, aes_encrypt_block,
      177 +                            AES_XOR_BLOCK, aes_ctr_mode);
      178 +                        if (rv == CRYPTO_DATA_LEN_RANGE)
      179 +                                rv = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
 123  180  #ifdef _KERNEL
 124      -        } else if (aes_ctx->ac_flags & CCM_MODE) {
 125      -                rv = ccm_mode_decrypt_contiguous_blocks(ctx, data, length,
 126      -                    out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
 127      -                    aes_xor_block);
 128      -        } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
 129      -                rv = gcm_mode_decrypt_contiguous_blocks(ctx, data, length,
 130      -                    out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
 131      -                    aes_xor_block);
      181 +                } else if (aes_ctx->ac_flags & CCM_MODE) {
      182 +                        rv = ccm_mode_decrypt_contiguous_blocks(ctx, &data[i],
      183 +                            opsz, out, AES_BLOCK_LEN, aes_encrypt_block,
      184 +                            AES_COPY_BLOCK, AES_XOR_BLOCK);
      185 +                } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
      186 +                        rv = gcm_mode_decrypt_contiguous_blocks(ctx, &data[i],
      187 +                            opsz, out, AES_BLOCK_LEN, aes_encrypt_block,
      188 +                            AES_COPY_BLOCK, AES_XOR_BLOCK, aes_ctr_mode);
 132  189  #endif
 133      -        } else if (aes_ctx->ac_flags & CBC_MODE) {
 134      -                rv = cbc_decrypt_contiguous_blocks(ctx, data, length, out,
 135      -                    AES_BLOCK_LEN, aes_decrypt_block, aes_copy_block,
 136      -                    aes_xor_block);
 137      -        } else {
 138      -                rv = ecb_cipher_contiguous_blocks(ctx, data, length, out,
 139      -                    AES_BLOCK_LEN, aes_decrypt_block);
 140      -                if (rv == CRYPTO_DATA_LEN_RANGE)
 141      -                        rv = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
      190 +                } else if (aes_ctx->ac_flags & CBC_MODE) {
      191 +                        rv = cbc_decrypt_contiguous_blocks(ctx, &data[i],
      192 +                            opsz, out, AES_BLOCK_LEN, aes_decrypt_block,
      193 +                            AES_COPY_BLOCK, AES_XOR_BLOCK, aes_decrypt_ecb,
      194 +                            aes_xor_range);
      195 +                } else {
      196 +                        rv = ecb_cipher_contiguous_blocks(ctx, &data[i],
      197 +                            opsz, out, AES_BLOCK_LEN, aes_decrypt_block,
      198 +                            aes_decrypt_ecb);
      199 +                        if (rv == CRYPTO_DATA_LEN_RANGE)
      200 +                                rv = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
      201 +                }
      202 +
      203 +                aes_accel_exit(savestate);
      204 +
      205 +                if (rv != CRYPTO_SUCCESS)
      206 +                                break;
 142  207          }
      208 +
 143  209          return (rv);
 144  210  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX