Print this page
4896 Performance improvements for KCF AES modes

@@ -20,32 +20,70 @@
  */
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright 2015 by Saso Kiselkov. All rights reserved.
+ */
 
 #ifndef _KERNEL
 #include <strings.h>
 #include <limits.h>
 #include <assert.h>
 #include <security/cryptoki.h>
 #endif
 
 #include <sys/types.h>
+#define INLINE_CRYPTO_GET_PTRS
 #include <modes/modes.h>
 #include <sys/crypto/common.h>
 #include <sys/crypto/impl.h>
 
+boolean_t cbc_fastpath_enabled = B_TRUE;
+
+static void
+cbc_decrypt_fastpath(cbc_ctx_t *ctx, const uint8_t *data, size_t length,
+    uint8_t *out, size_t block_size,
+    int (*decrypt)(const void *, const uint8_t *, uint8_t *),
+    int (*decrypt_ecb)(const void *, const uint8_t *, uint8_t *, uint64_t),
+    void (*xor_block)(const uint8_t *, uint8_t *),
+    void (*xor_block_range)(const uint8_t *, uint8_t *, uint64_t))
+{
+        const uint8_t *iv = (uint8_t *)ctx->cbc_iv;
+
+        /* Use bulk decryption when available. */
+        if (decrypt_ecb != NULL) {
+                decrypt_ecb(ctx->cbc_keysched, data, out, length);
+        } else {
+                for (size_t i = 0; i < length; i += block_size)
+                        decrypt(ctx->cbc_keysched, &data[i], &out[i]);
+        }
+
+        /* Use bulk XOR when available. */
+        if (xor_block_range != NULL && length >= 2 * block_size) {
+                xor_block(iv, out);
+                xor_block_range(data, &out[block_size], length - block_size);
+        } else {
+                for (size_t i = 0; i < length; i += block_size) {
+                        xor_block(iv, &out[i]);
+                        iv = &data[i];
+                }
+        }
+}
+
 /*
  * Algorithm independent CBC functions.
  */
 int
 cbc_encrypt_contiguous_blocks(cbc_ctx_t *ctx, char *data, size_t length,
     crypto_data_t *out, size_t block_size,
     int (*encrypt)(const void *, const uint8_t *, uint8_t *),
-    void (*copy_block)(uint8_t *, uint8_t *),
-    void (*xor_block)(uint8_t *, uint8_t *))
+    void (*copy_block)(const uint8_t *, uint8_t *),
+    void (*xor_block)(const uint8_t *, uint8_t *),
+    int (*encrypt_cbc)(const void *, const uint8_t *, uint8_t *,
+    const uint8_t *, uint64_t))
 {
         size_t remainder = length;
         size_t need;
         uint8_t *datap = (uint8_t *)data;
         uint8_t *blockp;

@@ -54,10 +92,35 @@
         offset_t offset;
         uint8_t *out_data_1;
         uint8_t *out_data_2;
         size_t out_data_1_len;
 
+        /*
+         * CBC encryption fastpath requirements:
+         * - fastpath is enabled
+         * - algorithm-specific acceleration function is available
+         * - input is block-aligned
+         * - output is a single contiguous region or the user requested that
+         *   we overwrite their input buffer (input/output aliasing allowed)
+         */
+        if (cbc_fastpath_enabled && encrypt_cbc != NULL && length != 0 &&
+            ctx->cbc_remainder_len == 0 && (length & (block_size - 1)) == 0 &&
+            (out == NULL || CRYPTO_DATA_IS_SINGLE_BLOCK(out))) {
+                if (out == NULL) {
+                        encrypt_cbc(ctx->cbc_keysched, (uint8_t *)data,
+                            (uint8_t *)data, (uint8_t *)ctx->cbc_iv, length);
+                        ctx->cbc_lastp = (uint8_t *)&data[length - block_size];
+                } else {
+                        uint8_t *outp = CRYPTO_DATA_FIRST_BLOCK(out);
+                        encrypt_cbc(ctx->cbc_keysched, (uint8_t *)data, outp,
+                            (uint8_t *)ctx->cbc_iv, length);
+                        out->cd_offset += length;
+                        ctx->cbc_lastp = &outp[length - block_size];
+                }
+                goto out;
+        }
+
         if (length + ctx->cbc_remainder_len < block_size) {
                 /* accumulate bytes here and return */
                 bcopy(datap,
                     (uint8_t *)ctx->cbc_remainder + ctx->cbc_remainder_len,
                     length);

@@ -167,12 +230,14 @@
 /* ARGSUSED */
 int
 cbc_decrypt_contiguous_blocks(cbc_ctx_t *ctx, char *data, size_t length,
     crypto_data_t *out, size_t block_size,
     int (*decrypt)(const void *, const uint8_t *, uint8_t *),
-    void (*copy_block)(uint8_t *, uint8_t *),
-    void (*xor_block)(uint8_t *, uint8_t *))
+    void (*copy_block)(const uint8_t *, uint8_t *),
+    void (*xor_block)(const uint8_t *, uint8_t *),
+    int (*decrypt_ecb)(const void *, const uint8_t *, uint8_t *, uint64_t),
+    void (*xor_block_range)(const uint8_t *, uint8_t *, uint64_t))
 {
         size_t remainder = length;
         size_t need;
         uint8_t *datap = (uint8_t *)data;
         uint8_t *blockp;

@@ -181,10 +246,30 @@
         offset_t offset;
         uint8_t *out_data_1;
         uint8_t *out_data_2;
         size_t out_data_1_len;
 
+        /*
+         * CBC decryption fastpath requirements:
+         * - fastpath is enabled
+         * - input is block-aligned
+         * - output is a single contiguous region and doesn't alias input
+         */
+        if (cbc_fastpath_enabled && ctx->cbc_remainder_len == 0 &&
+            length != 0 && (length & (block_size - 1)) == 0 &&
+            CRYPTO_DATA_IS_SINGLE_BLOCK(out)) {
+                uint8_t *outp = CRYPTO_DATA_FIRST_BLOCK(out);
+
+                cbc_decrypt_fastpath(ctx, (uint8_t *)data, length, outp,
+                    block_size, decrypt, decrypt_ecb, xor_block,
+                    xor_block_range);
+                out->cd_offset += length;
+                bcopy(&data[length - block_size], ctx->cbc_iv, block_size);
+                ctx->cbc_lastp = (uint8_t *)ctx->cbc_iv;
+                return (CRYPTO_SUCCESS);
+        }
+
         if (length + ctx->cbc_remainder_len < block_size) {
                 /* accumulate bytes here and return */
                 bcopy(datap,
                     (uint8_t *)ctx->cbc_remainder + ctx->cbc_remainder_len,
                     length);

@@ -278,11 +363,11 @@
         return (CRYPTO_SUCCESS);
 }
 
 int
 cbc_init_ctx(cbc_ctx_t *cbc_ctx, char *param, size_t param_len,
-    size_t block_size, void (*copy_block)(uint8_t *, uint64_t *))
+    size_t block_size, void (*copy_block)(const uint8_t *, uint64_t *))
 {
         /*
          * Copy IV into context.
          *
          * If cm_param == NULL then the IV comes from the