Print this page
4896 Performance improvements for KCF AES modes

@@ -20,125 +20,191 @@
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright 2015 by Saso Kiselkov. All rights reserved.
+ */
 
 #include <sys/types.h>
 #include <sys/sysmacros.h>
 #include <modes/modes.h>
 #include "aes_impl.h"
 #ifndef _KERNEL
 #include <stdlib.h>
 #endif  /* !_KERNEL */
 
+#if defined(__amd64)
 
+/*
+ * XORs a range of contiguous AES blocks in `data' with blocks in 'dst'
+ * and places the result in `dst'. On x86-64 this exploits the 128-bit
+ * floating point registers (xmm) to maximize performance.
+ */
+static void
+aes_xor_range(const uint8_t *data, uint8_t *dst, uint64_t length)
+{
+        uint64_t i = 0;
+
+        /* First use the unrolled version. */
+        for (; i + 8 * AES_BLOCK_LEN <= length; i += 8 * AES_BLOCK_LEN)
+                aes_xor_intel8(&data[i], &dst[i]);
+        /* Finish the rest in single blocks. */
+        for (; i < length; i += AES_BLOCK_LEN)
+                aes_xor_intel(&data[i], &dst[i]);
+}
+
+#else   /* !__amd64 */
+
+/*
+ * XORs a range of contiguous AES blocks in `data' with blocks in 'dst'
+ * and places the result in `dst'.
+ */
+static void
+aes_xor_range(const uint8_t *data, uint8_t *dst, uint64_t length)
+{
+        uint64_t i = 0;
+
+        if (IS_P2ALIGNED2(dst, data, sizeof (uint64_t))) {
+                /* Unroll the loop to enable efficiency. */
+                for (; i + 8 * AES_BLOCK_LEN < length; i += 8 * AES_BLOCK_LEN) {
+                        AES_XOR_BLOCK_ALIGNED(&data[i + 0x00], &dst[i + 0x00]);
+                        AES_XOR_BLOCK_ALIGNED(&data[i + 0x10], &dst[i + 0x10]);
+                        AES_XOR_BLOCK_ALIGNED(&data[i + 0x20], &dst[i + 0x20]);
+                        AES_XOR_BLOCK_ALIGNED(&data[i + 0x30], &dst[i + 0x30]);
+                        AES_XOR_BLOCK_ALIGNED(&data[i + 0x40], &dst[i + 0x40]);
+                        AES_XOR_BLOCK_ALIGNED(&data[i + 0x50], &dst[i + 0x50]);
+                        AES_XOR_BLOCK_ALIGNED(&data[i + 0x60], &dst[i + 0x60]);
+                        AES_XOR_BLOCK_ALIGNED(&data[i + 0x70], &dst[i + 0x70]);
+                }
+        }
+        /* Finish the rest in single blocks. */
+        for (; i < length; i += AES_BLOCK_LEN)
+                AES_XOR_BLOCK(&data[i], &dst[i]);
+}
+
+#endif  /* !__amd64 */
+
 /* Copy a 16-byte AES block from "in" to "out" */
 void
-aes_copy_block(uint8_t *in, uint8_t *out)
+aes_copy_block(const uint8_t *in, uint8_t *out)
 {
         if (IS_P2ALIGNED2(in, out, sizeof (uint32_t))) {
-                /* LINTED: pointer alignment */
-                *(uint32_t *)&out[0] = *(uint32_t *)&in[0];
-                /* LINTED: pointer alignment */
-                *(uint32_t *)&out[4] = *(uint32_t *)&in[4];
-                /* LINTED: pointer alignment */
-                *(uint32_t *)&out[8] = *(uint32_t *)&in[8];
-                /* LINTED: pointer alignment */
-                *(uint32_t *)&out[12] = *(uint32_t *)&in[12];
+                AES_COPY_BLOCK_ALIGNED(in, out);
         } else {
-                AES_COPY_BLOCK(in, out);
+                AES_COPY_BLOCK_UNALIGNED(in, out);
         }
 }
 
-
 /* XOR a 16-byte AES block of data into dst */
 void
-aes_xor_block(uint8_t *data, uint8_t *dst)
+aes_xor_block(const uint8_t *data, uint8_t *dst)
 {
         if (IS_P2ALIGNED2(dst, data, sizeof (uint32_t))) {
-                /* LINTED: pointer alignment */
-                *(uint32_t *)&dst[0] ^= *(uint32_t *)&data[0];
-                /* LINTED: pointer alignment */
-                *(uint32_t *)&dst[4] ^= *(uint32_t *)&data[4];
-                /* LINTED: pointer alignment */
-                *(uint32_t *)&dst[8] ^= *(uint32_t *)&data[8];
-                /* LINTED: pointer alignment */
-                *(uint32_t *)&dst[12] ^= *(uint32_t *)&data[12];
+                AES_XOR_BLOCK_ALIGNED(data, dst);
         } else {
-                AES_XOR_BLOCK(data, dst);
+                AES_XOR_BLOCK_UNALIGNED(data, dst);
         }
 }
 
-
 /*
  * Encrypt multiple blocks of data according to mode.
  */
 int
 aes_encrypt_contiguous_blocks(void *ctx, char *data, size_t length,
     crypto_data_t *out)
 {
         aes_ctx_t *aes_ctx = ctx;
-        int rv;
+        int rv = CRYPTO_SUCCESS;
 
+        for (size_t i = 0; i < length; i += AES_OPSZ) {
+                size_t opsz = MIN(length - i, AES_OPSZ);
+                AES_ACCEL_SAVESTATE(savestate);
+                aes_accel_enter(savestate);
+
         if (aes_ctx->ac_flags & CTR_MODE) {
-                rv = ctr_mode_contiguous_blocks(ctx, data, length, out,
-                    AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block);
+                        rv = ctr_mode_contiguous_blocks(ctx, &data[i], opsz,
+                            out, AES_BLOCK_LEN, aes_encrypt_block,
+                            AES_XOR_BLOCK, aes_ctr_mode);
 #ifdef _KERNEL
         } else if (aes_ctx->ac_flags & CCM_MODE) {
-                rv = ccm_mode_encrypt_contiguous_blocks(ctx, data, length,
-                    out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
-                    aes_xor_block);
+                        rv = ccm_mode_encrypt_contiguous_blocks(ctx, &data[i],
+                            opsz, out, AES_BLOCK_LEN, aes_encrypt_block,
+                            AES_COPY_BLOCK, AES_XOR_BLOCK);
         } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
-                rv = gcm_mode_encrypt_contiguous_blocks(ctx, data, length,
-                    out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
-                    aes_xor_block);
+                        rv = gcm_mode_encrypt_contiguous_blocks(ctx, &data[i],
+                            opsz, out, AES_BLOCK_LEN, aes_encrypt_block,
+                            AES_COPY_BLOCK, AES_XOR_BLOCK, aes_ctr_mode);
 #endif
         } else if (aes_ctx->ac_flags & CBC_MODE) {
-                rv = cbc_encrypt_contiguous_blocks(ctx,
-                    data, length, out, AES_BLOCK_LEN, aes_encrypt_block,
-                    aes_copy_block, aes_xor_block);
+                        rv = cbc_encrypt_contiguous_blocks(ctx, &data[i], opsz,
+                            out, AES_BLOCK_LEN, aes_encrypt_block,
+                            AES_COPY_BLOCK, AES_XOR_BLOCK, aes_encrypt_cbc);
         } else {
-                rv = ecb_cipher_contiguous_blocks(ctx, data, length, out,
-                    AES_BLOCK_LEN, aes_encrypt_block);
+                        rv = ecb_cipher_contiguous_blocks(ctx, &data[i], opsz,
+                            out, AES_BLOCK_LEN, aes_encrypt_block,
+                            aes_encrypt_ecb);
         }
+
+                aes_accel_exit(savestate);
+
+                if (rv != CRYPTO_SUCCESS)
+                                break;
+        }
+
         return (rv);
 }
 
-
 /*
  * Decrypt multiple blocks of data according to mode.
  */
 int
 aes_decrypt_contiguous_blocks(void *ctx, char *data, size_t length,
     crypto_data_t *out)
 {
         aes_ctx_t *aes_ctx = ctx;
-        int rv;
+        int rv = CRYPTO_SUCCESS;
 
+
+        for (size_t i = 0; i < length; i += AES_OPSZ) {
+                size_t opsz = MIN(length - i, AES_OPSZ);
+                AES_ACCEL_SAVESTATE(savestate);
+                aes_accel_enter(savestate);
+
         if (aes_ctx->ac_flags & CTR_MODE) {
-                rv = ctr_mode_contiguous_blocks(ctx, data, length, out,
-                    AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block);
+                        rv = ctr_mode_contiguous_blocks(ctx, &data[i], opsz,
+                            out, AES_BLOCK_LEN, aes_encrypt_block,
+                            AES_XOR_BLOCK, aes_ctr_mode);
                 if (rv == CRYPTO_DATA_LEN_RANGE)
                         rv = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
 #ifdef _KERNEL
         } else if (aes_ctx->ac_flags & CCM_MODE) {
-                rv = ccm_mode_decrypt_contiguous_blocks(ctx, data, length,
-                    out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
-                    aes_xor_block);
+                        rv = ccm_mode_decrypt_contiguous_blocks(ctx, &data[i],
+                            opsz, out, AES_BLOCK_LEN, aes_encrypt_block,
+                            AES_COPY_BLOCK, AES_XOR_BLOCK);
         } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
-                rv = gcm_mode_decrypt_contiguous_blocks(ctx, data, length,
-                    out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
-                    aes_xor_block);
+                        rv = gcm_mode_decrypt_contiguous_blocks(ctx, &data[i],
+                            opsz, out, AES_BLOCK_LEN, aes_encrypt_block,
+                            AES_COPY_BLOCK, AES_XOR_BLOCK, aes_ctr_mode);
 #endif
         } else if (aes_ctx->ac_flags & CBC_MODE) {
-                rv = cbc_decrypt_contiguous_blocks(ctx, data, length, out,
-                    AES_BLOCK_LEN, aes_decrypt_block, aes_copy_block,
-                    aes_xor_block);
+                        rv = cbc_decrypt_contiguous_blocks(ctx, &data[i],
+                            opsz, out, AES_BLOCK_LEN, aes_decrypt_block,
+                            AES_COPY_BLOCK, AES_XOR_BLOCK, aes_decrypt_ecb,
+                            aes_xor_range);
         } else {
-                rv = ecb_cipher_contiguous_blocks(ctx, data, length, out,
-                    AES_BLOCK_LEN, aes_decrypt_block);
+                        rv = ecb_cipher_contiguous_blocks(ctx, &data[i],
+                            opsz, out, AES_BLOCK_LEN, aes_decrypt_block,
+                            aes_decrypt_ecb);
                 if (rv == CRYPTO_DATA_LEN_RANGE)
                         rv = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
         }
+
+                aes_accel_exit(savestate);
+
+                if (rv != CRYPTO_SUCCESS)
+                                break;
+        }
+
         return (rv);
 }