Print this page
4896 Performance improvements for KCF AES modes

@@ -18,10 +18,11 @@
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 by Saso Kiselkov. All rights reserved.
  */
 
 
 #ifndef _KERNEL
 #include <strings.h>

@@ -28,38 +29,83 @@
 #include <limits.h>
 #include <assert.h>
 #include <security/cryptoki.h>
 #endif  /* _KERNEL */
 
-
+#include <sys/cmn_err.h>
 #include <sys/types.h>
 #include <sys/kmem.h>
+#define INLINE_CRYPTO_GET_PTRS
 #include <modes/modes.h>
 #include <sys/crypto/common.h>
 #include <sys/crypto/impl.h>
 #include <sys/byteorder.h>
 
+#define COUNTER_MASK    0x00000000ffffffffULL
+
+#ifdef  _KERNEL
+#include <sys/sdt.h>            /* SET_ERROR */
+#endif  /* _KERNEL */
+
 #ifdef __amd64
 
 #ifdef _KERNEL
 #include <sys/cpuvar.h>         /* cpu_t, CPU */
 #include <sys/x86_archext.h>    /* x86_featureset, X86FSET_*, CPUID_* */
 #include <sys/disp.h>           /* kpreempt_disable(), kpreempt_enable */
 /* Workaround for no XMM kernel thread save/restore */
-#define KPREEMPT_DISABLE        kpreempt_disable()
-#define KPREEMPT_ENABLE         kpreempt_enable()
+extern void gcm_accel_save(void *savestate);
+extern void gcm_accel_restore(void *savestate);
 
+#if     defined(lint) || defined(__lint)
+#define GCM_ACCEL_SAVESTATE(name)       uint8_t name[16 * 16 + 8]
 #else
+#define GCM_ACCEL_SAVESTATE(name) \
+        /* stack space for xmm0--xmm15 and cr0 (16 x 128 bits + 64 bits) */ \
+        uint8_t name[16 * 16 + 8] __attribute__((aligned(16)))
+#endif
+
+/*
+ * Disables kernel thread preemption and conditionally gcm_accel_save() iff
+ * Intel PCLMULQDQ support is present. Must be balanced by GCM_ACCEL_EXIT.
+ * This must be present in all externally callable GCM functions which
+ * invoke GHASH operations using FPU-accelerated implementations, or call
+ * static functions which do (such as gcm_encrypt_fastpath128()).
+ */
+#define GCM_ACCEL_ENTER \
+        GCM_ACCEL_SAVESTATE(savestate); \
+        do { \
+                if (intel_pclmulqdq_instruction_present()) { \
+                        kpreempt_disable(); \
+                        gcm_accel_save(savestate); \
+                } \
+                _NOTE(CONSTCOND) \
+        } while (0)
+#define GCM_ACCEL_EXIT \
+        do { \
+                if (intel_pclmulqdq_instruction_present()) { \
+                        gcm_accel_restore(savestate); \
+                        kpreempt_enable(); \
+                } \
+                _NOTE(CONSTCOND) \
+        } while (0)
+
+#else   /* _KERNEL */
 #include <sys/auxv.h>           /* getisax() */
 #include <sys/auxv_386.h>       /* AV_386_PCLMULQDQ bit */
-#define KPREEMPT_DISABLE
-#define KPREEMPT_ENABLE
+#define SET_ERROR(x)    (x)
 #endif  /* _KERNEL */
 
 extern void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
-static int intel_pclmulqdq_instruction_present(void);
-#endif  /* __amd64 */
+extern void gcm_init_clmul(const uint64_t hash_init[2], uint8_t Htable[256]);
+extern void gcm_ghash_clmul(uint64_t ghash[2], const uint8_t Htable[256],
+    const uint8_t *inp, size_t length);
+static inline int intel_pclmulqdq_instruction_present(void);
+#else   /* !__amd64 */
+#define GCM_ACCEL_ENTER
+#define GCM_ACCEL_EXIT
+#endif  /* !__amd64 */
 
 struct aes_block {
         uint64_t a;
         uint64_t b;
 };

@@ -73,18 +119,20 @@
  * Byte swap the input (*x_in and *y) and the output (*res).
  *
  * Note: x_in, y, and res all point to 16-byte numbers (an array of two
  * 64-bit integers).
  */
-void
+static inline void
 gcm_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
 {
 #ifdef __amd64
         if (intel_pclmulqdq_instruction_present()) {
-                KPREEMPT_DISABLE;
+                /*
+                 * FPU context will have been saved and kernel thread
+                 * preemption disabled already.
+                 */
                 gcm_mul_pclmulqdq(x_in, y, res);
-                KPREEMPT_ENABLE;
         } else
 #endif  /* __amd64 */
         {
                 static const uint64_t R = 0xe100000000000000ULL;
                 struct aes_block z = {0, 0};

@@ -114,27 +162,96 @@
                 res[0] = htonll(z.a);
                 res[1] = htonll(z.b);
         }
 }
 
-
 #define GHASH(c, d, t) \
+        do { \
         xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
         gcm_mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
-        (uint64_t *)(void *)(t));
+                    (uint64_t *)(void *)(t)); \
+                _NOTE(CONSTCOND) \
+        } while (0)
 
+boolean_t gcm_fastpath_enabled = B_TRUE;
 
-/*
- * Encrypt multiple blocks of data in GCM mode.  Decrypt for GCM mode
- * is done in another function.
+static void
+gcm_fastpath128(gcm_ctx_t *ctx, const uint8_t *data, size_t length,
+    uint8_t *out, boolean_t encrypt,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*xor_block)(const uint8_t *, uint8_t *),
+    int (*cipher_ctr)(const void *, const uint8_t *, uint8_t *, uint64_t,
+    uint64_t *))
+{
+        /* When decrypting, `data' holds the ciphertext we need to GHASH. */
+        if (!encrypt) {
+#ifdef  __amd64
+                if (intel_pclmulqdq_instruction_present())
+                        gcm_ghash_clmul(ctx->gcm_ghash, ctx->gcm_H_table,
+                            data, length);
+                else
+#endif  /* __amd64 */
+                        for (size_t i = 0; i < length; i += 16)
+                                GHASH(ctx, &data[i], ctx->gcm_ghash);
+        }
+
+        if (cipher_ctr != NULL) {
+                /*
+                 * GCM is almost but not quite like CTR. GCM increments the
+                 * counter value *before* processing the first input block,
+                 * whereas CTR does so afterwards. So we need to increment
+                 * the counter before calling CTR and decrement it afterwards.
  */
-int
-gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
-    crypto_data_t *out, size_t block_size,
+                uint64_t counter = ntohll(ctx->gcm_cb[1]);
+
+                ctx->gcm_cb[1] = htonll((counter & ~COUNTER_MASK) |
+                    ((counter & COUNTER_MASK) + 1));
+                cipher_ctr(ctx->gcm_keysched, data, out, length, ctx->gcm_cb);
+                counter = ntohll(ctx->gcm_cb[1]);
+                ctx->gcm_cb[1] = htonll((counter & ~COUNTER_MASK) |
+                    ((counter & COUNTER_MASK) - 1));
+        } else {
+                uint64_t counter = ntohll(ctx->gcm_cb[1]);
+
+                for (size_t i = 0; i < length; i += 16) {
+                        /*LINTED(E_BAD_PTR_CAST_ALIGN)*/
+                        *(uint64_t *)&out[i] = ctx->gcm_cb[0];
+                        /*LINTED(E_BAD_PTR_CAST_ALIGN)*/
+                        *(uint64_t *)&out[i + 8] = htonll(counter++);
+                        encrypt_block(ctx->gcm_keysched, &out[i], &out[i]);
+                        xor_block(&data[i], &out[i]);
+                }
+
+                ctx->gcm_cb[1] = htonll(counter);
+        }
+
+        /* When encrypting, `out' holds the ciphertext we need to GHASH. */
+        if (encrypt) {
+#ifdef  __amd64
+                if (intel_pclmulqdq_instruction_present())
+                        gcm_ghash_clmul(ctx->gcm_ghash, ctx->gcm_H_table,
+                            out, length);
+                else
+#endif  /* __amd64 */
+                        for (size_t i = 0; i < length; i += 16)
+                                GHASH(ctx, &out[i], ctx->gcm_ghash);
+
+                /* If no more data comes in, the last block is the auth tag. */
+                bcopy(&out[length - 16], ctx->gcm_tmp, 16);
+        }
+
+        ctx->gcm_processed_data_len += length;
+}
+
+static int
+gcm_process_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
+    crypto_data_t *out, size_t block_size, boolean_t encrypt,
     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
-    void (*copy_block)(uint8_t *, uint8_t *),
-    void (*xor_block)(uint8_t *, uint8_t *))
+    void (*copy_block)(const uint8_t *, uint8_t *),
+    void (*xor_block)(const uint8_t *, uint8_t *),
+    int (*cipher_ctr)(const void *, const uint8_t *, uint8_t *, uint64_t,
+    uint64_t *))
 {
         size_t remainder = length;
         size_t need;
         uint8_t *datap = (uint8_t *)data;
         uint8_t *blockp;

@@ -144,19 +261,41 @@
         uint8_t *out_data_1;
         uint8_t *out_data_2;
         size_t out_data_1_len;
         uint64_t counter;
         uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
+        int rv = CRYPTO_SUCCESS;
 
+        GCM_ACCEL_ENTER;
+
+        /*
+         * GCM mode fastpath requirements:
+         * - fastpath is enabled
+         * - block size is 128 bits
+         * - input is block-aligned
+         * - the counter value won't overflow
+         * - output is a single contiguous region and doesn't alias input
+         */
+        if (gcm_fastpath_enabled && block_size == 16 &&
+            ctx->gcm_remainder_len == 0 && (length & (block_size - 1)) == 0 &&
+            ntohll(ctx->gcm_cb[1] & counter_mask) <= ntohll(counter_mask) -
+            length / block_size && CRYPTO_DATA_IS_SINGLE_BLOCK(out)) {
+                gcm_fastpath128(ctx, (uint8_t *)data, length,
+                    CRYPTO_DATA_FIRST_BLOCK(out), encrypt, encrypt_block,
+                    xor_block, cipher_ctr);
+                out->cd_offset += length;
+                goto out;
+        }
+
         if (length + ctx->gcm_remainder_len < block_size) {
                 /* accumulate bytes here and return */
                 bcopy(datap,
                     (uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
                     length);
                 ctx->gcm_remainder_len += length;
                 ctx->gcm_copy_to = datap;
-                return (CRYPTO_SUCCESS);
+                goto out;
         }
 
         lastp = (uint8_t *)ctx->gcm_cb;
         if (out != NULL)
                 crypto_init_ptrs(out, &iov_or_mp, &offset);

@@ -164,21 +303,27 @@
         do {
                 /* Unprocessed data from last call. */
                 if (ctx->gcm_remainder_len > 0) {
                         need = block_size - ctx->gcm_remainder_len;
 
-                        if (need > remainder)
-                                return (CRYPTO_DATA_LEN_RANGE);
+                        if (need > remainder) {
+                                rv = SET_ERROR(CRYPTO_DATA_LEN_RANGE);
+                                goto out;
+                        }
 
                         bcopy(datap, &((uint8_t *)ctx->gcm_remainder)
                             [ctx->gcm_remainder_len], need);
 
                         blockp = (uint8_t *)ctx->gcm_remainder;
                 } else {
                         blockp = datap;
                 }
 
+                /* add ciphertext to the hash */
+                if (!encrypt)
+                        GHASH(ctx, blockp, ctx->gcm_ghash);
+
                 /*
                  * Increment counter. Counter bits are confined
                  * to the bottom 32 bits of the counter block.
                  */
                 counter = ntohll(ctx->gcm_cb[1] & counter_mask);

@@ -219,10 +364,11 @@
                         /* update offset */
                         out->cd_offset += block_size;
                 }
 
                 /* add ciphertext to the hash */
+                if (encrypt)
                 GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash);
 
                 /* Update pointer to next block of data to be processed. */
                 if (ctx->gcm_remainder_len != 0) {
                         datap += need;

@@ -242,27 +388,51 @@
                 }
                 ctx->gcm_copy_to = NULL;
 
         } while (remainder > 0);
 out:
-        return (CRYPTO_SUCCESS);
+        GCM_ACCEL_EXIT;
+
+        return (rv);
 }
 
+
+/*
+ * Encrypt multiple blocks of data in GCM mode.  Decrypt for GCM mode
+ * is done in another function.
+ */
+/*ARGSUSED*/
+int
+gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
+    crypto_data_t *out, size_t block_size,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(const uint8_t *, uint8_t *),
+    void (*xor_block)(const uint8_t *, uint8_t *),
+    int (*cipher_ctr)(const void *, const uint8_t *, uint8_t *, uint64_t,
+    uint64_t *))
+{
+        return (gcm_process_contiguous_blocks(ctx, data, length, out,
+            block_size, B_TRUE, encrypt_block, copy_block, xor_block,
+            cipher_ctr));
+}
+
 /* ARGSUSED */
 int
 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
-    void (*copy_block)(uint8_t *, uint8_t *),
-    void (*xor_block)(uint8_t *, uint8_t *))
+    void (*copy_block)(const uint8_t *, uint8_t *),
+    void (*xor_block)(const uint8_t *, uint8_t *))
 {
         uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
         uint8_t *ghash, *macp;
         int i, rv;
 
-        if (out->cd_length <
-            (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
-                return (CRYPTO_DATA_LEN_RANGE);
+        GCM_ACCEL_ENTER;
+
+        if (out->cd_length < (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
+                rv = CRYPTO_DATA_LEN_RANGE;
+                goto out;
         }
 
         ghash = (uint8_t *)ctx->gcm_ghash;
 
         if (ctx->gcm_remainder_len > 0) {

@@ -308,179 +478,240 @@
         xor_block((uint8_t *)ctx->gcm_J0, ghash);
 
         if (ctx->gcm_remainder_len > 0) {
                 rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
                 if (rv != CRYPTO_SUCCESS)
-                        return (rv);
+                        goto out;
         }
         out->cd_offset += ctx->gcm_remainder_len;
         ctx->gcm_remainder_len = 0;
         rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
         if (rv != CRYPTO_SUCCESS)
-                return (rv);
+                goto out;
         out->cd_offset += ctx->gcm_tag_len;
-
-        return (CRYPTO_SUCCESS);
+out:
+        GCM_ACCEL_EXIT;
+        return (rv);
 }
 
 /*
  * This will only deal with decrypting the last block of the input that
  * might not be a multiple of block length.
  */
+/*ARGSUSED*/
 static void
-gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
+gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, uint8_t *data, size_t length,
+    size_t block_size, crypto_data_t *out,
     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
-    void (*xor_block)(uint8_t *, uint8_t *))
+    void (*xor_block)(const uint8_t *, uint8_t *))
 {
-        uint8_t *datap, *outp, *counterp;
         uint64_t counter;
         uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
-        int i;
 
+        /* padd last block and add to GHASH */
+        bcopy(data, ctx->gcm_tmp, length);
+        bzero(((uint8_t *)ctx->gcm_tmp) + length,
+            sizeof (ctx->gcm_tmp) - length);
+        GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash);
+
         /*
          * Increment counter.
-         * Counter bits are confined to the bottom 32 bits
+         * Counter bits are confined to the bottom 32 bits.
          */
         counter = ntohll(ctx->gcm_cb[1] & counter_mask);
         counter = htonll(counter + 1);
         counter &= counter_mask;
         ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
 
-        datap = (uint8_t *)ctx->gcm_remainder;
-        outp = &((ctx->gcm_pt_buf)[index]);
-        counterp = (uint8_t *)ctx->gcm_tmp;
+        encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
+            (uint8_t *)ctx->gcm_tmp);
 
-        /* authentication tag */
-        bzero((uint8_t *)ctx->gcm_tmp, block_size);
-        bcopy(datap, (uint8_t *)ctx->gcm_tmp, ctx->gcm_remainder_len);
-
-        /* add ciphertext to the hash */
-        GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash);
-
-        /* decrypt remaining ciphertext */
-        encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
-
         /* XOR with counter block */
-        for (i = 0; i < ctx->gcm_remainder_len; i++) {
-                outp[i] = datap[i] ^ counterp[i];
+        for (size_t i = 0; i < length; i++)
+                ((uint8_t *)ctx->gcm_tmp)[i] ^= data[i];
+
+        if (out != NULL) {
+                (void) crypto_put_output_data((uchar_t *)ctx->gcm_tmp, out,
+                    length);
+                out->cd_offset += length;
+        } else {
+                bcopy(ctx->gcm_tmp, data, length);
         }
 }
 
 /* ARGSUSED */
 int
 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
     crypto_data_t *out, size_t block_size,
     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
-    void (*copy_block)(uint8_t *, uint8_t *),
-    void (*xor_block)(uint8_t *, uint8_t *))
+    void (*copy_block)(const uint8_t *, uint8_t *),
+    void (*xor_block)(const uint8_t *, uint8_t *),
+    int (*cipher_ctr)(const void *, const uint8_t *, uint8_t *, uint64_t,
+    uint64_t *))
 {
-        size_t new_len;
-        uint8_t *new;
+        int rv = CRYPTO_SUCCESS;
 
+        GCM_ACCEL_ENTER;
+
         /*
-         * Copy contiguous ciphertext input blocks to plaintext buffer.
-         * Ciphertext will be decrypted in the final.
+         * Previous calls accumulate data in the input buffer to make sure
+         * we have the auth tag (the last part of the ciphertext) when we
+         * receive a final() call.
          */
-        if (length > 0) {
-                new_len = ctx->gcm_pt_buf_len + length;
-#ifdef _KERNEL
-                new = kmem_alloc(new_len, ctx->gcm_kmflag);
-                bcopy(ctx->gcm_pt_buf, new, ctx->gcm_pt_buf_len);
-                kmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
-#else
-                new = malloc(new_len);
-                bcopy(ctx->gcm_pt_buf, new, ctx->gcm_pt_buf_len);
-                free(ctx->gcm_pt_buf);
-#endif
-                if (new == NULL)
-                        return (CRYPTO_HOST_MEMORY);
+        if (ctx->gcm_last_input_fill > 0) {
+                /* Try to complete the input buffer */
+                size_t to_copy = MIN(length,
+                    sizeof (ctx->gcm_last_input) - ctx->gcm_last_input_fill);
 
-                ctx->gcm_pt_buf = new;
-                ctx->gcm_pt_buf_len = new_len;
-                bcopy(data, &ctx->gcm_pt_buf[ctx->gcm_processed_data_len],
-                    length);
-                ctx->gcm_processed_data_len += length;
+                bcopy(data, ctx->gcm_last_input + ctx->gcm_last_input_fill,
+                    to_copy);
+                data += to_copy;
+                ctx->gcm_last_input_fill += to_copy;
+                length -= to_copy;
+
+                if (ctx->gcm_last_input_fill < sizeof (ctx->gcm_last_input))
+                        /* Not enough input data to continue */
+                        goto out;
+
+                if (length < ctx->gcm_tag_len) {
+                        /*
+                         * There isn't enough data ahead to constitute a full
+                         * auth tag, so only crunch one input block and copy
+                         * the remainder of the input into our buffer.
+                         */
+                        rv = gcm_process_contiguous_blocks(ctx,
+                            (char *)ctx->gcm_last_input, block_size, out,
+                            block_size, B_FALSE, encrypt_block, copy_block,
+                            xor_block, cipher_ctr);
+                        if (rv != CRYPTO_SUCCESS)
+                                goto out;
+                        ctx->gcm_last_input_fill -= block_size;
+                        bcopy(ctx->gcm_last_input + block_size,
+                            ctx->gcm_last_input, ctx->gcm_last_input_fill);
+                        bcopy(data, ctx->gcm_last_input +
+                            ctx->gcm_last_input_fill, length);
+                        ctx->gcm_last_input_fill += length;
+                        /* No more input left */
+                        goto out;
         }
+                /*
+                 * There is enough data ahead for the auth tag, so crunch
+                 * everything in our buffer now and empty it.
+                 */
+                rv = gcm_process_contiguous_blocks(ctx,
+                    (char *)ctx->gcm_last_input, ctx->gcm_last_input_fill,
+                    out, block_size, B_FALSE, encrypt_block, copy_block,
+                    xor_block, cipher_ctr);
+                if (rv != CRYPTO_SUCCESS)
+                        goto out;
+                ctx->gcm_last_input_fill = 0;
+        }
+        /*
+         * Last input buffer is empty, so what's left ahead is block-aligned.
+         * Crunch all the blocks up until the near end, which might be our
+         * auth tag and we must NOT decrypt.
+         */
+        ASSERT(ctx->gcm_last_input_fill == 0);
+        if (length >= block_size + ctx->gcm_tag_len) {
+                size_t to_decrypt = (length - ctx->gcm_tag_len) &
+                    ~(block_size - 1);
 
-        ctx->gcm_remainder_len = 0;
-        return (CRYPTO_SUCCESS);
+                rv = gcm_process_contiguous_blocks(ctx, data, to_decrypt, out,
+                    block_size, B_FALSE, encrypt_block, copy_block, xor_block,
+                    cipher_ctr);
+                if (rv != CRYPTO_SUCCESS)
+                        goto out;
+                data += to_decrypt;
+                length -= to_decrypt;
+        }
+        /*
+         * Copy the remainder into our input buffer, it's potentially
+         * the auth tag and a last partial block.
+         */
+        ASSERT(length < sizeof (ctx->gcm_last_input));
+        bcopy(data, ctx->gcm_last_input, length);
+        ctx->gcm_last_input_fill += length;
+out:
+        GCM_ACCEL_EXIT;
+
+        return (rv);
 }
 
 int
 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
-    void (*xor_block)(uint8_t *, uint8_t *))
+    void (*copy_block)(const uint8_t *, uint8_t *),
+    void (*xor_block)(const uint8_t *, uint8_t *),
+    int (*cipher_ctr)(const void *, const uint8_t *, uint8_t *, uint64_t,
+    uint64_t *))
 {
-        size_t pt_len;
-        size_t remainder;
-        uint8_t *ghash;
-        uint8_t *blockp;
-        uint8_t *cbp;
-        uint64_t counter;
-        uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
-        int processed = 0, rv;
+        int rv = CRYPTO_SUCCESS;
 
-        ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
+        /* Check there's enough data to at least compute a tag */
+        if (ctx->gcm_last_input_fill < ctx->gcm_tag_len)
+                return (SET_ERROR(CRYPTO_DATA_LEN_RANGE));
 
-        pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
-        ghash = (uint8_t *)ctx->gcm_ghash;
-        blockp = ctx->gcm_pt_buf;
-        remainder = pt_len;
-        while (remainder > 0) {
-                /* Incomplete last block */
-                if (remainder < block_size) {
-                        bcopy(blockp, ctx->gcm_remainder, remainder);
-                        ctx->gcm_remainder_len = remainder;
-                        /*
-                         * not expecting anymore ciphertext, just
-                         * compute plaintext for the remaining input
-                         */
-                        gcm_decrypt_incomplete_block(ctx, block_size,
-                            processed, encrypt_block, xor_block);
-                        ctx->gcm_remainder_len = 0;
-                        goto out;
-                }
-                /* add ciphertext to the hash */
-                GHASH(ctx, blockp, ghash);
+        GCM_ACCEL_ENTER;
 
-                /*
-                 * Increment counter.
-                 * Counter bits are confined to the bottom 32 bits
-                 */
-                counter = ntohll(ctx->gcm_cb[1] & counter_mask);
-                counter = htonll(counter + 1);
-                counter &= counter_mask;
-                ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
+        /* Finish any unprocessed input */
+        if (ctx->gcm_last_input_fill > ctx->gcm_tag_len) {
+                size_t last_blk_len = MIN(block_size,
+                    ctx->gcm_last_input_fill - ctx->gcm_tag_len);
 
-                cbp = (uint8_t *)ctx->gcm_tmp;
-                encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
+                /* Finish last full block */
+                if (last_blk_len >= block_size) {
+                        rv = gcm_process_contiguous_blocks(ctx,
+                            (char *)ctx->gcm_last_input, block_size, out,
+                            block_size, B_FALSE, encrypt_block, copy_block,
+                            xor_block, cipher_ctr);
+                        if (rv != CRYPTO_SUCCESS)
+                                goto errout;
 
-                /* XOR with ciphertext */
-                xor_block(cbp, blockp);
+                        last_blk_len -= block_size;
+                        ctx->gcm_processed_data_len += block_size;
+                        ctx->gcm_last_input_fill -= block_size;
 
-                processed += block_size;
-                blockp += block_size;
-                remainder -= block_size;
+                        /* Shift what remains in the input buffer forward */
+                        bcopy(ctx->gcm_last_input + block_size,
+                            ctx->gcm_last_input, ctx->gcm_last_input_fill);
         }
-out:
-        ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
-        GHASH(ctx, ctx->gcm_len_a_len_c, ghash);
+                /* Finish last incomplete block before auth tag */
+                if (last_blk_len > 0) {
+                        gcm_decrypt_incomplete_block(ctx, ctx->gcm_last_input,
+                            last_blk_len, block_size, out, encrypt_block,
+                            xor_block);
+
+                        ctx->gcm_processed_data_len += last_blk_len;
+                        ctx->gcm_last_input_fill -= last_blk_len;
+
+                        /* Shift what remains in the input buffer forward */
+                        bcopy(ctx->gcm_last_input + last_blk_len,
+                            ctx->gcm_last_input, ctx->gcm_last_input_fill);
+                }
+                /* Now the last_input buffer holds just the auth tag */
+        }
+
+        ASSERT(ctx->gcm_last_input_fill == ctx->gcm_tag_len);
+
+        ctx->gcm_len_a_len_c[1] =
+            htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
+        GHASH(ctx, ctx->gcm_len_a_len_c, ctx->gcm_ghash);
         encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
             (uint8_t *)ctx->gcm_J0);
-        xor_block((uint8_t *)ctx->gcm_J0, ghash);
+        xor_block((uint8_t *)ctx->gcm_J0, (uint8_t *)ctx->gcm_ghash);
 
+        GCM_ACCEL_EXIT;
+
         /* compare the input authentication tag with what we calculated */
-        if (bcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
-                /* They don't match */
-                return (CRYPTO_INVALID_MAC);
-        } else {
-                rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
-                if (rv != CRYPTO_SUCCESS)
-                        return (rv);
-                out->cd_offset += pt_len;
-        }
+        if (bcmp(&ctx->gcm_last_input, ctx->gcm_ghash, ctx->gcm_tag_len) != 0)
+                return (SET_ERROR(CRYPTO_INVALID_MAC));
+
         return (CRYPTO_SUCCESS);
+
+errout:
+        GCM_ACCEL_EXIT;
+        return (rv);
 }
 
 static int
 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
 {

@@ -498,24 +729,25 @@
         case 112:
         case 120:
         case 128:
                 break;
         default:
-                return (CRYPTO_MECHANISM_PARAM_INVALID);
+                return (SET_ERROR(CRYPTO_MECHANISM_PARAM_INVALID));
         }
 
         if (gcm_param->ulIvLen == 0)
-                return (CRYPTO_MECHANISM_PARAM_INVALID);
+                return (SET_ERROR(CRYPTO_MECHANISM_PARAM_INVALID));
 
         return (CRYPTO_SUCCESS);
 }
 
+/*ARGSUSED*/
 static void
 gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
     gcm_ctx_t *ctx, size_t block_size,
-    void (*copy_block)(uint8_t *, uint8_t *),
-    void (*xor_block)(uint8_t *, uint8_t *))
+    void (*copy_block)(const uint8_t *, uint8_t *),
+    void (*xor_block)(const uint8_t *, uint8_t *))
 {
         uint8_t *cb;
         ulong_t remainder = iv_len;
         ulong_t processed = 0;
         uint8_t *datap, *ghash;

@@ -562,24 +794,36 @@
  */
 int
 gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
     unsigned char *auth_data, size_t auth_data_len, size_t block_size,
     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
-    void (*copy_block)(uint8_t *, uint8_t *),
-    void (*xor_block)(uint8_t *, uint8_t *))
+    void (*copy_block)(const uint8_t *, uint8_t *),
+    void (*xor_block)(const uint8_t *, uint8_t *))
 {
         uint8_t *ghash, *datap, *authp;
         size_t remainder, processed;
 
+        GCM_ACCEL_ENTER;
+
         /* encrypt zero block to get subkey H */
         bzero(ctx->gcm_H, sizeof (ctx->gcm_H));
         encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
             (uint8_t *)ctx->gcm_H);
 
         gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
             copy_block, xor_block);
 
+#ifdef  __amd64
+        if (intel_pclmulqdq_instruction_present()) {
+                uint64_t H_bswap64[2] = {
+                    ntohll(ctx->gcm_H[0]), ntohll(ctx->gcm_H[1])
+                };
+
+                gcm_init_clmul(H_bswap64, ctx->gcm_H_table);
+        }
+#endif
+
         authp = (uint8_t *)ctx->gcm_tmp;
         ghash = (uint8_t *)ctx->gcm_ghash;
         bzero(authp, block_size);
         bzero(ghash, block_size);
 

@@ -604,19 +848,25 @@
                 /* add auth data to the hash */
                 GHASH(ctx, datap, ghash);
 
         } while (remainder > 0);
 
+        GCM_ACCEL_EXIT;
+
         return (CRYPTO_SUCCESS);
 }
 
 int
 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
-    void (*copy_block)(uint8_t *, uint8_t *),
-    void (*xor_block)(uint8_t *, uint8_t *))
+    void (*copy_block)(const uint8_t *, uint8_t *),
+    void (*xor_block)(const uint8_t *, uint8_t *))
 {
+        /*
+         * No GHASH invocations in this function and gcm_init does its own
+         * FPU saving, so no need to GCM_ACCEL_ENTER/GCM_ACCEL_EXIT here.
+         */
         int rv;
         CK_AES_GCM_PARAMS *gcm_param;
 
         if (param != NULL) {
                 gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;

@@ -650,13 +900,17 @@
 }
 
 int
 gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
-    void (*copy_block)(uint8_t *, uint8_t *),
-    void (*xor_block)(uint8_t *, uint8_t *))
+    void (*copy_block)(const uint8_t *, uint8_t *),
+    void (*xor_block)(const uint8_t *, uint8_t *))
 {
+        /*
+         * No GHASH invocations in this function and gcm_init does its own
+         * FPU saving, so no need to GCM_ACCEL_ENTER/GCM_ACCEL_EXIT here.
+         */
         int rv;
         CK_AES_GMAC_PARAMS *gmac_param;
 
         if (param != NULL) {
                 gmac_param = (CK_AES_GMAC_PARAMS *)(void *)param;

@@ -730,11 +984,11 @@
  * Cache the result, as the CPU can't change.
  *
  * Note: the userland version uses getisax().  The kernel version uses
  * is_x86_featureset().
  */
-static int
+static inline int
 intel_pclmulqdq_instruction_present(void)
 {
         static int      cached_result = -1;
 
         if (cached_result == -1) { /* first time */