Print this page
4896 Performance improvements for KCF AES modes

Split Close
Expand all
Collapse all
          --- old/usr/src/common/crypto/aes/aes_impl.c
          +++ new/usr/src/common/crypto/aes/aes_impl.c
↓ open down ↓ 12 lines elided ↑ open up ↑
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
       23 + * Copyright 2015 by Saso Kiselkov. All rights reserved.
  23   24   */
  24   25  
  25   26  #include <sys/types.h>
  26   27  #include <sys/systm.h>
  27   28  #include <sys/sysmacros.h>
  28   29  #include <netinet/in.h>
  29   30  #include "aes_impl.h"
  30   31  #ifndef _KERNEL
  31   32  #include <strings.h>
  32   33  #include <stdlib.h>
       34 +#include <sys/note.h>
  33   35  #endif  /* !_KERNEL */
  34   36  
  35   37  #ifdef __amd64
  36   38  
  37   39  #ifdef _KERNEL
  38   40  #include <sys/cpuvar.h>         /* cpu_t, CPU */
  39   41  #include <sys/x86_archext.h>    /* x86_featureset, X86FSET_AES */
  40   42  #include <sys/disp.h>           /* kpreempt_disable(), kpreempt_enable */
  41   43  
  42   44  /* Workaround for no XMM kernel thread save/restore */
↓ open down ↓ 44 lines elided ↑ open up ↑
  87   89   * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  88   90   */
  89   91  
  90   92  #if defined(sun4u)
  91   93  /* External assembly functions: */
  92   94  extern void aes_encrypt_impl(const uint32_t rk[], int Nr, const uint32_t pt[4],
  93   95          uint32_t ct[4]);
  94   96  extern void aes_decrypt_impl(const uint32_t rk[], int Nr, const uint32_t ct[4],
  95   97          uint32_t pt[4]);
  96   98  
  97      -#define AES_ENCRYPT_IMPL(a, b, c, d, e) aes_encrypt_impl(a, b, c, d)
  98      -#define AES_DECRYPT_IMPL(a, b, c, d, e) aes_decrypt_impl(a, b, c, d)
       99 +#define AES_ENCRYPT_IMPL(a, b, c, d)    aes_encrypt_impl(a, b, c, d)
      100 +#define AES_DECRYPT_IMPL(a, b, c, d)    aes_decrypt_impl(a, b, c, d)
  99  101  
 100  102  #elif defined(__amd64)
 101  103  
 102  104  /* These functions are used to execute amd64 instructions for AMD or Intel: */
 103  105  extern int rijndael_key_setup_enc_amd64(uint32_t rk[],
 104  106          const uint32_t cipherKey[], int keyBits);
 105  107  extern int rijndael_key_setup_dec_amd64(uint32_t rk[],
 106  108          const uint32_t cipherKey[], int keyBits);
 107  109  extern void aes_encrypt_amd64(const uint32_t rk[], int Nr,
 108  110          const uint32_t pt[4], uint32_t ct[4]);
↓ open down ↓ 2 lines elided ↑ open up ↑
 111  113  
 112  114  /* These functions are used to execute Intel-specific AES-NI instructions: */
 113  115  extern int rijndael_key_setup_enc_intel(uint32_t rk[],
 114  116          const uint32_t cipherKey[], uint64_t keyBits);
 115  117  extern int rijndael_key_setup_dec_intel(uint32_t rk[],
 116  118          const uint32_t cipherKey[], uint64_t keyBits);
 117  119  extern void aes_encrypt_intel(const uint32_t rk[], int Nr,
 118  120          const uint32_t pt[4], uint32_t ct[4]);
 119  121  extern void aes_decrypt_intel(const uint32_t rk[], int Nr,
 120  122          const uint32_t ct[4], uint32_t pt[4]);
      123 +extern void aes_encrypt_intel8(const uint32_t rk[], int Nr,
      124 +        const void *pt, void *ct);
      125 +extern void aes_decrypt_intel8(const uint32_t rk[], int Nr,
      126 +        const void *ct, void *pt);
      127 +extern void aes_encrypt_cbc_intel8(const uint32_t rk[], int Nr,
      128 +        const void *pt, void *ct, const void *iv);
      129 +extern void aes_ctr_intel8(const uint32_t rk[], int Nr,
      130 +        const void *input, void *output, uint64_t counter_upper_BE,
      131 +        uint64_t counter_lower_LE);
      132 +extern void aes_xor_intel(const uint8_t *, uint8_t *);
 121  133  
 122      -static int intel_aes_instructions_present(void);
      134 +static inline int intel_aes_instructions_present(void);
 123  135  
 124      -#define AES_ENCRYPT_IMPL(a, b, c, d, e) rijndael_encrypt(a, b, c, d, e)
 125      -#define AES_DECRYPT_IMPL(a, b, c, d, e) rijndael_decrypt(a, b, c, d, e)
      136 +#ifdef  _KERNEL
      137 +/*
      138 + * Some form of floating-point acceleration is available, so declare these.
      139 + * The implementations will be in a platform-specific assembly file (e.g.
      140 + * amd64/aes_intel.s for SSE2/AES-NI).
      141 + */
      142 +extern void aes_accel_save(void *savestate);
      143 +extern void aes_accel_restore(void *savestate);
      144 +#endif  /* _KERNEL */
 126  145  
 127  146  #else /* Generic C implementation */
 128      -
 129      -#define AES_ENCRYPT_IMPL(a, b, c, d, e) rijndael_encrypt(a, b, c, d)
 130      -#define AES_DECRYPT_IMPL(a, b, c, d, e) rijndael_decrypt(a, b, c, d)
      147 +static void rijndael_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4],
      148 +    uint32_t ct[4]);
      149 +static void rijndael_decrypt(const uint32_t rk[], int Nr, const uint32_t pt[4],
      150 +    uint32_t ct[4]);
 131  151  #define rijndael_key_setup_enc_raw      rijndael_key_setup_enc
      152 +#define AES_ENCRYPT_IMPL(a, b, c, d)    rijndael_encrypt(a, b, c, d)
      153 +#define AES_DECRYPT_IMPL(a, b, c, d)    rijndael_decrypt(a, b, c, d)
 132  154  #endif  /* sun4u || __amd64 */
 133  155  
 134  156  #if defined(_LITTLE_ENDIAN) && !defined(__amd64)
 135  157  #define AES_BYTE_SWAP
 136  158  #endif
 137  159  
 138  160  
 139  161  #if !defined(__amd64)
 140  162  /*
 141  163   *  Constant tables
↓ open down ↓ 991 lines elided ↑ open up ↑
1133 1155   * key schedules.
1134 1156   *
1135 1157   * Parameters:
1136 1158   * key          AES key schedule to be initialized
1137 1159   * keyarr32     User key
1138 1160   * keyBits      AES key size (128, 192, or 256 bits)
1139 1161   */
1140 1162  static void
1141 1163  aes_setupkeys(aes_key_t *key, const uint32_t *keyarr32, int keybits)
1142 1164  {
     1165 +        AES_ACCEL_SAVESTATE(savestate);
     1166 +        aes_accel_enter(savestate);
     1167 +
1143 1168          if (intel_aes_instructions_present()) {
1144      -                key->flags = INTEL_AES_NI_CAPABLE;
1145      -                KPREEMPT_DISABLE;
1146 1169                  key->nr = rijndael_key_setup_enc_intel(&(key->encr_ks.ks32[0]),
1147 1170                      keyarr32, keybits);
1148 1171                  key->nr = rijndael_key_setup_dec_intel(&(key->decr_ks.ks32[0]),
1149 1172                      keyarr32, keybits);
1150      -                KPREEMPT_ENABLE;
1151 1173          } else {
1152      -                key->flags = 0;
1153 1174                  key->nr = rijndael_key_setup_enc_amd64(&(key->encr_ks.ks32[0]),
1154 1175                      keyarr32, keybits);
1155 1176                  key->nr = rijndael_key_setup_dec_amd64(&(key->decr_ks.ks32[0]),
1156 1177                      keyarr32, keybits);
1157 1178          }
1158 1179  
     1180 +        aes_accel_exit(savestate);
1159 1181          key->type = AES_32BIT_KS;
1160 1182  }
1161 1183  
1162      -/*
1163      - * Encrypt one block of data. The block is assumed to be an array
1164      - * of four uint32_t values, so copy for alignment (and byte-order
1165      - * reversal for little endian systems might be necessary on the
1166      - * input and output byte streams.
1167      - * The size of the key schedule depends on the number of rounds
1168      - * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
1169      - *
1170      - * Parameters:
1171      - * rk           Key schedule, of aes_ks_t (60 32-bit integers)
1172      - * Nr           Number of rounds
1173      - * pt           Input block (plain text)
1174      - * ct           Output block (crypto text).  Can overlap with pt
1175      - * flags        Indicates whether we're on Intel AES-NI-capable hardware
1176      - */
1177      -static void
1178      -rijndael_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4],
1179      -    uint32_t ct[4], int flags) {
1180      -        if (flags & INTEL_AES_NI_CAPABLE) {
1181      -                KPREEMPT_DISABLE;
1182      -                aes_encrypt_intel(rk, Nr, pt, ct);
1183      -                KPREEMPT_ENABLE;
1184      -        } else {
1185      -                aes_encrypt_amd64(rk, Nr, pt, ct);
1186      -        }
1187      -}
1188      -
1189      -/*
1190      - * Decrypt one block of data. The block is assumed to be an array
1191      - * of four uint32_t values, so copy for alignment (and byte-order
1192      - * reversal for little endian systems might be necessary on the
1193      - * input and output byte streams.
1194      - * The size of the key schedule depends on the number of rounds
1195      - * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
1196      - *
1197      - * Parameters:
1198      - * rk           Key schedule, of aes_ks_t (60 32-bit integers)
1199      - * Nr           Number of rounds
1200      - * ct           Input block (crypto text)
1201      - * pt           Output block (plain text). Can overlap with pt
1202      - * flags        Indicates whether we're on Intel AES-NI-capable hardware
1203      - */
1204      -static void
1205      -rijndael_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
1206      -    uint32_t pt[4], int flags) {
1207      -        if (flags & INTEL_AES_NI_CAPABLE) {
1208      -                KPREEMPT_DISABLE;
1209      -                aes_decrypt_intel(rk, Nr, ct, pt);
1210      -                KPREEMPT_ENABLE;
1211      -        } else {
1212      -                aes_decrypt_amd64(rk, Nr, ct, pt);
1213      -        }
1214      -}
1215      -
1216      -
1217 1184  #else /* generic C implementation */
1218 1185  
1219 1186  /*
1220 1187   *  Expand the cipher key into the decryption key schedule.
1221 1188   *  Return the number of rounds for the given cipher key size.
1222 1189   *  The size of the key schedule depends on the number of rounds
1223 1190   *  (which can be computed from the size of the key), i.e. 4*(Nr + 1).
1224 1191   *
1225 1192   * Parameters:
1226 1193   * rk           AES key schedule 32-bit array to be initialized
↓ open down ↓ 388 lines elided ↑ open up ↑
1615 1582  
1616 1583  #else   /* byte swap */
1617 1584          for (i = 0, j = 0; j < keysize; i++, j += 4) {
1618 1585                  keyarr.ka32[i] = htonl(*(uint32_t *)(void *)&cipherKey[j]);
1619 1586          }
1620 1587  #endif
1621 1588  
1622 1589          aes_setupkeys(newbie, keyarr.ka32, keyBits);
1623 1590  }
1624 1591  
     1592 +#if     defined(__amd64) && defined(_KERNEL)
     1593 +void
     1594 +aes_accel_enter(void *savestate)
     1595 +{
     1596 +        KPREEMPT_DISABLE;
     1597 +        aes_accel_save(savestate);
     1598 +}
1625 1599  
     1600 +void
     1601 +aes_accel_exit(void *savestate)
     1602 +{
     1603 +        aes_accel_restore(savestate);
     1604 +        KPREEMPT_ENABLE;
     1605 +}
     1606 +#endif  /* defined(__amd64) && defined(_KERNEL) */
     1607 +
1626 1608  /*
1627 1609   * Encrypt one block using AES.
1628 1610   * Align if needed and (for x86 32-bit only) byte-swap.
1629 1611   *
1630 1612   * Parameters:
1631 1613   * ks   Key schedule, of type aes_key_t
1632 1614   * pt   Input block (plain text)
1633 1615   * ct   Output block (crypto text).  Can overlap with pt
1634 1616   */
1635 1617  int
1636 1618  aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct)
1637 1619  {
1638 1620          aes_key_t       *ksch = (aes_key_t *)ks;
1639 1621  
     1622 +#ifdef  __amd64
     1623 +        if (intel_aes_instructions_present())
     1624 +                aes_encrypt_intel(&ksch->encr_ks.ks32[0], ksch->nr,
     1625 +                    /* LINTED:  pointer alignment */
     1626 +                    (uint32_t *)pt, (uint32_t *)ct);
     1627 +        else
     1628 +                aes_encrypt_amd64(&ksch->encr_ks.ks32[0], ksch->nr,
     1629 +                    /* LINTED:  pointer alignment */
     1630 +                    (uint32_t *)pt, (uint32_t *)ct);
     1631 +#else   /* !__amd64 */
1640 1632  #ifndef AES_BYTE_SWAP
1641 1633          if (IS_P2ALIGNED2(pt, ct, sizeof (uint32_t))) {
1642      -                /* LINTED:  pointer alignment */
1643 1634                  AES_ENCRYPT_IMPL(&ksch->encr_ks.ks32[0], ksch->nr,
1644 1635                      /* LINTED:  pointer alignment */
1645      -                    (uint32_t *)pt, (uint32_t *)ct, ksch->flags);
     1636 +                    (uint32_t *)pt, (uint32_t *)ct);
1646 1637          } else {
1647 1638  #endif
1648 1639                  uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)];
1649 1640  
1650 1641                  /* Copy input block into buffer */
1651 1642  #ifndef AES_BYTE_SWAP
1652 1643                  bcopy(pt, &buffer, AES_BLOCK_LEN);
1653 1644  
1654 1645  #else   /* byte swap */
1655 1646                  buffer[0] = htonl(*(uint32_t *)(void *)&pt[0]);
1656 1647                  buffer[1] = htonl(*(uint32_t *)(void *)&pt[4]);
1657 1648                  buffer[2] = htonl(*(uint32_t *)(void *)&pt[8]);
1658 1649                  buffer[3] = htonl(*(uint32_t *)(void *)&pt[12]);
1659      -#endif
     1650 +#endif  /* byte swap */
1660 1651  
1661 1652                  AES_ENCRYPT_IMPL(&ksch->encr_ks.ks32[0], ksch->nr,
1662      -                    buffer, buffer, ksch->flags);
     1653 +                    buffer, buffer);
1663 1654  
1664 1655                  /* Copy result from buffer to output block */
1665 1656  #ifndef AES_BYTE_SWAP
1666 1657                  bcopy(&buffer, ct, AES_BLOCK_LEN);
1667 1658          }
1668 1659  
1669 1660  #else   /* byte swap */
1670 1661                  *(uint32_t *)(void *)&ct[0] = htonl(buffer[0]);
1671 1662                  *(uint32_t *)(void *)&ct[4] = htonl(buffer[1]);
1672 1663                  *(uint32_t *)(void *)&ct[8] = htonl(buffer[2]);
1673 1664                  *(uint32_t *)(void *)&ct[12] = htonl(buffer[3]);
1674      -#endif
     1665 +#endif  /* byte swap */
     1666 +#endif  /* !__amd64 */
     1667 +
1675 1668          return (CRYPTO_SUCCESS);
1676 1669  }
1677 1670  
1678 1671  
1679 1672  /*
1680 1673   * Decrypt one block using AES.
1681 1674   * Align and byte-swap if needed.
1682 1675   *
1683 1676   * Parameters:
1684 1677   * ks   Key schedule, of type aes_key_t
1685 1678   * ct   Input block (crypto text)
1686 1679   * pt   Output block (plain text). Can overlap with pt
1687 1680   */
1688 1681  int
1689 1682  aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt)
1690 1683  {
1691 1684          aes_key_t       *ksch = (aes_key_t *)ks;
1692 1685  
     1686 +#ifdef  __amd64
     1687 +        if (intel_aes_instructions_present())
     1688 +                aes_decrypt_intel(&ksch->decr_ks.ks32[0], ksch->nr,
     1689 +                    /* LINTED:  pointer alignment */
     1690 +                    (uint32_t *)ct, (uint32_t *)pt);
     1691 +        else
     1692 +                aes_decrypt_amd64(&ksch->decr_ks.ks32[0], ksch->nr,
     1693 +                    /* LINTED:  pointer alignment */
     1694 +                    (uint32_t *)ct, (uint32_t *)pt);
     1695 +#else   /* !__amd64 */
1693 1696  #ifndef AES_BYTE_SWAP
1694 1697          if (IS_P2ALIGNED2(ct, pt, sizeof (uint32_t))) {
1695      -                /* LINTED:  pointer alignment */
1696 1698                  AES_DECRYPT_IMPL(&ksch->decr_ks.ks32[0], ksch->nr,
1697 1699                      /* LINTED:  pointer alignment */
1698      -                    (uint32_t *)ct, (uint32_t *)pt, ksch->flags);
     1700 +                    (uint32_t *)ct, (uint32_t *)pt);
1699 1701          } else {
1700 1702  #endif
1701 1703                  uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)];
1702 1704  
1703 1705                  /* Copy input block into buffer */
1704 1706  #ifndef AES_BYTE_SWAP
1705 1707                  bcopy(ct, &buffer, AES_BLOCK_LEN);
1706 1708  
1707 1709  #else   /* byte swap */
1708 1710                  buffer[0] = htonl(*(uint32_t *)(void *)&ct[0]);
1709 1711                  buffer[1] = htonl(*(uint32_t *)(void *)&ct[4]);
1710 1712                  buffer[2] = htonl(*(uint32_t *)(void *)&ct[8]);
1711 1713                  buffer[3] = htonl(*(uint32_t *)(void *)&ct[12]);
1712      -#endif
     1714 +#endif  /* byte swap */
1713 1715  
1714 1716                  AES_DECRYPT_IMPL(&ksch->decr_ks.ks32[0], ksch->nr,
1715      -                    buffer, buffer, ksch->flags);
     1717 +                    buffer, buffer);
1716 1718  
1717 1719                  /* Copy result from buffer to output block */
1718 1720  #ifndef AES_BYTE_SWAP
1719 1721                  bcopy(&buffer, pt, AES_BLOCK_LEN);
1720 1722          }
1721 1723  
1722 1724  #else   /* byte swap */
1723 1725          *(uint32_t *)(void *)&pt[0] = htonl(buffer[0]);
1724 1726          *(uint32_t *)(void *)&pt[4] = htonl(buffer[1]);
1725 1727          *(uint32_t *)(void *)&pt[8] = htonl(buffer[2]);
1726 1728          *(uint32_t *)(void *)&pt[12] = htonl(buffer[3]);
1727      -#endif
     1729 +#endif  /* byte swap */
     1730 +#endif  /* !__amd64 */
1728 1731  
1729 1732          return (CRYPTO_SUCCESS);
1730 1733  }
1731 1734  
     1735 +#define ECB_LOOP(ciph_func)                                             \
     1736 +        do {                                                            \
     1737 +                for (; i < length; i += AES_BLOCK_LEN)                  \
     1738 +                        ciph_func;                                      \
     1739 +                _NOTE(CONSTCOND)                                        \
     1740 +        } while (0)
     1741 +#define ECB_LOOP_4P(ciph_func, enc_or_dec, in, out)                     \
     1742 +        ECB_LOOP(ciph_func(&ksch->enc_or_dec ## r_ks.ks32[0],           \
     1743 +            ksch->nr, (void *)&in[i], (void *)&out[i]))
     1744 +#define ECB_LOOP_3P(ciph_func, in, out)                                 \
     1745 +        ECB_LOOP(ciph_func(ksch, (void *)&in[i], (void *)&out[i]))
1732 1746  
     1747 +#ifdef  __amd64
     1748 +#define ECB_INTEL_IMPL(enc_or_dec, in, out)                             \
     1749 +        do {                                                            \
     1750 +                if (intel_aes_instructions_present()) {                 \
     1751 +                        /* first use the accelerated function */        \
     1752 +                        for (; i + 8 * AES_BLOCK_LEN <= length;         \
     1753 +                            i += 8 * AES_BLOCK_LEN)                     \
     1754 +                                aes_ ## enc_or_dec ## rypt_intel8(      \
     1755 +                                    &ksch->enc_or_dec ## r_ks.ks32[0],  \
     1756 +                                    ksch->nr, &in[i], &out[i]);         \
     1757 +                        /* finish off the remainder per-block */        \
     1758 +                        ECB_LOOP_4P(aes_ ## enc_or_dec ## rypt_intel,   \
     1759 +                            enc_or_dec, in, out);                       \
     1760 +                } else {                                                \
     1761 +                        ECB_LOOP_4P(aes_ ## enc_or_dec ## rypt_amd64,   \
     1762 +                            enc_or_dec, in, out);                       \
     1763 +                }                                                       \
     1764 +                _NOTE(CONSTCOND)                                        \
     1765 +        } while (0)
     1766 +#endif  /* __amd64 */
     1767 +
1733 1768  /*
     1769 + * Perform AES ECB encryption on a sequence of blocks. On x86-64 CPUs with
     1770 + * the AES-NI extension, this performs the encryption in increments of 8
     1771 + * blocks at a time, exploiting instruction parallelism more efficiently.
     1772 + * On other platforms, this simply encrypts the blocks in sequence.
     1773 + */
     1774 +int
     1775 +aes_encrypt_ecb(const void *ks, const uint8_t *pt, uint8_t *ct, uint64_t length)
     1776 +{
     1777 +        aes_key_t *ksch = (aes_key_t *)ks;
     1778 +        uint64_t i = 0;
     1779 +
     1780 +#ifdef  __amd64
     1781 +        ECB_INTEL_IMPL(enc, pt, ct);
     1782 +#elif   defined(sun4u)
     1783 +        ECB_LOOP_4P(aes_encrypt_impl, enc, pt, ct);
     1784 +#else   /* Generic C implementation */
     1785 +        ECB_LOOP_3P((void) aes_encrypt_block, pt, ct);
     1786 +#endif  /* Generic C implementation */
     1787 +
     1788 +        return (CRYPTO_SUCCESS);
     1789 +}
     1790 +
     1791 +/*
     1792 + * Same as aes_encrypt_ecb, but performs decryption.
     1793 + */
     1794 +int
     1795 +aes_decrypt_ecb(const void *ks, const uint8_t *ct, uint8_t *pt, uint64_t length)
     1796 +{
     1797 +        aes_key_t *ksch = (aes_key_t *)ks;
     1798 +        uint64_t i = 0;
     1799 +
     1800 +#ifdef  __amd64
     1801 +        ECB_INTEL_IMPL(dec, ct, pt);
     1802 +#elif   defined(sun4u)
     1803 +        ECB_LOOP_4P(aes_decrypt_impl, dec, ct, pt);
     1804 +#else   /* Generic C implementation */
     1805 +        ECB_LOOP_3P((void) aes_decrypt_block, ct, pt);
     1806 +#endif  /* Generic C implementation */
     1807 +
     1808 +        return (CRYPTO_SUCCESS);
     1809 +}
     1810 +#ifdef  __amd64
     1811 +#undef  ECB_INTEL_IMPL
     1812 +#endif  /* __amd64 */
     1813 +
     1814 +#undef  ECB_LOOP
     1815 +#undef  ECB_LOOP_4P
     1816 +#undef  ECB_LOOP_3P
     1817 +
     1818 +#define CBC_LOOP(enc_func, xor_func)                                    \
     1819 +        do {                                                            \
     1820 +                for (; i < length; i += AES_BLOCK_LEN) {                \
     1821 +                        /* copy IV to ciphertext */                     \
     1822 +                        bcopy(iv, &ct[i], AES_BLOCK_LEN);               \
     1823 +                        /* XOR IV with plaintext with input */          \
     1824 +                        xor_func(&pt[i], &ct[i]);                       \
     1825 +                        /* encrypt counter in output region */          \
     1826 +                        enc_func;                                       \
     1827 +                        iv = &ct[i];                                    \
     1828 +                }                                                       \
     1829 +                _NOTE(CONSTCOND)                                        \
     1830 +        } while (0)
     1831 +#define CBC_LOOP_4P(enc_func, xor_func)                                 \
     1832 +        CBC_LOOP(enc_func(&ksch->encr_ks.ks32[0],                       \
     1833 +            ksch->nr, (void *)&ct[i], (void *)&ct[i]), xor_func)
     1834 +#define CBC_LOOP_3P(enc_func, xor_func)                                 \
     1835 +        CBC_LOOP(enc_func(ksch, (void *)&ct[i], (void *)&ct[i]), xor_func)
     1836 +
     1837 +/*
     1838 + * Encrypts a sequence of consecutive AES blocks in CBC mode. On x86-64
     1839 + * with the AES-NI extension, the encryption is performed on 8 blocks at
     1840 + * a time using an optimized assembly implementation, giving a speed boost
     1841 + * of around 75%. On other platforms, this simply performs CBC encryption
     1842 + * in sequence on the blocks.
     1843 + *
     1844 + * Decryption acceleration is implemented in the kernel kcf block cipher
     1845 + * modes code (cbc.c), because that doesn't require a complete hand-tuned
     1846 + * CBC implementation in assembly.
     1847 + */
     1848 +int
     1849 +aes_encrypt_cbc(const void *ks, const uint8_t *pt, uint8_t *ct,
     1850 +    const uint8_t *iv, uint64_t length)
     1851 +{
     1852 +        aes_key_t *ksch = (aes_key_t *)ks;
     1853 +        size_t i = 0;
     1854 +
     1855 +#ifdef  __amd64
     1856 +        if (intel_aes_instructions_present()) {
     1857 +                for (; i + 8 * AES_BLOCK_LEN <= length;
     1858 +                    i += 8 * AES_BLOCK_LEN) {
     1859 +                        aes_encrypt_cbc_intel8(&ksch->encr_ks.ks32[0],
     1860 +                            ksch->nr, &ct[i], &ct[i], iv);
     1861 +                        iv = &ct[7 * AES_BLOCK_LEN];
     1862 +                }
     1863 +                CBC_LOOP_4P(aes_encrypt_intel, aes_xor_intel);
     1864 +        } else {
     1865 +                CBC_LOOP_4P(aes_encrypt_amd64, aes_xor_intel);
     1866 +        }
     1867 +#elif   defined(sun4u)
     1868 +        CBC_LOOP_4P(aes_encrypt_impl, aes_xor_block);
     1869 +#else   /* Generic C implementation */
     1870 +        CBC_LOOP_3P((void) aes_encrypt_block, aes_xor_block);
     1871 +#endif  /* Generic C implementation */
     1872 +
     1873 +        return (CRYPTO_SUCCESS);
     1874 +}
     1875 +#undef  CBC_LOOP
     1876 +#undef  CBC_LOOP_4P
     1877 +#undef  CBC_LOOP_3P
     1878 +
     1879 +#define CTR_LOOP(enc_func, xor_func)                                    \
     1880 +        do {                                                            \
     1881 +                for (; i < length; i += AES_BLOCK_LEN) {                \
     1882 +                        /* set up counter in output region */           \
     1883 +                        *(uint64_t *)(void *)&output[i] = counter[0];   \
     1884 +                        *(uint64_t *)(void *)&output[i + 8] =           \
     1885 +                            htonll(counter[1]++);                       \
     1886 +                        /* encrypt counter in output region */          \
     1887 +                        enc_func;                                       \
     1888 +                        /* XOR encrypted counter with input */          \
     1889 +                        xor_func(&input[i], &output[i]);                \
     1890 +                }                                                       \
     1891 +                _NOTE(CONSTCOND)                                        \
     1892 +        } while (0)
     1893 +#define CTR_LOOP_4P(enc_func, xor_func)                                 \
     1894 +        CTR_LOOP(enc_func(&ksch->encr_ks.ks32[0], ksch->nr,             \
     1895 +            (void *)&output[i], (void *)&output[i]), xor_func)
     1896 +#define CTR_LOOP_3P(enc_func, xor_func)                                 \
     1897 +        CTR_LOOP(enc_func(ksch, (void *)&output[i], (void *)&output[i]),\
     1898 +            xor_func)
     1899 +/*
     1900 + * Performs high-performance counter mode encryption and decryption on
     1901 + * a sequence of blocks. In CTR mode, encryption and decryption are the
     1902 + * same operation, just with the plaintext and ciphertext reversed:
     1903 + * plaintext = CTR(CTR(plaintext, K), K)
     1904 + * Blocks also do not interdepend on each other, so it is an excellent
     1905 + * mode when high performance is required and data authentication/integrity
     1906 + * checking is provided via some other means, or isn't necessary.
     1907 + *
     1908 + * On x86-64 with the AES-NI extension, this code performs CTR mode
     1909 + * encryption in parallel on 8 blocks at a time and can provide in
     1910 + * excess of 3GB/s/core of encryption/decryption performance (<1 CPB).
     1911 + */
     1912 +int
     1913 +aes_ctr_mode(const void *ks, const uint8_t *input, uint8_t *output,
     1914 +    uint64_t length, uint64_t counter[2])
     1915 +{
     1916 +        aes_key_t *ksch = (aes_key_t *)ks;
     1917 +        uint64_t i = 0;
     1918 +
     1919 +        // swap lower part to host order for computations
     1920 +        counter[1] = ntohll(counter[1]);
     1921 +
     1922 +#ifdef  __amd64
     1923 +        if (intel_aes_instructions_present()) {
     1924 +                /* first use the wide-register accelerated function */
     1925 +                for (; i + 8 * AES_BLOCK_LEN <= length;
     1926 +                    i += 8 * AES_BLOCK_LEN) {
     1927 +                        aes_ctr_intel8(&ksch->encr_ks.ks32[0], ksch->nr,
     1928 +                            &input[i], &output[i], counter[0], counter[1]);
     1929 +                        counter[1] += 8;
     1930 +                }
     1931 +                /* finish off the remainder using the slow per-block method */
     1932 +                CTR_LOOP_4P(aes_encrypt_intel, aes_xor_intel);
     1933 +        } else {
     1934 +                CTR_LOOP_4P(aes_encrypt_amd64, aes_xor_intel);
     1935 +        }
     1936 +#elif   defined(sun4u)
     1937 +        CTR_LOOP_4P(aes_encrypt_impl, aes_xor_block);
     1938 +#else   /* Generic C implementation */
     1939 +        CTR_LOOP_3P((void) aes_encrypt_block, aes_xor_block);
     1940 +#endif  /* Generic C implementation */
     1941 +
     1942 +        // swap lower part back to big endian
     1943 +        counter[1] = htonll(counter[1]);
     1944 +
     1945 +        return (CRYPTO_SUCCESS);
     1946 +}
     1947 +#undef  CTR_LOOP
     1948 +
     1949 +/*
1734 1950   * Allocate key schedule for AES.
1735 1951   *
1736 1952   * Return the pointer and set size to the number of bytes allocated.
1737 1953   * Memory allocated must be freed by the caller when done.
1738 1954   *
1739 1955   * Parameters:
1740 1956   * size         Size of key schedule allocated, in bytes
1741 1957   * kmflag       Flag passed to kmem_alloc(9F); ignored in userland.
1742 1958   */
1743 1959  /* ARGSUSED */
↓ open down ↓ 11 lines elided ↑ open up ↑
1755 1971          if (keysched != NULL) {
1756 1972                  *size = sizeof (aes_key_t);
1757 1973                  return (keysched);
1758 1974          }
1759 1975          return (NULL);
1760 1976  }
1761 1977  
1762 1978  
1763 1979  #ifdef __amd64
1764 1980  /*
1765      - * Return 1 if executing on Intel with AES-NI instructions,
1766      - * otherwise 0 (i.e., Intel without AES-NI or AMD64).
     1981 + * Return 1 if executing on x86-64 with AES-NI instructions, otherwise 0.
1767 1982   * Cache the result, as the CPU can't change.
1768 1983   *
1769 1984   * Note: the userland version uses getisax().  The kernel version uses
1770 1985   * global variable x86_featureset.
1771 1986   */
1772      -static int
     1987 +static inline int
1773 1988  intel_aes_instructions_present(void)
1774 1989  {
1775 1990          static int      cached_result = -1;
1776 1991  
1777 1992          if (cached_result == -1) { /* first time */
1778 1993  #ifdef _KERNEL
1779 1994                  cached_result = is_x86_feature(x86_featureset, X86FSET_AES);
1780 1995  #else
1781 1996                  uint_t          ui = 0;
1782 1997  
1783 1998                  (void) getisax(&ui, 1);
1784 1999                  cached_result = (ui & AV_386_AES) != 0;
1785 2000  #endif  /* _KERNEL */
1786 2001          }
1787 2002  
1788 2003          return (cached_result);
1789 2004  }
1790 2005  #endif  /* __amd64 */
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX