1 /* ====================================================================
   2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  *
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  *
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in
  13  *    the documentation and/or other materials provided with the
  14  *    distribution.
  15  *
  16  * 3. All advertising materials mentioning features or use of this
  17  *    software must display the following acknowledgment:
  18  *    "This product includes software developed by the OpenSSL Project
  19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
  20  *
  21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
  22  *    endorse or promote products derived from this software without
  23  *    prior written permission. For written permission, please contact
  24  *    openssl-core@openssl.org.
  25  *
  26  * 5. Products derived from this software may not be called "OpenSSL"
  27  *    nor may "OpenSSL" appear in their names without prior written
  28  *    permission of the OpenSSL Project.
  29  *
  30  * 6. Redistributions of any form whatsoever must retain the following
  31  *    acknowledgment:
  32  *    "This product includes software developed by the OpenSSL Project
  33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
  34  *
  35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
  36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
  39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
  46  * OF THE POSSIBILITY OF SUCH DAMAGE.
  47  * ====================================================================
  48  */
  49 
  50 #define OPENSSL_FIPSAPI
  51 
  52 #include <openssl/crypto.h>
  53 #include "modes_lcl.h"
  54 #include <string.h>
  55 
  56 #ifndef MODES_DEBUG
  57 # ifndef NDEBUG
  58 #  define NDEBUG
  59 # endif
  60 #endif
  61 #include <assert.h>
  62 
  63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
  64 /* redefine, because alignment is ensured */
  65 #undef  GETU32
  66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
  67 #undef  PUTU32
  68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
  69 #endif
  70 
  71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
  72 #define REDUCE1BIT(V)   do { \
  73         if (sizeof(size_t)==8) { \
  74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
  75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
  76                 V.hi  = (V.hi>>1 )^T; \
  77         } \
  78         else { \
  79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
  80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
  81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
  82         } \
  83 } while(0)
  84 
  85 /*
  86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
  87  * never be set to 8. 8 is effectively reserved for testing purposes.
  88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
  89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
  90  * whole spectrum of possible table driven implementations. Why? In
  91  * non-"Shoup's" case memory access pattern is segmented in such manner,
  92  * that it's trivial to see that cache timing information can reveal
  93  * fair portion of intermediate hash value. Given that ciphertext is
  94  * always available to attacker, it's possible for him to attempt to
  95  * deduce secret parameter H and if successful, tamper with messages
  96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
  97  * not as trivial, but there is no reason to believe that it's resistant
  98  * to cache-timing attack. And the thing about "8-bit" implementation is
  99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
 100  * key + 1KB shared. Well, on pros side it should be twice as fast as
 101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
 102  * was observed to run ~75% faster, closer to 100% for commercial
 103  * compilers... Yet "4-bit" procedure is preferred, because it's
 104  * believed to provide better security-performance balance and adequate
 105  * all-round performance. "All-round" refers to things like:
 106  *
 107  * - shorter setup time effectively improves overall timing for
 108  *   handling short messages;
 109  * - larger table allocation can become unbearable because of VM
 110  *   subsystem penalties (for example on Windows large enough free
 111  *   results in VM working set trimming, meaning that consequent
 112  *   malloc would immediately incur working set expansion);
 113  * - larger table has larger cache footprint, which can affect
 114  *   performance of other code paths (not necessarily even from same
 115  *   thread in Hyper-Threading world);
 116  *
 117  * Value of 1 is not appropriate for performance reasons.
 118  */
 119 #if     TABLE_BITS==8
 120 
 121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
 122 {
 123         int  i, j;
 124         u128 V;
 125 
 126         Htable[0].hi = 0;
 127         Htable[0].lo = 0;
 128         V.hi = H[0];
 129         V.lo = H[1];
 130 
 131         for (Htable[128]=V, i=64; i>0; i>>=1) {
 132                 REDUCE1BIT(V);
 133                 Htable[i] = V;
 134         }
 135 
 136         for (i=2; i<256; i<<=1) {
 137                 u128 *Hi = Htable+i, H0 = *Hi;
 138                 for (j=1; j<i; ++j) {
 139                         Hi[j].hi = H0.hi^Htable[j].hi;
 140                         Hi[j].lo = H0.lo^Htable[j].lo;
 141                 }
 142         }
 143 }
 144 
 145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
 146 {
 147         u128 Z = { 0, 0};
 148         const u8 *xi = (const u8 *)Xi+15;
 149         size_t rem, n = *xi;
 150         const union { long one; char little; } is_endian = {1};
 151         static const size_t rem_8bit[256] = {
 152                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
 153                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
 154                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
 155                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
 156                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
 157                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
 158                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
 159                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
 160                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
 161                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
 162                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
 163                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
 164                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
 165                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
 166                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
 167                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
 168                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
 169                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
 170                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
 171                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
 172                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
 173                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
 174                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
 175                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
 176                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
 177                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
 178                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
 179                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
 180                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
 181                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
 182                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
 183                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
 184                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
 185                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
 186                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
 187                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
 188                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
 189                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
 190                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
 191                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
 192                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
 193                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
 194                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
 195                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
 196                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
 197                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
 198                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
 199                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
 200                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
 201                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
 202                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
 203                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
 204                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
 205                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
 206                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
 207                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
 208                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
 209                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
 210                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
 211                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
 212                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
 213                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
 214                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
 215                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
 216 
 217         while (1) {
 218                 Z.hi ^= Htable[n].hi;
 219                 Z.lo ^= Htable[n].lo;
 220 
 221                 if ((u8 *)Xi==xi)       break;
 222 
 223                 n = *(--xi);
 224 
 225                 rem  = (size_t)Z.lo&0xff;
 226                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
 227                 Z.hi = (Z.hi>>8);
 228                 if (sizeof(size_t)==8)
 229                         Z.hi ^= rem_8bit[rem];
 230                 else
 231                         Z.hi ^= (u64)rem_8bit[rem]<<32;
 232         }
 233 
 234         if (is_endian.little) {
 235 #ifdef BSWAP8
 236                 Xi[0] = BSWAP8(Z.hi);
 237                 Xi[1] = BSWAP8(Z.lo);
 238 #else
 239                 u8 *p = (u8 *)Xi;
 240                 u32 v;
 241                 v = (u32)(Z.hi>>32);      PUTU32(p,v);
 242                 v = (u32)(Z.hi);        PUTU32(p+4,v);
 243                 v = (u32)(Z.lo>>32);      PUTU32(p+8,v);
 244                 v = (u32)(Z.lo);        PUTU32(p+12,v);
 245 #endif
 246         }
 247         else {
 248                 Xi[0] = Z.hi;
 249                 Xi[1] = Z.lo;
 250         }
 251 }
 252 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
 253 
 254 #elif   TABLE_BITS==4
 255 
 256 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
 257 {
 258         u128 V;
 259 #if defined(OPENSSL_SMALL_FOOTPRINT)
 260         int  i;
 261 #endif
 262 
 263         Htable[0].hi = 0;
 264         Htable[0].lo = 0;
 265         V.hi = H[0];
 266         V.lo = H[1];
 267 
 268 #if defined(OPENSSL_SMALL_FOOTPRINT)
 269         for (Htable[8]=V, i=4; i>0; i>>=1) {
 270                 REDUCE1BIT(V);
 271                 Htable[i] = V;
 272         }
 273 
 274         for (i=2; i<16; i<<=1) {
 275                 u128 *Hi = Htable+i;
 276                 int   j;
 277                 for (V=*Hi, j=1; j<i; ++j) {
 278                         Hi[j].hi = V.hi^Htable[j].hi;
 279                         Hi[j].lo = V.lo^Htable[j].lo;
 280                 }
 281         }
 282 #else
 283         Htable[8] = V;
 284         REDUCE1BIT(V);
 285         Htable[4] = V;
 286         REDUCE1BIT(V);
 287         Htable[2] = V;
 288         REDUCE1BIT(V);
 289         Htable[1] = V;
 290         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
 291         V=Htable[4];
 292         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
 293         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
 294         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
 295         V=Htable[8];
 296         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
 297         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
 298         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
 299         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
 300         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
 301         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
 302         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
 303 #endif
 304 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
 305         /*
 306          * ARM assembler expects specific dword order in Htable.
 307          */
 308         {
 309         int j;
 310         const union { long one; char little; } is_endian = {1};
 311 
 312         if (is_endian.little)
 313                 for (j=0;j<16;++j) {
 314                         V = Htable[j];
 315                         Htable[j].hi = V.lo;
 316                         Htable[j].lo = V.hi;
 317                 }
 318         else
 319                 for (j=0;j<16;++j) {
 320                         V = Htable[j];
 321                         Htable[j].hi = V.lo<<32|V.lo>>32;
 322                         Htable[j].lo = V.hi<<32|V.hi>>32;
 323                 }
 324         }
 325 #endif
 326 }
 327 
 328 #ifndef GHASH_ASM
 329 static const size_t rem_4bit[16] = {
 330         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
 331         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
 332         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
 333         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
 334 
 335 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
 336 {
 337         u128 Z;
 338         int cnt = 15;
 339         size_t rem, nlo, nhi;
 340         const union { long one; char little; } is_endian = {1};
 341 
 342         nlo  = ((const u8 *)Xi)[15];
 343         nhi  = nlo>>4;
 344         nlo &= 0xf;
 345 
 346         Z.hi = Htable[nlo].hi;
 347         Z.lo = Htable[nlo].lo;
 348 
 349         while (1) {
 350                 rem  = (size_t)Z.lo&0xf;
 351                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
 352                 Z.hi = (Z.hi>>4);
 353                 if (sizeof(size_t)==8)
 354                         Z.hi ^= rem_4bit[rem];
 355                 else
 356                         Z.hi ^= (u64)rem_4bit[rem]<<32;
 357 
 358                 Z.hi ^= Htable[nhi].hi;
 359                 Z.lo ^= Htable[nhi].lo;
 360 
 361                 if (--cnt<0)         break;
 362 
 363                 nlo  = ((const u8 *)Xi)[cnt];
 364                 nhi  = nlo>>4;
 365                 nlo &= 0xf;
 366 
 367                 rem  = (size_t)Z.lo&0xf;
 368                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
 369                 Z.hi = (Z.hi>>4);
 370                 if (sizeof(size_t)==8)
 371                         Z.hi ^= rem_4bit[rem];
 372                 else
 373                         Z.hi ^= (u64)rem_4bit[rem]<<32;
 374 
 375                 Z.hi ^= Htable[nlo].hi;
 376                 Z.lo ^= Htable[nlo].lo;
 377         }
 378 
 379         if (is_endian.little) {
 380 #ifdef BSWAP8
 381                 Xi[0] = BSWAP8(Z.hi);
 382                 Xi[1] = BSWAP8(Z.lo);
 383 #else
 384                 u8 *p = (u8 *)Xi;
 385                 u32 v;
 386                 v = (u32)(Z.hi>>32);      PUTU32(p,v);
 387                 v = (u32)(Z.hi);        PUTU32(p+4,v);
 388                 v = (u32)(Z.lo>>32);      PUTU32(p+8,v);
 389                 v = (u32)(Z.lo);        PUTU32(p+12,v);
 390 #endif
 391         }
 392         else {
 393                 Xi[0] = Z.hi;
 394                 Xi[1] = Z.lo;
 395         }
 396 }
 397 
 398 #if !defined(OPENSSL_SMALL_FOOTPRINT)
 399 /*
 400  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
 401  * details... Compiler-generated code doesn't seem to give any
 402  * performance improvement, at least not on x86[_64]. It's here
 403  * mostly as reference and a placeholder for possible future
 404  * non-trivial optimization[s]...
 405  */
 406 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
 407                                 const u8 *inp,size_t len)
 408 {
 409     u128 Z;
 410     int cnt;
 411     size_t rem, nlo, nhi;
 412     const union { long one; char little; } is_endian = {1};
 413 
 414 #if 1
 415     do {
 416         cnt  = 15;
 417         nlo  = ((const u8 *)Xi)[15];
 418         nlo ^= inp[15];
 419         nhi  = nlo>>4;
 420         nlo &= 0xf;
 421 
 422         Z.hi = Htable[nlo].hi;
 423         Z.lo = Htable[nlo].lo;
 424 
 425         while (1) {
 426                 rem  = (size_t)Z.lo&0xf;
 427                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
 428                 Z.hi = (Z.hi>>4);
 429                 if (sizeof(size_t)==8)
 430                         Z.hi ^= rem_4bit[rem];
 431                 else
 432                         Z.hi ^= (u64)rem_4bit[rem]<<32;
 433 
 434                 Z.hi ^= Htable[nhi].hi;
 435                 Z.lo ^= Htable[nhi].lo;
 436 
 437                 if (--cnt<0)         break;
 438 
 439                 nlo  = ((const u8 *)Xi)[cnt];
 440                 nlo ^= inp[cnt];
 441                 nhi  = nlo>>4;
 442                 nlo &= 0xf;
 443 
 444                 rem  = (size_t)Z.lo&0xf;
 445                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
 446                 Z.hi = (Z.hi>>4);
 447                 if (sizeof(size_t)==8)
 448                         Z.hi ^= rem_4bit[rem];
 449                 else
 450                         Z.hi ^= (u64)rem_4bit[rem]<<32;
 451 
 452                 Z.hi ^= Htable[nlo].hi;
 453                 Z.lo ^= Htable[nlo].lo;
 454         }
 455 #else
 456     /*
 457      * Extra 256+16 bytes per-key plus 512 bytes shared tables
 458      * [should] give ~50% improvement... One could have PACK()-ed
 459      * the rem_8bit even here, but the priority is to minimize
 460      * cache footprint...
 461      */
 462     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
 463     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
 464     static const unsigned short rem_8bit[256] = {
 465         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
 466         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
 467         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
 468         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
 469         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
 470         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
 471         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
 472         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
 473         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
 474         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
 475         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
 476         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
 477         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
 478         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
 479         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
 480         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
 481         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
 482         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
 483         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
 484         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
 485         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
 486         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
 487         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
 488         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
 489         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
 490         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
 491         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
 492         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
 493         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
 494         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
 495         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
 496         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
 497     /*
 498      * This pre-processing phase slows down procedure by approximately
 499      * same time as it makes each loop spin faster. In other words
 500      * single block performance is approximately same as straightforward
 501      * "4-bit" implementation, and then it goes only faster...
 502      */
 503     for (cnt=0; cnt<16; ++cnt) {
 504         Z.hi = Htable[cnt].hi;
 505         Z.lo = Htable[cnt].lo;
 506         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
 507         Hshr4[cnt].hi = (Z.hi>>4);
 508         Hshl4[cnt]    = (u8)(Z.lo<<4);
 509     }
 510 
 511     do {
 512         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
 513                 nlo  = ((const u8 *)Xi)[cnt];
 514                 nlo ^= inp[cnt];
 515                 nhi  = nlo>>4;
 516                 nlo &= 0xf;
 517 
 518                 Z.hi ^= Htable[nlo].hi;
 519                 Z.lo ^= Htable[nlo].lo;
 520 
 521                 rem = (size_t)Z.lo&0xff;
 522 
 523                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
 524                 Z.hi = (Z.hi>>8);
 525 
 526                 Z.hi ^= Hshr4[nhi].hi;
 527                 Z.lo ^= Hshr4[nhi].lo;
 528                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
 529         }
 530 
 531         nlo  = ((const u8 *)Xi)[0];
 532         nlo ^= inp[0];
 533         nhi  = nlo>>4;
 534         nlo &= 0xf;
 535 
 536         Z.hi ^= Htable[nlo].hi;
 537         Z.lo ^= Htable[nlo].lo;
 538 
 539         rem = (size_t)Z.lo&0xf;
 540 
 541         Z.lo = (Z.hi<<60)|(Z.lo>>4);
 542         Z.hi = (Z.hi>>4);
 543 
 544         Z.hi ^= Htable[nhi].hi;
 545         Z.lo ^= Htable[nhi].lo;
 546         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
 547 #endif
 548 
 549         if (is_endian.little) {
 550 #ifdef BSWAP8
 551                 Xi[0] = BSWAP8(Z.hi);
 552                 Xi[1] = BSWAP8(Z.lo);
 553 #else
 554                 u8 *p = (u8 *)Xi;
 555                 u32 v;
 556                 v = (u32)(Z.hi>>32);      PUTU32(p,v);
 557                 v = (u32)(Z.hi);        PUTU32(p+4,v);
 558                 v = (u32)(Z.lo>>32);      PUTU32(p+8,v);
 559                 v = (u32)(Z.lo);        PUTU32(p+12,v);
 560 #endif
 561         }
 562         else {
 563                 Xi[0] = Z.hi;
 564                 Xi[1] = Z.lo;
 565         }
 566     } while (inp+=16, len-=16);
 567 }
 568 #endif
 569 #else
 570 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
 571 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 572 #endif
 573 
 574 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
 575 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
 576 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
 577 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
 578  * trashing effect. In other words idea is to hash data while it's
 579  * still in L1 cache after encryption pass... */
 580 #define GHASH_CHUNK       (3*1024)
 581 #endif
 582 
 583 #else   /* TABLE_BITS */
 584 
 585 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
 586 {
 587         u128 V,Z = { 0,0 };
 588         long X;
 589         int  i,j;
 590         const long *xi = (const long *)Xi;
 591         const union { long one; char little; } is_endian = {1};
 592 
 593         V.hi = H[0];    /* H is in host byte order, no byte swapping */
 594         V.lo = H[1];
 595 
 596         for (j=0; j<16/sizeof(long); ++j) {
 597                 if (is_endian.little) {
 598                         if (sizeof(long)==8) {
 599 #ifdef BSWAP8
 600                                 X = (long)(BSWAP8(xi[j]));
 601 #else
 602                                 const u8 *p = (const u8 *)(xi+j);
 603                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
 604 #endif
 605                         }
 606                         else {
 607                                 const u8 *p = (const u8 *)(xi+j);
 608                                 X = (long)GETU32(p);
 609                         }
 610                 }
 611                 else
 612                         X = xi[j];
 613 
 614                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
 615                         u64 M = (u64)(X>>(8*sizeof(long)-1));
 616                         Z.hi ^= V.hi&M;
 617                         Z.lo ^= V.lo&M;
 618 
 619                         REDUCE1BIT(V);
 620                 }
 621         }
 622 
 623         if (is_endian.little) {
 624 #ifdef BSWAP8
 625                 Xi[0] = BSWAP8(Z.hi);
 626                 Xi[1] = BSWAP8(Z.lo);
 627 #else
 628                 u8 *p = (u8 *)Xi;
 629                 u32 v;
 630                 v = (u32)(Z.hi>>32);      PUTU32(p,v);
 631                 v = (u32)(Z.hi);        PUTU32(p+4,v);
 632                 v = (u32)(Z.lo>>32);      PUTU32(p+8,v);
 633                 v = (u32)(Z.lo);        PUTU32(p+12,v);
 634 #endif
 635         }
 636         else {
 637                 Xi[0] = Z.hi;
 638                 Xi[1] = Z.lo;
 639         }
 640 }
 641 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
 642 
 643 #endif
 644 
 645 #if     TABLE_BITS==4 && defined(GHASH_ASM)
 646 # if    !defined(I386_ONLY) && \
 647         (defined(__i386)        || defined(__i386__)    || \
 648          defined(__x86_64)      || defined(__x86_64__)  || \
 649          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
 650 #  define GHASH_ASM_X86_OR_64
 651 #  define GCM_FUNCREF_4BIT
 652 extern unsigned int OPENSSL_ia32cap_P[2];
 653 
 654 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
 655 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
 656 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 657 
 658 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
 659 #   define GHASH_ASM_X86
 660 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
 661 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 662 
 663 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
 664 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 665 #  endif
 666 # elif defined(__arm__) || defined(__arm)
 667 #  include "arm_arch.h"
 668 #  if __ARM_ARCH__>=7
 669 #   define GHASH_ASM_ARM
 670 #   define GCM_FUNCREF_4BIT
 671 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
 672 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 673 #  endif
 674 # endif
 675 #endif
 676 
 677 #ifdef GCM_FUNCREF_4BIT
 678 # undef  GCM_MUL
 679 # define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
 680 # ifdef GHASH
 681 #  undef  GHASH
 682 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
 683 # endif
 684 #endif
 685 
 686 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
 687 {
 688         const union { long one; char little; } is_endian = {1};
 689 
 690         memset(ctx,0,sizeof(*ctx));
 691         ctx->block = block;
 692         ctx->key   = key;
 693 
 694         (*block)(ctx->H.c,ctx->H.c,key);
 695 
 696         if (is_endian.little) {
 697                 /* H is stored in host byte order */
 698 #ifdef BSWAP8
 699                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
 700                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
 701 #else
 702                 u8 *p = ctx->H.c;
 703                 u64 hi,lo;
 704                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
 705                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
 706                 ctx->H.u[0] = hi;
 707                 ctx->H.u[1] = lo;
 708 #endif
 709         }
 710 
 711 #if     TABLE_BITS==8
 712         gcm_init_8bit(ctx->Htable,ctx->H.u);
 713 #elif   TABLE_BITS==4
 714 # if    defined(GHASH_ASM_X86_OR_64)
 715 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
 716         if (OPENSSL_ia32cap_P[0]&(1<<24) &&   /* check FXSR bit */
 717             OPENSSL_ia32cap_P[1]&(1<<1) ) {   /* check PCLMULQDQ bit */
 718                 gcm_init_clmul(ctx->Htable,ctx->H.u);
 719                 ctx->gmult = gcm_gmult_clmul;
 720                 ctx->ghash = gcm_ghash_clmul;
 721                 return;
 722         }
 723 #  endif
 724         gcm_init_4bit(ctx->Htable,ctx->H.u);
 725 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
 726 #   if  defined(OPENSSL_IA32_SSE2)
 727         if (OPENSSL_ia32cap_P[0]&(1<<25)) {   /* check SSE bit */
 728 #   else
 729         if (OPENSSL_ia32cap_P[0]&(1<<23)) {   /* check MMX bit */
 730 #   endif
 731                 ctx->gmult = gcm_gmult_4bit_mmx;
 732                 ctx->ghash = gcm_ghash_4bit_mmx;
 733         } else {
 734                 ctx->gmult = gcm_gmult_4bit_x86;
 735                 ctx->ghash = gcm_ghash_4bit_x86;
 736         }
 737 #  else
 738         ctx->gmult = gcm_gmult_4bit;
 739         ctx->ghash = gcm_ghash_4bit;
 740 #  endif
 741 # elif  defined(GHASH_ASM_ARM)
 742         if (OPENSSL_armcap_P & ARMV7_NEON) {
 743                 ctx->gmult = gcm_gmult_neon;
 744                 ctx->ghash = gcm_ghash_neon;
 745         } else {
 746                 gcm_init_4bit(ctx->Htable,ctx->H.u);
 747                 ctx->gmult = gcm_gmult_4bit;
 748                 ctx->ghash = gcm_ghash_4bit;
 749         }
 750 # else
 751         gcm_init_4bit(ctx->Htable,ctx->H.u);
 752 # endif
 753 #endif
 754 }
 755 
 756 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
 757 {
 758         const union { long one; char little; } is_endian = {1};
 759         unsigned int ctr;
 760 #ifdef GCM_FUNCREF_4BIT
 761         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
 762 #endif
 763 
 764         ctx->Yi.u[0]  = 0;
 765         ctx->Yi.u[1]  = 0;
 766         ctx->Xi.u[0]  = 0;
 767         ctx->Xi.u[1]  = 0;
 768         ctx->len.u[0] = 0;   /* AAD length */
 769         ctx->len.u[1] = 0;   /* message length */
 770         ctx->ares = 0;
 771         ctx->mres = 0;
 772 
 773         if (len==12) {
 774                 memcpy(ctx->Yi.c,iv,12);
 775                 ctx->Yi.c[15]=1;
 776                 ctr=1;
 777         }
 778         else {
 779                 size_t i;
 780                 u64 len0 = len;
 781 
 782                 while (len>=16) {
 783                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
 784                         GCM_MUL(ctx,Yi);
 785                         iv += 16;
 786                         len -= 16;
 787                 }
 788                 if (len) {
 789                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
 790                         GCM_MUL(ctx,Yi);
 791                 }
 792                 len0 <<= 3;
 793                 if (is_endian.little) {
 794 #ifdef BSWAP8
 795                         ctx->Yi.u[1]  ^= BSWAP8(len0);
 796 #else
 797                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
 798                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
 799                         ctx->Yi.c[10] ^= (u8)(len0>>40);
 800                         ctx->Yi.c[11] ^= (u8)(len0>>32);
 801                         ctx->Yi.c[12] ^= (u8)(len0>>24);
 802                         ctx->Yi.c[13] ^= (u8)(len0>>16);
 803                         ctx->Yi.c[14] ^= (u8)(len0>>8);
 804                         ctx->Yi.c[15] ^= (u8)(len0);
 805 #endif
 806                 }
 807                 else
 808                         ctx->Yi.u[1]  ^= len0;
 809 
 810                 GCM_MUL(ctx,Yi);
 811 
 812                 if (is_endian.little)
 813 #ifdef BSWAP4
 814                         ctr = BSWAP4(ctx->Yi.d[3]);
 815 #else
 816                         ctr = GETU32(ctx->Yi.c+12);
 817 #endif
 818                 else
 819                         ctr = ctx->Yi.d[3];
 820         }
 821 
 822         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
 823         ++ctr;
 824         if (is_endian.little)
 825 #ifdef BSWAP4
 826                 ctx->Yi.d[3] = BSWAP4(ctr);
 827 #else
 828                 PUTU32(ctx->Yi.c+12,ctr);
 829 #endif
 830         else
 831                 ctx->Yi.d[3] = ctr;
 832 }
 833 
 834 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
 835 {
 836         size_t i;
 837         unsigned int n;
 838         u64 alen = ctx->len.u[0];
 839 #ifdef GCM_FUNCREF_4BIT
 840         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
 841 # ifdef GHASH
 842         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
 843                                 const u8 *inp,size_t len)       = ctx->ghash;
 844 # endif
 845 #endif
 846 
 847         if (ctx->len.u[1]) return -2;
 848 
 849         alen += len;
 850         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
 851                 return -1;
 852         ctx->len.u[0] = alen;
 853 
 854         n = ctx->ares;
 855         if (n) {
 856                 while (n && len) {
 857                         ctx->Xi.c[n] ^= *(aad++);
 858                         --len;
 859                         n = (n+1)%16;
 860                 }
 861                 if (n==0) GCM_MUL(ctx,Xi);
 862                 else {
 863                         ctx->ares = n;
 864                         return 0;
 865                 }
 866         }
 867 
 868 #ifdef GHASH
 869         if ((i = (len&(size_t)-16))) {
 870                 GHASH(ctx,aad,i);
 871                 aad += i;
 872                 len -= i;
 873         }
 874 #else
 875         while (len>=16) {
 876                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
 877                 GCM_MUL(ctx,Xi);
 878                 aad += 16;
 879                 len -= 16;
 880         }
 881 #endif
 882         if (len) {
 883                 n = (unsigned int)len;
 884                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
 885         }
 886 
 887         ctx->ares = n;
 888         return 0;
 889 }
 890 
 891 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
 892                 const unsigned char *in, unsigned char *out,
 893                 size_t len)
 894 {
 895         const union { long one; char little; } is_endian = {1};
 896         unsigned int n, ctr;
 897         size_t i;
 898         u64        mlen  = ctx->len.u[1];
 899         block128_f block = ctx->block;
 900         void      *key   = ctx->key;
 901 #ifdef GCM_FUNCREF_4BIT
 902         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
 903 # ifdef GHASH
 904         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
 905                                 const u8 *inp,size_t len)       = ctx->ghash;
 906 # endif
 907 #endif
 908 
 909 #if 0
 910         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
 911 #endif
 912         mlen += len;
 913         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
 914                 return -1;
 915         ctx->len.u[1] = mlen;
 916 
 917         if (ctx->ares) {
 918                 /* First call to encrypt finalizes GHASH(AAD) */
 919                 GCM_MUL(ctx,Xi);
 920                 ctx->ares = 0;
 921         }
 922 
 923         if (is_endian.little)
 924 #ifdef BSWAP4
 925                 ctr = BSWAP4(ctx->Yi.d[3]);
 926 #else
 927                 ctr = GETU32(ctx->Yi.c+12);
 928 #endif
 929         else
 930                 ctr = ctx->Yi.d[3];
 931 
 932         n = ctx->mres;
 933 #if !defined(OPENSSL_SMALL_FOOTPRINT)
 934         if (16%sizeof(size_t) == 0) do {        /* always true actually */
 935                 if (n) {
 936                         while (n && len) {
 937                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
 938                                 --len;
 939                                 n = (n+1)%16;
 940                         }
 941                         if (n==0) GCM_MUL(ctx,Xi);
 942                         else {
 943                                 ctx->mres = n;
 944                                 return 0;
 945                         }
 946                 }
 947 #if defined(STRICT_ALIGNMENT)
 948                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
 949                         break;
 950 #endif
 951 #if defined(GHASH) && defined(GHASH_CHUNK)
 952                 while (len>=GHASH_CHUNK) {
 953                     size_t j=GHASH_CHUNK;
 954 
 955                     while (j) {
 956                         size_t *out_t=(size_t *)out;
 957                         const size_t *in_t=(const size_t *)in;
 958 
 959                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
 960                         ++ctr;
 961                         if (is_endian.little)
 962 #ifdef BSWAP4
 963                                 ctx->Yi.d[3] = BSWAP4(ctr);
 964 #else
 965                                 PUTU32(ctx->Yi.c+12,ctr);
 966 #endif
 967                         else
 968                                 ctx->Yi.d[3] = ctr;
 969                         for (i=0; i<16/sizeof(size_t); ++i)
 970                                 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
 971                         out += 16;
 972                         in  += 16;
 973                         j   -= 16;
 974                     }
 975                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
 976                     len -= GHASH_CHUNK;
 977                 }
 978                 if ((i = (len&(size_t)-16))) {
 979                     size_t j=i;
 980 
 981                     while (len>=16) {
 982                         size_t *out_t=(size_t *)out;
 983                         const size_t *in_t=(const size_t *)in;
 984 
 985                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
 986                         ++ctr;
 987                         if (is_endian.little)
 988 #ifdef BSWAP4
 989                                 ctx->Yi.d[3] = BSWAP4(ctr);
 990 #else
 991                                 PUTU32(ctx->Yi.c+12,ctr);
 992 #endif
 993                         else
 994                                 ctx->Yi.d[3] = ctr;
 995                         for (i=0; i<16/sizeof(size_t); ++i)
 996                                 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
 997                         out += 16;
 998                         in  += 16;
 999                         len -= 16;
1000                     }
1001                     GHASH(ctx,out-j,j);
1002                 }
1003 #else
1004                 while (len>=16) {
1005                         size_t *out_t=(size_t *)out;
1006                         const size_t *in_t=(const size_t *)in;
1007 
1008                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1009                         ++ctr;
1010                         if (is_endian.little)
1011 #ifdef BSWAP4
1012                                 ctx->Yi.d[3] = BSWAP4(ctr);
1013 #else
1014                                 PUTU32(ctx->Yi.c+12,ctr);
1015 #endif
1016                         else
1017                                 ctx->Yi.d[3] = ctr;
1018                         for (i=0; i<16/sizeof(size_t); ++i)
1019                                 ctx->Xi.t[i] ^=
1020                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1021                         GCM_MUL(ctx,Xi);
1022                         out += 16;
1023                         in  += 16;
1024                         len -= 16;
1025                 }
1026 #endif
1027                 if (len) {
1028                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1029                         ++ctr;
1030                         if (is_endian.little)
1031 #ifdef BSWAP4
1032                                 ctx->Yi.d[3] = BSWAP4(ctr);
1033 #else
1034                                 PUTU32(ctx->Yi.c+12,ctr);
1035 #endif
1036                         else
1037                                 ctx->Yi.d[3] = ctr;
1038                         while (len--) {
1039                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1040                                 ++n;
1041                         }
1042                 }
1043 
1044                 ctx->mres = n;
1045                 return 0;
1046         } while(0);
1047 #endif
1048         for (i=0;i<len;++i) {
1049                 if (n==0) {
1050                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1051                         ++ctr;
1052                         if (is_endian.little)
1053 #ifdef BSWAP4
1054                                 ctx->Yi.d[3] = BSWAP4(ctr);
1055 #else
1056                                 PUTU32(ctx->Yi.c+12,ctr);
1057 #endif
1058                         else
1059                                 ctx->Yi.d[3] = ctr;
1060                 }
1061                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1062                 n = (n+1)%16;
1063                 if (n==0)
1064                         GCM_MUL(ctx,Xi);
1065         }
1066 
1067         ctx->mres = n;
1068         return 0;
1069 }
1070 
1071 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1072                 const unsigned char *in, unsigned char *out,
1073                 size_t len)
1074 {
1075         const union { long one; char little; } is_endian = {1};
1076         unsigned int n, ctr;
1077         size_t i;
1078         u64        mlen  = ctx->len.u[1];
1079         block128_f block = ctx->block;
1080         void      *key   = ctx->key;
1081 #ifdef GCM_FUNCREF_4BIT
1082         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1083 # ifdef GHASH
1084         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1085                                 const u8 *inp,size_t len)       = ctx->ghash;
1086 # endif
1087 #endif
1088 
1089         mlen += len;
1090         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1091                 return -1;
1092         ctx->len.u[1] = mlen;
1093 
1094         if (ctx->ares) {
1095                 /* First call to decrypt finalizes GHASH(AAD) */
1096                 GCM_MUL(ctx,Xi);
1097                 ctx->ares = 0;
1098         }
1099 
1100         if (is_endian.little)
1101 #ifdef BSWAP4
1102                 ctr = BSWAP4(ctx->Yi.d[3]);
1103 #else
1104                 ctr = GETU32(ctx->Yi.c+12);
1105 #endif
1106         else
1107                 ctr = ctx->Yi.d[3];
1108 
1109         n = ctx->mres;
1110 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1111         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1112                 if (n) {
1113                         while (n && len) {
1114                                 u8 c = *(in++);
1115                                 *(out++) = c^ctx->EKi.c[n];
1116                                 ctx->Xi.c[n] ^= c;
1117                                 --len;
1118                                 n = (n+1)%16;
1119                         }
1120                         if (n==0) GCM_MUL (ctx,Xi);
1121                         else {
1122                                 ctx->mres = n;
1123                                 return 0;
1124                         }
1125                 }
1126 #if defined(STRICT_ALIGNMENT)
1127                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1128                         break;
1129 #endif
1130 #if defined(GHASH) && defined(GHASH_CHUNK)
1131                 while (len>=GHASH_CHUNK) {
1132                     size_t j=GHASH_CHUNK;
1133 
1134                     GHASH(ctx,in,GHASH_CHUNK);
1135                     while (j) {
1136                         size_t *out_t=(size_t *)out;
1137                         const size_t *in_t=(const size_t *)in;
1138 
1139                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1140                         ++ctr;
1141                         if (is_endian.little)
1142 #ifdef BSWAP4
1143                                 ctx->Yi.d[3] = BSWAP4(ctr);
1144 #else
1145                                 PUTU32(ctx->Yi.c+12,ctr);
1146 #endif
1147                         else
1148                                 ctx->Yi.d[3] = ctr;
1149                         for (i=0; i<16/sizeof(size_t); ++i)
1150                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1151                         out += 16;
1152                         in  += 16;
1153                         j   -= 16;
1154                     }
1155                     len -= GHASH_CHUNK;
1156                 }
1157                 if ((i = (len&(size_t)-16))) {
1158                     GHASH(ctx,in,i);
1159                     while (len>=16) {
1160                         size_t *out_t=(size_t *)out;
1161                         const size_t *in_t=(const size_t *)in;
1162 
1163                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1164                         ++ctr;
1165                         if (is_endian.little)
1166 #ifdef BSWAP4
1167                                 ctx->Yi.d[3] = BSWAP4(ctr);
1168 #else
1169                                 PUTU32(ctx->Yi.c+12,ctr);
1170 #endif
1171                         else
1172                                 ctx->Yi.d[3] = ctr;
1173                         for (i=0; i<16/sizeof(size_t); ++i)
1174                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1175                         out += 16;
1176                         in  += 16;
1177                         len -= 16;
1178                     }
1179                 }
1180 #else
1181                 while (len>=16) {
1182                         size_t *out_t=(size_t *)out;
1183                         const size_t *in_t=(const size_t *)in;
1184 
1185                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1186                         ++ctr;
1187                         if (is_endian.little)
1188 #ifdef BSWAP4
1189                                 ctx->Yi.d[3] = BSWAP4(ctr);
1190 #else
1191                                 PUTU32(ctx->Yi.c+12,ctr);
1192 #endif
1193                         else
1194                                 ctx->Yi.d[3] = ctr;
1195                         for (i=0; i<16/sizeof(size_t); ++i) {
1196                                 size_t c = in[i];
1197                                 out[i] = c^ctx->EKi.t[i];
1198                                 ctx->Xi.t[i] ^= c;
1199                         }
1200                         GCM_MUL(ctx,Xi);
1201                         out += 16;
1202                         in  += 16;
1203                         len -= 16;
1204                 }
1205 #endif
1206                 if (len) {
1207                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1208                         ++ctr;
1209                         if (is_endian.little)
1210 #ifdef BSWAP4
1211                                 ctx->Yi.d[3] = BSWAP4(ctr);
1212 #else
1213                                 PUTU32(ctx->Yi.c+12,ctr);
1214 #endif
1215                         else
1216                                 ctx->Yi.d[3] = ctr;
1217                         while (len--) {
1218                                 u8 c = in[n];
1219                                 ctx->Xi.c[n] ^= c;
1220                                 out[n] = c^ctx->EKi.c[n];
1221                                 ++n;
1222                         }
1223                 }
1224 
1225                 ctx->mres = n;
1226                 return 0;
1227         } while(0);
1228 #endif
1229         for (i=0;i<len;++i) {
1230                 u8 c;
1231                 if (n==0) {
1232                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1233                         ++ctr;
1234                         if (is_endian.little)
1235 #ifdef BSWAP4
1236                                 ctx->Yi.d[3] = BSWAP4(ctr);
1237 #else
1238                                 PUTU32(ctx->Yi.c+12,ctr);
1239 #endif
1240                         else
1241                                 ctx->Yi.d[3] = ctr;
1242                 }
1243                 c = in[i];
1244                 out[i] = c^ctx->EKi.c[n];
1245                 ctx->Xi.c[n] ^= c;
1246                 n = (n+1)%16;
1247                 if (n==0)
1248                         GCM_MUL(ctx,Xi);
1249         }
1250 
1251         ctx->mres = n;
1252         return 0;
1253 }
1254 
1255 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1256                 const unsigned char *in, unsigned char *out,
1257                 size_t len, ctr128_f stream)
1258 {
1259         const union { long one; char little; } is_endian = {1};
1260         unsigned int n, ctr;
1261         size_t i;
1262         u64   mlen = ctx->len.u[1];
1263         void *key  = ctx->key;
1264 #ifdef GCM_FUNCREF_4BIT
1265         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1266 # ifdef GHASH
1267         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1268                                 const u8 *inp,size_t len)       = ctx->ghash;
1269 # endif
1270 #endif
1271 
1272         mlen += len;
1273         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1274                 return -1;
1275         ctx->len.u[1] = mlen;
1276 
1277         if (ctx->ares) {
1278                 /* First call to encrypt finalizes GHASH(AAD) */
1279                 GCM_MUL(ctx,Xi);
1280                 ctx->ares = 0;
1281         }
1282 
1283         if (is_endian.little)
1284 #ifdef BSWAP4
1285                 ctr = BSWAP4(ctx->Yi.d[3]);
1286 #else
1287                 ctr = GETU32(ctx->Yi.c+12);
1288 #endif
1289         else
1290                 ctr = ctx->Yi.d[3];
1291 
1292         n = ctx->mres;
1293         if (n) {
1294                 while (n && len) {
1295                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1296                         --len;
1297                         n = (n+1)%16;
1298                 }
1299                 if (n==0) GCM_MUL(ctx,Xi);
1300                 else {
1301                         ctx->mres = n;
1302                         return 0;
1303                 }
1304         }
1305 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1306         while (len>=GHASH_CHUNK) {
1307                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1308                 ctr += GHASH_CHUNK/16;
1309                 if (is_endian.little)
1310 #ifdef BSWAP4
1311                         ctx->Yi.d[3] = BSWAP4(ctr);
1312 #else
1313                         PUTU32(ctx->Yi.c+12,ctr);
1314 #endif
1315                 else
1316                         ctx->Yi.d[3] = ctr;
1317                 GHASH(ctx,out,GHASH_CHUNK);
1318                 out += GHASH_CHUNK;
1319                 in  += GHASH_CHUNK;
1320                 len -= GHASH_CHUNK;
1321         }
1322 #endif
1323         if ((i = (len&(size_t)-16))) {
1324                 size_t j=i/16;
1325 
1326                 (*stream)(in,out,j,key,ctx->Yi.c);
1327                 ctr += (unsigned int)j;
1328                 if (is_endian.little)
1329 #ifdef BSWAP4
1330                         ctx->Yi.d[3] = BSWAP4(ctr);
1331 #else
1332                         PUTU32(ctx->Yi.c+12,ctr);
1333 #endif
1334                 else
1335                         ctx->Yi.d[3] = ctr;
1336                 in  += i;
1337                 len -= i;
1338 #if defined(GHASH)
1339                 GHASH(ctx,out,i);
1340                 out += i;
1341 #else
1342                 while (j--) {
1343                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1344                         GCM_MUL(ctx,Xi);
1345                         out += 16;
1346                 }
1347 #endif
1348         }
1349         if (len) {
1350                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1351                 ++ctr;
1352                 if (is_endian.little)
1353 #ifdef BSWAP4
1354                         ctx->Yi.d[3] = BSWAP4(ctr);
1355 #else
1356                         PUTU32(ctx->Yi.c+12,ctr);
1357 #endif
1358                 else
1359                         ctx->Yi.d[3] = ctr;
1360                 while (len--) {
1361                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1362                         ++n;
1363                 }
1364         }
1365 
1366         ctx->mres = n;
1367         return 0;
1368 }
1369 
1370 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1371                 const unsigned char *in, unsigned char *out,
1372                 size_t len,ctr128_f stream)
1373 {
1374         const union { long one; char little; } is_endian = {1};
1375         unsigned int n, ctr;
1376         size_t i;
1377         u64   mlen = ctx->len.u[1];
1378         void *key  = ctx->key;
1379 #ifdef GCM_FUNCREF_4BIT
1380         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1381 # ifdef GHASH
1382         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1383                                 const u8 *inp,size_t len)       = ctx->ghash;
1384 # endif
1385 #endif
1386 
1387         mlen += len;
1388         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1389                 return -1;
1390         ctx->len.u[1] = mlen;
1391 
1392         if (ctx->ares) {
1393                 /* First call to decrypt finalizes GHASH(AAD) */
1394                 GCM_MUL(ctx,Xi);
1395                 ctx->ares = 0;
1396         }
1397 
1398         if (is_endian.little)
1399 #ifdef BSWAP4
1400                 ctr = BSWAP4(ctx->Yi.d[3]);
1401 #else
1402                 ctr = GETU32(ctx->Yi.c+12);
1403 #endif
1404         else
1405                 ctr = ctx->Yi.d[3];
1406 
1407         n = ctx->mres;
1408         if (n) {
1409                 while (n && len) {
1410                         u8 c = *(in++);
1411                         *(out++) = c^ctx->EKi.c[n];
1412                         ctx->Xi.c[n] ^= c;
1413                         --len;
1414                         n = (n+1)%16;
1415                 }
1416                 if (n==0) GCM_MUL (ctx,Xi);
1417                 else {
1418                         ctx->mres = n;
1419                         return 0;
1420                 }
1421         }
1422 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1423         while (len>=GHASH_CHUNK) {
1424                 GHASH(ctx,in,GHASH_CHUNK);
1425                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1426                 ctr += GHASH_CHUNK/16;
1427                 if (is_endian.little)
1428 #ifdef BSWAP4
1429                         ctx->Yi.d[3] = BSWAP4(ctr);
1430 #else
1431                         PUTU32(ctx->Yi.c+12,ctr);
1432 #endif
1433                 else
1434                         ctx->Yi.d[3] = ctr;
1435                 out += GHASH_CHUNK;
1436                 in  += GHASH_CHUNK;
1437                 len -= GHASH_CHUNK;
1438         }
1439 #endif
1440         if ((i = (len&(size_t)-16))) {
1441                 size_t j=i/16;
1442 
1443 #if defined(GHASH)
1444                 GHASH(ctx,in,i);
1445 #else
1446                 while (j--) {
1447                         size_t k;
1448                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1449                         GCM_MUL(ctx,Xi);
1450                         in += 16;
1451                 }
1452                 j   = i/16;
1453                 in -= i;
1454 #endif
1455                 (*stream)(in,out,j,key,ctx->Yi.c);
1456                 ctr += (unsigned int)j;
1457                 if (is_endian.little)
1458 #ifdef BSWAP4
1459                         ctx->Yi.d[3] = BSWAP4(ctr);
1460 #else
1461                         PUTU32(ctx->Yi.c+12,ctr);
1462 #endif
1463                 else
1464                         ctx->Yi.d[3] = ctr;
1465                 out += i;
1466                 in  += i;
1467                 len -= i;
1468         }
1469         if (len) {
1470                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1471                 ++ctr;
1472                 if (is_endian.little)
1473 #ifdef BSWAP4
1474                         ctx->Yi.d[3] = BSWAP4(ctr);
1475 #else
1476                         PUTU32(ctx->Yi.c+12,ctr);
1477 #endif
1478                 else
1479                         ctx->Yi.d[3] = ctr;
1480                 while (len--) {
1481                         u8 c = in[n];
1482                         ctx->Xi.c[n] ^= c;
1483                         out[n] = c^ctx->EKi.c[n];
1484                         ++n;
1485                 }
1486         }
1487 
1488         ctx->mres = n;
1489         return 0;
1490 }
1491 
1492 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1493                         size_t len)
1494 {
1495         const union { long one; char little; } is_endian = {1};
1496         u64 alen = ctx->len.u[0]<<3;
1497         u64 clen = ctx->len.u[1]<<3;
1498 #ifdef GCM_FUNCREF_4BIT
1499         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1500 #endif
1501 
1502         if (ctx->mres || ctx->ares)
1503                 GCM_MUL(ctx,Xi);
1504 
1505         if (is_endian.little) {
1506 #ifdef BSWAP8
1507                 alen = BSWAP8(alen);
1508                 clen = BSWAP8(clen);
1509 #else
1510                 u8 *p = ctx->len.c;
1511 
1512                 ctx->len.u[0] = alen;
1513                 ctx->len.u[1] = clen;
1514 
1515                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1516                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1517 #endif
1518         }
1519 
1520         ctx->Xi.u[0] ^= alen;
1521         ctx->Xi.u[1] ^= clen;
1522         GCM_MUL(ctx,Xi);
1523 
1524         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1525         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1526 
1527         if (tag && len<=sizeof(ctx->Xi))
1528                 return memcmp(ctx->Xi.c,tag,len);
1529         else
1530                 return -1;
1531 }
1532 
1533 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1534 {
1535         CRYPTO_gcm128_finish(ctx, NULL, 0);
1536         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1537 }
1538 
1539 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1540 {
1541         GCM128_CONTEXT *ret;
1542 
1543         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1544                 CRYPTO_gcm128_init(ret,key,block);
1545 
1546         return ret;
1547 }
1548 
1549 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1550 {
1551         if (ctx) {
1552                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1553                 OPENSSL_free(ctx);
1554         }
1555 }
1556 
1557 #if defined(SELFTEST)
1558 #include <stdio.h>
1559 #include <openssl/aes.h>
1560 
1561 /* Test Case 1 */
1562 static const u8 K1[16],
1563                 *P1=NULL,
1564                 *A1=NULL,
1565                 IV1[12],
1566                 *C1=NULL,
1567                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1568 
1569 /* Test Case 2 */
1570 #define K2 K1
1571 #define A2 A1
1572 #define IV2 IV1
1573 static const u8 P2[16],
1574                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1575                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1576 
1577 /* Test Case 3 */
1578 #define A3 A2
1579 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1580                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1581                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1582                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1583                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1584                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1585                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1586                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1587                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1588                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1589                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1590 
1591 /* Test Case 4 */
1592 #define K4 K3
1593 #define IV4 IV3
1594 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1595                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1596                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1597                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1598                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1599                         0xab,0xad,0xda,0xd2},
1600                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1601                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1602                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1603                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1604                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1605 
1606 /* Test Case 5 */
1607 #define K5 K4
1608 #define P5 P4
1609 #define A5 A4
1610 static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1611                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1612                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1613                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1614                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1615                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1616 
1617 /* Test Case 6 */
1618 #define K6 K5
1619 #define P6 P5
1620 #define A6 A5
1621 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1622                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1623                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1624                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1625                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1626                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1627                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1628                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1629                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1630 
1631 /* Test Case 7 */
1632 static const u8 K7[24],
1633                 *P7=NULL,
1634                 *A7=NULL,
1635                 IV7[12],
1636                 *C7=NULL,
1637                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1638 
1639 /* Test Case 8 */
1640 #define K8 K7
1641 #define IV8 IV7
1642 #define A8 A7
1643 static const u8 P8[16],
1644                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1645                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1646 
1647 /* Test Case 9 */
1648 #define A9 A8
1649 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1650                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1651                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1652                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1653                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1654                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1655                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1656                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1657                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1658                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1659                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1660                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1661 
1662 /* Test Case 10 */
1663 #define K10 K9
1664 #define IV10 IV9
1665 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1666                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1667                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1668                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1669                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1670                         0xab,0xad,0xda,0xd2},
1671                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1672                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1673                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1674                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1675                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1676 
1677 /* Test Case 11 */
1678 #define K11 K10
1679 #define P11 P10
1680 #define A11 A10
1681 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1682                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1683                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1684                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1685                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1686                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1687 
1688 /* Test Case 12 */
1689 #define K12 K11
1690 #define P12 P11
1691 #define A12 A11
1692 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1693                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1694                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1695                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1696                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1697                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1698                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1699                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1700                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1701 
1702 /* Test Case 13 */
1703 static const u8 K13[32],
1704                 *P13=NULL,
1705                 *A13=NULL,
1706                 IV13[12],
1707                 *C13=NULL,
1708                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1709 
1710 /* Test Case 14 */
1711 #define K14 K13
1712 #define A14 A13
1713 static const u8 P14[16],
1714                 IV14[12],
1715                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1716                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1717 
1718 /* Test Case 15 */
1719 #define A15 A14
1720 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1721                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1722                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1723                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1724                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1725                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1726                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1727                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1728                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1729                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1730                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1731                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1732 
1733 /* Test Case 16 */
1734 #define K16 K15
1735 #define IV16 IV15
1736 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1737                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1738                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1739                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1740                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1741                         0xab,0xad,0xda,0xd2},
1742                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1743                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1744                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1745                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1746                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1747 
1748 /* Test Case 17 */
1749 #define K17 K16
1750 #define P17 P16
1751 #define A17 A16
1752 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1753                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1754                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1755                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1756                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1757                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1758 
1759 /* Test Case 18 */
1760 #define K18 K17
1761 #define P18 P17
1762 #define A18 A17
1763 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1764                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1765                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1766                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1767                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1768                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1769                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1770                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1771                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1772 
1773 /* Test Case 19 */
1774 #define K19 K1
1775 #define P19 P1
1776 #define IV19 IV1
1777 #define C19 C1
1778 static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1779                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1780                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1781                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55,
1782                         0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1783                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1784                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1785                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1786                 T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92};
1787 
1788 /* Test Case 20 */
1789 #define K20 K1
1790 #define A20 A1
1791 static const u8 IV20[64]={0xff,0xff,0xff,0xff}, /* this results in 0xff in counter LSB */
1792                 P20[288],
1793                 C20[]= {0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14,
1794                         0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f,
1795                         0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18,
1796                         0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49,
1797                         0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c,
1798                         0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29,
1799                         0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76,
1800                         0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce,
1801                         0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86,
1802                         0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18,
1803                         0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42,
1804                         0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06,
1805                         0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c,
1806                         0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64,
1807                         0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6,
1808                         0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74,
1809                         0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46,
1810                         0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c},
1811                 T20[]= {0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f};
1812 
1813 #define TEST_CASE(n)    do {                                    \
1814         u8 out[sizeof(P##n)];                                   \
1815         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);              \
1816         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1817         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));              \
1818         memset(out,0,sizeof(out));                              \
1819         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));        \
1820         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \
1821         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||           \
1822             (C##n && memcmp(out,C##n,sizeof(out))))             \
1823                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1824         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));              \
1825         memset(out,0,sizeof(out));                              \
1826         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));        \
1827         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \
1828         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||           \
1829             (P##n && memcmp(out,P##n,sizeof(out))))             \
1830                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1831         } while(0)
1832 
1833 int main()
1834 {
1835         GCM128_CONTEXT ctx;
1836         AES_KEY key;
1837         int ret=0;
1838 
1839         TEST_CASE(1);
1840         TEST_CASE(2);
1841         TEST_CASE(3);
1842         TEST_CASE(4);
1843         TEST_CASE(5);
1844         TEST_CASE(6);
1845         TEST_CASE(7);
1846         TEST_CASE(8);
1847         TEST_CASE(9);
1848         TEST_CASE(10);
1849         TEST_CASE(11);
1850         TEST_CASE(12);
1851         TEST_CASE(13);
1852         TEST_CASE(14);
1853         TEST_CASE(15);
1854         TEST_CASE(16);
1855         TEST_CASE(17);
1856         TEST_CASE(18);
1857         TEST_CASE(19);
1858         TEST_CASE(20);
1859 
1860 #ifdef OPENSSL_CPUID_OBJ
1861         {
1862         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1863         union { u64 u; u8 c[1024]; } buf;
1864         int i;
1865 
1866         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1867         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1868         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1869 
1870         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1871         start = OPENSSL_rdtsc();
1872         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1873         gcm_t = OPENSSL_rdtsc() - start;
1874 
1875         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1876                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1877                         (block128_f)AES_encrypt);
1878         start = OPENSSL_rdtsc();
1879         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1880                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1881                         (block128_f)AES_encrypt);
1882         ctr_t = OPENSSL_rdtsc() - start;
1883 
1884         printf("%.2f-%.2f=%.2f\n",
1885                         gcm_t/(double)sizeof(buf),
1886                         ctr_t/(double)sizeof(buf),
1887                         (gcm_t-ctr_t)/(double)sizeof(buf));
1888 #ifdef GHASH
1889         {
1890         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1891                                 const u8 *inp,size_t len)       = ctx.ghash;
1892 
1893         GHASH((&ctx),buf.c,sizeof(buf));
1894         start = OPENSSL_rdtsc();
1895         for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
1896         gcm_t = OPENSSL_rdtsc() - start;
1897         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1898         }
1899 #endif
1900         }
1901 #endif
1902 
1903         return ret;
1904 }
1905 #endif