1 /*
   2  * Implementation of the Skein block functions.
   3  * Source code author: Doug Whiting, 2008.
   4  * This algorithm and source code is released to the public domain.
   5  * Compile-time switches:
   6  *  SKEIN_USE_ASM  -- set bits (256/512/1024) to select which
   7  *                    versions use ASM code for block processing
   8  *                    [default: use C for all block sizes]
   9  */
  10 /* Copyright 2013 Doug Whiting. This code is released to the public domain. */
  11 
  12 #include <sys/skein.h>
  13 #include "skein_impl.h"
  14 
  15 #ifndef SKEIN_USE_ASM
  16 #define SKEIN_USE_ASM   (0)     /* default is all C code (no ASM) */
  17 #endif
  18 
  19 #ifndef SKEIN_LOOP
  20 #define SKEIN_LOOP 001          /* default: unroll 256 and 512, but not 1024 */
  21 #endif
  22 
  23 /* some useful definitions for code here */
  24 #define BLK_BITS        (WCNT*64)
  25 #define KW_TWK_BASE     (0)
  26 #define KW_KEY_BASE     (3)
  27 #define ks              (kw + KW_KEY_BASE)
  28 #define ts              (kw + KW_TWK_BASE)
  29 
  30 /* no debugging in Illumos version */
  31 #define DebugSaveTweak(ctx)
  32 
  33 /* Skein_256 */
  34 #if     !(SKEIN_USE_ASM & 256)
  35 void
  36 Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr,
  37     size_t blkCnt, size_t byteCntAdd)
  38 {                               /* do it in C */
  39         enum {
  40                 WCNT = SKEIN_256_STATE_WORDS
  41         };
  42 #undef  RCNT
  43 #define RCNT  (SKEIN_256_ROUNDS_TOTAL / 8)
  44 
  45 #ifdef  SKEIN_LOOP              /* configure how much to unroll the loop */
  46 #define SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10)
  47 #else
  48 #define SKEIN_UNROLL_256 (0)
  49 #endif
  50 
  51 #if     SKEIN_UNROLL_256
  52 #if     (RCNT % SKEIN_UNROLL_256)
  53 #error "Invalid SKEIN_UNROLL_256"       /* sanity check on unroll count */
  54 #endif
  55         size_t r;
  56         /* key schedule words : chaining vars + tweak + "rotation" */
  57         uint64_t kw[WCNT + 4 + RCNT * 2];
  58 #else
  59         uint64_t kw[WCNT + 4];  /* key schedule words : chaining vars + tweak */
  60 #endif
  61         /* local copy of context vars, for speed */
  62         uint64_t X0, X1, X2, X3;
  63         uint64_t w[WCNT];               /* local copy of input block */
  64 #ifdef  SKEIN_DEBUG
  65         /* use for debugging (help compiler put Xn in registers) */
  66         const uint64_t *Xptr[4];
  67         Xptr[0] = &X0;
  68         Xptr[1] = &X1;
  69         Xptr[2] = &X2;
  70         Xptr[3] = &X3;
  71 #endif
  72         Skein_assert(blkCnt != 0);      /* never call with blkCnt == 0! */
  73         ts[0] = ctx->h.T[0];
  74         ts[1] = ctx->h.T[1];
  75         do {
  76                 /*
  77                  * this implementation only supports 2**64 input bytes
  78                  * (no carry out here)
  79                  */
  80                 ts[0] += byteCntAdd;    /* update processed length */
  81 
  82                 /* precompute the key schedule for this block */
  83                 ks[0] = ctx->X[0];
  84                 ks[1] = ctx->X[1];
  85                 ks[2] = ctx->X[2];
  86                 ks[3] = ctx->X[3];
  87                 ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
  88 
  89                 ts[2] = ts[0] ^ ts[1];
  90 
  91                 /* get input block in little-endian format */
  92                 Skein_Get64_LSB_First(w, blkPtr, WCNT);
  93                 DebugSaveTweak(ctx);
  94                 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
  95 
  96                 X0 = w[0] + ks[0];      /* do the first full key injection */
  97                 X1 = w[1] + ks[1] + ts[0];
  98                 X2 = w[2] + ks[2] + ts[1];
  99                 X3 = w[3] + ks[3];
 100 
 101                 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
 102                     Xptr);      /* show starting state values */
 103 
 104                 blkPtr += SKEIN_256_BLOCK_BYTES;
 105 
 106                 /* run the rounds */
 107 
 108 #define Round256(p0, p1, p2, p3, ROT, rNum)                          \
 109     X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
 110     X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \
 111 
 112 #if     SKEIN_UNROLL_256 == 0
 113 #define R256(p0, p1, p2, p3, ROT, rNum)         /* fully unrolled */    \
 114     Round256(p0, p1, p2, p3, ROT, rNum)                                 \
 115     Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
 116 
 117 #define I256(R)                                                         \
 118     X0 += ks[((R) + 1) % 5];    /* inject the key schedule value */     \
 119     X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3];                        \
 120     X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3];                        \
 121     X3 += ks[((R) + 4) % 5] + (R) + 1;                                  \
 122     Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 123 #else                           /* looping version */
 124 #define R256(p0, p1, p2, p3, ROT, rNum)                             \
 125     Round256(p0, p1, p2, p3, ROT, rNum)                             \
 126     Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
 127 
 128 #define I256(R)                                                         \
 129         X0 += ks[r + (R) + 0];  /* inject the key schedule value */     \
 130         X1 += ks[r + (R) + 1] + ts[r + (R) + 0];                        \
 131         X2 += ks[r + (R) + 2] + ts[r + (R) + 1];                        \
 132         X3 += ks[r + (R) + 3] + r + (R);                                \
 133         ks[r + (R) + 4] = ks[r + (R) - 1];   /* rotate key schedule */  \
 134     ts[r + (R) + 2] = ts[r + (R) - 1];                                  \
 135     Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 136 
 137                 /* loop thru it */
 138                 for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256)
 139 #endif
 140                 {
 141 #define R256_8_rounds(R)                         \
 142         R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1);  \
 143         R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2);  \
 144         R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3);  \
 145         R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4);  \
 146         I256(2 * (R));                           \
 147         R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5);  \
 148         R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6);  \
 149         R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7);  \
 150         R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8);  \
 151         I256(2 * (R) + 1);
 152 
 153                         R256_8_rounds(0);
 154 
 155 #define R256_Unroll_R(NN) \
 156         ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || \
 157         (SKEIN_UNROLL_256 > (NN)))
 158 
 159 #if     R256_Unroll_R(1)
 160                         R256_8_rounds(1);
 161 #endif
 162 #if     R256_Unroll_R(2)
 163                         R256_8_rounds(2);
 164 #endif
 165 #if     R256_Unroll_R(3)
 166                         R256_8_rounds(3);
 167 #endif
 168 #if     R256_Unroll_R(4)
 169                         R256_8_rounds(4);
 170 #endif
 171 #if     R256_Unroll_R(5)
 172                         R256_8_rounds(5);
 173 #endif
 174 #if     R256_Unroll_R(6)
 175                         R256_8_rounds(6);
 176 #endif
 177 #if     R256_Unroll_R(7)
 178                         R256_8_rounds(7);
 179 #endif
 180 #if     R256_Unroll_R(8)
 181                         R256_8_rounds(8);
 182 #endif
 183 #if     R256_Unroll_R(9)
 184                         R256_8_rounds(9);
 185 #endif
 186 #if     R256_Unroll_R(10)
 187                         R256_8_rounds(10);
 188 #endif
 189 #if     R256_Unroll_R(11)
 190                         R256_8_rounds(11);
 191 #endif
 192 #if     R256_Unroll_R(12)
 193                         R256_8_rounds(12);
 194 #endif
 195 #if     R256_Unroll_R(13)
 196                         R256_8_rounds(13);
 197 #endif
 198 #if     R256_Unroll_R(14)
 199                         R256_8_rounds(14);
 200 #endif
 201 #if     (SKEIN_UNROLL_256 > 14)
 202 #error  "need more unrolling in Skein_256_Process_Block"
 203 #endif
 204                 }
 205                 /*
 206                  * do the final "feedforward" xor, update context chaining vars
 207                  */
 208                 ctx->X[0] = X0 ^ w[0];
 209                 ctx->X[1] = X1 ^ w[1];
 210                 ctx->X[2] = X2 ^ w[2];
 211                 ctx->X[3] = X3 ^ w[3];
 212 
 213                 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
 214 
 215                 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
 216         }
 217         while (--blkCnt);
 218         ctx->h.T[0] = ts[0];
 219         ctx->h.T[1] = ts[1];
 220 }
 221 
 222 #if     defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 223 size_t
 224 Skein_256_Process_Block_CodeSize(void)
 225 {
 226         return ((uint8_t *)Skein_256_Process_Block_CodeSize) -
 227             ((uint8_t *)Skein_256_Process_Block);
 228 }
 229 
 230 uint_t
 231 Skein_256_Unroll_Cnt(void)
 232 {
 233         return (SKEIN_UNROLL_256);
 234 }
 235 #endif
 236 #endif
 237 
 238 /* Skein_512 */
 239 #if     !(SKEIN_USE_ASM & 512)
 240 void
 241 Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr,
 242     size_t blkCnt, size_t byteCntAdd)
 243 {                               /* do it in C */
 244         enum {
 245                 WCNT = SKEIN_512_STATE_WORDS
 246         };
 247 #undef  RCNT
 248 #define RCNT  (SKEIN_512_ROUNDS_TOTAL / 8)
 249 
 250 #ifdef  SKEIN_LOOP              /* configure how much to unroll the loop */
 251 #define SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10)
 252 #else
 253 #define SKEIN_UNROLL_512 (0)
 254 #endif
 255 
 256 #if     SKEIN_UNROLL_512
 257 #if     (RCNT % SKEIN_UNROLL_512)
 258 #error "Invalid SKEIN_UNROLL_512"       /* sanity check on unroll count */
 259 #endif
 260         size_t r;
 261         /* key schedule words : chaining vars + tweak + "rotation" */
 262         uint64_t kw[WCNT + 4 + RCNT * 2];
 263 #else
 264         uint64_t kw[WCNT + 4];  /* key schedule words : chaining vars + tweak */
 265 #endif
 266         /* local copy of vars, for speed */
 267         uint64_t X0, X1, X2, X3, X4, X5, X6, X7;
 268         uint64_t w[WCNT];               /* local copy of input block */
 269 #ifdef  SKEIN_DEBUG
 270         /* use for debugging (help compiler put Xn in registers) */
 271         const uint64_t *Xptr[8];
 272         Xptr[0] = &X0;
 273         Xptr[1] = &X1;
 274         Xptr[2] = &X2;
 275         Xptr[3] = &X3;
 276         Xptr[4] = &X4;
 277         Xptr[5] = &X5;
 278         Xptr[6] = &X6;
 279         Xptr[7] = &X7;
 280 #endif
 281 
 282         Skein_assert(blkCnt != 0);      /* never call with blkCnt == 0! */
 283         ts[0] = ctx->h.T[0];
 284         ts[1] = ctx->h.T[1];
 285         do {
 286                 /*
 287                  * this implementation only supports 2**64 input bytes
 288                  * (no carry out here)
 289                  */
 290                 ts[0] += byteCntAdd;    /* update processed length */
 291 
 292                 /* precompute the key schedule for this block */
 293                 ks[0] = ctx->X[0];
 294                 ks[1] = ctx->X[1];
 295                 ks[2] = ctx->X[2];
 296                 ks[3] = ctx->X[3];
 297                 ks[4] = ctx->X[4];
 298                 ks[5] = ctx->X[5];
 299                 ks[6] = ctx->X[6];
 300                 ks[7] = ctx->X[7];
 301                 ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
 302                     ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
 303 
 304                 ts[2] = ts[0] ^ ts[1];
 305 
 306                 /* get input block in little-endian format */
 307                 Skein_Get64_LSB_First(w, blkPtr, WCNT);
 308                 DebugSaveTweak(ctx);
 309                 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
 310 
 311                 X0 = w[0] + ks[0];      /* do the first full key injection */
 312                 X1 = w[1] + ks[1];
 313                 X2 = w[2] + ks[2];
 314                 X3 = w[3] + ks[3];
 315                 X4 = w[4] + ks[4];
 316                 X5 = w[5] + ks[5] + ts[0];
 317                 X6 = w[6] + ks[6] + ts[1];
 318                 X7 = w[7] + ks[7];
 319 
 320                 blkPtr += SKEIN_512_BLOCK_BYTES;
 321 
 322                 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
 323                     Xptr);
 324                 /* run the rounds */
 325 #define Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)             \
 326         X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
 327         X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
 328         X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
 329         X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;
 330 
 331 #if     SKEIN_UNROLL_512 == 0
 332 #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) /* unrolled */  \
 333         Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)             \
 334         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
 335 
 336 #define I512(R)                                                         \
 337         X0 += ks[((R) + 1) % 9];        /* inject the key schedule value */\
 338         X1 += ks[((R) + 2) % 9];                                        \
 339         X2 += ks[((R) + 3) % 9];                                        \
 340         X3 += ks[((R) + 4) % 9];                                        \
 341         X4 += ks[((R) + 5) % 9];                                        \
 342         X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3];                    \
 343         X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3];                    \
 344         X7 += ks[((R) + 8) % 9] + (R) + 1;                              \
 345         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 346 #else                           /* looping version */
 347 #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)                 \
 348         Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)             \
 349         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
 350 
 351 #define I512(R)                                                         \
 352         X0 += ks[r + (R) + 0];  /* inject the key schedule value */     \
 353         X1 += ks[r + (R) + 1];                                          \
 354         X2 += ks[r + (R) + 2];                                          \
 355         X3 += ks[r + (R) + 3];                                          \
 356         X4 += ks[r + (R) + 4];                                          \
 357         X5 += ks[r + (R) + 5] + ts[r + (R) + 0];                        \
 358         X6 += ks[r + (R) + 6] + ts[r + (R) + 1];                        \
 359         X7 += ks[r + (R) + 7] + r + (R);                                \
 360         ks[r + (R)+8] = ks[r + (R) - 1];        /* rotate key schedule */\
 361         ts[r + (R)+2] = ts[r + (R) - 1];                                \
 362         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 363 
 364                 /* loop thru it */
 365                 for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512)
 366 #endif                          /* end of looped code definitions */
 367                 {
 368 #define R512_8_rounds(R)        /* do 8 full rounds */                  \
 369         R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1);             \
 370         R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2);             \
 371         R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3);             \
 372         R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4);             \
 373         I512(2 * (R));                                                  \
 374         R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5);             \
 375         R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6);             \
 376         R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7);             \
 377         R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8);             \
 378         I512(2*(R) + 1);                /* and key injection */
 379 
 380                         R512_8_rounds(0);
 381 
 382 #define R512_Unroll_R(NN) \
 383         ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || \
 384         (SKEIN_UNROLL_512 > (NN)))
 385 
 386 #if     R512_Unroll_R(1)
 387                         R512_8_rounds(1);
 388 #endif
 389 #if     R512_Unroll_R(2)
 390                         R512_8_rounds(2);
 391 #endif
 392 #if     R512_Unroll_R(3)
 393                         R512_8_rounds(3);
 394 #endif
 395 #if     R512_Unroll_R(4)
 396                         R512_8_rounds(4);
 397 #endif
 398 #if     R512_Unroll_R(5)
 399                         R512_8_rounds(5);
 400 #endif
 401 #if     R512_Unroll_R(6)
 402                         R512_8_rounds(6);
 403 #endif
 404 #if     R512_Unroll_R(7)
 405                         R512_8_rounds(7);
 406 #endif
 407 #if     R512_Unroll_R(8)
 408                         R512_8_rounds(8);
 409 #endif
 410 #if     R512_Unroll_R(9)
 411                         R512_8_rounds(9);
 412 #endif
 413 #if     R512_Unroll_R(10)
 414                         R512_8_rounds(10);
 415 #endif
 416 #if     R512_Unroll_R(11)
 417                         R512_8_rounds(11);
 418 #endif
 419 #if     R512_Unroll_R(12)
 420                         R512_8_rounds(12);
 421 #endif
 422 #if     R512_Unroll_R(13)
 423                         R512_8_rounds(13);
 424 #endif
 425 #if     R512_Unroll_R(14)
 426                         R512_8_rounds(14);
 427 #endif
 428 #if     (SKEIN_UNROLL_512 > 14)
 429 #error "need more unrolling in Skein_512_Process_Block"
 430 #endif
 431                 }
 432 
 433                 /*
 434                  * do the final "feedforward" xor, update context chaining vars
 435                  */
 436                 ctx->X[0] = X0 ^ w[0];
 437                 ctx->X[1] = X1 ^ w[1];
 438                 ctx->X[2] = X2 ^ w[2];
 439                 ctx->X[3] = X3 ^ w[3];
 440                 ctx->X[4] = X4 ^ w[4];
 441                 ctx->X[5] = X5 ^ w[5];
 442                 ctx->X[6] = X6 ^ w[6];
 443                 ctx->X[7] = X7 ^ w[7];
 444                 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
 445 
 446                 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
 447         }
 448         while (--blkCnt);
 449         ctx->h.T[0] = ts[0];
 450         ctx->h.T[1] = ts[1];
 451 }
 452 
 453 #if     defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 454 size_t
 455 Skein_512_Process_Block_CodeSize(void)
 456 {
 457         return ((uint8_t *)Skein_512_Process_Block_CodeSize) -
 458             ((uint8_t *)Skein_512_Process_Block);
 459 }
 460 
 461 uint_t
 462 Skein_512_Unroll_Cnt(void)
 463 {
 464         return (SKEIN_UNROLL_512);
 465 }
 466 #endif
 467 #endif
 468 
 469 /*  Skein1024 */
 470 #if     !(SKEIN_USE_ASM & 1024)
 471 void
 472 Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr,
 473     size_t blkCnt, size_t byteCntAdd)
 474 {
 475         /* do it in C, always looping (unrolled is bigger AND slower!) */
 476         enum {
 477                 WCNT = SKEIN1024_STATE_WORDS
 478         };
 479 #undef  RCNT
 480 #define RCNT  (SKEIN1024_ROUNDS_TOTAL/8)
 481 
 482 #ifdef  SKEIN_LOOP              /* configure how much to unroll the loop */
 483 #define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10)
 484 #else
 485 #define SKEIN_UNROLL_1024 (0)
 486 #endif
 487 
 488 #if     (SKEIN_UNROLL_1024 != 0)
 489 #if     (RCNT % SKEIN_UNROLL_1024)
 490 #error "Invalid SKEIN_UNROLL_1024"      /* sanity check on unroll count */
 491 #endif
 492         size_t r;
 493         /* key schedule words : chaining vars + tweak + "rotation" */
 494         uint64_t kw[WCNT + 4 + RCNT * 2];
 495 #else
 496         uint64_t kw[WCNT + 4];  /* key schedule words : chaining vars + tweak */
 497 #endif
 498 
 499         /* local copy of vars, for speed */
 500         uint64_t X00, X01, X02, X03, X04, X05, X06, X07, X08, X09, X10, X11,
 501             X12, X13, X14, X15;
 502         uint64_t w[WCNT];               /* local copy of input block */
 503 #ifdef  SKEIN_DEBUG
 504         /* use for debugging (help compiler put Xn in registers) */
 505         const uint64_t *Xptr[16];
 506         Xptr[0] = &X00;
 507         Xptr[1] = &X01;
 508         Xptr[2] = &X02;
 509         Xptr[3] = &X03;
 510         Xptr[4] = &X04;
 511         Xptr[5] = &X05;
 512         Xptr[6] = &X06;
 513         Xptr[7] = &X07;
 514         Xptr[8] = &X08;
 515         Xptr[9] = &X09;
 516         Xptr[10] = &X10;
 517         Xptr[11] = &X11;
 518         Xptr[12] = &X12;
 519         Xptr[13] = &X13;
 520         Xptr[14] = &X14;
 521         Xptr[15] = &X15;
 522 #endif
 523 
 524         Skein_assert(blkCnt != 0);      /* never call with blkCnt == 0! */
 525         ts[0] = ctx->h.T[0];
 526         ts[1] = ctx->h.T[1];
 527         do {
 528                 /*
 529                  * this implementation only supports 2**64 input bytes
 530                  * (no carry out here)
 531                  */
 532                 ts[0] += byteCntAdd;    /* update processed length */
 533 
 534                 /* precompute the key schedule for this block */
 535                 ks[0] = ctx->X[0];
 536                 ks[1] = ctx->X[1];
 537                 ks[2] = ctx->X[2];
 538                 ks[3] = ctx->X[3];
 539                 ks[4] = ctx->X[4];
 540                 ks[5] = ctx->X[5];
 541                 ks[6] = ctx->X[6];
 542                 ks[7] = ctx->X[7];
 543                 ks[8] = ctx->X[8];
 544                 ks[9] = ctx->X[9];
 545                 ks[10] = ctx->X[10];
 546                 ks[11] = ctx->X[11];
 547                 ks[12] = ctx->X[12];
 548                 ks[13] = ctx->X[13];
 549                 ks[14] = ctx->X[14];
 550                 ks[15] = ctx->X[15];
 551                 ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
 552                     ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^
 553                     ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^
 554                     ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;
 555 
 556                 ts[2] = ts[0] ^ ts[1];
 557 
 558                 /* get input block in little-endian format */
 559                 Skein_Get64_LSB_First(w, blkPtr, WCNT);
 560                 DebugSaveTweak(ctx);
 561                 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
 562 
 563                 X00 = w[0] + ks[0];     /* do the first full key injection */
 564                 X01 = w[1] + ks[1];
 565                 X02 = w[2] + ks[2];
 566                 X03 = w[3] + ks[3];
 567                 X04 = w[4] + ks[4];
 568                 X05 = w[5] + ks[5];
 569                 X06 = w[6] + ks[6];
 570                 X07 = w[7] + ks[7];
 571                 X08 = w[8] + ks[8];
 572                 X09 = w[9] + ks[9];
 573                 X10 = w[10] + ks[10];
 574                 X11 = w[11] + ks[11];
 575                 X12 = w[12] + ks[12];
 576                 X13 = w[13] + ks[13] + ts[0];
 577                 X14 = w[14] + ks[14] + ts[1];
 578                 X15 = w[15] + ks[15];
 579 
 580                 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
 581                     Xptr);
 582 
 583 #define Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,   \
 584         pD, pE, pF, ROT, rNum)                                          \
 585         X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
 586         X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
 587         X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
 588         X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;\
 589         X##p8 += X##p9; X##p9 = RotL_64(X##p9, ROT##_4); X##p9 ^= X##p8;\
 590         X##pA += X##pB; X##pB = RotL_64(X##pB, ROT##_5); X##pB ^= X##pA;\
 591         X##pC += X##pD; X##pD = RotL_64(X##pD, ROT##_6); X##pD ^= X##pC;\
 592         X##pE += X##pF; X##pF = RotL_64(X##pF, ROT##_7); X##pF ^= X##pE;
 593 
 594 #if     SKEIN_UNROLL_1024 == 0
 595 #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD,   \
 596         pE, pF, ROT, rn)                                                \
 597         Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,   \
 598         pD, pE, pF, ROT, rn)                                            \
 599         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr);
 600 
 601 #define I1024(R)                                                        \
 602         X00 += ks[((R) + 1) % 17];      /* inject the key schedule value */\
 603         X01 += ks[((R) + 2) % 17];                                      \
 604         X02 += ks[((R) + 3) % 17];                                      \
 605         X03 += ks[((R) + 4) % 17];                                      \
 606         X04 += ks[((R) + 5) % 17];                                      \
 607         X05 += ks[((R) + 6) % 17];                                      \
 608         X06 += ks[((R) + 7) % 17];                                      \
 609         X07 += ks[((R) + 8) % 17];                                      \
 610         X08 += ks[((R) + 9) % 17];                                      \
 611         X09 += ks[((R) + 10) % 17];                                     \
 612         X10 += ks[((R) + 11) % 17];                                     \
 613         X11 += ks[((R) + 12) % 17];                                     \
 614         X12 += ks[((R) + 13) % 17];                                     \
 615         X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3];                 \
 616         X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3];                 \
 617         X15 += ks[((R) + 16) % 17] + (R) +1;                            \
 618         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 619 #else                           /* looping version */
 620 #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD,   \
 621         pE, pF, ROT, rn)                                                \
 622         Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,   \
 623         pD, pE, pF, ROT, rn)                                            \
 624         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr);
 625 
 626 #define I1024(R)                                                        \
 627         X00 += ks[r + (R) + 0]; /* inject the key schedule value */     \
 628         X01 += ks[r + (R) + 1];                                         \
 629         X02 += ks[r + (R) + 2];                                         \
 630         X03 += ks[r + (R) + 3];                                         \
 631         X04 += ks[r + (R) + 4];                                         \
 632         X05 += ks[r + (R) + 5];                                         \
 633         X06 += ks[r + (R) + 6];                                         \
 634         X07 += ks[r + (R) + 7];                                         \
 635         X08 += ks[r + (R) + 8];                                         \
 636         X09 += ks[r + (R) + 9];                                         \
 637         X10 += ks[r + (R) + 10];                                        \
 638         X11 += ks[r + (R) + 11];                                        \
 639         X12 += ks[r + (R) + 12];                                        \
 640         X13 += ks[r + (R) + 13] + ts[r + (R) + 0];                      \
 641         X14 += ks[r + (R) + 14] + ts[r + (R) + 1];                      \
 642         X15 += ks[r + (R) + 15] +  r + (R);                             \
 643         ks[r + (R) + 16] = ks[r + (R) - 1];     /* rotate key schedule */\
 644         ts[r + (R) + 2] = ts[r + (R) - 1];                              \
 645         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 646 
 647                 /* loop thru it */
 648                 for (r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024)
 649 #endif
 650                 {
 651 #define R1024_8_rounds(R)       /* do 8 full rounds */                  \
 652         R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13,   \
 653             14, 15, R1024_0, 8 * (R) + 1);                              \
 654         R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05,   \
 655             08, 01, R1024_1, 8 * (R) + 2);                              \
 656         R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11,   \
 657             10, 09, R1024_2, 8 * (R) + 3);                              \
 658         R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03,   \
 659             12, 07, R1024_3, 8 * (R) + 4);                              \
 660         I1024(2 * (R));                                                 \
 661         R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13,   \
 662             14, 15, R1024_4, 8 * (R) + 5);                              \
 663         R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05,   \
 664             08, 01, R1024_5, 8 * (R) + 6);                              \
 665         R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11,   \
 666             10, 09, R1024_6, 8 * (R) + 7);                              \
 667         R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03,   \
 668             12, 07, R1024_7, 8 * (R) + 8);                              \
 669         I1024(2 * (R) + 1);
 670 
 671                         R1024_8_rounds(0);
 672 
 673 #define R1024_Unroll_R(NN)                                              \
 674         ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) ||      \
 675         (SKEIN_UNROLL_1024 > (NN)))
 676 
 677 #if     R1024_Unroll_R(1)
 678                         R1024_8_rounds(1);
 679 #endif
 680 #if     R1024_Unroll_R(2)
 681                         R1024_8_rounds(2);
 682 #endif
 683 #if     R1024_Unroll_R(3)
 684                         R1024_8_rounds(3);
 685 #endif
 686 #if     R1024_Unroll_R(4)
 687                         R1024_8_rounds(4);
 688 #endif
 689 #if     R1024_Unroll_R(5)
 690                         R1024_8_rounds(5);
 691 #endif
 692 #if     R1024_Unroll_R(6)
 693                         R1024_8_rounds(6);
 694 #endif
 695 #if     R1024_Unroll_R(7)
 696                         R1024_8_rounds(7);
 697 #endif
 698 #if     R1024_Unroll_R(8)
 699                         R1024_8_rounds(8);
 700 #endif
 701 #if     R1024_Unroll_R(9)
 702                         R1024_8_rounds(9);
 703 #endif
 704 #if     R1024_Unroll_R(10)
 705                         R1024_8_rounds(10);
 706 #endif
 707 #if     R1024_Unroll_R(11)
 708                         R1024_8_rounds(11);
 709 #endif
 710 #if     R1024_Unroll_R(12)
 711                         R1024_8_rounds(12);
 712 #endif
 713 #if     R1024_Unroll_R(13)
 714                         R1024_8_rounds(13);
 715 #endif
 716 #if     R1024_Unroll_R(14)
 717                         R1024_8_rounds(14);
 718 #endif
 719 #if     (SKEIN_UNROLL_1024 > 14)
 720 #error  "need more unrolling in Skein_1024_Process_Block"
 721 #endif
 722                 }
 723                 /*
 724                  * do the final "feedforward" xor, update context chaining vars
 725                  */
 726 
 727                 ctx->X[0] = X00 ^ w[0];
 728                 ctx->X[1] = X01 ^ w[1];
 729                 ctx->X[2] = X02 ^ w[2];
 730                 ctx->X[3] = X03 ^ w[3];
 731                 ctx->X[4] = X04 ^ w[4];
 732                 ctx->X[5] = X05 ^ w[5];
 733                 ctx->X[6] = X06 ^ w[6];
 734                 ctx->X[7] = X07 ^ w[7];
 735                 ctx->X[8] = X08 ^ w[8];
 736                 ctx->X[9] = X09 ^ w[9];
 737                 ctx->X[10] = X10 ^ w[10];
 738                 ctx->X[11] = X11 ^ w[11];
 739                 ctx->X[12] = X12 ^ w[12];
 740                 ctx->X[13] = X13 ^ w[13];
 741                 ctx->X[14] = X14 ^ w[14];
 742                 ctx->X[15] = X15 ^ w[15];
 743 
 744                 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
 745 
 746                 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
 747                 blkPtr += SKEIN1024_BLOCK_BYTES;
 748         } while (--blkCnt);
 749         ctx->h.T[0] = ts[0];
 750         ctx->h.T[1] = ts[1];
 751 }
 752 
 753 #if     defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 754 size_t
 755 Skein1024_Process_Block_CodeSize(void)
 756 {
 757         return ((uint8_t *)Skein1024_Process_Block_CodeSize) -
 758             ((uint8_t *)Skein1024_Process_Block);
 759 }
 760 
 761 uint_t
 762 Skein1024_Unroll_Cnt(void)
 763 {
 764         return (SKEIN_UNROLL_1024);
 765 }
 766 #endif
 767 #endif