1 /*
   2  * match.S -- optimized version of longest_match()
   3  * based on the similar work by Gilles Vollant, and Brian Raiter, written 1998
   4  *
   5  * This is free software; you can redistribute it and/or modify it
   6  * under the terms of the BSD License. Use by owners of Che Guevarra
   7  * parafernalia is prohibited, where possible, and highly discouraged
   8  * elsewhere.
   9  */
  10 
  11 #ifndef NO_UNDERLINE
  12 #       define  match_init      _match_init
  13 #       define  longest_match   _longest_match
  14 #endif
  15 
  16 #define scanend         ebx
  17 #define scanendw        bx
  18 #define chainlenwmask   edx /* high word: current chain len low word: s->wmask */
  19 #define curmatch        rsi
  20 #define curmatchd       esi
  21 #define windowbestlen   r8
  22 #define scanalign       r9
  23 #define scanalignd      r9d
  24 #define window          r10
  25 #define bestlen         r11
  26 #define bestlend        r11d
  27 #define scanstart       r12d
  28 #define scanstartw      r12w
  29 #define scan            r13
  30 #define nicematch       r14d
  31 #define limit           r15
  32 #define limitd          r15d
  33 #define prev            rcx
  34 
  35 /*
  36  * The 258 is a "magic number, not a parameter -- changing it
  37  * breaks the hell loose
  38  */
  39 #define MAX_MATCH       (258)
  40 #define MIN_MATCH       (3)
  41 #define MIN_LOOKAHEAD   (MAX_MATCH + MIN_MATCH + 1)
  42 #define MAX_MATCH_8     ((MAX_MATCH + 7) & ~7)
  43 
  44 /* stack frame offsets */
  45 #define LocalVarsSize   (112)
  46 #define _chainlenwmask  ( 8-LocalVarsSize)(%rsp)
  47 #define _windowbestlen  (16-LocalVarsSize)(%rsp)
  48 #define save_r14        (24-LocalVarsSize)(%rsp)
  49 #define save_rsi        (32-LocalVarsSize)(%rsp)
  50 #define save_rbx        (40-LocalVarsSize)(%rsp)
  51 #define save_r12        (56-LocalVarsSize)(%rsp)
  52 #define save_r13        (64-LocalVarsSize)(%rsp)
  53 #define save_r15        (80-LocalVarsSize)(%rsp)
  54 
  55 
  56 .globl  match_init, longest_match
  57 
  58 /*
  59  * On AMD64 the first argument of a function (in our case -- the pointer to
  60  * deflate_state structure) is passed in %rdi, hence our offsets below are
  61  * all off of that.
  62  */
  63 
  64 /* you can check the structure offset by running
  65 
  66 #include <stdlib.h>
  67 #include <stdio.h>
  68 #include "deflate.h"
  69 
  70 void print_depl()
  71 {
  72 deflate_state ds;
  73 deflate_state *s=&ds;
  74 printf("size pointer=%u\n",(int)sizeof(void*));
  75 
  76 printf("#define dsWSize         (%3u)(%%rdi)\n",(int)(((char*)&(s->w_size))-((char*)s)));
  77 printf("#define dsWMask         (%3u)(%%rdi)\n",(int)(((char*)&(s->w_mask))-((char*)s)));
  78 printf("#define dsWindow        (%3u)(%%rdi)\n",(int)(((char*)&(s->window))-((char*)s)));
  79 printf("#define dsPrev          (%3u)(%%rdi)\n",(int)(((char*)&(s->prev))-((char*)s)));
  80 printf("#define dsMatchLen      (%3u)(%%rdi)\n",(int)(((char*)&(s->match_length))-((char*)s)));
  81 printf("#define dsPrevMatch     (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_match))-((char*)s)));
  82 printf("#define dsStrStart      (%3u)(%%rdi)\n",(int)(((char*)&(s->strstart))-((char*)s)));
  83 printf("#define dsMatchStart    (%3u)(%%rdi)\n",(int)(((char*)&(s->match_start))-((char*)s)));
  84 printf("#define dsLookahead     (%3u)(%%rdi)\n",(int)(((char*)&(s->lookahead))-((char*)s)));
  85 printf("#define dsPrevLen       (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_length))-((char*)s)));
  86 printf("#define dsMaxChainLen   (%3u)(%%rdi)\n",(int)(((char*)&(s->max_chain_length))-((char*)s)));
  87 printf("#define dsGoodMatch     (%3u)(%%rdi)\n",(int)(((char*)&(s->good_match))-((char*)s)));
  88 printf("#define dsNiceMatch     (%3u)(%%rdi)\n",(int)(((char*)&(s->nice_match))-((char*)s)));
  89 }
  90 
  91 */
  92 
  93 
  94 /*
  95   to compile for XCode 3.2 on MacOSX x86_64
  96   - run "gcc -g -c -DXCODE_MAC_X64_STRUCTURE amd64-match.S"
  97  */
  98 
  99 
 100 #ifndef CURRENT_LINX_XCODE_MAC_X64_STRUCTURE
 101 #define dsWSize         ( 68)(%rdi)
 102 #define dsWMask         ( 76)(%rdi)
 103 #define dsWindow        ( 80)(%rdi)
 104 #define dsPrev          ( 96)(%rdi)
 105 #define dsMatchLen      (144)(%rdi)
 106 #define dsPrevMatch     (148)(%rdi)
 107 #define dsStrStart      (156)(%rdi)
 108 #define dsMatchStart    (160)(%rdi)
 109 #define dsLookahead     (164)(%rdi)
 110 #define dsPrevLen       (168)(%rdi)
 111 #define dsMaxChainLen   (172)(%rdi)
 112 #define dsGoodMatch     (188)(%rdi)
 113 #define dsNiceMatch     (192)(%rdi)
 114 
 115 #else 
 116 
 117 #ifndef STRUCT_OFFSET
 118 #       define STRUCT_OFFSET    (0)
 119 #endif
 120 
 121 
 122 #define dsWSize         ( 56 + STRUCT_OFFSET)(%rdi)
 123 #define dsWMask         ( 64 + STRUCT_OFFSET)(%rdi)
 124 #define dsWindow        ( 72 + STRUCT_OFFSET)(%rdi)
 125 #define dsPrev          ( 88 + STRUCT_OFFSET)(%rdi)
 126 #define dsMatchLen      (136 + STRUCT_OFFSET)(%rdi)
 127 #define dsPrevMatch     (140 + STRUCT_OFFSET)(%rdi)
 128 #define dsStrStart      (148 + STRUCT_OFFSET)(%rdi)
 129 #define dsMatchStart    (152 + STRUCT_OFFSET)(%rdi)
 130 #define dsLookahead     (156 + STRUCT_OFFSET)(%rdi)
 131 #define dsPrevLen       (160 + STRUCT_OFFSET)(%rdi)
 132 #define dsMaxChainLen   (164 + STRUCT_OFFSET)(%rdi)
 133 #define dsGoodMatch     (180 + STRUCT_OFFSET)(%rdi)
 134 #define dsNiceMatch     (184 + STRUCT_OFFSET)(%rdi)
 135 
 136 #endif
 137 
 138 
 139 
 140 
 141 .text
 142 
 143 /* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
 144 
 145 longest_match:
 146 /*
 147  * Retrieve the function arguments. %curmatch will hold cur_match
 148  * throughout the entire function (passed via rsi on amd64).
 149  * rdi will hold the pointer to the deflate_state (first arg on amd64)
 150  */
 151                 mov     %rsi, save_rsi
 152                 mov     %rbx, save_rbx
 153                 mov     %r12, save_r12
 154                 mov     %r13, save_r13
 155                 mov     %r14, save_r14
 156                 mov     %r15, save_r15
 157 
 158 /* uInt wmask = s->w_mask;                                           */
 159 /* unsigned chain_length = s->max_chain_length;                              */
 160 /* if (s->prev_length >= s->good_match) {                              */
 161 /*     chain_length >>= 2;                                                */
 162 /* }                                                                    */
 163 
 164                 movl    dsPrevLen, %eax
 165                 movl    dsGoodMatch, %ebx
 166                 cmpl    %ebx, %eax
 167                 movl    dsWMask, %eax
 168                 movl    dsMaxChainLen, %chainlenwmask
 169                 jl      LastMatchGood
 170                 shrl    $2, %chainlenwmask
 171 LastMatchGood:
 172 
 173 /* chainlen is decremented once beforehand so that the function can     */
 174 /* use the sign flag instead of the zero flag for the exit test.        */
 175 /* It is then shifted into the high word, to make room for the wmask    */
 176 /* value, which it will always accompany.                               */
 177 
 178                 decl    %chainlenwmask
 179                 shll    $16, %chainlenwmask
 180                 orl     %eax, %chainlenwmask
 181 
 182 /* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;     */
 183 
 184                 movl    dsNiceMatch, %eax
 185                 movl    dsLookahead, %ebx
 186                 cmpl    %eax, %ebx
 187                 jl      LookaheadLess
 188                 movl    %eax, %ebx
 189 LookaheadLess:  movl    %ebx, %nicematch
 190 
 191 /* register Bytef *scan = s->window + s->strstart;                        */
 192 
 193                 mov     dsWindow, %window
 194                 movl    dsStrStart, %limitd
 195                 lea     (%limit, %window), %scan
 196 
 197 /* Determine how many bytes the scan ptr is off from being              */
 198 /* dword-aligned.                                                       */
 199 
 200                 mov     %scan, %scanalign
 201                 negl    %scanalignd
 202                 andl    $3, %scanalignd
 203 
 204 /* IPos limit = s->strstart > (IPos)MAX_DIST(s) ?                 */
 205 /*     s->strstart - (IPos)MAX_DIST(s) : NIL;                                */
 206 
 207                 movl    dsWSize, %eax
 208                 subl    $MIN_LOOKAHEAD, %eax
 209                 xorl    %ecx, %ecx
 210                 subl    %eax, %limitd
 211                 cmovng  %ecx, %limitd
 212 
 213 /* int best_len = s->prev_length;                                    */
 214 
 215                 movl    dsPrevLen, %bestlend
 216 
 217 /* Store the sum of s->window + best_len in %windowbestlen locally, and in memory.   */
 218 
 219                 lea     (%window, %bestlen), %windowbestlen
 220                 mov     %windowbestlen, _windowbestlen
 221 
 222 /* register ush scan_start = *(ushf*)scan;                              */
 223 /* register ush scan_end   = *(ushf*)(scan+best_len-1);                 */
 224 /* Posf *prev = s->prev;                                             */
 225 
 226                 movzwl  (%scan), %scanstart
 227                 movzwl  -1(%scan, %bestlen), %scanend
 228                 mov     dsPrev, %prev
 229 
 230 /* Jump into the main loop.                                             */
 231 
 232                 movl    %chainlenwmask, _chainlenwmask
 233                 jmp     LoopEntry
 234 
 235 .balign 16
 236 
 237 /* do {
 238  *     match = s->window + cur_match;
 239  *     if (*(ushf*)(match+best_len-1) != scan_end ||
 240  *         *(ushf*)match != scan_start) continue;
 241  *     [...]
 242  * } while ((cur_match = prev[cur_match & wmask]) > limit
 243  *          && --chain_length != 0);
 244  *
 245  * Here is the inner loop of the function. The function will spend the
 246  * majority of its time in this loop, and majority of that time will
 247  * be spent in the first ten instructions.
 248  */
 249 LookupLoop:
 250                 andl    %chainlenwmask, %curmatchd
 251                 movzwl  (%prev, %curmatch, 2), %curmatchd
 252                 cmpl    %limitd, %curmatchd
 253                 jbe     LeaveNow
 254                 subl    $0x00010000, %chainlenwmask
 255                 js      LeaveNow
 256 LoopEntry:      cmpw    -1(%windowbestlen, %curmatch), %scanendw
 257                 jne     LookupLoop
 258                 cmpw    %scanstartw, (%window, %curmatch)
 259                 jne     LookupLoop
 260 
 261 /* Store the current value of chainlen.                                 */
 262                 movl    %chainlenwmask, _chainlenwmask
 263 
 264 /* %scan is the string under scrutiny, and %prev to the string we       */
 265 /* are hoping to match it up with. In actuality, %esi and %edi are      */
 266 /* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is      */
 267 /* initialized to -(MAX_MATCH_8 - scanalign).                           */
 268 
 269                 mov     $(-MAX_MATCH_8), %rdx
 270                 lea     (%curmatch, %window), %windowbestlen
 271                 lea     MAX_MATCH_8(%windowbestlen, %scanalign), %windowbestlen
 272                 lea     MAX_MATCH_8(%scan, %scanalign), %prev
 273 
 274 /* the prefetching below makes very little difference... */
 275                 prefetcht1      (%windowbestlen, %rdx)
 276                 prefetcht1      (%prev, %rdx)
 277 
 278 /*
 279  * Test the strings for equality, 8 bytes at a time. At the end,
 280  * adjust %rdx so that it is offset to the exact byte that mismatched.
 281  *
 282  * It should be confessed that this loop usually does not represent
 283  * much of the total running time. Replacing it with a more
 284  * straightforward "rep cmpsb" would not drastically degrade
 285  * performance -- unrolling it, for example, makes no difference.
 286  */
 287 
 288 #undef USE_SSE  /* works, but is 6-7% slower, than non-SSE... */
 289 
 290 LoopCmps:
 291 #ifdef USE_SSE
 292                 /* Preload the SSE registers */
 293                 movdqu    (%windowbestlen, %rdx), %xmm1
 294                 movdqu    (%prev, %rdx), %xmm2
 295                 pcmpeqb %xmm2, %xmm1
 296                 movdqu  16(%windowbestlen, %rdx), %xmm3
 297                 movdqu  16(%prev, %rdx), %xmm4
 298                 pcmpeqb %xmm4, %xmm3
 299                 movdqu  32(%windowbestlen, %rdx), %xmm5
 300                 movdqu  32(%prev, %rdx), %xmm6
 301                 pcmpeqb %xmm6, %xmm5
 302                 movdqu  48(%windowbestlen, %rdx), %xmm7
 303                 movdqu  48(%prev, %rdx), %xmm8
 304                 pcmpeqb %xmm8, %xmm7
 305 
 306                 /* Check the comparisions' results */
 307                 pmovmskb %xmm1, %rax
 308                 notw    %ax
 309                 bsfw    %ax, %ax
 310                 jnz     LeaveLoopCmps
 311                 
 312                 /* this is the only iteration of the loop with a possibility of having
 313                    incremented rdx by 0x108 (each loop iteration add 16*4 = 0x40 
 314                    and (0x40*4)+8=0x108 */
 315                 add     $8, %rdx
 316                 jz LenMaximum
 317                 add     $8, %rdx
 318 
 319                 
 320                 pmovmskb %xmm3, %rax
 321                 notw    %ax
 322                 bsfw    %ax, %ax
 323                 jnz     LeaveLoopCmps
 324                 
 325                 
 326                 add     $16, %rdx
 327 
 328 
 329                 pmovmskb %xmm5, %rax
 330                 notw    %ax
 331                 bsfw    %ax, %ax
 332                 jnz     LeaveLoopCmps
 333                 
 334                 add     $16, %rdx
 335 
 336 
 337                 pmovmskb %xmm7, %rax
 338                 notw    %ax
 339                 bsfw    %ax, %ax
 340                 jnz     LeaveLoopCmps
 341                 
 342                 add     $16, %rdx
 343                 
 344                 jmp     LoopCmps
 345 LeaveLoopCmps:  add     %rax, %rdx
 346 #else
 347                 mov     (%windowbestlen, %rdx), %rax
 348                 xor     (%prev, %rdx), %rax
 349                 jnz     LeaveLoopCmps
 350                 
 351                 mov     8(%windowbestlen, %rdx), %rax
 352                 xor     8(%prev, %rdx), %rax
 353                 jnz     LeaveLoopCmps8
 354 
 355                 mov     16(%windowbestlen, %rdx), %rax
 356                 xor     16(%prev, %rdx), %rax
 357                 jnz     LeaveLoopCmps16
 358                                 
 359                 add     $24, %rdx
 360                 jnz     LoopCmps
 361                 jmp     LenMaximum
 362 #       if 0
 363 /*
 364  * This three-liner is tantalizingly simple, but bsf is a slow instruction,
 365  * and the complicated alternative down below is quite a bit faster. Sad...
 366  */
 367 
 368 LeaveLoopCmps:  bsf     %rax, %rax /* find the first non-zero bit */
 369                 shrl    $3, %eax /* divide by 8 to get the byte */
 370                 add     %rax, %rdx
 371 #       else
 372 LeaveLoopCmps16:
 373                 add     $8, %rdx
 374 LeaveLoopCmps8:
 375                 add     $8, %rdx
 376 LeaveLoopCmps:  testl   $0xFFFFFFFF, %eax /* Check the first 4 bytes */
 377                 jnz     Check16
 378                 add     $4, %rdx
 379                 shr     $32, %rax
 380 Check16:        testw   $0xFFFF, %ax
 381                 jnz     LenLower
 382                 add     $2, %rdx
 383                 shrl    $16, %eax
 384 LenLower:       subb    $1, %al
 385                 adc     $0, %rdx
 386 #       endif
 387 #endif
 388 
 389 /* Calculate the length of the match. If it is longer than MAX_MATCH,   */
 390 /* then automatically accept it as the best possible match and leave.   */
 391 
 392                 lea     (%prev, %rdx), %rax
 393                 sub     %scan, %rax
 394                 cmpl    $MAX_MATCH, %eax
 395                 jge     LenMaximum
 396 
 397 /* If the length of the match is not longer than the best match we      */
 398 /* have so far, then forget it and return to the lookup loop.           */
 399 
 400                 cmpl    %bestlend, %eax
 401                 jg      LongerMatch
 402                 mov     _windowbestlen, %windowbestlen
 403                 mov     dsPrev, %prev
 404                 movl    _chainlenwmask, %edx
 405                 jmp     LookupLoop
 406 
 407 /*         s->match_start = cur_match;                                       */
 408 /*         best_len = len;                                              */
 409 /*         if (len >= nice_match) break;                             */
 410 /*         scan_end = *(ushf*)(scan+best_len-1);                        */
 411 
 412 LongerMatch:
 413                 movl    %eax, %bestlend
 414                 movl    %curmatchd, dsMatchStart
 415                 cmpl    %nicematch, %eax
 416                 jge     LeaveNow
 417 
 418                 lea     (%window, %bestlen), %windowbestlen
 419                 mov     %windowbestlen, _windowbestlen
 420 
 421                 movzwl  -1(%scan, %rax), %scanend
 422                 mov     dsPrev, %prev
 423                 movl    _chainlenwmask, %chainlenwmask
 424                 jmp     LookupLoop
 425 
 426 /* Accept the current string, with the maximum possible length.         */
 427 
 428 LenMaximum:
 429                 movl    $MAX_MATCH, %bestlend
 430                 movl    %curmatchd, dsMatchStart
 431 
 432 /* if ((uInt)best_len <= s->lookahead) return (uInt)best_len;             */
 433 /* return s->lookahead;                                                      */
 434 
 435 LeaveNow:
 436                 movl    dsLookahead, %eax
 437                 cmpl    %eax, %bestlend
 438                 cmovngl %bestlend, %eax
 439 LookaheadRet:
 440 
 441 /* Restore the registers and return from whence we came.                        */
 442 
 443         mov     save_rsi, %rsi
 444         mov     save_rbx, %rbx
 445         mov     save_r12, %r12
 446         mov     save_r13, %r13
 447         mov     save_r14, %r14
 448         mov     save_r15, %r15
 449 
 450         ret
 451 
 452 match_init:     ret