1 ;uInt longest_match_x64(
   2 ;    deflate_state *s,
   3 ;    IPos cur_match);                             /* current match */
   4 
   5 ; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86_64
   6 ;  (AMD64 on Athlon 64, Opteron, Phenom
   7 ;     and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
   8 ; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
   9 ;
  10 ; File written by Gilles Vollant, by converting to assembly the longest_match
  11 ;  from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
  12 ;
  13 ;  and by taking inspiration on asm686 with masm, optimised assembly code
  14 ;        from Brian Raiter, written 1998
  15 ;
  16 ;  This software is provided 'as-is', without any express or implied
  17 ;  warranty.  In no event will the authors be held liable for any damages
  18 ;  arising from the use of this software.
  19 ;
  20 ;  Permission is granted to anyone to use this software for any purpose,
  21 ;  including commercial applications, and to alter it and redistribute it
  22 ;  freely, subject to the following restrictions:
  23 ;
  24 ;  1. The origin of this software must not be misrepresented; you must not
  25 ;     claim that you wrote the original software. If you use this software
  26 ;     in a product, an acknowledgment in the product documentation would be
  27 ;     appreciated but is not required.
  28 ;  2. Altered source versions must be plainly marked as such, and must not be
  29 ;     misrepresented as being the original software
  30 ;  3. This notice may not be removed or altered from any source distribution.
  31 ;
  32 ;
  33 ;
  34 ;         http://www.zlib.net
  35 ;         http://www.winimage.com/zLibDll
  36 ;         http://www.muppetlabs.com/~breadbox/software/assembly.html
  37 ;
  38 ; to compile this file for infozip Zip, I use option:
  39 ;   ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm
  40 ;
  41 ; to compile this file for zLib, I use option:
  42 ;   ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
  43 ; Be carrefull to adapt zlib1222add below to your version of zLib
  44 ;   (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change
  45 ;    value of zlib1222add later)
  46 ;
  47 ; This file compile with Microsoft Macro Assembler (x64) for AMD64
  48 ;
  49 ;   ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
  50 ;
  51 ;   (you can get Windows WDK with ml64 for AMD64 from
  52 ;      http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
  53 ;
  54 
  55 
  56 ;uInt longest_match(s, cur_match)
  57 ;    deflate_state *s;
  58 ;    IPos cur_match;                             /* current match */
  59 .code
  60 longest_match PROC
  61 
  62 
  63 ;LocalVarsSize   equ 88
  64  LocalVarsSize   equ 72
  65 
  66 ; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
  67 ; free register :  r14,r15
  68 ; register can be saved : rsp
  69 
  70  chainlenwmask   equ  rsp + 8 - LocalVarsSize    ; high word: current chain len
  71                                                  ; low word: s->wmask
  72 ;window          equ  rsp + xx - LocalVarsSize   ; local copy of s->window ; stored in r10
  73 ;windowbestlen   equ  rsp + xx - LocalVarsSize   ; s->window + bestlen , use r10+r11
  74 ;scanstart       equ  rsp + xx - LocalVarsSize   ; first two bytes of string ; stored in r12w
  75 ;scanend         equ  rsp + xx - LocalVarsSize   ; last two bytes of string use ebx
  76 ;scanalign       equ  rsp + xx - LocalVarsSize   ; dword-misalignment of string r13
  77 ;bestlen         equ  rsp + xx - LocalVarsSize   ; size of best match so far -> r11d
  78 ;scan            equ  rsp + xx - LocalVarsSize   ; ptr to string wanting match -> r9
  79 IFDEF INFOZIP
  80 ELSE
  81  nicematch       equ  (rsp + 16 - LocalVarsSize) ; a good enough match size
  82 ENDIF
  83 
  84 save_rdi        equ  rsp + 24 - LocalVarsSize
  85 save_rsi        equ  rsp + 32 - LocalVarsSize
  86 save_rbx        equ  rsp + 40 - LocalVarsSize
  87 save_rbp        equ  rsp + 48 - LocalVarsSize
  88 save_r12        equ  rsp + 56 - LocalVarsSize
  89 save_r13        equ  rsp + 64 - LocalVarsSize
  90 ;save_r14        equ  rsp + 72 - LocalVarsSize
  91 ;save_r15        equ  rsp + 80 - LocalVarsSize
  92 
  93 
  94 ; summary of register usage
  95 ; scanend     ebx
  96 ; scanendw    bx
  97 ; chainlenwmask   edx
  98 ; curmatch    rsi
  99 ; curmatchd   esi
 100 ; windowbestlen   r8
 101 ; scanalign   r9
 102 ; scanalignd  r9d
 103 ; window      r10
 104 ; bestlen     r11
 105 ; bestlend    r11d
 106 ; scanstart   r12d
 107 ; scanstartw  r12w
 108 ; scan        r13
 109 ; nicematch   r14d
 110 ; limit       r15
 111 ; limitd      r15d
 112 ; prev        rcx
 113 
 114 ;  all the +4 offsets are due to the addition of pending_buf_size (in zlib
 115 ;  in the deflate_state structure since the asm code was first written
 116 ;  (if you compile with zlib 1.0.4 or older, remove the +4).
 117 ;  Note : these value are good with a 8 bytes boundary pack structure
 118 
 119 
 120     MAX_MATCH           equ     258
 121     MIN_MATCH           equ     3
 122     MIN_LOOKAHEAD       equ     (MAX_MATCH+MIN_MATCH+1)
 123 
 124 
 125 ;;; Offsets for fields in the deflate_state structure. These numbers
 126 ;;; are calculated from the definition of deflate_state, with the
 127 ;;; assumption that the compiler will dword-align the fields. (Thus,
 128 ;;; changing the definition of deflate_state could easily cause this
 129 ;;; program to crash horribly, without so much as a warning at
 130 ;;; compile time. Sigh.)
 131 
 132 ;  all the +zlib1222add offsets are due to the addition of fields
 133 ;  in zlib in the deflate_state structure since the asm code was first written
 134 ;  (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
 135 ;  (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
 136 ;  if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
 137 
 138 
 139 IFDEF INFOZIP
 140 
 141 _DATA   SEGMENT
 142 COMM    window_size:DWORD
 143 ; WMask ; 7fff
 144 COMM    window:BYTE:010040H
 145 COMM    prev:WORD:08000H
 146 ; MatchLen : unused
 147 ; PrevMatch : unused
 148 COMM    strstart:DWORD
 149 COMM    match_start:DWORD
 150 ; Lookahead : ignore
 151 COMM    prev_length:DWORD ; PrevLen
 152 COMM    max_chain_length:DWORD
 153 COMM    good_match:DWORD
 154 COMM    nice_match:DWORD
 155 prev_ad equ OFFSET prev
 156 window_ad equ OFFSET window
 157 nicematch equ nice_match
 158 _DATA ENDS
 159 WMask equ 07fffh
 160 
 161 ELSE
 162 
 163   IFNDEF zlib1222add
 164     zlib1222add equ 8
 165   ENDIF
 166 dsWSize         equ 56+zlib1222add+(zlib1222add/2)
 167 dsWMask         equ 64+zlib1222add+(zlib1222add/2)
 168 dsWindow        equ 72+zlib1222add
 169 dsPrev          equ 88+zlib1222add
 170 dsMatchLen      equ 128+zlib1222add
 171 dsPrevMatch     equ 132+zlib1222add
 172 dsStrStart      equ 140+zlib1222add
 173 dsMatchStart    equ 144+zlib1222add
 174 dsLookahead     equ 148+zlib1222add
 175 dsPrevLen       equ 152+zlib1222add
 176 dsMaxChainLen   equ 156+zlib1222add
 177 dsGoodMatch     equ 172+zlib1222add
 178 dsNiceMatch     equ 176+zlib1222add
 179 
 180 window_size     equ [ rcx + dsWSize]
 181 WMask           equ [ rcx + dsWMask]
 182 window_ad       equ [ rcx + dsWindow]
 183 prev_ad         equ [ rcx + dsPrev]
 184 strstart        equ [ rcx + dsStrStart]
 185 match_start     equ [ rcx + dsMatchStart]
 186 Lookahead       equ [ rcx + dsLookahead] ; 0ffffffffh on infozip
 187 prev_length     equ [ rcx + dsPrevLen]
 188 max_chain_length equ [ rcx + dsMaxChainLen]
 189 good_match      equ [ rcx + dsGoodMatch]
 190 nice_match      equ [ rcx + dsNiceMatch]
 191 ENDIF
 192 
 193 ; parameter 1 in r8(deflate state s), param 2 in rdx (cur match)
 194 
 195 ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
 196 ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
 197 ;
 198 ; All registers must be preserved across the call, except for
 199 ;   rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
 200 
 201 
 202 
 203 ;;; Save registers that the compiler may be using, and adjust esp to
 204 ;;; make room for our stack frame.
 205 
 206 
 207 ;;; Retrieve the function arguments. r8d will hold cur_match
 208 ;;; throughout the entire function. edx will hold the pointer to the
 209 ;;; deflate_state structure during the function's setup (before
 210 ;;; entering the main loop.
 211 
 212 ; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
 213 
 214 ; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
 215 
 216         mov [save_rdi],rdi
 217         mov [save_rsi],rsi
 218         mov [save_rbx],rbx
 219         mov [save_rbp],rbp
 220 IFDEF INFOZIP
 221         mov r8d,ecx
 222 ELSE
 223         mov r8d,edx
 224 ENDIF
 225         mov [save_r12],r12
 226         mov [save_r13],r13
 227 ;        mov [save_r14],r14
 228 ;        mov [save_r15],r15
 229 
 230 
 231 ;;; uInt wmask = s->w_mask;
 232 ;;; unsigned chain_length = s->max_chain_length;
 233 ;;; if (s->prev_length >= s->good_match) {
 234 ;;;     chain_length >>= 2;
 235 ;;; }
 236 
 237         mov edi, prev_length
 238         mov esi, good_match
 239         mov eax, WMask
 240         mov ebx, max_chain_length
 241         cmp edi, esi
 242         jl  LastMatchGood
 243         shr ebx, 2
 244 LastMatchGood:
 245 
 246 ;;; chainlen is decremented once beforehand so that the function can
 247 ;;; use the sign flag instead of the zero flag for the exit test.
 248 ;;; It is then shifted into the high word, to make room for the wmask
 249 ;;; value, which it will always accompany.
 250 
 251         dec ebx
 252         shl ebx, 16
 253         or  ebx, eax
 254 
 255 ;;; on zlib only
 256 ;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
 257 
 258 IFDEF INFOZIP
 259         mov [chainlenwmask], ebx
 260 ; on infozip nice_match = [nice_match]
 261 ELSE
 262         mov eax, nice_match
 263         mov [chainlenwmask], ebx
 264         mov r10d, Lookahead
 265         cmp r10d, eax
 266         cmovnl r10d, eax
 267         mov [nicematch],r10d
 268 ENDIF
 269 
 270 ;;; register Bytef *scan = s->window + s->strstart;
 271         mov r10, window_ad
 272         mov ebp, strstart
 273         lea r13, [r10 + rbp]
 274 
 275 ;;; Determine how many bytes the scan ptr is off from being
 276 ;;; dword-aligned.
 277 
 278          mov r9,r13
 279          neg r13
 280          and r13,3
 281 
 282 ;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
 283 ;;;     s->strstart - (IPos)MAX_DIST(s) : NIL;
 284 IFDEF INFOZIP
 285         mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1))
 286 ELSE
 287         mov eax, window_size
 288         sub eax, MIN_LOOKAHEAD
 289 ENDIF
 290         xor edi,edi
 291         sub ebp, eax
 292 
 293         mov r11d, prev_length
 294 
 295         cmovng ebp,edi
 296 
 297 ;;; int best_len = s->prev_length;
 298 
 299 
 300 ;;; Store the sum of s->window + best_len in esi locally, and in esi.
 301 
 302        lea  rsi,[r10+r11]
 303 
 304 ;;; register ush scan_start = *(ushf*)scan;
 305 ;;; register ush scan_end   = *(ushf*)(scan+best_len-1);
 306 ;;; Posf *prev = s->prev;
 307 
 308         movzx r12d,word ptr [r9]
 309         movzx ebx, word ptr [r9 + r11 - 1]
 310 
 311         mov rdi, prev_ad
 312 
 313 ;;; Jump into the main loop.
 314 
 315         mov edx, [chainlenwmask]
 316 
 317         cmp bx,word ptr [rsi + r8 - 1]
 318         jz  LookupLoopIsZero
 319 
 320 LookupLoop1:
 321         and r8d, edx
 322 
 323         movzx   r8d, word ptr [rdi + r8*2]
 324         cmp r8d, ebp
 325         jbe LeaveNow
 326         sub edx, 00010000h
 327         js  LeaveNow
 328 
 329 LoopEntry1:
 330         cmp bx,word ptr [rsi + r8 - 1]
 331         jz  LookupLoopIsZero
 332 
 333 LookupLoop2:
 334         and r8d, edx
 335 
 336         movzx   r8d, word ptr [rdi + r8*2]
 337         cmp r8d, ebp
 338         jbe LeaveNow
 339         sub edx, 00010000h
 340         js  LeaveNow
 341 
 342 LoopEntry2:
 343         cmp bx,word ptr [rsi + r8 - 1]
 344         jz  LookupLoopIsZero
 345 
 346 LookupLoop4:
 347         and r8d, edx
 348 
 349         movzx   r8d, word ptr [rdi + r8*2]
 350         cmp r8d, ebp
 351         jbe LeaveNow
 352         sub edx, 00010000h
 353         js  LeaveNow
 354 
 355 LoopEntry4:
 356 
 357         cmp bx,word ptr [rsi + r8 - 1]
 358         jnz LookupLoop1
 359         jmp LookupLoopIsZero
 360 
 361 
 362 ;;; do {
 363 ;;;     match = s->window + cur_match;
 364 ;;;     if (*(ushf*)(match+best_len-1) != scan_end ||
 365 ;;;         *(ushf*)match != scan_start) continue;
 366 ;;;     [...]
 367 ;;; } while ((cur_match = prev[cur_match & wmask]) > limit
 368 ;;;          && --chain_length != 0);
 369 ;;;
 370 ;;; Here is the inner loop of the function. The function will spend the
 371 ;;; majority of its time in this loop, and majority of that time will
 372 ;;; be spent in the first ten instructions.
 373 ;;;
 374 ;;; Within this loop:
 375 ;;; ebx = scanend
 376 ;;; r8d = curmatch
 377 ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
 378 ;;; esi = windowbestlen - i.e., (window + bestlen)
 379 ;;; edi = prev
 380 ;;; ebp = limit
 381 
 382 LookupLoop:
 383         and r8d, edx
 384 
 385         movzx   r8d, word ptr [rdi + r8*2]
 386         cmp r8d, ebp
 387         jbe LeaveNow
 388         sub edx, 00010000h
 389         js  LeaveNow
 390 
 391 LoopEntry:
 392 
 393         cmp bx,word ptr [rsi + r8 - 1]
 394         jnz LookupLoop1
 395 LookupLoopIsZero:
 396         cmp     r12w, word ptr [r10 + r8]
 397         jnz LookupLoop1
 398 
 399 
 400 ;;; Store the current value of chainlen.
 401         mov [chainlenwmask], edx
 402 
 403 ;;; Point edi to the string under scrutiny, and esi to the string we
 404 ;;; are hoping to match it up with. In actuality, esi and edi are
 405 ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
 406 ;;; initialized to -(MAX_MATCH_8 - scanalign).
 407 
 408         lea rsi,[r8+r10]
 409         mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8)
 410         lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8]
 411         lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8]
 412 
 413         prefetcht1 [rsi+rdx]
 414         prefetcht1 [rdi+rdx]
 415 
 416 
 417 ;;; Test the strings for equality, 8 bytes at a time. At the end,
 418 ;;; adjust rdx so that it is offset to the exact byte that mismatched.
 419 ;;;
 420 ;;; We already know at this point that the first three bytes of the
 421 ;;; strings match each other, and they can be safely passed over before
 422 ;;; starting the compare loop. So what this code does is skip over 0-3
 423 ;;; bytes, as much as necessary in order to dword-align the edi
 424 ;;; pointer. (rsi will still be misaligned three times out of four.)
 425 ;;;
 426 ;;; It should be confessed that this loop usually does not represent
 427 ;;; much of the total running time. Replacing it with a more
 428 ;;; straightforward "rep cmpsb" would not drastically degrade
 429 ;;; performance.
 430 
 431 
 432 LoopCmps:
 433         mov rax, [rsi + rdx]
 434         xor rax, [rdi + rdx]
 435         jnz LeaveLoopCmps
 436 
 437         mov rax, [rsi + rdx + 8]
 438         xor rax, [rdi + rdx + 8]
 439         jnz LeaveLoopCmps8
 440 
 441 
 442         mov rax, [rsi + rdx + 8+8]
 443         xor rax, [rdi + rdx + 8+8]
 444         jnz LeaveLoopCmps16
 445 
 446         add rdx,8+8+8
 447 
 448         jnz short LoopCmps
 449         jmp short LenMaximum
 450 LeaveLoopCmps16: add rdx,8
 451 LeaveLoopCmps8: add rdx,8
 452 LeaveLoopCmps:
 453 
 454         test    eax, 0000FFFFh
 455         jnz LenLower
 456 
 457         test eax,0ffffffffh
 458 
 459         jnz LenLower32
 460 
 461         add rdx,4
 462         shr rax,32
 463         or ax,ax
 464         jnz LenLower
 465 
 466 LenLower32:
 467         shr eax,16
 468         add rdx,2
 469 LenLower:   sub al, 1
 470         adc rdx, 0
 471 ;;; Calculate the length of the match. If it is longer than MAX_MATCH,
 472 ;;; then automatically accept it as the best possible match and leave.
 473 
 474         lea rax, [rdi + rdx]
 475         sub rax, r9
 476         cmp eax, MAX_MATCH
 477         jge LenMaximum
 478 
 479 ;;; If the length of the match is not longer than the best match we
 480 ;;; have so far, then forget it and return to the lookup loop.
 481 ;///////////////////////////////////
 482 
 483         cmp eax, r11d
 484         jg  LongerMatch
 485 
 486         lea rsi,[r10+r11]
 487 
 488         mov rdi, prev_ad
 489         mov edx, [chainlenwmask]
 490         jmp LookupLoop
 491 
 492 ;;;         s->match_start = cur_match;
 493 ;;;         best_len = len;
 494 ;;;         if (len >= nice_match) break;
 495 ;;;         scan_end = *(ushf*)(scan+best_len-1);
 496 
 497 LongerMatch:
 498         mov r11d, eax
 499         mov match_start, r8d
 500         cmp eax, [nicematch]
 501         jge LeaveNow
 502 
 503         lea rsi,[r10+rax]
 504 
 505         movzx   ebx, word ptr [r9 + rax - 1]
 506         mov rdi, prev_ad
 507         mov edx, [chainlenwmask]
 508         jmp LookupLoop
 509 
 510 ;;; Accept the current string, with the maximum possible length.
 511 
 512 LenMaximum:
 513         mov r11d,MAX_MATCH
 514         mov match_start, r8d
 515 
 516 ;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
 517 ;;; return s->lookahead;
 518 
 519 LeaveNow:
 520 IFDEF INFOZIP
 521         mov eax,r11d
 522 ELSE
 523         mov eax, Lookahead
 524         cmp r11d, eax
 525         cmovng eax, r11d
 526 ENDIF
 527 
 528 ;;; Restore the stack and return from whence we came.
 529 
 530 
 531         mov rsi,[save_rsi]
 532         mov rdi,[save_rdi]
 533         mov rbx,[save_rbx]
 534         mov rbp,[save_rbp]
 535         mov r12,[save_r12]
 536         mov r13,[save_r13]
 537 ;        mov r14,[save_r14]
 538 ;        mov r15,[save_r15]
 539 
 540 
 541         ret 0
 542 ; please don't remove this string !
 543 ; Your can freely use gvmat64 in any free or commercial app
 544 ; but it is far better don't remove the string in the binary!
 545     db     0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
 546 longest_match   ENDP
 547 
 548 match_init PROC
 549   ret 0
 550 match_init ENDP
 551 
 552 
 553 END