1 ;uInt longest_match_x64( 2 ; deflate_state *s, 3 ; IPos cur_match); /* current match */ 4 5 ; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86_64 6 ; (AMD64 on Athlon 64, Opteron, Phenom 7 ; and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7) 8 ; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant. 9 ; 10 ; File written by Gilles Vollant, by converting to assembly the longest_match 11 ; from Jean-loup Gailly in deflate.c of zLib and infoZip zip. 12 ; 13 ; and by taking inspiration on asm686 with masm, optimised assembly code 14 ; from Brian Raiter, written 1998 15 ; 16 ; This software is provided 'as-is', without any express or implied 17 ; warranty. In no event will the authors be held liable for any damages 18 ; arising from the use of this software. 19 ; 20 ; Permission is granted to anyone to use this software for any purpose, 21 ; including commercial applications, and to alter it and redistribute it 22 ; freely, subject to the following restrictions: 23 ; 24 ; 1. The origin of this software must not be misrepresented; you must not 25 ; claim that you wrote the original software. If you use this software 26 ; in a product, an acknowledgment in the product documentation would be 27 ; appreciated but is not required. 28 ; 2. Altered source versions must be plainly marked as such, and must not be 29 ; misrepresented as being the original software 30 ; 3. This notice may not be removed or altered from any source distribution. 31 ; 32 ; 33 ; 34 ; http://www.zlib.net 35 ; http://www.winimage.com/zLibDll 36 ; http://www.muppetlabs.com/~breadbox/software/assembly.html 37 ; 38 ; to compile this file for infozip Zip, I use option: 39 ; ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm 40 ; 41 ; to compile this file for zLib, I use option: 42 ; ml64.exe /Flgvmat64 /c /Zi gvmat64.asm 43 ; Be carrefull to adapt zlib1222add below to your version of zLib 44 ; (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change 45 ; value of zlib1222add later) 46 ; 47 ; This file compile with Microsoft Macro Assembler (x64) for AMD64 48 ; 49 ; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK 50 ; 51 ; (you can get Windows WDK with ml64 for AMD64 from 52 ; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price) 53 ; 54 55 56 ;uInt longest_match(s, cur_match) 57 ; deflate_state *s; 58 ; IPos cur_match; /* current match */ 59 .code 60 longest_match PROC 61 62 63 ;LocalVarsSize equ 88 64 LocalVarsSize equ 72 65 66 ; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12 67 ; free register : r14,r15 68 ; register can be saved : rsp 69 70 chainlenwmask equ rsp + 8 - LocalVarsSize ; high word: current chain len 71 ; low word: s->wmask 72 ;window equ rsp + xx - LocalVarsSize ; local copy of s->window ; stored in r10 73 ;windowbestlen equ rsp + xx - LocalVarsSize ; s->window + bestlen , use r10+r11 74 ;scanstart equ rsp + xx - LocalVarsSize ; first two bytes of string ; stored in r12w 75 ;scanend equ rsp + xx - LocalVarsSize ; last two bytes of string use ebx 76 ;scanalign equ rsp + xx - LocalVarsSize ; dword-misalignment of string r13 77 ;bestlen equ rsp + xx - LocalVarsSize ; size of best match so far -> r11d 78 ;scan equ rsp + xx - LocalVarsSize ; ptr to string wanting match -> r9 79 IFDEF INFOZIP 80 ELSE 81 nicematch equ (rsp + 16 - LocalVarsSize) ; a good enough match size 82 ENDIF 83 84 save_rdi equ rsp + 24 - LocalVarsSize 85 save_rsi equ rsp + 32 - LocalVarsSize 86 save_rbx equ rsp + 40 - LocalVarsSize 87 save_rbp equ rsp + 48 - LocalVarsSize 88 save_r12 equ rsp + 56 - LocalVarsSize 89 save_r13 equ rsp + 64 - LocalVarsSize 90 ;save_r14 equ rsp + 72 - LocalVarsSize 91 ;save_r15 equ rsp + 80 - LocalVarsSize 92 93 94 ; summary of register usage 95 ; scanend ebx 96 ; scanendw bx 97 ; chainlenwmask edx 98 ; curmatch rsi 99 ; curmatchd esi 100 ; windowbestlen r8 101 ; scanalign r9 102 ; scanalignd r9d 103 ; window r10 104 ; bestlen r11 105 ; bestlend r11d 106 ; scanstart r12d 107 ; scanstartw r12w 108 ; scan r13 109 ; nicematch r14d 110 ; limit r15 111 ; limitd r15d 112 ; prev rcx 113 114 ; all the +4 offsets are due to the addition of pending_buf_size (in zlib 115 ; in the deflate_state structure since the asm code was first written 116 ; (if you compile with zlib 1.0.4 or older, remove the +4). 117 ; Note : these value are good with a 8 bytes boundary pack structure 118 119 120 MAX_MATCH equ 258 121 MIN_MATCH equ 3 122 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1) 123 124 125 ;;; Offsets for fields in the deflate_state structure. These numbers 126 ;;; are calculated from the definition of deflate_state, with the 127 ;;; assumption that the compiler will dword-align the fields. (Thus, 128 ;;; changing the definition of deflate_state could easily cause this 129 ;;; program to crash horribly, without so much as a warning at 130 ;;; compile time. Sigh.) 131 132 ; all the +zlib1222add offsets are due to the addition of fields 133 ; in zlib in the deflate_state structure since the asm code was first written 134 ; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). 135 ; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). 136 ; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). 137 138 139 IFDEF INFOZIP 140 141 _DATA SEGMENT 142 COMM window_size:DWORD 143 ; WMask ; 7fff 144 COMM window:BYTE:010040H 145 COMM prev:WORD:08000H 146 ; MatchLen : unused 147 ; PrevMatch : unused 148 COMM strstart:DWORD 149 COMM match_start:DWORD 150 ; Lookahead : ignore 151 COMM prev_length:DWORD ; PrevLen 152 COMM max_chain_length:DWORD 153 COMM good_match:DWORD 154 COMM nice_match:DWORD 155 prev_ad equ OFFSET prev 156 window_ad equ OFFSET window 157 nicematch equ nice_match 158 _DATA ENDS 159 WMask equ 07fffh 160 161 ELSE 162 163 IFNDEF zlib1222add 164 zlib1222add equ 8 165 ENDIF 166 dsWSize equ 56+zlib1222add+(zlib1222add/2) 167 dsWMask equ 64+zlib1222add+(zlib1222add/2) 168 dsWindow equ 72+zlib1222add 169 dsPrev equ 88+zlib1222add 170 dsMatchLen equ 128+zlib1222add 171 dsPrevMatch equ 132+zlib1222add 172 dsStrStart equ 140+zlib1222add 173 dsMatchStart equ 144+zlib1222add 174 dsLookahead equ 148+zlib1222add 175 dsPrevLen equ 152+zlib1222add 176 dsMaxChainLen equ 156+zlib1222add 177 dsGoodMatch equ 172+zlib1222add 178 dsNiceMatch equ 176+zlib1222add 179 180 window_size equ [ rcx + dsWSize] 181 WMask equ [ rcx + dsWMask] 182 window_ad equ [ rcx + dsWindow] 183 prev_ad equ [ rcx + dsPrev] 184 strstart equ [ rcx + dsStrStart] 185 match_start equ [ rcx + dsMatchStart] 186 Lookahead equ [ rcx + dsLookahead] ; 0ffffffffh on infozip 187 prev_length equ [ rcx + dsPrevLen] 188 max_chain_length equ [ rcx + dsMaxChainLen] 189 good_match equ [ rcx + dsGoodMatch] 190 nice_match equ [ rcx + dsNiceMatch] 191 ENDIF 192 193 ; parameter 1 in r8(deflate state s), param 2 in rdx (cur match) 194 195 ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and 196 ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp 197 ; 198 ; All registers must be preserved across the call, except for 199 ; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch. 200 201 202 203 ;;; Save registers that the compiler may be using, and adjust esp to 204 ;;; make room for our stack frame. 205 206 207 ;;; Retrieve the function arguments. r8d will hold cur_match 208 ;;; throughout the entire function. edx will hold the pointer to the 209 ;;; deflate_state structure during the function's setup (before 210 ;;; entering the main loop. 211 212 ; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match) 213 214 ; this clear high 32 bits of r8, which can be garbage in both r8 and rdx 215 216 mov [save_rdi],rdi 217 mov [save_rsi],rsi 218 mov [save_rbx],rbx 219 mov [save_rbp],rbp 220 IFDEF INFOZIP 221 mov r8d,ecx 222 ELSE 223 mov r8d,edx 224 ENDIF 225 mov [save_r12],r12 226 mov [save_r13],r13 227 ; mov [save_r14],r14 228 ; mov [save_r15],r15 229 230 231 ;;; uInt wmask = s->w_mask; 232 ;;; unsigned chain_length = s->max_chain_length; 233 ;;; if (s->prev_length >= s->good_match) { 234 ;;; chain_length >>= 2; 235 ;;; } 236 237 mov edi, prev_length 238 mov esi, good_match 239 mov eax, WMask 240 mov ebx, max_chain_length 241 cmp edi, esi 242 jl LastMatchGood 243 shr ebx, 2 244 LastMatchGood: 245 246 ;;; chainlen is decremented once beforehand so that the function can 247 ;;; use the sign flag instead of the zero flag for the exit test. 248 ;;; It is then shifted into the high word, to make room for the wmask 249 ;;; value, which it will always accompany. 250 251 dec ebx 252 shl ebx, 16 253 or ebx, eax 254 255 ;;; on zlib only 256 ;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; 257 258 IFDEF INFOZIP 259 mov [chainlenwmask], ebx 260 ; on infozip nice_match = [nice_match] 261 ELSE 262 mov eax, nice_match 263 mov [chainlenwmask], ebx 264 mov r10d, Lookahead 265 cmp r10d, eax 266 cmovnl r10d, eax 267 mov [nicematch],r10d 268 ENDIF 269 270 ;;; register Bytef *scan = s->window + s->strstart; 271 mov r10, window_ad 272 mov ebp, strstart 273 lea r13, [r10 + rbp] 274 275 ;;; Determine how many bytes the scan ptr is off from being 276 ;;; dword-aligned. 277 278 mov r9,r13 279 neg r13 280 and r13,3 281 282 ;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? 283 ;;; s->strstart - (IPos)MAX_DIST(s) : NIL; 284 IFDEF INFOZIP 285 mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1)) 286 ELSE 287 mov eax, window_size 288 sub eax, MIN_LOOKAHEAD 289 ENDIF 290 xor edi,edi 291 sub ebp, eax 292 293 mov r11d, prev_length 294 295 cmovng ebp,edi 296 297 ;;; int best_len = s->prev_length; 298 299 300 ;;; Store the sum of s->window + best_len in esi locally, and in esi. 301 302 lea rsi,[r10+r11] 303 304 ;;; register ush scan_start = *(ushf*)scan; 305 ;;; register ush scan_end = *(ushf*)(scan+best_len-1); 306 ;;; Posf *prev = s->prev; 307 308 movzx r12d,word ptr [r9] 309 movzx ebx, word ptr [r9 + r11 - 1] 310 311 mov rdi, prev_ad 312 313 ;;; Jump into the main loop. 314 315 mov edx, [chainlenwmask] 316 317 cmp bx,word ptr [rsi + r8 - 1] 318 jz LookupLoopIsZero 319 320 LookupLoop1: 321 and r8d, edx 322 323 movzx r8d, word ptr [rdi + r8*2] 324 cmp r8d, ebp 325 jbe LeaveNow 326 sub edx, 00010000h 327 js LeaveNow 328 329 LoopEntry1: 330 cmp bx,word ptr [rsi + r8 - 1] 331 jz LookupLoopIsZero 332 333 LookupLoop2: 334 and r8d, edx 335 336 movzx r8d, word ptr [rdi + r8*2] 337 cmp r8d, ebp 338 jbe LeaveNow 339 sub edx, 00010000h 340 js LeaveNow 341 342 LoopEntry2: 343 cmp bx,word ptr [rsi + r8 - 1] 344 jz LookupLoopIsZero 345 346 LookupLoop4: 347 and r8d, edx 348 349 movzx r8d, word ptr [rdi + r8*2] 350 cmp r8d, ebp 351 jbe LeaveNow 352 sub edx, 00010000h 353 js LeaveNow 354 355 LoopEntry4: 356 357 cmp bx,word ptr [rsi + r8 - 1] 358 jnz LookupLoop1 359 jmp LookupLoopIsZero 360 361 362 ;;; do { 363 ;;; match = s->window + cur_match; 364 ;;; if (*(ushf*)(match+best_len-1) != scan_end || 365 ;;; *(ushf*)match != scan_start) continue; 366 ;;; [...] 367 ;;; } while ((cur_match = prev[cur_match & wmask]) > limit 368 ;;; && --chain_length != 0); 369 ;;; 370 ;;; Here is the inner loop of the function. The function will spend the 371 ;;; majority of its time in this loop, and majority of that time will 372 ;;; be spent in the first ten instructions. 373 ;;; 374 ;;; Within this loop: 375 ;;; ebx = scanend 376 ;;; r8d = curmatch 377 ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) 378 ;;; esi = windowbestlen - i.e., (window + bestlen) 379 ;;; edi = prev 380 ;;; ebp = limit 381 382 LookupLoop: 383 and r8d, edx 384 385 movzx r8d, word ptr [rdi + r8*2] 386 cmp r8d, ebp 387 jbe LeaveNow 388 sub edx, 00010000h 389 js LeaveNow 390 391 LoopEntry: 392 393 cmp bx,word ptr [rsi + r8 - 1] 394 jnz LookupLoop1 395 LookupLoopIsZero: 396 cmp r12w, word ptr [r10 + r8] 397 jnz LookupLoop1 398 399 400 ;;; Store the current value of chainlen. 401 mov [chainlenwmask], edx 402 403 ;;; Point edi to the string under scrutiny, and esi to the string we 404 ;;; are hoping to match it up with. In actuality, esi and edi are 405 ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is 406 ;;; initialized to -(MAX_MATCH_8 - scanalign). 407 408 lea rsi,[r8+r10] 409 mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8) 410 lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8] 411 lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8] 412 413 prefetcht1 [rsi+rdx] 414 prefetcht1 [rdi+rdx] 415 416 417 ;;; Test the strings for equality, 8 bytes at a time. At the end, 418 ;;; adjust rdx so that it is offset to the exact byte that mismatched. 419 ;;; 420 ;;; We already know at this point that the first three bytes of the 421 ;;; strings match each other, and they can be safely passed over before 422 ;;; starting the compare loop. So what this code does is skip over 0-3 423 ;;; bytes, as much as necessary in order to dword-align the edi 424 ;;; pointer. (rsi will still be misaligned three times out of four.) 425 ;;; 426 ;;; It should be confessed that this loop usually does not represent 427 ;;; much of the total running time. Replacing it with a more 428 ;;; straightforward "rep cmpsb" would not drastically degrade 429 ;;; performance. 430 431 432 LoopCmps: 433 mov rax, [rsi + rdx] 434 xor rax, [rdi + rdx] 435 jnz LeaveLoopCmps 436 437 mov rax, [rsi + rdx + 8] 438 xor rax, [rdi + rdx + 8] 439 jnz LeaveLoopCmps8 440 441 442 mov rax, [rsi + rdx + 8+8] 443 xor rax, [rdi + rdx + 8+8] 444 jnz LeaveLoopCmps16 445 446 add rdx,8+8+8 447 448 jnz short LoopCmps 449 jmp short LenMaximum 450 LeaveLoopCmps16: add rdx,8 451 LeaveLoopCmps8: add rdx,8 452 LeaveLoopCmps: 453 454 test eax, 0000FFFFh 455 jnz LenLower 456 457 test eax,0ffffffffh 458 459 jnz LenLower32 460 461 add rdx,4 462 shr rax,32 463 or ax,ax 464 jnz LenLower 465 466 LenLower32: 467 shr eax,16 468 add rdx,2 469 LenLower: sub al, 1 470 adc rdx, 0 471 ;;; Calculate the length of the match. If it is longer than MAX_MATCH, 472 ;;; then automatically accept it as the best possible match and leave. 473 474 lea rax, [rdi + rdx] 475 sub rax, r9 476 cmp eax, MAX_MATCH 477 jge LenMaximum 478 479 ;;; If the length of the match is not longer than the best match we 480 ;;; have so far, then forget it and return to the lookup loop. 481 ;/////////////////////////////////// 482 483 cmp eax, r11d 484 jg LongerMatch 485 486 lea rsi,[r10+r11] 487 488 mov rdi, prev_ad 489 mov edx, [chainlenwmask] 490 jmp LookupLoop 491 492 ;;; s->match_start = cur_match; 493 ;;; best_len = len; 494 ;;; if (len >= nice_match) break; 495 ;;; scan_end = *(ushf*)(scan+best_len-1); 496 497 LongerMatch: 498 mov r11d, eax 499 mov match_start, r8d 500 cmp eax, [nicematch] 501 jge LeaveNow 502 503 lea rsi,[r10+rax] 504 505 movzx ebx, word ptr [r9 + rax - 1] 506 mov rdi, prev_ad 507 mov edx, [chainlenwmask] 508 jmp LookupLoop 509 510 ;;; Accept the current string, with the maximum possible length. 511 512 LenMaximum: 513 mov r11d,MAX_MATCH 514 mov match_start, r8d 515 516 ;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; 517 ;;; return s->lookahead; 518 519 LeaveNow: 520 IFDEF INFOZIP 521 mov eax,r11d 522 ELSE 523 mov eax, Lookahead 524 cmp r11d, eax 525 cmovng eax, r11d 526 ENDIF 527 528 ;;; Restore the stack and return from whence we came. 529 530 531 mov rsi,[save_rsi] 532 mov rdi,[save_rdi] 533 mov rbx,[save_rbx] 534 mov rbp,[save_rbp] 535 mov r12,[save_r12] 536 mov r13,[save_r13] 537 ; mov r14,[save_r14] 538 ; mov r15,[save_r15] 539 540 541 ret 0 542 ; please don't remove this string ! 543 ; Your can freely use gvmat64 in any free or commercial app 544 ; but it is far better don't remove the string in the binary! 545 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0 546 longest_match ENDP 547 548 match_init PROC 549 ret 0 550 match_init ENDP 551 552 553 END