1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* Copyright (c) 1988 AT&T */ 23 /* All Rights Reserved */ 24 25 26 /* 27 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 28 * Use is subject to license terms. 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <ctype.h> 34 #include <stdio.h> 35 #include <sys/types.h> 36 #include <sys/sysmacros.h> 37 #include <sys/byteorder.h> 38 #if SHARE 39 #include <sys/ipc.h> 40 #include <sys/shm.h> 41 #define ERR -1 42 #endif 43 #include "invlib.h" 44 #include "library.h" 45 46 #define DEBUG 0 /* debugging code and realloc messages */ 47 #define BLOCKSIZE 2 * BUFSIZ /* logical block size */ 48 #define LINEMAX 1000 /* sorted posting line max size */ 49 #define POSTINC 10000 /* posting buffer size increment */ 50 #define SEP ' ' /* sorted posting field separator */ 51 #define SETINC 100 /* posting set size increment */ 52 #define STATS 0 /* print statistics */ 53 #define SUPERINC 10000 /* super index size increment */ 54 #define TERMMAX 512 /* term max size */ 55 #define VERSION 1 /* inverted index format version */ 56 #define ZIPFSIZE 200 /* zipf curve size */ 57 #define FREAD "r" /* fopen for reading */ 58 #define FREADP "r+" /* fopen for update */ 59 #define FWRITE "w" /* fopen truncate or create for writing */ 60 #define FWRITEP "w+" /* fopen truncate or create for update */ 61 62 extern char *argv0; /* command name (must be set in main function) */ 63 64 int invbreak; 65 66 #if STATS 67 int showzipf; /* show postings per term distribution */ 68 #endif 69 70 static POSTING *item, *enditem, *item1 = NULL, *item2 = NULL; 71 static unsigned setsize1, setsize2; 72 static long numitems, totterm, zerolong; 73 static char *indexfile, *postingfile; 74 static FILE *outfile, *fpost; 75 static unsigned supersize = SUPERINC, supintsize; 76 static int numpost, numlogblk, amtused, nextpost, 77 lastinblk, numinvitems; 78 static POSTING *POST, *postptr; 79 static unsigned long *SUPINT, *supint, nextsupfing; 80 static char *SUPFING, *supfing; 81 static char thisterm[TERMMAX]; 82 static union { 83 long invblk[BLOCKSIZE / sizeof (long)]; 84 char chrblk[BLOCKSIZE]; 85 } logicalblk; 86 87 #if DEBUG || STATS 88 static long totpost; 89 #endif 90 91 #if STATS 92 static int zipf[ZIPFSIZE + 1]; 93 #endif 94 95 static void invcannotalloc(size_t n); 96 static void invcannotopen(char *file); 97 static void invcannotwrite(char *file); 98 static int invnewterm(void); 99 static int boolready(void); 100 101 long 102 invmake(char *invname, char *invpost, FILE *infile) 103 { 104 unsigned char *s; 105 long num; 106 int i; 107 long fileindex; 108 unsigned postsize = POSTINC * sizeof (POSTING); 109 unsigned long *intptr; 110 char line[LINEMAX]; 111 long tlong; 112 PARAM param; 113 POSTING posting; 114 #if STATS 115 int j; 116 unsigned maxtermlen = 0; 117 #endif 118 /* output file */ 119 if ((outfile = vpfopen(invname, FWRITEP)) == NULL) { 120 invcannotopen(invname); 121 return (0); 122 } 123 indexfile = invname; 124 (void) fseek(outfile, (long)BUFSIZ, 0); 125 126 /* posting file */ 127 if ((fpost = vpfopen(invpost, FWRITE)) == NULL) { 128 invcannotopen(invpost); 129 return (0); 130 } 131 postingfile = invpost; 132 nextpost = 0; 133 /* get space for the postings list */ 134 if ((POST = (POSTING *)malloc(postsize)) == NULL) { 135 invcannotalloc(postsize); 136 return (0); 137 } 138 postptr = POST; 139 /* get space for the superfinger (superindex) */ 140 if ((SUPFING = malloc(supersize)) == NULL) { 141 invcannotalloc(supersize); 142 return (0); 143 } 144 supfing = SUPFING; 145 supintsize = supersize / 40; 146 /* also for the superfinger index */ 147 if ((SUPINT = malloc(supintsize * sizeof (long))) == NULL) { 148 invcannotalloc(supintsize * sizeof (long)); 149 return (0); 150 } 151 supint = SUPINT; 152 supint++; /* leave first term open for a count */ 153 /* initialize using an empty term */ 154 (void) strcpy(thisterm, ""); 155 *supint++ = 0; 156 *supfing++ = ' '; 157 *supfing++ = '\0'; 158 nextsupfing = 2; 159 #if DEBUG || STATS 160 totpost = 0L; 161 #endif 162 totterm = 0L; 163 numpost = 1; 164 165 /* 166 * set up as though a block had come and gone, i.e., set up for 167 * new block 168 */ 169 amtused = 16; /* leave no space - init 3 words + one for luck */ 170 numinvitems = 0; 171 numlogblk = 0; 172 lastinblk = BLOCKSIZE; 173 174 /* now loop as long as more to read (till eof) */ 175 while (fgets(line, LINEMAX, infile) != NULL) { 176 #if DEBUG || STATS 177 ++totpost; 178 #endif 179 s = (unsigned char *) strchr(line, SEP); 180 if (s == NULL) /* where did this line come from ??? */ 181 continue; /* workaround: just skip it */ 182 *s = '\0'; 183 #if STATS 184 if ((i = strlen(line)) > maxtermlen) { 185 maxtermlen = i; 186 } 187 #endif 188 #if DEBUG 189 (void) printf("%ld: %s ", totpost, line); 190 (void) fflush(stdout); 191 #endif 192 if (strcmp(thisterm, line) == 0) { 193 if (postptr + 10 > POST + postsize / sizeof (POSTING)) { 194 i = postptr - POST; 195 postsize += POSTINC * sizeof (POSTING); 196 if ((POST = realloc(POST, postsize)) == NULL) { 197 invcannotalloc(postsize); 198 return (0); 199 } 200 postptr = i + POST; 201 #if DEBUG 202 (void) printf("reallocated post space to %u, " 203 "totpost=%ld\n", postsize, totpost); 204 #endif 205 } 206 numpost++; 207 } else { 208 /* have a new term */ 209 if (!invnewterm()) { 210 return (0); 211 } 212 (void) strcpy(thisterm, line); 213 numpost = 1; 214 postptr = POST; 215 fileindex = 0; 216 } 217 /* get the new posting */ 218 num = *++s - '!'; 219 i = 1; 220 do { 221 num = BASE * num + *++s - '!'; 222 } while (++i < PRECISION); 223 posting.lineoffset = num; 224 while (++fileindex < nsrcoffset && num > srcoffset[fileindex]) { 225 ; 226 } 227 posting.fileindex = --fileindex; 228 posting.type = *++s; 229 num = *++s - '!'; 230 if (*s != '\n') { 231 num = *++s - '!'; 232 while (*++s != '\n') { 233 num = BASE * num + *s - '!'; 234 } 235 posting.fcnoffset = num; 236 } else { 237 posting.fcnoffset = 0; 238 } 239 *postptr++ = posting; 240 #if DEBUG 241 (void) printf("%ld %ld %ld %ld\n", posting.fileindex, 242 posting.fcnoffset, posting.lineoffset, posting.type); 243 (void) fflush(stdout); 244 #endif 245 } 246 if (!invnewterm()) { 247 return (0); 248 } 249 /* now clean up final block */ 250 logicalblk.invblk[0] = numinvitems; 251 /* loops pointer around to start */ 252 logicalblk.invblk[1] = 0; 253 logicalblk.invblk[2] = numlogblk - 1; 254 if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) { 255 goto cannotwrite; 256 } 257 numlogblk++; 258 /* write out block to save space. what in it doesn't matter */ 259 if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) { 260 goto cannotwrite; 261 } 262 /* finish up the super finger */ 263 *SUPINT = numlogblk; 264 /* add to the offsets the size of the offset pointers */ 265 intptr = (SUPINT + 1); 266 i = (char *)supint - (char *)SUPINT; 267 while (intptr < supint) 268 *intptr++ += i; 269 /* write out the offsets (1 for the N at start) and the super finger */ 270 if (fwrite((char *)SUPINT, sizeof (*SUPINT), numlogblk + 1, 271 outfile) == 0 || 272 fwrite(SUPFING, 1, supfing - SUPFING, outfile) == 0) { 273 goto cannotwrite; 274 } 275 /* save the size for reference later */ 276 nextsupfing = sizeof (long) + sizeof (long) * numlogblk + 277 (supfing - SUPFING); 278 /* 279 * make sure the file ends at a logical block boundary. This is 280 * necessary for invinsert to correctly create extended blocks 281 */ 282 i = nextsupfing % BLOCKSIZE; 283 /* write out junk to fill log blk */ 284 if (fwrite(SUPFING, BLOCKSIZE - i, 1, outfile) == 0 || 285 fflush(outfile) == EOF) { 286 /* rewind doesn't check for write failure */ 287 goto cannotwrite; 288 } 289 /* write the control area */ 290 rewind(outfile); 291 param.version = VERSION; 292 param.filestat = 0; 293 param.sizeblk = BLOCKSIZE; 294 param.startbyte = (numlogblk + 1) * BLOCKSIZE + BUFSIZ; 295 param.supsize = nextsupfing; 296 param.cntlsize = BUFSIZ; 297 param.share = 0; 298 if (fwrite((char *)¶m, sizeof (param), 1, outfile) == 0) { 299 goto cannotwrite; 300 } 301 for (i = 0; i < 10; i++) /* for future use */ 302 if (fwrite((char *)&zerolong, sizeof (zerolong), 303 1, outfile) == 0) { 304 goto cannotwrite; 305 } 306 307 /* make first block loop backwards to last block */ 308 if (fflush(outfile) == EOF) { 309 /* fseek doesn't check for write failure */ 310 goto cannotwrite; 311 } 312 /* get to second word first block */ 313 (void) fseek(outfile, (long)BUFSIZ + 8, 0); 314 tlong = numlogblk - 1; 315 if (fwrite((char *)&tlong, sizeof (tlong), 1, outfile) == 0 || 316 fclose(outfile) == EOF) { 317 cannotwrite: 318 invcannotwrite(invname); 319 return (0); 320 } 321 if (fclose(fpost) == EOF) { 322 invcannotwrite(postingfile); 323 return (0); 324 } 325 --totterm; /* don't count null term */ 326 #if STATS 327 (void) printf("logical blocks = %d, postings = %ld, terms = %ld, " 328 "max term length = %d\n", numlogblk, totpost, totterm, maxtermlen); 329 if (showzipf) { 330 (void) printf( 331 "\n************* ZIPF curve ****************\n"); 332 for (j = ZIPFSIZE; j > 1; j--) 333 if (zipf[j]) 334 break; 335 for (i = 1; i < j; ++i) { 336 (void) printf("%3d -%6d ", i, zipf[i]); 337 if (i % 6 == 0) (void) putchar('\n'); 338 } 339 (void) printf(">%d-%6d\n", ZIPFSIZE, zipf[0]); 340 } 341 #endif 342 /* free all malloc'd memory */ 343 free(POST); 344 free(SUPFING); 345 free(SUPINT); 346 return (totterm); 347 } 348 349 /* add a term to the data base */ 350 351 static int 352 invnewterm(void) 353 { 354 int backupflag, i, j, maxback, holditems, gooditems, howfar; 355 int len, numwilluse, wdlen; 356 char *tptr, *tptr2, *tptr3; 357 union { 358 unsigned long packword[2]; 359 ENTRY e; 360 } iteminfo; 361 362 totterm++; 363 #if STATS 364 /* keep zipfian info on the distribution */ 365 if (numpost <= ZIPFSIZE) 366 zipf[numpost]++; 367 else 368 zipf[0]++; 369 #endif 370 len = strlen(thisterm); 371 wdlen = (len + (sizeof (long) - 1)) / sizeof (long); 372 numwilluse = (wdlen + 3) * sizeof (long); 373 /* new block if at least 1 item in block */ 374 if (numinvitems && numwilluse + amtused > BLOCKSIZE) { 375 /* set up new block */ 376 if (supfing + 500 > SUPFING + supersize) { 377 i = supfing - SUPFING; 378 supersize += 20000; 379 if ((SUPFING = realloc(SUPFING, supersize)) == NULL) { 380 invcannotalloc(supersize); 381 return (0); 382 } 383 supfing = i + SUPFING; 384 #if DEBUG 385 (void) printf("reallocated superfinger space to %d, " 386 "totpost=%ld\n", supersize, totpost); 387 #endif 388 } 389 /* check that room for the offset as well */ 390 if ((numlogblk + 10) > supintsize) { 391 i = supint - SUPINT; 392 supintsize += SUPERINC; 393 if ((SUPINT = realloc((char *)SUPINT, 394 supintsize * sizeof (long))) == NULL) { 395 invcannotalloc(supintsize * sizeof (long)); 396 return (0); 397 } 398 supint = i + SUPINT; 399 #if DEBUG 400 (void) printf("reallocated superfinger offset to %d, " 401 "totpost = %ld\n", supintsize * sizeof (long), 402 totpost); 403 #endif 404 } 405 /* See if backup is efficatious */ 406 backupflag = 0; 407 maxback = strlen(thisterm) / 10; 408 holditems = numinvitems; 409 if (maxback > numinvitems) 410 maxback = numinvitems - 2; 411 howfar = 0; 412 while (--maxback > 0) { 413 howfar++; 414 iteminfo.packword[0] = 415 logicalblk.invblk[--holditems * 2 + 416 (sizeof (long) - 1)]; 417 if ((i = iteminfo.e.size / 10) < maxback) { 418 maxback = i; 419 backupflag = howfar; 420 gooditems = holditems; 421 tptr2 = logicalblk.chrblk + iteminfo.e.offset; 422 } 423 } 424 /* see if backup will occur */ 425 if (backupflag) { 426 numinvitems = gooditems; 427 } 428 logicalblk.invblk[0] = numinvitems; 429 /* set forward pointer pointing to next */ 430 logicalblk.invblk[1] = numlogblk + 1; 431 /* set back pointer to last block */ 432 logicalblk.invblk[2] = numlogblk - 1; 433 if (fwrite((char *)logicalblk.chrblk, 1, 434 BLOCKSIZE, outfile) == 0) { 435 invcannotwrite(indexfile); 436 return (0); 437 } 438 amtused = 16; 439 numlogblk++; 440 /* check if had to back up, if so do it */ 441 if (backupflag) { 442 /* find out where the end of the new block is */ 443 iteminfo.packword[0] = 444 logicalblk.invblk[numinvitems * 2 + 1]; 445 tptr3 = logicalblk.chrblk + iteminfo.e.offset; 446 /* move the index for this block */ 447 for (i = 3; i <= (backupflag * 2 + 2); i++) { 448 logicalblk.invblk[i] = 449 logicalblk.invblk[numinvitems * 2+i]; 450 } 451 /* move the word into the super index */ 452 iteminfo.packword[0] = logicalblk.invblk[3]; 453 iteminfo.packword[1] = logicalblk.invblk[4]; 454 tptr2 = logicalblk.chrblk + iteminfo.e.offset; 455 (void) strncpy(supfing, tptr2, (int)iteminfo.e.size); 456 *(supfing + iteminfo.e.size) = '\0'; 457 #if DEBUG 458 (void) printf("backup %d at term=%s to term=%s\n", 459 backupflag, thisterm, supfing); 460 #endif 461 *supint++ = nextsupfing; 462 nextsupfing += strlen(supfing) + 1; 463 supfing += strlen(supfing) + 1; 464 /* now fix up the logical block */ 465 tptr = logicalblk.chrblk + lastinblk; 466 lastinblk = BLOCKSIZE; 467 tptr2 = logicalblk.chrblk + lastinblk; 468 j = tptr3 - tptr; 469 while (tptr3 > tptr) 470 *--tptr2 = *--tptr3; 471 lastinblk -= j; 472 amtused += (8 * backupflag + j); 473 for (i = 3; i < (backupflag * 2 + 2); i += 2) { 474 iteminfo.packword[0] = logicalblk.invblk[i]; 475 iteminfo.e.offset += (tptr2 - tptr3); 476 logicalblk.invblk[i] = iteminfo.packword[0]; 477 } 478 numinvitems = backupflag; 479 } else { /* no backup needed */ 480 numinvitems = 0; 481 lastinblk = BLOCKSIZE; 482 /* add new term to superindex */ 483 (void) strcpy(supfing, thisterm); 484 supfing += strlen(thisterm) + 1; 485 *supint++ = nextsupfing; 486 nextsupfing += strlen(thisterm) + 1; 487 } 488 } 489 lastinblk -= (numwilluse - 8); 490 iteminfo.e.offset = lastinblk; 491 iteminfo.e.size = (char)len; 492 iteminfo.e.space = 0; 493 iteminfo.e.post = numpost; 494 (void) strncpy(logicalblk.chrblk + lastinblk, thisterm, len); 495 amtused += numwilluse; 496 logicalblk.invblk[(lastinblk/sizeof (long))+wdlen] = nextpost; 497 if ((i = postptr - POST) > 0) { 498 if (fwrite((char *)POST, sizeof (POSTING), i, fpost) == 0) { 499 invcannotwrite(postingfile); 500 return (0); 501 } 502 nextpost += i * sizeof (POSTING); 503 } 504 logicalblk.invblk[3+2*numinvitems++] = iteminfo.packword[0]; 505 logicalblk.invblk[2+2*numinvitems] = iteminfo.packword[1]; 506 return (1); 507 } 508 509 static void 510 swap_ints(void *p, size_t sz) 511 { 512 uint32_t *s; 513 uint32_t *e = (uint32_t *)p + (sz / sizeof (uint32_t)); 514 515 for (s = p; s < e; s++) 516 *s = BSWAP_32(*s); 517 } 518 519 static void 520 write_param(INVCONTROL *invcntl) 521 { 522 if (invcntl->swap) 523 swap_ints(&invcntl->param, sizeof (invcntl->param)); 524 525 rewind(invcntl->invfile); 526 (void) fwrite((char *)&invcntl->param, sizeof (invcntl->param), 1, 527 invcntl->invfile); 528 529 if (invcntl->swap) 530 swap_ints(&invcntl->param, sizeof (invcntl->param)); 531 } 532 533 static void 534 read_superfinger(INVCONTROL *invcntl) 535 { 536 size_t count; 537 538 (void) fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET); 539 (void) fread(invcntl->iindex, (int)invcntl->param.supsize, 540 1, invcntl->invfile); 541 542 if (invcntl->swap) { 543 /* 544 * The superfinger consists of a count, followed by 545 * count offsets, followed by a string table (which 546 * the offsets reference). 547 * 548 * We need to swap the count and the offsets. 549 */ 550 count = 1 + BSWAP_32(*(uint32_t *)invcntl->iindex); 551 swap_ints(invcntl->iindex, count * sizeof (unsigned long)); 552 } 553 } 554 555 static void 556 read_logblock(INVCONTROL *invcntl, int block) 557 { 558 /* note always fetch it if the file is busy */ 559 if ((block != invcntl->numblk) || 560 (invcntl->param.filestat >= INVBUSY)) { 561 (void) fseek(invcntl->invfile, 562 (block * invcntl->param.sizeblk) + invcntl->param.cntlsize, 563 SEEK_SET); 564 invcntl->numblk = block; 565 (void) fread(invcntl->logblk, (int)invcntl->param.sizeblk, 566 1, invcntl->invfile); 567 568 if (invcntl->swap) { 569 size_t count; 570 ENTRY *ecur, *eend; 571 uint32_t *postptr; 572 573 /* 574 * A logblock consists of a count, a next block id, 575 * and a previous block id, followed by count 576 * ENTRYs, followed by alternating strings and 577 * offsets. 578 */ 579 swap_ints(invcntl->logblk, 3 * sizeof (unsigned long)); 580 581 count = *(uint32_t *)invcntl->logblk; 582 583 ecur = (ENTRY *)((uint32_t *)invcntl->logblk + 3); 584 eend = ecur + count; 585 586 for (; ecur < eend; ecur++) { 587 ecur->offset = BSWAP_16(ecur->offset); 588 ecur->post = BSWAP_32(ecur->post); 589 590 /* 591 * After the string is the posting offset. 592 */ 593 postptr = (uint32_t *)(invcntl->logblk + 594 ecur->offset + 595 P2ROUNDUP(ecur->size, sizeof (long))); 596 597 *postptr = BSWAP_32(*postptr); 598 } 599 } 600 } 601 } 602 603 void 604 read_next_posting(INVCONTROL *invcntl, POSTING *posting) 605 { 606 (void) fread((char *)posting, sizeof (*posting), 1, invcntl->postfile); 607 if (invcntl->swap) { 608 posting->lineoffset = BSWAP_32(posting->lineoffset); 609 posting->fcnoffset = BSWAP_32(posting->fcnoffset); 610 /* 611 * fileindex is a 24-bit field, so shift it before swapping 612 */ 613 posting->fileindex = BSWAP_32(posting->fileindex << 8); 614 } 615 } 616 617 int 618 invopen(INVCONTROL *invcntl, char *invname, char *invpost, int stat) 619 { 620 int read_index; 621 622 if ((invcntl->invfile = vpfopen(invname, 623 ((stat == 0) ? FREAD : FREADP))) == NULL) { 624 invcannotopen(invname); 625 return (-1); 626 } 627 if (fread((char *)&invcntl->param, sizeof (invcntl->param), 1, 628 invcntl->invfile) == 0) { 629 (void) fprintf(stderr, "%s: empty inverted file\n", argv0); 630 goto closeinv; 631 } 632 if (invcntl->param.version != VERSION && 633 BSWAP_32(invcntl->param.version) != VERSION) { 634 (void) fprintf(stderr, 635 "%s: cannot read old index format; use -U option to " 636 "force database to rebuild\n", argv0); 637 goto closeinv; 638 } 639 invcntl->swap = (invcntl->param.version != VERSION); 640 if (invcntl->swap) 641 swap_ints(&invcntl->param, sizeof (invcntl->param)); 642 643 if (stat == 0 && invcntl->param.filestat == INVALONE) { 644 (void) fprintf(stderr, "%s: inverted file is locked\n", argv0); 645 goto closeinv; 646 } 647 if ((invcntl->postfile = vpfopen(invpost, 648 ((stat == 0) ? FREAD : FREADP))) == NULL) { 649 invcannotopen(invpost); 650 goto closeinv; 651 } 652 /* allocate core for a logical block */ 653 if ((invcntl->logblk = malloc(invcntl->param.sizeblk)) == NULL) { 654 invcannotalloc((unsigned)invcntl->param.sizeblk); 655 goto closeboth; 656 } 657 /* allocate for and read in superfinger */ 658 read_index = 1; 659 invcntl->iindex = NULL; 660 #if SHARE 661 if (invcntl->param.share == 1) { 662 key_t ftok(), shm_key; 663 struct shmid_ds shm_buf; 664 char *shmat(); 665 int shm_id; 666 667 /* see if the shared segment exists */ 668 shm_key = ftok(invname, 2); 669 shm_id = shmget(shm_key, 0, 0); 670 /* 671 * Failure simply means (hopefully) that segment doesn't 672 * exist 673 */ 674 if (shm_id == -1) { 675 /* 676 * Have to give general write permission due to AMdahl 677 * not having protected segments 678 */ 679 shm_id = shmget(shm_key, 680 invcntl->param.supsize + sizeof (long), 681 IPC_CREAT | 0666); 682 if (shm_id == -1) 683 perror("Could not create shared " 684 "memory segment"); 685 } else 686 read_index = 0; 687 688 if (shm_id != -1) { 689 invcntl->iindex = shmat(shm_id, 0, 690 ((read_index) ? 0 : SHM_RDONLY)); 691 if (invcntl->iindex == (char *)ERR) { 692 (void) fprintf(stderr, 693 "%s: shared memory link failed\n", argv0); 694 invcntl->iindex = NULL; 695 read_index = 1; 696 } 697 } 698 } 699 #endif 700 if (invcntl->iindex == NULL) 701 invcntl->iindex = malloc(invcntl->param.supsize + 16); 702 if (invcntl->iindex == NULL) { 703 invcannotalloc((unsigned)invcntl->param.supsize); 704 free(invcntl->logblk); 705 goto closeboth; 706 } 707 if (read_index) { 708 read_superfinger(invcntl); 709 } 710 invcntl->numblk = -1; 711 if (boolready() == -1) { 712 closeboth: 713 (void) fclose(invcntl->postfile); 714 closeinv: 715 (void) fclose(invcntl->invfile); 716 return (-1); 717 } 718 /* write back out the control block if anything changed */ 719 invcntl->param.filestat = stat; 720 if (stat > invcntl->param.filestat) 721 write_param(invcntl); 722 return (1); 723 } 724 725 /* invclose must be called to wrap things up and deallocate core */ 726 void 727 invclose(INVCONTROL *invcntl) 728 { 729 /* write out the control block in case anything changed */ 730 if (invcntl->param.filestat > 0) { 731 invcntl->param.filestat = 0; 732 write_param(invcntl); 733 } 734 (void) fclose(invcntl->invfile); 735 (void) fclose(invcntl->postfile); 736 #if SHARE 737 if (invcntl->param.share > 0) { 738 shmdt(invcntl->iindex); 739 invcntl->iindex = NULL; 740 } 741 #endif 742 if (invcntl->iindex != NULL) 743 free(invcntl->iindex); 744 free(invcntl->logblk); 745 } 746 747 /* invstep steps the inverted file forward one item */ 748 void 749 invstep(INVCONTROL *invcntl) 750 { 751 if (invcntl->keypnt < (*(int *)invcntl->logblk - 1)) { 752 invcntl->keypnt++; 753 return; 754 } 755 756 /* move forward a block else wrap */ 757 read_logblock(invcntl, *(int *)(invcntl->logblk + sizeof (long))); 758 759 invcntl->keypnt = 0; 760 } 761 762 /* invforward moves forward one term in the inverted file */ 763 int 764 invforward(INVCONTROL *invcntl) 765 { 766 invstep(invcntl); 767 /* skip things with 0 postings */ 768 while (((ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt)->post == 0) { 769 invstep(invcntl); 770 } 771 /* Check for having wrapped - reached start of inverted file! */ 772 if ((invcntl->numblk == 0) && (invcntl->keypnt == 0)) 773 return (0); 774 return (1); 775 } 776 777 /* invterm gets the present term from the present logical block */ 778 int 779 invterm(INVCONTROL *invcntl, char *term) 780 { 781 ENTRY * entryptr; 782 783 entryptr = (ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt; 784 (void) strncpy(term, entryptr->offset + invcntl->logblk, 785 (int)entryptr->size); 786 *(term + entryptr->size) = '\0'; 787 return (entryptr->post); 788 } 789 790 /* invfind searches for an individual item in the inverted file */ 791 long 792 invfind(INVCONTROL *invcntl, char *searchterm) 793 { 794 int imid, ilow, ihigh; 795 long num; 796 int i; 797 unsigned long *intptr, *intptr2; 798 ENTRY *entryptr; 799 800 /* make sure it is initialized via invready */ 801 if (invcntl->invfile == 0) 802 return (-1L); 803 804 /* now search for the appropriate finger block */ 805 intptr = (unsigned long *)invcntl->iindex; 806 807 ilow = 0; 808 ihigh = *intptr++ - 1; 809 while (ilow <= ihigh) { 810 imid = (ilow + ihigh) / 2; 811 intptr2 = intptr + imid; 812 i = strcmp(searchterm, (invcntl->iindex + *intptr2)); 813 if (i < 0) 814 ihigh = imid - 1; 815 else if (i > 0) 816 ilow = ++imid; 817 else { 818 ilow = imid + 1; 819 break; 820 } 821 } 822 /* be careful about case where searchterm is after last in this block */ 823 imid = (ilow) ? ilow - 1 : 0; 824 825 /* fetch the appropriate logical block if not in core */ 826 read_logblock(invcntl, imid); 827 828 srch_ext: 829 /* now find the term in this block. tricky this */ 830 intptr = (unsigned long *)invcntl->logblk; 831 832 ilow = 0; 833 ihigh = *intptr - 1; 834 intptr += 3; 835 num = 0; 836 while (ilow <= ihigh) { 837 imid = (ilow + ihigh) / 2; 838 entryptr = (ENTRY *)intptr + imid; 839 i = strncmp(searchterm, (invcntl->logblk + entryptr->offset), 840 (int)entryptr->size); 841 if (i == 0) 842 i = strlen(searchterm) - entryptr->size; 843 if (i < 0) 844 ihigh = imid - 1; 845 else if (i > 0) 846 ilow = ++imid; 847 else { 848 num = entryptr->post; 849 break; 850 } 851 } 852 /* be careful about case where searchterm is after last in this block */ 853 if (imid >= *(long *)invcntl->logblk) { 854 invcntl->keypnt = *(long *)invcntl->logblk; 855 invstep(invcntl); 856 /* note if this happens the term could be in extended block */ 857 if (invcntl->param.startbyte < 858 invcntl->numblk * invcntl->param.sizeblk) 859 goto srch_ext; 860 } else 861 invcntl->keypnt = imid; 862 return (num); 863 } 864 865 #if DEBUG 866 867 /* invdump dumps the block the term parameter is in */ 868 void 869 invdump(INVCONTROL *invcntl, char *term) 870 { 871 long i, j, n, *longptr; 872 ENTRY * entryptr; 873 char temp[512], *ptr; 874 875 /* dump superindex if term is "-" */ 876 if (*term == '-') { 877 j = atoi(term + 1); 878 longptr = (long *)invcntl->iindex; 879 n = *longptr++; 880 (void) printf("Superindex dump, num blocks=%ld\n", n); 881 longptr += j; 882 while ((longptr <= ((long *)invcntl->iindex) + n) && 883 invbreak == 0) { 884 (void) printf("%2ld %6ld %s\n", j++, *longptr, 885 invcntl->iindex + *longptr); 886 longptr++; 887 } 888 return; 889 } else if (*term == '#') { 890 j = atoi(term + 1); 891 /* fetch the appropriate logical block */ 892 read_logblock(invcntl, j); 893 } else 894 i = abs((int)invfind(invcntl, term)); 895 longptr = (long *)invcntl->logblk; 896 n = *longptr++; 897 (void) printf("Entry term to invdump=%s, postings=%ld, " 898 "forward ptr=%ld, back ptr=%ld\n", term, i, *(longptr), 899 *(longptr + 1)); 900 entryptr = (ENTRY *)(invcntl->logblk + 12); 901 (void) printf("%ld terms in this block, block=%ld\n", n, 902 invcntl->numblk); 903 (void) printf("\tterm\t\t\tposts\tsize\toffset\tspace\t1st word\n"); 904 for (j = 0; j < n && invbreak == 0; j++) { 905 ptr = invcntl->logblk + entryptr->offset; 906 (void) strncpy(temp, ptr, (int)entryptr->size); 907 temp[entryptr->size] = '\0'; 908 ptr += (sizeof (long) * 909 (long)((entryptr->size + 910 (sizeof (long) - 1)) / sizeof (long))); 911 (void) printf("%2ld %-24s\t%5ld\t%3d\t%d\t%d\t%ld\n", j, temp, 912 entryptr->post, entryptr->size, entryptr->offset, 913 entryptr->space, *(long *)ptr); 914 entryptr++; 915 } 916 } 917 #endif 918 919 static int 920 boolready(void) 921 { 922 numitems = 0; 923 if (item1 != NULL) 924 free(item1); 925 setsize1 = SETINC; 926 if ((item1 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) { 927 invcannotalloc(SETINC); 928 return (-1); 929 } 930 if (item2 != NULL) 931 free(item2); 932 setsize2 = SETINC; 933 if ((item2 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) { 934 invcannotalloc(SETINC); 935 return (-1); 936 } 937 item = item1; 938 enditem = item; 939 return (0); 940 } 941 942 void 943 boolclear(void) 944 { 945 numitems = 0; 946 item = item1; 947 enditem = item; 948 } 949 950 POSTING * 951 boolfile(INVCONTROL *invcntl, long *num, int bool) 952 { 953 ENTRY *entryptr; 954 FILE *file; 955 char *ptr; 956 unsigned long *ptr2; 957 POSTING *newitem; 958 POSTING posting; 959 unsigned u; 960 POSTING *newsetp, *set1p; 961 long newsetc, set1c, set2c; 962 963 entryptr = (ENTRY *) (invcntl->logblk + 12) + invcntl->keypnt; 964 ptr = invcntl->logblk + entryptr->offset; 965 ptr2 = ((unsigned long *)ptr) + 966 (entryptr->size + (sizeof (long) - 1)) / sizeof (long); 967 *num = entryptr->post; 968 switch (bool) { 969 case OR: 970 case NOT: 971 if (*num == 0) { 972 *num = numitems; 973 return (item); 974 } 975 } 976 /* make room for the new set */ 977 u = 0; 978 switch (bool) { 979 case AND: 980 case NOT: 981 newsetp = set1p = item; 982 break; 983 984 case OR: 985 u = enditem - item; 986 /* FALLTHROUGH */ 987 case REVERSENOT: 988 u += *num; 989 if (item == item2) { 990 if (u > setsize1) { 991 u += SETINC; 992 if ((item1 = (POSTING *) realloc(item1, 993 u * sizeof (POSTING))) == NULL) { 994 goto cannotalloc; 995 } 996 setsize1 = u; 997 } 998 newitem = item1; 999 } else { 1000 if (u > setsize2) { 1001 u += SETINC; 1002 if ((item2 = (POSTING *)realloc(item2, 1003 u * sizeof (POSTING))) == NULL) { 1004 cannotalloc: 1005 invcannotalloc(u * sizeof (POSTING)); 1006 (void) boolready(); 1007 *num = -1; 1008 return (NULL); 1009 } 1010 setsize2 = u; 1011 } 1012 newitem = item2; 1013 } 1014 set1p = item; 1015 newsetp = newitem; 1016 } 1017 file = invcntl->postfile; 1018 (void) fseek(file, (long)*ptr2, SEEK_SET); 1019 read_next_posting(invcntl, &posting); 1020 newsetc = 0; 1021 switch (bool) { 1022 case OR: 1023 /* while something in both sets */ 1024 set1p = item; 1025 newsetp = newitem; 1026 for (set1c = 0, set2c = 0; 1027 set1c < numitems && set2c < *num; newsetc++) { 1028 if (set1p->lineoffset < posting.lineoffset) { 1029 *newsetp++ = *set1p++; 1030 set1c++; 1031 } else if (set1p->lineoffset > posting.lineoffset) { 1032 *newsetp++ = posting; 1033 read_next_posting(invcntl, &posting); 1034 set2c++; 1035 } else if (set1p->type < posting.type) { 1036 *newsetp++ = *set1p++; 1037 set1c++; 1038 } else if (set1p->type > posting.type) { 1039 *newsetp++ = posting; 1040 read_next_posting(invcntl, &posting); 1041 set2c++; 1042 } else { /* identical postings */ 1043 *newsetp++ = *set1p++; 1044 set1c++; 1045 read_next_posting(invcntl, &posting); 1046 set2c++; 1047 } 1048 } 1049 /* find out what ran out and move the rest in */ 1050 if (set1c < numitems) { 1051 newsetc += numitems - set1c; 1052 while (set1c++ < numitems) { 1053 *newsetp++ = *set1p++; 1054 } 1055 } else { 1056 while (set2c++ < *num) { 1057 *newsetp++ = posting; 1058 newsetc++; 1059 read_next_posting(invcntl, &posting); 1060 } 1061 } 1062 item = newitem; 1063 break; /* end of OR */ 1064 #if 0 1065 case AND: 1066 set1c = 0; 1067 set2c = 0; 1068 while (set1c < numitems && set2c < *num) { 1069 if (set1p->lineoffset < posting.lineoffset) { 1070 set1p++; 1071 set1c++; 1072 } else if (set1p->lineoffset > posting.lineoffset) { 1073 read_next_posting(invcntl, &posting); 1074 set2c++; 1075 } else if (set1p->type < posting.type) { 1076 *set1p++; 1077 set1c++; 1078 } else if (set1p->type > posting.type) { 1079 read_next_posting(invcntl, &posting); 1080 set2c++; 1081 } else { /* identical postings */ 1082 *newsetp++ = *set1p++; 1083 newsetc++; 1084 set1c++; 1085 read_next_posting(invcntl, &posting); 1086 set2c++; 1087 } 1088 } 1089 break; /* end of AND */ 1090 1091 case NOT: 1092 set1c = 0; 1093 set2c = 0; 1094 while (set1c < numitems && set2c < *num) { 1095 if (set1p->lineoffset < posting.lineoffset) { 1096 *newsetp++ = *set1p++; 1097 newsetc++; 1098 set1c++; 1099 } else if (set1p->lineoffset > posting.lineoffset) { 1100 read_next_posting(invcntl, &posting); 1101 set2c++; 1102 } else if (set1p->type < posting.type) { 1103 *newsetp++ = *set1p++; 1104 newsetc++; 1105 set1c++; 1106 } else if (set1p->type > posting.type) { 1107 read_next_posting(invcntl, &posting); 1108 set2c++; 1109 } else { /* identical postings */ 1110 set1c++; 1111 set1p++; 1112 read_next_posting(invcntl, &posting); 1113 set2c++; 1114 } 1115 } 1116 newsetc += numitems - set1c; 1117 while (set1c++ < numitems) { 1118 *newsetp++ = *set1p++; 1119 } 1120 break; /* end of NOT */ 1121 1122 case REVERSENOT: /* core NOT incoming set */ 1123 set1c = 0; 1124 set2c = 0; 1125 while (set1c < numitems && set2c < *num) { 1126 if (set1p->lineoffset < posting.lineoffset) { 1127 set1p++; 1128 set1c++; 1129 } else if (set1p->lineoffset > posting.lineoffset) { 1130 *newsetp++ = posting; 1131 read_next_posting(invcntl, &posting); 1132 set2c++; 1133 } else if (set1p->type < posting.type) { 1134 set1p++; 1135 set1c++; 1136 } else if (set1p->type > posting.type) { 1137 *newsetp++ = posting; 1138 read_next_posting(invcntl, &posting); 1139 set2c++; 1140 } else { /* identical postings */ 1141 set1c++; 1142 set1p++; 1143 read_next_posting(invcntl, &posting); 1144 set2c++; 1145 } 1146 } 1147 while (set2c++ < *num) { 1148 *newsetp++ = posting; 1149 newsetc++; 1150 read_next_posting(invcntl, &posting); 1151 } 1152 item = newitem; 1153 break; /* end of REVERSENOT */ 1154 #endif 1155 } 1156 numitems = newsetc; 1157 *num = newsetc; 1158 enditem = (POSTING *)newsetp; 1159 return ((POSTING *)item); 1160 } 1161 1162 #if 0 1163 POSTING * 1164 boolsave(int clear) 1165 { 1166 int i; 1167 POSTING *ptr; 1168 POSTING *oldstuff, *newstuff; 1169 1170 if (numitems == 0) { 1171 if (clear) 1172 boolclear(); 1173 return (NULL); 1174 } 1175 /* 1176 * if clear then give them what we have and use (void) 1177 * boolready to realloc 1178 */ 1179 if (clear) { 1180 ptr = item; 1181 /* free up the space we didn't give them */ 1182 if (item == item1) 1183 item1 = NULL; 1184 else 1185 item2 = NULL; 1186 (void) boolready(); 1187 return (ptr); 1188 } 1189 i = (enditem - item) * sizeof (POSTING) + 100; 1190 if ((ptr = (POSTING *)malloc(i))r == NULL) { 1191 invcannotalloc(i); 1192 return (ptr); 1193 } 1194 /* move present set into place */ 1195 oldstuff = item; 1196 newstuff = ptr; 1197 while (oldstuff < enditem) 1198 *newstuff++ = *oldstuff++; 1199 return (ptr); 1200 } 1201 #endif 1202 1203 static void 1204 invcannotalloc(size_t n) 1205 { 1206 (void) fprintf(stderr, "%s: cannot allocate %u bytes\n", argv0, n); 1207 } 1208 1209 static void 1210 invcannotopen(char *file) 1211 { 1212 (void) fprintf(stderr, "%s: cannot open file %s\n", argv0, file); 1213 } 1214 1215 static void 1216 invcannotwrite(char *file) 1217 { 1218 (void) perror(argv0); /* must be first to preserve errno */ 1219 (void) fprintf(stderr, "%s: write to file %s failed\n", argv0, file); 1220 }