1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*      Copyright (c) 1988 AT&T     */
  23 /*        All Rights Reserved   */
  24 
  25 
  26 /*
  27  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  28  * Use is subject to license terms.
  29  */
  30 
  31 #pragma ident   "%Z%%M% %I%     %E% SMI"
  32 
  33 #include <ctype.h>
  34 #include <stdio.h>
  35 #include <sys/types.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/byteorder.h>
  38 #if SHARE
  39 #include <sys/ipc.h>
  40 #include <sys/shm.h>
  41 #define ERR  -1
  42 #endif
  43 #include "invlib.h"
  44 #include "library.h"
  45 
  46 #define DEBUG           0       /* debugging code and realloc messages */
  47 #define BLOCKSIZE       2 * BUFSIZ      /* logical block size */
  48 #define LINEMAX         1000    /* sorted posting line max size */
  49 #define POSTINC         10000   /* posting buffer size increment */
  50 #define SEP             ' '     /* sorted posting field separator */
  51 #define SETINC          100     /* posting set size increment */
  52 #define STATS           0       /* print statistics */
  53 #define SUPERINC        10000   /* super index size increment */
  54 #define TERMMAX         512     /* term max size */
  55 #define VERSION         1       /* inverted index format version */
  56 #define ZIPFSIZE        200     /* zipf curve size */
  57 #define FREAD   "r"             /* fopen for reading */
  58 #define FREADP  "r+"            /* fopen for update */
  59 #define FWRITE  "w"             /* fopen truncate or create for writing */
  60 #define FWRITEP "w+"            /* fopen truncate or create for update */
  61 
  62 extern  char    *argv0; /* command name (must be set in main function) */
  63 
  64 int     invbreak;
  65 
  66 #if STATS
  67 int     showzipf;       /* show postings per term distribution */
  68 #endif
  69 
  70 static  POSTING *item, *enditem, *item1 = NULL, *item2 = NULL;
  71 static  unsigned setsize1, setsize2;
  72 static  long    numitems, totterm, zerolong;
  73 static  char    *indexfile, *postingfile;
  74 static  FILE    *outfile, *fpost;
  75 static  unsigned supersize = SUPERINC, supintsize;
  76 static  int     numpost, numlogblk, amtused, nextpost,
  77                 lastinblk, numinvitems;
  78 static  POSTING *POST, *postptr;
  79 static  unsigned long   *SUPINT, *supint, nextsupfing;
  80 static  char    *SUPFING, *supfing;
  81 static  char    thisterm[TERMMAX];
  82 static  union {
  83         long    invblk[BLOCKSIZE / sizeof (long)];
  84         char    chrblk[BLOCKSIZE];
  85 } logicalblk;
  86 
  87 #if DEBUG || STATS
  88 static  long    totpost;
  89 #endif
  90 
  91 #if STATS
  92 static  int     zipf[ZIPFSIZE + 1];
  93 #endif
  94 
  95 static void invcannotalloc(size_t n);
  96 static void invcannotopen(char *file);
  97 static void invcannotwrite(char *file);
  98 static int invnewterm(void);
  99 static int boolready(void);
 100 
 101 long
 102 invmake(char *invname, char *invpost, FILE *infile)
 103 {
 104         unsigned char   *s;
 105         long    num;
 106         int     i;
 107         long    fileindex;
 108         unsigned postsize = POSTINC * sizeof (POSTING);
 109         unsigned long   *intptr;
 110         char    line[LINEMAX];
 111         long    tlong;
 112         PARAM   param;
 113         POSTING posting;
 114 #if STATS
 115         int     j;
 116         unsigned maxtermlen = 0;
 117 #endif
 118         /* output file */
 119         if ((outfile = vpfopen(invname, FWRITEP)) == NULL) {
 120                 invcannotopen(invname);
 121                 return (0);
 122         }
 123         indexfile = invname;
 124         (void) fseek(outfile, (long)BUFSIZ, 0);
 125 
 126         /* posting file  */
 127         if ((fpost = vpfopen(invpost, FWRITE)) == NULL) {
 128                 invcannotopen(invpost);
 129                 return (0);
 130         }
 131         postingfile = invpost;
 132         nextpost = 0;
 133         /* get space for the postings list */
 134         if ((POST = (POSTING *)malloc(postsize)) == NULL) {
 135                 invcannotalloc(postsize);
 136                 return (0);
 137         }
 138         postptr = POST;
 139         /* get space for the superfinger (superindex) */
 140         if ((SUPFING = malloc(supersize)) == NULL) {
 141                 invcannotalloc(supersize);
 142                 return (0);
 143         }
 144         supfing = SUPFING;
 145         supintsize = supersize / 40;
 146         /* also for the superfinger index */
 147         if ((SUPINT = malloc(supintsize * sizeof (long))) == NULL) {
 148                 invcannotalloc(supintsize * sizeof (long));
 149                 return (0);
 150         }
 151         supint = SUPINT;
 152         supint++; /* leave first term open for a count */
 153         /* initialize using an empty term */
 154         (void) strcpy(thisterm, "");
 155         *supint++ = 0;
 156         *supfing++ = ' ';
 157         *supfing++ = '\0';
 158         nextsupfing = 2;
 159 #if DEBUG || STATS
 160         totpost = 0L;
 161 #endif
 162         totterm = 0L;
 163         numpost = 1;
 164 
 165         /*
 166          * set up as though a block had come and gone, i.e., set up for
 167          * new block
 168          */
 169         amtused = 16; /* leave no space - init 3 words + one for luck */
 170         numinvitems = 0;
 171         numlogblk = 0;
 172         lastinblk = BLOCKSIZE;
 173 
 174         /* now loop as long as more to read (till eof)  */
 175         while (fgets(line, LINEMAX, infile) != NULL) {
 176 #if DEBUG || STATS
 177                 ++totpost;
 178 #endif
 179                 s = (unsigned char *) strchr(line, SEP);
 180                 if (s == NULL)          /* where did this line come from ??? */
 181                         continue;       /* workaround: just skip it */
 182                 *s = '\0';
 183 #if STATS
 184                 if ((i = strlen(line)) > maxtermlen) {
 185                         maxtermlen = i;
 186                 }
 187 #endif
 188 #if DEBUG
 189                 (void) printf("%ld: %s ", totpost, line);
 190                 (void) fflush(stdout);
 191 #endif
 192                 if (strcmp(thisterm, line) == 0) {
 193                         if (postptr + 10 > POST + postsize / sizeof (POSTING)) {
 194                                 i = postptr - POST;
 195                                 postsize += POSTINC * sizeof (POSTING);
 196                                 if ((POST = realloc(POST, postsize)) == NULL) {
 197                                         invcannotalloc(postsize);
 198                                         return (0);
 199                                 }
 200                                 postptr = i + POST;
 201 #if DEBUG
 202                                 (void) printf("reallocated post space to %u, "
 203                                     "totpost=%ld\n", postsize, totpost);
 204 #endif
 205                         }
 206                         numpost++;
 207                 } else {
 208                         /* have a new term */
 209                         if (!invnewterm()) {
 210                                 return (0);
 211                         }
 212                         (void) strcpy(thisterm, line);
 213                         numpost = 1;
 214                         postptr = POST;
 215                         fileindex = 0;
 216                 }
 217                 /* get the new posting */
 218                 num = *++s - '!';
 219                 i = 1;
 220                 do {
 221                         num = BASE * num + *++s - '!';
 222                 } while (++i < PRECISION);
 223                 posting.lineoffset = num;
 224                 while (++fileindex < nsrcoffset && num > srcoffset[fileindex]) {
 225                         ;
 226                 }
 227                 posting.fileindex = --fileindex;
 228                 posting.type = *++s;
 229                 num = *++s - '!';
 230                 if (*s != '\n') {
 231                         num = *++s - '!';
 232                         while (*++s != '\n') {
 233                                 num = BASE * num + *s - '!';
 234                         }
 235                         posting.fcnoffset = num;
 236                 } else {
 237                         posting.fcnoffset = 0;
 238                 }
 239                 *postptr++ = posting;
 240 #if DEBUG
 241                 (void) printf("%ld %ld %ld %ld\n", posting.fileindex,
 242                     posting.fcnoffset, posting.lineoffset, posting.type);
 243                 (void) fflush(stdout);
 244 #endif
 245         }
 246         if (!invnewterm()) {
 247                 return (0);
 248         }
 249         /* now clean up final block  */
 250         logicalblk.invblk[0] = numinvitems;
 251         /* loops pointer around to start */
 252         logicalblk.invblk[1] = 0;
 253         logicalblk.invblk[2] = numlogblk - 1;
 254         if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) {
 255                 goto cannotwrite;
 256         }
 257         numlogblk++;
 258         /* write out block to save space. what in it doesn't matter */
 259         if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) {
 260                 goto cannotwrite;
 261         }
 262         /* finish up the super finger */
 263         *SUPINT = numlogblk;
 264         /* add to the offsets the size of the offset pointers */
 265         intptr = (SUPINT + 1);
 266         i = (char *)supint - (char *)SUPINT;
 267         while (intptr < supint)
 268                 *intptr++ += i;
 269         /* write out the offsets (1 for the N at start) and the super finger */
 270         if (fwrite((char *)SUPINT, sizeof (*SUPINT), numlogblk + 1,
 271             outfile) == 0 ||
 272             fwrite(SUPFING, 1, supfing - SUPFING, outfile) == 0) {
 273                 goto cannotwrite;
 274         }
 275         /* save the size for reference later */
 276         nextsupfing = sizeof (long) + sizeof (long) * numlogblk +
 277             (supfing - SUPFING);
 278         /*
 279          * make sure the file ends at a logical block boundary.  This is
 280          * necessary for invinsert to correctly create extended blocks
 281          */
 282         i = nextsupfing % BLOCKSIZE;
 283         /* write out junk to fill log blk */
 284         if (fwrite(SUPFING, BLOCKSIZE - i, 1, outfile) == 0 ||
 285             fflush(outfile) == EOF) {
 286                 /* rewind doesn't check for write failure */
 287                 goto cannotwrite;
 288         }
 289         /* write the control area */
 290         rewind(outfile);
 291         param.version = VERSION;
 292         param.filestat = 0;
 293         param.sizeblk = BLOCKSIZE;
 294         param.startbyte = (numlogblk + 1) * BLOCKSIZE + BUFSIZ;
 295         param.supsize = nextsupfing;
 296         param.cntlsize = BUFSIZ;
 297         param.share = 0;
 298         if (fwrite((char *)&param, sizeof (param), 1, outfile) == 0) {
 299                 goto cannotwrite;
 300         }
 301         for (i = 0; i < 10; i++)     /* for future use */
 302                 if (fwrite((char *)&zerolong, sizeof (zerolong),
 303                     1, outfile) == 0) {
 304                         goto cannotwrite;
 305                 }
 306 
 307         /* make first block loop backwards to last block */
 308         if (fflush(outfile) == EOF) {
 309                 /* fseek doesn't check for write failure */
 310                 goto cannotwrite;
 311         }
 312         /* get to second word first block */
 313         (void) fseek(outfile, (long)BUFSIZ + 8, 0);
 314         tlong = numlogblk - 1;
 315         if (fwrite((char *)&tlong, sizeof (tlong), 1, outfile) == 0 ||
 316             fclose(outfile) == EOF) {
 317 cannotwrite:
 318                 invcannotwrite(invname);
 319                 return (0);
 320         }
 321         if (fclose(fpost) == EOF) {
 322                 invcannotwrite(postingfile);
 323                 return (0);
 324         }
 325         --totterm;      /* don't count null term */
 326 #if STATS
 327         (void) printf("logical blocks = %d, postings = %ld, terms = %ld, "
 328             "max term length = %d\n", numlogblk, totpost, totterm, maxtermlen);
 329         if (showzipf) {
 330                 (void) printf(
 331                     "\n*************   ZIPF curve  ****************\n");
 332                 for (j = ZIPFSIZE; j > 1; j--)
 333                         if (zipf[j])
 334                                 break;
 335                 for (i = 1; i < j; ++i) {
 336                         (void) printf("%3d -%6d ", i, zipf[i]);
 337                         if (i % 6 == 0) (void) putchar('\n');
 338                 }
 339                 (void) printf(">%d-%6d\n", ZIPFSIZE, zipf[0]);
 340         }
 341 #endif
 342         /* free all malloc'd memory */
 343         free(POST);
 344         free(SUPFING);
 345         free(SUPINT);
 346         return (totterm);
 347 }
 348 
 349 /* add a term to the data base */
 350 
 351 static int
 352 invnewterm(void)
 353 {
 354         int     backupflag, i, j, maxback, holditems, gooditems, howfar;
 355         int     len, numwilluse, wdlen;
 356         char    *tptr, *tptr2, *tptr3;
 357         union {
 358                 unsigned long   packword[2];
 359                 ENTRY           e;
 360         } iteminfo;
 361 
 362         totterm++;
 363 #if STATS
 364         /* keep zipfian info on the distribution */
 365         if (numpost <= ZIPFSIZE)
 366                 zipf[numpost]++;
 367         else
 368                 zipf[0]++;
 369 #endif
 370         len = strlen(thisterm);
 371         wdlen = (len + (sizeof (long) - 1)) / sizeof (long);
 372         numwilluse = (wdlen + 3) * sizeof (long);
 373         /* new block if at least 1 item in block */
 374         if (numinvitems && numwilluse + amtused > BLOCKSIZE) {
 375                 /* set up new block */
 376                 if (supfing + 500 > SUPFING + supersize) {
 377                         i = supfing - SUPFING;
 378                         supersize += 20000;
 379                         if ((SUPFING = realloc(SUPFING, supersize)) == NULL) {
 380                                 invcannotalloc(supersize);
 381                                 return (0);
 382                         }
 383                         supfing = i + SUPFING;
 384 #if DEBUG
 385                         (void) printf("reallocated superfinger space to %d, "
 386                             "totpost=%ld\n", supersize, totpost);
 387 #endif
 388                 }
 389                 /* check that room for the offset as well */
 390                 if ((numlogblk + 10) > supintsize) {
 391                         i = supint - SUPINT;
 392                         supintsize += SUPERINC;
 393                         if ((SUPINT = realloc((char *)SUPINT,
 394                             supintsize * sizeof (long))) == NULL) {
 395                                 invcannotalloc(supintsize * sizeof (long));
 396                                 return (0);
 397                         }
 398                         supint = i + SUPINT;
 399 #if DEBUG
 400                         (void) printf("reallocated superfinger offset to %d, "
 401                             "totpost = %ld\n", supintsize * sizeof (long),
 402                             totpost);
 403 #endif
 404                 }
 405                 /* See if backup is efficatious  */
 406                 backupflag = 0;
 407                 maxback = strlen(thisterm) / 10;
 408                 holditems = numinvitems;
 409                 if (maxback > numinvitems)
 410                         maxback = numinvitems - 2;
 411                 howfar = 0;
 412                 while (--maxback > 0) {
 413                         howfar++;
 414                         iteminfo.packword[0] =
 415                             logicalblk.invblk[--holditems * 2 +
 416                             (sizeof (long) - 1)];
 417                         if ((i = iteminfo.e.size / 10) < maxback) {
 418                                 maxback = i;
 419                                 backupflag = howfar;
 420                                 gooditems = holditems;
 421                                 tptr2 = logicalblk.chrblk + iteminfo.e.offset;
 422                         }
 423                 }
 424                 /* see if backup will occur  */
 425                 if (backupflag) {
 426                         numinvitems = gooditems;
 427                 }
 428                 logicalblk.invblk[0] = numinvitems;
 429                 /* set forward pointer pointing to next */
 430                 logicalblk.invblk[1] = numlogblk + 1;
 431                 /* set back pointer to last block */
 432                 logicalblk.invblk[2] = numlogblk - 1;
 433                 if (fwrite((char *)logicalblk.chrblk, 1,
 434                     BLOCKSIZE, outfile) == 0) {
 435                         invcannotwrite(indexfile);
 436                         return (0);
 437                 }
 438                 amtused = 16;
 439                 numlogblk++;
 440                 /* check if had to back up, if so do it */
 441                 if (backupflag) {
 442                         /* find out where the end of the new block is */
 443                         iteminfo.packword[0] =
 444                             logicalblk.invblk[numinvitems * 2 + 1];
 445                         tptr3 = logicalblk.chrblk + iteminfo.e.offset;
 446                         /* move the index for this block */
 447                         for (i = 3; i <= (backupflag * 2 + 2); i++) {
 448                                 logicalblk.invblk[i] =
 449                                     logicalblk.invblk[numinvitems * 2+i];
 450                         }
 451                         /* move the word into the super index */
 452                         iteminfo.packword[0] = logicalblk.invblk[3];
 453                         iteminfo.packword[1] = logicalblk.invblk[4];
 454                         tptr2 = logicalblk.chrblk + iteminfo.e.offset;
 455                         (void) strncpy(supfing, tptr2, (int)iteminfo.e.size);
 456                         *(supfing + iteminfo.e.size) = '\0';
 457 #if DEBUG
 458                         (void) printf("backup %d at term=%s to term=%s\n",
 459                             backupflag, thisterm, supfing);
 460 #endif
 461                         *supint++ = nextsupfing;
 462                         nextsupfing += strlen(supfing) + 1;
 463                         supfing += strlen(supfing) + 1;
 464                         /* now fix up the logical block */
 465                         tptr = logicalblk.chrblk + lastinblk;
 466                         lastinblk = BLOCKSIZE;
 467                         tptr2 = logicalblk.chrblk + lastinblk;
 468                         j = tptr3 - tptr;
 469                         while (tptr3 > tptr)
 470                                 *--tptr2 = *--tptr3;
 471                         lastinblk -= j;
 472                         amtused += (8 * backupflag + j);
 473                         for (i = 3; i < (backupflag * 2 + 2); i += 2) {
 474                                 iteminfo.packword[0] = logicalblk.invblk[i];
 475                                 iteminfo.e.offset += (tptr2 - tptr3);
 476                                 logicalblk.invblk[i] = iteminfo.packword[0];
 477                         }
 478                         numinvitems = backupflag;
 479                 } else { /* no backup needed */
 480                         numinvitems = 0;
 481                         lastinblk = BLOCKSIZE;
 482                         /* add new term to superindex */
 483                         (void) strcpy(supfing, thisterm);
 484                         supfing += strlen(thisterm) + 1;
 485                         *supint++ = nextsupfing;
 486                         nextsupfing += strlen(thisterm) + 1;
 487                 }
 488         }
 489         lastinblk -= (numwilluse - 8);
 490         iteminfo.e.offset = lastinblk;
 491         iteminfo.e.size = (char)len;
 492         iteminfo.e.space = 0;
 493         iteminfo.e.post = numpost;
 494         (void) strncpy(logicalblk.chrblk + lastinblk, thisterm, len);
 495         amtused += numwilluse;
 496         logicalblk.invblk[(lastinblk/sizeof (long))+wdlen] = nextpost;
 497         if ((i = postptr - POST) > 0) {
 498                 if (fwrite((char *)POST, sizeof (POSTING), i, fpost) == 0) {
 499                         invcannotwrite(postingfile);
 500                         return (0);
 501                 }
 502                 nextpost += i * sizeof (POSTING);
 503         }
 504         logicalblk.invblk[3+2*numinvitems++] = iteminfo.packword[0];
 505         logicalblk.invblk[2+2*numinvitems] = iteminfo.packword[1];
 506         return (1);
 507 }
 508 
 509 static void
 510 swap_ints(void *p, size_t sz)
 511 {
 512         uint32_t *s;
 513         uint32_t *e = (uint32_t *)p + (sz / sizeof (uint32_t));
 514 
 515         for (s = p; s < e; s++)
 516                 *s = BSWAP_32(*s);
 517 }
 518 
 519 static void
 520 write_param(INVCONTROL *invcntl)
 521 {
 522         if (invcntl->swap)
 523                 swap_ints(&invcntl->param, sizeof (invcntl->param));
 524 
 525         rewind(invcntl->invfile);
 526         (void) fwrite((char *)&invcntl->param, sizeof (invcntl->param), 1,
 527             invcntl->invfile);
 528 
 529         if (invcntl->swap)
 530                 swap_ints(&invcntl->param, sizeof (invcntl->param));
 531 }
 532 
 533 static void
 534 read_superfinger(INVCONTROL *invcntl)
 535 {
 536         size_t count;
 537 
 538         (void) fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
 539         (void) fread(invcntl->iindex, (int)invcntl->param.supsize,
 540             1, invcntl->invfile);
 541 
 542         if (invcntl->swap) {
 543                 /*
 544                  * The superfinger consists of a count, followed by
 545                  * count offsets, followed by a string table (which
 546                  * the offsets reference).
 547                  *
 548                  * We need to swap the count and the offsets.
 549                  */
 550                 count = 1 + BSWAP_32(*(uint32_t *)invcntl->iindex);
 551                 swap_ints(invcntl->iindex, count * sizeof (unsigned long));
 552         }
 553 }
 554 
 555 static void
 556 read_logblock(INVCONTROL *invcntl, int block)
 557 {
 558         /* note always fetch it if the file is busy */
 559         if ((block != invcntl->numblk) ||
 560             (invcntl->param.filestat >= INVBUSY)) {
 561                 (void) fseek(invcntl->invfile,
 562                     (block * invcntl->param.sizeblk) + invcntl->param.cntlsize,
 563                     SEEK_SET);
 564                 invcntl->numblk = block;
 565                 (void) fread(invcntl->logblk, (int)invcntl->param.sizeblk,
 566                     1, invcntl->invfile);
 567 
 568                 if (invcntl->swap) {
 569                         size_t count;
 570                         ENTRY *ecur, *eend;
 571                         uint32_t *postptr;
 572 
 573                         /*
 574                          * A logblock consists of a count, a next block id,
 575                          * and a previous block id, followed by count
 576                          * ENTRYs, followed by alternating strings and
 577                          * offsets.
 578                          */
 579                         swap_ints(invcntl->logblk, 3 * sizeof (unsigned long));
 580 
 581                         count = *(uint32_t *)invcntl->logblk;
 582 
 583                         ecur = (ENTRY *)((uint32_t *)invcntl->logblk + 3);
 584                         eend = ecur + count;
 585 
 586                         for (; ecur < eend; ecur++) {
 587                                 ecur->offset = BSWAP_16(ecur->offset);
 588                                 ecur->post = BSWAP_32(ecur->post);
 589 
 590                                 /*
 591                                  * After the string is the posting offset.
 592                                  */
 593                                 postptr = (uint32_t *)(invcntl->logblk +
 594                                     ecur->offset +
 595                                     P2ROUNDUP(ecur->size, sizeof (long)));
 596 
 597                                 *postptr = BSWAP_32(*postptr);
 598                         }
 599                 }
 600         }
 601 }
 602 
 603 void
 604 read_next_posting(INVCONTROL *invcntl, POSTING *posting)
 605 {
 606         (void) fread((char *)posting, sizeof (*posting), 1, invcntl->postfile);
 607         if (invcntl->swap) {
 608                 posting->lineoffset = BSWAP_32(posting->lineoffset);
 609                 posting->fcnoffset = BSWAP_32(posting->fcnoffset);
 610                 /*
 611                  * fileindex is a 24-bit field, so shift it before swapping
 612                  */
 613                 posting->fileindex = BSWAP_32(posting->fileindex << 8);
 614         }
 615 }
 616 
 617 int
 618 invopen(INVCONTROL *invcntl, char *invname, char *invpost, int stat)
 619 {
 620         int     read_index;
 621 
 622         if ((invcntl->invfile = vpfopen(invname,
 623             ((stat == 0) ? FREAD : FREADP))) == NULL) {
 624                 invcannotopen(invname);
 625                 return (-1);
 626         }
 627         if (fread((char *)&invcntl->param, sizeof (invcntl->param), 1,
 628             invcntl->invfile) == 0) {
 629                 (void) fprintf(stderr, "%s: empty inverted file\n", argv0);
 630                 goto closeinv;
 631         }
 632         if (invcntl->param.version != VERSION &&
 633             BSWAP_32(invcntl->param.version) != VERSION) {
 634                 (void) fprintf(stderr,
 635                     "%s: cannot read old index format; use -U option to "
 636                     "force database to rebuild\n", argv0);
 637                 goto closeinv;
 638         }
 639         invcntl->swap = (invcntl->param.version != VERSION);
 640         if (invcntl->swap)
 641                 swap_ints(&invcntl->param, sizeof (invcntl->param));
 642 
 643         if (stat == 0 && invcntl->param.filestat == INVALONE) {
 644                 (void) fprintf(stderr, "%s: inverted file is locked\n", argv0);
 645                 goto closeinv;
 646         }
 647         if ((invcntl->postfile = vpfopen(invpost,
 648             ((stat == 0) ? FREAD : FREADP))) == NULL) {
 649                 invcannotopen(invpost);
 650                 goto closeinv;
 651         }
 652         /* allocate core for a logical block  */
 653         if ((invcntl->logblk = malloc(invcntl->param.sizeblk)) == NULL) {
 654                 invcannotalloc((unsigned)invcntl->param.sizeblk);
 655                 goto closeboth;
 656         }
 657         /* allocate for and read in superfinger  */
 658         read_index = 1;
 659         invcntl->iindex = NULL;
 660 #if SHARE
 661         if (invcntl->param.share == 1) {
 662                 key_t ftok(), shm_key;
 663                 struct shmid_ds shm_buf;
 664                 char    *shmat();
 665                 int     shm_id;
 666 
 667                 /* see if the shared segment exists */
 668                 shm_key = ftok(invname, 2);
 669                 shm_id = shmget(shm_key, 0, 0);
 670                 /*
 671                  * Failure simply means (hopefully) that segment doesn't
 672                  * exist
 673                  */
 674                 if (shm_id == -1) {
 675                         /*
 676                          * Have to give general write permission due to AMdahl
 677                          * not having protected segments
 678                          */
 679                         shm_id = shmget(shm_key,
 680                             invcntl->param.supsize + sizeof (long),
 681                             IPC_CREAT | 0666);
 682                         if (shm_id == -1)
 683                                 perror("Could not create shared "
 684                                     "memory segment");
 685                 } else
 686                         read_index = 0;
 687 
 688                 if (shm_id != -1) {
 689                         invcntl->iindex = shmat(shm_id, 0,
 690                             ((read_index) ? 0 : SHM_RDONLY));
 691                         if (invcntl->iindex == (char *)ERR) {
 692                                 (void) fprintf(stderr,
 693                                     "%s: shared memory link failed\n", argv0);
 694                                 invcntl->iindex = NULL;
 695                                 read_index = 1;
 696                         }
 697                 }
 698         }
 699 #endif
 700         if (invcntl->iindex == NULL)
 701                 invcntl->iindex = malloc(invcntl->param.supsize + 16);
 702         if (invcntl->iindex == NULL) {
 703                 invcannotalloc((unsigned)invcntl->param.supsize);
 704                 free(invcntl->logblk);
 705                 goto closeboth;
 706         }
 707         if (read_index) {
 708                 read_superfinger(invcntl);
 709         }
 710         invcntl->numblk = -1;
 711         if (boolready() == -1) {
 712         closeboth:
 713                 (void) fclose(invcntl->postfile);
 714         closeinv:
 715                 (void) fclose(invcntl->invfile);
 716                 return (-1);
 717         }
 718         /* write back out the control block if anything changed */
 719         invcntl->param.filestat = stat;
 720         if (stat > invcntl->param.filestat)
 721                 write_param(invcntl);
 722         return (1);
 723 }
 724 
 725 /* invclose must be called to wrap things up and deallocate core */
 726 void
 727 invclose(INVCONTROL *invcntl)
 728 {
 729         /* write out the control block in case anything changed */
 730         if (invcntl->param.filestat > 0) {
 731                 invcntl->param.filestat = 0;
 732                 write_param(invcntl);
 733         }
 734         (void) fclose(invcntl->invfile);
 735         (void) fclose(invcntl->postfile);
 736 #if SHARE
 737         if (invcntl->param.share > 0) {
 738                 shmdt(invcntl->iindex);
 739                 invcntl->iindex = NULL;
 740         }
 741 #endif
 742         if (invcntl->iindex != NULL)
 743                 free(invcntl->iindex);
 744         free(invcntl->logblk);
 745 }
 746 
 747 /* invstep steps the inverted file forward one item */
 748 void
 749 invstep(INVCONTROL *invcntl)
 750 {
 751         if (invcntl->keypnt < (*(int *)invcntl->logblk - 1)) {
 752                 invcntl->keypnt++;
 753                 return;
 754         }
 755 
 756         /* move forward a block else wrap */
 757         read_logblock(invcntl, *(int *)(invcntl->logblk + sizeof (long)));
 758 
 759         invcntl->keypnt = 0;
 760 }
 761 
 762 /* invforward moves forward one term in the inverted file  */
 763 int
 764 invforward(INVCONTROL *invcntl)
 765 {
 766         invstep(invcntl);
 767         /* skip things with 0 postings */
 768         while (((ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt)->post == 0) {
 769                 invstep(invcntl);
 770         }
 771         /* Check for having wrapped - reached start of inverted file! */
 772         if ((invcntl->numblk == 0) && (invcntl->keypnt == 0))
 773                 return (0);
 774         return (1);
 775 }
 776 
 777 /*  invterm gets the present term from the present logical block  */
 778 int
 779 invterm(INVCONTROL *invcntl, char *term)
 780 {
 781         ENTRY * entryptr;
 782 
 783         entryptr = (ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt;
 784         (void) strncpy(term, entryptr->offset + invcntl->logblk,
 785             (int)entryptr->size);
 786         *(term + entryptr->size) = '\0';
 787         return (entryptr->post);
 788 }
 789 
 790 /* invfind searches for an individual item in the inverted file */
 791 long
 792 invfind(INVCONTROL *invcntl, char *searchterm)
 793 {
 794         int     imid, ilow, ihigh;
 795         long    num;
 796         int     i;
 797         unsigned long   *intptr, *intptr2;
 798         ENTRY *entryptr;
 799 
 800         /* make sure it is initialized via invready  */
 801         if (invcntl->invfile == 0)
 802                 return (-1L);
 803 
 804         /* now search for the appropriate finger block */
 805         intptr = (unsigned long *)invcntl->iindex;
 806 
 807         ilow = 0;
 808         ihigh = *intptr++ - 1;
 809         while (ilow <= ihigh) {
 810                 imid = (ilow + ihigh) / 2;
 811                 intptr2 = intptr + imid;
 812                 i = strcmp(searchterm, (invcntl->iindex + *intptr2));
 813                 if (i < 0)
 814                         ihigh = imid - 1;
 815                 else if (i > 0)
 816                         ilow = ++imid;
 817                 else {
 818                         ilow = imid + 1;
 819                         break;
 820                 }
 821         }
 822         /* be careful about case where searchterm is after last in this block */
 823         imid = (ilow) ? ilow - 1 : 0;
 824 
 825         /* fetch the appropriate logical block if not in core  */
 826         read_logblock(invcntl, imid);
 827 
 828 srch_ext:
 829         /* now find the term in this block. tricky this  */
 830         intptr = (unsigned long *)invcntl->logblk;
 831 
 832         ilow = 0;
 833         ihigh = *intptr - 1;
 834         intptr += 3;
 835         num = 0;
 836         while (ilow <= ihigh) {
 837                 imid = (ilow + ihigh) / 2;
 838                 entryptr = (ENTRY *)intptr + imid;
 839                 i = strncmp(searchterm, (invcntl->logblk + entryptr->offset),
 840                     (int)entryptr->size);
 841                 if (i == 0)
 842                         i = strlen(searchterm) - entryptr->size;
 843                 if (i < 0)
 844                         ihigh = imid - 1;
 845                 else if (i > 0)
 846                         ilow = ++imid;
 847                 else {
 848                         num = entryptr->post;
 849                         break;
 850                 }
 851         }
 852         /* be careful about case where searchterm is after last in this block */
 853         if (imid >= *(long *)invcntl->logblk) {
 854                 invcntl->keypnt = *(long *)invcntl->logblk;
 855                 invstep(invcntl);
 856                 /* note if this happens the term could be in extended block */
 857                 if (invcntl->param.startbyte <
 858                     invcntl->numblk * invcntl->param.sizeblk)
 859                         goto srch_ext;
 860         } else
 861                 invcntl->keypnt = imid;
 862         return (num);
 863 }
 864 
 865 #if DEBUG
 866 
 867 /* invdump dumps the block the term parameter is in */
 868 void
 869 invdump(INVCONTROL *invcntl, char *term)
 870 {
 871         long    i, j, n, *longptr;
 872         ENTRY * entryptr;
 873         char    temp[512], *ptr;
 874 
 875         /* dump superindex if term is "-"  */
 876         if (*term == '-') {
 877                 j = atoi(term + 1);
 878                 longptr = (long *)invcntl->iindex;
 879                 n = *longptr++;
 880                 (void) printf("Superindex dump, num blocks=%ld\n", n);
 881                 longptr += j;
 882                 while ((longptr <= ((long *)invcntl->iindex) + n) &&
 883                     invbreak == 0) {
 884                         (void) printf("%2ld  %6ld %s\n", j++, *longptr,
 885                             invcntl->iindex + *longptr);
 886                         longptr++;
 887                 }
 888                 return;
 889         } else if (*term == '#') {
 890                 j = atoi(term + 1);
 891                 /* fetch the appropriate logical block */
 892                 read_logblock(invcntl, j);
 893         } else
 894                 i = abs((int)invfind(invcntl, term));
 895         longptr = (long *)invcntl->logblk;
 896         n = *longptr++;
 897         (void) printf("Entry term to invdump=%s, postings=%ld, "
 898             "forward ptr=%ld, back ptr=%ld\n", term, i, *(longptr),
 899             *(longptr + 1));
 900         entryptr = (ENTRY *)(invcntl->logblk + 12);
 901         (void) printf("%ld terms in this block, block=%ld\n", n,
 902             invcntl->numblk);
 903         (void) printf("\tterm\t\t\tposts\tsize\toffset\tspace\t1st word\n");
 904         for (j = 0; j < n && invbreak == 0; j++) {
 905                 ptr = invcntl->logblk + entryptr->offset;
 906                 (void) strncpy(temp, ptr, (int)entryptr->size);
 907                 temp[entryptr->size] = '\0';
 908                 ptr += (sizeof (long) *
 909                     (long)((entryptr->size +
 910                     (sizeof (long) - 1)) / sizeof (long)));
 911                 (void) printf("%2ld  %-24s\t%5ld\t%3d\t%d\t%d\t%ld\n", j, temp,
 912                     entryptr->post, entryptr->size, entryptr->offset,
 913                     entryptr->space, *(long *)ptr);
 914                 entryptr++;
 915         }
 916 }
 917 #endif
 918 
 919 static int
 920 boolready(void)
 921 {
 922         numitems = 0;
 923         if (item1 != NULL)
 924                 free(item1);
 925         setsize1 = SETINC;
 926         if ((item1 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) {
 927                 invcannotalloc(SETINC);
 928                 return (-1);
 929         }
 930         if (item2 != NULL)
 931                 free(item2);
 932         setsize2 = SETINC;
 933         if ((item2 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) {
 934                 invcannotalloc(SETINC);
 935                 return (-1);
 936         }
 937         item = item1;
 938         enditem = item;
 939         return (0);
 940 }
 941 
 942 void
 943 boolclear(void)
 944 {
 945         numitems = 0;
 946         item = item1;
 947         enditem = item;
 948 }
 949 
 950 POSTING *
 951 boolfile(INVCONTROL *invcntl, long *num, int bool)
 952 {
 953         ENTRY   *entryptr;
 954         FILE    *file;
 955         char    *ptr;
 956         unsigned long   *ptr2;
 957         POSTING *newitem;
 958         POSTING posting;
 959         unsigned u;
 960         POSTING *newsetp, *set1p;
 961         long    newsetc, set1c, set2c;
 962 
 963         entryptr = (ENTRY *) (invcntl->logblk + 12) + invcntl->keypnt;
 964         ptr = invcntl->logblk + entryptr->offset;
 965         ptr2 = ((unsigned long *)ptr) +
 966             (entryptr->size + (sizeof (long) - 1)) / sizeof (long);
 967         *num = entryptr->post;
 968         switch (bool) {
 969         case OR:
 970         case NOT:
 971                 if (*num == 0) {
 972                         *num = numitems;
 973                         return (item);
 974                 }
 975         }
 976         /* make room for the new set */
 977         u = 0;
 978         switch (bool) {
 979         case AND:
 980         case NOT:
 981                 newsetp = set1p = item;
 982                 break;
 983 
 984         case OR:
 985                 u = enditem - item;
 986                 /* FALLTHROUGH */
 987         case REVERSENOT:
 988                 u += *num;
 989                 if (item == item2) {
 990                         if (u > setsize1) {
 991                                 u += SETINC;
 992                                 if ((item1 = (POSTING *) realloc(item1,
 993                                     u * sizeof (POSTING))) == NULL) {
 994                                         goto cannotalloc;
 995                                 }
 996                                 setsize1 = u;
 997                         }
 998                         newitem = item1;
 999                 } else {
1000                         if (u > setsize2) {
1001                                 u += SETINC;
1002                                 if ((item2 = (POSTING *)realloc(item2,
1003                                     u * sizeof (POSTING))) == NULL) {
1004                                 cannotalloc:
1005                                         invcannotalloc(u * sizeof (POSTING));
1006                                         (void) boolready();
1007                                         *num = -1;
1008                                         return (NULL);
1009                                 }
1010                                 setsize2 = u;
1011                         }
1012                         newitem = item2;
1013                 }
1014                 set1p = item;
1015                 newsetp = newitem;
1016         }
1017         file = invcntl->postfile;
1018         (void) fseek(file, (long)*ptr2, SEEK_SET);
1019         read_next_posting(invcntl, &posting);
1020         newsetc = 0;
1021         switch (bool) {
1022         case OR:
1023                 /* while something in both sets */
1024                 set1p = item;
1025                 newsetp = newitem;
1026                 for (set1c = 0, set2c = 0;
1027                     set1c < numitems && set2c < *num; newsetc++) {
1028                         if (set1p->lineoffset < posting.lineoffset) {
1029                                 *newsetp++ = *set1p++;
1030                                 set1c++;
1031                         } else if (set1p->lineoffset > posting.lineoffset) {
1032                                 *newsetp++ = posting;
1033                                 read_next_posting(invcntl, &posting);
1034                                 set2c++;
1035                         } else if (set1p->type < posting.type) {
1036                                 *newsetp++ = *set1p++;
1037                                 set1c++;
1038                         } else if (set1p->type > posting.type) {
1039                                 *newsetp++ = posting;
1040                                 read_next_posting(invcntl, &posting);
1041                                 set2c++;
1042                         } else {        /* identical postings */
1043                                 *newsetp++ = *set1p++;
1044                                 set1c++;
1045                                 read_next_posting(invcntl, &posting);
1046                                 set2c++;
1047                         }
1048                 }
1049                 /* find out what ran out and move the rest in */
1050                 if (set1c < numitems) {
1051                         newsetc += numitems - set1c;
1052                         while (set1c++ < numitems) {
1053                                 *newsetp++ = *set1p++;
1054                         }
1055                 } else {
1056                         while (set2c++ < *num) {
1057                                 *newsetp++ = posting;
1058                                 newsetc++;
1059                                 read_next_posting(invcntl, &posting);
1060                         }
1061                 }
1062                 item = newitem;
1063                 break; /* end of OR */
1064 #if 0
1065         case AND:
1066                 set1c = 0;
1067                 set2c = 0;
1068                 while (set1c < numitems && set2c < *num) {
1069                         if (set1p->lineoffset < posting.lineoffset) {
1070                                 set1p++;
1071                                 set1c++;
1072                         } else if (set1p->lineoffset > posting.lineoffset) {
1073                                 read_next_posting(invcntl, &posting);
1074                                 set2c++;
1075                         } else if (set1p->type < posting.type)  {
1076                                 *set1p++;
1077                                 set1c++;
1078                         } else if (set1p->type > posting.type) {
1079                                 read_next_posting(invcntl, &posting);
1080                                 set2c++;
1081                         } else {        /* identical postings */
1082                                 *newsetp++ = *set1p++;
1083                                 newsetc++;
1084                                 set1c++;
1085                                 read_next_posting(invcntl, &posting);
1086                                 set2c++;
1087                         }
1088                 }
1089                 break; /* end of AND */
1090 
1091         case NOT:
1092                 set1c = 0;
1093                 set2c = 0;
1094                 while (set1c < numitems && set2c < *num) {
1095                         if (set1p->lineoffset < posting.lineoffset) {
1096                                 *newsetp++ = *set1p++;
1097                                 newsetc++;
1098                                 set1c++;
1099                         } else if (set1p->lineoffset > posting.lineoffset) {
1100                                 read_next_posting(invcntl, &posting);
1101                                 set2c++;
1102                         } else if (set1p->type < posting.type) {
1103                                 *newsetp++ = *set1p++;
1104                                 newsetc++;
1105                                 set1c++;
1106                         } else if (set1p->type > posting.type) {
1107                                 read_next_posting(invcntl, &posting);
1108                                 set2c++;
1109                         } else {        /* identical postings */
1110                                 set1c++;
1111                                 set1p++;
1112                                 read_next_posting(invcntl, &posting);
1113                                 set2c++;
1114                         }
1115                 }
1116                 newsetc += numitems - set1c;
1117                 while (set1c++ < numitems) {
1118                         *newsetp++ = *set1p++;
1119                 }
1120                 break; /* end of NOT */
1121 
1122         case REVERSENOT:  /* core NOT incoming set */
1123                 set1c = 0;
1124                 set2c = 0;
1125                 while (set1c < numitems && set2c < *num) {
1126                         if (set1p->lineoffset < posting.lineoffset) {
1127                                 set1p++;
1128                                 set1c++;
1129                         } else if (set1p->lineoffset > posting.lineoffset) {
1130                                 *newsetp++ = posting;
1131                                 read_next_posting(invcntl, &posting);
1132                                 set2c++;
1133                         } else if (set1p->type < posting.type) {
1134                                 set1p++;
1135                                 set1c++;
1136                         } else if (set1p->type > posting.type) {
1137                                 *newsetp++ = posting;
1138                                 read_next_posting(invcntl, &posting);
1139                                 set2c++;
1140                         } else {        /* identical postings */
1141                                 set1c++;
1142                                 set1p++;
1143                                 read_next_posting(invcntl, &posting);
1144                                 set2c++;
1145                         }
1146                 }
1147                 while (set2c++ < *num) {
1148                         *newsetp++ = posting;
1149                         newsetc++;
1150                         read_next_posting(invcntl, &posting);
1151                 }
1152                 item = newitem;
1153                 break; /* end of REVERSENOT  */
1154 #endif
1155         }
1156         numitems = newsetc;
1157         *num = newsetc;
1158         enditem = (POSTING *)newsetp;
1159         return ((POSTING *)item);
1160 }
1161 
1162 #if 0
1163 POSTING *
1164 boolsave(int clear)
1165 {
1166         int     i;
1167         POSTING *ptr;
1168         POSTING *oldstuff, *newstuff;
1169 
1170         if (numitems == 0) {
1171                 if (clear)
1172                         boolclear();
1173                 return (NULL);
1174         }
1175         /*
1176          * if clear then give them what we have and use (void)
1177          * boolready to realloc
1178          */
1179         if (clear) {
1180                 ptr = item;
1181                 /* free up the space we didn't give them */
1182                 if (item == item1)
1183                         item1 = NULL;
1184                 else
1185                         item2 = NULL;
1186                 (void) boolready();
1187                 return (ptr);
1188         }
1189         i = (enditem - item) * sizeof (POSTING) + 100;
1190         if ((ptr = (POSTING *)malloc(i))r == NULL) {
1191                 invcannotalloc(i);
1192                 return (ptr);
1193         }
1194         /* move present set into place  */
1195         oldstuff = item;
1196         newstuff = ptr;
1197         while (oldstuff < enditem)
1198                 *newstuff++ = *oldstuff++;
1199         return (ptr);
1200 }
1201 #endif
1202 
1203 static void
1204 invcannotalloc(size_t n)
1205 {
1206         (void) fprintf(stderr, "%s: cannot allocate %u bytes\n", argv0, n);
1207 }
1208 
1209 static void
1210 invcannotopen(char *file)
1211 {
1212         (void) fprintf(stderr, "%s: cannot open file %s\n", argv0, file);
1213 }
1214 
1215 static void
1216 invcannotwrite(char *file)
1217 {
1218         (void) perror(argv0);   /* must be first to preserve errno */
1219         (void) fprintf(stderr, "%s: write to file %s failed\n", argv0, file);
1220 }