1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*      Copyright (c) 1988 AT&T     */
  23 /*        All Rights Reserved   */
  24 
  25 
  26 /*
  27  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  28  * Use is subject to license terms.
  29  */
  30 
  31 #include <ctype.h>
  32 #include <stdio.h>
  33 #include <sys/types.h>
  34 #include <sys/sysmacros.h>
  35 #include <sys/byteorder.h>
  36 #if SHARE
  37 #include <sys/ipc.h>
  38 #include <sys/shm.h>
  39 #define ERR  -1
  40 #endif
  41 #include "invlib.h"
  42 #include "library.h"
  43 
  44 #define DEBUG           0       /* debugging code and realloc messages */
  45 #define BLOCKSIZE       2 * BUFSIZ      /* logical block size */
  46 #define LINEMAX         1000    /* sorted posting line max size */
  47 #define POSTINC         10000   /* posting buffer size increment */
  48 #define SEP             ' '     /* sorted posting field separator */
  49 #define SETINC          100     /* posting set size increment */
  50 #define STATS           0       /* print statistics */
  51 #define SUPERINC        10000   /* super index size increment */
  52 #define TERMMAX         512     /* term max size */
  53 #define VERSION         1       /* inverted index format version */
  54 #define ZIPFSIZE        200     /* zipf curve size */
  55 #define FREAD   "r"             /* fopen for reading */
  56 #define FREADP  "r+"            /* fopen for update */
  57 #define FWRITE  "w"             /* fopen truncate or create for writing */
  58 #define FWRITEP "w+"            /* fopen truncate or create for update */
  59 
  60 extern  char    *argv0; /* command name (must be set in main function) */
  61 
  62 int     invbreak;
  63 
  64 #if STATS
  65 int     showzipf;       /* show postings per term distribution */
  66 #endif
  67 
  68 static  POSTING *item, *enditem, *item1 = NULL, *item2 = NULL;
  69 static  unsigned setsize1, setsize2;
  70 static  long    numitems, totterm, zerolong;
  71 static  char    *indexfile, *postingfile;
  72 static  FILE    *outfile, *fpost;
  73 static  unsigned supersize = SUPERINC, supintsize;
  74 static  int     numpost, numlogblk, amtused, nextpost,
  75                 lastinblk, numinvitems;
  76 static  POSTING *POST, *postptr;
  77 static  unsigned long   *SUPINT, *supint, nextsupfing;
  78 static  char    *SUPFING, *supfing;
  79 static  char    thisterm[TERMMAX];
  80 static  union {
  81         long    invblk[BLOCKSIZE / sizeof (long)];
  82         char    chrblk[BLOCKSIZE];
  83 } logicalblk;
  84 
  85 #if DEBUG || STATS
  86 static  long    totpost;
  87 #endif
  88 
  89 #if STATS
  90 static  int     zipf[ZIPFSIZE + 1];
  91 #endif
  92 
  93 static void invcannotalloc(size_t n);
  94 static void invcannotopen(char *file);
  95 static void invcannotwrite(char *file);
  96 static int invnewterm(void);
  97 static int boolready(void);
  98 
  99 long
 100 invmake(char *invname, char *invpost, FILE *infile)
 101 {
 102         unsigned char   *s;
 103         long    num;
 104         int     i;
 105         long    fileindex;
 106         unsigned postsize = POSTINC * sizeof (POSTING);
 107         unsigned long   *intptr;
 108         char    line[LINEMAX];
 109         long    tlong;
 110         PARAM   param;
 111         POSTING posting;
 112 #if STATS
 113         int     j;
 114         unsigned maxtermlen = 0;
 115 #endif
 116         /* output file */
 117         if ((outfile = vpfopen(invname, FWRITEP)) == NULL) {
 118                 invcannotopen(invname);
 119                 return (0);
 120         }
 121         indexfile = invname;
 122         (void) fseek(outfile, (long)BUFSIZ, 0);
 123 
 124         /* posting file  */
 125         if ((fpost = vpfopen(invpost, FWRITE)) == NULL) {
 126                 invcannotopen(invpost);
 127                 return (0);
 128         }
 129         postingfile = invpost;
 130         nextpost = 0;
 131         /* get space for the postings list */
 132         if ((POST = (POSTING *)malloc(postsize)) == NULL) {
 133                 invcannotalloc(postsize);
 134                 return (0);
 135         }
 136         postptr = POST;
 137         /* get space for the superfinger (superindex) */
 138         if ((SUPFING = malloc(supersize)) == NULL) {
 139                 invcannotalloc(supersize);
 140                 return (0);
 141         }
 142         supfing = SUPFING;
 143         supintsize = supersize / 40;
 144         /* also for the superfinger index */
 145         if ((SUPINT = malloc(supintsize * sizeof (long))) == NULL) {
 146                 invcannotalloc(supintsize * sizeof (long));
 147                 return (0);
 148         }
 149         supint = SUPINT;
 150         supint++; /* leave first term open for a count */
 151         /* initialize using an empty term */
 152         (void) strcpy(thisterm, "");
 153         *supint++ = 0;
 154         *supfing++ = ' ';
 155         *supfing++ = '\0';
 156         nextsupfing = 2;
 157 #if DEBUG || STATS
 158         totpost = 0L;
 159 #endif
 160         totterm = 0L;
 161         numpost = 1;
 162 
 163         /*
 164          * set up as though a block had come and gone, i.e., set up for
 165          * new block
 166          */
 167         amtused = 16; /* leave no space - init 3 words + one for luck */
 168         numinvitems = 0;
 169         numlogblk = 0;
 170         lastinblk = BLOCKSIZE;
 171 
 172         /* now loop as long as more to read (till eof)  */
 173         while (fgets(line, LINEMAX, infile) != NULL) {
 174 #if DEBUG || STATS
 175                 ++totpost;
 176 #endif
 177                 s = (unsigned char *) strchr(line, SEP);
 178                 if (s == NULL)          /* where did this line come from ??? */
 179                         continue;       /* workaround: just skip it */
 180                 *s = '\0';
 181 #if STATS
 182                 if ((i = strlen(line)) > maxtermlen) {
 183                         maxtermlen = i;
 184                 }
 185 #endif
 186 #if DEBUG
 187                 (void) printf("%ld: %s ", totpost, line);
 188                 (void) fflush(stdout);
 189 #endif
 190                 if (strcmp(thisterm, line) == 0) {
 191                         if (postptr + 10 > POST + postsize / sizeof (POSTING)) {
 192                                 i = postptr - POST;
 193                                 postsize += POSTINC * sizeof (POSTING);
 194                                 if ((POST = realloc(POST, postsize)) == NULL) {
 195                                         invcannotalloc(postsize);
 196                                         return (0);
 197                                 }
 198                                 postptr = i + POST;
 199 #if DEBUG
 200                                 (void) printf("reallocated post space to %u, "
 201                                     "totpost=%ld\n", postsize, totpost);
 202 #endif
 203                         }
 204                         numpost++;
 205                 } else {
 206                         /* have a new term */
 207                         if (!invnewterm()) {
 208                                 return (0);
 209                         }
 210                         (void) strcpy(thisterm, line);
 211                         numpost = 1;
 212                         postptr = POST;
 213                         fileindex = 0;
 214                 }
 215                 /* get the new posting */
 216                 num = *++s - '!';
 217                 i = 1;
 218                 do {
 219                         num = BASE * num + *++s - '!';
 220                 } while (++i < PRECISION);
 221                 posting.lineoffset = num;
 222                 while (++fileindex < nsrcoffset && num > srcoffset[fileindex]) {
 223                         ;
 224                 }
 225                 posting.fileindex = --fileindex;
 226                 posting.type = *++s;
 227                 num = *++s - '!';
 228                 if (*s != '\n') {
 229                         num = *++s - '!';
 230                         while (*++s != '\n') {
 231                                 num = BASE * num + *s - '!';
 232                         }
 233                         posting.fcnoffset = num;
 234                 } else {
 235                         posting.fcnoffset = 0;
 236                 }
 237                 *postptr++ = posting;
 238 #if DEBUG
 239                 (void) printf("%ld %ld %ld %ld\n", posting.fileindex,
 240                     posting.fcnoffset, posting.lineoffset, posting.type);
 241                 (void) fflush(stdout);
 242 #endif
 243         }
 244         if (!invnewterm()) {
 245                 return (0);
 246         }
 247         /* now clean up final block  */
 248         logicalblk.invblk[0] = numinvitems;
 249         /* loops pointer around to start */
 250         logicalblk.invblk[1] = 0;
 251         logicalblk.invblk[2] = numlogblk - 1;
 252         if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) {
 253                 goto cannotwrite;
 254         }
 255         numlogblk++;
 256         /* write out block to save space. what in it doesn't matter */
 257         if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) {
 258                 goto cannotwrite;
 259         }
 260         /* finish up the super finger */
 261         *SUPINT = numlogblk;
 262         /* add to the offsets the size of the offset pointers */
 263         intptr = (SUPINT + 1);
 264         i = (char *)supint - (char *)SUPINT;
 265         while (intptr < supint)
 266                 *intptr++ += i;
 267         /* write out the offsets (1 for the N at start) and the super finger */
 268         if (fwrite((char *)SUPINT, sizeof (*SUPINT), numlogblk + 1,
 269             outfile) == 0 ||
 270             fwrite(SUPFING, 1, supfing - SUPFING, outfile) == 0) {
 271                 goto cannotwrite;
 272         }
 273         /* save the size for reference later */
 274         nextsupfing = sizeof (long) + sizeof (long) * numlogblk +
 275             (supfing - SUPFING);
 276         /*
 277          * make sure the file ends at a logical block boundary.  This is
 278          * necessary for invinsert to correctly create extended blocks
 279          */
 280         i = nextsupfing % BLOCKSIZE;
 281         /* write out junk to fill log blk */
 282         if (fwrite(SUPFING, BLOCKSIZE - i, 1, outfile) == 0 ||
 283             fflush(outfile) == EOF) {
 284                 /* rewind doesn't check for write failure */
 285                 goto cannotwrite;
 286         }
 287         /* write the control area */
 288         rewind(outfile);
 289         param.version = VERSION;
 290         param.filestat = 0;
 291         param.sizeblk = BLOCKSIZE;
 292         param.startbyte = (numlogblk + 1) * BLOCKSIZE + BUFSIZ;
 293         param.supsize = nextsupfing;
 294         param.cntlsize = BUFSIZ;
 295         param.share = 0;
 296         if (fwrite((char *)&param, sizeof (param), 1, outfile) == 0) {
 297                 goto cannotwrite;
 298         }
 299         for (i = 0; i < 10; i++)     /* for future use */
 300                 if (fwrite((char *)&zerolong, sizeof (zerolong),
 301                     1, outfile) == 0) {
 302                         goto cannotwrite;
 303                 }
 304 
 305         /* make first block loop backwards to last block */
 306         if (fflush(outfile) == EOF) {
 307                 /* fseek doesn't check for write failure */
 308                 goto cannotwrite;
 309         }
 310         /* get to second word first block */
 311         (void) fseek(outfile, (long)BUFSIZ + 8, 0);
 312         tlong = numlogblk - 1;
 313         if (fwrite((char *)&tlong, sizeof (tlong), 1, outfile) == 0 ||
 314             fclose(outfile) == EOF) {
 315 cannotwrite:
 316                 invcannotwrite(invname);
 317                 return (0);
 318         }
 319         if (fclose(fpost) == EOF) {
 320                 invcannotwrite(postingfile);
 321                 return (0);
 322         }
 323         --totterm;      /* don't count null term */
 324 #if STATS
 325         (void) printf("logical blocks = %d, postings = %ld, terms = %ld, "
 326             "max term length = %d\n", numlogblk, totpost, totterm, maxtermlen);
 327         if (showzipf) {
 328                 (void) printf(
 329                     "\n*************   ZIPF curve  ****************\n");
 330                 for (j = ZIPFSIZE; j > 1; j--)
 331                         if (zipf[j])
 332                                 break;
 333                 for (i = 1; i < j; ++i) {
 334                         (void) printf("%3d -%6d ", i, zipf[i]);
 335                         if (i % 6 == 0) (void) putchar('\n');
 336                 }
 337                 (void) printf(">%d-%6d\n", ZIPFSIZE, zipf[0]);
 338         }
 339 #endif
 340         /* free all malloc'd memory */
 341         free(POST);
 342         free(SUPFING);
 343         free(SUPINT);
 344         return (totterm);
 345 }
 346 
 347 /* add a term to the data base */
 348 
 349 static int
 350 invnewterm(void)
 351 {
 352         int     backupflag, i, j, maxback, holditems, gooditems, howfar;
 353         int     len, numwilluse, wdlen;
 354         char    *tptr, *tptr2, *tptr3;
 355         union {
 356                 unsigned long   packword[2];
 357                 ENTRY           e;
 358         } iteminfo;
 359 
 360         totterm++;
 361 #if STATS
 362         /* keep zipfian info on the distribution */
 363         if (numpost <= ZIPFSIZE)
 364                 zipf[numpost]++;
 365         else
 366                 zipf[0]++;
 367 #endif
 368         len = strlen(thisterm);
 369         wdlen = (len + (sizeof (long) - 1)) / sizeof (long);
 370         numwilluse = (wdlen + 3) * sizeof (long);
 371         /* new block if at least 1 item in block */
 372         if (numinvitems && numwilluse + amtused > BLOCKSIZE) {
 373                 /* set up new block */
 374                 if (supfing + 500 > SUPFING + supersize) {
 375                         i = supfing - SUPFING;
 376                         supersize += 20000;
 377                         if ((SUPFING = realloc(SUPFING, supersize)) == NULL) {
 378                                 invcannotalloc(supersize);
 379                                 return (0);
 380                         }
 381                         supfing = i + SUPFING;
 382 #if DEBUG
 383                         (void) printf("reallocated superfinger space to %d, "
 384                             "totpost=%ld\n", supersize, totpost);
 385 #endif
 386                 }
 387                 /* check that room for the offset as well */
 388                 if ((numlogblk + 10) > supintsize) {
 389                         i = supint - SUPINT;
 390                         supintsize += SUPERINC;
 391                         if ((SUPINT = realloc((char *)SUPINT,
 392                             supintsize * sizeof (long))) == NULL) {
 393                                 invcannotalloc(supintsize * sizeof (long));
 394                                 return (0);
 395                         }
 396                         supint = i + SUPINT;
 397 #if DEBUG
 398                         (void) printf("reallocated superfinger offset to %d, "
 399                             "totpost = %ld\n", supintsize * sizeof (long),
 400                             totpost);
 401 #endif
 402                 }
 403                 /* See if backup is efficatious  */
 404                 backupflag = 0;
 405                 maxback = strlen(thisterm) / 10;
 406                 holditems = numinvitems;
 407                 if (maxback > numinvitems)
 408                         maxback = numinvitems - 2;
 409                 howfar = 0;
 410                 while (--maxback > 0) {
 411                         howfar++;
 412                         iteminfo.packword[0] =
 413                             logicalblk.invblk[--holditems * 2 +
 414                             (sizeof (long) - 1)];
 415                         if ((i = iteminfo.e.size / 10) < maxback) {
 416                                 maxback = i;
 417                                 backupflag = howfar;
 418                                 gooditems = holditems;
 419                                 tptr2 = logicalblk.chrblk + iteminfo.e.offset;
 420                         }
 421                 }
 422                 /* see if backup will occur  */
 423                 if (backupflag) {
 424                         numinvitems = gooditems;
 425                 }
 426                 logicalblk.invblk[0] = numinvitems;
 427                 /* set forward pointer pointing to next */
 428                 logicalblk.invblk[1] = numlogblk + 1;
 429                 /* set back pointer to last block */
 430                 logicalblk.invblk[2] = numlogblk - 1;
 431                 if (fwrite((char *)logicalblk.chrblk, 1,
 432                     BLOCKSIZE, outfile) == 0) {
 433                         invcannotwrite(indexfile);
 434                         return (0);
 435                 }
 436                 amtused = 16;
 437                 numlogblk++;
 438                 /* check if had to back up, if so do it */
 439                 if (backupflag) {
 440                         /* find out where the end of the new block is */
 441                         iteminfo.packword[0] =
 442                             logicalblk.invblk[numinvitems * 2 + 1];
 443                         tptr3 = logicalblk.chrblk + iteminfo.e.offset;
 444                         /* move the index for this block */
 445                         for (i = 3; i <= (backupflag * 2 + 2); i++) {
 446                                 logicalblk.invblk[i] =
 447                                     logicalblk.invblk[numinvitems * 2+i];
 448                         }
 449                         /* move the word into the super index */
 450                         iteminfo.packword[0] = logicalblk.invblk[3];
 451                         iteminfo.packword[1] = logicalblk.invblk[4];
 452                         tptr2 = logicalblk.chrblk + iteminfo.e.offset;
 453                         (void) strncpy(supfing, tptr2, (int)iteminfo.e.size);
 454                         *(supfing + iteminfo.e.size) = '\0';
 455 #if DEBUG
 456                         (void) printf("backup %d at term=%s to term=%s\n",
 457                             backupflag, thisterm, supfing);
 458 #endif
 459                         *supint++ = nextsupfing;
 460                         nextsupfing += strlen(supfing) + 1;
 461                         supfing += strlen(supfing) + 1;
 462                         /* now fix up the logical block */
 463                         tptr = logicalblk.chrblk + lastinblk;
 464                         lastinblk = BLOCKSIZE;
 465                         tptr2 = logicalblk.chrblk + lastinblk;
 466                         j = tptr3 - tptr;
 467                         while (tptr3 > tptr)
 468                                 *--tptr2 = *--tptr3;
 469                         lastinblk -= j;
 470                         amtused += (8 * backupflag + j);
 471                         for (i = 3; i < (backupflag * 2 + 2); i += 2) {
 472                                 iteminfo.packword[0] = logicalblk.invblk[i];
 473                                 iteminfo.e.offset += (tptr2 - tptr3);
 474                                 logicalblk.invblk[i] = iteminfo.packword[0];
 475                         }
 476                         numinvitems = backupflag;
 477                 } else { /* no backup needed */
 478                         numinvitems = 0;
 479                         lastinblk = BLOCKSIZE;
 480                         /* add new term to superindex */
 481                         (void) strcpy(supfing, thisterm);
 482                         supfing += strlen(thisterm) + 1;
 483                         *supint++ = nextsupfing;
 484                         nextsupfing += strlen(thisterm) + 1;
 485                 }
 486         }
 487         lastinblk -= (numwilluse - 8);
 488         iteminfo.e.offset = lastinblk;
 489         iteminfo.e.size = (char)len;
 490         iteminfo.e.space = 0;
 491         iteminfo.e.post = numpost;
 492         (void) strncpy(logicalblk.chrblk + lastinblk, thisterm, len);
 493         amtused += numwilluse;
 494         logicalblk.invblk[(lastinblk/sizeof (long))+wdlen] = nextpost;
 495         if ((i = postptr - POST) > 0) {
 496                 if (fwrite((char *)POST, sizeof (POSTING), i, fpost) == 0) {
 497                         invcannotwrite(postingfile);
 498                         return (0);
 499                 }
 500                 nextpost += i * sizeof (POSTING);
 501         }
 502         logicalblk.invblk[3+2*numinvitems++] = iteminfo.packword[0];
 503         logicalblk.invblk[2+2*numinvitems] = iteminfo.packword[1];
 504         return (1);
 505 }
 506 
 507 static void
 508 swap_ints(void *p, size_t sz)
 509 {
 510         uint32_t *s;
 511         uint32_t *e = (uint32_t *)p + (sz / sizeof (uint32_t));
 512 
 513         for (s = p; s < e; s++)
 514                 *s = BSWAP_32(*s);
 515 }
 516 
 517 static void
 518 write_param(INVCONTROL *invcntl)
 519 {
 520         if (invcntl->swap)
 521                 swap_ints(&invcntl->param, sizeof (invcntl->param));
 522 
 523         rewind(invcntl->invfile);
 524         (void) fwrite((char *)&invcntl->param, sizeof (invcntl->param), 1,
 525             invcntl->invfile);
 526 
 527         if (invcntl->swap)
 528                 swap_ints(&invcntl->param, sizeof (invcntl->param));
 529 }
 530 
 531 static void
 532 read_superfinger(INVCONTROL *invcntl)
 533 {
 534         size_t count;
 535 
 536         (void) fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
 537         (void) fread(invcntl->iindex, (int)invcntl->param.supsize,
 538             1, invcntl->invfile);
 539 
 540         if (invcntl->swap) {
 541                 /*
 542                  * The superfinger consists of a count, followed by
 543                  * count offsets, followed by a string table (which
 544                  * the offsets reference).
 545                  *
 546                  * We need to swap the count and the offsets.
 547                  */
 548                 count = 1 + BSWAP_32(*(uint32_t *)invcntl->iindex);
 549                 swap_ints(invcntl->iindex, count * sizeof (unsigned long));
 550         }
 551 }
 552 
 553 static void
 554 read_logblock(INVCONTROL *invcntl, int block)
 555 {
 556         /* note always fetch it if the file is busy */
 557         if ((block != invcntl->numblk) ||
 558             (invcntl->param.filestat >= INVBUSY)) {
 559                 (void) fseek(invcntl->invfile,
 560                     (block * invcntl->param.sizeblk) + invcntl->param.cntlsize,
 561                     SEEK_SET);
 562                 invcntl->numblk = block;
 563                 (void) fread(invcntl->logblk, (int)invcntl->param.sizeblk,
 564                     1, invcntl->invfile);
 565 
 566                 if (invcntl->swap) {
 567                         size_t count;
 568                         ENTRY *ecur, *eend;
 569                         uint32_t *postptr;
 570 
 571                         /*
 572                          * A logblock consists of a count, a next block id,
 573                          * and a previous block id, followed by count
 574                          * ENTRYs, followed by alternating strings and
 575                          * offsets.
 576                          */
 577                         swap_ints(invcntl->logblk, 3 * sizeof (unsigned long));
 578 
 579                         count = *(uint32_t *)invcntl->logblk;
 580 
 581                         ecur = (ENTRY *)((uint32_t *)invcntl->logblk + 3);
 582                         eend = ecur + count;
 583 
 584                         for (; ecur < eend; ecur++) {
 585                                 ecur->offset = BSWAP_16(ecur->offset);
 586                                 ecur->post = BSWAP_32(ecur->post);
 587 
 588                                 /*
 589                                  * After the string is the posting offset.
 590                                  */
 591                                 postptr = (uint32_t *)(invcntl->logblk +
 592                                     ecur->offset +
 593                                     P2ROUNDUP(ecur->size, sizeof (long)));
 594 
 595                                 *postptr = BSWAP_32(*postptr);
 596                         }
 597                 }
 598         }
 599 }
 600 
 601 void
 602 read_next_posting(INVCONTROL *invcntl, POSTING *posting)
 603 {
 604         (void) fread((char *)posting, sizeof (*posting), 1, invcntl->postfile);
 605         if (invcntl->swap) {
 606                 posting->lineoffset = BSWAP_32(posting->lineoffset);
 607                 posting->fcnoffset = BSWAP_32(posting->fcnoffset);
 608                 /*
 609                  * fileindex is a 24-bit field, so shift it before swapping
 610                  */
 611                 posting->fileindex = BSWAP_32(posting->fileindex << 8);
 612         }
 613 }
 614 
 615 int
 616 invopen(INVCONTROL *invcntl, char *invname, char *invpost, int stat)
 617 {
 618         int     read_index;
 619 
 620         if ((invcntl->invfile = vpfopen(invname,
 621             ((stat == 0) ? FREAD : FREADP))) == NULL) {
 622                 invcannotopen(invname);
 623                 return (-1);
 624         }
 625         if (fread((char *)&invcntl->param, sizeof (invcntl->param), 1,
 626             invcntl->invfile) == 0) {
 627                 (void) fprintf(stderr, "%s: empty inverted file\n", argv0);
 628                 goto closeinv;
 629         }
 630         if (invcntl->param.version != VERSION &&
 631             BSWAP_32(invcntl->param.version) != VERSION) {
 632                 (void) fprintf(stderr,
 633                     "%s: cannot read old index format; use -U option to "
 634                     "force database to rebuild\n", argv0);
 635                 goto closeinv;
 636         }
 637         invcntl->swap = (invcntl->param.version != VERSION);
 638         if (invcntl->swap)
 639                 swap_ints(&invcntl->param, sizeof (invcntl->param));
 640 
 641         if (stat == 0 && invcntl->param.filestat == INVALONE) {
 642                 (void) fprintf(stderr, "%s: inverted file is locked\n", argv0);
 643                 goto closeinv;
 644         }
 645         if ((invcntl->postfile = vpfopen(invpost,
 646             ((stat == 0) ? FREAD : FREADP))) == NULL) {
 647                 invcannotopen(invpost);
 648                 goto closeinv;
 649         }
 650         /* allocate core for a logical block  */
 651         if ((invcntl->logblk = malloc(invcntl->param.sizeblk)) == NULL) {
 652                 invcannotalloc((unsigned)invcntl->param.sizeblk);
 653                 goto closeboth;
 654         }
 655         /* allocate for and read in superfinger  */
 656         read_index = 1;
 657         invcntl->iindex = NULL;
 658 #if SHARE
 659         if (invcntl->param.share == 1) {
 660                 key_t ftok(), shm_key;
 661                 struct shmid_ds shm_buf;
 662                 char    *shmat();
 663                 int     shm_id;
 664 
 665                 /* see if the shared segment exists */
 666                 shm_key = ftok(invname, 2);
 667                 shm_id = shmget(shm_key, 0, 0);
 668                 /*
 669                  * Failure simply means (hopefully) that segment doesn't
 670                  * exist
 671                  */
 672                 if (shm_id == -1) {
 673                         /*
 674                          * Have to give general write permission due to AMdahl
 675                          * not having protected segments
 676                          */
 677                         shm_id = shmget(shm_key,
 678                             invcntl->param.supsize + sizeof (long),
 679                             IPC_CREAT | 0666);
 680                         if (shm_id == -1)
 681                                 perror("Could not create shared "
 682                                     "memory segment");
 683                 } else
 684                         read_index = 0;
 685 
 686                 if (shm_id != -1) {
 687                         invcntl->iindex = shmat(shm_id, 0,
 688                             ((read_index) ? 0 : SHM_RDONLY));
 689                         if (invcntl->iindex == (char *)ERR) {
 690                                 (void) fprintf(stderr,
 691                                     "%s: shared memory link failed\n", argv0);
 692                                 invcntl->iindex = NULL;
 693                                 read_index = 1;
 694                         }
 695                 }
 696         }
 697 #endif
 698         if (invcntl->iindex == NULL)
 699                 invcntl->iindex = malloc(invcntl->param.supsize + 16);
 700         if (invcntl->iindex == NULL) {
 701                 invcannotalloc((unsigned)invcntl->param.supsize);
 702                 free(invcntl->logblk);
 703                 goto closeboth;
 704         }
 705         if (read_index) {
 706                 read_superfinger(invcntl);
 707         }
 708         invcntl->numblk = -1;
 709         if (boolready() == -1) {
 710         closeboth:
 711                 (void) fclose(invcntl->postfile);
 712         closeinv:
 713                 (void) fclose(invcntl->invfile);
 714                 return (-1);
 715         }
 716         /* write back out the control block if anything changed */
 717         invcntl->param.filestat = stat;
 718         if (stat > invcntl->param.filestat)
 719                 write_param(invcntl);
 720         return (1);
 721 }
 722 
 723 /* invclose must be called to wrap things up and deallocate core */
 724 void
 725 invclose(INVCONTROL *invcntl)
 726 {
 727         /* write out the control block in case anything changed */
 728         if (invcntl->param.filestat > 0) {
 729                 invcntl->param.filestat = 0;
 730                 write_param(invcntl);
 731         }
 732         (void) fclose(invcntl->invfile);
 733         (void) fclose(invcntl->postfile);
 734 #if SHARE
 735         if (invcntl->param.share > 0) {
 736                 shmdt(invcntl->iindex);
 737                 invcntl->iindex = NULL;
 738         }
 739 #endif
 740         if (invcntl->iindex != NULL)
 741                 free(invcntl->iindex);
 742         free(invcntl->logblk);
 743 }
 744 
 745 /* invstep steps the inverted file forward one item */
 746 void
 747 invstep(INVCONTROL *invcntl)
 748 {
 749         if (invcntl->keypnt < (*(int *)invcntl->logblk - 1)) {
 750                 invcntl->keypnt++;
 751                 return;
 752         }
 753 
 754         /* move forward a block else wrap */
 755         read_logblock(invcntl, *(int *)(invcntl->logblk + sizeof (long)));
 756 
 757         invcntl->keypnt = 0;
 758 }
 759 
 760 /* invforward moves forward one term in the inverted file  */
 761 int
 762 invforward(INVCONTROL *invcntl)
 763 {
 764         invstep(invcntl);
 765         /* skip things with 0 postings */
 766         while (((ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt)->post == 0) {
 767                 invstep(invcntl);
 768         }
 769         /* Check for having wrapped - reached start of inverted file! */
 770         if ((invcntl->numblk == 0) && (invcntl->keypnt == 0))
 771                 return (0);
 772         return (1);
 773 }
 774 
 775 /*  invterm gets the present term from the present logical block  */
 776 int
 777 invterm(INVCONTROL *invcntl, char *term)
 778 {
 779         ENTRY * entryptr;
 780 
 781         entryptr = (ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt;
 782         (void) strncpy(term, entryptr->offset + invcntl->logblk,
 783             (int)entryptr->size);
 784         *(term + entryptr->size) = '\0';
 785         return (entryptr->post);
 786 }
 787 
 788 /* invfind searches for an individual item in the inverted file */
 789 long
 790 invfind(INVCONTROL *invcntl, char *searchterm)
 791 {
 792         int     imid, ilow, ihigh;
 793         long    num;
 794         int     i;
 795         unsigned long   *intptr, *intptr2;
 796         ENTRY *entryptr;
 797 
 798         /* make sure it is initialized via invready  */
 799         if (invcntl->invfile == 0)
 800                 return (-1L);
 801 
 802         /* now search for the appropriate finger block */
 803         intptr = (unsigned long *)invcntl->iindex;
 804 
 805         ilow = 0;
 806         ihigh = *intptr++ - 1;
 807         while (ilow <= ihigh) {
 808                 imid = (ilow + ihigh) / 2;
 809                 intptr2 = intptr + imid;
 810                 i = strcmp(searchterm, (invcntl->iindex + *intptr2));
 811                 if (i < 0)
 812                         ihigh = imid - 1;
 813                 else if (i > 0)
 814                         ilow = ++imid;
 815                 else {
 816                         ilow = imid + 1;
 817                         break;
 818                 }
 819         }
 820         /* be careful about case where searchterm is after last in this block */
 821         imid = (ilow) ? ilow - 1 : 0;
 822 
 823         /* fetch the appropriate logical block if not in core  */
 824         read_logblock(invcntl, imid);
 825 
 826 srch_ext:
 827         /* now find the term in this block. tricky this  */
 828         intptr = (unsigned long *)invcntl->logblk;
 829 
 830         ilow = 0;
 831         ihigh = *intptr - 1;
 832         intptr += 3;
 833         num = 0;
 834         while (ilow <= ihigh) {
 835                 imid = (ilow + ihigh) / 2;
 836                 entryptr = (ENTRY *)intptr + imid;
 837                 i = strncmp(searchterm, (invcntl->logblk + entryptr->offset),
 838                     (int)entryptr->size);
 839                 if (i == 0)
 840                         i = strlen(searchterm) - entryptr->size;
 841                 if (i < 0)
 842                         ihigh = imid - 1;
 843                 else if (i > 0)
 844                         ilow = ++imid;
 845                 else {
 846                         num = entryptr->post;
 847                         break;
 848                 }
 849         }
 850         /* be careful about case where searchterm is after last in this block */
 851         if (imid >= *(long *)invcntl->logblk) {
 852                 invcntl->keypnt = *(long *)invcntl->logblk;
 853                 invstep(invcntl);
 854                 /* note if this happens the term could be in extended block */
 855                 if (invcntl->param.startbyte <
 856                     invcntl->numblk * invcntl->param.sizeblk)
 857                         goto srch_ext;
 858         } else
 859                 invcntl->keypnt = imid;
 860         return (num);
 861 }
 862 
 863 #if DEBUG
 864 
 865 /* invdump dumps the block the term parameter is in */
 866 void
 867 invdump(INVCONTROL *invcntl, char *term)
 868 {
 869         long    i, j, n, *longptr;
 870         ENTRY * entryptr;
 871         char    temp[512], *ptr;
 872 
 873         /* dump superindex if term is "-"  */
 874         if (*term == '-') {
 875                 j = atoi(term + 1);
 876                 longptr = (long *)invcntl->iindex;
 877                 n = *longptr++;
 878                 (void) printf("Superindex dump, num blocks=%ld\n", n);
 879                 longptr += j;
 880                 while ((longptr <= ((long *)invcntl->iindex) + n) &&
 881                     invbreak == 0) {
 882                         (void) printf("%2ld  %6ld %s\n", j++, *longptr,
 883                             invcntl->iindex + *longptr);
 884                         longptr++;
 885                 }
 886                 return;
 887         } else if (*term == '#') {
 888                 j = atoi(term + 1);
 889                 /* fetch the appropriate logical block */
 890                 read_logblock(invcntl, j);
 891         } else
 892                 i = abs((int)invfind(invcntl, term));
 893         longptr = (long *)invcntl->logblk;
 894         n = *longptr++;
 895         (void) printf("Entry term to invdump=%s, postings=%ld, "
 896             "forward ptr=%ld, back ptr=%ld\n", term, i, *(longptr),
 897             *(longptr + 1));
 898         entryptr = (ENTRY *)(invcntl->logblk + 12);
 899         (void) printf("%ld terms in this block, block=%ld\n", n,
 900             invcntl->numblk);
 901         (void) printf("\tterm\t\t\tposts\tsize\toffset\tspace\t1st word\n");
 902         for (j = 0; j < n && invbreak == 0; j++) {
 903                 ptr = invcntl->logblk + entryptr->offset;
 904                 (void) strncpy(temp, ptr, (int)entryptr->size);
 905                 temp[entryptr->size] = '\0';
 906                 ptr += (sizeof (long) *
 907                     (long)((entryptr->size +
 908                     (sizeof (long) - 1)) / sizeof (long)));
 909                 (void) printf("%2ld  %-24s\t%5ld\t%3d\t%d\t%d\t%ld\n", j, temp,
 910                     entryptr->post, entryptr->size, entryptr->offset,
 911                     entryptr->space, *(long *)ptr);
 912                 entryptr++;
 913         }
 914 }
 915 #endif
 916 
 917 static int
 918 boolready(void)
 919 {
 920         numitems = 0;
 921         if (item1 != NULL)
 922                 free(item1);
 923         setsize1 = SETINC;
 924         if ((item1 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) {
 925                 invcannotalloc(SETINC);
 926                 return (-1);
 927         }
 928         if (item2 != NULL)
 929                 free(item2);
 930         setsize2 = SETINC;
 931         if ((item2 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) {
 932                 invcannotalloc(SETINC);
 933                 return (-1);
 934         }
 935         item = item1;
 936         enditem = item;
 937         return (0);
 938 }
 939 
 940 void
 941 boolclear(void)
 942 {
 943         numitems = 0;
 944         item = item1;
 945         enditem = item;
 946 }
 947 
 948 POSTING *
 949 boolfile(INVCONTROL *invcntl, long *num, int op)
 950 {
 951         ENTRY   *entryptr;
 952         FILE    *file;
 953         char    *ptr;
 954         unsigned long   *ptr2;
 955         POSTING *newitem;
 956         POSTING posting;
 957         unsigned u;
 958         POSTING *newsetp, *set1p;
 959         long    newsetc, set1c, set2c;
 960 
 961         entryptr = (ENTRY *) (invcntl->logblk + 12) + invcntl->keypnt;
 962         ptr = invcntl->logblk + entryptr->offset;
 963         ptr2 = ((unsigned long *)ptr) +
 964             (entryptr->size + (sizeof (long) - 1)) / sizeof (long);
 965         *num = entryptr->post;
 966         switch (op) {
 967         case OR:
 968         case NOT:
 969                 if (*num == 0) {
 970                         *num = numitems;
 971                         return (item);
 972                 }
 973         }
 974         /* make room for the new set */
 975         u = 0;
 976         switch (op) {
 977         case AND:
 978         case NOT:
 979                 newsetp = set1p = item;
 980                 break;
 981 
 982         case OR:
 983                 u = enditem - item;
 984                 /* FALLTHROUGH */
 985         case REVERSENOT:
 986                 u += *num;
 987                 if (item == item2) {
 988                         if (u > setsize1) {
 989                                 u += SETINC;
 990                                 if ((item1 = (POSTING *) realloc(item1,
 991                                     u * sizeof (POSTING))) == NULL) {
 992                                         goto cannotalloc;
 993                                 }
 994                                 setsize1 = u;
 995                         }
 996                         newitem = item1;
 997                 } else {
 998                         if (u > setsize2) {
 999                                 u += SETINC;
1000                                 if ((item2 = (POSTING *)realloc(item2,
1001                                     u * sizeof (POSTING))) == NULL) {
1002                                 cannotalloc:
1003                                         invcannotalloc(u * sizeof (POSTING));
1004                                         (void) boolready();
1005                                         *num = -1;
1006                                         return (NULL);
1007                                 }
1008                                 setsize2 = u;
1009                         }
1010                         newitem = item2;
1011                 }
1012                 set1p = item;
1013                 newsetp = newitem;
1014         }
1015         file = invcntl->postfile;
1016         (void) fseek(file, (long)*ptr2, SEEK_SET);
1017         read_next_posting(invcntl, &posting);
1018         newsetc = 0;
1019         switch (op) {
1020         case OR:
1021                 /* while something in both sets */
1022                 set1p = item;
1023                 newsetp = newitem;
1024                 for (set1c = 0, set2c = 0;
1025                     set1c < numitems && set2c < *num; newsetc++) {
1026                         if (set1p->lineoffset < posting.lineoffset) {
1027                                 *newsetp++ = *set1p++;
1028                                 set1c++;
1029                         } else if (set1p->lineoffset > posting.lineoffset) {
1030                                 *newsetp++ = posting;
1031                                 read_next_posting(invcntl, &posting);
1032                                 set2c++;
1033                         } else if (set1p->type < posting.type) {
1034                                 *newsetp++ = *set1p++;
1035                                 set1c++;
1036                         } else if (set1p->type > posting.type) {
1037                                 *newsetp++ = posting;
1038                                 read_next_posting(invcntl, &posting);
1039                                 set2c++;
1040                         } else {        /* identical postings */
1041                                 *newsetp++ = *set1p++;
1042                                 set1c++;
1043                                 read_next_posting(invcntl, &posting);
1044                                 set2c++;
1045                         }
1046                 }
1047                 /* find out what ran out and move the rest in */
1048                 if (set1c < numitems) {
1049                         newsetc += numitems - set1c;
1050                         while (set1c++ < numitems) {
1051                                 *newsetp++ = *set1p++;
1052                         }
1053                 } else {
1054                         while (set2c++ < *num) {
1055                                 *newsetp++ = posting;
1056                                 newsetc++;
1057                                 read_next_posting(invcntl, &posting);
1058                         }
1059                 }
1060                 item = newitem;
1061                 break; /* end of OR */
1062 #if 0
1063         case AND:
1064                 set1c = 0;
1065                 set2c = 0;
1066                 while (set1c < numitems && set2c < *num) {
1067                         if (set1p->lineoffset < posting.lineoffset) {
1068                                 set1p++;
1069                                 set1c++;
1070                         } else if (set1p->lineoffset > posting.lineoffset) {
1071                                 read_next_posting(invcntl, &posting);
1072                                 set2c++;
1073                         } else if (set1p->type < posting.type)  {
1074                                 *set1p++;
1075                                 set1c++;
1076                         } else if (set1p->type > posting.type) {
1077                                 read_next_posting(invcntl, &posting);
1078                                 set2c++;
1079                         } else {        /* identical postings */
1080                                 *newsetp++ = *set1p++;
1081                                 newsetc++;
1082                                 set1c++;
1083                                 read_next_posting(invcntl, &posting);
1084                                 set2c++;
1085                         }
1086                 }
1087                 break; /* end of AND */
1088 
1089         case NOT:
1090                 set1c = 0;
1091                 set2c = 0;
1092                 while (set1c < numitems && set2c < *num) {
1093                         if (set1p->lineoffset < posting.lineoffset) {
1094                                 *newsetp++ = *set1p++;
1095                                 newsetc++;
1096                                 set1c++;
1097                         } else if (set1p->lineoffset > posting.lineoffset) {
1098                                 read_next_posting(invcntl, &posting);
1099                                 set2c++;
1100                         } else if (set1p->type < posting.type) {
1101                                 *newsetp++ = *set1p++;
1102                                 newsetc++;
1103                                 set1c++;
1104                         } else if (set1p->type > posting.type) {
1105                                 read_next_posting(invcntl, &posting);
1106                                 set2c++;
1107                         } else {        /* identical postings */
1108                                 set1c++;
1109                                 set1p++;
1110                                 read_next_posting(invcntl, &posting);
1111                                 set2c++;
1112                         }
1113                 }
1114                 newsetc += numitems - set1c;
1115                 while (set1c++ < numitems) {
1116                         *newsetp++ = *set1p++;
1117                 }
1118                 break; /* end of NOT */
1119 
1120         case REVERSENOT:  /* core NOT incoming set */
1121                 set1c = 0;
1122                 set2c = 0;
1123                 while (set1c < numitems && set2c < *num) {
1124                         if (set1p->lineoffset < posting.lineoffset) {
1125                                 set1p++;
1126                                 set1c++;
1127                         } else if (set1p->lineoffset > posting.lineoffset) {
1128                                 *newsetp++ = posting;
1129                                 read_next_posting(invcntl, &posting);
1130                                 set2c++;
1131                         } else if (set1p->type < posting.type) {
1132                                 set1p++;
1133                                 set1c++;
1134                         } else if (set1p->type > posting.type) {
1135                                 *newsetp++ = posting;
1136                                 read_next_posting(invcntl, &posting);
1137                                 set2c++;
1138                         } else {        /* identical postings */
1139                                 set1c++;
1140                                 set1p++;
1141                                 read_next_posting(invcntl, &posting);
1142                                 set2c++;
1143                         }
1144                 }
1145                 while (set2c++ < *num) {
1146                         *newsetp++ = posting;
1147                         newsetc++;
1148                         read_next_posting(invcntl, &posting);
1149                 }
1150                 item = newitem;
1151                 break; /* end of REVERSENOT  */
1152 #endif
1153         }
1154         numitems = newsetc;
1155         *num = newsetc;
1156         enditem = (POSTING *)newsetp;
1157         return ((POSTING *)item);
1158 }
1159 
1160 #if 0
1161 POSTING *
1162 boolsave(int clear)
1163 {
1164         int     i;
1165         POSTING *ptr;
1166         POSTING *oldstuff, *newstuff;
1167 
1168         if (numitems == 0) {
1169                 if (clear)
1170                         boolclear();
1171                 return (NULL);
1172         }
1173         /*
1174          * if clear then give them what we have and use (void)
1175          * boolready to realloc
1176          */
1177         if (clear) {
1178                 ptr = item;
1179                 /* free up the space we didn't give them */
1180                 if (item == item1)
1181                         item1 = NULL;
1182                 else
1183                         item2 = NULL;
1184                 (void) boolready();
1185                 return (ptr);
1186         }
1187         i = (enditem - item) * sizeof (POSTING) + 100;
1188         if ((ptr = (POSTING *)malloc(i))r == NULL) {
1189                 invcannotalloc(i);
1190                 return (ptr);
1191         }
1192         /* move present set into place  */
1193         oldstuff = item;
1194         newstuff = ptr;
1195         while (oldstuff < enditem)
1196                 *newstuff++ = *oldstuff++;
1197         return (ptr);
1198 }
1199 #endif
1200 
1201 static void
1202 invcannotalloc(size_t n)
1203 {
1204         (void) fprintf(stderr, "%s: cannot allocate %u bytes\n", argv0, n);
1205 }
1206 
1207 static void
1208 invcannotopen(char *file)
1209 {
1210         (void) fprintf(stderr, "%s: cannot open file %s\n", argv0, file);
1211 }
1212 
1213 static void
1214 invcannotwrite(char *file)
1215 {
1216         (void) perror(argv0);   /* must be first to preserve errno */
1217         (void) fprintf(stderr, "%s: write to file %s failed\n", argv0, file);
1218 }