1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  28 /*        All Rights Reserved   */
  29 
  30 #pragma ident   "%Z%%M% %I%     %E% SMI"
  31 
  32 #include <stdlib.h>
  33 #include <unistd.h>
  34 #include <limits.h>
  35 #include <string.h>
  36 #include <stdio.h>
  37 #include <ctype.h>
  38 #include <locale.h>
  39 #include "hash.h"
  40 
  41 #define Tolower(c) (isupper(c)?tolower(c):c)
  42 #define DLEV 2
  43 
  44 /*
  45  * ANSI prototypes
  46  */
  47 static int      ily(char *, char *, char *, int);
  48 static int      s(char *, char *, char *, int);
  49 static int      es(char *, char *, char *, int);
  50 static int      subst(char *, char *, char *, int);
  51 static int      nop(void);
  52 static int      bility(char *, char *, char *, int);
  53 static int      i_to_y(char *, char *, char *, int);
  54 static int      CCe(char *, char *, char *, int);
  55 static int      y_to_e(char *, char *, char *, int);
  56 static int      strip(char *, char *, char *, int);
  57 static int      ize(char *, char *, char *, int);
  58 static int      tion(char *, char *, char *, int);
  59 static int      an(char *, char *, char *, int);
  60 int             prime(char *);
  61 static void     ise(void);
  62 static int      tryword(char *, char *, int);
  63 static int      trypref(char *, char *, int);
  64 static int      trysuff(char *, int);
  65 static int      vowel(int);
  66 static int      dict(char *, char *);
  67 static int      monosyl(char *, char *);
  68 static int      VCe(char *, char *, char *, int);
  69 static char     *skipv(char *);
  70 static void     ztos(char *);
  71 
  72 static struct suftab {
  73         char *suf;
  74         int (*p1)();
  75         int n1;
  76         char *d1;
  77         char *a1;
  78         int (*p2)();
  79         int n2;
  80         char *d2;
  81         char *a2;
  82 } suftab[] = {
  83         {"ssen", ily, 4, "-y+iness", "+ness" },
  84         {"ssel", ily, 4, "-y+i+less", "+less" },
  85         {"se", s, 1, "", "+s",  es, 2, "-y+ies", "+es" },
  86         {"s'", s, 2, "", "+'s"},
  87         {"s", s, 1, "", "+s"},
  88         {"ecn", subst, 1, "-t+ce", ""},
  89         {"ycn", subst, 1, "-t+cy", ""},
  90         {"ytilb", nop, 0, "", ""},
  91         {"ytilib", bility, 5, "-le+ility", ""},
  92         {"elbaif", i_to_y, 4, "-y+iable", ""},
  93         {"elba", CCe, 4, "-e+able", "+able"},
  94         {"yti", CCe, 3, "-e+ity", "+ity"},
  95         {"ylb", y_to_e, 1, "-e+y", ""},
  96         {"yl", ily, 2, "-y+ily", "+ly"},
  97         {"laci", strip, 2, "", "+al"},
  98         {"latnem", strip, 2, "", "+al"},
  99         {"lanoi", strip, 2, "", "+al"},
 100         {"tnem", strip, 4, "", "+ment"},
 101         {"gni", CCe, 3, "-e+ing", "+ing"},
 102         {"reta", nop, 0, "", ""},
 103         {"retc", nop, 0, "", ""},
 104         {"re", strip, 1, "", "+r", i_to_y, 2, "-y+ier", "+er"},
 105         {"de", strip, 1, "", "+d", i_to_y, 2, "-y+ied", "+ed"},
 106         {"citsi", strip, 2, "", "+ic"},
 107         {"citi", ize, 1, "-ic+e", ""},
 108         {"cihparg", i_to_y, 1, "-y+ic", ""},
 109         {"tse", strip, 2, "", "+st",    i_to_y, 3, "-y+iest", "+est"},
 110         {"cirtem", i_to_y, 1, "-y+ic", ""},
 111         {"yrtem", subst, 0, "-er+ry", ""},
 112         {"cigol", i_to_y, 1, "-y+ic", ""},
 113         {"tsigol", i_to_y, 2, "-y+ist", ""},
 114         {"tsi", CCe, 3, "-e+ist", "+ist"},
 115         {"msi", CCe, 3, "-e+ism", "+ist"},
 116         {"noitacifi", i_to_y, 6, "-y+ication", ""},
 117         {"noitazi", ize, 4, "-e+ation", ""},
 118         {"rota", tion, 2, "-e+or", ""},
 119         {"rotc", tion, 2, "", "+or"},
 120         {"noit", tion, 3, "-e+ion", "+ion"},
 121         {"naino", an, 3, "", "+ian"},
 122         {"na", an, 1, "", "+n"},
 123         {"evi", subst, 0, "-ion+ive", ""},
 124         {"ezi", CCe, 3, "-e+ize", "+ize"},
 125         {"pihs", strip, 4, "", "+ship"},
 126         {"dooh", ily, 4, "-y+ihood", "+hood"},
 127         {"luf", ily, 3, "-y+iful", "+ful"},
 128         {"ekil", strip, 4, "", "+like"},
 129         0
 130 };
 131 
 132 static char *preftab[] = {
 133         "anti",
 134         "auto",
 135         "bio",
 136         "counter",
 137         "dis",
 138         "electro",
 139         "en",
 140         "fore",
 141         "geo",
 142         "hyper",
 143         "intra",
 144         "inter",
 145         "iso",
 146         "kilo",
 147         "magneto",
 148         "meta",
 149         "micro",
 150         "mid",
 151         "milli",
 152         "mis",
 153         "mono",
 154         "multi",
 155         "non",
 156         "out",
 157         "over",
 158         "photo",
 159         "poly",
 160         "pre",
 161         "pseudo",
 162         "psycho",
 163         "re",
 164         "semi",
 165         "stereo",
 166         "sub",
 167         "super",
 168         "tele",
 169         "thermo",
 170         "ultra",
 171         "under",        /* must precede un */
 172         "un",
 173         0
 174 };
 175 
 176 static int vflag;
 177 static int xflag;
 178 static char *prog;
 179 static char word[LINE_MAX];
 180 static char original[LINE_MAX];
 181 static char *deriv[LINE_MAX];
 182 static char affix[LINE_MAX];
 183 static FILE *file, *found;
 184 /*
 185  *      deriv is stack of pointers to notes like +micro +ed
 186  *      affix is concatenated string of notes
 187  *      the buffer size 141 stems from the sizes of original and affix.
 188  */
 189 
 190 /*
 191  *      in an attempt to defray future maintenance misunderstandings, here is
 192  *      an attempt to describe the input/output expectations of the spell
 193  *      program.
 194  *
 195  *      spellprog is intended to be called from the shell file spell.
 196  *      because of this, there is little error checking (this is historical, not
 197  *      necessarily advisable).
 198  *
 199  *      spellprog options hashed-list pass
 200  *
 201  *      the hashed-list is a list of the form made by spellin.
 202  *      there are 2 types of hashed lists:
 203  *              1. a stop list: this specifies words that by the rules embodied
 204  *                 in spellprog would be recognized as correct, BUT are really
 205  *                 errors.
 206  *              2. a dictionary of correctly spelled words.
 207  *      the pass number determines how the words found in the specified
 208  *      hashed-list are treated. If the pass number is 1, the hashed-list is
 209  *      treated as the stop-list, otherwise, it is treated as the regular
 210  *      dictionary list. in this case, the value of "pass" is a filename. Found
 211  *      words are written to this file.
 212  *
 213  *      In the normal case, the filename = /dev/null. However, if the v option
 214  *      is specified, the derivations are written to this file.
 215  *      The spellprog looks up words in the hashed-list; if a word is found, it
 216  *      is printed to the stdout. If the hashed-list was the stop-list, the
 217  *      words found are presumed to be misspellings. in this case,
 218  *      a control character is printed ( a "-" is appended to the word.
 219  *      a hyphen will never occur naturally in the input list because deroff
 220  *      is used in the shell file before calling spellprog.)
 221  *      If the regualar spelling list was used (hlista or hlistb), the words
 222  *      are correct, and may be ditched. (unless the -v option was used -
 223  *      see the manual page).
 224  *
 225  *      spellprog should be called twice : first with the stop-list, to flag all
 226  *      a priori incorrectly spelled words; second with the dictionary.
 227  *
 228  *      spellprog hstop 1 |\
 229  *      spellprog hlista /dev/null
 230  *
 231  *      for a complete scenario, see the shell file: spell.
 232  *
 233  */
 234 
 235 int
 236 main(int argc, char **argv)
 237 {
 238         char *ep, *cp;
 239         char *dp;
 240         int fold;
 241         int c, j;
 242         int pass;
 243 
 244         /* Set locale environment variables local definitions */
 245         (void) setlocale(LC_ALL, "");
 246 #if !defined(TEXT_DOMAIN)       /* Should be defined by cc -D */
 247 #define TEXT_DOMAIN "SYS_TEST"  /* Use this only if it wasn't */
 248 #endif
 249         (void) textdomain(TEXT_DOMAIN);
 250 
 251 
 252         prog = argv[0];
 253         while ((c = getopt(argc, argv, "bvx")) != EOF) {
 254                 switch (c) {
 255                 case 'b':
 256                         ise();
 257                         break;
 258                 case 'v':
 259                         vflag++;
 260                         break;
 261                 case 'x':
 262                         xflag++;
 263                         break;
 264                 }
 265         }
 266 
 267         argc -= optind;
 268         argv = &argv[optind];
 269 
 270         if ((argc < 2) || !prime(*argv)) {
 271                 (void) fprintf(stderr,
 272                     gettext("%s: cannot initialize hash table\n"), prog);
 273                 exit(1);
 274         }
 275         argc--;
 276         argv++;
 277 
 278 /*
 279  *      if pass is not 1, it is assumed to be a filename.
 280  *      found words are written to this file.
 281  */
 282         pass = **argv;
 283         if (pass != '1')
 284                 found = fopen(*argv, "w");
 285 
 286         for (;;) {
 287                 affix[0] = 0;
 288                 file = stdout;
 289                 for (ep = word; (*ep = j = getchar()) != '\n'; ep++)
 290                         if (j == EOF)
 291                                 exit(0);
 292 /*
 293  *      here is the hyphen processing. these words were found in the stop
 294  *      list. however, if they exist as is, (no derivations tried) in the
 295  *      dictionary, let them through as correct.
 296  *
 297  */
 298                 if (ep[-1] == '-') {
 299                         *--ep = 0;
 300                         if (!tryword(word, ep, 0))
 301                                 (void) fprintf(file, "%s\n", word);
 302                         continue;
 303                 }
 304                 for (cp = word, dp = original; cp < ep; )
 305                         *dp++ = *cp++;
 306                 *dp = 0;
 307                 fold = 0;
 308                 for (cp = word; cp < ep; cp++)
 309                         if (islower(*cp))
 310                                 goto lcase;
 311                 if (((ep - word) == 1) &&
 312                     ((word[0] == 'A') || (word[0] == 'I')))
 313                         continue;
 314                 if (trypref(ep, ".", 0))
 315                         goto foundit;
 316                 ++fold;
 317                 for (cp = original+1, dp = word+1; dp < ep; dp++, cp++)
 318                         *dp = Tolower(*cp);
 319 lcase:
 320                 if (((ep - word) == 1) && (word[0] == 'a'))
 321                         continue;
 322                 if (trypref(ep, ".", 0)||trysuff(ep, 0))
 323                         goto foundit;
 324                 if (isupper(word[0])) {
 325                         for (cp = original, dp = word; *dp = *cp++; dp++)
 326                                 if (fold) *dp = Tolower(*dp);
 327                         word[0] = Tolower(word[0]);
 328                         goto lcase;
 329                 }
 330                 (void) fprintf(file, "%s\n", original);
 331                 continue;
 332 
 333 foundit:
 334                 if (pass == '1')
 335                         (void) fprintf(file, "%s-\n", original);
 336                 else if (affix[0] != 0 && affix[0] != '.') {
 337                         file = found;
 338                         (void) fprintf(file, "%s\t%s\n", affix,
 339                             original);
 340                 }
 341         }
 342 }
 343 
 344 /*
 345  *      strip exactly one suffix and do
 346  *      indicated routine(s), which may recursively
 347  *      strip suffixes
 348  */
 349 
 350 static int
 351 trysuff(char *ep, int lev)
 352 {
 353         struct suftab   *t;
 354         char *cp, *sp;
 355 
 356         lev += DLEV;
 357         deriv[lev] = deriv[lev-1] = 0;
 358         for (t = &suftab[0]; (sp = t->suf) != 0; t++) {
 359                 cp = ep;
 360                 while (*sp)
 361                         if (*--cp != *sp++)
 362                                 goto next;
 363                 for (sp = cp; --sp >= word && !vowel(*sp); );
 364                 if (sp < word)
 365                         return (0);
 366                 if ((*t->p1)(ep-t->n1, t->d1, t->a1, lev+1))
 367                         return (1);
 368                 if (t->p2 != 0) {
 369                         deriv[lev] = deriv[lev+1] = 0;
 370                         return ((*t->p2)(ep-t->n2, t->d2, t->a2, lev));
 371                 }
 372                 return (0);
 373 next:;
 374         }
 375         return (0);
 376 }
 377 
 378 static int
 379 nop(void)
 380 {
 381         return (0);
 382 }
 383 
 384 /* ARGSUSED */
 385 static int
 386 strip(char *ep, char *d, char *a, int lev)
 387 {
 388         return (trypref(ep, a, lev)||trysuff(ep, lev));
 389 }
 390 
 391 static int
 392 s(char *ep, char *d, char *a, int lev)
 393 {
 394         if (lev > DLEV+1)
 395                 return (0);
 396         if (*ep == 's' && ep[-1] == 's')
 397                 return (0);
 398         return (strip(ep, d, a, lev));
 399 }
 400 
 401 /* ARGSUSED */
 402 static int
 403 an(char *ep, char *d, char *a, int lev)
 404 {
 405         if (!isupper(*word))    /* must be proper name */
 406                 return (0);
 407         return (trypref(ep, a, lev));
 408 }
 409 
 410 /* ARGSUSED */
 411 static int
 412 ize(char *ep, char *d, char *a, int lev)
 413 {
 414         ep[-1] = 'e';
 415         return (strip(ep, "", d, lev));
 416 }
 417 
 418 /* ARGSUSED */
 419 static int
 420 y_to_e(char *ep, char *d, char *a, int lev)
 421 {
 422         *ep++ = 'e';
 423         return (strip(ep, "", d, lev));
 424 }
 425 
 426 static int
 427 ily(char *ep, char *d, char *a, int lev)
 428 {
 429         if (ep[-1] == 'i')
 430                 return (i_to_y(ep, d, a, lev));
 431         else
 432                 return (strip(ep, d, a, lev));
 433 }
 434 
 435 static int
 436 bility(char *ep, char *d, char *a, int lev)
 437 {
 438         *ep++ = 'l';
 439         return (y_to_e(ep, d, a, lev));
 440 }
 441 
 442 static int
 443 i_to_y(char *ep, char *d, char *a, int lev)
 444 {
 445         if (ep[-1] == 'i') {
 446                 ep[-1] = 'y';
 447                 a = d;
 448         }
 449         return (strip(ep, "", a, lev));
 450 }
 451 
 452 static int
 453 es(char *ep, char *d, char *a, int lev)
 454 {
 455         if (lev > DLEV)
 456                 return (0);
 457         switch (ep[-1]) {
 458         default:
 459                 return (0);
 460         case 'i':
 461                 return (i_to_y(ep, d, a, lev));
 462         case 's':
 463         case 'h':
 464         case 'z':
 465         case 'x':
 466                 return (strip(ep, d, a, lev));
 467         }
 468 }
 469 
 470 /* ARGSUSED */
 471 static int
 472 subst(char *ep, char *d, char *a, int lev)
 473 {
 474         char *u, *t;
 475 
 476         if (skipv(skipv(ep-1)) < word)
 477                 return (0);
 478         for (t = d; *t != '+'; t++)
 479                 continue;
 480         for (u = ep; *--t != '-'; )
 481                 *--u = *t;
 482         return (strip(ep, "", d, lev));
 483 }
 484 
 485 
 486 static int
 487 tion(char *ep, char *d, char *a, int lev)
 488 {
 489         switch (ep[-2]) {
 490         case 'c':
 491         case 'r':
 492                 return (trypref(ep, a, lev));
 493         case 'a':
 494                 return (y_to_e(ep, d, a, lev));
 495         }
 496         return (0);
 497 }
 498 
 499 /*      possible consonant-consonant-e ending */
 500 static int
 501 CCe(char *ep, char *d, char *a, int lev)
 502 {
 503         switch (ep[-1]) {
 504         case 'r':
 505                 if (ep[-2] == 't')
 506                         return (y_to_e(ep, d, a, lev));
 507                 break;
 508         case 'l':
 509                 if (vowel(ep[-2]))
 510                         break;
 511                 switch (ep[-2]) {
 512                 case 'l':
 513                 case 'r':
 514                 case 'w':
 515                         break;
 516                 default:
 517                         return (y_to_e(ep, d, a, lev));
 518                 }
 519                 break;
 520         case 's':
 521                 if (ep[-2] == 's')
 522                         break;
 523                 if (*ep == 'a')
 524                         return (0);
 525                 if (vowel(ep[-2]))
 526                         break;
 527                 if (y_to_e(ep, d, a, lev))
 528                         return (1);
 529                 if (!(ep[-2] == 'n' && ep[-1] == 'g'))
 530                         return (0);
 531                 break;
 532         case 'c':
 533         case 'g':
 534                 if (*ep == 'a')
 535                         return (0);
 536                 if (vowel(ep[-2]))
 537                         break;
 538                 if (y_to_e(ep, d, a, lev))
 539                         return (1);
 540                 if (!(ep[-2] == 'n' && ep[-1] == 'g'))
 541                         return (0);
 542                 break;
 543         case 'v':
 544         case 'z':
 545                 if (vowel(ep[-2]))
 546                         break;
 547                 if (y_to_e(ep, d, a, lev))
 548                         return (1);
 549                 if (!(ep[-2] == 'n' && ep[-1] == 'g'))
 550                         return (0);
 551                 break;
 552         case 'u':
 553                 if (y_to_e(ep, d, a, lev))
 554                         return (1);
 555                 if (!(ep[-2] == 'n' && ep[-1] == 'g'))
 556                         return (0);
 557                 break;
 558         }
 559         return (VCe(ep, d, a, lev));
 560 }
 561 
 562 /*      possible consonant-vowel-consonant-e ending */
 563 static int
 564 VCe(char *ep, char *d, char *a, int lev)
 565 {
 566         char c;
 567         c = ep[-1];
 568         if (c == 'e')
 569                 return (0);
 570         if (!vowel(c) && vowel(ep[-2])) {
 571                 c = *ep;
 572                 *ep++ = 'e';
 573                 if (trypref(ep, d, lev)||trysuff(ep, lev))
 574                         return (1);
 575                 ep--;
 576                 *ep = c;
 577         }
 578         return (strip(ep, d, a, lev));
 579 }
 580 
 581 static char *
 582 lookuppref(char **wp, char *ep)
 583 {
 584         char **sp;
 585         char *bp, *cp;
 586 
 587         for (sp = preftab; *sp; sp++) {
 588                 bp = *wp;
 589                 for (cp = *sp; *cp; cp++, bp++)
 590                         if (Tolower(*bp) != *cp)
 591                                 goto next;
 592                 for (cp = bp; cp < ep; cp++)
 593                         if (vowel(*cp)) {
 594                                 *wp = bp;
 595                                 return (*sp);
 596                         }
 597 next:;
 598         }
 599         return (0);
 600 }
 601 
 602 /*
 603  *      while word is not in dictionary try stripping
 604  *      prefixes. Fail if no more prefixes.
 605  */
 606 static int
 607 trypref(char *ep, char *a, int lev)
 608 {
 609         char *cp;
 610         char *bp;
 611         char *pp;
 612         int val = 0;
 613         char space[LINE_MAX * 2];
 614         deriv[lev] = a;
 615         if (tryword(word, ep, lev))
 616                 return (1);
 617         bp = word;
 618         pp = space;
 619         deriv[lev+1] = pp;
 620         while (cp = lookuppref(&bp, ep)) {
 621                 *pp++ = '+';
 622                 while (*pp = *cp++)
 623                         pp++;
 624                 if (tryword(bp, ep, lev+1)) {
 625                         val = 1;
 626                         break;
 627                 }
 628         }
 629         deriv[lev+1] = deriv[lev+2] = 0;
 630         return (val);
 631 }
 632 
 633 static int
 634 tryword(char *bp, char *ep, int lev)
 635 {
 636         int i, j;
 637         char duple[3];
 638         if (ep-bp <= 1)
 639                 return (0);
 640         if (vowel(*ep)) {
 641                 if (monosyl(bp, ep))
 642                         return (0);
 643         }
 644         i = dict(bp, ep);
 645         if (i == 0 && vowel(*ep) && ep[-1] == ep[-2] && monosyl(bp, ep-1)) {
 646                 ep--;
 647                 deriv[++lev] = duple;
 648                 duple[0] = '+';
 649                 duple[1] = *ep;
 650                 duple[2] = 0;
 651                 i = dict(bp, ep);
 652         }
 653         if (vflag == 0 || i == 0)
 654                 return (i);
 655         /*
 656          *      when derivations are wanted, collect them
 657          *      for printing
 658          */
 659         j = lev;
 660         do {
 661                 if (deriv[j])
 662                         (void) strcat(affix, deriv[j]);
 663         } while (--j > 0);
 664         return (i);
 665 }
 666 
 667 
 668 static int
 669 monosyl(char *bp, char *ep)
 670 {
 671         if (ep < bp+2)
 672                 return (0);
 673         if (vowel(*--ep) || !vowel(*--ep) || ep[1] == 'x' || ep[1] == 'w')
 674                 return (0);
 675         while (--ep >= bp)
 676                 if (vowel(*ep))
 677                         return (0);
 678         return (1);
 679 }
 680 
 681 static char *
 682 skipv(char *s)
 683 {
 684         if (s >= word&&vowel(*s))
 685                 s--;
 686         while (s >= word && !vowel(*s))
 687                 s--;
 688         return (s);
 689 }
 690 
 691 static int
 692 vowel(int c)
 693 {
 694         switch (Tolower(c)) {
 695         case 'a':
 696         case 'e':
 697         case 'i':
 698         case 'o':
 699         case 'u':
 700         case 'y':
 701                 return (1);
 702         }
 703         return (0);
 704 }
 705 
 706 /* crummy way to Britishise */
 707 static void
 708 ise(void)
 709 {
 710         struct suftab *p;
 711 
 712         for (p = suftab; p->suf; p++) {
 713                 ztos(p->suf);
 714                 ztos(p->d1);
 715                 ztos(p->a1);
 716         }
 717 }
 718 
 719 static void
 720 ztos(char *s)
 721 {
 722         for (; *s; s++)
 723                 if (*s == 'z')
 724                         *s = 's';
 725 }
 726 
 727 static int
 728 dict(char *bp, char *ep)
 729 {
 730         int temp, result;
 731         if (xflag)
 732                 (void) fprintf(stdout, "=%.*s\n", ep-bp, bp);
 733         temp = *ep;
 734         *ep = 0;
 735         result = hashlook(bp);
 736         *ep = temp;
 737         return (result);
 738 }