1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 #pragma ident "%Z%%M% %I% %E% SMI" 31 32 #include <stdlib.h> 33 #include <unistd.h> 34 #include <limits.h> 35 #include <string.h> 36 #include <stdio.h> 37 #include <ctype.h> 38 #include <locale.h> 39 #include "hash.h" 40 41 #define Tolower(c) (isupper(c)?tolower(c):c) 42 #define DLEV 2 43 44 /* 45 * ANSI prototypes 46 */ 47 static int ily(char *, char *, char *, int); 48 static int s(char *, char *, char *, int); 49 static int es(char *, char *, char *, int); 50 static int subst(char *, char *, char *, int); 51 static int nop(void); 52 static int bility(char *, char *, char *, int); 53 static int i_to_y(char *, char *, char *, int); 54 static int CCe(char *, char *, char *, int); 55 static int y_to_e(char *, char *, char *, int); 56 static int strip(char *, char *, char *, int); 57 static int ize(char *, char *, char *, int); 58 static int tion(char *, char *, char *, int); 59 static int an(char *, char *, char *, int); 60 int prime(char *); 61 static void ise(void); 62 static int tryword(char *, char *, int); 63 static int trypref(char *, char *, int); 64 static int trysuff(char *, int); 65 static int vowel(int); 66 static int dict(char *, char *); 67 static int monosyl(char *, char *); 68 static int VCe(char *, char *, char *, int); 69 static char *skipv(char *); 70 static void ztos(char *); 71 72 static struct suftab { 73 char *suf; 74 int (*p1)(); 75 int n1; 76 char *d1; 77 char *a1; 78 int (*p2)(); 79 int n2; 80 char *d2; 81 char *a2; 82 } suftab[] = { 83 {"ssen", ily, 4, "-y+iness", "+ness" }, 84 {"ssel", ily, 4, "-y+i+less", "+less" }, 85 {"se", s, 1, "", "+s", es, 2, "-y+ies", "+es" }, 86 {"s'", s, 2, "", "+'s"}, 87 {"s", s, 1, "", "+s"}, 88 {"ecn", subst, 1, "-t+ce", ""}, 89 {"ycn", subst, 1, "-t+cy", ""}, 90 {"ytilb", nop, 0, "", ""}, 91 {"ytilib", bility, 5, "-le+ility", ""}, 92 {"elbaif", i_to_y, 4, "-y+iable", ""}, 93 {"elba", CCe, 4, "-e+able", "+able"}, 94 {"yti", CCe, 3, "-e+ity", "+ity"}, 95 {"ylb", y_to_e, 1, "-e+y", ""}, 96 {"yl", ily, 2, "-y+ily", "+ly"}, 97 {"laci", strip, 2, "", "+al"}, 98 {"latnem", strip, 2, "", "+al"}, 99 {"lanoi", strip, 2, "", "+al"}, 100 {"tnem", strip, 4, "", "+ment"}, 101 {"gni", CCe, 3, "-e+ing", "+ing"}, 102 {"reta", nop, 0, "", ""}, 103 {"retc", nop, 0, "", ""}, 104 {"re", strip, 1, "", "+r", i_to_y, 2, "-y+ier", "+er"}, 105 {"de", strip, 1, "", "+d", i_to_y, 2, "-y+ied", "+ed"}, 106 {"citsi", strip, 2, "", "+ic"}, 107 {"citi", ize, 1, "-ic+e", ""}, 108 {"cihparg", i_to_y, 1, "-y+ic", ""}, 109 {"tse", strip, 2, "", "+st", i_to_y, 3, "-y+iest", "+est"}, 110 {"cirtem", i_to_y, 1, "-y+ic", ""}, 111 {"yrtem", subst, 0, "-er+ry", ""}, 112 {"cigol", i_to_y, 1, "-y+ic", ""}, 113 {"tsigol", i_to_y, 2, "-y+ist", ""}, 114 {"tsi", CCe, 3, "-e+ist", "+ist"}, 115 {"msi", CCe, 3, "-e+ism", "+ist"}, 116 {"noitacifi", i_to_y, 6, "-y+ication", ""}, 117 {"noitazi", ize, 4, "-e+ation", ""}, 118 {"rota", tion, 2, "-e+or", ""}, 119 {"rotc", tion, 2, "", "+or"}, 120 {"noit", tion, 3, "-e+ion", "+ion"}, 121 {"naino", an, 3, "", "+ian"}, 122 {"na", an, 1, "", "+n"}, 123 {"evi", subst, 0, "-ion+ive", ""}, 124 {"ezi", CCe, 3, "-e+ize", "+ize"}, 125 {"pihs", strip, 4, "", "+ship"}, 126 {"dooh", ily, 4, "-y+ihood", "+hood"}, 127 {"luf", ily, 3, "-y+iful", "+ful"}, 128 {"ekil", strip, 4, "", "+like"}, 129 0 130 }; 131 132 static char *preftab[] = { 133 "anti", 134 "auto", 135 "bio", 136 "counter", 137 "dis", 138 "electro", 139 "en", 140 "fore", 141 "geo", 142 "hyper", 143 "intra", 144 "inter", 145 "iso", 146 "kilo", 147 "magneto", 148 "meta", 149 "micro", 150 "mid", 151 "milli", 152 "mis", 153 "mono", 154 "multi", 155 "non", 156 "out", 157 "over", 158 "photo", 159 "poly", 160 "pre", 161 "pseudo", 162 "psycho", 163 "re", 164 "semi", 165 "stereo", 166 "sub", 167 "super", 168 "tele", 169 "thermo", 170 "ultra", 171 "under", /* must precede un */ 172 "un", 173 0 174 }; 175 176 static int vflag; 177 static int xflag; 178 static char *prog; 179 static char word[LINE_MAX]; 180 static char original[LINE_MAX]; 181 static char *deriv[LINE_MAX]; 182 static char affix[LINE_MAX]; 183 static FILE *file, *found; 184 /* 185 * deriv is stack of pointers to notes like +micro +ed 186 * affix is concatenated string of notes 187 * the buffer size 141 stems from the sizes of original and affix. 188 */ 189 190 /* 191 * in an attempt to defray future maintenance misunderstandings, here is 192 * an attempt to describe the input/output expectations of the spell 193 * program. 194 * 195 * spellprog is intended to be called from the shell file spell. 196 * because of this, there is little error checking (this is historical, not 197 * necessarily advisable). 198 * 199 * spellprog options hashed-list pass 200 * 201 * the hashed-list is a list of the form made by spellin. 202 * there are 2 types of hashed lists: 203 * 1. a stop list: this specifies words that by the rules embodied 204 * in spellprog would be recognized as correct, BUT are really 205 * errors. 206 * 2. a dictionary of correctly spelled words. 207 * the pass number determines how the words found in the specified 208 * hashed-list are treated. If the pass number is 1, the hashed-list is 209 * treated as the stop-list, otherwise, it is treated as the regular 210 * dictionary list. in this case, the value of "pass" is a filename. Found 211 * words are written to this file. 212 * 213 * In the normal case, the filename = /dev/null. However, if the v option 214 * is specified, the derivations are written to this file. 215 * The spellprog looks up words in the hashed-list; if a word is found, it 216 * is printed to the stdout. If the hashed-list was the stop-list, the 217 * words found are presumed to be misspellings. in this case, 218 * a control character is printed ( a "-" is appended to the word. 219 * a hyphen will never occur naturally in the input list because deroff 220 * is used in the shell file before calling spellprog.) 221 * If the regualar spelling list was used (hlista or hlistb), the words 222 * are correct, and may be ditched. (unless the -v option was used - 223 * see the manual page). 224 * 225 * spellprog should be called twice : first with the stop-list, to flag all 226 * a priori incorrectly spelled words; second with the dictionary. 227 * 228 * spellprog hstop 1 |\ 229 * spellprog hlista /dev/null 230 * 231 * for a complete scenario, see the shell file: spell. 232 * 233 */ 234 235 int 236 main(int argc, char **argv) 237 { 238 char *ep, *cp; 239 char *dp; 240 int fold; 241 int c, j; 242 int pass; 243 244 /* Set locale environment variables local definitions */ 245 (void) setlocale(LC_ALL, ""); 246 #if !defined(TEXT_DOMAIN) /* Should be defined by cc -D */ 247 #define TEXT_DOMAIN "SYS_TEST" /* Use this only if it wasn't */ 248 #endif 249 (void) textdomain(TEXT_DOMAIN); 250 251 252 prog = argv[0]; 253 while ((c = getopt(argc, argv, "bvx")) != EOF) { 254 switch (c) { 255 case 'b': 256 ise(); 257 break; 258 case 'v': 259 vflag++; 260 break; 261 case 'x': 262 xflag++; 263 break; 264 } 265 } 266 267 argc -= optind; 268 argv = &argv[optind]; 269 270 if ((argc < 2) || !prime(*argv)) { 271 (void) fprintf(stderr, 272 gettext("%s: cannot initialize hash table\n"), prog); 273 exit(1); 274 } 275 argc--; 276 argv++; 277 278 /* 279 * if pass is not 1, it is assumed to be a filename. 280 * found words are written to this file. 281 */ 282 pass = **argv; 283 if (pass != '1') 284 found = fopen(*argv, "w"); 285 286 for (;;) { 287 affix[0] = 0; 288 file = stdout; 289 for (ep = word; (*ep = j = getchar()) != '\n'; ep++) 290 if (j == EOF) 291 exit(0); 292 /* 293 * here is the hyphen processing. these words were found in the stop 294 * list. however, if they exist as is, (no derivations tried) in the 295 * dictionary, let them through as correct. 296 * 297 */ 298 if (ep[-1] == '-') { 299 *--ep = 0; 300 if (!tryword(word, ep, 0)) 301 (void) fprintf(file, "%s\n", word); 302 continue; 303 } 304 for (cp = word, dp = original; cp < ep; ) 305 *dp++ = *cp++; 306 *dp = 0; 307 fold = 0; 308 for (cp = word; cp < ep; cp++) 309 if (islower(*cp)) 310 goto lcase; 311 if (((ep - word) == 1) && 312 ((word[0] == 'A') || (word[0] == 'I'))) 313 continue; 314 if (trypref(ep, ".", 0)) 315 goto foundit; 316 ++fold; 317 for (cp = original+1, dp = word+1; dp < ep; dp++, cp++) 318 *dp = Tolower(*cp); 319 lcase: 320 if (((ep - word) == 1) && (word[0] == 'a')) 321 continue; 322 if (trypref(ep, ".", 0)||trysuff(ep, 0)) 323 goto foundit; 324 if (isupper(word[0])) { 325 for (cp = original, dp = word; *dp = *cp++; dp++) 326 if (fold) *dp = Tolower(*dp); 327 word[0] = Tolower(word[0]); 328 goto lcase; 329 } 330 (void) fprintf(file, "%s\n", original); 331 continue; 332 333 foundit: 334 if (pass == '1') 335 (void) fprintf(file, "%s-\n", original); 336 else if (affix[0] != 0 && affix[0] != '.') { 337 file = found; 338 (void) fprintf(file, "%s\t%s\n", affix, 339 original); 340 } 341 } 342 } 343 344 /* 345 * strip exactly one suffix and do 346 * indicated routine(s), which may recursively 347 * strip suffixes 348 */ 349 350 static int 351 trysuff(char *ep, int lev) 352 { 353 struct suftab *t; 354 char *cp, *sp; 355 356 lev += DLEV; 357 deriv[lev] = deriv[lev-1] = 0; 358 for (t = &suftab[0]; (sp = t->suf) != 0; t++) { 359 cp = ep; 360 while (*sp) 361 if (*--cp != *sp++) 362 goto next; 363 for (sp = cp; --sp >= word && !vowel(*sp); ); 364 if (sp < word) 365 return (0); 366 if ((*t->p1)(ep-t->n1, t->d1, t->a1, lev+1)) 367 return (1); 368 if (t->p2 != 0) { 369 deriv[lev] = deriv[lev+1] = 0; 370 return ((*t->p2)(ep-t->n2, t->d2, t->a2, lev)); 371 } 372 return (0); 373 next:; 374 } 375 return (0); 376 } 377 378 static int 379 nop(void) 380 { 381 return (0); 382 } 383 384 /* ARGSUSED */ 385 static int 386 strip(char *ep, char *d, char *a, int lev) 387 { 388 return (trypref(ep, a, lev)||trysuff(ep, lev)); 389 } 390 391 static int 392 s(char *ep, char *d, char *a, int lev) 393 { 394 if (lev > DLEV+1) 395 return (0); 396 if (*ep == 's' && ep[-1] == 's') 397 return (0); 398 return (strip(ep, d, a, lev)); 399 } 400 401 /* ARGSUSED */ 402 static int 403 an(char *ep, char *d, char *a, int lev) 404 { 405 if (!isupper(*word)) /* must be proper name */ 406 return (0); 407 return (trypref(ep, a, lev)); 408 } 409 410 /* ARGSUSED */ 411 static int 412 ize(char *ep, char *d, char *a, int lev) 413 { 414 ep[-1] = 'e'; 415 return (strip(ep, "", d, lev)); 416 } 417 418 /* ARGSUSED */ 419 static int 420 y_to_e(char *ep, char *d, char *a, int lev) 421 { 422 *ep++ = 'e'; 423 return (strip(ep, "", d, lev)); 424 } 425 426 static int 427 ily(char *ep, char *d, char *a, int lev) 428 { 429 if (ep[-1] == 'i') 430 return (i_to_y(ep, d, a, lev)); 431 else 432 return (strip(ep, d, a, lev)); 433 } 434 435 static int 436 bility(char *ep, char *d, char *a, int lev) 437 { 438 *ep++ = 'l'; 439 return (y_to_e(ep, d, a, lev)); 440 } 441 442 static int 443 i_to_y(char *ep, char *d, char *a, int lev) 444 { 445 if (ep[-1] == 'i') { 446 ep[-1] = 'y'; 447 a = d; 448 } 449 return (strip(ep, "", a, lev)); 450 } 451 452 static int 453 es(char *ep, char *d, char *a, int lev) 454 { 455 if (lev > DLEV) 456 return (0); 457 switch (ep[-1]) { 458 default: 459 return (0); 460 case 'i': 461 return (i_to_y(ep, d, a, lev)); 462 case 's': 463 case 'h': 464 case 'z': 465 case 'x': 466 return (strip(ep, d, a, lev)); 467 } 468 } 469 470 /* ARGSUSED */ 471 static int 472 subst(char *ep, char *d, char *a, int lev) 473 { 474 char *u, *t; 475 476 if (skipv(skipv(ep-1)) < word) 477 return (0); 478 for (t = d; *t != '+'; t++) 479 continue; 480 for (u = ep; *--t != '-'; ) 481 *--u = *t; 482 return (strip(ep, "", d, lev)); 483 } 484 485 486 static int 487 tion(char *ep, char *d, char *a, int lev) 488 { 489 switch (ep[-2]) { 490 case 'c': 491 case 'r': 492 return (trypref(ep, a, lev)); 493 case 'a': 494 return (y_to_e(ep, d, a, lev)); 495 } 496 return (0); 497 } 498 499 /* possible consonant-consonant-e ending */ 500 static int 501 CCe(char *ep, char *d, char *a, int lev) 502 { 503 switch (ep[-1]) { 504 case 'r': 505 if (ep[-2] == 't') 506 return (y_to_e(ep, d, a, lev)); 507 break; 508 case 'l': 509 if (vowel(ep[-2])) 510 break; 511 switch (ep[-2]) { 512 case 'l': 513 case 'r': 514 case 'w': 515 break; 516 default: 517 return (y_to_e(ep, d, a, lev)); 518 } 519 break; 520 case 's': 521 if (ep[-2] == 's') 522 break; 523 if (*ep == 'a') 524 return (0); 525 if (vowel(ep[-2])) 526 break; 527 if (y_to_e(ep, d, a, lev)) 528 return (1); 529 if (!(ep[-2] == 'n' && ep[-1] == 'g')) 530 return (0); 531 break; 532 case 'c': 533 case 'g': 534 if (*ep == 'a') 535 return (0); 536 if (vowel(ep[-2])) 537 break; 538 if (y_to_e(ep, d, a, lev)) 539 return (1); 540 if (!(ep[-2] == 'n' && ep[-1] == 'g')) 541 return (0); 542 break; 543 case 'v': 544 case 'z': 545 if (vowel(ep[-2])) 546 break; 547 if (y_to_e(ep, d, a, lev)) 548 return (1); 549 if (!(ep[-2] == 'n' && ep[-1] == 'g')) 550 return (0); 551 break; 552 case 'u': 553 if (y_to_e(ep, d, a, lev)) 554 return (1); 555 if (!(ep[-2] == 'n' && ep[-1] == 'g')) 556 return (0); 557 break; 558 } 559 return (VCe(ep, d, a, lev)); 560 } 561 562 /* possible consonant-vowel-consonant-e ending */ 563 static int 564 VCe(char *ep, char *d, char *a, int lev) 565 { 566 char c; 567 c = ep[-1]; 568 if (c == 'e') 569 return (0); 570 if (!vowel(c) && vowel(ep[-2])) { 571 c = *ep; 572 *ep++ = 'e'; 573 if (trypref(ep, d, lev)||trysuff(ep, lev)) 574 return (1); 575 ep--; 576 *ep = c; 577 } 578 return (strip(ep, d, a, lev)); 579 } 580 581 static char * 582 lookuppref(char **wp, char *ep) 583 { 584 char **sp; 585 char *bp, *cp; 586 587 for (sp = preftab; *sp; sp++) { 588 bp = *wp; 589 for (cp = *sp; *cp; cp++, bp++) 590 if (Tolower(*bp) != *cp) 591 goto next; 592 for (cp = bp; cp < ep; cp++) 593 if (vowel(*cp)) { 594 *wp = bp; 595 return (*sp); 596 } 597 next:; 598 } 599 return (0); 600 } 601 602 /* 603 * while word is not in dictionary try stripping 604 * prefixes. Fail if no more prefixes. 605 */ 606 static int 607 trypref(char *ep, char *a, int lev) 608 { 609 char *cp; 610 char *bp; 611 char *pp; 612 int val = 0; 613 char space[LINE_MAX * 2]; 614 deriv[lev] = a; 615 if (tryword(word, ep, lev)) 616 return (1); 617 bp = word; 618 pp = space; 619 deriv[lev+1] = pp; 620 while (cp = lookuppref(&bp, ep)) { 621 *pp++ = '+'; 622 while (*pp = *cp++) 623 pp++; 624 if (tryword(bp, ep, lev+1)) { 625 val = 1; 626 break; 627 } 628 } 629 deriv[lev+1] = deriv[lev+2] = 0; 630 return (val); 631 } 632 633 static int 634 tryword(char *bp, char *ep, int lev) 635 { 636 int i, j; 637 char duple[3]; 638 if (ep-bp <= 1) 639 return (0); 640 if (vowel(*ep)) { 641 if (monosyl(bp, ep)) 642 return (0); 643 } 644 i = dict(bp, ep); 645 if (i == 0 && vowel(*ep) && ep[-1] == ep[-2] && monosyl(bp, ep-1)) { 646 ep--; 647 deriv[++lev] = duple; 648 duple[0] = '+'; 649 duple[1] = *ep; 650 duple[2] = 0; 651 i = dict(bp, ep); 652 } 653 if (vflag == 0 || i == 0) 654 return (i); 655 /* 656 * when derivations are wanted, collect them 657 * for printing 658 */ 659 j = lev; 660 do { 661 if (deriv[j]) 662 (void) strcat(affix, deriv[j]); 663 } while (--j > 0); 664 return (i); 665 } 666 667 668 static int 669 monosyl(char *bp, char *ep) 670 { 671 if (ep < bp+2) 672 return (0); 673 if (vowel(*--ep) || !vowel(*--ep) || ep[1] == 'x' || ep[1] == 'w') 674 return (0); 675 while (--ep >= bp) 676 if (vowel(*ep)) 677 return (0); 678 return (1); 679 } 680 681 static char * 682 skipv(char *s) 683 { 684 if (s >= word&&vowel(*s)) 685 s--; 686 while (s >= word && !vowel(*s)) 687 s--; 688 return (s); 689 } 690 691 static int 692 vowel(int c) 693 { 694 switch (Tolower(c)) { 695 case 'a': 696 case 'e': 697 case 'i': 698 case 'o': 699 case 'u': 700 case 'y': 701 return (1); 702 } 703 return (0); 704 } 705 706 /* crummy way to Britishise */ 707 static void 708 ise(void) 709 { 710 struct suftab *p; 711 712 for (p = suftab; p->suf; p++) { 713 ztos(p->suf); 714 ztos(p->d1); 715 ztos(p->a1); 716 } 717 } 718 719 static void 720 ztos(char *s) 721 { 722 for (; *s; s++) 723 if (*s == 'z') 724 *s = 's'; 725 } 726 727 static int 728 dict(char *bp, char *ep) 729 { 730 int temp, result; 731 if (xflag) 732 (void) fprintf(stdout, "=%.*s\n", ep-bp, bp); 733 temp = *ep; 734 *ep = 0; 735 result = hashlook(bp); 736 *ep = temp; 737 return (result); 738 }