1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #include <_string_table.h>
  28 #include <strings.h>
  29 #include <sgs.h>
  30 #include <stdio.h>
  31 
  32 /*
  33  * This file provides the interfaces to build a Str_tbl suitable for use by
  34  * either the sgsmsg message system, or a standard ELF string table (SHT_STRTAB)
  35  * as created by ld(1).
  36  *
  37  * There are two modes which can be used when constructing a string table:
  38  *
  39  *      st_new(0)
  40  *              standard string table - no compression.  This is the
  41  *              traditional, fast method.
  42  *
  43  *      st_new(FLG_STTAB_COMPRESS)
  44  *              builds a compressed string table which both eliminates
  45  *              duplicate strings, and permits strings with common suffixes
  46  *              (atexit vs. exit) to overlap in the table.  This provides space
  47  *              savings for many string tables.  Although more work than the
  48  *              traditional method, the algorithms used are designed to scale
  49  *              and keep any overhead at a minimum.
  50  *
  51  * These string tables are built with a common interface in a two-pass manner.
  52  * The first pass finds all of the strings required for the string-table and
  53  * calculates the size required for the final string table.
  54  *
  55  * The second pass allocates the string table, populates the strings into the
  56  * table and returns the offsets the strings have been assigned.
  57  *
  58  * The calling sequence to build and populate a string table is:
  59  *
  60  *              st_new();               // initialize strtab
  61  *
  62  *              st_insert(st1);         // first pass of strings ...
  63  *                                      // calculates size required for
  64  *                                      // string table
  65  *
  66  *              st_delstring(st?);      // remove string previously
  67  *                                      // inserted
  68  *              st_insert(stN);
  69  *
  70  *              st_getstrtab_sz();      // freezes strtab and computes
  71  *                                      // size of table.
  72  *
  73  *              st_setstrbuf();         // associates a final destination
  74  *                                      // for the string table
  75  *
  76  *              st_setstring(st1);      // populate the string table
  77  *              ...                     // offsets are based off of second
  78  *                                      // pass through the string table
  79  *              st_setstring(stN);
  80  *
  81  *              st_destroy();           // tear down string table
  82  *                                      // structures.
  83  *
  84  * String Suffix Compression Algorithm:
  85  *
  86  *   Here's a quick high level overview of the Suffix String
  87  *   compression algorithm used.  First - the heart of the algorithm
  88  *   is a Hash table list which represents a dictionary of all unique
  89  *   strings inserted into the string table.  The hash function for
  90  *   this table is a standard string hash except that the hash starts
  91  *   at the last character in the string (&str[n - 1]) and works towards
  92  *   the first character in the function (&str[0]).  As we compute the
  93  *   HASH value for a given string, we also compute the hash values
  94  *   for all of the possible suffix strings for that string.
  95  *
  96  *   As we compute the hash - at each character see if the current
  97  *   suffix string for that hash is already present in the table.  If
  98  *   it is, and the string is a master string.  Then change that
  99  *   string to a suffix string of the new string being inserted.
 100  *
 101  *   When the final hash value is found (hash for str[0...n]), check
 102  *   to see if it is in the hash table - if so increment the reference
 103  *   count for the string.  If it is not yet in the table, insert a
 104  *   new hash table entry for a master string.
 105  *
 106  *   The above method will find all suffixes of a given string given
 107  *   that the strings are inserted from shortest to longest.  That is
 108  *   why this is a two phase method, we first collect all of the
 109  *   strings and store them based off of their length in an AVL tree.
 110  *   Once all of the strings have been submitted we then start the
 111  *   hash table build by traversing the AVL tree in order and
 112  *   inserting the strings from shortest to longest as described
 113  *   above.
 114  */
 115 
 116 /* LINTLIBRARY */
 117 
 118 static int
 119 avl_len_compare(const void *n1, const void *n2)
 120 {
 121         size_t  len1, len2;
 122 
 123         len1 = ((LenNode *)n1)->ln_strlen;
 124         len2 = ((LenNode *)n2)->ln_strlen;
 125 
 126         if (len1 == len2)
 127                 return (0);
 128         if (len2 < len1)
 129                 return (1);
 130         return (-1);
 131 }
 132 
 133 static int
 134 avl_str_compare(const void *n1, const void *n2)
 135 {
 136         const char      *str1, *str2;
 137         int             rc;
 138 
 139         str1 = ((StrNode *)n1)->sn_str;
 140         str2 = ((StrNode *)n2)->sn_str;
 141 
 142         rc = strcmp(str1, str2);
 143         if (rc > 0)
 144                 return (1);
 145         if (rc < 0)
 146                 return (-1);
 147         return (0);
 148 }
 149 
 150 /*
 151  * Return an initialized Str_tbl - returns NULL on failure.
 152  *
 153  * flags:
 154  *      FLG_STTAB_COMPRESS - build a compressed string table
 155  */
 156 Str_tbl *
 157 st_new(uint_t flags)
 158 {
 159         Str_tbl *stp;
 160 
 161         if ((stp = calloc(sizeof (*stp), 1)) == NULL)
 162                 return (NULL);
 163 
 164         /*
 165          * Start with a leading '\0' - it's tradition.
 166          */
 167         stp->st_strsize = stp->st_fullstrsize = stp->st_nextoff = 1;
 168 
 169         /*
 170          * Do we compress this string table?
 171          */
 172         stp->st_flags = flags;
 173         if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
 174                 return (stp);
 175 
 176         if ((stp->st_lentree = calloc(sizeof (*stp->st_lentree), 1)) == NULL)
 177                 return (NULL);
 178 
 179         avl_create(stp->st_lentree, &avl_len_compare, sizeof (LenNode),
 180             SGSOFFSETOF(LenNode, ln_avlnode));
 181 
 182         return (stp);
 183 }
 184 
 185 /*
 186  * Insert a new string into the Str_tbl.  There are two AVL trees used.
 187  *
 188  *  -   The first LenNode AVL tree maintains a tree of nodes based on string
 189  *      sizes.
 190  *  -   Each LenNode maintains a StrNode AVL tree for each string.  Large
 191  *      applications have been known to contribute thousands of strings of
 192  *      the same size.  Should strings need to be removed (-z ignore), then
 193  *      the string AVL tree makes this removal efficient and scalable.
 194  */
 195 int
 196 st_insert(Str_tbl *stp, const char *str)
 197 {
 198         size_t          len;
 199         StrNode         *snp, sn = { 0 };
 200         LenNode         *lnp, ln = { 0 };
 201         avl_index_t     where;
 202 
 203         /*
 204          * String table can't have been cooked
 205          */
 206         assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
 207 
 208         /*
 209          * Null strings always point to the head of the string
 210          * table - no reason to keep searching.
 211          */
 212         if ((len = strlen(str)) == 0)
 213                 return (0);
 214 
 215         stp->st_fullstrsize += len + 1;
 216         stp->st_strcnt++;
 217 
 218         if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
 219                 return (0);
 220 
 221         /*
 222          * From the controlling string table, determine which LenNode AVL node
 223          * provides for this string length.  If the node doesn't exist, insert
 224          * a new node to represent this string length.
 225          */
 226         ln.ln_strlen = len;
 227         if ((lnp = avl_find(stp->st_lentree, &ln, &where)) == NULL) {
 228                 if ((lnp = calloc(sizeof (*lnp), 1)) == NULL)
 229                         return (-1);
 230                 lnp->ln_strlen = len;
 231                 avl_insert(stp->st_lentree, lnp, where);
 232 
 233                 if ((lnp->ln_strtree = calloc(sizeof (*lnp->ln_strtree), 1)) ==
 234                     NULL)
 235                         return (0);
 236 
 237                 avl_create(lnp->ln_strtree, &avl_str_compare, sizeof (StrNode),
 238                     SGSOFFSETOF(StrNode, sn_avlnode));
 239         }
 240 
 241         /*
 242          * From the string length AVL node determine whether a StrNode AVL node
 243          * provides this string.  If the node doesn't exist, insert a new node
 244          * to represent this string.
 245          */
 246         sn.sn_str = str;
 247         if ((snp = avl_find(lnp->ln_strtree, &sn, &where)) == NULL) {
 248                 if ((snp = calloc(sizeof (*snp), 1)) == NULL)
 249                         return (-1);
 250                 snp->sn_str = str;
 251                 avl_insert(lnp->ln_strtree, snp, where);
 252         }
 253         snp->sn_refcnt++;
 254 
 255         return (0);
 256 }
 257 
 258 /*
 259  * Remove a previously inserted string from the Str_tbl.
 260  */
 261 int
 262 st_delstring(Str_tbl *stp, const char *str)
 263 {
 264         size_t          len;
 265         LenNode         *lnp, ln = { 0 };
 266         StrNode         *snp, sn = { 0 };
 267 
 268         /*
 269          * String table can't have been cooked
 270          */
 271         assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
 272 
 273         len = strlen(str);
 274         stp->st_fullstrsize -= len + 1;
 275 
 276         if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
 277                 return (0);
 278 
 279         /*
 280          * Determine which LenNode AVL node provides for this string length.
 281          */
 282         ln.ln_strlen = len;
 283         if ((lnp = avl_find(stp->st_lentree, &ln, 0)) != NULL) {
 284                 sn.sn_str = str;
 285                 if ((snp = avl_find(lnp->ln_strtree, &sn, 0)) != NULL) {
 286                         /*
 287                          * Reduce the reference count, and if zero remove the
 288                          * node.
 289                          */
 290                         if (--snp->sn_refcnt == 0)
 291                                 avl_remove(lnp->ln_strtree, snp);
 292                         return (0);
 293                 }
 294         }
 295 
 296         /*
 297          * No strings of this length, or no string itself - someone goofed.
 298          */
 299         return (-1);
 300 }
 301 
 302 /*
 303  * Tear down a String_Table structure.
 304  */
 305 void
 306 st_destroy(Str_tbl *stp)
 307 {
 308         Str_hash        *sthash, *psthash;
 309         Str_master      *mstr, *pmstr;
 310         uint_t          i;
 311 
 312         /*
 313          * cleanup the master strings
 314          */
 315         for (mstr = stp->st_mstrlist, pmstr = 0; mstr;
 316             mstr = mstr->sm_next) {
 317                 if (pmstr)
 318                         free(pmstr);
 319                 pmstr = mstr;
 320         }
 321         if (pmstr)
 322                 free(pmstr);
 323 
 324         if (stp->st_hashbcks) {
 325                 for (i = 0; i < stp->st_hbckcnt; i++) {
 326                         for (sthash = stp->st_hashbcks[i], psthash = 0;
 327                             sthash; sthash = sthash->hi_next) {
 328                                 if (psthash)
 329                                         free(psthash);
 330                                 psthash = sthash;
 331                         }
 332                         if (psthash)
 333                                 free(psthash);
 334                 }
 335                 free(stp->st_hashbcks);
 336         }
 337         free(stp);
 338 }
 339 
 340 
 341 /*
 342  * For a given string - copy it into the buffer associated with
 343  * the string table - and return the offset it has been assigned.
 344  *
 345  * If a value of '-1' is returned - the string was not found in
 346  * the Str_tbl.
 347  */
 348 int
 349 st_setstring(Str_tbl *stp, const char *str, size_t *stoff)
 350 {
 351         size_t          stlen;
 352         uint_t          hashval;
 353         Str_hash        *sthash;
 354         Str_master      *mstr;
 355         int             i;
 356 
 357         /*
 358          * String table *must* have been previously cooked
 359          */
 360         assert(stp->st_strbuf);
 361 
 362         assert(stp->st_flags & FLG_STTAB_COOKED);
 363         stlen = strlen(str);
 364         /*
 365          * Null string always points to head of string table
 366          */
 367         if (stlen == 0) {
 368                 *stoff = 0;
 369                 return (0);
 370         }
 371 
 372         if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
 373                 size_t          _stoff;
 374 
 375                 stlen++;        /* count for trailing '\0' */
 376                 _stoff = stp->st_nextoff;
 377                 /*
 378                  * Have we overflowed our assigned buffer?
 379                  */
 380                 if ((_stoff + stlen) > stp->st_fullstrsize)
 381                         return (-1);
 382                 memcpy(stp->st_strbuf + _stoff, str, stlen);
 383                 *stoff = _stoff;
 384                 stp->st_nextoff += stlen;
 385                 return (0);
 386         }
 387 
 388         /*
 389          * Calculate reverse hash for string.
 390          */
 391         hashval = HASHSEED;
 392         for (i = stlen; i >= 0; i--) {
 393                 hashval = ((hashval << 5) + hashval) +
 394                     str[i];                     /* h = ((h * 33) + c) */
 395         }
 396 
 397         for (sthash = stp->st_hashbcks[hashval % stp->st_hbckcnt]; sthash;
 398             sthash = sthash->hi_next) {
 399                 const char      *hstr;
 400 
 401                 if (sthash->hi_hashval != hashval)
 402                         continue;
 403 
 404                 hstr = &sthash->hi_mstr->sm_str[sthash->hi_mstr->sm_strlen -
 405                     sthash->hi_strlen];
 406                 if (strcmp(str, hstr) == 0)
 407                         break;
 408         }
 409 
 410         /*
 411          * Did we find the string?
 412          */
 413         if (sthash == 0)
 414                 return (-1);
 415 
 416         /*
 417          * Has this string been copied into the string table?
 418          */
 419         mstr = sthash->hi_mstr;
 420         if (mstr->sm_stroff == 0) {
 421                 size_t  mstrlen = mstr->sm_strlen + 1;
 422 
 423                 mstr->sm_stroff = stp->st_nextoff;
 424 
 425                 /*
 426                  * Have we overflowed our assigned buffer?
 427                  */
 428                 if ((mstr->sm_stroff + mstrlen) > stp->st_fullstrsize)
 429                         return (-1);
 430 
 431                 (void) memcpy(stp->st_strbuf + mstr->sm_stroff,
 432                     mstr->sm_str, mstrlen);
 433                 stp->st_nextoff += mstrlen;
 434         }
 435 
 436         /*
 437          * Calculate offset of (sub)string.
 438          */
 439         *stoff = mstr->sm_stroff + mstr->sm_strlen - sthash->hi_strlen;
 440 
 441         return (0);
 442 }
 443 
 444 
 445 static int
 446 st_hash_insert(Str_tbl *stp, const char *str, size_t len)
 447 {
 448         int             i;
 449         uint_t          hashval = HASHSEED;
 450         uint_t          bckcnt = stp->st_hbckcnt;
 451         Str_hash        **hashbcks = stp->st_hashbcks;
 452         Str_hash        *sthash;
 453         Str_master      *mstr = 0;
 454 
 455         /*
 456          * We use a classic 'Bernstein k=33' hash function.  But
 457          * instead of hashing from the start of the string to the
 458          * end, we do it in reverse.
 459          *
 460          * This way - we are essentially building all of the
 461          * suffix hashvalues as we go.  We can check to see if
 462          * any suffixes already exist in the tree as we generate
 463          * the hash.
 464          */
 465         for (i = len; i >= 0; i--) {
 466                 hashval = ((hashval << 5) + hashval) +
 467                     str[i];                     /* h = ((h * 33) + c) */
 468 
 469                 for (sthash = hashbcks[hashval % bckcnt];
 470                     sthash; sthash = sthash->hi_next) {
 471                         const char      *hstr;
 472                         Str_master      *_mstr;
 473 
 474                         if (sthash->hi_hashval != hashval)
 475                                 continue;
 476 
 477                         _mstr = sthash->hi_mstr;
 478                         hstr = &_mstr->sm_str[_mstr->sm_strlen -
 479                             sthash->hi_strlen];
 480 
 481                         if (strcmp(&str[i], hstr))
 482                                 continue;
 483 
 484                         if (i == 0) {
 485                                 /*
 486                                  * Entry already in table, increment refcnt and
 487                                  * get out.
 488                                  */
 489                                 sthash->hi_refcnt++;
 490                                 return (0);
 491                         } else {
 492                                 /*
 493                                  * If this 'suffix' is presently a 'master
 494                                  * string, then take over it's record.
 495                                  */
 496                                 if (sthash->hi_strlen == _mstr->sm_strlen) {
 497                                         /*
 498                                          * we should only do this once.
 499                                          */
 500                                         assert(mstr == 0);
 501                                         mstr = _mstr;
 502                                 }
 503                         }
 504                 }
 505         }
 506 
 507         /*
 508          * Do we need a new master string, or can we take over
 509          * one we already found in the table?
 510          */
 511         if (mstr == 0) {
 512                 /*
 513                  * allocate a new master string
 514                  */
 515                 if ((mstr = calloc(sizeof (*mstr), 1)) == 0)
 516                         return (-1);
 517                 mstr->sm_next = stp->st_mstrlist;
 518                 stp->st_mstrlist = mstr;
 519                 stp->st_strsize += len + 1;
 520         } else {
 521                 /*
 522                  * We are taking over a existing master string, the string size
 523                  * only increments by the difference between the current string
 524                  * and the previous master.
 525                  */
 526                 assert(len > mstr->sm_strlen);
 527                 stp->st_strsize += len - mstr->sm_strlen;
 528         }
 529 
 530         if ((sthash = calloc(sizeof (*sthash), 1)) == 0)
 531                 return (-1);
 532 
 533         mstr->sm_hashval = sthash->hi_hashval = hashval;
 534         mstr->sm_strlen = sthash->hi_strlen = len;
 535         mstr->sm_str = str;
 536         sthash->hi_refcnt = 1;
 537         sthash->hi_mstr = mstr;
 538 
 539         /*
 540          * Insert string element into head of hash list
 541          */
 542         hashval = hashval % bckcnt;
 543         sthash->hi_next = hashbcks[hashval];
 544         hashbcks[hashval] = sthash;
 545         return (0);
 546 }
 547 
 548 /*
 549  * Return amount of space required for the string table.
 550  */
 551 size_t
 552 st_getstrtab_sz(Str_tbl *stp)
 553 {
 554         assert(stp->st_fullstrsize > 0);
 555 
 556         if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
 557                 stp->st_flags |= FLG_STTAB_COOKED;
 558                 return (stp->st_fullstrsize);
 559         }
 560 
 561         if ((stp->st_flags & FLG_STTAB_COOKED) == 0) {
 562                 LenNode         *lnp;
 563                 void            *cookie;
 564 
 565                 stp->st_flags |= FLG_STTAB_COOKED;
 566                 /*
 567                  * allocate a hash table about the size of # of
 568                  * strings input.
 569                  */
 570                 stp->st_hbckcnt = findprime(stp->st_strcnt);
 571                 if ((stp->st_hashbcks = calloc(sizeof (*stp->st_hashbcks),
 572                     stp->st_hbckcnt)) == NULL)
 573                         return (0);
 574 
 575                 /*
 576                  * We now walk all of the strings in the list, from shortest to
 577                  * longest, and insert them into the hashtable.
 578                  */
 579                 if ((lnp = avl_first(stp->st_lentree)) == NULL) {
 580                         /*
 581                          * Is it possible we have an empty string table, if so,
 582                          * the table still contains '\0', so return the size.
 583                          */
 584                         if (avl_numnodes(stp->st_lentree) == 0) {
 585                                 assert(stp->st_strsize == 1);
 586                                 return (stp->st_strsize);
 587                         }
 588                         return (0);
 589                 }
 590 
 591                 while (lnp) {
 592                         StrNode *snp;
 593 
 594                         /*
 595                          * Walk the string lists and insert them into the hash
 596                          * list.  Once a string is inserted we no longer need
 597                          * it's entry, so the string can be freed.
 598                          */
 599                         for (snp = avl_first(lnp->ln_strtree); snp;
 600                             snp = AVL_NEXT(lnp->ln_strtree, snp)) {
 601                                 if (st_hash_insert(stp, snp->sn_str,
 602                                     lnp->ln_strlen) == -1)
 603                                         return (0);
 604                         }
 605 
 606                         /*
 607                          * Now that the strings have been copied, walk the
 608                          * StrNode tree and free all the AVL nodes.  Note,
 609                          * avl_destroy_nodes() beats avl_remove() as the
 610                          * latter balances the nodes as they are removed.
 611                          * We just want to tear the whole thing down fast.
 612                          */
 613                         cookie = NULL;
 614                         while ((snp = avl_destroy_nodes(lnp->ln_strtree,
 615                             &cookie)) != NULL)
 616                                 free(snp);
 617                         avl_destroy(lnp->ln_strtree);
 618                         free(lnp->ln_strtree);
 619                         lnp->ln_strtree = NULL;
 620 
 621                         /*
 622                          * Move on to the next LenNode.
 623                          */
 624                         lnp = AVL_NEXT(stp->st_lentree, lnp);
 625                 }
 626 
 627                 /*
 628                  * Now that all of the strings have been freed, walk the
 629                  * LenNode tree and free all of the AVL nodes.  Note,
 630                  * avl_destroy_nodes() beats avl_remove() as the latter
 631                  * balances the nodes as they are removed. We just want to
 632                  * tear the whole thing down fast.
 633                  */
 634                 cookie = NULL;
 635                 while ((lnp = avl_destroy_nodes(stp->st_lentree,
 636                     &cookie)) != NULL)
 637                         free(lnp);
 638                 avl_destroy(stp->st_lentree);
 639                 free(stp->st_lentree);
 640                 stp->st_lentree = 0;
 641         }
 642 
 643         assert(stp->st_strsize > 0);
 644         assert(stp->st_fullstrsize >= stp->st_strsize);
 645 
 646         return (stp->st_strsize);
 647 }
 648 
 649 /*
 650  * Associate a buffer with a string table.
 651  */
 652 const char *
 653 st_getstrbuf(Str_tbl *stp)
 654 {
 655         return (stp->st_strbuf);
 656 }
 657 
 658 int
 659 st_setstrbuf(Str_tbl *stp, char *stbuf, size_t bufsize)
 660 {
 661         assert(stp->st_flags & FLG_STTAB_COOKED);
 662 
 663         if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
 664                 if (bufsize < stp->st_fullstrsize)
 665                         return (-1);
 666         } else {
 667                 if (bufsize < stp->st_strsize)
 668                         return (-1);
 669         }
 670 
 671         stp->st_strbuf = stbuf;
 672 #ifdef  DEBUG
 673         /*
 674          * for debug builds - start with a stringtable filled in
 675          * with '0xff'.  This makes it very easy to spot unfilled
 676          * holes in the strtab.
 677          */
 678         memset(stbuf, 0xff, bufsize);
 679         stbuf[0] = '\0';
 680 #else
 681         memset(stbuf, 0x0, bufsize);
 682 #endif
 683         return (0);
 684 }