1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2019, Joyent, Inc.
  14  */
  15 
  16 #include <errno.h>
  17 #include <libcustr.h>
  18 #include <limits.h>
  19 #include <string.h>
  20 #include <sys/ctype.h>    /* We want the C locale ISXXX() versions */
  21 #include <sys/debug.h>
  22 #include <stdio.h>
  23 #include <sys/sysmacros.h>
  24 
  25 #include "strview.h"
  26 #include "demangle_int.h"
  27 
  28 /*
  29  * Unfortunately, there is currently no official specification for the rust
  30  * name mangling.  This is an attempt to document the understanding of the
  31  * mangling used here.  It is based off examination of
  32  *     https://docs.rs/rustc-demangle/0.1.13/rustc_demangle/
  33  *
  34  * A mangled rust name is:
  35  *     <prefix> <name> <hash> E
  36  *
  37  * <prefix>       ::=     _Z
  38  *                      __Z
  39  *
  40  * <name> ::= <name-segment>+
  41  *
  42  * <name-segment> ::= <len> <name-chars>{len}
  43  *
  44  * <len>  ::= [1-9][0-9]+
  45  *
  46  * <name-chars>   ::=     <[A-Za-z]> <[A-Za-z0-9]>*
  47  *                      <separator>
  48  *                      <special>
  49  *
  50  * <separator>    ::=     '..'    # '::'
  51  *
  52  * <special>      ::=     $SP$    # ' '
  53  *                      $BP$    # '*'
  54  *                      $RF$    # '&'
  55  *                      $LT$    # '<'
  56  *                      $GT$    # '>'
  57  *                      $LP$    # '('
  58  *                      $RP$    # ')'
  59  *                      $C$     # ','
  60  *                      $u7e$   # '~'
  61  *                      $u20$   # ' '
  62  *                      $u27$   # '\''
  63  *                      $u3d$   # '='
  64  *                      $u5b$   # '['
  65  *                      $u5d$   # ']'
  66  *                      $u7b$   # '{'
  67  *                      $u7d$   # '}'
  68  *                      $u3b$   # ';'
  69  *                      $u2b$   # '+'
  70  *                      $u22$   # '"'
  71  *
  72  * <hash> := <len> h <hex-digits>+
  73  *
  74  * <hex-digits>   := <[0-9a-f]>
  75  */
  76 
  77 typedef struct rustdem_state {
  78         const char      *rds_str;
  79         custr_t         *rds_demangled;
  80         sysdem_ops_t    *rds_ops;
  81         int             rds_error;
  82 } rustdem_state_t;
  83 
  84 static const struct rust_charmap {
  85         const char      *ruc_seq;
  86         char            ruc_ch;
  87 } rust_charmap[] = {
  88         { "$SP$", '@' },
  89         { "$BP$", '*' },
  90         { "$RF$", '&' },
  91         { "$LT$", '<' },
  92         { "$GT$", '>' },
  93         { "$LP$", '(' },
  94         { "$RP$", ')' },
  95         { "$C$", ',' },
  96         { "$u7e$", '~' },
  97         { "$u20$", ' ' },
  98         { "$u27$", '\'' },
  99         { "$u3d$", '=' },
 100         { "$u5b$", '[' },
 101         { "$u5d$", ']' },
 102         { "$u7b$", '{' },
 103         { "$u7d$", '}' },
 104         { "$u3b$", ';' },
 105         { "$u2b$", '+' },
 106         { "$u22$", '"' }
 107 };
 108 static const size_t rust_charmap_sz = ARRAY_SIZE(rust_charmap);
 109 
 110 static void *rustdem_alloc(custr_alloc_t *, size_t);
 111 static void rustdem_free(custr_alloc_t *, void *, size_t);
 112 
 113 static boolean_t rustdem_append_c(rustdem_state_t *, char);
 114 static boolean_t rustdem_all_ascii(const strview_t *);
 115 
 116 static boolean_t rustdem_parse_prefix(rustdem_state_t *, strview_t *);
 117 static boolean_t rustdem_parse_name(rustdem_state_t *, strview_t *);
 118 static boolean_t rustdem_parse_hash(rustdem_state_t *, strview_t *);
 119 static boolean_t rustdem_parse_num(rustdem_state_t *, strview_t *, uint64_t *);
 120 static boolean_t rustdem_parse_special(rustdem_state_t *, strview_t *);
 121 static boolean_t rustdem_add_sep(rustdem_state_t *);
 122 
 123 char *
 124 rust_demangle(const char *s, size_t slen, sysdem_ops_t *ops)
 125 {
 126         rustdem_state_t st = {
 127                 .rds_str = s,
 128                 .rds_ops = ops,
 129         };
 130         custr_alloc_ops_t custr_ops = {
 131                 .custr_ao_alloc = rustdem_alloc,
 132                 .custr_ao_free = rustdem_free
 133         };
 134         custr_alloc_t custr_alloc = {
 135                 .cua_version = CUSTR_VERSION
 136         };
 137         strview_t sv;
 138         int ret;
 139 
 140         if (custr_alloc_init(&custr_alloc, &custr_ops) != 0)
 141                 return (NULL);
 142         custr_alloc.cua_arg = &st;
 143 
 144         sv_init_str(&sv, s, s + slen);
 145 
 146         if (sv_remaining(&sv) < 1 || sv_peek(&sv, -1) != 'E') {
 147                 DEMDEBUG("ERROR: string is either too small or does not end "
 148                     "with 'E'");
 149                 errno = EINVAL;
 150                 return (NULL);
 151         }
 152 
 153         if (!rustdem_parse_prefix(&st, &sv)) {
 154                 DEMDEBUG("ERROR: could not parse prefix");
 155                 errno = EINVAL;
 156                 return (NULL);
 157         }
 158         DEMDEBUG("parsed prefix; remaining='%.*s'", SV_PRINT(&sv));
 159 
 160         if (!rustdem_all_ascii(&sv)) {
 161                 /* rustdem_all_ascii() provides debug output */
 162                 errno = EINVAL;
 163                 return (NULL);
 164         }
 165 
 166         if ((ret = custr_xalloc(&st.rds_demangled, &custr_alloc)) != 0)
 167                 return (NULL);
 168 
 169         while (sv_remaining(&sv) > 1) {
 170                 if (rustdem_parse_name(&st, &sv))
 171                         continue;
 172                 if (st.rds_error != 0)
 173                         goto fail;
 174         }
 175 
 176         if (st.rds_error != 0 || !sv_consume_if_c(&sv, 'E'))
 177                 goto fail;
 178 
 179         char *res = xstrdup(ops, custr_cstr(st.rds_demangled));
 180         if (res == NULL) {
 181                 st.rds_error = errno;
 182                 goto fail;
 183         }
 184 
 185         custr_free(st.rds_demangled);
 186         DEMDEBUG("result = '%s'", res);
 187         return (res);
 188 
 189 fail:
 190         custr_free(st.rds_demangled);
 191         errno = st.rds_error;
 192         return (NULL);
 193 }
 194 
 195 static boolean_t
 196 rustdem_parse_prefix(rustdem_state_t *st, strview_t *svp)
 197 {
 198         strview_t pfx;
 199 
 200         sv_init_sv(&pfx, svp);
 201 
 202         DEMDEBUG("checking for '_ZN' or '__ZN' in '%.*s'", SV_PRINT(&pfx));
 203 
 204         if (st->rds_error != 0)
 205                 return (B_FALSE);
 206 
 207         if (!sv_consume_if_c(&pfx, '_'))
 208                 return (B_FALSE);
 209 
 210         (void) sv_consume_if_c(&pfx, '_');
 211 
 212         if (!sv_consume_if_c(&pfx, 'Z') || !sv_consume_if_c(&pfx, 'N'))
 213                 return (B_FALSE);
 214 
 215         /* Update svp with new position */
 216         sv_init_sv(svp, &pfx);
 217         return (B_TRUE);
 218 }
 219 
 220 static boolean_t
 221 rustdem_parse_name_segment(rustdem_state_t *st, strview_t *svp, boolean_t first)
 222 {
 223         strview_t sv;
 224         strview_t name;
 225         uint64_t len;
 226         size_t rem;
 227         boolean_t last = B_FALSE;
 228 
 229         if (st->rds_error != 0 || sv_remaining(svp) == 0)
 230                 return (B_FALSE);
 231 
 232         sv_init_sv(&sv, svp);
 233 
 234         if (!rustdem_parse_num(st, &sv, &len)) {
 235                 DEMDEBUG("ERROR: no leading length");
 236                 st->rds_error = EINVAL;
 237                 return (B_FALSE);
 238         }
 239 
 240         rem = sv_remaining(&sv);
 241 
 242         if (rem < len || len > SIZE_MAX) {
 243                 st->rds_error = EINVAL;
 244                 return (B_FALSE);
 245         }
 246 
 247         /* Is this the last segment before the terminating E? */
 248         if (rem == len + 1) {
 249                 VERIFY3U(sv_peek(&sv, -1), ==, 'E');
 250                 last = B_TRUE;
 251         }
 252 
 253         if (!first && !rustdem_add_sep(st))
 254                 return (B_FALSE);
 255 
 256         /* Reduce length of seg to the length we parsed */
 257         (void) sv_init_sv_range(&name, &sv, len);
 258 
 259         DEMDEBUG("%s: segment='%.*s'", __func__, SV_PRINT(&name));
 260 
 261         /*
 262          * A rust hash starts with 'h', and is the last component of a name
 263          * before the terminating 'E'
 264          */
 265         if (sv_peek(&name, 0) == 'h' && last) {
 266                 if (!rustdem_parse_hash(st, &name))
 267                         return (B_FALSE);
 268                 goto done;
 269         }
 270 
 271         while (sv_remaining(&name) > 0) {
 272                 switch (sv_peek(&name, 0)) {
 273                 case '$':
 274                         if (rustdem_parse_special(st, &name))
 275                                 continue;
 276                         break;
 277                 case '_':
 278                         if (sv_peek(&name, 1) == '$') {
 279                                 /*
 280                                  * Only consume/ignore '_'.  Leave
 281                                  * $ for next round.
 282                                  */
 283                                 sv_consume_n(&name, 1);
 284                                 continue;
 285                         }
 286                         break;
 287                 case '.':
 288                         /* Convert '..' to '::' */
 289                         if (sv_peek(&name, 1) != '.')
 290                                 break;
 291 
 292                         if (!rustdem_add_sep(st))
 293                                 return (B_FALSE);
 294 
 295                         sv_consume_n(&name, 2);
 296                         continue;
 297                 default:
 298                         break;
 299                 }
 300 
 301                 if (custr_appendc(st->rds_demangled,
 302                     sv_consume_c(&name)) != 0) {
 303                         st->rds_error = ENOMEM;
 304                         return (B_FALSE);
 305                 }
 306         }
 307 
 308 done:
 309         DEMDEBUG("%s: consumed '%.*s'", __func__, (int)len, svp->sv_first);
 310         sv_consume_n(&sv, len);
 311         sv_init_sv(svp, &sv);
 312         return (B_TRUE);
 313 }
 314 
 315 static boolean_t
 316 rustdem_parse_name(rustdem_state_t *st, strview_t *svp)
 317 {
 318         strview_t name;
 319         boolean_t first = B_TRUE;
 320 
 321         if (st->rds_error != 0)
 322                 return (B_FALSE);
 323 
 324         sv_init_sv(&name, svp);
 325 
 326         if (sv_remaining(&name) == 0)
 327                 return (B_FALSE);
 328 
 329         while (sv_remaining(&name) > 0 && sv_peek(&name, 0) != 'E') {
 330                 if (!rustdem_parse_name_segment(st, &name, first))
 331                         return (B_FALSE);
 332                 first = B_FALSE;
 333         }
 334 
 335         sv_init_sv(svp, &name);
 336         return (B_TRUE);
 337 }
 338 
 339 static boolean_t
 340 rustdem_parse_hash(rustdem_state_t *st, strview_t *svp)
 341 {
 342         strview_t sv;
 343 
 344         sv_init_sv(&sv, svp);
 345 
 346         VERIFY(sv_consume_if_c(&sv, 'h'));
 347         if (!rustdem_append_c(st, 'h'))
 348                 return (B_FALSE);
 349 
 350         while (sv_remaining(&sv) > 0) {
 351                 char c = sv_consume_c(&sv);
 352 
 353                 switch (c) {
 354                 /*
 355                  * The upper-case hex digits (A-F) are excluded as valid
 356                  * hash values for several reasons:
 357                  *
 358                  * 1. It would result in two different possible names for
 359                  * the same function, leading to ambiguity in linking (among
 360                  * other things).
 361                  *
 362                  * 2. It would cause potential ambiguity in parsing -- is a
 363                  * trailing 'E' part of the hash, or the terminating character
 364                  * in the mangled name?
 365                  *
 366                  * 3. No examples were able to be found in the wild where
 367                  * uppercase digits are used, and other rust demanglers all
 368                  * seem to assume the hash must contain lower-case hex digits.
 369                  */
 370                 case '0': case '1': case '2': case '3':
 371                 case '4': case '5': case '6': case '7':
 372                 case '8': case '9': case 'a': case 'b':
 373                 case 'c': case 'd': case 'e': case 'f':
 374                         if (!rustdem_append_c(st, c))
 375                                 return (B_FALSE);
 376                         break;
 377                 default:
 378                         return (B_FALSE);
 379                 }
 380         }
 381 
 382         sv_init_sv(svp, &sv);
 383         return (B_TRUE);
 384 }
 385 
 386 /*
 387  * A 10 digit value would imply a name 1Gb or larger in size.  It seems
 388  * unlikely to the point of absurdity any such value could every possibly
 389  * be valid (or even have compiled properly).  This also prevents the
 390  * uint64_t conversion from possibly overflowing since the value must always
 391  * be below 10 * UINT32_MAX.
 392  */
 393 #define MAX_DIGITS 10
 394 
 395 static boolean_t
 396 rustdem_parse_num(rustdem_state_t *restrict st, strview_t *restrict svp,
 397     uint64_t *restrict valp)
 398 {
 399         strview_t snum;
 400         uint64_t v = 0;
 401         size_t ndigits = 0;
 402         char c;
 403 
 404         if (st->rds_error != 0)
 405                 return (B_FALSE);
 406 
 407         sv_init_sv(&snum, svp);
 408 
 409         DEMDEBUG("%s: str='%.*s'", __func__, SV_PRINT(&snum));
 410 
 411         c = sv_peek(&snum, 0);
 412         if (!ISDIGIT(c)) {
 413                 DEMDEBUG("%s: ERROR no digits in str\n", __func__);
 414                 st->rds_error = EINVAL;
 415                 return (B_FALSE);
 416         }
 417 
 418         /*
 419          * Since there is currently no official specification on rust name
 420          * mangling, only that it has been stated that rust follows what
 421          * C++ mangling does.  In the Itanium C++ ABI (what practically
 422          * every non-Windows C++ implementation uses these days), it
 423          * explicitly disallows leading 0s in numeric values (except for
 424          * substition and template indexes, which aren't relevant here).
 425          * We enforce the same restriction -- if a rust implementation allowed
 426          * leading zeros in numbers (basically segment lengths) it'd
 427          * cause all sorts of ambiguity problems with names that likely lead
 428          * to much bigger problems with linking and such, so this seems
 429          * reasonable.
 430          */
 431         if (c == '0') {
 432                 DEMDEBUG("%s: ERROR number starts with leading 0\n", __func__);
 433                 st->rds_error = EINVAL;
 434                 return (B_FALSE);
 435         }
 436 
 437         while (sv_remaining(&snum) > 0 && ndigits <= MAX_DIGITS) {
 438                 c = sv_consume_c(&snum);
 439 
 440                 if (!ISDIGIT(c))
 441                         break;
 442 
 443                 v *= 10;
 444                 v += c - '0';
 445                 ndigits++;
 446         }
 447 
 448         if (ndigits > MAX_DIGITS) {
 449                 DEMDEBUG("%s: value %llu is too large\n", __func__, v);
 450                 st->rds_error = ERANGE;
 451                 return (B_FALSE);
 452         }
 453 
 454         DEMDEBUG("%s: num=%llu", __func__, v);
 455 
 456         *valp = v;
 457         sv_consume_n(svp, ndigits);
 458         return (B_TRUE);
 459 }
 460 
 461 static boolean_t
 462 rustdem_parse_special(rustdem_state_t *restrict st, strview_t *restrict svp)
 463 {
 464         if (st->rds_error != 0)
 465                 return (B_FALSE);
 466 
 467         if (sv_peek(svp, 0) != '$')
 468                 return (B_FALSE);
 469 
 470         for (size_t i = 0; i < rust_charmap_sz; i++) {
 471                 if (sv_consume_if(svp, rust_charmap[i].ruc_seq)) {
 472                         if (!rustdem_append_c(st, rust_charmap[i].ruc_ch))
 473                                 return (B_FALSE);
 474                         return (B_TRUE);
 475                 }
 476         }
 477         return (B_FALSE);
 478 }
 479 
 480 static boolean_t
 481 rustdem_add_sep(rustdem_state_t *st)
 482 {
 483         if (st->rds_error != 0)
 484                 return (B_FALSE);
 485 
 486         if (!rustdem_append_c(st, ':') ||
 487             !rustdem_append_c(st, ':'))
 488                 return (B_FALSE);
 489 
 490         return (B_TRUE);
 491 }
 492 
 493 static boolean_t
 494 rustdem_append_c(rustdem_state_t *st, char c)
 495 {
 496         if (st->rds_error != 0)
 497                 return (B_FALSE);
 498 
 499         if (custr_appendc(st->rds_demangled, c) == 0)
 500                 return (B_TRUE);
 501 
 502         st->rds_error = errno;
 503         return (B_FALSE);
 504 }
 505 
 506 static boolean_t
 507 rustdem_all_ascii(const strview_t *svp)
 508 {
 509         strview_t p;
 510 
 511         sv_init_sv(&p, svp);
 512 
 513         while (sv_remaining(&p) > 0) {
 514                 char c = sv_consume_c(&p);
 515 
 516                 /*
 517                  * #including <sys/ctype.h> conflicts with <ctype.h>.  Since
 518                  * we want the C locale macros (ISDIGIT, etc), it also means
 519                  * we can't use isascii(3C).
 520                  */
 521                 if ((c & 0x80) != 0) {
 522                         DEMDEBUG("%s: found non-ascii character 0x%02hhx at "
 523                             "offset %tu", __func__, c,
 524                             (ptrdiff_t)(p.sv_first - svp->sv_first));
 525                         return (B_FALSE);
 526                 }
 527         }
 528         return (B_TRUE);
 529 }
 530 
 531 static void *
 532 rustdem_alloc(custr_alloc_t *cao, size_t len)
 533 {
 534         rustdem_state_t *st = cao->cua_arg;
 535         return (zalloc(st->rds_ops, len));
 536 }
 537 
 538 static void
 539 rustdem_free(custr_alloc_t *cao, void *p, size_t len)
 540 {
 541         rustdem_state_t *st = cao->cua_arg;
 542         xfree(st->rds_ops, p, len);
 543 }