1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2019, Joyent, Inc. 14 */ 15 16 #include <errno.h> 17 #include <libcustr.h> 18 #include <limits.h> 19 #include <string.h> 20 #include <sys/ctype.h> /* We want the C locale ISXXX() versions */ 21 #include <sys/debug.h> 22 #include <stdio.h> 23 #include <sys/sysmacros.h> 24 25 #include "strview.h" 26 #include "demangle_int.h" 27 28 /* 29 * Unfortunately, there is currently no official specification for the rust 30 * name mangling. This is an attempt to document the understanding of the 31 * mangling used here. It is based off examination of 32 * https://docs.rs/rustc-demangle/0.1.13/rustc_demangle/ 33 * 34 * A mangled rust name is: 35 * <prefix> <name> <hash> E 36 * 37 * <prefix> ::= _Z 38 * __Z 39 * 40 * <name> ::= <name-segment>+ 41 * 42 * <name-segment> ::= <len> <name-chars>{len} 43 * 44 * <len> ::= [1-9][0-9]+ 45 * 46 * <name-chars> ::= <[A-Za-z]> <[A-Za-z0-9]>* 47 * <separator> 48 * <special> 49 * 50 * <separator> ::= '..' # '::' 51 * 52 * <special> ::= $SP$ # ' ' 53 * $BP$ # '*' 54 * $RF$ # '&' 55 * $LT$ # '<' 56 * $GT$ # '>' 57 * $LP$ # '(' 58 * $RP$ # ')' 59 * $C$ # ',' 60 * $u7e$ # '~' 61 * $u20$ # ' ' 62 * $u27$ # '\'' 63 * $u3d$ # '=' 64 * $u5b$ # '[' 65 * $u5d$ # ']' 66 * $u7b$ # '{' 67 * $u7d$ # '}' 68 * $u3b$ # ';' 69 * $u2b$ # '+' 70 * $u22$ # '"' 71 * 72 * <hash> := <len> h <hex-digits>+ 73 * 74 * <hex-digits> := <[0-9a-f]> 75 */ 76 77 typedef struct rustdem_state { 78 const char *rds_str; 79 custr_t *rds_demangled; 80 sysdem_ops_t *rds_ops; 81 int rds_error; 82 } rustdem_state_t; 83 84 static const struct rust_charmap { 85 const char *ruc_seq; 86 char ruc_ch; 87 } rust_charmap[] = { 88 { "$SP$", '@' }, 89 { "$BP$", '*' }, 90 { "$RF$", '&' }, 91 { "$LT$", '<' }, 92 { "$GT$", '>' }, 93 { "$LP$", '(' }, 94 { "$RP$", ')' }, 95 { "$C$", ',' }, 96 { "$u7e$", '~' }, 97 { "$u20$", ' ' }, 98 { "$u27$", '\'' }, 99 { "$u3d$", '=' }, 100 { "$u5b$", '[' }, 101 { "$u5d$", ']' }, 102 { "$u7b$", '{' }, 103 { "$u7d$", '}' }, 104 { "$u3b$", ';' }, 105 { "$u2b$", '+' }, 106 { "$u22$", '"' } 107 }; 108 static const size_t rust_charmap_sz = ARRAY_SIZE(rust_charmap); 109 110 static void *rustdem_alloc(custr_alloc_t *, size_t); 111 static void rustdem_free(custr_alloc_t *, void *, size_t); 112 113 static boolean_t rustdem_append_c(rustdem_state_t *, char); 114 static boolean_t rustdem_all_ascii(const strview_t *); 115 116 static boolean_t rustdem_parse_prefix(rustdem_state_t *, strview_t *); 117 static boolean_t rustdem_parse_name(rustdem_state_t *, strview_t *); 118 static boolean_t rustdem_parse_hash(rustdem_state_t *, strview_t *); 119 static boolean_t rustdem_parse_num(rustdem_state_t *, strview_t *, uint64_t *); 120 static boolean_t rustdem_parse_special(rustdem_state_t *, strview_t *); 121 static boolean_t rustdem_add_sep(rustdem_state_t *); 122 123 char * 124 rust_demangle(const char *s, size_t slen, sysdem_ops_t *ops) 125 { 126 rustdem_state_t st = { 127 .rds_str = s, 128 .rds_ops = ops, 129 }; 130 custr_alloc_ops_t custr_ops = { 131 .custr_ao_alloc = rustdem_alloc, 132 .custr_ao_free = rustdem_free 133 }; 134 custr_alloc_t custr_alloc = { 135 .cua_version = CUSTR_VERSION 136 }; 137 strview_t sv; 138 int ret; 139 140 if (custr_alloc_init(&custr_alloc, &custr_ops) != 0) 141 return (NULL); 142 custr_alloc.cua_arg = &st; 143 144 sv_init_str(&sv, s, s + slen); 145 146 if (sv_remaining(&sv) < 1 || sv_peek(&sv, -1) != 'E') { 147 DEMDEBUG("ERROR: string is either too small or does not end " 148 "with 'E'"); 149 errno = EINVAL; 150 return (NULL); 151 } 152 153 if (!rustdem_parse_prefix(&st, &sv)) { 154 DEMDEBUG("ERROR: could not parse prefix"); 155 errno = EINVAL; 156 return (NULL); 157 } 158 DEMDEBUG("parsed prefix; remaining='%.*s'", SV_PRINT(&sv)); 159 160 if (!rustdem_all_ascii(&sv)) { 161 /* rustdem_all_ascii() provides debug output */ 162 errno = EINVAL; 163 return (NULL); 164 } 165 166 if ((ret = custr_xalloc(&st.rds_demangled, &custr_alloc)) != 0) 167 return (NULL); 168 169 while (sv_remaining(&sv) > 1) { 170 if (rustdem_parse_name(&st, &sv)) 171 continue; 172 if (st.rds_error != 0) 173 goto fail; 174 } 175 176 if (st.rds_error != 0 || !sv_consume_if_c(&sv, 'E')) 177 goto fail; 178 179 char *res = xstrdup(ops, custr_cstr(st.rds_demangled)); 180 if (res == NULL) { 181 st.rds_error = errno; 182 goto fail; 183 } 184 185 custr_free(st.rds_demangled); 186 DEMDEBUG("result = '%s'", res); 187 return (res); 188 189 fail: 190 custr_free(st.rds_demangled); 191 errno = st.rds_error; 192 return (NULL); 193 } 194 195 static boolean_t 196 rustdem_parse_prefix(rustdem_state_t *st, strview_t *svp) 197 { 198 strview_t pfx; 199 200 sv_init_sv(&pfx, svp); 201 202 DEMDEBUG("checking for '_ZN' or '__ZN' in '%.*s'", SV_PRINT(&pfx)); 203 204 if (st->rds_error != 0) 205 return (B_FALSE); 206 207 if (!sv_consume_if_c(&pfx, '_')) 208 return (B_FALSE); 209 210 (void) sv_consume_if_c(&pfx, '_'); 211 212 if (!sv_consume_if_c(&pfx, 'Z') || !sv_consume_if_c(&pfx, 'N')) 213 return (B_FALSE); 214 215 /* Update svp with new position */ 216 sv_init_sv(svp, &pfx); 217 return (B_TRUE); 218 } 219 220 static boolean_t 221 rustdem_parse_name_segment(rustdem_state_t *st, strview_t *svp, boolean_t first) 222 { 223 strview_t sv; 224 strview_t name; 225 uint64_t len; 226 size_t rem; 227 boolean_t last = B_FALSE; 228 229 if (st->rds_error != 0 || sv_remaining(svp) == 0) 230 return (B_FALSE); 231 232 sv_init_sv(&sv, svp); 233 234 if (!rustdem_parse_num(st, &sv, &len)) { 235 DEMDEBUG("ERROR: no leading length"); 236 st->rds_error = EINVAL; 237 return (B_FALSE); 238 } 239 240 rem = sv_remaining(&sv); 241 242 if (rem < len || len > SIZE_MAX) { 243 st->rds_error = EINVAL; 244 return (B_FALSE); 245 } 246 247 /* Is this the last segment before the terminating E? */ 248 if (rem == len + 1) { 249 VERIFY3U(sv_peek(&sv, -1), ==, 'E'); 250 last = B_TRUE; 251 } 252 253 if (!first && !rustdem_add_sep(st)) 254 return (B_FALSE); 255 256 /* Reduce length of seg to the length we parsed */ 257 (void) sv_init_sv_range(&name, &sv, len); 258 259 DEMDEBUG("%s: segment='%.*s'", __func__, SV_PRINT(&name)); 260 261 /* 262 * A rust hash starts with 'h', and is the last component of a name 263 * before the terminating 'E' 264 */ 265 if (sv_peek(&name, 0) == 'h' && last) { 266 if (!rustdem_parse_hash(st, &name)) 267 return (B_FALSE); 268 goto done; 269 } 270 271 while (sv_remaining(&name) > 0) { 272 switch (sv_peek(&name, 0)) { 273 case '$': 274 if (rustdem_parse_special(st, &name)) 275 continue; 276 break; 277 case '_': 278 if (sv_peek(&name, 1) == '$') { 279 /* 280 * Only consume/ignore '_'. Leave 281 * $ for next round. 282 */ 283 sv_consume_n(&name, 1); 284 continue; 285 } 286 break; 287 case '.': 288 /* Convert '..' to '::' */ 289 if (sv_peek(&name, 1) != '.') 290 break; 291 292 if (!rustdem_add_sep(st)) 293 return (B_FALSE); 294 295 sv_consume_n(&name, 2); 296 continue; 297 default: 298 break; 299 } 300 301 if (custr_appendc(st->rds_demangled, 302 sv_consume_c(&name)) != 0) { 303 st->rds_error = ENOMEM; 304 return (B_FALSE); 305 } 306 } 307 308 done: 309 DEMDEBUG("%s: consumed '%.*s'", __func__, (int)len, svp->sv_first); 310 sv_consume_n(&sv, len); 311 sv_init_sv(svp, &sv); 312 return (B_TRUE); 313 } 314 315 static boolean_t 316 rustdem_parse_name(rustdem_state_t *st, strview_t *svp) 317 { 318 strview_t name; 319 boolean_t first = B_TRUE; 320 321 if (st->rds_error != 0) 322 return (B_FALSE); 323 324 sv_init_sv(&name, svp); 325 326 if (sv_remaining(&name) == 0) 327 return (B_FALSE); 328 329 while (sv_remaining(&name) > 0 && sv_peek(&name, 0) != 'E') { 330 if (!rustdem_parse_name_segment(st, &name, first)) 331 return (B_FALSE); 332 first = B_FALSE; 333 } 334 335 sv_init_sv(svp, &name); 336 return (B_TRUE); 337 } 338 339 static boolean_t 340 rustdem_parse_hash(rustdem_state_t *st, strview_t *svp) 341 { 342 strview_t sv; 343 344 sv_init_sv(&sv, svp); 345 346 VERIFY(sv_consume_if_c(&sv, 'h')); 347 if (!rustdem_append_c(st, 'h')) 348 return (B_FALSE); 349 350 while (sv_remaining(&sv) > 0) { 351 char c = sv_consume_c(&sv); 352 353 switch (c) { 354 /* 355 * The upper-case hex digits (A-F) are excluded as valid 356 * hash values for several reasons: 357 * 358 * 1. It would result in two different possible names for 359 * the same function, leading to ambiguity in linking (among 360 * other things). 361 * 362 * 2. It would cause potential ambiguity in parsing -- is a 363 * trailing 'E' part of the hash, or the terminating character 364 * in the mangled name? 365 * 366 * 3. No examples were able to be found in the wild where 367 * uppercase digits are used, and other rust demanglers all 368 * seem to assume the hash must contain lower-case hex digits. 369 */ 370 case '0': case '1': case '2': case '3': 371 case '4': case '5': case '6': case '7': 372 case '8': case '9': case 'a': case 'b': 373 case 'c': case 'd': case 'e': case 'f': 374 if (!rustdem_append_c(st, c)) 375 return (B_FALSE); 376 break; 377 default: 378 return (B_FALSE); 379 } 380 } 381 382 sv_init_sv(svp, &sv); 383 return (B_TRUE); 384 } 385 386 /* 387 * A 10 digit value would imply a name 1Gb or larger in size. It seems 388 * unlikely to the point of absurdity any such value could every possibly 389 * be valid (or even have compiled properly). This also prevents the 390 * uint64_t conversion from possibly overflowing since the value must always 391 * be below 10 * UINT32_MAX. 392 */ 393 #define MAX_DIGITS 10 394 395 static boolean_t 396 rustdem_parse_num(rustdem_state_t *restrict st, strview_t *restrict svp, 397 uint64_t *restrict valp) 398 { 399 strview_t snum; 400 uint64_t v = 0; 401 size_t ndigits = 0; 402 char c; 403 404 if (st->rds_error != 0) 405 return (B_FALSE); 406 407 sv_init_sv(&snum, svp); 408 409 DEMDEBUG("%s: str='%.*s'", __func__, SV_PRINT(&snum)); 410 411 c = sv_peek(&snum, 0); 412 if (!ISDIGIT(c)) { 413 DEMDEBUG("%s: ERROR no digits in str\n", __func__); 414 st->rds_error = EINVAL; 415 return (B_FALSE); 416 } 417 418 /* 419 * Since there is currently no official specification on rust name 420 * mangling, only that it has been stated that rust follows what 421 * C++ mangling does. In the Itanium C++ ABI (what practically 422 * every non-Windows C++ implementation uses these days), it 423 * explicitly disallows leading 0s in numeric values (except for 424 * substition and template indexes, which aren't relevant here). 425 * We enforce the same restriction -- if a rust implementation allowed 426 * leading zeros in numbers (basically segment lengths) it'd 427 * cause all sorts of ambiguity problems with names that likely lead 428 * to much bigger problems with linking and such, so this seems 429 * reasonable. 430 */ 431 if (c == '0') { 432 DEMDEBUG("%s: ERROR number starts with leading 0\n", __func__); 433 st->rds_error = EINVAL; 434 return (B_FALSE); 435 } 436 437 while (sv_remaining(&snum) > 0 && ndigits <= MAX_DIGITS) { 438 c = sv_consume_c(&snum); 439 440 if (!ISDIGIT(c)) 441 break; 442 443 v *= 10; 444 v += c - '0'; 445 ndigits++; 446 } 447 448 if (ndigits > MAX_DIGITS) { 449 DEMDEBUG("%s: value %llu is too large\n", __func__, v); 450 st->rds_error = ERANGE; 451 return (B_FALSE); 452 } 453 454 DEMDEBUG("%s: num=%llu", __func__, v); 455 456 *valp = v; 457 sv_consume_n(svp, ndigits); 458 return (B_TRUE); 459 } 460 461 static boolean_t 462 rustdem_parse_special(rustdem_state_t *restrict st, strview_t *restrict svp) 463 { 464 if (st->rds_error != 0) 465 return (B_FALSE); 466 467 if (sv_peek(svp, 0) != '$') 468 return (B_FALSE); 469 470 for (size_t i = 0; i < rust_charmap_sz; i++) { 471 if (sv_consume_if(svp, rust_charmap[i].ruc_seq)) { 472 if (!rustdem_append_c(st, rust_charmap[i].ruc_ch)) 473 return (B_FALSE); 474 return (B_TRUE); 475 } 476 } 477 return (B_FALSE); 478 } 479 480 static boolean_t 481 rustdem_add_sep(rustdem_state_t *st) 482 { 483 if (st->rds_error != 0) 484 return (B_FALSE); 485 486 if (!rustdem_append_c(st, ':') || 487 !rustdem_append_c(st, ':')) 488 return (B_FALSE); 489 490 return (B_TRUE); 491 } 492 493 static boolean_t 494 rustdem_append_c(rustdem_state_t *st, char c) 495 { 496 if (st->rds_error != 0) 497 return (B_FALSE); 498 499 if (custr_appendc(st->rds_demangled, c) == 0) 500 return (B_TRUE); 501 502 st->rds_error = errno; 503 return (B_FALSE); 504 } 505 506 static boolean_t 507 rustdem_all_ascii(const strview_t *svp) 508 { 509 strview_t p; 510 511 sv_init_sv(&p, svp); 512 513 while (sv_remaining(&p) > 0) { 514 char c = sv_consume_c(&p); 515 516 /* 517 * #including <sys/ctype.h> conflicts with <ctype.h>. Since 518 * we want the C locale macros (ISDIGIT, etc), it also means 519 * we can't use isascii(3C). 520 */ 521 if ((c & 0x80) != 0) { 522 DEMDEBUG("%s: found non-ascii character 0x%02hhx at " 523 "offset %tu", __func__, c, 524 (ptrdiff_t)(p.sv_first - svp->sv_first)); 525 return (B_FALSE); 526 } 527 } 528 return (B_TRUE); 529 } 530 531 static void * 532 rustdem_alloc(custr_alloc_t *cao, size_t len) 533 { 534 rustdem_state_t *st = cao->cua_arg; 535 return (zalloc(st->rds_ops, len)); 536 } 537 538 static void 539 rustdem_free(custr_alloc_t *cao, void *p, size_t len) 540 { 541 rustdem_state_t *st = cao->cua_arg; 542 xfree(st->rds_ops, p, len); 543 }