Print this page
11472 fix libdemangle rust changes
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/lib/libdemangle/common/rust.c
+++ new/usr/src/lib/libdemangle/common/rust.c
1 1 /*
2 2 * This file and its contents are supplied under the terms of the
3 3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 4 * You may only use this file in accordance with the terms of version
5 5 * 1.0 of the CDDL.
6 6 *
7 7 * A full copy of the text of the CDDL should have accompanied this
8 8 * source. A copy of the CDDL is also available via the Internet at
9 9 * http://www.illumos.org/license/CDDL.
10 10 */
11 11
12 12 /*
13 13 * Copyright 2019, Joyent, Inc.
14 14 */
15 15
16 16 #include <errno.h>
17 17 #include <libcustr.h>
18 18 #include <limits.h>
19 19 #include <string.h>
20 20 #include <sys/ctype.h> /* We want the C locale ISXXX() versions */
21 21 #include <sys/debug.h>
22 22 #include <stdio.h>
23 23 #include <sys/sysmacros.h>
24 24
25 25 #include "strview.h"
26 26 #include "demangle_int.h"
27 27
28 28 /*
29 29 * Unfortunately, there is currently no official specification for the rust
30 30 * name mangling. This is an attempt to document the understanding of the
31 31 * mangling used here. It is based off examination of
32 32 * https://docs.rs/rustc-demangle/0.1.13/rustc_demangle/
33 33 *
34 34 * A mangled rust name is:
35 35 * <prefix> <name> <hash> E
36 36 *
37 37 * <prefix> ::= _Z
38 38 * __Z
39 39 *
40 40 * <name> ::= <name-segment>+
41 41 *
42 42 * <name-segment> ::= <len> <name-chars>{len}
43 43 *
44 44 * <len> ::= [1-9][0-9]+
45 45 *
46 46 * <name-chars> ::= <[A-Za-z]> <[A-Za-z0-9]>*
47 47 * <separator>
48 48 * <special>
49 49 *
50 50 * <separator> ::= '..' # '::'
51 51 *
52 52 * <special> ::= $SP$ # ' '
53 53 * $BP$ # '*'
54 54 * $RF$ # '&'
55 55 * $LT$ # '<'
56 56 * $GT$ # '>'
57 57 * $LP$ # '('
58 58 * $RP$ # ')'
59 59 * $C$ # ','
60 60 * $u7e$ # '~'
61 61 * $u20$ # ' '
62 62 * $u27$ # '\''
63 63 * $u3d$ # '='
64 64 * $u5b$ # '['
65 65 * $u5d$ # ']'
66 66 * $u7b$ # '{'
67 67 * $u7d$ # '}'
68 68 * $u3b$ # ';'
69 69 * $u2b$ # '+'
70 70 * $u22$ # '"'
71 71 *
72 72 * <hash> := <len> h <hex-digits>+
73 73 *
74 74 * <hex-digits> := <[0-9a-f]>
75 75 */
76 76
77 77 typedef struct rustdem_state {
78 78 const char *rds_str;
79 79 custr_t *rds_demangled;
80 80 sysdem_ops_t *rds_ops;
81 81 int rds_error;
82 82 } rustdem_state_t;
83 83
84 84 static const struct rust_charmap {
85 85 const char *ruc_seq;
86 86 char ruc_ch;
87 87 } rust_charmap[] = {
88 88 { "$SP$", '@' },
89 89 { "$BP$", '*' },
90 90 { "$RF$", '&' },
91 91 { "$LT$", '<' },
92 92 { "$GT$", '>' },
93 93 { "$LP$", '(' },
94 94 { "$RP$", ')' },
95 95 { "$C$", ',' },
96 96 { "$u7e$", '~' },
97 97 { "$u20$", ' ' },
98 98 { "$u27$", '\'' },
99 99 { "$u3d$", '=' },
100 100 { "$u5b$", '[' },
101 101 { "$u5d$", ']' },
102 102 { "$u7b$", '{' },
103 103 { "$u7d$", '}' },
104 104 { "$u3b$", ';' },
105 105 { "$u2b$", '+' },
106 106 { "$u22$", '"' }
107 107 };
108 108 static const size_t rust_charmap_sz = ARRAY_SIZE(rust_charmap);
109 109
110 110 static void *rustdem_alloc(custr_alloc_t *, size_t);
111 111 static void rustdem_free(custr_alloc_t *, void *, size_t);
112 112
113 113 static boolean_t rustdem_append_c(rustdem_state_t *, char);
114 114 static boolean_t rustdem_all_ascii(const strview_t *);
115 115
116 116 static boolean_t rustdem_parse_prefix(rustdem_state_t *, strview_t *);
117 117 static boolean_t rustdem_parse_name(rustdem_state_t *, strview_t *);
118 118 static boolean_t rustdem_parse_hash(rustdem_state_t *, strview_t *);
119 119 static boolean_t rustdem_parse_num(rustdem_state_t *, strview_t *, uint64_t *);
120 120 static boolean_t rustdem_parse_special(rustdem_state_t *, strview_t *);
121 121 static boolean_t rustdem_add_sep(rustdem_state_t *);
122 122
123 123 char *
124 124 rust_demangle(const char *s, size_t slen, sysdem_ops_t *ops)
125 125 {
126 126 rustdem_state_t st = {
127 127 .rds_str = s,
128 128 .rds_ops = ops,
129 129 };
130 130 custr_alloc_ops_t custr_ops = {
131 131 .custr_ao_alloc = rustdem_alloc,
132 132 .custr_ao_free = rustdem_free
133 133 };
134 134 custr_alloc_t custr_alloc = {
135 135 .cua_version = CUSTR_VERSION
136 136 };
137 137 strview_t sv;
138 138 int ret;
139 139
140 140 if (custr_alloc_init(&custr_alloc, &custr_ops) != 0)
141 141 return (NULL);
142 142 custr_alloc.cua_arg = &st;
143 143
144 144 sv_init_str(&sv, s, s + slen);
145 145
146 146 if (sv_remaining(&sv) < 1 || sv_peek(&sv, -1) != 'E') {
147 147 DEMDEBUG("ERROR: string is either too small or does not end "
148 148 "with 'E'");
149 149 errno = EINVAL;
150 150 return (NULL);
151 151 }
152 152
153 153 if (!rustdem_parse_prefix(&st, &sv)) {
154 154 DEMDEBUG("ERROR: could not parse prefix");
155 155 errno = EINVAL;
156 156 return (NULL);
157 157 }
158 158 DEMDEBUG("parsed prefix; remaining='%.*s'", SV_PRINT(&sv));
159 159
160 160 if (!rustdem_all_ascii(&sv)) {
161 161 /* rustdem_all_ascii() provides debug output */
162 162 errno = EINVAL;
163 163 return (NULL);
164 164 }
165 165
166 166 if ((ret = custr_xalloc(&st.rds_demangled, &custr_alloc)) != 0)
167 167 return (NULL);
168 168
169 169 while (sv_remaining(&sv) > 1) {
170 170 if (rustdem_parse_name(&st, &sv))
171 171 continue;
172 172 if (st.rds_error != 0)
173 173 goto fail;
174 174 }
175 175
176 176 if (st.rds_error != 0 || !sv_consume_if_c(&sv, 'E'))
177 177 goto fail;
178 178
179 179 char *res = xstrdup(ops, custr_cstr(st.rds_demangled));
180 180 if (res == NULL) {
181 181 st.rds_error = errno;
182 182 goto fail;
183 183 }
184 184
185 185 custr_free(st.rds_demangled);
186 186 DEMDEBUG("result = '%s'", res);
187 187 return (res);
188 188
189 189 fail:
190 190 custr_free(st.rds_demangled);
191 191 errno = st.rds_error;
192 192 return (NULL);
193 193 }
194 194
195 195 static boolean_t
196 196 rustdem_parse_prefix(rustdem_state_t *st, strview_t *svp)
197 197 {
198 198 strview_t pfx;
199 199
200 200 sv_init_sv(&pfx, svp);
201 201
202 202 DEMDEBUG("checking for '_ZN' or '__ZN' in '%.*s'", SV_PRINT(&pfx));
203 203
204 204 if (st->rds_error != 0)
205 205 return (B_FALSE);
206 206
207 207 if (!sv_consume_if_c(&pfx, '_'))
208 208 return (B_FALSE);
209 209
210 210 (void) sv_consume_if_c(&pfx, '_');
211 211
212 212 if (!sv_consume_if_c(&pfx, 'Z') || !sv_consume_if_c(&pfx, 'N'))
213 213 return (B_FALSE);
214 214
215 215 /* Update svp with new position */
216 216 sv_init_sv(svp, &pfx);
217 217 return (B_TRUE);
218 218 }
219 219
220 220 static boolean_t
221 221 rustdem_parse_name_segment(rustdem_state_t *st, strview_t *svp, boolean_t first)
222 222 {
223 223 strview_t sv;
224 224 strview_t name;
225 225 uint64_t len;
226 226 size_t rem;
227 227 boolean_t last = B_FALSE;
228 228
229 229 if (st->rds_error != 0 || sv_remaining(svp) == 0)
230 230 return (B_FALSE);
231 231
↓ open down ↓ |
231 lines elided |
↑ open up ↑ |
232 232 sv_init_sv(&sv, svp);
233 233
234 234 if (!rustdem_parse_num(st, &sv, &len)) {
235 235 DEMDEBUG("ERROR: no leading length");
236 236 st->rds_error = EINVAL;
237 237 return (B_FALSE);
238 238 }
239 239
240 240 rem = sv_remaining(&sv);
241 241
242 - if (rem < len || len == SIZE_MAX) {
242 + if (rem < len) {
243 243 st->rds_error = EINVAL;
244 244 return (B_FALSE);
245 245 }
246 246
247 247 /* Is this the last segment before the terminating E? */
248 248 if (rem == len + 1) {
249 249 VERIFY3U(sv_peek(&sv, -1), ==, 'E');
250 250 last = B_TRUE;
251 251 }
252 252
253 253 if (!first && !rustdem_add_sep(st))
254 254 return (B_FALSE);
255 255
256 256 /* Reduce length of seg to the length we parsed */
257 257 (void) sv_init_sv_range(&name, &sv, len);
258 258
259 259 DEMDEBUG("%s: segment='%.*s'", __func__, SV_PRINT(&name));
260 260
261 261 /*
262 262 * A rust hash starts with 'h', and is the last component of a name
263 263 * before the terminating 'E'
264 264 */
265 265 if (sv_peek(&name, 0) == 'h' && last) {
266 266 if (!rustdem_parse_hash(st, &name))
267 267 return (B_FALSE);
268 268 goto done;
269 269 }
270 270
271 271 while (sv_remaining(&name) > 0) {
272 272 switch (sv_peek(&name, 0)) {
273 273 case '$':
274 274 if (rustdem_parse_special(st, &name))
275 275 continue;
276 276 break;
277 277 case '_':
278 278 if (sv_peek(&name, 1) == '$') {
279 279 /*
280 280 * Only consume/ignore '_'. Leave
281 281 * $ for next round.
282 282 */
283 283 sv_consume_n(&name, 1);
284 284 continue;
285 285 }
286 286 break;
287 287 case '.':
288 288 /* Convert '..' to '::' */
289 289 if (sv_peek(&name, 1) != '.')
290 290 break;
291 291
292 292 if (!rustdem_add_sep(st))
293 293 return (B_FALSE);
294 294
295 295 sv_consume_n(&name, 2);
296 296 continue;
297 297 default:
298 298 break;
299 299 }
300 300
301 301 if (custr_appendc(st->rds_demangled,
302 302 sv_consume_c(&name)) != 0) {
303 303 st->rds_error = ENOMEM;
304 304 return (B_FALSE);
305 305 }
306 306 }
307 307
308 308 done:
309 309 DEMDEBUG("%s: consumed '%.*s'", __func__, (int)len, svp->sv_first);
310 310 sv_consume_n(&sv, len);
311 311 sv_init_sv(svp, &sv);
312 312 return (B_TRUE);
313 313 }
314 314
315 315 static boolean_t
316 316 rustdem_parse_name(rustdem_state_t *st, strview_t *svp)
317 317 {
318 318 strview_t name;
319 319 boolean_t first = B_TRUE;
320 320
321 321 if (st->rds_error != 0)
322 322 return (B_FALSE);
323 323
324 324 sv_init_sv(&name, svp);
325 325
326 326 if (sv_remaining(&name) == 0)
327 327 return (B_FALSE);
328 328
329 329 while (sv_remaining(&name) > 0 && sv_peek(&name, 0) != 'E') {
330 330 if (!rustdem_parse_name_segment(st, &name, first))
331 331 return (B_FALSE);
332 332 first = B_FALSE;
333 333 }
334 334
335 335 sv_init_sv(svp, &name);
336 336 return (B_TRUE);
337 337 }
338 338
339 339 static boolean_t
340 340 rustdem_parse_hash(rustdem_state_t *st, strview_t *svp)
341 341 {
342 342 strview_t sv;
343 343
344 344 sv_init_sv(&sv, svp);
345 345
346 346 VERIFY(sv_consume_if_c(&sv, 'h'));
347 347 if (!rustdem_append_c(st, 'h'))
348 348 return (B_FALSE);
349 349
350 350 while (sv_remaining(&sv) > 0) {
351 351 char c = sv_consume_c(&sv);
352 352
353 353 switch (c) {
354 354 /*
355 355 * The upper-case hex digits (A-F) are excluded as valid
356 356 * hash values for several reasons:
357 357 *
358 358 * 1. It would result in two different possible names for
359 359 * the same function, leading to ambiguity in linking (among
360 360 * other things).
361 361 *
362 362 * 2. It would cause potential ambiguity in parsing -- is a
363 363 * trailing 'E' part of the hash, or the terminating character
364 364 * in the mangled name?
365 365 *
366 366 * 3. No examples were able to be found in the wild where
367 367 * uppercase digits are used, and other rust demanglers all
368 368 * seem to assume the hash must contain lower-case hex digits.
369 369 */
370 370 case '0': case '1': case '2': case '3':
371 371 case '4': case '5': case '6': case '7':
372 372 case '8': case '9': case 'a': case 'b':
373 373 case 'c': case 'd': case 'e': case 'f':
374 374 if (!rustdem_append_c(st, c))
375 375 return (B_FALSE);
376 376 break;
↓ open down ↓ |
124 lines elided |
↑ open up ↑ |
377 377 default:
378 378 return (B_FALSE);
379 379 }
380 380 }
381 381
382 382 sv_init_sv(svp, &sv);
383 383 return (B_TRUE);
384 384 }
385 385
386 386 /*
387 - * A 10 digit value would imply a name 1Gb or larger in size. It seems
388 - * unlikely to the point of absurdity any such value could every possibly
389 - * be valid (or even have compiled properly). This also prevents the
390 - * uint64_t conversion from possibly overflowing since the value must always
391 - * be below 10 * UINT32_MAX.
387 + * We have to pick an arbitrary limit here; 999,999,999 fits comfortably
388 + * within an int32_t, so let's go with that, as it seems unlikely we'd
389 + * ever see a larger value in context.
392 390 */
393 -#define MAX_DIGITS 10
391 +#define MAX_DIGITS 9
394 392
395 393 static boolean_t
396 394 rustdem_parse_num(rustdem_state_t *restrict st, strview_t *restrict svp,
397 395 uint64_t *restrict valp)
398 396 {
399 397 strview_t snum;
400 398 uint64_t v = 0;
401 399 size_t ndigits = 0;
402 400 char c;
403 401
404 402 if (st->rds_error != 0)
405 403 return (B_FALSE);
406 404
407 405 sv_init_sv(&snum, svp);
408 406
409 407 DEMDEBUG("%s: str='%.*s'", __func__, SV_PRINT(&snum));
410 408
411 409 c = sv_peek(&snum, 0);
412 410 if (!ISDIGIT(c)) {
413 411 DEMDEBUG("%s: ERROR no digits in str\n", __func__);
414 412 st->rds_error = EINVAL;
415 413 return (B_FALSE);
416 414 }
417 415
418 416 /*
419 417 * Since there is currently no official specification on rust name
420 418 * mangling, only that it has been stated that rust follows what
421 419 * C++ mangling does. In the Itanium C++ ABI (what practically
422 420 * every non-Windows C++ implementation uses these days), it
423 421 * explicitly disallows leading 0s in numeric values (except for
424 422 * substition and template indexes, which aren't relevant here).
425 423 * We enforce the same restriction -- if a rust implementation allowed
426 424 * leading zeros in numbers (basically segment lengths) it'd
427 425 * cause all sorts of ambiguity problems with names that likely lead
428 426 * to much bigger problems with linking and such, so this seems
429 427 * reasonable.
430 428 */
431 429 if (c == '0') {
432 430 DEMDEBUG("%s: ERROR number starts with leading 0\n", __func__);
433 431 st->rds_error = EINVAL;
434 432 return (B_FALSE);
435 433 }
436 434
437 435 while (sv_remaining(&snum) > 0 && ndigits <= MAX_DIGITS) {
438 436 c = sv_consume_c(&snum);
439 437
440 438 if (!ISDIGIT(c))
441 439 break;
442 440
443 441 v *= 10;
444 442 v += c - '0';
445 443 ndigits++;
446 444 }
447 445
448 446 if (ndigits > MAX_DIGITS) {
449 447 DEMDEBUG("%s: value %llu is too large\n", __func__, v);
450 448 st->rds_error = ERANGE;
451 449 return (B_FALSE);
452 450 }
453 451
454 452 DEMDEBUG("%s: num=%llu", __func__, v);
455 453
456 454 *valp = v;
457 455 sv_consume_n(svp, ndigits);
458 456 return (B_TRUE);
459 457 }
460 458
461 459 static boolean_t
462 460 rustdem_parse_special(rustdem_state_t *restrict st, strview_t *restrict svp)
463 461 {
464 462 if (st->rds_error != 0)
465 463 return (B_FALSE);
466 464
467 465 if (sv_peek(svp, 0) != '$')
468 466 return (B_FALSE);
469 467
470 468 for (size_t i = 0; i < rust_charmap_sz; i++) {
471 469 if (sv_consume_if(svp, rust_charmap[i].ruc_seq)) {
472 470 if (!rustdem_append_c(st, rust_charmap[i].ruc_ch))
473 471 return (B_FALSE);
474 472 return (B_TRUE);
475 473 }
476 474 }
477 475 return (B_FALSE);
478 476 }
479 477
480 478 static boolean_t
481 479 rustdem_add_sep(rustdem_state_t *st)
482 480 {
483 481 if (st->rds_error != 0)
484 482 return (B_FALSE);
485 483
486 484 if (!rustdem_append_c(st, ':') ||
487 485 !rustdem_append_c(st, ':'))
488 486 return (B_FALSE);
489 487
490 488 return (B_TRUE);
491 489 }
492 490
493 491 static boolean_t
494 492 rustdem_append_c(rustdem_state_t *st, char c)
495 493 {
496 494 if (st->rds_error != 0)
497 495 return (B_FALSE);
498 496
499 497 if (custr_appendc(st->rds_demangled, c) == 0)
500 498 return (B_TRUE);
501 499
502 500 st->rds_error = errno;
503 501 return (B_FALSE);
504 502 }
505 503
506 504 static boolean_t
507 505 rustdem_all_ascii(const strview_t *svp)
508 506 {
509 507 strview_t p;
510 508
511 509 sv_init_sv(&p, svp);
512 510
513 511 while (sv_remaining(&p) > 0) {
514 512 char c = sv_consume_c(&p);
515 513
516 514 /*
517 515 * #including <sys/ctype.h> conflicts with <ctype.h>. Since
518 516 * we want the C locale macros (ISDIGIT, etc), it also means
519 517 * we can't use isascii(3C).
520 518 */
521 519 if ((c & 0x80) != 0) {
522 520 DEMDEBUG("%s: found non-ascii character 0x%02hhx at "
523 521 "offset %tu", __func__, c,
524 522 (ptrdiff_t)(p.sv_first - svp->sv_first));
525 523 return (B_FALSE);
526 524 }
527 525 }
528 526 return (B_TRUE);
529 527 }
530 528
531 529 static void *
532 530 rustdem_alloc(custr_alloc_t *cao, size_t len)
533 531 {
534 532 rustdem_state_t *st = cao->cua_arg;
535 533 return (zalloc(st->rds_ops, len));
536 534 }
537 535
538 536 static void
539 537 rustdem_free(custr_alloc_t *cao, void *p, size_t len)
540 538 {
541 539 rustdem_state_t *st = cao->cua_arg;
542 540 xfree(st->rds_ops, p, len);
543 541 }
↓ open down ↓ |
140 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX