11414 Fix smatch issue in libdemangle
1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2019, Joyent, Inc.
14 */
15
16 #include <errno.h>
17 #include <libcustr.h>
18 #include <limits.h>
19 #include <string.h>
20 #include <sys/ctype.h> /* We want the C locale ISXXX() versions */
21 #include <sys/debug.h>
22 #include <stdio.h>
23 #include <sys/sysmacros.h>
24
25 #include "strview.h"
26 #include "demangle_int.h"
27
28 /*
29 * Unfortunately, there is currently no official specification for the rust
30 * name mangling. This is an attempt to document the understanding of the
31 * mangling used here. It is based off examination of
32 * https://docs.rs/rustc-demangle/0.1.13/rustc_demangle/
33 *
34 * A mangled rust name is:
35 * <prefix> <name> <hash> E
36 *
37 * <prefix> ::= _Z
38 * __Z
39 *
40 * <name> ::= <name-segment>+
41 *
42 * <name-segment> ::= <len> <name-chars>{len}
43 *
44 * <len> ::= [1-9][0-9]+
45 *
46 * <name-chars> ::= <[A-Za-z]> <[A-Za-z0-9]>*
47 * <separator>
48 * <special>
49 *
50 * <separator> ::= '..' # '::'
51 *
52 * <special> ::= $SP$ # ' '
53 * $BP$ # '*'
54 * $RF$ # '&'
55 * $LT$ # '<'
56 * $GT$ # '>'
57 * $LP$ # '('
58 * $RP$ # ')'
59 * $C$ # ','
60 * $u7e$ # '~'
61 * $u20$ # ' '
62 * $u27$ # '\''
63 * $u3d$ # '='
64 * $u5b$ # '['
65 * $u5d$ # ']'
66 * $u7b$ # '{'
67 * $u7d$ # '}'
68 * $u3b$ # ';'
69 * $u2b$ # '+'
70 * $u22$ # '"'
71 *
72 * <hash> := <len> h <hex-digits>+
73 *
74 * <hex-digits> := <[0-9a-f]>
75 */
76
77 typedef struct rustdem_state {
78 const char *rds_str;
79 custr_t *rds_demangled;
80 sysdem_ops_t *rds_ops;
81 int rds_error;
82 } rustdem_state_t;
83
84 static const struct rust_charmap {
85 const char *ruc_seq;
86 char ruc_ch;
87 } rust_charmap[] = {
88 { "$SP$", '@' },
89 { "$BP$", '*' },
90 { "$RF$", '&' },
91 { "$LT$", '<' },
92 { "$GT$", '>' },
93 { "$LP$", '(' },
94 { "$RP$", ')' },
95 { "$C$", ',' },
96 { "$u7e$", '~' },
97 { "$u20$", ' ' },
98 { "$u27$", '\'' },
99 { "$u3d$", '=' },
100 { "$u5b$", '[' },
101 { "$u5d$", ']' },
102 { "$u7b$", '{' },
103 { "$u7d$", '}' },
104 { "$u3b$", ';' },
105 { "$u2b$", '+' },
106 { "$u22$", '"' }
107 };
108 static const size_t rust_charmap_sz = ARRAY_SIZE(rust_charmap);
109
110 static void *rustdem_alloc(custr_alloc_t *, size_t);
111 static void rustdem_free(custr_alloc_t *, void *, size_t);
112
113 static boolean_t rustdem_append_c(rustdem_state_t *, char);
114 static boolean_t rustdem_all_ascii(const strview_t *);
115
116 static boolean_t rustdem_parse_prefix(rustdem_state_t *, strview_t *);
117 static boolean_t rustdem_parse_name(rustdem_state_t *, strview_t *);
118 static boolean_t rustdem_parse_hash(rustdem_state_t *, strview_t *);
119 static boolean_t rustdem_parse_num(rustdem_state_t *, strview_t *, uint64_t *);
120 static boolean_t rustdem_parse_special(rustdem_state_t *, strview_t *);
121 static boolean_t rustdem_add_sep(rustdem_state_t *);
122
123 char *
124 rust_demangle(const char *s, size_t slen, sysdem_ops_t *ops)
125 {
126 rustdem_state_t st = {
127 .rds_str = s,
128 .rds_ops = ops,
129 };
130 custr_alloc_ops_t custr_ops = {
131 .custr_ao_alloc = rustdem_alloc,
132 .custr_ao_free = rustdem_free
133 };
134 custr_alloc_t custr_alloc = {
135 .cua_version = CUSTR_VERSION
136 };
137 strview_t sv;
138 int ret;
139
140 if (custr_alloc_init(&custr_alloc, &custr_ops) != 0)
141 return (NULL);
142 custr_alloc.cua_arg = &st;
143
144 sv_init_str(&sv, s, s + slen);
145
146 if (sv_remaining(&sv) < 1 || sv_peek(&sv, -1) != 'E') {
147 DEMDEBUG("ERROR: string is either too small or does not end "
148 "with 'E'");
149 errno = EINVAL;
150 return (NULL);
151 }
152
153 if (!rustdem_parse_prefix(&st, &sv)) {
154 DEMDEBUG("ERROR: could not parse prefix");
155 errno = EINVAL;
156 return (NULL);
157 }
158 DEMDEBUG("parsed prefix; remaining='%.*s'", SV_PRINT(&sv));
159
160 if (!rustdem_all_ascii(&sv)) {
161 /* rustdem_all_ascii() provides debug output */
162 errno = EINVAL;
163 return (NULL);
164 }
165
166 if ((ret = custr_xalloc(&st.rds_demangled, &custr_alloc)) != 0)
167 return (NULL);
168
169 while (sv_remaining(&sv) > 1) {
170 if (rustdem_parse_name(&st, &sv))
171 continue;
172 if (st.rds_error != 0)
173 goto fail;
174 }
175
176 if (st.rds_error != 0 || !sv_consume_if_c(&sv, 'E'))
177 goto fail;
178
179 char *res = xstrdup(ops, custr_cstr(st.rds_demangled));
180 if (res == NULL) {
181 st.rds_error = errno;
182 goto fail;
183 }
184
185 custr_free(st.rds_demangled);
186 DEMDEBUG("result = '%s'", res);
187 return (res);
188
189 fail:
190 custr_free(st.rds_demangled);
191 errno = st.rds_error;
192 return (NULL);
193 }
194
195 static boolean_t
196 rustdem_parse_prefix(rustdem_state_t *st, strview_t *svp)
197 {
198 strview_t pfx;
199
200 sv_init_sv(&pfx, svp);
201
202 DEMDEBUG("checking for '_ZN' or '__ZN' in '%.*s'", SV_PRINT(&pfx));
203
204 if (st->rds_error != 0)
205 return (B_FALSE);
206
207 if (!sv_consume_if_c(&pfx, '_'))
208 return (B_FALSE);
209
210 (void) sv_consume_if_c(&pfx, '_');
211
212 if (!sv_consume_if_c(&pfx, 'Z') || !sv_consume_if_c(&pfx, 'N'))
213 return (B_FALSE);
214
215 /* Update svp with new position */
216 sv_init_sv(svp, &pfx);
217 return (B_TRUE);
218 }
219
220 static boolean_t
221 rustdem_parse_name_segment(rustdem_state_t *st, strview_t *svp, boolean_t first)
222 {
223 strview_t sv;
224 strview_t name;
225 uint64_t len;
226 size_t rem;
227 boolean_t last = B_FALSE;
228
229 if (st->rds_error != 0 || sv_remaining(svp) == 0)
230 return (B_FALSE);
231
232 sv_init_sv(&sv, svp);
233
234 if (!rustdem_parse_num(st, &sv, &len)) {
235 DEMDEBUG("ERROR: no leading length");
236 st->rds_error = EINVAL;
237 return (B_FALSE);
238 }
239
240 rem = sv_remaining(&sv);
241
242 if (rem < len || len == SIZE_MAX) {
243 st->rds_error = EINVAL;
244 return (B_FALSE);
245 }
246
247 /* Is this the last segment before the terminating E? */
248 if (rem == len + 1) {
249 VERIFY3U(sv_peek(&sv, -1), ==, 'E');
250 last = B_TRUE;
251 }
252
253 if (!first && !rustdem_add_sep(st))
254 return (B_FALSE);
255
256 /* Reduce length of seg to the length we parsed */
257 (void) sv_init_sv_range(&name, &sv, len);
258
259 DEMDEBUG("%s: segment='%.*s'", __func__, SV_PRINT(&name));
260
261 /*
262 * A rust hash starts with 'h', and is the last component of a name
263 * before the terminating 'E'
264 */
265 if (sv_peek(&name, 0) == 'h' && last) {
266 if (!rustdem_parse_hash(st, &name))
267 return (B_FALSE);
268 goto done;
269 }
270
271 while (sv_remaining(&name) > 0) {
272 switch (sv_peek(&name, 0)) {
273 case '$':
274 if (rustdem_parse_special(st, &name))
275 continue;
276 break;
277 case '_':
278 if (sv_peek(&name, 1) == '$') {
279 /*
280 * Only consume/ignore '_'. Leave
281 * $ for next round.
282 */
283 sv_consume_n(&name, 1);
284 continue;
285 }
286 break;
287 case '.':
288 /* Convert '..' to '::' */
289 if (sv_peek(&name, 1) != '.')
290 break;
291
292 if (!rustdem_add_sep(st))
293 return (B_FALSE);
294
295 sv_consume_n(&name, 2);
296 continue;
297 default:
298 break;
299 }
300
301 if (custr_appendc(st->rds_demangled,
302 sv_consume_c(&name)) != 0) {
303 st->rds_error = ENOMEM;
304 return (B_FALSE);
305 }
306 }
307
308 done:
309 DEMDEBUG("%s: consumed '%.*s'", __func__, (int)len, svp->sv_first);
310 sv_consume_n(&sv, len);
311 sv_init_sv(svp, &sv);
312 return (B_TRUE);
313 }
314
315 static boolean_t
316 rustdem_parse_name(rustdem_state_t *st, strview_t *svp)
317 {
318 strview_t name;
319 boolean_t first = B_TRUE;
320
321 if (st->rds_error != 0)
322 return (B_FALSE);
323
324 sv_init_sv(&name, svp);
325
326 if (sv_remaining(&name) == 0)
327 return (B_FALSE);
328
329 while (sv_remaining(&name) > 0 && sv_peek(&name, 0) != 'E') {
330 if (!rustdem_parse_name_segment(st, &name, first))
331 return (B_FALSE);
332 first = B_FALSE;
333 }
334
335 sv_init_sv(svp, &name);
336 return (B_TRUE);
337 }
338
339 static boolean_t
340 rustdem_parse_hash(rustdem_state_t *st, strview_t *svp)
341 {
342 strview_t sv;
343
344 sv_init_sv(&sv, svp);
345
346 VERIFY(sv_consume_if_c(&sv, 'h'));
347 if (!rustdem_append_c(st, 'h'))
348 return (B_FALSE);
349
350 while (sv_remaining(&sv) > 0) {
351 char c = sv_consume_c(&sv);
352
353 switch (c) {
354 /*
355 * The upper-case hex digits (A-F) are excluded as valid
356 * hash values for several reasons:
357 *
358 * 1. It would result in two different possible names for
359 * the same function, leading to ambiguity in linking (among
360 * other things).
361 *
362 * 2. It would cause potential ambiguity in parsing -- is a
363 * trailing 'E' part of the hash, or the terminating character
364 * in the mangled name?
365 *
366 * 3. No examples were able to be found in the wild where
367 * uppercase digits are used, and other rust demanglers all
368 * seem to assume the hash must contain lower-case hex digits.
369 */
370 case '0': case '1': case '2': case '3':
371 case '4': case '5': case '6': case '7':
372 case '8': case '9': case 'a': case 'b':
373 case 'c': case 'd': case 'e': case 'f':
374 if (!rustdem_append_c(st, c))
375 return (B_FALSE);
376 break;
377 default:
378 return (B_FALSE);
379 }
380 }
381
382 sv_init_sv(svp, &sv);
383 return (B_TRUE);
384 }
385
386 /*
387 * A 10 digit value would imply a name 1Gb or larger in size. It seems
388 * unlikely to the point of absurdity any such value could every possibly
389 * be valid (or even have compiled properly). This also prevents the
390 * uint64_t conversion from possibly overflowing since the value must always
391 * be below 10 * UINT32_MAX.
392 */
393 #define MAX_DIGITS 10
394
395 static boolean_t
396 rustdem_parse_num(rustdem_state_t *restrict st, strview_t *restrict svp,
397 uint64_t *restrict valp)
398 {
399 strview_t snum;
400 uint64_t v = 0;
401 size_t ndigits = 0;
402 char c;
403
404 if (st->rds_error != 0)
405 return (B_FALSE);
406
407 sv_init_sv(&snum, svp);
408
409 DEMDEBUG("%s: str='%.*s'", __func__, SV_PRINT(&snum));
410
411 c = sv_peek(&snum, 0);
412 if (!ISDIGIT(c)) {
413 DEMDEBUG("%s: ERROR no digits in str\n", __func__);
414 st->rds_error = EINVAL;
415 return (B_FALSE);
416 }
417
418 /*
419 * Since there is currently no official specification on rust name
420 * mangling, only that it has been stated that rust follows what
421 * C++ mangling does. In the Itanium C++ ABI (what practically
422 * every non-Windows C++ implementation uses these days), it
423 * explicitly disallows leading 0s in numeric values (except for
424 * substition and template indexes, which aren't relevant here).
425 * We enforce the same restriction -- if a rust implementation allowed
426 * leading zeros in numbers (basically segment lengths) it'd
427 * cause all sorts of ambiguity problems with names that likely lead
428 * to much bigger problems with linking and such, so this seems
429 * reasonable.
430 */
431 if (c == '0') {
432 DEMDEBUG("%s: ERROR number starts with leading 0\n", __func__);
433 st->rds_error = EINVAL;
434 return (B_FALSE);
435 }
436
437 while (sv_remaining(&snum) > 0 && ndigits <= MAX_DIGITS) {
438 c = sv_consume_c(&snum);
439
440 if (!ISDIGIT(c))
441 break;
442
443 v *= 10;
444 v += c - '0';
445 ndigits++;
446 }
447
448 if (ndigits > MAX_DIGITS) {
449 DEMDEBUG("%s: value %llu is too large\n", __func__, v);
450 st->rds_error = ERANGE;
451 return (B_FALSE);
452 }
453
454 DEMDEBUG("%s: num=%llu", __func__, v);
455
456 *valp = v;
457 sv_consume_n(svp, ndigits);
458 return (B_TRUE);
459 }
460
461 static boolean_t
462 rustdem_parse_special(rustdem_state_t *restrict st, strview_t *restrict svp)
463 {
464 if (st->rds_error != 0)
465 return (B_FALSE);
466
467 if (sv_peek(svp, 0) != '$')
468 return (B_FALSE);
469
470 for (size_t i = 0; i < rust_charmap_sz; i++) {
471 if (sv_consume_if(svp, rust_charmap[i].ruc_seq)) {
472 if (!rustdem_append_c(st, rust_charmap[i].ruc_ch))
473 return (B_FALSE);
474 return (B_TRUE);
475 }
476 }
477 return (B_FALSE);
478 }
479
480 static boolean_t
481 rustdem_add_sep(rustdem_state_t *st)
482 {
483 if (st->rds_error != 0)
484 return (B_FALSE);
485
486 if (!rustdem_append_c(st, ':') ||
487 !rustdem_append_c(st, ':'))
488 return (B_FALSE);
489
490 return (B_TRUE);
491 }
492
493 static boolean_t
494 rustdem_append_c(rustdem_state_t *st, char c)
495 {
496 if (st->rds_error != 0)
497 return (B_FALSE);
498
499 if (custr_appendc(st->rds_demangled, c) == 0)
500 return (B_TRUE);
501
502 st->rds_error = errno;
503 return (B_FALSE);
504 }
505
506 static boolean_t
507 rustdem_all_ascii(const strview_t *svp)
508 {
509 strview_t p;
510
511 sv_init_sv(&p, svp);
512
513 while (sv_remaining(&p) > 0) {
514 char c = sv_consume_c(&p);
515
516 /*
517 * #including <sys/ctype.h> conflicts with <ctype.h>. Since
518 * we want the C locale macros (ISDIGIT, etc), it also means
519 * we can't use isascii(3C).
520 */
521 if ((c & 0x80) != 0) {
522 DEMDEBUG("%s: found non-ascii character 0x%02hhx at "
523 "offset %tu", __func__, c,
524 (ptrdiff_t)(p.sv_first - svp->sv_first));
525 return (B_FALSE);
526 }
527 }
528 return (B_TRUE);
529 }
530
531 static void *
532 rustdem_alloc(custr_alloc_t *cao, size_t len)
533 {
534 rustdem_state_t *st = cao->cua_arg;
535 return (zalloc(st->rds_ops, len));
536 }
537
538 static void
539 rustdem_free(custr_alloc_t *cao, void *p, size_t len)
540 {
541 rustdem_state_t *st = cao->cua_arg;
542 xfree(st->rds_ops, p, len);
543 }
--- EOF ---