5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2018 Joyent, Inc.
25 */
26
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/vm.h>
31 #include <sys/proc.h>
32 #include <sys/file.h>
33 #include <sys/conf.h>
34 #include <sys/kmem.h>
35 #include <sys/mem.h>
36 #include <sys/mman.h>
37 #include <sys/vnode.h>
38 #include <sys/errno.h>
39 #include <sys/memlist.h>
40 #include <sys/dumphdr.h>
41 #include <sys/dumpadm.h>
42 #include <sys/ksyms.h>
43 #include <sys/compress.h>
44 #include <sys/stream.h>
54 #include <sys/debug.h>
55 #include <sys/sunddi.h>
56 #include <fs/fs_subr.h>
57 #include <sys/fs/snode.h>
58 #include <sys/ontrap.h>
59 #include <sys/panic.h>
60 #include <sys/dkio.h>
61 #include <sys/vtoc.h>
62 #include <sys/errorq.h>
63 #include <sys/fm/util.h>
64 #include <sys/fs/zfs.h>
65
66 #include <vm/hat.h>
67 #include <vm/as.h>
68 #include <vm/page.h>
69 #include <vm/pvn.h>
70 #include <vm/seg.h>
71 #include <vm/seg_kmem.h>
72 #include <sys/clock_impl.h>
73 #include <sys/hold_page.h>
74
75 #include <bzip2/bzlib.h>
76
77 #define ONE_GIG (1024 * 1024 * 1024UL)
78
79 /*
80 * Crash dump time is dominated by disk write time. To reduce this,
81 * the stronger compression method bzip2 is applied to reduce the dump
82 * size and hence reduce I/O time. However, bzip2 is much more
83 * computationally expensive than the existing lzjb algorithm, so to
84 * avoid increasing compression time, CPUs that are otherwise idle
85 * during panic are employed to parallelize the compression task.
86 * Many helper CPUs are needed to prevent bzip2 from being a
87 * bottleneck, and on systems with too few CPUs, the lzjb algorithm is
88 * parallelized instead. Lastly, I/O and compression are performed by
89 * different CPUs, and are hence overlapped in time, unlike the older
90 * serial code.
91 *
92 * Another important consideration is the speed of the dump
93 * device. Faster disks need less CPUs in order to benefit from
423 /*
424 * The dump I/O buffer.
425 *
426 * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is
427 * sized according to the optimum device transfer speed.
428 */
429 typedef struct dumpbuf {
430 vnode_t *cdev_vp; /* VCHR open of the dump device */
431 len_t vp_limit; /* maximum write offset */
432 offset_t vp_off; /* current dump device offset */
433 char *cur; /* dump write pointer */
434 char *start; /* dump buffer address */
435 char *end; /* dump buffer end */
436 size_t size; /* size of dumpbuf in bytes */
437 size_t iosize; /* best transfer size for device */
438 } dumpbuf_t;
439
440 dumpbuf_t dumpbuf; /* I/O buffer */
441
442 /*
443 * The dump I/O buffer must be at least one page, at most xfer_size
444 * bytes, and should scale with physmem in between. The transfer size
445 * passed in will either represent a global default (maxphys) or the
446 * best size for the device. The size of the dumpbuf I/O buffer is
447 * limited by dumpbuf_limit (8MB by default) because the dump
448 * performance saturates beyond a certain size. The default is to
449 * select 1/4096 of the memory.
450 */
451 static int dumpbuf_fraction = 12; /* memory size scale factor */
452 static size_t dumpbuf_limit = 8 * DUMP_1MB; /* max I/O buf size */
453
454 static size_t
455 dumpbuf_iosize(size_t xfer_size)
456 {
457 size_t iosize = ptob(physmem >> dumpbuf_fraction);
458
459 if (iosize < PAGESIZE)
460 iosize = PAGESIZE;
461 else if (iosize > xfer_size)
462 iosize = xfer_size;
2284 dumpvp_write(&csize, sizeof (csize));
2285 dumpvp_write(hp->lzbuf, csize);
2286 HRSTOP(hp->perpage, write);
2287 }
2288 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
2289 hp->cpin = NULL;
2290 }
2291
2292 /*
2293 * Main task to dump pages. This is called on the dump CPU.
2294 */
2295 static void
2296 dumpsys_main_task(void *arg)
2297 {
2298 dumpsync_t *ds = arg;
2299 pgcnt_t pagenum = 0, bitnum = 0, hibitnum;
2300 dumpmlw_t mlw;
2301 cbuf_t *cp;
2302 pgcnt_t baseoff, pfnoff;
2303 pfn_t base, pfn;
2304 int i, dumpserial;
2305
2306 /*
2307 * Fall back to serial mode if there are no helpers.
2308 * dump_plat_mincpu can be set to 0 at any time.
2309 * dumpcfg.helpermap must contain at least one member.
2310 */
2311 dumpserial = 1;
2312
2313 if (dump_plat_mincpu != 0 && dumpcfg.clevel != 0) {
2314 for (i = 0; i < BT_BITOUL(NCPU); ++i) {
2315 if (dumpcfg.helpermap[i] != 0) {
2316 dumpserial = 0;
2317 break;
2318 }
2319 }
2320 }
2321
2322 if (dumpserial) {
2323 dumpcfg.clevel = 0;
2324 if (dumpcfg.helper[0].lzbuf == NULL)
2325 dumpcfg.helper[0].lzbuf = dumpcfg.helper[1].page;
2326 }
2327
2328 dump_init_memlist_walker(&mlw);
2329
2330 for (;;) {
2331 int sec = (gethrtime() - ds->start) / NANOSEC;
2332
2333 /*
2334 * Render a simple progress display on the system console to
2335 * make clear to the operator that the system has not hung.
2336 * Emit an update when dump progress has advanced by one
2337 * percent, or when no update has been drawn in the last
2338 * second.
2339 */
2340 if (ds->percent > ds->percent_done || sec > ds->sec_done) {
2341 ds->sec_done = sec;
2342 ds->percent_done = ds->percent;
2343 uprintf("^\rdumping: %2d:%02d %3d%% done",
2344 sec / 60, sec % 60, ds->percent);
2345 ds->neednl = 1;
2346 }
2450
2451 dump_timeleft = dump_timeout;
2452 cp->used = ptob(pagenum - cp->pagenum);
2453
2454 HRSTART(ds->perpage, map);
2455 hat_devload(kas.a_hat, cp->buf, cp->size, base,
2456 PROT_READ, HAT_LOAD_NOCONSIST);
2457 HRSTOP(ds->perpage, map);
2458
2459 ds->pages_mapped += btop(cp->size);
2460 ds->pages_used += pagenum - cp->pagenum;
2461
2462 CQ_OPEN(mainq);
2463
2464 /*
2465 * If there are no helpers the main task does
2466 * non-streams lzjb compress.
2467 */
2468 if (dumpserial) {
2469 dumpsys_lzjb_page(dumpcfg.helper, cp);
2470 break;
2471 }
2472
2473 /* pass mapped pages to a helper */
2474 CQ_PUT(helperq, cp, CBUF_INREADY);
2475
2476 /* the last page was done */
2477 if (bitnum >= dumpcfg.bitmapsize)
2478 CQ_CLOSE(helperq);
2479
2480 break;
2481
2482 case CBUF_USEDMAP:
2483
2484 ds->npages += btop(cp->used);
2485
2486 HRSTART(ds->perpage, unmap);
2487 hat_unload(kas.a_hat, cp->buf, cp->size, HAT_UNLOAD);
2488 HRSTOP(ds->perpage, unmap);
2489
2490 if (bitnum < dumpcfg.bitmapsize)
2491 CQ_PUT(mainq, cp, CBUF_FREEMAP);
2492 CQ_CLOSE(mainq);
2493
2494 ASSERT(ds->npages <= dumphdr->dump_npages);
2560 for (i = 0; i < ncpus; i++) {
2561 if ((i & 15) == 0)
2562 P(",,%03d,", i);
2563 if (i == myid)
2564 P(" M");
2565 else if (BT_TEST(cfg->helpermap, i))
2566 P("%4d", cpu_seq[i]->cpu_id);
2567 else
2568 P(" *");
2569 if ((i & 15) == 15)
2570 P("\n");
2571 }
2572
2573 P("ncbuf_used,%d\n", cfg->ncbuf_used);
2574 P("ncmap,%d\n", cfg->ncmap);
2575
2576 P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m);
2577 P("Found small pages,%ld\n", cfg->foundsm);
2578
2579 P("Compression level,%d\n", cfg->clevel);
2580 P("Compression type,%s %s\n", cfg->clevel == 0 ? "serial" : "parallel",
2581 cfg->clevel >= DUMP_CLEVEL_BZIP2 ? "bzip2" : "lzjb");
2582 P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio %
2583 100);
2584 P("nhelper_used,%d\n", cfg->nhelper_used);
2585
2586 P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100);
2587 P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite);
2588 P("..total nsec,%lld\n", (u_longlong_t)ds->iotime);
2589 P("dumpbuf.iosize,%ld\n", dumpbuf.iosize);
2590 P("dumpbuf.size,%ld\n", dumpbuf.size);
2591
2592 P("Dump pages/sec,%llu\n", (u_longlong_t)ds->npages / sec);
2593 P("Dump pages,%llu\n", (u_longlong_t)ds->npages);
2594 P("Dump time,%d\n", sec);
2595
2596 if (ds->pages_mapped > 0)
2597 P("per-cent map utilization,%d\n", (int)((100 * ds->pages_used)
2598 / ds->pages_mapped));
2599
2600 P("\nPer-page metrics:\n");
2601 if (ds->npages > 0) {
|
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2018 Joyent, Inc.
25 * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
26 */
27
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/vm.h>
32 #include <sys/proc.h>
33 #include <sys/file.h>
34 #include <sys/conf.h>
35 #include <sys/kmem.h>
36 #include <sys/mem.h>
37 #include <sys/mman.h>
38 #include <sys/vnode.h>
39 #include <sys/errno.h>
40 #include <sys/memlist.h>
41 #include <sys/dumphdr.h>
42 #include <sys/dumpadm.h>
43 #include <sys/ksyms.h>
44 #include <sys/compress.h>
45 #include <sys/stream.h>
55 #include <sys/debug.h>
56 #include <sys/sunddi.h>
57 #include <fs/fs_subr.h>
58 #include <sys/fs/snode.h>
59 #include <sys/ontrap.h>
60 #include <sys/panic.h>
61 #include <sys/dkio.h>
62 #include <sys/vtoc.h>
63 #include <sys/errorq.h>
64 #include <sys/fm/util.h>
65 #include <sys/fs/zfs.h>
66
67 #include <vm/hat.h>
68 #include <vm/as.h>
69 #include <vm/page.h>
70 #include <vm/pvn.h>
71 #include <vm/seg.h>
72 #include <vm/seg_kmem.h>
73 #include <sys/clock_impl.h>
74 #include <sys/hold_page.h>
75 #include <sys/cpu.h>
76
77 #include <bzip2/bzlib.h>
78
79 #define ONE_GIG (1024 * 1024 * 1024UL)
80
81 /*
82 * Crash dump time is dominated by disk write time. To reduce this,
83 * the stronger compression method bzip2 is applied to reduce the dump
84 * size and hence reduce I/O time. However, bzip2 is much more
85 * computationally expensive than the existing lzjb algorithm, so to
86 * avoid increasing compression time, CPUs that are otherwise idle
87 * during panic are employed to parallelize the compression task.
88 * Many helper CPUs are needed to prevent bzip2 from being a
89 * bottleneck, and on systems with too few CPUs, the lzjb algorithm is
90 * parallelized instead. Lastly, I/O and compression are performed by
91 * different CPUs, and are hence overlapped in time, unlike the older
92 * serial code.
93 *
94 * Another important consideration is the speed of the dump
95 * device. Faster disks need less CPUs in order to benefit from
425 /*
426 * The dump I/O buffer.
427 *
428 * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is
429 * sized according to the optimum device transfer speed.
430 */
431 typedef struct dumpbuf {
432 vnode_t *cdev_vp; /* VCHR open of the dump device */
433 len_t vp_limit; /* maximum write offset */
434 offset_t vp_off; /* current dump device offset */
435 char *cur; /* dump write pointer */
436 char *start; /* dump buffer address */
437 char *end; /* dump buffer end */
438 size_t size; /* size of dumpbuf in bytes */
439 size_t iosize; /* best transfer size for device */
440 } dumpbuf_t;
441
442 dumpbuf_t dumpbuf; /* I/O buffer */
443
444 /*
445 * For parallel dump, defines maximum time main task thread will wait
446 * for at least one helper to register in dumpcfg.helpermap, before
447 * assuming there are no helpers and falling back to serial mode.
448 * Value is chosen arbitrary and provides *really* long wait for any
449 * available helper to register.
450 */
451 #define DUMP_HELPER_MAX_WAIT 1000 /* millisec */
452
453 /*
454 * The dump I/O buffer must be at least one page, at most xfer_size
455 * bytes, and should scale with physmem in between. The transfer size
456 * passed in will either represent a global default (maxphys) or the
457 * best size for the device. The size of the dumpbuf I/O buffer is
458 * limited by dumpbuf_limit (8MB by default) because the dump
459 * performance saturates beyond a certain size. The default is to
460 * select 1/4096 of the memory.
461 */
462 static int dumpbuf_fraction = 12; /* memory size scale factor */
463 static size_t dumpbuf_limit = 8 * DUMP_1MB; /* max I/O buf size */
464
465 static size_t
466 dumpbuf_iosize(size_t xfer_size)
467 {
468 size_t iosize = ptob(physmem >> dumpbuf_fraction);
469
470 if (iosize < PAGESIZE)
471 iosize = PAGESIZE;
472 else if (iosize > xfer_size)
473 iosize = xfer_size;
2295 dumpvp_write(&csize, sizeof (csize));
2296 dumpvp_write(hp->lzbuf, csize);
2297 HRSTOP(hp->perpage, write);
2298 }
2299 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
2300 hp->cpin = NULL;
2301 }
2302
2303 /*
2304 * Main task to dump pages. This is called on the dump CPU.
2305 */
2306 static void
2307 dumpsys_main_task(void *arg)
2308 {
2309 dumpsync_t *ds = arg;
2310 pgcnt_t pagenum = 0, bitnum = 0, hibitnum;
2311 dumpmlw_t mlw;
2312 cbuf_t *cp;
2313 pgcnt_t baseoff, pfnoff;
2314 pfn_t base, pfn;
2315 boolean_t dumpserial;
2316 int i;
2317
2318 /*
2319 * Fall back to serial mode if there are no helpers.
2320 * dump_plat_mincpu can be set to 0 at any time.
2321 * dumpcfg.helpermap must contain at least one member.
2322 *
2323 * It is possible that the helpers haven't registered
2324 * in helpermap yet; wait up to DUMP_HELPER_MAX_WAIT for
2325 * at least one helper to register.
2326 */
2327 dumpserial = B_TRUE;
2328 if (dump_plat_mincpu != 0 && dumpcfg.clevel != 0) {
2329 hrtime_t hrtmax = MSEC2NSEC(DUMP_HELPER_MAX_WAIT);
2330 hrtime_t hrtstart = gethrtime();
2331
2332 for (;;) {
2333 for (i = 0; i < BT_BITOUL(NCPU); ++i) {
2334 if (dumpcfg.helpermap[i] != 0) {
2335 dumpserial = B_FALSE;
2336 break;
2337 }
2338 }
2339
2340 if ((!dumpserial) ||
2341 ((gethrtime() - hrtstart) >= hrtmax)) {
2342 break;
2343 }
2344
2345 SMT_PAUSE();
2346 }
2347
2348 if (dumpserial) {
2349 dumpcfg.clevel = 0;
2350 if (dumpcfg.helper[0].lzbuf == NULL) {
2351 dumpcfg.helper[0].lzbuf =
2352 dumpcfg.helper[1].page;
2353 }
2354 }
2355 }
2356
2357 dump_init_memlist_walker(&mlw);
2358
2359 for (;;) {
2360 int sec = (gethrtime() - ds->start) / NANOSEC;
2361
2362 /*
2363 * Render a simple progress display on the system console to
2364 * make clear to the operator that the system has not hung.
2365 * Emit an update when dump progress has advanced by one
2366 * percent, or when no update has been drawn in the last
2367 * second.
2368 */
2369 if (ds->percent > ds->percent_done || sec > ds->sec_done) {
2370 ds->sec_done = sec;
2371 ds->percent_done = ds->percent;
2372 uprintf("^\rdumping: %2d:%02d %3d%% done",
2373 sec / 60, sec % 60, ds->percent);
2374 ds->neednl = 1;
2375 }
2479
2480 dump_timeleft = dump_timeout;
2481 cp->used = ptob(pagenum - cp->pagenum);
2482
2483 HRSTART(ds->perpage, map);
2484 hat_devload(kas.a_hat, cp->buf, cp->size, base,
2485 PROT_READ, HAT_LOAD_NOCONSIST);
2486 HRSTOP(ds->perpage, map);
2487
2488 ds->pages_mapped += btop(cp->size);
2489 ds->pages_used += pagenum - cp->pagenum;
2490
2491 CQ_OPEN(mainq);
2492
2493 /*
2494 * If there are no helpers the main task does
2495 * non-streams lzjb compress.
2496 */
2497 if (dumpserial) {
2498 dumpsys_lzjb_page(dumpcfg.helper, cp);
2499 } else {
2500 /* pass mapped pages to a helper */
2501 CQ_PUT(helperq, cp, CBUF_INREADY);
2502 }
2503
2504 /* the last page was done */
2505 if (bitnum >= dumpcfg.bitmapsize)
2506 CQ_CLOSE(helperq);
2507
2508 break;
2509
2510 case CBUF_USEDMAP:
2511
2512 ds->npages += btop(cp->used);
2513
2514 HRSTART(ds->perpage, unmap);
2515 hat_unload(kas.a_hat, cp->buf, cp->size, HAT_UNLOAD);
2516 HRSTOP(ds->perpage, unmap);
2517
2518 if (bitnum < dumpcfg.bitmapsize)
2519 CQ_PUT(mainq, cp, CBUF_FREEMAP);
2520 CQ_CLOSE(mainq);
2521
2522 ASSERT(ds->npages <= dumphdr->dump_npages);
2588 for (i = 0; i < ncpus; i++) {
2589 if ((i & 15) == 0)
2590 P(",,%03d,", i);
2591 if (i == myid)
2592 P(" M");
2593 else if (BT_TEST(cfg->helpermap, i))
2594 P("%4d", cpu_seq[i]->cpu_id);
2595 else
2596 P(" *");
2597 if ((i & 15) == 15)
2598 P("\n");
2599 }
2600
2601 P("ncbuf_used,%d\n", cfg->ncbuf_used);
2602 P("ncmap,%d\n", cfg->ncmap);
2603
2604 P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m);
2605 P("Found small pages,%ld\n", cfg->foundsm);
2606
2607 P("Compression level,%d\n", cfg->clevel);
2608 P("Compression type,%s %s", cfg->clevel == 0 ? "serial" : "parallel",
2609 cfg->clevel >= DUMP_CLEVEL_BZIP2 ? "bzip2" : "lzjb");
2610 if (cfg->clevel >= DUMP_CLEVEL_BZIP2)
2611 P(" (level %d)\n", dump_bzip2_level);
2612 else
2613 P("\n");
2614 P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio %
2615 100);
2616 P("nhelper_used,%d\n", cfg->nhelper_used);
2617
2618 P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100);
2619 P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite);
2620 P("..total nsec,%lld\n", (u_longlong_t)ds->iotime);
2621 P("dumpbuf.iosize,%ld\n", dumpbuf.iosize);
2622 P("dumpbuf.size,%ld\n", dumpbuf.size);
2623
2624 P("Dump pages/sec,%llu\n", (u_longlong_t)ds->npages / sec);
2625 P("Dump pages,%llu\n", (u_longlong_t)ds->npages);
2626 P("Dump time,%d\n", sec);
2627
2628 if (ds->pages_mapped > 0)
2629 P("per-cent map utilization,%d\n", (int)((100 * ds->pages_used)
2630 / ds->pages_mapped));
2631
2632 P("\nPer-page metrics:\n");
2633 if (ds->npages > 0) {
|