4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2016 Joyent, Inc.
25 */
26
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/vm.h>
31 #include <sys/proc.h>
32 #include <sys/file.h>
33 #include <sys/conf.h>
34 #include <sys/kmem.h>
35 #include <sys/mem.h>
36 #include <sys/mman.h>
37 #include <sys/vnode.h>
38 #include <sys/errno.h>
39 #include <sys/memlist.h>
40 #include <sys/dumphdr.h>
41 #include <sys/dumpadm.h>
42 #include <sys/ksyms.h>
43 #include <sys/compress.h>
44 #include <sys/stream.h>
57 #include <sys/fs/snode.h>
58 #include <sys/ontrap.h>
59 #include <sys/panic.h>
60 #include <sys/dkio.h>
61 #include <sys/vtoc.h>
62 #include <sys/errorq.h>
63 #include <sys/fm/util.h>
64 #include <sys/fs/zfs.h>
65
66 #include <vm/hat.h>
67 #include <vm/as.h>
68 #include <vm/page.h>
69 #include <vm/pvn.h>
70 #include <vm/seg.h>
71 #include <vm/seg_kmem.h>
72 #include <sys/clock_impl.h>
73 #include <sys/hold_page.h>
74
75 #include <bzip2/bzlib.h>
76
77 /*
78 * Crash dump time is dominated by disk write time. To reduce this,
79 * the stronger compression method bzip2 is applied to reduce the dump
80 * size and hence reduce I/O time. However, bzip2 is much more
81 * computationally expensive than the existing lzjb algorithm, so to
82 * avoid increasing compression time, CPUs that are otherwise idle
83 * during panic are employed to parallelize the compression task.
84 * Many helper CPUs are needed to prevent bzip2 from being a
85 * bottleneck, and on systems with too few CPUs, the lzjb algorithm is
86 * parallelized instead. Lastly, I/O and compression are performed by
87 * different CPUs, and are hence overlapped in time, unlike the older
88 * serial code.
89 *
90 * Another important consideration is the speed of the dump
91 * device. Faster disks need less CPUs in order to benefit from
92 * parallel lzjb versus parallel bzip2. Therefore, the CPU count
93 * threshold for switching from parallel lzjb to paralled bzip2 is
94 * elevated for faster disks. The dump device speed is adduced from
95 * the setting for dumpbuf.iosize, see dump_update_clevel.
96 */
121 * Higher numbers give greater compression, but take more memory
122 * and time. Memory used per helper is ~(dump_bzip2_level * 1MB).
123 *
124 * dump_plat_mincpu the cross-over limit for using bzip2 (per platform):
125 * if dump_plat_mincpu == 0, then always do single threaded dump
126 * if ncpu >= dump_plat_mincpu then try to use bzip2
127 *
128 * dump_metrics_on if set, metrics are collected in the kernel, passed
129 * to savecore via the dump file, and recorded by savecore in
130 * METRICS.txt.
131 */
132 uint_t dump_ncpu_low = 4; /* minimum config for parallel lzjb */
133 uint_t dump_bzip2_level = 1; /* bzip2 level (1-9) */
134
135 /* Use dump_plat_mincpu_default unless this variable is set by /etc/system */
136 #define MINCPU_NOT_SET ((uint_t)-1)
137 uint_t dump_plat_mincpu = MINCPU_NOT_SET;
138
139 /* tunables for pre-reserved heap */
140 uint_t dump_kmem_permap = 1024;
141 uint_t dump_kmem_pages = 8;
142
143 /* Define multiple buffers per helper to avoid stalling */
144 #define NCBUF_PER_HELPER 2
145 #define NCMAP_PER_HELPER 4
146
147 /* minimum number of helpers configured */
148 #define MINHELPERS (dump_ncpu_low)
149 #define MINCBUFS (MINHELPERS * NCBUF_PER_HELPER)
150
151 /*
152 * Define constant parameters.
153 *
154 * CBUF_SIZE size of an output buffer
155 *
156 * CBUF_MAPSIZE size of virtual range for mapping pages
157 *
158 * CBUF_MAPNP size of virtual range in pages
159 *
160 */
161 #define DUMP_1KB ((size_t)1 << 10)
662 }
663
664 new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP);
665 cpend = &new->cmap[new->ncmap];
666 for (cp = new->cmap; cp != cpend; cp++) {
667 cp->state = CBUF_FREEMAP;
668 cp->size = CBUF_MAPSIZE;
669 cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE,
670 0, 0, NULL, NULL, VM_SLEEP);
671 }
672
673 /* reserve VA to be backed with spare pages at crash time */
674 if (new->maxsize > 0) {
675 new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE);
676 new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE);
677 new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize,
678 CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP);
679 }
680
681 /*
682 * Reserve memory for kmem allocation calls made during crash
683 * dump. The hat layer allocates memory for each mapping
684 * created, and the I/O path allocates buffers and data structs.
685 * Add a few pages for safety.
686 */
687 kmem_dump_init((new->ncmap * dump_kmem_permap) +
688 (dump_kmem_pages * PAGESIZE));
689
690 /* set new config pointers */
691 *old = *new;
692 }
693
694 /*
695 * Define a struct memlist walker to optimize bitnum to pfn
696 * lookup. The walker maintains the state of the list traversal.
697 */
698 typedef struct dumpmlw {
699 struct memlist *mp; /* current memlist */
700 pgcnt_t basenum; /* bitnum base offset */
701 pgcnt_t mppages; /* current memlist size */
702 pgcnt_t mpleft; /* size to end of current memlist */
703 pfn_t mpaddr; /* first pfn in memlist */
704 } dumpmlw_t;
705
706 /* initialize the walker */
|
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2018 Joyent, Inc.
25 */
26
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/vm.h>
31 #include <sys/proc.h>
32 #include <sys/file.h>
33 #include <sys/conf.h>
34 #include <sys/kmem.h>
35 #include <sys/mem.h>
36 #include <sys/mman.h>
37 #include <sys/vnode.h>
38 #include <sys/errno.h>
39 #include <sys/memlist.h>
40 #include <sys/dumphdr.h>
41 #include <sys/dumpadm.h>
42 #include <sys/ksyms.h>
43 #include <sys/compress.h>
44 #include <sys/stream.h>
57 #include <sys/fs/snode.h>
58 #include <sys/ontrap.h>
59 #include <sys/panic.h>
60 #include <sys/dkio.h>
61 #include <sys/vtoc.h>
62 #include <sys/errorq.h>
63 #include <sys/fm/util.h>
64 #include <sys/fs/zfs.h>
65
66 #include <vm/hat.h>
67 #include <vm/as.h>
68 #include <vm/page.h>
69 #include <vm/pvn.h>
70 #include <vm/seg.h>
71 #include <vm/seg_kmem.h>
72 #include <sys/clock_impl.h>
73 #include <sys/hold_page.h>
74
75 #include <bzip2/bzlib.h>
76
77 #define ONE_GIG (1024 * 1024 * 1024UL)
78
79 /*
80 * Crash dump time is dominated by disk write time. To reduce this,
81 * the stronger compression method bzip2 is applied to reduce the dump
82 * size and hence reduce I/O time. However, bzip2 is much more
83 * computationally expensive than the existing lzjb algorithm, so to
84 * avoid increasing compression time, CPUs that are otherwise idle
85 * during panic are employed to parallelize the compression task.
86 * Many helper CPUs are needed to prevent bzip2 from being a
87 * bottleneck, and on systems with too few CPUs, the lzjb algorithm is
88 * parallelized instead. Lastly, I/O and compression are performed by
89 * different CPUs, and are hence overlapped in time, unlike the older
90 * serial code.
91 *
92 * Another important consideration is the speed of the dump
93 * device. Faster disks need less CPUs in order to benefit from
94 * parallel lzjb versus parallel bzip2. Therefore, the CPU count
95 * threshold for switching from parallel lzjb to paralled bzip2 is
96 * elevated for faster disks. The dump device speed is adduced from
97 * the setting for dumpbuf.iosize, see dump_update_clevel.
98 */
123 * Higher numbers give greater compression, but take more memory
124 * and time. Memory used per helper is ~(dump_bzip2_level * 1MB).
125 *
126 * dump_plat_mincpu the cross-over limit for using bzip2 (per platform):
127 * if dump_plat_mincpu == 0, then always do single threaded dump
128 * if ncpu >= dump_plat_mincpu then try to use bzip2
129 *
130 * dump_metrics_on if set, metrics are collected in the kernel, passed
131 * to savecore via the dump file, and recorded by savecore in
132 * METRICS.txt.
133 */
134 uint_t dump_ncpu_low = 4; /* minimum config for parallel lzjb */
135 uint_t dump_bzip2_level = 1; /* bzip2 level (1-9) */
136
137 /* Use dump_plat_mincpu_default unless this variable is set by /etc/system */
138 #define MINCPU_NOT_SET ((uint_t)-1)
139 uint_t dump_plat_mincpu = MINCPU_NOT_SET;
140
141 /* tunables for pre-reserved heap */
142 uint_t dump_kmem_permap = 1024;
143 uint_t dump_kmem_pages = 0;
144
145 /* Define multiple buffers per helper to avoid stalling */
146 #define NCBUF_PER_HELPER 2
147 #define NCMAP_PER_HELPER 4
148
149 /* minimum number of helpers configured */
150 #define MINHELPERS (dump_ncpu_low)
151 #define MINCBUFS (MINHELPERS * NCBUF_PER_HELPER)
152
153 /*
154 * Define constant parameters.
155 *
156 * CBUF_SIZE size of an output buffer
157 *
158 * CBUF_MAPSIZE size of virtual range for mapping pages
159 *
160 * CBUF_MAPNP size of virtual range in pages
161 *
162 */
163 #define DUMP_1KB ((size_t)1 << 10)
664 }
665
666 new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP);
667 cpend = &new->cmap[new->ncmap];
668 for (cp = new->cmap; cp != cpend; cp++) {
669 cp->state = CBUF_FREEMAP;
670 cp->size = CBUF_MAPSIZE;
671 cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE,
672 0, 0, NULL, NULL, VM_SLEEP);
673 }
674
675 /* reserve VA to be backed with spare pages at crash time */
676 if (new->maxsize > 0) {
677 new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE);
678 new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE);
679 new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize,
680 CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP);
681 }
682
683 /*
684 * Reserve memory for kmem allocation calls made during crash dump. The
685 * hat layer allocates memory for each mapping created, and the I/O path
686 * allocates buffers and data structs.
687 *
688 * On larger systems, we easily exceed the lower amount, so we need some
689 * more space; the cut-over point is relatively arbitrary. If we run
690 * out, the only impact is that kmem state in the dump becomes
691 * inconsistent.
692 */
693
694 if (dump_kmem_pages == 0) {
695 if (physmem > (16 * ONE_GIG) / PAGESIZE)
696 dump_kmem_pages = 20;
697 else
698 dump_kmem_pages = 8;
699 }
700
701 kmem_dump_init((new->ncmap * dump_kmem_permap) +
702 (dump_kmem_pages * PAGESIZE));
703
704 /* set new config pointers */
705 *old = *new;
706 }
707
708 /*
709 * Define a struct memlist walker to optimize bitnum to pfn
710 * lookup. The walker maintains the state of the list traversal.
711 */
712 typedef struct dumpmlw {
713 struct memlist *mp; /* current memlist */
714 pgcnt_t basenum; /* bitnum base offset */
715 pgcnt_t mppages; /* current memlist size */
716 pgcnt_t mpleft; /* size to end of current memlist */
717 pfn_t mpaddr; /* first pfn in memlist */
718 } dumpmlw_t;
719
720 /* initialize the walker */
|