1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2018 Joyent, Inc.
25 */
26
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/vm.h>
31 #include <sys/proc.h>
32 #include <sys/file.h>
33 #include <sys/conf.h>
34 #include <sys/kmem.h>
35 #include <sys/mem.h>
36 #include <sys/mman.h>
37 #include <sys/vnode.h>
38 #include <sys/errno.h>
39 #include <sys/memlist.h>
40 #include <sys/dumphdr.h>
41 #include <sys/dumpadm.h>
42 #include <sys/ksyms.h>
43 #include <sys/compress.h>
44 #include <sys/stream.h>
45 #include <sys/strsun.h>
46 #include <sys/cmn_err.h>
47 #include <sys/bitmap.h>
48 #include <sys/modctl.h>
49 #include <sys/utsname.h>
50 #include <sys/systeminfo.h>
51 #include <sys/vmem.h>
52 #include <sys/log.h>
53 #include <sys/var.h>
54 #include <sys/debug.h>
55 #include <sys/sunddi.h>
56 #include <fs/fs_subr.h>
57 #include <sys/fs/snode.h>
58 #include <sys/ontrap.h>
59 #include <sys/panic.h>
60 #include <sys/dkio.h>
61 #include <sys/vtoc.h>
62 #include <sys/errorq.h>
63 #include <sys/fm/util.h>
64 #include <sys/fs/zfs.h>
65
66 #include <vm/hat.h>
67 #include <vm/as.h>
68 #include <vm/page.h>
69 #include <vm/pvn.h>
70 #include <vm/seg.h>
71 #include <vm/seg_kmem.h>
72 #include <sys/clock_impl.h>
73 #include <sys/hold_page.h>
74
75 #include <bzip2/bzlib.h>
76
77 #define ONE_GIG (1024 * 1024 * 1024UL)
78
79 /*
80 * Crash dump time is dominated by disk write time. To reduce this,
81 * the stronger compression method bzip2 is applied to reduce the dump
82 * size and hence reduce I/O time. However, bzip2 is much more
83 * computationally expensive than the existing lzjb algorithm, so to
84 * avoid increasing compression time, CPUs that are otherwise idle
85 * during panic are employed to parallelize the compression task.
86 * Many helper CPUs are needed to prevent bzip2 from being a
87 * bottleneck, and on systems with too few CPUs, the lzjb algorithm is
88 * parallelized instead. Lastly, I/O and compression are performed by
89 * different CPUs, and are hence overlapped in time, unlike the older
90 * serial code.
91 *
92 * Another important consideration is the speed of the dump
93 * device. Faster disks need less CPUs in order to benefit from
94 * parallel lzjb versus parallel bzip2. Therefore, the CPU count
95 * threshold for switching from parallel lzjb to paralled bzip2 is
96 * elevated for faster disks. The dump device speed is adduced from
97 * the setting for dumpbuf.iosize, see dump_update_clevel.
98 */
99
100 /*
101 * exported vars
102 */
103 kmutex_t dump_lock; /* lock for dump configuration */
104 dumphdr_t *dumphdr; /* dump header */
105 int dump_conflags = DUMP_KERNEL; /* dump configuration flags */
106 vnode_t *dumpvp; /* dump device vnode pointer */
107 u_offset_t dumpvp_size; /* size of dump device, in bytes */
108 char *dumppath; /* pathname of dump device */
109 int dump_timeout = 120; /* timeout for dumping pages */
110 int dump_timeleft; /* portion of dump_timeout remaining */
111 int dump_ioerr; /* dump i/o error */
112 int dump_check_used; /* enable check for used pages */
113 char *dump_stack_scratch; /* scratch area for saving stack summary */
114
115 /*
116 * Tunables for dump compression and parallelism. These can be set via
117 * /etc/system.
118 *
119 * dump_ncpu_low number of helpers for parallel lzjb
120 * This is also the minimum configuration.
121 *
122 * dump_bzip2_level bzip2 compression level: 1-9
123 * Higher numbers give greater compression, but take more memory
124 * and time. Memory used per helper is ~(dump_bzip2_level * 1MB).
125 *
126 * dump_plat_mincpu the cross-over limit for using bzip2 (per platform):
127 * if dump_plat_mincpu == 0, then always do single threaded dump
128 * if ncpu >= dump_plat_mincpu then try to use bzip2
129 *
130 * dump_metrics_on if set, metrics are collected in the kernel, passed
131 * to savecore via the dump file, and recorded by savecore in
132 * METRICS.txt.
133 */
134 uint_t dump_ncpu_low = 4; /* minimum config for parallel lzjb */
135 uint_t dump_bzip2_level = 1; /* bzip2 level (1-9) */
136
137 /* Use dump_plat_mincpu_default unless this variable is set by /etc/system */
138 #define MINCPU_NOT_SET ((uint_t)-1)
139 uint_t dump_plat_mincpu = MINCPU_NOT_SET;
140
141 /* tunables for pre-reserved heap */
142 uint_t dump_kmem_permap = 1024;
143 uint_t dump_kmem_pages = 0;
144
145 /* Define multiple buffers per helper to avoid stalling */
146 #define NCBUF_PER_HELPER 2
147 #define NCMAP_PER_HELPER 4
148
149 /* minimum number of helpers configured */
150 #define MINHELPERS (dump_ncpu_low)
151 #define MINCBUFS (MINHELPERS * NCBUF_PER_HELPER)
152
153 /*
154 * Define constant parameters.
155 *
156 * CBUF_SIZE size of an output buffer
157 *
158 * CBUF_MAPSIZE size of virtual range for mapping pages
159 *
160 * CBUF_MAPNP size of virtual range in pages
161 *
162 */
163 #define DUMP_1KB ((size_t)1 << 10)
164 #define DUMP_1MB ((size_t)1 << 20)
165 #define CBUF_SIZE ((size_t)1 << 17)
166 #define CBUF_MAPSHIFT (22)
167 #define CBUF_MAPSIZE ((size_t)1 << CBUF_MAPSHIFT)
168 #define CBUF_MAPNP ((size_t)1 << (CBUF_MAPSHIFT - PAGESHIFT))
169
170 /*
171 * Compression metrics are accumulated nano-second subtotals. The
172 * results are normalized by the number of pages dumped. A report is
173 * generated when dumpsys() completes and is saved in the dump image
174 * after the trailing dump header.
175 *
176 * Metrics are always collected. Set the variable dump_metrics_on to
177 * cause metrics to be saved in the crash file, where savecore will
178 * save it in the file METRICS.txt.
179 */
180 #define PERPAGES \
181 PERPAGE(bitmap) PERPAGE(map) PERPAGE(unmap) \
182 PERPAGE(copy) PERPAGE(compress) \
183 PERPAGE(write) \
184 PERPAGE(inwait) PERPAGE(outwait)
185
186 typedef struct perpage {
187 #define PERPAGE(x) hrtime_t x;
188 PERPAGES
189 #undef PERPAGE
190 } perpage_t;
191
192 /*
193 * This macro controls the code generation for collecting dump
194 * performance information. By default, the code is generated, but
195 * automatic saving of the information is disabled. If dump_metrics_on
196 * is set to 1, the timing information is passed to savecore via the
197 * crash file, where it is appended to the file dump-dir/METRICS.txt.
198 */
199 #define COLLECT_METRICS
200
201 #ifdef COLLECT_METRICS
202 uint_t dump_metrics_on = 0; /* set to 1 to enable recording metrics */
203
204 #define HRSTART(v, m) v##ts.m = gethrtime()
205 #define HRSTOP(v, m) v.m += gethrtime() - v##ts.m
206 #define HRBEGIN(v, m, s) v##ts.m = gethrtime(); v.size += s
207 #define HREND(v, m) v.m += gethrtime() - v##ts.m
208 #define HRNORM(v, m, n) v.m /= (n)
209
210 #else
211 #define HRSTART(v, m)
212 #define HRSTOP(v, m)
213 #define HRBEGIN(v, m, s)
214 #define HREND(v, m)
215 #define HRNORM(v, m, n)
216 #endif /* COLLECT_METRICS */
217
218 /*
219 * Buffers for copying and compressing memory pages.
220 *
221 * cbuf_t buffer controllers: used for both input and output.
222 *
223 * The buffer state indicates how it is being used:
224 *
225 * CBUF_FREEMAP: CBUF_MAPSIZE virtual address range is available for
226 * mapping input pages.
227 *
228 * CBUF_INREADY: input pages are mapped and ready for compression by a
229 * helper.
230 *
231 * CBUF_USEDMAP: mapping has been consumed by a helper. Needs unmap.
232 *
233 * CBUF_FREEBUF: CBUF_SIZE output buffer, which is available.
234 *
235 * CBUF_WRITE: CBUF_SIZE block of compressed pages from a helper,
236 * ready to write out.
237 *
238 * CBUF_ERRMSG: CBUF_SIZE block of error messages from a helper
239 * (reports UE errors.)
240 */
241
242 typedef enum cbufstate {
243 CBUF_FREEMAP,
244 CBUF_INREADY,
245 CBUF_USEDMAP,
246 CBUF_FREEBUF,
247 CBUF_WRITE,
248 CBUF_ERRMSG
249 } cbufstate_t;
250
251 typedef struct cbuf cbuf_t;
252
253 struct cbuf {
254 cbuf_t *next; /* next in list */
255 cbufstate_t state; /* processing state */
256 size_t used; /* amount used */
257 size_t size; /* mem size */
258 char *buf; /* kmem or vmem */
259 pgcnt_t pagenum; /* index to pfn map */
260 pgcnt_t bitnum; /* first set bitnum */
261 pfn_t pfn; /* first pfn in mapped range */
262 int off; /* byte offset to first pfn */
263 };
264
265 static char dump_osimage_uuid[36 + 1];
266
267 #define isdigit(ch) ((ch) >= '0' && (ch) <= '9')
268 #define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \
269 ((ch) >= 'A' && (ch) <= 'F'))
270
271 /*
272 * cqueue_t queues: a uni-directional channel for communication
273 * from the master to helper tasks or vice-versa using put and
274 * get primitives. Both mappings and data buffers are passed via
275 * queues. Producers close a queue when done. The number of
276 * active producers is reference counted so the consumer can
277 * detect end of data. Concurrent access is mediated by atomic
278 * operations for panic dump, or mutex/cv for live dump.
279 *
280 * There a four queues, used as follows:
281 *
282 * Queue Dataflow NewState
283 * --------------------------------------------------
284 * mainq master -> master FREEMAP
285 * master has initialized or unmapped an input buffer
286 * --------------------------------------------------
287 * helperq master -> helper INREADY
288 * master has mapped input for use by helper
289 * --------------------------------------------------
290 * mainq master <- helper USEDMAP
291 * helper is done with input
292 * --------------------------------------------------
293 * freebufq master -> helper FREEBUF
294 * master has initialized or written an output buffer
295 * --------------------------------------------------
296 * mainq master <- helper WRITE
297 * block of compressed pages from a helper
298 * --------------------------------------------------
299 * mainq master <- helper ERRMSG
300 * error messages from a helper (memory error case)
301 * --------------------------------------------------
302 * writerq master <- master WRITE
303 * non-blocking queue of blocks to write
304 * --------------------------------------------------
305 */
306 typedef struct cqueue {
307 cbuf_t *volatile first; /* first in list */
308 cbuf_t *last; /* last in list */
309 hrtime_t ts; /* timestamp */
310 hrtime_t empty; /* total time empty */
311 kmutex_t mutex; /* live state lock */
312 kcondvar_t cv; /* live wait var */
313 lock_t spinlock; /* panic mode spin lock */
314 volatile uint_t open; /* producer ref count */
315 } cqueue_t;
316
317 /*
318 * Convenience macros for using the cqueue functions
319 * Note that the caller must have defined "dumpsync_t *ds"
320 */
321 #define CQ_IS_EMPTY(q) \
322 (ds->q.first == NULL)
323
324 #define CQ_OPEN(q) \
325 atomic_inc_uint(&ds->q.open)
326
327 #define CQ_CLOSE(q) \
328 dumpsys_close_cq(&ds->q, ds->live)
329
330 #define CQ_PUT(q, cp, st) \
331 dumpsys_put_cq(&ds->q, cp, st, ds->live)
332
333 #define CQ_GET(q) \
334 dumpsys_get_cq(&ds->q, ds->live)
335
336 /*
337 * Dynamic state when dumpsys() is running.
338 */
339 typedef struct dumpsync {
340 pgcnt_t npages; /* subtotal of pages dumped */
341 pgcnt_t pages_mapped; /* subtotal of pages mapped */
342 pgcnt_t pages_used; /* subtotal of pages used per map */
343 size_t nwrite; /* subtotal of bytes written */
344 uint_t live; /* running live dump */
345 uint_t neednl; /* will need to print a newline */
346 uint_t percent; /* dump progress */
347 uint_t percent_done; /* dump progress reported */
348 int sec_done; /* dump progress last report time */
349 cqueue_t freebufq; /* free kmem bufs for writing */
350 cqueue_t mainq; /* input for main task */
351 cqueue_t helperq; /* input for helpers */
352 cqueue_t writerq; /* input for writer */
353 hrtime_t start; /* start time */
354 hrtime_t elapsed; /* elapsed time when completed */
355 hrtime_t iotime; /* time spent writing nwrite bytes */
356 hrtime_t iowait; /* time spent waiting for output */
357 hrtime_t iowaitts; /* iowait timestamp */
358 perpage_t perpage; /* metrics */
359 perpage_t perpagets;
360 int dumpcpu; /* master cpu */
361 } dumpsync_t;
362
363 static dumpsync_t dumpsync; /* synchronization vars */
364
365 /*
366 * helper_t helpers: contains the context for a stream. CPUs run in
367 * parallel at dump time; each CPU creates a single stream of
368 * compression data. Stream data is divided into CBUF_SIZE blocks.
369 * The blocks are written in order within a stream. But, blocks from
370 * multiple streams can be interleaved. Each stream is identified by a
371 * unique tag.
372 */
373 typedef struct helper {
374 int helper; /* bound helper id */
375 int tag; /* compression stream tag */
376 perpage_t perpage; /* per page metrics */
377 perpage_t perpagets; /* per page metrics (timestamps) */
378 taskqid_t taskqid; /* live dump task ptr */
379 int in, out; /* buffer offsets */
380 cbuf_t *cpin, *cpout, *cperr; /* cbuf objects in process */
381 dumpsync_t *ds; /* pointer to sync vars */
382 size_t used; /* counts input consumed */
383 char *page; /* buffer for page copy */
384 char *lzbuf; /* lzjb output */
385 bz_stream bzstream; /* bzip2 state */
386 } helper_t;
387
388 #define MAINHELPER (-1) /* helper is also the main task */
389 #define FREEHELPER (-2) /* unbound helper */
390 #define DONEHELPER (-3) /* helper finished */
391
392 /*
393 * configuration vars for dumpsys
394 */
395 typedef struct dumpcfg {
396 int threshold; /* ncpu threshold for bzip2 */
397 int nhelper; /* number of helpers */
398 int nhelper_used; /* actual number of helpers used */
399 int ncmap; /* number VA pages for compression */
400 int ncbuf; /* number of bufs for compression */
401 int ncbuf_used; /* number of bufs in use */
402 uint_t clevel; /* dump compression level */
403 helper_t *helper; /* array of helpers */
404 cbuf_t *cmap; /* array of input (map) buffers */
405 cbuf_t *cbuf; /* array of output buffers */
406 ulong_t *helpermap; /* set of dumpsys helper CPU ids */
407 ulong_t *bitmap; /* bitmap for marking pages to dump */
408 ulong_t *rbitmap; /* bitmap for used CBUF_MAPSIZE ranges */
409 pgcnt_t bitmapsize; /* size of bitmap */
410 pgcnt_t rbitmapsize; /* size of bitmap for ranges */
411 pgcnt_t found4m; /* number ranges allocated by dump */
412 pgcnt_t foundsm; /* number small pages allocated by dump */
413 pid_t *pids; /* list of process IDs at dump time */
414 size_t maxsize; /* memory size needed at dump time */
415 size_t maxvmsize; /* size of reserved VM */
416 char *maxvm; /* reserved VM for spare pages */
417 lock_t helper_lock; /* protect helper state */
418 char helpers_wanted; /* flag to enable parallelism */
419 } dumpcfg_t;
420
421 static dumpcfg_t dumpcfg; /* config vars */
422
423 /*
424 * The dump I/O buffer.
425 *
426 * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is
427 * sized according to the optimum device transfer speed.
428 */
429 typedef struct dumpbuf {
430 vnode_t *cdev_vp; /* VCHR open of the dump device */
431 len_t vp_limit; /* maximum write offset */
432 offset_t vp_off; /* current dump device offset */
433 char *cur; /* dump write pointer */
434 char *start; /* dump buffer address */
435 char *end; /* dump buffer end */
436 size_t size; /* size of dumpbuf in bytes */
437 size_t iosize; /* best transfer size for device */
438 } dumpbuf_t;
439
440 dumpbuf_t dumpbuf; /* I/O buffer */
441
442 /*
443 * The dump I/O buffer must be at least one page, at most xfer_size
444 * bytes, and should scale with physmem in between. The transfer size
445 * passed in will either represent a global default (maxphys) or the
446 * best size for the device. The size of the dumpbuf I/O buffer is
447 * limited by dumpbuf_limit (8MB by default) because the dump
448 * performance saturates beyond a certain size. The default is to
449 * select 1/4096 of the memory.
450 */
451 static int dumpbuf_fraction = 12; /* memory size scale factor */
452 static size_t dumpbuf_limit = 8 * DUMP_1MB; /* max I/O buf size */
453
454 static size_t
455 dumpbuf_iosize(size_t xfer_size)
456 {
457 size_t iosize = ptob(physmem >> dumpbuf_fraction);
458
459 if (iosize < PAGESIZE)
460 iosize = PAGESIZE;
461 else if (iosize > xfer_size)
462 iosize = xfer_size;
463 if (iosize > dumpbuf_limit)
464 iosize = dumpbuf_limit;
465 return (iosize & PAGEMASK);
466 }
467
468 /*
469 * resize the I/O buffer
470 */
471 static void
472 dumpbuf_resize(void)
473 {
474 char *old_buf = dumpbuf.start;
475 size_t old_size = dumpbuf.size;
476 char *new_buf;
477 size_t new_size;
478
479 ASSERT(MUTEX_HELD(&dump_lock));
480
481 new_size = dumpbuf_iosize(MAX(dumpbuf.iosize, maxphys));
482 if (new_size <= old_size)
483 return; /* no need to reallocate buffer */
484
485 new_buf = kmem_alloc(new_size, KM_SLEEP);
486 dumpbuf.size = new_size;
487 dumpbuf.start = new_buf;
488 dumpbuf.end = new_buf + new_size;
489 kmem_free(old_buf, old_size);
490 }
491
492 /*
493 * dump_update_clevel is called when dumpadm configures the dump device.
494 * Calculate number of helpers and buffers.
495 * Allocate the minimum configuration for now.
496 *
497 * When the dump file is configured we reserve a minimum amount of
498 * memory for use at crash time. But we reserve VA for all the memory
499 * we really want in order to do the fastest dump possible. The VA is
500 * backed by pages not being dumped, according to the bitmap. If
501 * there is insufficient spare memory, however, we fall back to the
502 * minimum.
503 *
504 * Live dump (savecore -L) always uses the minimum config.
505 *
506 * clevel 0 is single threaded lzjb
507 * clevel 1 is parallel lzjb
508 * clevel 2 is parallel bzip2
509 *
510 * The ncpu threshold is selected with dump_plat_mincpu.
511 * On OPL, set_platform_defaults() overrides the sun4u setting.
512 * The actual values are defined via DUMP_PLAT_*_MINCPU macros.
513 *
514 * Architecture Threshold Algorithm
515 * sun4u < 51 parallel lzjb
516 * sun4u >= 51 parallel bzip2(*)
517 * sun4u OPL < 8 parallel lzjb
518 * sun4u OPL >= 8 parallel bzip2(*)
519 * sun4v < 128 parallel lzjb
520 * sun4v >= 128 parallel bzip2(*)
521 * x86 < 11 parallel lzjb
522 * x86 >= 11 parallel bzip2(*)
523 * 32-bit N/A single-threaded lzjb
524 *
525 * (*) bzip2 is only chosen if there is sufficient available
526 * memory for buffers at dump time. See dumpsys_get_maxmem().
527 *
528 * Faster dump devices have larger I/O buffers. The threshold value is
529 * increased according to the size of the dump I/O buffer, because
530 * parallel lzjb performs better with faster disks. For buffers >= 1MB
531 * the threshold is 3X; for buffers >= 256K threshold is 2X.
532 *
533 * For parallel dumps, the number of helpers is ncpu-1. The CPU
534 * running panic runs the main task. For single-threaded dumps, the
535 * panic CPU does lzjb compression (it is tagged as MAINHELPER.)
536 *
537 * Need multiple buffers per helper so that they do not block waiting
538 * for the main task.
539 * parallel single-threaded
540 * Number of output buffers: nhelper*2 1
541 * Number of mapping buffers: nhelper*4 1
542 *
543 */
544 static void
545 dump_update_clevel()
546 {
547 int tag;
548 size_t bz2size;
549 helper_t *hp, *hpend;
550 cbuf_t *cp, *cpend;
551 dumpcfg_t *old = &dumpcfg;
552 dumpcfg_t newcfg = *old;
553 dumpcfg_t *new = &newcfg;
554
555 ASSERT(MUTEX_HELD(&dump_lock));
556
557 /*
558 * Free the previously allocated bufs and VM.
559 */
560 if (old->helper != NULL) {
561
562 /* helpers */
563 hpend = &old->helper[old->nhelper];
564 for (hp = old->helper; hp != hpend; hp++) {
565 if (hp->lzbuf != NULL)
566 kmem_free(hp->lzbuf, PAGESIZE);
567 if (hp->page != NULL)
568 kmem_free(hp->page, PAGESIZE);
569 }
570 kmem_free(old->helper, old->nhelper * sizeof (helper_t));
571
572 /* VM space for mapping pages */
573 cpend = &old->cmap[old->ncmap];
574 for (cp = old->cmap; cp != cpend; cp++)
575 vmem_xfree(heap_arena, cp->buf, CBUF_MAPSIZE);
576 kmem_free(old->cmap, old->ncmap * sizeof (cbuf_t));
577
578 /* output bufs */
579 cpend = &old->cbuf[old->ncbuf];
580 for (cp = old->cbuf; cp != cpend; cp++)
581 if (cp->buf != NULL)
582 kmem_free(cp->buf, cp->size);
583 kmem_free(old->cbuf, old->ncbuf * sizeof (cbuf_t));
584
585 /* reserved VM for dumpsys_get_maxmem */
586 if (old->maxvmsize > 0)
587 vmem_xfree(heap_arena, old->maxvm, old->maxvmsize);
588 }
589
590 /*
591 * Allocate memory and VM.
592 * One CPU runs dumpsys, the rest are helpers.
593 */
594 new->nhelper = ncpus - 1;
595 if (new->nhelper < 1)
596 new->nhelper = 1;
597
598 if (new->nhelper > DUMP_MAX_NHELPER)
599 new->nhelper = DUMP_MAX_NHELPER;
600
601 /* use platform default, unless /etc/system overrides */
602 if (dump_plat_mincpu == MINCPU_NOT_SET)
603 dump_plat_mincpu = dump_plat_mincpu_default;
604
605 /* increase threshold for faster disks */
606 new->threshold = dump_plat_mincpu;
607 if (dumpbuf.iosize >= DUMP_1MB)
608 new->threshold *= 3;
609 else if (dumpbuf.iosize >= (256 * DUMP_1KB))
610 new->threshold *= 2;
611
612 /* figure compression level based upon the computed threshold. */
613 if (dump_plat_mincpu == 0 || new->nhelper < 2) {
614 new->clevel = 0;
615 new->nhelper = 1;
616 } else if ((new->nhelper + 1) >= new->threshold) {
617 new->clevel = DUMP_CLEVEL_BZIP2;
618 } else {
619 new->clevel = DUMP_CLEVEL_LZJB;
620 }
621
622 if (new->clevel == 0) {
623 new->ncbuf = 1;
624 new->ncmap = 1;
625 } else {
626 new->ncbuf = NCBUF_PER_HELPER * new->nhelper;
627 new->ncmap = NCMAP_PER_HELPER * new->nhelper;
628 }
629
630 /*
631 * Allocate new data structures and buffers for MINHELPERS,
632 * and also figure the max desired size.
633 */
634 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level);
635 new->maxsize = 0;
636 new->maxvmsize = 0;
637 new->maxvm = NULL;
638 tag = 1;
639 new->helper = kmem_zalloc(new->nhelper * sizeof (helper_t), KM_SLEEP);
640 hpend = &new->helper[new->nhelper];
641 for (hp = new->helper; hp != hpend; hp++) {
642 hp->tag = tag++;
643 if (hp < &new->helper[MINHELPERS]) {
644 hp->lzbuf = kmem_alloc(PAGESIZE, KM_SLEEP);
645 hp->page = kmem_alloc(PAGESIZE, KM_SLEEP);
646 } else if (new->clevel < DUMP_CLEVEL_BZIP2) {
647 new->maxsize += 2 * PAGESIZE;
648 } else {
649 new->maxsize += PAGESIZE;
650 }
651 if (new->clevel >= DUMP_CLEVEL_BZIP2)
652 new->maxsize += bz2size;
653 }
654
655 new->cbuf = kmem_zalloc(new->ncbuf * sizeof (cbuf_t), KM_SLEEP);
656 cpend = &new->cbuf[new->ncbuf];
657 for (cp = new->cbuf; cp != cpend; cp++) {
658 cp->state = CBUF_FREEBUF;
659 cp->size = CBUF_SIZE;
660 if (cp < &new->cbuf[MINCBUFS])
661 cp->buf = kmem_alloc(cp->size, KM_SLEEP);
662 else
663 new->maxsize += cp->size;
664 }
665
666 new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP);
667 cpend = &new->cmap[new->ncmap];
668 for (cp = new->cmap; cp != cpend; cp++) {
669 cp->state = CBUF_FREEMAP;
670 cp->size = CBUF_MAPSIZE;
671 cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE,
672 0, 0, NULL, NULL, VM_SLEEP);
673 }
674
675 /* reserve VA to be backed with spare pages at crash time */
676 if (new->maxsize > 0) {
677 new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE);
678 new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE);
679 new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize,
680 CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP);
681 }
682
683 /*
684 * Reserve memory for kmem allocation calls made during crash dump. The
685 * hat layer allocates memory for each mapping created, and the I/O path
686 * allocates buffers and data structs.
687 *
688 * On larger systems, we easily exceed the lower amount, so we need some
689 * more space; the cut-over point is relatively arbitrary. If we run
690 * out, the only impact is that kmem state in the dump becomes
691 * inconsistent.
692 */
693
694 if (dump_kmem_pages == 0) {
695 if (physmem > (16 * ONE_GIG) / PAGESIZE)
696 dump_kmem_pages = 20;
697 else
698 dump_kmem_pages = 8;
699 }
700
701 kmem_dump_init((new->ncmap * dump_kmem_permap) +
702 (dump_kmem_pages * PAGESIZE));
703
704 /* set new config pointers */
705 *old = *new;
706 }
707
708 /*
709 * Define a struct memlist walker to optimize bitnum to pfn
710 * lookup. The walker maintains the state of the list traversal.
711 */
712 typedef struct dumpmlw {
713 struct memlist *mp; /* current memlist */
714 pgcnt_t basenum; /* bitnum base offset */
715 pgcnt_t mppages; /* current memlist size */
716 pgcnt_t mpleft; /* size to end of current memlist */
717 pfn_t mpaddr; /* first pfn in memlist */
718 } dumpmlw_t;
719
720 /* initialize the walker */
721 static inline void
722 dump_init_memlist_walker(dumpmlw_t *pw)
723 {
724 pw->mp = phys_install;
725 pw->basenum = 0;
726 pw->mppages = pw->mp->ml_size >> PAGESHIFT;
727 pw->mpleft = pw->mppages;
728 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT;
729 }
730
731 /*
732 * Lookup pfn given bitnum. The memlist can be quite long on some
733 * systems (e.g.: one per board). To optimize sequential lookups, the
734 * caller initializes and presents a memlist walker.
735 */
736 static pfn_t
737 dump_bitnum_to_pfn(pgcnt_t bitnum, dumpmlw_t *pw)
738 {
739 bitnum -= pw->basenum;
740 while (pw->mp != NULL) {
741 if (bitnum < pw->mppages) {
742 pw->mpleft = pw->mppages - bitnum;
743 return (pw->mpaddr + bitnum);
744 }
745 bitnum -= pw->mppages;
746 pw->basenum += pw->mppages;
747 pw->mp = pw->mp->ml_next;
748 if (pw->mp != NULL) {
749 pw->mppages = pw->mp->ml_size >> PAGESHIFT;
750 pw->mpleft = pw->mppages;
751 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT;
752 }
753 }
754 return (PFN_INVALID);
755 }
756
757 static pgcnt_t
758 dump_pfn_to_bitnum(pfn_t pfn)
759 {
760 struct memlist *mp;
761 pgcnt_t bitnum = 0;
762
763 for (mp = phys_install; mp != NULL; mp = mp->ml_next) {
764 if (pfn >= (mp->ml_address >> PAGESHIFT) &&
765 pfn < ((mp->ml_address + mp->ml_size) >> PAGESHIFT))
766 return (bitnum + pfn - (mp->ml_address >> PAGESHIFT));
767 bitnum += mp->ml_size >> PAGESHIFT;
768 }
769 return ((pgcnt_t)-1);
770 }
771
772 /*
773 * Set/test bitmap for a CBUF_MAPSIZE range which includes pfn. The
774 * mapping of pfn to range index is imperfect because pfn and bitnum
775 * do not have the same phase. To make sure a CBUF_MAPSIZE range is
776 * covered, call this for both ends:
777 * dump_set_used(base)
778 * dump_set_used(base+CBUF_MAPNP-1)
779 *
780 * This is used during a panic dump to mark pages allocated by
781 * dumpsys_get_maxmem(). The macro IS_DUMP_PAGE(pp) is used by
782 * page_get_mnode_freelist() to make sure pages used by dump are never
783 * allocated.
784 */
785 #define CBUF_MAPP2R(pfn) ((pfn) >> (CBUF_MAPSHIFT - PAGESHIFT))
786
787 static void
788 dump_set_used(pfn_t pfn)
789 {
790
791 pgcnt_t bitnum, rbitnum;
792
793 bitnum = dump_pfn_to_bitnum(pfn);
794 ASSERT(bitnum != (pgcnt_t)-1);
795
796 rbitnum = CBUF_MAPP2R(bitnum);
797 ASSERT(rbitnum < dumpcfg.rbitmapsize);
798
799 BT_SET(dumpcfg.rbitmap, rbitnum);
800 }
801
802 int
803 dump_test_used(pfn_t pfn)
804 {
805 pgcnt_t bitnum, rbitnum;
806
807 bitnum = dump_pfn_to_bitnum(pfn);
808 ASSERT(bitnum != (pgcnt_t)-1);
809
810 rbitnum = CBUF_MAPP2R(bitnum);
811 ASSERT(rbitnum < dumpcfg.rbitmapsize);
812
813 return (BT_TEST(dumpcfg.rbitmap, rbitnum));
814 }
815
816 /*
817 * dumpbzalloc and dumpbzfree are callbacks from the bzip2 library.
818 * dumpsys_get_maxmem() uses them for BZ2_bzCompressInit().
819 */
820 static void *
821 dumpbzalloc(void *opaque, int items, int size)
822 {
823 size_t *sz;
824 char *ret;
825
826 ASSERT(opaque != NULL);
827 sz = opaque;
828 ret = dumpcfg.maxvm + *sz;
829 *sz += items * size;
830 *sz = P2ROUNDUP(*sz, BZ2_BZALLOC_ALIGN);
831 ASSERT(*sz <= dumpcfg.maxvmsize);
832 return (ret);
833 }
834
835 /*ARGSUSED*/
836 static void
837 dumpbzfree(void *opaque, void *addr)
838 {
839 }
840
841 /*
842 * Perform additional checks on the page to see if we can really use
843 * it. The kernel (kas) pages are always set in the bitmap. However,
844 * boot memory pages (prom_ppages or P_BOOTPAGES) are not in the
845 * bitmap. So we check for them.
846 */
847 static inline int
848 dump_pfn_check(pfn_t pfn)
849 {
850 page_t *pp = page_numtopp_nolock(pfn);
851 if (pp == NULL || pp->p_pagenum != pfn ||
852 #if defined(__sparc)
853 pp->p_vnode == &promvp ||
854 #else
855 PP_ISBOOTPAGES(pp) ||
856 #endif
857 pp->p_toxic != 0)
858 return (0);
859 return (1);
860 }
861
862 /*
863 * Check a range to see if all contained pages are available and
864 * return non-zero if the range can be used.
865 */
866 static inline int
867 dump_range_check(pgcnt_t start, pgcnt_t end, pfn_t pfn)
868 {
869 for (; start < end; start++, pfn++) {
870 if (BT_TEST(dumpcfg.bitmap, start))
871 return (0);
872 if (!dump_pfn_check(pfn))
873 return (0);
874 }
875 return (1);
876 }
877
878 /*
879 * dumpsys_get_maxmem() is called during panic. Find unused ranges
880 * and use them for buffers. If we find enough memory switch to
881 * parallel bzip2, otherwise use parallel lzjb.
882 *
883 * It searches the dump bitmap in 2 passes. The first time it looks
884 * for CBUF_MAPSIZE ranges. On the second pass it uses small pages.
885 */
886 static void
887 dumpsys_get_maxmem()
888 {
889 dumpcfg_t *cfg = &dumpcfg;
890 cbuf_t *endcp = &cfg->cbuf[cfg->ncbuf];
891 helper_t *endhp = &cfg->helper[cfg->nhelper];
892 pgcnt_t bitnum, end;
893 size_t sz, endsz, bz2size;
894 pfn_t pfn, off;
895 cbuf_t *cp;
896 helper_t *hp, *ohp;
897 dumpmlw_t mlw;
898 int k;
899
900 /*
901 * Setting dump_plat_mincpu to 0 at any time forces a serial
902 * dump.
903 */
904 if (dump_plat_mincpu == 0) {
905 cfg->clevel = 0;
906 return;
907 }
908
909 /*
910 * There may be no point in looking for spare memory. If
911 * dumping all memory, then none is spare. If doing a serial
912 * dump, then already have buffers.
913 */
914 if (cfg->maxsize == 0 || cfg->clevel < DUMP_CLEVEL_LZJB ||
915 (dump_conflags & DUMP_ALL) != 0) {
916 if (cfg->clevel > DUMP_CLEVEL_LZJB)
917 cfg->clevel = DUMP_CLEVEL_LZJB;
918 return;
919 }
920
921 sz = 0;
922 cfg->found4m = 0;
923 cfg->foundsm = 0;
924
925 /* bitmap of ranges used to estimate which pfns are being used */
926 bzero(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.rbitmapsize));
927
928 /* find ranges that are not being dumped to use for buffers */
929 dump_init_memlist_walker(&mlw);
930 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) {
931 dump_timeleft = dump_timeout;
932 end = bitnum + CBUF_MAPNP;
933 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
934 ASSERT(pfn != PFN_INVALID);
935
936 /* skip partial range at end of mem segment */
937 if (mlw.mpleft < CBUF_MAPNP) {
938 end = bitnum + mlw.mpleft;
939 continue;
940 }
941
942 /* skip non aligned pages */
943 off = P2PHASE(pfn, CBUF_MAPNP);
944 if (off != 0) {
945 end -= off;
946 continue;
947 }
948
949 if (!dump_range_check(bitnum, end, pfn))
950 continue;
951
952 ASSERT((sz + CBUF_MAPSIZE) <= cfg->maxvmsize);
953 hat_devload(kas.a_hat, cfg->maxvm + sz, CBUF_MAPSIZE, pfn,
954 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST);
955 sz += CBUF_MAPSIZE;
956 cfg->found4m++;
957
958 /* set the bitmap for both ends to be sure to cover the range */
959 dump_set_used(pfn);
960 dump_set_used(pfn + CBUF_MAPNP - 1);
961
962 if (sz >= cfg->maxsize)
963 goto foundmax;
964 }
965
966 /* Add small pages if we can't find enough large pages. */
967 dump_init_memlist_walker(&mlw);
968 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) {
969 dump_timeleft = dump_timeout;
970 end = bitnum + CBUF_MAPNP;
971 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
972 ASSERT(pfn != PFN_INVALID);
973
974 /* Find any non-aligned pages at start and end of segment. */
975 off = P2PHASE(pfn, CBUF_MAPNP);
976 if (mlw.mpleft < CBUF_MAPNP) {
977 end = bitnum + mlw.mpleft;
978 } else if (off != 0) {
979 end -= off;
980 } else if (cfg->found4m && dump_test_used(pfn)) {
981 continue;
982 }
983
984 for (; bitnum < end; bitnum++, pfn++) {
985 dump_timeleft = dump_timeout;
986 if (BT_TEST(dumpcfg.bitmap, bitnum))
987 continue;
988 if (!dump_pfn_check(pfn))
989 continue;
990 ASSERT((sz + PAGESIZE) <= cfg->maxvmsize);
991 hat_devload(kas.a_hat, cfg->maxvm + sz, PAGESIZE, pfn,
992 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST);
993 sz += PAGESIZE;
994 cfg->foundsm++;
995 dump_set_used(pfn);
996 if (sz >= cfg->maxsize)
997 goto foundmax;
998 }
999 }
1000
1001 /* Fall back to lzjb if we did not get enough memory for bzip2. */
1002 endsz = (cfg->maxsize * cfg->threshold) / cfg->nhelper;
1003 if (sz < endsz) {
1004 cfg->clevel = DUMP_CLEVEL_LZJB;
1005 }
1006
1007 /* Allocate memory for as many helpers as we can. */
1008 foundmax:
1009
1010 /* Byte offsets into memory found and mapped above */
1011 endsz = sz;
1012 sz = 0;
1013
1014 /* Set the size for bzip2 state. Only bzip2 needs it. */
1015 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level);
1016
1017 /* Skip the preallocate output buffers. */
1018 cp = &cfg->cbuf[MINCBUFS];
1019
1020 /* Use this to move memory up from the preallocated helpers. */
1021 ohp = cfg->helper;
1022
1023 /* Loop over all helpers and allocate memory. */
1024 for (hp = cfg->helper; hp < endhp; hp++) {
1025
1026 /* Skip preallocated helpers by checking hp->page. */
1027 if (hp->page == NULL) {
1028 if (cfg->clevel <= DUMP_CLEVEL_LZJB) {
1029 /* lzjb needs 2 1-page buffers */
1030 if ((sz + (2 * PAGESIZE)) > endsz)
1031 break;
1032 hp->page = cfg->maxvm + sz;
1033 sz += PAGESIZE;
1034 hp->lzbuf = cfg->maxvm + sz;
1035 sz += PAGESIZE;
1036
1037 } else if (ohp->lzbuf != NULL) {
1038 /* re-use the preallocted lzjb page for bzip2 */
1039 hp->page = ohp->lzbuf;
1040 ohp->lzbuf = NULL;
1041 ++ohp;
1042
1043 } else {
1044 /* bzip2 needs a 1-page buffer */
1045 if ((sz + PAGESIZE) > endsz)
1046 break;
1047 hp->page = cfg->maxvm + sz;
1048 sz += PAGESIZE;
1049 }
1050 }
1051
1052 /*
1053 * Add output buffers per helper. The number of
1054 * buffers per helper is determined by the ratio of
1055 * ncbuf to nhelper.
1056 */
1057 for (k = 0; cp < endcp && (sz + CBUF_SIZE) <= endsz &&
1058 k < NCBUF_PER_HELPER; k++) {
1059 cp->state = CBUF_FREEBUF;
1060 cp->size = CBUF_SIZE;
1061 cp->buf = cfg->maxvm + sz;
1062 sz += CBUF_SIZE;
1063 ++cp;
1064 }
1065
1066 /*
1067 * bzip2 needs compression state. Use the dumpbzalloc
1068 * and dumpbzfree callbacks to allocate the memory.
1069 * bzip2 does allocation only at init time.
1070 */
1071 if (cfg->clevel >= DUMP_CLEVEL_BZIP2) {
1072 if ((sz + bz2size) > endsz) {
1073 hp->page = NULL;
1074 break;
1075 } else {
1076 hp->bzstream.opaque = &sz;
1077 hp->bzstream.bzalloc = dumpbzalloc;
1078 hp->bzstream.bzfree = dumpbzfree;
1079 (void) BZ2_bzCompressInit(&hp->bzstream,
1080 dump_bzip2_level, 0, 0);
1081 hp->bzstream.opaque = NULL;
1082 }
1083 }
1084 }
1085
1086 /* Finish allocating output buffers */
1087 for (; cp < endcp && (sz + CBUF_SIZE) <= endsz; cp++) {
1088 cp->state = CBUF_FREEBUF;
1089 cp->size = CBUF_SIZE;
1090 cp->buf = cfg->maxvm + sz;
1091 sz += CBUF_SIZE;
1092 }
1093
1094 /* Enable IS_DUMP_PAGE macro, which checks for pages we took. */
1095 if (cfg->found4m || cfg->foundsm)
1096 dump_check_used = 1;
1097
1098 ASSERT(sz <= endsz);
1099 }
1100
1101 static void
1102 dumphdr_init(void)
1103 {
1104 pgcnt_t npages = 0;
1105
1106 ASSERT(MUTEX_HELD(&dump_lock));
1107
1108 if (dumphdr == NULL) {
1109 dumphdr = kmem_zalloc(sizeof (dumphdr_t), KM_SLEEP);
1110 dumphdr->dump_magic = DUMP_MAGIC;
1111 dumphdr->dump_version = DUMP_VERSION;
1112 dumphdr->dump_wordsize = DUMP_WORDSIZE;
1113 dumphdr->dump_pageshift = PAGESHIFT;
1114 dumphdr->dump_pagesize = PAGESIZE;
1115 dumphdr->dump_utsname = utsname;
1116 (void) strcpy(dumphdr->dump_platform, platform);
1117 dumpbuf.size = dumpbuf_iosize(maxphys);
1118 dumpbuf.start = kmem_alloc(dumpbuf.size, KM_SLEEP);
1119 dumpbuf.end = dumpbuf.start + dumpbuf.size;
1120 dumpcfg.pids = kmem_alloc(v.v_proc * sizeof (pid_t), KM_SLEEP);
1121 dumpcfg.helpermap = kmem_zalloc(BT_SIZEOFMAP(NCPU), KM_SLEEP);
1122 LOCK_INIT_HELD(&dumpcfg.helper_lock);
1123 dump_stack_scratch = kmem_alloc(STACK_BUF_SIZE, KM_SLEEP);
1124 (void) strncpy(dumphdr->dump_uuid, dump_get_uuid(),
1125 sizeof (dumphdr->dump_uuid));
1126 }
1127
1128 npages = num_phys_pages();
1129
1130 if (dumpcfg.bitmapsize != npages) {
1131 size_t rlen = CBUF_MAPP2R(P2ROUNDUP(npages, CBUF_MAPNP));
1132 void *map = kmem_alloc(BT_SIZEOFMAP(npages), KM_SLEEP);
1133 void *rmap = kmem_alloc(BT_SIZEOFMAP(rlen), KM_SLEEP);
1134
1135 if (dumpcfg.bitmap != NULL)
1136 kmem_free(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.
1137 bitmapsize));
1138 if (dumpcfg.rbitmap != NULL)
1139 kmem_free(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.
1140 rbitmapsize));
1141 dumpcfg.bitmap = map;
1142 dumpcfg.bitmapsize = npages;
1143 dumpcfg.rbitmap = rmap;
1144 dumpcfg.rbitmapsize = rlen;
1145 }
1146 }
1147
1148 /*
1149 * Establish a new dump device.
1150 */
1151 int
1152 dumpinit(vnode_t *vp, char *name, int justchecking)
1153 {
1154 vnode_t *cvp;
1155 vattr_t vattr;
1156 vnode_t *cdev_vp;
1157 int error = 0;
1158
1159 ASSERT(MUTEX_HELD(&dump_lock));
1160
1161 dumphdr_init();
1162
1163 cvp = common_specvp(vp);
1164 if (cvp == dumpvp)
1165 return (0);
1166
1167 /*
1168 * Determine whether this is a plausible dump device. We want either:
1169 * (1) a real device that's not mounted and has a cb_dump routine, or
1170 * (2) a swapfile on some filesystem that has a vop_dump routine.
1171 */
1172 if ((error = VOP_OPEN(&cvp, FREAD | FWRITE, kcred, NULL)) != 0)
1173 return (error);
1174
1175 vattr.va_mask = AT_SIZE | AT_TYPE | AT_RDEV;
1176 if ((error = VOP_GETATTR(cvp, &vattr, 0, kcred, NULL)) == 0) {
1177 if (vattr.va_type == VBLK || vattr.va_type == VCHR) {
1178 if (devopsp[getmajor(vattr.va_rdev)]->
1179 devo_cb_ops->cb_dump == nodev)
1180 error = ENOTSUP;
1181 else if (vfs_devismounted(vattr.va_rdev))
1182 error = EBUSY;
1183 if (strcmp(ddi_driver_name(VTOS(cvp)->s_dip),
1184 ZFS_DRIVER) == 0 &&
1185 IS_SWAPVP(common_specvp(cvp)))
1186 error = EBUSY;
1187 } else {
1188 if (vn_matchopval(cvp, VOPNAME_DUMP, fs_nosys) ||
1189 !IS_SWAPVP(cvp))
1190 error = ENOTSUP;
1191 }
1192 }
1193
1194 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE)
1195 error = ENOSPC;
1196
1197 if (error || justchecking) {
1198 (void) VOP_CLOSE(cvp, FREAD | FWRITE, 1, (offset_t)0,
1199 kcred, NULL);
1200 return (error);
1201 }
1202
1203 VN_HOLD(cvp);
1204
1205 if (dumpvp != NULL)
1206 dumpfini(); /* unconfigure the old dump device */
1207
1208 dumpvp = cvp;
1209 dumpvp_size = vattr.va_size & -DUMP_OFFSET;
1210 dumppath = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1211 (void) strcpy(dumppath, name);
1212 dumpbuf.iosize = 0;
1213
1214 /*
1215 * If the dump device is a block device, attempt to open up the
1216 * corresponding character device and determine its maximum transfer
1217 * size. We use this information to potentially resize dumpbuf to a
1218 * larger and more optimal size for performing i/o to the dump device.
1219 */
1220 if (cvp->v_type == VBLK &&
1221 (cdev_vp = makespecvp(VTOS(cvp)->s_dev, VCHR)) != NULL) {
1222 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) {
1223 size_t blk_size;
1224 struct dk_cinfo dki;
1225 struct dk_minfo minf;
1226
1227 if (VOP_IOCTL(cdev_vp, DKIOCGMEDIAINFO,
1228 (intptr_t)&minf, FKIOCTL, kcred, NULL, NULL)
1229 == 0 && minf.dki_lbsize != 0)
1230 blk_size = minf.dki_lbsize;
1231 else
1232 blk_size = DEV_BSIZE;
1233
1234 if (VOP_IOCTL(cdev_vp, DKIOCINFO, (intptr_t)&dki,
1235 FKIOCTL, kcred, NULL, NULL) == 0) {
1236 dumpbuf.iosize = dki.dki_maxtransfer * blk_size;
1237 dumpbuf_resize();
1238 }
1239 /*
1240 * If we are working with a zvol then dumpify it
1241 * if it's not being used as swap.
1242 */
1243 if (strcmp(dki.dki_dname, ZVOL_DRIVER) == 0) {
1244 if (IS_SWAPVP(common_specvp(cvp)))
1245 error = EBUSY;
1246 else if ((error = VOP_IOCTL(cdev_vp,
1247 DKIOCDUMPINIT, NULL, FKIOCTL, kcred,
1248 NULL, NULL)) != 0)
1249 dumpfini();
1250 }
1251
1252 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0,
1253 kcred, NULL);
1254 }
1255
1256 VN_RELE(cdev_vp);
1257 }
1258
1259 cmn_err(CE_CONT, "?dump on %s size %llu MB\n", name, dumpvp_size >> 20);
1260
1261 dump_update_clevel();
1262
1263 return (error);
1264 }
1265
1266 void
1267 dumpfini(void)
1268 {
1269 vattr_t vattr;
1270 boolean_t is_zfs = B_FALSE;
1271 vnode_t *cdev_vp;
1272 ASSERT(MUTEX_HELD(&dump_lock));
1273
1274 kmem_free(dumppath, strlen(dumppath) + 1);
1275
1276 /*
1277 * Determine if we are using zvols for our dump device
1278 */
1279 vattr.va_mask = AT_RDEV;
1280 if (VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL) == 0) {
1281 is_zfs = (getmajor(vattr.va_rdev) ==
1282 ddi_name_to_major(ZFS_DRIVER)) ? B_TRUE : B_FALSE;
1283 }
1284
1285 /*
1286 * If we have a zvol dump device then we call into zfs so
1287 * that it may have a chance to cleanup.
1288 */
1289 if (is_zfs &&
1290 (cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR)) != NULL) {
1291 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) {
1292 (void) VOP_IOCTL(cdev_vp, DKIOCDUMPFINI, NULL, FKIOCTL,
1293 kcred, NULL, NULL);
1294 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0,
1295 kcred, NULL);
1296 }
1297 VN_RELE(cdev_vp);
1298 }
1299
1300 (void) VOP_CLOSE(dumpvp, FREAD | FWRITE, 1, (offset_t)0, kcred, NULL);
1301
1302 VN_RELE(dumpvp);
1303
1304 dumpvp = NULL;
1305 dumpvp_size = 0;
1306 dumppath = NULL;
1307 }
1308
1309 static offset_t
1310 dumpvp_flush(void)
1311 {
1312 size_t size = P2ROUNDUP(dumpbuf.cur - dumpbuf.start, PAGESIZE);
1313 hrtime_t iotime;
1314 int err;
1315
1316 if (dumpbuf.vp_off + size > dumpbuf.vp_limit) {
1317 dump_ioerr = ENOSPC;
1318 dumpbuf.vp_off = dumpbuf.vp_limit;
1319 } else if (size != 0) {
1320 iotime = gethrtime();
1321 dumpsync.iowait += iotime - dumpsync.iowaitts;
1322 if (panicstr)
1323 err = VOP_DUMP(dumpvp, dumpbuf.start,
1324 lbtodb(dumpbuf.vp_off), btod(size), NULL);
1325 else
1326 err = vn_rdwr(UIO_WRITE, dumpbuf.cdev_vp != NULL ?
1327 dumpbuf.cdev_vp : dumpvp, dumpbuf.start, size,
1328 dumpbuf.vp_off, UIO_SYSSPACE, 0, dumpbuf.vp_limit,
1329 kcred, 0);
1330 if (err && dump_ioerr == 0)
1331 dump_ioerr = err;
1332 dumpsync.iowaitts = gethrtime();
1333 dumpsync.iotime += dumpsync.iowaitts - iotime;
1334 dumpsync.nwrite += size;
1335 dumpbuf.vp_off += size;
1336 }
1337 dumpbuf.cur = dumpbuf.start;
1338 dump_timeleft = dump_timeout;
1339 return (dumpbuf.vp_off);
1340 }
1341
1342 /* maximize write speed by keeping seek offset aligned with size */
1343 void
1344 dumpvp_write(const void *va, size_t size)
1345 {
1346 size_t len, off, sz;
1347
1348 while (size != 0) {
1349 len = MIN(size, dumpbuf.end - dumpbuf.cur);
1350 if (len == 0) {
1351 off = P2PHASE(dumpbuf.vp_off, dumpbuf.size);
1352 if (off == 0 || !ISP2(dumpbuf.size)) {
1353 (void) dumpvp_flush();
1354 } else {
1355 sz = dumpbuf.size - off;
1356 dumpbuf.cur = dumpbuf.start + sz;
1357 (void) dumpvp_flush();
1358 ovbcopy(dumpbuf.start + sz, dumpbuf.start, off);
1359 dumpbuf.cur += off;
1360 }
1361 } else {
1362 bcopy(va, dumpbuf.cur, len);
1363 va = (char *)va + len;
1364 dumpbuf.cur += len;
1365 size -= len;
1366 }
1367 }
1368 }
1369
1370 /*ARGSUSED*/
1371 static void
1372 dumpvp_ksyms_write(const void *src, void *dst, size_t size)
1373 {
1374 dumpvp_write(src, size);
1375 }
1376
1377 /*
1378 * Mark 'pfn' in the bitmap and dump its translation table entry.
1379 */
1380 void
1381 dump_addpage(struct as *as, void *va, pfn_t pfn)
1382 {
1383 mem_vtop_t mem_vtop;
1384 pgcnt_t bitnum;
1385
1386 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) {
1387 if (!BT_TEST(dumpcfg.bitmap, bitnum)) {
1388 dumphdr->dump_npages++;
1389 BT_SET(dumpcfg.bitmap, bitnum);
1390 }
1391 dumphdr->dump_nvtop++;
1392 mem_vtop.m_as = as;
1393 mem_vtop.m_va = va;
1394 mem_vtop.m_pfn = pfn;
1395 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
1396 }
1397 dump_timeleft = dump_timeout;
1398 }
1399
1400 /*
1401 * Mark 'pfn' in the bitmap
1402 */
1403 void
1404 dump_page(pfn_t pfn)
1405 {
1406 pgcnt_t bitnum;
1407
1408 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) {
1409 if (!BT_TEST(dumpcfg.bitmap, bitnum)) {
1410 dumphdr->dump_npages++;
1411 BT_SET(dumpcfg.bitmap, bitnum);
1412 }
1413 }
1414 dump_timeleft = dump_timeout;
1415 }
1416
1417 /*
1418 * Dump the <as, va, pfn> information for a given address space.
1419 * SEGOP_DUMP() will call dump_addpage() for each page in the segment.
1420 */
1421 static void
1422 dump_as(struct as *as)
1423 {
1424 struct seg *seg;
1425
1426 AS_LOCK_ENTER(as, RW_READER);
1427 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
1428 if (seg->s_as != as)
1429 break;
1430 if (seg->s_ops == NULL)
1431 continue;
1432 SEGOP_DUMP(seg);
1433 }
1434 AS_LOCK_EXIT(as);
1435
1436 if (seg != NULL)
1437 cmn_err(CE_WARN, "invalid segment %p in address space %p",
1438 (void *)seg, (void *)as);
1439 }
1440
1441 static int
1442 dump_process(pid_t pid)
1443 {
1444 proc_t *p = sprlock(pid);
1445
1446 if (p == NULL)
1447 return (-1);
1448 if (p->p_as != &kas) {
1449 mutex_exit(&p->p_lock);
1450 dump_as(p->p_as);
1451 mutex_enter(&p->p_lock);
1452 }
1453
1454 sprunlock(p);
1455
1456 return (0);
1457 }
1458
1459 /*
1460 * The following functions (dump_summary(), dump_ereports(), and
1461 * dump_messages()), write data to an uncompressed area within the
1462 * crashdump. The layout of these is
1463 *
1464 * +------------------------------------------------------------+
1465 * | compressed pages | summary | ereports | messages |
1466 * +------------------------------------------------------------+
1467 *
1468 * With the advent of saving a compressed crash dump by default, we
1469 * need to save a little more data to describe the failure mode in
1470 * an uncompressed buffer available before savecore uncompresses
1471 * the dump. Initially this is a copy of the stack trace. Additional
1472 * summary information should be added here.
1473 */
1474
1475 void
1476 dump_summary(void)
1477 {
1478 u_offset_t dumpvp_start;
1479 summary_dump_t sd;
1480
1481 if (dumpvp == NULL || dumphdr == NULL)
1482 return;
1483
1484 dumpbuf.cur = dumpbuf.start;
1485
1486 dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE +
1487 DUMP_ERPTSIZE);
1488 dumpvp_start = dumpbuf.vp_limit - DUMP_SUMMARYSIZE;
1489 dumpbuf.vp_off = dumpvp_start;
1490
1491 sd.sd_magic = SUMMARY_MAGIC;
1492 sd.sd_ssum = checksum32(dump_stack_scratch, STACK_BUF_SIZE);
1493 dumpvp_write(&sd, sizeof (sd));
1494 dumpvp_write(dump_stack_scratch, STACK_BUF_SIZE);
1495
1496 sd.sd_magic = 0; /* indicate end of summary */
1497 dumpvp_write(&sd, sizeof (sd));
1498 (void) dumpvp_flush();
1499 }
1500
1501 void
1502 dump_ereports(void)
1503 {
1504 u_offset_t dumpvp_start;
1505 erpt_dump_t ed;
1506
1507 if (dumpvp == NULL || dumphdr == NULL)
1508 return;
1509
1510 dumpbuf.cur = dumpbuf.start;
1511 dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE);
1512 dumpvp_start = dumpbuf.vp_limit - DUMP_ERPTSIZE;
1513 dumpbuf.vp_off = dumpvp_start;
1514
1515 fm_ereport_dump();
1516 if (panicstr)
1517 errorq_dump();
1518
1519 bzero(&ed, sizeof (ed)); /* indicate end of ereports */
1520 dumpvp_write(&ed, sizeof (ed));
1521 (void) dumpvp_flush();
1522
1523 if (!panicstr) {
1524 (void) VOP_PUTPAGE(dumpvp, dumpvp_start,
1525 (size_t)(dumpbuf.vp_off - dumpvp_start),
1526 B_INVAL | B_FORCE, kcred, NULL);
1527 }
1528 }
1529
1530 void
1531 dump_messages(void)
1532 {
1533 log_dump_t ld;
1534 mblk_t *mctl, *mdata;
1535 queue_t *q, *qlast;
1536 u_offset_t dumpvp_start;
1537
1538 if (dumpvp == NULL || dumphdr == NULL || log_consq == NULL)
1539 return;
1540
1541 dumpbuf.cur = dumpbuf.start;
1542 dumpbuf.vp_limit = dumpvp_size - DUMP_OFFSET;
1543 dumpvp_start = dumpbuf.vp_limit - DUMP_LOGSIZE;
1544 dumpbuf.vp_off = dumpvp_start;
1545
1546 qlast = NULL;
1547 do {
1548 for (q = log_consq; q->q_next != qlast; q = q->q_next)
1549 continue;
1550 for (mctl = q->q_first; mctl != NULL; mctl = mctl->b_next) {
1551 dump_timeleft = dump_timeout;
1552 mdata = mctl->b_cont;
1553 ld.ld_magic = LOG_MAGIC;
1554 ld.ld_msgsize = MBLKL(mctl->b_cont);
1555 ld.ld_csum = checksum32(mctl->b_rptr, MBLKL(mctl));
1556 ld.ld_msum = checksum32(mdata->b_rptr, MBLKL(mdata));
1557 dumpvp_write(&ld, sizeof (ld));
1558 dumpvp_write(mctl->b_rptr, MBLKL(mctl));
1559 dumpvp_write(mdata->b_rptr, MBLKL(mdata));
1560 }
1561 } while ((qlast = q) != log_consq);
1562
1563 ld.ld_magic = 0; /* indicate end of messages */
1564 dumpvp_write(&ld, sizeof (ld));
1565 (void) dumpvp_flush();
1566 if (!panicstr) {
1567 (void) VOP_PUTPAGE(dumpvp, dumpvp_start,
1568 (size_t)(dumpbuf.vp_off - dumpvp_start),
1569 B_INVAL | B_FORCE, kcred, NULL);
1570 }
1571 }
1572
1573 /*
1574 * The following functions are called on multiple CPUs during dump.
1575 * They must not use most kernel services, because all cross-calls are
1576 * disabled during panic. Therefore, blocking locks and cache flushes
1577 * will not work.
1578 */
1579
1580 /*
1581 * Copy pages, trapping ECC errors. Also, for robustness, trap data
1582 * access in case something goes wrong in the hat layer and the
1583 * mapping is broken.
1584 */
1585 static int
1586 dump_pagecopy(void *src, void *dst)
1587 {
1588 long *wsrc = (long *)src;
1589 long *wdst = (long *)dst;
1590 const ulong_t ncopies = PAGESIZE / sizeof (long);
1591 volatile int w = 0;
1592 volatile int ueoff = -1;
1593 on_trap_data_t otd;
1594
1595 if (on_trap(&otd, OT_DATA_EC | OT_DATA_ACCESS)) {
1596 if (ueoff == -1)
1597 ueoff = w * sizeof (long);
1598 /* report "bad ECC" or "bad address" */
1599 #ifdef _LP64
1600 if (otd.ot_trap & OT_DATA_EC)
1601 wdst[w++] = 0x00badecc00badecc;
1602 else
1603 wdst[w++] = 0x00badadd00badadd;
1604 #else
1605 if (otd.ot_trap & OT_DATA_EC)
1606 wdst[w++] = 0x00badecc;
1607 else
1608 wdst[w++] = 0x00badadd;
1609 #endif
1610 }
1611 while (w < ncopies) {
1612 wdst[w] = wsrc[w];
1613 w++;
1614 }
1615 no_trap();
1616 return (ueoff);
1617 }
1618
1619 static void
1620 dumpsys_close_cq(cqueue_t *cq, int live)
1621 {
1622 if (live) {
1623 mutex_enter(&cq->mutex);
1624 atomic_dec_uint(&cq->open);
1625 cv_signal(&cq->cv);
1626 mutex_exit(&cq->mutex);
1627 } else {
1628 atomic_dec_uint(&cq->open);
1629 }
1630 }
1631
1632 static inline void
1633 dumpsys_spinlock(lock_t *lp)
1634 {
1635 uint_t backoff = 0;
1636 int loop_count = 0;
1637
1638 while (LOCK_HELD(lp) || !lock_spin_try(lp)) {
1639 if (++loop_count >= ncpus) {
1640 backoff = mutex_lock_backoff(0);
1641 loop_count = 0;
1642 } else {
1643 backoff = mutex_lock_backoff(backoff);
1644 }
1645 mutex_lock_delay(backoff);
1646 }
1647 }
1648
1649 static inline void
1650 dumpsys_spinunlock(lock_t *lp)
1651 {
1652 lock_clear(lp);
1653 }
1654
1655 static inline void
1656 dumpsys_lock(cqueue_t *cq, int live)
1657 {
1658 if (live)
1659 mutex_enter(&cq->mutex);
1660 else
1661 dumpsys_spinlock(&cq->spinlock);
1662 }
1663
1664 static inline void
1665 dumpsys_unlock(cqueue_t *cq, int live, int signal)
1666 {
1667 if (live) {
1668 if (signal)
1669 cv_signal(&cq->cv);
1670 mutex_exit(&cq->mutex);
1671 } else {
1672 dumpsys_spinunlock(&cq->spinlock);
1673 }
1674 }
1675
1676 static void
1677 dumpsys_wait_cq(cqueue_t *cq, int live)
1678 {
1679 if (live) {
1680 cv_wait(&cq->cv, &cq->mutex);
1681 } else {
1682 dumpsys_spinunlock(&cq->spinlock);
1683 while (cq->open)
1684 if (cq->first)
1685 break;
1686 dumpsys_spinlock(&cq->spinlock);
1687 }
1688 }
1689
1690 static void
1691 dumpsys_put_cq(cqueue_t *cq, cbuf_t *cp, int newstate, int live)
1692 {
1693 if (cp == NULL)
1694 return;
1695
1696 dumpsys_lock(cq, live);
1697
1698 if (cq->ts != 0) {
1699 cq->empty += gethrtime() - cq->ts;
1700 cq->ts = 0;
1701 }
1702
1703 cp->state = newstate;
1704 cp->next = NULL;
1705 if (cq->last == NULL)
1706 cq->first = cp;
1707 else
1708 cq->last->next = cp;
1709 cq->last = cp;
1710
1711 dumpsys_unlock(cq, live, 1);
1712 }
1713
1714 static cbuf_t *
1715 dumpsys_get_cq(cqueue_t *cq, int live)
1716 {
1717 cbuf_t *cp;
1718 hrtime_t now = gethrtime();
1719
1720 dumpsys_lock(cq, live);
1721
1722 /* CONSTCOND */
1723 while (1) {
1724 cp = (cbuf_t *)cq->first;
1725 if (cp == NULL) {
1726 if (cq->open == 0)
1727 break;
1728 dumpsys_wait_cq(cq, live);
1729 continue;
1730 }
1731 cq->first = cp->next;
1732 if (cq->first == NULL) {
1733 cq->last = NULL;
1734 cq->ts = now;
1735 }
1736 break;
1737 }
1738
1739 dumpsys_unlock(cq, live, cq->first != NULL || cq->open == 0);
1740 return (cp);
1741 }
1742
1743 /*
1744 * Send an error message to the console. If the main task is running
1745 * just write the message via uprintf. If a helper is running the
1746 * message has to be put on a queue for the main task. Setting fmt to
1747 * NULL means flush the error message buffer. If fmt is not NULL, just
1748 * add the text to the existing buffer.
1749 */
1750 static void
1751 dumpsys_errmsg(helper_t *hp, const char *fmt, ...)
1752 {
1753 dumpsync_t *ds = hp->ds;
1754 cbuf_t *cp = hp->cperr;
1755 va_list adx;
1756
1757 if (hp->helper == MAINHELPER) {
1758 if (fmt != NULL) {
1759 if (ds->neednl) {
1760 uprintf("\n");
1761 ds->neednl = 0;
1762 }
1763 va_start(adx, fmt);
1764 vuprintf(fmt, adx);
1765 va_end(adx);
1766 }
1767 } else if (fmt == NULL) {
1768 if (cp != NULL) {
1769 CQ_PUT(mainq, cp, CBUF_ERRMSG);
1770 hp->cperr = NULL;
1771 }
1772 } else {
1773 if (hp->cperr == NULL) {
1774 cp = CQ_GET(freebufq);
1775 hp->cperr = cp;
1776 cp->used = 0;
1777 }
1778 va_start(adx, fmt);
1779 cp->used += vsnprintf(cp->buf + cp->used, cp->size - cp->used,
1780 fmt, adx);
1781 va_end(adx);
1782 if ((cp->used + LOG_MSGSIZE) > cp->size) {
1783 CQ_PUT(mainq, cp, CBUF_ERRMSG);
1784 hp->cperr = NULL;
1785 }
1786 }
1787 }
1788
1789 /*
1790 * Write an output buffer to the dump file. If the main task is
1791 * running just write the data. If a helper is running the output is
1792 * placed on a queue for the main task.
1793 */
1794 static void
1795 dumpsys_swrite(helper_t *hp, cbuf_t *cp, size_t used)
1796 {
1797 dumpsync_t *ds = hp->ds;
1798
1799 if (hp->helper == MAINHELPER) {
1800 HRSTART(ds->perpage, write);
1801 dumpvp_write(cp->buf, used);
1802 HRSTOP(ds->perpage, write);
1803 CQ_PUT(freebufq, cp, CBUF_FREEBUF);
1804 } else {
1805 cp->used = used;
1806 CQ_PUT(mainq, cp, CBUF_WRITE);
1807 }
1808 }
1809
1810 /*
1811 * Copy one page within the mapped range. The offset starts at 0 and
1812 * is relative to the first pfn. cp->buf + cp->off is the address of
1813 * the first pfn. If dump_pagecopy returns a UE offset, create an
1814 * error message. Returns the offset to the next pfn in the range
1815 * selected by the bitmap.
1816 */
1817 static int
1818 dumpsys_copy_page(helper_t *hp, int offset)
1819 {
1820 cbuf_t *cp = hp->cpin;
1821 int ueoff;
1822
1823 ASSERT(cp->off + offset + PAGESIZE <= cp->size);
1824 ASSERT(BT_TEST(dumpcfg.bitmap, cp->bitnum));
1825
1826 ueoff = dump_pagecopy(cp->buf + cp->off + offset, hp->page);
1827
1828 /* ueoff is the offset in the page to a UE error */
1829 if (ueoff != -1) {
1830 uint64_t pa = ptob(cp->pfn) + offset + ueoff;
1831
1832 dumpsys_errmsg(hp, "cpu %d: memory error at PA 0x%08x.%08x\n",
1833 CPU->cpu_id, (uint32_t)(pa >> 32), (uint32_t)pa);
1834 }
1835
1836 /*
1837 * Advance bitnum and offset to the next input page for the
1838 * next call to this function.
1839 */
1840 offset += PAGESIZE;
1841 cp->bitnum++;
1842 while (cp->off + offset < cp->size) {
1843 if (BT_TEST(dumpcfg.bitmap, cp->bitnum))
1844 break;
1845 offset += PAGESIZE;
1846 cp->bitnum++;
1847 }
1848
1849 return (offset);
1850 }
1851
1852 /*
1853 * Read the helper queue, and copy one mapped page. Return 0 when
1854 * done. Return 1 when a page has been copied into hp->page.
1855 */
1856 static int
1857 dumpsys_sread(helper_t *hp)
1858 {
1859 dumpsync_t *ds = hp->ds;
1860
1861 /* CONSTCOND */
1862 while (1) {
1863
1864 /* Find the next input buffer. */
1865 if (hp->cpin == NULL) {
1866 HRSTART(hp->perpage, inwait);
1867
1868 /* CONSTCOND */
1869 while (1) {
1870 hp->cpin = CQ_GET(helperq);
1871 dump_timeleft = dump_timeout;
1872
1873 /*
1874 * NULL return means the helper queue
1875 * is closed and empty.
1876 */
1877 if (hp->cpin == NULL)
1878 break;
1879
1880 /* Have input, check for dump I/O error. */
1881 if (!dump_ioerr)
1882 break;
1883
1884 /*
1885 * If an I/O error occurs, stay in the
1886 * loop in order to empty the helper
1887 * queue. Return the buffers to the
1888 * main task to unmap and free it.
1889 */
1890 hp->cpin->used = 0;
1891 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
1892 }
1893 HRSTOP(hp->perpage, inwait);
1894
1895 /* Stop here when the helper queue is closed. */
1896 if (hp->cpin == NULL)
1897 break;
1898
1899 /* Set the offset=0 to get the first pfn. */
1900 hp->in = 0;
1901
1902 /* Set the total processed to 0 */
1903 hp->used = 0;
1904 }
1905
1906 /* Process the next page. */
1907 if (hp->used < hp->cpin->used) {
1908
1909 /*
1910 * Get the next page from the input buffer and
1911 * return a copy.
1912 */
1913 ASSERT(hp->in != -1);
1914 HRSTART(hp->perpage, copy);
1915 hp->in = dumpsys_copy_page(hp, hp->in);
1916 hp->used += PAGESIZE;
1917 HRSTOP(hp->perpage, copy);
1918 break;
1919
1920 } else {
1921
1922 /*
1923 * Done with the input. Flush the VM and
1924 * return the buffer to the main task.
1925 */
1926 if (panicstr && hp->helper != MAINHELPER)
1927 hat_flush_range(kas.a_hat,
1928 hp->cpin->buf, hp->cpin->size);
1929 dumpsys_errmsg(hp, NULL);
1930 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
1931 hp->cpin = NULL;
1932 }
1933 }
1934
1935 return (hp->cpin != NULL);
1936 }
1937
1938 /*
1939 * Compress size bytes starting at buf with bzip2
1940 * mode:
1941 * BZ_RUN add one more compressed page
1942 * BZ_FINISH no more input, flush the state
1943 */
1944 static void
1945 dumpsys_bzrun(helper_t *hp, void *buf, size_t size, int mode)
1946 {
1947 dumpsync_t *ds = hp->ds;
1948 const int CSIZE = sizeof (dumpcsize_t);
1949 bz_stream *ps = &hp->bzstream;
1950 int rc = 0;
1951 uint32_t csize;
1952 dumpcsize_t cs;
1953
1954 /* Set input pointers to new input page */
1955 if (size > 0) {
1956 ps->avail_in = size;
1957 ps->next_in = buf;
1958 }
1959
1960 /* CONSTCOND */
1961 while (1) {
1962
1963 /* Quit when all input has been consumed */
1964 if (ps->avail_in == 0 && mode == BZ_RUN)
1965 break;
1966
1967 /* Get a new output buffer */
1968 if (hp->cpout == NULL) {
1969 HRSTART(hp->perpage, outwait);
1970 hp->cpout = CQ_GET(freebufq);
1971 HRSTOP(hp->perpage, outwait);
1972 ps->avail_out = hp->cpout->size - CSIZE;
1973 ps->next_out = hp->cpout->buf + CSIZE;
1974 }
1975
1976 /* Compress input, or finalize */
1977 HRSTART(hp->perpage, compress);
1978 rc = BZ2_bzCompress(ps, mode);
1979 HRSTOP(hp->perpage, compress);
1980
1981 /* Check for error */
1982 if (mode == BZ_RUN && rc != BZ_RUN_OK) {
1983 dumpsys_errmsg(hp, "%d: BZ_RUN error %s at page %lx\n",
1984 hp->helper, BZ2_bzErrorString(rc),
1985 hp->cpin->pagenum);
1986 break;
1987 }
1988
1989 /* Write the buffer if it is full, or we are flushing */
1990 if (ps->avail_out == 0 || mode == BZ_FINISH) {
1991 csize = hp->cpout->size - CSIZE - ps->avail_out;
1992 cs = DUMP_SET_TAG(csize, hp->tag);
1993 if (csize > 0) {
1994 (void) memcpy(hp->cpout->buf, &cs, CSIZE);
1995 dumpsys_swrite(hp, hp->cpout, csize + CSIZE);
1996 hp->cpout = NULL;
1997 }
1998 }
1999
2000 /* Check for final complete */
2001 if (mode == BZ_FINISH) {
2002 if (rc == BZ_STREAM_END)
2003 break;
2004 if (rc != BZ_FINISH_OK) {
2005 dumpsys_errmsg(hp, "%d: BZ_FINISH error %s\n",
2006 hp->helper, BZ2_bzErrorString(rc));
2007 break;
2008 }
2009 }
2010 }
2011
2012 /* Cleanup state and buffers */
2013 if (mode == BZ_FINISH) {
2014
2015 /* Reset state so that it is re-usable. */
2016 (void) BZ2_bzCompressReset(&hp->bzstream);
2017
2018 /* Give any unused outout buffer to the main task */
2019 if (hp->cpout != NULL) {
2020 hp->cpout->used = 0;
2021 CQ_PUT(mainq, hp->cpout, CBUF_ERRMSG);
2022 hp->cpout = NULL;
2023 }
2024 }
2025 }
2026
2027 static void
2028 dumpsys_bz2compress(helper_t *hp)
2029 {
2030 dumpsync_t *ds = hp->ds;
2031 dumpstreamhdr_t sh;
2032
2033 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC);
2034 sh.stream_pagenum = (pgcnt_t)-1;
2035 sh.stream_npages = 0;
2036 hp->cpin = NULL;
2037 hp->cpout = NULL;
2038 hp->cperr = NULL;
2039 hp->in = 0;
2040 hp->out = 0;
2041 hp->bzstream.avail_in = 0;
2042
2043 /* Bump reference to mainq while we are running */
2044 CQ_OPEN(mainq);
2045
2046 /* Get one page at a time */
2047 while (dumpsys_sread(hp)) {
2048 if (sh.stream_pagenum != hp->cpin->pagenum) {
2049 sh.stream_pagenum = hp->cpin->pagenum;
2050 sh.stream_npages = btop(hp->cpin->used);
2051 dumpsys_bzrun(hp, &sh, sizeof (sh), BZ_RUN);
2052 }
2053 dumpsys_bzrun(hp, hp->page, PAGESIZE, 0);
2054 }
2055
2056 /* Done with input, flush any partial buffer */
2057 if (sh.stream_pagenum != (pgcnt_t)-1) {
2058 dumpsys_bzrun(hp, NULL, 0, BZ_FINISH);
2059 dumpsys_errmsg(hp, NULL);
2060 }
2061
2062 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL);
2063
2064 /* Decrement main queue count, we are done */
2065 CQ_CLOSE(mainq);
2066 }
2067
2068 /*
2069 * Compress with lzjb
2070 * write stream block if full or size==0
2071 * if csize==0 write stream header, else write <csize, data>
2072 * size==0 is a call to flush a buffer
2073 * hp->cpout is the buffer we are flushing or filling
2074 * hp->out is the next index to fill data
2075 * osize is either csize+data, or the size of a stream header
2076 */
2077 static void
2078 dumpsys_lzjbrun(helper_t *hp, size_t csize, void *buf, size_t size)
2079 {
2080 dumpsync_t *ds = hp->ds;
2081 const int CSIZE = sizeof (dumpcsize_t);
2082 dumpcsize_t cs;
2083 size_t osize = csize > 0 ? CSIZE + size : size;
2084
2085 /* If flush, and there is no buffer, just return */
2086 if (size == 0 && hp->cpout == NULL)
2087 return;
2088
2089 /* If flush, or cpout is full, write it out */
2090 if (size == 0 ||
2091 hp->cpout != NULL && hp->out + osize > hp->cpout->size) {
2092
2093 /* Set tag+size word at the front of the stream block. */
2094 cs = DUMP_SET_TAG(hp->out - CSIZE, hp->tag);
2095 (void) memcpy(hp->cpout->buf, &cs, CSIZE);
2096
2097 /* Write block to dump file. */
2098 dumpsys_swrite(hp, hp->cpout, hp->out);
2099
2100 /* Clear pointer to indicate we need a new buffer */
2101 hp->cpout = NULL;
2102
2103 /* flushing, we are done */
2104 if (size == 0)
2105 return;
2106 }
2107
2108 /* Get an output buffer if we dont have one. */
2109 if (hp->cpout == NULL) {
2110 HRSTART(hp->perpage, outwait);
2111 hp->cpout = CQ_GET(freebufq);
2112 HRSTOP(hp->perpage, outwait);
2113 hp->out = CSIZE;
2114 }
2115
2116 /* Store csize word. This is the size of compressed data. */
2117 if (csize > 0) {
2118 cs = DUMP_SET_TAG(csize, 0);
2119 (void) memcpy(hp->cpout->buf + hp->out, &cs, CSIZE);
2120 hp->out += CSIZE;
2121 }
2122
2123 /* Store the data. */
2124 (void) memcpy(hp->cpout->buf + hp->out, buf, size);
2125 hp->out += size;
2126 }
2127
2128 static void
2129 dumpsys_lzjbcompress(helper_t *hp)
2130 {
2131 dumpsync_t *ds = hp->ds;
2132 size_t csize;
2133 dumpstreamhdr_t sh;
2134
2135 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC);
2136 sh.stream_pagenum = (pfn_t)-1;
2137 sh.stream_npages = 0;
2138 hp->cpin = NULL;
2139 hp->cpout = NULL;
2140 hp->cperr = NULL;
2141 hp->in = 0;
2142 hp->out = 0;
2143
2144 /* Bump reference to mainq while we are running */
2145 CQ_OPEN(mainq);
2146
2147 /* Get one page at a time */
2148 while (dumpsys_sread(hp)) {
2149
2150 /* Create a stream header for each new input map */
2151 if (sh.stream_pagenum != hp->cpin->pagenum) {
2152 sh.stream_pagenum = hp->cpin->pagenum;
2153 sh.stream_npages = btop(hp->cpin->used);
2154 dumpsys_lzjbrun(hp, 0, &sh, sizeof (sh));
2155 }
2156
2157 /* Compress one page */
2158 HRSTART(hp->perpage, compress);
2159 csize = compress(hp->page, hp->lzbuf, PAGESIZE);
2160 HRSTOP(hp->perpage, compress);
2161
2162 /* Add csize+data to output block */
2163 ASSERT(csize > 0 && csize <= PAGESIZE);
2164 dumpsys_lzjbrun(hp, csize, hp->lzbuf, csize);
2165 }
2166
2167 /* Done with input, flush any partial buffer */
2168 if (sh.stream_pagenum != (pfn_t)-1) {
2169 dumpsys_lzjbrun(hp, 0, NULL, 0);
2170 dumpsys_errmsg(hp, NULL);
2171 }
2172
2173 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL);
2174
2175 /* Decrement main queue count, we are done */
2176 CQ_CLOSE(mainq);
2177 }
2178
2179 /*
2180 * Dump helper called from panic_idle() to compress pages. CPUs in
2181 * this path must not call most kernel services.
2182 *
2183 * During panic, all but one of the CPUs is idle. These CPUs are used
2184 * as helpers working in parallel to copy and compress memory
2185 * pages. During a panic, however, these processors cannot call any
2186 * kernel services. This is because mutexes become no-ops during
2187 * panic, and, cross-call interrupts are inhibited. Therefore, during
2188 * panic dump the helper CPUs communicate with the panic CPU using
2189 * memory variables. All memory mapping and I/O is performed by the
2190 * panic CPU.
2191 *
2192 * At dump configuration time, helper_lock is set and helpers_wanted
2193 * is 0. dumpsys() decides whether to set helpers_wanted before
2194 * clearing helper_lock.
2195 *
2196 * At panic time, idle CPUs spin-wait on helper_lock, then alternately
2197 * take the lock and become a helper, or return.
2198 */
2199 void
2200 dumpsys_helper()
2201 {
2202 dumpsys_spinlock(&dumpcfg.helper_lock);
2203 if (dumpcfg.helpers_wanted) {
2204 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper];
2205
2206 for (hp = dumpcfg.helper; hp != hpend; hp++) {
2207 if (hp->helper == FREEHELPER) {
2208 hp->helper = CPU->cpu_id;
2209 BT_SET(dumpcfg.helpermap, CPU->cpu_seqid);
2210
2211 dumpsys_spinunlock(&dumpcfg.helper_lock);
2212
2213 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2)
2214 dumpsys_lzjbcompress(hp);
2215 else
2216 dumpsys_bz2compress(hp);
2217
2218 hp->helper = DONEHELPER;
2219 return;
2220 }
2221 }
2222
2223 /* No more helpers are needed. */
2224 dumpcfg.helpers_wanted = 0;
2225
2226 }
2227 dumpsys_spinunlock(&dumpcfg.helper_lock);
2228 }
2229
2230 /*
2231 * No-wait helper callable in spin loops.
2232 *
2233 * Do not wait for helper_lock. Just check helpers_wanted. The caller
2234 * may decide to continue. This is the "c)ontinue, s)ync, r)eset? s"
2235 * case.
2236 */
2237 void
2238 dumpsys_helper_nw()
2239 {
2240 if (dumpcfg.helpers_wanted)
2241 dumpsys_helper();
2242 }
2243
2244 /*
2245 * Dump helper for live dumps.
2246 * These run as a system task.
2247 */
2248 static void
2249 dumpsys_live_helper(void *arg)
2250 {
2251 helper_t *hp = arg;
2252
2253 BT_ATOMIC_SET(dumpcfg.helpermap, CPU->cpu_seqid);
2254 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2)
2255 dumpsys_lzjbcompress(hp);
2256 else
2257 dumpsys_bz2compress(hp);
2258 }
2259
2260 /*
2261 * Compress one page with lzjb (single threaded case)
2262 */
2263 static void
2264 dumpsys_lzjb_page(helper_t *hp, cbuf_t *cp)
2265 {
2266 dumpsync_t *ds = hp->ds;
2267 uint32_t csize;
2268
2269 hp->helper = MAINHELPER;
2270 hp->in = 0;
2271 hp->used = 0;
2272 hp->cpin = cp;
2273 while (hp->used < cp->used) {
2274 HRSTART(hp->perpage, copy);
2275 hp->in = dumpsys_copy_page(hp, hp->in);
2276 hp->used += PAGESIZE;
2277 HRSTOP(hp->perpage, copy);
2278
2279 HRSTART(hp->perpage, compress);
2280 csize = compress(hp->page, hp->lzbuf, PAGESIZE);
2281 HRSTOP(hp->perpage, compress);
2282
2283 HRSTART(hp->perpage, write);
2284 dumpvp_write(&csize, sizeof (csize));
2285 dumpvp_write(hp->lzbuf, csize);
2286 HRSTOP(hp->perpage, write);
2287 }
2288 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
2289 hp->cpin = NULL;
2290 }
2291
2292 /*
2293 * Main task to dump pages. This is called on the dump CPU.
2294 */
2295 static void
2296 dumpsys_main_task(void *arg)
2297 {
2298 dumpsync_t *ds = arg;
2299 pgcnt_t pagenum = 0, bitnum = 0, hibitnum;
2300 dumpmlw_t mlw;
2301 cbuf_t *cp;
2302 pgcnt_t baseoff, pfnoff;
2303 pfn_t base, pfn;
2304 int i, dumpserial;
2305
2306 /*
2307 * Fall back to serial mode if there are no helpers.
2308 * dump_plat_mincpu can be set to 0 at any time.
2309 * dumpcfg.helpermap must contain at least one member.
2310 */
2311 dumpserial = 1;
2312
2313 if (dump_plat_mincpu != 0 && dumpcfg.clevel != 0) {
2314 for (i = 0; i < BT_BITOUL(NCPU); ++i) {
2315 if (dumpcfg.helpermap[i] != 0) {
2316 dumpserial = 0;
2317 break;
2318 }
2319 }
2320 }
2321
2322 if (dumpserial) {
2323 dumpcfg.clevel = 0;
2324 if (dumpcfg.helper[0].lzbuf == NULL)
2325 dumpcfg.helper[0].lzbuf = dumpcfg.helper[1].page;
2326 }
2327
2328 dump_init_memlist_walker(&mlw);
2329
2330 for (;;) {
2331 int sec = (gethrtime() - ds->start) / NANOSEC;
2332
2333 /*
2334 * Render a simple progress display on the system console to
2335 * make clear to the operator that the system has not hung.
2336 * Emit an update when dump progress has advanced by one
2337 * percent, or when no update has been drawn in the last
2338 * second.
2339 */
2340 if (ds->percent > ds->percent_done || sec > ds->sec_done) {
2341 ds->sec_done = sec;
2342 ds->percent_done = ds->percent;
2343 uprintf("^\rdumping: %2d:%02d %3d%% done",
2344 sec / 60, sec % 60, ds->percent);
2345 ds->neednl = 1;
2346 }
2347
2348 while (CQ_IS_EMPTY(mainq) && !CQ_IS_EMPTY(writerq)) {
2349
2350 /* the writerq never blocks */
2351 cp = CQ_GET(writerq);
2352 if (cp == NULL)
2353 break;
2354
2355 dump_timeleft = dump_timeout;
2356
2357 HRSTART(ds->perpage, write);
2358 dumpvp_write(cp->buf, cp->used);
2359 HRSTOP(ds->perpage, write);
2360
2361 CQ_PUT(freebufq, cp, CBUF_FREEBUF);
2362 }
2363
2364 /*
2365 * Wait here for some buffers to process. Returns NULL
2366 * when all helpers have terminated and all buffers
2367 * have been processed.
2368 */
2369 cp = CQ_GET(mainq);
2370
2371 if (cp == NULL) {
2372
2373 /* Drain the write queue. */
2374 if (!CQ_IS_EMPTY(writerq))
2375 continue;
2376
2377 /* Main task exits here. */
2378 break;
2379 }
2380
2381 dump_timeleft = dump_timeout;
2382
2383 switch (cp->state) {
2384
2385 case CBUF_FREEMAP:
2386
2387 /*
2388 * Note that we drop CBUF_FREEMAP buffers on
2389 * the floor (they will not be on any cqueue)
2390 * when we no longer need them.
2391 */
2392 if (bitnum >= dumpcfg.bitmapsize)
2393 break;
2394
2395 if (dump_ioerr) {
2396 bitnum = dumpcfg.bitmapsize;
2397 CQ_CLOSE(helperq);
2398 break;
2399 }
2400
2401 HRSTART(ds->perpage, bitmap);
2402 for (; bitnum < dumpcfg.bitmapsize; bitnum++)
2403 if (BT_TEST(dumpcfg.bitmap, bitnum))
2404 break;
2405 HRSTOP(ds->perpage, bitmap);
2406 dump_timeleft = dump_timeout;
2407
2408 if (bitnum >= dumpcfg.bitmapsize) {
2409 CQ_CLOSE(helperq);
2410 break;
2411 }
2412
2413 /*
2414 * Try to map CBUF_MAPSIZE ranges. Can't
2415 * assume that memory segment size is a
2416 * multiple of CBUF_MAPSIZE. Can't assume that
2417 * the segment starts on a CBUF_MAPSIZE
2418 * boundary.
2419 */
2420 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
2421 ASSERT(pfn != PFN_INVALID);
2422 ASSERT(bitnum + mlw.mpleft <= dumpcfg.bitmapsize);
2423
2424 base = P2ALIGN(pfn, CBUF_MAPNP);
2425 if (base < mlw.mpaddr) {
2426 base = mlw.mpaddr;
2427 baseoff = P2PHASE(base, CBUF_MAPNP);
2428 } else {
2429 baseoff = 0;
2430 }
2431
2432 pfnoff = pfn - base;
2433 if (pfnoff + mlw.mpleft < CBUF_MAPNP) {
2434 hibitnum = bitnum + mlw.mpleft;
2435 cp->size = ptob(pfnoff + mlw.mpleft);
2436 } else {
2437 hibitnum = bitnum - pfnoff + CBUF_MAPNP -
2438 baseoff;
2439 cp->size = CBUF_MAPSIZE - ptob(baseoff);
2440 }
2441
2442 cp->pfn = pfn;
2443 cp->bitnum = bitnum++;
2444 cp->pagenum = pagenum++;
2445 cp->off = ptob(pfnoff);
2446
2447 for (; bitnum < hibitnum; bitnum++)
2448 if (BT_TEST(dumpcfg.bitmap, bitnum))
2449 pagenum++;
2450
2451 dump_timeleft = dump_timeout;
2452 cp->used = ptob(pagenum - cp->pagenum);
2453
2454 HRSTART(ds->perpage, map);
2455 hat_devload(kas.a_hat, cp->buf, cp->size, base,
2456 PROT_READ, HAT_LOAD_NOCONSIST);
2457 HRSTOP(ds->perpage, map);
2458
2459 ds->pages_mapped += btop(cp->size);
2460 ds->pages_used += pagenum - cp->pagenum;
2461
2462 CQ_OPEN(mainq);
2463
2464 /*
2465 * If there are no helpers the main task does
2466 * non-streams lzjb compress.
2467 */
2468 if (dumpserial) {
2469 dumpsys_lzjb_page(dumpcfg.helper, cp);
2470 break;
2471 }
2472
2473 /* pass mapped pages to a helper */
2474 CQ_PUT(helperq, cp, CBUF_INREADY);
2475
2476 /* the last page was done */
2477 if (bitnum >= dumpcfg.bitmapsize)
2478 CQ_CLOSE(helperq);
2479
2480 break;
2481
2482 case CBUF_USEDMAP:
2483
2484 ds->npages += btop(cp->used);
2485
2486 HRSTART(ds->perpage, unmap);
2487 hat_unload(kas.a_hat, cp->buf, cp->size, HAT_UNLOAD);
2488 HRSTOP(ds->perpage, unmap);
2489
2490 if (bitnum < dumpcfg.bitmapsize)
2491 CQ_PUT(mainq, cp, CBUF_FREEMAP);
2492 CQ_CLOSE(mainq);
2493
2494 ASSERT(ds->npages <= dumphdr->dump_npages);
2495 ds->percent = ds->npages * 100LL / dumphdr->dump_npages;
2496 break;
2497
2498 case CBUF_WRITE:
2499
2500 CQ_PUT(writerq, cp, CBUF_WRITE);
2501 break;
2502
2503 case CBUF_ERRMSG:
2504
2505 if (cp->used > 0) {
2506 cp->buf[cp->size - 2] = '\n';
2507 cp->buf[cp->size - 1] = '\0';
2508 if (ds->neednl) {
2509 uprintf("\n%s", cp->buf);
2510 ds->neednl = 0;
2511 } else {
2512 uprintf("%s", cp->buf);
2513 }
2514 /* wait for console output */
2515 drv_usecwait(200000);
2516 dump_timeleft = dump_timeout;
2517 }
2518 CQ_PUT(freebufq, cp, CBUF_FREEBUF);
2519 break;
2520
2521 default:
2522 uprintf("dump: unexpected buffer state %d, "
2523 "buffer will be lost\n", cp->state);
2524 break;
2525
2526 } /* end switch */
2527 }
2528 }
2529
2530 #ifdef COLLECT_METRICS
2531 size_t
2532 dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size)
2533 {
2534 dumpcfg_t *cfg = &dumpcfg;
2535 int myid = CPU->cpu_seqid;
2536 int i, compress_ratio;
2537 int sec, iorate;
2538 helper_t *hp, *hpend = &cfg->helper[cfg->nhelper];
2539 char *e = buf + size;
2540 char *p = buf;
2541
2542 sec = ds->elapsed / (1000 * 1000 * 1000ULL);
2543 if (sec < 1)
2544 sec = 1;
2545
2546 if (ds->iotime < 1)
2547 ds->iotime = 1;
2548 iorate = (ds->nwrite * 100000ULL) / ds->iotime;
2549
2550 compress_ratio = 100LL * ds->npages / btopr(ds->nwrite + 1);
2551
2552 #define P(...) (p += p < e ? snprintf(p, e - p, __VA_ARGS__) : 0)
2553
2554 P("Master cpu_seqid,%d\n", CPU->cpu_seqid);
2555 P("Master cpu_id,%d\n", CPU->cpu_id);
2556 P("dump_flags,0x%x\n", dumphdr->dump_flags);
2557 P("dump_ioerr,%d\n", dump_ioerr);
2558
2559 P("Helpers:\n");
2560 for (i = 0; i < ncpus; i++) {
2561 if ((i & 15) == 0)
2562 P(",,%03d,", i);
2563 if (i == myid)
2564 P(" M");
2565 else if (BT_TEST(cfg->helpermap, i))
2566 P("%4d", cpu_seq[i]->cpu_id);
2567 else
2568 P(" *");
2569 if ((i & 15) == 15)
2570 P("\n");
2571 }
2572
2573 P("ncbuf_used,%d\n", cfg->ncbuf_used);
2574 P("ncmap,%d\n", cfg->ncmap);
2575
2576 P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m);
2577 P("Found small pages,%ld\n", cfg->foundsm);
2578
2579 P("Compression level,%d\n", cfg->clevel);
2580 P("Compression type,%s %s\n", cfg->clevel == 0 ? "serial" : "parallel",
2581 cfg->clevel >= DUMP_CLEVEL_BZIP2 ? "bzip2" : "lzjb");
2582 P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio %
2583 100);
2584 P("nhelper_used,%d\n", cfg->nhelper_used);
2585
2586 P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100);
2587 P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite);
2588 P("..total nsec,%lld\n", (u_longlong_t)ds->iotime);
2589 P("dumpbuf.iosize,%ld\n", dumpbuf.iosize);
2590 P("dumpbuf.size,%ld\n", dumpbuf.size);
2591
2592 P("Dump pages/sec,%llu\n", (u_longlong_t)ds->npages / sec);
2593 P("Dump pages,%llu\n", (u_longlong_t)ds->npages);
2594 P("Dump time,%d\n", sec);
2595
2596 if (ds->pages_mapped > 0)
2597 P("per-cent map utilization,%d\n", (int)((100 * ds->pages_used)
2598 / ds->pages_mapped));
2599
2600 P("\nPer-page metrics:\n");
2601 if (ds->npages > 0) {
2602 for (hp = cfg->helper; hp != hpend; hp++) {
2603 #define PERPAGE(x) ds->perpage.x += hp->perpage.x;
2604 PERPAGES;
2605 #undef PERPAGE
2606 }
2607 #define PERPAGE(x) \
2608 P("%s nsec/page,%d\n", #x, (int)(ds->perpage.x / ds->npages));
2609 PERPAGES;
2610 #undef PERPAGE
2611 P("freebufq.empty,%d\n", (int)(ds->freebufq.empty /
2612 ds->npages));
2613 P("helperq.empty,%d\n", (int)(ds->helperq.empty /
2614 ds->npages));
2615 P("writerq.empty,%d\n", (int)(ds->writerq.empty /
2616 ds->npages));
2617 P("mainq.empty,%d\n", (int)(ds->mainq.empty / ds->npages));
2618
2619 P("I/O wait nsec/page,%llu\n", (u_longlong_t)(ds->iowait /
2620 ds->npages));
2621 }
2622 #undef P
2623 if (p < e)
2624 bzero(p, e - p);
2625 return (p - buf);
2626 }
2627 #endif /* COLLECT_METRICS */
2628
2629 /*
2630 * Dump the system.
2631 */
2632 void
2633 dumpsys(void)
2634 {
2635 dumpsync_t *ds = &dumpsync;
2636 taskq_t *livetaskq = NULL;
2637 pfn_t pfn;
2638 pgcnt_t bitnum;
2639 proc_t *p;
2640 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper];
2641 cbuf_t *cp;
2642 pid_t npids, pidx;
2643 char *content;
2644 char *buf;
2645 size_t size;
2646 int save_dump_clevel;
2647 dumpmlw_t mlw;
2648 dumpcsize_t datatag;
2649 dumpdatahdr_t datahdr;
2650
2651 if (dumpvp == NULL || dumphdr == NULL) {
2652 uprintf("skipping system dump - no dump device configured\n");
2653 if (panicstr) {
2654 dumpcfg.helpers_wanted = 0;
2655 dumpsys_spinunlock(&dumpcfg.helper_lock);
2656 }
2657 return;
2658 }
2659 dumpbuf.cur = dumpbuf.start;
2660
2661 /* clear the sync variables */
2662 ASSERT(dumpcfg.nhelper > 0);
2663 bzero(ds, sizeof (*ds));
2664 ds->dumpcpu = CPU->cpu_id;
2665
2666 /*
2667 * Calculate the starting block for dump. If we're dumping on a
2668 * swap device, start 1/5 of the way in; otherwise, start at the
2669 * beginning. And never use the first page -- it may be a disk label.
2670 */
2671 if (dumpvp->v_flag & VISSWAP)
2672 dumphdr->dump_start = P2ROUNDUP(dumpvp_size / 5, DUMP_OFFSET);
2673 else
2674 dumphdr->dump_start = DUMP_OFFSET;
2675
2676 dumphdr->dump_flags = DF_VALID | DF_COMPLETE | DF_LIVE | DF_COMPRESSED;
2677 dumphdr->dump_crashtime = gethrestime_sec();
2678 dumphdr->dump_npages = 0;
2679 dumphdr->dump_nvtop = 0;
2680 bzero(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.bitmapsize));
2681 dump_timeleft = dump_timeout;
2682
2683 if (panicstr) {
2684 dumphdr->dump_flags &= ~DF_LIVE;
2685 (void) VOP_DUMPCTL(dumpvp, DUMP_FREE, NULL, NULL);
2686 (void) VOP_DUMPCTL(dumpvp, DUMP_ALLOC, NULL, NULL);
2687 (void) vsnprintf(dumphdr->dump_panicstring, DUMP_PANICSIZE,
2688 panicstr, panicargs);
2689
2690 }
2691
2692 if (dump_conflags & DUMP_ALL)
2693 content = "all";
2694 else if (dump_conflags & DUMP_CURPROC)
2695 content = "kernel + curproc";
2696 else
2697 content = "kernel";
2698 uprintf("dumping to %s, offset %lld, content: %s\n", dumppath,
2699 dumphdr->dump_start, content);
2700
2701 /* Make sure nodename is current */
2702 bcopy(utsname.nodename, dumphdr->dump_utsname.nodename, SYS_NMLN);
2703
2704 /*
2705 * If this is a live dump, try to open a VCHR vnode for better
2706 * performance. We must take care to flush the buffer cache
2707 * first.
2708 */
2709 if (!panicstr) {
2710 vnode_t *cdev_vp, *cmn_cdev_vp;
2711
2712 ASSERT(dumpbuf.cdev_vp == NULL);
2713 cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR);
2714 if (cdev_vp != NULL) {
2715 cmn_cdev_vp = common_specvp(cdev_vp);
2716 if (VOP_OPEN(&cmn_cdev_vp, FREAD | FWRITE, kcred, NULL)
2717 == 0) {
2718 if (vn_has_cached_data(dumpvp))
2719 (void) pvn_vplist_dirty(dumpvp, 0, NULL,
2720 B_INVAL | B_TRUNC, kcred);
2721 dumpbuf.cdev_vp = cmn_cdev_vp;
2722 } else {
2723 VN_RELE(cdev_vp);
2724 }
2725 }
2726 }
2727
2728 /*
2729 * Store a hires timestamp so we can look it up during debugging.
2730 */
2731 lbolt_debug_entry();
2732
2733 /*
2734 * Leave room for the message and ereport save areas and terminal dump
2735 * header.
2736 */
2737 dumpbuf.vp_limit = dumpvp_size - DUMP_LOGSIZE - DUMP_OFFSET -
2738 DUMP_ERPTSIZE;
2739
2740 /*
2741 * Write out the symbol table. It's no longer compressed,
2742 * so its 'size' and 'csize' are equal.
2743 */
2744 dumpbuf.vp_off = dumphdr->dump_ksyms = dumphdr->dump_start + PAGESIZE;
2745 dumphdr->dump_ksyms_size = dumphdr->dump_ksyms_csize =
2746 ksyms_snapshot(dumpvp_ksyms_write, NULL, LONG_MAX);
2747
2748 /*
2749 * Write out the translation map.
2750 */
2751 dumphdr->dump_map = dumpvp_flush();
2752 dump_as(&kas);
2753 dumphdr->dump_nvtop += dump_plat_addr();
2754
2755 /*
2756 * call into hat, which may have unmapped pages that also need to
2757 * be in the dump
2758 */
2759 hat_dump();
2760
2761 if (dump_conflags & DUMP_ALL) {
2762 mutex_enter(&pidlock);
2763
2764 for (npids = 0, p = practive; p != NULL; p = p->p_next)
2765 dumpcfg.pids[npids++] = p->p_pid;
2766
2767 mutex_exit(&pidlock);
2768
2769 for (pidx = 0; pidx < npids; pidx++)
2770 (void) dump_process(dumpcfg.pids[pidx]);
2771
2772 dump_init_memlist_walker(&mlw);
2773 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) {
2774 dump_timeleft = dump_timeout;
2775 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
2776 /*
2777 * Some hypervisors do not have all pages available to
2778 * be accessed by the guest OS. Check for page
2779 * accessibility.
2780 */
2781 if (plat_hold_page(pfn, PLAT_HOLD_NO_LOCK, NULL) !=
2782 PLAT_HOLD_OK)
2783 continue;
2784 BT_SET(dumpcfg.bitmap, bitnum);
2785 }
2786 dumphdr->dump_npages = dumpcfg.bitmapsize;
2787 dumphdr->dump_flags |= DF_ALL;
2788
2789 } else if (dump_conflags & DUMP_CURPROC) {
2790 /*
2791 * Determine which pid is to be dumped. If we're panicking, we
2792 * dump the process associated with panic_thread (if any). If
2793 * this is a live dump, we dump the process associated with
2794 * curthread.
2795 */
2796 npids = 0;
2797 if (panicstr) {
2798 if (panic_thread != NULL &&
2799 panic_thread->t_procp != NULL &&
2800 panic_thread->t_procp != &p0) {
2801 dumpcfg.pids[npids++] =
2802 panic_thread->t_procp->p_pid;
2803 }
2804 } else {
2805 dumpcfg.pids[npids++] = curthread->t_procp->p_pid;
2806 }
2807
2808 if (npids && dump_process(dumpcfg.pids[0]) == 0)
2809 dumphdr->dump_flags |= DF_CURPROC;
2810 else
2811 dumphdr->dump_flags |= DF_KERNEL;
2812
2813 } else {
2814 dumphdr->dump_flags |= DF_KERNEL;
2815 }
2816
2817 dumphdr->dump_hashmask = (1 << highbit(dumphdr->dump_nvtop - 1)) - 1;
2818
2819 /*
2820 * Write out the pfn table.
2821 */
2822 dumphdr->dump_pfn = dumpvp_flush();
2823 dump_init_memlist_walker(&mlw);
2824 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) {
2825 dump_timeleft = dump_timeout;
2826 if (!BT_TEST(dumpcfg.bitmap, bitnum))
2827 continue;
2828 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
2829 ASSERT(pfn != PFN_INVALID);
2830 dumpvp_write(&pfn, sizeof (pfn_t));
2831 }
2832 dump_plat_pfn();
2833
2834 /*
2835 * Write out all the pages.
2836 * Map pages, copy them handling UEs, compress, and write them out.
2837 * Cooperate with any helpers running on CPUs in panic_idle().
2838 */
2839 dumphdr->dump_data = dumpvp_flush();
2840
2841 bzero(dumpcfg.helpermap, BT_SIZEOFMAP(NCPU));
2842 ds->live = dumpcfg.clevel > 0 &&
2843 (dumphdr->dump_flags & DF_LIVE) != 0;
2844
2845 save_dump_clevel = dumpcfg.clevel;
2846 if (panicstr)
2847 dumpsys_get_maxmem();
2848 else if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2)
2849 dumpcfg.clevel = DUMP_CLEVEL_LZJB;
2850
2851 dumpcfg.nhelper_used = 0;
2852 for (hp = dumpcfg.helper; hp != hpend; hp++) {
2853 if (hp->page == NULL) {
2854 hp->helper = DONEHELPER;
2855 continue;
2856 }
2857 ++dumpcfg.nhelper_used;
2858 hp->helper = FREEHELPER;
2859 hp->taskqid = NULL;
2860 hp->ds = ds;
2861 bzero(&hp->perpage, sizeof (hp->perpage));
2862 if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2)
2863 (void) BZ2_bzCompressReset(&hp->bzstream);
2864 }
2865
2866 CQ_OPEN(freebufq);
2867 CQ_OPEN(helperq);
2868
2869 dumpcfg.ncbuf_used = 0;
2870 for (cp = dumpcfg.cbuf; cp != &dumpcfg.cbuf[dumpcfg.ncbuf]; cp++) {
2871 if (cp->buf != NULL) {
2872 CQ_PUT(freebufq, cp, CBUF_FREEBUF);
2873 ++dumpcfg.ncbuf_used;
2874 }
2875 }
2876
2877 for (cp = dumpcfg.cmap; cp != &dumpcfg.cmap[dumpcfg.ncmap]; cp++)
2878 CQ_PUT(mainq, cp, CBUF_FREEMAP);
2879
2880 ds->start = gethrtime();
2881 ds->iowaitts = ds->start;
2882
2883 /* start helpers */
2884 if (ds->live) {
2885 int n = dumpcfg.nhelper_used;
2886 int pri = MINCLSYSPRI - 25;
2887
2888 livetaskq = taskq_create("LiveDump", n, pri, n, n,
2889 TASKQ_PREPOPULATE);
2890 for (hp = dumpcfg.helper; hp != hpend; hp++) {
2891 if (hp->page == NULL)
2892 continue;
2893 hp->helper = hp - dumpcfg.helper;
2894 hp->taskqid = taskq_dispatch(livetaskq,
2895 dumpsys_live_helper, (void *)hp, TQ_NOSLEEP);
2896 }
2897
2898 } else {
2899 if (panicstr)
2900 kmem_dump_begin();
2901 dumpcfg.helpers_wanted = dumpcfg.clevel > 0;
2902 dumpsys_spinunlock(&dumpcfg.helper_lock);
2903 }
2904
2905 /* run main task */
2906 dumpsys_main_task(ds);
2907
2908 ds->elapsed = gethrtime() - ds->start;
2909 if (ds->elapsed < 1)
2910 ds->elapsed = 1;
2911
2912 if (livetaskq != NULL)
2913 taskq_destroy(livetaskq);
2914
2915 if (ds->neednl) {
2916 uprintf("\n");
2917 ds->neednl = 0;
2918 }
2919
2920 /* record actual pages dumped */
2921 dumphdr->dump_npages = ds->npages;
2922
2923 /* platform-specific data */
2924 dumphdr->dump_npages += dump_plat_data(dumpcfg.cbuf[0].buf);
2925
2926 /* note any errors by clearing DF_COMPLETE */
2927 if (dump_ioerr || ds->npages < dumphdr->dump_npages)
2928 dumphdr->dump_flags &= ~DF_COMPLETE;
2929
2930 /* end of stream blocks */
2931 datatag = 0;
2932 dumpvp_write(&datatag, sizeof (datatag));
2933
2934 bzero(&datahdr, sizeof (datahdr));
2935
2936 /* buffer for metrics */
2937 buf = dumpcfg.cbuf[0].buf;
2938 size = MIN(dumpcfg.cbuf[0].size, DUMP_OFFSET - sizeof (dumphdr_t) -
2939 sizeof (dumpdatahdr_t));
2940
2941 /* finish the kmem intercepts, collect kmem verbose info */
2942 if (panicstr) {
2943 datahdr.dump_metrics = kmem_dump_finish(buf, size);
2944 buf += datahdr.dump_metrics;
2945 size -= datahdr.dump_metrics;
2946 }
2947
2948 /* record in the header whether this is a fault-management panic */
2949 if (panicstr)
2950 dumphdr->dump_fm_panic = is_fm_panic();
2951
2952 /* compression info in data header */
2953 datahdr.dump_datahdr_magic = DUMP_DATAHDR_MAGIC;
2954 datahdr.dump_datahdr_version = DUMP_DATAHDR_VERSION;
2955 datahdr.dump_maxcsize = CBUF_SIZE;
2956 datahdr.dump_maxrange = CBUF_MAPSIZE / PAGESIZE;
2957 datahdr.dump_nstreams = dumpcfg.nhelper_used;
2958 datahdr.dump_clevel = dumpcfg.clevel;
2959 #ifdef COLLECT_METRICS
2960 if (dump_metrics_on)
2961 datahdr.dump_metrics += dumpsys_metrics(ds, buf, size);
2962 #endif
2963 datahdr.dump_data_csize = dumpvp_flush() - dumphdr->dump_data;
2964
2965 /*
2966 * Write out the initial and terminal dump headers.
2967 */
2968 dumpbuf.vp_off = dumphdr->dump_start;
2969 dumpvp_write(dumphdr, sizeof (dumphdr_t));
2970 (void) dumpvp_flush();
2971
2972 dumpbuf.vp_limit = dumpvp_size;
2973 dumpbuf.vp_off = dumpbuf.vp_limit - DUMP_OFFSET;
2974 dumpvp_write(dumphdr, sizeof (dumphdr_t));
2975 dumpvp_write(&datahdr, sizeof (dumpdatahdr_t));
2976 dumpvp_write(dumpcfg.cbuf[0].buf, datahdr.dump_metrics);
2977
2978 (void) dumpvp_flush();
2979
2980 uprintf("\r%3d%% done: %llu pages dumped, ",
2981 ds->percent_done, (u_longlong_t)ds->npages);
2982
2983 if (dump_ioerr == 0) {
2984 uprintf("dump succeeded\n");
2985 } else {
2986 uprintf("dump failed: error %d\n", dump_ioerr);
2987 #ifdef DEBUG
2988 if (panicstr)
2989 debug_enter("dump failed");
2990 #endif
2991 }
2992
2993 /*
2994 * Write out all undelivered messages. This has to be the *last*
2995 * thing we do because the dump process itself emits messages.
2996 */
2997 if (panicstr) {
2998 dump_summary();
2999 dump_ereports();
3000 dump_messages();
3001 }
3002
3003 delay(2 * hz); /* let people see the 'done' message */
3004 dump_timeleft = 0;
3005 dump_ioerr = 0;
3006
3007 /* restore settings after live dump completes */
3008 if (!panicstr) {
3009 dumpcfg.clevel = save_dump_clevel;
3010
3011 /* release any VCHR open of the dump device */
3012 if (dumpbuf.cdev_vp != NULL) {
3013 (void) VOP_CLOSE(dumpbuf.cdev_vp, FREAD | FWRITE, 1, 0,
3014 kcred, NULL);
3015 VN_RELE(dumpbuf.cdev_vp);
3016 dumpbuf.cdev_vp = NULL;
3017 }
3018 }
3019 }
3020
3021 /*
3022 * This function is called whenever the memory size, as represented
3023 * by the phys_install list, changes.
3024 */
3025 void
3026 dump_resize()
3027 {
3028 mutex_enter(&dump_lock);
3029 dumphdr_init();
3030 dumpbuf_resize();
3031 dump_update_clevel();
3032 mutex_exit(&dump_lock);
3033 }
3034
3035 /*
3036 * This function allows for dynamic resizing of a dump area. It assumes that
3037 * the underlying device has update its appropriate size(9P).
3038 */
3039 int
3040 dumpvp_resize()
3041 {
3042 int error;
3043 vattr_t vattr;
3044
3045 mutex_enter(&dump_lock);
3046 vattr.va_mask = AT_SIZE;
3047 if ((error = VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL)) != 0) {
3048 mutex_exit(&dump_lock);
3049 return (error);
3050 }
3051
3052 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) {
3053 mutex_exit(&dump_lock);
3054 return (ENOSPC);
3055 }
3056
3057 dumpvp_size = vattr.va_size & -DUMP_OFFSET;
3058 mutex_exit(&dump_lock);
3059 return (0);
3060 }
3061
3062 int
3063 dump_set_uuid(const char *uuidstr)
3064 {
3065 const char *ptr;
3066 int i;
3067
3068 if (uuidstr == NULL || strnlen(uuidstr, 36 + 1) != 36)
3069 return (EINVAL);
3070
3071 /* uuid_parse is not common code so check manually */
3072 for (i = 0, ptr = uuidstr; i < 36; i++, ptr++) {
3073 switch (i) {
3074 case 8:
3075 case 13:
3076 case 18:
3077 case 23:
3078 if (*ptr != '-')
3079 return (EINVAL);
3080 break;
3081
3082 default:
3083 if (!isxdigit(*ptr))
3084 return (EINVAL);
3085 break;
3086 }
3087 }
3088
3089 if (dump_osimage_uuid[0] != '\0')
3090 return (EALREADY);
3091
3092 (void) strncpy(dump_osimage_uuid, uuidstr, 36 + 1);
3093
3094 cmn_err(CE_CONT, "?This Solaris instance has UUID %s\n",
3095 dump_osimage_uuid);
3096
3097 return (0);
3098 }
3099
3100 const char *
3101 dump_get_uuid(void)
3102 {
3103 return (dump_osimage_uuid[0] != '\0' ? dump_osimage_uuid : "");
3104 }