Print this page
9525 kmem_dump_size is a corrupting influence
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/os/dumpsubr.c
+++ new/usr/src/uts/common/os/dumpsubr.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
24 - * Copyright 2016 Joyent, Inc.
24 + * Copyright 2018 Joyent, Inc.
25 25 */
26 26
27 27 #include <sys/types.h>
28 28 #include <sys/param.h>
29 29 #include <sys/systm.h>
30 30 #include <sys/vm.h>
31 31 #include <sys/proc.h>
32 32 #include <sys/file.h>
33 33 #include <sys/conf.h>
34 34 #include <sys/kmem.h>
35 35 #include <sys/mem.h>
36 36 #include <sys/mman.h>
37 37 #include <sys/vnode.h>
38 38 #include <sys/errno.h>
39 39 #include <sys/memlist.h>
40 40 #include <sys/dumphdr.h>
41 41 #include <sys/dumpadm.h>
42 42 #include <sys/ksyms.h>
43 43 #include <sys/compress.h>
44 44 #include <sys/stream.h>
45 45 #include <sys/strsun.h>
46 46 #include <sys/cmn_err.h>
47 47 #include <sys/bitmap.h>
48 48 #include <sys/modctl.h>
49 49 #include <sys/utsname.h>
50 50 #include <sys/systeminfo.h>
51 51 #include <sys/vmem.h>
52 52 #include <sys/log.h>
53 53 #include <sys/var.h>
54 54 #include <sys/debug.h>
55 55 #include <sys/sunddi.h>
56 56 #include <fs/fs_subr.h>
57 57 #include <sys/fs/snode.h>
58 58 #include <sys/ontrap.h>
59 59 #include <sys/panic.h>
60 60 #include <sys/dkio.h>
61 61 #include <sys/vtoc.h>
62 62 #include <sys/errorq.h>
63 63 #include <sys/fm/util.h>
64 64 #include <sys/fs/zfs.h>
65 65
66 66 #include <vm/hat.h>
↓ open down ↓ |
32 lines elided |
↑ open up ↑ |
67 67 #include <vm/as.h>
68 68 #include <vm/page.h>
69 69 #include <vm/pvn.h>
70 70 #include <vm/seg.h>
71 71 #include <vm/seg_kmem.h>
72 72 #include <sys/clock_impl.h>
73 73 #include <sys/hold_page.h>
74 74
75 75 #include <bzip2/bzlib.h>
76 76
77 +#define ONE_GIG (1024 * 1024 * 1024UL)
78 +
77 79 /*
78 80 * Crash dump time is dominated by disk write time. To reduce this,
79 81 * the stronger compression method bzip2 is applied to reduce the dump
80 82 * size and hence reduce I/O time. However, bzip2 is much more
81 83 * computationally expensive than the existing lzjb algorithm, so to
82 84 * avoid increasing compression time, CPUs that are otherwise idle
83 85 * during panic are employed to parallelize the compression task.
84 86 * Many helper CPUs are needed to prevent bzip2 from being a
85 87 * bottleneck, and on systems with too few CPUs, the lzjb algorithm is
86 88 * parallelized instead. Lastly, I/O and compression are performed by
87 89 * different CPUs, and are hence overlapped in time, unlike the older
88 90 * serial code.
89 91 *
90 92 * Another important consideration is the speed of the dump
91 93 * device. Faster disks need less CPUs in order to benefit from
92 94 * parallel lzjb versus parallel bzip2. Therefore, the CPU count
93 95 * threshold for switching from parallel lzjb to paralled bzip2 is
94 96 * elevated for faster disks. The dump device speed is adduced from
95 97 * the setting for dumpbuf.iosize, see dump_update_clevel.
96 98 */
97 99
98 100 /*
99 101 * exported vars
100 102 */
101 103 kmutex_t dump_lock; /* lock for dump configuration */
102 104 dumphdr_t *dumphdr; /* dump header */
103 105 int dump_conflags = DUMP_KERNEL; /* dump configuration flags */
104 106 vnode_t *dumpvp; /* dump device vnode pointer */
105 107 u_offset_t dumpvp_size; /* size of dump device, in bytes */
106 108 char *dumppath; /* pathname of dump device */
107 109 int dump_timeout = 120; /* timeout for dumping pages */
108 110 int dump_timeleft; /* portion of dump_timeout remaining */
109 111 int dump_ioerr; /* dump i/o error */
110 112 int dump_check_used; /* enable check for used pages */
111 113 char *dump_stack_scratch; /* scratch area for saving stack summary */
112 114
113 115 /*
114 116 * Tunables for dump compression and parallelism. These can be set via
115 117 * /etc/system.
116 118 *
117 119 * dump_ncpu_low number of helpers for parallel lzjb
118 120 * This is also the minimum configuration.
119 121 *
120 122 * dump_bzip2_level bzip2 compression level: 1-9
121 123 * Higher numbers give greater compression, but take more memory
122 124 * and time. Memory used per helper is ~(dump_bzip2_level * 1MB).
123 125 *
124 126 * dump_plat_mincpu the cross-over limit for using bzip2 (per platform):
125 127 * if dump_plat_mincpu == 0, then always do single threaded dump
126 128 * if ncpu >= dump_plat_mincpu then try to use bzip2
127 129 *
128 130 * dump_metrics_on if set, metrics are collected in the kernel, passed
129 131 * to savecore via the dump file, and recorded by savecore in
130 132 * METRICS.txt.
↓ open down ↓ |
44 lines elided |
↑ open up ↑ |
131 133 */
132 134 uint_t dump_ncpu_low = 4; /* minimum config for parallel lzjb */
133 135 uint_t dump_bzip2_level = 1; /* bzip2 level (1-9) */
134 136
135 137 /* Use dump_plat_mincpu_default unless this variable is set by /etc/system */
136 138 #define MINCPU_NOT_SET ((uint_t)-1)
137 139 uint_t dump_plat_mincpu = MINCPU_NOT_SET;
138 140
139 141 /* tunables for pre-reserved heap */
140 142 uint_t dump_kmem_permap = 1024;
141 -uint_t dump_kmem_pages = 8;
143 +uint_t dump_kmem_pages = 0;
142 144
143 145 /* Define multiple buffers per helper to avoid stalling */
144 146 #define NCBUF_PER_HELPER 2
145 147 #define NCMAP_PER_HELPER 4
146 148
147 149 /* minimum number of helpers configured */
148 150 #define MINHELPERS (dump_ncpu_low)
149 151 #define MINCBUFS (MINHELPERS * NCBUF_PER_HELPER)
150 152
151 153 /*
152 154 * Define constant parameters.
153 155 *
154 156 * CBUF_SIZE size of an output buffer
155 157 *
156 158 * CBUF_MAPSIZE size of virtual range for mapping pages
157 159 *
158 160 * CBUF_MAPNP size of virtual range in pages
159 161 *
160 162 */
161 163 #define DUMP_1KB ((size_t)1 << 10)
162 164 #define DUMP_1MB ((size_t)1 << 20)
163 165 #define CBUF_SIZE ((size_t)1 << 17)
164 166 #define CBUF_MAPSHIFT (22)
165 167 #define CBUF_MAPSIZE ((size_t)1 << CBUF_MAPSHIFT)
166 168 #define CBUF_MAPNP ((size_t)1 << (CBUF_MAPSHIFT - PAGESHIFT))
167 169
168 170 /*
169 171 * Compression metrics are accumulated nano-second subtotals. The
170 172 * results are normalized by the number of pages dumped. A report is
171 173 * generated when dumpsys() completes and is saved in the dump image
172 174 * after the trailing dump header.
173 175 *
174 176 * Metrics are always collected. Set the variable dump_metrics_on to
175 177 * cause metrics to be saved in the crash file, where savecore will
176 178 * save it in the file METRICS.txt.
177 179 */
178 180 #define PERPAGES \
179 181 PERPAGE(bitmap) PERPAGE(map) PERPAGE(unmap) \
180 182 PERPAGE(copy) PERPAGE(compress) \
181 183 PERPAGE(write) \
182 184 PERPAGE(inwait) PERPAGE(outwait)
183 185
184 186 typedef struct perpage {
185 187 #define PERPAGE(x) hrtime_t x;
186 188 PERPAGES
187 189 #undef PERPAGE
188 190 } perpage_t;
189 191
190 192 /*
191 193 * This macro controls the code generation for collecting dump
192 194 * performance information. By default, the code is generated, but
193 195 * automatic saving of the information is disabled. If dump_metrics_on
194 196 * is set to 1, the timing information is passed to savecore via the
195 197 * crash file, where it is appended to the file dump-dir/METRICS.txt.
196 198 */
197 199 #define COLLECT_METRICS
198 200
199 201 #ifdef COLLECT_METRICS
200 202 uint_t dump_metrics_on = 0; /* set to 1 to enable recording metrics */
201 203
202 204 #define HRSTART(v, m) v##ts.m = gethrtime()
203 205 #define HRSTOP(v, m) v.m += gethrtime() - v##ts.m
204 206 #define HRBEGIN(v, m, s) v##ts.m = gethrtime(); v.size += s
205 207 #define HREND(v, m) v.m += gethrtime() - v##ts.m
206 208 #define HRNORM(v, m, n) v.m /= (n)
207 209
208 210 #else
209 211 #define HRSTART(v, m)
210 212 #define HRSTOP(v, m)
211 213 #define HRBEGIN(v, m, s)
212 214 #define HREND(v, m)
213 215 #define HRNORM(v, m, n)
214 216 #endif /* COLLECT_METRICS */
215 217
216 218 /*
217 219 * Buffers for copying and compressing memory pages.
218 220 *
219 221 * cbuf_t buffer controllers: used for both input and output.
220 222 *
221 223 * The buffer state indicates how it is being used:
222 224 *
223 225 * CBUF_FREEMAP: CBUF_MAPSIZE virtual address range is available for
224 226 * mapping input pages.
225 227 *
226 228 * CBUF_INREADY: input pages are mapped and ready for compression by a
227 229 * helper.
228 230 *
229 231 * CBUF_USEDMAP: mapping has been consumed by a helper. Needs unmap.
230 232 *
231 233 * CBUF_FREEBUF: CBUF_SIZE output buffer, which is available.
232 234 *
233 235 * CBUF_WRITE: CBUF_SIZE block of compressed pages from a helper,
234 236 * ready to write out.
235 237 *
236 238 * CBUF_ERRMSG: CBUF_SIZE block of error messages from a helper
237 239 * (reports UE errors.)
238 240 */
239 241
240 242 typedef enum cbufstate {
241 243 CBUF_FREEMAP,
242 244 CBUF_INREADY,
243 245 CBUF_USEDMAP,
244 246 CBUF_FREEBUF,
245 247 CBUF_WRITE,
246 248 CBUF_ERRMSG
247 249 } cbufstate_t;
248 250
249 251 typedef struct cbuf cbuf_t;
250 252
251 253 struct cbuf {
252 254 cbuf_t *next; /* next in list */
253 255 cbufstate_t state; /* processing state */
254 256 size_t used; /* amount used */
255 257 size_t size; /* mem size */
256 258 char *buf; /* kmem or vmem */
257 259 pgcnt_t pagenum; /* index to pfn map */
258 260 pgcnt_t bitnum; /* first set bitnum */
259 261 pfn_t pfn; /* first pfn in mapped range */
260 262 int off; /* byte offset to first pfn */
261 263 };
262 264
263 265 static char dump_osimage_uuid[36 + 1];
264 266
265 267 #define isdigit(ch) ((ch) >= '0' && (ch) <= '9')
266 268 #define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \
267 269 ((ch) >= 'A' && (ch) <= 'F'))
268 270
269 271 /*
270 272 * cqueue_t queues: a uni-directional channel for communication
271 273 * from the master to helper tasks or vice-versa using put and
272 274 * get primitives. Both mappings and data buffers are passed via
273 275 * queues. Producers close a queue when done. The number of
274 276 * active producers is reference counted so the consumer can
275 277 * detect end of data. Concurrent access is mediated by atomic
276 278 * operations for panic dump, or mutex/cv for live dump.
277 279 *
278 280 * There a four queues, used as follows:
279 281 *
280 282 * Queue Dataflow NewState
281 283 * --------------------------------------------------
282 284 * mainq master -> master FREEMAP
283 285 * master has initialized or unmapped an input buffer
284 286 * --------------------------------------------------
285 287 * helperq master -> helper INREADY
286 288 * master has mapped input for use by helper
287 289 * --------------------------------------------------
288 290 * mainq master <- helper USEDMAP
289 291 * helper is done with input
290 292 * --------------------------------------------------
291 293 * freebufq master -> helper FREEBUF
292 294 * master has initialized or written an output buffer
293 295 * --------------------------------------------------
294 296 * mainq master <- helper WRITE
295 297 * block of compressed pages from a helper
296 298 * --------------------------------------------------
297 299 * mainq master <- helper ERRMSG
298 300 * error messages from a helper (memory error case)
299 301 * --------------------------------------------------
300 302 * writerq master <- master WRITE
301 303 * non-blocking queue of blocks to write
302 304 * --------------------------------------------------
303 305 */
304 306 typedef struct cqueue {
305 307 cbuf_t *volatile first; /* first in list */
306 308 cbuf_t *last; /* last in list */
307 309 hrtime_t ts; /* timestamp */
308 310 hrtime_t empty; /* total time empty */
309 311 kmutex_t mutex; /* live state lock */
310 312 kcondvar_t cv; /* live wait var */
311 313 lock_t spinlock; /* panic mode spin lock */
312 314 volatile uint_t open; /* producer ref count */
313 315 } cqueue_t;
314 316
315 317 /*
316 318 * Convenience macros for using the cqueue functions
317 319 * Note that the caller must have defined "dumpsync_t *ds"
318 320 */
319 321 #define CQ_IS_EMPTY(q) \
320 322 (ds->q.first == NULL)
321 323
322 324 #define CQ_OPEN(q) \
323 325 atomic_inc_uint(&ds->q.open)
324 326
325 327 #define CQ_CLOSE(q) \
326 328 dumpsys_close_cq(&ds->q, ds->live)
327 329
328 330 #define CQ_PUT(q, cp, st) \
329 331 dumpsys_put_cq(&ds->q, cp, st, ds->live)
330 332
331 333 #define CQ_GET(q) \
332 334 dumpsys_get_cq(&ds->q, ds->live)
333 335
334 336 /*
335 337 * Dynamic state when dumpsys() is running.
336 338 */
337 339 typedef struct dumpsync {
338 340 pgcnt_t npages; /* subtotal of pages dumped */
339 341 pgcnt_t pages_mapped; /* subtotal of pages mapped */
340 342 pgcnt_t pages_used; /* subtotal of pages used per map */
341 343 size_t nwrite; /* subtotal of bytes written */
342 344 uint_t live; /* running live dump */
343 345 uint_t neednl; /* will need to print a newline */
344 346 uint_t percent; /* dump progress */
345 347 uint_t percent_done; /* dump progress reported */
346 348 int sec_done; /* dump progress last report time */
347 349 cqueue_t freebufq; /* free kmem bufs for writing */
348 350 cqueue_t mainq; /* input for main task */
349 351 cqueue_t helperq; /* input for helpers */
350 352 cqueue_t writerq; /* input for writer */
351 353 hrtime_t start; /* start time */
352 354 hrtime_t elapsed; /* elapsed time when completed */
353 355 hrtime_t iotime; /* time spent writing nwrite bytes */
354 356 hrtime_t iowait; /* time spent waiting for output */
355 357 hrtime_t iowaitts; /* iowait timestamp */
356 358 perpage_t perpage; /* metrics */
357 359 perpage_t perpagets;
358 360 int dumpcpu; /* master cpu */
359 361 } dumpsync_t;
360 362
361 363 static dumpsync_t dumpsync; /* synchronization vars */
362 364
363 365 /*
364 366 * helper_t helpers: contains the context for a stream. CPUs run in
365 367 * parallel at dump time; each CPU creates a single stream of
366 368 * compression data. Stream data is divided into CBUF_SIZE blocks.
367 369 * The blocks are written in order within a stream. But, blocks from
368 370 * multiple streams can be interleaved. Each stream is identified by a
369 371 * unique tag.
370 372 */
371 373 typedef struct helper {
372 374 int helper; /* bound helper id */
373 375 int tag; /* compression stream tag */
374 376 perpage_t perpage; /* per page metrics */
375 377 perpage_t perpagets; /* per page metrics (timestamps) */
376 378 taskqid_t taskqid; /* live dump task ptr */
377 379 int in, out; /* buffer offsets */
378 380 cbuf_t *cpin, *cpout, *cperr; /* cbuf objects in process */
379 381 dumpsync_t *ds; /* pointer to sync vars */
380 382 size_t used; /* counts input consumed */
381 383 char *page; /* buffer for page copy */
382 384 char *lzbuf; /* lzjb output */
383 385 bz_stream bzstream; /* bzip2 state */
384 386 } helper_t;
385 387
386 388 #define MAINHELPER (-1) /* helper is also the main task */
387 389 #define FREEHELPER (-2) /* unbound helper */
388 390 #define DONEHELPER (-3) /* helper finished */
389 391
390 392 /*
391 393 * configuration vars for dumpsys
392 394 */
393 395 typedef struct dumpcfg {
394 396 int threshold; /* ncpu threshold for bzip2 */
395 397 int nhelper; /* number of helpers */
396 398 int nhelper_used; /* actual number of helpers used */
397 399 int ncmap; /* number VA pages for compression */
398 400 int ncbuf; /* number of bufs for compression */
399 401 int ncbuf_used; /* number of bufs in use */
400 402 uint_t clevel; /* dump compression level */
401 403 helper_t *helper; /* array of helpers */
402 404 cbuf_t *cmap; /* array of input (map) buffers */
403 405 cbuf_t *cbuf; /* array of output buffers */
404 406 ulong_t *helpermap; /* set of dumpsys helper CPU ids */
405 407 ulong_t *bitmap; /* bitmap for marking pages to dump */
406 408 ulong_t *rbitmap; /* bitmap for used CBUF_MAPSIZE ranges */
407 409 pgcnt_t bitmapsize; /* size of bitmap */
408 410 pgcnt_t rbitmapsize; /* size of bitmap for ranges */
409 411 pgcnt_t found4m; /* number ranges allocated by dump */
410 412 pgcnt_t foundsm; /* number small pages allocated by dump */
411 413 pid_t *pids; /* list of process IDs at dump time */
412 414 size_t maxsize; /* memory size needed at dump time */
413 415 size_t maxvmsize; /* size of reserved VM */
414 416 char *maxvm; /* reserved VM for spare pages */
415 417 lock_t helper_lock; /* protect helper state */
416 418 char helpers_wanted; /* flag to enable parallelism */
417 419 } dumpcfg_t;
418 420
419 421 static dumpcfg_t dumpcfg; /* config vars */
420 422
421 423 /*
422 424 * The dump I/O buffer.
423 425 *
424 426 * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is
425 427 * sized according to the optimum device transfer speed.
426 428 */
427 429 typedef struct dumpbuf {
428 430 vnode_t *cdev_vp; /* VCHR open of the dump device */
429 431 len_t vp_limit; /* maximum write offset */
430 432 offset_t vp_off; /* current dump device offset */
431 433 char *cur; /* dump write pointer */
432 434 char *start; /* dump buffer address */
433 435 char *end; /* dump buffer end */
434 436 size_t size; /* size of dumpbuf in bytes */
435 437 size_t iosize; /* best transfer size for device */
436 438 } dumpbuf_t;
437 439
438 440 dumpbuf_t dumpbuf; /* I/O buffer */
439 441
440 442 /*
441 443 * The dump I/O buffer must be at least one page, at most xfer_size
442 444 * bytes, and should scale with physmem in between. The transfer size
443 445 * passed in will either represent a global default (maxphys) or the
444 446 * best size for the device. The size of the dumpbuf I/O buffer is
445 447 * limited by dumpbuf_limit (8MB by default) because the dump
446 448 * performance saturates beyond a certain size. The default is to
447 449 * select 1/4096 of the memory.
448 450 */
449 451 static int dumpbuf_fraction = 12; /* memory size scale factor */
450 452 static size_t dumpbuf_limit = 8 * DUMP_1MB; /* max I/O buf size */
451 453
452 454 static size_t
453 455 dumpbuf_iosize(size_t xfer_size)
454 456 {
455 457 size_t iosize = ptob(physmem >> dumpbuf_fraction);
456 458
457 459 if (iosize < PAGESIZE)
458 460 iosize = PAGESIZE;
459 461 else if (iosize > xfer_size)
460 462 iosize = xfer_size;
461 463 if (iosize > dumpbuf_limit)
462 464 iosize = dumpbuf_limit;
463 465 return (iosize & PAGEMASK);
464 466 }
465 467
466 468 /*
467 469 * resize the I/O buffer
468 470 */
469 471 static void
470 472 dumpbuf_resize(void)
471 473 {
472 474 char *old_buf = dumpbuf.start;
473 475 size_t old_size = dumpbuf.size;
474 476 char *new_buf;
475 477 size_t new_size;
476 478
477 479 ASSERT(MUTEX_HELD(&dump_lock));
478 480
479 481 new_size = dumpbuf_iosize(MAX(dumpbuf.iosize, maxphys));
480 482 if (new_size <= old_size)
481 483 return; /* no need to reallocate buffer */
482 484
483 485 new_buf = kmem_alloc(new_size, KM_SLEEP);
484 486 dumpbuf.size = new_size;
485 487 dumpbuf.start = new_buf;
486 488 dumpbuf.end = new_buf + new_size;
487 489 kmem_free(old_buf, old_size);
488 490 }
489 491
490 492 /*
491 493 * dump_update_clevel is called when dumpadm configures the dump device.
492 494 * Calculate number of helpers and buffers.
493 495 * Allocate the minimum configuration for now.
494 496 *
495 497 * When the dump file is configured we reserve a minimum amount of
496 498 * memory for use at crash time. But we reserve VA for all the memory
497 499 * we really want in order to do the fastest dump possible. The VA is
498 500 * backed by pages not being dumped, according to the bitmap. If
499 501 * there is insufficient spare memory, however, we fall back to the
500 502 * minimum.
501 503 *
502 504 * Live dump (savecore -L) always uses the minimum config.
503 505 *
504 506 * clevel 0 is single threaded lzjb
505 507 * clevel 1 is parallel lzjb
506 508 * clevel 2 is parallel bzip2
507 509 *
508 510 * The ncpu threshold is selected with dump_plat_mincpu.
509 511 * On OPL, set_platform_defaults() overrides the sun4u setting.
510 512 * The actual values are defined via DUMP_PLAT_*_MINCPU macros.
511 513 *
512 514 * Architecture Threshold Algorithm
513 515 * sun4u < 51 parallel lzjb
514 516 * sun4u >= 51 parallel bzip2(*)
515 517 * sun4u OPL < 8 parallel lzjb
516 518 * sun4u OPL >= 8 parallel bzip2(*)
517 519 * sun4v < 128 parallel lzjb
518 520 * sun4v >= 128 parallel bzip2(*)
519 521 * x86 < 11 parallel lzjb
520 522 * x86 >= 11 parallel bzip2(*)
521 523 * 32-bit N/A single-threaded lzjb
522 524 *
523 525 * (*) bzip2 is only chosen if there is sufficient available
524 526 * memory for buffers at dump time. See dumpsys_get_maxmem().
525 527 *
526 528 * Faster dump devices have larger I/O buffers. The threshold value is
527 529 * increased according to the size of the dump I/O buffer, because
528 530 * parallel lzjb performs better with faster disks. For buffers >= 1MB
529 531 * the threshold is 3X; for buffers >= 256K threshold is 2X.
530 532 *
531 533 * For parallel dumps, the number of helpers is ncpu-1. The CPU
532 534 * running panic runs the main task. For single-threaded dumps, the
533 535 * panic CPU does lzjb compression (it is tagged as MAINHELPER.)
534 536 *
535 537 * Need multiple buffers per helper so that they do not block waiting
536 538 * for the main task.
537 539 * parallel single-threaded
538 540 * Number of output buffers: nhelper*2 1
539 541 * Number of mapping buffers: nhelper*4 1
540 542 *
541 543 */
542 544 static void
543 545 dump_update_clevel()
544 546 {
545 547 int tag;
546 548 size_t bz2size;
547 549 helper_t *hp, *hpend;
548 550 cbuf_t *cp, *cpend;
549 551 dumpcfg_t *old = &dumpcfg;
550 552 dumpcfg_t newcfg = *old;
551 553 dumpcfg_t *new = &newcfg;
552 554
553 555 ASSERT(MUTEX_HELD(&dump_lock));
554 556
555 557 /*
556 558 * Free the previously allocated bufs and VM.
557 559 */
558 560 if (old->helper != NULL) {
559 561
560 562 /* helpers */
561 563 hpend = &old->helper[old->nhelper];
562 564 for (hp = old->helper; hp != hpend; hp++) {
563 565 if (hp->lzbuf != NULL)
564 566 kmem_free(hp->lzbuf, PAGESIZE);
565 567 if (hp->page != NULL)
566 568 kmem_free(hp->page, PAGESIZE);
567 569 }
568 570 kmem_free(old->helper, old->nhelper * sizeof (helper_t));
569 571
570 572 /* VM space for mapping pages */
571 573 cpend = &old->cmap[old->ncmap];
572 574 for (cp = old->cmap; cp != cpend; cp++)
573 575 vmem_xfree(heap_arena, cp->buf, CBUF_MAPSIZE);
574 576 kmem_free(old->cmap, old->ncmap * sizeof (cbuf_t));
575 577
576 578 /* output bufs */
577 579 cpend = &old->cbuf[old->ncbuf];
578 580 for (cp = old->cbuf; cp != cpend; cp++)
579 581 if (cp->buf != NULL)
580 582 kmem_free(cp->buf, cp->size);
581 583 kmem_free(old->cbuf, old->ncbuf * sizeof (cbuf_t));
582 584
583 585 /* reserved VM for dumpsys_get_maxmem */
584 586 if (old->maxvmsize > 0)
585 587 vmem_xfree(heap_arena, old->maxvm, old->maxvmsize);
586 588 }
587 589
588 590 /*
589 591 * Allocate memory and VM.
590 592 * One CPU runs dumpsys, the rest are helpers.
591 593 */
592 594 new->nhelper = ncpus - 1;
593 595 if (new->nhelper < 1)
594 596 new->nhelper = 1;
595 597
596 598 if (new->nhelper > DUMP_MAX_NHELPER)
597 599 new->nhelper = DUMP_MAX_NHELPER;
598 600
599 601 /* use platform default, unless /etc/system overrides */
600 602 if (dump_plat_mincpu == MINCPU_NOT_SET)
601 603 dump_plat_mincpu = dump_plat_mincpu_default;
602 604
603 605 /* increase threshold for faster disks */
604 606 new->threshold = dump_plat_mincpu;
605 607 if (dumpbuf.iosize >= DUMP_1MB)
606 608 new->threshold *= 3;
607 609 else if (dumpbuf.iosize >= (256 * DUMP_1KB))
608 610 new->threshold *= 2;
609 611
610 612 /* figure compression level based upon the computed threshold. */
611 613 if (dump_plat_mincpu == 0 || new->nhelper < 2) {
612 614 new->clevel = 0;
613 615 new->nhelper = 1;
614 616 } else if ((new->nhelper + 1) >= new->threshold) {
615 617 new->clevel = DUMP_CLEVEL_BZIP2;
616 618 } else {
617 619 new->clevel = DUMP_CLEVEL_LZJB;
618 620 }
619 621
620 622 if (new->clevel == 0) {
621 623 new->ncbuf = 1;
622 624 new->ncmap = 1;
623 625 } else {
624 626 new->ncbuf = NCBUF_PER_HELPER * new->nhelper;
625 627 new->ncmap = NCMAP_PER_HELPER * new->nhelper;
626 628 }
627 629
628 630 /*
629 631 * Allocate new data structures and buffers for MINHELPERS,
630 632 * and also figure the max desired size.
631 633 */
632 634 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level);
633 635 new->maxsize = 0;
634 636 new->maxvmsize = 0;
635 637 new->maxvm = NULL;
636 638 tag = 1;
637 639 new->helper = kmem_zalloc(new->nhelper * sizeof (helper_t), KM_SLEEP);
638 640 hpend = &new->helper[new->nhelper];
639 641 for (hp = new->helper; hp != hpend; hp++) {
640 642 hp->tag = tag++;
641 643 if (hp < &new->helper[MINHELPERS]) {
642 644 hp->lzbuf = kmem_alloc(PAGESIZE, KM_SLEEP);
643 645 hp->page = kmem_alloc(PAGESIZE, KM_SLEEP);
644 646 } else if (new->clevel < DUMP_CLEVEL_BZIP2) {
645 647 new->maxsize += 2 * PAGESIZE;
646 648 } else {
647 649 new->maxsize += PAGESIZE;
648 650 }
649 651 if (new->clevel >= DUMP_CLEVEL_BZIP2)
650 652 new->maxsize += bz2size;
651 653 }
652 654
653 655 new->cbuf = kmem_zalloc(new->ncbuf * sizeof (cbuf_t), KM_SLEEP);
654 656 cpend = &new->cbuf[new->ncbuf];
655 657 for (cp = new->cbuf; cp != cpend; cp++) {
656 658 cp->state = CBUF_FREEBUF;
657 659 cp->size = CBUF_SIZE;
658 660 if (cp < &new->cbuf[MINCBUFS])
659 661 cp->buf = kmem_alloc(cp->size, KM_SLEEP);
660 662 else
661 663 new->maxsize += cp->size;
662 664 }
663 665
664 666 new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP);
665 667 cpend = &new->cmap[new->ncmap];
666 668 for (cp = new->cmap; cp != cpend; cp++) {
667 669 cp->state = CBUF_FREEMAP;
668 670 cp->size = CBUF_MAPSIZE;
669 671 cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE,
670 672 0, 0, NULL, NULL, VM_SLEEP);
671 673 }
↓ open down ↓ |
520 lines elided |
↑ open up ↑ |
672 674
673 675 /* reserve VA to be backed with spare pages at crash time */
674 676 if (new->maxsize > 0) {
675 677 new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE);
676 678 new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE);
677 679 new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize,
678 680 CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP);
679 681 }
680 682
681 683 /*
682 - * Reserve memory for kmem allocation calls made during crash
683 - * dump. The hat layer allocates memory for each mapping
684 - * created, and the I/O path allocates buffers and data structs.
685 - * Add a few pages for safety.
684 + * Reserve memory for kmem allocation calls made during crash dump. The
685 + * hat layer allocates memory for each mapping created, and the I/O path
686 + * allocates buffers and data structs.
687 + *
688 + * On larger systems, we easily exceed the lower amount, so we need some
689 + * more space; the cut-over point is relatively arbitrary. If we run
690 + * out, the only impact is that kmem state in the dump becomes
691 + * inconsistent.
686 692 */
693 +
694 + if (dump_kmem_pages == 0) {
695 + if (physmem > (16 * ONE_GIG) / PAGESIZE)
696 + dump_kmem_pages = 20;
697 + else
698 + dump_kmem_pages = 8;
699 + }
700 +
687 701 kmem_dump_init((new->ncmap * dump_kmem_permap) +
688 702 (dump_kmem_pages * PAGESIZE));
689 703
690 704 /* set new config pointers */
691 705 *old = *new;
692 706 }
693 707
694 708 /*
695 709 * Define a struct memlist walker to optimize bitnum to pfn
696 710 * lookup. The walker maintains the state of the list traversal.
697 711 */
698 712 typedef struct dumpmlw {
699 713 struct memlist *mp; /* current memlist */
700 714 pgcnt_t basenum; /* bitnum base offset */
701 715 pgcnt_t mppages; /* current memlist size */
702 716 pgcnt_t mpleft; /* size to end of current memlist */
703 717 pfn_t mpaddr; /* first pfn in memlist */
704 718 } dumpmlw_t;
705 719
706 720 /* initialize the walker */
707 721 static inline void
708 722 dump_init_memlist_walker(dumpmlw_t *pw)
709 723 {
710 724 pw->mp = phys_install;
711 725 pw->basenum = 0;
712 726 pw->mppages = pw->mp->ml_size >> PAGESHIFT;
713 727 pw->mpleft = pw->mppages;
714 728 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT;
715 729 }
716 730
717 731 /*
718 732 * Lookup pfn given bitnum. The memlist can be quite long on some
719 733 * systems (e.g.: one per board). To optimize sequential lookups, the
720 734 * caller initializes and presents a memlist walker.
721 735 */
722 736 static pfn_t
723 737 dump_bitnum_to_pfn(pgcnt_t bitnum, dumpmlw_t *pw)
724 738 {
725 739 bitnum -= pw->basenum;
726 740 while (pw->mp != NULL) {
727 741 if (bitnum < pw->mppages) {
728 742 pw->mpleft = pw->mppages - bitnum;
729 743 return (pw->mpaddr + bitnum);
730 744 }
731 745 bitnum -= pw->mppages;
732 746 pw->basenum += pw->mppages;
733 747 pw->mp = pw->mp->ml_next;
734 748 if (pw->mp != NULL) {
735 749 pw->mppages = pw->mp->ml_size >> PAGESHIFT;
736 750 pw->mpleft = pw->mppages;
737 751 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT;
738 752 }
739 753 }
740 754 return (PFN_INVALID);
741 755 }
742 756
743 757 static pgcnt_t
744 758 dump_pfn_to_bitnum(pfn_t pfn)
745 759 {
746 760 struct memlist *mp;
747 761 pgcnt_t bitnum = 0;
748 762
749 763 for (mp = phys_install; mp != NULL; mp = mp->ml_next) {
750 764 if (pfn >= (mp->ml_address >> PAGESHIFT) &&
751 765 pfn < ((mp->ml_address + mp->ml_size) >> PAGESHIFT))
752 766 return (bitnum + pfn - (mp->ml_address >> PAGESHIFT));
753 767 bitnum += mp->ml_size >> PAGESHIFT;
754 768 }
755 769 return ((pgcnt_t)-1);
756 770 }
757 771
758 772 /*
759 773 * Set/test bitmap for a CBUF_MAPSIZE range which includes pfn. The
760 774 * mapping of pfn to range index is imperfect because pfn and bitnum
761 775 * do not have the same phase. To make sure a CBUF_MAPSIZE range is
762 776 * covered, call this for both ends:
763 777 * dump_set_used(base)
764 778 * dump_set_used(base+CBUF_MAPNP-1)
765 779 *
766 780 * This is used during a panic dump to mark pages allocated by
767 781 * dumpsys_get_maxmem(). The macro IS_DUMP_PAGE(pp) is used by
768 782 * page_get_mnode_freelist() to make sure pages used by dump are never
769 783 * allocated.
770 784 */
771 785 #define CBUF_MAPP2R(pfn) ((pfn) >> (CBUF_MAPSHIFT - PAGESHIFT))
772 786
773 787 static void
774 788 dump_set_used(pfn_t pfn)
775 789 {
776 790
777 791 pgcnt_t bitnum, rbitnum;
778 792
779 793 bitnum = dump_pfn_to_bitnum(pfn);
780 794 ASSERT(bitnum != (pgcnt_t)-1);
781 795
782 796 rbitnum = CBUF_MAPP2R(bitnum);
783 797 ASSERT(rbitnum < dumpcfg.rbitmapsize);
784 798
785 799 BT_SET(dumpcfg.rbitmap, rbitnum);
786 800 }
787 801
788 802 int
789 803 dump_test_used(pfn_t pfn)
790 804 {
791 805 pgcnt_t bitnum, rbitnum;
792 806
793 807 bitnum = dump_pfn_to_bitnum(pfn);
794 808 ASSERT(bitnum != (pgcnt_t)-1);
795 809
796 810 rbitnum = CBUF_MAPP2R(bitnum);
797 811 ASSERT(rbitnum < dumpcfg.rbitmapsize);
798 812
799 813 return (BT_TEST(dumpcfg.rbitmap, rbitnum));
800 814 }
801 815
802 816 /*
803 817 * dumpbzalloc and dumpbzfree are callbacks from the bzip2 library.
804 818 * dumpsys_get_maxmem() uses them for BZ2_bzCompressInit().
805 819 */
806 820 static void *
807 821 dumpbzalloc(void *opaque, int items, int size)
808 822 {
809 823 size_t *sz;
810 824 char *ret;
811 825
812 826 ASSERT(opaque != NULL);
813 827 sz = opaque;
814 828 ret = dumpcfg.maxvm + *sz;
815 829 *sz += items * size;
816 830 *sz = P2ROUNDUP(*sz, BZ2_BZALLOC_ALIGN);
817 831 ASSERT(*sz <= dumpcfg.maxvmsize);
818 832 return (ret);
819 833 }
820 834
821 835 /*ARGSUSED*/
822 836 static void
823 837 dumpbzfree(void *opaque, void *addr)
824 838 {
825 839 }
826 840
827 841 /*
828 842 * Perform additional checks on the page to see if we can really use
829 843 * it. The kernel (kas) pages are always set in the bitmap. However,
830 844 * boot memory pages (prom_ppages or P_BOOTPAGES) are not in the
831 845 * bitmap. So we check for them.
832 846 */
833 847 static inline int
834 848 dump_pfn_check(pfn_t pfn)
835 849 {
836 850 page_t *pp = page_numtopp_nolock(pfn);
837 851 if (pp == NULL || pp->p_pagenum != pfn ||
838 852 #if defined(__sparc)
839 853 pp->p_vnode == &promvp ||
840 854 #else
841 855 PP_ISBOOTPAGES(pp) ||
842 856 #endif
843 857 pp->p_toxic != 0)
844 858 return (0);
845 859 return (1);
846 860 }
847 861
848 862 /*
849 863 * Check a range to see if all contained pages are available and
850 864 * return non-zero if the range can be used.
851 865 */
852 866 static inline int
853 867 dump_range_check(pgcnt_t start, pgcnt_t end, pfn_t pfn)
854 868 {
855 869 for (; start < end; start++, pfn++) {
856 870 if (BT_TEST(dumpcfg.bitmap, start))
857 871 return (0);
858 872 if (!dump_pfn_check(pfn))
859 873 return (0);
860 874 }
861 875 return (1);
862 876 }
863 877
864 878 /*
865 879 * dumpsys_get_maxmem() is called during panic. Find unused ranges
866 880 * and use them for buffers. If we find enough memory switch to
867 881 * parallel bzip2, otherwise use parallel lzjb.
868 882 *
869 883 * It searches the dump bitmap in 2 passes. The first time it looks
870 884 * for CBUF_MAPSIZE ranges. On the second pass it uses small pages.
871 885 */
872 886 static void
873 887 dumpsys_get_maxmem()
874 888 {
875 889 dumpcfg_t *cfg = &dumpcfg;
876 890 cbuf_t *endcp = &cfg->cbuf[cfg->ncbuf];
877 891 helper_t *endhp = &cfg->helper[cfg->nhelper];
878 892 pgcnt_t bitnum, end;
879 893 size_t sz, endsz, bz2size;
880 894 pfn_t pfn, off;
881 895 cbuf_t *cp;
882 896 helper_t *hp, *ohp;
883 897 dumpmlw_t mlw;
884 898 int k;
885 899
886 900 /*
887 901 * Setting dump_plat_mincpu to 0 at any time forces a serial
888 902 * dump.
889 903 */
890 904 if (dump_plat_mincpu == 0) {
891 905 cfg->clevel = 0;
892 906 return;
893 907 }
894 908
895 909 /*
896 910 * There may be no point in looking for spare memory. If
897 911 * dumping all memory, then none is spare. If doing a serial
898 912 * dump, then already have buffers.
899 913 */
900 914 if (cfg->maxsize == 0 || cfg->clevel < DUMP_CLEVEL_LZJB ||
901 915 (dump_conflags & DUMP_ALL) != 0) {
902 916 if (cfg->clevel > DUMP_CLEVEL_LZJB)
903 917 cfg->clevel = DUMP_CLEVEL_LZJB;
904 918 return;
905 919 }
906 920
907 921 sz = 0;
908 922 cfg->found4m = 0;
909 923 cfg->foundsm = 0;
910 924
911 925 /* bitmap of ranges used to estimate which pfns are being used */
912 926 bzero(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.rbitmapsize));
913 927
914 928 /* find ranges that are not being dumped to use for buffers */
915 929 dump_init_memlist_walker(&mlw);
916 930 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) {
917 931 dump_timeleft = dump_timeout;
918 932 end = bitnum + CBUF_MAPNP;
919 933 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
920 934 ASSERT(pfn != PFN_INVALID);
921 935
922 936 /* skip partial range at end of mem segment */
923 937 if (mlw.mpleft < CBUF_MAPNP) {
924 938 end = bitnum + mlw.mpleft;
925 939 continue;
926 940 }
927 941
928 942 /* skip non aligned pages */
929 943 off = P2PHASE(pfn, CBUF_MAPNP);
930 944 if (off != 0) {
931 945 end -= off;
932 946 continue;
933 947 }
934 948
935 949 if (!dump_range_check(bitnum, end, pfn))
936 950 continue;
937 951
938 952 ASSERT((sz + CBUF_MAPSIZE) <= cfg->maxvmsize);
939 953 hat_devload(kas.a_hat, cfg->maxvm + sz, CBUF_MAPSIZE, pfn,
940 954 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST);
941 955 sz += CBUF_MAPSIZE;
942 956 cfg->found4m++;
943 957
944 958 /* set the bitmap for both ends to be sure to cover the range */
945 959 dump_set_used(pfn);
946 960 dump_set_used(pfn + CBUF_MAPNP - 1);
947 961
948 962 if (sz >= cfg->maxsize)
949 963 goto foundmax;
950 964 }
951 965
952 966 /* Add small pages if we can't find enough large pages. */
953 967 dump_init_memlist_walker(&mlw);
954 968 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) {
955 969 dump_timeleft = dump_timeout;
956 970 end = bitnum + CBUF_MAPNP;
957 971 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
958 972 ASSERT(pfn != PFN_INVALID);
959 973
960 974 /* Find any non-aligned pages at start and end of segment. */
961 975 off = P2PHASE(pfn, CBUF_MAPNP);
962 976 if (mlw.mpleft < CBUF_MAPNP) {
963 977 end = bitnum + mlw.mpleft;
964 978 } else if (off != 0) {
965 979 end -= off;
966 980 } else if (cfg->found4m && dump_test_used(pfn)) {
967 981 continue;
968 982 }
969 983
970 984 for (; bitnum < end; bitnum++, pfn++) {
971 985 dump_timeleft = dump_timeout;
972 986 if (BT_TEST(dumpcfg.bitmap, bitnum))
973 987 continue;
974 988 if (!dump_pfn_check(pfn))
975 989 continue;
976 990 ASSERT((sz + PAGESIZE) <= cfg->maxvmsize);
977 991 hat_devload(kas.a_hat, cfg->maxvm + sz, PAGESIZE, pfn,
978 992 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST);
979 993 sz += PAGESIZE;
980 994 cfg->foundsm++;
981 995 dump_set_used(pfn);
982 996 if (sz >= cfg->maxsize)
983 997 goto foundmax;
984 998 }
985 999 }
986 1000
987 1001 /* Fall back to lzjb if we did not get enough memory for bzip2. */
988 1002 endsz = (cfg->maxsize * cfg->threshold) / cfg->nhelper;
989 1003 if (sz < endsz) {
990 1004 cfg->clevel = DUMP_CLEVEL_LZJB;
991 1005 }
992 1006
993 1007 /* Allocate memory for as many helpers as we can. */
994 1008 foundmax:
995 1009
996 1010 /* Byte offsets into memory found and mapped above */
997 1011 endsz = sz;
998 1012 sz = 0;
999 1013
1000 1014 /* Set the size for bzip2 state. Only bzip2 needs it. */
1001 1015 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level);
1002 1016
1003 1017 /* Skip the preallocate output buffers. */
1004 1018 cp = &cfg->cbuf[MINCBUFS];
1005 1019
1006 1020 /* Use this to move memory up from the preallocated helpers. */
1007 1021 ohp = cfg->helper;
1008 1022
1009 1023 /* Loop over all helpers and allocate memory. */
1010 1024 for (hp = cfg->helper; hp < endhp; hp++) {
1011 1025
1012 1026 /* Skip preallocated helpers by checking hp->page. */
1013 1027 if (hp->page == NULL) {
1014 1028 if (cfg->clevel <= DUMP_CLEVEL_LZJB) {
1015 1029 /* lzjb needs 2 1-page buffers */
1016 1030 if ((sz + (2 * PAGESIZE)) > endsz)
1017 1031 break;
1018 1032 hp->page = cfg->maxvm + sz;
1019 1033 sz += PAGESIZE;
1020 1034 hp->lzbuf = cfg->maxvm + sz;
1021 1035 sz += PAGESIZE;
1022 1036
1023 1037 } else if (ohp->lzbuf != NULL) {
1024 1038 /* re-use the preallocted lzjb page for bzip2 */
1025 1039 hp->page = ohp->lzbuf;
1026 1040 ohp->lzbuf = NULL;
1027 1041 ++ohp;
1028 1042
1029 1043 } else {
1030 1044 /* bzip2 needs a 1-page buffer */
1031 1045 if ((sz + PAGESIZE) > endsz)
1032 1046 break;
1033 1047 hp->page = cfg->maxvm + sz;
1034 1048 sz += PAGESIZE;
1035 1049 }
1036 1050 }
1037 1051
1038 1052 /*
1039 1053 * Add output buffers per helper. The number of
1040 1054 * buffers per helper is determined by the ratio of
1041 1055 * ncbuf to nhelper.
1042 1056 */
1043 1057 for (k = 0; cp < endcp && (sz + CBUF_SIZE) <= endsz &&
1044 1058 k < NCBUF_PER_HELPER; k++) {
1045 1059 cp->state = CBUF_FREEBUF;
1046 1060 cp->size = CBUF_SIZE;
1047 1061 cp->buf = cfg->maxvm + sz;
1048 1062 sz += CBUF_SIZE;
1049 1063 ++cp;
1050 1064 }
1051 1065
1052 1066 /*
1053 1067 * bzip2 needs compression state. Use the dumpbzalloc
1054 1068 * and dumpbzfree callbacks to allocate the memory.
1055 1069 * bzip2 does allocation only at init time.
1056 1070 */
1057 1071 if (cfg->clevel >= DUMP_CLEVEL_BZIP2) {
1058 1072 if ((sz + bz2size) > endsz) {
1059 1073 hp->page = NULL;
1060 1074 break;
1061 1075 } else {
1062 1076 hp->bzstream.opaque = &sz;
1063 1077 hp->bzstream.bzalloc = dumpbzalloc;
1064 1078 hp->bzstream.bzfree = dumpbzfree;
1065 1079 (void) BZ2_bzCompressInit(&hp->bzstream,
1066 1080 dump_bzip2_level, 0, 0);
1067 1081 hp->bzstream.opaque = NULL;
1068 1082 }
1069 1083 }
1070 1084 }
1071 1085
1072 1086 /* Finish allocating output buffers */
1073 1087 for (; cp < endcp && (sz + CBUF_SIZE) <= endsz; cp++) {
1074 1088 cp->state = CBUF_FREEBUF;
1075 1089 cp->size = CBUF_SIZE;
1076 1090 cp->buf = cfg->maxvm + sz;
1077 1091 sz += CBUF_SIZE;
1078 1092 }
1079 1093
1080 1094 /* Enable IS_DUMP_PAGE macro, which checks for pages we took. */
1081 1095 if (cfg->found4m || cfg->foundsm)
1082 1096 dump_check_used = 1;
1083 1097
1084 1098 ASSERT(sz <= endsz);
1085 1099 }
1086 1100
1087 1101 static void
1088 1102 dumphdr_init(void)
1089 1103 {
1090 1104 pgcnt_t npages = 0;
1091 1105
1092 1106 ASSERT(MUTEX_HELD(&dump_lock));
1093 1107
1094 1108 if (dumphdr == NULL) {
1095 1109 dumphdr = kmem_zalloc(sizeof (dumphdr_t), KM_SLEEP);
1096 1110 dumphdr->dump_magic = DUMP_MAGIC;
1097 1111 dumphdr->dump_version = DUMP_VERSION;
1098 1112 dumphdr->dump_wordsize = DUMP_WORDSIZE;
1099 1113 dumphdr->dump_pageshift = PAGESHIFT;
1100 1114 dumphdr->dump_pagesize = PAGESIZE;
1101 1115 dumphdr->dump_utsname = utsname;
1102 1116 (void) strcpy(dumphdr->dump_platform, platform);
1103 1117 dumpbuf.size = dumpbuf_iosize(maxphys);
1104 1118 dumpbuf.start = kmem_alloc(dumpbuf.size, KM_SLEEP);
1105 1119 dumpbuf.end = dumpbuf.start + dumpbuf.size;
1106 1120 dumpcfg.pids = kmem_alloc(v.v_proc * sizeof (pid_t), KM_SLEEP);
1107 1121 dumpcfg.helpermap = kmem_zalloc(BT_SIZEOFMAP(NCPU), KM_SLEEP);
1108 1122 LOCK_INIT_HELD(&dumpcfg.helper_lock);
1109 1123 dump_stack_scratch = kmem_alloc(STACK_BUF_SIZE, KM_SLEEP);
1110 1124 (void) strncpy(dumphdr->dump_uuid, dump_get_uuid(),
1111 1125 sizeof (dumphdr->dump_uuid));
1112 1126 }
1113 1127
1114 1128 npages = num_phys_pages();
1115 1129
1116 1130 if (dumpcfg.bitmapsize != npages) {
1117 1131 size_t rlen = CBUF_MAPP2R(P2ROUNDUP(npages, CBUF_MAPNP));
1118 1132 void *map = kmem_alloc(BT_SIZEOFMAP(npages), KM_SLEEP);
1119 1133 void *rmap = kmem_alloc(BT_SIZEOFMAP(rlen), KM_SLEEP);
1120 1134
1121 1135 if (dumpcfg.bitmap != NULL)
1122 1136 kmem_free(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.
1123 1137 bitmapsize));
1124 1138 if (dumpcfg.rbitmap != NULL)
1125 1139 kmem_free(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.
1126 1140 rbitmapsize));
1127 1141 dumpcfg.bitmap = map;
1128 1142 dumpcfg.bitmapsize = npages;
1129 1143 dumpcfg.rbitmap = rmap;
1130 1144 dumpcfg.rbitmapsize = rlen;
1131 1145 }
1132 1146 }
1133 1147
1134 1148 /*
1135 1149 * Establish a new dump device.
1136 1150 */
1137 1151 int
1138 1152 dumpinit(vnode_t *vp, char *name, int justchecking)
1139 1153 {
1140 1154 vnode_t *cvp;
1141 1155 vattr_t vattr;
1142 1156 vnode_t *cdev_vp;
1143 1157 int error = 0;
1144 1158
1145 1159 ASSERT(MUTEX_HELD(&dump_lock));
1146 1160
1147 1161 dumphdr_init();
1148 1162
1149 1163 cvp = common_specvp(vp);
1150 1164 if (cvp == dumpvp)
1151 1165 return (0);
1152 1166
1153 1167 /*
1154 1168 * Determine whether this is a plausible dump device. We want either:
1155 1169 * (1) a real device that's not mounted and has a cb_dump routine, or
1156 1170 * (2) a swapfile on some filesystem that has a vop_dump routine.
1157 1171 */
1158 1172 if ((error = VOP_OPEN(&cvp, FREAD | FWRITE, kcred, NULL)) != 0)
1159 1173 return (error);
1160 1174
1161 1175 vattr.va_mask = AT_SIZE | AT_TYPE | AT_RDEV;
1162 1176 if ((error = VOP_GETATTR(cvp, &vattr, 0, kcred, NULL)) == 0) {
1163 1177 if (vattr.va_type == VBLK || vattr.va_type == VCHR) {
1164 1178 if (devopsp[getmajor(vattr.va_rdev)]->
1165 1179 devo_cb_ops->cb_dump == nodev)
1166 1180 error = ENOTSUP;
1167 1181 else if (vfs_devismounted(vattr.va_rdev))
1168 1182 error = EBUSY;
1169 1183 if (strcmp(ddi_driver_name(VTOS(cvp)->s_dip),
1170 1184 ZFS_DRIVER) == 0 &&
1171 1185 IS_SWAPVP(common_specvp(cvp)))
1172 1186 error = EBUSY;
1173 1187 } else {
1174 1188 if (vn_matchopval(cvp, VOPNAME_DUMP, fs_nosys) ||
1175 1189 !IS_SWAPVP(cvp))
1176 1190 error = ENOTSUP;
1177 1191 }
1178 1192 }
1179 1193
1180 1194 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE)
1181 1195 error = ENOSPC;
1182 1196
1183 1197 if (error || justchecking) {
1184 1198 (void) VOP_CLOSE(cvp, FREAD | FWRITE, 1, (offset_t)0,
1185 1199 kcred, NULL);
1186 1200 return (error);
1187 1201 }
1188 1202
1189 1203 VN_HOLD(cvp);
1190 1204
1191 1205 if (dumpvp != NULL)
1192 1206 dumpfini(); /* unconfigure the old dump device */
1193 1207
1194 1208 dumpvp = cvp;
1195 1209 dumpvp_size = vattr.va_size & -DUMP_OFFSET;
1196 1210 dumppath = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1197 1211 (void) strcpy(dumppath, name);
1198 1212 dumpbuf.iosize = 0;
1199 1213
1200 1214 /*
1201 1215 * If the dump device is a block device, attempt to open up the
1202 1216 * corresponding character device and determine its maximum transfer
1203 1217 * size. We use this information to potentially resize dumpbuf to a
1204 1218 * larger and more optimal size for performing i/o to the dump device.
1205 1219 */
1206 1220 if (cvp->v_type == VBLK &&
1207 1221 (cdev_vp = makespecvp(VTOS(cvp)->s_dev, VCHR)) != NULL) {
1208 1222 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) {
1209 1223 size_t blk_size;
1210 1224 struct dk_cinfo dki;
1211 1225 struct dk_minfo minf;
1212 1226
1213 1227 if (VOP_IOCTL(cdev_vp, DKIOCGMEDIAINFO,
1214 1228 (intptr_t)&minf, FKIOCTL, kcred, NULL, NULL)
1215 1229 == 0 && minf.dki_lbsize != 0)
1216 1230 blk_size = minf.dki_lbsize;
1217 1231 else
1218 1232 blk_size = DEV_BSIZE;
1219 1233
1220 1234 if (VOP_IOCTL(cdev_vp, DKIOCINFO, (intptr_t)&dki,
1221 1235 FKIOCTL, kcred, NULL, NULL) == 0) {
1222 1236 dumpbuf.iosize = dki.dki_maxtransfer * blk_size;
1223 1237 dumpbuf_resize();
1224 1238 }
1225 1239 /*
1226 1240 * If we are working with a zvol then dumpify it
1227 1241 * if it's not being used as swap.
1228 1242 */
1229 1243 if (strcmp(dki.dki_dname, ZVOL_DRIVER) == 0) {
1230 1244 if (IS_SWAPVP(common_specvp(cvp)))
1231 1245 error = EBUSY;
1232 1246 else if ((error = VOP_IOCTL(cdev_vp,
1233 1247 DKIOCDUMPINIT, NULL, FKIOCTL, kcred,
1234 1248 NULL, NULL)) != 0)
1235 1249 dumpfini();
1236 1250 }
1237 1251
1238 1252 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0,
1239 1253 kcred, NULL);
1240 1254 }
1241 1255
1242 1256 VN_RELE(cdev_vp);
1243 1257 }
1244 1258
1245 1259 cmn_err(CE_CONT, "?dump on %s size %llu MB\n", name, dumpvp_size >> 20);
1246 1260
1247 1261 dump_update_clevel();
1248 1262
1249 1263 return (error);
1250 1264 }
1251 1265
1252 1266 void
1253 1267 dumpfini(void)
1254 1268 {
1255 1269 vattr_t vattr;
1256 1270 boolean_t is_zfs = B_FALSE;
1257 1271 vnode_t *cdev_vp;
1258 1272 ASSERT(MUTEX_HELD(&dump_lock));
1259 1273
1260 1274 kmem_free(dumppath, strlen(dumppath) + 1);
1261 1275
1262 1276 /*
1263 1277 * Determine if we are using zvols for our dump device
1264 1278 */
1265 1279 vattr.va_mask = AT_RDEV;
1266 1280 if (VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL) == 0) {
1267 1281 is_zfs = (getmajor(vattr.va_rdev) ==
1268 1282 ddi_name_to_major(ZFS_DRIVER)) ? B_TRUE : B_FALSE;
1269 1283 }
1270 1284
1271 1285 /*
1272 1286 * If we have a zvol dump device then we call into zfs so
1273 1287 * that it may have a chance to cleanup.
1274 1288 */
1275 1289 if (is_zfs &&
1276 1290 (cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR)) != NULL) {
1277 1291 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) {
1278 1292 (void) VOP_IOCTL(cdev_vp, DKIOCDUMPFINI, NULL, FKIOCTL,
1279 1293 kcred, NULL, NULL);
1280 1294 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0,
1281 1295 kcred, NULL);
1282 1296 }
1283 1297 VN_RELE(cdev_vp);
1284 1298 }
1285 1299
1286 1300 (void) VOP_CLOSE(dumpvp, FREAD | FWRITE, 1, (offset_t)0, kcred, NULL);
1287 1301
1288 1302 VN_RELE(dumpvp);
1289 1303
1290 1304 dumpvp = NULL;
1291 1305 dumpvp_size = 0;
1292 1306 dumppath = NULL;
1293 1307 }
1294 1308
1295 1309 static offset_t
1296 1310 dumpvp_flush(void)
1297 1311 {
1298 1312 size_t size = P2ROUNDUP(dumpbuf.cur - dumpbuf.start, PAGESIZE);
1299 1313 hrtime_t iotime;
1300 1314 int err;
1301 1315
1302 1316 if (dumpbuf.vp_off + size > dumpbuf.vp_limit) {
1303 1317 dump_ioerr = ENOSPC;
1304 1318 dumpbuf.vp_off = dumpbuf.vp_limit;
1305 1319 } else if (size != 0) {
1306 1320 iotime = gethrtime();
1307 1321 dumpsync.iowait += iotime - dumpsync.iowaitts;
1308 1322 if (panicstr)
1309 1323 err = VOP_DUMP(dumpvp, dumpbuf.start,
1310 1324 lbtodb(dumpbuf.vp_off), btod(size), NULL);
1311 1325 else
1312 1326 err = vn_rdwr(UIO_WRITE, dumpbuf.cdev_vp != NULL ?
1313 1327 dumpbuf.cdev_vp : dumpvp, dumpbuf.start, size,
1314 1328 dumpbuf.vp_off, UIO_SYSSPACE, 0, dumpbuf.vp_limit,
1315 1329 kcred, 0);
1316 1330 if (err && dump_ioerr == 0)
1317 1331 dump_ioerr = err;
1318 1332 dumpsync.iowaitts = gethrtime();
1319 1333 dumpsync.iotime += dumpsync.iowaitts - iotime;
1320 1334 dumpsync.nwrite += size;
1321 1335 dumpbuf.vp_off += size;
1322 1336 }
1323 1337 dumpbuf.cur = dumpbuf.start;
1324 1338 dump_timeleft = dump_timeout;
1325 1339 return (dumpbuf.vp_off);
1326 1340 }
1327 1341
1328 1342 /* maximize write speed by keeping seek offset aligned with size */
1329 1343 void
1330 1344 dumpvp_write(const void *va, size_t size)
1331 1345 {
1332 1346 size_t len, off, sz;
1333 1347
1334 1348 while (size != 0) {
1335 1349 len = MIN(size, dumpbuf.end - dumpbuf.cur);
1336 1350 if (len == 0) {
1337 1351 off = P2PHASE(dumpbuf.vp_off, dumpbuf.size);
1338 1352 if (off == 0 || !ISP2(dumpbuf.size)) {
1339 1353 (void) dumpvp_flush();
1340 1354 } else {
1341 1355 sz = dumpbuf.size - off;
1342 1356 dumpbuf.cur = dumpbuf.start + sz;
1343 1357 (void) dumpvp_flush();
1344 1358 ovbcopy(dumpbuf.start + sz, dumpbuf.start, off);
1345 1359 dumpbuf.cur += off;
1346 1360 }
1347 1361 } else {
1348 1362 bcopy(va, dumpbuf.cur, len);
1349 1363 va = (char *)va + len;
1350 1364 dumpbuf.cur += len;
1351 1365 size -= len;
1352 1366 }
1353 1367 }
1354 1368 }
1355 1369
1356 1370 /*ARGSUSED*/
1357 1371 static void
1358 1372 dumpvp_ksyms_write(const void *src, void *dst, size_t size)
1359 1373 {
1360 1374 dumpvp_write(src, size);
1361 1375 }
1362 1376
1363 1377 /*
1364 1378 * Mark 'pfn' in the bitmap and dump its translation table entry.
1365 1379 */
1366 1380 void
1367 1381 dump_addpage(struct as *as, void *va, pfn_t pfn)
1368 1382 {
1369 1383 mem_vtop_t mem_vtop;
1370 1384 pgcnt_t bitnum;
1371 1385
1372 1386 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) {
1373 1387 if (!BT_TEST(dumpcfg.bitmap, bitnum)) {
1374 1388 dumphdr->dump_npages++;
1375 1389 BT_SET(dumpcfg.bitmap, bitnum);
1376 1390 }
1377 1391 dumphdr->dump_nvtop++;
1378 1392 mem_vtop.m_as = as;
1379 1393 mem_vtop.m_va = va;
1380 1394 mem_vtop.m_pfn = pfn;
1381 1395 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
1382 1396 }
1383 1397 dump_timeleft = dump_timeout;
1384 1398 }
1385 1399
1386 1400 /*
1387 1401 * Mark 'pfn' in the bitmap
1388 1402 */
1389 1403 void
1390 1404 dump_page(pfn_t pfn)
1391 1405 {
1392 1406 pgcnt_t bitnum;
1393 1407
1394 1408 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) {
1395 1409 if (!BT_TEST(dumpcfg.bitmap, bitnum)) {
1396 1410 dumphdr->dump_npages++;
1397 1411 BT_SET(dumpcfg.bitmap, bitnum);
1398 1412 }
1399 1413 }
1400 1414 dump_timeleft = dump_timeout;
1401 1415 }
1402 1416
1403 1417 /*
1404 1418 * Dump the <as, va, pfn> information for a given address space.
1405 1419 * SEGOP_DUMP() will call dump_addpage() for each page in the segment.
1406 1420 */
1407 1421 static void
1408 1422 dump_as(struct as *as)
1409 1423 {
1410 1424 struct seg *seg;
1411 1425
1412 1426 AS_LOCK_ENTER(as, RW_READER);
1413 1427 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
1414 1428 if (seg->s_as != as)
1415 1429 break;
1416 1430 if (seg->s_ops == NULL)
1417 1431 continue;
1418 1432 SEGOP_DUMP(seg);
1419 1433 }
1420 1434 AS_LOCK_EXIT(as);
1421 1435
1422 1436 if (seg != NULL)
1423 1437 cmn_err(CE_WARN, "invalid segment %p in address space %p",
1424 1438 (void *)seg, (void *)as);
1425 1439 }
1426 1440
1427 1441 static int
1428 1442 dump_process(pid_t pid)
1429 1443 {
1430 1444 proc_t *p = sprlock(pid);
1431 1445
1432 1446 if (p == NULL)
1433 1447 return (-1);
1434 1448 if (p->p_as != &kas) {
1435 1449 mutex_exit(&p->p_lock);
1436 1450 dump_as(p->p_as);
1437 1451 mutex_enter(&p->p_lock);
1438 1452 }
1439 1453
1440 1454 sprunlock(p);
1441 1455
1442 1456 return (0);
1443 1457 }
1444 1458
1445 1459 /*
1446 1460 * The following functions (dump_summary(), dump_ereports(), and
1447 1461 * dump_messages()), write data to an uncompressed area within the
1448 1462 * crashdump. The layout of these is
1449 1463 *
1450 1464 * +------------------------------------------------------------+
1451 1465 * | compressed pages | summary | ereports | messages |
1452 1466 * +------------------------------------------------------------+
1453 1467 *
1454 1468 * With the advent of saving a compressed crash dump by default, we
1455 1469 * need to save a little more data to describe the failure mode in
1456 1470 * an uncompressed buffer available before savecore uncompresses
1457 1471 * the dump. Initially this is a copy of the stack trace. Additional
1458 1472 * summary information should be added here.
1459 1473 */
1460 1474
1461 1475 void
1462 1476 dump_summary(void)
1463 1477 {
1464 1478 u_offset_t dumpvp_start;
1465 1479 summary_dump_t sd;
1466 1480
1467 1481 if (dumpvp == NULL || dumphdr == NULL)
1468 1482 return;
1469 1483
1470 1484 dumpbuf.cur = dumpbuf.start;
1471 1485
1472 1486 dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE +
1473 1487 DUMP_ERPTSIZE);
1474 1488 dumpvp_start = dumpbuf.vp_limit - DUMP_SUMMARYSIZE;
1475 1489 dumpbuf.vp_off = dumpvp_start;
1476 1490
1477 1491 sd.sd_magic = SUMMARY_MAGIC;
1478 1492 sd.sd_ssum = checksum32(dump_stack_scratch, STACK_BUF_SIZE);
1479 1493 dumpvp_write(&sd, sizeof (sd));
1480 1494 dumpvp_write(dump_stack_scratch, STACK_BUF_SIZE);
1481 1495
1482 1496 sd.sd_magic = 0; /* indicate end of summary */
1483 1497 dumpvp_write(&sd, sizeof (sd));
1484 1498 (void) dumpvp_flush();
1485 1499 }
1486 1500
1487 1501 void
1488 1502 dump_ereports(void)
1489 1503 {
1490 1504 u_offset_t dumpvp_start;
1491 1505 erpt_dump_t ed;
1492 1506
1493 1507 if (dumpvp == NULL || dumphdr == NULL)
1494 1508 return;
1495 1509
1496 1510 dumpbuf.cur = dumpbuf.start;
1497 1511 dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE);
1498 1512 dumpvp_start = dumpbuf.vp_limit - DUMP_ERPTSIZE;
1499 1513 dumpbuf.vp_off = dumpvp_start;
1500 1514
1501 1515 fm_ereport_dump();
1502 1516 if (panicstr)
1503 1517 errorq_dump();
1504 1518
1505 1519 bzero(&ed, sizeof (ed)); /* indicate end of ereports */
1506 1520 dumpvp_write(&ed, sizeof (ed));
1507 1521 (void) dumpvp_flush();
1508 1522
1509 1523 if (!panicstr) {
1510 1524 (void) VOP_PUTPAGE(dumpvp, dumpvp_start,
1511 1525 (size_t)(dumpbuf.vp_off - dumpvp_start),
1512 1526 B_INVAL | B_FORCE, kcred, NULL);
1513 1527 }
1514 1528 }
1515 1529
1516 1530 void
1517 1531 dump_messages(void)
1518 1532 {
1519 1533 log_dump_t ld;
1520 1534 mblk_t *mctl, *mdata;
1521 1535 queue_t *q, *qlast;
1522 1536 u_offset_t dumpvp_start;
1523 1537
1524 1538 if (dumpvp == NULL || dumphdr == NULL || log_consq == NULL)
1525 1539 return;
1526 1540
1527 1541 dumpbuf.cur = dumpbuf.start;
1528 1542 dumpbuf.vp_limit = dumpvp_size - DUMP_OFFSET;
1529 1543 dumpvp_start = dumpbuf.vp_limit - DUMP_LOGSIZE;
1530 1544 dumpbuf.vp_off = dumpvp_start;
1531 1545
1532 1546 qlast = NULL;
1533 1547 do {
1534 1548 for (q = log_consq; q->q_next != qlast; q = q->q_next)
1535 1549 continue;
1536 1550 for (mctl = q->q_first; mctl != NULL; mctl = mctl->b_next) {
1537 1551 dump_timeleft = dump_timeout;
1538 1552 mdata = mctl->b_cont;
1539 1553 ld.ld_magic = LOG_MAGIC;
1540 1554 ld.ld_msgsize = MBLKL(mctl->b_cont);
1541 1555 ld.ld_csum = checksum32(mctl->b_rptr, MBLKL(mctl));
1542 1556 ld.ld_msum = checksum32(mdata->b_rptr, MBLKL(mdata));
1543 1557 dumpvp_write(&ld, sizeof (ld));
1544 1558 dumpvp_write(mctl->b_rptr, MBLKL(mctl));
1545 1559 dumpvp_write(mdata->b_rptr, MBLKL(mdata));
1546 1560 }
1547 1561 } while ((qlast = q) != log_consq);
1548 1562
1549 1563 ld.ld_magic = 0; /* indicate end of messages */
1550 1564 dumpvp_write(&ld, sizeof (ld));
1551 1565 (void) dumpvp_flush();
1552 1566 if (!panicstr) {
1553 1567 (void) VOP_PUTPAGE(dumpvp, dumpvp_start,
1554 1568 (size_t)(dumpbuf.vp_off - dumpvp_start),
1555 1569 B_INVAL | B_FORCE, kcred, NULL);
1556 1570 }
1557 1571 }
1558 1572
1559 1573 /*
1560 1574 * The following functions are called on multiple CPUs during dump.
1561 1575 * They must not use most kernel services, because all cross-calls are
1562 1576 * disabled during panic. Therefore, blocking locks and cache flushes
1563 1577 * will not work.
1564 1578 */
1565 1579
1566 1580 /*
1567 1581 * Copy pages, trapping ECC errors. Also, for robustness, trap data
1568 1582 * access in case something goes wrong in the hat layer and the
1569 1583 * mapping is broken.
1570 1584 */
1571 1585 static int
1572 1586 dump_pagecopy(void *src, void *dst)
1573 1587 {
1574 1588 long *wsrc = (long *)src;
1575 1589 long *wdst = (long *)dst;
1576 1590 const ulong_t ncopies = PAGESIZE / sizeof (long);
1577 1591 volatile int w = 0;
1578 1592 volatile int ueoff = -1;
1579 1593 on_trap_data_t otd;
1580 1594
1581 1595 if (on_trap(&otd, OT_DATA_EC | OT_DATA_ACCESS)) {
1582 1596 if (ueoff == -1)
1583 1597 ueoff = w * sizeof (long);
1584 1598 /* report "bad ECC" or "bad address" */
1585 1599 #ifdef _LP64
1586 1600 if (otd.ot_trap & OT_DATA_EC)
1587 1601 wdst[w++] = 0x00badecc00badecc;
1588 1602 else
1589 1603 wdst[w++] = 0x00badadd00badadd;
1590 1604 #else
1591 1605 if (otd.ot_trap & OT_DATA_EC)
1592 1606 wdst[w++] = 0x00badecc;
1593 1607 else
1594 1608 wdst[w++] = 0x00badadd;
1595 1609 #endif
1596 1610 }
1597 1611 while (w < ncopies) {
1598 1612 wdst[w] = wsrc[w];
1599 1613 w++;
1600 1614 }
1601 1615 no_trap();
1602 1616 return (ueoff);
1603 1617 }
1604 1618
1605 1619 static void
1606 1620 dumpsys_close_cq(cqueue_t *cq, int live)
1607 1621 {
1608 1622 if (live) {
1609 1623 mutex_enter(&cq->mutex);
1610 1624 atomic_dec_uint(&cq->open);
1611 1625 cv_signal(&cq->cv);
1612 1626 mutex_exit(&cq->mutex);
1613 1627 } else {
1614 1628 atomic_dec_uint(&cq->open);
1615 1629 }
1616 1630 }
1617 1631
1618 1632 static inline void
1619 1633 dumpsys_spinlock(lock_t *lp)
1620 1634 {
1621 1635 uint_t backoff = 0;
1622 1636 int loop_count = 0;
1623 1637
1624 1638 while (LOCK_HELD(lp) || !lock_spin_try(lp)) {
1625 1639 if (++loop_count >= ncpus) {
1626 1640 backoff = mutex_lock_backoff(0);
1627 1641 loop_count = 0;
1628 1642 } else {
1629 1643 backoff = mutex_lock_backoff(backoff);
1630 1644 }
1631 1645 mutex_lock_delay(backoff);
1632 1646 }
1633 1647 }
1634 1648
1635 1649 static inline void
1636 1650 dumpsys_spinunlock(lock_t *lp)
1637 1651 {
1638 1652 lock_clear(lp);
1639 1653 }
1640 1654
1641 1655 static inline void
1642 1656 dumpsys_lock(cqueue_t *cq, int live)
1643 1657 {
1644 1658 if (live)
1645 1659 mutex_enter(&cq->mutex);
1646 1660 else
1647 1661 dumpsys_spinlock(&cq->spinlock);
1648 1662 }
1649 1663
1650 1664 static inline void
1651 1665 dumpsys_unlock(cqueue_t *cq, int live, int signal)
1652 1666 {
1653 1667 if (live) {
1654 1668 if (signal)
1655 1669 cv_signal(&cq->cv);
1656 1670 mutex_exit(&cq->mutex);
1657 1671 } else {
1658 1672 dumpsys_spinunlock(&cq->spinlock);
1659 1673 }
1660 1674 }
1661 1675
1662 1676 static void
1663 1677 dumpsys_wait_cq(cqueue_t *cq, int live)
1664 1678 {
1665 1679 if (live) {
1666 1680 cv_wait(&cq->cv, &cq->mutex);
1667 1681 } else {
1668 1682 dumpsys_spinunlock(&cq->spinlock);
1669 1683 while (cq->open)
1670 1684 if (cq->first)
1671 1685 break;
1672 1686 dumpsys_spinlock(&cq->spinlock);
1673 1687 }
1674 1688 }
1675 1689
1676 1690 static void
1677 1691 dumpsys_put_cq(cqueue_t *cq, cbuf_t *cp, int newstate, int live)
1678 1692 {
1679 1693 if (cp == NULL)
1680 1694 return;
1681 1695
1682 1696 dumpsys_lock(cq, live);
1683 1697
1684 1698 if (cq->ts != 0) {
1685 1699 cq->empty += gethrtime() - cq->ts;
1686 1700 cq->ts = 0;
1687 1701 }
1688 1702
1689 1703 cp->state = newstate;
1690 1704 cp->next = NULL;
1691 1705 if (cq->last == NULL)
1692 1706 cq->first = cp;
1693 1707 else
1694 1708 cq->last->next = cp;
1695 1709 cq->last = cp;
1696 1710
1697 1711 dumpsys_unlock(cq, live, 1);
1698 1712 }
1699 1713
1700 1714 static cbuf_t *
1701 1715 dumpsys_get_cq(cqueue_t *cq, int live)
1702 1716 {
1703 1717 cbuf_t *cp;
1704 1718 hrtime_t now = gethrtime();
1705 1719
1706 1720 dumpsys_lock(cq, live);
1707 1721
1708 1722 /* CONSTCOND */
1709 1723 while (1) {
1710 1724 cp = (cbuf_t *)cq->first;
1711 1725 if (cp == NULL) {
1712 1726 if (cq->open == 0)
1713 1727 break;
1714 1728 dumpsys_wait_cq(cq, live);
1715 1729 continue;
1716 1730 }
1717 1731 cq->first = cp->next;
1718 1732 if (cq->first == NULL) {
1719 1733 cq->last = NULL;
1720 1734 cq->ts = now;
1721 1735 }
1722 1736 break;
1723 1737 }
1724 1738
1725 1739 dumpsys_unlock(cq, live, cq->first != NULL || cq->open == 0);
1726 1740 return (cp);
1727 1741 }
1728 1742
1729 1743 /*
1730 1744 * Send an error message to the console. If the main task is running
1731 1745 * just write the message via uprintf. If a helper is running the
1732 1746 * message has to be put on a queue for the main task. Setting fmt to
1733 1747 * NULL means flush the error message buffer. If fmt is not NULL, just
1734 1748 * add the text to the existing buffer.
1735 1749 */
1736 1750 static void
1737 1751 dumpsys_errmsg(helper_t *hp, const char *fmt, ...)
1738 1752 {
1739 1753 dumpsync_t *ds = hp->ds;
1740 1754 cbuf_t *cp = hp->cperr;
1741 1755 va_list adx;
1742 1756
1743 1757 if (hp->helper == MAINHELPER) {
1744 1758 if (fmt != NULL) {
1745 1759 if (ds->neednl) {
1746 1760 uprintf("\n");
1747 1761 ds->neednl = 0;
1748 1762 }
1749 1763 va_start(adx, fmt);
1750 1764 vuprintf(fmt, adx);
1751 1765 va_end(adx);
1752 1766 }
1753 1767 } else if (fmt == NULL) {
1754 1768 if (cp != NULL) {
1755 1769 CQ_PUT(mainq, cp, CBUF_ERRMSG);
1756 1770 hp->cperr = NULL;
1757 1771 }
1758 1772 } else {
1759 1773 if (hp->cperr == NULL) {
1760 1774 cp = CQ_GET(freebufq);
1761 1775 hp->cperr = cp;
1762 1776 cp->used = 0;
1763 1777 }
1764 1778 va_start(adx, fmt);
1765 1779 cp->used += vsnprintf(cp->buf + cp->used, cp->size - cp->used,
1766 1780 fmt, adx);
1767 1781 va_end(adx);
1768 1782 if ((cp->used + LOG_MSGSIZE) > cp->size) {
1769 1783 CQ_PUT(mainq, cp, CBUF_ERRMSG);
1770 1784 hp->cperr = NULL;
1771 1785 }
1772 1786 }
1773 1787 }
1774 1788
1775 1789 /*
1776 1790 * Write an output buffer to the dump file. If the main task is
1777 1791 * running just write the data. If a helper is running the output is
1778 1792 * placed on a queue for the main task.
1779 1793 */
1780 1794 static void
1781 1795 dumpsys_swrite(helper_t *hp, cbuf_t *cp, size_t used)
1782 1796 {
1783 1797 dumpsync_t *ds = hp->ds;
1784 1798
1785 1799 if (hp->helper == MAINHELPER) {
1786 1800 HRSTART(ds->perpage, write);
1787 1801 dumpvp_write(cp->buf, used);
1788 1802 HRSTOP(ds->perpage, write);
1789 1803 CQ_PUT(freebufq, cp, CBUF_FREEBUF);
1790 1804 } else {
1791 1805 cp->used = used;
1792 1806 CQ_PUT(mainq, cp, CBUF_WRITE);
1793 1807 }
1794 1808 }
1795 1809
1796 1810 /*
1797 1811 * Copy one page within the mapped range. The offset starts at 0 and
1798 1812 * is relative to the first pfn. cp->buf + cp->off is the address of
1799 1813 * the first pfn. If dump_pagecopy returns a UE offset, create an
1800 1814 * error message. Returns the offset to the next pfn in the range
1801 1815 * selected by the bitmap.
1802 1816 */
1803 1817 static int
1804 1818 dumpsys_copy_page(helper_t *hp, int offset)
1805 1819 {
1806 1820 cbuf_t *cp = hp->cpin;
1807 1821 int ueoff;
1808 1822
1809 1823 ASSERT(cp->off + offset + PAGESIZE <= cp->size);
1810 1824 ASSERT(BT_TEST(dumpcfg.bitmap, cp->bitnum));
1811 1825
1812 1826 ueoff = dump_pagecopy(cp->buf + cp->off + offset, hp->page);
1813 1827
1814 1828 /* ueoff is the offset in the page to a UE error */
1815 1829 if (ueoff != -1) {
1816 1830 uint64_t pa = ptob(cp->pfn) + offset + ueoff;
1817 1831
1818 1832 dumpsys_errmsg(hp, "cpu %d: memory error at PA 0x%08x.%08x\n",
1819 1833 CPU->cpu_id, (uint32_t)(pa >> 32), (uint32_t)pa);
1820 1834 }
1821 1835
1822 1836 /*
1823 1837 * Advance bitnum and offset to the next input page for the
1824 1838 * next call to this function.
1825 1839 */
1826 1840 offset += PAGESIZE;
1827 1841 cp->bitnum++;
1828 1842 while (cp->off + offset < cp->size) {
1829 1843 if (BT_TEST(dumpcfg.bitmap, cp->bitnum))
1830 1844 break;
1831 1845 offset += PAGESIZE;
1832 1846 cp->bitnum++;
1833 1847 }
1834 1848
1835 1849 return (offset);
1836 1850 }
1837 1851
1838 1852 /*
1839 1853 * Read the helper queue, and copy one mapped page. Return 0 when
1840 1854 * done. Return 1 when a page has been copied into hp->page.
1841 1855 */
1842 1856 static int
1843 1857 dumpsys_sread(helper_t *hp)
1844 1858 {
1845 1859 dumpsync_t *ds = hp->ds;
1846 1860
1847 1861 /* CONSTCOND */
1848 1862 while (1) {
1849 1863
1850 1864 /* Find the next input buffer. */
1851 1865 if (hp->cpin == NULL) {
1852 1866 HRSTART(hp->perpage, inwait);
1853 1867
1854 1868 /* CONSTCOND */
1855 1869 while (1) {
1856 1870 hp->cpin = CQ_GET(helperq);
1857 1871 dump_timeleft = dump_timeout;
1858 1872
1859 1873 /*
1860 1874 * NULL return means the helper queue
1861 1875 * is closed and empty.
1862 1876 */
1863 1877 if (hp->cpin == NULL)
1864 1878 break;
1865 1879
1866 1880 /* Have input, check for dump I/O error. */
1867 1881 if (!dump_ioerr)
1868 1882 break;
1869 1883
1870 1884 /*
1871 1885 * If an I/O error occurs, stay in the
1872 1886 * loop in order to empty the helper
1873 1887 * queue. Return the buffers to the
1874 1888 * main task to unmap and free it.
1875 1889 */
1876 1890 hp->cpin->used = 0;
1877 1891 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
1878 1892 }
1879 1893 HRSTOP(hp->perpage, inwait);
1880 1894
1881 1895 /* Stop here when the helper queue is closed. */
1882 1896 if (hp->cpin == NULL)
1883 1897 break;
1884 1898
1885 1899 /* Set the offset=0 to get the first pfn. */
1886 1900 hp->in = 0;
1887 1901
1888 1902 /* Set the total processed to 0 */
1889 1903 hp->used = 0;
1890 1904 }
1891 1905
1892 1906 /* Process the next page. */
1893 1907 if (hp->used < hp->cpin->used) {
1894 1908
1895 1909 /*
1896 1910 * Get the next page from the input buffer and
1897 1911 * return a copy.
1898 1912 */
1899 1913 ASSERT(hp->in != -1);
1900 1914 HRSTART(hp->perpage, copy);
1901 1915 hp->in = dumpsys_copy_page(hp, hp->in);
1902 1916 hp->used += PAGESIZE;
1903 1917 HRSTOP(hp->perpage, copy);
1904 1918 break;
1905 1919
1906 1920 } else {
1907 1921
1908 1922 /*
1909 1923 * Done with the input. Flush the VM and
1910 1924 * return the buffer to the main task.
1911 1925 */
1912 1926 if (panicstr && hp->helper != MAINHELPER)
1913 1927 hat_flush_range(kas.a_hat,
1914 1928 hp->cpin->buf, hp->cpin->size);
1915 1929 dumpsys_errmsg(hp, NULL);
1916 1930 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
1917 1931 hp->cpin = NULL;
1918 1932 }
1919 1933 }
1920 1934
1921 1935 return (hp->cpin != NULL);
1922 1936 }
1923 1937
1924 1938 /*
1925 1939 * Compress size bytes starting at buf with bzip2
1926 1940 * mode:
1927 1941 * BZ_RUN add one more compressed page
1928 1942 * BZ_FINISH no more input, flush the state
1929 1943 */
1930 1944 static void
1931 1945 dumpsys_bzrun(helper_t *hp, void *buf, size_t size, int mode)
1932 1946 {
1933 1947 dumpsync_t *ds = hp->ds;
1934 1948 const int CSIZE = sizeof (dumpcsize_t);
1935 1949 bz_stream *ps = &hp->bzstream;
1936 1950 int rc = 0;
1937 1951 uint32_t csize;
1938 1952 dumpcsize_t cs;
1939 1953
1940 1954 /* Set input pointers to new input page */
1941 1955 if (size > 0) {
1942 1956 ps->avail_in = size;
1943 1957 ps->next_in = buf;
1944 1958 }
1945 1959
1946 1960 /* CONSTCOND */
1947 1961 while (1) {
1948 1962
1949 1963 /* Quit when all input has been consumed */
1950 1964 if (ps->avail_in == 0 && mode == BZ_RUN)
1951 1965 break;
1952 1966
1953 1967 /* Get a new output buffer */
1954 1968 if (hp->cpout == NULL) {
1955 1969 HRSTART(hp->perpage, outwait);
1956 1970 hp->cpout = CQ_GET(freebufq);
1957 1971 HRSTOP(hp->perpage, outwait);
1958 1972 ps->avail_out = hp->cpout->size - CSIZE;
1959 1973 ps->next_out = hp->cpout->buf + CSIZE;
1960 1974 }
1961 1975
1962 1976 /* Compress input, or finalize */
1963 1977 HRSTART(hp->perpage, compress);
1964 1978 rc = BZ2_bzCompress(ps, mode);
1965 1979 HRSTOP(hp->perpage, compress);
1966 1980
1967 1981 /* Check for error */
1968 1982 if (mode == BZ_RUN && rc != BZ_RUN_OK) {
1969 1983 dumpsys_errmsg(hp, "%d: BZ_RUN error %s at page %lx\n",
1970 1984 hp->helper, BZ2_bzErrorString(rc),
1971 1985 hp->cpin->pagenum);
1972 1986 break;
1973 1987 }
1974 1988
1975 1989 /* Write the buffer if it is full, or we are flushing */
1976 1990 if (ps->avail_out == 0 || mode == BZ_FINISH) {
1977 1991 csize = hp->cpout->size - CSIZE - ps->avail_out;
1978 1992 cs = DUMP_SET_TAG(csize, hp->tag);
1979 1993 if (csize > 0) {
1980 1994 (void) memcpy(hp->cpout->buf, &cs, CSIZE);
1981 1995 dumpsys_swrite(hp, hp->cpout, csize + CSIZE);
1982 1996 hp->cpout = NULL;
1983 1997 }
1984 1998 }
1985 1999
1986 2000 /* Check for final complete */
1987 2001 if (mode == BZ_FINISH) {
1988 2002 if (rc == BZ_STREAM_END)
1989 2003 break;
1990 2004 if (rc != BZ_FINISH_OK) {
1991 2005 dumpsys_errmsg(hp, "%d: BZ_FINISH error %s\n",
1992 2006 hp->helper, BZ2_bzErrorString(rc));
1993 2007 break;
1994 2008 }
1995 2009 }
1996 2010 }
1997 2011
1998 2012 /* Cleanup state and buffers */
1999 2013 if (mode == BZ_FINISH) {
2000 2014
2001 2015 /* Reset state so that it is re-usable. */
2002 2016 (void) BZ2_bzCompressReset(&hp->bzstream);
2003 2017
2004 2018 /* Give any unused outout buffer to the main task */
2005 2019 if (hp->cpout != NULL) {
2006 2020 hp->cpout->used = 0;
2007 2021 CQ_PUT(mainq, hp->cpout, CBUF_ERRMSG);
2008 2022 hp->cpout = NULL;
2009 2023 }
2010 2024 }
2011 2025 }
2012 2026
2013 2027 static void
2014 2028 dumpsys_bz2compress(helper_t *hp)
2015 2029 {
2016 2030 dumpsync_t *ds = hp->ds;
2017 2031 dumpstreamhdr_t sh;
2018 2032
2019 2033 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC);
2020 2034 sh.stream_pagenum = (pgcnt_t)-1;
2021 2035 sh.stream_npages = 0;
2022 2036 hp->cpin = NULL;
2023 2037 hp->cpout = NULL;
2024 2038 hp->cperr = NULL;
2025 2039 hp->in = 0;
2026 2040 hp->out = 0;
2027 2041 hp->bzstream.avail_in = 0;
2028 2042
2029 2043 /* Bump reference to mainq while we are running */
2030 2044 CQ_OPEN(mainq);
2031 2045
2032 2046 /* Get one page at a time */
2033 2047 while (dumpsys_sread(hp)) {
2034 2048 if (sh.stream_pagenum != hp->cpin->pagenum) {
2035 2049 sh.stream_pagenum = hp->cpin->pagenum;
2036 2050 sh.stream_npages = btop(hp->cpin->used);
2037 2051 dumpsys_bzrun(hp, &sh, sizeof (sh), BZ_RUN);
2038 2052 }
2039 2053 dumpsys_bzrun(hp, hp->page, PAGESIZE, 0);
2040 2054 }
2041 2055
2042 2056 /* Done with input, flush any partial buffer */
2043 2057 if (sh.stream_pagenum != (pgcnt_t)-1) {
2044 2058 dumpsys_bzrun(hp, NULL, 0, BZ_FINISH);
2045 2059 dumpsys_errmsg(hp, NULL);
2046 2060 }
2047 2061
2048 2062 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL);
2049 2063
2050 2064 /* Decrement main queue count, we are done */
2051 2065 CQ_CLOSE(mainq);
2052 2066 }
2053 2067
2054 2068 /*
2055 2069 * Compress with lzjb
2056 2070 * write stream block if full or size==0
2057 2071 * if csize==0 write stream header, else write <csize, data>
2058 2072 * size==0 is a call to flush a buffer
2059 2073 * hp->cpout is the buffer we are flushing or filling
2060 2074 * hp->out is the next index to fill data
2061 2075 * osize is either csize+data, or the size of a stream header
2062 2076 */
2063 2077 static void
2064 2078 dumpsys_lzjbrun(helper_t *hp, size_t csize, void *buf, size_t size)
2065 2079 {
2066 2080 dumpsync_t *ds = hp->ds;
2067 2081 const int CSIZE = sizeof (dumpcsize_t);
2068 2082 dumpcsize_t cs;
2069 2083 size_t osize = csize > 0 ? CSIZE + size : size;
2070 2084
2071 2085 /* If flush, and there is no buffer, just return */
2072 2086 if (size == 0 && hp->cpout == NULL)
2073 2087 return;
2074 2088
2075 2089 /* If flush, or cpout is full, write it out */
2076 2090 if (size == 0 ||
2077 2091 hp->cpout != NULL && hp->out + osize > hp->cpout->size) {
2078 2092
2079 2093 /* Set tag+size word at the front of the stream block. */
2080 2094 cs = DUMP_SET_TAG(hp->out - CSIZE, hp->tag);
2081 2095 (void) memcpy(hp->cpout->buf, &cs, CSIZE);
2082 2096
2083 2097 /* Write block to dump file. */
2084 2098 dumpsys_swrite(hp, hp->cpout, hp->out);
2085 2099
2086 2100 /* Clear pointer to indicate we need a new buffer */
2087 2101 hp->cpout = NULL;
2088 2102
2089 2103 /* flushing, we are done */
2090 2104 if (size == 0)
2091 2105 return;
2092 2106 }
2093 2107
2094 2108 /* Get an output buffer if we dont have one. */
2095 2109 if (hp->cpout == NULL) {
2096 2110 HRSTART(hp->perpage, outwait);
2097 2111 hp->cpout = CQ_GET(freebufq);
2098 2112 HRSTOP(hp->perpage, outwait);
2099 2113 hp->out = CSIZE;
2100 2114 }
2101 2115
2102 2116 /* Store csize word. This is the size of compressed data. */
2103 2117 if (csize > 0) {
2104 2118 cs = DUMP_SET_TAG(csize, 0);
2105 2119 (void) memcpy(hp->cpout->buf + hp->out, &cs, CSIZE);
2106 2120 hp->out += CSIZE;
2107 2121 }
2108 2122
2109 2123 /* Store the data. */
2110 2124 (void) memcpy(hp->cpout->buf + hp->out, buf, size);
2111 2125 hp->out += size;
2112 2126 }
2113 2127
2114 2128 static void
2115 2129 dumpsys_lzjbcompress(helper_t *hp)
2116 2130 {
2117 2131 dumpsync_t *ds = hp->ds;
2118 2132 size_t csize;
2119 2133 dumpstreamhdr_t sh;
2120 2134
2121 2135 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC);
2122 2136 sh.stream_pagenum = (pfn_t)-1;
2123 2137 sh.stream_npages = 0;
2124 2138 hp->cpin = NULL;
2125 2139 hp->cpout = NULL;
2126 2140 hp->cperr = NULL;
2127 2141 hp->in = 0;
2128 2142 hp->out = 0;
2129 2143
2130 2144 /* Bump reference to mainq while we are running */
2131 2145 CQ_OPEN(mainq);
2132 2146
2133 2147 /* Get one page at a time */
2134 2148 while (dumpsys_sread(hp)) {
2135 2149
2136 2150 /* Create a stream header for each new input map */
2137 2151 if (sh.stream_pagenum != hp->cpin->pagenum) {
2138 2152 sh.stream_pagenum = hp->cpin->pagenum;
2139 2153 sh.stream_npages = btop(hp->cpin->used);
2140 2154 dumpsys_lzjbrun(hp, 0, &sh, sizeof (sh));
2141 2155 }
2142 2156
2143 2157 /* Compress one page */
2144 2158 HRSTART(hp->perpage, compress);
2145 2159 csize = compress(hp->page, hp->lzbuf, PAGESIZE);
2146 2160 HRSTOP(hp->perpage, compress);
2147 2161
2148 2162 /* Add csize+data to output block */
2149 2163 ASSERT(csize > 0 && csize <= PAGESIZE);
2150 2164 dumpsys_lzjbrun(hp, csize, hp->lzbuf, csize);
2151 2165 }
2152 2166
2153 2167 /* Done with input, flush any partial buffer */
2154 2168 if (sh.stream_pagenum != (pfn_t)-1) {
2155 2169 dumpsys_lzjbrun(hp, 0, NULL, 0);
2156 2170 dumpsys_errmsg(hp, NULL);
2157 2171 }
2158 2172
2159 2173 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL);
2160 2174
2161 2175 /* Decrement main queue count, we are done */
2162 2176 CQ_CLOSE(mainq);
2163 2177 }
2164 2178
2165 2179 /*
2166 2180 * Dump helper called from panic_idle() to compress pages. CPUs in
2167 2181 * this path must not call most kernel services.
2168 2182 *
2169 2183 * During panic, all but one of the CPUs is idle. These CPUs are used
2170 2184 * as helpers working in parallel to copy and compress memory
2171 2185 * pages. During a panic, however, these processors cannot call any
2172 2186 * kernel services. This is because mutexes become no-ops during
2173 2187 * panic, and, cross-call interrupts are inhibited. Therefore, during
2174 2188 * panic dump the helper CPUs communicate with the panic CPU using
2175 2189 * memory variables. All memory mapping and I/O is performed by the
2176 2190 * panic CPU.
2177 2191 *
2178 2192 * At dump configuration time, helper_lock is set and helpers_wanted
2179 2193 * is 0. dumpsys() decides whether to set helpers_wanted before
2180 2194 * clearing helper_lock.
2181 2195 *
2182 2196 * At panic time, idle CPUs spin-wait on helper_lock, then alternately
2183 2197 * take the lock and become a helper, or return.
2184 2198 */
2185 2199 void
2186 2200 dumpsys_helper()
2187 2201 {
2188 2202 dumpsys_spinlock(&dumpcfg.helper_lock);
2189 2203 if (dumpcfg.helpers_wanted) {
2190 2204 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper];
2191 2205
2192 2206 for (hp = dumpcfg.helper; hp != hpend; hp++) {
2193 2207 if (hp->helper == FREEHELPER) {
2194 2208 hp->helper = CPU->cpu_id;
2195 2209 BT_SET(dumpcfg.helpermap, CPU->cpu_seqid);
2196 2210
2197 2211 dumpsys_spinunlock(&dumpcfg.helper_lock);
2198 2212
2199 2213 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2)
2200 2214 dumpsys_lzjbcompress(hp);
2201 2215 else
2202 2216 dumpsys_bz2compress(hp);
2203 2217
2204 2218 hp->helper = DONEHELPER;
2205 2219 return;
2206 2220 }
2207 2221 }
2208 2222
2209 2223 /* No more helpers are needed. */
2210 2224 dumpcfg.helpers_wanted = 0;
2211 2225
2212 2226 }
2213 2227 dumpsys_spinunlock(&dumpcfg.helper_lock);
2214 2228 }
2215 2229
2216 2230 /*
2217 2231 * No-wait helper callable in spin loops.
2218 2232 *
2219 2233 * Do not wait for helper_lock. Just check helpers_wanted. The caller
2220 2234 * may decide to continue. This is the "c)ontinue, s)ync, r)eset? s"
2221 2235 * case.
2222 2236 */
2223 2237 void
2224 2238 dumpsys_helper_nw()
2225 2239 {
2226 2240 if (dumpcfg.helpers_wanted)
2227 2241 dumpsys_helper();
2228 2242 }
2229 2243
2230 2244 /*
2231 2245 * Dump helper for live dumps.
2232 2246 * These run as a system task.
2233 2247 */
2234 2248 static void
2235 2249 dumpsys_live_helper(void *arg)
2236 2250 {
2237 2251 helper_t *hp = arg;
2238 2252
2239 2253 BT_ATOMIC_SET(dumpcfg.helpermap, CPU->cpu_seqid);
2240 2254 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2)
2241 2255 dumpsys_lzjbcompress(hp);
2242 2256 else
2243 2257 dumpsys_bz2compress(hp);
2244 2258 }
2245 2259
2246 2260 /*
2247 2261 * Compress one page with lzjb (single threaded case)
2248 2262 */
2249 2263 static void
2250 2264 dumpsys_lzjb_page(helper_t *hp, cbuf_t *cp)
2251 2265 {
2252 2266 dumpsync_t *ds = hp->ds;
2253 2267 uint32_t csize;
2254 2268
2255 2269 hp->helper = MAINHELPER;
2256 2270 hp->in = 0;
2257 2271 hp->used = 0;
2258 2272 hp->cpin = cp;
2259 2273 while (hp->used < cp->used) {
2260 2274 HRSTART(hp->perpage, copy);
2261 2275 hp->in = dumpsys_copy_page(hp, hp->in);
2262 2276 hp->used += PAGESIZE;
2263 2277 HRSTOP(hp->perpage, copy);
2264 2278
2265 2279 HRSTART(hp->perpage, compress);
2266 2280 csize = compress(hp->page, hp->lzbuf, PAGESIZE);
2267 2281 HRSTOP(hp->perpage, compress);
2268 2282
2269 2283 HRSTART(hp->perpage, write);
2270 2284 dumpvp_write(&csize, sizeof (csize));
2271 2285 dumpvp_write(hp->lzbuf, csize);
2272 2286 HRSTOP(hp->perpage, write);
2273 2287 }
2274 2288 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
2275 2289 hp->cpin = NULL;
2276 2290 }
2277 2291
2278 2292 /*
2279 2293 * Main task to dump pages. This is called on the dump CPU.
2280 2294 */
2281 2295 static void
2282 2296 dumpsys_main_task(void *arg)
2283 2297 {
2284 2298 dumpsync_t *ds = arg;
2285 2299 pgcnt_t pagenum = 0, bitnum = 0, hibitnum;
2286 2300 dumpmlw_t mlw;
2287 2301 cbuf_t *cp;
2288 2302 pgcnt_t baseoff, pfnoff;
2289 2303 pfn_t base, pfn;
2290 2304 int i, dumpserial;
2291 2305
2292 2306 /*
2293 2307 * Fall back to serial mode if there are no helpers.
2294 2308 * dump_plat_mincpu can be set to 0 at any time.
2295 2309 * dumpcfg.helpermap must contain at least one member.
2296 2310 */
2297 2311 dumpserial = 1;
2298 2312
2299 2313 if (dump_plat_mincpu != 0 && dumpcfg.clevel != 0) {
2300 2314 for (i = 0; i < BT_BITOUL(NCPU); ++i) {
2301 2315 if (dumpcfg.helpermap[i] != 0) {
2302 2316 dumpserial = 0;
2303 2317 break;
2304 2318 }
2305 2319 }
2306 2320 }
2307 2321
2308 2322 if (dumpserial) {
2309 2323 dumpcfg.clevel = 0;
2310 2324 if (dumpcfg.helper[0].lzbuf == NULL)
2311 2325 dumpcfg.helper[0].lzbuf = dumpcfg.helper[1].page;
2312 2326 }
2313 2327
2314 2328 dump_init_memlist_walker(&mlw);
2315 2329
2316 2330 for (;;) {
2317 2331 int sec = (gethrtime() - ds->start) / NANOSEC;
2318 2332
2319 2333 /*
2320 2334 * Render a simple progress display on the system console to
2321 2335 * make clear to the operator that the system has not hung.
2322 2336 * Emit an update when dump progress has advanced by one
2323 2337 * percent, or when no update has been drawn in the last
2324 2338 * second.
2325 2339 */
2326 2340 if (ds->percent > ds->percent_done || sec > ds->sec_done) {
2327 2341 ds->sec_done = sec;
2328 2342 ds->percent_done = ds->percent;
2329 2343 uprintf("^\rdumping: %2d:%02d %3d%% done",
2330 2344 sec / 60, sec % 60, ds->percent);
2331 2345 ds->neednl = 1;
2332 2346 }
2333 2347
2334 2348 while (CQ_IS_EMPTY(mainq) && !CQ_IS_EMPTY(writerq)) {
2335 2349
2336 2350 /* the writerq never blocks */
2337 2351 cp = CQ_GET(writerq);
2338 2352 if (cp == NULL)
2339 2353 break;
2340 2354
2341 2355 dump_timeleft = dump_timeout;
2342 2356
2343 2357 HRSTART(ds->perpage, write);
2344 2358 dumpvp_write(cp->buf, cp->used);
2345 2359 HRSTOP(ds->perpage, write);
2346 2360
2347 2361 CQ_PUT(freebufq, cp, CBUF_FREEBUF);
2348 2362 }
2349 2363
2350 2364 /*
2351 2365 * Wait here for some buffers to process. Returns NULL
2352 2366 * when all helpers have terminated and all buffers
2353 2367 * have been processed.
2354 2368 */
2355 2369 cp = CQ_GET(mainq);
2356 2370
2357 2371 if (cp == NULL) {
2358 2372
2359 2373 /* Drain the write queue. */
2360 2374 if (!CQ_IS_EMPTY(writerq))
2361 2375 continue;
2362 2376
2363 2377 /* Main task exits here. */
2364 2378 break;
2365 2379 }
2366 2380
2367 2381 dump_timeleft = dump_timeout;
2368 2382
2369 2383 switch (cp->state) {
2370 2384
2371 2385 case CBUF_FREEMAP:
2372 2386
2373 2387 /*
2374 2388 * Note that we drop CBUF_FREEMAP buffers on
2375 2389 * the floor (they will not be on any cqueue)
2376 2390 * when we no longer need them.
2377 2391 */
2378 2392 if (bitnum >= dumpcfg.bitmapsize)
2379 2393 break;
2380 2394
2381 2395 if (dump_ioerr) {
2382 2396 bitnum = dumpcfg.bitmapsize;
2383 2397 CQ_CLOSE(helperq);
2384 2398 break;
2385 2399 }
2386 2400
2387 2401 HRSTART(ds->perpage, bitmap);
2388 2402 for (; bitnum < dumpcfg.bitmapsize; bitnum++)
2389 2403 if (BT_TEST(dumpcfg.bitmap, bitnum))
2390 2404 break;
2391 2405 HRSTOP(ds->perpage, bitmap);
2392 2406 dump_timeleft = dump_timeout;
2393 2407
2394 2408 if (bitnum >= dumpcfg.bitmapsize) {
2395 2409 CQ_CLOSE(helperq);
2396 2410 break;
2397 2411 }
2398 2412
2399 2413 /*
2400 2414 * Try to map CBUF_MAPSIZE ranges. Can't
2401 2415 * assume that memory segment size is a
2402 2416 * multiple of CBUF_MAPSIZE. Can't assume that
2403 2417 * the segment starts on a CBUF_MAPSIZE
2404 2418 * boundary.
2405 2419 */
2406 2420 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
2407 2421 ASSERT(pfn != PFN_INVALID);
2408 2422 ASSERT(bitnum + mlw.mpleft <= dumpcfg.bitmapsize);
2409 2423
2410 2424 base = P2ALIGN(pfn, CBUF_MAPNP);
2411 2425 if (base < mlw.mpaddr) {
2412 2426 base = mlw.mpaddr;
2413 2427 baseoff = P2PHASE(base, CBUF_MAPNP);
2414 2428 } else {
2415 2429 baseoff = 0;
2416 2430 }
2417 2431
2418 2432 pfnoff = pfn - base;
2419 2433 if (pfnoff + mlw.mpleft < CBUF_MAPNP) {
2420 2434 hibitnum = bitnum + mlw.mpleft;
2421 2435 cp->size = ptob(pfnoff + mlw.mpleft);
2422 2436 } else {
2423 2437 hibitnum = bitnum - pfnoff + CBUF_MAPNP -
2424 2438 baseoff;
2425 2439 cp->size = CBUF_MAPSIZE - ptob(baseoff);
2426 2440 }
2427 2441
2428 2442 cp->pfn = pfn;
2429 2443 cp->bitnum = bitnum++;
2430 2444 cp->pagenum = pagenum++;
2431 2445 cp->off = ptob(pfnoff);
2432 2446
2433 2447 for (; bitnum < hibitnum; bitnum++)
2434 2448 if (BT_TEST(dumpcfg.bitmap, bitnum))
2435 2449 pagenum++;
2436 2450
2437 2451 dump_timeleft = dump_timeout;
2438 2452 cp->used = ptob(pagenum - cp->pagenum);
2439 2453
2440 2454 HRSTART(ds->perpage, map);
2441 2455 hat_devload(kas.a_hat, cp->buf, cp->size, base,
2442 2456 PROT_READ, HAT_LOAD_NOCONSIST);
2443 2457 HRSTOP(ds->perpage, map);
2444 2458
2445 2459 ds->pages_mapped += btop(cp->size);
2446 2460 ds->pages_used += pagenum - cp->pagenum;
2447 2461
2448 2462 CQ_OPEN(mainq);
2449 2463
2450 2464 /*
2451 2465 * If there are no helpers the main task does
2452 2466 * non-streams lzjb compress.
2453 2467 */
2454 2468 if (dumpserial) {
2455 2469 dumpsys_lzjb_page(dumpcfg.helper, cp);
2456 2470 break;
2457 2471 }
2458 2472
2459 2473 /* pass mapped pages to a helper */
2460 2474 CQ_PUT(helperq, cp, CBUF_INREADY);
2461 2475
2462 2476 /* the last page was done */
2463 2477 if (bitnum >= dumpcfg.bitmapsize)
2464 2478 CQ_CLOSE(helperq);
2465 2479
2466 2480 break;
2467 2481
2468 2482 case CBUF_USEDMAP:
2469 2483
2470 2484 ds->npages += btop(cp->used);
2471 2485
2472 2486 HRSTART(ds->perpage, unmap);
2473 2487 hat_unload(kas.a_hat, cp->buf, cp->size, HAT_UNLOAD);
2474 2488 HRSTOP(ds->perpage, unmap);
2475 2489
2476 2490 if (bitnum < dumpcfg.bitmapsize)
2477 2491 CQ_PUT(mainq, cp, CBUF_FREEMAP);
2478 2492 CQ_CLOSE(mainq);
2479 2493
2480 2494 ASSERT(ds->npages <= dumphdr->dump_npages);
2481 2495 ds->percent = ds->npages * 100LL / dumphdr->dump_npages;
2482 2496 break;
2483 2497
2484 2498 case CBUF_WRITE:
2485 2499
2486 2500 CQ_PUT(writerq, cp, CBUF_WRITE);
2487 2501 break;
2488 2502
2489 2503 case CBUF_ERRMSG:
2490 2504
2491 2505 if (cp->used > 0) {
2492 2506 cp->buf[cp->size - 2] = '\n';
2493 2507 cp->buf[cp->size - 1] = '\0';
2494 2508 if (ds->neednl) {
2495 2509 uprintf("\n%s", cp->buf);
2496 2510 ds->neednl = 0;
2497 2511 } else {
2498 2512 uprintf("%s", cp->buf);
2499 2513 }
2500 2514 /* wait for console output */
2501 2515 drv_usecwait(200000);
2502 2516 dump_timeleft = dump_timeout;
2503 2517 }
2504 2518 CQ_PUT(freebufq, cp, CBUF_FREEBUF);
2505 2519 break;
2506 2520
2507 2521 default:
2508 2522 uprintf("dump: unexpected buffer state %d, "
2509 2523 "buffer will be lost\n", cp->state);
2510 2524 break;
2511 2525
2512 2526 } /* end switch */
2513 2527 }
2514 2528 }
2515 2529
2516 2530 #ifdef COLLECT_METRICS
2517 2531 size_t
2518 2532 dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size)
2519 2533 {
2520 2534 dumpcfg_t *cfg = &dumpcfg;
2521 2535 int myid = CPU->cpu_seqid;
2522 2536 int i, compress_ratio;
2523 2537 int sec, iorate;
2524 2538 helper_t *hp, *hpend = &cfg->helper[cfg->nhelper];
2525 2539 char *e = buf + size;
2526 2540 char *p = buf;
2527 2541
2528 2542 sec = ds->elapsed / (1000 * 1000 * 1000ULL);
2529 2543 if (sec < 1)
2530 2544 sec = 1;
2531 2545
2532 2546 if (ds->iotime < 1)
2533 2547 ds->iotime = 1;
2534 2548 iorate = (ds->nwrite * 100000ULL) / ds->iotime;
2535 2549
2536 2550 compress_ratio = 100LL * ds->npages / btopr(ds->nwrite + 1);
2537 2551
2538 2552 #define P(...) (p += p < e ? snprintf(p, e - p, __VA_ARGS__) : 0)
2539 2553
2540 2554 P("Master cpu_seqid,%d\n", CPU->cpu_seqid);
2541 2555 P("Master cpu_id,%d\n", CPU->cpu_id);
2542 2556 P("dump_flags,0x%x\n", dumphdr->dump_flags);
2543 2557 P("dump_ioerr,%d\n", dump_ioerr);
2544 2558
2545 2559 P("Helpers:\n");
2546 2560 for (i = 0; i < ncpus; i++) {
2547 2561 if ((i & 15) == 0)
2548 2562 P(",,%03d,", i);
2549 2563 if (i == myid)
2550 2564 P(" M");
2551 2565 else if (BT_TEST(cfg->helpermap, i))
2552 2566 P("%4d", cpu_seq[i]->cpu_id);
2553 2567 else
2554 2568 P(" *");
2555 2569 if ((i & 15) == 15)
2556 2570 P("\n");
2557 2571 }
2558 2572
2559 2573 P("ncbuf_used,%d\n", cfg->ncbuf_used);
2560 2574 P("ncmap,%d\n", cfg->ncmap);
2561 2575
2562 2576 P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m);
2563 2577 P("Found small pages,%ld\n", cfg->foundsm);
2564 2578
2565 2579 P("Compression level,%d\n", cfg->clevel);
2566 2580 P("Compression type,%s %s\n", cfg->clevel == 0 ? "serial" : "parallel",
2567 2581 cfg->clevel >= DUMP_CLEVEL_BZIP2 ? "bzip2" : "lzjb");
2568 2582 P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio %
2569 2583 100);
2570 2584 P("nhelper_used,%d\n", cfg->nhelper_used);
2571 2585
2572 2586 P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100);
2573 2587 P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite);
2574 2588 P("..total nsec,%lld\n", (u_longlong_t)ds->iotime);
2575 2589 P("dumpbuf.iosize,%ld\n", dumpbuf.iosize);
2576 2590 P("dumpbuf.size,%ld\n", dumpbuf.size);
2577 2591
2578 2592 P("Dump pages/sec,%llu\n", (u_longlong_t)ds->npages / sec);
2579 2593 P("Dump pages,%llu\n", (u_longlong_t)ds->npages);
2580 2594 P("Dump time,%d\n", sec);
2581 2595
2582 2596 if (ds->pages_mapped > 0)
2583 2597 P("per-cent map utilization,%d\n", (int)((100 * ds->pages_used)
2584 2598 / ds->pages_mapped));
2585 2599
2586 2600 P("\nPer-page metrics:\n");
2587 2601 if (ds->npages > 0) {
2588 2602 for (hp = cfg->helper; hp != hpend; hp++) {
2589 2603 #define PERPAGE(x) ds->perpage.x += hp->perpage.x;
2590 2604 PERPAGES;
2591 2605 #undef PERPAGE
2592 2606 }
2593 2607 #define PERPAGE(x) \
2594 2608 P("%s nsec/page,%d\n", #x, (int)(ds->perpage.x / ds->npages));
2595 2609 PERPAGES;
2596 2610 #undef PERPAGE
2597 2611 P("freebufq.empty,%d\n", (int)(ds->freebufq.empty /
2598 2612 ds->npages));
2599 2613 P("helperq.empty,%d\n", (int)(ds->helperq.empty /
2600 2614 ds->npages));
2601 2615 P("writerq.empty,%d\n", (int)(ds->writerq.empty /
2602 2616 ds->npages));
2603 2617 P("mainq.empty,%d\n", (int)(ds->mainq.empty / ds->npages));
2604 2618
2605 2619 P("I/O wait nsec/page,%llu\n", (u_longlong_t)(ds->iowait /
2606 2620 ds->npages));
2607 2621 }
2608 2622 #undef P
2609 2623 if (p < e)
2610 2624 bzero(p, e - p);
2611 2625 return (p - buf);
2612 2626 }
2613 2627 #endif /* COLLECT_METRICS */
2614 2628
2615 2629 /*
2616 2630 * Dump the system.
2617 2631 */
2618 2632 void
2619 2633 dumpsys(void)
2620 2634 {
2621 2635 dumpsync_t *ds = &dumpsync;
2622 2636 taskq_t *livetaskq = NULL;
2623 2637 pfn_t pfn;
2624 2638 pgcnt_t bitnum;
2625 2639 proc_t *p;
2626 2640 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper];
2627 2641 cbuf_t *cp;
2628 2642 pid_t npids, pidx;
2629 2643 char *content;
2630 2644 char *buf;
2631 2645 size_t size;
2632 2646 int save_dump_clevel;
2633 2647 dumpmlw_t mlw;
2634 2648 dumpcsize_t datatag;
2635 2649 dumpdatahdr_t datahdr;
2636 2650
2637 2651 if (dumpvp == NULL || dumphdr == NULL) {
2638 2652 uprintf("skipping system dump - no dump device configured\n");
2639 2653 if (panicstr) {
2640 2654 dumpcfg.helpers_wanted = 0;
2641 2655 dumpsys_spinunlock(&dumpcfg.helper_lock);
2642 2656 }
2643 2657 return;
2644 2658 }
2645 2659 dumpbuf.cur = dumpbuf.start;
2646 2660
2647 2661 /* clear the sync variables */
2648 2662 ASSERT(dumpcfg.nhelper > 0);
2649 2663 bzero(ds, sizeof (*ds));
2650 2664 ds->dumpcpu = CPU->cpu_id;
2651 2665
2652 2666 /*
2653 2667 * Calculate the starting block for dump. If we're dumping on a
2654 2668 * swap device, start 1/5 of the way in; otherwise, start at the
2655 2669 * beginning. And never use the first page -- it may be a disk label.
2656 2670 */
2657 2671 if (dumpvp->v_flag & VISSWAP)
2658 2672 dumphdr->dump_start = P2ROUNDUP(dumpvp_size / 5, DUMP_OFFSET);
2659 2673 else
2660 2674 dumphdr->dump_start = DUMP_OFFSET;
2661 2675
2662 2676 dumphdr->dump_flags = DF_VALID | DF_COMPLETE | DF_LIVE | DF_COMPRESSED;
2663 2677 dumphdr->dump_crashtime = gethrestime_sec();
2664 2678 dumphdr->dump_npages = 0;
2665 2679 dumphdr->dump_nvtop = 0;
2666 2680 bzero(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.bitmapsize));
2667 2681 dump_timeleft = dump_timeout;
2668 2682
2669 2683 if (panicstr) {
2670 2684 dumphdr->dump_flags &= ~DF_LIVE;
2671 2685 (void) VOP_DUMPCTL(dumpvp, DUMP_FREE, NULL, NULL);
2672 2686 (void) VOP_DUMPCTL(dumpvp, DUMP_ALLOC, NULL, NULL);
2673 2687 (void) vsnprintf(dumphdr->dump_panicstring, DUMP_PANICSIZE,
2674 2688 panicstr, panicargs);
2675 2689
2676 2690 }
2677 2691
2678 2692 if (dump_conflags & DUMP_ALL)
2679 2693 content = "all";
2680 2694 else if (dump_conflags & DUMP_CURPROC)
2681 2695 content = "kernel + curproc";
2682 2696 else
2683 2697 content = "kernel";
2684 2698 uprintf("dumping to %s, offset %lld, content: %s\n", dumppath,
2685 2699 dumphdr->dump_start, content);
2686 2700
2687 2701 /* Make sure nodename is current */
2688 2702 bcopy(utsname.nodename, dumphdr->dump_utsname.nodename, SYS_NMLN);
2689 2703
2690 2704 /*
2691 2705 * If this is a live dump, try to open a VCHR vnode for better
2692 2706 * performance. We must take care to flush the buffer cache
2693 2707 * first.
2694 2708 */
2695 2709 if (!panicstr) {
2696 2710 vnode_t *cdev_vp, *cmn_cdev_vp;
2697 2711
2698 2712 ASSERT(dumpbuf.cdev_vp == NULL);
2699 2713 cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR);
2700 2714 if (cdev_vp != NULL) {
2701 2715 cmn_cdev_vp = common_specvp(cdev_vp);
2702 2716 if (VOP_OPEN(&cmn_cdev_vp, FREAD | FWRITE, kcred, NULL)
2703 2717 == 0) {
2704 2718 if (vn_has_cached_data(dumpvp))
2705 2719 (void) pvn_vplist_dirty(dumpvp, 0, NULL,
2706 2720 B_INVAL | B_TRUNC, kcred);
2707 2721 dumpbuf.cdev_vp = cmn_cdev_vp;
2708 2722 } else {
2709 2723 VN_RELE(cdev_vp);
2710 2724 }
2711 2725 }
2712 2726 }
2713 2727
2714 2728 /*
2715 2729 * Store a hires timestamp so we can look it up during debugging.
2716 2730 */
2717 2731 lbolt_debug_entry();
2718 2732
2719 2733 /*
2720 2734 * Leave room for the message and ereport save areas and terminal dump
2721 2735 * header.
2722 2736 */
2723 2737 dumpbuf.vp_limit = dumpvp_size - DUMP_LOGSIZE - DUMP_OFFSET -
2724 2738 DUMP_ERPTSIZE;
2725 2739
2726 2740 /*
2727 2741 * Write out the symbol table. It's no longer compressed,
2728 2742 * so its 'size' and 'csize' are equal.
2729 2743 */
2730 2744 dumpbuf.vp_off = dumphdr->dump_ksyms = dumphdr->dump_start + PAGESIZE;
2731 2745 dumphdr->dump_ksyms_size = dumphdr->dump_ksyms_csize =
2732 2746 ksyms_snapshot(dumpvp_ksyms_write, NULL, LONG_MAX);
2733 2747
2734 2748 /*
2735 2749 * Write out the translation map.
2736 2750 */
2737 2751 dumphdr->dump_map = dumpvp_flush();
2738 2752 dump_as(&kas);
2739 2753 dumphdr->dump_nvtop += dump_plat_addr();
2740 2754
2741 2755 /*
2742 2756 * call into hat, which may have unmapped pages that also need to
2743 2757 * be in the dump
2744 2758 */
2745 2759 hat_dump();
2746 2760
2747 2761 if (dump_conflags & DUMP_ALL) {
2748 2762 mutex_enter(&pidlock);
2749 2763
2750 2764 for (npids = 0, p = practive; p != NULL; p = p->p_next)
2751 2765 dumpcfg.pids[npids++] = p->p_pid;
2752 2766
2753 2767 mutex_exit(&pidlock);
2754 2768
2755 2769 for (pidx = 0; pidx < npids; pidx++)
2756 2770 (void) dump_process(dumpcfg.pids[pidx]);
2757 2771
2758 2772 dump_init_memlist_walker(&mlw);
2759 2773 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) {
2760 2774 dump_timeleft = dump_timeout;
2761 2775 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
2762 2776 /*
2763 2777 * Some hypervisors do not have all pages available to
2764 2778 * be accessed by the guest OS. Check for page
2765 2779 * accessibility.
2766 2780 */
2767 2781 if (plat_hold_page(pfn, PLAT_HOLD_NO_LOCK, NULL) !=
2768 2782 PLAT_HOLD_OK)
2769 2783 continue;
2770 2784 BT_SET(dumpcfg.bitmap, bitnum);
2771 2785 }
2772 2786 dumphdr->dump_npages = dumpcfg.bitmapsize;
2773 2787 dumphdr->dump_flags |= DF_ALL;
2774 2788
2775 2789 } else if (dump_conflags & DUMP_CURPROC) {
2776 2790 /*
2777 2791 * Determine which pid is to be dumped. If we're panicking, we
2778 2792 * dump the process associated with panic_thread (if any). If
2779 2793 * this is a live dump, we dump the process associated with
2780 2794 * curthread.
2781 2795 */
2782 2796 npids = 0;
2783 2797 if (panicstr) {
2784 2798 if (panic_thread != NULL &&
2785 2799 panic_thread->t_procp != NULL &&
2786 2800 panic_thread->t_procp != &p0) {
2787 2801 dumpcfg.pids[npids++] =
2788 2802 panic_thread->t_procp->p_pid;
2789 2803 }
2790 2804 } else {
2791 2805 dumpcfg.pids[npids++] = curthread->t_procp->p_pid;
2792 2806 }
2793 2807
2794 2808 if (npids && dump_process(dumpcfg.pids[0]) == 0)
2795 2809 dumphdr->dump_flags |= DF_CURPROC;
2796 2810 else
2797 2811 dumphdr->dump_flags |= DF_KERNEL;
2798 2812
2799 2813 } else {
2800 2814 dumphdr->dump_flags |= DF_KERNEL;
2801 2815 }
2802 2816
2803 2817 dumphdr->dump_hashmask = (1 << highbit(dumphdr->dump_nvtop - 1)) - 1;
2804 2818
2805 2819 /*
2806 2820 * Write out the pfn table.
2807 2821 */
2808 2822 dumphdr->dump_pfn = dumpvp_flush();
2809 2823 dump_init_memlist_walker(&mlw);
2810 2824 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) {
2811 2825 dump_timeleft = dump_timeout;
2812 2826 if (!BT_TEST(dumpcfg.bitmap, bitnum))
2813 2827 continue;
2814 2828 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
2815 2829 ASSERT(pfn != PFN_INVALID);
2816 2830 dumpvp_write(&pfn, sizeof (pfn_t));
2817 2831 }
2818 2832 dump_plat_pfn();
2819 2833
2820 2834 /*
2821 2835 * Write out all the pages.
2822 2836 * Map pages, copy them handling UEs, compress, and write them out.
2823 2837 * Cooperate with any helpers running on CPUs in panic_idle().
2824 2838 */
2825 2839 dumphdr->dump_data = dumpvp_flush();
2826 2840
2827 2841 bzero(dumpcfg.helpermap, BT_SIZEOFMAP(NCPU));
2828 2842 ds->live = dumpcfg.clevel > 0 &&
2829 2843 (dumphdr->dump_flags & DF_LIVE) != 0;
2830 2844
2831 2845 save_dump_clevel = dumpcfg.clevel;
2832 2846 if (panicstr)
2833 2847 dumpsys_get_maxmem();
2834 2848 else if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2)
2835 2849 dumpcfg.clevel = DUMP_CLEVEL_LZJB;
2836 2850
2837 2851 dumpcfg.nhelper_used = 0;
2838 2852 for (hp = dumpcfg.helper; hp != hpend; hp++) {
2839 2853 if (hp->page == NULL) {
2840 2854 hp->helper = DONEHELPER;
2841 2855 continue;
2842 2856 }
2843 2857 ++dumpcfg.nhelper_used;
2844 2858 hp->helper = FREEHELPER;
2845 2859 hp->taskqid = NULL;
2846 2860 hp->ds = ds;
2847 2861 bzero(&hp->perpage, sizeof (hp->perpage));
2848 2862 if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2)
2849 2863 (void) BZ2_bzCompressReset(&hp->bzstream);
2850 2864 }
2851 2865
2852 2866 CQ_OPEN(freebufq);
2853 2867 CQ_OPEN(helperq);
2854 2868
2855 2869 dumpcfg.ncbuf_used = 0;
2856 2870 for (cp = dumpcfg.cbuf; cp != &dumpcfg.cbuf[dumpcfg.ncbuf]; cp++) {
2857 2871 if (cp->buf != NULL) {
2858 2872 CQ_PUT(freebufq, cp, CBUF_FREEBUF);
2859 2873 ++dumpcfg.ncbuf_used;
2860 2874 }
2861 2875 }
2862 2876
2863 2877 for (cp = dumpcfg.cmap; cp != &dumpcfg.cmap[dumpcfg.ncmap]; cp++)
2864 2878 CQ_PUT(mainq, cp, CBUF_FREEMAP);
2865 2879
2866 2880 ds->start = gethrtime();
2867 2881 ds->iowaitts = ds->start;
2868 2882
2869 2883 /* start helpers */
2870 2884 if (ds->live) {
2871 2885 int n = dumpcfg.nhelper_used;
2872 2886 int pri = MINCLSYSPRI - 25;
2873 2887
2874 2888 livetaskq = taskq_create("LiveDump", n, pri, n, n,
2875 2889 TASKQ_PREPOPULATE);
2876 2890 for (hp = dumpcfg.helper; hp != hpend; hp++) {
2877 2891 if (hp->page == NULL)
2878 2892 continue;
2879 2893 hp->helper = hp - dumpcfg.helper;
2880 2894 hp->taskqid = taskq_dispatch(livetaskq,
2881 2895 dumpsys_live_helper, (void *)hp, TQ_NOSLEEP);
2882 2896 }
2883 2897
2884 2898 } else {
2885 2899 if (panicstr)
2886 2900 kmem_dump_begin();
2887 2901 dumpcfg.helpers_wanted = dumpcfg.clevel > 0;
2888 2902 dumpsys_spinunlock(&dumpcfg.helper_lock);
2889 2903 }
2890 2904
2891 2905 /* run main task */
2892 2906 dumpsys_main_task(ds);
2893 2907
2894 2908 ds->elapsed = gethrtime() - ds->start;
2895 2909 if (ds->elapsed < 1)
2896 2910 ds->elapsed = 1;
2897 2911
2898 2912 if (livetaskq != NULL)
2899 2913 taskq_destroy(livetaskq);
2900 2914
2901 2915 if (ds->neednl) {
2902 2916 uprintf("\n");
2903 2917 ds->neednl = 0;
2904 2918 }
2905 2919
2906 2920 /* record actual pages dumped */
2907 2921 dumphdr->dump_npages = ds->npages;
2908 2922
2909 2923 /* platform-specific data */
2910 2924 dumphdr->dump_npages += dump_plat_data(dumpcfg.cbuf[0].buf);
2911 2925
2912 2926 /* note any errors by clearing DF_COMPLETE */
2913 2927 if (dump_ioerr || ds->npages < dumphdr->dump_npages)
2914 2928 dumphdr->dump_flags &= ~DF_COMPLETE;
2915 2929
2916 2930 /* end of stream blocks */
2917 2931 datatag = 0;
2918 2932 dumpvp_write(&datatag, sizeof (datatag));
2919 2933
2920 2934 bzero(&datahdr, sizeof (datahdr));
2921 2935
2922 2936 /* buffer for metrics */
2923 2937 buf = dumpcfg.cbuf[0].buf;
2924 2938 size = MIN(dumpcfg.cbuf[0].size, DUMP_OFFSET - sizeof (dumphdr_t) -
2925 2939 sizeof (dumpdatahdr_t));
2926 2940
2927 2941 /* finish the kmem intercepts, collect kmem verbose info */
2928 2942 if (panicstr) {
2929 2943 datahdr.dump_metrics = kmem_dump_finish(buf, size);
2930 2944 buf += datahdr.dump_metrics;
2931 2945 size -= datahdr.dump_metrics;
2932 2946 }
2933 2947
2934 2948 /* record in the header whether this is a fault-management panic */
2935 2949 if (panicstr)
2936 2950 dumphdr->dump_fm_panic = is_fm_panic();
2937 2951
2938 2952 /* compression info in data header */
2939 2953 datahdr.dump_datahdr_magic = DUMP_DATAHDR_MAGIC;
2940 2954 datahdr.dump_datahdr_version = DUMP_DATAHDR_VERSION;
2941 2955 datahdr.dump_maxcsize = CBUF_SIZE;
2942 2956 datahdr.dump_maxrange = CBUF_MAPSIZE / PAGESIZE;
2943 2957 datahdr.dump_nstreams = dumpcfg.nhelper_used;
2944 2958 datahdr.dump_clevel = dumpcfg.clevel;
2945 2959 #ifdef COLLECT_METRICS
2946 2960 if (dump_metrics_on)
2947 2961 datahdr.dump_metrics += dumpsys_metrics(ds, buf, size);
2948 2962 #endif
2949 2963 datahdr.dump_data_csize = dumpvp_flush() - dumphdr->dump_data;
2950 2964
2951 2965 /*
2952 2966 * Write out the initial and terminal dump headers.
2953 2967 */
2954 2968 dumpbuf.vp_off = dumphdr->dump_start;
2955 2969 dumpvp_write(dumphdr, sizeof (dumphdr_t));
2956 2970 (void) dumpvp_flush();
2957 2971
2958 2972 dumpbuf.vp_limit = dumpvp_size;
2959 2973 dumpbuf.vp_off = dumpbuf.vp_limit - DUMP_OFFSET;
2960 2974 dumpvp_write(dumphdr, sizeof (dumphdr_t));
2961 2975 dumpvp_write(&datahdr, sizeof (dumpdatahdr_t));
2962 2976 dumpvp_write(dumpcfg.cbuf[0].buf, datahdr.dump_metrics);
2963 2977
2964 2978 (void) dumpvp_flush();
2965 2979
2966 2980 uprintf("\r%3d%% done: %llu pages dumped, ",
2967 2981 ds->percent_done, (u_longlong_t)ds->npages);
2968 2982
2969 2983 if (dump_ioerr == 0) {
2970 2984 uprintf("dump succeeded\n");
2971 2985 } else {
2972 2986 uprintf("dump failed: error %d\n", dump_ioerr);
2973 2987 #ifdef DEBUG
2974 2988 if (panicstr)
2975 2989 debug_enter("dump failed");
2976 2990 #endif
2977 2991 }
2978 2992
2979 2993 /*
2980 2994 * Write out all undelivered messages. This has to be the *last*
2981 2995 * thing we do because the dump process itself emits messages.
2982 2996 */
2983 2997 if (panicstr) {
2984 2998 dump_summary();
2985 2999 dump_ereports();
2986 3000 dump_messages();
2987 3001 }
2988 3002
2989 3003 delay(2 * hz); /* let people see the 'done' message */
2990 3004 dump_timeleft = 0;
2991 3005 dump_ioerr = 0;
2992 3006
2993 3007 /* restore settings after live dump completes */
2994 3008 if (!panicstr) {
2995 3009 dumpcfg.clevel = save_dump_clevel;
2996 3010
2997 3011 /* release any VCHR open of the dump device */
2998 3012 if (dumpbuf.cdev_vp != NULL) {
2999 3013 (void) VOP_CLOSE(dumpbuf.cdev_vp, FREAD | FWRITE, 1, 0,
3000 3014 kcred, NULL);
3001 3015 VN_RELE(dumpbuf.cdev_vp);
3002 3016 dumpbuf.cdev_vp = NULL;
3003 3017 }
3004 3018 }
3005 3019 }
3006 3020
3007 3021 /*
3008 3022 * This function is called whenever the memory size, as represented
3009 3023 * by the phys_install list, changes.
3010 3024 */
3011 3025 void
3012 3026 dump_resize()
3013 3027 {
3014 3028 mutex_enter(&dump_lock);
3015 3029 dumphdr_init();
3016 3030 dumpbuf_resize();
3017 3031 dump_update_clevel();
3018 3032 mutex_exit(&dump_lock);
3019 3033 }
3020 3034
3021 3035 /*
3022 3036 * This function allows for dynamic resizing of a dump area. It assumes that
3023 3037 * the underlying device has update its appropriate size(9P).
3024 3038 */
3025 3039 int
3026 3040 dumpvp_resize()
3027 3041 {
3028 3042 int error;
3029 3043 vattr_t vattr;
3030 3044
3031 3045 mutex_enter(&dump_lock);
3032 3046 vattr.va_mask = AT_SIZE;
3033 3047 if ((error = VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL)) != 0) {
3034 3048 mutex_exit(&dump_lock);
3035 3049 return (error);
3036 3050 }
3037 3051
3038 3052 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) {
3039 3053 mutex_exit(&dump_lock);
3040 3054 return (ENOSPC);
3041 3055 }
3042 3056
3043 3057 dumpvp_size = vattr.va_size & -DUMP_OFFSET;
3044 3058 mutex_exit(&dump_lock);
3045 3059 return (0);
3046 3060 }
3047 3061
3048 3062 int
3049 3063 dump_set_uuid(const char *uuidstr)
3050 3064 {
3051 3065 const char *ptr;
3052 3066 int i;
3053 3067
3054 3068 if (uuidstr == NULL || strnlen(uuidstr, 36 + 1) != 36)
3055 3069 return (EINVAL);
3056 3070
3057 3071 /* uuid_parse is not common code so check manually */
3058 3072 for (i = 0, ptr = uuidstr; i < 36; i++, ptr++) {
3059 3073 switch (i) {
3060 3074 case 8:
3061 3075 case 13:
3062 3076 case 18:
3063 3077 case 23:
3064 3078 if (*ptr != '-')
3065 3079 return (EINVAL);
3066 3080 break;
3067 3081
3068 3082 default:
3069 3083 if (!isxdigit(*ptr))
3070 3084 return (EINVAL);
3071 3085 break;
3072 3086 }
3073 3087 }
3074 3088
3075 3089 if (dump_osimage_uuid[0] != '\0')
3076 3090 return (EALREADY);
3077 3091
3078 3092 (void) strncpy(dump_osimage_uuid, uuidstr, 36 + 1);
3079 3093
3080 3094 cmn_err(CE_CONT, "?This Solaris instance has UUID %s\n",
3081 3095 dump_osimage_uuid);
3082 3096
3083 3097 return (0);
3084 3098 }
3085 3099
3086 3100 const char *
3087 3101 dump_get_uuid(void)
3088 3102 {
3089 3103 return (dump_osimage_uuid[0] != '\0' ? dump_osimage_uuid : "");
3090 3104 }
↓ open down ↓ |
2394 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX