Print this page
9694 Parallel dump hangs
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: John Levon <levon@movementarian.org>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/os/dumpsubr.c
+++ new/usr/src/uts/common/os/dumpsubr.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright 2018 Joyent, Inc.
25 + * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
25 26 */
26 27
27 28 #include <sys/types.h>
28 29 #include <sys/param.h>
29 30 #include <sys/systm.h>
30 31 #include <sys/vm.h>
31 32 #include <sys/proc.h>
32 33 #include <sys/file.h>
33 34 #include <sys/conf.h>
34 35 #include <sys/kmem.h>
35 36 #include <sys/mem.h>
36 37 #include <sys/mman.h>
37 38 #include <sys/vnode.h>
38 39 #include <sys/errno.h>
39 40 #include <sys/memlist.h>
40 41 #include <sys/dumphdr.h>
41 42 #include <sys/dumpadm.h>
42 43 #include <sys/ksyms.h>
43 44 #include <sys/compress.h>
44 45 #include <sys/stream.h>
45 46 #include <sys/strsun.h>
46 47 #include <sys/cmn_err.h>
47 48 #include <sys/bitmap.h>
48 49 #include <sys/modctl.h>
49 50 #include <sys/utsname.h>
50 51 #include <sys/systeminfo.h>
51 52 #include <sys/vmem.h>
52 53 #include <sys/log.h>
53 54 #include <sys/var.h>
54 55 #include <sys/debug.h>
55 56 #include <sys/sunddi.h>
56 57 #include <fs/fs_subr.h>
57 58 #include <sys/fs/snode.h>
58 59 #include <sys/ontrap.h>
59 60 #include <sys/panic.h>
60 61 #include <sys/dkio.h>
61 62 #include <sys/vtoc.h>
62 63 #include <sys/errorq.h>
63 64 #include <sys/fm/util.h>
↓ open down ↓ |
29 lines elided |
↑ open up ↑ |
64 65 #include <sys/fs/zfs.h>
65 66
66 67 #include <vm/hat.h>
67 68 #include <vm/as.h>
68 69 #include <vm/page.h>
69 70 #include <vm/pvn.h>
70 71 #include <vm/seg.h>
71 72 #include <vm/seg_kmem.h>
72 73 #include <sys/clock_impl.h>
73 74 #include <sys/hold_page.h>
75 +#include <sys/cpu.h>
74 76
75 77 #include <bzip2/bzlib.h>
76 78
77 79 #define ONE_GIG (1024 * 1024 * 1024UL)
78 80
79 81 /*
80 82 * Crash dump time is dominated by disk write time. To reduce this,
81 83 * the stronger compression method bzip2 is applied to reduce the dump
82 84 * size and hence reduce I/O time. However, bzip2 is much more
83 85 * computationally expensive than the existing lzjb algorithm, so to
84 86 * avoid increasing compression time, CPUs that are otherwise idle
85 87 * during panic are employed to parallelize the compression task.
86 88 * Many helper CPUs are needed to prevent bzip2 from being a
87 89 * bottleneck, and on systems with too few CPUs, the lzjb algorithm is
88 90 * parallelized instead. Lastly, I/O and compression are performed by
89 91 * different CPUs, and are hence overlapped in time, unlike the older
90 92 * serial code.
91 93 *
92 94 * Another important consideration is the speed of the dump
93 95 * device. Faster disks need less CPUs in order to benefit from
94 96 * parallel lzjb versus parallel bzip2. Therefore, the CPU count
95 97 * threshold for switching from parallel lzjb to paralled bzip2 is
96 98 * elevated for faster disks. The dump device speed is adduced from
97 99 * the setting for dumpbuf.iosize, see dump_update_clevel.
98 100 */
99 101
100 102 /*
101 103 * exported vars
102 104 */
103 105 kmutex_t dump_lock; /* lock for dump configuration */
104 106 dumphdr_t *dumphdr; /* dump header */
105 107 int dump_conflags = DUMP_KERNEL; /* dump configuration flags */
106 108 vnode_t *dumpvp; /* dump device vnode pointer */
107 109 u_offset_t dumpvp_size; /* size of dump device, in bytes */
108 110 char *dumppath; /* pathname of dump device */
109 111 int dump_timeout = 120; /* timeout for dumping pages */
110 112 int dump_timeleft; /* portion of dump_timeout remaining */
111 113 int dump_ioerr; /* dump i/o error */
112 114 int dump_check_used; /* enable check for used pages */
113 115 char *dump_stack_scratch; /* scratch area for saving stack summary */
114 116
115 117 /*
116 118 * Tunables for dump compression and parallelism. These can be set via
117 119 * /etc/system.
118 120 *
119 121 * dump_ncpu_low number of helpers for parallel lzjb
120 122 * This is also the minimum configuration.
121 123 *
122 124 * dump_bzip2_level bzip2 compression level: 1-9
123 125 * Higher numbers give greater compression, but take more memory
124 126 * and time. Memory used per helper is ~(dump_bzip2_level * 1MB).
125 127 *
126 128 * dump_plat_mincpu the cross-over limit for using bzip2 (per platform):
127 129 * if dump_plat_mincpu == 0, then always do single threaded dump
128 130 * if ncpu >= dump_plat_mincpu then try to use bzip2
129 131 *
130 132 * dump_metrics_on if set, metrics are collected in the kernel, passed
131 133 * to savecore via the dump file, and recorded by savecore in
132 134 * METRICS.txt.
133 135 */
134 136 uint_t dump_ncpu_low = 4; /* minimum config for parallel lzjb */
135 137 uint_t dump_bzip2_level = 1; /* bzip2 level (1-9) */
136 138
137 139 /* Use dump_plat_mincpu_default unless this variable is set by /etc/system */
138 140 #define MINCPU_NOT_SET ((uint_t)-1)
139 141 uint_t dump_plat_mincpu = MINCPU_NOT_SET;
140 142
141 143 /* tunables for pre-reserved heap */
142 144 uint_t dump_kmem_permap = 1024;
143 145 uint_t dump_kmem_pages = 0;
144 146
145 147 /* Define multiple buffers per helper to avoid stalling */
146 148 #define NCBUF_PER_HELPER 2
147 149 #define NCMAP_PER_HELPER 4
148 150
149 151 /* minimum number of helpers configured */
150 152 #define MINHELPERS (dump_ncpu_low)
151 153 #define MINCBUFS (MINHELPERS * NCBUF_PER_HELPER)
152 154
153 155 /*
154 156 * Define constant parameters.
155 157 *
156 158 * CBUF_SIZE size of an output buffer
157 159 *
158 160 * CBUF_MAPSIZE size of virtual range for mapping pages
159 161 *
160 162 * CBUF_MAPNP size of virtual range in pages
161 163 *
162 164 */
163 165 #define DUMP_1KB ((size_t)1 << 10)
164 166 #define DUMP_1MB ((size_t)1 << 20)
165 167 #define CBUF_SIZE ((size_t)1 << 17)
166 168 #define CBUF_MAPSHIFT (22)
167 169 #define CBUF_MAPSIZE ((size_t)1 << CBUF_MAPSHIFT)
168 170 #define CBUF_MAPNP ((size_t)1 << (CBUF_MAPSHIFT - PAGESHIFT))
169 171
170 172 /*
171 173 * Compression metrics are accumulated nano-second subtotals. The
172 174 * results are normalized by the number of pages dumped. A report is
173 175 * generated when dumpsys() completes and is saved in the dump image
174 176 * after the trailing dump header.
175 177 *
176 178 * Metrics are always collected. Set the variable dump_metrics_on to
177 179 * cause metrics to be saved in the crash file, where savecore will
178 180 * save it in the file METRICS.txt.
179 181 */
180 182 #define PERPAGES \
181 183 PERPAGE(bitmap) PERPAGE(map) PERPAGE(unmap) \
182 184 PERPAGE(copy) PERPAGE(compress) \
183 185 PERPAGE(write) \
184 186 PERPAGE(inwait) PERPAGE(outwait)
185 187
186 188 typedef struct perpage {
187 189 #define PERPAGE(x) hrtime_t x;
188 190 PERPAGES
189 191 #undef PERPAGE
190 192 } perpage_t;
191 193
192 194 /*
193 195 * This macro controls the code generation for collecting dump
194 196 * performance information. By default, the code is generated, but
195 197 * automatic saving of the information is disabled. If dump_metrics_on
196 198 * is set to 1, the timing information is passed to savecore via the
197 199 * crash file, where it is appended to the file dump-dir/METRICS.txt.
198 200 */
199 201 #define COLLECT_METRICS
200 202
201 203 #ifdef COLLECT_METRICS
202 204 uint_t dump_metrics_on = 0; /* set to 1 to enable recording metrics */
203 205
204 206 #define HRSTART(v, m) v##ts.m = gethrtime()
205 207 #define HRSTOP(v, m) v.m += gethrtime() - v##ts.m
206 208 #define HRBEGIN(v, m, s) v##ts.m = gethrtime(); v.size += s
207 209 #define HREND(v, m) v.m += gethrtime() - v##ts.m
208 210 #define HRNORM(v, m, n) v.m /= (n)
209 211
210 212 #else
211 213 #define HRSTART(v, m)
212 214 #define HRSTOP(v, m)
213 215 #define HRBEGIN(v, m, s)
214 216 #define HREND(v, m)
215 217 #define HRNORM(v, m, n)
216 218 #endif /* COLLECT_METRICS */
217 219
218 220 /*
219 221 * Buffers for copying and compressing memory pages.
220 222 *
221 223 * cbuf_t buffer controllers: used for both input and output.
222 224 *
223 225 * The buffer state indicates how it is being used:
224 226 *
225 227 * CBUF_FREEMAP: CBUF_MAPSIZE virtual address range is available for
226 228 * mapping input pages.
227 229 *
228 230 * CBUF_INREADY: input pages are mapped and ready for compression by a
229 231 * helper.
230 232 *
231 233 * CBUF_USEDMAP: mapping has been consumed by a helper. Needs unmap.
232 234 *
233 235 * CBUF_FREEBUF: CBUF_SIZE output buffer, which is available.
234 236 *
235 237 * CBUF_WRITE: CBUF_SIZE block of compressed pages from a helper,
236 238 * ready to write out.
237 239 *
238 240 * CBUF_ERRMSG: CBUF_SIZE block of error messages from a helper
239 241 * (reports UE errors.)
240 242 */
241 243
242 244 typedef enum cbufstate {
243 245 CBUF_FREEMAP,
244 246 CBUF_INREADY,
245 247 CBUF_USEDMAP,
246 248 CBUF_FREEBUF,
247 249 CBUF_WRITE,
248 250 CBUF_ERRMSG
249 251 } cbufstate_t;
250 252
251 253 typedef struct cbuf cbuf_t;
252 254
253 255 struct cbuf {
254 256 cbuf_t *next; /* next in list */
255 257 cbufstate_t state; /* processing state */
256 258 size_t used; /* amount used */
257 259 size_t size; /* mem size */
258 260 char *buf; /* kmem or vmem */
259 261 pgcnt_t pagenum; /* index to pfn map */
260 262 pgcnt_t bitnum; /* first set bitnum */
261 263 pfn_t pfn; /* first pfn in mapped range */
262 264 int off; /* byte offset to first pfn */
263 265 };
264 266
265 267 static char dump_osimage_uuid[36 + 1];
266 268
267 269 #define isdigit(ch) ((ch) >= '0' && (ch) <= '9')
268 270 #define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \
269 271 ((ch) >= 'A' && (ch) <= 'F'))
270 272
271 273 /*
272 274 * cqueue_t queues: a uni-directional channel for communication
273 275 * from the master to helper tasks or vice-versa using put and
274 276 * get primitives. Both mappings and data buffers are passed via
275 277 * queues. Producers close a queue when done. The number of
276 278 * active producers is reference counted so the consumer can
277 279 * detect end of data. Concurrent access is mediated by atomic
278 280 * operations for panic dump, or mutex/cv for live dump.
279 281 *
280 282 * There a four queues, used as follows:
281 283 *
282 284 * Queue Dataflow NewState
283 285 * --------------------------------------------------
284 286 * mainq master -> master FREEMAP
285 287 * master has initialized or unmapped an input buffer
286 288 * --------------------------------------------------
287 289 * helperq master -> helper INREADY
288 290 * master has mapped input for use by helper
289 291 * --------------------------------------------------
290 292 * mainq master <- helper USEDMAP
291 293 * helper is done with input
292 294 * --------------------------------------------------
293 295 * freebufq master -> helper FREEBUF
294 296 * master has initialized or written an output buffer
295 297 * --------------------------------------------------
296 298 * mainq master <- helper WRITE
297 299 * block of compressed pages from a helper
298 300 * --------------------------------------------------
299 301 * mainq master <- helper ERRMSG
300 302 * error messages from a helper (memory error case)
301 303 * --------------------------------------------------
302 304 * writerq master <- master WRITE
303 305 * non-blocking queue of blocks to write
304 306 * --------------------------------------------------
305 307 */
306 308 typedef struct cqueue {
307 309 cbuf_t *volatile first; /* first in list */
308 310 cbuf_t *last; /* last in list */
309 311 hrtime_t ts; /* timestamp */
310 312 hrtime_t empty; /* total time empty */
311 313 kmutex_t mutex; /* live state lock */
312 314 kcondvar_t cv; /* live wait var */
313 315 lock_t spinlock; /* panic mode spin lock */
314 316 volatile uint_t open; /* producer ref count */
315 317 } cqueue_t;
316 318
317 319 /*
318 320 * Convenience macros for using the cqueue functions
319 321 * Note that the caller must have defined "dumpsync_t *ds"
320 322 */
321 323 #define CQ_IS_EMPTY(q) \
322 324 (ds->q.first == NULL)
323 325
324 326 #define CQ_OPEN(q) \
325 327 atomic_inc_uint(&ds->q.open)
326 328
327 329 #define CQ_CLOSE(q) \
328 330 dumpsys_close_cq(&ds->q, ds->live)
329 331
330 332 #define CQ_PUT(q, cp, st) \
331 333 dumpsys_put_cq(&ds->q, cp, st, ds->live)
332 334
333 335 #define CQ_GET(q) \
334 336 dumpsys_get_cq(&ds->q, ds->live)
335 337
336 338 /*
337 339 * Dynamic state when dumpsys() is running.
338 340 */
339 341 typedef struct dumpsync {
340 342 pgcnt_t npages; /* subtotal of pages dumped */
341 343 pgcnt_t pages_mapped; /* subtotal of pages mapped */
342 344 pgcnt_t pages_used; /* subtotal of pages used per map */
343 345 size_t nwrite; /* subtotal of bytes written */
344 346 uint_t live; /* running live dump */
345 347 uint_t neednl; /* will need to print a newline */
346 348 uint_t percent; /* dump progress */
347 349 uint_t percent_done; /* dump progress reported */
348 350 int sec_done; /* dump progress last report time */
349 351 cqueue_t freebufq; /* free kmem bufs for writing */
350 352 cqueue_t mainq; /* input for main task */
351 353 cqueue_t helperq; /* input for helpers */
352 354 cqueue_t writerq; /* input for writer */
353 355 hrtime_t start; /* start time */
354 356 hrtime_t elapsed; /* elapsed time when completed */
355 357 hrtime_t iotime; /* time spent writing nwrite bytes */
356 358 hrtime_t iowait; /* time spent waiting for output */
357 359 hrtime_t iowaitts; /* iowait timestamp */
358 360 perpage_t perpage; /* metrics */
359 361 perpage_t perpagets;
360 362 int dumpcpu; /* master cpu */
361 363 } dumpsync_t;
362 364
363 365 static dumpsync_t dumpsync; /* synchronization vars */
364 366
365 367 /*
366 368 * helper_t helpers: contains the context for a stream. CPUs run in
367 369 * parallel at dump time; each CPU creates a single stream of
368 370 * compression data. Stream data is divided into CBUF_SIZE blocks.
369 371 * The blocks are written in order within a stream. But, blocks from
370 372 * multiple streams can be interleaved. Each stream is identified by a
371 373 * unique tag.
372 374 */
373 375 typedef struct helper {
374 376 int helper; /* bound helper id */
375 377 int tag; /* compression stream tag */
376 378 perpage_t perpage; /* per page metrics */
377 379 perpage_t perpagets; /* per page metrics (timestamps) */
378 380 taskqid_t taskqid; /* live dump task ptr */
379 381 int in, out; /* buffer offsets */
380 382 cbuf_t *cpin, *cpout, *cperr; /* cbuf objects in process */
381 383 dumpsync_t *ds; /* pointer to sync vars */
382 384 size_t used; /* counts input consumed */
383 385 char *page; /* buffer for page copy */
384 386 char *lzbuf; /* lzjb output */
385 387 bz_stream bzstream; /* bzip2 state */
386 388 } helper_t;
387 389
388 390 #define MAINHELPER (-1) /* helper is also the main task */
389 391 #define FREEHELPER (-2) /* unbound helper */
390 392 #define DONEHELPER (-3) /* helper finished */
391 393
392 394 /*
393 395 * configuration vars for dumpsys
394 396 */
395 397 typedef struct dumpcfg {
396 398 int threshold; /* ncpu threshold for bzip2 */
397 399 int nhelper; /* number of helpers */
398 400 int nhelper_used; /* actual number of helpers used */
399 401 int ncmap; /* number VA pages for compression */
400 402 int ncbuf; /* number of bufs for compression */
401 403 int ncbuf_used; /* number of bufs in use */
402 404 uint_t clevel; /* dump compression level */
403 405 helper_t *helper; /* array of helpers */
404 406 cbuf_t *cmap; /* array of input (map) buffers */
405 407 cbuf_t *cbuf; /* array of output buffers */
406 408 ulong_t *helpermap; /* set of dumpsys helper CPU ids */
407 409 ulong_t *bitmap; /* bitmap for marking pages to dump */
408 410 ulong_t *rbitmap; /* bitmap for used CBUF_MAPSIZE ranges */
409 411 pgcnt_t bitmapsize; /* size of bitmap */
410 412 pgcnt_t rbitmapsize; /* size of bitmap for ranges */
411 413 pgcnt_t found4m; /* number ranges allocated by dump */
412 414 pgcnt_t foundsm; /* number small pages allocated by dump */
413 415 pid_t *pids; /* list of process IDs at dump time */
414 416 size_t maxsize; /* memory size needed at dump time */
415 417 size_t maxvmsize; /* size of reserved VM */
416 418 char *maxvm; /* reserved VM for spare pages */
417 419 lock_t helper_lock; /* protect helper state */
418 420 char helpers_wanted; /* flag to enable parallelism */
419 421 } dumpcfg_t;
420 422
421 423 static dumpcfg_t dumpcfg; /* config vars */
422 424
423 425 /*
424 426 * The dump I/O buffer.
425 427 *
426 428 * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is
427 429 * sized according to the optimum device transfer speed.
428 430 */
429 431 typedef struct dumpbuf {
430 432 vnode_t *cdev_vp; /* VCHR open of the dump device */
431 433 len_t vp_limit; /* maximum write offset */
432 434 offset_t vp_off; /* current dump device offset */
↓ open down ↓ |
349 lines elided |
↑ open up ↑ |
433 435 char *cur; /* dump write pointer */
434 436 char *start; /* dump buffer address */
435 437 char *end; /* dump buffer end */
436 438 size_t size; /* size of dumpbuf in bytes */
437 439 size_t iosize; /* best transfer size for device */
438 440 } dumpbuf_t;
439 441
440 442 dumpbuf_t dumpbuf; /* I/O buffer */
441 443
442 444 /*
445 + * For parallel dump, defines maximum time main task thread will wait
446 + * for at least one helper to register in dumpcfg.helpermap, before
447 + * assuming there are no helpers and falling back to serial mode.
448 + * Value is chosen arbitrary and provides *really* long wait for any
449 + * available helper to register.
450 + */
451 +#define DUMP_HELPER_MAX_WAIT 1000 /* millisec */
452 +
453 +/*
443 454 * The dump I/O buffer must be at least one page, at most xfer_size
444 455 * bytes, and should scale with physmem in between. The transfer size
445 456 * passed in will either represent a global default (maxphys) or the
446 457 * best size for the device. The size of the dumpbuf I/O buffer is
447 458 * limited by dumpbuf_limit (8MB by default) because the dump
448 459 * performance saturates beyond a certain size. The default is to
449 460 * select 1/4096 of the memory.
450 461 */
451 462 static int dumpbuf_fraction = 12; /* memory size scale factor */
452 463 static size_t dumpbuf_limit = 8 * DUMP_1MB; /* max I/O buf size */
453 464
454 465 static size_t
455 466 dumpbuf_iosize(size_t xfer_size)
456 467 {
457 468 size_t iosize = ptob(physmem >> dumpbuf_fraction);
458 469
459 470 if (iosize < PAGESIZE)
460 471 iosize = PAGESIZE;
461 472 else if (iosize > xfer_size)
462 473 iosize = xfer_size;
463 474 if (iosize > dumpbuf_limit)
464 475 iosize = dumpbuf_limit;
465 476 return (iosize & PAGEMASK);
466 477 }
467 478
468 479 /*
469 480 * resize the I/O buffer
470 481 */
471 482 static void
472 483 dumpbuf_resize(void)
473 484 {
474 485 char *old_buf = dumpbuf.start;
475 486 size_t old_size = dumpbuf.size;
476 487 char *new_buf;
477 488 size_t new_size;
478 489
479 490 ASSERT(MUTEX_HELD(&dump_lock));
480 491
481 492 new_size = dumpbuf_iosize(MAX(dumpbuf.iosize, maxphys));
482 493 if (new_size <= old_size)
483 494 return; /* no need to reallocate buffer */
↓ open down ↓ |
31 lines elided |
↑ open up ↑ |
484 495
485 496 new_buf = kmem_alloc(new_size, KM_SLEEP);
486 497 dumpbuf.size = new_size;
487 498 dumpbuf.start = new_buf;
488 499 dumpbuf.end = new_buf + new_size;
489 500 kmem_free(old_buf, old_size);
490 501 }
491 502
492 503 /*
493 504 * dump_update_clevel is called when dumpadm configures the dump device.
494 - * Calculate number of helpers and buffers.
495 - * Allocate the minimum configuration for now.
505 + * Calculate number of helpers and buffers.
506 + * Allocate the minimum configuration for now.
496 507 *
497 508 * When the dump file is configured we reserve a minimum amount of
498 509 * memory for use at crash time. But we reserve VA for all the memory
499 510 * we really want in order to do the fastest dump possible. The VA is
500 511 * backed by pages not being dumped, according to the bitmap. If
501 512 * there is insufficient spare memory, however, we fall back to the
502 513 * minimum.
503 514 *
504 515 * Live dump (savecore -L) always uses the minimum config.
505 516 *
506 517 * clevel 0 is single threaded lzjb
507 518 * clevel 1 is parallel lzjb
508 519 * clevel 2 is parallel bzip2
509 520 *
510 521 * The ncpu threshold is selected with dump_plat_mincpu.
511 522 * On OPL, set_platform_defaults() overrides the sun4u setting.
512 523 * The actual values are defined via DUMP_PLAT_*_MINCPU macros.
513 524 *
514 525 * Architecture Threshold Algorithm
515 - * sun4u < 51 parallel lzjb
516 - * sun4u >= 51 parallel bzip2(*)
517 - * sun4u OPL < 8 parallel lzjb
518 - * sun4u OPL >= 8 parallel bzip2(*)
519 - * sun4v < 128 parallel lzjb
520 - * sun4v >= 128 parallel bzip2(*)
526 + * sun4u < 51 parallel lzjb
527 + * sun4u >= 51 parallel bzip2(*)
528 + * sun4u OPL < 8 parallel lzjb
529 + * sun4u OPL >= 8 parallel bzip2(*)
530 + * sun4v < 128 parallel lzjb
531 + * sun4v >= 128 parallel bzip2(*)
521 532 * x86 < 11 parallel lzjb
522 533 * x86 >= 11 parallel bzip2(*)
523 - * 32-bit N/A single-threaded lzjb
534 + * 32-bit N/A single-threaded lzjb
524 535 *
525 536 * (*) bzip2 is only chosen if there is sufficient available
526 537 * memory for buffers at dump time. See dumpsys_get_maxmem().
527 538 *
528 539 * Faster dump devices have larger I/O buffers. The threshold value is
529 540 * increased according to the size of the dump I/O buffer, because
530 541 * parallel lzjb performs better with faster disks. For buffers >= 1MB
531 542 * the threshold is 3X; for buffers >= 256K threshold is 2X.
532 543 *
533 544 * For parallel dumps, the number of helpers is ncpu-1. The CPU
534 545 * running panic runs the main task. For single-threaded dumps, the
535 546 * panic CPU does lzjb compression (it is tagged as MAINHELPER.)
536 547 *
537 548 * Need multiple buffers per helper so that they do not block waiting
538 549 * for the main task.
539 550 * parallel single-threaded
540 551 * Number of output buffers: nhelper*2 1
541 552 * Number of mapping buffers: nhelper*4 1
542 553 *
543 554 */
544 555 static void
545 556 dump_update_clevel()
546 557 {
547 558 int tag;
548 559 size_t bz2size;
549 560 helper_t *hp, *hpend;
550 561 cbuf_t *cp, *cpend;
551 562 dumpcfg_t *old = &dumpcfg;
552 563 dumpcfg_t newcfg = *old;
553 564 dumpcfg_t *new = &newcfg;
554 565
555 566 ASSERT(MUTEX_HELD(&dump_lock));
556 567
557 568 /*
558 569 * Free the previously allocated bufs and VM.
559 570 */
560 571 if (old->helper != NULL) {
561 572
562 573 /* helpers */
563 574 hpend = &old->helper[old->nhelper];
564 575 for (hp = old->helper; hp != hpend; hp++) {
565 576 if (hp->lzbuf != NULL)
566 577 kmem_free(hp->lzbuf, PAGESIZE);
567 578 if (hp->page != NULL)
568 579 kmem_free(hp->page, PAGESIZE);
569 580 }
570 581 kmem_free(old->helper, old->nhelper * sizeof (helper_t));
571 582
572 583 /* VM space for mapping pages */
573 584 cpend = &old->cmap[old->ncmap];
574 585 for (cp = old->cmap; cp != cpend; cp++)
575 586 vmem_xfree(heap_arena, cp->buf, CBUF_MAPSIZE);
576 587 kmem_free(old->cmap, old->ncmap * sizeof (cbuf_t));
577 588
578 589 /* output bufs */
579 590 cpend = &old->cbuf[old->ncbuf];
580 591 for (cp = old->cbuf; cp != cpend; cp++)
581 592 if (cp->buf != NULL)
582 593 kmem_free(cp->buf, cp->size);
583 594 kmem_free(old->cbuf, old->ncbuf * sizeof (cbuf_t));
584 595
585 596 /* reserved VM for dumpsys_get_maxmem */
586 597 if (old->maxvmsize > 0)
587 598 vmem_xfree(heap_arena, old->maxvm, old->maxvmsize);
588 599 }
589 600
590 601 /*
591 602 * Allocate memory and VM.
592 603 * One CPU runs dumpsys, the rest are helpers.
593 604 */
594 605 new->nhelper = ncpus - 1;
595 606 if (new->nhelper < 1)
596 607 new->nhelper = 1;
597 608
598 609 if (new->nhelper > DUMP_MAX_NHELPER)
599 610 new->nhelper = DUMP_MAX_NHELPER;
600 611
601 612 /* use platform default, unless /etc/system overrides */
602 613 if (dump_plat_mincpu == MINCPU_NOT_SET)
603 614 dump_plat_mincpu = dump_plat_mincpu_default;
604 615
605 616 /* increase threshold for faster disks */
606 617 new->threshold = dump_plat_mincpu;
607 618 if (dumpbuf.iosize >= DUMP_1MB)
608 619 new->threshold *= 3;
609 620 else if (dumpbuf.iosize >= (256 * DUMP_1KB))
610 621 new->threshold *= 2;
611 622
612 623 /* figure compression level based upon the computed threshold. */
613 624 if (dump_plat_mincpu == 0 || new->nhelper < 2) {
614 625 new->clevel = 0;
615 626 new->nhelper = 1;
616 627 } else if ((new->nhelper + 1) >= new->threshold) {
617 628 new->clevel = DUMP_CLEVEL_BZIP2;
618 629 } else {
619 630 new->clevel = DUMP_CLEVEL_LZJB;
620 631 }
621 632
622 633 if (new->clevel == 0) {
623 634 new->ncbuf = 1;
624 635 new->ncmap = 1;
625 636 } else {
626 637 new->ncbuf = NCBUF_PER_HELPER * new->nhelper;
627 638 new->ncmap = NCMAP_PER_HELPER * new->nhelper;
628 639 }
629 640
630 641 /*
631 642 * Allocate new data structures and buffers for MINHELPERS,
632 643 * and also figure the max desired size.
633 644 */
634 645 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level);
635 646 new->maxsize = 0;
636 647 new->maxvmsize = 0;
637 648 new->maxvm = NULL;
638 649 tag = 1;
639 650 new->helper = kmem_zalloc(new->nhelper * sizeof (helper_t), KM_SLEEP);
640 651 hpend = &new->helper[new->nhelper];
641 652 for (hp = new->helper; hp != hpend; hp++) {
642 653 hp->tag = tag++;
643 654 if (hp < &new->helper[MINHELPERS]) {
644 655 hp->lzbuf = kmem_alloc(PAGESIZE, KM_SLEEP);
645 656 hp->page = kmem_alloc(PAGESIZE, KM_SLEEP);
646 657 } else if (new->clevel < DUMP_CLEVEL_BZIP2) {
647 658 new->maxsize += 2 * PAGESIZE;
648 659 } else {
649 660 new->maxsize += PAGESIZE;
650 661 }
651 662 if (new->clevel >= DUMP_CLEVEL_BZIP2)
652 663 new->maxsize += bz2size;
653 664 }
654 665
655 666 new->cbuf = kmem_zalloc(new->ncbuf * sizeof (cbuf_t), KM_SLEEP);
656 667 cpend = &new->cbuf[new->ncbuf];
657 668 for (cp = new->cbuf; cp != cpend; cp++) {
658 669 cp->state = CBUF_FREEBUF;
659 670 cp->size = CBUF_SIZE;
660 671 if (cp < &new->cbuf[MINCBUFS])
661 672 cp->buf = kmem_alloc(cp->size, KM_SLEEP);
662 673 else
663 674 new->maxsize += cp->size;
664 675 }
665 676
666 677 new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP);
667 678 cpend = &new->cmap[new->ncmap];
668 679 for (cp = new->cmap; cp != cpend; cp++) {
669 680 cp->state = CBUF_FREEMAP;
670 681 cp->size = CBUF_MAPSIZE;
671 682 cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE,
672 683 0, 0, NULL, NULL, VM_SLEEP);
673 684 }
674 685
675 686 /* reserve VA to be backed with spare pages at crash time */
676 687 if (new->maxsize > 0) {
677 688 new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE);
678 689 new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE);
679 690 new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize,
680 691 CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP);
681 692 }
682 693
683 694 /*
684 695 * Reserve memory for kmem allocation calls made during crash dump. The
685 696 * hat layer allocates memory for each mapping created, and the I/O path
686 697 * allocates buffers and data structs.
687 698 *
688 699 * On larger systems, we easily exceed the lower amount, so we need some
689 700 * more space; the cut-over point is relatively arbitrary. If we run
690 701 * out, the only impact is that kmem state in the dump becomes
691 702 * inconsistent.
692 703 */
693 704
694 705 if (dump_kmem_pages == 0) {
695 706 if (physmem > (16 * ONE_GIG) / PAGESIZE)
696 707 dump_kmem_pages = 20;
697 708 else
698 709 dump_kmem_pages = 8;
699 710 }
700 711
701 712 kmem_dump_init((new->ncmap * dump_kmem_permap) +
702 713 (dump_kmem_pages * PAGESIZE));
703 714
704 715 /* set new config pointers */
705 716 *old = *new;
706 717 }
707 718
708 719 /*
709 720 * Define a struct memlist walker to optimize bitnum to pfn
710 721 * lookup. The walker maintains the state of the list traversal.
711 722 */
712 723 typedef struct dumpmlw {
713 724 struct memlist *mp; /* current memlist */
714 725 pgcnt_t basenum; /* bitnum base offset */
715 726 pgcnt_t mppages; /* current memlist size */
716 727 pgcnt_t mpleft; /* size to end of current memlist */
717 728 pfn_t mpaddr; /* first pfn in memlist */
718 729 } dumpmlw_t;
719 730
720 731 /* initialize the walker */
721 732 static inline void
722 733 dump_init_memlist_walker(dumpmlw_t *pw)
723 734 {
724 735 pw->mp = phys_install;
725 736 pw->basenum = 0;
726 737 pw->mppages = pw->mp->ml_size >> PAGESHIFT;
727 738 pw->mpleft = pw->mppages;
728 739 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT;
729 740 }
730 741
731 742 /*
732 743 * Lookup pfn given bitnum. The memlist can be quite long on some
733 744 * systems (e.g.: one per board). To optimize sequential lookups, the
734 745 * caller initializes and presents a memlist walker.
735 746 */
736 747 static pfn_t
737 748 dump_bitnum_to_pfn(pgcnt_t bitnum, dumpmlw_t *pw)
738 749 {
739 750 bitnum -= pw->basenum;
740 751 while (pw->mp != NULL) {
741 752 if (bitnum < pw->mppages) {
742 753 pw->mpleft = pw->mppages - bitnum;
743 754 return (pw->mpaddr + bitnum);
744 755 }
745 756 bitnum -= pw->mppages;
746 757 pw->basenum += pw->mppages;
747 758 pw->mp = pw->mp->ml_next;
748 759 if (pw->mp != NULL) {
749 760 pw->mppages = pw->mp->ml_size >> PAGESHIFT;
750 761 pw->mpleft = pw->mppages;
751 762 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT;
752 763 }
753 764 }
754 765 return (PFN_INVALID);
755 766 }
756 767
757 768 static pgcnt_t
758 769 dump_pfn_to_bitnum(pfn_t pfn)
759 770 {
760 771 struct memlist *mp;
761 772 pgcnt_t bitnum = 0;
762 773
763 774 for (mp = phys_install; mp != NULL; mp = mp->ml_next) {
764 775 if (pfn >= (mp->ml_address >> PAGESHIFT) &&
765 776 pfn < ((mp->ml_address + mp->ml_size) >> PAGESHIFT))
766 777 return (bitnum + pfn - (mp->ml_address >> PAGESHIFT));
767 778 bitnum += mp->ml_size >> PAGESHIFT;
768 779 }
769 780 return ((pgcnt_t)-1);
770 781 }
771 782
772 783 /*
773 784 * Set/test bitmap for a CBUF_MAPSIZE range which includes pfn. The
774 785 * mapping of pfn to range index is imperfect because pfn and bitnum
775 786 * do not have the same phase. To make sure a CBUF_MAPSIZE range is
776 787 * covered, call this for both ends:
777 788 * dump_set_used(base)
778 789 * dump_set_used(base+CBUF_MAPNP-1)
779 790 *
780 791 * This is used during a panic dump to mark pages allocated by
781 792 * dumpsys_get_maxmem(). The macro IS_DUMP_PAGE(pp) is used by
782 793 * page_get_mnode_freelist() to make sure pages used by dump are never
783 794 * allocated.
784 795 */
785 796 #define CBUF_MAPP2R(pfn) ((pfn) >> (CBUF_MAPSHIFT - PAGESHIFT))
786 797
787 798 static void
788 799 dump_set_used(pfn_t pfn)
789 800 {
790 801
791 802 pgcnt_t bitnum, rbitnum;
792 803
793 804 bitnum = dump_pfn_to_bitnum(pfn);
794 805 ASSERT(bitnum != (pgcnt_t)-1);
795 806
796 807 rbitnum = CBUF_MAPP2R(bitnum);
797 808 ASSERT(rbitnum < dumpcfg.rbitmapsize);
798 809
799 810 BT_SET(dumpcfg.rbitmap, rbitnum);
800 811 }
801 812
802 813 int
803 814 dump_test_used(pfn_t pfn)
804 815 {
805 816 pgcnt_t bitnum, rbitnum;
806 817
807 818 bitnum = dump_pfn_to_bitnum(pfn);
808 819 ASSERT(bitnum != (pgcnt_t)-1);
809 820
810 821 rbitnum = CBUF_MAPP2R(bitnum);
811 822 ASSERT(rbitnum < dumpcfg.rbitmapsize);
812 823
813 824 return (BT_TEST(dumpcfg.rbitmap, rbitnum));
814 825 }
815 826
816 827 /*
817 828 * dumpbzalloc and dumpbzfree are callbacks from the bzip2 library.
818 829 * dumpsys_get_maxmem() uses them for BZ2_bzCompressInit().
819 830 */
820 831 static void *
821 832 dumpbzalloc(void *opaque, int items, int size)
822 833 {
823 834 size_t *sz;
824 835 char *ret;
825 836
826 837 ASSERT(opaque != NULL);
827 838 sz = opaque;
828 839 ret = dumpcfg.maxvm + *sz;
829 840 *sz += items * size;
830 841 *sz = P2ROUNDUP(*sz, BZ2_BZALLOC_ALIGN);
831 842 ASSERT(*sz <= dumpcfg.maxvmsize);
832 843 return (ret);
833 844 }
834 845
835 846 /*ARGSUSED*/
836 847 static void
837 848 dumpbzfree(void *opaque, void *addr)
838 849 {
839 850 }
840 851
841 852 /*
842 853 * Perform additional checks on the page to see if we can really use
843 854 * it. The kernel (kas) pages are always set in the bitmap. However,
844 855 * boot memory pages (prom_ppages or P_BOOTPAGES) are not in the
845 856 * bitmap. So we check for them.
846 857 */
847 858 static inline int
848 859 dump_pfn_check(pfn_t pfn)
849 860 {
850 861 page_t *pp = page_numtopp_nolock(pfn);
851 862 if (pp == NULL || pp->p_pagenum != pfn ||
852 863 #if defined(__sparc)
853 864 pp->p_vnode == &promvp ||
854 865 #else
855 866 PP_ISBOOTPAGES(pp) ||
856 867 #endif
857 868 pp->p_toxic != 0)
858 869 return (0);
859 870 return (1);
860 871 }
861 872
862 873 /*
863 874 * Check a range to see if all contained pages are available and
864 875 * return non-zero if the range can be used.
865 876 */
866 877 static inline int
867 878 dump_range_check(pgcnt_t start, pgcnt_t end, pfn_t pfn)
868 879 {
869 880 for (; start < end; start++, pfn++) {
870 881 if (BT_TEST(dumpcfg.bitmap, start))
871 882 return (0);
872 883 if (!dump_pfn_check(pfn))
873 884 return (0);
874 885 }
875 886 return (1);
876 887 }
877 888
878 889 /*
879 890 * dumpsys_get_maxmem() is called during panic. Find unused ranges
880 891 * and use them for buffers. If we find enough memory switch to
881 892 * parallel bzip2, otherwise use parallel lzjb.
882 893 *
883 894 * It searches the dump bitmap in 2 passes. The first time it looks
884 895 * for CBUF_MAPSIZE ranges. On the second pass it uses small pages.
885 896 */
886 897 static void
887 898 dumpsys_get_maxmem()
888 899 {
889 900 dumpcfg_t *cfg = &dumpcfg;
890 901 cbuf_t *endcp = &cfg->cbuf[cfg->ncbuf];
891 902 helper_t *endhp = &cfg->helper[cfg->nhelper];
892 903 pgcnt_t bitnum, end;
893 904 size_t sz, endsz, bz2size;
894 905 pfn_t pfn, off;
895 906 cbuf_t *cp;
896 907 helper_t *hp, *ohp;
897 908 dumpmlw_t mlw;
898 909 int k;
899 910
900 911 /*
901 912 * Setting dump_plat_mincpu to 0 at any time forces a serial
902 913 * dump.
903 914 */
904 915 if (dump_plat_mincpu == 0) {
905 916 cfg->clevel = 0;
906 917 return;
907 918 }
908 919
909 920 /*
910 921 * There may be no point in looking for spare memory. If
911 922 * dumping all memory, then none is spare. If doing a serial
912 923 * dump, then already have buffers.
913 924 */
914 925 if (cfg->maxsize == 0 || cfg->clevel < DUMP_CLEVEL_LZJB ||
915 926 (dump_conflags & DUMP_ALL) != 0) {
916 927 if (cfg->clevel > DUMP_CLEVEL_LZJB)
917 928 cfg->clevel = DUMP_CLEVEL_LZJB;
918 929 return;
919 930 }
920 931
921 932 sz = 0;
922 933 cfg->found4m = 0;
923 934 cfg->foundsm = 0;
924 935
925 936 /* bitmap of ranges used to estimate which pfns are being used */
926 937 bzero(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.rbitmapsize));
927 938
928 939 /* find ranges that are not being dumped to use for buffers */
929 940 dump_init_memlist_walker(&mlw);
930 941 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) {
931 942 dump_timeleft = dump_timeout;
932 943 end = bitnum + CBUF_MAPNP;
933 944 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
934 945 ASSERT(pfn != PFN_INVALID);
935 946
936 947 /* skip partial range at end of mem segment */
937 948 if (mlw.mpleft < CBUF_MAPNP) {
938 949 end = bitnum + mlw.mpleft;
939 950 continue;
940 951 }
941 952
942 953 /* skip non aligned pages */
943 954 off = P2PHASE(pfn, CBUF_MAPNP);
944 955 if (off != 0) {
945 956 end -= off;
946 957 continue;
947 958 }
948 959
949 960 if (!dump_range_check(bitnum, end, pfn))
950 961 continue;
951 962
952 963 ASSERT((sz + CBUF_MAPSIZE) <= cfg->maxvmsize);
953 964 hat_devload(kas.a_hat, cfg->maxvm + sz, CBUF_MAPSIZE, pfn,
954 965 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST);
955 966 sz += CBUF_MAPSIZE;
956 967 cfg->found4m++;
957 968
958 969 /* set the bitmap for both ends to be sure to cover the range */
959 970 dump_set_used(pfn);
960 971 dump_set_used(pfn + CBUF_MAPNP - 1);
961 972
962 973 if (sz >= cfg->maxsize)
963 974 goto foundmax;
964 975 }
965 976
966 977 /* Add small pages if we can't find enough large pages. */
967 978 dump_init_memlist_walker(&mlw);
968 979 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) {
969 980 dump_timeleft = dump_timeout;
970 981 end = bitnum + CBUF_MAPNP;
971 982 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
972 983 ASSERT(pfn != PFN_INVALID);
973 984
974 985 /* Find any non-aligned pages at start and end of segment. */
975 986 off = P2PHASE(pfn, CBUF_MAPNP);
976 987 if (mlw.mpleft < CBUF_MAPNP) {
977 988 end = bitnum + mlw.mpleft;
978 989 } else if (off != 0) {
979 990 end -= off;
980 991 } else if (cfg->found4m && dump_test_used(pfn)) {
981 992 continue;
982 993 }
983 994
984 995 for (; bitnum < end; bitnum++, pfn++) {
985 996 dump_timeleft = dump_timeout;
986 997 if (BT_TEST(dumpcfg.bitmap, bitnum))
987 998 continue;
988 999 if (!dump_pfn_check(pfn))
989 1000 continue;
990 1001 ASSERT((sz + PAGESIZE) <= cfg->maxvmsize);
991 1002 hat_devload(kas.a_hat, cfg->maxvm + sz, PAGESIZE, pfn,
992 1003 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST);
993 1004 sz += PAGESIZE;
994 1005 cfg->foundsm++;
995 1006 dump_set_used(pfn);
996 1007 if (sz >= cfg->maxsize)
997 1008 goto foundmax;
998 1009 }
999 1010 }
1000 1011
1001 1012 /* Fall back to lzjb if we did not get enough memory for bzip2. */
1002 1013 endsz = (cfg->maxsize * cfg->threshold) / cfg->nhelper;
1003 1014 if (sz < endsz) {
1004 1015 cfg->clevel = DUMP_CLEVEL_LZJB;
1005 1016 }
1006 1017
1007 1018 /* Allocate memory for as many helpers as we can. */
1008 1019 foundmax:
1009 1020
1010 1021 /* Byte offsets into memory found and mapped above */
1011 1022 endsz = sz;
1012 1023 sz = 0;
1013 1024
1014 1025 /* Set the size for bzip2 state. Only bzip2 needs it. */
1015 1026 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level);
1016 1027
1017 1028 /* Skip the preallocate output buffers. */
1018 1029 cp = &cfg->cbuf[MINCBUFS];
1019 1030
1020 1031 /* Use this to move memory up from the preallocated helpers. */
1021 1032 ohp = cfg->helper;
1022 1033
1023 1034 /* Loop over all helpers and allocate memory. */
1024 1035 for (hp = cfg->helper; hp < endhp; hp++) {
1025 1036
1026 1037 /* Skip preallocated helpers by checking hp->page. */
1027 1038 if (hp->page == NULL) {
1028 1039 if (cfg->clevel <= DUMP_CLEVEL_LZJB) {
1029 1040 /* lzjb needs 2 1-page buffers */
1030 1041 if ((sz + (2 * PAGESIZE)) > endsz)
1031 1042 break;
1032 1043 hp->page = cfg->maxvm + sz;
1033 1044 sz += PAGESIZE;
1034 1045 hp->lzbuf = cfg->maxvm + sz;
1035 1046 sz += PAGESIZE;
1036 1047
1037 1048 } else if (ohp->lzbuf != NULL) {
1038 1049 /* re-use the preallocted lzjb page for bzip2 */
1039 1050 hp->page = ohp->lzbuf;
1040 1051 ohp->lzbuf = NULL;
1041 1052 ++ohp;
1042 1053
1043 1054 } else {
1044 1055 /* bzip2 needs a 1-page buffer */
1045 1056 if ((sz + PAGESIZE) > endsz)
1046 1057 break;
1047 1058 hp->page = cfg->maxvm + sz;
1048 1059 sz += PAGESIZE;
1049 1060 }
1050 1061 }
1051 1062
1052 1063 /*
1053 1064 * Add output buffers per helper. The number of
1054 1065 * buffers per helper is determined by the ratio of
1055 1066 * ncbuf to nhelper.
1056 1067 */
1057 1068 for (k = 0; cp < endcp && (sz + CBUF_SIZE) <= endsz &&
1058 1069 k < NCBUF_PER_HELPER; k++) {
1059 1070 cp->state = CBUF_FREEBUF;
1060 1071 cp->size = CBUF_SIZE;
1061 1072 cp->buf = cfg->maxvm + sz;
1062 1073 sz += CBUF_SIZE;
1063 1074 ++cp;
1064 1075 }
1065 1076
1066 1077 /*
1067 1078 * bzip2 needs compression state. Use the dumpbzalloc
1068 1079 * and dumpbzfree callbacks to allocate the memory.
1069 1080 * bzip2 does allocation only at init time.
1070 1081 */
1071 1082 if (cfg->clevel >= DUMP_CLEVEL_BZIP2) {
1072 1083 if ((sz + bz2size) > endsz) {
1073 1084 hp->page = NULL;
1074 1085 break;
1075 1086 } else {
1076 1087 hp->bzstream.opaque = &sz;
1077 1088 hp->bzstream.bzalloc = dumpbzalloc;
1078 1089 hp->bzstream.bzfree = dumpbzfree;
1079 1090 (void) BZ2_bzCompressInit(&hp->bzstream,
1080 1091 dump_bzip2_level, 0, 0);
1081 1092 hp->bzstream.opaque = NULL;
1082 1093 }
1083 1094 }
1084 1095 }
1085 1096
1086 1097 /* Finish allocating output buffers */
1087 1098 for (; cp < endcp && (sz + CBUF_SIZE) <= endsz; cp++) {
1088 1099 cp->state = CBUF_FREEBUF;
1089 1100 cp->size = CBUF_SIZE;
1090 1101 cp->buf = cfg->maxvm + sz;
1091 1102 sz += CBUF_SIZE;
1092 1103 }
1093 1104
1094 1105 /* Enable IS_DUMP_PAGE macro, which checks for pages we took. */
1095 1106 if (cfg->found4m || cfg->foundsm)
1096 1107 dump_check_used = 1;
1097 1108
1098 1109 ASSERT(sz <= endsz);
1099 1110 }
1100 1111
1101 1112 static void
1102 1113 dumphdr_init(void)
1103 1114 {
1104 1115 pgcnt_t npages = 0;
1105 1116
1106 1117 ASSERT(MUTEX_HELD(&dump_lock));
1107 1118
1108 1119 if (dumphdr == NULL) {
1109 1120 dumphdr = kmem_zalloc(sizeof (dumphdr_t), KM_SLEEP);
1110 1121 dumphdr->dump_magic = DUMP_MAGIC;
1111 1122 dumphdr->dump_version = DUMP_VERSION;
1112 1123 dumphdr->dump_wordsize = DUMP_WORDSIZE;
1113 1124 dumphdr->dump_pageshift = PAGESHIFT;
1114 1125 dumphdr->dump_pagesize = PAGESIZE;
1115 1126 dumphdr->dump_utsname = utsname;
1116 1127 (void) strcpy(dumphdr->dump_platform, platform);
1117 1128 dumpbuf.size = dumpbuf_iosize(maxphys);
1118 1129 dumpbuf.start = kmem_alloc(dumpbuf.size, KM_SLEEP);
1119 1130 dumpbuf.end = dumpbuf.start + dumpbuf.size;
1120 1131 dumpcfg.pids = kmem_alloc(v.v_proc * sizeof (pid_t), KM_SLEEP);
1121 1132 dumpcfg.helpermap = kmem_zalloc(BT_SIZEOFMAP(NCPU), KM_SLEEP);
1122 1133 LOCK_INIT_HELD(&dumpcfg.helper_lock);
1123 1134 dump_stack_scratch = kmem_alloc(STACK_BUF_SIZE, KM_SLEEP);
1124 1135 (void) strncpy(dumphdr->dump_uuid, dump_get_uuid(),
1125 1136 sizeof (dumphdr->dump_uuid));
1126 1137 }
1127 1138
1128 1139 npages = num_phys_pages();
1129 1140
1130 1141 if (dumpcfg.bitmapsize != npages) {
1131 1142 size_t rlen = CBUF_MAPP2R(P2ROUNDUP(npages, CBUF_MAPNP));
1132 1143 void *map = kmem_alloc(BT_SIZEOFMAP(npages), KM_SLEEP);
1133 1144 void *rmap = kmem_alloc(BT_SIZEOFMAP(rlen), KM_SLEEP);
1134 1145
1135 1146 if (dumpcfg.bitmap != NULL)
1136 1147 kmem_free(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.
1137 1148 bitmapsize));
1138 1149 if (dumpcfg.rbitmap != NULL)
1139 1150 kmem_free(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.
1140 1151 rbitmapsize));
1141 1152 dumpcfg.bitmap = map;
1142 1153 dumpcfg.bitmapsize = npages;
1143 1154 dumpcfg.rbitmap = rmap;
1144 1155 dumpcfg.rbitmapsize = rlen;
1145 1156 }
1146 1157 }
1147 1158
1148 1159 /*
1149 1160 * Establish a new dump device.
1150 1161 */
1151 1162 int
1152 1163 dumpinit(vnode_t *vp, char *name, int justchecking)
1153 1164 {
1154 1165 vnode_t *cvp;
1155 1166 vattr_t vattr;
1156 1167 vnode_t *cdev_vp;
1157 1168 int error = 0;
1158 1169
1159 1170 ASSERT(MUTEX_HELD(&dump_lock));
1160 1171
1161 1172 dumphdr_init();
1162 1173
1163 1174 cvp = common_specvp(vp);
1164 1175 if (cvp == dumpvp)
1165 1176 return (0);
1166 1177
1167 1178 /*
1168 1179 * Determine whether this is a plausible dump device. We want either:
1169 1180 * (1) a real device that's not mounted and has a cb_dump routine, or
1170 1181 * (2) a swapfile on some filesystem that has a vop_dump routine.
1171 1182 */
1172 1183 if ((error = VOP_OPEN(&cvp, FREAD | FWRITE, kcred, NULL)) != 0)
1173 1184 return (error);
1174 1185
1175 1186 vattr.va_mask = AT_SIZE | AT_TYPE | AT_RDEV;
1176 1187 if ((error = VOP_GETATTR(cvp, &vattr, 0, kcred, NULL)) == 0) {
1177 1188 if (vattr.va_type == VBLK || vattr.va_type == VCHR) {
1178 1189 if (devopsp[getmajor(vattr.va_rdev)]->
1179 1190 devo_cb_ops->cb_dump == nodev)
1180 1191 error = ENOTSUP;
1181 1192 else if (vfs_devismounted(vattr.va_rdev))
1182 1193 error = EBUSY;
1183 1194 if (strcmp(ddi_driver_name(VTOS(cvp)->s_dip),
1184 1195 ZFS_DRIVER) == 0 &&
1185 1196 IS_SWAPVP(common_specvp(cvp)))
1186 1197 error = EBUSY;
1187 1198 } else {
1188 1199 if (vn_matchopval(cvp, VOPNAME_DUMP, fs_nosys) ||
1189 1200 !IS_SWAPVP(cvp))
1190 1201 error = ENOTSUP;
1191 1202 }
1192 1203 }
1193 1204
1194 1205 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE)
1195 1206 error = ENOSPC;
1196 1207
1197 1208 if (error || justchecking) {
1198 1209 (void) VOP_CLOSE(cvp, FREAD | FWRITE, 1, (offset_t)0,
1199 1210 kcred, NULL);
1200 1211 return (error);
1201 1212 }
1202 1213
1203 1214 VN_HOLD(cvp);
1204 1215
1205 1216 if (dumpvp != NULL)
1206 1217 dumpfini(); /* unconfigure the old dump device */
1207 1218
1208 1219 dumpvp = cvp;
1209 1220 dumpvp_size = vattr.va_size & -DUMP_OFFSET;
1210 1221 dumppath = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1211 1222 (void) strcpy(dumppath, name);
1212 1223 dumpbuf.iosize = 0;
1213 1224
1214 1225 /*
1215 1226 * If the dump device is a block device, attempt to open up the
1216 1227 * corresponding character device and determine its maximum transfer
1217 1228 * size. We use this information to potentially resize dumpbuf to a
1218 1229 * larger and more optimal size for performing i/o to the dump device.
1219 1230 */
1220 1231 if (cvp->v_type == VBLK &&
1221 1232 (cdev_vp = makespecvp(VTOS(cvp)->s_dev, VCHR)) != NULL) {
1222 1233 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) {
1223 1234 size_t blk_size;
1224 1235 struct dk_cinfo dki;
1225 1236 struct dk_minfo minf;
1226 1237
1227 1238 if (VOP_IOCTL(cdev_vp, DKIOCGMEDIAINFO,
1228 1239 (intptr_t)&minf, FKIOCTL, kcred, NULL, NULL)
1229 1240 == 0 && minf.dki_lbsize != 0)
1230 1241 blk_size = minf.dki_lbsize;
1231 1242 else
1232 1243 blk_size = DEV_BSIZE;
1233 1244
1234 1245 if (VOP_IOCTL(cdev_vp, DKIOCINFO, (intptr_t)&dki,
1235 1246 FKIOCTL, kcred, NULL, NULL) == 0) {
1236 1247 dumpbuf.iosize = dki.dki_maxtransfer * blk_size;
1237 1248 dumpbuf_resize();
1238 1249 }
1239 1250 /*
1240 1251 * If we are working with a zvol then dumpify it
1241 1252 * if it's not being used as swap.
1242 1253 */
1243 1254 if (strcmp(dki.dki_dname, ZVOL_DRIVER) == 0) {
1244 1255 if (IS_SWAPVP(common_specvp(cvp)))
1245 1256 error = EBUSY;
1246 1257 else if ((error = VOP_IOCTL(cdev_vp,
1247 1258 DKIOCDUMPINIT, NULL, FKIOCTL, kcred,
1248 1259 NULL, NULL)) != 0)
1249 1260 dumpfini();
1250 1261 }
1251 1262
1252 1263 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0,
1253 1264 kcred, NULL);
1254 1265 }
1255 1266
1256 1267 VN_RELE(cdev_vp);
1257 1268 }
1258 1269
1259 1270 cmn_err(CE_CONT, "?dump on %s size %llu MB\n", name, dumpvp_size >> 20);
1260 1271
1261 1272 dump_update_clevel();
1262 1273
1263 1274 return (error);
1264 1275 }
1265 1276
1266 1277 void
1267 1278 dumpfini(void)
1268 1279 {
1269 1280 vattr_t vattr;
1270 1281 boolean_t is_zfs = B_FALSE;
1271 1282 vnode_t *cdev_vp;
1272 1283 ASSERT(MUTEX_HELD(&dump_lock));
1273 1284
1274 1285 kmem_free(dumppath, strlen(dumppath) + 1);
1275 1286
1276 1287 /*
1277 1288 * Determine if we are using zvols for our dump device
1278 1289 */
1279 1290 vattr.va_mask = AT_RDEV;
1280 1291 if (VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL) == 0) {
1281 1292 is_zfs = (getmajor(vattr.va_rdev) ==
1282 1293 ddi_name_to_major(ZFS_DRIVER)) ? B_TRUE : B_FALSE;
1283 1294 }
1284 1295
1285 1296 /*
1286 1297 * If we have a zvol dump device then we call into zfs so
1287 1298 * that it may have a chance to cleanup.
1288 1299 */
1289 1300 if (is_zfs &&
1290 1301 (cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR)) != NULL) {
1291 1302 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) {
1292 1303 (void) VOP_IOCTL(cdev_vp, DKIOCDUMPFINI, NULL, FKIOCTL,
1293 1304 kcred, NULL, NULL);
1294 1305 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0,
1295 1306 kcred, NULL);
1296 1307 }
1297 1308 VN_RELE(cdev_vp);
1298 1309 }
1299 1310
1300 1311 (void) VOP_CLOSE(dumpvp, FREAD | FWRITE, 1, (offset_t)0, kcred, NULL);
1301 1312
1302 1313 VN_RELE(dumpvp);
1303 1314
1304 1315 dumpvp = NULL;
1305 1316 dumpvp_size = 0;
1306 1317 dumppath = NULL;
1307 1318 }
1308 1319
1309 1320 static offset_t
1310 1321 dumpvp_flush(void)
1311 1322 {
1312 1323 size_t size = P2ROUNDUP(dumpbuf.cur - dumpbuf.start, PAGESIZE);
1313 1324 hrtime_t iotime;
1314 1325 int err;
1315 1326
1316 1327 if (dumpbuf.vp_off + size > dumpbuf.vp_limit) {
1317 1328 dump_ioerr = ENOSPC;
1318 1329 dumpbuf.vp_off = dumpbuf.vp_limit;
1319 1330 } else if (size != 0) {
1320 1331 iotime = gethrtime();
1321 1332 dumpsync.iowait += iotime - dumpsync.iowaitts;
1322 1333 if (panicstr)
1323 1334 err = VOP_DUMP(dumpvp, dumpbuf.start,
1324 1335 lbtodb(dumpbuf.vp_off), btod(size), NULL);
1325 1336 else
1326 1337 err = vn_rdwr(UIO_WRITE, dumpbuf.cdev_vp != NULL ?
1327 1338 dumpbuf.cdev_vp : dumpvp, dumpbuf.start, size,
1328 1339 dumpbuf.vp_off, UIO_SYSSPACE, 0, dumpbuf.vp_limit,
1329 1340 kcred, 0);
1330 1341 if (err && dump_ioerr == 0)
1331 1342 dump_ioerr = err;
1332 1343 dumpsync.iowaitts = gethrtime();
1333 1344 dumpsync.iotime += dumpsync.iowaitts - iotime;
1334 1345 dumpsync.nwrite += size;
1335 1346 dumpbuf.vp_off += size;
1336 1347 }
1337 1348 dumpbuf.cur = dumpbuf.start;
1338 1349 dump_timeleft = dump_timeout;
1339 1350 return (dumpbuf.vp_off);
1340 1351 }
1341 1352
1342 1353 /* maximize write speed by keeping seek offset aligned with size */
1343 1354 void
1344 1355 dumpvp_write(const void *va, size_t size)
1345 1356 {
1346 1357 size_t len, off, sz;
1347 1358
1348 1359 while (size != 0) {
1349 1360 len = MIN(size, dumpbuf.end - dumpbuf.cur);
1350 1361 if (len == 0) {
1351 1362 off = P2PHASE(dumpbuf.vp_off, dumpbuf.size);
1352 1363 if (off == 0 || !ISP2(dumpbuf.size)) {
1353 1364 (void) dumpvp_flush();
1354 1365 } else {
1355 1366 sz = dumpbuf.size - off;
1356 1367 dumpbuf.cur = dumpbuf.start + sz;
1357 1368 (void) dumpvp_flush();
1358 1369 ovbcopy(dumpbuf.start + sz, dumpbuf.start, off);
1359 1370 dumpbuf.cur += off;
1360 1371 }
1361 1372 } else {
1362 1373 bcopy(va, dumpbuf.cur, len);
1363 1374 va = (char *)va + len;
1364 1375 dumpbuf.cur += len;
1365 1376 size -= len;
1366 1377 }
1367 1378 }
1368 1379 }
1369 1380
1370 1381 /*ARGSUSED*/
1371 1382 static void
1372 1383 dumpvp_ksyms_write(const void *src, void *dst, size_t size)
1373 1384 {
1374 1385 dumpvp_write(src, size);
1375 1386 }
1376 1387
1377 1388 /*
1378 1389 * Mark 'pfn' in the bitmap and dump its translation table entry.
1379 1390 */
1380 1391 void
1381 1392 dump_addpage(struct as *as, void *va, pfn_t pfn)
1382 1393 {
1383 1394 mem_vtop_t mem_vtop;
1384 1395 pgcnt_t bitnum;
1385 1396
1386 1397 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) {
1387 1398 if (!BT_TEST(dumpcfg.bitmap, bitnum)) {
1388 1399 dumphdr->dump_npages++;
1389 1400 BT_SET(dumpcfg.bitmap, bitnum);
1390 1401 }
1391 1402 dumphdr->dump_nvtop++;
1392 1403 mem_vtop.m_as = as;
1393 1404 mem_vtop.m_va = va;
1394 1405 mem_vtop.m_pfn = pfn;
1395 1406 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
1396 1407 }
1397 1408 dump_timeleft = dump_timeout;
1398 1409 }
1399 1410
1400 1411 /*
1401 1412 * Mark 'pfn' in the bitmap
1402 1413 */
1403 1414 void
1404 1415 dump_page(pfn_t pfn)
1405 1416 {
1406 1417 pgcnt_t bitnum;
1407 1418
1408 1419 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) {
1409 1420 if (!BT_TEST(dumpcfg.bitmap, bitnum)) {
1410 1421 dumphdr->dump_npages++;
1411 1422 BT_SET(dumpcfg.bitmap, bitnum);
1412 1423 }
1413 1424 }
1414 1425 dump_timeleft = dump_timeout;
1415 1426 }
1416 1427
1417 1428 /*
1418 1429 * Dump the <as, va, pfn> information for a given address space.
1419 1430 * SEGOP_DUMP() will call dump_addpage() for each page in the segment.
1420 1431 */
1421 1432 static void
1422 1433 dump_as(struct as *as)
1423 1434 {
1424 1435 struct seg *seg;
1425 1436
1426 1437 AS_LOCK_ENTER(as, RW_READER);
1427 1438 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
1428 1439 if (seg->s_as != as)
1429 1440 break;
1430 1441 if (seg->s_ops == NULL)
1431 1442 continue;
1432 1443 SEGOP_DUMP(seg);
1433 1444 }
1434 1445 AS_LOCK_EXIT(as);
1435 1446
1436 1447 if (seg != NULL)
1437 1448 cmn_err(CE_WARN, "invalid segment %p in address space %p",
1438 1449 (void *)seg, (void *)as);
1439 1450 }
1440 1451
1441 1452 static int
1442 1453 dump_process(pid_t pid)
1443 1454 {
1444 1455 proc_t *p = sprlock(pid);
1445 1456
1446 1457 if (p == NULL)
1447 1458 return (-1);
1448 1459 if (p->p_as != &kas) {
1449 1460 mutex_exit(&p->p_lock);
1450 1461 dump_as(p->p_as);
1451 1462 mutex_enter(&p->p_lock);
1452 1463 }
1453 1464
1454 1465 sprunlock(p);
1455 1466
1456 1467 return (0);
1457 1468 }
1458 1469
1459 1470 /*
1460 1471 * The following functions (dump_summary(), dump_ereports(), and
1461 1472 * dump_messages()), write data to an uncompressed area within the
1462 1473 * crashdump. The layout of these is
1463 1474 *
1464 1475 * +------------------------------------------------------------+
1465 1476 * | compressed pages | summary | ereports | messages |
1466 1477 * +------------------------------------------------------------+
1467 1478 *
1468 1479 * With the advent of saving a compressed crash dump by default, we
1469 1480 * need to save a little more data to describe the failure mode in
1470 1481 * an uncompressed buffer available before savecore uncompresses
1471 1482 * the dump. Initially this is a copy of the stack trace. Additional
1472 1483 * summary information should be added here.
1473 1484 */
1474 1485
1475 1486 void
1476 1487 dump_summary(void)
1477 1488 {
1478 1489 u_offset_t dumpvp_start;
1479 1490 summary_dump_t sd;
1480 1491
1481 1492 if (dumpvp == NULL || dumphdr == NULL)
1482 1493 return;
1483 1494
1484 1495 dumpbuf.cur = dumpbuf.start;
1485 1496
1486 1497 dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE +
1487 1498 DUMP_ERPTSIZE);
1488 1499 dumpvp_start = dumpbuf.vp_limit - DUMP_SUMMARYSIZE;
1489 1500 dumpbuf.vp_off = dumpvp_start;
1490 1501
1491 1502 sd.sd_magic = SUMMARY_MAGIC;
1492 1503 sd.sd_ssum = checksum32(dump_stack_scratch, STACK_BUF_SIZE);
1493 1504 dumpvp_write(&sd, sizeof (sd));
1494 1505 dumpvp_write(dump_stack_scratch, STACK_BUF_SIZE);
1495 1506
1496 1507 sd.sd_magic = 0; /* indicate end of summary */
1497 1508 dumpvp_write(&sd, sizeof (sd));
1498 1509 (void) dumpvp_flush();
1499 1510 }
1500 1511
1501 1512 void
1502 1513 dump_ereports(void)
1503 1514 {
1504 1515 u_offset_t dumpvp_start;
1505 1516 erpt_dump_t ed;
1506 1517
1507 1518 if (dumpvp == NULL || dumphdr == NULL)
1508 1519 return;
1509 1520
1510 1521 dumpbuf.cur = dumpbuf.start;
1511 1522 dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE);
1512 1523 dumpvp_start = dumpbuf.vp_limit - DUMP_ERPTSIZE;
1513 1524 dumpbuf.vp_off = dumpvp_start;
1514 1525
1515 1526 fm_ereport_dump();
1516 1527 if (panicstr)
1517 1528 errorq_dump();
1518 1529
1519 1530 bzero(&ed, sizeof (ed)); /* indicate end of ereports */
1520 1531 dumpvp_write(&ed, sizeof (ed));
1521 1532 (void) dumpvp_flush();
1522 1533
1523 1534 if (!panicstr) {
1524 1535 (void) VOP_PUTPAGE(dumpvp, dumpvp_start,
1525 1536 (size_t)(dumpbuf.vp_off - dumpvp_start),
1526 1537 B_INVAL | B_FORCE, kcred, NULL);
1527 1538 }
1528 1539 }
1529 1540
1530 1541 void
1531 1542 dump_messages(void)
1532 1543 {
1533 1544 log_dump_t ld;
1534 1545 mblk_t *mctl, *mdata;
1535 1546 queue_t *q, *qlast;
1536 1547 u_offset_t dumpvp_start;
1537 1548
1538 1549 if (dumpvp == NULL || dumphdr == NULL || log_consq == NULL)
1539 1550 return;
1540 1551
1541 1552 dumpbuf.cur = dumpbuf.start;
1542 1553 dumpbuf.vp_limit = dumpvp_size - DUMP_OFFSET;
1543 1554 dumpvp_start = dumpbuf.vp_limit - DUMP_LOGSIZE;
1544 1555 dumpbuf.vp_off = dumpvp_start;
1545 1556
1546 1557 qlast = NULL;
1547 1558 do {
1548 1559 for (q = log_consq; q->q_next != qlast; q = q->q_next)
1549 1560 continue;
1550 1561 for (mctl = q->q_first; mctl != NULL; mctl = mctl->b_next) {
1551 1562 dump_timeleft = dump_timeout;
1552 1563 mdata = mctl->b_cont;
1553 1564 ld.ld_magic = LOG_MAGIC;
1554 1565 ld.ld_msgsize = MBLKL(mctl->b_cont);
1555 1566 ld.ld_csum = checksum32(mctl->b_rptr, MBLKL(mctl));
1556 1567 ld.ld_msum = checksum32(mdata->b_rptr, MBLKL(mdata));
1557 1568 dumpvp_write(&ld, sizeof (ld));
1558 1569 dumpvp_write(mctl->b_rptr, MBLKL(mctl));
1559 1570 dumpvp_write(mdata->b_rptr, MBLKL(mdata));
1560 1571 }
1561 1572 } while ((qlast = q) != log_consq);
1562 1573
1563 1574 ld.ld_magic = 0; /* indicate end of messages */
1564 1575 dumpvp_write(&ld, sizeof (ld));
1565 1576 (void) dumpvp_flush();
1566 1577 if (!panicstr) {
1567 1578 (void) VOP_PUTPAGE(dumpvp, dumpvp_start,
1568 1579 (size_t)(dumpbuf.vp_off - dumpvp_start),
1569 1580 B_INVAL | B_FORCE, kcred, NULL);
1570 1581 }
1571 1582 }
1572 1583
1573 1584 /*
1574 1585 * The following functions are called on multiple CPUs during dump.
1575 1586 * They must not use most kernel services, because all cross-calls are
1576 1587 * disabled during panic. Therefore, blocking locks and cache flushes
1577 1588 * will not work.
1578 1589 */
1579 1590
1580 1591 /*
1581 1592 * Copy pages, trapping ECC errors. Also, for robustness, trap data
1582 1593 * access in case something goes wrong in the hat layer and the
1583 1594 * mapping is broken.
1584 1595 */
1585 1596 static int
1586 1597 dump_pagecopy(void *src, void *dst)
1587 1598 {
1588 1599 long *wsrc = (long *)src;
1589 1600 long *wdst = (long *)dst;
1590 1601 const ulong_t ncopies = PAGESIZE / sizeof (long);
1591 1602 volatile int w = 0;
1592 1603 volatile int ueoff = -1;
1593 1604 on_trap_data_t otd;
1594 1605
1595 1606 if (on_trap(&otd, OT_DATA_EC | OT_DATA_ACCESS)) {
1596 1607 if (ueoff == -1)
1597 1608 ueoff = w * sizeof (long);
1598 1609 /* report "bad ECC" or "bad address" */
1599 1610 #ifdef _LP64
1600 1611 if (otd.ot_trap & OT_DATA_EC)
1601 1612 wdst[w++] = 0x00badecc00badecc;
1602 1613 else
1603 1614 wdst[w++] = 0x00badadd00badadd;
1604 1615 #else
1605 1616 if (otd.ot_trap & OT_DATA_EC)
1606 1617 wdst[w++] = 0x00badecc;
1607 1618 else
1608 1619 wdst[w++] = 0x00badadd;
1609 1620 #endif
1610 1621 }
1611 1622 while (w < ncopies) {
1612 1623 wdst[w] = wsrc[w];
1613 1624 w++;
1614 1625 }
1615 1626 no_trap();
1616 1627 return (ueoff);
1617 1628 }
1618 1629
1619 1630 static void
1620 1631 dumpsys_close_cq(cqueue_t *cq, int live)
1621 1632 {
1622 1633 if (live) {
1623 1634 mutex_enter(&cq->mutex);
1624 1635 atomic_dec_uint(&cq->open);
1625 1636 cv_signal(&cq->cv);
1626 1637 mutex_exit(&cq->mutex);
1627 1638 } else {
1628 1639 atomic_dec_uint(&cq->open);
1629 1640 }
1630 1641 }
1631 1642
1632 1643 static inline void
1633 1644 dumpsys_spinlock(lock_t *lp)
1634 1645 {
1635 1646 uint_t backoff = 0;
1636 1647 int loop_count = 0;
1637 1648
1638 1649 while (LOCK_HELD(lp) || !lock_spin_try(lp)) {
1639 1650 if (++loop_count >= ncpus) {
1640 1651 backoff = mutex_lock_backoff(0);
1641 1652 loop_count = 0;
1642 1653 } else {
1643 1654 backoff = mutex_lock_backoff(backoff);
1644 1655 }
1645 1656 mutex_lock_delay(backoff);
1646 1657 }
1647 1658 }
1648 1659
1649 1660 static inline void
1650 1661 dumpsys_spinunlock(lock_t *lp)
1651 1662 {
1652 1663 lock_clear(lp);
1653 1664 }
1654 1665
1655 1666 static inline void
1656 1667 dumpsys_lock(cqueue_t *cq, int live)
1657 1668 {
1658 1669 if (live)
1659 1670 mutex_enter(&cq->mutex);
1660 1671 else
1661 1672 dumpsys_spinlock(&cq->spinlock);
1662 1673 }
1663 1674
1664 1675 static inline void
1665 1676 dumpsys_unlock(cqueue_t *cq, int live, int signal)
1666 1677 {
1667 1678 if (live) {
1668 1679 if (signal)
1669 1680 cv_signal(&cq->cv);
1670 1681 mutex_exit(&cq->mutex);
1671 1682 } else {
1672 1683 dumpsys_spinunlock(&cq->spinlock);
1673 1684 }
1674 1685 }
1675 1686
1676 1687 static void
1677 1688 dumpsys_wait_cq(cqueue_t *cq, int live)
1678 1689 {
1679 1690 if (live) {
1680 1691 cv_wait(&cq->cv, &cq->mutex);
1681 1692 } else {
1682 1693 dumpsys_spinunlock(&cq->spinlock);
1683 1694 while (cq->open)
1684 1695 if (cq->first)
1685 1696 break;
1686 1697 dumpsys_spinlock(&cq->spinlock);
1687 1698 }
1688 1699 }
1689 1700
1690 1701 static void
1691 1702 dumpsys_put_cq(cqueue_t *cq, cbuf_t *cp, int newstate, int live)
1692 1703 {
1693 1704 if (cp == NULL)
1694 1705 return;
1695 1706
1696 1707 dumpsys_lock(cq, live);
1697 1708
1698 1709 if (cq->ts != 0) {
1699 1710 cq->empty += gethrtime() - cq->ts;
1700 1711 cq->ts = 0;
1701 1712 }
1702 1713
1703 1714 cp->state = newstate;
1704 1715 cp->next = NULL;
1705 1716 if (cq->last == NULL)
1706 1717 cq->first = cp;
1707 1718 else
1708 1719 cq->last->next = cp;
1709 1720 cq->last = cp;
1710 1721
1711 1722 dumpsys_unlock(cq, live, 1);
1712 1723 }
1713 1724
1714 1725 static cbuf_t *
1715 1726 dumpsys_get_cq(cqueue_t *cq, int live)
1716 1727 {
1717 1728 cbuf_t *cp;
1718 1729 hrtime_t now = gethrtime();
1719 1730
1720 1731 dumpsys_lock(cq, live);
1721 1732
1722 1733 /* CONSTCOND */
1723 1734 while (1) {
1724 1735 cp = (cbuf_t *)cq->first;
1725 1736 if (cp == NULL) {
1726 1737 if (cq->open == 0)
1727 1738 break;
1728 1739 dumpsys_wait_cq(cq, live);
1729 1740 continue;
1730 1741 }
1731 1742 cq->first = cp->next;
1732 1743 if (cq->first == NULL) {
1733 1744 cq->last = NULL;
1734 1745 cq->ts = now;
1735 1746 }
1736 1747 break;
1737 1748 }
1738 1749
1739 1750 dumpsys_unlock(cq, live, cq->first != NULL || cq->open == 0);
1740 1751 return (cp);
1741 1752 }
1742 1753
1743 1754 /*
1744 1755 * Send an error message to the console. If the main task is running
1745 1756 * just write the message via uprintf. If a helper is running the
1746 1757 * message has to be put on a queue for the main task. Setting fmt to
1747 1758 * NULL means flush the error message buffer. If fmt is not NULL, just
1748 1759 * add the text to the existing buffer.
1749 1760 */
1750 1761 static void
1751 1762 dumpsys_errmsg(helper_t *hp, const char *fmt, ...)
1752 1763 {
1753 1764 dumpsync_t *ds = hp->ds;
1754 1765 cbuf_t *cp = hp->cperr;
1755 1766 va_list adx;
1756 1767
1757 1768 if (hp->helper == MAINHELPER) {
1758 1769 if (fmt != NULL) {
1759 1770 if (ds->neednl) {
1760 1771 uprintf("\n");
1761 1772 ds->neednl = 0;
1762 1773 }
1763 1774 va_start(adx, fmt);
1764 1775 vuprintf(fmt, adx);
1765 1776 va_end(adx);
1766 1777 }
1767 1778 } else if (fmt == NULL) {
1768 1779 if (cp != NULL) {
1769 1780 CQ_PUT(mainq, cp, CBUF_ERRMSG);
1770 1781 hp->cperr = NULL;
1771 1782 }
1772 1783 } else {
1773 1784 if (hp->cperr == NULL) {
1774 1785 cp = CQ_GET(freebufq);
1775 1786 hp->cperr = cp;
1776 1787 cp->used = 0;
1777 1788 }
1778 1789 va_start(adx, fmt);
1779 1790 cp->used += vsnprintf(cp->buf + cp->used, cp->size - cp->used,
1780 1791 fmt, adx);
1781 1792 va_end(adx);
1782 1793 if ((cp->used + LOG_MSGSIZE) > cp->size) {
1783 1794 CQ_PUT(mainq, cp, CBUF_ERRMSG);
1784 1795 hp->cperr = NULL;
1785 1796 }
1786 1797 }
1787 1798 }
1788 1799
1789 1800 /*
1790 1801 * Write an output buffer to the dump file. If the main task is
1791 1802 * running just write the data. If a helper is running the output is
1792 1803 * placed on a queue for the main task.
1793 1804 */
1794 1805 static void
1795 1806 dumpsys_swrite(helper_t *hp, cbuf_t *cp, size_t used)
1796 1807 {
1797 1808 dumpsync_t *ds = hp->ds;
1798 1809
1799 1810 if (hp->helper == MAINHELPER) {
1800 1811 HRSTART(ds->perpage, write);
1801 1812 dumpvp_write(cp->buf, used);
1802 1813 HRSTOP(ds->perpage, write);
1803 1814 CQ_PUT(freebufq, cp, CBUF_FREEBUF);
1804 1815 } else {
1805 1816 cp->used = used;
1806 1817 CQ_PUT(mainq, cp, CBUF_WRITE);
1807 1818 }
1808 1819 }
1809 1820
1810 1821 /*
1811 1822 * Copy one page within the mapped range. The offset starts at 0 and
1812 1823 * is relative to the first pfn. cp->buf + cp->off is the address of
1813 1824 * the first pfn. If dump_pagecopy returns a UE offset, create an
1814 1825 * error message. Returns the offset to the next pfn in the range
1815 1826 * selected by the bitmap.
1816 1827 */
1817 1828 static int
1818 1829 dumpsys_copy_page(helper_t *hp, int offset)
1819 1830 {
1820 1831 cbuf_t *cp = hp->cpin;
1821 1832 int ueoff;
1822 1833
1823 1834 ASSERT(cp->off + offset + PAGESIZE <= cp->size);
1824 1835 ASSERT(BT_TEST(dumpcfg.bitmap, cp->bitnum));
1825 1836
1826 1837 ueoff = dump_pagecopy(cp->buf + cp->off + offset, hp->page);
1827 1838
1828 1839 /* ueoff is the offset in the page to a UE error */
1829 1840 if (ueoff != -1) {
1830 1841 uint64_t pa = ptob(cp->pfn) + offset + ueoff;
1831 1842
1832 1843 dumpsys_errmsg(hp, "cpu %d: memory error at PA 0x%08x.%08x\n",
1833 1844 CPU->cpu_id, (uint32_t)(pa >> 32), (uint32_t)pa);
1834 1845 }
1835 1846
1836 1847 /*
1837 1848 * Advance bitnum and offset to the next input page for the
1838 1849 * next call to this function.
1839 1850 */
1840 1851 offset += PAGESIZE;
1841 1852 cp->bitnum++;
1842 1853 while (cp->off + offset < cp->size) {
1843 1854 if (BT_TEST(dumpcfg.bitmap, cp->bitnum))
1844 1855 break;
1845 1856 offset += PAGESIZE;
1846 1857 cp->bitnum++;
1847 1858 }
1848 1859
1849 1860 return (offset);
1850 1861 }
1851 1862
1852 1863 /*
1853 1864 * Read the helper queue, and copy one mapped page. Return 0 when
1854 1865 * done. Return 1 when a page has been copied into hp->page.
1855 1866 */
1856 1867 static int
1857 1868 dumpsys_sread(helper_t *hp)
1858 1869 {
1859 1870 dumpsync_t *ds = hp->ds;
1860 1871
1861 1872 /* CONSTCOND */
1862 1873 while (1) {
1863 1874
1864 1875 /* Find the next input buffer. */
1865 1876 if (hp->cpin == NULL) {
1866 1877 HRSTART(hp->perpage, inwait);
1867 1878
1868 1879 /* CONSTCOND */
1869 1880 while (1) {
1870 1881 hp->cpin = CQ_GET(helperq);
1871 1882 dump_timeleft = dump_timeout;
1872 1883
1873 1884 /*
1874 1885 * NULL return means the helper queue
1875 1886 * is closed and empty.
1876 1887 */
1877 1888 if (hp->cpin == NULL)
1878 1889 break;
1879 1890
1880 1891 /* Have input, check for dump I/O error. */
1881 1892 if (!dump_ioerr)
1882 1893 break;
1883 1894
1884 1895 /*
1885 1896 * If an I/O error occurs, stay in the
1886 1897 * loop in order to empty the helper
1887 1898 * queue. Return the buffers to the
1888 1899 * main task to unmap and free it.
1889 1900 */
1890 1901 hp->cpin->used = 0;
1891 1902 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
1892 1903 }
1893 1904 HRSTOP(hp->perpage, inwait);
1894 1905
1895 1906 /* Stop here when the helper queue is closed. */
1896 1907 if (hp->cpin == NULL)
1897 1908 break;
1898 1909
1899 1910 /* Set the offset=0 to get the first pfn. */
1900 1911 hp->in = 0;
1901 1912
1902 1913 /* Set the total processed to 0 */
1903 1914 hp->used = 0;
1904 1915 }
1905 1916
1906 1917 /* Process the next page. */
1907 1918 if (hp->used < hp->cpin->used) {
1908 1919
1909 1920 /*
1910 1921 * Get the next page from the input buffer and
1911 1922 * return a copy.
1912 1923 */
1913 1924 ASSERT(hp->in != -1);
1914 1925 HRSTART(hp->perpage, copy);
1915 1926 hp->in = dumpsys_copy_page(hp, hp->in);
1916 1927 hp->used += PAGESIZE;
1917 1928 HRSTOP(hp->perpage, copy);
1918 1929 break;
1919 1930
1920 1931 } else {
1921 1932
1922 1933 /*
1923 1934 * Done with the input. Flush the VM and
1924 1935 * return the buffer to the main task.
1925 1936 */
1926 1937 if (panicstr && hp->helper != MAINHELPER)
1927 1938 hat_flush_range(kas.a_hat,
1928 1939 hp->cpin->buf, hp->cpin->size);
1929 1940 dumpsys_errmsg(hp, NULL);
1930 1941 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
1931 1942 hp->cpin = NULL;
1932 1943 }
1933 1944 }
1934 1945
1935 1946 return (hp->cpin != NULL);
1936 1947 }
1937 1948
1938 1949 /*
1939 1950 * Compress size bytes starting at buf with bzip2
1940 1951 * mode:
1941 1952 * BZ_RUN add one more compressed page
1942 1953 * BZ_FINISH no more input, flush the state
1943 1954 */
1944 1955 static void
1945 1956 dumpsys_bzrun(helper_t *hp, void *buf, size_t size, int mode)
1946 1957 {
1947 1958 dumpsync_t *ds = hp->ds;
1948 1959 const int CSIZE = sizeof (dumpcsize_t);
1949 1960 bz_stream *ps = &hp->bzstream;
1950 1961 int rc = 0;
1951 1962 uint32_t csize;
1952 1963 dumpcsize_t cs;
1953 1964
1954 1965 /* Set input pointers to new input page */
1955 1966 if (size > 0) {
1956 1967 ps->avail_in = size;
1957 1968 ps->next_in = buf;
1958 1969 }
1959 1970
1960 1971 /* CONSTCOND */
1961 1972 while (1) {
1962 1973
1963 1974 /* Quit when all input has been consumed */
1964 1975 if (ps->avail_in == 0 && mode == BZ_RUN)
1965 1976 break;
1966 1977
1967 1978 /* Get a new output buffer */
1968 1979 if (hp->cpout == NULL) {
1969 1980 HRSTART(hp->perpage, outwait);
1970 1981 hp->cpout = CQ_GET(freebufq);
1971 1982 HRSTOP(hp->perpage, outwait);
1972 1983 ps->avail_out = hp->cpout->size - CSIZE;
1973 1984 ps->next_out = hp->cpout->buf + CSIZE;
1974 1985 }
1975 1986
1976 1987 /* Compress input, or finalize */
1977 1988 HRSTART(hp->perpage, compress);
1978 1989 rc = BZ2_bzCompress(ps, mode);
1979 1990 HRSTOP(hp->perpage, compress);
1980 1991
1981 1992 /* Check for error */
1982 1993 if (mode == BZ_RUN && rc != BZ_RUN_OK) {
1983 1994 dumpsys_errmsg(hp, "%d: BZ_RUN error %s at page %lx\n",
1984 1995 hp->helper, BZ2_bzErrorString(rc),
1985 1996 hp->cpin->pagenum);
1986 1997 break;
1987 1998 }
1988 1999
1989 2000 /* Write the buffer if it is full, or we are flushing */
1990 2001 if (ps->avail_out == 0 || mode == BZ_FINISH) {
1991 2002 csize = hp->cpout->size - CSIZE - ps->avail_out;
1992 2003 cs = DUMP_SET_TAG(csize, hp->tag);
1993 2004 if (csize > 0) {
1994 2005 (void) memcpy(hp->cpout->buf, &cs, CSIZE);
1995 2006 dumpsys_swrite(hp, hp->cpout, csize + CSIZE);
1996 2007 hp->cpout = NULL;
1997 2008 }
1998 2009 }
1999 2010
2000 2011 /* Check for final complete */
2001 2012 if (mode == BZ_FINISH) {
2002 2013 if (rc == BZ_STREAM_END)
2003 2014 break;
2004 2015 if (rc != BZ_FINISH_OK) {
2005 2016 dumpsys_errmsg(hp, "%d: BZ_FINISH error %s\n",
2006 2017 hp->helper, BZ2_bzErrorString(rc));
2007 2018 break;
2008 2019 }
2009 2020 }
2010 2021 }
2011 2022
2012 2023 /* Cleanup state and buffers */
2013 2024 if (mode == BZ_FINISH) {
2014 2025
2015 2026 /* Reset state so that it is re-usable. */
2016 2027 (void) BZ2_bzCompressReset(&hp->bzstream);
2017 2028
2018 2029 /* Give any unused outout buffer to the main task */
2019 2030 if (hp->cpout != NULL) {
2020 2031 hp->cpout->used = 0;
2021 2032 CQ_PUT(mainq, hp->cpout, CBUF_ERRMSG);
2022 2033 hp->cpout = NULL;
2023 2034 }
2024 2035 }
2025 2036 }
2026 2037
2027 2038 static void
2028 2039 dumpsys_bz2compress(helper_t *hp)
2029 2040 {
2030 2041 dumpsync_t *ds = hp->ds;
2031 2042 dumpstreamhdr_t sh;
2032 2043
2033 2044 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC);
2034 2045 sh.stream_pagenum = (pgcnt_t)-1;
2035 2046 sh.stream_npages = 0;
2036 2047 hp->cpin = NULL;
2037 2048 hp->cpout = NULL;
2038 2049 hp->cperr = NULL;
2039 2050 hp->in = 0;
2040 2051 hp->out = 0;
2041 2052 hp->bzstream.avail_in = 0;
2042 2053
2043 2054 /* Bump reference to mainq while we are running */
2044 2055 CQ_OPEN(mainq);
2045 2056
2046 2057 /* Get one page at a time */
2047 2058 while (dumpsys_sread(hp)) {
2048 2059 if (sh.stream_pagenum != hp->cpin->pagenum) {
2049 2060 sh.stream_pagenum = hp->cpin->pagenum;
2050 2061 sh.stream_npages = btop(hp->cpin->used);
2051 2062 dumpsys_bzrun(hp, &sh, sizeof (sh), BZ_RUN);
2052 2063 }
2053 2064 dumpsys_bzrun(hp, hp->page, PAGESIZE, 0);
2054 2065 }
2055 2066
2056 2067 /* Done with input, flush any partial buffer */
2057 2068 if (sh.stream_pagenum != (pgcnt_t)-1) {
2058 2069 dumpsys_bzrun(hp, NULL, 0, BZ_FINISH);
2059 2070 dumpsys_errmsg(hp, NULL);
2060 2071 }
2061 2072
2062 2073 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL);
2063 2074
2064 2075 /* Decrement main queue count, we are done */
2065 2076 CQ_CLOSE(mainq);
2066 2077 }
2067 2078
2068 2079 /*
2069 2080 * Compress with lzjb
2070 2081 * write stream block if full or size==0
2071 2082 * if csize==0 write stream header, else write <csize, data>
2072 2083 * size==0 is a call to flush a buffer
2073 2084 * hp->cpout is the buffer we are flushing or filling
2074 2085 * hp->out is the next index to fill data
2075 2086 * osize is either csize+data, or the size of a stream header
2076 2087 */
2077 2088 static void
2078 2089 dumpsys_lzjbrun(helper_t *hp, size_t csize, void *buf, size_t size)
2079 2090 {
2080 2091 dumpsync_t *ds = hp->ds;
2081 2092 const int CSIZE = sizeof (dumpcsize_t);
2082 2093 dumpcsize_t cs;
2083 2094 size_t osize = csize > 0 ? CSIZE + size : size;
2084 2095
2085 2096 /* If flush, and there is no buffer, just return */
2086 2097 if (size == 0 && hp->cpout == NULL)
2087 2098 return;
2088 2099
2089 2100 /* If flush, or cpout is full, write it out */
2090 2101 if (size == 0 ||
2091 2102 hp->cpout != NULL && hp->out + osize > hp->cpout->size) {
2092 2103
2093 2104 /* Set tag+size word at the front of the stream block. */
2094 2105 cs = DUMP_SET_TAG(hp->out - CSIZE, hp->tag);
2095 2106 (void) memcpy(hp->cpout->buf, &cs, CSIZE);
2096 2107
2097 2108 /* Write block to dump file. */
2098 2109 dumpsys_swrite(hp, hp->cpout, hp->out);
2099 2110
2100 2111 /* Clear pointer to indicate we need a new buffer */
2101 2112 hp->cpout = NULL;
2102 2113
2103 2114 /* flushing, we are done */
2104 2115 if (size == 0)
2105 2116 return;
2106 2117 }
2107 2118
2108 2119 /* Get an output buffer if we dont have one. */
2109 2120 if (hp->cpout == NULL) {
2110 2121 HRSTART(hp->perpage, outwait);
2111 2122 hp->cpout = CQ_GET(freebufq);
2112 2123 HRSTOP(hp->perpage, outwait);
2113 2124 hp->out = CSIZE;
2114 2125 }
2115 2126
2116 2127 /* Store csize word. This is the size of compressed data. */
2117 2128 if (csize > 0) {
2118 2129 cs = DUMP_SET_TAG(csize, 0);
2119 2130 (void) memcpy(hp->cpout->buf + hp->out, &cs, CSIZE);
2120 2131 hp->out += CSIZE;
2121 2132 }
2122 2133
2123 2134 /* Store the data. */
2124 2135 (void) memcpy(hp->cpout->buf + hp->out, buf, size);
2125 2136 hp->out += size;
2126 2137 }
2127 2138
2128 2139 static void
2129 2140 dumpsys_lzjbcompress(helper_t *hp)
2130 2141 {
2131 2142 dumpsync_t *ds = hp->ds;
2132 2143 size_t csize;
2133 2144 dumpstreamhdr_t sh;
2134 2145
2135 2146 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC);
2136 2147 sh.stream_pagenum = (pfn_t)-1;
2137 2148 sh.stream_npages = 0;
2138 2149 hp->cpin = NULL;
2139 2150 hp->cpout = NULL;
2140 2151 hp->cperr = NULL;
2141 2152 hp->in = 0;
2142 2153 hp->out = 0;
2143 2154
2144 2155 /* Bump reference to mainq while we are running */
2145 2156 CQ_OPEN(mainq);
2146 2157
2147 2158 /* Get one page at a time */
2148 2159 while (dumpsys_sread(hp)) {
2149 2160
2150 2161 /* Create a stream header for each new input map */
2151 2162 if (sh.stream_pagenum != hp->cpin->pagenum) {
2152 2163 sh.stream_pagenum = hp->cpin->pagenum;
2153 2164 sh.stream_npages = btop(hp->cpin->used);
2154 2165 dumpsys_lzjbrun(hp, 0, &sh, sizeof (sh));
2155 2166 }
2156 2167
2157 2168 /* Compress one page */
2158 2169 HRSTART(hp->perpage, compress);
2159 2170 csize = compress(hp->page, hp->lzbuf, PAGESIZE);
2160 2171 HRSTOP(hp->perpage, compress);
2161 2172
2162 2173 /* Add csize+data to output block */
2163 2174 ASSERT(csize > 0 && csize <= PAGESIZE);
2164 2175 dumpsys_lzjbrun(hp, csize, hp->lzbuf, csize);
2165 2176 }
2166 2177
2167 2178 /* Done with input, flush any partial buffer */
2168 2179 if (sh.stream_pagenum != (pfn_t)-1) {
2169 2180 dumpsys_lzjbrun(hp, 0, NULL, 0);
2170 2181 dumpsys_errmsg(hp, NULL);
2171 2182 }
2172 2183
2173 2184 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL);
2174 2185
2175 2186 /* Decrement main queue count, we are done */
2176 2187 CQ_CLOSE(mainq);
2177 2188 }
2178 2189
2179 2190 /*
2180 2191 * Dump helper called from panic_idle() to compress pages. CPUs in
2181 2192 * this path must not call most kernel services.
2182 2193 *
2183 2194 * During panic, all but one of the CPUs is idle. These CPUs are used
2184 2195 * as helpers working in parallel to copy and compress memory
2185 2196 * pages. During a panic, however, these processors cannot call any
2186 2197 * kernel services. This is because mutexes become no-ops during
2187 2198 * panic, and, cross-call interrupts are inhibited. Therefore, during
2188 2199 * panic dump the helper CPUs communicate with the panic CPU using
2189 2200 * memory variables. All memory mapping and I/O is performed by the
2190 2201 * panic CPU.
2191 2202 *
2192 2203 * At dump configuration time, helper_lock is set and helpers_wanted
2193 2204 * is 0. dumpsys() decides whether to set helpers_wanted before
2194 2205 * clearing helper_lock.
2195 2206 *
2196 2207 * At panic time, idle CPUs spin-wait on helper_lock, then alternately
2197 2208 * take the lock and become a helper, or return.
2198 2209 */
2199 2210 void
2200 2211 dumpsys_helper()
2201 2212 {
2202 2213 dumpsys_spinlock(&dumpcfg.helper_lock);
2203 2214 if (dumpcfg.helpers_wanted) {
2204 2215 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper];
2205 2216
2206 2217 for (hp = dumpcfg.helper; hp != hpend; hp++) {
2207 2218 if (hp->helper == FREEHELPER) {
2208 2219 hp->helper = CPU->cpu_id;
2209 2220 BT_SET(dumpcfg.helpermap, CPU->cpu_seqid);
2210 2221
2211 2222 dumpsys_spinunlock(&dumpcfg.helper_lock);
2212 2223
2213 2224 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2)
2214 2225 dumpsys_lzjbcompress(hp);
2215 2226 else
2216 2227 dumpsys_bz2compress(hp);
2217 2228
2218 2229 hp->helper = DONEHELPER;
2219 2230 return;
2220 2231 }
2221 2232 }
2222 2233
2223 2234 /* No more helpers are needed. */
2224 2235 dumpcfg.helpers_wanted = 0;
2225 2236
2226 2237 }
2227 2238 dumpsys_spinunlock(&dumpcfg.helper_lock);
2228 2239 }
2229 2240
2230 2241 /*
2231 2242 * No-wait helper callable in spin loops.
2232 2243 *
2233 2244 * Do not wait for helper_lock. Just check helpers_wanted. The caller
2234 2245 * may decide to continue. This is the "c)ontinue, s)ync, r)eset? s"
2235 2246 * case.
2236 2247 */
2237 2248 void
2238 2249 dumpsys_helper_nw()
2239 2250 {
2240 2251 if (dumpcfg.helpers_wanted)
2241 2252 dumpsys_helper();
2242 2253 }
2243 2254
2244 2255 /*
2245 2256 * Dump helper for live dumps.
2246 2257 * These run as a system task.
2247 2258 */
2248 2259 static void
2249 2260 dumpsys_live_helper(void *arg)
2250 2261 {
2251 2262 helper_t *hp = arg;
2252 2263
2253 2264 BT_ATOMIC_SET(dumpcfg.helpermap, CPU->cpu_seqid);
2254 2265 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2)
2255 2266 dumpsys_lzjbcompress(hp);
2256 2267 else
2257 2268 dumpsys_bz2compress(hp);
2258 2269 }
2259 2270
2260 2271 /*
2261 2272 * Compress one page with lzjb (single threaded case)
2262 2273 */
2263 2274 static void
2264 2275 dumpsys_lzjb_page(helper_t *hp, cbuf_t *cp)
2265 2276 {
2266 2277 dumpsync_t *ds = hp->ds;
2267 2278 uint32_t csize;
2268 2279
2269 2280 hp->helper = MAINHELPER;
2270 2281 hp->in = 0;
2271 2282 hp->used = 0;
2272 2283 hp->cpin = cp;
2273 2284 while (hp->used < cp->used) {
2274 2285 HRSTART(hp->perpage, copy);
2275 2286 hp->in = dumpsys_copy_page(hp, hp->in);
2276 2287 hp->used += PAGESIZE;
2277 2288 HRSTOP(hp->perpage, copy);
2278 2289
2279 2290 HRSTART(hp->perpage, compress);
2280 2291 csize = compress(hp->page, hp->lzbuf, PAGESIZE);
2281 2292 HRSTOP(hp->perpage, compress);
2282 2293
2283 2294 HRSTART(hp->perpage, write);
2284 2295 dumpvp_write(&csize, sizeof (csize));
2285 2296 dumpvp_write(hp->lzbuf, csize);
2286 2297 HRSTOP(hp->perpage, write);
2287 2298 }
2288 2299 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
2289 2300 hp->cpin = NULL;
2290 2301 }
2291 2302
2292 2303 /*
2293 2304 * Main task to dump pages. This is called on the dump CPU.
↓ open down ↓ |
1760 lines elided |
↑ open up ↑ |
2294 2305 */
2295 2306 static void
2296 2307 dumpsys_main_task(void *arg)
2297 2308 {
2298 2309 dumpsync_t *ds = arg;
2299 2310 pgcnt_t pagenum = 0, bitnum = 0, hibitnum;
2300 2311 dumpmlw_t mlw;
2301 2312 cbuf_t *cp;
2302 2313 pgcnt_t baseoff, pfnoff;
2303 2314 pfn_t base, pfn;
2304 - int i, dumpserial;
2315 + boolean_t dumpserial;
2316 + int i;
2305 2317
2306 2318 /*
2307 2319 * Fall back to serial mode if there are no helpers.
2308 2320 * dump_plat_mincpu can be set to 0 at any time.
2309 2321 * dumpcfg.helpermap must contain at least one member.
2322 + *
2323 + * It is possible that the helpers haven't registered
2324 + * in helpermap yet; wait up to DUMP_HELPER_MAX_WAIT for
2325 + * at least one helper to register.
2310 2326 */
2311 - dumpserial = 1;
2312 -
2327 + dumpserial = B_TRUE;
2313 2328 if (dump_plat_mincpu != 0 && dumpcfg.clevel != 0) {
2314 - for (i = 0; i < BT_BITOUL(NCPU); ++i) {
2315 - if (dumpcfg.helpermap[i] != 0) {
2316 - dumpserial = 0;
2329 + hrtime_t hrtmax = MSEC2NSEC(DUMP_HELPER_MAX_WAIT);
2330 + hrtime_t hrtstart = gethrtime();
2331 +
2332 + for (;;) {
2333 + for (i = 0; i < BT_BITOUL(NCPU); ++i) {
2334 + if (dumpcfg.helpermap[i] != 0) {
2335 + dumpserial = B_FALSE;
2336 + break;
2337 + }
2338 + }
2339 +
2340 + if ((!dumpserial) ||
2341 + ((gethrtime() - hrtstart) >= hrtmax)) {
2317 2342 break;
2318 2343 }
2344 +
2345 + SMT_PAUSE();
2319 2346 }
2320 - }
2321 2347
2322 - if (dumpserial) {
2323 - dumpcfg.clevel = 0;
2324 - if (dumpcfg.helper[0].lzbuf == NULL)
2325 - dumpcfg.helper[0].lzbuf = dumpcfg.helper[1].page;
2348 + if (dumpserial) {
2349 + dumpcfg.clevel = 0;
2350 + if (dumpcfg.helper[0].lzbuf == NULL) {
2351 + dumpcfg.helper[0].lzbuf =
2352 + dumpcfg.helper[1].page;
2353 + }
2354 + }
2326 2355 }
2327 2356
2328 2357 dump_init_memlist_walker(&mlw);
2329 2358
2330 2359 for (;;) {
2331 2360 int sec = (gethrtime() - ds->start) / NANOSEC;
2332 2361
2333 2362 /*
2334 2363 * Render a simple progress display on the system console to
2335 2364 * make clear to the operator that the system has not hung.
2336 2365 * Emit an update when dump progress has advanced by one
2337 2366 * percent, or when no update has been drawn in the last
2338 2367 * second.
2339 2368 */
2340 2369 if (ds->percent > ds->percent_done || sec > ds->sec_done) {
2341 2370 ds->sec_done = sec;
2342 2371 ds->percent_done = ds->percent;
2343 2372 uprintf("^\rdumping: %2d:%02d %3d%% done",
2344 2373 sec / 60, sec % 60, ds->percent);
2345 2374 ds->neednl = 1;
2346 2375 }
2347 2376
2348 2377 while (CQ_IS_EMPTY(mainq) && !CQ_IS_EMPTY(writerq)) {
2349 2378
2350 2379 /* the writerq never blocks */
2351 2380 cp = CQ_GET(writerq);
2352 2381 if (cp == NULL)
2353 2382 break;
2354 2383
2355 2384 dump_timeleft = dump_timeout;
2356 2385
2357 2386 HRSTART(ds->perpage, write);
2358 2387 dumpvp_write(cp->buf, cp->used);
2359 2388 HRSTOP(ds->perpage, write);
2360 2389
2361 2390 CQ_PUT(freebufq, cp, CBUF_FREEBUF);
2362 2391 }
2363 2392
2364 2393 /*
2365 2394 * Wait here for some buffers to process. Returns NULL
2366 2395 * when all helpers have terminated and all buffers
2367 2396 * have been processed.
2368 2397 */
2369 2398 cp = CQ_GET(mainq);
2370 2399
2371 2400 if (cp == NULL) {
2372 2401
2373 2402 /* Drain the write queue. */
2374 2403 if (!CQ_IS_EMPTY(writerq))
2375 2404 continue;
2376 2405
2377 2406 /* Main task exits here. */
2378 2407 break;
2379 2408 }
2380 2409
2381 2410 dump_timeleft = dump_timeout;
2382 2411
2383 2412 switch (cp->state) {
2384 2413
2385 2414 case CBUF_FREEMAP:
2386 2415
2387 2416 /*
2388 2417 * Note that we drop CBUF_FREEMAP buffers on
2389 2418 * the floor (they will not be on any cqueue)
2390 2419 * when we no longer need them.
2391 2420 */
2392 2421 if (bitnum >= dumpcfg.bitmapsize)
2393 2422 break;
2394 2423
2395 2424 if (dump_ioerr) {
2396 2425 bitnum = dumpcfg.bitmapsize;
2397 2426 CQ_CLOSE(helperq);
2398 2427 break;
2399 2428 }
2400 2429
2401 2430 HRSTART(ds->perpage, bitmap);
2402 2431 for (; bitnum < dumpcfg.bitmapsize; bitnum++)
2403 2432 if (BT_TEST(dumpcfg.bitmap, bitnum))
2404 2433 break;
2405 2434 HRSTOP(ds->perpage, bitmap);
2406 2435 dump_timeleft = dump_timeout;
2407 2436
2408 2437 if (bitnum >= dumpcfg.bitmapsize) {
2409 2438 CQ_CLOSE(helperq);
2410 2439 break;
2411 2440 }
2412 2441
2413 2442 /*
2414 2443 * Try to map CBUF_MAPSIZE ranges. Can't
2415 2444 * assume that memory segment size is a
2416 2445 * multiple of CBUF_MAPSIZE. Can't assume that
2417 2446 * the segment starts on a CBUF_MAPSIZE
2418 2447 * boundary.
2419 2448 */
2420 2449 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
2421 2450 ASSERT(pfn != PFN_INVALID);
2422 2451 ASSERT(bitnum + mlw.mpleft <= dumpcfg.bitmapsize);
2423 2452
2424 2453 base = P2ALIGN(pfn, CBUF_MAPNP);
2425 2454 if (base < mlw.mpaddr) {
2426 2455 base = mlw.mpaddr;
2427 2456 baseoff = P2PHASE(base, CBUF_MAPNP);
2428 2457 } else {
2429 2458 baseoff = 0;
2430 2459 }
2431 2460
2432 2461 pfnoff = pfn - base;
2433 2462 if (pfnoff + mlw.mpleft < CBUF_MAPNP) {
2434 2463 hibitnum = bitnum + mlw.mpleft;
2435 2464 cp->size = ptob(pfnoff + mlw.mpleft);
2436 2465 } else {
2437 2466 hibitnum = bitnum - pfnoff + CBUF_MAPNP -
2438 2467 baseoff;
2439 2468 cp->size = CBUF_MAPSIZE - ptob(baseoff);
2440 2469 }
2441 2470
2442 2471 cp->pfn = pfn;
2443 2472 cp->bitnum = bitnum++;
2444 2473 cp->pagenum = pagenum++;
2445 2474 cp->off = ptob(pfnoff);
2446 2475
2447 2476 for (; bitnum < hibitnum; bitnum++)
2448 2477 if (BT_TEST(dumpcfg.bitmap, bitnum))
2449 2478 pagenum++;
2450 2479
2451 2480 dump_timeleft = dump_timeout;
2452 2481 cp->used = ptob(pagenum - cp->pagenum);
2453 2482
2454 2483 HRSTART(ds->perpage, map);
2455 2484 hat_devload(kas.a_hat, cp->buf, cp->size, base,
2456 2485 PROT_READ, HAT_LOAD_NOCONSIST);
2457 2486 HRSTOP(ds->perpage, map);
2458 2487
2459 2488 ds->pages_mapped += btop(cp->size);
↓ open down ↓ |
124 lines elided |
↑ open up ↑ |
2460 2489 ds->pages_used += pagenum - cp->pagenum;
2461 2490
2462 2491 CQ_OPEN(mainq);
2463 2492
2464 2493 /*
2465 2494 * If there are no helpers the main task does
2466 2495 * non-streams lzjb compress.
2467 2496 */
2468 2497 if (dumpserial) {
2469 2498 dumpsys_lzjb_page(dumpcfg.helper, cp);
2470 - break;
2499 + } else {
2500 + /* pass mapped pages to a helper */
2501 + CQ_PUT(helperq, cp, CBUF_INREADY);
2471 2502 }
2472 2503
2473 - /* pass mapped pages to a helper */
2474 - CQ_PUT(helperq, cp, CBUF_INREADY);
2475 -
2476 2504 /* the last page was done */
2477 2505 if (bitnum >= dumpcfg.bitmapsize)
2478 2506 CQ_CLOSE(helperq);
2479 2507
2480 2508 break;
2481 2509
2482 2510 case CBUF_USEDMAP:
2483 2511
2484 2512 ds->npages += btop(cp->used);
2485 2513
2486 2514 HRSTART(ds->perpage, unmap);
2487 2515 hat_unload(kas.a_hat, cp->buf, cp->size, HAT_UNLOAD);
2488 2516 HRSTOP(ds->perpage, unmap);
2489 2517
2490 2518 if (bitnum < dumpcfg.bitmapsize)
2491 2519 CQ_PUT(mainq, cp, CBUF_FREEMAP);
2492 2520 CQ_CLOSE(mainq);
2493 2521
2494 2522 ASSERT(ds->npages <= dumphdr->dump_npages);
2495 2523 ds->percent = ds->npages * 100LL / dumphdr->dump_npages;
2496 2524 break;
2497 2525
2498 2526 case CBUF_WRITE:
2499 2527
2500 2528 CQ_PUT(writerq, cp, CBUF_WRITE);
2501 2529 break;
2502 2530
2503 2531 case CBUF_ERRMSG:
2504 2532
2505 2533 if (cp->used > 0) {
2506 2534 cp->buf[cp->size - 2] = '\n';
2507 2535 cp->buf[cp->size - 1] = '\0';
2508 2536 if (ds->neednl) {
2509 2537 uprintf("\n%s", cp->buf);
2510 2538 ds->neednl = 0;
2511 2539 } else {
2512 2540 uprintf("%s", cp->buf);
2513 2541 }
2514 2542 /* wait for console output */
2515 2543 drv_usecwait(200000);
2516 2544 dump_timeleft = dump_timeout;
2517 2545 }
2518 2546 CQ_PUT(freebufq, cp, CBUF_FREEBUF);
2519 2547 break;
2520 2548
2521 2549 default:
2522 2550 uprintf("dump: unexpected buffer state %d, "
2523 2551 "buffer will be lost\n", cp->state);
2524 2552 break;
2525 2553
2526 2554 } /* end switch */
2527 2555 }
2528 2556 }
2529 2557
2530 2558 #ifdef COLLECT_METRICS
2531 2559 size_t
2532 2560 dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size)
2533 2561 {
2534 2562 dumpcfg_t *cfg = &dumpcfg;
2535 2563 int myid = CPU->cpu_seqid;
2536 2564 int i, compress_ratio;
2537 2565 int sec, iorate;
2538 2566 helper_t *hp, *hpend = &cfg->helper[cfg->nhelper];
2539 2567 char *e = buf + size;
2540 2568 char *p = buf;
2541 2569
2542 2570 sec = ds->elapsed / (1000 * 1000 * 1000ULL);
2543 2571 if (sec < 1)
2544 2572 sec = 1;
2545 2573
2546 2574 if (ds->iotime < 1)
2547 2575 ds->iotime = 1;
2548 2576 iorate = (ds->nwrite * 100000ULL) / ds->iotime;
2549 2577
2550 2578 compress_ratio = 100LL * ds->npages / btopr(ds->nwrite + 1);
2551 2579
2552 2580 #define P(...) (p += p < e ? snprintf(p, e - p, __VA_ARGS__) : 0)
2553 2581
2554 2582 P("Master cpu_seqid,%d\n", CPU->cpu_seqid);
2555 2583 P("Master cpu_id,%d\n", CPU->cpu_id);
2556 2584 P("dump_flags,0x%x\n", dumphdr->dump_flags);
2557 2585 P("dump_ioerr,%d\n", dump_ioerr);
2558 2586
2559 2587 P("Helpers:\n");
2560 2588 for (i = 0; i < ncpus; i++) {
2561 2589 if ((i & 15) == 0)
2562 2590 P(",,%03d,", i);
2563 2591 if (i == myid)
2564 2592 P(" M");
2565 2593 else if (BT_TEST(cfg->helpermap, i))
2566 2594 P("%4d", cpu_seq[i]->cpu_id);
2567 2595 else
2568 2596 P(" *");
2569 2597 if ((i & 15) == 15)
↓ open down ↓ |
84 lines elided |
↑ open up ↑ |
2570 2598 P("\n");
2571 2599 }
2572 2600
2573 2601 P("ncbuf_used,%d\n", cfg->ncbuf_used);
2574 2602 P("ncmap,%d\n", cfg->ncmap);
2575 2603
2576 2604 P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m);
2577 2605 P("Found small pages,%ld\n", cfg->foundsm);
2578 2606
2579 2607 P("Compression level,%d\n", cfg->clevel);
2580 - P("Compression type,%s %s\n", cfg->clevel == 0 ? "serial" : "parallel",
2608 + P("Compression type,%s %s", cfg->clevel == 0 ? "serial" : "parallel",
2581 2609 cfg->clevel >= DUMP_CLEVEL_BZIP2 ? "bzip2" : "lzjb");
2610 + if (cfg->clevel >= DUMP_CLEVEL_BZIP2)
2611 + P(" (level %d)\n", dump_bzip2_level);
2612 + else
2613 + P("\n");
2582 2614 P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio %
2583 2615 100);
2584 2616 P("nhelper_used,%d\n", cfg->nhelper_used);
2585 2617
2586 2618 P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100);
2587 2619 P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite);
2588 2620 P("..total nsec,%lld\n", (u_longlong_t)ds->iotime);
2589 2621 P("dumpbuf.iosize,%ld\n", dumpbuf.iosize);
2590 2622 P("dumpbuf.size,%ld\n", dumpbuf.size);
2591 2623
2592 2624 P("Dump pages/sec,%llu\n", (u_longlong_t)ds->npages / sec);
2593 2625 P("Dump pages,%llu\n", (u_longlong_t)ds->npages);
2594 2626 P("Dump time,%d\n", sec);
2595 2627
2596 2628 if (ds->pages_mapped > 0)
2597 2629 P("per-cent map utilization,%d\n", (int)((100 * ds->pages_used)
2598 2630 / ds->pages_mapped));
2599 2631
2600 2632 P("\nPer-page metrics:\n");
2601 2633 if (ds->npages > 0) {
2602 2634 for (hp = cfg->helper; hp != hpend; hp++) {
2603 2635 #define PERPAGE(x) ds->perpage.x += hp->perpage.x;
2604 2636 PERPAGES;
2605 2637 #undef PERPAGE
2606 2638 }
2607 2639 #define PERPAGE(x) \
2608 2640 P("%s nsec/page,%d\n", #x, (int)(ds->perpage.x / ds->npages));
2609 2641 PERPAGES;
2610 2642 #undef PERPAGE
2611 2643 P("freebufq.empty,%d\n", (int)(ds->freebufq.empty /
2612 2644 ds->npages));
2613 2645 P("helperq.empty,%d\n", (int)(ds->helperq.empty /
2614 2646 ds->npages));
2615 2647 P("writerq.empty,%d\n", (int)(ds->writerq.empty /
2616 2648 ds->npages));
2617 2649 P("mainq.empty,%d\n", (int)(ds->mainq.empty / ds->npages));
2618 2650
2619 2651 P("I/O wait nsec/page,%llu\n", (u_longlong_t)(ds->iowait /
2620 2652 ds->npages));
2621 2653 }
2622 2654 #undef P
2623 2655 if (p < e)
2624 2656 bzero(p, e - p);
2625 2657 return (p - buf);
2626 2658 }
2627 2659 #endif /* COLLECT_METRICS */
2628 2660
2629 2661 /*
2630 2662 * Dump the system.
2631 2663 */
2632 2664 void
2633 2665 dumpsys(void)
2634 2666 {
2635 2667 dumpsync_t *ds = &dumpsync;
2636 2668 taskq_t *livetaskq = NULL;
2637 2669 pfn_t pfn;
2638 2670 pgcnt_t bitnum;
2639 2671 proc_t *p;
2640 2672 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper];
2641 2673 cbuf_t *cp;
2642 2674 pid_t npids, pidx;
2643 2675 char *content;
2644 2676 char *buf;
2645 2677 size_t size;
2646 2678 int save_dump_clevel;
2647 2679 dumpmlw_t mlw;
2648 2680 dumpcsize_t datatag;
2649 2681 dumpdatahdr_t datahdr;
2650 2682
2651 2683 if (dumpvp == NULL || dumphdr == NULL) {
2652 2684 uprintf("skipping system dump - no dump device configured\n");
2653 2685 if (panicstr) {
2654 2686 dumpcfg.helpers_wanted = 0;
2655 2687 dumpsys_spinunlock(&dumpcfg.helper_lock);
2656 2688 }
2657 2689 return;
2658 2690 }
2659 2691 dumpbuf.cur = dumpbuf.start;
2660 2692
2661 2693 /* clear the sync variables */
2662 2694 ASSERT(dumpcfg.nhelper > 0);
2663 2695 bzero(ds, sizeof (*ds));
2664 2696 ds->dumpcpu = CPU->cpu_id;
2665 2697
2666 2698 /*
2667 2699 * Calculate the starting block for dump. If we're dumping on a
2668 2700 * swap device, start 1/5 of the way in; otherwise, start at the
2669 2701 * beginning. And never use the first page -- it may be a disk label.
2670 2702 */
2671 2703 if (dumpvp->v_flag & VISSWAP)
2672 2704 dumphdr->dump_start = P2ROUNDUP(dumpvp_size / 5, DUMP_OFFSET);
2673 2705 else
2674 2706 dumphdr->dump_start = DUMP_OFFSET;
2675 2707
2676 2708 dumphdr->dump_flags = DF_VALID | DF_COMPLETE | DF_LIVE | DF_COMPRESSED;
2677 2709 dumphdr->dump_crashtime = gethrestime_sec();
2678 2710 dumphdr->dump_npages = 0;
2679 2711 dumphdr->dump_nvtop = 0;
2680 2712 bzero(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.bitmapsize));
2681 2713 dump_timeleft = dump_timeout;
2682 2714
2683 2715 if (panicstr) {
2684 2716 dumphdr->dump_flags &= ~DF_LIVE;
2685 2717 (void) VOP_DUMPCTL(dumpvp, DUMP_FREE, NULL, NULL);
2686 2718 (void) VOP_DUMPCTL(dumpvp, DUMP_ALLOC, NULL, NULL);
2687 2719 (void) vsnprintf(dumphdr->dump_panicstring, DUMP_PANICSIZE,
2688 2720 panicstr, panicargs);
2689 2721
2690 2722 }
2691 2723
2692 2724 if (dump_conflags & DUMP_ALL)
2693 2725 content = "all";
2694 2726 else if (dump_conflags & DUMP_CURPROC)
2695 2727 content = "kernel + curproc";
2696 2728 else
2697 2729 content = "kernel";
2698 2730 uprintf("dumping to %s, offset %lld, content: %s\n", dumppath,
2699 2731 dumphdr->dump_start, content);
2700 2732
2701 2733 /* Make sure nodename is current */
2702 2734 bcopy(utsname.nodename, dumphdr->dump_utsname.nodename, SYS_NMLN);
2703 2735
2704 2736 /*
2705 2737 * If this is a live dump, try to open a VCHR vnode for better
2706 2738 * performance. We must take care to flush the buffer cache
2707 2739 * first.
2708 2740 */
2709 2741 if (!panicstr) {
2710 2742 vnode_t *cdev_vp, *cmn_cdev_vp;
2711 2743
2712 2744 ASSERT(dumpbuf.cdev_vp == NULL);
2713 2745 cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR);
2714 2746 if (cdev_vp != NULL) {
2715 2747 cmn_cdev_vp = common_specvp(cdev_vp);
2716 2748 if (VOP_OPEN(&cmn_cdev_vp, FREAD | FWRITE, kcred, NULL)
2717 2749 == 0) {
2718 2750 if (vn_has_cached_data(dumpvp))
2719 2751 (void) pvn_vplist_dirty(dumpvp, 0, NULL,
2720 2752 B_INVAL | B_TRUNC, kcred);
2721 2753 dumpbuf.cdev_vp = cmn_cdev_vp;
2722 2754 } else {
2723 2755 VN_RELE(cdev_vp);
2724 2756 }
2725 2757 }
2726 2758 }
2727 2759
2728 2760 /*
2729 2761 * Store a hires timestamp so we can look it up during debugging.
2730 2762 */
2731 2763 lbolt_debug_entry();
2732 2764
2733 2765 /*
2734 2766 * Leave room for the message and ereport save areas and terminal dump
2735 2767 * header.
2736 2768 */
2737 2769 dumpbuf.vp_limit = dumpvp_size - DUMP_LOGSIZE - DUMP_OFFSET -
2738 2770 DUMP_ERPTSIZE;
2739 2771
2740 2772 /*
2741 2773 * Write out the symbol table. It's no longer compressed,
2742 2774 * so its 'size' and 'csize' are equal.
2743 2775 */
2744 2776 dumpbuf.vp_off = dumphdr->dump_ksyms = dumphdr->dump_start + PAGESIZE;
2745 2777 dumphdr->dump_ksyms_size = dumphdr->dump_ksyms_csize =
2746 2778 ksyms_snapshot(dumpvp_ksyms_write, NULL, LONG_MAX);
2747 2779
2748 2780 /*
2749 2781 * Write out the translation map.
2750 2782 */
2751 2783 dumphdr->dump_map = dumpvp_flush();
2752 2784 dump_as(&kas);
2753 2785 dumphdr->dump_nvtop += dump_plat_addr();
2754 2786
2755 2787 /*
2756 2788 * call into hat, which may have unmapped pages that also need to
2757 2789 * be in the dump
2758 2790 */
2759 2791 hat_dump();
2760 2792
2761 2793 if (dump_conflags & DUMP_ALL) {
2762 2794 mutex_enter(&pidlock);
2763 2795
2764 2796 for (npids = 0, p = practive; p != NULL; p = p->p_next)
2765 2797 dumpcfg.pids[npids++] = p->p_pid;
2766 2798
2767 2799 mutex_exit(&pidlock);
2768 2800
2769 2801 for (pidx = 0; pidx < npids; pidx++)
2770 2802 (void) dump_process(dumpcfg.pids[pidx]);
2771 2803
2772 2804 dump_init_memlist_walker(&mlw);
2773 2805 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) {
2774 2806 dump_timeleft = dump_timeout;
2775 2807 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
2776 2808 /*
2777 2809 * Some hypervisors do not have all pages available to
2778 2810 * be accessed by the guest OS. Check for page
2779 2811 * accessibility.
2780 2812 */
2781 2813 if (plat_hold_page(pfn, PLAT_HOLD_NO_LOCK, NULL) !=
2782 2814 PLAT_HOLD_OK)
2783 2815 continue;
2784 2816 BT_SET(dumpcfg.bitmap, bitnum);
2785 2817 }
2786 2818 dumphdr->dump_npages = dumpcfg.bitmapsize;
2787 2819 dumphdr->dump_flags |= DF_ALL;
2788 2820
2789 2821 } else if (dump_conflags & DUMP_CURPROC) {
2790 2822 /*
2791 2823 * Determine which pid is to be dumped. If we're panicking, we
2792 2824 * dump the process associated with panic_thread (if any). If
2793 2825 * this is a live dump, we dump the process associated with
2794 2826 * curthread.
2795 2827 */
2796 2828 npids = 0;
2797 2829 if (panicstr) {
2798 2830 if (panic_thread != NULL &&
2799 2831 panic_thread->t_procp != NULL &&
2800 2832 panic_thread->t_procp != &p0) {
2801 2833 dumpcfg.pids[npids++] =
2802 2834 panic_thread->t_procp->p_pid;
2803 2835 }
2804 2836 } else {
2805 2837 dumpcfg.pids[npids++] = curthread->t_procp->p_pid;
2806 2838 }
2807 2839
2808 2840 if (npids && dump_process(dumpcfg.pids[0]) == 0)
2809 2841 dumphdr->dump_flags |= DF_CURPROC;
2810 2842 else
2811 2843 dumphdr->dump_flags |= DF_KERNEL;
2812 2844
2813 2845 } else {
2814 2846 dumphdr->dump_flags |= DF_KERNEL;
2815 2847 }
2816 2848
2817 2849 dumphdr->dump_hashmask = (1 << highbit(dumphdr->dump_nvtop - 1)) - 1;
2818 2850
2819 2851 /*
2820 2852 * Write out the pfn table.
2821 2853 */
2822 2854 dumphdr->dump_pfn = dumpvp_flush();
2823 2855 dump_init_memlist_walker(&mlw);
2824 2856 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) {
2825 2857 dump_timeleft = dump_timeout;
2826 2858 if (!BT_TEST(dumpcfg.bitmap, bitnum))
2827 2859 continue;
2828 2860 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
2829 2861 ASSERT(pfn != PFN_INVALID);
2830 2862 dumpvp_write(&pfn, sizeof (pfn_t));
2831 2863 }
2832 2864 dump_plat_pfn();
2833 2865
2834 2866 /*
2835 2867 * Write out all the pages.
2836 2868 * Map pages, copy them handling UEs, compress, and write them out.
2837 2869 * Cooperate with any helpers running on CPUs in panic_idle().
2838 2870 */
2839 2871 dumphdr->dump_data = dumpvp_flush();
2840 2872
2841 2873 bzero(dumpcfg.helpermap, BT_SIZEOFMAP(NCPU));
2842 2874 ds->live = dumpcfg.clevel > 0 &&
2843 2875 (dumphdr->dump_flags & DF_LIVE) != 0;
2844 2876
2845 2877 save_dump_clevel = dumpcfg.clevel;
2846 2878 if (panicstr)
2847 2879 dumpsys_get_maxmem();
2848 2880 else if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2)
2849 2881 dumpcfg.clevel = DUMP_CLEVEL_LZJB;
2850 2882
2851 2883 dumpcfg.nhelper_used = 0;
2852 2884 for (hp = dumpcfg.helper; hp != hpend; hp++) {
2853 2885 if (hp->page == NULL) {
2854 2886 hp->helper = DONEHELPER;
2855 2887 continue;
2856 2888 }
2857 2889 ++dumpcfg.nhelper_used;
2858 2890 hp->helper = FREEHELPER;
2859 2891 hp->taskqid = NULL;
2860 2892 hp->ds = ds;
2861 2893 bzero(&hp->perpage, sizeof (hp->perpage));
2862 2894 if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2)
2863 2895 (void) BZ2_bzCompressReset(&hp->bzstream);
2864 2896 }
2865 2897
2866 2898 CQ_OPEN(freebufq);
2867 2899 CQ_OPEN(helperq);
2868 2900
2869 2901 dumpcfg.ncbuf_used = 0;
2870 2902 for (cp = dumpcfg.cbuf; cp != &dumpcfg.cbuf[dumpcfg.ncbuf]; cp++) {
2871 2903 if (cp->buf != NULL) {
2872 2904 CQ_PUT(freebufq, cp, CBUF_FREEBUF);
2873 2905 ++dumpcfg.ncbuf_used;
2874 2906 }
2875 2907 }
2876 2908
2877 2909 for (cp = dumpcfg.cmap; cp != &dumpcfg.cmap[dumpcfg.ncmap]; cp++)
2878 2910 CQ_PUT(mainq, cp, CBUF_FREEMAP);
2879 2911
2880 2912 ds->start = gethrtime();
2881 2913 ds->iowaitts = ds->start;
2882 2914
2883 2915 /* start helpers */
2884 2916 if (ds->live) {
2885 2917 int n = dumpcfg.nhelper_used;
2886 2918 int pri = MINCLSYSPRI - 25;
2887 2919
2888 2920 livetaskq = taskq_create("LiveDump", n, pri, n, n,
2889 2921 TASKQ_PREPOPULATE);
2890 2922 for (hp = dumpcfg.helper; hp != hpend; hp++) {
2891 2923 if (hp->page == NULL)
2892 2924 continue;
2893 2925 hp->helper = hp - dumpcfg.helper;
2894 2926 hp->taskqid = taskq_dispatch(livetaskq,
2895 2927 dumpsys_live_helper, (void *)hp, TQ_NOSLEEP);
2896 2928 }
2897 2929
2898 2930 } else {
2899 2931 if (panicstr)
2900 2932 kmem_dump_begin();
2901 2933 dumpcfg.helpers_wanted = dumpcfg.clevel > 0;
2902 2934 dumpsys_spinunlock(&dumpcfg.helper_lock);
2903 2935 }
2904 2936
2905 2937 /* run main task */
2906 2938 dumpsys_main_task(ds);
2907 2939
2908 2940 ds->elapsed = gethrtime() - ds->start;
2909 2941 if (ds->elapsed < 1)
2910 2942 ds->elapsed = 1;
2911 2943
2912 2944 if (livetaskq != NULL)
2913 2945 taskq_destroy(livetaskq);
2914 2946
2915 2947 if (ds->neednl) {
2916 2948 uprintf("\n");
2917 2949 ds->neednl = 0;
2918 2950 }
2919 2951
2920 2952 /* record actual pages dumped */
2921 2953 dumphdr->dump_npages = ds->npages;
2922 2954
2923 2955 /* platform-specific data */
2924 2956 dumphdr->dump_npages += dump_plat_data(dumpcfg.cbuf[0].buf);
2925 2957
2926 2958 /* note any errors by clearing DF_COMPLETE */
2927 2959 if (dump_ioerr || ds->npages < dumphdr->dump_npages)
2928 2960 dumphdr->dump_flags &= ~DF_COMPLETE;
2929 2961
2930 2962 /* end of stream blocks */
2931 2963 datatag = 0;
2932 2964 dumpvp_write(&datatag, sizeof (datatag));
2933 2965
2934 2966 bzero(&datahdr, sizeof (datahdr));
2935 2967
2936 2968 /* buffer for metrics */
2937 2969 buf = dumpcfg.cbuf[0].buf;
2938 2970 size = MIN(dumpcfg.cbuf[0].size, DUMP_OFFSET - sizeof (dumphdr_t) -
2939 2971 sizeof (dumpdatahdr_t));
2940 2972
2941 2973 /* finish the kmem intercepts, collect kmem verbose info */
2942 2974 if (panicstr) {
2943 2975 datahdr.dump_metrics = kmem_dump_finish(buf, size);
2944 2976 buf += datahdr.dump_metrics;
2945 2977 size -= datahdr.dump_metrics;
2946 2978 }
2947 2979
2948 2980 /* record in the header whether this is a fault-management panic */
2949 2981 if (panicstr)
2950 2982 dumphdr->dump_fm_panic = is_fm_panic();
2951 2983
2952 2984 /* compression info in data header */
2953 2985 datahdr.dump_datahdr_magic = DUMP_DATAHDR_MAGIC;
2954 2986 datahdr.dump_datahdr_version = DUMP_DATAHDR_VERSION;
2955 2987 datahdr.dump_maxcsize = CBUF_SIZE;
2956 2988 datahdr.dump_maxrange = CBUF_MAPSIZE / PAGESIZE;
2957 2989 datahdr.dump_nstreams = dumpcfg.nhelper_used;
2958 2990 datahdr.dump_clevel = dumpcfg.clevel;
2959 2991 #ifdef COLLECT_METRICS
2960 2992 if (dump_metrics_on)
2961 2993 datahdr.dump_metrics += dumpsys_metrics(ds, buf, size);
2962 2994 #endif
2963 2995 datahdr.dump_data_csize = dumpvp_flush() - dumphdr->dump_data;
2964 2996
2965 2997 /*
2966 2998 * Write out the initial and terminal dump headers.
2967 2999 */
2968 3000 dumpbuf.vp_off = dumphdr->dump_start;
2969 3001 dumpvp_write(dumphdr, sizeof (dumphdr_t));
2970 3002 (void) dumpvp_flush();
2971 3003
2972 3004 dumpbuf.vp_limit = dumpvp_size;
2973 3005 dumpbuf.vp_off = dumpbuf.vp_limit - DUMP_OFFSET;
2974 3006 dumpvp_write(dumphdr, sizeof (dumphdr_t));
2975 3007 dumpvp_write(&datahdr, sizeof (dumpdatahdr_t));
2976 3008 dumpvp_write(dumpcfg.cbuf[0].buf, datahdr.dump_metrics);
2977 3009
2978 3010 (void) dumpvp_flush();
2979 3011
2980 3012 uprintf("\r%3d%% done: %llu pages dumped, ",
2981 3013 ds->percent_done, (u_longlong_t)ds->npages);
2982 3014
2983 3015 if (dump_ioerr == 0) {
2984 3016 uprintf("dump succeeded\n");
2985 3017 } else {
2986 3018 uprintf("dump failed: error %d\n", dump_ioerr);
2987 3019 #ifdef DEBUG
2988 3020 if (panicstr)
2989 3021 debug_enter("dump failed");
2990 3022 #endif
2991 3023 }
2992 3024
2993 3025 /*
2994 3026 * Write out all undelivered messages. This has to be the *last*
2995 3027 * thing we do because the dump process itself emits messages.
2996 3028 */
2997 3029 if (panicstr) {
2998 3030 dump_summary();
2999 3031 dump_ereports();
3000 3032 dump_messages();
3001 3033 }
3002 3034
3003 3035 delay(2 * hz); /* let people see the 'done' message */
3004 3036 dump_timeleft = 0;
3005 3037 dump_ioerr = 0;
3006 3038
3007 3039 /* restore settings after live dump completes */
3008 3040 if (!panicstr) {
3009 3041 dumpcfg.clevel = save_dump_clevel;
3010 3042
3011 3043 /* release any VCHR open of the dump device */
3012 3044 if (dumpbuf.cdev_vp != NULL) {
3013 3045 (void) VOP_CLOSE(dumpbuf.cdev_vp, FREAD | FWRITE, 1, 0,
3014 3046 kcred, NULL);
3015 3047 VN_RELE(dumpbuf.cdev_vp);
3016 3048 dumpbuf.cdev_vp = NULL;
3017 3049 }
3018 3050 }
3019 3051 }
3020 3052
3021 3053 /*
3022 3054 * This function is called whenever the memory size, as represented
3023 3055 * by the phys_install list, changes.
3024 3056 */
3025 3057 void
3026 3058 dump_resize()
3027 3059 {
3028 3060 mutex_enter(&dump_lock);
3029 3061 dumphdr_init();
3030 3062 dumpbuf_resize();
3031 3063 dump_update_clevel();
3032 3064 mutex_exit(&dump_lock);
3033 3065 }
3034 3066
3035 3067 /*
3036 3068 * This function allows for dynamic resizing of a dump area. It assumes that
3037 3069 * the underlying device has update its appropriate size(9P).
3038 3070 */
3039 3071 int
3040 3072 dumpvp_resize()
3041 3073 {
3042 3074 int error;
3043 3075 vattr_t vattr;
3044 3076
3045 3077 mutex_enter(&dump_lock);
3046 3078 vattr.va_mask = AT_SIZE;
3047 3079 if ((error = VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL)) != 0) {
3048 3080 mutex_exit(&dump_lock);
3049 3081 return (error);
3050 3082 }
3051 3083
3052 3084 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) {
3053 3085 mutex_exit(&dump_lock);
3054 3086 return (ENOSPC);
3055 3087 }
3056 3088
3057 3089 dumpvp_size = vattr.va_size & -DUMP_OFFSET;
3058 3090 mutex_exit(&dump_lock);
3059 3091 return (0);
3060 3092 }
3061 3093
3062 3094 int
3063 3095 dump_set_uuid(const char *uuidstr)
3064 3096 {
3065 3097 const char *ptr;
3066 3098 int i;
3067 3099
3068 3100 if (uuidstr == NULL || strnlen(uuidstr, 36 + 1) != 36)
3069 3101 return (EINVAL);
3070 3102
3071 3103 /* uuid_parse is not common code so check manually */
3072 3104 for (i = 0, ptr = uuidstr; i < 36; i++, ptr++) {
3073 3105 switch (i) {
3074 3106 case 8:
3075 3107 case 13:
3076 3108 case 18:
3077 3109 case 23:
3078 3110 if (*ptr != '-')
3079 3111 return (EINVAL);
3080 3112 break;
3081 3113
3082 3114 default:
3083 3115 if (!isxdigit(*ptr))
3084 3116 return (EINVAL);
3085 3117 break;
3086 3118 }
3087 3119 }
3088 3120
3089 3121 if (dump_osimage_uuid[0] != '\0')
3090 3122 return (EALREADY);
3091 3123
3092 3124 (void) strncpy(dump_osimage_uuid, uuidstr, 36 + 1);
3093 3125
3094 3126 cmn_err(CE_CONT, "?This Solaris instance has UUID %s\n",
3095 3127 dump_osimage_uuid);
3096 3128
3097 3129 return (0);
3098 3130 }
3099 3131
3100 3132 const char *
3101 3133 dump_get_uuid(void)
3102 3134 {
3103 3135 return (dump_osimage_uuid[0] != '\0' ? dump_osimage_uuid : "");
3104 3136 }
↓ open down ↓ |
513 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX