1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2018 Joyent, Inc. 25 * Copyright 2018 Nexenta Systems, Inc. All rights reserved. 26 */ 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/vm.h> 32 #include <sys/proc.h> 33 #include <sys/file.h> 34 #include <sys/conf.h> 35 #include <sys/kmem.h> 36 #include <sys/mem.h> 37 #include <sys/mman.h> 38 #include <sys/vnode.h> 39 #include <sys/errno.h> 40 #include <sys/memlist.h> 41 #include <sys/dumphdr.h> 42 #include <sys/dumpadm.h> 43 #include <sys/ksyms.h> 44 #include <sys/compress.h> 45 #include <sys/stream.h> 46 #include <sys/strsun.h> 47 #include <sys/cmn_err.h> 48 #include <sys/bitmap.h> 49 #include <sys/modctl.h> 50 #include <sys/utsname.h> 51 #include <sys/systeminfo.h> 52 #include <sys/vmem.h> 53 #include <sys/log.h> 54 #include <sys/var.h> 55 #include <sys/debug.h> 56 #include <sys/sunddi.h> 57 #include <fs/fs_subr.h> 58 #include <sys/fs/snode.h> 59 #include <sys/ontrap.h> 60 #include <sys/panic.h> 61 #include <sys/dkio.h> 62 #include <sys/vtoc.h> 63 #include <sys/errorq.h> 64 #include <sys/fm/util.h> 65 #include <sys/fs/zfs.h> 66 67 #include <vm/hat.h> 68 #include <vm/as.h> 69 #include <vm/page.h> 70 #include <vm/pvn.h> 71 #include <vm/seg.h> 72 #include <vm/seg_kmem.h> 73 #include <sys/clock_impl.h> 74 #include <sys/hold_page.h> 75 #include <sys/cpu.h> 76 77 #define ONE_GIG (1024 * 1024 * 1024UL) 78 79 /* 80 * Parallel Dump: 81 * CPUs that are otherwise idle during panic are employed to parallelize 82 * the compression task. I/O and compression are performed by different 83 * CPUs, and are hence overlapped in time, unlike the older serial code. 84 */ 85 86 /* 87 * exported vars 88 */ 89 kmutex_t dump_lock; /* lock for dump configuration */ 90 dumphdr_t *dumphdr; /* dump header */ 91 int dump_conflags = DUMP_KERNEL; /* dump configuration flags */ 92 vnode_t *dumpvp; /* dump device vnode pointer */ 93 u_offset_t dumpvp_size; /* size of dump device, in bytes */ 94 char *dumppath; /* pathname of dump device */ 95 int dump_timeout = 120; /* timeout for dumping pages */ 96 int dump_timeleft; /* portion of dump_timeout remaining */ 97 int dump_ioerr; /* dump i/o error */ 98 int dump_check_used; /* enable check for used pages */ 99 char *dump_stack_scratch; /* scratch area for saving stack summary */ 100 101 /* 102 * Tunables for dump compression and parallelism. 103 * These can be set via /etc/system. 104 * 105 * dump_ncpu_low: 106 * This is the minimum configuration for parallel lzjb. 107 * A special value of 0 means that parallel dump will not be used. 108 * 109 * dump_metrics_on: 110 * If set, metrics are collected in the kernel, passed to savecore 111 * via the dump file, and recorded by savecore in METRICS.txt. 112 */ 113 uint_t dump_ncpu_low = 4; /* minimum config for parallel lzjb */ 114 115 /* tunables for pre-reserved heap */ 116 uint_t dump_kmem_permap = 1024; 117 uint_t dump_kmem_pages = 0; 118 119 /* Define multiple buffers per helper to avoid stalling */ 120 #define NCBUF_PER_HELPER 2 121 #define NCMAP_PER_HELPER 4 122 123 /* minimum number of helpers configured */ 124 #define MINHELPERS (MAX(dump_ncpu_low, 1)) 125 #define MINCBUFS (MINHELPERS * NCBUF_PER_HELPER) 126 127 /* 128 * Define constant parameters. 129 * 130 * CBUF_SIZE size of an output buffer 131 * 132 * CBUF_MAPSIZE size of virtual range for mapping pages 133 * 134 * CBUF_MAPNP size of virtual range in pages 135 * 136 */ 137 #define DUMP_1KB ((size_t)1 << 10) 138 #define DUMP_1MB ((size_t)1 << 20) 139 #define CBUF_SIZE ((size_t)1 << 17) 140 #define CBUF_MAPSHIFT (22) 141 #define CBUF_MAPSIZE ((size_t)1 << CBUF_MAPSHIFT) 142 #define CBUF_MAPNP ((size_t)1 << (CBUF_MAPSHIFT - PAGESHIFT)) 143 144 /* 145 * Compression metrics are accumulated nano-second subtotals. The 146 * results are normalized by the number of pages dumped. A report is 147 * generated when dumpsys() completes and is saved in the dump image 148 * after the trailing dump header. 149 * 150 * Metrics are always collected. Set the variable dump_metrics_on to 151 * cause metrics to be saved in the crash file, where savecore will 152 * save it in the file METRICS.txt. 153 */ 154 #define PERPAGES \ 155 PERPAGE(bitmap) PERPAGE(map) PERPAGE(unmap) \ 156 PERPAGE(copy) PERPAGE(compress) \ 157 PERPAGE(write) \ 158 PERPAGE(inwait) PERPAGE(outwait) 159 160 typedef struct perpage { 161 #define PERPAGE(x) hrtime_t x; 162 PERPAGES 163 #undef PERPAGE 164 } perpage_t; 165 166 /* 167 * This macro controls the code generation for collecting dump 168 * performance information. By default, the code is generated, but 169 * automatic saving of the information is disabled. If dump_metrics_on 170 * is set to 1, the timing information is passed to savecore via the 171 * crash file, where it is appended to the file dump-dir/METRICS.txt. 172 */ 173 #define COLLECT_METRICS 174 175 #ifdef COLLECT_METRICS 176 uint_t dump_metrics_on = 0; /* set to 1 to enable recording metrics */ 177 178 #define HRSTART(v, m) v##ts.m = gethrtime() 179 #define HRSTOP(v, m) v.m += gethrtime() - v##ts.m 180 #define HRBEGIN(v, m, s) v##ts.m = gethrtime(); v.size += s 181 #define HREND(v, m) v.m += gethrtime() - v##ts.m 182 #define HRNORM(v, m, n) v.m /= (n) 183 184 #else 185 #define HRSTART(v, m) 186 #define HRSTOP(v, m) 187 #define HRBEGIN(v, m, s) 188 #define HREND(v, m) 189 #define HRNORM(v, m, n) 190 #endif /* COLLECT_METRICS */ 191 192 /* 193 * Buffers for copying and compressing memory pages. 194 * 195 * cbuf_t buffer controllers: used for both input and output. 196 * 197 * The buffer state indicates how it is being used: 198 * 199 * CBUF_FREEMAP: CBUF_MAPSIZE virtual address range is available for 200 * mapping input pages. 201 * 202 * CBUF_INREADY: input pages are mapped and ready for compression by a 203 * helper. 204 * 205 * CBUF_USEDMAP: mapping has been consumed by a helper. Needs unmap. 206 * 207 * CBUF_FREEBUF: CBUF_SIZE output buffer, which is available. 208 * 209 * CBUF_WRITE: CBUF_SIZE block of compressed pages from a helper, 210 * ready to write out. 211 * 212 * CBUF_ERRMSG: CBUF_SIZE block of error messages from a helper 213 * (reports UE errors.) 214 */ 215 216 typedef enum cbufstate { 217 CBUF_FREEMAP, 218 CBUF_INREADY, 219 CBUF_USEDMAP, 220 CBUF_FREEBUF, 221 CBUF_WRITE, 222 CBUF_ERRMSG 223 } cbufstate_t; 224 225 typedef struct cbuf cbuf_t; 226 227 struct cbuf { 228 cbuf_t *next; /* next in list */ 229 cbufstate_t state; /* processing state */ 230 size_t used; /* amount used */ 231 size_t size; /* mem size */ 232 char *buf; /* kmem or vmem */ 233 pgcnt_t pagenum; /* index to pfn map */ 234 pgcnt_t bitnum; /* first set bitnum */ 235 pfn_t pfn; /* first pfn in mapped range */ 236 int off; /* byte offset to first pfn */ 237 }; 238 239 static char dump_osimage_uuid[36 + 1]; 240 241 #define isdigit(ch) ((ch) >= '0' && (ch) <= '9') 242 #define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \ 243 ((ch) >= 'A' && (ch) <= 'F')) 244 245 /* 246 * cqueue_t queues: a uni-directional channel for communication 247 * from the master to helper tasks or vice-versa using put and 248 * get primitives. Both mappings and data buffers are passed via 249 * queues. Producers close a queue when done. The number of 250 * active producers is reference counted so the consumer can 251 * detect end of data. Concurrent access is mediated by atomic 252 * operations for panic dump, or mutex/cv for live dump. 253 * 254 * There a four queues, used as follows: 255 * 256 * Queue Dataflow NewState 257 * -------------------------------------------------- 258 * mainq master -> master FREEMAP 259 * master has initialized or unmapped an input buffer 260 * -------------------------------------------------- 261 * helperq master -> helper INREADY 262 * master has mapped input for use by helper 263 * -------------------------------------------------- 264 * mainq master <- helper USEDMAP 265 * helper is done with input 266 * -------------------------------------------------- 267 * freebufq master -> helper FREEBUF 268 * master has initialized or written an output buffer 269 * -------------------------------------------------- 270 * mainq master <- helper WRITE 271 * block of compressed pages from a helper 272 * -------------------------------------------------- 273 * mainq master <- helper ERRMSG 274 * error messages from a helper (memory error case) 275 * -------------------------------------------------- 276 * writerq master <- master WRITE 277 * non-blocking queue of blocks to write 278 * -------------------------------------------------- 279 */ 280 typedef struct cqueue { 281 cbuf_t *volatile first; /* first in list */ 282 cbuf_t *last; /* last in list */ 283 hrtime_t ts; /* timestamp */ 284 hrtime_t empty; /* total time empty */ 285 kmutex_t mutex; /* live state lock */ 286 kcondvar_t cv; /* live wait var */ 287 lock_t spinlock; /* panic mode spin lock */ 288 volatile uint_t open; /* producer ref count */ 289 } cqueue_t; 290 291 /* 292 * Convenience macros for using the cqueue functions 293 * Note that the caller must have defined "dumpsync_t *ds" 294 */ 295 #define CQ_IS_EMPTY(q) \ 296 (ds->q.first == NULL) 297 298 #define CQ_OPEN(q) \ 299 atomic_inc_uint(&ds->q.open) 300 301 #define CQ_CLOSE(q) \ 302 dumpsys_close_cq(&ds->q, ds->live) 303 304 #define CQ_PUT(q, cp, st) \ 305 dumpsys_put_cq(&ds->q, cp, st, ds->live) 306 307 #define CQ_GET(q) \ 308 dumpsys_get_cq(&ds->q, ds->live) 309 310 /* 311 * Dynamic state when dumpsys() is running. 312 */ 313 typedef struct dumpsync { 314 pgcnt_t npages; /* subtotal of pages dumped */ 315 pgcnt_t pages_mapped; /* subtotal of pages mapped */ 316 pgcnt_t pages_used; /* subtotal of pages used per map */ 317 size_t nwrite; /* subtotal of bytes written */ 318 uint_t live; /* running live dump */ 319 uint_t neednl; /* will need to print a newline */ 320 uint_t percent; /* dump progress */ 321 uint_t percent_done; /* dump progress reported */ 322 int sec_done; /* dump progress last report time */ 323 cqueue_t freebufq; /* free kmem bufs for writing */ 324 cqueue_t mainq; /* input for main task */ 325 cqueue_t helperq; /* input for helpers */ 326 cqueue_t writerq; /* input for writer */ 327 hrtime_t start; /* start time */ 328 hrtime_t elapsed; /* elapsed time when completed */ 329 hrtime_t iotime; /* time spent writing nwrite bytes */ 330 hrtime_t iowait; /* time spent waiting for output */ 331 hrtime_t iowaitts; /* iowait timestamp */ 332 perpage_t perpage; /* metrics */ 333 perpage_t perpagets; 334 int dumpcpu; /* master cpu */ 335 } dumpsync_t; 336 337 static dumpsync_t dumpsync; /* synchronization vars */ 338 339 /* 340 * helper_t helpers: contains the context for a stream. CPUs run in 341 * parallel at dump time; each CPU creates a single stream of 342 * compression data. Stream data is divided into CBUF_SIZE blocks. 343 * The blocks are written in order within a stream. But, blocks from 344 * multiple streams can be interleaved. Each stream is identified by a 345 * unique tag. 346 */ 347 typedef struct helper { 348 int helper; /* bound helper id */ 349 int tag; /* compression stream tag */ 350 perpage_t perpage; /* per page metrics */ 351 perpage_t perpagets; /* per page metrics (timestamps) */ 352 taskqid_t taskqid; /* live dump task ptr */ 353 int in, out; /* buffer offsets */ 354 cbuf_t *cpin, *cpout, *cperr; /* cbuf objects in process */ 355 dumpsync_t *ds; /* pointer to sync vars */ 356 size_t used; /* counts input consumed */ 357 char *page; /* buffer for page copy */ 358 char *lzbuf; /* lzjb output */ 359 } helper_t; 360 361 #define MAINHELPER (-1) /* helper is also the main task */ 362 #define FREEHELPER (-2) /* unbound helper */ 363 #define DONEHELPER (-3) /* helper finished */ 364 365 /* 366 * configuration vars for dumpsys 367 */ 368 typedef struct dumpcfg { 369 int nhelper; /* number of helpers */ 370 int nhelper_used; /* actual number of helpers used */ 371 int ncmap; /* number VA pages for compression */ 372 int ncbuf; /* number of bufs for compression */ 373 int ncbuf_used; /* number of bufs in use */ 374 uint_t clevel; /* dump compression level */ 375 helper_t *helper; /* array of helpers */ 376 cbuf_t *cmap; /* array of input (map) buffers */ 377 cbuf_t *cbuf; /* array of output buffers */ 378 ulong_t *helpermap; /* set of dumpsys helper CPU ids */ 379 ulong_t *bitmap; /* bitmap for marking pages to dump */ 380 ulong_t *rbitmap; /* bitmap for used CBUF_MAPSIZE ranges */ 381 pgcnt_t bitmapsize; /* size of bitmap */ 382 pgcnt_t rbitmapsize; /* size of bitmap for ranges */ 383 pgcnt_t found4m; /* number ranges allocated by dump */ 384 pgcnt_t foundsm; /* number small pages allocated by dump */ 385 pid_t *pids; /* list of process IDs at dump time */ 386 size_t maxsize; /* memory size needed at dump time */ 387 size_t maxvmsize; /* size of reserved VM */ 388 char *maxvm; /* reserved VM for spare pages */ 389 lock_t helper_lock; /* protect helper state */ 390 char helpers_wanted; /* flag to enable parallelism */ 391 } dumpcfg_t; 392 393 static dumpcfg_t dumpcfg; /* config vars */ 394 395 /* 396 * The dump I/O buffer. 397 * 398 * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is 399 * sized according to the optimum device transfer speed. 400 */ 401 typedef struct dumpbuf { 402 vnode_t *cdev_vp; /* VCHR open of the dump device */ 403 len_t vp_limit; /* maximum write offset */ 404 offset_t vp_off; /* current dump device offset */ 405 char *cur; /* dump write pointer */ 406 char *start; /* dump buffer address */ 407 char *end; /* dump buffer end */ 408 size_t size; /* size of dumpbuf in bytes */ 409 size_t iosize; /* best transfer size for device */ 410 } dumpbuf_t; 411 412 dumpbuf_t dumpbuf; /* I/O buffer */ 413 414 /* 415 * For parallel dump, defines maximum time main task thread will wait 416 * for at least one helper to register in dumpcfg.helpermap, before 417 * assuming there are no helpers and falling back to serial mode. 418 * Value is chosen arbitrary and provides *really* long wait for any 419 * available helper to register. 420 */ 421 #define DUMP_HELPER_MAX_WAIT 1000 /* millisec */ 422 423 /* 424 * The dump I/O buffer must be at least one page, at most xfer_size 425 * bytes, and should scale with physmem in between. The transfer size 426 * passed in will either represent a global default (maxphys) or the 427 * best size for the device. The size of the dumpbuf I/O buffer is 428 * limited by dumpbuf_limit (8MB by default) because the dump 429 * performance saturates beyond a certain size. The default is to 430 * select 1/4096 of the memory. 431 */ 432 static int dumpbuf_fraction = 12; /* memory size scale factor */ 433 static size_t dumpbuf_limit = 8 * DUMP_1MB; /* max I/O buf size */ 434 435 static size_t 436 dumpbuf_iosize(size_t xfer_size) 437 { 438 size_t iosize = ptob(physmem >> dumpbuf_fraction); 439 440 if (iosize < PAGESIZE) 441 iosize = PAGESIZE; 442 else if (iosize > xfer_size) 443 iosize = xfer_size; 444 if (iosize > dumpbuf_limit) 445 iosize = dumpbuf_limit; 446 return (iosize & PAGEMASK); 447 } 448 449 /* 450 * resize the I/O buffer 451 */ 452 static void 453 dumpbuf_resize(void) 454 { 455 char *old_buf = dumpbuf.start; 456 size_t old_size = dumpbuf.size; 457 char *new_buf; 458 size_t new_size; 459 460 ASSERT(MUTEX_HELD(&dump_lock)); 461 462 new_size = dumpbuf_iosize(MAX(dumpbuf.iosize, maxphys)); 463 if (new_size <= old_size) 464 return; /* no need to reallocate buffer */ 465 466 new_buf = kmem_alloc(new_size, KM_SLEEP); 467 dumpbuf.size = new_size; 468 dumpbuf.start = new_buf; 469 dumpbuf.end = new_buf + new_size; 470 kmem_free(old_buf, old_size); 471 } 472 473 /* 474 * dump_update_clevel is called when dumpadm configures the dump device. 475 * Determine the compression level / type 476 * - DUMP_CLEVEL_SERIAL is single threaded lzjb 477 * - DUMP_CLEVEL_LZJB is parallel lzjb 478 * Calculate number of helpers and buffers. 479 * Allocate the minimum configuration for now. 480 * 481 * When the dump file is configured we reserve a minimum amount of 482 * memory for use at crash time. But we reserve VA for all the memory 483 * we really want in order to do the fastest dump possible. The VA is 484 * backed by pages not being dumped, according to the bitmap. If 485 * there is insufficient spare memory, however, we fall back to the 486 * minimum. 487 * 488 * Live dump (savecore -L) always uses the minimum config. 489 * 490 * For parallel dumps, the number of helpers is ncpu-1. The CPU 491 * running panic runs the main task. For single-threaded dumps, the 492 * panic CPU does lzjb compression (it is tagged as MAINHELPER.) 493 * 494 * Need multiple buffers per helper so that they do not block waiting 495 * for the main task. 496 * parallel single-threaded 497 * Number of output buffers: nhelper*2 1 498 * Number of mapping buffers: nhelper*4 1 499 * 500 */ 501 static void 502 dump_update_clevel() 503 { 504 int tag; 505 helper_t *hp, *hpend; 506 cbuf_t *cp, *cpend; 507 dumpcfg_t *old = &dumpcfg; 508 dumpcfg_t newcfg = *old; 509 dumpcfg_t *new = &newcfg; 510 511 ASSERT(MUTEX_HELD(&dump_lock)); 512 513 /* 514 * Free the previously allocated bufs and VM. 515 */ 516 if (old->helper != NULL) { 517 518 /* helpers */ 519 hpend = &old->helper[old->nhelper]; 520 for (hp = old->helper; hp != hpend; hp++) { 521 if (hp->lzbuf != NULL) 522 kmem_free(hp->lzbuf, PAGESIZE); 523 if (hp->page != NULL) 524 kmem_free(hp->page, PAGESIZE); 525 } 526 kmem_free(old->helper, old->nhelper * sizeof (helper_t)); 527 528 /* VM space for mapping pages */ 529 cpend = &old->cmap[old->ncmap]; 530 for (cp = old->cmap; cp != cpend; cp++) 531 vmem_xfree(heap_arena, cp->buf, CBUF_MAPSIZE); 532 kmem_free(old->cmap, old->ncmap * sizeof (cbuf_t)); 533 534 /* output bufs */ 535 cpend = &old->cbuf[old->ncbuf]; 536 for (cp = old->cbuf; cp != cpend; cp++) 537 if (cp->buf != NULL) 538 kmem_free(cp->buf, cp->size); 539 kmem_free(old->cbuf, old->ncbuf * sizeof (cbuf_t)); 540 541 /* reserved VM for dumpsys_get_maxmem */ 542 if (old->maxvmsize > 0) 543 vmem_xfree(heap_arena, old->maxvm, old->maxvmsize); 544 } 545 546 /* 547 * Allocate memory and VM. 548 * One CPU runs dumpsys, the rest are helpers. 549 */ 550 new->nhelper = ncpus - 1; 551 if (new->nhelper < 1) 552 new->nhelper = 1; 553 554 if (new->nhelper > DUMP_MAX_NHELPER) 555 new->nhelper = DUMP_MAX_NHELPER; 556 557 /* If dump_ncpu_low is 0 or greater than ncpus, do serial dump */ 558 if (dump_ncpu_low == 0 || dump_ncpu_low > ncpus || new->nhelper < 2) { 559 new->clevel = DUMP_CLEVEL_SERIAL; 560 new->nhelper = 1; 561 new->ncbuf = 1; 562 new->ncmap = 1; 563 } else { 564 new->clevel = DUMP_CLEVEL_LZJB; 565 new->ncbuf = NCBUF_PER_HELPER * new->nhelper; 566 new->ncmap = NCMAP_PER_HELPER * new->nhelper; 567 } 568 569 /* 570 * Allocate new data structures and buffers for MINHELPERS, 571 * and also figure the max desired size. 572 */ 573 new->maxsize = 0; 574 new->maxvmsize = 0; 575 new->maxvm = NULL; 576 tag = 1; 577 new->helper = kmem_zalloc(new->nhelper * sizeof (helper_t), KM_SLEEP); 578 hpend = &new->helper[new->nhelper]; 579 for (hp = new->helper; hp != hpend; hp++) { 580 hp->tag = tag++; 581 if (hp < &new->helper[MINHELPERS]) { 582 hp->lzbuf = kmem_alloc(PAGESIZE, KM_SLEEP); 583 hp->page = kmem_alloc(PAGESIZE, KM_SLEEP); 584 } else { 585 new->maxsize += 2 * PAGESIZE; 586 } 587 } 588 589 new->cbuf = kmem_zalloc(new->ncbuf * sizeof (cbuf_t), KM_SLEEP); 590 cpend = &new->cbuf[new->ncbuf]; 591 for (cp = new->cbuf; cp != cpend; cp++) { 592 cp->state = CBUF_FREEBUF; 593 cp->size = CBUF_SIZE; 594 if (cp < &new->cbuf[MINCBUFS]) 595 cp->buf = kmem_alloc(cp->size, KM_SLEEP); 596 else 597 new->maxsize += cp->size; 598 } 599 600 new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP); 601 cpend = &new->cmap[new->ncmap]; 602 for (cp = new->cmap; cp != cpend; cp++) { 603 cp->state = CBUF_FREEMAP; 604 cp->size = CBUF_MAPSIZE; 605 cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE, 606 0, 0, NULL, NULL, VM_SLEEP); 607 } 608 609 /* reserve VA to be backed with spare pages at crash time */ 610 if (new->maxsize > 0) { 611 new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE); 612 new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE); 613 new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize, 614 CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP); 615 } 616 617 /* 618 * Reserve memory for kmem allocation calls made during crash dump. The 619 * hat layer allocates memory for each mapping created, and the I/O path 620 * allocates buffers and data structs. 621 * 622 * On larger systems, we easily exceed the lower amount, so we need some 623 * more space; the cut-over point is relatively arbitrary. If we run 624 * out, the only impact is that kmem state in the dump becomes 625 * inconsistent. 626 */ 627 628 if (dump_kmem_pages == 0) { 629 if (physmem > (16 * ONE_GIG) / PAGESIZE) 630 dump_kmem_pages = 20; 631 else 632 dump_kmem_pages = 8; 633 } 634 635 kmem_dump_init((new->ncmap * dump_kmem_permap) + 636 (dump_kmem_pages * PAGESIZE)); 637 638 /* set new config pointers */ 639 *old = *new; 640 } 641 642 /* 643 * Define a struct memlist walker to optimize bitnum to pfn 644 * lookup. The walker maintains the state of the list traversal. 645 */ 646 typedef struct dumpmlw { 647 struct memlist *mp; /* current memlist */ 648 pgcnt_t basenum; /* bitnum base offset */ 649 pgcnt_t mppages; /* current memlist size */ 650 pgcnt_t mpleft; /* size to end of current memlist */ 651 pfn_t mpaddr; /* first pfn in memlist */ 652 } dumpmlw_t; 653 654 /* initialize the walker */ 655 static inline void 656 dump_init_memlist_walker(dumpmlw_t *pw) 657 { 658 pw->mp = phys_install; 659 pw->basenum = 0; 660 pw->mppages = pw->mp->ml_size >> PAGESHIFT; 661 pw->mpleft = pw->mppages; 662 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT; 663 } 664 665 /* 666 * Lookup pfn given bitnum. The memlist can be quite long on some 667 * systems (e.g.: one per board). To optimize sequential lookups, the 668 * caller initializes and presents a memlist walker. 669 */ 670 static pfn_t 671 dump_bitnum_to_pfn(pgcnt_t bitnum, dumpmlw_t *pw) 672 { 673 bitnum -= pw->basenum; 674 while (pw->mp != NULL) { 675 if (bitnum < pw->mppages) { 676 pw->mpleft = pw->mppages - bitnum; 677 return (pw->mpaddr + bitnum); 678 } 679 bitnum -= pw->mppages; 680 pw->basenum += pw->mppages; 681 pw->mp = pw->mp->ml_next; 682 if (pw->mp != NULL) { 683 pw->mppages = pw->mp->ml_size >> PAGESHIFT; 684 pw->mpleft = pw->mppages; 685 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT; 686 } 687 } 688 return (PFN_INVALID); 689 } 690 691 static pgcnt_t 692 dump_pfn_to_bitnum(pfn_t pfn) 693 { 694 struct memlist *mp; 695 pgcnt_t bitnum = 0; 696 697 for (mp = phys_install; mp != NULL; mp = mp->ml_next) { 698 if (pfn >= (mp->ml_address >> PAGESHIFT) && 699 pfn < ((mp->ml_address + mp->ml_size) >> PAGESHIFT)) 700 return (bitnum + pfn - (mp->ml_address >> PAGESHIFT)); 701 bitnum += mp->ml_size >> PAGESHIFT; 702 } 703 return ((pgcnt_t)-1); 704 } 705 706 /* 707 * Set/test bitmap for a CBUF_MAPSIZE range which includes pfn. The 708 * mapping of pfn to range index is imperfect because pfn and bitnum 709 * do not have the same phase. To make sure a CBUF_MAPSIZE range is 710 * covered, call this for both ends: 711 * dump_set_used(base) 712 * dump_set_used(base+CBUF_MAPNP-1) 713 * 714 * This is used during a panic dump to mark pages allocated by 715 * dumpsys_get_maxmem(). The macro IS_DUMP_PAGE(pp) is used by 716 * page_get_mnode_freelist() to make sure pages used by dump are never 717 * allocated. 718 */ 719 #define CBUF_MAPP2R(pfn) ((pfn) >> (CBUF_MAPSHIFT - PAGESHIFT)) 720 721 static void 722 dump_set_used(pfn_t pfn) 723 { 724 725 pgcnt_t bitnum, rbitnum; 726 727 bitnum = dump_pfn_to_bitnum(pfn); 728 ASSERT(bitnum != (pgcnt_t)-1); 729 730 rbitnum = CBUF_MAPP2R(bitnum); 731 ASSERT(rbitnum < dumpcfg.rbitmapsize); 732 733 BT_SET(dumpcfg.rbitmap, rbitnum); 734 } 735 736 int 737 dump_test_used(pfn_t pfn) 738 { 739 pgcnt_t bitnum, rbitnum; 740 741 bitnum = dump_pfn_to_bitnum(pfn); 742 ASSERT(bitnum != (pgcnt_t)-1); 743 744 rbitnum = CBUF_MAPP2R(bitnum); 745 ASSERT(rbitnum < dumpcfg.rbitmapsize); 746 747 return (BT_TEST(dumpcfg.rbitmap, rbitnum)); 748 } 749 750 /* 751 * Perform additional checks on the page to see if we can really use 752 * it. The kernel (kas) pages are always set in the bitmap. However, 753 * boot memory pages (prom_ppages or P_BOOTPAGES) are not in the 754 * bitmap. So we check for them. 755 */ 756 static inline int 757 dump_pfn_check(pfn_t pfn) 758 { 759 page_t *pp = page_numtopp_nolock(pfn); 760 if (pp == NULL || pp->p_pagenum != pfn || 761 #if defined(__sparc) 762 pp->p_vnode == &promvp || 763 #else 764 PP_ISBOOTPAGES(pp) || 765 #endif 766 pp->p_toxic != 0) 767 return (0); 768 return (1); 769 } 770 771 /* 772 * Check a range to see if all contained pages are available and 773 * return non-zero if the range can be used. 774 */ 775 static inline int 776 dump_range_check(pgcnt_t start, pgcnt_t end, pfn_t pfn) 777 { 778 for (; start < end; start++, pfn++) { 779 if (BT_TEST(dumpcfg.bitmap, start)) 780 return (0); 781 if (!dump_pfn_check(pfn)) 782 return (0); 783 } 784 return (1); 785 } 786 787 /* 788 * dumpsys_get_maxmem() is called during panic. Find unused ranges 789 * and use them for buffers. 790 * It searches the dump bitmap in 2 passes. The first time it looks 791 * for CBUF_MAPSIZE ranges. On the second pass it uses small pages. 792 */ 793 static void 794 dumpsys_get_maxmem() 795 { 796 dumpcfg_t *cfg = &dumpcfg; 797 cbuf_t *endcp = &cfg->cbuf[cfg->ncbuf]; 798 helper_t *endhp = &cfg->helper[cfg->nhelper]; 799 pgcnt_t bitnum, end; 800 size_t sz, endsz; 801 pfn_t pfn, off; 802 cbuf_t *cp; 803 helper_t *hp; 804 dumpmlw_t mlw; 805 int k; 806 807 /* 808 * Setting dump_ncpu_low to 0 forces a single threaded dump. 809 */ 810 if (dump_ncpu_low == 0) { 811 cfg->clevel = DUMP_CLEVEL_SERIAL; 812 return; 813 } 814 815 /* 816 * There may be no point in looking for spare memory. If 817 * dumping all memory, then none is spare. If doing a serial 818 * dump, then already have buffers. 819 */ 820 if (cfg->maxsize == 0 || cfg->clevel == DUMP_CLEVEL_SERIAL || 821 (dump_conflags & DUMP_ALL) != 0) { 822 return; 823 } 824 825 sz = 0; 826 cfg->found4m = 0; 827 cfg->foundsm = 0; 828 829 /* bitmap of ranges used to estimate which pfns are being used */ 830 bzero(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.rbitmapsize)); 831 832 /* find ranges that are not being dumped to use for buffers */ 833 dump_init_memlist_walker(&mlw); 834 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) { 835 dump_timeleft = dump_timeout; 836 end = bitnum + CBUF_MAPNP; 837 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 838 ASSERT(pfn != PFN_INVALID); 839 840 /* skip partial range at end of mem segment */ 841 if (mlw.mpleft < CBUF_MAPNP) { 842 end = bitnum + mlw.mpleft; 843 continue; 844 } 845 846 /* skip non aligned pages */ 847 off = P2PHASE(pfn, CBUF_MAPNP); 848 if (off != 0) { 849 end -= off; 850 continue; 851 } 852 853 if (!dump_range_check(bitnum, end, pfn)) 854 continue; 855 856 ASSERT((sz + CBUF_MAPSIZE) <= cfg->maxvmsize); 857 hat_devload(kas.a_hat, cfg->maxvm + sz, CBUF_MAPSIZE, pfn, 858 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST); 859 sz += CBUF_MAPSIZE; 860 cfg->found4m++; 861 862 /* set the bitmap for both ends to be sure to cover the range */ 863 dump_set_used(pfn); 864 dump_set_used(pfn + CBUF_MAPNP - 1); 865 866 if (sz >= cfg->maxsize) 867 goto foundmax; 868 } 869 870 /* Add small pages if we can't find enough large pages. */ 871 dump_init_memlist_walker(&mlw); 872 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) { 873 dump_timeleft = dump_timeout; 874 end = bitnum + CBUF_MAPNP; 875 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 876 ASSERT(pfn != PFN_INVALID); 877 878 /* Find any non-aligned pages at start and end of segment. */ 879 off = P2PHASE(pfn, CBUF_MAPNP); 880 if (mlw.mpleft < CBUF_MAPNP) { 881 end = bitnum + mlw.mpleft; 882 } else if (off != 0) { 883 end -= off; 884 } else if (cfg->found4m && dump_test_used(pfn)) { 885 continue; 886 } 887 888 for (; bitnum < end; bitnum++, pfn++) { 889 dump_timeleft = dump_timeout; 890 if (BT_TEST(dumpcfg.bitmap, bitnum)) 891 continue; 892 if (!dump_pfn_check(pfn)) 893 continue; 894 ASSERT((sz + PAGESIZE) <= cfg->maxvmsize); 895 hat_devload(kas.a_hat, cfg->maxvm + sz, PAGESIZE, pfn, 896 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST); 897 sz += PAGESIZE; 898 cfg->foundsm++; 899 dump_set_used(pfn); 900 if (sz >= cfg->maxsize) 901 goto foundmax; 902 } 903 } 904 905 /* Allocate memory for as many helpers as we can. */ 906 foundmax: 907 908 /* Byte offsets into memory found and mapped above */ 909 endsz = sz; 910 sz = 0; 911 912 /* Skip the preallocate output buffers. */ 913 cp = &cfg->cbuf[MINCBUFS]; 914 915 /* Loop over all helpers and allocate memory. */ 916 for (hp = cfg->helper; hp < endhp; hp++) { 917 918 /* Skip preallocated helpers by checking hp->page. */ 919 if (hp->page == NULL) { 920 /* lzjb needs 2 1-page buffers */ 921 if ((sz + (2 * PAGESIZE)) > endsz) 922 break; 923 hp->page = cfg->maxvm + sz; 924 sz += PAGESIZE; 925 hp->lzbuf = cfg->maxvm + sz; 926 sz += PAGESIZE; 927 } 928 929 /* 930 * Add output buffers per helper. The number of 931 * buffers per helper is determined by the ratio of 932 * ncbuf to nhelper. 933 */ 934 for (k = 0; cp < endcp && (sz + CBUF_SIZE) <= endsz && 935 k < NCBUF_PER_HELPER; k++) { 936 cp->state = CBUF_FREEBUF; 937 cp->size = CBUF_SIZE; 938 cp->buf = cfg->maxvm + sz; 939 sz += CBUF_SIZE; 940 ++cp; 941 } 942 } 943 944 /* Finish allocating output buffers */ 945 for (; cp < endcp && (sz + CBUF_SIZE) <= endsz; cp++) { 946 cp->state = CBUF_FREEBUF; 947 cp->size = CBUF_SIZE; 948 cp->buf = cfg->maxvm + sz; 949 sz += CBUF_SIZE; 950 } 951 952 /* Enable IS_DUMP_PAGE macro, which checks for pages we took. */ 953 if (cfg->found4m || cfg->foundsm) 954 dump_check_used = 1; 955 956 ASSERT(sz <= endsz); 957 } 958 959 static void 960 dumphdr_init(void) 961 { 962 pgcnt_t npages = 0; 963 964 ASSERT(MUTEX_HELD(&dump_lock)); 965 966 if (dumphdr == NULL) { 967 dumphdr = kmem_zalloc(sizeof (dumphdr_t), KM_SLEEP); 968 dumphdr->dump_magic = DUMP_MAGIC; 969 dumphdr->dump_version = DUMP_VERSION; 970 dumphdr->dump_wordsize = DUMP_WORDSIZE; 971 dumphdr->dump_pageshift = PAGESHIFT; 972 dumphdr->dump_pagesize = PAGESIZE; 973 dumphdr->dump_utsname = utsname; 974 (void) strcpy(dumphdr->dump_platform, platform); 975 dumpbuf.size = dumpbuf_iosize(maxphys); 976 dumpbuf.start = kmem_alloc(dumpbuf.size, KM_SLEEP); 977 dumpbuf.end = dumpbuf.start + dumpbuf.size; 978 dumpcfg.pids = kmem_alloc(v.v_proc * sizeof (pid_t), KM_SLEEP); 979 dumpcfg.helpermap = kmem_zalloc(BT_SIZEOFMAP(NCPU), KM_SLEEP); 980 LOCK_INIT_HELD(&dumpcfg.helper_lock); 981 dump_stack_scratch = kmem_alloc(STACK_BUF_SIZE, KM_SLEEP); 982 (void) strncpy(dumphdr->dump_uuid, dump_get_uuid(), 983 sizeof (dumphdr->dump_uuid)); 984 } 985 986 npages = num_phys_pages(); 987 988 if (dumpcfg.bitmapsize != npages) { 989 size_t rlen = CBUF_MAPP2R(P2ROUNDUP(npages, CBUF_MAPNP)); 990 void *map = kmem_alloc(BT_SIZEOFMAP(npages), KM_SLEEP); 991 void *rmap = kmem_alloc(BT_SIZEOFMAP(rlen), KM_SLEEP); 992 993 if (dumpcfg.bitmap != NULL) 994 kmem_free(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg. 995 bitmapsize)); 996 if (dumpcfg.rbitmap != NULL) 997 kmem_free(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg. 998 rbitmapsize)); 999 dumpcfg.bitmap = map; 1000 dumpcfg.bitmapsize = npages; 1001 dumpcfg.rbitmap = rmap; 1002 dumpcfg.rbitmapsize = rlen; 1003 } 1004 } 1005 1006 /* 1007 * Establish a new dump device. 1008 */ 1009 int 1010 dumpinit(vnode_t *vp, char *name, int justchecking) 1011 { 1012 vnode_t *cvp; 1013 vattr_t vattr; 1014 vnode_t *cdev_vp; 1015 int error = 0; 1016 1017 ASSERT(MUTEX_HELD(&dump_lock)); 1018 1019 dumphdr_init(); 1020 1021 cvp = common_specvp(vp); 1022 if (cvp == dumpvp) 1023 return (0); 1024 1025 /* 1026 * Determine whether this is a plausible dump device. We want either: 1027 * (1) a real device that's not mounted and has a cb_dump routine, or 1028 * (2) a swapfile on some filesystem that has a vop_dump routine. 1029 */ 1030 if ((error = VOP_OPEN(&cvp, FREAD | FWRITE, kcred, NULL)) != 0) 1031 return (error); 1032 1033 vattr.va_mask = AT_SIZE | AT_TYPE | AT_RDEV; 1034 if ((error = VOP_GETATTR(cvp, &vattr, 0, kcred, NULL)) == 0) { 1035 if (vattr.va_type == VBLK || vattr.va_type == VCHR) { 1036 if (devopsp[getmajor(vattr.va_rdev)]-> 1037 devo_cb_ops->cb_dump == nodev) 1038 error = ENOTSUP; 1039 else if (vfs_devismounted(vattr.va_rdev)) 1040 error = EBUSY; 1041 if (strcmp(ddi_driver_name(VTOS(cvp)->s_dip), 1042 ZFS_DRIVER) == 0 && 1043 IS_SWAPVP(common_specvp(cvp))) 1044 error = EBUSY; 1045 } else { 1046 if (vn_matchopval(cvp, VOPNAME_DUMP, fs_nosys) || 1047 !IS_SWAPVP(cvp)) 1048 error = ENOTSUP; 1049 } 1050 } 1051 1052 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) 1053 error = ENOSPC; 1054 1055 if (error || justchecking) { 1056 (void) VOP_CLOSE(cvp, FREAD | FWRITE, 1, (offset_t)0, 1057 kcred, NULL); 1058 return (error); 1059 } 1060 1061 VN_HOLD(cvp); 1062 1063 if (dumpvp != NULL) 1064 dumpfini(); /* unconfigure the old dump device */ 1065 1066 dumpvp = cvp; 1067 dumpvp_size = vattr.va_size & -DUMP_OFFSET; 1068 dumppath = kmem_alloc(strlen(name) + 1, KM_SLEEP); 1069 (void) strcpy(dumppath, name); 1070 dumpbuf.iosize = 0; 1071 1072 /* 1073 * If the dump device is a block device, attempt to open up the 1074 * corresponding character device and determine its maximum transfer 1075 * size. We use this information to potentially resize dumpbuf to a 1076 * larger and more optimal size for performing i/o to the dump device. 1077 */ 1078 if (cvp->v_type == VBLK && 1079 (cdev_vp = makespecvp(VTOS(cvp)->s_dev, VCHR)) != NULL) { 1080 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) { 1081 size_t blk_size; 1082 struct dk_cinfo dki; 1083 struct dk_minfo minf; 1084 1085 if (VOP_IOCTL(cdev_vp, DKIOCGMEDIAINFO, 1086 (intptr_t)&minf, FKIOCTL, kcred, NULL, NULL) 1087 == 0 && minf.dki_lbsize != 0) 1088 blk_size = minf.dki_lbsize; 1089 else 1090 blk_size = DEV_BSIZE; 1091 1092 if (VOP_IOCTL(cdev_vp, DKIOCINFO, (intptr_t)&dki, 1093 FKIOCTL, kcred, NULL, NULL) == 0) { 1094 dumpbuf.iosize = dki.dki_maxtransfer * blk_size; 1095 dumpbuf_resize(); 1096 } 1097 /* 1098 * If we are working with a zvol then dumpify it 1099 * if it's not being used as swap. 1100 */ 1101 if (strcmp(dki.dki_dname, ZVOL_DRIVER) == 0) { 1102 if (IS_SWAPVP(common_specvp(cvp))) 1103 error = EBUSY; 1104 else if ((error = VOP_IOCTL(cdev_vp, 1105 DKIOCDUMPINIT, NULL, FKIOCTL, kcred, 1106 NULL, NULL)) != 0) 1107 dumpfini(); 1108 } 1109 1110 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0, 1111 kcred, NULL); 1112 } 1113 1114 VN_RELE(cdev_vp); 1115 } 1116 1117 cmn_err(CE_CONT, "?dump on %s size %llu MB\n", name, dumpvp_size >> 20); 1118 1119 dump_update_clevel(); 1120 1121 return (error); 1122 } 1123 1124 void 1125 dumpfini(void) 1126 { 1127 vattr_t vattr; 1128 boolean_t is_zfs = B_FALSE; 1129 vnode_t *cdev_vp; 1130 ASSERT(MUTEX_HELD(&dump_lock)); 1131 1132 kmem_free(dumppath, strlen(dumppath) + 1); 1133 1134 /* 1135 * Determine if we are using zvols for our dump device 1136 */ 1137 vattr.va_mask = AT_RDEV; 1138 if (VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL) == 0) { 1139 is_zfs = (getmajor(vattr.va_rdev) == 1140 ddi_name_to_major(ZFS_DRIVER)) ? B_TRUE : B_FALSE; 1141 } 1142 1143 /* 1144 * If we have a zvol dump device then we call into zfs so 1145 * that it may have a chance to cleanup. 1146 */ 1147 if (is_zfs && 1148 (cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR)) != NULL) { 1149 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) { 1150 (void) VOP_IOCTL(cdev_vp, DKIOCDUMPFINI, NULL, FKIOCTL, 1151 kcred, NULL, NULL); 1152 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0, 1153 kcred, NULL); 1154 } 1155 VN_RELE(cdev_vp); 1156 } 1157 1158 (void) VOP_CLOSE(dumpvp, FREAD | FWRITE, 1, (offset_t)0, kcred, NULL); 1159 1160 VN_RELE(dumpvp); 1161 1162 dumpvp = NULL; 1163 dumpvp_size = 0; 1164 dumppath = NULL; 1165 } 1166 1167 static offset_t 1168 dumpvp_flush(void) 1169 { 1170 size_t size = P2ROUNDUP(dumpbuf.cur - dumpbuf.start, PAGESIZE); 1171 hrtime_t iotime; 1172 int err; 1173 1174 if (dumpbuf.vp_off + size > dumpbuf.vp_limit) { 1175 dump_ioerr = ENOSPC; 1176 dumpbuf.vp_off = dumpbuf.vp_limit; 1177 } else if (size != 0) { 1178 iotime = gethrtime(); 1179 dumpsync.iowait += iotime - dumpsync.iowaitts; 1180 if (panicstr) 1181 err = VOP_DUMP(dumpvp, dumpbuf.start, 1182 lbtodb(dumpbuf.vp_off), btod(size), NULL); 1183 else 1184 err = vn_rdwr(UIO_WRITE, dumpbuf.cdev_vp != NULL ? 1185 dumpbuf.cdev_vp : dumpvp, dumpbuf.start, size, 1186 dumpbuf.vp_off, UIO_SYSSPACE, 0, dumpbuf.vp_limit, 1187 kcred, 0); 1188 if (err && dump_ioerr == 0) 1189 dump_ioerr = err; 1190 dumpsync.iowaitts = gethrtime(); 1191 dumpsync.iotime += dumpsync.iowaitts - iotime; 1192 dumpsync.nwrite += size; 1193 dumpbuf.vp_off += size; 1194 } 1195 dumpbuf.cur = dumpbuf.start; 1196 dump_timeleft = dump_timeout; 1197 return (dumpbuf.vp_off); 1198 } 1199 1200 /* maximize write speed by keeping seek offset aligned with size */ 1201 void 1202 dumpvp_write(const void *va, size_t size) 1203 { 1204 size_t len, off, sz; 1205 1206 while (size != 0) { 1207 len = MIN(size, dumpbuf.end - dumpbuf.cur); 1208 if (len == 0) { 1209 off = P2PHASE(dumpbuf.vp_off, dumpbuf.size); 1210 if (off == 0 || !ISP2(dumpbuf.size)) { 1211 (void) dumpvp_flush(); 1212 } else { 1213 sz = dumpbuf.size - off; 1214 dumpbuf.cur = dumpbuf.start + sz; 1215 (void) dumpvp_flush(); 1216 ovbcopy(dumpbuf.start + sz, dumpbuf.start, off); 1217 dumpbuf.cur += off; 1218 } 1219 } else { 1220 bcopy(va, dumpbuf.cur, len); 1221 va = (char *)va + len; 1222 dumpbuf.cur += len; 1223 size -= len; 1224 } 1225 } 1226 } 1227 1228 /*ARGSUSED*/ 1229 static void 1230 dumpvp_ksyms_write(const void *src, void *dst, size_t size) 1231 { 1232 dumpvp_write(src, size); 1233 } 1234 1235 /* 1236 * Mark 'pfn' in the bitmap and dump its translation table entry. 1237 */ 1238 void 1239 dump_addpage(struct as *as, void *va, pfn_t pfn) 1240 { 1241 mem_vtop_t mem_vtop; 1242 pgcnt_t bitnum; 1243 1244 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) { 1245 if (!BT_TEST(dumpcfg.bitmap, bitnum)) { 1246 dumphdr->dump_npages++; 1247 BT_SET(dumpcfg.bitmap, bitnum); 1248 } 1249 dumphdr->dump_nvtop++; 1250 mem_vtop.m_as = as; 1251 mem_vtop.m_va = va; 1252 mem_vtop.m_pfn = pfn; 1253 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 1254 } 1255 dump_timeleft = dump_timeout; 1256 } 1257 1258 /* 1259 * Mark 'pfn' in the bitmap 1260 */ 1261 void 1262 dump_page(pfn_t pfn) 1263 { 1264 pgcnt_t bitnum; 1265 1266 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) { 1267 if (!BT_TEST(dumpcfg.bitmap, bitnum)) { 1268 dumphdr->dump_npages++; 1269 BT_SET(dumpcfg.bitmap, bitnum); 1270 } 1271 } 1272 dump_timeleft = dump_timeout; 1273 } 1274 1275 /* 1276 * Dump the <as, va, pfn> information for a given address space. 1277 * SEGOP_DUMP() will call dump_addpage() for each page in the segment. 1278 */ 1279 static void 1280 dump_as(struct as *as) 1281 { 1282 struct seg *seg; 1283 1284 AS_LOCK_ENTER(as, RW_READER); 1285 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 1286 if (seg->s_as != as) 1287 break; 1288 if (seg->s_ops == NULL) 1289 continue; 1290 SEGOP_DUMP(seg); 1291 } 1292 AS_LOCK_EXIT(as); 1293 1294 if (seg != NULL) 1295 cmn_err(CE_WARN, "invalid segment %p in address space %p", 1296 (void *)seg, (void *)as); 1297 } 1298 1299 static int 1300 dump_process(pid_t pid) 1301 { 1302 proc_t *p = sprlock(pid); 1303 1304 if (p == NULL) 1305 return (-1); 1306 if (p->p_as != &kas) { 1307 mutex_exit(&p->p_lock); 1308 dump_as(p->p_as); 1309 mutex_enter(&p->p_lock); 1310 } 1311 1312 sprunlock(p); 1313 1314 return (0); 1315 } 1316 1317 /* 1318 * The following functions (dump_summary(), dump_ereports(), and 1319 * dump_messages()), write data to an uncompressed area within the 1320 * crashdump. The layout of these is 1321 * 1322 * +------------------------------------------------------------+ 1323 * | compressed pages | summary | ereports | messages | 1324 * +------------------------------------------------------------+ 1325 * 1326 * With the advent of saving a compressed crash dump by default, we 1327 * need to save a little more data to describe the failure mode in 1328 * an uncompressed buffer available before savecore uncompresses 1329 * the dump. Initially this is a copy of the stack trace. Additional 1330 * summary information should be added here. 1331 */ 1332 1333 void 1334 dump_summary(void) 1335 { 1336 u_offset_t dumpvp_start; 1337 summary_dump_t sd; 1338 1339 if (dumpvp == NULL || dumphdr == NULL) 1340 return; 1341 1342 dumpbuf.cur = dumpbuf.start; 1343 1344 dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE + 1345 DUMP_ERPTSIZE); 1346 dumpvp_start = dumpbuf.vp_limit - DUMP_SUMMARYSIZE; 1347 dumpbuf.vp_off = dumpvp_start; 1348 1349 sd.sd_magic = SUMMARY_MAGIC; 1350 sd.sd_ssum = checksum32(dump_stack_scratch, STACK_BUF_SIZE); 1351 dumpvp_write(&sd, sizeof (sd)); 1352 dumpvp_write(dump_stack_scratch, STACK_BUF_SIZE); 1353 1354 sd.sd_magic = 0; /* indicate end of summary */ 1355 dumpvp_write(&sd, sizeof (sd)); 1356 (void) dumpvp_flush(); 1357 } 1358 1359 void 1360 dump_ereports(void) 1361 { 1362 u_offset_t dumpvp_start; 1363 erpt_dump_t ed; 1364 1365 if (dumpvp == NULL || dumphdr == NULL) 1366 return; 1367 1368 dumpbuf.cur = dumpbuf.start; 1369 dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE); 1370 dumpvp_start = dumpbuf.vp_limit - DUMP_ERPTSIZE; 1371 dumpbuf.vp_off = dumpvp_start; 1372 1373 fm_ereport_dump(); 1374 if (panicstr) 1375 errorq_dump(); 1376 1377 bzero(&ed, sizeof (ed)); /* indicate end of ereports */ 1378 dumpvp_write(&ed, sizeof (ed)); 1379 (void) dumpvp_flush(); 1380 1381 if (!panicstr) { 1382 (void) VOP_PUTPAGE(dumpvp, dumpvp_start, 1383 (size_t)(dumpbuf.vp_off - dumpvp_start), 1384 B_INVAL | B_FORCE, kcred, NULL); 1385 } 1386 } 1387 1388 void 1389 dump_messages(void) 1390 { 1391 log_dump_t ld; 1392 mblk_t *mctl, *mdata; 1393 queue_t *q, *qlast; 1394 u_offset_t dumpvp_start; 1395 1396 if (dumpvp == NULL || dumphdr == NULL || log_consq == NULL) 1397 return; 1398 1399 dumpbuf.cur = dumpbuf.start; 1400 dumpbuf.vp_limit = dumpvp_size - DUMP_OFFSET; 1401 dumpvp_start = dumpbuf.vp_limit - DUMP_LOGSIZE; 1402 dumpbuf.vp_off = dumpvp_start; 1403 1404 qlast = NULL; 1405 do { 1406 for (q = log_consq; q->q_next != qlast; q = q->q_next) 1407 continue; 1408 for (mctl = q->q_first; mctl != NULL; mctl = mctl->b_next) { 1409 dump_timeleft = dump_timeout; 1410 mdata = mctl->b_cont; 1411 ld.ld_magic = LOG_MAGIC; 1412 ld.ld_msgsize = MBLKL(mctl->b_cont); 1413 ld.ld_csum = checksum32(mctl->b_rptr, MBLKL(mctl)); 1414 ld.ld_msum = checksum32(mdata->b_rptr, MBLKL(mdata)); 1415 dumpvp_write(&ld, sizeof (ld)); 1416 dumpvp_write(mctl->b_rptr, MBLKL(mctl)); 1417 dumpvp_write(mdata->b_rptr, MBLKL(mdata)); 1418 } 1419 } while ((qlast = q) != log_consq); 1420 1421 ld.ld_magic = 0; /* indicate end of messages */ 1422 dumpvp_write(&ld, sizeof (ld)); 1423 (void) dumpvp_flush(); 1424 if (!panicstr) { 1425 (void) VOP_PUTPAGE(dumpvp, dumpvp_start, 1426 (size_t)(dumpbuf.vp_off - dumpvp_start), 1427 B_INVAL | B_FORCE, kcred, NULL); 1428 } 1429 } 1430 1431 /* 1432 * The following functions are called on multiple CPUs during dump. 1433 * They must not use most kernel services, because all cross-calls are 1434 * disabled during panic. Therefore, blocking locks and cache flushes 1435 * will not work. 1436 */ 1437 1438 /* 1439 * Copy pages, trapping ECC errors. Also, for robustness, trap data 1440 * access in case something goes wrong in the hat layer and the 1441 * mapping is broken. 1442 */ 1443 static int 1444 dump_pagecopy(void *src, void *dst) 1445 { 1446 long *wsrc = (long *)src; 1447 long *wdst = (long *)dst; 1448 const ulong_t ncopies = PAGESIZE / sizeof (long); 1449 volatile int w = 0; 1450 volatile int ueoff = -1; 1451 on_trap_data_t otd; 1452 1453 if (on_trap(&otd, OT_DATA_EC | OT_DATA_ACCESS)) { 1454 if (ueoff == -1) 1455 ueoff = w * sizeof (long); 1456 /* report "bad ECC" or "bad address" */ 1457 #ifdef _LP64 1458 if (otd.ot_trap & OT_DATA_EC) 1459 wdst[w++] = 0x00badecc00badecc; 1460 else 1461 wdst[w++] = 0x00badadd00badadd; 1462 #else 1463 if (otd.ot_trap & OT_DATA_EC) 1464 wdst[w++] = 0x00badecc; 1465 else 1466 wdst[w++] = 0x00badadd; 1467 #endif 1468 } 1469 while (w < ncopies) { 1470 wdst[w] = wsrc[w]; 1471 w++; 1472 } 1473 no_trap(); 1474 return (ueoff); 1475 } 1476 1477 static void 1478 dumpsys_close_cq(cqueue_t *cq, int live) 1479 { 1480 if (live) { 1481 mutex_enter(&cq->mutex); 1482 atomic_dec_uint(&cq->open); 1483 cv_signal(&cq->cv); 1484 mutex_exit(&cq->mutex); 1485 } else { 1486 atomic_dec_uint(&cq->open); 1487 } 1488 } 1489 1490 static inline void 1491 dumpsys_spinlock(lock_t *lp) 1492 { 1493 uint_t backoff = 0; 1494 int loop_count = 0; 1495 1496 while (LOCK_HELD(lp) || !lock_spin_try(lp)) { 1497 if (++loop_count >= ncpus) { 1498 backoff = mutex_lock_backoff(0); 1499 loop_count = 0; 1500 } else { 1501 backoff = mutex_lock_backoff(backoff); 1502 } 1503 mutex_lock_delay(backoff); 1504 } 1505 } 1506 1507 static inline void 1508 dumpsys_spinunlock(lock_t *lp) 1509 { 1510 lock_clear(lp); 1511 } 1512 1513 static inline void 1514 dumpsys_lock(cqueue_t *cq, int live) 1515 { 1516 if (live) 1517 mutex_enter(&cq->mutex); 1518 else 1519 dumpsys_spinlock(&cq->spinlock); 1520 } 1521 1522 static inline void 1523 dumpsys_unlock(cqueue_t *cq, int live, int signal) 1524 { 1525 if (live) { 1526 if (signal) 1527 cv_signal(&cq->cv); 1528 mutex_exit(&cq->mutex); 1529 } else { 1530 dumpsys_spinunlock(&cq->spinlock); 1531 } 1532 } 1533 1534 static void 1535 dumpsys_wait_cq(cqueue_t *cq, int live) 1536 { 1537 if (live) { 1538 cv_wait(&cq->cv, &cq->mutex); 1539 } else { 1540 dumpsys_spinunlock(&cq->spinlock); 1541 while (cq->open) 1542 if (cq->first) 1543 break; 1544 dumpsys_spinlock(&cq->spinlock); 1545 } 1546 } 1547 1548 static void 1549 dumpsys_put_cq(cqueue_t *cq, cbuf_t *cp, int newstate, int live) 1550 { 1551 if (cp == NULL) 1552 return; 1553 1554 dumpsys_lock(cq, live); 1555 1556 if (cq->ts != 0) { 1557 cq->empty += gethrtime() - cq->ts; 1558 cq->ts = 0; 1559 } 1560 1561 cp->state = newstate; 1562 cp->next = NULL; 1563 if (cq->last == NULL) 1564 cq->first = cp; 1565 else 1566 cq->last->next = cp; 1567 cq->last = cp; 1568 1569 dumpsys_unlock(cq, live, 1); 1570 } 1571 1572 static cbuf_t * 1573 dumpsys_get_cq(cqueue_t *cq, int live) 1574 { 1575 cbuf_t *cp; 1576 hrtime_t now = gethrtime(); 1577 1578 dumpsys_lock(cq, live); 1579 1580 /* CONSTCOND */ 1581 while (1) { 1582 cp = (cbuf_t *)cq->first; 1583 if (cp == NULL) { 1584 if (cq->open == 0) 1585 break; 1586 dumpsys_wait_cq(cq, live); 1587 continue; 1588 } 1589 cq->first = cp->next; 1590 if (cq->first == NULL) { 1591 cq->last = NULL; 1592 cq->ts = now; 1593 } 1594 break; 1595 } 1596 1597 dumpsys_unlock(cq, live, cq->first != NULL || cq->open == 0); 1598 return (cp); 1599 } 1600 1601 /* 1602 * Send an error message to the console. If the main task is running 1603 * just write the message via uprintf. If a helper is running the 1604 * message has to be put on a queue for the main task. Setting fmt to 1605 * NULL means flush the error message buffer. If fmt is not NULL, just 1606 * add the text to the existing buffer. 1607 */ 1608 static void 1609 dumpsys_errmsg(helper_t *hp, const char *fmt, ...) 1610 { 1611 dumpsync_t *ds = hp->ds; 1612 cbuf_t *cp = hp->cperr; 1613 va_list adx; 1614 1615 if (hp->helper == MAINHELPER) { 1616 if (fmt != NULL) { 1617 if (ds->neednl) { 1618 uprintf("\n"); 1619 ds->neednl = 0; 1620 } 1621 va_start(adx, fmt); 1622 vuprintf(fmt, adx); 1623 va_end(adx); 1624 } 1625 } else if (fmt == NULL) { 1626 if (cp != NULL) { 1627 CQ_PUT(mainq, cp, CBUF_ERRMSG); 1628 hp->cperr = NULL; 1629 } 1630 } else { 1631 if (hp->cperr == NULL) { 1632 cp = CQ_GET(freebufq); 1633 hp->cperr = cp; 1634 cp->used = 0; 1635 } 1636 va_start(adx, fmt); 1637 cp->used += vsnprintf(cp->buf + cp->used, cp->size - cp->used, 1638 fmt, adx); 1639 va_end(adx); 1640 if ((cp->used + LOG_MSGSIZE) > cp->size) { 1641 CQ_PUT(mainq, cp, CBUF_ERRMSG); 1642 hp->cperr = NULL; 1643 } 1644 } 1645 } 1646 1647 /* 1648 * Write an output buffer to the dump file. If the main task is 1649 * running just write the data. If a helper is running the output is 1650 * placed on a queue for the main task. 1651 */ 1652 static void 1653 dumpsys_swrite(helper_t *hp, cbuf_t *cp, size_t used) 1654 { 1655 dumpsync_t *ds = hp->ds; 1656 1657 if (hp->helper == MAINHELPER) { 1658 HRSTART(ds->perpage, write); 1659 dumpvp_write(cp->buf, used); 1660 HRSTOP(ds->perpage, write); 1661 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 1662 } else { 1663 cp->used = used; 1664 CQ_PUT(mainq, cp, CBUF_WRITE); 1665 } 1666 } 1667 1668 /* 1669 * Copy one page within the mapped range. The offset starts at 0 and 1670 * is relative to the first pfn. cp->buf + cp->off is the address of 1671 * the first pfn. If dump_pagecopy returns a UE offset, create an 1672 * error message. Returns the offset to the next pfn in the range 1673 * selected by the bitmap. 1674 */ 1675 static int 1676 dumpsys_copy_page(helper_t *hp, int offset) 1677 { 1678 cbuf_t *cp = hp->cpin; 1679 int ueoff; 1680 1681 ASSERT(cp->off + offset + PAGESIZE <= cp->size); 1682 ASSERT(BT_TEST(dumpcfg.bitmap, cp->bitnum)); 1683 1684 ueoff = dump_pagecopy(cp->buf + cp->off + offset, hp->page); 1685 1686 /* ueoff is the offset in the page to a UE error */ 1687 if (ueoff != -1) { 1688 uint64_t pa = ptob(cp->pfn) + offset + ueoff; 1689 1690 dumpsys_errmsg(hp, "cpu %d: memory error at PA 0x%08x.%08x\n", 1691 CPU->cpu_id, (uint32_t)(pa >> 32), (uint32_t)pa); 1692 } 1693 1694 /* 1695 * Advance bitnum and offset to the next input page for the 1696 * next call to this function. 1697 */ 1698 offset += PAGESIZE; 1699 cp->bitnum++; 1700 while (cp->off + offset < cp->size) { 1701 if (BT_TEST(dumpcfg.bitmap, cp->bitnum)) 1702 break; 1703 offset += PAGESIZE; 1704 cp->bitnum++; 1705 } 1706 1707 return (offset); 1708 } 1709 1710 /* 1711 * Read the helper queue, and copy one mapped page. Return 0 when 1712 * done. Return 1 when a page has been copied into hp->page. 1713 */ 1714 static int 1715 dumpsys_sread(helper_t *hp) 1716 { 1717 dumpsync_t *ds = hp->ds; 1718 1719 /* CONSTCOND */ 1720 while (1) { 1721 1722 /* Find the next input buffer. */ 1723 if (hp->cpin == NULL) { 1724 HRSTART(hp->perpage, inwait); 1725 1726 /* CONSTCOND */ 1727 while (1) { 1728 hp->cpin = CQ_GET(helperq); 1729 dump_timeleft = dump_timeout; 1730 1731 /* 1732 * NULL return means the helper queue 1733 * is closed and empty. 1734 */ 1735 if (hp->cpin == NULL) 1736 break; 1737 1738 /* Have input, check for dump I/O error. */ 1739 if (!dump_ioerr) 1740 break; 1741 1742 /* 1743 * If an I/O error occurs, stay in the 1744 * loop in order to empty the helper 1745 * queue. Return the buffers to the 1746 * main task to unmap and free it. 1747 */ 1748 hp->cpin->used = 0; 1749 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 1750 } 1751 HRSTOP(hp->perpage, inwait); 1752 1753 /* Stop here when the helper queue is closed. */ 1754 if (hp->cpin == NULL) 1755 break; 1756 1757 /* Set the offset=0 to get the first pfn. */ 1758 hp->in = 0; 1759 1760 /* Set the total processed to 0 */ 1761 hp->used = 0; 1762 } 1763 1764 /* Process the next page. */ 1765 if (hp->used < hp->cpin->used) { 1766 1767 /* 1768 * Get the next page from the input buffer and 1769 * return a copy. 1770 */ 1771 ASSERT(hp->in != -1); 1772 HRSTART(hp->perpage, copy); 1773 hp->in = dumpsys_copy_page(hp, hp->in); 1774 hp->used += PAGESIZE; 1775 HRSTOP(hp->perpage, copy); 1776 break; 1777 1778 } else { 1779 1780 /* 1781 * Done with the input. Flush the VM and 1782 * return the buffer to the main task. 1783 */ 1784 if (panicstr && hp->helper != MAINHELPER) 1785 hat_flush_range(kas.a_hat, 1786 hp->cpin->buf, hp->cpin->size); 1787 dumpsys_errmsg(hp, NULL); 1788 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 1789 hp->cpin = NULL; 1790 } 1791 } 1792 1793 return (hp->cpin != NULL); 1794 } 1795 1796 /* 1797 * Compress with lzjb 1798 * write stream block if full or size==0 1799 * if csize==0 write stream header, else write <csize, data> 1800 * size==0 is a call to flush a buffer 1801 * hp->cpout is the buffer we are flushing or filling 1802 * hp->out is the next index to fill data 1803 * osize is either csize+data, or the size of a stream header 1804 */ 1805 static void 1806 dumpsys_lzjbrun(helper_t *hp, size_t csize, void *buf, size_t size) 1807 { 1808 dumpsync_t *ds = hp->ds; 1809 const int CSIZE = sizeof (dumpcsize_t); 1810 dumpcsize_t cs; 1811 size_t osize = csize > 0 ? CSIZE + size : size; 1812 1813 /* If flush, and there is no buffer, just return */ 1814 if (size == 0 && hp->cpout == NULL) 1815 return; 1816 1817 /* If flush, or cpout is full, write it out */ 1818 if (size == 0 || 1819 hp->cpout != NULL && hp->out + osize > hp->cpout->size) { 1820 1821 /* Set tag+size word at the front of the stream block. */ 1822 cs = DUMP_SET_TAG(hp->out - CSIZE, hp->tag); 1823 (void) memcpy(hp->cpout->buf, &cs, CSIZE); 1824 1825 /* Write block to dump file. */ 1826 dumpsys_swrite(hp, hp->cpout, hp->out); 1827 1828 /* Clear pointer to indicate we need a new buffer */ 1829 hp->cpout = NULL; 1830 1831 /* flushing, we are done */ 1832 if (size == 0) 1833 return; 1834 } 1835 1836 /* Get an output buffer if we dont have one. */ 1837 if (hp->cpout == NULL) { 1838 HRSTART(hp->perpage, outwait); 1839 hp->cpout = CQ_GET(freebufq); 1840 HRSTOP(hp->perpage, outwait); 1841 hp->out = CSIZE; 1842 } 1843 1844 /* Store csize word. This is the size of compressed data. */ 1845 if (csize > 0) { 1846 cs = DUMP_SET_TAG(csize, 0); 1847 (void) memcpy(hp->cpout->buf + hp->out, &cs, CSIZE); 1848 hp->out += CSIZE; 1849 } 1850 1851 /* Store the data. */ 1852 (void) memcpy(hp->cpout->buf + hp->out, buf, size); 1853 hp->out += size; 1854 } 1855 1856 static void 1857 dumpsys_lzjbcompress(helper_t *hp) 1858 { 1859 dumpsync_t *ds = hp->ds; 1860 size_t csize; 1861 dumpstreamhdr_t sh; 1862 1863 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC); 1864 sh.stream_pagenum = (pfn_t)-1; 1865 sh.stream_npages = 0; 1866 hp->cpin = NULL; 1867 hp->cpout = NULL; 1868 hp->cperr = NULL; 1869 hp->in = 0; 1870 hp->out = 0; 1871 1872 /* Bump reference to mainq while we are running */ 1873 CQ_OPEN(mainq); 1874 1875 /* Get one page at a time */ 1876 while (dumpsys_sread(hp)) { 1877 1878 /* Create a stream header for each new input map */ 1879 if (sh.stream_pagenum != hp->cpin->pagenum) { 1880 sh.stream_pagenum = hp->cpin->pagenum; 1881 sh.stream_npages = btop(hp->cpin->used); 1882 dumpsys_lzjbrun(hp, 0, &sh, sizeof (sh)); 1883 } 1884 1885 /* Compress one page */ 1886 HRSTART(hp->perpage, compress); 1887 csize = compress(hp->page, hp->lzbuf, PAGESIZE); 1888 HRSTOP(hp->perpage, compress); 1889 1890 /* Add csize+data to output block */ 1891 ASSERT(csize > 0 && csize <= PAGESIZE); 1892 dumpsys_lzjbrun(hp, csize, hp->lzbuf, csize); 1893 } 1894 1895 /* Done with input, flush any partial buffer */ 1896 if (sh.stream_pagenum != (pfn_t)-1) { 1897 dumpsys_lzjbrun(hp, 0, NULL, 0); 1898 dumpsys_errmsg(hp, NULL); 1899 } 1900 1901 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL); 1902 1903 /* Decrement main queue count, we are done */ 1904 CQ_CLOSE(mainq); 1905 } 1906 1907 /* 1908 * Dump helper called from panic_idle() to compress pages. CPUs in 1909 * this path must not call most kernel services. 1910 * 1911 * During panic, all but one of the CPUs is idle. These CPUs are used 1912 * as helpers working in parallel to copy and compress memory 1913 * pages. During a panic, however, these processors cannot call any 1914 * kernel services. This is because mutexes become no-ops during 1915 * panic, and, cross-call interrupts are inhibited. Therefore, during 1916 * panic dump the helper CPUs communicate with the panic CPU using 1917 * memory variables. All memory mapping and I/O is performed by the 1918 * panic CPU. 1919 * 1920 * At dump configuration time, helper_lock is set and helpers_wanted 1921 * is 0. dumpsys() decides whether to set helpers_wanted before 1922 * clearing helper_lock. 1923 * 1924 * At panic time, idle CPUs spin-wait on helper_lock, then alternately 1925 * take the lock and become a helper, or return. 1926 */ 1927 void 1928 dumpsys_helper() 1929 { 1930 dumpsys_spinlock(&dumpcfg.helper_lock); 1931 if (dumpcfg.helpers_wanted) { 1932 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper]; 1933 1934 for (hp = dumpcfg.helper; hp != hpend; hp++) { 1935 if (hp->helper == FREEHELPER) { 1936 hp->helper = CPU->cpu_id; 1937 BT_SET(dumpcfg.helpermap, CPU->cpu_seqid); 1938 dumpsys_spinunlock(&dumpcfg.helper_lock); 1939 dumpsys_lzjbcompress(hp); 1940 hp->helper = DONEHELPER; 1941 return; 1942 } 1943 } 1944 1945 /* No more helpers are needed. */ 1946 dumpcfg.helpers_wanted = 0; 1947 1948 } 1949 dumpsys_spinunlock(&dumpcfg.helper_lock); 1950 } 1951 1952 /* 1953 * No-wait helper callable in spin loops. 1954 * 1955 * Do not wait for helper_lock. Just check helpers_wanted. The caller 1956 * may decide to continue. This is the "c)ontinue, s)ync, r)eset? s" 1957 * case. 1958 */ 1959 void 1960 dumpsys_helper_nw() 1961 { 1962 if (dumpcfg.helpers_wanted) 1963 dumpsys_helper(); 1964 } 1965 1966 /* 1967 * Dump helper for live dumps. 1968 * These run as a system task. 1969 */ 1970 static void 1971 dumpsys_live_helper(void *arg) 1972 { 1973 helper_t *hp = arg; 1974 1975 BT_ATOMIC_SET(dumpcfg.helpermap, CPU->cpu_seqid); 1976 dumpsys_lzjbcompress(hp); 1977 } 1978 1979 /* 1980 * Compress one page with lzjb (single threaded case) 1981 */ 1982 static void 1983 dumpsys_lzjb_page(helper_t *hp, cbuf_t *cp) 1984 { 1985 dumpsync_t *ds = hp->ds; 1986 uint32_t csize; 1987 1988 hp->helper = MAINHELPER; 1989 hp->in = 0; 1990 hp->used = 0; 1991 hp->cpin = cp; 1992 while (hp->used < cp->used) { 1993 HRSTART(hp->perpage, copy); 1994 hp->in = dumpsys_copy_page(hp, hp->in); 1995 hp->used += PAGESIZE; 1996 HRSTOP(hp->perpage, copy); 1997 1998 HRSTART(hp->perpage, compress); 1999 csize = compress(hp->page, hp->lzbuf, PAGESIZE); 2000 HRSTOP(hp->perpage, compress); 2001 2002 HRSTART(hp->perpage, write); 2003 dumpvp_write(&csize, sizeof (csize)); 2004 dumpvp_write(hp->lzbuf, csize); 2005 HRSTOP(hp->perpage, write); 2006 } 2007 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 2008 hp->cpin = NULL; 2009 } 2010 2011 /* 2012 * Main task to dump pages. This is called on the dump CPU. 2013 */ 2014 static void 2015 dumpsys_main_task(void *arg) 2016 { 2017 dumpsync_t *ds = arg; 2018 pgcnt_t pagenum = 0, bitnum = 0, hibitnum; 2019 dumpmlw_t mlw; 2020 cbuf_t *cp; 2021 pgcnt_t baseoff, pfnoff; 2022 pfn_t base, pfn; 2023 int i; 2024 2025 /* 2026 * Fall back to serial mode if there are no helpers. 2027 * dump_ncpu_low can be set to 0 at any time. 2028 * dumpcfg.helpermap must contain at least one member. 2029 * 2030 * It is possible that the helpers haven't registered 2031 * in helpermap yet; wait up to DUMP_HELPER_MAX_WAIT for 2032 * at least one helper to register. 2033 */ 2034 if (dump_ncpu_low != 0 && dumpcfg.clevel != DUMP_CLEVEL_SERIAL) { 2035 boolean_t dumpserial = B_TRUE; 2036 hrtime_t hrtmax = MSEC2NSEC(DUMP_HELPER_MAX_WAIT); 2037 hrtime_t hrtstart = gethrtime(); 2038 2039 for (;;) { 2040 for (i = 0; i < BT_BITOUL(NCPU); ++i) { 2041 if (dumpcfg.helpermap[i] != 0) { 2042 dumpserial = B_FALSE; 2043 break; 2044 } 2045 } 2046 2047 if ((!dumpserial) || 2048 ((gethrtime() - hrtstart) >= hrtmax)) { 2049 break; 2050 } 2051 2052 SMT_PAUSE(); 2053 } 2054 2055 if (dumpserial) { 2056 dumpcfg.clevel = DUMP_CLEVEL_SERIAL; 2057 if (dumpcfg.helper[0].lzbuf == NULL) { 2058 dumpcfg.helper[0].lzbuf = 2059 dumpcfg.helper[1].page; 2060 } 2061 } 2062 } 2063 2064 dump_init_memlist_walker(&mlw); 2065 2066 for (;;) { 2067 int sec = (gethrtime() - ds->start) / NANOSEC; 2068 2069 /* 2070 * Render a simple progress display on the system console to 2071 * make clear to the operator that the system has not hung. 2072 * Emit an update when dump progress has advanced by one 2073 * percent, or when no update has been drawn in the last 2074 * second. 2075 */ 2076 if (ds->percent > ds->percent_done || sec > ds->sec_done) { 2077 ds->sec_done = sec; 2078 ds->percent_done = ds->percent; 2079 uprintf("^\rdumping: %2d:%02d %3d%% done", 2080 sec / 60, sec % 60, ds->percent); 2081 ds->neednl = 1; 2082 } 2083 2084 while (CQ_IS_EMPTY(mainq) && !CQ_IS_EMPTY(writerq)) { 2085 2086 /* the writerq never blocks */ 2087 cp = CQ_GET(writerq); 2088 if (cp == NULL) 2089 break; 2090 2091 dump_timeleft = dump_timeout; 2092 2093 HRSTART(ds->perpage, write); 2094 dumpvp_write(cp->buf, cp->used); 2095 HRSTOP(ds->perpage, write); 2096 2097 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2098 } 2099 2100 /* 2101 * Wait here for some buffers to process. Returns NULL 2102 * when all helpers have terminated and all buffers 2103 * have been processed. 2104 */ 2105 cp = CQ_GET(mainq); 2106 2107 if (cp == NULL) { 2108 2109 /* Drain the write queue. */ 2110 if (!CQ_IS_EMPTY(writerq)) 2111 continue; 2112 2113 /* Main task exits here. */ 2114 break; 2115 } 2116 2117 dump_timeleft = dump_timeout; 2118 2119 switch (cp->state) { 2120 2121 case CBUF_FREEMAP: 2122 2123 /* 2124 * Note that we drop CBUF_FREEMAP buffers on 2125 * the floor (they will not be on any cqueue) 2126 * when we no longer need them. 2127 */ 2128 if (bitnum >= dumpcfg.bitmapsize) 2129 break; 2130 2131 if (dump_ioerr) { 2132 bitnum = dumpcfg.bitmapsize; 2133 CQ_CLOSE(helperq); 2134 break; 2135 } 2136 2137 HRSTART(ds->perpage, bitmap); 2138 for (; bitnum < dumpcfg.bitmapsize; bitnum++) 2139 if (BT_TEST(dumpcfg.bitmap, bitnum)) 2140 break; 2141 HRSTOP(ds->perpage, bitmap); 2142 dump_timeleft = dump_timeout; 2143 2144 if (bitnum >= dumpcfg.bitmapsize) { 2145 CQ_CLOSE(helperq); 2146 break; 2147 } 2148 2149 /* 2150 * Try to map CBUF_MAPSIZE ranges. Can't 2151 * assume that memory segment size is a 2152 * multiple of CBUF_MAPSIZE. Can't assume that 2153 * the segment starts on a CBUF_MAPSIZE 2154 * boundary. 2155 */ 2156 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2157 ASSERT(pfn != PFN_INVALID); 2158 ASSERT(bitnum + mlw.mpleft <= dumpcfg.bitmapsize); 2159 2160 base = P2ALIGN(pfn, CBUF_MAPNP); 2161 if (base < mlw.mpaddr) { 2162 base = mlw.mpaddr; 2163 baseoff = P2PHASE(base, CBUF_MAPNP); 2164 } else { 2165 baseoff = 0; 2166 } 2167 2168 pfnoff = pfn - base; 2169 if (pfnoff + mlw.mpleft < CBUF_MAPNP) { 2170 hibitnum = bitnum + mlw.mpleft; 2171 cp->size = ptob(pfnoff + mlw.mpleft); 2172 } else { 2173 hibitnum = bitnum - pfnoff + CBUF_MAPNP - 2174 baseoff; 2175 cp->size = CBUF_MAPSIZE - ptob(baseoff); 2176 } 2177 2178 cp->pfn = pfn; 2179 cp->bitnum = bitnum++; 2180 cp->pagenum = pagenum++; 2181 cp->off = ptob(pfnoff); 2182 2183 for (; bitnum < hibitnum; bitnum++) 2184 if (BT_TEST(dumpcfg.bitmap, bitnum)) 2185 pagenum++; 2186 2187 dump_timeleft = dump_timeout; 2188 cp->used = ptob(pagenum - cp->pagenum); 2189 2190 HRSTART(ds->perpage, map); 2191 hat_devload(kas.a_hat, cp->buf, cp->size, base, 2192 PROT_READ, HAT_LOAD_NOCONSIST); 2193 HRSTOP(ds->perpage, map); 2194 2195 ds->pages_mapped += btop(cp->size); 2196 ds->pages_used += pagenum - cp->pagenum; 2197 2198 CQ_OPEN(mainq); 2199 2200 /* 2201 * If there are no helpers the main task does 2202 * non-streams lzjb compress. 2203 */ 2204 if (dumpcfg.clevel == DUMP_CLEVEL_SERIAL) { 2205 dumpsys_lzjb_page(dumpcfg.helper, cp); 2206 } else { 2207 /* pass mapped pages to a helper */ 2208 CQ_PUT(helperq, cp, CBUF_INREADY); 2209 } 2210 2211 /* the last page was done */ 2212 if (bitnum >= dumpcfg.bitmapsize) 2213 CQ_CLOSE(helperq); 2214 2215 break; 2216 2217 case CBUF_USEDMAP: 2218 2219 ds->npages += btop(cp->used); 2220 2221 HRSTART(ds->perpage, unmap); 2222 hat_unload(kas.a_hat, cp->buf, cp->size, HAT_UNLOAD); 2223 HRSTOP(ds->perpage, unmap); 2224 2225 if (bitnum < dumpcfg.bitmapsize) 2226 CQ_PUT(mainq, cp, CBUF_FREEMAP); 2227 CQ_CLOSE(mainq); 2228 2229 ASSERT(ds->npages <= dumphdr->dump_npages); 2230 ds->percent = ds->npages * 100LL / dumphdr->dump_npages; 2231 break; 2232 2233 case CBUF_WRITE: 2234 2235 CQ_PUT(writerq, cp, CBUF_WRITE); 2236 break; 2237 2238 case CBUF_ERRMSG: 2239 2240 if (cp->used > 0) { 2241 cp->buf[cp->size - 2] = '\n'; 2242 cp->buf[cp->size - 1] = '\0'; 2243 if (ds->neednl) { 2244 uprintf("\n%s", cp->buf); 2245 ds->neednl = 0; 2246 } else { 2247 uprintf("%s", cp->buf); 2248 } 2249 /* wait for console output */ 2250 drv_usecwait(200000); 2251 dump_timeleft = dump_timeout; 2252 } 2253 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2254 break; 2255 2256 default: 2257 uprintf("dump: unexpected buffer state %d, " 2258 "buffer will be lost\n", cp->state); 2259 break; 2260 2261 } /* end switch */ 2262 } 2263 } 2264 2265 #ifdef COLLECT_METRICS 2266 size_t 2267 dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size) 2268 { 2269 dumpcfg_t *cfg = &dumpcfg; 2270 int myid = CPU->cpu_seqid; 2271 int i, compress_ratio; 2272 int sec, iorate; 2273 helper_t *hp, *hpend = &cfg->helper[cfg->nhelper]; 2274 char *e = buf + size; 2275 char *p = buf; 2276 2277 sec = ds->elapsed / (1000 * 1000 * 1000ULL); 2278 if (sec < 1) 2279 sec = 1; 2280 2281 if (ds->iotime < 1) 2282 ds->iotime = 1; 2283 iorate = (ds->nwrite * 100000ULL) / ds->iotime; 2284 2285 compress_ratio = 100LL * ds->npages / btopr(ds->nwrite + 1); 2286 2287 #define P(...) (p += p < e ? snprintf(p, e - p, __VA_ARGS__) : 0) 2288 2289 P("Master cpu_seqid,%d\n", CPU->cpu_seqid); 2290 P("Master cpu_id,%d\n", CPU->cpu_id); 2291 P("dump_flags,0x%x\n", dumphdr->dump_flags); 2292 P("dump_ioerr,%d\n", dump_ioerr); 2293 2294 P("Helpers:\n"); 2295 for (i = 0; i < ncpus; i++) { 2296 if ((i & 15) == 0) 2297 P(",,%03d,", i); 2298 if (i == myid) 2299 P(" M"); 2300 else if (BT_TEST(cfg->helpermap, i)) 2301 P("%4d", cpu_seq[i]->cpu_id); 2302 else 2303 P(" *"); 2304 if ((i & 15) == 15) 2305 P("\n"); 2306 } 2307 2308 P("ncbuf_used,%d\n", cfg->ncbuf_used); 2309 P("ncmap,%d\n", cfg->ncmap); 2310 2311 P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m); 2312 P("Found small pages,%ld\n", cfg->foundsm); 2313 2314 P("Compression level,%d\n", cfg->clevel); 2315 P("Compression type,%s lzjb\n", 2316 cfg->clevel == DUMP_CLEVEL_SERIAL ? "serial" : "parallel"); 2317 P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio % 2318 100); 2319 P("nhelper_used,%d\n", cfg->nhelper_used); 2320 2321 P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100); 2322 P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite); 2323 P("..total nsec,%lld\n", (u_longlong_t)ds->iotime); 2324 P("dumpbuf.iosize,%ld\n", dumpbuf.iosize); 2325 P("dumpbuf.size,%ld\n", dumpbuf.size); 2326 2327 P("Dump pages/sec,%llu\n", (u_longlong_t)ds->npages / sec); 2328 P("Dump pages,%llu\n", (u_longlong_t)ds->npages); 2329 P("Dump time,%d\n", sec); 2330 2331 if (ds->pages_mapped > 0) 2332 P("per-cent map utilization,%d\n", (int)((100 * ds->pages_used) 2333 / ds->pages_mapped)); 2334 2335 P("\nPer-page metrics:\n"); 2336 if (ds->npages > 0) { 2337 for (hp = cfg->helper; hp != hpend; hp++) { 2338 #define PERPAGE(x) ds->perpage.x += hp->perpage.x; 2339 PERPAGES; 2340 #undef PERPAGE 2341 } 2342 #define PERPAGE(x) \ 2343 P("%s nsec/page,%d\n", #x, (int)(ds->perpage.x / ds->npages)); 2344 PERPAGES; 2345 #undef PERPAGE 2346 P("freebufq.empty,%d\n", (int)(ds->freebufq.empty / 2347 ds->npages)); 2348 P("helperq.empty,%d\n", (int)(ds->helperq.empty / 2349 ds->npages)); 2350 P("writerq.empty,%d\n", (int)(ds->writerq.empty / 2351 ds->npages)); 2352 P("mainq.empty,%d\n", (int)(ds->mainq.empty / ds->npages)); 2353 2354 P("I/O wait nsec/page,%llu\n", (u_longlong_t)(ds->iowait / 2355 ds->npages)); 2356 } 2357 #undef P 2358 if (p < e) 2359 bzero(p, e - p); 2360 return (p - buf); 2361 } 2362 #endif /* COLLECT_METRICS */ 2363 2364 /* 2365 * Dump the system. 2366 */ 2367 void 2368 dumpsys(void) 2369 { 2370 dumpsync_t *ds = &dumpsync; 2371 taskq_t *livetaskq = NULL; 2372 pfn_t pfn; 2373 pgcnt_t bitnum; 2374 proc_t *p; 2375 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper]; 2376 cbuf_t *cp; 2377 pid_t npids, pidx; 2378 char *content; 2379 char *buf; 2380 size_t size; 2381 int save_dump_clevel; 2382 dumpmlw_t mlw; 2383 dumpcsize_t datatag; 2384 dumpdatahdr_t datahdr; 2385 2386 if (dumpvp == NULL || dumphdr == NULL) { 2387 uprintf("skipping system dump - no dump device configured\n"); 2388 if (panicstr) { 2389 dumpcfg.helpers_wanted = 0; 2390 dumpsys_spinunlock(&dumpcfg.helper_lock); 2391 } 2392 return; 2393 } 2394 dumpbuf.cur = dumpbuf.start; 2395 2396 /* clear the sync variables */ 2397 ASSERT(dumpcfg.nhelper > 0); 2398 bzero(ds, sizeof (*ds)); 2399 ds->dumpcpu = CPU->cpu_id; 2400 2401 /* 2402 * Calculate the starting block for dump. If we're dumping on a 2403 * swap device, start 1/5 of the way in; otherwise, start at the 2404 * beginning. And never use the first page -- it may be a disk label. 2405 */ 2406 if (dumpvp->v_flag & VISSWAP) 2407 dumphdr->dump_start = P2ROUNDUP(dumpvp_size / 5, DUMP_OFFSET); 2408 else 2409 dumphdr->dump_start = DUMP_OFFSET; 2410 2411 dumphdr->dump_flags = DF_VALID | DF_COMPLETE | DF_LIVE | DF_COMPRESSED; 2412 dumphdr->dump_crashtime = gethrestime_sec(); 2413 dumphdr->dump_npages = 0; 2414 dumphdr->dump_nvtop = 0; 2415 bzero(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.bitmapsize)); 2416 dump_timeleft = dump_timeout; 2417 2418 if (panicstr) { 2419 dumphdr->dump_flags &= ~DF_LIVE; 2420 (void) VOP_DUMPCTL(dumpvp, DUMP_FREE, NULL, NULL); 2421 (void) VOP_DUMPCTL(dumpvp, DUMP_ALLOC, NULL, NULL); 2422 (void) vsnprintf(dumphdr->dump_panicstring, DUMP_PANICSIZE, 2423 panicstr, panicargs); 2424 2425 } 2426 2427 if (dump_conflags & DUMP_ALL) 2428 content = "all"; 2429 else if (dump_conflags & DUMP_CURPROC) 2430 content = "kernel + curproc"; 2431 else 2432 content = "kernel"; 2433 uprintf("dumping to %s, offset %lld, content: %s\n", dumppath, 2434 dumphdr->dump_start, content); 2435 2436 /* Make sure nodename is current */ 2437 bcopy(utsname.nodename, dumphdr->dump_utsname.nodename, SYS_NMLN); 2438 2439 /* 2440 * If this is a live dump, try to open a VCHR vnode for better 2441 * performance. We must take care to flush the buffer cache 2442 * first. 2443 */ 2444 if (!panicstr) { 2445 vnode_t *cdev_vp, *cmn_cdev_vp; 2446 2447 ASSERT(dumpbuf.cdev_vp == NULL); 2448 cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR); 2449 if (cdev_vp != NULL) { 2450 cmn_cdev_vp = common_specvp(cdev_vp); 2451 if (VOP_OPEN(&cmn_cdev_vp, FREAD | FWRITE, kcred, NULL) 2452 == 0) { 2453 if (vn_has_cached_data(dumpvp)) 2454 (void) pvn_vplist_dirty(dumpvp, 0, NULL, 2455 B_INVAL | B_TRUNC, kcred); 2456 dumpbuf.cdev_vp = cmn_cdev_vp; 2457 } else { 2458 VN_RELE(cdev_vp); 2459 } 2460 } 2461 } 2462 2463 /* 2464 * Store a hires timestamp so we can look it up during debugging. 2465 */ 2466 lbolt_debug_entry(); 2467 2468 /* 2469 * Leave room for the message and ereport save areas and terminal dump 2470 * header. 2471 */ 2472 dumpbuf.vp_limit = dumpvp_size - DUMP_LOGSIZE - DUMP_OFFSET - 2473 DUMP_ERPTSIZE; 2474 2475 /* 2476 * Write out the symbol table. It's no longer compressed, 2477 * so its 'size' and 'csize' are equal. 2478 */ 2479 dumpbuf.vp_off = dumphdr->dump_ksyms = dumphdr->dump_start + PAGESIZE; 2480 dumphdr->dump_ksyms_size = dumphdr->dump_ksyms_csize = 2481 ksyms_snapshot(dumpvp_ksyms_write, NULL, LONG_MAX); 2482 2483 /* 2484 * Write out the translation map. 2485 */ 2486 dumphdr->dump_map = dumpvp_flush(); 2487 dump_as(&kas); 2488 dumphdr->dump_nvtop += dump_plat_addr(); 2489 2490 /* 2491 * call into hat, which may have unmapped pages that also need to 2492 * be in the dump 2493 */ 2494 hat_dump(); 2495 2496 if (dump_conflags & DUMP_ALL) { 2497 mutex_enter(&pidlock); 2498 2499 for (npids = 0, p = practive; p != NULL; p = p->p_next) 2500 dumpcfg.pids[npids++] = p->p_pid; 2501 2502 mutex_exit(&pidlock); 2503 2504 for (pidx = 0; pidx < npids; pidx++) 2505 (void) dump_process(dumpcfg.pids[pidx]); 2506 2507 dump_init_memlist_walker(&mlw); 2508 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) { 2509 dump_timeleft = dump_timeout; 2510 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2511 /* 2512 * Some hypervisors do not have all pages available to 2513 * be accessed by the guest OS. Check for page 2514 * accessibility. 2515 */ 2516 if (plat_hold_page(pfn, PLAT_HOLD_NO_LOCK, NULL) != 2517 PLAT_HOLD_OK) 2518 continue; 2519 BT_SET(dumpcfg.bitmap, bitnum); 2520 } 2521 dumphdr->dump_npages = dumpcfg.bitmapsize; 2522 dumphdr->dump_flags |= DF_ALL; 2523 2524 } else if (dump_conflags & DUMP_CURPROC) { 2525 /* 2526 * Determine which pid is to be dumped. If we're panicking, we 2527 * dump the process associated with panic_thread (if any). If 2528 * this is a live dump, we dump the process associated with 2529 * curthread. 2530 */ 2531 npids = 0; 2532 if (panicstr) { 2533 if (panic_thread != NULL && 2534 panic_thread->t_procp != NULL && 2535 panic_thread->t_procp != &p0) { 2536 dumpcfg.pids[npids++] = 2537 panic_thread->t_procp->p_pid; 2538 } 2539 } else { 2540 dumpcfg.pids[npids++] = curthread->t_procp->p_pid; 2541 } 2542 2543 if (npids && dump_process(dumpcfg.pids[0]) == 0) 2544 dumphdr->dump_flags |= DF_CURPROC; 2545 else 2546 dumphdr->dump_flags |= DF_KERNEL; 2547 2548 } else { 2549 dumphdr->dump_flags |= DF_KERNEL; 2550 } 2551 2552 dumphdr->dump_hashmask = (1 << highbit(dumphdr->dump_nvtop - 1)) - 1; 2553 2554 /* 2555 * Write out the pfn table. 2556 */ 2557 dumphdr->dump_pfn = dumpvp_flush(); 2558 dump_init_memlist_walker(&mlw); 2559 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) { 2560 dump_timeleft = dump_timeout; 2561 if (!BT_TEST(dumpcfg.bitmap, bitnum)) 2562 continue; 2563 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2564 ASSERT(pfn != PFN_INVALID); 2565 dumpvp_write(&pfn, sizeof (pfn_t)); 2566 } 2567 dump_plat_pfn(); 2568 2569 /* 2570 * Write out all the pages. 2571 * Map pages, copy them handling UEs, compress, and write them out. 2572 * Cooperate with any helpers running on CPUs in panic_idle(). 2573 */ 2574 dumphdr->dump_data = dumpvp_flush(); 2575 2576 bzero(dumpcfg.helpermap, BT_SIZEOFMAP(NCPU)); 2577 ds->live = dumpcfg.clevel > DUMP_CLEVEL_SERIAL && 2578 (dumphdr->dump_flags & DF_LIVE) != 0; 2579 2580 save_dump_clevel = dumpcfg.clevel; 2581 if (panicstr) 2582 dumpsys_get_maxmem(); 2583 2584 dumpcfg.nhelper_used = 0; 2585 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2586 if (hp->page == NULL) { 2587 hp->helper = DONEHELPER; 2588 continue; 2589 } 2590 ++dumpcfg.nhelper_used; 2591 hp->helper = FREEHELPER; 2592 hp->taskqid = NULL; 2593 hp->ds = ds; 2594 bzero(&hp->perpage, sizeof (hp->perpage)); 2595 } 2596 2597 CQ_OPEN(freebufq); 2598 CQ_OPEN(helperq); 2599 2600 dumpcfg.ncbuf_used = 0; 2601 for (cp = dumpcfg.cbuf; cp != &dumpcfg.cbuf[dumpcfg.ncbuf]; cp++) { 2602 if (cp->buf != NULL) { 2603 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2604 ++dumpcfg.ncbuf_used; 2605 } 2606 } 2607 2608 for (cp = dumpcfg.cmap; cp != &dumpcfg.cmap[dumpcfg.ncmap]; cp++) 2609 CQ_PUT(mainq, cp, CBUF_FREEMAP); 2610 2611 ds->start = gethrtime(); 2612 ds->iowaitts = ds->start; 2613 2614 /* start helpers */ 2615 if (ds->live) { 2616 int n = dumpcfg.nhelper_used; 2617 int pri = MINCLSYSPRI - 25; 2618 2619 livetaskq = taskq_create("LiveDump", n, pri, n, n, 2620 TASKQ_PREPOPULATE); 2621 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2622 if (hp->page == NULL) 2623 continue; 2624 hp->helper = hp - dumpcfg.helper; 2625 hp->taskqid = taskq_dispatch(livetaskq, 2626 dumpsys_live_helper, (void *)hp, TQ_NOSLEEP); 2627 } 2628 2629 } else { 2630 if (panicstr) 2631 kmem_dump_begin(); 2632 dumpcfg.helpers_wanted = dumpcfg.clevel > DUMP_CLEVEL_SERIAL; 2633 dumpsys_spinunlock(&dumpcfg.helper_lock); 2634 } 2635 2636 /* run main task */ 2637 dumpsys_main_task(ds); 2638 2639 ds->elapsed = gethrtime() - ds->start; 2640 if (ds->elapsed < 1) 2641 ds->elapsed = 1; 2642 2643 if (livetaskq != NULL) 2644 taskq_destroy(livetaskq); 2645 2646 if (ds->neednl) { 2647 uprintf("\n"); 2648 ds->neednl = 0; 2649 } 2650 2651 /* record actual pages dumped */ 2652 dumphdr->dump_npages = ds->npages; 2653 2654 /* platform-specific data */ 2655 dumphdr->dump_npages += dump_plat_data(dumpcfg.cbuf[0].buf); 2656 2657 /* note any errors by clearing DF_COMPLETE */ 2658 if (dump_ioerr || ds->npages < dumphdr->dump_npages) 2659 dumphdr->dump_flags &= ~DF_COMPLETE; 2660 2661 /* end of stream blocks */ 2662 datatag = 0; 2663 dumpvp_write(&datatag, sizeof (datatag)); 2664 2665 bzero(&datahdr, sizeof (datahdr)); 2666 2667 /* buffer for metrics */ 2668 buf = dumpcfg.cbuf[0].buf; 2669 size = MIN(dumpcfg.cbuf[0].size, DUMP_OFFSET - sizeof (dumphdr_t) - 2670 sizeof (dumpdatahdr_t)); 2671 2672 /* finish the kmem intercepts, collect kmem verbose info */ 2673 if (panicstr) { 2674 datahdr.dump_metrics = kmem_dump_finish(buf, size); 2675 buf += datahdr.dump_metrics; 2676 size -= datahdr.dump_metrics; 2677 } 2678 2679 /* record in the header whether this is a fault-management panic */ 2680 if (panicstr) 2681 dumphdr->dump_fm_panic = is_fm_panic(); 2682 2683 /* compression info in data header */ 2684 datahdr.dump_datahdr_magic = DUMP_DATAHDR_MAGIC; 2685 datahdr.dump_datahdr_version = DUMP_DATAHDR_VERSION; 2686 datahdr.dump_maxcsize = CBUF_SIZE; 2687 datahdr.dump_maxrange = CBUF_MAPSIZE / PAGESIZE; 2688 datahdr.dump_nstreams = dumpcfg.nhelper_used; 2689 datahdr.dump_clevel = dumpcfg.clevel; 2690 #ifdef COLLECT_METRICS 2691 if (dump_metrics_on) 2692 datahdr.dump_metrics += dumpsys_metrics(ds, buf, size); 2693 #endif 2694 datahdr.dump_data_csize = dumpvp_flush() - dumphdr->dump_data; 2695 2696 /* 2697 * Write out the initial and terminal dump headers. 2698 */ 2699 dumpbuf.vp_off = dumphdr->dump_start; 2700 dumpvp_write(dumphdr, sizeof (dumphdr_t)); 2701 (void) dumpvp_flush(); 2702 2703 dumpbuf.vp_limit = dumpvp_size; 2704 dumpbuf.vp_off = dumpbuf.vp_limit - DUMP_OFFSET; 2705 dumpvp_write(dumphdr, sizeof (dumphdr_t)); 2706 dumpvp_write(&datahdr, sizeof (dumpdatahdr_t)); 2707 dumpvp_write(dumpcfg.cbuf[0].buf, datahdr.dump_metrics); 2708 2709 (void) dumpvp_flush(); 2710 2711 uprintf("\r%3d%% done: %llu pages dumped, ", 2712 ds->percent_done, (u_longlong_t)ds->npages); 2713 2714 if (dump_ioerr == 0) { 2715 uprintf("dump succeeded\n"); 2716 } else { 2717 uprintf("dump failed: error %d\n", dump_ioerr); 2718 #ifdef DEBUG 2719 if (panicstr) 2720 debug_enter("dump failed"); 2721 #endif 2722 } 2723 2724 /* 2725 * Write out all undelivered messages. This has to be the *last* 2726 * thing we do because the dump process itself emits messages. 2727 */ 2728 if (panicstr) { 2729 dump_summary(); 2730 dump_ereports(); 2731 dump_messages(); 2732 } 2733 2734 delay(2 * hz); /* let people see the 'done' message */ 2735 dump_timeleft = 0; 2736 dump_ioerr = 0; 2737 2738 /* restore settings after live dump completes */ 2739 if (!panicstr) { 2740 dumpcfg.clevel = save_dump_clevel; 2741 2742 /* release any VCHR open of the dump device */ 2743 if (dumpbuf.cdev_vp != NULL) { 2744 (void) VOP_CLOSE(dumpbuf.cdev_vp, FREAD | FWRITE, 1, 0, 2745 kcred, NULL); 2746 VN_RELE(dumpbuf.cdev_vp); 2747 dumpbuf.cdev_vp = NULL; 2748 } 2749 } 2750 } 2751 2752 /* 2753 * This function is called whenever the memory size, as represented 2754 * by the phys_install list, changes. 2755 */ 2756 void 2757 dump_resize() 2758 { 2759 mutex_enter(&dump_lock); 2760 dumphdr_init(); 2761 dumpbuf_resize(); 2762 dump_update_clevel(); 2763 mutex_exit(&dump_lock); 2764 } 2765 2766 /* 2767 * This function allows for dynamic resizing of a dump area. It assumes that 2768 * the underlying device has update its appropriate size(9P). 2769 */ 2770 int 2771 dumpvp_resize() 2772 { 2773 int error; 2774 vattr_t vattr; 2775 2776 mutex_enter(&dump_lock); 2777 vattr.va_mask = AT_SIZE; 2778 if ((error = VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL)) != 0) { 2779 mutex_exit(&dump_lock); 2780 return (error); 2781 } 2782 2783 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) { 2784 mutex_exit(&dump_lock); 2785 return (ENOSPC); 2786 } 2787 2788 dumpvp_size = vattr.va_size & -DUMP_OFFSET; 2789 mutex_exit(&dump_lock); 2790 return (0); 2791 } 2792 2793 int 2794 dump_set_uuid(const char *uuidstr) 2795 { 2796 const char *ptr; 2797 int i; 2798 2799 if (uuidstr == NULL || strnlen(uuidstr, 36 + 1) != 36) 2800 return (EINVAL); 2801 2802 /* uuid_parse is not common code so check manually */ 2803 for (i = 0, ptr = uuidstr; i < 36; i++, ptr++) { 2804 switch (i) { 2805 case 8: 2806 case 13: 2807 case 18: 2808 case 23: 2809 if (*ptr != '-') 2810 return (EINVAL); 2811 break; 2812 2813 default: 2814 if (!isxdigit(*ptr)) 2815 return (EINVAL); 2816 break; 2817 } 2818 } 2819 2820 if (dump_osimage_uuid[0] != '\0') 2821 return (EALREADY); 2822 2823 (void) strncpy(dump_osimage_uuid, uuidstr, 36 + 1); 2824 2825 cmn_err(CE_CONT, "?This Solaris instance has UUID %s\n", 2826 dump_osimage_uuid); 2827 2828 return (0); 2829 } 2830 2831 const char * 2832 dump_get_uuid(void) 2833 { 2834 return (dump_osimage_uuid[0] != '\0' ? dump_osimage_uuid : ""); 2835 }