1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2018 Joyent, Inc.
  25  * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/param.h>
  30 #include <sys/systm.h>
  31 #include <sys/vm.h>
  32 #include <sys/proc.h>
  33 #include <sys/file.h>
  34 #include <sys/conf.h>
  35 #include <sys/kmem.h>
  36 #include <sys/mem.h>
  37 #include <sys/mman.h>
  38 #include <sys/vnode.h>
  39 #include <sys/errno.h>
  40 #include <sys/memlist.h>
  41 #include <sys/dumphdr.h>
  42 #include <sys/dumpadm.h>
  43 #include <sys/ksyms.h>
  44 #include <sys/compress.h>
  45 #include <sys/stream.h>
  46 #include <sys/strsun.h>
  47 #include <sys/cmn_err.h>
  48 #include <sys/bitmap.h>
  49 #include <sys/modctl.h>
  50 #include <sys/utsname.h>
  51 #include <sys/systeminfo.h>
  52 #include <sys/vmem.h>
  53 #include <sys/log.h>
  54 #include <sys/var.h>
  55 #include <sys/debug.h>
  56 #include <sys/sunddi.h>
  57 #include <fs/fs_subr.h>
  58 #include <sys/fs/snode.h>
  59 #include <sys/ontrap.h>
  60 #include <sys/panic.h>
  61 #include <sys/dkio.h>
  62 #include <sys/vtoc.h>
  63 #include <sys/errorq.h>
  64 #include <sys/fm/util.h>
  65 #include <sys/fs/zfs.h>
  66 
  67 #include <vm/hat.h>
  68 #include <vm/as.h>
  69 #include <vm/page.h>
  70 #include <vm/pvn.h>
  71 #include <vm/seg.h>
  72 #include <vm/seg_kmem.h>
  73 #include <sys/clock_impl.h>
  74 #include <sys/hold_page.h>
  75 #include <sys/cpu.h>
  76 
  77 #define ONE_GIG (1024 * 1024 * 1024UL)
  78 
  79 /*
  80  * Parallel Dump:
  81  * CPUs that are otherwise idle during panic are employed to parallelize
  82  * the compression task. I/O and compression are performed by different
  83  * CPUs, and are hence overlapped in time, unlike the older serial code.
  84  */
  85 
  86 /*
  87  * exported vars
  88  */
  89 kmutex_t        dump_lock;              /* lock for dump configuration */
  90 dumphdr_t       *dumphdr;               /* dump header */
  91 int             dump_conflags = DUMP_KERNEL; /* dump configuration flags */
  92 vnode_t         *dumpvp;                /* dump device vnode pointer */
  93 u_offset_t      dumpvp_size;            /* size of dump device, in bytes */
  94 char            *dumppath;              /* pathname of dump device */
  95 int             dump_timeout = 120;     /* timeout for dumping pages */
  96 int             dump_timeleft;          /* portion of dump_timeout remaining */
  97 int             dump_ioerr;             /* dump i/o error */
  98 int             dump_check_used;        /* enable check for used pages */
  99 char            *dump_stack_scratch; /* scratch area for saving stack summary */
 100 
 101 /*
 102  * Tunables for dump compression and parallelism.
 103  * These can be set via /etc/system.
 104  *
 105  * dump_ncpu_low:
 106  * This is the minimum configuration for parallel lzjb.
 107  * A special value of 0 means that parallel dump will not be used.
 108  *
 109  * dump_metrics_on:
 110  * If set, metrics are collected in the kernel, passed to savecore
 111  * via the dump file, and recorded by savecore in METRICS.txt.
 112  */
 113 uint_t dump_ncpu_low = 4;       /* minimum config for parallel lzjb */
 114 
 115 /* tunables for pre-reserved heap */
 116 uint_t dump_kmem_permap = 1024;
 117 uint_t dump_kmem_pages = 0;
 118 
 119 /* Define multiple buffers per helper to avoid stalling */
 120 #define NCBUF_PER_HELPER        2
 121 #define NCMAP_PER_HELPER        4
 122 
 123 /* minimum number of helpers configured */
 124 #define MINHELPERS      (MAX(dump_ncpu_low, 1))
 125 #define MINCBUFS        (MINHELPERS * NCBUF_PER_HELPER)
 126 
 127 /*
 128  * Define constant parameters.
 129  *
 130  * CBUF_SIZE            size of an output buffer
 131  *
 132  * CBUF_MAPSIZE         size of virtual range for mapping pages
 133  *
 134  * CBUF_MAPNP           size of virtual range in pages
 135  *
 136  */
 137 #define DUMP_1KB        ((size_t)1 << 10)
 138 #define DUMP_1MB        ((size_t)1 << 20)
 139 #define CBUF_SIZE       ((size_t)1 << 17)
 140 #define CBUF_MAPSHIFT   (22)
 141 #define CBUF_MAPSIZE    ((size_t)1 << CBUF_MAPSHIFT)
 142 #define CBUF_MAPNP      ((size_t)1 << (CBUF_MAPSHIFT - PAGESHIFT))
 143 
 144 /*
 145  * Compression metrics are accumulated nano-second subtotals. The
 146  * results are normalized by the number of pages dumped. A report is
 147  * generated when dumpsys() completes and is saved in the dump image
 148  * after the trailing dump header.
 149  *
 150  * Metrics are always collected. Set the variable dump_metrics_on to
 151  * cause metrics to be saved in the crash file, where savecore will
 152  * save it in the file METRICS.txt.
 153  */
 154 #define PERPAGES \
 155         PERPAGE(bitmap) PERPAGE(map) PERPAGE(unmap) \
 156         PERPAGE(copy) PERPAGE(compress) \
 157         PERPAGE(write) \
 158         PERPAGE(inwait) PERPAGE(outwait)
 159 
 160 typedef struct perpage {
 161 #define PERPAGE(x) hrtime_t x;
 162         PERPAGES
 163 #undef PERPAGE
 164 } perpage_t;
 165 
 166 /*
 167  * This macro controls the code generation for collecting dump
 168  * performance information. By default, the code is generated, but
 169  * automatic saving of the information is disabled. If dump_metrics_on
 170  * is set to 1, the timing information is passed to savecore via the
 171  * crash file, where it is appended to the file dump-dir/METRICS.txt.
 172  */
 173 #define COLLECT_METRICS
 174 
 175 #ifdef COLLECT_METRICS
 176 uint_t dump_metrics_on = 0;     /* set to 1 to enable recording metrics */
 177 
 178 #define HRSTART(v, m)           v##ts.m = gethrtime()
 179 #define HRSTOP(v, m)            v.m += gethrtime() - v##ts.m
 180 #define HRBEGIN(v, m, s)        v##ts.m = gethrtime(); v.size += s
 181 #define HREND(v, m)             v.m += gethrtime() - v##ts.m
 182 #define HRNORM(v, m, n)         v.m /= (n)
 183 
 184 #else
 185 #define HRSTART(v, m)
 186 #define HRSTOP(v, m)
 187 #define HRBEGIN(v, m, s)
 188 #define HREND(v, m)
 189 #define HRNORM(v, m, n)
 190 #endif  /* COLLECT_METRICS */
 191 
 192 /*
 193  * Buffers for copying and compressing memory pages.
 194  *
 195  * cbuf_t buffer controllers: used for both input and output.
 196  *
 197  * The buffer state indicates how it is being used:
 198  *
 199  * CBUF_FREEMAP: CBUF_MAPSIZE virtual address range is available for
 200  * mapping input pages.
 201  *
 202  * CBUF_INREADY: input pages are mapped and ready for compression by a
 203  * helper.
 204  *
 205  * CBUF_USEDMAP: mapping has been consumed by a helper. Needs unmap.
 206  *
 207  * CBUF_FREEBUF: CBUF_SIZE output buffer, which is available.
 208  *
 209  * CBUF_WRITE: CBUF_SIZE block of compressed pages from a helper,
 210  * ready to write out.
 211  *
 212  * CBUF_ERRMSG: CBUF_SIZE block of error messages from a helper
 213  * (reports UE errors.)
 214  */
 215 
 216 typedef enum cbufstate {
 217         CBUF_FREEMAP,
 218         CBUF_INREADY,
 219         CBUF_USEDMAP,
 220         CBUF_FREEBUF,
 221         CBUF_WRITE,
 222         CBUF_ERRMSG
 223 } cbufstate_t;
 224 
 225 typedef struct cbuf cbuf_t;
 226 
 227 struct cbuf {
 228         cbuf_t *next;                   /* next in list */
 229         cbufstate_t state;              /* processing state */
 230         size_t used;                    /* amount used */
 231         size_t size;                    /* mem size */
 232         char *buf;                      /* kmem or vmem */
 233         pgcnt_t pagenum;                /* index to pfn map */
 234         pgcnt_t bitnum;                 /* first set bitnum */
 235         pfn_t pfn;                      /* first pfn in mapped range */
 236         int off;                        /* byte offset to first pfn */
 237 };
 238 
 239 static char dump_osimage_uuid[36 + 1];
 240 
 241 #define isdigit(ch)     ((ch) >= '0' && (ch) <= '9')
 242 #define isxdigit(ch)    (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \
 243                         ((ch) >= 'A' && (ch) <= 'F'))
 244 
 245 /*
 246  * cqueue_t queues: a uni-directional channel for communication
 247  * from the master to helper tasks or vice-versa using put and
 248  * get primitives. Both mappings and data buffers are passed via
 249  * queues. Producers close a queue when done. The number of
 250  * active producers is reference counted so the consumer can
 251  * detect end of data. Concurrent access is mediated by atomic
 252  * operations for panic dump, or mutex/cv for live dump.
 253  *
 254  * There a four queues, used as follows:
 255  *
 256  * Queue                Dataflow                NewState
 257  * --------------------------------------------------
 258  * mainq                master -> master     FREEMAP
 259  * master has initialized or unmapped an input buffer
 260  * --------------------------------------------------
 261  * helperq              master -> helper     INREADY
 262  * master has mapped input for use by helper
 263  * --------------------------------------------------
 264  * mainq                master <- helper     USEDMAP
 265  * helper is done with input
 266  * --------------------------------------------------
 267  * freebufq             master -> helper     FREEBUF
 268  * master has initialized or written an output buffer
 269  * --------------------------------------------------
 270  * mainq                master <- helper     WRITE
 271  * block of compressed pages from a helper
 272  * --------------------------------------------------
 273  * mainq                master <- helper     ERRMSG
 274  * error messages from a helper (memory error case)
 275  * --------------------------------------------------
 276  * writerq              master <- master     WRITE
 277  * non-blocking queue of blocks to write
 278  * --------------------------------------------------
 279  */
 280 typedef struct cqueue {
 281         cbuf_t *volatile first;         /* first in list */
 282         cbuf_t *last;                   /* last in list */
 283         hrtime_t ts;                    /* timestamp */
 284         hrtime_t empty;                 /* total time empty */
 285         kmutex_t mutex;                 /* live state lock */
 286         kcondvar_t cv;                  /* live wait var */
 287         lock_t spinlock;                /* panic mode spin lock */
 288         volatile uint_t open;           /* producer ref count */
 289 } cqueue_t;
 290 
 291 /*
 292  * Convenience macros for using the cqueue functions
 293  * Note that the caller must have defined "dumpsync_t *ds"
 294  */
 295 #define CQ_IS_EMPTY(q)                                  \
 296         (ds->q.first == NULL)
 297 
 298 #define CQ_OPEN(q)                                      \
 299         atomic_inc_uint(&ds->q.open)
 300 
 301 #define CQ_CLOSE(q)                                     \
 302         dumpsys_close_cq(&ds->q, ds->live)
 303 
 304 #define CQ_PUT(q, cp, st)                               \
 305         dumpsys_put_cq(&ds->q, cp, st, ds->live)
 306 
 307 #define CQ_GET(q)                                       \
 308         dumpsys_get_cq(&ds->q, ds->live)
 309 
 310 /*
 311  * Dynamic state when dumpsys() is running.
 312  */
 313 typedef struct dumpsync {
 314         pgcnt_t npages;                 /* subtotal of pages dumped */
 315         pgcnt_t pages_mapped;           /* subtotal of pages mapped */
 316         pgcnt_t pages_used;             /* subtotal of pages used per map */
 317         size_t nwrite;                  /* subtotal of bytes written */
 318         uint_t live;                    /* running live dump */
 319         uint_t neednl;                  /* will need to print a newline */
 320         uint_t percent;                 /* dump progress */
 321         uint_t percent_done;            /* dump progress reported */
 322         int sec_done;                   /* dump progress last report time */
 323         cqueue_t freebufq;              /* free kmem bufs for writing */
 324         cqueue_t mainq;                 /* input for main task */
 325         cqueue_t helperq;               /* input for helpers */
 326         cqueue_t writerq;               /* input for writer */
 327         hrtime_t start;                 /* start time */
 328         hrtime_t elapsed;               /* elapsed time when completed */
 329         hrtime_t iotime;                /* time spent writing nwrite bytes */
 330         hrtime_t iowait;                /* time spent waiting for output */
 331         hrtime_t iowaitts;              /* iowait timestamp */
 332         perpage_t perpage;              /* metrics */
 333         perpage_t perpagets;
 334         int dumpcpu;                    /* master cpu */
 335 } dumpsync_t;
 336 
 337 static dumpsync_t dumpsync;             /* synchronization vars */
 338 
 339 /*
 340  * helper_t helpers: contains the context for a stream. CPUs run in
 341  * parallel at dump time; each CPU creates a single stream of
 342  * compression data.  Stream data is divided into CBUF_SIZE blocks.
 343  * The blocks are written in order within a stream. But, blocks from
 344  * multiple streams can be interleaved. Each stream is identified by a
 345  * unique tag.
 346  */
 347 typedef struct helper {
 348         int helper;                     /* bound helper id */
 349         int tag;                        /* compression stream tag */
 350         perpage_t perpage;              /* per page metrics */
 351         perpage_t perpagets;            /* per page metrics (timestamps) */
 352         taskqid_t taskqid;              /* live dump task ptr */
 353         int in, out;                    /* buffer offsets */
 354         cbuf_t *cpin, *cpout, *cperr;   /* cbuf objects in process */
 355         dumpsync_t *ds;                 /* pointer to sync vars */
 356         size_t used;                    /* counts input consumed */
 357         char *page;                     /* buffer for page copy */
 358         char *lzbuf;                    /* lzjb output */
 359 } helper_t;
 360 
 361 #define MAINHELPER      (-1)            /* helper is also the main task */
 362 #define FREEHELPER      (-2)            /* unbound helper */
 363 #define DONEHELPER      (-3)            /* helper finished */
 364 
 365 /*
 366  * configuration vars for dumpsys
 367  */
 368 typedef struct dumpcfg {
 369         int     nhelper;        /* number of helpers */
 370         int     nhelper_used;   /* actual number of helpers used */
 371         int     ncmap;          /* number VA pages for compression */
 372         int     ncbuf;          /* number of bufs for compression */
 373         int     ncbuf_used;     /* number of bufs in use */
 374         uint_t  clevel;         /* dump compression level */
 375         helper_t *helper;       /* array of helpers */
 376         cbuf_t  *cmap;          /* array of input (map) buffers */
 377         cbuf_t  *cbuf;          /* array of output  buffers */
 378         ulong_t *helpermap;     /* set of dumpsys helper CPU ids */
 379         ulong_t *bitmap;        /* bitmap for marking pages to dump */
 380         ulong_t *rbitmap;       /* bitmap for used CBUF_MAPSIZE ranges */
 381         pgcnt_t bitmapsize;     /* size of bitmap */
 382         pgcnt_t rbitmapsize;    /* size of bitmap for ranges */
 383         pgcnt_t found4m;        /* number ranges allocated by dump */
 384         pgcnt_t foundsm;        /* number small pages allocated by dump */
 385         pid_t   *pids;          /* list of process IDs at dump time */
 386         size_t  maxsize;        /* memory size needed at dump time */
 387         size_t  maxvmsize;      /* size of reserved VM */
 388         char    *maxvm;         /* reserved VM for spare pages */
 389         lock_t  helper_lock;    /* protect helper state */
 390         char    helpers_wanted; /* flag to enable parallelism */
 391 } dumpcfg_t;
 392 
 393 static dumpcfg_t dumpcfg;       /* config vars */
 394 
 395 /*
 396  * The dump I/O buffer.
 397  *
 398  * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is
 399  * sized according to the optimum device transfer speed.
 400  */
 401 typedef struct dumpbuf {
 402         vnode_t *cdev_vp;       /* VCHR open of the dump device */
 403         len_t   vp_limit;       /* maximum write offset */
 404         offset_t vp_off;        /* current dump device offset */
 405         char    *cur;           /* dump write pointer */
 406         char    *start;         /* dump buffer address */
 407         char    *end;           /* dump buffer end */
 408         size_t  size;           /* size of dumpbuf in bytes */
 409         size_t  iosize;         /* best transfer size for device */
 410 } dumpbuf_t;
 411 
 412 dumpbuf_t dumpbuf;              /* I/O buffer */
 413 
 414 /*
 415  * For parallel dump, defines maximum time main task thread will wait
 416  * for at least one helper to register in dumpcfg.helpermap, before
 417  * assuming there are no helpers and falling back to serial mode.
 418  * Value is chosen arbitrary and provides *really* long wait for any
 419  * available helper to register.
 420  */
 421 #define DUMP_HELPER_MAX_WAIT    1000    /* millisec */
 422 
 423 /*
 424  * The dump I/O buffer must be at least one page, at most xfer_size
 425  * bytes, and should scale with physmem in between.  The transfer size
 426  * passed in will either represent a global default (maxphys) or the
 427  * best size for the device.  The size of the dumpbuf I/O buffer is
 428  * limited by dumpbuf_limit (8MB by default) because the dump
 429  * performance saturates beyond a certain size.  The default is to
 430  * select 1/4096 of the memory.
 431  */
 432 static int      dumpbuf_fraction = 12;  /* memory size scale factor */
 433 static size_t   dumpbuf_limit = 8 * DUMP_1MB;   /* max I/O buf size */
 434 
 435 static size_t
 436 dumpbuf_iosize(size_t xfer_size)
 437 {
 438         size_t iosize = ptob(physmem >> dumpbuf_fraction);
 439 
 440         if (iosize < PAGESIZE)
 441                 iosize = PAGESIZE;
 442         else if (iosize > xfer_size)
 443                 iosize = xfer_size;
 444         if (iosize > dumpbuf_limit)
 445                 iosize = dumpbuf_limit;
 446         return (iosize & PAGEMASK);
 447 }
 448 
 449 /*
 450  * resize the I/O buffer
 451  */
 452 static void
 453 dumpbuf_resize(void)
 454 {
 455         char *old_buf = dumpbuf.start;
 456         size_t old_size = dumpbuf.size;
 457         char *new_buf;
 458         size_t new_size;
 459 
 460         ASSERT(MUTEX_HELD(&dump_lock));
 461 
 462         new_size = dumpbuf_iosize(MAX(dumpbuf.iosize, maxphys));
 463         if (new_size <= old_size)
 464                 return; /* no need to reallocate buffer */
 465 
 466         new_buf = kmem_alloc(new_size, KM_SLEEP);
 467         dumpbuf.size = new_size;
 468         dumpbuf.start = new_buf;
 469         dumpbuf.end = new_buf + new_size;
 470         kmem_free(old_buf, old_size);
 471 }
 472 
 473 /*
 474  * dump_update_clevel is called when dumpadm configures the dump device.
 475  *      Determine the compression level / type
 476  *      - DUMP_CLEVEL_SERIAL is single threaded lzjb
 477  *      - DUMP_CLEVEL_LZJB   is parallel lzjb
 478  *      Calculate number of helpers and buffers.
 479  *      Allocate the minimum configuration for now.
 480  *
 481  * When the dump file is configured we reserve a minimum amount of
 482  * memory for use at crash time. But we reserve VA for all the memory
 483  * we really want in order to do the fastest dump possible. The VA is
 484  * backed by pages not being dumped, according to the bitmap. If
 485  * there is insufficient spare memory, however, we fall back to the
 486  * minimum.
 487  *
 488  * Live dump (savecore -L) always uses the minimum config.
 489  *
 490  * For parallel dumps, the number of helpers is ncpu-1. The CPU
 491  * running panic runs the main task. For single-threaded dumps, the
 492  * panic CPU does lzjb compression (it is tagged as MAINHELPER.)
 493  *
 494  * Need multiple buffers per helper so that they do not block waiting
 495  * for the main task.
 496  *                              parallel        single-threaded
 497  * Number of output buffers:    nhelper*2               1
 498  * Number of mapping buffers:   nhelper*4               1
 499  *
 500  */
 501 static void
 502 dump_update_clevel()
 503 {
 504         int tag;
 505         helper_t *hp, *hpend;
 506         cbuf_t *cp, *cpend;
 507         dumpcfg_t *old = &dumpcfg;
 508         dumpcfg_t newcfg = *old;
 509         dumpcfg_t *new = &newcfg;
 510 
 511         ASSERT(MUTEX_HELD(&dump_lock));
 512 
 513         /*
 514          * Free the previously allocated bufs and VM.
 515          */
 516         if (old->helper != NULL) {
 517 
 518                 /* helpers */
 519                 hpend = &old->helper[old->nhelper];
 520                 for (hp = old->helper; hp != hpend; hp++) {
 521                         if (hp->lzbuf != NULL)
 522                                 kmem_free(hp->lzbuf, PAGESIZE);
 523                         if (hp->page != NULL)
 524                                 kmem_free(hp->page, PAGESIZE);
 525                 }
 526                 kmem_free(old->helper, old->nhelper * sizeof (helper_t));
 527 
 528                 /* VM space for mapping pages */
 529                 cpend = &old->cmap[old->ncmap];
 530                 for (cp = old->cmap; cp != cpend; cp++)
 531                         vmem_xfree(heap_arena, cp->buf, CBUF_MAPSIZE);
 532                 kmem_free(old->cmap, old->ncmap * sizeof (cbuf_t));
 533 
 534                 /* output bufs */
 535                 cpend = &old->cbuf[old->ncbuf];
 536                 for (cp = old->cbuf; cp != cpend; cp++)
 537                         if (cp->buf != NULL)
 538                                 kmem_free(cp->buf, cp->size);
 539                 kmem_free(old->cbuf, old->ncbuf * sizeof (cbuf_t));
 540 
 541                 /* reserved VM for dumpsys_get_maxmem */
 542                 if (old->maxvmsize > 0)
 543                         vmem_xfree(heap_arena, old->maxvm, old->maxvmsize);
 544         }
 545 
 546         /*
 547          * Allocate memory and VM.
 548          * One CPU runs dumpsys, the rest are helpers.
 549          */
 550         new->nhelper = ncpus - 1;
 551         if (new->nhelper < 1)
 552                 new->nhelper = 1;
 553 
 554         if (new->nhelper > DUMP_MAX_NHELPER)
 555                 new->nhelper = DUMP_MAX_NHELPER;
 556 
 557         /* If dump_ncpu_low is 0 or greater than ncpus, do serial dump */
 558         if (dump_ncpu_low == 0 || dump_ncpu_low > ncpus || new->nhelper < 2) {
 559                 new->clevel = DUMP_CLEVEL_SERIAL;
 560                 new->nhelper = 1;
 561                 new->ncbuf = 1;
 562                 new->ncmap = 1;
 563         } else {
 564                 new->clevel = DUMP_CLEVEL_LZJB;
 565                 new->ncbuf = NCBUF_PER_HELPER * new->nhelper;
 566                 new->ncmap = NCMAP_PER_HELPER * new->nhelper;
 567         }
 568 
 569         /*
 570          * Allocate new data structures and buffers for MINHELPERS,
 571          * and also figure the max desired size.
 572          */
 573         new->maxsize = 0;
 574         new->maxvmsize = 0;
 575         new->maxvm = NULL;
 576         tag = 1;
 577         new->helper = kmem_zalloc(new->nhelper * sizeof (helper_t), KM_SLEEP);
 578         hpend = &new->helper[new->nhelper];
 579         for (hp = new->helper; hp != hpend; hp++) {
 580                 hp->tag = tag++;
 581                 if (hp < &new->helper[MINHELPERS]) {
 582                         hp->lzbuf = kmem_alloc(PAGESIZE, KM_SLEEP);
 583                         hp->page = kmem_alloc(PAGESIZE, KM_SLEEP);
 584                 } else  {
 585                         new->maxsize += 2 * PAGESIZE;
 586                 }
 587         }
 588 
 589         new->cbuf = kmem_zalloc(new->ncbuf * sizeof (cbuf_t), KM_SLEEP);
 590         cpend = &new->cbuf[new->ncbuf];
 591         for (cp = new->cbuf; cp != cpend; cp++) {
 592                 cp->state = CBUF_FREEBUF;
 593                 cp->size = CBUF_SIZE;
 594                 if (cp < &new->cbuf[MINCBUFS])
 595                         cp->buf = kmem_alloc(cp->size, KM_SLEEP);
 596                 else
 597                         new->maxsize += cp->size;
 598         }
 599 
 600         new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP);
 601         cpend = &new->cmap[new->ncmap];
 602         for (cp = new->cmap; cp != cpend; cp++) {
 603                 cp->state = CBUF_FREEMAP;
 604                 cp->size = CBUF_MAPSIZE;
 605                 cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE,
 606                     0, 0, NULL, NULL, VM_SLEEP);
 607         }
 608 
 609         /* reserve VA to be backed with spare pages at crash time */
 610         if (new->maxsize > 0) {
 611                 new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE);
 612                 new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE);
 613                 new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize,
 614                     CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP);
 615         }
 616 
 617         /*
 618          * Reserve memory for kmem allocation calls made during crash dump.  The
 619          * hat layer allocates memory for each mapping created, and the I/O path
 620          * allocates buffers and data structs.
 621          *
 622          * On larger systems, we easily exceed the lower amount, so we need some
 623          * more space; the cut-over point is relatively arbitrary.  If we run
 624          * out, the only impact is that kmem state in the dump becomes
 625          * inconsistent.
 626          */
 627 
 628         if (dump_kmem_pages == 0) {
 629                 if (physmem > (16 * ONE_GIG) / PAGESIZE)
 630                         dump_kmem_pages = 20;
 631                 else
 632                         dump_kmem_pages = 8;
 633         }
 634 
 635         kmem_dump_init((new->ncmap * dump_kmem_permap) +
 636             (dump_kmem_pages * PAGESIZE));
 637 
 638         /* set new config pointers */
 639         *old = *new;
 640 }
 641 
 642 /*
 643  * Define a struct memlist walker to optimize bitnum to pfn
 644  * lookup. The walker maintains the state of the list traversal.
 645  */
 646 typedef struct dumpmlw {
 647         struct memlist  *mp;            /* current memlist */
 648         pgcnt_t         basenum;        /* bitnum base offset */
 649         pgcnt_t         mppages;        /* current memlist size */
 650         pgcnt_t         mpleft;         /* size to end of current memlist */
 651         pfn_t           mpaddr;         /* first pfn in memlist */
 652 } dumpmlw_t;
 653 
 654 /* initialize the walker */
 655 static inline void
 656 dump_init_memlist_walker(dumpmlw_t *pw)
 657 {
 658         pw->mp = phys_install;
 659         pw->basenum = 0;
 660         pw->mppages = pw->mp->ml_size >> PAGESHIFT;
 661         pw->mpleft = pw->mppages;
 662         pw->mpaddr = pw->mp->ml_address >> PAGESHIFT;
 663 }
 664 
 665 /*
 666  * Lookup pfn given bitnum. The memlist can be quite long on some
 667  * systems (e.g.: one per board). To optimize sequential lookups, the
 668  * caller initializes and presents a memlist walker.
 669  */
 670 static pfn_t
 671 dump_bitnum_to_pfn(pgcnt_t bitnum, dumpmlw_t *pw)
 672 {
 673         bitnum -= pw->basenum;
 674         while (pw->mp != NULL) {
 675                 if (bitnum < pw->mppages) {
 676                         pw->mpleft = pw->mppages - bitnum;
 677                         return (pw->mpaddr + bitnum);
 678                 }
 679                 bitnum -= pw->mppages;
 680                 pw->basenum += pw->mppages;
 681                 pw->mp = pw->mp->ml_next;
 682                 if (pw->mp != NULL) {
 683                         pw->mppages = pw->mp->ml_size >> PAGESHIFT;
 684                         pw->mpleft = pw->mppages;
 685                         pw->mpaddr = pw->mp->ml_address >> PAGESHIFT;
 686                 }
 687         }
 688         return (PFN_INVALID);
 689 }
 690 
 691 static pgcnt_t
 692 dump_pfn_to_bitnum(pfn_t pfn)
 693 {
 694         struct memlist *mp;
 695         pgcnt_t bitnum = 0;
 696 
 697         for (mp = phys_install; mp != NULL; mp = mp->ml_next) {
 698                 if (pfn >= (mp->ml_address >> PAGESHIFT) &&
 699                     pfn < ((mp->ml_address + mp->ml_size) >> PAGESHIFT))
 700                         return (bitnum + pfn - (mp->ml_address >> PAGESHIFT));
 701                 bitnum += mp->ml_size >> PAGESHIFT;
 702         }
 703         return ((pgcnt_t)-1);
 704 }
 705 
 706 /*
 707  * Set/test bitmap for a CBUF_MAPSIZE range which includes pfn. The
 708  * mapping of pfn to range index is imperfect because pfn and bitnum
 709  * do not have the same phase. To make sure a CBUF_MAPSIZE range is
 710  * covered, call this for both ends:
 711  *      dump_set_used(base)
 712  *      dump_set_used(base+CBUF_MAPNP-1)
 713  *
 714  * This is used during a panic dump to mark pages allocated by
 715  * dumpsys_get_maxmem(). The macro IS_DUMP_PAGE(pp) is used by
 716  * page_get_mnode_freelist() to make sure pages used by dump are never
 717  * allocated.
 718  */
 719 #define CBUF_MAPP2R(pfn)        ((pfn) >> (CBUF_MAPSHIFT - PAGESHIFT))
 720 
 721 static void
 722 dump_set_used(pfn_t pfn)
 723 {
 724 
 725         pgcnt_t bitnum, rbitnum;
 726 
 727         bitnum = dump_pfn_to_bitnum(pfn);
 728         ASSERT(bitnum != (pgcnt_t)-1);
 729 
 730         rbitnum = CBUF_MAPP2R(bitnum);
 731         ASSERT(rbitnum < dumpcfg.rbitmapsize);
 732 
 733         BT_SET(dumpcfg.rbitmap, rbitnum);
 734 }
 735 
 736 int
 737 dump_test_used(pfn_t pfn)
 738 {
 739         pgcnt_t bitnum, rbitnum;
 740 
 741         bitnum = dump_pfn_to_bitnum(pfn);
 742         ASSERT(bitnum != (pgcnt_t)-1);
 743 
 744         rbitnum = CBUF_MAPP2R(bitnum);
 745         ASSERT(rbitnum < dumpcfg.rbitmapsize);
 746 
 747         return (BT_TEST(dumpcfg.rbitmap, rbitnum));
 748 }
 749 
 750 /*
 751  * Perform additional checks on the page to see if we can really use
 752  * it. The kernel (kas) pages are always set in the bitmap. However,
 753  * boot memory pages (prom_ppages or P_BOOTPAGES) are not in the
 754  * bitmap. So we check for them.
 755  */
 756 static inline int
 757 dump_pfn_check(pfn_t pfn)
 758 {
 759         page_t *pp = page_numtopp_nolock(pfn);
 760         if (pp == NULL || pp->p_pagenum != pfn ||
 761 #if defined(__sparc)
 762             pp->p_vnode == &promvp ||
 763 #else
 764             PP_ISBOOTPAGES(pp) ||
 765 #endif
 766             pp->p_toxic != 0)
 767                 return (0);
 768         return (1);
 769 }
 770 
 771 /*
 772  * Check a range to see if all contained pages are available and
 773  * return non-zero if the range can be used.
 774  */
 775 static inline int
 776 dump_range_check(pgcnt_t start, pgcnt_t end, pfn_t pfn)
 777 {
 778         for (; start < end; start++, pfn++) {
 779                 if (BT_TEST(dumpcfg.bitmap, start))
 780                         return (0);
 781                 if (!dump_pfn_check(pfn))
 782                         return (0);
 783         }
 784         return (1);
 785 }
 786 
 787 /*
 788  * dumpsys_get_maxmem() is called during panic. Find unused ranges
 789  * and use them for buffers.
 790  * It searches the dump bitmap in 2 passes. The first time it looks
 791  * for CBUF_MAPSIZE ranges. On the second pass it uses small pages.
 792  */
 793 static void
 794 dumpsys_get_maxmem()
 795 {
 796         dumpcfg_t *cfg = &dumpcfg;
 797         cbuf_t *endcp = &cfg->cbuf[cfg->ncbuf];
 798         helper_t *endhp = &cfg->helper[cfg->nhelper];
 799         pgcnt_t bitnum, end;
 800         size_t sz, endsz;
 801         pfn_t pfn, off;
 802         cbuf_t *cp;
 803         helper_t *hp;
 804         dumpmlw_t mlw;
 805         int k;
 806 
 807         /*
 808          * Setting dump_ncpu_low to 0 forces a single threaded dump.
 809          */
 810         if (dump_ncpu_low == 0) {
 811                 cfg->clevel = DUMP_CLEVEL_SERIAL;
 812                 return;
 813         }
 814 
 815         /*
 816          * There may be no point in looking for spare memory. If
 817          * dumping all memory, then none is spare. If doing a serial
 818          * dump, then already have buffers.
 819          */
 820         if (cfg->maxsize == 0 || cfg->clevel == DUMP_CLEVEL_SERIAL ||
 821             (dump_conflags & DUMP_ALL) != 0) {
 822                 return;
 823         }
 824 
 825         sz = 0;
 826         cfg->found4m = 0;
 827         cfg->foundsm = 0;
 828 
 829         /* bitmap of ranges used to estimate which pfns are being used */
 830         bzero(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.rbitmapsize));
 831 
 832         /* find ranges that are not being dumped to use for buffers */
 833         dump_init_memlist_walker(&mlw);
 834         for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) {
 835                 dump_timeleft = dump_timeout;
 836                 end = bitnum + CBUF_MAPNP;
 837                 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
 838                 ASSERT(pfn != PFN_INVALID);
 839 
 840                 /* skip partial range at end of mem segment */
 841                 if (mlw.mpleft < CBUF_MAPNP) {
 842                         end = bitnum + mlw.mpleft;
 843                         continue;
 844                 }
 845 
 846                 /* skip non aligned pages */
 847                 off = P2PHASE(pfn, CBUF_MAPNP);
 848                 if (off != 0) {
 849                         end -= off;
 850                         continue;
 851                 }
 852 
 853                 if (!dump_range_check(bitnum, end, pfn))
 854                         continue;
 855 
 856                 ASSERT((sz + CBUF_MAPSIZE) <= cfg->maxvmsize);
 857                 hat_devload(kas.a_hat, cfg->maxvm + sz, CBUF_MAPSIZE, pfn,
 858                     PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST);
 859                 sz += CBUF_MAPSIZE;
 860                 cfg->found4m++;
 861 
 862                 /* set the bitmap for both ends to be sure to cover the range */
 863                 dump_set_used(pfn);
 864                 dump_set_used(pfn + CBUF_MAPNP - 1);
 865 
 866                 if (sz >= cfg->maxsize)
 867                         goto foundmax;
 868         }
 869 
 870         /* Add small pages if we can't find enough large pages. */
 871         dump_init_memlist_walker(&mlw);
 872         for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) {
 873                 dump_timeleft = dump_timeout;
 874                 end = bitnum + CBUF_MAPNP;
 875                 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
 876                 ASSERT(pfn != PFN_INVALID);
 877 
 878                 /* Find any non-aligned pages at start and end of segment. */
 879                 off = P2PHASE(pfn, CBUF_MAPNP);
 880                 if (mlw.mpleft < CBUF_MAPNP) {
 881                         end = bitnum + mlw.mpleft;
 882                 } else if (off != 0) {
 883                         end -= off;
 884                 } else if (cfg->found4m && dump_test_used(pfn)) {
 885                         continue;
 886                 }
 887 
 888                 for (; bitnum < end; bitnum++, pfn++) {
 889                         dump_timeleft = dump_timeout;
 890                         if (BT_TEST(dumpcfg.bitmap, bitnum))
 891                                 continue;
 892                         if (!dump_pfn_check(pfn))
 893                                 continue;
 894                         ASSERT((sz + PAGESIZE) <= cfg->maxvmsize);
 895                         hat_devload(kas.a_hat, cfg->maxvm + sz, PAGESIZE, pfn,
 896                             PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST);
 897                         sz += PAGESIZE;
 898                         cfg->foundsm++;
 899                         dump_set_used(pfn);
 900                         if (sz >= cfg->maxsize)
 901                                 goto foundmax;
 902                 }
 903         }
 904 
 905         /* Allocate memory for as many helpers as we can. */
 906 foundmax:
 907 
 908         /* Byte offsets into memory found and mapped above */
 909         endsz = sz;
 910         sz = 0;
 911 
 912         /* Skip the preallocate output buffers. */
 913         cp = &cfg->cbuf[MINCBUFS];
 914 
 915         /* Loop over all helpers and allocate memory. */
 916         for (hp = cfg->helper; hp < endhp; hp++) {
 917 
 918                 /* Skip preallocated helpers by checking hp->page. */
 919                 if (hp->page == NULL) {
 920                         /* lzjb needs 2 1-page buffers */
 921                         if ((sz + (2 * PAGESIZE)) > endsz)
 922                                 break;
 923                         hp->page = cfg->maxvm + sz;
 924                         sz += PAGESIZE;
 925                         hp->lzbuf = cfg->maxvm + sz;
 926                         sz += PAGESIZE;
 927                 }
 928 
 929                 /*
 930                  * Add output buffers per helper. The number of
 931                  * buffers per helper is determined by the ratio of
 932                  * ncbuf to nhelper.
 933                  */
 934                 for (k = 0; cp < endcp && (sz + CBUF_SIZE) <= endsz &&
 935                     k < NCBUF_PER_HELPER; k++) {
 936                         cp->state = CBUF_FREEBUF;
 937                         cp->size = CBUF_SIZE;
 938                         cp->buf = cfg->maxvm + sz;
 939                         sz += CBUF_SIZE;
 940                         ++cp;
 941                 }
 942         }
 943 
 944         /* Finish allocating output buffers */
 945         for (; cp < endcp && (sz + CBUF_SIZE) <= endsz; cp++) {
 946                 cp->state = CBUF_FREEBUF;
 947                 cp->size = CBUF_SIZE;
 948                 cp->buf = cfg->maxvm + sz;
 949                 sz += CBUF_SIZE;
 950         }
 951 
 952         /* Enable IS_DUMP_PAGE macro, which checks for pages we took. */
 953         if (cfg->found4m || cfg->foundsm)
 954                 dump_check_used = 1;
 955 
 956         ASSERT(sz <= endsz);
 957 }
 958 
 959 static void
 960 dumphdr_init(void)
 961 {
 962         pgcnt_t npages = 0;
 963 
 964         ASSERT(MUTEX_HELD(&dump_lock));
 965 
 966         if (dumphdr == NULL) {
 967                 dumphdr = kmem_zalloc(sizeof (dumphdr_t), KM_SLEEP);
 968                 dumphdr->dump_magic = DUMP_MAGIC;
 969                 dumphdr->dump_version = DUMP_VERSION;
 970                 dumphdr->dump_wordsize = DUMP_WORDSIZE;
 971                 dumphdr->dump_pageshift = PAGESHIFT;
 972                 dumphdr->dump_pagesize = PAGESIZE;
 973                 dumphdr->dump_utsname = utsname;
 974                 (void) strcpy(dumphdr->dump_platform, platform);
 975                 dumpbuf.size = dumpbuf_iosize(maxphys);
 976                 dumpbuf.start = kmem_alloc(dumpbuf.size, KM_SLEEP);
 977                 dumpbuf.end = dumpbuf.start + dumpbuf.size;
 978                 dumpcfg.pids = kmem_alloc(v.v_proc * sizeof (pid_t), KM_SLEEP);
 979                 dumpcfg.helpermap = kmem_zalloc(BT_SIZEOFMAP(NCPU), KM_SLEEP);
 980                 LOCK_INIT_HELD(&dumpcfg.helper_lock);
 981                 dump_stack_scratch = kmem_alloc(STACK_BUF_SIZE, KM_SLEEP);
 982                 (void) strncpy(dumphdr->dump_uuid, dump_get_uuid(),
 983                     sizeof (dumphdr->dump_uuid));
 984         }
 985 
 986         npages = num_phys_pages();
 987 
 988         if (dumpcfg.bitmapsize != npages) {
 989                 size_t rlen = CBUF_MAPP2R(P2ROUNDUP(npages, CBUF_MAPNP));
 990                 void *map = kmem_alloc(BT_SIZEOFMAP(npages), KM_SLEEP);
 991                 void *rmap = kmem_alloc(BT_SIZEOFMAP(rlen), KM_SLEEP);
 992 
 993                 if (dumpcfg.bitmap != NULL)
 994                         kmem_free(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.
 995                             bitmapsize));
 996                 if (dumpcfg.rbitmap != NULL)
 997                         kmem_free(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.
 998                             rbitmapsize));
 999                 dumpcfg.bitmap = map;
1000                 dumpcfg.bitmapsize = npages;
1001                 dumpcfg.rbitmap = rmap;
1002                 dumpcfg.rbitmapsize = rlen;
1003         }
1004 }
1005 
1006 /*
1007  * Establish a new dump device.
1008  */
1009 int
1010 dumpinit(vnode_t *vp, char *name, int justchecking)
1011 {
1012         vnode_t *cvp;
1013         vattr_t vattr;
1014         vnode_t *cdev_vp;
1015         int error = 0;
1016 
1017         ASSERT(MUTEX_HELD(&dump_lock));
1018 
1019         dumphdr_init();
1020 
1021         cvp = common_specvp(vp);
1022         if (cvp == dumpvp)
1023                 return (0);
1024 
1025         /*
1026          * Determine whether this is a plausible dump device.  We want either:
1027          * (1) a real device that's not mounted and has a cb_dump routine, or
1028          * (2) a swapfile on some filesystem that has a vop_dump routine.
1029          */
1030         if ((error = VOP_OPEN(&cvp, FREAD | FWRITE, kcred, NULL)) != 0)
1031                 return (error);
1032 
1033         vattr.va_mask = AT_SIZE | AT_TYPE | AT_RDEV;
1034         if ((error = VOP_GETATTR(cvp, &vattr, 0, kcred, NULL)) == 0) {
1035                 if (vattr.va_type == VBLK || vattr.va_type == VCHR) {
1036                         if (devopsp[getmajor(vattr.va_rdev)]->
1037                             devo_cb_ops->cb_dump == nodev)
1038                                 error = ENOTSUP;
1039                         else if (vfs_devismounted(vattr.va_rdev))
1040                                 error = EBUSY;
1041                         if (strcmp(ddi_driver_name(VTOS(cvp)->s_dip),
1042                             ZFS_DRIVER) == 0 &&
1043                             IS_SWAPVP(common_specvp(cvp)))
1044                                         error = EBUSY;
1045                 } else {
1046                         if (vn_matchopval(cvp, VOPNAME_DUMP, fs_nosys) ||
1047                             !IS_SWAPVP(cvp))
1048                                 error = ENOTSUP;
1049                 }
1050         }
1051 
1052         if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE)
1053                 error = ENOSPC;
1054 
1055         if (error || justchecking) {
1056                 (void) VOP_CLOSE(cvp, FREAD | FWRITE, 1, (offset_t)0,
1057                     kcred, NULL);
1058                 return (error);
1059         }
1060 
1061         VN_HOLD(cvp);
1062 
1063         if (dumpvp != NULL)
1064                 dumpfini();     /* unconfigure the old dump device */
1065 
1066         dumpvp = cvp;
1067         dumpvp_size = vattr.va_size & -DUMP_OFFSET;
1068         dumppath = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1069         (void) strcpy(dumppath, name);
1070         dumpbuf.iosize = 0;
1071 
1072         /*
1073          * If the dump device is a block device, attempt to open up the
1074          * corresponding character device and determine its maximum transfer
1075          * size.  We use this information to potentially resize dumpbuf to a
1076          * larger and more optimal size for performing i/o to the dump device.
1077          */
1078         if (cvp->v_type == VBLK &&
1079             (cdev_vp = makespecvp(VTOS(cvp)->s_dev, VCHR)) != NULL) {
1080                 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) {
1081                         size_t blk_size;
1082                         struct dk_cinfo dki;
1083                         struct dk_minfo minf;
1084 
1085                         if (VOP_IOCTL(cdev_vp, DKIOCGMEDIAINFO,
1086                             (intptr_t)&minf, FKIOCTL, kcred, NULL, NULL)
1087                             == 0 && minf.dki_lbsize != 0)
1088                                 blk_size = minf.dki_lbsize;
1089                         else
1090                                 blk_size = DEV_BSIZE;
1091 
1092                         if (VOP_IOCTL(cdev_vp, DKIOCINFO, (intptr_t)&dki,
1093                             FKIOCTL, kcred, NULL, NULL) == 0) {
1094                                 dumpbuf.iosize = dki.dki_maxtransfer * blk_size;
1095                                 dumpbuf_resize();
1096                         }
1097                         /*
1098                          * If we are working with a zvol then dumpify it
1099                          * if it's not being used as swap.
1100                          */
1101                         if (strcmp(dki.dki_dname, ZVOL_DRIVER) == 0) {
1102                                 if (IS_SWAPVP(common_specvp(cvp)))
1103                                         error = EBUSY;
1104                                 else if ((error = VOP_IOCTL(cdev_vp,
1105                                     DKIOCDUMPINIT, NULL, FKIOCTL, kcred,
1106                                     NULL, NULL)) != 0)
1107                                         dumpfini();
1108                         }
1109 
1110                         (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0,
1111                             kcred, NULL);
1112                 }
1113 
1114                 VN_RELE(cdev_vp);
1115         }
1116 
1117         cmn_err(CE_CONT, "?dump on %s size %llu MB\n", name, dumpvp_size >> 20);
1118 
1119         dump_update_clevel();
1120 
1121         return (error);
1122 }
1123 
1124 void
1125 dumpfini(void)
1126 {
1127         vattr_t vattr;
1128         boolean_t is_zfs = B_FALSE;
1129         vnode_t *cdev_vp;
1130         ASSERT(MUTEX_HELD(&dump_lock));
1131 
1132         kmem_free(dumppath, strlen(dumppath) + 1);
1133 
1134         /*
1135          * Determine if we are using zvols for our dump device
1136          */
1137         vattr.va_mask = AT_RDEV;
1138         if (VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL) == 0) {
1139                 is_zfs = (getmajor(vattr.va_rdev) ==
1140                     ddi_name_to_major(ZFS_DRIVER)) ? B_TRUE : B_FALSE;
1141         }
1142 
1143         /*
1144          * If we have a zvol dump device then we call into zfs so
1145          * that it may have a chance to cleanup.
1146          */
1147         if (is_zfs &&
1148             (cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR)) != NULL) {
1149                 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) {
1150                         (void) VOP_IOCTL(cdev_vp, DKIOCDUMPFINI, NULL, FKIOCTL,
1151                             kcred, NULL, NULL);
1152                         (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0,
1153                             kcred, NULL);
1154                 }
1155                 VN_RELE(cdev_vp);
1156         }
1157 
1158         (void) VOP_CLOSE(dumpvp, FREAD | FWRITE, 1, (offset_t)0, kcred, NULL);
1159 
1160         VN_RELE(dumpvp);
1161 
1162         dumpvp = NULL;
1163         dumpvp_size = 0;
1164         dumppath = NULL;
1165 }
1166 
1167 static offset_t
1168 dumpvp_flush(void)
1169 {
1170         size_t size = P2ROUNDUP(dumpbuf.cur - dumpbuf.start, PAGESIZE);
1171         hrtime_t iotime;
1172         int err;
1173 
1174         if (dumpbuf.vp_off + size > dumpbuf.vp_limit) {
1175                 dump_ioerr = ENOSPC;
1176                 dumpbuf.vp_off = dumpbuf.vp_limit;
1177         } else if (size != 0) {
1178                 iotime = gethrtime();
1179                 dumpsync.iowait += iotime - dumpsync.iowaitts;
1180                 if (panicstr)
1181                         err = VOP_DUMP(dumpvp, dumpbuf.start,
1182                             lbtodb(dumpbuf.vp_off), btod(size), NULL);
1183                 else
1184                         err = vn_rdwr(UIO_WRITE, dumpbuf.cdev_vp != NULL ?
1185                             dumpbuf.cdev_vp : dumpvp, dumpbuf.start, size,
1186                             dumpbuf.vp_off, UIO_SYSSPACE, 0, dumpbuf.vp_limit,
1187                             kcred, 0);
1188                 if (err && dump_ioerr == 0)
1189                         dump_ioerr = err;
1190                 dumpsync.iowaitts = gethrtime();
1191                 dumpsync.iotime += dumpsync.iowaitts - iotime;
1192                 dumpsync.nwrite += size;
1193                 dumpbuf.vp_off += size;
1194         }
1195         dumpbuf.cur = dumpbuf.start;
1196         dump_timeleft = dump_timeout;
1197         return (dumpbuf.vp_off);
1198 }
1199 
1200 /* maximize write speed by keeping seek offset aligned with size */
1201 void
1202 dumpvp_write(const void *va, size_t size)
1203 {
1204         size_t len, off, sz;
1205 
1206         while (size != 0) {
1207                 len = MIN(size, dumpbuf.end - dumpbuf.cur);
1208                 if (len == 0) {
1209                         off = P2PHASE(dumpbuf.vp_off, dumpbuf.size);
1210                         if (off == 0 || !ISP2(dumpbuf.size)) {
1211                                 (void) dumpvp_flush();
1212                         } else {
1213                                 sz = dumpbuf.size - off;
1214                                 dumpbuf.cur = dumpbuf.start + sz;
1215                                 (void) dumpvp_flush();
1216                                 ovbcopy(dumpbuf.start + sz, dumpbuf.start, off);
1217                                 dumpbuf.cur += off;
1218                         }
1219                 } else {
1220                         bcopy(va, dumpbuf.cur, len);
1221                         va = (char *)va + len;
1222                         dumpbuf.cur += len;
1223                         size -= len;
1224                 }
1225         }
1226 }
1227 
1228 /*ARGSUSED*/
1229 static void
1230 dumpvp_ksyms_write(const void *src, void *dst, size_t size)
1231 {
1232         dumpvp_write(src, size);
1233 }
1234 
1235 /*
1236  * Mark 'pfn' in the bitmap and dump its translation table entry.
1237  */
1238 void
1239 dump_addpage(struct as *as, void *va, pfn_t pfn)
1240 {
1241         mem_vtop_t mem_vtop;
1242         pgcnt_t bitnum;
1243 
1244         if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) {
1245                 if (!BT_TEST(dumpcfg.bitmap, bitnum)) {
1246                         dumphdr->dump_npages++;
1247                         BT_SET(dumpcfg.bitmap, bitnum);
1248                 }
1249                 dumphdr->dump_nvtop++;
1250                 mem_vtop.m_as = as;
1251                 mem_vtop.m_va = va;
1252                 mem_vtop.m_pfn = pfn;
1253                 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
1254         }
1255         dump_timeleft = dump_timeout;
1256 }
1257 
1258 /*
1259  * Mark 'pfn' in the bitmap
1260  */
1261 void
1262 dump_page(pfn_t pfn)
1263 {
1264         pgcnt_t bitnum;
1265 
1266         if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) {
1267                 if (!BT_TEST(dumpcfg.bitmap, bitnum)) {
1268                         dumphdr->dump_npages++;
1269                         BT_SET(dumpcfg.bitmap, bitnum);
1270                 }
1271         }
1272         dump_timeleft = dump_timeout;
1273 }
1274 
1275 /*
1276  * Dump the <as, va, pfn> information for a given address space.
1277  * SEGOP_DUMP() will call dump_addpage() for each page in the segment.
1278  */
1279 static void
1280 dump_as(struct as *as)
1281 {
1282         struct seg *seg;
1283 
1284         AS_LOCK_ENTER(as, RW_READER);
1285         for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
1286                 if (seg->s_as != as)
1287                         break;
1288                 if (seg->s_ops == NULL)
1289                         continue;
1290                 SEGOP_DUMP(seg);
1291         }
1292         AS_LOCK_EXIT(as);
1293 
1294         if (seg != NULL)
1295                 cmn_err(CE_WARN, "invalid segment %p in address space %p",
1296                     (void *)seg, (void *)as);
1297 }
1298 
1299 static int
1300 dump_process(pid_t pid)
1301 {
1302         proc_t *p = sprlock(pid);
1303 
1304         if (p == NULL)
1305                 return (-1);
1306         if (p->p_as != &kas) {
1307                 mutex_exit(&p->p_lock);
1308                 dump_as(p->p_as);
1309                 mutex_enter(&p->p_lock);
1310         }
1311 
1312         sprunlock(p);
1313 
1314         return (0);
1315 }
1316 
1317 /*
1318  * The following functions (dump_summary(), dump_ereports(), and
1319  * dump_messages()), write data to an uncompressed area within the
1320  * crashdump. The layout of these is
1321  *
1322  * +------------------------------------------------------------+
1323  * |     compressed pages       | summary | ereports | messages |
1324  * +------------------------------------------------------------+
1325  *
1326  * With the advent of saving a compressed crash dump by default, we
1327  * need to save a little more data to describe the failure mode in
1328  * an uncompressed buffer available before savecore uncompresses
1329  * the dump. Initially this is a copy of the stack trace. Additional
1330  * summary information should be added here.
1331  */
1332 
1333 void
1334 dump_summary(void)
1335 {
1336         u_offset_t dumpvp_start;
1337         summary_dump_t sd;
1338 
1339         if (dumpvp == NULL || dumphdr == NULL)
1340                 return;
1341 
1342         dumpbuf.cur = dumpbuf.start;
1343 
1344         dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE +
1345             DUMP_ERPTSIZE);
1346         dumpvp_start = dumpbuf.vp_limit - DUMP_SUMMARYSIZE;
1347         dumpbuf.vp_off = dumpvp_start;
1348 
1349         sd.sd_magic = SUMMARY_MAGIC;
1350         sd.sd_ssum = checksum32(dump_stack_scratch, STACK_BUF_SIZE);
1351         dumpvp_write(&sd, sizeof (sd));
1352         dumpvp_write(dump_stack_scratch, STACK_BUF_SIZE);
1353 
1354         sd.sd_magic = 0; /* indicate end of summary */
1355         dumpvp_write(&sd, sizeof (sd));
1356         (void) dumpvp_flush();
1357 }
1358 
1359 void
1360 dump_ereports(void)
1361 {
1362         u_offset_t dumpvp_start;
1363         erpt_dump_t ed;
1364 
1365         if (dumpvp == NULL || dumphdr == NULL)
1366                 return;
1367 
1368         dumpbuf.cur = dumpbuf.start;
1369         dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE);
1370         dumpvp_start = dumpbuf.vp_limit - DUMP_ERPTSIZE;
1371         dumpbuf.vp_off = dumpvp_start;
1372 
1373         fm_ereport_dump();
1374         if (panicstr)
1375                 errorq_dump();
1376 
1377         bzero(&ed, sizeof (ed)); /* indicate end of ereports */
1378         dumpvp_write(&ed, sizeof (ed));
1379         (void) dumpvp_flush();
1380 
1381         if (!panicstr) {
1382                 (void) VOP_PUTPAGE(dumpvp, dumpvp_start,
1383                     (size_t)(dumpbuf.vp_off - dumpvp_start),
1384                     B_INVAL | B_FORCE, kcred, NULL);
1385         }
1386 }
1387 
1388 void
1389 dump_messages(void)
1390 {
1391         log_dump_t ld;
1392         mblk_t *mctl, *mdata;
1393         queue_t *q, *qlast;
1394         u_offset_t dumpvp_start;
1395 
1396         if (dumpvp == NULL || dumphdr == NULL || log_consq == NULL)
1397                 return;
1398 
1399         dumpbuf.cur = dumpbuf.start;
1400         dumpbuf.vp_limit = dumpvp_size - DUMP_OFFSET;
1401         dumpvp_start = dumpbuf.vp_limit - DUMP_LOGSIZE;
1402         dumpbuf.vp_off = dumpvp_start;
1403 
1404         qlast = NULL;
1405         do {
1406                 for (q = log_consq; q->q_next != qlast; q = q->q_next)
1407                         continue;
1408                 for (mctl = q->q_first; mctl != NULL; mctl = mctl->b_next) {
1409                         dump_timeleft = dump_timeout;
1410                         mdata = mctl->b_cont;
1411                         ld.ld_magic = LOG_MAGIC;
1412                         ld.ld_msgsize = MBLKL(mctl->b_cont);
1413                         ld.ld_csum = checksum32(mctl->b_rptr, MBLKL(mctl));
1414                         ld.ld_msum = checksum32(mdata->b_rptr, MBLKL(mdata));
1415                         dumpvp_write(&ld, sizeof (ld));
1416                         dumpvp_write(mctl->b_rptr, MBLKL(mctl));
1417                         dumpvp_write(mdata->b_rptr, MBLKL(mdata));
1418                 }
1419         } while ((qlast = q) != log_consq);
1420 
1421         ld.ld_magic = 0;                /* indicate end of messages */
1422         dumpvp_write(&ld, sizeof (ld));
1423         (void) dumpvp_flush();
1424         if (!panicstr) {
1425                 (void) VOP_PUTPAGE(dumpvp, dumpvp_start,
1426                     (size_t)(dumpbuf.vp_off - dumpvp_start),
1427                     B_INVAL | B_FORCE, kcred, NULL);
1428         }
1429 }
1430 
1431 /*
1432  * The following functions are called on multiple CPUs during dump.
1433  * They must not use most kernel services, because all cross-calls are
1434  * disabled during panic. Therefore, blocking locks and cache flushes
1435  * will not work.
1436  */
1437 
1438 /*
1439  * Copy pages, trapping ECC errors. Also, for robustness, trap data
1440  * access in case something goes wrong in the hat layer and the
1441  * mapping is broken.
1442  */
1443 static int
1444 dump_pagecopy(void *src, void *dst)
1445 {
1446         long *wsrc = (long *)src;
1447         long *wdst = (long *)dst;
1448         const ulong_t ncopies = PAGESIZE / sizeof (long);
1449         volatile int w = 0;
1450         volatile int ueoff = -1;
1451         on_trap_data_t otd;
1452 
1453         if (on_trap(&otd, OT_DATA_EC | OT_DATA_ACCESS)) {
1454                 if (ueoff == -1)
1455                         ueoff = w * sizeof (long);
1456                 /* report "bad ECC" or "bad address" */
1457 #ifdef _LP64
1458                 if (otd.ot_trap & OT_DATA_EC)
1459                         wdst[w++] = 0x00badecc00badecc;
1460                 else
1461                         wdst[w++] = 0x00badadd00badadd;
1462 #else
1463                 if (otd.ot_trap & OT_DATA_EC)
1464                         wdst[w++] = 0x00badecc;
1465                 else
1466                         wdst[w++] = 0x00badadd;
1467 #endif
1468         }
1469         while (w < ncopies) {
1470                 wdst[w] = wsrc[w];
1471                 w++;
1472         }
1473         no_trap();
1474         return (ueoff);
1475 }
1476 
1477 static void
1478 dumpsys_close_cq(cqueue_t *cq, int live)
1479 {
1480         if (live) {
1481                 mutex_enter(&cq->mutex);
1482                 atomic_dec_uint(&cq->open);
1483                 cv_signal(&cq->cv);
1484                 mutex_exit(&cq->mutex);
1485         } else {
1486                 atomic_dec_uint(&cq->open);
1487         }
1488 }
1489 
1490 static inline void
1491 dumpsys_spinlock(lock_t *lp)
1492 {
1493         uint_t backoff = 0;
1494         int loop_count = 0;
1495 
1496         while (LOCK_HELD(lp) || !lock_spin_try(lp)) {
1497                 if (++loop_count >= ncpus) {
1498                         backoff = mutex_lock_backoff(0);
1499                         loop_count = 0;
1500                 } else {
1501                         backoff = mutex_lock_backoff(backoff);
1502                 }
1503                 mutex_lock_delay(backoff);
1504         }
1505 }
1506 
1507 static inline void
1508 dumpsys_spinunlock(lock_t *lp)
1509 {
1510         lock_clear(lp);
1511 }
1512 
1513 static inline void
1514 dumpsys_lock(cqueue_t *cq, int live)
1515 {
1516         if (live)
1517                 mutex_enter(&cq->mutex);
1518         else
1519                 dumpsys_spinlock(&cq->spinlock);
1520 }
1521 
1522 static inline void
1523 dumpsys_unlock(cqueue_t *cq, int live, int signal)
1524 {
1525         if (live) {
1526                 if (signal)
1527                         cv_signal(&cq->cv);
1528                 mutex_exit(&cq->mutex);
1529         } else {
1530                 dumpsys_spinunlock(&cq->spinlock);
1531         }
1532 }
1533 
1534 static void
1535 dumpsys_wait_cq(cqueue_t *cq, int live)
1536 {
1537         if (live) {
1538                 cv_wait(&cq->cv, &cq->mutex);
1539         } else {
1540                 dumpsys_spinunlock(&cq->spinlock);
1541                 while (cq->open)
1542                         if (cq->first)
1543                                 break;
1544                 dumpsys_spinlock(&cq->spinlock);
1545         }
1546 }
1547 
1548 static void
1549 dumpsys_put_cq(cqueue_t *cq, cbuf_t *cp, int newstate, int live)
1550 {
1551         if (cp == NULL)
1552                 return;
1553 
1554         dumpsys_lock(cq, live);
1555 
1556         if (cq->ts != 0) {
1557                 cq->empty += gethrtime() - cq->ts;
1558                 cq->ts = 0;
1559         }
1560 
1561         cp->state = newstate;
1562         cp->next = NULL;
1563         if (cq->last == NULL)
1564                 cq->first = cp;
1565         else
1566                 cq->last->next = cp;
1567         cq->last = cp;
1568 
1569         dumpsys_unlock(cq, live, 1);
1570 }
1571 
1572 static cbuf_t *
1573 dumpsys_get_cq(cqueue_t *cq, int live)
1574 {
1575         cbuf_t *cp;
1576         hrtime_t now = gethrtime();
1577 
1578         dumpsys_lock(cq, live);
1579 
1580         /* CONSTCOND */
1581         while (1) {
1582                 cp = (cbuf_t *)cq->first;
1583                 if (cp == NULL) {
1584                         if (cq->open == 0)
1585                                 break;
1586                         dumpsys_wait_cq(cq, live);
1587                         continue;
1588                 }
1589                 cq->first = cp->next;
1590                 if (cq->first == NULL) {
1591                         cq->last = NULL;
1592                         cq->ts = now;
1593                 }
1594                 break;
1595         }
1596 
1597         dumpsys_unlock(cq, live, cq->first != NULL || cq->open == 0);
1598         return (cp);
1599 }
1600 
1601 /*
1602  * Send an error message to the console. If the main task is running
1603  * just write the message via uprintf. If a helper is running the
1604  * message has to be put on a queue for the main task. Setting fmt to
1605  * NULL means flush the error message buffer. If fmt is not NULL, just
1606  * add the text to the existing buffer.
1607  */
1608 static void
1609 dumpsys_errmsg(helper_t *hp, const char *fmt, ...)
1610 {
1611         dumpsync_t *ds = hp->ds;
1612         cbuf_t *cp = hp->cperr;
1613         va_list adx;
1614 
1615         if (hp->helper == MAINHELPER) {
1616                 if (fmt != NULL) {
1617                         if (ds->neednl) {
1618                                 uprintf("\n");
1619                                 ds->neednl = 0;
1620                         }
1621                         va_start(adx, fmt);
1622                         vuprintf(fmt, adx);
1623                         va_end(adx);
1624                 }
1625         } else if (fmt == NULL) {
1626                 if (cp != NULL) {
1627                         CQ_PUT(mainq, cp, CBUF_ERRMSG);
1628                         hp->cperr = NULL;
1629                 }
1630         } else {
1631                 if (hp->cperr == NULL) {
1632                         cp = CQ_GET(freebufq);
1633                         hp->cperr = cp;
1634                         cp->used = 0;
1635                 }
1636                 va_start(adx, fmt);
1637                 cp->used += vsnprintf(cp->buf + cp->used, cp->size - cp->used,
1638                     fmt, adx);
1639                 va_end(adx);
1640                 if ((cp->used + LOG_MSGSIZE) > cp->size) {
1641                         CQ_PUT(mainq, cp, CBUF_ERRMSG);
1642                         hp->cperr = NULL;
1643                 }
1644         }
1645 }
1646 
1647 /*
1648  * Write an output buffer to the dump file. If the main task is
1649  * running just write the data. If a helper is running the output is
1650  * placed on a queue for the main task.
1651  */
1652 static void
1653 dumpsys_swrite(helper_t *hp, cbuf_t *cp, size_t used)
1654 {
1655         dumpsync_t *ds = hp->ds;
1656 
1657         if (hp->helper == MAINHELPER) {
1658                 HRSTART(ds->perpage, write);
1659                 dumpvp_write(cp->buf, used);
1660                 HRSTOP(ds->perpage, write);
1661                 CQ_PUT(freebufq, cp, CBUF_FREEBUF);
1662         } else {
1663                 cp->used = used;
1664                 CQ_PUT(mainq, cp, CBUF_WRITE);
1665         }
1666 }
1667 
1668 /*
1669  * Copy one page within the mapped range. The offset starts at 0 and
1670  * is relative to the first pfn. cp->buf + cp->off is the address of
1671  * the first pfn. If dump_pagecopy returns a UE offset, create an
1672  * error message.  Returns the offset to the next pfn in the range
1673  * selected by the bitmap.
1674  */
1675 static int
1676 dumpsys_copy_page(helper_t *hp, int offset)
1677 {
1678         cbuf_t *cp = hp->cpin;
1679         int ueoff;
1680 
1681         ASSERT(cp->off + offset + PAGESIZE <= cp->size);
1682         ASSERT(BT_TEST(dumpcfg.bitmap, cp->bitnum));
1683 
1684         ueoff = dump_pagecopy(cp->buf + cp->off + offset, hp->page);
1685 
1686         /* ueoff is the offset in the page to a UE error */
1687         if (ueoff != -1) {
1688                 uint64_t pa = ptob(cp->pfn) + offset + ueoff;
1689 
1690                 dumpsys_errmsg(hp, "cpu %d: memory error at PA 0x%08x.%08x\n",
1691                     CPU->cpu_id, (uint32_t)(pa >> 32), (uint32_t)pa);
1692         }
1693 
1694         /*
1695          * Advance bitnum and offset to the next input page for the
1696          * next call to this function.
1697          */
1698         offset += PAGESIZE;
1699         cp->bitnum++;
1700         while (cp->off + offset < cp->size) {
1701                 if (BT_TEST(dumpcfg.bitmap, cp->bitnum))
1702                         break;
1703                 offset += PAGESIZE;
1704                 cp->bitnum++;
1705         }
1706 
1707         return (offset);
1708 }
1709 
1710 /*
1711  * Read the helper queue, and copy one mapped page. Return 0 when
1712  * done. Return 1 when a page has been copied into hp->page.
1713  */
1714 static int
1715 dumpsys_sread(helper_t *hp)
1716 {
1717         dumpsync_t *ds = hp->ds;
1718 
1719         /* CONSTCOND */
1720         while (1) {
1721 
1722                 /* Find the next input buffer. */
1723                 if (hp->cpin == NULL) {
1724                         HRSTART(hp->perpage, inwait);
1725 
1726                         /* CONSTCOND */
1727                         while (1) {
1728                                 hp->cpin = CQ_GET(helperq);
1729                                 dump_timeleft = dump_timeout;
1730 
1731                                 /*
1732                                  * NULL return means the helper queue
1733                                  * is closed and empty.
1734                                  */
1735                                 if (hp->cpin == NULL)
1736                                         break;
1737 
1738                                 /* Have input, check for dump I/O error. */
1739                                 if (!dump_ioerr)
1740                                         break;
1741 
1742                                 /*
1743                                  * If an I/O error occurs, stay in the
1744                                  * loop in order to empty the helper
1745                                  * queue. Return the buffers to the
1746                                  * main task to unmap and free it.
1747                                  */
1748                                 hp->cpin->used = 0;
1749                                 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
1750                         }
1751                         HRSTOP(hp->perpage, inwait);
1752 
1753                         /* Stop here when the helper queue is closed. */
1754                         if (hp->cpin == NULL)
1755                                 break;
1756 
1757                         /* Set the offset=0 to get the first pfn. */
1758                         hp->in = 0;
1759 
1760                         /* Set the total processed to 0 */
1761                         hp->used = 0;
1762                 }
1763 
1764                 /* Process the next page. */
1765                 if (hp->used < hp->cpin->used) {
1766 
1767                         /*
1768                          * Get the next page from the input buffer and
1769                          * return a copy.
1770                          */
1771                         ASSERT(hp->in != -1);
1772                         HRSTART(hp->perpage, copy);
1773                         hp->in = dumpsys_copy_page(hp, hp->in);
1774                         hp->used += PAGESIZE;
1775                         HRSTOP(hp->perpage, copy);
1776                         break;
1777 
1778                 } else {
1779 
1780                         /*
1781                          * Done with the input. Flush the VM and
1782                          * return the buffer to the main task.
1783                          */
1784                         if (panicstr && hp->helper != MAINHELPER)
1785                                 hat_flush_range(kas.a_hat,
1786                                     hp->cpin->buf, hp->cpin->size);
1787                         dumpsys_errmsg(hp, NULL);
1788                         CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
1789                         hp->cpin = NULL;
1790                 }
1791         }
1792 
1793         return (hp->cpin != NULL);
1794 }
1795 
1796 /*
1797  * Compress with lzjb
1798  * write stream block if full or size==0
1799  * if csize==0 write stream header, else write <csize, data>
1800  * size==0 is a call to flush a buffer
1801  * hp->cpout is the buffer we are flushing or filling
1802  * hp->out is the next index to fill data
1803  * osize is either csize+data, or the size of a stream header
1804  */
1805 static void
1806 dumpsys_lzjbrun(helper_t *hp, size_t csize, void *buf, size_t size)
1807 {
1808         dumpsync_t *ds = hp->ds;
1809         const int CSIZE = sizeof (dumpcsize_t);
1810         dumpcsize_t cs;
1811         size_t osize = csize > 0 ? CSIZE + size : size;
1812 
1813         /* If flush, and there is no buffer, just return */
1814         if (size == 0 && hp->cpout == NULL)
1815                 return;
1816 
1817         /* If flush, or cpout is full, write it out */
1818         if (size == 0 ||
1819             hp->cpout != NULL && hp->out + osize > hp->cpout->size) {
1820 
1821                 /* Set tag+size word at the front of the stream block. */
1822                 cs = DUMP_SET_TAG(hp->out - CSIZE, hp->tag);
1823                 (void) memcpy(hp->cpout->buf, &cs, CSIZE);
1824 
1825                 /* Write block to dump file. */
1826                 dumpsys_swrite(hp, hp->cpout, hp->out);
1827 
1828                 /* Clear pointer to indicate we need a new buffer */
1829                 hp->cpout = NULL;
1830 
1831                 /* flushing, we are done */
1832                 if (size == 0)
1833                         return;
1834         }
1835 
1836         /* Get an output buffer if we dont have one. */
1837         if (hp->cpout == NULL) {
1838                 HRSTART(hp->perpage, outwait);
1839                 hp->cpout = CQ_GET(freebufq);
1840                 HRSTOP(hp->perpage, outwait);
1841                 hp->out = CSIZE;
1842         }
1843 
1844         /* Store csize word. This is the size of compressed data. */
1845         if (csize > 0) {
1846                 cs = DUMP_SET_TAG(csize, 0);
1847                 (void) memcpy(hp->cpout->buf + hp->out, &cs, CSIZE);
1848                 hp->out += CSIZE;
1849         }
1850 
1851         /* Store the data. */
1852         (void) memcpy(hp->cpout->buf + hp->out, buf, size);
1853         hp->out += size;
1854 }
1855 
1856 static void
1857 dumpsys_lzjbcompress(helper_t *hp)
1858 {
1859         dumpsync_t *ds = hp->ds;
1860         size_t csize;
1861         dumpstreamhdr_t sh;
1862 
1863         (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC);
1864         sh.stream_pagenum = (pfn_t)-1;
1865         sh.stream_npages = 0;
1866         hp->cpin = NULL;
1867         hp->cpout = NULL;
1868         hp->cperr = NULL;
1869         hp->in = 0;
1870         hp->out = 0;
1871 
1872         /* Bump reference to mainq while we are running */
1873         CQ_OPEN(mainq);
1874 
1875         /* Get one page at a time */
1876         while (dumpsys_sread(hp)) {
1877 
1878                 /* Create a stream header for each new input map */
1879                 if (sh.stream_pagenum != hp->cpin->pagenum) {
1880                         sh.stream_pagenum = hp->cpin->pagenum;
1881                         sh.stream_npages = btop(hp->cpin->used);
1882                         dumpsys_lzjbrun(hp, 0, &sh, sizeof (sh));
1883                 }
1884 
1885                 /* Compress one page */
1886                 HRSTART(hp->perpage, compress);
1887                 csize = compress(hp->page, hp->lzbuf, PAGESIZE);
1888                 HRSTOP(hp->perpage, compress);
1889 
1890                 /* Add csize+data to output block */
1891                 ASSERT(csize > 0 && csize <= PAGESIZE);
1892                 dumpsys_lzjbrun(hp, csize, hp->lzbuf, csize);
1893         }
1894 
1895         /* Done with input, flush any partial buffer */
1896         if (sh.stream_pagenum != (pfn_t)-1) {
1897                 dumpsys_lzjbrun(hp, 0, NULL, 0);
1898                 dumpsys_errmsg(hp, NULL);
1899         }
1900 
1901         ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL);
1902 
1903         /* Decrement main queue count, we are done */
1904         CQ_CLOSE(mainq);
1905 }
1906 
1907 /*
1908  * Dump helper called from panic_idle() to compress pages.  CPUs in
1909  * this path must not call most kernel services.
1910  *
1911  * During panic, all but one of the CPUs is idle. These CPUs are used
1912  * as helpers working in parallel to copy and compress memory
1913  * pages. During a panic, however, these processors cannot call any
1914  * kernel services. This is because mutexes become no-ops during
1915  * panic, and, cross-call interrupts are inhibited.  Therefore, during
1916  * panic dump the helper CPUs communicate with the panic CPU using
1917  * memory variables. All memory mapping and I/O is performed by the
1918  * panic CPU.
1919  *
1920  * At dump configuration time, helper_lock is set and helpers_wanted
1921  * is 0. dumpsys() decides whether to set helpers_wanted before
1922  * clearing helper_lock.
1923  *
1924  * At panic time, idle CPUs spin-wait on helper_lock, then alternately
1925  * take the lock and become a helper, or return.
1926  */
1927 void
1928 dumpsys_helper()
1929 {
1930         dumpsys_spinlock(&dumpcfg.helper_lock);
1931         if (dumpcfg.helpers_wanted) {
1932                 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper];
1933 
1934                 for (hp = dumpcfg.helper; hp != hpend; hp++) {
1935                         if (hp->helper == FREEHELPER) {
1936                                 hp->helper = CPU->cpu_id;
1937                                 BT_SET(dumpcfg.helpermap, CPU->cpu_seqid);
1938                                 dumpsys_spinunlock(&dumpcfg.helper_lock);
1939                                 dumpsys_lzjbcompress(hp);
1940                                 hp->helper = DONEHELPER;
1941                                 return;
1942                         }
1943                 }
1944 
1945                 /* No more helpers are needed. */
1946                 dumpcfg.helpers_wanted = 0;
1947 
1948         }
1949         dumpsys_spinunlock(&dumpcfg.helper_lock);
1950 }
1951 
1952 /*
1953  * No-wait helper callable in spin loops.
1954  *
1955  * Do not wait for helper_lock. Just check helpers_wanted. The caller
1956  * may decide to continue. This is the "c)ontinue, s)ync, r)eset? s"
1957  * case.
1958  */
1959 void
1960 dumpsys_helper_nw()
1961 {
1962         if (dumpcfg.helpers_wanted)
1963                 dumpsys_helper();
1964 }
1965 
1966 /*
1967  * Dump helper for live dumps.
1968  * These run as a system task.
1969  */
1970 static void
1971 dumpsys_live_helper(void *arg)
1972 {
1973         helper_t *hp = arg;
1974 
1975         BT_ATOMIC_SET(dumpcfg.helpermap, CPU->cpu_seqid);
1976         dumpsys_lzjbcompress(hp);
1977 }
1978 
1979 /*
1980  * Compress one page with lzjb (single threaded case)
1981  */
1982 static void
1983 dumpsys_lzjb_page(helper_t *hp, cbuf_t *cp)
1984 {
1985         dumpsync_t *ds = hp->ds;
1986         uint32_t csize;
1987 
1988         hp->helper = MAINHELPER;
1989         hp->in = 0;
1990         hp->used = 0;
1991         hp->cpin = cp;
1992         while (hp->used < cp->used) {
1993                 HRSTART(hp->perpage, copy);
1994                 hp->in = dumpsys_copy_page(hp, hp->in);
1995                 hp->used += PAGESIZE;
1996                 HRSTOP(hp->perpage, copy);
1997 
1998                 HRSTART(hp->perpage, compress);
1999                 csize = compress(hp->page, hp->lzbuf, PAGESIZE);
2000                 HRSTOP(hp->perpage, compress);
2001 
2002                 HRSTART(hp->perpage, write);
2003                 dumpvp_write(&csize, sizeof (csize));
2004                 dumpvp_write(hp->lzbuf, csize);
2005                 HRSTOP(hp->perpage, write);
2006         }
2007         CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
2008         hp->cpin = NULL;
2009 }
2010 
2011 /*
2012  * Main task to dump pages. This is called on the dump CPU.
2013  */
2014 static void
2015 dumpsys_main_task(void *arg)
2016 {
2017         dumpsync_t *ds = arg;
2018         pgcnt_t pagenum = 0, bitnum = 0, hibitnum;
2019         dumpmlw_t mlw;
2020         cbuf_t *cp;
2021         pgcnt_t baseoff, pfnoff;
2022         pfn_t base, pfn;
2023         int i;
2024 
2025         /*
2026          * Fall back to serial mode if there are no helpers.
2027          * dump_ncpu_low can be set to 0 at any time.
2028          * dumpcfg.helpermap must contain at least one member.
2029          *
2030          * It is possible that the helpers haven't registered
2031          * in helpermap yet; wait up to DUMP_HELPER_MAX_WAIT for
2032          * at least one helper to register.
2033          */
2034         if (dump_ncpu_low != 0 && dumpcfg.clevel != DUMP_CLEVEL_SERIAL) {
2035                 boolean_t dumpserial = B_TRUE;
2036                 hrtime_t hrtmax = MSEC2NSEC(DUMP_HELPER_MAX_WAIT);
2037                 hrtime_t hrtstart = gethrtime();
2038 
2039                 for (;;) {
2040                         for (i = 0; i < BT_BITOUL(NCPU); ++i) {
2041                                 if (dumpcfg.helpermap[i] != 0) {
2042                                         dumpserial = B_FALSE;
2043                                         break;
2044                                 }
2045                         }
2046 
2047                         if ((!dumpserial) ||
2048                             ((gethrtime() - hrtstart) >= hrtmax)) {
2049                                 break;
2050                         }
2051 
2052                         SMT_PAUSE();
2053                 }
2054 
2055                 if (dumpserial) {
2056                         dumpcfg.clevel = DUMP_CLEVEL_SERIAL;
2057                         if (dumpcfg.helper[0].lzbuf == NULL) {
2058                                 dumpcfg.helper[0].lzbuf =
2059                                     dumpcfg.helper[1].page;
2060                         }
2061                 }
2062         }
2063 
2064         dump_init_memlist_walker(&mlw);
2065 
2066         for (;;) {
2067                 int sec = (gethrtime() - ds->start) / NANOSEC;
2068 
2069                 /*
2070                  * Render a simple progress display on the system console to
2071                  * make clear to the operator that the system has not hung.
2072                  * Emit an update when dump progress has advanced by one
2073                  * percent, or when no update has been drawn in the last
2074                  * second.
2075                  */
2076                 if (ds->percent > ds->percent_done || sec > ds->sec_done) {
2077                         ds->sec_done = sec;
2078                         ds->percent_done = ds->percent;
2079                         uprintf("^\rdumping: %2d:%02d %3d%% done",
2080                             sec / 60, sec % 60, ds->percent);
2081                         ds->neednl = 1;
2082                 }
2083 
2084                 while (CQ_IS_EMPTY(mainq) && !CQ_IS_EMPTY(writerq)) {
2085 
2086                         /* the writerq never blocks */
2087                         cp = CQ_GET(writerq);
2088                         if (cp == NULL)
2089                                 break;
2090 
2091                         dump_timeleft = dump_timeout;
2092 
2093                         HRSTART(ds->perpage, write);
2094                         dumpvp_write(cp->buf, cp->used);
2095                         HRSTOP(ds->perpage, write);
2096 
2097                         CQ_PUT(freebufq, cp, CBUF_FREEBUF);
2098                 }
2099 
2100                 /*
2101                  * Wait here for some buffers to process. Returns NULL
2102                  * when all helpers have terminated and all buffers
2103                  * have been processed.
2104                  */
2105                 cp = CQ_GET(mainq);
2106 
2107                 if (cp == NULL) {
2108 
2109                         /* Drain the write queue. */
2110                         if (!CQ_IS_EMPTY(writerq))
2111                                 continue;
2112 
2113                         /* Main task exits here. */
2114                         break;
2115                 }
2116 
2117                 dump_timeleft = dump_timeout;
2118 
2119                 switch (cp->state) {
2120 
2121                 case CBUF_FREEMAP:
2122 
2123                         /*
2124                          * Note that we drop CBUF_FREEMAP buffers on
2125                          * the floor (they will not be on any cqueue)
2126                          * when we no longer need them.
2127                          */
2128                         if (bitnum >= dumpcfg.bitmapsize)
2129                                 break;
2130 
2131                         if (dump_ioerr) {
2132                                 bitnum = dumpcfg.bitmapsize;
2133                                 CQ_CLOSE(helperq);
2134                                 break;
2135                         }
2136 
2137                         HRSTART(ds->perpage, bitmap);
2138                         for (; bitnum < dumpcfg.bitmapsize; bitnum++)
2139                                 if (BT_TEST(dumpcfg.bitmap, bitnum))
2140                                         break;
2141                         HRSTOP(ds->perpage, bitmap);
2142                         dump_timeleft = dump_timeout;
2143 
2144                         if (bitnum >= dumpcfg.bitmapsize) {
2145                                 CQ_CLOSE(helperq);
2146                                 break;
2147                         }
2148 
2149                         /*
2150                          * Try to map CBUF_MAPSIZE ranges. Can't
2151                          * assume that memory segment size is a
2152                          * multiple of CBUF_MAPSIZE. Can't assume that
2153                          * the segment starts on a CBUF_MAPSIZE
2154                          * boundary.
2155                          */
2156                         pfn = dump_bitnum_to_pfn(bitnum, &mlw);
2157                         ASSERT(pfn != PFN_INVALID);
2158                         ASSERT(bitnum + mlw.mpleft <= dumpcfg.bitmapsize);
2159 
2160                         base = P2ALIGN(pfn, CBUF_MAPNP);
2161                         if (base < mlw.mpaddr) {
2162                                 base = mlw.mpaddr;
2163                                 baseoff = P2PHASE(base, CBUF_MAPNP);
2164                         } else {
2165                                 baseoff = 0;
2166                         }
2167 
2168                         pfnoff = pfn - base;
2169                         if (pfnoff + mlw.mpleft < CBUF_MAPNP) {
2170                                 hibitnum = bitnum + mlw.mpleft;
2171                                 cp->size = ptob(pfnoff + mlw.mpleft);
2172                         } else {
2173                                 hibitnum = bitnum - pfnoff + CBUF_MAPNP -
2174                                     baseoff;
2175                                 cp->size = CBUF_MAPSIZE - ptob(baseoff);
2176                         }
2177 
2178                         cp->pfn = pfn;
2179                         cp->bitnum = bitnum++;
2180                         cp->pagenum = pagenum++;
2181                         cp->off = ptob(pfnoff);
2182 
2183                         for (; bitnum < hibitnum; bitnum++)
2184                                 if (BT_TEST(dumpcfg.bitmap, bitnum))
2185                                         pagenum++;
2186 
2187                         dump_timeleft = dump_timeout;
2188                         cp->used = ptob(pagenum - cp->pagenum);
2189 
2190                         HRSTART(ds->perpage, map);
2191                         hat_devload(kas.a_hat, cp->buf, cp->size, base,
2192                             PROT_READ, HAT_LOAD_NOCONSIST);
2193                         HRSTOP(ds->perpage, map);
2194 
2195                         ds->pages_mapped += btop(cp->size);
2196                         ds->pages_used += pagenum - cp->pagenum;
2197 
2198                         CQ_OPEN(mainq);
2199 
2200                         /*
2201                          * If there are no helpers the main task does
2202                          * non-streams lzjb compress.
2203                          */
2204                         if (dumpcfg.clevel == DUMP_CLEVEL_SERIAL) {
2205                                 dumpsys_lzjb_page(dumpcfg.helper, cp);
2206                         } else {
2207                                 /* pass mapped pages to a helper */
2208                                 CQ_PUT(helperq, cp, CBUF_INREADY);
2209                         }
2210 
2211                         /* the last page was done */
2212                         if (bitnum >= dumpcfg.bitmapsize)
2213                                 CQ_CLOSE(helperq);
2214 
2215                         break;
2216 
2217                 case CBUF_USEDMAP:
2218 
2219                         ds->npages += btop(cp->used);
2220 
2221                         HRSTART(ds->perpage, unmap);
2222                         hat_unload(kas.a_hat, cp->buf, cp->size, HAT_UNLOAD);
2223                         HRSTOP(ds->perpage, unmap);
2224 
2225                         if (bitnum < dumpcfg.bitmapsize)
2226                                 CQ_PUT(mainq, cp, CBUF_FREEMAP);
2227                         CQ_CLOSE(mainq);
2228 
2229                         ASSERT(ds->npages <= dumphdr->dump_npages);
2230                         ds->percent = ds->npages * 100LL / dumphdr->dump_npages;
2231                         break;
2232 
2233                 case CBUF_WRITE:
2234 
2235                         CQ_PUT(writerq, cp, CBUF_WRITE);
2236                         break;
2237 
2238                 case CBUF_ERRMSG:
2239 
2240                         if (cp->used > 0) {
2241                                 cp->buf[cp->size - 2] = '\n';
2242                                 cp->buf[cp->size - 1] = '\0';
2243                                 if (ds->neednl) {
2244                                         uprintf("\n%s", cp->buf);
2245                                         ds->neednl = 0;
2246                                 } else {
2247                                         uprintf("%s", cp->buf);
2248                                 }
2249                                 /* wait for console output */
2250                                 drv_usecwait(200000);
2251                                 dump_timeleft = dump_timeout;
2252                         }
2253                         CQ_PUT(freebufq, cp, CBUF_FREEBUF);
2254                         break;
2255 
2256                 default:
2257                         uprintf("dump: unexpected buffer state %d, "
2258                             "buffer will be lost\n", cp->state);
2259                         break;
2260 
2261                 } /* end switch */
2262         }
2263 }
2264 
2265 #ifdef  COLLECT_METRICS
2266 size_t
2267 dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size)
2268 {
2269         dumpcfg_t *cfg = &dumpcfg;
2270         int myid = CPU->cpu_seqid;
2271         int i, compress_ratio;
2272         int sec, iorate;
2273         helper_t *hp, *hpend = &cfg->helper[cfg->nhelper];
2274         char *e = buf + size;
2275         char *p = buf;
2276 
2277         sec = ds->elapsed / (1000 * 1000 * 1000ULL);
2278         if (sec < 1)
2279                 sec = 1;
2280 
2281         if (ds->iotime < 1)
2282                 ds->iotime = 1;
2283         iorate = (ds->nwrite * 100000ULL) / ds->iotime;
2284 
2285         compress_ratio = 100LL * ds->npages / btopr(ds->nwrite + 1);
2286 
2287 #define P(...) (p += p < e ? snprintf(p, e - p, __VA_ARGS__) : 0)
2288 
2289         P("Master cpu_seqid,%d\n", CPU->cpu_seqid);
2290         P("Master cpu_id,%d\n", CPU->cpu_id);
2291         P("dump_flags,0x%x\n", dumphdr->dump_flags);
2292         P("dump_ioerr,%d\n", dump_ioerr);
2293 
2294         P("Helpers:\n");
2295         for (i = 0; i < ncpus; i++) {
2296                 if ((i & 15) == 0)
2297                         P(",,%03d,", i);
2298                 if (i == myid)
2299                         P("   M");
2300                 else if (BT_TEST(cfg->helpermap, i))
2301                         P("%4d", cpu_seq[i]->cpu_id);
2302                 else
2303                         P("   *");
2304                 if ((i & 15) == 15)
2305                         P("\n");
2306         }
2307 
2308         P("ncbuf_used,%d\n", cfg->ncbuf_used);
2309         P("ncmap,%d\n", cfg->ncmap);
2310 
2311         P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m);
2312         P("Found small pages,%ld\n", cfg->foundsm);
2313 
2314         P("Compression level,%d\n", cfg->clevel);
2315         P("Compression type,%s lzjb\n",
2316             cfg->clevel == DUMP_CLEVEL_SERIAL ? "serial" : "parallel");
2317         P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio %
2318             100);
2319         P("nhelper_used,%d\n", cfg->nhelper_used);
2320 
2321         P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100);
2322         P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite);
2323         P("..total nsec,%lld\n", (u_longlong_t)ds->iotime);
2324         P("dumpbuf.iosize,%ld\n", dumpbuf.iosize);
2325         P("dumpbuf.size,%ld\n", dumpbuf.size);
2326 
2327         P("Dump pages/sec,%llu\n", (u_longlong_t)ds->npages / sec);
2328         P("Dump pages,%llu\n", (u_longlong_t)ds->npages);
2329         P("Dump time,%d\n", sec);
2330 
2331         if (ds->pages_mapped > 0)
2332                 P("per-cent map utilization,%d\n", (int)((100 * ds->pages_used)
2333                     / ds->pages_mapped));
2334 
2335         P("\nPer-page metrics:\n");
2336         if (ds->npages > 0) {
2337                 for (hp = cfg->helper; hp != hpend; hp++) {
2338 #define PERPAGE(x)      ds->perpage.x += hp->perpage.x;
2339                         PERPAGES;
2340 #undef PERPAGE
2341                 }
2342 #define PERPAGE(x) \
2343                 P("%s nsec/page,%d\n", #x, (int)(ds->perpage.x / ds->npages));
2344                 PERPAGES;
2345 #undef PERPAGE
2346                 P("freebufq.empty,%d\n", (int)(ds->freebufq.empty /
2347                     ds->npages));
2348                 P("helperq.empty,%d\n", (int)(ds->helperq.empty /
2349                     ds->npages));
2350                 P("writerq.empty,%d\n", (int)(ds->writerq.empty /
2351                     ds->npages));
2352                 P("mainq.empty,%d\n", (int)(ds->mainq.empty / ds->npages));
2353 
2354                 P("I/O wait nsec/page,%llu\n", (u_longlong_t)(ds->iowait /
2355                     ds->npages));
2356         }
2357 #undef P
2358         if (p < e)
2359                 bzero(p, e - p);
2360         return (p - buf);
2361 }
2362 #endif  /* COLLECT_METRICS */
2363 
2364 /*
2365  * Dump the system.
2366  */
2367 void
2368 dumpsys(void)
2369 {
2370         dumpsync_t *ds = &dumpsync;
2371         taskq_t *livetaskq = NULL;
2372         pfn_t pfn;
2373         pgcnt_t bitnum;
2374         proc_t *p;
2375         helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper];
2376         cbuf_t *cp;
2377         pid_t npids, pidx;
2378         char *content;
2379         char *buf;
2380         size_t size;
2381         int save_dump_clevel;
2382         dumpmlw_t mlw;
2383         dumpcsize_t datatag;
2384         dumpdatahdr_t datahdr;
2385 
2386         if (dumpvp == NULL || dumphdr == NULL) {
2387                 uprintf("skipping system dump - no dump device configured\n");
2388                 if (panicstr) {
2389                         dumpcfg.helpers_wanted = 0;
2390                         dumpsys_spinunlock(&dumpcfg.helper_lock);
2391                 }
2392                 return;
2393         }
2394         dumpbuf.cur = dumpbuf.start;
2395 
2396         /* clear the sync variables */
2397         ASSERT(dumpcfg.nhelper > 0);
2398         bzero(ds, sizeof (*ds));
2399         ds->dumpcpu = CPU->cpu_id;
2400 
2401         /*
2402          * Calculate the starting block for dump.  If we're dumping on a
2403          * swap device, start 1/5 of the way in; otherwise, start at the
2404          * beginning.  And never use the first page -- it may be a disk label.
2405          */
2406         if (dumpvp->v_flag & VISSWAP)
2407                 dumphdr->dump_start = P2ROUNDUP(dumpvp_size / 5, DUMP_OFFSET);
2408         else
2409                 dumphdr->dump_start = DUMP_OFFSET;
2410 
2411         dumphdr->dump_flags = DF_VALID | DF_COMPLETE | DF_LIVE | DF_COMPRESSED;
2412         dumphdr->dump_crashtime = gethrestime_sec();
2413         dumphdr->dump_npages = 0;
2414         dumphdr->dump_nvtop = 0;
2415         bzero(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.bitmapsize));
2416         dump_timeleft = dump_timeout;
2417 
2418         if (panicstr) {
2419                 dumphdr->dump_flags &= ~DF_LIVE;
2420                 (void) VOP_DUMPCTL(dumpvp, DUMP_FREE, NULL, NULL);
2421                 (void) VOP_DUMPCTL(dumpvp, DUMP_ALLOC, NULL, NULL);
2422                 (void) vsnprintf(dumphdr->dump_panicstring, DUMP_PANICSIZE,
2423                     panicstr, panicargs);
2424 
2425         }
2426 
2427         if (dump_conflags & DUMP_ALL)
2428                 content = "all";
2429         else if (dump_conflags & DUMP_CURPROC)
2430                 content = "kernel + curproc";
2431         else
2432                 content = "kernel";
2433         uprintf("dumping to %s, offset %lld, content: %s\n", dumppath,
2434             dumphdr->dump_start, content);
2435 
2436         /* Make sure nodename is current */
2437         bcopy(utsname.nodename, dumphdr->dump_utsname.nodename, SYS_NMLN);
2438 
2439         /*
2440          * If this is a live dump, try to open a VCHR vnode for better
2441          * performance. We must take care to flush the buffer cache
2442          * first.
2443          */
2444         if (!panicstr) {
2445                 vnode_t *cdev_vp, *cmn_cdev_vp;
2446 
2447                 ASSERT(dumpbuf.cdev_vp == NULL);
2448                 cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR);
2449                 if (cdev_vp != NULL) {
2450                         cmn_cdev_vp = common_specvp(cdev_vp);
2451                         if (VOP_OPEN(&cmn_cdev_vp, FREAD | FWRITE, kcred, NULL)
2452                             == 0) {
2453                                 if (vn_has_cached_data(dumpvp))
2454                                         (void) pvn_vplist_dirty(dumpvp, 0, NULL,
2455                                             B_INVAL | B_TRUNC, kcred);
2456                                 dumpbuf.cdev_vp = cmn_cdev_vp;
2457                         } else {
2458                                 VN_RELE(cdev_vp);
2459                         }
2460                 }
2461         }
2462 
2463         /*
2464          * Store a hires timestamp so we can look it up during debugging.
2465          */
2466         lbolt_debug_entry();
2467 
2468         /*
2469          * Leave room for the message and ereport save areas and terminal dump
2470          * header.
2471          */
2472         dumpbuf.vp_limit = dumpvp_size - DUMP_LOGSIZE - DUMP_OFFSET -
2473             DUMP_ERPTSIZE;
2474 
2475         /*
2476          * Write out the symbol table.  It's no longer compressed,
2477          * so its 'size' and 'csize' are equal.
2478          */
2479         dumpbuf.vp_off = dumphdr->dump_ksyms = dumphdr->dump_start + PAGESIZE;
2480         dumphdr->dump_ksyms_size = dumphdr->dump_ksyms_csize =
2481             ksyms_snapshot(dumpvp_ksyms_write, NULL, LONG_MAX);
2482 
2483         /*
2484          * Write out the translation map.
2485          */
2486         dumphdr->dump_map = dumpvp_flush();
2487         dump_as(&kas);
2488         dumphdr->dump_nvtop += dump_plat_addr();
2489 
2490         /*
2491          * call into hat, which may have unmapped pages that also need to
2492          * be in the dump
2493          */
2494         hat_dump();
2495 
2496         if (dump_conflags & DUMP_ALL) {
2497                 mutex_enter(&pidlock);
2498 
2499                 for (npids = 0, p = practive; p != NULL; p = p->p_next)
2500                         dumpcfg.pids[npids++] = p->p_pid;
2501 
2502                 mutex_exit(&pidlock);
2503 
2504                 for (pidx = 0; pidx < npids; pidx++)
2505                         (void) dump_process(dumpcfg.pids[pidx]);
2506 
2507                 dump_init_memlist_walker(&mlw);
2508                 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) {
2509                         dump_timeleft = dump_timeout;
2510                         pfn = dump_bitnum_to_pfn(bitnum, &mlw);
2511                         /*
2512                          * Some hypervisors do not have all pages available to
2513                          * be accessed by the guest OS.  Check for page
2514                          * accessibility.
2515                          */
2516                         if (plat_hold_page(pfn, PLAT_HOLD_NO_LOCK, NULL) !=
2517                             PLAT_HOLD_OK)
2518                                 continue;
2519                         BT_SET(dumpcfg.bitmap, bitnum);
2520                 }
2521                 dumphdr->dump_npages = dumpcfg.bitmapsize;
2522                 dumphdr->dump_flags |= DF_ALL;
2523 
2524         } else if (dump_conflags & DUMP_CURPROC) {
2525                 /*
2526                  * Determine which pid is to be dumped.  If we're panicking, we
2527                  * dump the process associated with panic_thread (if any).  If
2528                  * this is a live dump, we dump the process associated with
2529                  * curthread.
2530                  */
2531                 npids = 0;
2532                 if (panicstr) {
2533                         if (panic_thread != NULL &&
2534                             panic_thread->t_procp != NULL &&
2535                             panic_thread->t_procp != &p0) {
2536                                 dumpcfg.pids[npids++] =
2537                                     panic_thread->t_procp->p_pid;
2538                         }
2539                 } else {
2540                         dumpcfg.pids[npids++] = curthread->t_procp->p_pid;
2541                 }
2542 
2543                 if (npids && dump_process(dumpcfg.pids[0]) == 0)
2544                         dumphdr->dump_flags |= DF_CURPROC;
2545                 else
2546                         dumphdr->dump_flags |= DF_KERNEL;
2547 
2548         } else {
2549                 dumphdr->dump_flags |= DF_KERNEL;
2550         }
2551 
2552         dumphdr->dump_hashmask = (1 << highbit(dumphdr->dump_nvtop - 1)) - 1;
2553 
2554         /*
2555          * Write out the pfn table.
2556          */
2557         dumphdr->dump_pfn = dumpvp_flush();
2558         dump_init_memlist_walker(&mlw);
2559         for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) {
2560                 dump_timeleft = dump_timeout;
2561                 if (!BT_TEST(dumpcfg.bitmap, bitnum))
2562                         continue;
2563                 pfn = dump_bitnum_to_pfn(bitnum, &mlw);
2564                 ASSERT(pfn != PFN_INVALID);
2565                 dumpvp_write(&pfn, sizeof (pfn_t));
2566         }
2567         dump_plat_pfn();
2568 
2569         /*
2570          * Write out all the pages.
2571          * Map pages, copy them handling UEs, compress, and write them out.
2572          * Cooperate with any helpers running on CPUs in panic_idle().
2573          */
2574         dumphdr->dump_data = dumpvp_flush();
2575 
2576         bzero(dumpcfg.helpermap, BT_SIZEOFMAP(NCPU));
2577         ds->live = dumpcfg.clevel > DUMP_CLEVEL_SERIAL &&
2578             (dumphdr->dump_flags & DF_LIVE) != 0;
2579 
2580         save_dump_clevel = dumpcfg.clevel;
2581         if (panicstr)
2582                 dumpsys_get_maxmem();
2583 
2584         dumpcfg.nhelper_used = 0;
2585         for (hp = dumpcfg.helper; hp != hpend; hp++) {
2586                 if (hp->page == NULL) {
2587                         hp->helper = DONEHELPER;
2588                         continue;
2589                 }
2590                 ++dumpcfg.nhelper_used;
2591                 hp->helper = FREEHELPER;
2592                 hp->taskqid = NULL;
2593                 hp->ds = ds;
2594                 bzero(&hp->perpage, sizeof (hp->perpage));
2595         }
2596 
2597         CQ_OPEN(freebufq);
2598         CQ_OPEN(helperq);
2599 
2600         dumpcfg.ncbuf_used = 0;
2601         for (cp = dumpcfg.cbuf; cp != &dumpcfg.cbuf[dumpcfg.ncbuf]; cp++) {
2602                 if (cp->buf != NULL) {
2603                         CQ_PUT(freebufq, cp, CBUF_FREEBUF);
2604                         ++dumpcfg.ncbuf_used;
2605                 }
2606         }
2607 
2608         for (cp = dumpcfg.cmap; cp != &dumpcfg.cmap[dumpcfg.ncmap]; cp++)
2609                 CQ_PUT(mainq, cp, CBUF_FREEMAP);
2610 
2611         ds->start = gethrtime();
2612         ds->iowaitts = ds->start;
2613 
2614         /* start helpers */
2615         if (ds->live) {
2616                 int n = dumpcfg.nhelper_used;
2617                 int pri = MINCLSYSPRI - 25;
2618 
2619                 livetaskq = taskq_create("LiveDump", n, pri, n, n,
2620                     TASKQ_PREPOPULATE);
2621                 for (hp = dumpcfg.helper; hp != hpend; hp++) {
2622                         if (hp->page == NULL)
2623                                 continue;
2624                         hp->helper = hp - dumpcfg.helper;
2625                         hp->taskqid = taskq_dispatch(livetaskq,
2626                             dumpsys_live_helper, (void *)hp, TQ_NOSLEEP);
2627                 }
2628 
2629         } else {
2630                 if (panicstr)
2631                         kmem_dump_begin();
2632                 dumpcfg.helpers_wanted = dumpcfg.clevel > DUMP_CLEVEL_SERIAL;
2633                 dumpsys_spinunlock(&dumpcfg.helper_lock);
2634         }
2635 
2636         /* run main task */
2637         dumpsys_main_task(ds);
2638 
2639         ds->elapsed = gethrtime() - ds->start;
2640         if (ds->elapsed < 1)
2641                 ds->elapsed = 1;
2642 
2643         if (livetaskq != NULL)
2644                 taskq_destroy(livetaskq);
2645 
2646         if (ds->neednl) {
2647                 uprintf("\n");
2648                 ds->neednl = 0;
2649         }
2650 
2651         /* record actual pages dumped */
2652         dumphdr->dump_npages = ds->npages;
2653 
2654         /* platform-specific data */
2655         dumphdr->dump_npages += dump_plat_data(dumpcfg.cbuf[0].buf);
2656 
2657         /* note any errors by clearing DF_COMPLETE */
2658         if (dump_ioerr || ds->npages < dumphdr->dump_npages)
2659                 dumphdr->dump_flags &= ~DF_COMPLETE;
2660 
2661         /* end of stream blocks */
2662         datatag = 0;
2663         dumpvp_write(&datatag, sizeof (datatag));
2664 
2665         bzero(&datahdr, sizeof (datahdr));
2666 
2667         /* buffer for metrics */
2668         buf = dumpcfg.cbuf[0].buf;
2669         size = MIN(dumpcfg.cbuf[0].size, DUMP_OFFSET - sizeof (dumphdr_t) -
2670             sizeof (dumpdatahdr_t));
2671 
2672         /* finish the kmem intercepts, collect kmem verbose info */
2673         if (panicstr) {
2674                 datahdr.dump_metrics = kmem_dump_finish(buf, size);
2675                 buf += datahdr.dump_metrics;
2676                 size -= datahdr.dump_metrics;
2677         }
2678 
2679         /* record in the header whether this is a fault-management panic */
2680         if (panicstr)
2681                 dumphdr->dump_fm_panic = is_fm_panic();
2682 
2683         /* compression info in data header */
2684         datahdr.dump_datahdr_magic = DUMP_DATAHDR_MAGIC;
2685         datahdr.dump_datahdr_version = DUMP_DATAHDR_VERSION;
2686         datahdr.dump_maxcsize = CBUF_SIZE;
2687         datahdr.dump_maxrange = CBUF_MAPSIZE / PAGESIZE;
2688         datahdr.dump_nstreams = dumpcfg.nhelper_used;
2689         datahdr.dump_clevel = dumpcfg.clevel;
2690 #ifdef COLLECT_METRICS
2691         if (dump_metrics_on)
2692                 datahdr.dump_metrics += dumpsys_metrics(ds, buf, size);
2693 #endif
2694         datahdr.dump_data_csize = dumpvp_flush() - dumphdr->dump_data;
2695 
2696         /*
2697          * Write out the initial and terminal dump headers.
2698          */
2699         dumpbuf.vp_off = dumphdr->dump_start;
2700         dumpvp_write(dumphdr, sizeof (dumphdr_t));
2701         (void) dumpvp_flush();
2702 
2703         dumpbuf.vp_limit = dumpvp_size;
2704         dumpbuf.vp_off = dumpbuf.vp_limit - DUMP_OFFSET;
2705         dumpvp_write(dumphdr, sizeof (dumphdr_t));
2706         dumpvp_write(&datahdr, sizeof (dumpdatahdr_t));
2707         dumpvp_write(dumpcfg.cbuf[0].buf, datahdr.dump_metrics);
2708 
2709         (void) dumpvp_flush();
2710 
2711         uprintf("\r%3d%% done: %llu pages dumped, ",
2712             ds->percent_done, (u_longlong_t)ds->npages);
2713 
2714         if (dump_ioerr == 0) {
2715                 uprintf("dump succeeded\n");
2716         } else {
2717                 uprintf("dump failed: error %d\n", dump_ioerr);
2718 #ifdef DEBUG
2719                 if (panicstr)
2720                         debug_enter("dump failed");
2721 #endif
2722         }
2723 
2724         /*
2725          * Write out all undelivered messages.  This has to be the *last*
2726          * thing we do because the dump process itself emits messages.
2727          */
2728         if (panicstr) {
2729                 dump_summary();
2730                 dump_ereports();
2731                 dump_messages();
2732         }
2733 
2734         delay(2 * hz);  /* let people see the 'done' message */
2735         dump_timeleft = 0;
2736         dump_ioerr = 0;
2737 
2738         /* restore settings after live dump completes */
2739         if (!panicstr) {
2740                 dumpcfg.clevel = save_dump_clevel;
2741 
2742                 /* release any VCHR open of the dump device */
2743                 if (dumpbuf.cdev_vp != NULL) {
2744                         (void) VOP_CLOSE(dumpbuf.cdev_vp, FREAD | FWRITE, 1, 0,
2745                             kcred, NULL);
2746                         VN_RELE(dumpbuf.cdev_vp);
2747                         dumpbuf.cdev_vp = NULL;
2748                 }
2749         }
2750 }
2751 
2752 /*
2753  * This function is called whenever the memory size, as represented
2754  * by the phys_install list, changes.
2755  */
2756 void
2757 dump_resize()
2758 {
2759         mutex_enter(&dump_lock);
2760         dumphdr_init();
2761         dumpbuf_resize();
2762         dump_update_clevel();
2763         mutex_exit(&dump_lock);
2764 }
2765 
2766 /*
2767  * This function allows for dynamic resizing of a dump area. It assumes that
2768  * the underlying device has update its appropriate size(9P).
2769  */
2770 int
2771 dumpvp_resize()
2772 {
2773         int error;
2774         vattr_t vattr;
2775 
2776         mutex_enter(&dump_lock);
2777         vattr.va_mask = AT_SIZE;
2778         if ((error = VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL)) != 0) {
2779                 mutex_exit(&dump_lock);
2780                 return (error);
2781         }
2782 
2783         if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) {
2784                 mutex_exit(&dump_lock);
2785                 return (ENOSPC);
2786         }
2787 
2788         dumpvp_size = vattr.va_size & -DUMP_OFFSET;
2789         mutex_exit(&dump_lock);
2790         return (0);
2791 }
2792 
2793 int
2794 dump_set_uuid(const char *uuidstr)
2795 {
2796         const char *ptr;
2797         int i;
2798 
2799         if (uuidstr == NULL || strnlen(uuidstr, 36 + 1) != 36)
2800                 return (EINVAL);
2801 
2802         /* uuid_parse is not common code so check manually */
2803         for (i = 0, ptr = uuidstr; i < 36; i++, ptr++) {
2804                 switch (i) {
2805                 case 8:
2806                 case 13:
2807                 case 18:
2808                 case 23:
2809                         if (*ptr != '-')
2810                                 return (EINVAL);
2811                         break;
2812 
2813                 default:
2814                         if (!isxdigit(*ptr))
2815                                 return (EINVAL);
2816                         break;
2817                 }
2818         }
2819 
2820         if (dump_osimage_uuid[0] != '\0')
2821                 return (EALREADY);
2822 
2823         (void) strncpy(dump_osimage_uuid, uuidstr, 36 + 1);
2824 
2825         cmn_err(CE_CONT, "?This Solaris instance has UUID %s\n",
2826             dump_osimage_uuid);
2827 
2828         return (0);
2829 }
2830 
2831 const char *
2832 dump_get_uuid(void)
2833 {
2834         return (dump_osimage_uuid[0] != '\0' ? dump_osimage_uuid : "");
2835 }