Print this page
9525 kmem_dump_size is a corrupting influence


   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2016 Joyent, Inc.
  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/param.h>
  29 #include <sys/systm.h>
  30 #include <sys/vm.h>
  31 #include <sys/proc.h>
  32 #include <sys/file.h>
  33 #include <sys/conf.h>
  34 #include <sys/kmem.h>
  35 #include <sys/mem.h>
  36 #include <sys/mman.h>
  37 #include <sys/vnode.h>
  38 #include <sys/errno.h>
  39 #include <sys/memlist.h>
  40 #include <sys/dumphdr.h>
  41 #include <sys/dumpadm.h>
  42 #include <sys/ksyms.h>
  43 #include <sys/compress.h>
  44 #include <sys/stream.h>


  57 #include <sys/fs/snode.h>
  58 #include <sys/ontrap.h>
  59 #include <sys/panic.h>
  60 #include <sys/dkio.h>
  61 #include <sys/vtoc.h>
  62 #include <sys/errorq.h>
  63 #include <sys/fm/util.h>
  64 #include <sys/fs/zfs.h>
  65 
  66 #include <vm/hat.h>
  67 #include <vm/as.h>
  68 #include <vm/page.h>
  69 #include <vm/pvn.h>
  70 #include <vm/seg.h>
  71 #include <vm/seg_kmem.h>
  72 #include <sys/clock_impl.h>
  73 #include <sys/hold_page.h>
  74 
  75 #include <bzip2/bzlib.h>
  76 


  77 /*
  78  * Crash dump time is dominated by disk write time.  To reduce this,
  79  * the stronger compression method bzip2 is applied to reduce the dump
  80  * size and hence reduce I/O time.  However, bzip2 is much more
  81  * computationally expensive than the existing lzjb algorithm, so to
  82  * avoid increasing compression time, CPUs that are otherwise idle
  83  * during panic are employed to parallelize the compression task.
  84  * Many helper CPUs are needed to prevent bzip2 from being a
  85  * bottleneck, and on systems with too few CPUs, the lzjb algorithm is
  86  * parallelized instead. Lastly, I/O and compression are performed by
  87  * different CPUs, and are hence overlapped in time, unlike the older
  88  * serial code.
  89  *
  90  * Another important consideration is the speed of the dump
  91  * device. Faster disks need less CPUs in order to benefit from
  92  * parallel lzjb versus parallel bzip2. Therefore, the CPU count
  93  * threshold for switching from parallel lzjb to paralled bzip2 is
  94  * elevated for faster disks. The dump device speed is adduced from
  95  * the setting for dumpbuf.iosize, see dump_update_clevel.
  96  */


 121  *      Higher numbers give greater compression, but take more memory
 122  *      and time. Memory used per helper is ~(dump_bzip2_level * 1MB).
 123  *
 124  * dump_plat_mincpu     the cross-over limit for using bzip2 (per platform):
 125  *      if dump_plat_mincpu == 0, then always do single threaded dump
 126  *      if ncpu >= dump_plat_mincpu then try to use bzip2
 127  *
 128  * dump_metrics_on      if set, metrics are collected in the kernel, passed
 129  *      to savecore via the dump file, and recorded by savecore in
 130  *      METRICS.txt.
 131  */
 132 uint_t dump_ncpu_low = 4;       /* minimum config for parallel lzjb */
 133 uint_t dump_bzip2_level = 1;    /* bzip2 level (1-9) */
 134 
 135 /* Use dump_plat_mincpu_default unless this variable is set by /etc/system */
 136 #define MINCPU_NOT_SET  ((uint_t)-1)
 137 uint_t dump_plat_mincpu = MINCPU_NOT_SET;
 138 
 139 /* tunables for pre-reserved heap */
 140 uint_t dump_kmem_permap = 1024;
 141 uint_t dump_kmem_pages = 8;
 142 
 143 /* Define multiple buffers per helper to avoid stalling */
 144 #define NCBUF_PER_HELPER        2
 145 #define NCMAP_PER_HELPER        4
 146 
 147 /* minimum number of helpers configured */
 148 #define MINHELPERS      (dump_ncpu_low)
 149 #define MINCBUFS        (MINHELPERS * NCBUF_PER_HELPER)
 150 
 151 /*
 152  * Define constant parameters.
 153  *
 154  * CBUF_SIZE            size of an output buffer
 155  *
 156  * CBUF_MAPSIZE         size of virtual range for mapping pages
 157  *
 158  * CBUF_MAPNP           size of virtual range in pages
 159  *
 160  */
 161 #define DUMP_1KB        ((size_t)1 << 10)


 662         }
 663 
 664         new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP);
 665         cpend = &new->cmap[new->ncmap];
 666         for (cp = new->cmap; cp != cpend; cp++) {
 667                 cp->state = CBUF_FREEMAP;
 668                 cp->size = CBUF_MAPSIZE;
 669                 cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE,
 670                     0, 0, NULL, NULL, VM_SLEEP);
 671         }
 672 
 673         /* reserve VA to be backed with spare pages at crash time */
 674         if (new->maxsize > 0) {
 675                 new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE);
 676                 new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE);
 677                 new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize,
 678                     CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP);
 679         }
 680 
 681         /*
 682          * Reserve memory for kmem allocation calls made during crash
 683          * dump.  The hat layer allocates memory for each mapping
 684          * created, and the I/O path allocates buffers and data structs.
 685          * Add a few pages for safety.




 686          */








 687         kmem_dump_init((new->ncmap * dump_kmem_permap) +
 688             (dump_kmem_pages * PAGESIZE));
 689 
 690         /* set new config pointers */
 691         *old = *new;
 692 }
 693 
 694 /*
 695  * Define a struct memlist walker to optimize bitnum to pfn
 696  * lookup. The walker maintains the state of the list traversal.
 697  */
 698 typedef struct dumpmlw {
 699         struct memlist  *mp;            /* current memlist */
 700         pgcnt_t         basenum;        /* bitnum base offset */
 701         pgcnt_t         mppages;        /* current memlist size */
 702         pgcnt_t         mpleft;         /* size to end of current memlist */
 703         pfn_t           mpaddr;         /* first pfn in memlist */
 704 } dumpmlw_t;
 705 
 706 /* initialize the walker */




   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2018 Joyent, Inc.
  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/param.h>
  29 #include <sys/systm.h>
  30 #include <sys/vm.h>
  31 #include <sys/proc.h>
  32 #include <sys/file.h>
  33 #include <sys/conf.h>
  34 #include <sys/kmem.h>
  35 #include <sys/mem.h>
  36 #include <sys/mman.h>
  37 #include <sys/vnode.h>
  38 #include <sys/errno.h>
  39 #include <sys/memlist.h>
  40 #include <sys/dumphdr.h>
  41 #include <sys/dumpadm.h>
  42 #include <sys/ksyms.h>
  43 #include <sys/compress.h>
  44 #include <sys/stream.h>


  57 #include <sys/fs/snode.h>
  58 #include <sys/ontrap.h>
  59 #include <sys/panic.h>
  60 #include <sys/dkio.h>
  61 #include <sys/vtoc.h>
  62 #include <sys/errorq.h>
  63 #include <sys/fm/util.h>
  64 #include <sys/fs/zfs.h>
  65 
  66 #include <vm/hat.h>
  67 #include <vm/as.h>
  68 #include <vm/page.h>
  69 #include <vm/pvn.h>
  70 #include <vm/seg.h>
  71 #include <vm/seg_kmem.h>
  72 #include <sys/clock_impl.h>
  73 #include <sys/hold_page.h>
  74 
  75 #include <bzip2/bzlib.h>
  76 
  77 #define ONE_GIG (1024 * 1024 * 1024UL)
  78 
  79 /*
  80  * Crash dump time is dominated by disk write time.  To reduce this,
  81  * the stronger compression method bzip2 is applied to reduce the dump
  82  * size and hence reduce I/O time.  However, bzip2 is much more
  83  * computationally expensive than the existing lzjb algorithm, so to
  84  * avoid increasing compression time, CPUs that are otherwise idle
  85  * during panic are employed to parallelize the compression task.
  86  * Many helper CPUs are needed to prevent bzip2 from being a
  87  * bottleneck, and on systems with too few CPUs, the lzjb algorithm is
  88  * parallelized instead. Lastly, I/O and compression are performed by
  89  * different CPUs, and are hence overlapped in time, unlike the older
  90  * serial code.
  91  *
  92  * Another important consideration is the speed of the dump
  93  * device. Faster disks need less CPUs in order to benefit from
  94  * parallel lzjb versus parallel bzip2. Therefore, the CPU count
  95  * threshold for switching from parallel lzjb to paralled bzip2 is
  96  * elevated for faster disks. The dump device speed is adduced from
  97  * the setting for dumpbuf.iosize, see dump_update_clevel.
  98  */


 123  *      Higher numbers give greater compression, but take more memory
 124  *      and time. Memory used per helper is ~(dump_bzip2_level * 1MB).
 125  *
 126  * dump_plat_mincpu     the cross-over limit for using bzip2 (per platform):
 127  *      if dump_plat_mincpu == 0, then always do single threaded dump
 128  *      if ncpu >= dump_plat_mincpu then try to use bzip2
 129  *
 130  * dump_metrics_on      if set, metrics are collected in the kernel, passed
 131  *      to savecore via the dump file, and recorded by savecore in
 132  *      METRICS.txt.
 133  */
 134 uint_t dump_ncpu_low = 4;       /* minimum config for parallel lzjb */
 135 uint_t dump_bzip2_level = 1;    /* bzip2 level (1-9) */
 136 
 137 /* Use dump_plat_mincpu_default unless this variable is set by /etc/system */
 138 #define MINCPU_NOT_SET  ((uint_t)-1)
 139 uint_t dump_plat_mincpu = MINCPU_NOT_SET;
 140 
 141 /* tunables for pre-reserved heap */
 142 uint_t dump_kmem_permap = 1024;
 143 uint_t dump_kmem_pages = 0;
 144 
 145 /* Define multiple buffers per helper to avoid stalling */
 146 #define NCBUF_PER_HELPER        2
 147 #define NCMAP_PER_HELPER        4
 148 
 149 /* minimum number of helpers configured */
 150 #define MINHELPERS      (dump_ncpu_low)
 151 #define MINCBUFS        (MINHELPERS * NCBUF_PER_HELPER)
 152 
 153 /*
 154  * Define constant parameters.
 155  *
 156  * CBUF_SIZE            size of an output buffer
 157  *
 158  * CBUF_MAPSIZE         size of virtual range for mapping pages
 159  *
 160  * CBUF_MAPNP           size of virtual range in pages
 161  *
 162  */
 163 #define DUMP_1KB        ((size_t)1 << 10)


 664         }
 665 
 666         new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP);
 667         cpend = &new->cmap[new->ncmap];
 668         for (cp = new->cmap; cp != cpend; cp++) {
 669                 cp->state = CBUF_FREEMAP;
 670                 cp->size = CBUF_MAPSIZE;
 671                 cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE,
 672                     0, 0, NULL, NULL, VM_SLEEP);
 673         }
 674 
 675         /* reserve VA to be backed with spare pages at crash time */
 676         if (new->maxsize > 0) {
 677                 new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE);
 678                 new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE);
 679                 new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize,
 680                     CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP);
 681         }
 682 
 683         /*
 684          * Reserve memory for kmem allocation calls made during crash dump.  The
 685          * hat layer allocates memory for each mapping created, and the I/O path
 686          * allocates buffers and data structs.
 687          *
 688          * On larger systems, we easily exceed the lower amount, so we need some
 689          * more space; the cut-over point is relatively arbitrary.  If we run
 690          * out, the only impact is that kmem state in the dump becomes
 691          * inconsistent.
 692          */
 693 
 694         if (dump_kmem_pages == 0) {
 695                 if (physmem > (16 * ONE_GIG) / PAGESIZE)
 696                         dump_kmem_pages = 20;
 697                 else
 698                         dump_kmem_pages = 8;
 699         }
 700 
 701         kmem_dump_init((new->ncmap * dump_kmem_permap) +
 702             (dump_kmem_pages * PAGESIZE));
 703 
 704         /* set new config pointers */
 705         *old = *new;
 706 }
 707 
 708 /*
 709  * Define a struct memlist walker to optimize bitnum to pfn
 710  * lookup. The walker maintains the state of the list traversal.
 711  */
 712 typedef struct dumpmlw {
 713         struct memlist  *mp;            /* current memlist */
 714         pgcnt_t         basenum;        /* bitnum base offset */
 715         pgcnt_t         mppages;        /* current memlist size */
 716         pgcnt_t         mpleft;         /* size to end of current memlist */
 717         pfn_t           mpaddr;         /* first pfn in memlist */
 718 } dumpmlw_t;
 719 
 720 /* initialize the walker */