Print this page
9694 Parallel dump hangs
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: John Levon <levon@movementarian.org>


   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2018 Joyent, Inc.

  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/param.h>
  29 #include <sys/systm.h>
  30 #include <sys/vm.h>
  31 #include <sys/proc.h>
  32 #include <sys/file.h>
  33 #include <sys/conf.h>
  34 #include <sys/kmem.h>
  35 #include <sys/mem.h>
  36 #include <sys/mman.h>
  37 #include <sys/vnode.h>
  38 #include <sys/errno.h>
  39 #include <sys/memlist.h>
  40 #include <sys/dumphdr.h>
  41 #include <sys/dumpadm.h>
  42 #include <sys/ksyms.h>
  43 #include <sys/compress.h>
  44 #include <sys/stream.h>


  54 #include <sys/debug.h>
  55 #include <sys/sunddi.h>
  56 #include <fs/fs_subr.h>
  57 #include <sys/fs/snode.h>
  58 #include <sys/ontrap.h>
  59 #include <sys/panic.h>
  60 #include <sys/dkio.h>
  61 #include <sys/vtoc.h>
  62 #include <sys/errorq.h>
  63 #include <sys/fm/util.h>
  64 #include <sys/fs/zfs.h>
  65 
  66 #include <vm/hat.h>
  67 #include <vm/as.h>
  68 #include <vm/page.h>
  69 #include <vm/pvn.h>
  70 #include <vm/seg.h>
  71 #include <vm/seg_kmem.h>
  72 #include <sys/clock_impl.h>
  73 #include <sys/hold_page.h>

  74 
  75 #include <bzip2/bzlib.h>
  76 
  77 #define ONE_GIG (1024 * 1024 * 1024UL)
  78 
  79 /*
  80  * Crash dump time is dominated by disk write time.  To reduce this,
  81  * the stronger compression method bzip2 is applied to reduce the dump
  82  * size and hence reduce I/O time.  However, bzip2 is much more
  83  * computationally expensive than the existing lzjb algorithm, so to
  84  * avoid increasing compression time, CPUs that are otherwise idle
  85  * during panic are employed to parallelize the compression task.
  86  * Many helper CPUs are needed to prevent bzip2 from being a
  87  * bottleneck, and on systems with too few CPUs, the lzjb algorithm is
  88  * parallelized instead. Lastly, I/O and compression are performed by
  89  * different CPUs, and are hence overlapped in time, unlike the older
  90  * serial code.
  91  *
  92  * Another important consideration is the speed of the dump
  93  * device. Faster disks need less CPUs in order to benefit from


 423 /*
 424  * The dump I/O buffer.
 425  *
 426  * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is
 427  * sized according to the optimum device transfer speed.
 428  */
 429 typedef struct dumpbuf {
 430         vnode_t *cdev_vp;       /* VCHR open of the dump device */
 431         len_t   vp_limit;       /* maximum write offset */
 432         offset_t vp_off;        /* current dump device offset */
 433         char    *cur;           /* dump write pointer */
 434         char    *start;         /* dump buffer address */
 435         char    *end;           /* dump buffer end */
 436         size_t  size;           /* size of dumpbuf in bytes */
 437         size_t  iosize;         /* best transfer size for device */
 438 } dumpbuf_t;
 439 
 440 dumpbuf_t dumpbuf;              /* I/O buffer */
 441 
 442 /*









 443  * The dump I/O buffer must be at least one page, at most xfer_size
 444  * bytes, and should scale with physmem in between.  The transfer size
 445  * passed in will either represent a global default (maxphys) or the
 446  * best size for the device.  The size of the dumpbuf I/O buffer is
 447  * limited by dumpbuf_limit (8MB by default) because the dump
 448  * performance saturates beyond a certain size.  The default is to
 449  * select 1/4096 of the memory.
 450  */
 451 static int      dumpbuf_fraction = 12;  /* memory size scale factor */
 452 static size_t   dumpbuf_limit = 8 * DUMP_1MB;   /* max I/O buf size */
 453 
 454 static size_t
 455 dumpbuf_iosize(size_t xfer_size)
 456 {
 457         size_t iosize = ptob(physmem >> dumpbuf_fraction);
 458 
 459         if (iosize < PAGESIZE)
 460                 iosize = PAGESIZE;
 461         else if (iosize > xfer_size)
 462                 iosize = xfer_size;


2284                 dumpvp_write(&csize, sizeof (csize));
2285                 dumpvp_write(hp->lzbuf, csize);
2286                 HRSTOP(hp->perpage, write);
2287         }
2288         CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
2289         hp->cpin = NULL;
2290 }
2291 
2292 /*
2293  * Main task to dump pages. This is called on the dump CPU.
2294  */
2295 static void
2296 dumpsys_main_task(void *arg)
2297 {
2298         dumpsync_t *ds = arg;
2299         pgcnt_t pagenum = 0, bitnum = 0, hibitnum;
2300         dumpmlw_t mlw;
2301         cbuf_t *cp;
2302         pgcnt_t baseoff, pfnoff;
2303         pfn_t base, pfn;
2304         int i, dumpserial;

2305 
2306         /*
2307          * Fall back to serial mode if there are no helpers.
2308          * dump_plat_mincpu can be set to 0 at any time.
2309          * dumpcfg.helpermap must contain at least one member.




2310          */
2311         dumpserial = 1;
2312 
2313         if (dump_plat_mincpu != 0 && dumpcfg.clevel != 0) {




2314                 for (i = 0; i < BT_BITOUL(NCPU); ++i) {
2315                         if (dumpcfg.helpermap[i] != 0) {
2316                                 dumpserial = 0;
2317                                 break;
2318                         }
2319                 }




2320         }
2321 



2322         if (dumpserial) {
2323                 dumpcfg.clevel = 0;
2324                 if (dumpcfg.helper[0].lzbuf == NULL)
2325                         dumpcfg.helper[0].lzbuf = dumpcfg.helper[1].page;

2326         }


2327 
2328         dump_init_memlist_walker(&mlw);
2329 
2330         for (;;) {
2331                 int sec = (gethrtime() - ds->start) / NANOSEC;
2332 
2333                 /*
2334                  * Render a simple progress display on the system console to
2335                  * make clear to the operator that the system has not hung.
2336                  * Emit an update when dump progress has advanced by one
2337                  * percent, or when no update has been drawn in the last
2338                  * second.
2339                  */
2340                 if (ds->percent > ds->percent_done || sec > ds->sec_done) {
2341                         ds->sec_done = sec;
2342                         ds->percent_done = ds->percent;
2343                         uprintf("^\rdumping: %2d:%02d %3d%% done",
2344                             sec / 60, sec % 60, ds->percent);
2345                         ds->neednl = 1;
2346                 }


2450 
2451                         dump_timeleft = dump_timeout;
2452                         cp->used = ptob(pagenum - cp->pagenum);
2453 
2454                         HRSTART(ds->perpage, map);
2455                         hat_devload(kas.a_hat, cp->buf, cp->size, base,
2456                             PROT_READ, HAT_LOAD_NOCONSIST);
2457                         HRSTOP(ds->perpage, map);
2458 
2459                         ds->pages_mapped += btop(cp->size);
2460                         ds->pages_used += pagenum - cp->pagenum;
2461 
2462                         CQ_OPEN(mainq);
2463 
2464                         /*
2465                          * If there are no helpers the main task does
2466                          * non-streams lzjb compress.
2467                          */
2468                         if (dumpserial) {
2469                                 dumpsys_lzjb_page(dumpcfg.helper, cp);
2470                                 break;
2471                         }
2472 
2473                         /* pass mapped pages to a helper */
2474                         CQ_PUT(helperq, cp, CBUF_INREADY);

2475 
2476                         /* the last page was done */
2477                         if (bitnum >= dumpcfg.bitmapsize)
2478                                 CQ_CLOSE(helperq);
2479 
2480                         break;
2481 
2482                 case CBUF_USEDMAP:
2483 
2484                         ds->npages += btop(cp->used);
2485 
2486                         HRSTART(ds->perpage, unmap);
2487                         hat_unload(kas.a_hat, cp->buf, cp->size, HAT_UNLOAD);
2488                         HRSTOP(ds->perpage, unmap);
2489 
2490                         if (bitnum < dumpcfg.bitmapsize)
2491                                 CQ_PUT(mainq, cp, CBUF_FREEMAP);
2492                         CQ_CLOSE(mainq);
2493 
2494                         ASSERT(ds->npages <= dumphdr->dump_npages);


2560         for (i = 0; i < ncpus; i++) {
2561                 if ((i & 15) == 0)
2562                         P(",,%03d,", i);
2563                 if (i == myid)
2564                         P("   M");
2565                 else if (BT_TEST(cfg->helpermap, i))
2566                         P("%4d", cpu_seq[i]->cpu_id);
2567                 else
2568                         P("   *");
2569                 if ((i & 15) == 15)
2570                         P("\n");
2571         }
2572 
2573         P("ncbuf_used,%d\n", cfg->ncbuf_used);
2574         P("ncmap,%d\n", cfg->ncmap);
2575 
2576         P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m);
2577         P("Found small pages,%ld\n", cfg->foundsm);
2578 
2579         P("Compression level,%d\n", cfg->clevel);
2580         P("Compression type,%s %s\n", cfg->clevel == 0 ? "serial" : "parallel",
2581             cfg->clevel >= DUMP_CLEVEL_BZIP2 ? "bzip2" : "lzjb");




2582         P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio %
2583             100);
2584         P("nhelper_used,%d\n", cfg->nhelper_used);
2585 
2586         P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100);
2587         P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite);
2588         P("..total nsec,%lld\n", (u_longlong_t)ds->iotime);
2589         P("dumpbuf.iosize,%ld\n", dumpbuf.iosize);
2590         P("dumpbuf.size,%ld\n", dumpbuf.size);
2591 
2592         P("Dump pages/sec,%llu\n", (u_longlong_t)ds->npages / sec);
2593         P("Dump pages,%llu\n", (u_longlong_t)ds->npages);
2594         P("Dump time,%d\n", sec);
2595 
2596         if (ds->pages_mapped > 0)
2597                 P("per-cent map utilization,%d\n", (int)((100 * ds->pages_used)
2598                     / ds->pages_mapped));
2599 
2600         P("\nPer-page metrics:\n");
2601         if (ds->npages > 0) {




   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2018 Joyent, Inc.
  25  * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/param.h>
  30 #include <sys/systm.h>
  31 #include <sys/vm.h>
  32 #include <sys/proc.h>
  33 #include <sys/file.h>
  34 #include <sys/conf.h>
  35 #include <sys/kmem.h>
  36 #include <sys/mem.h>
  37 #include <sys/mman.h>
  38 #include <sys/vnode.h>
  39 #include <sys/errno.h>
  40 #include <sys/memlist.h>
  41 #include <sys/dumphdr.h>
  42 #include <sys/dumpadm.h>
  43 #include <sys/ksyms.h>
  44 #include <sys/compress.h>
  45 #include <sys/stream.h>


  55 #include <sys/debug.h>
  56 #include <sys/sunddi.h>
  57 #include <fs/fs_subr.h>
  58 #include <sys/fs/snode.h>
  59 #include <sys/ontrap.h>
  60 #include <sys/panic.h>
  61 #include <sys/dkio.h>
  62 #include <sys/vtoc.h>
  63 #include <sys/errorq.h>
  64 #include <sys/fm/util.h>
  65 #include <sys/fs/zfs.h>
  66 
  67 #include <vm/hat.h>
  68 #include <vm/as.h>
  69 #include <vm/page.h>
  70 #include <vm/pvn.h>
  71 #include <vm/seg.h>
  72 #include <vm/seg_kmem.h>
  73 #include <sys/clock_impl.h>
  74 #include <sys/hold_page.h>
  75 #include <sys/cpu.h>
  76 
  77 #include <bzip2/bzlib.h>
  78 
  79 #define ONE_GIG (1024 * 1024 * 1024UL)
  80 
  81 /*
  82  * Crash dump time is dominated by disk write time.  To reduce this,
  83  * the stronger compression method bzip2 is applied to reduce the dump
  84  * size and hence reduce I/O time.  However, bzip2 is much more
  85  * computationally expensive than the existing lzjb algorithm, so to
  86  * avoid increasing compression time, CPUs that are otherwise idle
  87  * during panic are employed to parallelize the compression task.
  88  * Many helper CPUs are needed to prevent bzip2 from being a
  89  * bottleneck, and on systems with too few CPUs, the lzjb algorithm is
  90  * parallelized instead. Lastly, I/O and compression are performed by
  91  * different CPUs, and are hence overlapped in time, unlike the older
  92  * serial code.
  93  *
  94  * Another important consideration is the speed of the dump
  95  * device. Faster disks need less CPUs in order to benefit from


 425 /*
 426  * The dump I/O buffer.
 427  *
 428  * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is
 429  * sized according to the optimum device transfer speed.
 430  */
 431 typedef struct dumpbuf {
 432         vnode_t *cdev_vp;       /* VCHR open of the dump device */
 433         len_t   vp_limit;       /* maximum write offset */
 434         offset_t vp_off;        /* current dump device offset */
 435         char    *cur;           /* dump write pointer */
 436         char    *start;         /* dump buffer address */
 437         char    *end;           /* dump buffer end */
 438         size_t  size;           /* size of dumpbuf in bytes */
 439         size_t  iosize;         /* best transfer size for device */
 440 } dumpbuf_t;
 441 
 442 dumpbuf_t dumpbuf;              /* I/O buffer */
 443 
 444 /*
 445  * For parallel dump, defines maximum time main task thread will wait
 446  * for at least one helper to register in dumpcfg.helpermap, before
 447  * assuming there are no helpers and falling back to serial mode.
 448  * Value is chosen arbitrary and provides *really* long wait for any
 449  * available helper to register.
 450  */
 451 #define DUMP_HELPER_MAX_WAIT    1000    /* millisec */
 452 
 453 /*
 454  * The dump I/O buffer must be at least one page, at most xfer_size
 455  * bytes, and should scale with physmem in between.  The transfer size
 456  * passed in will either represent a global default (maxphys) or the
 457  * best size for the device.  The size of the dumpbuf I/O buffer is
 458  * limited by dumpbuf_limit (8MB by default) because the dump
 459  * performance saturates beyond a certain size.  The default is to
 460  * select 1/4096 of the memory.
 461  */
 462 static int      dumpbuf_fraction = 12;  /* memory size scale factor */
 463 static size_t   dumpbuf_limit = 8 * DUMP_1MB;   /* max I/O buf size */
 464 
 465 static size_t
 466 dumpbuf_iosize(size_t xfer_size)
 467 {
 468         size_t iosize = ptob(physmem >> dumpbuf_fraction);
 469 
 470         if (iosize < PAGESIZE)
 471                 iosize = PAGESIZE;
 472         else if (iosize > xfer_size)
 473                 iosize = xfer_size;


2295                 dumpvp_write(&csize, sizeof (csize));
2296                 dumpvp_write(hp->lzbuf, csize);
2297                 HRSTOP(hp->perpage, write);
2298         }
2299         CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
2300         hp->cpin = NULL;
2301 }
2302 
2303 /*
2304  * Main task to dump pages. This is called on the dump CPU.
2305  */
2306 static void
2307 dumpsys_main_task(void *arg)
2308 {
2309         dumpsync_t *ds = arg;
2310         pgcnt_t pagenum = 0, bitnum = 0, hibitnum;
2311         dumpmlw_t mlw;
2312         cbuf_t *cp;
2313         pgcnt_t baseoff, pfnoff;
2314         pfn_t base, pfn;
2315         boolean_t dumpserial;
2316         int i;
2317 
2318         /*
2319          * Fall back to serial mode if there are no helpers.
2320          * dump_plat_mincpu can be set to 0 at any time.
2321          * dumpcfg.helpermap must contain at least one member.
2322          *
2323          * It is possible that the helpers haven't registered
2324          * in helpermap yet; wait up to DUMP_HELPER_MAX_WAIT for
2325          * at least one helper to register.
2326          */
2327         dumpserial = B_TRUE;

2328         if (dump_plat_mincpu != 0 && dumpcfg.clevel != 0) {
2329                 hrtime_t hrtmax = MSEC2NSEC(DUMP_HELPER_MAX_WAIT);
2330                 hrtime_t hrtstart = gethrtime();
2331 
2332                 for (;;) {
2333                         for (i = 0; i < BT_BITOUL(NCPU); ++i) {
2334                                 if (dumpcfg.helpermap[i] != 0) {
2335                                         dumpserial = B_FALSE;
2336                                         break;
2337                                 }
2338                         }
2339 
2340                         if ((!dumpserial) ||
2341                             ((gethrtime() - hrtstart) >= hrtmax)) {
2342                                 break;
2343                         }
2344 
2345                         SMT_PAUSE();
2346                 }
2347 
2348                 if (dumpserial) {
2349                         dumpcfg.clevel = 0;
2350                         if (dumpcfg.helper[0].lzbuf == NULL) {
2351                                 dumpcfg.helper[0].lzbuf =
2352                                     dumpcfg.helper[1].page;
2353                         }
2354                 }
2355         }
2356 
2357         dump_init_memlist_walker(&mlw);
2358 
2359         for (;;) {
2360                 int sec = (gethrtime() - ds->start) / NANOSEC;
2361 
2362                 /*
2363                  * Render a simple progress display on the system console to
2364                  * make clear to the operator that the system has not hung.
2365                  * Emit an update when dump progress has advanced by one
2366                  * percent, or when no update has been drawn in the last
2367                  * second.
2368                  */
2369                 if (ds->percent > ds->percent_done || sec > ds->sec_done) {
2370                         ds->sec_done = sec;
2371                         ds->percent_done = ds->percent;
2372                         uprintf("^\rdumping: %2d:%02d %3d%% done",
2373                             sec / 60, sec % 60, ds->percent);
2374                         ds->neednl = 1;
2375                 }


2479 
2480                         dump_timeleft = dump_timeout;
2481                         cp->used = ptob(pagenum - cp->pagenum);
2482 
2483                         HRSTART(ds->perpage, map);
2484                         hat_devload(kas.a_hat, cp->buf, cp->size, base,
2485                             PROT_READ, HAT_LOAD_NOCONSIST);
2486                         HRSTOP(ds->perpage, map);
2487 
2488                         ds->pages_mapped += btop(cp->size);
2489                         ds->pages_used += pagenum - cp->pagenum;
2490 
2491                         CQ_OPEN(mainq);
2492 
2493                         /*
2494                          * If there are no helpers the main task does
2495                          * non-streams lzjb compress.
2496                          */
2497                         if (dumpserial) {
2498                                 dumpsys_lzjb_page(dumpcfg.helper, cp);
2499                         } else {


2500                                 /* pass mapped pages to a helper */
2501                                 CQ_PUT(helperq, cp, CBUF_INREADY);
2502                         }
2503 
2504                         /* the last page was done */
2505                         if (bitnum >= dumpcfg.bitmapsize)
2506                                 CQ_CLOSE(helperq);
2507 
2508                         break;
2509 
2510                 case CBUF_USEDMAP:
2511 
2512                         ds->npages += btop(cp->used);
2513 
2514                         HRSTART(ds->perpage, unmap);
2515                         hat_unload(kas.a_hat, cp->buf, cp->size, HAT_UNLOAD);
2516                         HRSTOP(ds->perpage, unmap);
2517 
2518                         if (bitnum < dumpcfg.bitmapsize)
2519                                 CQ_PUT(mainq, cp, CBUF_FREEMAP);
2520                         CQ_CLOSE(mainq);
2521 
2522                         ASSERT(ds->npages <= dumphdr->dump_npages);


2588         for (i = 0; i < ncpus; i++) {
2589                 if ((i & 15) == 0)
2590                         P(",,%03d,", i);
2591                 if (i == myid)
2592                         P("   M");
2593                 else if (BT_TEST(cfg->helpermap, i))
2594                         P("%4d", cpu_seq[i]->cpu_id);
2595                 else
2596                         P("   *");
2597                 if ((i & 15) == 15)
2598                         P("\n");
2599         }
2600 
2601         P("ncbuf_used,%d\n", cfg->ncbuf_used);
2602         P("ncmap,%d\n", cfg->ncmap);
2603 
2604         P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m);
2605         P("Found small pages,%ld\n", cfg->foundsm);
2606 
2607         P("Compression level,%d\n", cfg->clevel);
2608         P("Compression type,%s %s", cfg->clevel == 0 ? "serial" : "parallel",
2609             cfg->clevel >= DUMP_CLEVEL_BZIP2 ? "bzip2" : "lzjb");
2610         if (cfg->clevel >= DUMP_CLEVEL_BZIP2)
2611                 P(" (level %d)\n", dump_bzip2_level);
2612         else
2613                 P("\n");
2614         P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio %
2615             100);
2616         P("nhelper_used,%d\n", cfg->nhelper_used);
2617 
2618         P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100);
2619         P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite);
2620         P("..total nsec,%lld\n", (u_longlong_t)ds->iotime);
2621         P("dumpbuf.iosize,%ld\n", dumpbuf.iosize);
2622         P("dumpbuf.size,%ld\n", dumpbuf.size);
2623 
2624         P("Dump pages/sec,%llu\n", (u_longlong_t)ds->npages / sec);
2625         P("Dump pages,%llu\n", (u_longlong_t)ds->npages);
2626         P("Dump time,%d\n", sec);
2627 
2628         if (ds->pages_mapped > 0)
2629                 P("per-cent map utilization,%d\n", (int)((100 * ds->pages_used)
2630                     / ds->pages_mapped));
2631 
2632         P("\nPer-page metrics:\n");
2633         if (ds->npages > 0) {