Print this page
9694 Parallel dump hangs
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: John Levon <levon@movementarian.org>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/os/dumpsubr.c
          +++ new/usr/src/uts/common/os/dumpsubr.c
↓ open down ↓ 14 lines elided ↑ open up ↑
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2018 Joyent, Inc.
       25 + * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
  25   26   */
  26   27  
  27   28  #include <sys/types.h>
  28   29  #include <sys/param.h>
  29   30  #include <sys/systm.h>
  30   31  #include <sys/vm.h>
  31   32  #include <sys/proc.h>
  32   33  #include <sys/file.h>
  33   34  #include <sys/conf.h>
  34   35  #include <sys/kmem.h>
↓ open down ↓ 29 lines elided ↑ open up ↑
  64   65  #include <sys/fs/zfs.h>
  65   66  
  66   67  #include <vm/hat.h>
  67   68  #include <vm/as.h>
  68   69  #include <vm/page.h>
  69   70  #include <vm/pvn.h>
  70   71  #include <vm/seg.h>
  71   72  #include <vm/seg_kmem.h>
  72   73  #include <sys/clock_impl.h>
  73   74  #include <sys/hold_page.h>
       75 +#include <sys/cpu.h>
  74   76  
  75   77  #include <bzip2/bzlib.h>
  76   78  
  77   79  #define ONE_GIG (1024 * 1024 * 1024UL)
  78   80  
  79   81  /*
  80   82   * Crash dump time is dominated by disk write time.  To reduce this,
  81   83   * the stronger compression method bzip2 is applied to reduce the dump
  82   84   * size and hence reduce I/O time.  However, bzip2 is much more
  83   85   * computationally expensive than the existing lzjb algorithm, so to
↓ open down ↓ 349 lines elided ↑ open up ↑
 433  435          char    *cur;           /* dump write pointer */
 434  436          char    *start;         /* dump buffer address */
 435  437          char    *end;           /* dump buffer end */
 436  438          size_t  size;           /* size of dumpbuf in bytes */
 437  439          size_t  iosize;         /* best transfer size for device */
 438  440  } dumpbuf_t;
 439  441  
 440  442  dumpbuf_t dumpbuf;              /* I/O buffer */
 441  443  
 442  444  /*
      445 + * For parallel dump, defines maximum time main task thread will wait
      446 + * for at least one helper to register in dumpcfg.helpermap, before
      447 + * assuming there are no helpers and falling back to serial mode.
      448 + * Value is chosen arbitrary and provides *really* long wait for any
      449 + * available helper to register.
      450 + */
      451 +#define DUMP_HELPER_MAX_WAIT    1000    /* millisec */
      452 +
      453 +/*
 443  454   * The dump I/O buffer must be at least one page, at most xfer_size
 444  455   * bytes, and should scale with physmem in between.  The transfer size
 445  456   * passed in will either represent a global default (maxphys) or the
 446  457   * best size for the device.  The size of the dumpbuf I/O buffer is
 447  458   * limited by dumpbuf_limit (8MB by default) because the dump
 448  459   * performance saturates beyond a certain size.  The default is to
 449  460   * select 1/4096 of the memory.
 450  461   */
 451  462  static int      dumpbuf_fraction = 12;  /* memory size scale factor */
 452  463  static size_t   dumpbuf_limit = 8 * DUMP_1MB;   /* max I/O buf size */
↓ open down ↓ 31 lines elided ↑ open up ↑
 484  495  
 485  496          new_buf = kmem_alloc(new_size, KM_SLEEP);
 486  497          dumpbuf.size = new_size;
 487  498          dumpbuf.start = new_buf;
 488  499          dumpbuf.end = new_buf + new_size;
 489  500          kmem_free(old_buf, old_size);
 490  501  }
 491  502  
 492  503  /*
 493  504   * dump_update_clevel is called when dumpadm configures the dump device.
 494      - *      Calculate number of helpers and buffers.
 495      - *      Allocate the minimum configuration for now.
      505 + *      Calculate number of helpers and buffers.
      506 + *      Allocate the minimum configuration for now.
 496  507   *
 497  508   * When the dump file is configured we reserve a minimum amount of
 498  509   * memory for use at crash time. But we reserve VA for all the memory
 499  510   * we really want in order to do the fastest dump possible. The VA is
 500  511   * backed by pages not being dumped, according to the bitmap. If
 501  512   * there is insufficient spare memory, however, we fall back to the
 502  513   * minimum.
 503  514   *
 504  515   * Live dump (savecore -L) always uses the minimum config.
 505  516   *
 506  517   * clevel 0 is single threaded lzjb
 507  518   * clevel 1 is parallel lzjb
 508  519   * clevel 2 is parallel bzip2
 509  520   *
 510  521   * The ncpu threshold is selected with dump_plat_mincpu.
 511  522   * On OPL, set_platform_defaults() overrides the sun4u setting.
 512  523   * The actual values are defined via DUMP_PLAT_*_MINCPU macros.
 513  524   *
 514  525   * Architecture         Threshold       Algorithm
 515      - * sun4u                <  51           parallel lzjb
 516      - * sun4u                >= 51           parallel bzip2(*)
 517      - * sun4u OPL            <  8            parallel lzjb
 518      - * sun4u OPL            >= 8            parallel bzip2(*)
 519      - * sun4v                <  128          parallel lzjb
 520      - * sun4v                >= 128          parallel bzip2(*)
      526 + * sun4u                <  51           parallel lzjb
      527 + * sun4u                >= 51           parallel bzip2(*)
      528 + * sun4u OPL            <  8            parallel lzjb
      529 + * sun4u OPL            >= 8            parallel bzip2(*)
      530 + * sun4v                <  128          parallel lzjb
      531 + * sun4v                >= 128          parallel bzip2(*)
 521  532   * x86                  < 11            parallel lzjb
 522  533   * x86                  >= 11           parallel bzip2(*)
 523      - * 32-bit               N/A             single-threaded lzjb
      534 + * 32-bit               N/A             single-threaded lzjb
 524  535   *
 525  536   * (*) bzip2 is only chosen if there is sufficient available
 526  537   * memory for buffers at dump time. See dumpsys_get_maxmem().
 527  538   *
 528  539   * Faster dump devices have larger I/O buffers. The threshold value is
 529  540   * increased according to the size of the dump I/O buffer, because
 530  541   * parallel lzjb performs better with faster disks. For buffers >= 1MB
 531  542   * the threshold is 3X; for buffers >= 256K threshold is 2X.
 532  543   *
 533  544   * For parallel dumps, the number of helpers is ncpu-1. The CPU
↓ open down ↓ 1760 lines elided ↑ open up ↑
2294 2305   */
2295 2306  static void
2296 2307  dumpsys_main_task(void *arg)
2297 2308  {
2298 2309          dumpsync_t *ds = arg;
2299 2310          pgcnt_t pagenum = 0, bitnum = 0, hibitnum;
2300 2311          dumpmlw_t mlw;
2301 2312          cbuf_t *cp;
2302 2313          pgcnt_t baseoff, pfnoff;
2303 2314          pfn_t base, pfn;
2304      -        int i, dumpserial;
     2315 +        boolean_t dumpserial;
     2316 +        int i;
2305 2317  
2306 2318          /*
2307 2319           * Fall back to serial mode if there are no helpers.
2308 2320           * dump_plat_mincpu can be set to 0 at any time.
2309 2321           * dumpcfg.helpermap must contain at least one member.
     2322 +         *
     2323 +         * It is possible that the helpers haven't registered
     2324 +         * in helpermap yet; wait up to DUMP_HELPER_MAX_WAIT for
     2325 +         * at least one helper to register.
2310 2326           */
2311      -        dumpserial = 1;
2312      -
     2327 +        dumpserial = B_TRUE;
2313 2328          if (dump_plat_mincpu != 0 && dumpcfg.clevel != 0) {
2314      -                for (i = 0; i < BT_BITOUL(NCPU); ++i) {
2315      -                        if (dumpcfg.helpermap[i] != 0) {
2316      -                                dumpserial = 0;
     2329 +                hrtime_t hrtmax = MSEC2NSEC(DUMP_HELPER_MAX_WAIT);
     2330 +                hrtime_t hrtstart = gethrtime();
     2331 +
     2332 +                for (;;) {
     2333 +                        for (i = 0; i < BT_BITOUL(NCPU); ++i) {
     2334 +                                if (dumpcfg.helpermap[i] != 0) {
     2335 +                                        dumpserial = B_FALSE;
     2336 +                                        break;
     2337 +                                }
     2338 +                        }
     2339 +
     2340 +                        if ((!dumpserial) ||
     2341 +                            ((gethrtime() - hrtstart) >= hrtmax)) {
2317 2342                                  break;
2318 2343                          }
     2344 +
     2345 +                        SMT_PAUSE();
2319 2346                  }
2320      -        }
2321 2347  
2322      -        if (dumpserial) {
2323      -                dumpcfg.clevel = 0;
2324      -                if (dumpcfg.helper[0].lzbuf == NULL)
2325      -                        dumpcfg.helper[0].lzbuf = dumpcfg.helper[1].page;
     2348 +                if (dumpserial) {
     2349 +                        dumpcfg.clevel = 0;
     2350 +                        if (dumpcfg.helper[0].lzbuf == NULL) {
     2351 +                                dumpcfg.helper[0].lzbuf =
     2352 +                                    dumpcfg.helper[1].page;
     2353 +                        }
     2354 +                }
2326 2355          }
2327 2356  
2328 2357          dump_init_memlist_walker(&mlw);
2329 2358  
2330 2359          for (;;) {
2331 2360                  int sec = (gethrtime() - ds->start) / NANOSEC;
2332 2361  
2333 2362                  /*
2334 2363                   * Render a simple progress display on the system console to
2335 2364                   * make clear to the operator that the system has not hung.
↓ open down ↓ 124 lines elided ↑ open up ↑
2460 2489                          ds->pages_used += pagenum - cp->pagenum;
2461 2490  
2462 2491                          CQ_OPEN(mainq);
2463 2492  
2464 2493                          /*
2465 2494                           * If there are no helpers the main task does
2466 2495                           * non-streams lzjb compress.
2467 2496                           */
2468 2497                          if (dumpserial) {
2469 2498                                  dumpsys_lzjb_page(dumpcfg.helper, cp);
2470      -                                break;
     2499 +                        } else {
     2500 +                                /* pass mapped pages to a helper */
     2501 +                                CQ_PUT(helperq, cp, CBUF_INREADY);
2471 2502                          }
2472 2503  
2473      -                        /* pass mapped pages to a helper */
2474      -                        CQ_PUT(helperq, cp, CBUF_INREADY);
2475      -
2476 2504                          /* the last page was done */
2477 2505                          if (bitnum >= dumpcfg.bitmapsize)
2478 2506                                  CQ_CLOSE(helperq);
2479 2507  
2480 2508                          break;
2481 2509  
2482 2510                  case CBUF_USEDMAP:
2483 2511  
2484 2512                          ds->npages += btop(cp->used);
2485 2513  
↓ open down ↓ 84 lines elided ↑ open up ↑
2570 2598                          P("\n");
2571 2599          }
2572 2600  
2573 2601          P("ncbuf_used,%d\n", cfg->ncbuf_used);
2574 2602          P("ncmap,%d\n", cfg->ncmap);
2575 2603  
2576 2604          P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m);
2577 2605          P("Found small pages,%ld\n", cfg->foundsm);
2578 2606  
2579 2607          P("Compression level,%d\n", cfg->clevel);
2580      -        P("Compression type,%s %s\n", cfg->clevel == 0 ? "serial" : "parallel",
     2608 +        P("Compression type,%s %s", cfg->clevel == 0 ? "serial" : "parallel",
2581 2609              cfg->clevel >= DUMP_CLEVEL_BZIP2 ? "bzip2" : "lzjb");
     2610 +        if (cfg->clevel >= DUMP_CLEVEL_BZIP2)
     2611 +                P(" (level %d)\n", dump_bzip2_level);
     2612 +        else
     2613 +                P("\n");
2582 2614          P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio %
2583 2615              100);
2584 2616          P("nhelper_used,%d\n", cfg->nhelper_used);
2585 2617  
2586 2618          P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100);
2587 2619          P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite);
2588 2620          P("..total nsec,%lld\n", (u_longlong_t)ds->iotime);
2589 2621          P("dumpbuf.iosize,%ld\n", dumpbuf.iosize);
2590 2622          P("dumpbuf.size,%ld\n", dumpbuf.size);
2591 2623  
↓ open down ↓ 513 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX