Print this page
10806 mnode_range_setup() makes assumptions about mnodes
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Toomas Soome <tsoome@me.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/i86pc/vm/vm_machdep.c
          +++ new/usr/src/uts/i86pc/vm/vm_machdep.c
↓ open down ↓ 16 lines elided ↑ open up ↑
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   */
  24   24  /*
  25   25   * Copyright (c) 2010, Intel Corporation.
  26   26   * All rights reserved.
  27      - * Copyright 2018 Joyent, Inc.
       27 + * Copyright 2019, Joyent, Inc.
  28   28   */
  29   29  
  30   30  /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  31   31  /*      All Rights Reserved   */
  32   32  
  33   33  /*
  34   34   * Portions of this source code were derived from Berkeley 4.3 BSD
  35   35   * under license from the Regents of the University of California.
  36   36   */
  37   37  
↓ open down ↓ 115 lines elided ↑ open up ↑
 153  153          int     mnr_exists;
 154  154          /* maintain page list stats */
 155  155          pgcnt_t mnr_mt_clpgcnt;         /* cache list cnt */
 156  156          pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */
 157  157          pgcnt_t mnr_mt_totcnt;          /* sum of cache and free lists */
 158  158  #ifdef DEBUG
 159  159          struct mnr_mts {                /* mnode/mtype szc stats */
 160  160                  pgcnt_t mnr_mts_pgcnt;
 161  161                  int     mnr_mts_colors;
 162  162                  pgcnt_t *mnr_mtsc_pgcnt;
 163      -        }       *mnr_mts;
      163 +        }       *mnr_mts;
 164  164  #endif
 165  165  } mnoderange_t;
 166  166  
 167  167  #define MEMRANGEHI(mtype)                                               \
 168  168          ((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
 169  169  #define MEMRANGELO(mtype)       (memranges[mtype])
 170  170  
 171  171  #define MTYPE_FREEMEM(mt)       (mnoderanges[mt].mnr_mt_totcnt)
 172  172  
 173  173  /*
↓ open down ↓ 21 lines elided ↑ open up ↑
 195  195      PFN_16MEG,  /* pfn range for 16M-2G */
 196  196      0x00000,    /* pfn range for 0-16M */
 197  197  };
 198  198  pfn_t *memranges = &arch_memranges[0];
 199  199  int nranges = NUM_MEM_RANGES;
 200  200  
 201  201  /*
 202  202   * This combines mem_node_config and memranges into one data
 203  203   * structure to be used for page list management.
 204  204   */
 205      -mnoderange_t    *mnoderanges;
 206      -int             mnoderangecnt;
 207      -int             mtype4g;
 208      -int             mtype16m;
 209      -int             mtypetop;       /* index of highest pfn'ed mnoderange */
      205 +static mnoderange_t *mnoderanges;
      206 +static int mnoderangecnt;
      207 +static int mtype4g;
      208 +static int mtype16m;
      209 +static int mtypetop;
 210  210  
 211  211  /*
 212  212   * 4g memory management variables for systems with more than 4g of memory:
 213  213   *
 214  214   * physical memory below 4g is required for 32bit dma devices and, currently,
 215  215   * for kmem memory. On systems with more than 4g of memory, the pool of memory
 216  216   * below 4g can be depleted without any paging activity given that there is
 217  217   * likely to be sufficient memory above 4g.
 218  218   *
 219  219   * physmax4g is set true if the largest pfn is over 4g. The rest of the
↓ open down ↓ 35 lines elided ↑ open up ↑
 255  255   * 16m or if the 16m pool drops below DESFREE16M.
 256  256   *
 257  257   * In this case, general page allocations via page_get_{free,cache}list
 258  258   * routines will be restricted from allocating from the 16m pool. Allocations
 259  259   * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations
 260  260   * are not restricted.
 261  261   */
 262  262  
 263  263  #define FREEMEM16M      MTYPE_FREEMEM(mtype16m)
 264  264  #define DESFREE16M      desfree16m
 265      -#define RESTRICT16M_ALLOC(freemem, pgcnt, flags)                \
 266      -        ((freemem != 0) && ((flags & PG_PANIC) == 0) &&         \
 267      -            ((freemem >= (FREEMEM16M)) ||                       \
      265 +#define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \
      266 +        (mtype16m != -1 && (freemem != 0) && ((flags & PG_PANIC) == 0) && \
      267 +            ((freemem >= (FREEMEM16M)) || \
 268  268              (FREEMEM16M  < (DESFREE16M + pgcnt))))
 269  269  
 270  270  static pgcnt_t  desfree16m = 0x380;
 271  271  
 272  272  /*
 273  273   * This can be patched via /etc/system to allow old non-PAE aware device
 274  274   * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
 275  275   */
 276  276  int restricted_kmemalloc = 0;
 277  277  
↓ open down ↓ 1104 lines elided ↑ open up ↑
1382 1382                                  mri--;
1383 1383                          else
1384 1384                                  break;
1385 1385                  }
1386 1386          }
1387 1387          ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
1388 1388          return (mnrcnt);
1389 1389  #endif  /* __xpv */
1390 1390  }
1391 1391  
1392      -/*
1393      - * mnode_range_setup() initializes mnoderanges.
1394      - */
     1392 +static int
     1393 +mnoderange_cmp(const void *v1, const void *v2)
     1394 +{
     1395 +        const mnoderange_t *m1 = v1;
     1396 +        const mnoderange_t *m2 = v2;
     1397 +
     1398 +        if (m1->mnr_pfnlo < m2->mnr_pfnlo)
     1399 +                return (-1);
     1400 +        return (m1->mnr_pfnlo > m2->mnr_pfnlo);
     1401 +}
     1402 +
1395 1403  void
1396 1404  mnode_range_setup(mnoderange_t *mnoderanges)
1397 1405  {
1398      -        mnoderange_t *mp = mnoderanges;
1399      -        int     mnode, mri;
1400      -        int     mindex = 0;     /* current index into mnoderanges array */
1401      -        int     i, j;
1402      -        pfn_t   hipfn;
1403      -        int     last, hi;
     1406 +        mnoderange_t *mp;
     1407 +        size_t nr_ranges;
     1408 +        size_t mnode;
1404 1409  
1405      -        for (mnode = 0; mnode < max_mem_nodes; mnode++) {
     1410 +        for (mnode = 0, nr_ranges = 0, mp = mnoderanges;
     1411 +            mnode < max_mem_nodes; mnode++) {
     1412 +                size_t mri = nranges - 1;
     1413 +
1406 1414                  if (mem_node_config[mnode].exists == 0)
1407 1415                          continue;
1408 1416  
1409      -                mri = nranges - 1;
1410      -
1411 1417                  while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1412 1418                          mri--;
1413 1419  
1414 1420                  while (mri >= 0 && mem_node_config[mnode].physmax >=
1415 1421                      MEMRANGELO(mri)) {
1416      -                        mnoderanges->mnr_pfnlo = MAX(MEMRANGELO(mri),
     1422 +                        mp->mnr_pfnlo = MAX(MEMRANGELO(mri),
1417 1423                              mem_node_config[mnode].physbase);
1418      -                        mnoderanges->mnr_pfnhi = MIN(MEMRANGEHI(mri),
     1424 +                        mp->mnr_pfnhi = MIN(MEMRANGEHI(mri),
1419 1425                              mem_node_config[mnode].physmax);
1420      -                        mnoderanges->mnr_mnode = mnode;
1421      -                        mnoderanges->mnr_memrange = mri;
1422      -                        mnoderanges->mnr_exists = 1;
1423      -                        mnoderanges++;
1424      -                        mindex++;
     1426 +                        mp->mnr_mnode = mnode;
     1427 +                        mp->mnr_memrange = mri;
     1428 +                        mp->mnr_next = -1;
     1429 +                        mp->mnr_exists = 1;
     1430 +                        mp++;
     1431 +                        nr_ranges++;
1425 1432                          if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1426 1433                                  mri--;
1427 1434                          else
1428 1435                                  break;
1429 1436                  }
1430 1437          }
1431 1438  
1432 1439          /*
1433      -         * For now do a simple sort of the mnoderanges array to fill in
1434      -         * the mnr_next fields.  Since mindex is expected to be relatively
1435      -         * small, using a simple O(N^2) algorithm.
     1440 +         * mnoderangecnt can be larger than nr_ranges when memory DR is
     1441 +         * supposedly supported.
1436 1442           */
1437      -        for (i = 0; i < mindex; i++) {
1438      -                if (mp[i].mnr_pfnlo == 0)       /* find lowest */
1439      -                        break;
1440      -        }
1441      -        ASSERT(i < mindex);
1442      -        last = i;
1443      -        mtype16m = last;
1444      -        mp[last].mnr_next = -1;
1445      -        for (i = 0; i < mindex - 1; i++) {
1446      -                hipfn = (pfn_t)(-1);
1447      -                hi = -1;
1448      -                /* find next highest mnode range */
1449      -                for (j = 0; j < mindex; j++) {
1450      -                        if (mp[j].mnr_pfnlo > mp[last].mnr_pfnlo &&
1451      -                            mp[j].mnr_pfnlo < hipfn) {
1452      -                                hipfn = mp[j].mnr_pfnlo;
1453      -                                hi = j;
1454      -                        }
1455      -                }
1456      -                mp[hi].mnr_next = last;
1457      -                last = hi;
1458      -        }
1459      -        mtypetop = last;
     1443 +        VERIFY3U(nr_ranges, <=, mnoderangecnt);
     1444 +
     1445 +        qsort(mnoderanges, nr_ranges, sizeof (mnoderange_t), mnoderange_cmp);
     1446 +
     1447 +        /*
     1448 +         * If some intrepid soul takes the axe to the memory DR code, we can
     1449 +         * remove ->mnr_next altogether, as we just sorted by ->mnr_pfnlo order.
     1450 +         *
     1451 +         * The VERIFY3U() above can be "==" then too.
     1452 +         */
     1453 +        for (size_t i = 1; i < nr_ranges; i++)
     1454 +                mnoderanges[i].mnr_next = i - 1;
     1455 +
     1456 +        mtypetop = nr_ranges - 1;
     1457 +        mtype16m = pfn_2_mtype(PFN_16MEG - 1); /* Can be -1 ... */
     1458 +        if (physmax4g)
     1459 +                mtype4g = pfn_2_mtype(0xfffff);
1460 1460  }
1461 1461  
1462 1462  #ifndef __xpv
1463 1463  /*
1464 1464   * Update mnoderanges for memory hot-add DR operations.
1465 1465   */
1466 1466  static void
1467 1467  mnode_range_add(int mnode)
1468 1468  {
1469 1469          int     *prev;
↓ open down ↓ 501 lines elided ↑ open up ↑
1971 1971          /*
1972 1972           * do page coloring setup
1973 1973           */
1974 1974          addr = pcmemaddr;
1975 1975  
1976 1976          mnoderanges = (mnoderange_t *)addr;
1977 1977          addr += (mnoderangecnt * sizeof (mnoderange_t));
1978 1978  
1979 1979          mnode_range_setup(mnoderanges);
1980 1980  
1981      -        if (physmax4g)
1982      -                mtype4g = pfn_2_mtype(0xfffff);
1983      -
1984 1981          for (k = 0; k < NPC_MUTEX; k++) {
1985 1982                  fpc_mutex[k] = (kmutex_t *)addr;
1986 1983                  addr += (max_mem_nodes * sizeof (kmutex_t));
1987 1984          }
1988 1985          for (k = 0; k < NPC_MUTEX; k++) {
1989 1986                  cpc_mutex[k] = (kmutex_t *)addr;
1990 1987                  addr += (max_mem_nodes * sizeof (kmutex_t));
1991 1988          }
1992 1989          page_freelists = (page_t ****)addr;
1993 1990          addr += (mnoderangecnt * sizeof (page_t ***));
↓ open down ↓ 2116 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX