Print this page
10806 mnode_range_setup() makes assumptions about mnodes
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Toomas Soome <tsoome@me.com>


   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  * Copyright 2018 Joyent, Inc.
  28  */
  29 
  30 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  31 /*      All Rights Reserved   */
  32 
  33 /*
  34  * Portions of this source code were derived from Berkeley 4.3 BSD
  35  * under license from the Regents of the University of California.
  36  */
  37 
  38 /*
  39  * UNIX machine dependent virtual memory support.
  40  */
  41 
  42 #include <sys/types.h>
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/user.h>
  46 #include <sys/proc.h>
  47 #include <sys/kmem.h>


 185 #define PFN_4GIG        0x100000
 186 #define PFN_16MEG       0x1000
 187 /* Indices into the memory range (arch_memranges) array. */
 188 #define MRI_4G          0
 189 #define MRI_2G          1
 190 #define MRI_16M         2
 191 #define MRI_0           3
 192 static pfn_t arch_memranges[NUM_MEM_RANGES] = {
 193     PFN_4GIG,   /* pfn range for 4G and above */
 194     0x80000,    /* pfn range for 2G-4G */
 195     PFN_16MEG,  /* pfn range for 16M-2G */
 196     0x00000,    /* pfn range for 0-16M */
 197 };
 198 pfn_t *memranges = &arch_memranges[0];
 199 int nranges = NUM_MEM_RANGES;
 200 
 201 /*
 202  * This combines mem_node_config and memranges into one data
 203  * structure to be used for page list management.
 204  */
 205 mnoderange_t    *mnoderanges;
 206 int             mnoderangecnt;
 207 int             mtype4g;
 208 int             mtype16m;
 209 int             mtypetop;       /* index of highest pfn'ed mnoderange */
 210 
 211 /*
 212  * 4g memory management variables for systems with more than 4g of memory:
 213  *
 214  * physical memory below 4g is required for 32bit dma devices and, currently,
 215  * for kmem memory. On systems with more than 4g of memory, the pool of memory
 216  * below 4g can be depleted without any paging activity given that there is
 217  * likely to be sufficient memory above 4g.
 218  *
 219  * physmax4g is set true if the largest pfn is over 4g. The rest of the
 220  * 4g memory management code is enabled only when physmax4g is true.
 221  *
 222  * maxmem4g is the count of the maximum number of pages on the page lists
 223  * with physical addresses below 4g. It can be a lot less then 4g given that
 224  * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
 225  * agp aperture etc.
 226  *
 227  * freemem4g maintains the count of the number of available pages on the
 228  * page lists with physical addresses below 4g.
 229  *


 246 static int      physmax4g;
 247 static int      desfree4gshift = 4;     /* maxmem4g shift to derive DESFREE4G */
 248 
 249 /*
 250  * 16m memory management:
 251  *
 252  * reserve some amount of physical memory below 16m for legacy devices.
 253  *
 254  * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above
 255  * 16m or if the 16m pool drops below DESFREE16M.
 256  *
 257  * In this case, general page allocations via page_get_{free,cache}list
 258  * routines will be restricted from allocating from the 16m pool. Allocations
 259  * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations
 260  * are not restricted.
 261  */
 262 
 263 #define FREEMEM16M      MTYPE_FREEMEM(mtype16m)
 264 #define DESFREE16M      desfree16m
 265 #define RESTRICT16M_ALLOC(freemem, pgcnt, flags)                \
 266         ((freemem != 0) && ((flags & PG_PANIC) == 0) &&             \
 267             ((freemem >= (FREEMEM16M)) ||                    \
 268             (FREEMEM16M  < (DESFREE16M + pgcnt))))
 269 
 270 static pgcnt_t  desfree16m = 0x380;
 271 
 272 /*
 273  * This can be patched via /etc/system to allow old non-PAE aware device
 274  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
 275  */
 276 int restricted_kmemalloc = 0;
 277 
 278 #ifdef VM_STATS
 279 struct {
 280         ulong_t pga_alloc;
 281         ulong_t pga_notfullrange;
 282         ulong_t pga_nulldmaattr;
 283         ulong_t pga_allocok;
 284         ulong_t pga_allocfailed;
 285         ulong_t pgma_alloc;
 286         ulong_t pgma_allocok;


1372                         mri--;
1373 
1374                 /*
1375                  * increment mnode range counter when memranges or mnode
1376                  * boundary is reached.
1377                  */
1378                 while (mri >= 0 &&
1379                     mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1380                         mnrcnt++;
1381                         if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1382                                 mri--;
1383                         else
1384                                 break;
1385                 }
1386         }
1387         ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
1388         return (mnrcnt);
1389 #endif  /* __xpv */
1390 }
1391 
1392 /*
1393  * mnode_range_setup() initializes mnoderanges.
1394  */








1395 void
1396 mnode_range_setup(mnoderange_t *mnoderanges)
1397 {
1398         mnoderange_t *mp = mnoderanges;
1399         int     mnode, mri;
1400         int     mindex = 0;     /* current index into mnoderanges array */
1401         int     i, j;
1402         pfn_t   hipfn;
1403         int     last, hi;
1404 
1405         for (mnode = 0; mnode < max_mem_nodes; mnode++) {



1406                 if (mem_node_config[mnode].exists == 0)
1407                         continue;
1408 
1409                 mri = nranges - 1;
1410 
1411                 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1412                         mri--;
1413 
1414                 while (mri >= 0 && mem_node_config[mnode].physmax >=
1415                     MEMRANGELO(mri)) {
1416                         mnoderanges->mnr_pfnlo = MAX(MEMRANGELO(mri),
1417                             mem_node_config[mnode].physbase);
1418                         mnoderanges->mnr_pfnhi = MIN(MEMRANGEHI(mri),
1419                             mem_node_config[mnode].physmax);
1420                         mnoderanges->mnr_mnode = mnode;
1421                         mnoderanges->mnr_memrange = mri;
1422                         mnoderanges->mnr_exists = 1;
1423                         mnoderanges++;
1424                         mindex++;

1425                         if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1426                                 mri--;
1427                         else
1428                                 break;
1429                 }
1430         }
1431 
1432         /*
1433          * For now do a simple sort of the mnoderanges array to fill in
1434          * the mnr_next fields.  Since mindex is expected to be relatively
1435          * small, using a simple O(N^2) algorithm.
1436          */
1437         for (i = 0; i < mindex; i++) {
1438                 if (mp[i].mnr_pfnlo == 0)       /* find lowest */
1439                         break;
1440         }
1441         ASSERT(i < mindex);
1442         last = i;
1443         mtype16m = last;
1444         mp[last].mnr_next = -1;
1445         for (i = 0; i < mindex - 1; i++) {
1446                 hipfn = (pfn_t)(-1);
1447                 hi = -1;
1448                 /* find next highest mnode range */
1449                 for (j = 0; j < mindex; j++) {
1450                         if (mp[j].mnr_pfnlo > mp[last].mnr_pfnlo &&
1451                             mp[j].mnr_pfnlo < hipfn) {
1452                                 hipfn = mp[j].mnr_pfnlo;
1453                                 hi = j;
1454                         }
1455                 }
1456                 mp[hi].mnr_next = last;
1457                 last = hi;
1458         }
1459         mtypetop = last;
1460 }
1461 
1462 #ifndef __xpv
1463 /*
1464  * Update mnoderanges for memory hot-add DR operations.
1465  */
1466 static void
1467 mnode_range_add(int mnode)
1468 {
1469         int     *prev;
1470         int     n, mri;
1471         pfn_t   start, end;
1472         extern  void membar_sync(void);
1473 
1474         ASSERT(0 <= mnode && mnode < max_mem_nodes);
1475         ASSERT(mem_node_config[mnode].exists);
1476         start = mem_node_config[mnode].physbase;
1477         end = mem_node_config[mnode].physmax;
1478         ASSERT(start <= end);
1479         mutex_enter(&mnoderange_lock);


1961  */
1962 void
1963 page_coloring_setup(caddr_t pcmemaddr)
1964 {
1965         int     i;
1966         int     j;
1967         int     k;
1968         caddr_t addr;
1969         int     colors;
1970 
1971         /*
1972          * do page coloring setup
1973          */
1974         addr = pcmemaddr;
1975 
1976         mnoderanges = (mnoderange_t *)addr;
1977         addr += (mnoderangecnt * sizeof (mnoderange_t));
1978 
1979         mnode_range_setup(mnoderanges);
1980 
1981         if (physmax4g)
1982                 mtype4g = pfn_2_mtype(0xfffff);
1983 
1984         for (k = 0; k < NPC_MUTEX; k++) {
1985                 fpc_mutex[k] = (kmutex_t *)addr;
1986                 addr += (max_mem_nodes * sizeof (kmutex_t));
1987         }
1988         for (k = 0; k < NPC_MUTEX; k++) {
1989                 cpc_mutex[k] = (kmutex_t *)addr;
1990                 addr += (max_mem_nodes * sizeof (kmutex_t));
1991         }
1992         page_freelists = (page_t ****)addr;
1993         addr += (mnoderangecnt * sizeof (page_t ***));
1994 
1995         page_cachelists = (page_t ***)addr;
1996         addr += (mnoderangecnt * sizeof (page_t **));
1997 
1998         for (i = 0; i < mnoderangecnt; i++) {
1999                 page_freelists[i] = (page_t ***)addr;
2000                 addr += (mmu_page_sizes * sizeof (page_t **));
2001 
2002                 for (j = 0; j < mmu_page_sizes; j++) {
2003                         colors = page_get_pagecolors(j);




   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  * Copyright 2019, Joyent, Inc.
  28  */
  29 
  30 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  31 /*      All Rights Reserved   */
  32 
  33 /*
  34  * Portions of this source code were derived from Berkeley 4.3 BSD
  35  * under license from the Regents of the University of California.
  36  */
  37 
  38 /*
  39  * UNIX machine dependent virtual memory support.
  40  */
  41 
  42 #include <sys/types.h>
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/user.h>
  46 #include <sys/proc.h>
  47 #include <sys/kmem.h>


 185 #define PFN_4GIG        0x100000
 186 #define PFN_16MEG       0x1000
 187 /* Indices into the memory range (arch_memranges) array. */
 188 #define MRI_4G          0
 189 #define MRI_2G          1
 190 #define MRI_16M         2
 191 #define MRI_0           3
 192 static pfn_t arch_memranges[NUM_MEM_RANGES] = {
 193     PFN_4GIG,   /* pfn range for 4G and above */
 194     0x80000,    /* pfn range for 2G-4G */
 195     PFN_16MEG,  /* pfn range for 16M-2G */
 196     0x00000,    /* pfn range for 0-16M */
 197 };
 198 pfn_t *memranges = &arch_memranges[0];
 199 int nranges = NUM_MEM_RANGES;
 200 
 201 /*
 202  * This combines mem_node_config and memranges into one data
 203  * structure to be used for page list management.
 204  */
 205 static mnoderange_t *mnoderanges;
 206 static int mnoderangecnt;
 207 static int mtype4g;
 208 static int mtype16m;
 209 static int mtypetop;
 210 
 211 /*
 212  * 4g memory management variables for systems with more than 4g of memory:
 213  *
 214  * physical memory below 4g is required for 32bit dma devices and, currently,
 215  * for kmem memory. On systems with more than 4g of memory, the pool of memory
 216  * below 4g can be depleted without any paging activity given that there is
 217  * likely to be sufficient memory above 4g.
 218  *
 219  * physmax4g is set true if the largest pfn is over 4g. The rest of the
 220  * 4g memory management code is enabled only when physmax4g is true.
 221  *
 222  * maxmem4g is the count of the maximum number of pages on the page lists
 223  * with physical addresses below 4g. It can be a lot less then 4g given that
 224  * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
 225  * agp aperture etc.
 226  *
 227  * freemem4g maintains the count of the number of available pages on the
 228  * page lists with physical addresses below 4g.
 229  *


 246 static int      physmax4g;
 247 static int      desfree4gshift = 4;     /* maxmem4g shift to derive DESFREE4G */
 248 
 249 /*
 250  * 16m memory management:
 251  *
 252  * reserve some amount of physical memory below 16m for legacy devices.
 253  *
 254  * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above
 255  * 16m or if the 16m pool drops below DESFREE16M.
 256  *
 257  * In this case, general page allocations via page_get_{free,cache}list
 258  * routines will be restricted from allocating from the 16m pool. Allocations
 259  * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations
 260  * are not restricted.
 261  */
 262 
 263 #define FREEMEM16M      MTYPE_FREEMEM(mtype16m)
 264 #define DESFREE16M      desfree16m
 265 #define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \
 266         (mtype16m != -1 && (freemem != 0) && ((flags & PG_PANIC) == 0) && \
 267             ((freemem >= (FREEMEM16M)) || \
 268             (FREEMEM16M  < (DESFREE16M + pgcnt))))
 269 
 270 static pgcnt_t  desfree16m = 0x380;
 271 
 272 /*
 273  * This can be patched via /etc/system to allow old non-PAE aware device
 274  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
 275  */
 276 int restricted_kmemalloc = 0;
 277 
 278 #ifdef VM_STATS
 279 struct {
 280         ulong_t pga_alloc;
 281         ulong_t pga_notfullrange;
 282         ulong_t pga_nulldmaattr;
 283         ulong_t pga_allocok;
 284         ulong_t pga_allocfailed;
 285         ulong_t pgma_alloc;
 286         ulong_t pgma_allocok;


1372                         mri--;
1373 
1374                 /*
1375                  * increment mnode range counter when memranges or mnode
1376                  * boundary is reached.
1377                  */
1378                 while (mri >= 0 &&
1379                     mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1380                         mnrcnt++;
1381                         if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1382                                 mri--;
1383                         else
1384                                 break;
1385                 }
1386         }
1387         ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
1388         return (mnrcnt);
1389 #endif  /* __xpv */
1390 }
1391 
1392 static int
1393 mnoderange_cmp(const void *v1, const void *v2)
1394 {
1395         const mnoderange_t *m1 = v1;
1396         const mnoderange_t *m2 = v2;
1397 
1398         if (m1->mnr_pfnlo < m2->mnr_pfnlo)
1399                 return (-1);
1400         return (m1->mnr_pfnlo > m2->mnr_pfnlo);
1401 }
1402 
1403 void
1404 mnode_range_setup(mnoderange_t *mnoderanges)
1405 {
1406         mnoderange_t *mp;
1407         size_t nr_ranges;
1408         size_t mnode;



1409 
1410         for (mnode = 0, nr_ranges = 0, mp = mnoderanges;
1411             mnode < max_mem_nodes; mnode++) {
1412                 size_t mri = nranges - 1;
1413 
1414                 if (mem_node_config[mnode].exists == 0)
1415                         continue;
1416 


1417                 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1418                         mri--;
1419 
1420                 while (mri >= 0 && mem_node_config[mnode].physmax >=
1421                     MEMRANGELO(mri)) {
1422                         mp->mnr_pfnlo = MAX(MEMRANGELO(mri),
1423                             mem_node_config[mnode].physbase);
1424                         mp->mnr_pfnhi = MIN(MEMRANGEHI(mri),
1425                             mem_node_config[mnode].physmax);
1426                         mp->mnr_mnode = mnode;
1427                         mp->mnr_memrange = mri;
1428                         mp->mnr_next = -1;
1429                         mp->mnr_exists = 1;
1430                         mp++;
1431                         nr_ranges++;
1432                         if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1433                                 mri--;
1434                         else
1435                                 break;
1436                 }
1437         }
1438 
1439         /*
1440          * mnoderangecnt can be larger than nr_ranges when memory DR is
1441          * supposedly supported.

1442          */
1443         VERIFY3U(nr_ranges, <=, mnoderangecnt);
1444 
1445         qsort(mnoderanges, nr_ranges, sizeof (mnoderange_t), mnoderange_cmp);
1446 
1447         /*
1448          * If some intrepid soul takes the axe to the memory DR code, we can
1449          * remove ->mnr_next altogether, as we just sorted by ->mnr_pfnlo order.
1450          *
1451          * The VERIFY3U() above can be "==" then too.
1452          */
1453         for (size_t i = 1; i < nr_ranges; i++)
1454                 mnoderanges[i].mnr_next = i - 1;
1455 
1456         mtypetop = nr_ranges - 1;
1457         mtype16m = pfn_2_mtype(PFN_16MEG - 1); /* Can be -1 ... */
1458         if (physmax4g)
1459                 mtype4g = pfn_2_mtype(0xfffff);






1460 }
1461 
1462 #ifndef __xpv
1463 /*
1464  * Update mnoderanges for memory hot-add DR operations.
1465  */
1466 static void
1467 mnode_range_add(int mnode)
1468 {
1469         int     *prev;
1470         int     n, mri;
1471         pfn_t   start, end;
1472         extern  void membar_sync(void);
1473 
1474         ASSERT(0 <= mnode && mnode < max_mem_nodes);
1475         ASSERT(mem_node_config[mnode].exists);
1476         start = mem_node_config[mnode].physbase;
1477         end = mem_node_config[mnode].physmax;
1478         ASSERT(start <= end);
1479         mutex_enter(&mnoderange_lock);


1961  */
1962 void
1963 page_coloring_setup(caddr_t pcmemaddr)
1964 {
1965         int     i;
1966         int     j;
1967         int     k;
1968         caddr_t addr;
1969         int     colors;
1970 
1971         /*
1972          * do page coloring setup
1973          */
1974         addr = pcmemaddr;
1975 
1976         mnoderanges = (mnoderange_t *)addr;
1977         addr += (mnoderangecnt * sizeof (mnoderange_t));
1978 
1979         mnode_range_setup(mnoderanges);
1980 



1981         for (k = 0; k < NPC_MUTEX; k++) {
1982                 fpc_mutex[k] = (kmutex_t *)addr;
1983                 addr += (max_mem_nodes * sizeof (kmutex_t));
1984         }
1985         for (k = 0; k < NPC_MUTEX; k++) {
1986                 cpc_mutex[k] = (kmutex_t *)addr;
1987                 addr += (max_mem_nodes * sizeof (kmutex_t));
1988         }
1989         page_freelists = (page_t ****)addr;
1990         addr += (mnoderangecnt * sizeof (page_t ***));
1991 
1992         page_cachelists = (page_t ***)addr;
1993         addr += (mnoderangecnt * sizeof (page_t **));
1994 
1995         for (i = 0; i < mnoderangecnt; i++) {
1996                 page_freelists[i] = (page_t ***)addr;
1997                 addr += (mmu_page_sizes * sizeof (page_t **));
1998 
1999                 for (j = 0; j < mmu_page_sizes; j++) {
2000                         colors = page_get_pagecolors(j);