12045 segkmem_page_create(): Fire Walk With Me

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2016 Joyent, Inc.
  24  */
  25 
  26 #include <sys/types.h>
  27 #include <sys/t_lock.h>
  28 #include <sys/param.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/tuneable.h>
  31 #include <sys/systm.h>
  32 #include <sys/vm.h>
  33 #include <sys/kmem.h>
  34 #include <sys/vmem.h>
  35 #include <sys/mman.h>
  36 #include <sys/cmn_err.h>
  37 #include <sys/debug.h>
  38 #include <sys/dumphdr.h>
  39 #include <sys/bootconf.h>
  40 #include <sys/lgrp.h>
  41 #include <vm/seg_kmem.h>
  42 #include <vm/hat.h>
  43 #include <vm/page.h>
  44 #include <vm/vm_dep.h>
  45 #include <vm/faultcode.h>
  46 #include <sys/promif.h>
  47 #include <vm/seg_kp.h>
  48 #include <sys/bitmap.h>
  49 #include <sys/mem_cage.h>
  50 
  51 #ifdef __sparc
  52 #include <sys/ivintr.h>
  53 #include <sys/panic.h>
  54 #endif
  55 
  56 /*
  57  * seg_kmem is the primary kernel memory segment driver.  It
  58  * maps the kernel heap [kernelheap, ekernelheap), module text,
  59  * and all memory which was allocated before the VM was initialized
  60  * into kas.
  61  *
  62  * Pages which belong to seg_kmem are hashed into &kvp vnode at
  63  * an offset equal to (u_offset_t)virt_addr, and have p_lckcnt >= 1.
  64  * They must never be paged out since segkmem_fault() is a no-op to
  65  * prevent recursive faults.
  66  *
  67  * Currently, seg_kmem pages are sharelocked (p_sharelock == 1) on
  68  * __x86 and are unlocked (p_sharelock == 0) on __sparc.  Once __x86
  69  * supports relocation the #ifdef kludges can be removed.
  70  *
  71  * seg_kmem pages may be subject to relocation by page_relocate(),
  72  * provided that the HAT supports it; if this is so, segkmem_reloc
  73  * will be set to a nonzero value. All boot time allocated memory as
  74  * well as static memory is considered off limits to relocation.
  75  * Pages are "relocatable" if p_state does not have P_NORELOC set, so
  76  * we request P_NORELOC pages for memory that isn't safe to relocate.
  77  *
  78  * The kernel heap is logically divided up into four pieces:
  79  *
  80  *   heap32_arena is for allocations that require 32-bit absolute
  81  *   virtual addresses (e.g. code that uses 32-bit pointers/offsets).
  82  *
  83  *   heap_core is for allocations that require 2GB *relative*
  84  *   offsets; in other words all memory from heap_core is within
  85  *   2GB of all other memory from the same arena. This is a requirement
  86  *   of the addressing modes of some processors in supervisor code.
  87  *
  88  *   heap_arena is the general heap arena.
  89  *
  90  *   static_arena is the static memory arena.  Allocations from it
  91  *   are not subject to relocation so it is safe to use the memory
  92  *   physical address as well as the virtual address (e.g. the VA to
  93  *   PA translations are static).  Caches may import from static_arena;
  94  *   all other static memory allocations should use static_alloc_arena.
  95  *
  96  * On some platforms which have limited virtual address space, seg_kmem
  97  * may share [kernelheap, ekernelheap) with seg_kp; if this is so,
  98  * segkp_bitmap is non-NULL, and each bit represents a page of virtual
  99  * address space which is actually seg_kp mapped.
 100  */
 101 
 102 extern ulong_t *segkp_bitmap;   /* Is set if segkp is from the kernel heap */
 103 
 104 char *kernelheap;               /* start of primary kernel heap */
 105 char *ekernelheap;              /* end of primary kernel heap */
 106 struct seg kvseg;               /* primary kernel heap segment */
 107 struct seg kvseg_core;          /* "core" kernel heap segment */
 108 struct seg kzioseg;             /* Segment for zio mappings */
 109 vmem_t *heap_arena;             /* primary kernel heap arena */
 110 vmem_t *heap_core_arena;        /* core kernel heap arena */
 111 char *heap_core_base;           /* start of core kernel heap arena */
 112 char *heap_lp_base;             /* start of kernel large page heap arena */
 113 char *heap_lp_end;              /* end of kernel large page heap arena */
 114 vmem_t *hat_memload_arena;      /* HAT translation data */
 115 struct seg kvseg32;             /* 32-bit kernel heap segment */
 116 vmem_t *heap32_arena;           /* 32-bit kernel heap arena */
 117 vmem_t *heaptext_arena;         /* heaptext arena */
 118 struct as kas;                  /* kernel address space */
 119 int segkmem_reloc;              /* enable/disable relocatable segkmem pages */
 120 vmem_t *static_arena;           /* arena for caches to import static memory */
 121 vmem_t *static_alloc_arena;     /* arena for allocating static memory */
 122 vmem_t *zio_arena = NULL;       /* arena for allocating zio memory */
 123 vmem_t *zio_alloc_arena = NULL; /* arena for allocating zio memory */
 124 
 125 /*
 126  * seg_kmem driver can map part of the kernel heap with large pages.
 127  * Currently this functionality is implemented for sparc platforms only.
 128  *
 129  * The large page size "segkmem_lpsize" for kernel heap is selected in the
 130  * platform specific code. It can also be modified via /etc/system file.
 131  * Setting segkmem_lpsize to PAGESIZE in /etc/system disables usage of large
 132  * pages for kernel heap. "segkmem_lpshift" is adjusted appropriately to
 133  * match segkmem_lpsize.
 134  *
 135  * At boot time we carve from kernel heap arena a range of virtual addresses
 136  * that will be used for large page mappings. This range [heap_lp_base,
 137  * heap_lp_end) is set up as a separate vmem arena - "heap_lp_arena". We also
 138  * create "kmem_lp_arena" that caches memory already backed up by large
 139  * pages. kmem_lp_arena imports virtual segments from heap_lp_arena.
 140  */
 141 
 142 size_t  segkmem_lpsize;
 143 static  uint_t  segkmem_lpshift = PAGESHIFT;
 144 int     segkmem_lpszc = 0;
 145 
 146 size_t  segkmem_kmemlp_quantum = 0x400000;      /* 4MB */
 147 size_t  segkmem_heaplp_quantum;
 148 vmem_t *heap_lp_arena;
 149 static  vmem_t *kmem_lp_arena;
 150 static  vmem_t *segkmem_ppa_arena;
 151 static  segkmem_lpcb_t segkmem_lpcb;
 152 
 153 /*
 154  * We use "segkmem_kmemlp_max" to limit the total amount of physical memory
 155  * consumed by the large page heap. By default this parameter is set to 1/8 of
 156  * physmem but can be adjusted through /etc/system either directly or
 157  * indirectly by setting "segkmem_kmemlp_pcnt" to the percent of physmem
 158  * we allow for large page heap.
 159  */
 160 size_t  segkmem_kmemlp_max;
 161 static  uint_t  segkmem_kmemlp_pcnt;
 162 
 163 /*
 164  * Getting large pages for kernel heap could be problematic due to
 165  * physical memory fragmentation. That's why we allow to preallocate
 166  * "segkmem_kmemlp_min" bytes at boot time.
 167  */
 168 static  size_t  segkmem_kmemlp_min;
 169 
 170 /*
 171  * Throttling is used to avoid expensive tries to allocate large pages
 172  * for kernel heap when a lot of succesive attempts to do so fail.
 173  */
 174 static  ulong_t segkmem_lpthrottle_max = 0x400000;
 175 static  ulong_t segkmem_lpthrottle_start = 0x40;
 176 static  ulong_t segkmem_use_lpthrottle = 1;
 177 
 178 /*
 179  * Freed pages accumulate on a garbage list until segkmem is ready,
 180  * at which point we call segkmem_gc() to free it all.
 181  */
 182 typedef struct segkmem_gc_list {
 183         struct segkmem_gc_list  *gc_next;
 184         vmem_t                  *gc_arena;
 185         size_t                  gc_size;
 186 } segkmem_gc_list_t;
 187 
 188 static segkmem_gc_list_t *segkmem_gc_list;
 189 
 190 /*
 191  * Allocations from the hat_memload arena add VM_MEMLOAD to their
 192  * vmflags so that segkmem_xalloc() can inform the hat layer that it needs
 193  * to take steps to prevent infinite recursion.  HAT allocations also
 194  * must be non-relocatable to prevent recursive page faults.
 195  */
 196 static void *
 197 hat_memload_alloc(vmem_t *vmp, size_t size, int flags)
 198 {
 199         flags |= (VM_MEMLOAD | VM_NORELOC);
 200         return (segkmem_alloc(vmp, size, flags));
 201 }
 202 
 203 /*
 204  * Allocations from static_arena arena (or any other arena that uses
 205  * segkmem_alloc_permanent()) require non-relocatable (permanently
 206  * wired) memory pages, since these pages are referenced by physical
 207  * as well as virtual address.
 208  */
 209 void *
 210 segkmem_alloc_permanent(vmem_t *vmp, size_t size, int flags)
 211 {
 212         return (segkmem_alloc(vmp, size, flags | VM_NORELOC));
 213 }
 214 
 215 /*
 216  * Initialize kernel heap boundaries.
 217  */
 218 void
 219 kernelheap_init(
 220         void *heap_start,
 221         void *heap_end,
 222         char *first_avail,
 223         void *core_start,
 224         void *core_end)
 225 {
 226         uintptr_t textbase;
 227         size_t core_size;
 228         size_t heap_size;
 229         vmem_t *heaptext_parent;
 230         size_t  heap_lp_size = 0;
 231 #ifdef __sparc
 232         size_t kmem64_sz = kmem64_aligned_end - kmem64_base;
 233 #endif  /* __sparc */
 234 
 235         kernelheap = heap_start;
 236         ekernelheap = heap_end;
 237 
 238 #ifdef __sparc
 239         heap_lp_size = (((uintptr_t)heap_end - (uintptr_t)heap_start) / 4);
 240         /*
 241          * Bias heap_lp start address by kmem64_sz to reduce collisions
 242          * in 4M kernel TSB between kmem64 area and heap_lp
 243          */
 244         kmem64_sz = P2ROUNDUP(kmem64_sz, MMU_PAGESIZE256M);
 245         if (kmem64_sz <= heap_lp_size / 2)
 246                 heap_lp_size -= kmem64_sz;
 247         heap_lp_base = ekernelheap - heap_lp_size;
 248         heap_lp_end = heap_lp_base + heap_lp_size;
 249 #endif  /* __sparc */
 250 
 251         /*
 252          * If this platform has a 'core' heap area, then the space for
 253          * overflow module text should be carved out of the end of that
 254          * heap.  Otherwise, it gets carved out of the general purpose
 255          * heap.
 256          */
 257         core_size = (uintptr_t)core_end - (uintptr_t)core_start;
 258         if (core_size > 0) {
 259                 ASSERT(core_size >= HEAPTEXT_SIZE);
 260                 textbase = (uintptr_t)core_end - HEAPTEXT_SIZE;
 261                 core_size -= HEAPTEXT_SIZE;
 262         }
 263 #ifndef __sparc
 264         else {
 265                 ekernelheap -= HEAPTEXT_SIZE;
 266                 textbase = (uintptr_t)ekernelheap;
 267         }
 268 #endif
 269 
 270         heap_size = (uintptr_t)ekernelheap - (uintptr_t)kernelheap;
 271         heap_arena = vmem_init("heap", kernelheap, heap_size, PAGESIZE,
 272             segkmem_alloc, segkmem_free);
 273 
 274         if (core_size > 0) {
 275                 heap_core_arena = vmem_create("heap_core", core_start,
 276                     core_size, PAGESIZE, NULL, NULL, NULL, 0, VM_SLEEP);
 277                 heap_core_base = core_start;
 278         } else {
 279                 heap_core_arena = heap_arena;
 280                 heap_core_base = kernelheap;
 281         }
 282 
 283         /*
 284          * reserve space for the large page heap. If large pages for kernel
 285          * heap is enabled large page heap arean will be created later in the
 286          * boot sequence in segkmem_heap_lp_init(). Otherwise the allocated
 287          * range will be returned back to the heap_arena.
 288          */
 289         if (heap_lp_size) {
 290                 (void) vmem_xalloc(heap_arena, heap_lp_size, PAGESIZE, 0, 0,
 291                     heap_lp_base, heap_lp_end,
 292                     VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
 293         }
 294 
 295         /*
 296          * Remove the already-spoken-for memory range [kernelheap, first_avail).
 297          */
 298         (void) vmem_xalloc(heap_arena, first_avail - kernelheap, PAGESIZE,
 299             0, 0, kernelheap, first_avail, VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
 300 
 301 #ifdef __sparc
 302         heap32_arena = vmem_create("heap32", (void *)SYSBASE32,
 303             SYSLIMIT32 - SYSBASE32 - HEAPTEXT_SIZE, PAGESIZE, NULL,
 304             NULL, NULL, 0, VM_SLEEP);
 305         /*
 306          * Prom claims the physical and virtual resources used by panicbuf
 307          * and inter_vec_table. So reserve space for panicbuf, intr_vec_table,
 308          * reserved interrupt vector data structures from 32-bit heap.
 309          */
 310         (void) vmem_xalloc(heap32_arena, PANICBUFSIZE, PAGESIZE, 0, 0,
 311             panicbuf, panicbuf + PANICBUFSIZE,
 312             VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
 313 
 314         (void) vmem_xalloc(heap32_arena, IVSIZE, PAGESIZE, 0, 0,
 315             intr_vec_table, (caddr_t)intr_vec_table + IVSIZE,
 316             VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
 317 
 318         textbase = SYSLIMIT32 - HEAPTEXT_SIZE;
 319         heaptext_parent = NULL;
 320 #else   /* __sparc */
 321         heap32_arena = heap_core_arena;
 322         heaptext_parent = heap_core_arena;
 323 #endif  /* __sparc */
 324 
 325         heaptext_arena = vmem_create("heaptext", (void *)textbase,
 326             HEAPTEXT_SIZE, PAGESIZE, NULL, NULL, heaptext_parent, 0, VM_SLEEP);
 327 
 328         /*
 329          * Create a set of arenas for memory with static translations
 330          * (e.g. VA -> PA translations cannot change).  Since using
 331          * kernel pages by physical address implies it isn't safe to
 332          * walk across page boundaries, the static_arena quantum must
 333          * be PAGESIZE.  Any kmem caches that require static memory
 334          * should source from static_arena, while direct allocations
 335          * should only use static_alloc_arena.
 336          */
 337         static_arena = vmem_create("static", NULL, 0, PAGESIZE,
 338             segkmem_alloc_permanent, segkmem_free, heap_arena, 0, VM_SLEEP);
 339         static_alloc_arena = vmem_create("static_alloc", NULL, 0,
 340             sizeof (uint64_t), vmem_alloc, vmem_free, static_arena,
 341             0, VM_SLEEP);
 342 
 343         /*
 344          * Create an arena for translation data (ptes, hmes, or hblks).
 345          * We need an arena for this because hat_memload() is essential
 346          * to vmem_populate() (see comments in common/os/vmem.c).
 347          *
 348          * Note: any kmem cache that allocates from hat_memload_arena
 349          * must be created as a KMC_NOHASH cache (i.e. no external slab
 350          * and bufctl structures to allocate) so that slab creation doesn't
 351          * require anything more than a single vmem_alloc().
 352          */
 353         hat_memload_arena = vmem_create("hat_memload", NULL, 0, PAGESIZE,
 354             hat_memload_alloc, segkmem_free, heap_arena, 0,
 355             VM_SLEEP | VMC_POPULATOR | VMC_DUMPSAFE);
 356 }
 357 
 358 void
 359 boot_mapin(caddr_t addr, size_t size)
 360 {
 361         caddr_t  eaddr;
 362         page_t  *pp;
 363         pfn_t    pfnum;
 364 
 365         if (page_resv(btop(size), KM_NOSLEEP) == 0)
 366                 panic("boot_mapin: page_resv failed");
 367 
 368         for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
 369                 pfnum = va_to_pfn(addr);
 370                 if (pfnum == PFN_INVALID)
 371                         continue;
 372                 if ((pp = page_numtopp_nolock(pfnum)) == NULL)
 373                         panic("boot_mapin(): No pp for pfnum = %lx", pfnum);
 374 
 375                 /*
 376                  * must break up any large pages that may have constituent
 377                  * pages being utilized for BOP_ALLOC()'s before calling
 378                  * page_numtopp().The locking code (ie. page_reclaim())
 379                  * can't handle them
 380                  */
 381                 if (pp->p_szc != 0)
 382                         page_boot_demote(pp);
 383 
 384                 pp = page_numtopp(pfnum, SE_EXCL);
 385                 if (pp == NULL || PP_ISFREE(pp))
 386                         panic("boot_alloc: pp is NULL or free");
 387 
 388                 /*
 389                  * If the cage is on but doesn't yet contain this page,
 390                  * mark it as non-relocatable.
 391                  */
 392                 if (kcage_on && !PP_ISNORELOC(pp)) {
 393                         PP_SETNORELOC(pp);
 394                         PLCNT_XFER_NORELOC(pp);
 395                 }
 396 
 397                 (void) page_hashin(pp, &kvp, (u_offset_t)(uintptr_t)addr, NULL);
 398                 pp->p_lckcnt = 1;
 399 #if defined(__x86)
 400                 page_downgrade(pp);
 401 #else
 402                 page_unlock(pp);
 403 #endif
 404         }
 405 }
 406 
 407 /*
 408  * Get pages from boot and hash them into the kernel's vp.
 409  * Used after page structs have been allocated, but before segkmem is ready.
 410  */
 411 void *
 412 boot_alloc(void *inaddr, size_t size, uint_t align)
 413 {
 414         caddr_t addr = inaddr;
 415 
 416         if (bootops == NULL)
 417                 prom_panic("boot_alloc: attempt to allocate memory after "
 418                     "BOP_GONE");
 419 
 420         size = ptob(btopr(size));
 421 #ifdef __sparc
 422         if (bop_alloc_chunk(addr, size, align) != (caddr_t)addr)
 423                 panic("boot_alloc: bop_alloc_chunk failed");
 424 #else
 425         if (BOP_ALLOC(bootops, addr, size, align) != addr)
 426                 panic("boot_alloc: BOP_ALLOC failed");
 427 #endif
 428         boot_mapin((caddr_t)addr, size);
 429         return (addr);
 430 }
 431 
 432 static void
 433 segkmem_badop()
 434 {
 435         panic("segkmem_badop");
 436 }
 437 
 438 #define SEGKMEM_BADOP(t)        (t(*)())(uintptr_t)segkmem_badop
 439 
 440 /*ARGSUSED*/
 441 static faultcode_t
 442 segkmem_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t size,
 443     enum fault_type type, enum seg_rw rw)
 444 {
 445         pgcnt_t npages;
 446         spgcnt_t pg;
 447         page_t *pp;
 448         struct vnode *vp = seg->s_data;
 449 
 450         ASSERT(RW_READ_HELD(&seg->s_as->a_lock));
 451 
 452         if (seg->s_as != &kas || size > seg->s_size ||
 453             addr < seg->s_base || addr + size > seg->s_base + seg->s_size)
 454                 panic("segkmem_fault: bad args");
 455 
 456         /*
 457          * If it is one of segkp pages, call segkp_fault.
 458          */
 459         if (segkp_bitmap && seg == &kvseg &&
 460             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 461                 return (SEGOP_FAULT(hat, segkp, addr, size, type, rw));
 462 
 463         if (rw != S_READ && rw != S_WRITE && rw != S_OTHER)
 464                 return (FC_NOSUPPORT);
 465 
 466         npages = btopr(size);
 467 
 468         switch (type) {
 469         case F_SOFTLOCK:        /* lock down already-loaded translations */
 470                 for (pg = 0; pg < npages; pg++) {
 471                         pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr,
 472                             SE_SHARED);
 473                         if (pp == NULL) {
 474                                 /*
 475                                  * Hmm, no page. Does a kernel mapping
 476                                  * exist for it?
 477                                  */
 478                                 if (!hat_probe(kas.a_hat, addr)) {
 479                                         addr -= PAGESIZE;
 480                                         while (--pg >= 0) {
 481                                                 pp = page_find(vp, (u_offset_t)
 482                                                     (uintptr_t)addr);
 483                                                 if (pp)
 484                                                         page_unlock(pp);
 485                                                 addr -= PAGESIZE;
 486                                         }
 487                                         return (FC_NOMAP);
 488                                 }
 489                         }
 490                         addr += PAGESIZE;
 491                 }
 492                 if (rw == S_OTHER)
 493                         hat_reserve(seg->s_as, addr, size);
 494                 return (0);
 495         case F_SOFTUNLOCK:
 496                 while (npages--) {
 497                         pp = page_find(vp, (u_offset_t)(uintptr_t)addr);
 498                         if (pp)
 499                                 page_unlock(pp);
 500                         addr += PAGESIZE;
 501                 }
 502                 return (0);
 503         default:
 504                 return (FC_NOSUPPORT);
 505         }
 506         /*NOTREACHED*/
 507 }
 508 
 509 static int
 510 segkmem_setprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
 511 {
 512         ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
 513 
 514         if (seg->s_as != &kas || size > seg->s_size ||
 515             addr < seg->s_base || addr + size > seg->s_base + seg->s_size)
 516                 panic("segkmem_setprot: bad args");
 517 
 518         /*
 519          * If it is one of segkp pages, call segkp.
 520          */
 521         if (segkp_bitmap && seg == &kvseg &&
 522             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 523                 return (SEGOP_SETPROT(segkp, addr, size, prot));
 524 
 525         if (prot == 0)
 526                 hat_unload(kas.a_hat, addr, size, HAT_UNLOAD);
 527         else
 528                 hat_chgprot(kas.a_hat, addr, size, prot);
 529         return (0);
 530 }
 531 
 532 /*
 533  * This is a dummy segkmem function overloaded to call segkp
 534  * when segkp is under the heap.
 535  */
 536 /* ARGSUSED */
 537 static int
 538 segkmem_checkprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
 539 {
 540         ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
 541 
 542         if (seg->s_as != &kas)
 543                 segkmem_badop();
 544 
 545         /*
 546          * If it is one of segkp pages, call into segkp.
 547          */
 548         if (segkp_bitmap && seg == &kvseg &&
 549             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 550                 return (SEGOP_CHECKPROT(segkp, addr, size, prot));
 551 
 552         segkmem_badop();
 553         return (0);
 554 }
 555 
 556 /*
 557  * This is a dummy segkmem function overloaded to call segkp
 558  * when segkp is under the heap.
 559  */
 560 /* ARGSUSED */
 561 static int
 562 segkmem_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
 563 {
 564         ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
 565 
 566         if (seg->s_as != &kas)
 567                 segkmem_badop();
 568 
 569         /*
 570          * If it is one of segkp pages, call into segkp.
 571          */
 572         if (segkp_bitmap && seg == &kvseg &&
 573             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 574                 return (SEGOP_KLUSTER(segkp, addr, delta));
 575 
 576         segkmem_badop();
 577         return (0);
 578 }
 579 
 580 static void
 581 segkmem_xdump_range(void *arg, void *start, size_t size)
 582 {
 583         struct as *as = arg;
 584         caddr_t addr = start;
 585         caddr_t addr_end = addr + size;
 586 
 587         while (addr < addr_end) {
 588                 pfn_t pfn = hat_getpfnum(kas.a_hat, addr);
 589                 if (pfn != PFN_INVALID && pfn <= physmax && pf_is_memory(pfn))
 590                         dump_addpage(as, addr, pfn);
 591                 addr += PAGESIZE;
 592                 dump_timeleft = dump_timeout;
 593         }
 594 }
 595 
 596 static void
 597 segkmem_dump_range(void *arg, void *start, size_t size)
 598 {
 599         caddr_t addr = start;
 600         caddr_t addr_end = addr + size;
 601 
 602         /*
 603          * If we are about to start dumping the range of addresses we
 604          * carved out of the kernel heap for the large page heap walk
 605          * heap_lp_arena to find what segments are actually populated
 606          */
 607         if (SEGKMEM_USE_LARGEPAGES &&
 608             addr == heap_lp_base && addr_end == heap_lp_end &&
 609             vmem_size(heap_lp_arena, VMEM_ALLOC) < size) {
 610                 vmem_walk(heap_lp_arena, VMEM_ALLOC | VMEM_REENTRANT,
 611                     segkmem_xdump_range, arg);
 612         } else {
 613                 segkmem_xdump_range(arg, start, size);
 614         }
 615 }
 616 
 617 static void
 618 segkmem_dump(struct seg *seg)
 619 {
 620         /*
 621          * The kernel's heap_arena (represented by kvseg) is a very large
 622          * VA space, most of which is typically unused.  To speed up dumping
 623          * we use vmem_walk() to quickly find the pieces of heap_arena that
 624          * are actually in use.  We do the same for heap32_arena and
 625          * heap_core.
 626          *
 627          * We specify VMEM_REENTRANT to vmem_walk() because dump_addpage()
 628          * may ultimately need to allocate memory.  Reentrant walks are
 629          * necessarily imperfect snapshots.  The kernel heap continues
 630          * to change during a live crash dump, for example.  For a normal
 631          * crash dump, however, we know that there won't be any other threads
 632          * messing with the heap.  Therefore, at worst, we may fail to dump
 633          * the pages that get allocated by the act of dumping; but we will
 634          * always dump every page that was allocated when the walk began.
 635          *
 636          * The other segkmem segments are dense (fully populated), so there's
 637          * no need to use this technique when dumping them.
 638          *
 639          * Note: when adding special dump handling for any new sparsely-
 640          * populated segments, be sure to add similar handling to the ::kgrep
 641          * code in mdb.
 642          */
 643         if (seg == &kvseg) {
 644                 vmem_walk(heap_arena, VMEM_ALLOC | VMEM_REENTRANT,
 645                     segkmem_dump_range, seg->s_as);
 646 #ifndef __sparc
 647                 vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT,
 648                     segkmem_dump_range, seg->s_as);
 649 #endif
 650         } else if (seg == &kvseg_core) {
 651                 vmem_walk(heap_core_arena, VMEM_ALLOC | VMEM_REENTRANT,
 652                     segkmem_dump_range, seg->s_as);
 653         } else if (seg == &kvseg32) {
 654                 vmem_walk(heap32_arena, VMEM_ALLOC | VMEM_REENTRANT,
 655                     segkmem_dump_range, seg->s_as);
 656                 vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT,
 657                     segkmem_dump_range, seg->s_as);
 658         } else if (seg == &kzioseg) {
 659                 /*
 660                  * We don't want to dump pages attached to kzioseg since they
 661                  * contain file data from ZFS.  If this page's segment is
 662                  * kzioseg return instead of writing it to the dump device.
 663                  */
 664                 return;
 665         } else {
 666                 segkmem_dump_range(seg->s_as, seg->s_base, seg->s_size);
 667         }
 668 }
 669 
 670 /*
 671  * lock/unlock kmem pages over a given range [addr, addr+len).
 672  * Returns a shadow list of pages in ppp. If there are holes
 673  * in the range (e.g. some of the kernel mappings do not have
 674  * underlying page_ts) returns ENOTSUP so that as_pagelock()
 675  * will handle the range via as_fault(F_SOFTLOCK).
 676  */
 677 /*ARGSUSED*/
 678 static int
 679 segkmem_pagelock(struct seg *seg, caddr_t addr, size_t len,
 680     page_t ***ppp, enum lock_type type, enum seg_rw rw)
 681 {
 682         page_t **pplist, *pp;
 683         pgcnt_t npages;
 684         spgcnt_t pg;
 685         size_t nb;
 686         struct vnode *vp = seg->s_data;
 687 
 688         ASSERT(ppp != NULL);
 689 
 690         /*
 691          * If it is one of segkp pages, call into segkp.
 692          */
 693         if (segkp_bitmap && seg == &kvseg &&
 694             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 695                 return (SEGOP_PAGELOCK(segkp, addr, len, ppp, type, rw));
 696 
 697         npages = btopr(len);
 698         nb = sizeof (page_t *) * npages;
 699 
 700         if (type == L_PAGEUNLOCK) {
 701                 pplist = *ppp;
 702                 ASSERT(pplist != NULL);
 703 
 704                 for (pg = 0; pg < npages; pg++) {
 705                         pp = pplist[pg];
 706                         page_unlock(pp);
 707                 }
 708                 kmem_free(pplist, nb);
 709                 return (0);
 710         }
 711 
 712         ASSERT(type == L_PAGELOCK);
 713 
 714         pplist = kmem_alloc(nb, KM_NOSLEEP);
 715         if (pplist == NULL) {
 716                 *ppp = NULL;
 717                 return (ENOTSUP);       /* take the slow path */
 718         }
 719 
 720         for (pg = 0; pg < npages; pg++) {
 721                 pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr, SE_SHARED);
 722                 if (pp == NULL) {
 723                         while (--pg >= 0)
 724                                 page_unlock(pplist[pg]);
 725                         kmem_free(pplist, nb);
 726                         *ppp = NULL;
 727                         return (ENOTSUP);
 728                 }
 729                 pplist[pg] = pp;
 730                 addr += PAGESIZE;
 731         }
 732 
 733         *ppp = pplist;
 734         return (0);
 735 }
 736 
 737 /*
 738  * This is a dummy segkmem function overloaded to call segkp
 739  * when segkp is under the heap.
 740  */
 741 /* ARGSUSED */
 742 static int
 743 segkmem_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
 744 {
 745         ASSERT(RW_LOCK_HELD(&seg->s_as->a_lock));
 746 
 747         if (seg->s_as != &kas)
 748                 segkmem_badop();
 749 
 750         /*
 751          * If it is one of segkp pages, call into segkp.
 752          */
 753         if (segkp_bitmap && seg == &kvseg &&
 754             BT_TEST(segkp_bitmap, btop((uintptr_t)(addr - seg->s_base))))
 755                 return (SEGOP_GETMEMID(segkp, addr, memidp));
 756 
 757         segkmem_badop();
 758         return (0);
 759 }
 760 
 761 /*ARGSUSED*/
 762 static lgrp_mem_policy_info_t *
 763 segkmem_getpolicy(struct seg *seg, caddr_t addr)
 764 {
 765         return (NULL);
 766 }
 767 
 768 /*ARGSUSED*/
 769 static int
 770 segkmem_capable(struct seg *seg, segcapability_t capability)
 771 {
 772         if (capability == S_CAPABILITY_NOMINFLT)
 773                 return (1);
 774         return (0);
 775 }
 776 
 777 struct seg_ops segkmem_ops = {
 778         SEGKMEM_BADOP(int),             /* dup */
 779         SEGKMEM_BADOP(int),             /* unmap */
 780         SEGKMEM_BADOP(void),            /* free */
 781         segkmem_fault,
 782         SEGKMEM_BADOP(faultcode_t),     /* faulta */
 783         segkmem_setprot,
 784         segkmem_checkprot,
 785         segkmem_kluster,
 786         SEGKMEM_BADOP(size_t),          /* swapout */
 787         SEGKMEM_BADOP(int),             /* sync */
 788         SEGKMEM_BADOP(size_t),          /* incore */
 789         SEGKMEM_BADOP(int),             /* lockop */
 790         SEGKMEM_BADOP(int),             /* getprot */
 791         SEGKMEM_BADOP(u_offset_t),      /* getoffset */
 792         SEGKMEM_BADOP(int),             /* gettype */
 793         SEGKMEM_BADOP(int),             /* getvp */
 794         SEGKMEM_BADOP(int),             /* advise */
 795         segkmem_dump,
 796         segkmem_pagelock,
 797         SEGKMEM_BADOP(int),             /* setpgsz */
 798         segkmem_getmemid,
 799         segkmem_getpolicy,              /* getpolicy */
 800         segkmem_capable,                /* capable */
 801         seg_inherit_notsup              /* inherit */
 802 };
 803 
 804 int
 805 segkmem_zio_create(struct seg *seg)
 806 {
 807         ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
 808         seg->s_ops = &segkmem_ops;
 809         seg->s_data = &zvp;
 810         kas.a_size += seg->s_size;
 811         return (0);
 812 }
 813 
 814 int
 815 segkmem_create(struct seg *seg)
 816 {
 817         ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
 818         seg->s_ops = &segkmem_ops;
 819         seg->s_data = &kvp;
 820         kas.a_size += seg->s_size;
 821         return (0);
 822 }
 823 
 824 /*ARGSUSED*/
 825 page_t *
 826 segkmem_page_create(void *addr, size_t size, int vmflag, void *arg)
 827 {
 828         struct seg kseg;
 829         int pgflags;
 830         struct vnode *vp = arg;
 831 
 832         if (vp == NULL)
 833                 vp = &kvp;
 834 
 835         kseg.s_as = &kas;
 836         pgflags = PG_EXCL;
 837 
 838         if (segkmem_reloc == 0 || (vmflag & VM_NORELOC))
 839                 pgflags |= PG_NORELOC;
 840         if ((vmflag & VM_NOSLEEP) == 0)
 841                 pgflags |= PG_WAIT;
 842         if (vmflag & VM_PANIC)
 843                 pgflags |= PG_PANIC;
 844         if (vmflag & VM_PUSHPAGE)
 845                 pgflags |= PG_PUSHPAGE;
 846         if (vmflag & VM_NORMALPRI) {
 847                 ASSERT(vmflag & VM_NOSLEEP);
 848                 pgflags |= PG_NORMALPRI;
 849         }
 850 
 851         return (page_create_va(vp, (u_offset_t)(uintptr_t)addr, size,
 852             pgflags, &kseg, addr));
 853 }
 854 
 855 /*
 856  * Allocate pages to back the virtual address range [addr, addr + size).
 857  * If addr is NULL, allocate the virtual address space as well.
 858  */
 859 void *
 860 segkmem_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag, uint_t attr,
 861     page_t *(*page_create_func)(void *, size_t, int, void *), void *pcarg)
 862 {
 863         page_t *ppl;
 864         caddr_t addr = inaddr;
 865         pgcnt_t npages = btopr(size);
 866         int allocflag;
 867 
 868         if (inaddr == NULL && (addr = vmem_alloc(vmp, size, vmflag)) == NULL)
 869                 return (NULL);
 870 
 871         ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0);
 872 
 873         if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) {
 874                 if (inaddr == NULL)
 875                         vmem_free(vmp, addr, size);
 876                 return (NULL);
 877         }
 878 
 879         ppl = page_create_func(addr, size, vmflag, pcarg);
 880         if (ppl == NULL) {
 881                 if (inaddr == NULL)
 882                         vmem_free(vmp, addr, size);
 883                 page_unresv(npages);
 884                 return (NULL);
 885         }
 886 
 887         /*
 888          * Under certain conditions, we need to let the HAT layer know
 889          * that it cannot safely allocate memory.  Allocations from
 890          * the hat_memload vmem arena always need this, to prevent
 891          * infinite recursion.
 892          *
 893          * In addition, the x86 hat cannot safely do memory
 894          * allocations while in vmem_populate(), because there
 895          * is no simple bound on its usage.
 896          */
 897         if (vmflag & VM_MEMLOAD)
 898                 allocflag = HAT_NO_KALLOC;
 899 #if defined(__x86)
 900         else if (vmem_is_populator())
 901                 allocflag = HAT_NO_KALLOC;
 902 #endif
 903         else
 904                 allocflag = 0;
 905 
 906         while (ppl != NULL) {
 907                 page_t *pp = ppl;
 908                 page_sub(&ppl, pp);
 909                 ASSERT(page_iolock_assert(pp));
 910                 ASSERT(PAGE_EXCL(pp));
 911                 page_io_unlock(pp);
 912                 hat_memload(kas.a_hat, (caddr_t)(uintptr_t)pp->p_offset, pp,
 913                     (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr,
 914                     HAT_LOAD_LOCK | allocflag);
 915                 pp->p_lckcnt = 1;
 916 #if defined(__x86)
 917                 page_downgrade(pp);
 918 #else
 919                 if (vmflag & SEGKMEM_SHARELOCKED)
 920                         page_downgrade(pp);
 921                 else
 922                         page_unlock(pp);
 923 #endif
 924         }
 925 
 926         return (addr);
 927 }
 928 
 929 static void *
 930 segkmem_alloc_vn(vmem_t *vmp, size_t size, int vmflag, struct vnode *vp)
 931 {
 932         void *addr;
 933         segkmem_gc_list_t *gcp, **prev_gcpp;
 934 
 935         ASSERT(vp != NULL);
 936 
 937         if (kvseg.s_base == NULL) {
 938 #ifndef __sparc
 939                 if (bootops->bsys_alloc == NULL)
 940                         halt("Memory allocation between bop_alloc() and "
 941                             "kmem_alloc().\n");
 942 #endif
 943 
 944                 /*
 945                  * There's not a lot of memory to go around during boot,
 946                  * so recycle it if we can.
 947                  */
 948                 for (prev_gcpp = &segkmem_gc_list; (gcp = *prev_gcpp) != NULL;
 949                     prev_gcpp = &gcp->gc_next) {
 950                         if (gcp->gc_arena == vmp && gcp->gc_size == size) {
 951                                 *prev_gcpp = gcp->gc_next;
 952                                 return (gcp);
 953                         }
 954                 }
 955 
 956                 addr = vmem_alloc(vmp, size, vmflag | VM_PANIC);
 957                 if (boot_alloc(addr, size, BO_NO_ALIGN) != addr)
 958                         panic("segkmem_alloc: boot_alloc failed");
 959                 return (addr);
 960         }
 961         return (segkmem_xalloc(vmp, NULL, size, vmflag, 0,
 962             segkmem_page_create, vp));
 963 }
 964 
 965 void *
 966 segkmem_alloc(vmem_t *vmp, size_t size, int vmflag)
 967 {
 968         return (segkmem_alloc_vn(vmp, size, vmflag, &kvp));
 969 }
 970 
 971 void *
 972 segkmem_zio_alloc(vmem_t *vmp, size_t size, int vmflag)
 973 {
 974         return (segkmem_alloc_vn(vmp, size, vmflag, &zvp));
 975 }
 976 
 977 /*
 978  * Any changes to this routine must also be carried over to
 979  * devmap_free_pages() in the seg_dev driver. This is because
 980  * we currently don't have a special kernel segment for non-paged
 981  * kernel memory that is exported by drivers to user space.
 982  */
 983 static void
 984 segkmem_free_vn(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp,
 985     void (*func)(page_t *))
 986 {
 987         page_t *pp;
 988         caddr_t addr = inaddr;
 989         caddr_t eaddr;
 990         pgcnt_t npages = btopr(size);
 991 
 992         ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0);
 993         ASSERT(vp != NULL);
 994 
 995         if (kvseg.s_base == NULL) {
 996                 segkmem_gc_list_t *gc = inaddr;
 997                 gc->gc_arena = vmp;
 998                 gc->gc_size = size;
 999                 gc->gc_next = segkmem_gc_list;
1000                 segkmem_gc_list = gc;
1001                 return;
1002         }
1003 
1004         hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK);
1005 
1006         for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
1007 #if defined(__x86)
1008                 pp = page_find(vp, (u_offset_t)(uintptr_t)addr);
1009                 if (pp == NULL)
1010                         panic("segkmem_free: page not found");
1011                 if (!page_tryupgrade(pp)) {
1012                         /*
1013                          * Some other thread has a sharelock. Wait for
1014                          * it to drop the lock so we can free this page.
1015                          */
1016                         page_unlock(pp);
1017                         pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr,
1018                             SE_EXCL);
1019                 }
1020 #else
1021                 pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr, SE_EXCL);
1022 #endif
1023                 if (pp == NULL)
1024                         panic("segkmem_free: page not found");
1025                 /* Clear p_lckcnt so page_destroy() doesn't update availrmem */
1026                 pp->p_lckcnt = 0;
1027                 if (func)
1028                         func(pp);
1029                 else
1030                         page_destroy(pp, 0);
1031         }
1032         if (func == NULL)
1033                 page_unresv(npages);
1034 
1035         if (vmp != NULL)
1036                 vmem_free(vmp, inaddr, size);
1037 
1038 }
1039 
1040 void
1041 segkmem_xfree(vmem_t *vmp, void *inaddr, size_t size, void (*func)(page_t *))
1042 {
1043         segkmem_free_vn(vmp, inaddr, size, &kvp, func);
1044 }
1045 
1046 void
1047 segkmem_free(vmem_t *vmp, void *inaddr, size_t size)
1048 {
1049         segkmem_free_vn(vmp, inaddr, size, &kvp, NULL);
1050 }
1051 
1052 void
1053 segkmem_zio_free(vmem_t *vmp, void *inaddr, size_t size)
1054 {
1055         segkmem_free_vn(vmp, inaddr, size, &zvp, NULL);
1056 }
1057 
1058 void
1059 segkmem_gc(void)
1060 {
1061         ASSERT(kvseg.s_base != NULL);
1062         while (segkmem_gc_list != NULL) {
1063                 segkmem_gc_list_t *gc = segkmem_gc_list;
1064                 segkmem_gc_list = gc->gc_next;
1065                 segkmem_free(gc->gc_arena, gc, gc->gc_size);
1066         }
1067 }
1068 
1069 /*
1070  * Legacy entry points from here to end of file.
1071  */
1072 void
1073 segkmem_mapin(struct seg *seg, void *addr, size_t size, uint_t vprot,
1074     pfn_t pfn, uint_t flags)
1075 {
1076         hat_unload(seg->s_as->a_hat, addr, size, HAT_UNLOAD_UNLOCK);
1077         hat_devload(seg->s_as->a_hat, addr, size, pfn, vprot,
1078             flags | HAT_LOAD_LOCK);
1079 }
1080 
1081 void
1082 segkmem_mapout(struct seg *seg, void *addr, size_t size)
1083 {
1084         hat_unload(seg->s_as->a_hat, addr, size, HAT_UNLOAD_UNLOCK);
1085 }
1086 
1087 void *
1088 kmem_getpages(pgcnt_t npages, int kmflag)
1089 {
1090         return (kmem_alloc(ptob(npages), kmflag));
1091 }
1092 
1093 void
1094 kmem_freepages(void *addr, pgcnt_t npages)
1095 {
1096         kmem_free(addr, ptob(npages));
1097 }
1098 
1099 /*
1100  * segkmem_page_create_large() allocates a large page to be used for the kmem
1101  * caches. If kpr is enabled we ask for a relocatable page unless requested
1102  * otherwise. If kpr is disabled we have to ask for a non-reloc page
1103  */
1104 static page_t *
1105 segkmem_page_create_large(void *addr, size_t size, int vmflag, void *arg)
1106 {
1107         int pgflags;
1108 
1109         pgflags = PG_EXCL;
1110 
1111         if (segkmem_reloc == 0 || (vmflag & VM_NORELOC))
1112                 pgflags |= PG_NORELOC;
1113         if (!(vmflag & VM_NOSLEEP))
1114                 pgflags |= PG_WAIT;
1115         if (vmflag & VM_PUSHPAGE)
1116                 pgflags |= PG_PUSHPAGE;
1117         if (vmflag & VM_NORMALPRI)
1118                 pgflags |= PG_NORMALPRI;
1119 
1120         return (page_create_va_large(&kvp, (u_offset_t)(uintptr_t)addr, size,
1121             pgflags, &kvseg, addr, arg));
1122 }
1123 
1124 /*
1125  * Allocate a large page to back the virtual address range
1126  * [addr, addr + size).  If addr is NULL, allocate the virtual address
1127  * space as well.
1128  */
1129 static void *
1130 segkmem_xalloc_lp(vmem_t *vmp, void *inaddr, size_t size, int vmflag,
1131     uint_t attr, page_t *(*page_create_func)(void *, size_t, int, void *),
1132     void *pcarg)
1133 {
1134         caddr_t addr = inaddr, pa;
1135         size_t  lpsize = segkmem_lpsize;
1136         pgcnt_t npages = btopr(size);
1137         pgcnt_t nbpages = btop(lpsize);
1138         pgcnt_t nlpages = size >> segkmem_lpshift;
1139         size_t  ppasize = nbpages * sizeof (page_t *);
1140         page_t *pp, *rootpp, **ppa, *pplist = NULL;
1141         int i;
1142 
1143         vmflag |= VM_NOSLEEP;
1144 
1145         if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) {
1146                 return (NULL);
1147         }
1148 
1149         /*
1150          * allocate an array we need for hat_memload_array.
1151          * we use a separate arena to avoid recursion.
1152          * we will not need this array when hat_memload_array learns pp++
1153          */
1154         if ((ppa = vmem_alloc(segkmem_ppa_arena, ppasize, vmflag)) == NULL) {
1155                 goto fail_array_alloc;
1156         }
1157 
1158         if (inaddr == NULL && (addr = vmem_alloc(vmp, size, vmflag)) == NULL)
1159                 goto fail_vmem_alloc;
1160 
1161         ASSERT(((uintptr_t)addr & (lpsize - 1)) == 0);
1162 
1163         /* create all the pages */
1164         for (pa = addr, i = 0; i < nlpages; i++, pa += lpsize) {
1165                 if ((pp = page_create_func(pa, lpsize, vmflag, pcarg)) == NULL)
1166                         goto fail_page_create;
1167                 page_list_concat(&pplist, &pp);
1168         }
1169 
1170         /* at this point we have all the resource to complete the request */
1171         while ((rootpp = pplist) != NULL) {
1172                 for (i = 0; i < nbpages; i++) {
1173                         ASSERT(pplist != NULL);
1174                         pp = pplist;
1175                         page_sub(&pplist, pp);
1176                         ASSERT(page_iolock_assert(pp));
1177                         page_io_unlock(pp);
1178                         ppa[i] = pp;
1179                 }
1180                 /*
1181                  * Load the locked entry. It's OK to preload the entry into the
1182                  * TSB since we now support large mappings in the kernel TSB.
1183                  */
1184                 hat_memload_array(kas.a_hat,
1185                     (caddr_t)(uintptr_t)rootpp->p_offset, lpsize,
1186                     ppa, (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr,
1187                     HAT_LOAD_LOCK);
1188 
1189                 for (--i; i >= 0; --i) {
1190                         ppa[i]->p_lckcnt = 1;
1191                         page_unlock(ppa[i]);
1192                 }
1193         }
1194 
1195         vmem_free(segkmem_ppa_arena, ppa, ppasize);
1196         return (addr);
1197 
1198 fail_page_create:
1199         while ((rootpp = pplist) != NULL) {
1200                 for (i = 0, pp = pplist; i < nbpages; i++, pp = pplist) {
1201                         ASSERT(pp != NULL);
1202                         page_sub(&pplist, pp);
1203                         ASSERT(page_iolock_assert(pp));
1204                         page_io_unlock(pp);
1205                 }
1206                 page_destroy_pages(rootpp);
1207         }
1208 
1209         if (inaddr == NULL)
1210                 vmem_free(vmp, addr, size);
1211 
1212 fail_vmem_alloc:
1213         vmem_free(segkmem_ppa_arena, ppa, ppasize);
1214 
1215 fail_array_alloc:
1216         page_unresv(npages);
1217 
1218         return (NULL);
1219 }
1220 
1221 static void
1222 segkmem_free_one_lp(caddr_t addr, size_t size)
1223 {
1224         page_t          *pp, *rootpp = NULL;
1225         pgcnt_t         pgs_left = btopr(size);
1226 
1227         ASSERT(size == segkmem_lpsize);
1228 
1229         hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK);
1230 
1231         for (; pgs_left > 0; addr += PAGESIZE, pgs_left--) {
1232                 pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_EXCL);
1233                 if (pp == NULL)
1234                         panic("segkmem_free_one_lp: page not found");
1235                 ASSERT(PAGE_EXCL(pp));
1236                 pp->p_lckcnt = 0;
1237                 if (rootpp == NULL)
1238                         rootpp = pp;
1239         }
1240         ASSERT(rootpp != NULL);
1241         page_destroy_pages(rootpp);
1242 
1243         /* page_unresv() is done by the caller */
1244 }
1245 
1246 /*
1247  * This function is called to import new spans into the vmem arenas like
1248  * kmem_default_arena and kmem_oversize_arena. It first tries to import
1249  * spans from large page arena - kmem_lp_arena. In order to do this it might
1250  * have to "upgrade the requested size" to kmem_lp_arena quantum. If
1251  * it was not able to satisfy the upgraded request it then calls regular
1252  * segkmem_alloc() that satisfies the request by importing from "*vmp" arena
1253  */
1254 /*ARGSUSED*/
1255 void *
1256 segkmem_alloc_lp(vmem_t *vmp, size_t *sizep, size_t align, int vmflag)
1257 {
1258         size_t size;
1259         kthread_t *t = curthread;
1260         segkmem_lpcb_t *lpcb = &segkmem_lpcb;
1261 
1262         ASSERT(sizep != NULL);
1263 
1264         size = *sizep;
1265 
1266         if (lpcb->lp_uselp && !(t->t_flag & T_PANIC) &&
1267             !(vmflag & SEGKMEM_SHARELOCKED)) {
1268 
1269                 size_t kmemlp_qnt = segkmem_kmemlp_quantum;
1270                 size_t asize = P2ROUNDUP(size, kmemlp_qnt);
1271                 void  *addr = NULL;
1272                 ulong_t *lpthrtp = &lpcb->lp_throttle;
1273                 ulong_t lpthrt = *lpthrtp;
1274                 int     dowakeup = 0;
1275                 int     doalloc = 1;
1276 
1277                 ASSERT(kmem_lp_arena != NULL);
1278                 ASSERT(asize >= size);
1279 
1280                 if (lpthrt != 0) {
1281                         /* try to update the throttle value */
1282                         lpthrt = atomic_inc_ulong_nv(lpthrtp);
1283                         if (lpthrt >= segkmem_lpthrottle_max) {
1284                                 lpthrt = atomic_cas_ulong(lpthrtp, lpthrt,
1285                                     segkmem_lpthrottle_max / 4);
1286                         }
1287 
1288                         /*
1289                          * when we get above throttle start do an exponential
1290                          * backoff at trying large pages and reaping
1291                          */
1292                         if (lpthrt > segkmem_lpthrottle_start &&
1293                             !ISP2(lpthrt)) {
1294                                 lpcb->allocs_throttled++;
1295                                 lpthrt--;
1296                                 if (ISP2(lpthrt))
1297                                         kmem_reap();
1298                                 return (segkmem_alloc(vmp, size, vmflag));
1299                         }
1300                 }
1301 
1302                 if (!(vmflag & VM_NOSLEEP) &&
1303                     segkmem_heaplp_quantum >= (8 * kmemlp_qnt) &&
1304                     vmem_size(kmem_lp_arena, VMEM_FREE) <= kmemlp_qnt &&
1305                     asize < (segkmem_heaplp_quantum - kmemlp_qnt)) {
1306 
1307                         /*
1308                          * we are low on free memory in kmem_lp_arena
1309                          * we let only one guy to allocate heap_lp
1310                          * quantum size chunk that everybody is going to
1311                          * share
1312                          */
1313                         mutex_enter(&lpcb->lp_lock);
1314 
1315                         if (lpcb->lp_wait) {
1316 
1317                                 /* we are not the first one - wait */
1318                                 cv_wait(&lpcb->lp_cv, &lpcb->lp_lock);
1319                                 if (vmem_size(kmem_lp_arena, VMEM_FREE) <
1320                                     kmemlp_qnt)  {
1321                                         doalloc = 0;
1322                                 }
1323                         } else if (vmem_size(kmem_lp_arena, VMEM_FREE) <=
1324                             kmemlp_qnt) {
1325 
1326                                 /*
1327                                  * we are the first one, make sure we import
1328                                  * a large page
1329                                  */
1330                                 if (asize == kmemlp_qnt)
1331                                         asize += kmemlp_qnt;
1332                                 dowakeup = 1;
1333                                 lpcb->lp_wait = 1;
1334                         }
1335 
1336                         mutex_exit(&lpcb->lp_lock);
1337                 }
1338 
1339                 /*
1340                  * VM_ABORT flag prevents sleeps in vmem_xalloc when
1341                  * large pages are not available. In that case this allocation
1342                  * attempt will fail and we will retry allocation with small
1343                  * pages. We also do not want to panic if this allocation fails
1344                  * because we are going to retry.
1345                  */
1346                 if (doalloc) {
1347                         addr = vmem_alloc(kmem_lp_arena, asize,
1348                             (vmflag | VM_ABORT) & ~VM_PANIC);
1349 
1350                         if (dowakeup) {
1351                                 mutex_enter(&lpcb->lp_lock);
1352                                 ASSERT(lpcb->lp_wait != 0);
1353                                 lpcb->lp_wait = 0;
1354                                 cv_broadcast(&lpcb->lp_cv);
1355                                 mutex_exit(&lpcb->lp_lock);
1356                         }
1357                 }
1358 
1359                 if (addr != NULL) {
1360                         *sizep = asize;
1361                         *lpthrtp = 0;
1362                         return (addr);
1363                 }
1364 
1365                 if (vmflag & VM_NOSLEEP)
1366                         lpcb->nosleep_allocs_failed++;
1367                 else
1368                         lpcb->sleep_allocs_failed++;
1369                 lpcb->alloc_bytes_failed += size;
1370 
1371                 /* if large page throttling is not started yet do it */
1372                 if (segkmem_use_lpthrottle && lpthrt == 0) {
1373                         lpthrt = atomic_cas_ulong(lpthrtp, lpthrt, 1);
1374                 }
1375         }
1376         return (segkmem_alloc(vmp, size, vmflag));
1377 }
1378 
1379 void
1380 segkmem_free_lp(vmem_t *vmp, void *inaddr, size_t size)
1381 {
1382         if (kmem_lp_arena == NULL || !IS_KMEM_VA_LARGEPAGE((caddr_t)inaddr)) {
1383                 segkmem_free(vmp, inaddr, size);
1384         } else {
1385                 vmem_free(kmem_lp_arena, inaddr, size);
1386         }
1387 }
1388 
1389 /*
1390  * segkmem_alloc_lpi() imports virtual memory from large page heap arena
1391  * into kmem_lp arena. In the process it maps the imported segment with
1392  * large pages
1393  */
1394 static void *
1395 segkmem_alloc_lpi(vmem_t *vmp, size_t size, int vmflag)
1396 {
1397         segkmem_lpcb_t *lpcb = &segkmem_lpcb;
1398         void  *addr;
1399 
1400         ASSERT(size != 0);
1401         ASSERT(vmp == heap_lp_arena);
1402 
1403         /* do not allow large page heap grow beyound limits */
1404         if (vmem_size(vmp, VMEM_ALLOC) >= segkmem_kmemlp_max) {
1405                 lpcb->allocs_limited++;
1406                 return (NULL);
1407         }
1408 
1409         addr = segkmem_xalloc_lp(vmp, NULL, size, vmflag, 0,
1410             segkmem_page_create_large, NULL);
1411         return (addr);
1412 }
1413 
1414 /*
1415  * segkmem_free_lpi() returns virtual memory back into large page heap arena
1416  * from kmem_lp arena. Beore doing this it unmaps the segment and frees
1417  * large pages used to map it.
1418  */
1419 static void
1420 segkmem_free_lpi(vmem_t *vmp, void *inaddr, size_t size)
1421 {
1422         pgcnt_t         nlpages = size >> segkmem_lpshift;
1423         size_t          lpsize = segkmem_lpsize;
1424         caddr_t         addr = inaddr;
1425         pgcnt_t         npages = btopr(size);
1426         int             i;
1427 
1428         ASSERT(vmp == heap_lp_arena);
1429         ASSERT(IS_KMEM_VA_LARGEPAGE(addr));
1430         ASSERT(((uintptr_t)inaddr & (lpsize - 1)) == 0);
1431 
1432         for (i = 0; i < nlpages; i++) {
1433                 segkmem_free_one_lp(addr, lpsize);
1434                 addr += lpsize;
1435         }
1436 
1437         page_unresv(npages);
1438 
1439         vmem_free(vmp, inaddr, size);
1440 }
1441 
1442 /*
1443  * This function is called at system boot time by kmem_init right after
1444  * /etc/system file has been read. It checks based on hardware configuration
1445  * and /etc/system settings if system is going to use large pages. The
1446  * initialiazation necessary to actually start using large pages
1447  * happens later in the process after segkmem_heap_lp_init() is called.
1448  */
1449 int
1450 segkmem_lpsetup()
1451 {
1452         int use_large_pages = 0;
1453 
1454 #ifdef __sparc
1455 
1456         size_t memtotal = physmem * PAGESIZE;
1457 
1458         if (heap_lp_base == NULL) {
1459                 segkmem_lpsize = PAGESIZE;
1460                 return (0);
1461         }
1462 
1463         /* get a platform dependent value of large page size for kernel heap */
1464         segkmem_lpsize = get_segkmem_lpsize(segkmem_lpsize);
1465 
1466         if (segkmem_lpsize <= PAGESIZE) {
1467                 /*
1468                  * put virtual space reserved for the large page kernel
1469                  * back to the regular heap
1470                  */
1471                 vmem_xfree(heap_arena, heap_lp_base,
1472                     heap_lp_end - heap_lp_base);
1473                 heap_lp_base = NULL;
1474                 heap_lp_end = NULL;
1475                 segkmem_lpsize = PAGESIZE;
1476                 return (0);
1477         }
1478 
1479         /* set heap_lp quantum if necessary */
1480         if (segkmem_heaplp_quantum == 0 || !ISP2(segkmem_heaplp_quantum) ||
1481             P2PHASE(segkmem_heaplp_quantum, segkmem_lpsize)) {
1482                 segkmem_heaplp_quantum = segkmem_lpsize;
1483         }
1484 
1485         /* set kmem_lp quantum if necessary */
1486         if (segkmem_kmemlp_quantum == 0 || !ISP2(segkmem_kmemlp_quantum) ||
1487             segkmem_kmemlp_quantum > segkmem_heaplp_quantum) {
1488                 segkmem_kmemlp_quantum = segkmem_heaplp_quantum;
1489         }
1490 
1491         /* set total amount of memory allowed for large page kernel heap */
1492         if (segkmem_kmemlp_max == 0) {
1493                 if (segkmem_kmemlp_pcnt == 0 || segkmem_kmemlp_pcnt > 100)
1494                         segkmem_kmemlp_pcnt = 12;
1495                 segkmem_kmemlp_max = (memtotal * segkmem_kmemlp_pcnt) / 100;
1496         }
1497         segkmem_kmemlp_max = P2ROUNDUP(segkmem_kmemlp_max,
1498             segkmem_heaplp_quantum);
1499 
1500         /* fix lp kmem preallocation request if necesssary */
1501         if (segkmem_kmemlp_min) {
1502                 segkmem_kmemlp_min = P2ROUNDUP(segkmem_kmemlp_min,
1503                     segkmem_heaplp_quantum);
1504                 if (segkmem_kmemlp_min > segkmem_kmemlp_max)
1505                         segkmem_kmemlp_min = segkmem_kmemlp_max;
1506         }
1507 
1508         use_large_pages = 1;
1509         segkmem_lpszc = page_szc(segkmem_lpsize);
1510         segkmem_lpshift = page_get_shift(segkmem_lpszc);
1511 
1512 #endif
1513         return (use_large_pages);
1514 }
1515 
1516 void
1517 segkmem_zio_init(void *zio_mem_base, size_t zio_mem_size)
1518 {
1519         ASSERT(zio_mem_base != NULL);
1520         ASSERT(zio_mem_size != 0);
1521 
1522         /*
1523          * To reduce VA space fragmentation, we set up quantum caches for the
1524          * smaller sizes;  we chose 32k because that translates to 128k VA
1525          * slabs, which matches nicely with the common 128k zio_data bufs.
1526          */
1527         zio_arena = vmem_create("zfs_file_data", zio_mem_base, zio_mem_size,
1528             PAGESIZE, NULL, NULL, NULL, 32 * 1024, VM_SLEEP);
1529 
1530         zio_alloc_arena = vmem_create("zfs_file_data_buf", NULL, 0, PAGESIZE,
1531             segkmem_zio_alloc, segkmem_zio_free, zio_arena, 0, VM_SLEEP);
1532 
1533         ASSERT(zio_arena != NULL);
1534         ASSERT(zio_alloc_arena != NULL);
1535 }
1536 
1537 #ifdef __sparc
1538 
1539 
1540 static void *
1541 segkmem_alloc_ppa(vmem_t *vmp, size_t size, int vmflag)
1542 {
1543         size_t ppaquantum = btopr(segkmem_lpsize) * sizeof (page_t *);
1544         void   *addr;
1545 
1546         if (ppaquantum <= PAGESIZE)
1547                 return (segkmem_alloc(vmp, size, vmflag));
1548 
1549         ASSERT((size & (ppaquantum - 1)) == 0);
1550 
1551         addr = vmem_xalloc(vmp, size, ppaquantum, 0, 0, NULL, NULL, vmflag);
1552         if (addr != NULL && segkmem_xalloc(vmp, addr, size, vmflag, 0,
1553             segkmem_page_create, NULL) == NULL) {
1554                 vmem_xfree(vmp, addr, size);
1555                 addr = NULL;
1556         }
1557 
1558         return (addr);
1559 }
1560 
1561 static void
1562 segkmem_free_ppa(vmem_t *vmp, void *addr, size_t size)
1563 {
1564         size_t ppaquantum = btopr(segkmem_lpsize) * sizeof (page_t *);
1565 
1566         ASSERT(addr != NULL);
1567 
1568         if (ppaquantum <= PAGESIZE) {
1569                 segkmem_free(vmp, addr, size);
1570         } else {
1571                 segkmem_free(NULL, addr, size);
1572                 vmem_xfree(vmp, addr, size);
1573         }
1574 }
1575 
1576 void
1577 segkmem_heap_lp_init()
1578 {
1579         segkmem_lpcb_t *lpcb = &segkmem_lpcb;
1580         size_t heap_lp_size = heap_lp_end - heap_lp_base;
1581         size_t lpsize = segkmem_lpsize;
1582         size_t ppaquantum;
1583         void   *addr;
1584 
1585         if (segkmem_lpsize <= PAGESIZE) {
1586                 ASSERT(heap_lp_base == NULL);
1587                 ASSERT(heap_lp_end == NULL);
1588                 return;
1589         }
1590 
1591         ASSERT(segkmem_heaplp_quantum >= lpsize);
1592         ASSERT((segkmem_heaplp_quantum & (lpsize - 1)) == 0);
1593         ASSERT(lpcb->lp_uselp == 0);
1594         ASSERT(heap_lp_base != NULL);
1595         ASSERT(heap_lp_end != NULL);
1596         ASSERT(heap_lp_base < heap_lp_end);
1597         ASSERT(heap_lp_arena == NULL);
1598         ASSERT(((uintptr_t)heap_lp_base & (lpsize - 1)) == 0);
1599         ASSERT(((uintptr_t)heap_lp_end & (lpsize - 1)) == 0);
1600 
1601         /* create large page heap arena */
1602         heap_lp_arena = vmem_create("heap_lp", heap_lp_base, heap_lp_size,
1603             segkmem_heaplp_quantum, NULL, NULL, NULL, 0, VM_SLEEP);
1604 
1605         ASSERT(heap_lp_arena != NULL);
1606 
1607         /* This arena caches memory already mapped by large pages */
1608         kmem_lp_arena = vmem_create("kmem_lp", NULL, 0, segkmem_kmemlp_quantum,
1609             segkmem_alloc_lpi, segkmem_free_lpi, heap_lp_arena, 0, VM_SLEEP);
1610 
1611         ASSERT(kmem_lp_arena != NULL);
1612 
1613         mutex_init(&lpcb->lp_lock, NULL, MUTEX_DEFAULT, NULL);
1614         cv_init(&lpcb->lp_cv, NULL, CV_DEFAULT, NULL);
1615 
1616         /*
1617          * this arena is used for the array of page_t pointers necessary
1618          * to call hat_mem_load_array
1619          */
1620         ppaquantum = btopr(lpsize) * sizeof (page_t *);
1621         segkmem_ppa_arena = vmem_create("segkmem_ppa", NULL, 0, ppaquantum,
1622             segkmem_alloc_ppa, segkmem_free_ppa, heap_arena, ppaquantum,
1623             VM_SLEEP);
1624 
1625         ASSERT(segkmem_ppa_arena != NULL);
1626 
1627         /* prealloacate some memory for the lp kernel heap */
1628         if (segkmem_kmemlp_min) {
1629 
1630                 ASSERT(P2PHASE(segkmem_kmemlp_min,
1631                     segkmem_heaplp_quantum) == 0);
1632 
1633                 if ((addr = segkmem_alloc_lpi(heap_lp_arena,
1634                     segkmem_kmemlp_min, VM_SLEEP)) != NULL) {
1635 
1636                         addr = vmem_add(kmem_lp_arena, addr,
1637                             segkmem_kmemlp_min, VM_SLEEP);
1638                         ASSERT(addr != NULL);
1639                 }
1640         }
1641 
1642         lpcb->lp_uselp = 1;
1643 }
1644 
1645 #endif
--- EOF ---