il_vmem-man Wdiff usr/src/uts/common/os/vmem.c

Print this page

7831 want vmem manual pages
7832 big theory statements need a place in the manual

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/vmem.c
          +++ new/usr/src/uts/common/os/vmem.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
  26   26  /*
  27   27   * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  28   28   * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  29   29   */
  30   30  
  31   31  /*
  32   32   * Big Theory Statement for the virtual memory allocator.
  33   33   *

↓ open down ↓

33 lines elided

↑ open up ↑

  34   34   * For a more complete description of the main ideas, see:
  35   35   *
  36   36   *      Jeff Bonwick and Jonathan Adams,
  37   37   *
  38   38   *      Magazines and vmem: Extending the Slab Allocator to Many CPUs and
  39   39   *      Arbitrary Resources.
  40   40   *
  41   41   *      Proceedings of the 2001 Usenix Conference.
  42   42   *      Available as http://www.usenix.org/event/usenix01/bonwick.html
  43   43   *
       44 + * Section 1, below, is also the primary contents of vmem(9).  If for some
       45 + * reason you are updating this comment, you will also wish to update the
       46 + * manual.
  44   47   *
  45   48   * 1. General Concepts
  46   49   * -------------------
  47   50   *
  48   51   * 1.1 Overview
  49   52   * ------------
  50   53   * We divide the kernel address space into a number of logically distinct
  51   54   * pieces, or *arenas*: text, data, heap, stack, and so on.  Within these
  52   55   * arenas we often subdivide further; for example, we use heap addresses
  53   56   * not only for the kernel heap (kmem_alloc() space), but also for DVMA,

  54   57   * bp_mapin(), /dev/kmem, and even some device mappings like the TOD chip.
  55   58   * The kernel address space, therefore, is most accurately described as
  56   59   * a tree of arenas in which each node of the tree *imports* some subset
  57   60   * of its parent.  The virtual memory allocator manages these arenas and
  58   61   * supports their natural hierarchical structure.
  59   62   *
  60   63   * 1.2 Arenas
  61   64   * ----------
  62   65   * An arena is nothing more than a set of integers.  These integers most
  63   66   * commonly represent virtual addresses, but in fact they can represent
  64   67   * anything at all.  For example, we could use an arena containing the
  65   68   * integers minpid through maxpid to allocate process IDs.  vmem_create()
  66   69   * and vmem_destroy() create and destroy vmem arenas.  In order to
  67   70   * differentiate between arenas used for adresses and arenas used for
  68   71   * identifiers, the VMC_IDENTIFIER flag is passed to vmem_create().  This
  69   72   * prevents identifier exhaustion from being diagnosed as general memory
  70   73   * failure.
  71   74   *
  72   75   * 1.3 Spans
  73   76   * ---------
  74   77   * We represent the integers in an arena as a collection of *spans*, or
  75   78   * contiguous ranges of integers.  For example, the kernel heap consists
  76   79   * of just one span: [kernelheap, ekernelheap).  Spans can be added to an
  77   80   * arena in two ways: explicitly, by vmem_add(), or implicitly, by
  78   81   * importing, as described in Section 1.5 below.
  79   82   *
  80   83   * 1.4 Segments
  81   84   * ------------
  82   85   * Spans are subdivided into *segments*, each of which is either allocated
  83   86   * or free.  A segment, like a span, is a contiguous range of integers.
  84   87   * Each allocated segment [addr, addr + size) represents exactly one
  85   88   * vmem_alloc(size) that returned addr.  Free segments represent the space
  86   89   * between allocated segments.  If two free segments are adjacent, we
  87   90   * coalesce them into one larger segment; that is, if segments [a, b) and
  88   91   * [b, c) are both free, we merge them into a single segment [a, c).
  89   92   * The segments within a span are linked together in increasing-address order
  90   93   * so we can easily determine whether coalescing is possible.
  91   94   *
  92   95   * Segments never cross span boundaries.  When all segments within
  93   96   * an imported span become free, we return the span to its source.
  94   97   *
  95   98   * 1.5 Imported Memory
  96   99   * -------------------
  97  100   * As mentioned in the overview, some arenas are logical subsets of
  98  101   * other arenas.  For example, kmem_va_arena (a virtual address cache
  99  102   * that satisfies most kmem_slab_create() requests) is just a subset
 100  103   * of heap_arena (the kernel heap) that provides caching for the most
 101  104   * common slab sizes.  When kmem_va_arena runs out of virtual memory,
 102  105   * it *imports* more from the heap; we say that heap_arena is the
 103  106   * *vmem source* for kmem_va_arena.  vmem_create() allows you to
 104  107   * specify any existing vmem arena as the source for your new arena.
 105  108   * Topologically, since every arena is a child of at most one source,
 106  109   * the set of all arenas forms a collection of trees.
 107  110   *
 108  111   * 1.6 Constrained Allocations
 109  112   * ---------------------------
 110  113   * Some vmem clients are quite picky about the kind of address they want.
 111  114   * For example, the DVMA code may need an address that is at a particular
 112  115   * phase with respect to some alignment (to get good cache coloring), or
 113  116   * that lies within certain limits (the addressable range of a device),
 114  117   * or that doesn't cross some boundary (a DMA counter restriction) --
 115  118   * or all of the above.  vmem_xalloc() allows the client to specify any
 116  119   * or all of these constraints.
 117  120   *
 118  121   * 1.7 The Vmem Quantum
 119  122   * --------------------
 120  123   * Every arena has a notion of 'quantum', specified at vmem_create() time,
 121  124   * that defines the arena's minimum unit of currency.  Most commonly the
 122  125   * quantum is either 1 or PAGESIZE, but any power of 2 is legal.
 123  126   * All vmem allocations are guaranteed to be quantum-aligned.
 124  127   *
 125  128   * 1.8 Quantum Caching
 126  129   * -------------------
 127  130   * A vmem arena may be so hot (frequently used) that the scalability of vmem
 128  131   * allocation is a significant concern.  We address this by allowing the most
 129  132   * common allocation sizes to be serviced by the kernel memory allocator,
 130  133   * which provides low-latency per-cpu caching.  The qcache_max argument to
 131  134   * vmem_create() specifies the largest allocation size to cache.
 132  135   *
 133  136   * 1.9 Relationship to Kernel Memory Allocator
 134  137   * -------------------------------------------
 135  138   * Every kmem cache has a vmem arena as its slab supplier.  The kernel memory
 136  139   * allocator uses vmem_alloc() and vmem_free() to create and destroy slabs.
 137  140   *
 138  141   *
 139  142   * 2. Implementation
 140  143   * -----------------
 141  144   *
 142  145   * 2.1 Segment lists and markers
 143  146   * -----------------------------
 144  147   * The segment structure (vmem_seg_t) contains two doubly-linked lists.
 145  148   *
 146  149   * The arena list (vs_anext/vs_aprev) links all segments in the arena.
 147  150   * In addition to the allocated and free segments, the arena contains
 148  151   * special marker segments at span boundaries.  Span markers simplify
 149  152   * coalescing and importing logic by making it easy to tell both when
 150  153   * we're at a span boundary (so we don't coalesce across it), and when
 151  154   * a span is completely free (its neighbors will both be span markers).
 152  155   *
 153  156   * Imported spans will have vs_import set.
 154  157   *
 155  158   * The next-of-kin list (vs_knext/vs_kprev) links segments of the same type:
 156  159   * (1) for allocated segments, vs_knext is the hash chain linkage;
 157  160   * (2) for free segments, vs_knext is the freelist linkage;
 158  161   * (3) for span marker segments, vs_knext is the next span marker.
 159  162   *
 160  163   * 2.2 Allocation hashing
 161  164   * ----------------------
 162  165   * We maintain a hash table of all allocated segments, hashed by address.
 163  166   * This allows vmem_free() to discover the target segment in constant time.
 164  167   * vmem_update() periodically resizes hash tables to keep hash chains short.
 165  168   *
 166  169   * 2.3 Freelist management
 167  170   * -----------------------
 168  171   * We maintain power-of-2 freelists for free segments, i.e. free segments
 169  172   * of size >= 2^n reside in vmp->vm_freelist[n].  To ensure constant-time
 170  173   * allocation, vmem_xalloc() looks not in the first freelist that *might*
 171  174   * satisfy the allocation, but in the first freelist that *definitely*
 172  175   * satisfies the allocation (unless VM_BESTFIT is specified, or all larger
 173  176   * freelists are empty).  For example, a 1000-byte allocation will be
 174  177   * satisfied not from the 512..1023-byte freelist, whose members *might*
 175  178   * contains a 1000-byte segment, but from a 1024-byte or larger freelist,
 176  179   * the first member of which will *definitely* satisfy the allocation.
 177  180   * This ensures that vmem_xalloc() works in constant time.
 178  181   *
 179  182   * We maintain a bit map to determine quickly which freelists are non-empty.
 180  183   * vmp->vm_freemap & (1 << n) is non-zero iff vmp->vm_freelist[n] is non-empty.
 181  184   *
 182  185   * The different freelists are linked together into one large freelist,
 183  186   * with the freelist heads serving as markers.  Freelist markers simplify
 184  187   * the maintenance of vm_freemap by making it easy to tell when we're taking
 185  188   * the last member of a freelist (both of its neighbors will be markers).
 186  189   *
 187  190   * 2.4 Vmem Locking
 188  191   * ----------------
 189  192   * For simplicity, all arena state is protected by a per-arena lock.
 190  193   * For very hot arenas, use quantum caching for scalability.
 191  194   *
 192  195   * 2.5 Vmem Population
 193  196   * -------------------
 194  197   * Any internal vmem routine that might need to allocate new segment
 195  198   * structures must prepare in advance by calling vmem_populate(), which
 196  199   * will preallocate enough vmem_seg_t's to get is through the entire
 197  200   * operation without dropping the arena lock.
 198  201   *
 199  202   * 2.6 Auditing
 200  203   * ------------
 201  204   * If KMF_AUDIT is set in kmem_flags, we audit vmem allocations as well.
 202  205   * Since virtual addresses cannot be scribbled on, there is no equivalent
 203  206   * in vmem to redzone checking, deadbeef, or other kmem debugging features.
 204  207   * Moreover, we do not audit frees because segment coalescing destroys the
 205  208   * association between an address and its segment structure.  Auditing is
 206  209   * thus intended primarily to keep track of who's consuming the arena.
 207  210   * Debugging support could certainly be extended in the future if it proves
 208  211   * necessary, but we do so much live checking via the allocation hash table
 209  212   * that even non-DEBUG systems get quite a bit of sanity checking already.
 210  213   */
 211  214  
 212  215  #include <sys/vmem_impl.h>
 213  216  #include <sys/kmem.h>
 214  217  #include <sys/kstat.h>
 215  218  #include <sys/param.h>
 216  219  #include <sys/systm.h>
 217  220  #include <sys/atomic.h>
 218  221  #include <sys/bitmap.h>
 219  222  #include <sys/sysmacros.h>
 220  223  #include <sys/cmn_err.h>
 221  224  #include <sys/debug.h>
 222  225  #include <sys/panic.h>
 223  226  
 224  227  #define VMEM_INITIAL            10      /* early vmem arenas */
 225  228  #define VMEM_SEG_INITIAL        200     /* early segments */
 226  229  
 227  230  /*
 228  231   * Adding a new span to an arena requires two segment structures: one to
 229  232   * represent the span, and one to represent the free segment it contains.
 230  233   */
 231  234  #define VMEM_SEGS_PER_SPAN_CREATE       2
 232  235  
 233  236  /*
 234  237   * Allocating a piece of an existing segment requires 0-2 segment structures
 235  238   * depending on how much of the segment we're allocating.
 236  239   *
 237  240   * To allocate the entire segment, no new segment structures are needed; we
 238  241   * simply move the existing segment structure from the freelist to the
 239  242   * allocation hash table.
 240  243   *
 241  244   * To allocate a piece from the left or right end of the segment, we must
 242  245   * split the segment into two pieces (allocated part and remainder), so we
 243  246   * need one new segment structure to represent the remainder.
 244  247   *
 245  248   * To allocate from the middle of a segment, we need two new segment strucures
 246  249   * to represent the remainders on either side of the allocated part.
 247  250   */
 248  251  #define VMEM_SEGS_PER_EXACT_ALLOC       0
 249  252  #define VMEM_SEGS_PER_LEFT_ALLOC        1
 250  253  #define VMEM_SEGS_PER_RIGHT_ALLOC       1
 251  254  #define VMEM_SEGS_PER_MIDDLE_ALLOC      2
 252  255  
 253  256  /*
 254  257   * vmem_populate() preallocates segment structures for vmem to do its work.
 255  258   * It must preallocate enough for the worst case, which is when we must import
 256  259   * a new span and then allocate from the middle of it.
 257  260   */
 258  261  #define VMEM_SEGS_PER_ALLOC_MAX         \
 259  262          (VMEM_SEGS_PER_SPAN_CREATE + VMEM_SEGS_PER_MIDDLE_ALLOC)
 260  263  
 261  264  /*
 262  265   * The segment structures themselves are allocated from vmem_seg_arena, so
 263  266   * we have a recursion problem when vmem_seg_arena needs to populate itself.
 264  267   * We address this by working out the maximum number of segment structures
 265  268   * this act will require, and multiplying by the maximum number of threads
 266  269   * that we'll allow to do it simultaneously.
 267  270   *
 268  271   * The worst-case segment consumption to populate vmem_seg_arena is as
 269  272   * follows (depicted as a stack trace to indicate why events are occurring):
 270  273   *
 271  274   * (In order to lower the fragmentation in the heap_arena, we specify a
 272  275   * minimum import size for the vmem_metadata_arena which is the same size
 273  276   * as the kmem_va quantum cache allocations.  This causes the worst-case
 274  277   * allocation from the vmem_metadata_arena to be 3 segments.)
 275  278   *
 276  279   * vmem_alloc(vmem_seg_arena)           -> 2 segs (span create + exact alloc)
 277  280   *  segkmem_alloc(vmem_metadata_arena)
 278  281   *   vmem_alloc(vmem_metadata_arena)    -> 3 segs (span create + left alloc)
 279  282   *    vmem_alloc(heap_arena)            -> 1 seg (left alloc)
 280  283   *   page_create()
 281  284   *   hat_memload()
 282  285   *    kmem_cache_alloc()
 283  286   *     kmem_slab_create()
 284  287   *      vmem_alloc(hat_memload_arena)   -> 2 segs (span create + exact alloc)
 285  288   *       segkmem_alloc(heap_arena)
 286  289   *        vmem_alloc(heap_arena)        -> 1 seg (left alloc)
 287  290   *        page_create()
 288  291   *        hat_memload()         -> (hat layer won't recurse further)
 289  292   *
 290  293   * The worst-case consumption for each arena is 3 segment structures.
 291  294   * Of course, a 3-seg reserve could easily be blown by multiple threads.
 292  295   * Therefore, we serialize all allocations from vmem_seg_arena (which is OK
 293  296   * because they're rare).  We cannot allow a non-blocking allocation to get
 294  297   * tied up behind a blocking allocation, however, so we use separate locks
 295  298   * for VM_SLEEP and VM_NOSLEEP allocations.  Similarly, VM_PUSHPAGE allocations
 296  299   * must not block behind ordinary VM_SLEEPs.  In addition, if the system is
 297  300   * panicking then we must keep enough resources for panic_thread to do its
 298  301   * work.  Thus we have at most four threads trying to allocate from
 299  302   * vmem_seg_arena, and each thread consumes at most three segment structures,
 300  303   * so we must maintain a 12-seg reserve.
 301  304   */
 302  305  #define VMEM_POPULATE_RESERVE   12
 303  306  
 304  307  /*
 305  308   * vmem_populate() ensures that each arena has VMEM_MINFREE seg structures
 306  309   * so that it can satisfy the worst-case allocation *and* participate in
 307  310   * worst-case allocation from vmem_seg_arena.
 308  311   */
 309  312  #define VMEM_MINFREE    (VMEM_POPULATE_RESERVE + VMEM_SEGS_PER_ALLOC_MAX)
 310  313  
 311  314  static vmem_t vmem0[VMEM_INITIAL];
 312  315  static vmem_t *vmem_populator[VMEM_INITIAL];
 313  316  static uint32_t vmem_id;
 314  317  static uint32_t vmem_populators;
 315  318  static vmem_seg_t vmem_seg0[VMEM_SEG_INITIAL];
 316  319  static vmem_seg_t *vmem_segfree;
 317  320  static kmutex_t vmem_list_lock;
 318  321  static kmutex_t vmem_segfree_lock;
 319  322  static kmutex_t vmem_sleep_lock;
 320  323  static kmutex_t vmem_nosleep_lock;
 321  324  static kmutex_t vmem_pushpage_lock;
 322  325  static kmutex_t vmem_panic_lock;
 323  326  static vmem_t *vmem_list;
 324  327  static vmem_t *vmem_metadata_arena;
 325  328  static vmem_t *vmem_seg_arena;
 326  329  static vmem_t *vmem_hash_arena;
 327  330  static vmem_t *vmem_vmem_arena;
 328  331  static long vmem_update_interval = 15;  /* vmem_update() every 15 seconds */
 329  332  uint32_t vmem_mtbf;             /* mean time between failures [default: off] */
 330  333  size_t vmem_seg_size = sizeof (vmem_seg_t);
 331  334  
 332  335  static vmem_kstat_t vmem_kstat_template = {
 333  336          { "mem_inuse",          KSTAT_DATA_UINT64 },
 334  337          { "mem_import",         KSTAT_DATA_UINT64 },
 335  338          { "mem_total",          KSTAT_DATA_UINT64 },
 336  339          { "vmem_source",        KSTAT_DATA_UINT32 },
 337  340          { "alloc",              KSTAT_DATA_UINT64 },
 338  341          { "free",               KSTAT_DATA_UINT64 },
 339  342          { "wait",               KSTAT_DATA_UINT64 },
 340  343          { "fail",               KSTAT_DATA_UINT64 },
 341  344          { "lookup",             KSTAT_DATA_UINT64 },
 342  345          { "search",             KSTAT_DATA_UINT64 },
 343  346          { "populate_wait",      KSTAT_DATA_UINT64 },
 344  347          { "populate_fail",      KSTAT_DATA_UINT64 },
 345  348          { "contains",           KSTAT_DATA_UINT64 },
 346  349          { "contains_search",    KSTAT_DATA_UINT64 },
 347  350  };
 348  351  
 349  352  /*
 350  353   * Insert/delete from arena list (type 'a') or next-of-kin list (type 'k').
 351  354   */
 352  355  #define VMEM_INSERT(vprev, vsp, type)                                   \
 353  356  {                                                                       \
 354  357          vmem_seg_t *vnext = (vprev)->vs_##type##next;                   \
 355  358          (vsp)->vs_##type##next = (vnext);                               \
 356  359          (vsp)->vs_##type##prev = (vprev);                               \
 357  360          (vprev)->vs_##type##next = (vsp);                               \
 358  361          (vnext)->vs_##type##prev = (vsp);                               \
 359  362  }
 360  363  
 361  364  #define VMEM_DELETE(vsp, type)                                          \
 362  365  {                                                                       \
 363  366          vmem_seg_t *vprev = (vsp)->vs_##type##prev;                     \
 364  367          vmem_seg_t *vnext = (vsp)->vs_##type##next;                     \
 365  368          (vprev)->vs_##type##next = (vnext);                             \
 366  369          (vnext)->vs_##type##prev = (vprev);                             \
 367  370  }
 368  371  
 369  372  /*
 370  373   * Get a vmem_seg_t from the global segfree list.
 371  374   */
 372  375  static vmem_seg_t *
 373  376  vmem_getseg_global(void)
 374  377  {
 375  378          vmem_seg_t *vsp;
 376  379  
 377  380          mutex_enter(&vmem_segfree_lock);
 378  381          if ((vsp = vmem_segfree) != NULL)
 379  382                  vmem_segfree = vsp->vs_knext;
 380  383          mutex_exit(&vmem_segfree_lock);
 381  384  
 382  385          return (vsp);
 383  386  }
 384  387  
 385  388  /*
 386  389   * Put a vmem_seg_t on the global segfree list.
 387  390   */
 388  391  static void
 389  392  vmem_putseg_global(vmem_seg_t *vsp)
 390  393  {
 391  394          mutex_enter(&vmem_segfree_lock);
 392  395          vsp->vs_knext = vmem_segfree;
 393  396          vmem_segfree = vsp;
 394  397          mutex_exit(&vmem_segfree_lock);
 395  398  }
 396  399  
 397  400  /*
 398  401   * Get a vmem_seg_t from vmp's segfree list.
 399  402   */
 400  403  static vmem_seg_t *
 401  404  vmem_getseg(vmem_t *vmp)
 402  405  {
 403  406          vmem_seg_t *vsp;
 404  407  
 405  408          ASSERT(vmp->vm_nsegfree > 0);
 406  409  
 407  410          vsp = vmp->vm_segfree;
 408  411          vmp->vm_segfree = vsp->vs_knext;
 409  412          vmp->vm_nsegfree--;
 410  413  
 411  414          return (vsp);
 412  415  }
 413  416  
 414  417  /*
 415  418   * Put a vmem_seg_t on vmp's segfree list.
 416  419   */
 417  420  static void
 418  421  vmem_putseg(vmem_t *vmp, vmem_seg_t *vsp)
 419  422  {
 420  423          vsp->vs_knext = vmp->vm_segfree;
 421  424          vmp->vm_segfree = vsp;
 422  425          vmp->vm_nsegfree++;
 423  426  }
 424  427  
 425  428  /*
 426  429   * Add vsp to the appropriate freelist.
 427  430   */
 428  431  static void
 429  432  vmem_freelist_insert(vmem_t *vmp, vmem_seg_t *vsp)
 430  433  {
 431  434          vmem_seg_t *vprev;
 432  435  
 433  436          ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp);
 434  437  
 435  438          vprev = (vmem_seg_t *)&vmp->vm_freelist[highbit(VS_SIZE(vsp)) - 1];
 436  439          vsp->vs_type = VMEM_FREE;
 437  440          vmp->vm_freemap |= VS_SIZE(vprev);
 438  441          VMEM_INSERT(vprev, vsp, k);
 439  442  
 440  443          cv_broadcast(&vmp->vm_cv);
 441  444  }
 442  445  
 443  446  /*
 444  447   * Take vsp from the freelist.
 445  448   */
 446  449  static void
 447  450  vmem_freelist_delete(vmem_t *vmp, vmem_seg_t *vsp)
 448  451  {
 449  452          ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp);
 450  453          ASSERT(vsp->vs_type == VMEM_FREE);
 451  454  
 452  455          if (vsp->vs_knext->vs_start == 0 && vsp->vs_kprev->vs_start == 0) {
 453  456                  /*
 454  457                   * The segments on both sides of 'vsp' are freelist heads,
 455  458                   * so taking vsp leaves the freelist at vsp->vs_kprev empty.
 456  459                   */
 457  460                  ASSERT(vmp->vm_freemap & VS_SIZE(vsp->vs_kprev));
 458  461                  vmp->vm_freemap ^= VS_SIZE(vsp->vs_kprev);
 459  462          }
 460  463          VMEM_DELETE(vsp, k);
 461  464  }
 462  465  
 463  466  /*
 464  467   * Add vsp to the allocated-segment hash table and update kstats.
 465  468   */
 466  469  static void
 467  470  vmem_hash_insert(vmem_t *vmp, vmem_seg_t *vsp)
 468  471  {
 469  472          vmem_seg_t **bucket;
 470  473  
 471  474          vsp->vs_type = VMEM_ALLOC;
 472  475          bucket = VMEM_HASH(vmp, vsp->vs_start);
 473  476          vsp->vs_knext = *bucket;
 474  477          *bucket = vsp;
 475  478  
 476  479          if (vmem_seg_size == sizeof (vmem_seg_t)) {
 477  480                  vsp->vs_depth = (uint8_t)getpcstack(vsp->vs_stack,
 478  481                      VMEM_STACK_DEPTH);
 479  482                  vsp->vs_thread = curthread;
 480  483                  vsp->vs_timestamp = gethrtime();
 481  484          } else {
 482  485                  vsp->vs_depth = 0;
 483  486          }
 484  487  
 485  488          vmp->vm_kstat.vk_alloc.value.ui64++;
 486  489          vmp->vm_kstat.vk_mem_inuse.value.ui64 += VS_SIZE(vsp);
 487  490  }
 488  491  
 489  492  /*
 490  493   * Remove vsp from the allocated-segment hash table and update kstats.
 491  494   */
 492  495  static vmem_seg_t *
 493  496  vmem_hash_delete(vmem_t *vmp, uintptr_t addr, size_t size)
 494  497  {
 495  498          vmem_seg_t *vsp, **prev_vspp;
 496  499  
 497  500          prev_vspp = VMEM_HASH(vmp, addr);
 498  501          while ((vsp = *prev_vspp) != NULL) {
 499  502                  if (vsp->vs_start == addr) {
 500  503                          *prev_vspp = vsp->vs_knext;
 501  504                          break;
 502  505                  }
 503  506                  vmp->vm_kstat.vk_lookup.value.ui64++;
 504  507                  prev_vspp = &vsp->vs_knext;
 505  508          }
 506  509  
 507  510          if (vsp == NULL)
 508  511                  panic("vmem_hash_delete(%p, %lx, %lu): bad free",
 509  512                      (void *)vmp, addr, size);
 510  513          if (VS_SIZE(vsp) != size)
 511  514                  panic("vmem_hash_delete(%p, %lx, %lu): wrong size (expect %lu)",
 512  515                      (void *)vmp, addr, size, VS_SIZE(vsp));
 513  516  
 514  517          vmp->vm_kstat.vk_free.value.ui64++;
 515  518          vmp->vm_kstat.vk_mem_inuse.value.ui64 -= size;
 516  519  
 517  520          return (vsp);
 518  521  }
 519  522  
 520  523  /*
 521  524   * Create a segment spanning the range [start, end) and add it to the arena.
 522  525   */
 523  526  static vmem_seg_t *
 524  527  vmem_seg_create(vmem_t *vmp, vmem_seg_t *vprev, uintptr_t start, uintptr_t end)
 525  528  {
 526  529          vmem_seg_t *newseg = vmem_getseg(vmp);
 527  530  
 528  531          newseg->vs_start = start;
 529  532          newseg->vs_end = end;
 530  533          newseg->vs_type = 0;
 531  534          newseg->vs_import = 0;
 532  535  
 533  536          VMEM_INSERT(vprev, newseg, a);
 534  537  
 535  538          return (newseg);
 536  539  }
 537  540  
 538  541  /*
 539  542   * Remove segment vsp from the arena.
 540  543   */
 541  544  static void
 542  545  vmem_seg_destroy(vmem_t *vmp, vmem_seg_t *vsp)
 543  546  {
 544  547          ASSERT(vsp->vs_type != VMEM_ROTOR);
 545  548          VMEM_DELETE(vsp, a);
 546  549  
 547  550          vmem_putseg(vmp, vsp);
 548  551  }
 549  552  
 550  553  /*
 551  554   * Add the span [vaddr, vaddr + size) to vmp and update kstats.
 552  555   */
 553  556  static vmem_seg_t *
 554  557  vmem_span_create(vmem_t *vmp, void *vaddr, size_t size, uint8_t import)
 555  558  {
 556  559          vmem_seg_t *newseg, *span;
 557  560          uintptr_t start = (uintptr_t)vaddr;
 558  561          uintptr_t end = start + size;
 559  562  
 560  563          ASSERT(MUTEX_HELD(&vmp->vm_lock));
 561  564  
 562  565          if ((start | end) & (vmp->vm_quantum - 1))
 563  566                  panic("vmem_span_create(%p, %p, %lu): misaligned",
 564  567                      (void *)vmp, vaddr, size);
 565  568  
 566  569          span = vmem_seg_create(vmp, vmp->vm_seg0.vs_aprev, start, end);
 567  570          span->vs_type = VMEM_SPAN;
 568  571          span->vs_import = import;
 569  572          VMEM_INSERT(vmp->vm_seg0.vs_kprev, span, k);
 570  573  
 571  574          newseg = vmem_seg_create(vmp, span, start, end);
 572  575          vmem_freelist_insert(vmp, newseg);
 573  576  
 574  577          if (import)
 575  578                  vmp->vm_kstat.vk_mem_import.value.ui64 += size;
 576  579          vmp->vm_kstat.vk_mem_total.value.ui64 += size;
 577  580  
 578  581          return (newseg);
 579  582  }
 580  583  
 581  584  /*
 582  585   * Remove span vsp from vmp and update kstats.
 583  586   */
 584  587  static void
 585  588  vmem_span_destroy(vmem_t *vmp, vmem_seg_t *vsp)
 586  589  {
 587  590          vmem_seg_t *span = vsp->vs_aprev;
 588  591          size_t size = VS_SIZE(vsp);
 589  592  
 590  593          ASSERT(MUTEX_HELD(&vmp->vm_lock));
 591  594          ASSERT(span->vs_type == VMEM_SPAN);
 592  595  
 593  596          if (span->vs_import)
 594  597                  vmp->vm_kstat.vk_mem_import.value.ui64 -= size;
 595  598          vmp->vm_kstat.vk_mem_total.value.ui64 -= size;
 596  599  
 597  600          VMEM_DELETE(span, k);
 598  601  
 599  602          vmem_seg_destroy(vmp, vsp);
 600  603          vmem_seg_destroy(vmp, span);
 601  604  }
 602  605  
 603  606  /*
 604  607   * Allocate the subrange [addr, addr + size) from segment vsp.
 605  608   * If there are leftovers on either side, place them on the freelist.
 606  609   * Returns a pointer to the segment representing [addr, addr + size).
 607  610   */
 608  611  static vmem_seg_t *
 609  612  vmem_seg_alloc(vmem_t *vmp, vmem_seg_t *vsp, uintptr_t addr, size_t size)
 610  613  {
 611  614          uintptr_t vs_start = vsp->vs_start;
 612  615          uintptr_t vs_end = vsp->vs_end;
 613  616          size_t vs_size = vs_end - vs_start;
 614  617          size_t realsize = P2ROUNDUP(size, vmp->vm_quantum);
 615  618          uintptr_t addr_end = addr + realsize;
 616  619  
 617  620          ASSERT(P2PHASE(vs_start, vmp->vm_quantum) == 0);
 618  621          ASSERT(P2PHASE(addr, vmp->vm_quantum) == 0);
 619  622          ASSERT(vsp->vs_type == VMEM_FREE);
 620  623          ASSERT(addr >= vs_start && addr_end - 1 <= vs_end - 1);
 621  624          ASSERT(addr - 1 <= addr_end - 1);
 622  625  
 623  626          /*
 624  627           * If we're allocating from the start of the segment, and the
 625  628           * remainder will be on the same freelist, we can save quite
 626  629           * a bit of work.
 627  630           */
 628  631          if (P2SAMEHIGHBIT(vs_size, vs_size - realsize) && addr == vs_start) {
 629  632                  ASSERT(highbit(vs_size) == highbit(vs_size - realsize));
 630  633                  vsp->vs_start = addr_end;
 631  634                  vsp = vmem_seg_create(vmp, vsp->vs_aprev, addr, addr + size);
 632  635                  vmem_hash_insert(vmp, vsp);
 633  636                  return (vsp);
 634  637          }
 635  638  
 636  639          vmem_freelist_delete(vmp, vsp);
 637  640  
 638  641          if (vs_end != addr_end)
 639  642                  vmem_freelist_insert(vmp,
 640  643                      vmem_seg_create(vmp, vsp, addr_end, vs_end));
 641  644  
 642  645          if (vs_start != addr)
 643  646                  vmem_freelist_insert(vmp,
 644  647                      vmem_seg_create(vmp, vsp->vs_aprev, vs_start, addr));
 645  648  
 646  649          vsp->vs_start = addr;
 647  650          vsp->vs_end = addr + size;
 648  651  
 649  652          vmem_hash_insert(vmp, vsp);
 650  653          return (vsp);
 651  654  }
 652  655  
 653  656  /*
 654  657   * Returns 1 if we are populating, 0 otherwise.
 655  658   * Call it if we want to prevent recursion from HAT.
 656  659   */
 657  660  int
 658  661  vmem_is_populator()
 659  662  {
 660  663          return (mutex_owner(&vmem_sleep_lock) == curthread ||
 661  664              mutex_owner(&vmem_nosleep_lock) == curthread ||
 662  665              mutex_owner(&vmem_pushpage_lock) == curthread ||
 663  666              mutex_owner(&vmem_panic_lock) == curthread);
 664  667  }
 665  668  
 666  669  /*
 667  670   * Populate vmp's segfree list with VMEM_MINFREE vmem_seg_t structures.
 668  671   */
 669  672  static int
 670  673  vmem_populate(vmem_t *vmp, int vmflag)
 671  674  {
 672  675          char *p;
 673  676          vmem_seg_t *vsp;
 674  677          ssize_t nseg;
 675  678          size_t size;
 676  679          kmutex_t *lp;
 677  680          int i;
 678  681  
 679  682          while (vmp->vm_nsegfree < VMEM_MINFREE &&
 680  683              (vsp = vmem_getseg_global()) != NULL)
 681  684                  vmem_putseg(vmp, vsp);
 682  685  
 683  686          if (vmp->vm_nsegfree >= VMEM_MINFREE)
 684  687                  return (1);
 685  688  
 686  689          /*
 687  690           * If we're already populating, tap the reserve.
 688  691           */
 689  692          if (vmem_is_populator()) {
 690  693                  ASSERT(vmp->vm_cflags & VMC_POPULATOR);
 691  694                  return (1);
 692  695          }
 693  696  
 694  697          mutex_exit(&vmp->vm_lock);
 695  698  
 696  699          if (panic_thread == curthread)
 697  700                  lp = &vmem_panic_lock;
 698  701          else if (vmflag & VM_NOSLEEP)
 699  702                  lp = &vmem_nosleep_lock;
 700  703          else if (vmflag & VM_PUSHPAGE)
 701  704                  lp = &vmem_pushpage_lock;
 702  705          else
 703  706                  lp = &vmem_sleep_lock;
 704  707  
 705  708          mutex_enter(lp);
 706  709  
 707  710          nseg = VMEM_MINFREE + vmem_populators * VMEM_POPULATE_RESERVE;
 708  711          size = P2ROUNDUP(nseg * vmem_seg_size, vmem_seg_arena->vm_quantum);
 709  712          nseg = size / vmem_seg_size;
 710  713  
 711  714          /*
 712  715           * The following vmem_alloc() may need to populate vmem_seg_arena
 713  716           * and all the things it imports from.  When doing so, it will tap
 714  717           * each arena's reserve to prevent recursion (see the block comment
 715  718           * above the definition of VMEM_POPULATE_RESERVE).
 716  719           */
 717  720          p = vmem_alloc(vmem_seg_arena, size, vmflag & VM_KMFLAGS);
 718  721          if (p == NULL) {
 719  722                  mutex_exit(lp);
 720  723                  mutex_enter(&vmp->vm_lock);
 721  724                  vmp->vm_kstat.vk_populate_fail.value.ui64++;
 722  725                  return (0);
 723  726          }
 724  727  
 725  728          /*
 726  729           * Restock the arenas that may have been depleted during population.
 727  730           */
 728  731          for (i = 0; i < vmem_populators; i++) {
 729  732                  mutex_enter(&vmem_populator[i]->vm_lock);
 730  733                  while (vmem_populator[i]->vm_nsegfree < VMEM_POPULATE_RESERVE)
 731  734                          vmem_putseg(vmem_populator[i],
 732  735                              (vmem_seg_t *)(p + --nseg * vmem_seg_size));
 733  736                  mutex_exit(&vmem_populator[i]->vm_lock);
 734  737          }
 735  738  
 736  739          mutex_exit(lp);
 737  740          mutex_enter(&vmp->vm_lock);
 738  741  
 739  742          /*
 740  743           * Now take our own segments.
 741  744           */
 742  745          ASSERT(nseg >= VMEM_MINFREE);
 743  746          while (vmp->vm_nsegfree < VMEM_MINFREE)
 744  747                  vmem_putseg(vmp, (vmem_seg_t *)(p + --nseg * vmem_seg_size));
 745  748  
 746  749          /*
 747  750           * Give the remainder to charity.
 748  751           */
 749  752          while (nseg > 0)
 750  753                  vmem_putseg_global((vmem_seg_t *)(p + --nseg * vmem_seg_size));
 751  754  
 752  755          return (1);
 753  756  }
 754  757  
 755  758  /*
 756  759   * Advance a walker from its previous position to 'afterme'.
 757  760   * Note: may drop and reacquire vmp->vm_lock.
 758  761   */
 759  762  static void
 760  763  vmem_advance(vmem_t *vmp, vmem_seg_t *walker, vmem_seg_t *afterme)
 761  764  {
 762  765          vmem_seg_t *vprev = walker->vs_aprev;
 763  766          vmem_seg_t *vnext = walker->vs_anext;
 764  767          vmem_seg_t *vsp = NULL;
 765  768  
 766  769          VMEM_DELETE(walker, a);
 767  770  
 768  771          if (afterme != NULL)
 769  772                  VMEM_INSERT(afterme, walker, a);
 770  773  
 771  774          /*
 772  775           * The walker segment's presence may have prevented its neighbors
 773  776           * from coalescing.  If so, coalesce them now.
 774  777           */
 775  778          if (vprev->vs_type == VMEM_FREE) {
 776  779                  if (vnext->vs_type == VMEM_FREE) {
 777  780                          ASSERT(vprev->vs_end == vnext->vs_start);
 778  781                          vmem_freelist_delete(vmp, vnext);
 779  782                          vmem_freelist_delete(vmp, vprev);
 780  783                          vprev->vs_end = vnext->vs_end;
 781  784                          vmem_freelist_insert(vmp, vprev);
 782  785                          vmem_seg_destroy(vmp, vnext);
 783  786                  }
 784  787                  vsp = vprev;
 785  788          } else if (vnext->vs_type == VMEM_FREE) {
 786  789                  vsp = vnext;
 787  790          }
 788  791  
 789  792          /*
 790  793           * vsp could represent a complete imported span,
 791  794           * in which case we must return it to the source.
 792  795           */
 793  796          if (vsp != NULL && vsp->vs_aprev->vs_import &&
 794  797              vmp->vm_source_free != NULL &&
 795  798              vsp->vs_aprev->vs_type == VMEM_SPAN &&
 796  799              vsp->vs_anext->vs_type == VMEM_SPAN) {
 797  800                  void *vaddr = (void *)vsp->vs_start;
 798  801                  size_t size = VS_SIZE(vsp);
 799  802                  ASSERT(size == VS_SIZE(vsp->vs_aprev));
 800  803                  vmem_freelist_delete(vmp, vsp);
 801  804                  vmem_span_destroy(vmp, vsp);
 802  805                  mutex_exit(&vmp->vm_lock);
 803  806                  vmp->vm_source_free(vmp->vm_source, vaddr, size);
 804  807                  mutex_enter(&vmp->vm_lock);
 805  808          }
 806  809  }
 807  810  
 808  811  /*
 809  812   * VM_NEXTFIT allocations deliberately cycle through all virtual addresses
 810  813   * in an arena, so that we avoid reusing addresses for as long as possible.
 811  814   * This helps to catch used-after-freed bugs.  It's also the perfect policy
 812  815   * for allocating things like process IDs, where we want to cycle through
 813  816   * all values in order.
 814  817   */
 815  818  static void *
 816  819  vmem_nextfit_alloc(vmem_t *vmp, size_t size, int vmflag)
 817  820  {
 818  821          vmem_seg_t *vsp, *rotor;
 819  822          uintptr_t addr;
 820  823          size_t realsize = P2ROUNDUP(size, vmp->vm_quantum);
 821  824          size_t vs_size;
 822  825  
 823  826          mutex_enter(&vmp->vm_lock);
 824  827  
 825  828          if (vmp->vm_nsegfree < VMEM_MINFREE && !vmem_populate(vmp, vmflag)) {
 826  829                  mutex_exit(&vmp->vm_lock);
 827  830                  return (NULL);
 828  831          }
 829  832  
 830  833          /*
 831  834           * The common case is that the segment right after the rotor is free,
 832  835           * and large enough that extracting 'size' bytes won't change which
 833  836           * freelist it's on.  In this case we can avoid a *lot* of work.
 834  837           * Instead of the normal vmem_seg_alloc(), we just advance the start
 835  838           * address of the victim segment.  Instead of moving the rotor, we
 836  839           * create the new segment structure *behind the rotor*, which has
 837  840           * the same effect.  And finally, we know we don't have to coalesce
 838  841           * the rotor's neighbors because the new segment lies between them.
 839  842           */
 840  843          rotor = &vmp->vm_rotor;
 841  844          vsp = rotor->vs_anext;
 842  845          if (vsp->vs_type == VMEM_FREE && (vs_size = VS_SIZE(vsp)) > realsize &&
 843  846              P2SAMEHIGHBIT(vs_size, vs_size - realsize)) {
 844  847                  ASSERT(highbit(vs_size) == highbit(vs_size - realsize));
 845  848                  addr = vsp->vs_start;
 846  849                  vsp->vs_start = addr + realsize;
 847  850                  vmem_hash_insert(vmp,
 848  851                      vmem_seg_create(vmp, rotor->vs_aprev, addr, addr + size));
 849  852                  mutex_exit(&vmp->vm_lock);
 850  853                  return ((void *)addr);
 851  854          }
 852  855  
 853  856          /*
 854  857           * Starting at the rotor, look for a segment large enough to
 855  858           * satisfy the allocation.
 856  859           */
 857  860          for (;;) {
 858  861                  vmp->vm_kstat.vk_search.value.ui64++;
 859  862                  if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size)
 860  863                          break;
 861  864                  vsp = vsp->vs_anext;
 862  865                  if (vsp == rotor) {
 863  866                          /*
 864  867                           * We've come full circle.  One possibility is that the
 865  868                           * there's actually enough space, but the rotor itself
 866  869                           * is preventing the allocation from succeeding because
 867  870                           * it's sitting between two free segments.  Therefore,
 868  871                           * we advance the rotor and see if that liberates a
 869  872                           * suitable segment.
 870  873                           */
 871  874                          vmem_advance(vmp, rotor, rotor->vs_anext);
 872  875                          vsp = rotor->vs_aprev;
 873  876                          if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size)
 874  877                                  break;
 875  878                          /*
 876  879                           * If there's a lower arena we can import from, or it's
 877  880                           * a VM_NOSLEEP allocation, let vmem_xalloc() handle it.
 878  881                           * Otherwise, wait until another thread frees something.
 879  882                           */
 880  883                          if (vmp->vm_source_alloc != NULL ||
 881  884                              (vmflag & VM_NOSLEEP)) {
 882  885                                  mutex_exit(&vmp->vm_lock);
 883  886                                  return (vmem_xalloc(vmp, size, vmp->vm_quantum,
 884  887                                      0, 0, NULL, NULL, vmflag & VM_KMFLAGS));
 885  888                          }
 886  889                          vmp->vm_kstat.vk_wait.value.ui64++;
 887  890                          cv_wait(&vmp->vm_cv, &vmp->vm_lock);
 888  891                          vsp = rotor->vs_anext;
 889  892                  }
 890  893          }
 891  894  
 892  895          /*
 893  896           * We found a segment.  Extract enough space to satisfy the allocation.
 894  897           */
 895  898          addr = vsp->vs_start;
 896  899          vsp = vmem_seg_alloc(vmp, vsp, addr, size);
 897  900          ASSERT(vsp->vs_type == VMEM_ALLOC &&
 898  901              vsp->vs_start == addr && vsp->vs_end == addr + size);
 899  902  
 900  903          /*
 901  904           * Advance the rotor to right after the newly-allocated segment.
 902  905           * That's where the next VM_NEXTFIT allocation will begin searching.
 903  906           */
 904  907          vmem_advance(vmp, rotor, vsp);
 905  908          mutex_exit(&vmp->vm_lock);
 906  909          return ((void *)addr);
 907  910  }
 908  911  
 909  912  /*
 910  913   * Checks if vmp is guaranteed to have a size-byte buffer somewhere on its
 911  914   * freelist.  If size is not a power-of-2, it can return a false-negative.
 912  915   *
 913  916   * Used to decide if a newly imported span is superfluous after re-acquiring
 914  917   * the arena lock.
 915  918   */
 916  919  static int
 917  920  vmem_canalloc(vmem_t *vmp, size_t size)
 918  921  {
 919  922          int hb;
 920  923          int flist = 0;
 921  924          ASSERT(MUTEX_HELD(&vmp->vm_lock));
 922  925  
 923  926          if (ISP2(size))
 924  927                  flist = lowbit(P2ALIGN(vmp->vm_freemap, size));
 925  928          else if ((hb = highbit(size)) < VMEM_FREELISTS)
 926  929                  flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb));
 927  930  
 928  931          return (flist);
 929  932  }
 930  933  
 931  934  /*
 932  935   * Allocate size bytes at offset phase from an align boundary such that the
 933  936   * resulting segment [addr, addr + size) is a subset of [minaddr, maxaddr)
 934  937   * that does not straddle a nocross-aligned boundary.
 935  938   */
 936  939  void *
 937  940  vmem_xalloc(vmem_t *vmp, size_t size, size_t align_arg, size_t phase,
 938  941      size_t nocross, void *minaddr, void *maxaddr, int vmflag)
 939  942  {
 940  943          vmem_seg_t *vsp;
 941  944          vmem_seg_t *vbest = NULL;
 942  945          uintptr_t addr, taddr, start, end;
 943  946          uintptr_t align = (align_arg != 0) ? align_arg : vmp->vm_quantum;
 944  947          void *vaddr, *xvaddr = NULL;
 945  948          size_t xsize;
 946  949          int hb, flist, resv;
 947  950          uint32_t mtbf;
 948  951  
 949  952          if ((align | phase | nocross) & (vmp->vm_quantum - 1))
 950  953                  panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): "
 951  954                      "parameters not vm_quantum aligned",
 952  955                      (void *)vmp, size, align_arg, phase, nocross,
 953  956                      minaddr, maxaddr, vmflag);
 954  957  
 955  958          if (nocross != 0 &&
 956  959              (align > nocross || P2ROUNDUP(phase + size, align) > nocross))
 957  960                  panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): "
 958  961                      "overconstrained allocation",
 959  962                      (void *)vmp, size, align_arg, phase, nocross,
 960  963                      minaddr, maxaddr, vmflag);
 961  964  
 962  965          if (phase >= align || !ISP2(align) || !ISP2(nocross))
 963  966                  panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): "
 964  967                      "parameters inconsistent or invalid",
 965  968                      (void *)vmp, size, align_arg, phase, nocross,
 966  969                      minaddr, maxaddr, vmflag);
 967  970  
 968  971          if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 &&
 969  972              (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP)
 970  973                  return (NULL);
 971  974  
 972  975          mutex_enter(&vmp->vm_lock);
 973  976          for (;;) {
 974  977                  if (vmp->vm_nsegfree < VMEM_MINFREE &&
 975  978                      !vmem_populate(vmp, vmflag))
 976  979                          break;
 977  980  do_alloc:
 978  981                  /*
 979  982                   * highbit() returns the highest bit + 1, which is exactly
 980  983                   * what we want: we want to search the first freelist whose
 981  984                   * members are *definitely* large enough to satisfy our
 982  985                   * allocation.  However, there are certain cases in which we
 983  986                   * want to look at the next-smallest freelist (which *might*
 984  987                   * be able to satisfy the allocation):
 985  988                   *
 986  989                   * (1)  The size is exactly a power of 2, in which case
 987  990                   *      the smaller freelist is always big enough;
 988  991                   *
 989  992                   * (2)  All other freelists are empty;
 990  993                   *
 991  994                   * (3)  We're in the highest possible freelist, which is
 992  995                   *      always empty (e.g. the 4GB freelist on 32-bit systems);
 993  996                   *
 994  997                   * (4)  We're doing a best-fit or first-fit allocation.
 995  998                   */
 996  999                  if (ISP2(size)) {
 997 1000                          flist = lowbit(P2ALIGN(vmp->vm_freemap, size));
 998 1001                  } else {
 999 1002                          hb = highbit(size);
1000 1003                          if ((vmp->vm_freemap >> hb) == 0 ||
1001 1004                              hb == VMEM_FREELISTS ||
1002 1005                              (vmflag & (VM_BESTFIT | VM_FIRSTFIT)))
1003 1006                                  hb--;
1004 1007                          flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb));
1005 1008                  }
1006 1009  
1007 1010                  for (vbest = NULL, vsp = (flist == 0) ? NULL :
1008 1011                      vmp->vm_freelist[flist - 1].vs_knext;
1009 1012                      vsp != NULL; vsp = vsp->vs_knext) {
1010 1013                          vmp->vm_kstat.vk_search.value.ui64++;
1011 1014                          if (vsp->vs_start == 0) {
1012 1015                                  /*
1013 1016                                   * We're moving up to a larger freelist,
1014 1017                                   * so if we've already found a candidate,
1015 1018                                   * the fit can't possibly get any better.
1016 1019                                   */
1017 1020                                  if (vbest != NULL)
1018 1021                                          break;
1019 1022                                  /*
1020 1023                                   * Find the next non-empty freelist.
1021 1024                                   */
1022 1025                                  flist = lowbit(P2ALIGN(vmp->vm_freemap,
1023 1026                                      VS_SIZE(vsp)));
1024 1027                                  if (flist-- == 0)
1025 1028                                          break;
1026 1029                                  vsp = (vmem_seg_t *)&vmp->vm_freelist[flist];
1027 1030                                  ASSERT(vsp->vs_knext->vs_type == VMEM_FREE);
1028 1031                                  continue;
1029 1032                          }
1030 1033                          if (vsp->vs_end - 1 < (uintptr_t)minaddr)
1031 1034                                  continue;
1032 1035                          if (vsp->vs_start > (uintptr_t)maxaddr - 1)
1033 1036                                  continue;
1034 1037                          start = MAX(vsp->vs_start, (uintptr_t)minaddr);
1035 1038                          end = MIN(vsp->vs_end - 1, (uintptr_t)maxaddr - 1) + 1;
1036 1039                          taddr = P2PHASEUP(start, align, phase);
1037 1040                          if (P2BOUNDARY(taddr, size, nocross))
1038 1041                                  taddr +=
1039 1042                                      P2ROUNDUP(P2NPHASE(taddr, nocross), align);
1040 1043                          if ((taddr - start) + size > end - start ||
1041 1044                              (vbest != NULL && VS_SIZE(vsp) >= VS_SIZE(vbest)))
1042 1045                                  continue;
1043 1046                          vbest = vsp;
1044 1047                          addr = taddr;
1045 1048                          if (!(vmflag & VM_BESTFIT) || VS_SIZE(vbest) == size)
1046 1049                                  break;
1047 1050                  }
1048 1051                  if (vbest != NULL)
1049 1052                          break;
1050 1053                  ASSERT(xvaddr == NULL);
1051 1054                  if (size == 0)
1052 1055                          panic("vmem_xalloc(): size == 0");
1053 1056                  if (vmp->vm_source_alloc != NULL && nocross == 0 &&
1054 1057                      minaddr == NULL && maxaddr == NULL) {
1055 1058                          size_t aneeded, asize;
1056 1059                          size_t aquantum = MAX(vmp->vm_quantum,
1057 1060                              vmp->vm_source->vm_quantum);
1058 1061                          size_t aphase = phase;
1059 1062                          if ((align > aquantum) &&
1060 1063                              !(vmp->vm_cflags & VMC_XALIGN)) {
1061 1064                                  aphase = (P2PHASE(phase, aquantum) != 0) ?
1062 1065                                      align - vmp->vm_quantum : align - aquantum;
1063 1066                                  ASSERT(aphase >= phase);
1064 1067                          }
1065 1068                          aneeded = MAX(size + aphase, vmp->vm_min_import);
1066 1069                          asize = P2ROUNDUP(aneeded, aquantum);
1067 1070  
1068 1071                          if (asize < size) {
1069 1072                                  /*
1070 1073                                   * The rounding induced overflow; return NULL
1071 1074                                   * if we are permitted to fail the allocation
1072 1075                                   * (and explicitly panic if we aren't).
1073 1076                                   */
1074 1077                                  if ((vmflag & VM_NOSLEEP) &&
1075 1078                                      !(vmflag & VM_PANIC)) {
1076 1079                                          mutex_exit(&vmp->vm_lock);
1077 1080                                          return (NULL);
1078 1081                                  }
1079 1082  
1080 1083                                  panic("vmem_xalloc(): size overflow");
1081 1084                          }
1082 1085  
1083 1086                          /*
1084 1087                           * Determine how many segment structures we'll consume.
1085 1088                           * The calculation must be precise because if we're
1086 1089                           * here on behalf of vmem_populate(), we are taking
1087 1090                           * segments from a very limited reserve.
1088 1091                           */
1089 1092                          if (size == asize && !(vmp->vm_cflags & VMC_XALLOC))
1090 1093                                  resv = VMEM_SEGS_PER_SPAN_CREATE +
1091 1094                                      VMEM_SEGS_PER_EXACT_ALLOC;
1092 1095                          else if (phase == 0 &&
1093 1096                              align <= vmp->vm_source->vm_quantum)
1094 1097                                  resv = VMEM_SEGS_PER_SPAN_CREATE +
1095 1098                                      VMEM_SEGS_PER_LEFT_ALLOC;
1096 1099                          else
1097 1100                                  resv = VMEM_SEGS_PER_ALLOC_MAX;
1098 1101  
1099 1102                          ASSERT(vmp->vm_nsegfree >= resv);
1100 1103                          vmp->vm_nsegfree -= resv;       /* reserve our segs */
1101 1104                          mutex_exit(&vmp->vm_lock);
1102 1105                          if (vmp->vm_cflags & VMC_XALLOC) {
1103 1106                                  size_t oasize = asize;
1104 1107                                  vaddr = ((vmem_ximport_t *)
1105 1108                                      vmp->vm_source_alloc)(vmp->vm_source,
1106 1109                                      &asize, align, vmflag & VM_KMFLAGS);
1107 1110                                  ASSERT(asize >= oasize);
1108 1111                                  ASSERT(P2PHASE(asize,
1109 1112                                      vmp->vm_source->vm_quantum) == 0);
1110 1113                                  ASSERT(!(vmp->vm_cflags & VMC_XALIGN) ||
1111 1114                                      IS_P2ALIGNED(vaddr, align));
1112 1115                          } else {
1113 1116                                  vaddr = vmp->vm_source_alloc(vmp->vm_source,
1114 1117                                      asize, vmflag & VM_KMFLAGS);
1115 1118                          }
1116 1119                          mutex_enter(&vmp->vm_lock);
1117 1120                          vmp->vm_nsegfree += resv;       /* claim reservation */
1118 1121                          aneeded = size + align - vmp->vm_quantum;
1119 1122                          aneeded = P2ROUNDUP(aneeded, vmp->vm_quantum);
1120 1123                          if (vaddr != NULL) {
1121 1124                                  /*
1122 1125                                   * Since we dropped the vmem lock while
1123 1126                                   * calling the import function, other
1124 1127                                   * threads could have imported space
1125 1128                                   * and made our import unnecessary.  In
1126 1129                                   * order to save space, we return
1127 1130                                   * excess imports immediately.
1128 1131                                   */
1129 1132                                  if (asize > aneeded &&
1130 1133                                      vmp->vm_source_free != NULL &&
1131 1134                                      vmem_canalloc(vmp, aneeded)) {
1132 1135                                          ASSERT(resv >=
1133 1136                                              VMEM_SEGS_PER_MIDDLE_ALLOC);
1134 1137                                          xvaddr = vaddr;
1135 1138                                          xsize = asize;
1136 1139                                          goto do_alloc;
1137 1140                                  }
1138 1141                                  vbest = vmem_span_create(vmp, vaddr, asize, 1);
1139 1142                                  addr = P2PHASEUP(vbest->vs_start, align, phase);
1140 1143                                  break;
1141 1144                          } else if (vmem_canalloc(vmp, aneeded)) {
1142 1145                                  /*
1143 1146                                   * Our import failed, but another thread
1144 1147                                   * added sufficient free memory to the arena
1145 1148                                   * to satisfy our request.  Go back and
1146 1149                                   * grab it.
1147 1150                                   */
1148 1151                                  ASSERT(resv >= VMEM_SEGS_PER_MIDDLE_ALLOC);
1149 1152                                  goto do_alloc;
1150 1153                          }
1151 1154                  }
1152 1155  
1153 1156                  /*
1154 1157                   * If the requestor chooses to fail the allocation attempt
1155 1158                   * rather than reap wait and retry - get out of the loop.
1156 1159                   */
1157 1160                  if (vmflag & VM_ABORT)
1158 1161                          break;
1159 1162                  mutex_exit(&vmp->vm_lock);
1160 1163                  if (vmp->vm_cflags & VMC_IDENTIFIER)
1161 1164                          kmem_reap_idspace();
1162 1165                  else
1163 1166                          kmem_reap();
1164 1167                  mutex_enter(&vmp->vm_lock);
1165 1168                  if (vmflag & VM_NOSLEEP)
1166 1169                          break;
1167 1170                  vmp->vm_kstat.vk_wait.value.ui64++;
1168 1171                  cv_wait(&vmp->vm_cv, &vmp->vm_lock);
1169 1172          }
1170 1173          if (vbest != NULL) {
1171 1174                  ASSERT(vbest->vs_type == VMEM_FREE);
1172 1175                  ASSERT(vbest->vs_knext != vbest);
1173 1176                  /* re-position to end of buffer */
1174 1177                  if (vmflag & VM_ENDALLOC) {
1175 1178                          addr += ((vbest->vs_end - (addr + size)) / align) *
1176 1179                              align;
1177 1180                  }
1178 1181                  (void) vmem_seg_alloc(vmp, vbest, addr, size);
1179 1182                  mutex_exit(&vmp->vm_lock);
1180 1183                  if (xvaddr)
1181 1184                          vmp->vm_source_free(vmp->vm_source, xvaddr, xsize);
1182 1185                  ASSERT(P2PHASE(addr, align) == phase);
1183 1186                  ASSERT(!P2BOUNDARY(addr, size, nocross));
1184 1187                  ASSERT(addr >= (uintptr_t)minaddr);
1185 1188                  ASSERT(addr + size - 1 <= (uintptr_t)maxaddr - 1);
1186 1189                  return ((void *)addr);
1187 1190          }
1188 1191          vmp->vm_kstat.vk_fail.value.ui64++;
1189 1192          mutex_exit(&vmp->vm_lock);
1190 1193          if (vmflag & VM_PANIC)
1191 1194                  panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): "
1192 1195                      "cannot satisfy mandatory allocation",
1193 1196                      (void *)vmp, size, align_arg, phase, nocross,
1194 1197                      minaddr, maxaddr, vmflag);
1195 1198          ASSERT(xvaddr == NULL);
1196 1199          return (NULL);
1197 1200  }
1198 1201  
1199 1202  /*
1200 1203   * Free the segment [vaddr, vaddr + size), where vaddr was a constrained
1201 1204   * allocation.  vmem_xalloc() and vmem_xfree() must always be paired because
1202 1205   * both routines bypass the quantum caches.
1203 1206   */
1204 1207  void
1205 1208  vmem_xfree(vmem_t *vmp, void *vaddr, size_t size)
1206 1209  {
1207 1210          vmem_seg_t *vsp, *vnext, *vprev;
1208 1211  
1209 1212          mutex_enter(&vmp->vm_lock);
1210 1213  
1211 1214          vsp = vmem_hash_delete(vmp, (uintptr_t)vaddr, size);
1212 1215          vsp->vs_end = P2ROUNDUP(vsp->vs_end, vmp->vm_quantum);
1213 1216  
1214 1217          /*
1215 1218           * Attempt to coalesce with the next segment.
1216 1219           */
1217 1220          vnext = vsp->vs_anext;
1218 1221          if (vnext->vs_type == VMEM_FREE) {
1219 1222                  ASSERT(vsp->vs_end == vnext->vs_start);
1220 1223                  vmem_freelist_delete(vmp, vnext);
1221 1224                  vsp->vs_end = vnext->vs_end;
1222 1225                  vmem_seg_destroy(vmp, vnext);
1223 1226          }
1224 1227  
1225 1228          /*
1226 1229           * Attempt to coalesce with the previous segment.
1227 1230           */
1228 1231          vprev = vsp->vs_aprev;
1229 1232          if (vprev->vs_type == VMEM_FREE) {
1230 1233                  ASSERT(vprev->vs_end == vsp->vs_start);
1231 1234                  vmem_freelist_delete(vmp, vprev);
1232 1235                  vprev->vs_end = vsp->vs_end;
1233 1236                  vmem_seg_destroy(vmp, vsp);
1234 1237                  vsp = vprev;
1235 1238          }
1236 1239  
1237 1240          /*
1238 1241           * If the entire span is free, return it to the source.
1239 1242           */
1240 1243          if (vsp->vs_aprev->vs_import && vmp->vm_source_free != NULL &&
1241 1244              vsp->vs_aprev->vs_type == VMEM_SPAN &&
1242 1245              vsp->vs_anext->vs_type == VMEM_SPAN) {
1243 1246                  vaddr = (void *)vsp->vs_start;
1244 1247                  size = VS_SIZE(vsp);
1245 1248                  ASSERT(size == VS_SIZE(vsp->vs_aprev));
1246 1249                  vmem_span_destroy(vmp, vsp);
1247 1250                  mutex_exit(&vmp->vm_lock);
1248 1251                  vmp->vm_source_free(vmp->vm_source, vaddr, size);
1249 1252          } else {
1250 1253                  vmem_freelist_insert(vmp, vsp);
1251 1254                  mutex_exit(&vmp->vm_lock);
1252 1255          }
1253 1256  }
1254 1257  
1255 1258  /*
1256 1259   * Allocate size bytes from arena vmp.  Returns the allocated address
1257 1260   * on success, NULL on failure.  vmflag specifies VM_SLEEP or VM_NOSLEEP,
1258 1261   * and may also specify best-fit, first-fit, or next-fit allocation policy
1259 1262   * instead of the default instant-fit policy.  VM_SLEEP allocations are
1260 1263   * guaranteed to succeed.
1261 1264   */
1262 1265  void *
1263 1266  vmem_alloc(vmem_t *vmp, size_t size, int vmflag)
1264 1267  {
1265 1268          vmem_seg_t *vsp;
1266 1269          uintptr_t addr;
1267 1270          int hb;
1268 1271          int flist = 0;
1269 1272          uint32_t mtbf;
1270 1273  
1271 1274          if (size - 1 < vmp->vm_qcache_max)
1272 1275                  return (kmem_cache_alloc(vmp->vm_qcache[(size - 1) >>
1273 1276                      vmp->vm_qshift], vmflag & VM_KMFLAGS));
1274 1277  
1275 1278          if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 &&
1276 1279              (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP)
1277 1280                  return (NULL);
1278 1281  
1279 1282          if (vmflag & VM_NEXTFIT)
1280 1283                  return (vmem_nextfit_alloc(vmp, size, vmflag));
1281 1284  
1282 1285          if (vmflag & (VM_BESTFIT | VM_FIRSTFIT))
1283 1286                  return (vmem_xalloc(vmp, size, vmp->vm_quantum, 0, 0,
1284 1287                      NULL, NULL, vmflag));
1285 1288  
1286 1289          /*
1287 1290           * Unconstrained instant-fit allocation from the segment list.
1288 1291           */
1289 1292          mutex_enter(&vmp->vm_lock);
1290 1293  
1291 1294          if (vmp->vm_nsegfree >= VMEM_MINFREE || vmem_populate(vmp, vmflag)) {
1292 1295                  if (ISP2(size))
1293 1296                          flist = lowbit(P2ALIGN(vmp->vm_freemap, size));
1294 1297                  else if ((hb = highbit(size)) < VMEM_FREELISTS)
1295 1298                          flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb));
1296 1299          }
1297 1300  
1298 1301          if (flist-- == 0) {
1299 1302                  mutex_exit(&vmp->vm_lock);
1300 1303                  return (vmem_xalloc(vmp, size, vmp->vm_quantum,
1301 1304                      0, 0, NULL, NULL, vmflag));
1302 1305          }
1303 1306  
1304 1307          ASSERT(size <= (1UL << flist));
1305 1308          vsp = vmp->vm_freelist[flist].vs_knext;
1306 1309          addr = vsp->vs_start;
1307 1310          if (vmflag & VM_ENDALLOC) {
1308 1311                  addr += vsp->vs_end - (addr + size);
1309 1312          }
1310 1313          (void) vmem_seg_alloc(vmp, vsp, addr, size);
1311 1314          mutex_exit(&vmp->vm_lock);
1312 1315          return ((void *)addr);
1313 1316  }
1314 1317  
1315 1318  /*
1316 1319   * Free the segment [vaddr, vaddr + size).
1317 1320   */
1318 1321  void
1319 1322  vmem_free(vmem_t *vmp, void *vaddr, size_t size)
1320 1323  {
1321 1324          if (size - 1 < vmp->vm_qcache_max)
1322 1325                  kmem_cache_free(vmp->vm_qcache[(size - 1) >> vmp->vm_qshift],
1323 1326                      vaddr);
1324 1327          else
1325 1328                  vmem_xfree(vmp, vaddr, size);
1326 1329  }
1327 1330  
1328 1331  /*
1329 1332   * Determine whether arena vmp contains the segment [vaddr, vaddr + size).
1330 1333   */
1331 1334  int
1332 1335  vmem_contains(vmem_t *vmp, void *vaddr, size_t size)
1333 1336  {
1334 1337          uintptr_t start = (uintptr_t)vaddr;
1335 1338          uintptr_t end = start + size;
1336 1339          vmem_seg_t *vsp;
1337 1340          vmem_seg_t *seg0 = &vmp->vm_seg0;
1338 1341  
1339 1342          mutex_enter(&vmp->vm_lock);
1340 1343          vmp->vm_kstat.vk_contains.value.ui64++;
1341 1344          for (vsp = seg0->vs_knext; vsp != seg0; vsp = vsp->vs_knext) {
1342 1345                  vmp->vm_kstat.vk_contains_search.value.ui64++;
1343 1346                  ASSERT(vsp->vs_type == VMEM_SPAN);
1344 1347                  if (start >= vsp->vs_start && end - 1 <= vsp->vs_end - 1)
1345 1348                          break;
1346 1349          }
1347 1350          mutex_exit(&vmp->vm_lock);
1348 1351          return (vsp != seg0);
1349 1352  }
1350 1353  
1351 1354  /*
1352 1355   * Add the span [vaddr, vaddr + size) to arena vmp.
1353 1356   */
1354 1357  void *
1355 1358  vmem_add(vmem_t *vmp, void *vaddr, size_t size, int vmflag)
1356 1359  {
1357 1360          if (vaddr == NULL || size == 0)
1358 1361                  panic("vmem_add(%p, %p, %lu): bad arguments",
1359 1362                      (void *)vmp, vaddr, size);
1360 1363  
1361 1364          ASSERT(!vmem_contains(vmp, vaddr, size));
1362 1365  
1363 1366          mutex_enter(&vmp->vm_lock);
1364 1367          if (vmem_populate(vmp, vmflag))
1365 1368                  (void) vmem_span_create(vmp, vaddr, size, 0);
1366 1369          else
1367 1370                  vaddr = NULL;
1368 1371          mutex_exit(&vmp->vm_lock);
1369 1372          return (vaddr);
1370 1373  }
1371 1374  
1372 1375  /*
1373 1376   * Walk the vmp arena, applying func to each segment matching typemask.
1374 1377   * If VMEM_REENTRANT is specified, the arena lock is dropped across each
1375 1378   * call to func(); otherwise, it is held for the duration of vmem_walk()
1376 1379   * to ensure a consistent snapshot.  Note that VMEM_REENTRANT callbacks
1377 1380   * are *not* necessarily consistent, so they may only be used when a hint
1378 1381   * is adequate.
1379 1382   */
1380 1383  void
1381 1384  vmem_walk(vmem_t *vmp, int typemask,
1382 1385      void (*func)(void *, void *, size_t), void *arg)
1383 1386  {
1384 1387          vmem_seg_t *vsp;
1385 1388          vmem_seg_t *seg0 = &vmp->vm_seg0;
1386 1389          vmem_seg_t walker;
1387 1390  
1388 1391          if (typemask & VMEM_WALKER)
1389 1392                  return;
1390 1393  
1391 1394          bzero(&walker, sizeof (walker));
1392 1395          walker.vs_type = VMEM_WALKER;
1393 1396  
1394 1397          mutex_enter(&vmp->vm_lock);
1395 1398          VMEM_INSERT(seg0, &walker, a);
1396 1399          for (vsp = seg0->vs_anext; vsp != seg0; vsp = vsp->vs_anext) {
1397 1400                  if (vsp->vs_type & typemask) {
1398 1401                          void *start = (void *)vsp->vs_start;
1399 1402                          size_t size = VS_SIZE(vsp);
1400 1403                          if (typemask & VMEM_REENTRANT) {
1401 1404                                  vmem_advance(vmp, &walker, vsp);
1402 1405                                  mutex_exit(&vmp->vm_lock);
1403 1406                                  func(arg, start, size);
1404 1407                                  mutex_enter(&vmp->vm_lock);
1405 1408                                  vsp = &walker;
1406 1409                          } else {
1407 1410                                  func(arg, start, size);
1408 1411                          }
1409 1412                  }
1410 1413          }
1411 1414          vmem_advance(vmp, &walker, NULL);
1412 1415          mutex_exit(&vmp->vm_lock);
1413 1416  }
1414 1417  
1415 1418  /*
1416 1419   * Return the total amount of memory whose type matches typemask.  Thus:
1417 1420   *
1418 1421   *      typemask VMEM_ALLOC yields total memory allocated (in use).
1419 1422   *      typemask VMEM_FREE yields total memory free (available).
1420 1423   *      typemask (VMEM_ALLOC | VMEM_FREE) yields total arena size.
1421 1424   */
1422 1425  size_t
1423 1426  vmem_size(vmem_t *vmp, int typemask)
1424 1427  {
1425 1428          uint64_t size = 0;
1426 1429  
1427 1430          if (typemask & VMEM_ALLOC)
1428 1431                  size += vmp->vm_kstat.vk_mem_inuse.value.ui64;
1429 1432          if (typemask & VMEM_FREE)
1430 1433                  size += vmp->vm_kstat.vk_mem_total.value.ui64 -
1431 1434                      vmp->vm_kstat.vk_mem_inuse.value.ui64;
1432 1435          return ((size_t)size);
1433 1436  }
1434 1437  
1435 1438  /*
1436 1439   * Create an arena called name whose initial span is [base, base + size).
1437 1440   * The arena's natural unit of currency is quantum, so vmem_alloc()
1438 1441   * guarantees quantum-aligned results.  The arena may import new spans
1439 1442   * by invoking afunc() on source, and may return those spans by invoking
1440 1443   * ffunc() on source.  To make small allocations fast and scalable,
1441 1444   * the arena offers high-performance caching for each integer multiple
1442 1445   * of quantum up to qcache_max.
1443 1446   */
1444 1447  static vmem_t *
1445 1448  vmem_create_common(const char *name, void *base, size_t size, size_t quantum,
1446 1449      void *(*afunc)(vmem_t *, size_t, int),
1447 1450      void (*ffunc)(vmem_t *, void *, size_t),
1448 1451      vmem_t *source, size_t qcache_max, int vmflag)
1449 1452  {
1450 1453          int i;
1451 1454          size_t nqcache;
1452 1455          vmem_t *vmp, *cur, **vmpp;
1453 1456          vmem_seg_t *vsp;
1454 1457          vmem_freelist_t *vfp;
1455 1458          uint32_t id = atomic_inc_32_nv(&vmem_id);
1456 1459  
1457 1460          if (vmem_vmem_arena != NULL) {
1458 1461                  vmp = vmem_alloc(vmem_vmem_arena, sizeof (vmem_t),
1459 1462                      vmflag & VM_KMFLAGS);
1460 1463          } else {
1461 1464                  ASSERT(id <= VMEM_INITIAL);
1462 1465                  vmp = &vmem0[id - 1];
1463 1466          }
1464 1467  
1465 1468          /* An identifier arena must inherit from another identifier arena */
1466 1469          ASSERT(source == NULL || ((source->vm_cflags & VMC_IDENTIFIER) ==
1467 1470              (vmflag & VMC_IDENTIFIER)));
1468 1471  
1469 1472          if (vmp == NULL)
1470 1473                  return (NULL);
1471 1474          bzero(vmp, sizeof (vmem_t));
1472 1475  
1473 1476          (void) snprintf(vmp->vm_name, VMEM_NAMELEN, "%s", name);
1474 1477          mutex_init(&vmp->vm_lock, NULL, MUTEX_DEFAULT, NULL);
1475 1478          cv_init(&vmp->vm_cv, NULL, CV_DEFAULT, NULL);
1476 1479          vmp->vm_cflags = vmflag;
1477 1480          vmflag &= VM_KMFLAGS;
1478 1481  
1479 1482          vmp->vm_quantum = quantum;
1480 1483          vmp->vm_qshift = highbit(quantum) - 1;
1481 1484          nqcache = MIN(qcache_max >> vmp->vm_qshift, VMEM_NQCACHE_MAX);
1482 1485  
1483 1486          for (i = 0; i <= VMEM_FREELISTS; i++) {
1484 1487                  vfp = &vmp->vm_freelist[i];
1485 1488                  vfp->vs_end = 1UL << i;
1486 1489                  vfp->vs_knext = (vmem_seg_t *)(vfp + 1);
1487 1490                  vfp->vs_kprev = (vmem_seg_t *)(vfp - 1);
1488 1491          }
1489 1492  
1490 1493          vmp->vm_freelist[0].vs_kprev = NULL;
1491 1494          vmp->vm_freelist[VMEM_FREELISTS].vs_knext = NULL;
1492 1495          vmp->vm_freelist[VMEM_FREELISTS].vs_end = 0;
1493 1496          vmp->vm_hash_table = vmp->vm_hash0;
1494 1497          vmp->vm_hash_mask = VMEM_HASH_INITIAL - 1;
1495 1498          vmp->vm_hash_shift = highbit(vmp->vm_hash_mask);
1496 1499  
1497 1500          vsp = &vmp->vm_seg0;
1498 1501          vsp->vs_anext = vsp;
1499 1502          vsp->vs_aprev = vsp;
1500 1503          vsp->vs_knext = vsp;
1501 1504          vsp->vs_kprev = vsp;
1502 1505          vsp->vs_type = VMEM_SPAN;
1503 1506  
1504 1507          vsp = &vmp->vm_rotor;
1505 1508          vsp->vs_type = VMEM_ROTOR;
1506 1509          VMEM_INSERT(&vmp->vm_seg0, vsp, a);
1507 1510  
1508 1511          bcopy(&vmem_kstat_template, &vmp->vm_kstat, sizeof (vmem_kstat_t));
1509 1512  
1510 1513          vmp->vm_id = id;
1511 1514          if (source != NULL)
1512 1515                  vmp->vm_kstat.vk_source_id.value.ui32 = source->vm_id;
1513 1516          vmp->vm_source = source;
1514 1517          vmp->vm_source_alloc = afunc;
1515 1518          vmp->vm_source_free = ffunc;
1516 1519  
1517 1520          /*
1518 1521           * Some arenas (like vmem_metadata and kmem_metadata) cannot
1519 1522           * use quantum caching to lower fragmentation.  Instead, we
1520 1523           * increase their imports, giving a similar effect.
1521 1524           */
1522 1525          if (vmp->vm_cflags & VMC_NO_QCACHE) {
1523 1526                  vmp->vm_min_import =
1524 1527                      VMEM_QCACHE_SLABSIZE(nqcache << vmp->vm_qshift);
1525 1528                  nqcache = 0;
1526 1529          }
1527 1530  
1528 1531          if (nqcache != 0) {
1529 1532                  ASSERT(!(vmflag & VM_NOSLEEP));
1530 1533                  vmp->vm_qcache_max = nqcache << vmp->vm_qshift;
1531 1534                  for (i = 0; i < nqcache; i++) {
1532 1535                          char buf[VMEM_NAMELEN + 21];
1533 1536                          (void) sprintf(buf, "%s_%lu", vmp->vm_name,
1534 1537                              (i + 1) * quantum);
1535 1538                          vmp->vm_qcache[i] = kmem_cache_create(buf,
1536 1539                              (i + 1) * quantum, quantum, NULL, NULL, NULL,
1537 1540                              NULL, vmp, KMC_QCACHE | KMC_NOTOUCH);
1538 1541                  }
1539 1542          }
1540 1543  
1541 1544          if ((vmp->vm_ksp = kstat_create("vmem", vmp->vm_id, vmp->vm_name,
1542 1545              "vmem", KSTAT_TYPE_NAMED, sizeof (vmem_kstat_t) /
1543 1546              sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL)) != NULL) {
1544 1547                  vmp->vm_ksp->ks_data = &vmp->vm_kstat;
1545 1548                  kstat_install(vmp->vm_ksp);
1546 1549          }
1547 1550  
1548 1551          mutex_enter(&vmem_list_lock);
1549 1552          vmpp = &vmem_list;
1550 1553          while ((cur = *vmpp) != NULL)
1551 1554                  vmpp = &cur->vm_next;
1552 1555          *vmpp = vmp;
1553 1556          mutex_exit(&vmem_list_lock);
1554 1557  
1555 1558          if (vmp->vm_cflags & VMC_POPULATOR) {
1556 1559                  ASSERT(vmem_populators < VMEM_INITIAL);
1557 1560                  vmem_populator[atomic_inc_32_nv(&vmem_populators) - 1] = vmp;
1558 1561                  mutex_enter(&vmp->vm_lock);
1559 1562                  (void) vmem_populate(vmp, vmflag | VM_PANIC);
1560 1563                  mutex_exit(&vmp->vm_lock);
1561 1564          }
1562 1565  
1563 1566          if ((base || size) && vmem_add(vmp, base, size, vmflag) == NULL) {
1564 1567                  vmem_destroy(vmp);
1565 1568                  return (NULL);
1566 1569          }
1567 1570  
1568 1571          return (vmp);
1569 1572  }
1570 1573  
1571 1574  vmem_t *
1572 1575  vmem_xcreate(const char *name, void *base, size_t size, size_t quantum,
1573 1576      vmem_ximport_t *afunc, vmem_free_t *ffunc, vmem_t *source,
1574 1577      size_t qcache_max, int vmflag)
1575 1578  {
1576 1579          ASSERT(!(vmflag & (VMC_POPULATOR | VMC_XALLOC)));
1577 1580          vmflag &= ~(VMC_POPULATOR | VMC_XALLOC);
1578 1581  
1579 1582          return (vmem_create_common(name, base, size, quantum,
1580 1583              (vmem_alloc_t *)afunc, ffunc, source, qcache_max,
1581 1584              vmflag | VMC_XALLOC));
1582 1585  }
1583 1586  
1584 1587  vmem_t *
1585 1588  vmem_create(const char *name, void *base, size_t size, size_t quantum,
1586 1589      vmem_alloc_t *afunc, vmem_free_t *ffunc, vmem_t *source,
1587 1590      size_t qcache_max, int vmflag)
1588 1591  {
1589 1592          ASSERT(!(vmflag & (VMC_XALLOC | VMC_XALIGN)));
1590 1593          vmflag &= ~(VMC_XALLOC | VMC_XALIGN);
1591 1594  
1592 1595          return (vmem_create_common(name, base, size, quantum,
1593 1596              afunc, ffunc, source, qcache_max, vmflag));
1594 1597  }
1595 1598  
1596 1599  /*
1597 1600   * Destroy arena vmp.
1598 1601   */
1599 1602  void
1600 1603  vmem_destroy(vmem_t *vmp)
1601 1604  {
1602 1605          vmem_t *cur, **vmpp;
1603 1606          vmem_seg_t *seg0 = &vmp->vm_seg0;
1604 1607          vmem_seg_t *vsp, *anext;
1605 1608          size_t leaked;
1606 1609          int i;
1607 1610  
1608 1611          mutex_enter(&vmem_list_lock);
1609 1612          vmpp = &vmem_list;
1610 1613          while ((cur = *vmpp) != vmp)
1611 1614                  vmpp = &cur->vm_next;
1612 1615          *vmpp = vmp->vm_next;
1613 1616          mutex_exit(&vmem_list_lock);
1614 1617  
1615 1618          for (i = 0; i < VMEM_NQCACHE_MAX; i++)
1616 1619                  if (vmp->vm_qcache[i])
1617 1620                          kmem_cache_destroy(vmp->vm_qcache[i]);
1618 1621  
1619 1622          leaked = vmem_size(vmp, VMEM_ALLOC);
1620 1623          if (leaked != 0)
1621 1624                  cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s",
1622 1625                      vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ?
1623 1626                      "identifiers" : "bytes");
1624 1627  
1625 1628          if (vmp->vm_hash_table != vmp->vm_hash0)
1626 1629                  vmem_free(vmem_hash_arena, vmp->vm_hash_table,
1627 1630                      (vmp->vm_hash_mask + 1) * sizeof (void *));
1628 1631  
1629 1632          /*
1630 1633           * Give back the segment structures for anything that's left in the
1631 1634           * arena, e.g. the primary spans and their free segments.
1632 1635           */
1633 1636          VMEM_DELETE(&vmp->vm_rotor, a);
1634 1637          for (vsp = seg0->vs_anext; vsp != seg0; vsp = anext) {
1635 1638                  anext = vsp->vs_anext;
1636 1639                  vmem_putseg_global(vsp);
1637 1640          }
1638 1641  
1639 1642          while (vmp->vm_nsegfree > 0)
1640 1643                  vmem_putseg_global(vmem_getseg(vmp));
1641 1644  
1642 1645          kstat_delete(vmp->vm_ksp);
1643 1646  
1644 1647          mutex_destroy(&vmp->vm_lock);
1645 1648          cv_destroy(&vmp->vm_cv);
1646 1649          vmem_free(vmem_vmem_arena, vmp, sizeof (vmem_t));
1647 1650  }
1648 1651  
1649 1652  /*
1650 1653   * Only shrink vmem hashtable if it is 1<<vmem_rescale_minshift times (8x)
1651 1654   * larger than necessary.
1652 1655   */
1653 1656  int vmem_rescale_minshift = 3;
1654 1657  
1655 1658  /*
1656 1659   * Resize vmp's hash table to keep the average lookup depth near 1.0.
1657 1660   */
1658 1661  static void
1659 1662  vmem_hash_rescale(vmem_t *vmp)
1660 1663  {
1661 1664          vmem_seg_t **old_table, **new_table, *vsp;
1662 1665          size_t old_size, new_size, h, nseg;
1663 1666  
1664 1667          nseg = (size_t)(vmp->vm_kstat.vk_alloc.value.ui64 -
1665 1668              vmp->vm_kstat.vk_free.value.ui64);
1666 1669  
1667 1670          new_size = MAX(VMEM_HASH_INITIAL, 1 << (highbit(3 * nseg + 4) - 2));
1668 1671          old_size = vmp->vm_hash_mask + 1;
1669 1672  
1670 1673          if ((old_size >> vmem_rescale_minshift) <= new_size &&
1671 1674              new_size <= (old_size << 1))
1672 1675                  return;
1673 1676  
1674 1677          new_table = vmem_alloc(vmem_hash_arena, new_size * sizeof (void *),
1675 1678              VM_NOSLEEP);
1676 1679          if (new_table == NULL)
1677 1680                  return;
1678 1681          bzero(new_table, new_size * sizeof (void *));
1679 1682  
1680 1683          mutex_enter(&vmp->vm_lock);
1681 1684  
1682 1685          old_size = vmp->vm_hash_mask + 1;
1683 1686          old_table = vmp->vm_hash_table;
1684 1687  
1685 1688          vmp->vm_hash_mask = new_size - 1;
1686 1689          vmp->vm_hash_table = new_table;
1687 1690          vmp->vm_hash_shift = highbit(vmp->vm_hash_mask);
1688 1691  
1689 1692          for (h = 0; h < old_size; h++) {
1690 1693                  vsp = old_table[h];
1691 1694                  while (vsp != NULL) {
1692 1695                          uintptr_t addr = vsp->vs_start;
1693 1696                          vmem_seg_t *next_vsp = vsp->vs_knext;
1694 1697                          vmem_seg_t **hash_bucket = VMEM_HASH(vmp, addr);
1695 1698                          vsp->vs_knext = *hash_bucket;
1696 1699                          *hash_bucket = vsp;
1697 1700                          vsp = next_vsp;
1698 1701                  }
1699 1702          }
1700 1703  
1701 1704          mutex_exit(&vmp->vm_lock);
1702 1705  
1703 1706          if (old_table != vmp->vm_hash0)
1704 1707                  vmem_free(vmem_hash_arena, old_table,
1705 1708                      old_size * sizeof (void *));
1706 1709  }
1707 1710  
1708 1711  /*
1709 1712   * Perform periodic maintenance on all vmem arenas.
1710 1713   */
1711 1714  void
1712 1715  vmem_update(void *dummy)
1713 1716  {
1714 1717          vmem_t *vmp;
1715 1718  
1716 1719          mutex_enter(&vmem_list_lock);
1717 1720          for (vmp = vmem_list; vmp != NULL; vmp = vmp->vm_next) {
1718 1721                  /*
1719 1722                   * If threads are waiting for resources, wake them up
1720 1723                   * periodically so they can issue another kmem_reap()
1721 1724                   * to reclaim resources cached by the slab allocator.
1722 1725                   */
1723 1726                  cv_broadcast(&vmp->vm_cv);
1724 1727  
1725 1728                  /*
1726 1729                   * Rescale the hash table to keep the hash chains short.
1727 1730                   */
1728 1731                  vmem_hash_rescale(vmp);
1729 1732          }
1730 1733          mutex_exit(&vmem_list_lock);
1731 1734  
1732 1735          (void) timeout(vmem_update, dummy, vmem_update_interval * hz);
1733 1736  }
1734 1737  
1735 1738  void
1736 1739  vmem_qcache_reap(vmem_t *vmp)
1737 1740  {
1738 1741          int i;
1739 1742  
1740 1743          /*
1741 1744           * Reap any quantum caches that may be part of this vmem.
1742 1745           */
1743 1746          for (i = 0; i < VMEM_NQCACHE_MAX; i++)
1744 1747                  if (vmp->vm_qcache[i])
1745 1748                          kmem_cache_reap_now(vmp->vm_qcache[i]);
1746 1749  }
1747 1750  
1748 1751  /*
1749 1752   * Prepare vmem for use.
1750 1753   */
1751 1754  vmem_t *
1752 1755  vmem_init(const char *heap_name,
1753 1756      void *heap_start, size_t heap_size, size_t heap_quantum,
1754 1757      void *(*heap_alloc)(vmem_t *, size_t, int),
1755 1758      void (*heap_free)(vmem_t *, void *, size_t))
1756 1759  {
1757 1760          uint32_t id;
1758 1761          int nseg = VMEM_SEG_INITIAL;
1759 1762          vmem_t *heap;
1760 1763  
1761 1764          while (--nseg >= 0)
1762 1765                  vmem_putseg_global(&vmem_seg0[nseg]);
1763 1766  
1764 1767          heap = vmem_create(heap_name,
1765 1768              heap_start, heap_size, heap_quantum,
1766 1769              NULL, NULL, NULL, 0,
1767 1770              VM_SLEEP | VMC_POPULATOR);
1768 1771  
1769 1772          vmem_metadata_arena = vmem_create("vmem_metadata",
1770 1773              NULL, 0, heap_quantum,
1771 1774              vmem_alloc, vmem_free, heap, 8 * heap_quantum,
1772 1775              VM_SLEEP | VMC_POPULATOR | VMC_NO_QCACHE);
1773 1776  
1774 1777          vmem_seg_arena = vmem_create("vmem_seg",
1775 1778              NULL, 0, heap_quantum,
1776 1779              heap_alloc, heap_free, vmem_metadata_arena, 0,
1777 1780              VM_SLEEP | VMC_POPULATOR);
1778 1781  
1779 1782          vmem_hash_arena = vmem_create("vmem_hash",
1780 1783              NULL, 0, 8,
1781 1784              heap_alloc, heap_free, vmem_metadata_arena, 0,
1782 1785              VM_SLEEP);
1783 1786  
1784 1787          vmem_vmem_arena = vmem_create("vmem_vmem",
1785 1788              vmem0, sizeof (vmem0), 1,
1786 1789              heap_alloc, heap_free, vmem_metadata_arena, 0,
1787 1790              VM_SLEEP);
1788 1791  
1789 1792          for (id = 0; id < vmem_id; id++)
1790 1793                  (void) vmem_xalloc(vmem_vmem_arena, sizeof (vmem_t),
1791 1794                      1, 0, 0, &vmem0[id], &vmem0[id + 1],
1792 1795                      VM_NOSLEEP | VM_BESTFIT | VM_PANIC);
1793 1796  
1794 1797          return (heap);
1795 1798  }

↓ open down ↓

1742 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX