illumos-gate New usr/src/uts/common/sys/kmem

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2018 Joyent, Inc.
  25  */
  26 
  27 #ifndef _SYS_KMEM_IMPL_H
  28 #define _SYS_KMEM_IMPL_H
  29 
  30 #include <sys/kmem.h>
  31 #include <sys/vmem.h>
  32 #include <sys/thread.h>
  33 #include <sys/t_lock.h>
  34 #include <sys/time.h>
  35 #include <sys/kstat.h>
  36 #include <sys/cpuvar.h>
  37 #include <sys/systm.h>
  38 #include <vm/page.h>
  39 #include <sys/avl.h>
  40 #include <sys/list.h>
  41 
  42 #ifdef  __cplusplus
  43 extern "C" {
  44 #endif
  45 
  46 /*
  47  * kernel memory allocator: implementation-private data structures
  48  *
  49  * Lock order:
  50  * 1. cache_lock
  51  * 2. cc_lock in order by CPU ID
  52  * 3. cache_depot_lock
  53  *
  54  * Do not call kmem_cache_alloc() or taskq_dispatch() while holding any of the
  55  * above locks.
  56  */
  57 
  58 #define KMF_AUDIT       0x00000001      /* transaction auditing */
  59 #define KMF_DEADBEEF    0x00000002      /* deadbeef checking */
  60 #define KMF_REDZONE     0x00000004      /* redzone checking */
  61 #define KMF_CONTENTS    0x00000008      /* freed-buffer content logging */
  62 #define KMF_STICKY      0x00000010      /* if set, override /etc/system */
  63 #define KMF_NOMAGAZINE  0x00000020      /* disable per-cpu magazines */
  64 #define KMF_FIREWALL    0x00000040      /* put all bufs before unmapped pages */
  65 #define KMF_LITE        0x00000100      /* lightweight debugging */
  66 
  67 #define KMF_HASH        0x00000200      /* cache has hash table */
  68 #define KMF_RANDOMIZE   0x00000400      /* randomize other kmem_flags */
  69 
  70 #define KMF_DUMPDIVERT  0x00001000      /* use alternate memory at dump time */
  71 #define KMF_DUMPUNSAFE  0x00002000      /* flag caches used at dump time */
  72 #define KMF_PREFILL     0x00004000      /* Prefill the slab when created. */
  73 
  74 #define KMF_BUFTAG      (KMF_DEADBEEF | KMF_REDZONE)
  75 #define KMF_TOUCH       (KMF_BUFTAG | KMF_LITE | KMF_CONTENTS)
  76 #define KMF_RANDOM      (KMF_TOUCH | KMF_AUDIT | KMF_NOMAGAZINE)
  77 #define KMF_DEBUG       (KMF_RANDOM | KMF_FIREWALL)
  78 
  79 #define KMEM_STACK_DEPTH        15
  80 
  81 #define KMEM_FREE_PATTERN               0xdeadbeefdeadbeefULL
  82 #define KMEM_UNINITIALIZED_PATTERN      0xbaddcafebaddcafeULL
  83 #define KMEM_REDZONE_PATTERN            0xfeedfacefeedfaceULL
  84 #define KMEM_REDZONE_BYTE               0xbb
  85 
  86 /*
  87  * Redzone size encodings for kmem_alloc() / kmem_free().  We encode the
  88  * allocation size, rather than storing it directly, so that kmem_free()
  89  * can distinguish frees of the wrong size from redzone violations.
  90  *
  91  * A size of zero is never valid.
  92  */
  93 #define KMEM_SIZE_ENCODE(x)     (251 * (x) + 1)
  94 #define KMEM_SIZE_DECODE(x)     ((x) / 251)
  95 #define KMEM_SIZE_VALID(x)      ((x) % 251 == 1 && (x) != 1)
  96 
  97 
  98 #define KMEM_ALIGN              8       /* min guaranteed alignment */
  99 #define KMEM_ALIGN_SHIFT        3       /* log2(KMEM_ALIGN) */
 100 #define KMEM_VOID_FRACTION      8       /* never waste more than 1/8 of slab */
 101 
 102 #define KMEM_SLAB_IS_PARTIAL(sp)                \
 103         ((sp)->slab_refcnt > 0 && (sp)->slab_refcnt < (sp)->slab_chunks)
 104 #define KMEM_SLAB_IS_ALL_USED(sp)               \
 105         ((sp)->slab_refcnt == (sp)->slab_chunks)
 106 
 107 /*
 108  * The bufctl (buffer control) structure keeps some minimal information
 109  * about each buffer: its address, its slab, and its current linkage,
 110  * which is either on the slab's freelist (if the buffer is free), or
 111  * on the cache's buf-to-bufctl hash table (if the buffer is allocated).
 112  * In the case of non-hashed, or "raw", caches (the common case), only
 113  * the freelist linkage is necessary: the buffer address is at a fixed
 114  * offset from the bufctl address, and the slab is at the end of the page.
 115  *
 116  * NOTE: bc_next must be the first field; raw buffers have linkage only.
 117  */
 118 typedef struct kmem_bufctl {
 119         struct kmem_bufctl      *bc_next;       /* next bufctl struct */
 120         void                    *bc_addr;       /* address of buffer */
 121         struct kmem_slab        *bc_slab;       /* controlling slab */
 122 } kmem_bufctl_t;
 123 
 124 /*
 125  * The KMF_AUDIT version of the bufctl structure.  The beginning of this
 126  * structure must be identical to the normal bufctl structure so that
 127  * pointers are interchangeable.
 128  */
 129 typedef struct kmem_bufctl_audit {
 130         struct kmem_bufctl      *bc_next;       /* next bufctl struct */
 131         void                    *bc_addr;       /* address of buffer */
 132         struct kmem_slab        *bc_slab;       /* controlling slab */
 133         kmem_cache_t            *bc_cache;      /* controlling cache */
 134         hrtime_t                bc_timestamp;   /* transaction time */
 135         kthread_t               *bc_thread;     /* thread doing transaction */
 136         struct kmem_bufctl      *bc_lastlog;    /* last log entry */
 137         void                    *bc_contents;   /* contents at last free */
 138         int                     bc_depth;       /* stack depth */
 139         pc_t                    bc_stack[KMEM_STACK_DEPTH];     /* pc stack */
 140 } kmem_bufctl_audit_t;
 141 
 142 /*
 143  * A kmem_buftag structure is appended to each buffer whenever any of the
 144  * KMF_BUFTAG flags (KMF_DEADBEEF, KMF_REDZONE, KMF_VERIFY) are set.
 145  */
 146 typedef struct kmem_buftag {
 147         uint64_t                bt_redzone;     /* 64-bit redzone pattern */
 148         kmem_bufctl_t           *bt_bufctl;     /* bufctl */
 149         intptr_t                bt_bxstat;      /* bufctl ^ (alloc/free) */
 150 } kmem_buftag_t;
 151 
 152 /*
 153  * A variant of the kmem_buftag structure used for KMF_LITE caches.
 154  * Previous callers are stored in reverse chronological order. (i.e. most
 155  * recent first)
 156  */
 157 typedef struct kmem_buftag_lite {
 158         kmem_buftag_t           bt_buftag;      /* a normal buftag */
 159         pc_t                    bt_history[1];  /* zero or more callers */
 160 } kmem_buftag_lite_t;
 161 
 162 #define KMEM_BUFTAG_LITE_SIZE(f)        \
 163         (offsetof(kmem_buftag_lite_t, bt_history[f]))
 164 
 165 #define KMEM_BUFTAG(cp, buf)            \
 166         ((kmem_buftag_t *)((char *)(buf) + (cp)->cache_buftag))
 167 
 168 #define KMEM_BUFCTL(cp, buf)            \
 169         ((kmem_bufctl_t *)((char *)(buf) + (cp)->cache_bufctl))
 170 
 171 #define KMEM_BUF(cp, bcp)               \
 172         ((void *)((char *)(bcp) - (cp)->cache_bufctl))
 173 
 174 #define KMEM_SLAB(cp, buf)              \
 175         ((kmem_slab_t *)P2END((uintptr_t)(buf), (cp)->cache_slabsize) - 1)
 176 
 177 /*
 178  * Test for using alternate memory at dump time.
 179  */
 180 #define KMEM_DUMP(cp)           ((cp)->cache_flags & KMF_DUMPDIVERT)
 181 #define KMEM_DUMPCC(ccp)        ((ccp)->cc_flags & KMF_DUMPDIVERT)
 182 
 183 /*
 184  * The "CPU" macro loads a cpu_t that refers to the cpu that the current
 185  * thread is running on at the time the macro is executed.  A context switch
 186  * may occur immediately after loading this data structure, leaving this
 187  * thread pointing at the cpu_t for the previous cpu.  This is not a problem;
 188  * we'd just end up checking the previous cpu's per-cpu cache, and then check
 189  * the other layers of the kmem cache if need be.
 190  *
 191  * It's not even a problem if the old cpu gets DR'ed out during the context
 192  * switch.  The cpu-remove DR operation bzero()s the cpu_t, but doesn't free
 193  * it.  So the cpu_t's cpu_cache_offset would read as 0, causing us to use
 194  * cpu 0's per-cpu cache.
 195  *
 196  * So, there is no need to disable kernel preemption while using the CPU macro
 197  * below since if we have been context switched, there will not be any
 198  * correctness problem, just a momentary use of a different per-cpu cache.
 199  */
 200 
 201 #define KMEM_CPU_CACHE(cp)                                              \
 202         ((kmem_cpu_cache_t *)((char *)(&cp->cache_cpu) + CPU->cpu_cache_offset))
 203 
 204 #define KMEM_MAGAZINE_VALID(cp, mp)     \
 205         (((kmem_slab_t *)P2END((uintptr_t)(mp), PAGESIZE) - 1)->slab_cache == \
 206             (cp)->cache_magtype->mt_cache)
 207 
 208 #define KMEM_SLAB_OFFSET(sp, buf)       \
 209         ((size_t)((uintptr_t)(buf) - (uintptr_t)((sp)->slab_base)))
 210 
 211 #define KMEM_SLAB_MEMBER(sp, buf)       \
 212         (KMEM_SLAB_OFFSET(sp, buf) < (sp)->slab_cache->cache_slabsize)
 213 
 214 #define KMEM_BUFTAG_ALLOC       0xa110c8edUL
 215 #define KMEM_BUFTAG_FREE        0xf4eef4eeUL
 216 
 217 /* slab_later_count thresholds */
 218 #define KMEM_DISBELIEF          3
 219 
 220 /* slab_flags */
 221 #define KMEM_SLAB_NOMOVE        0x1
 222 #define KMEM_SLAB_MOVE_PENDING  0x2
 223 
 224 typedef struct kmem_slab {
 225         struct kmem_cache       *slab_cache;    /* controlling cache */
 226         void                    *slab_base;     /* base of allocated memory */
 227         avl_node_t              slab_link;      /* slab linkage */
 228         struct kmem_bufctl      *slab_head;     /* first free buffer */
 229         long                    slab_refcnt;    /* outstanding allocations */
 230         long                    slab_chunks;    /* chunks (bufs) in this slab */
 231         uint32_t                slab_stuck_offset; /* unmoved buffer offset */
 232         uint16_t                slab_later_count; /* cf KMEM_CBRC_LATER */
 233         uint16_t                slab_flags;     /* bits to mark the slab */
 234 } kmem_slab_t;
 235 
 236 #define KMEM_HASH_INITIAL       64
 237 
 238 #define KMEM_HASH(cp, buf)      \
 239         ((cp)->cache_hash_table +    \
 240         (((uintptr_t)(buf) >> (cp)->cache_hash_shift) & (cp)->cache_hash_mask))
 241 
 242 typedef struct kmem_magazine {
 243         void    *mag_next;
 244         void    *mag_round[1];          /* one or more rounds */
 245 } kmem_magazine_t;
 246 
 247 /*
 248  * The magazine types for fast per-cpu allocation
 249  */
 250 typedef struct kmem_magtype {
 251         short           mt_magsize;     /* magazine size (number of rounds) */
 252         int             mt_align;       /* magazine alignment */
 253         size_t          mt_minbuf;      /* all smaller buffers qualify */
 254         size_t          mt_maxbuf;      /* no larger buffers qualify */
 255         kmem_cache_t    *mt_cache;      /* magazine cache */
 256 } kmem_magtype_t;
 257 
 258 #define KMEM_CPU_CACHE_SIZE     64      /* must be power of 2 */
 259 #define KMEM_CPU_PAD            (KMEM_CPU_CACHE_SIZE - sizeof (kmutex_t) - \
 260         2 * sizeof (uint64_t) - 2 * sizeof (void *) - sizeof (int) - \
 261         5 * sizeof (short))
 262 #define KMEM_CACHE_SIZE(ncpus)  \
 263         ((size_t)(&((kmem_cache_t *)0)->cache_cpu[ncpus]))
 264 
 265 /* Offset from kmem_cache->cache_cpu for per cpu caches */
 266 #define KMEM_CPU_CACHE_OFFSET(cpuid)                                    \
 267         ((size_t)(&((kmem_cache_t *)0)->cache_cpu[cpuid]) -              \
 268         (size_t)(&((kmem_cache_t *)0)->cache_cpu))
 269 
 270 typedef struct kmem_cpu_cache {
 271         kmutex_t        cc_lock;        /* protects this cpu's local cache */
 272         uint64_t        cc_alloc;       /* allocations from this cpu */
 273         uint64_t        cc_free;        /* frees to this cpu */
 274         kmem_magazine_t *cc_loaded;     /* the currently loaded magazine */
 275         kmem_magazine_t *cc_ploaded;    /* the previously loaded magazine */
 276         int             cc_flags;       /* CPU-local copy of cache_flags */
 277         short           cc_rounds;      /* number of objects in loaded mag */
 278         short           cc_prounds;     /* number of objects in previous mag */
 279         short           cc_magsize;     /* number of rounds in a full mag */
 280         short           cc_dump_rounds; /* dump time copy of cc_rounds */
 281         short           cc_dump_prounds; /* dump time copy of cc_prounds */
 282         char            cc_pad[KMEM_CPU_PAD]; /* for nice alignment */
 283 } kmem_cpu_cache_t;
 284 
 285 /*
 286  * The magazine lists used in the depot.
 287  */
 288 typedef struct kmem_maglist {
 289         kmem_magazine_t *ml_list;       /* magazine list */
 290         long            ml_total;       /* number of magazines */
 291         long            ml_min;         /* min since last update */
 292         long            ml_reaplimit;   /* max reapable magazines */
 293         uint64_t        ml_alloc;       /* allocations from this list */
 294 } kmem_maglist_t;
 295 
 296 typedef struct kmem_defrag {
 297         /*
 298          * Statistics
 299          */
 300         uint64_t        kmd_callbacks;          /* move callbacks */
 301         uint64_t        kmd_yes;                /* KMEM_CBRC_YES responses */
 302         uint64_t        kmd_no;                 /* NO responses */
 303         uint64_t        kmd_later;              /* LATER responses */
 304         uint64_t        kmd_dont_need;          /* DONT_NEED responses */
 305         uint64_t        kmd_dont_know;          /* DONT_KNOW responses */
 306         uint64_t        kmd_slabs_freed;        /* slabs freed by moves */
 307         uint64_t        kmd_defrags;            /* kmem_cache_defrag() */
 308         uint64_t        kmd_scans;              /* kmem_cache_scan() */
 309 
 310         /*
 311          * Consolidator fields
 312          */
 313         avl_tree_t      kmd_moves_pending;      /* buffer moves pending */
 314         list_t          kmd_deadlist;           /* deferred slab frees */
 315         size_t          kmd_deadcount;          /* # of slabs in kmd_deadlist */
 316         uint8_t         kmd_reclaim_numer;      /* slab usage threshold */
 317         uint8_t         kmd_pad1;               /* compiler padding */
 318         uint16_t        kmd_consolidate;        /* triggers consolidator */
 319         uint32_t        kmd_pad2;               /* compiler padding */
 320         size_t          kmd_slabs_sought;       /* reclaimable slabs sought */
 321         size_t          kmd_slabs_found;        /* reclaimable slabs found */
 322         size_t          kmd_tries;              /* nth scan interval counter */
 323         /*
 324          * Fields used to ASSERT that the client does not kmem_cache_free()
 325          * objects passed to the move callback.
 326          */
 327         void            *kmd_from_buf;          /* object to move */
 328         void            *kmd_to_buf;            /* move destination */
 329         kthread_t       *kmd_thread;            /* thread calling move */
 330 } kmem_defrag_t;
 331 
 332 typedef struct kmem_dump {
 333         void            *kd_freelist;           /* heap during crash dump */
 334         uint_t          kd_alloc_fails;         /* # of allocation failures */
 335         uint_t          kd_unsafe;              /* cache was used, but unsafe */
 336 } kmem_dump_t;
 337 
 338 #define KMEM_CACHE_NAMELEN      31
 339 
 340 struct kmem_cache {
 341         /*
 342          * Statistics
 343          */
 344         uint64_t        cache_slab_create;      /* slab creates */
 345         uint64_t        cache_slab_destroy;     /* slab destroys */
 346         uint64_t        cache_slab_alloc;       /* slab layer allocations */
 347         uint64_t        cache_slab_free;        /* slab layer frees */
 348         uint64_t        cache_alloc_fail;       /* total failed allocations */
 349         uint64_t        cache_buftotal;         /* total buffers */
 350         uint64_t        cache_bufmax;           /* max buffers ever */
 351         uint64_t        cache_bufslab;          /* buffers free in slab layer */
 352         uint64_t        cache_reap;             /* cache reaps */
 353         uint64_t        cache_rescale;          /* hash table rescales */
 354         uint64_t        cache_lookup_depth;     /* hash lookup depth */
 355         uint64_t        cache_depot_contention; /* mutex contention count */
 356         uint64_t        cache_depot_contention_prev; /* previous snapshot */
 357 
 358         /*
 359          * Cache properties
 360          */
 361         char            cache_name[KMEM_CACHE_NAMELEN + 1];
 362         size_t          cache_bufsize;          /* object size */
 363         size_t          cache_align;            /* object alignment */
 364         int             (*cache_constructor)(void *, void *, int);
 365         void            (*cache_destructor)(void *, void *);
 366         void            (*cache_reclaim)(void *);
 367         kmem_cbrc_t     (*cache_move)(void *, void *, size_t, void *);
 368         void            *cache_private;         /* opaque arg to callbacks */
 369         vmem_t          *cache_arena;           /* vmem source for slabs */
 370         int             cache_cflags;           /* cache creation flags */
 371         int             cache_flags;            /* various cache state info */
 372         uint32_t        cache_mtbf;             /* induced alloc failure rate */
 373         uint32_t        cache_pad1;             /* compiler padding */
 374         kstat_t         *cache_kstat;           /* exported statistics */
 375         list_node_t     cache_link;             /* cache linkage */
 376 
 377         /*
 378          * Slab layer
 379          */
 380         kmutex_t        cache_lock;             /* protects slab layer */
 381         size_t          cache_chunksize;        /* buf + alignment [+ debug] */
 382         size_t          cache_slabsize;         /* size of a slab */
 383         size_t          cache_maxchunks;        /* max buffers per slab */
 384         size_t          cache_bufctl;           /* buf-to-bufctl distance */
 385         size_t          cache_buftag;           /* buf-to-buftag distance */
 386         size_t          cache_verify;           /* bytes to verify */
 387         size_t          cache_contents;         /* bytes of saved content */
 388         size_t          cache_color;            /* next slab color */
 389         size_t          cache_mincolor;         /* maximum slab color */
 390         size_t          cache_maxcolor;         /* maximum slab color */
 391         size_t          cache_hash_shift;       /* get to interesting bits */
 392         size_t          cache_hash_mask;        /* hash table mask */
 393         list_t          cache_complete_slabs;   /* completely allocated slabs */
 394         size_t          cache_complete_slab_count;
 395         avl_tree_t      cache_partial_slabs;    /* partial slab freelist */
 396         size_t          cache_partial_binshift; /* for AVL sort bins */
 397         kmem_cache_t    *cache_bufctl_cache;    /* source of bufctls */
 398         kmem_bufctl_t   **cache_hash_table;     /* hash table base */
 399         kmem_defrag_t   *cache_defrag;          /* slab consolidator fields */
 400 
 401         /*
 402          * Depot layer
 403          */
 404         kmutex_t        cache_depot_lock;       /* protects depot */
 405         kmem_magtype_t  *cache_magtype;         /* magazine type */
 406         kmem_maglist_t  cache_full;             /* full magazines */
 407         kmem_maglist_t  cache_empty;            /* empty magazines */
 408         kmem_dump_t     cache_dump;             /* used during crash dump */
 409 
 410         /*
 411          * Per-CPU layer
 412          */
 413         kmem_cpu_cache_t cache_cpu[1];          /* max_ncpus actual elements */
 414 };
 415 
 416 typedef struct kmem_cpu_log_header {
 417         kmutex_t        clh_lock;
 418         char            *clh_current;
 419         size_t          clh_avail;
 420         int             clh_chunk;
 421         int             clh_hits;
 422         char            clh_pad[64 - sizeof (kmutex_t) - sizeof (char *) -
 423                                 sizeof (size_t) - 2 * sizeof (int)];
 424 } kmem_cpu_log_header_t;
 425 
 426 typedef struct kmem_log_header {
 427         kmutex_t        lh_lock;
 428         char            *lh_base;
 429         int             *lh_free;
 430         size_t          lh_chunksize;
 431         int             lh_nchunks;
 432         int             lh_head;
 433         int             lh_tail;
 434         int             lh_hits;
 435         kmem_cpu_log_header_t lh_cpu[1];        /* ncpus actually allocated */
 436 } kmem_log_header_t;
 437 
 438 /* kmem_move kmm_flags */
 439 #define KMM_DESPERATE           0x1
 440 #define KMM_NOTIFY              0x2
 441 #define KMM_DEBUG               0x4
 442 
 443 typedef struct kmem_move {
 444         kmem_slab_t     *kmm_from_slab;
 445         void            *kmm_from_buf;
 446         void            *kmm_to_buf;
 447         avl_node_t      kmm_entry;
 448         int             kmm_flags;
 449 } kmem_move_t;
 450 
 451 /*
 452  * In order to consolidate partial slabs, it must be possible for the cache to
 453  * have partial slabs.
 454  */
 455 #define KMEM_IS_MOVABLE(cp)                                             \
 456         (((cp)->cache_chunksize * 2) <= (cp)->cache_slabsize)
 457 
 458 #ifdef  __cplusplus
 459 }
 460 #endif
 461 
 462 #endif  /* _SYS_KMEM_IMPL_H */