1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 #ifndef _SYS_KMEM_IMPL_H
  27 #define _SYS_KMEM_IMPL_H
  28 
  29 #include <sys/kmem.h>
  30 #include <sys/vmem.h>
  31 #include <sys/thread.h>
  32 #include <sys/t_lock.h>
  33 #include <sys/time.h>
  34 #include <sys/kstat.h>
  35 #include <sys/cpuvar.h>
  36 #include <sys/systm.h>
  37 #include <vm/page.h>
  38 #include <sys/avl.h>
  39 #include <sys/list.h>
  40 
  41 #ifdef  __cplusplus
  42 extern "C" {
  43 #endif
  44 
  45 /*
  46  * kernel memory allocator: implementation-private data structures
  47  *
  48  * Lock order:
  49  * 1. cache_lock
  50  * 2. cc_lock in order by CPU ID
  51  * 3. cache_depot_lock
  52  *
  53  * Do not call kmem_cache_alloc() or taskq_dispatch() while holding any of the
  54  * above locks.
  55  */
  56 
  57 #define KMF_AUDIT       0x00000001      /* transaction auditing */
  58 #define KMF_DEADBEEF    0x00000002      /* deadbeef checking */
  59 #define KMF_REDZONE     0x00000004      /* redzone checking */
  60 #define KMF_CONTENTS    0x00000008      /* freed-buffer content logging */
  61 #define KMF_STICKY      0x00000010      /* if set, override /etc/system */
  62 #define KMF_NOMAGAZINE  0x00000020      /* disable per-cpu magazines */
  63 #define KMF_FIREWALL    0x00000040      /* put all bufs before unmapped pages */
  64 #define KMF_LITE        0x00000100      /* lightweight debugging */
  65 
  66 #define KMF_HASH        0x00000200      /* cache has hash table */
  67 #define KMF_RANDOMIZE   0x00000400      /* randomize other kmem_flags */
  68 
  69 #define KMF_DUMPDIVERT  0x00001000      /* use alternate memory at dump time */
  70 #define KMF_DUMPUNSAFE  0x00002000      /* flag caches used at dump time */
  71 #define KMF_PREFILL     0x00004000      /* Prefill the slab when created. */
  72 
  73 #define KMF_BUFTAG      (KMF_DEADBEEF | KMF_REDZONE)
  74 #define KMF_TOUCH       (KMF_BUFTAG | KMF_LITE | KMF_CONTENTS)
  75 #define KMF_RANDOM      (KMF_TOUCH | KMF_AUDIT | KMF_NOMAGAZINE)
  76 #define KMF_DEBUG       (KMF_RANDOM | KMF_FIREWALL)
  77 
  78 #define KMEM_STACK_DEPTH        15
  79 
  80 #define KMEM_FREE_PATTERN               0xdeadbeefdeadbeefULL
  81 #define KMEM_UNINITIALIZED_PATTERN      0xbaddcafebaddcafeULL
  82 #define KMEM_REDZONE_PATTERN            0xfeedfacefeedfaceULL
  83 #define KMEM_REDZONE_BYTE               0xbb
  84 
  85 /*
  86  * Redzone size encodings for kmem_alloc() / kmem_free().  We encode the
  87  * allocation size, rather than storing it directly, so that kmem_free()
  88  * can distinguish frees of the wrong size from redzone violations.
  89  *
  90  * A size of zero is never valid.
  91  */
  92 #define KMEM_SIZE_ENCODE(x)     (251 * (x) + 1)
  93 #define KMEM_SIZE_DECODE(x)     ((x) / 251)
  94 #define KMEM_SIZE_VALID(x)      ((x) % 251 == 1 && (x) != 1)
  95 
  96 
  97 #define KMEM_ALIGN              8       /* min guaranteed alignment */
  98 #define KMEM_ALIGN_SHIFT        3       /* log2(KMEM_ALIGN) */
  99 #define KMEM_VOID_FRACTION      8       /* never waste more than 1/8 of slab */
 100 
 101 #define KMEM_SLAB_IS_PARTIAL(sp)                \
 102         ((sp)->slab_refcnt > 0 && (sp)->slab_refcnt < (sp)->slab_chunks)
 103 #define KMEM_SLAB_IS_ALL_USED(sp)               \
 104         ((sp)->slab_refcnt == (sp)->slab_chunks)
 105 
 106 /*
 107  * The bufctl (buffer control) structure keeps some minimal information
 108  * about each buffer: its address, its slab, and its current linkage,
 109  * which is either on the slab's freelist (if the buffer is free), or
 110  * on the cache's buf-to-bufctl hash table (if the buffer is allocated).
 111  * In the case of non-hashed, or "raw", caches (the common case), only
 112  * the freelist linkage is necessary: the buffer address is at a fixed
 113  * offset from the bufctl address, and the slab is at the end of the page.
 114  *
 115  * NOTE: bc_next must be the first field; raw buffers have linkage only.
 116  */
 117 typedef struct kmem_bufctl {
 118         struct kmem_bufctl      *bc_next;       /* next bufctl struct */
 119         void                    *bc_addr;       /* address of buffer */
 120         struct kmem_slab        *bc_slab;       /* controlling slab */
 121 } kmem_bufctl_t;
 122 
 123 /*
 124  * The KMF_AUDIT version of the bufctl structure.  The beginning of this
 125  * structure must be identical to the normal bufctl structure so that
 126  * pointers are interchangeable.
 127  */
 128 typedef struct kmem_bufctl_audit {
 129         struct kmem_bufctl      *bc_next;       /* next bufctl struct */
 130         void                    *bc_addr;       /* address of buffer */
 131         struct kmem_slab        *bc_slab;       /* controlling slab */
 132         kmem_cache_t            *bc_cache;      /* controlling cache */
 133         hrtime_t                bc_timestamp;   /* transaction time */
 134         kthread_t               *bc_thread;     /* thread doing transaction */
 135         struct kmem_bufctl      *bc_lastlog;    /* last log entry */
 136         void                    *bc_contents;   /* contents at last free */
 137         int                     bc_depth;       /* stack depth */
 138         pc_t                    bc_stack[KMEM_STACK_DEPTH];     /* pc stack */
 139 } kmem_bufctl_audit_t;
 140 
 141 /*
 142  * A kmem_buftag structure is appended to each buffer whenever any of the
 143  * KMF_BUFTAG flags (KMF_DEADBEEF, KMF_REDZONE, KMF_VERIFY) are set.
 144  */
 145 typedef struct kmem_buftag {
 146         uint64_t                bt_redzone;     /* 64-bit redzone pattern */
 147         kmem_bufctl_t           *bt_bufctl;     /* bufctl */
 148         intptr_t                bt_bxstat;      /* bufctl ^ (alloc/free) */
 149 } kmem_buftag_t;
 150 
 151 /*
 152  * A variant of the kmem_buftag structure used for KMF_LITE caches.
 153  * Previous callers are stored in reverse chronological order. (i.e. most
 154  * recent first)
 155  */
 156 typedef struct kmem_buftag_lite {
 157         kmem_buftag_t           bt_buftag;      /* a normal buftag */
 158         pc_t                    bt_history[1];  /* zero or more callers */
 159 } kmem_buftag_lite_t;
 160 
 161 #define KMEM_BUFTAG_LITE_SIZE(f)        \
 162         (offsetof(kmem_buftag_lite_t, bt_history[f]))
 163 
 164 #define KMEM_BUFTAG(cp, buf)            \
 165         ((kmem_buftag_t *)((char *)(buf) + (cp)->cache_buftag))
 166 
 167 #define KMEM_BUFCTL(cp, buf)            \
 168         ((kmem_bufctl_t *)((char *)(buf) + (cp)->cache_bufctl))
 169 
 170 #define KMEM_BUF(cp, bcp)               \
 171         ((void *)((char *)(bcp) - (cp)->cache_bufctl))
 172 
 173 #define KMEM_SLAB(cp, buf)              \
 174         ((kmem_slab_t *)P2END((uintptr_t)(buf), (cp)->cache_slabsize) - 1)
 175 
 176 /*
 177  * Test for using alternate memory at dump time.
 178  */
 179 #define KMEM_DUMP(cp)           ((cp)->cache_flags & KMF_DUMPDIVERT)
 180 #define KMEM_DUMPCC(ccp)        ((ccp)->cc_flags & KMF_DUMPDIVERT)
 181 
 182 /*
 183  * The "CPU" macro loads a cpu_t that refers to the cpu that the current
 184  * thread is running on at the time the macro is executed.  A context switch
 185  * may occur immediately after loading this data structure, leaving this
 186  * thread pointing at the cpu_t for the previous cpu.  This is not a problem;
 187  * we'd just end up checking the previous cpu's per-cpu cache, and then check
 188  * the other layers of the kmem cache if need be.
 189  *
 190  * It's not even a problem if the old cpu gets DR'ed out during the context
 191  * switch.  The cpu-remove DR operation bzero()s the cpu_t, but doesn't free
 192  * it.  So the cpu_t's cpu_cache_offset would read as 0, causing us to use
 193  * cpu 0's per-cpu cache.
 194  *
 195  * So, there is no need to disable kernel preemption while using the CPU macro
 196  * below since if we have been context switched, there will not be any
 197  * correctness problem, just a momentary use of a different per-cpu cache.
 198  */
 199 
 200 #define KMEM_CPU_CACHE(cp)                                              \
 201         ((kmem_cpu_cache_t *)((char *)(&cp->cache_cpu) + CPU->cpu_cache_offset))
 202 
 203 #define KMEM_MAGAZINE_VALID(cp, mp)     \
 204         (((kmem_slab_t *)P2END((uintptr_t)(mp), PAGESIZE) - 1)->slab_cache == \
 205             (cp)->cache_magtype->mt_cache)
 206 
 207 #define KMEM_SLAB_OFFSET(sp, buf)       \
 208         ((size_t)((uintptr_t)(buf) - (uintptr_t)((sp)->slab_base)))
 209 
 210 #define KMEM_SLAB_MEMBER(sp, buf)       \
 211         (KMEM_SLAB_OFFSET(sp, buf) < (sp)->slab_cache->cache_slabsize)
 212 
 213 #define KMEM_BUFTAG_ALLOC       0xa110c8edUL
 214 #define KMEM_BUFTAG_FREE        0xf4eef4eeUL
 215 
 216 /* slab_later_count thresholds */
 217 #define KMEM_DISBELIEF          3
 218 
 219 /* slab_flags */
 220 #define KMEM_SLAB_NOMOVE        0x1
 221 #define KMEM_SLAB_MOVE_PENDING  0x2
 222 
 223 typedef struct kmem_slab {
 224         struct kmem_cache       *slab_cache;    /* controlling cache */
 225         void                    *slab_base;     /* base of allocated memory */
 226         avl_node_t              slab_link;      /* slab linkage */
 227         struct kmem_bufctl      *slab_head;     /* first free buffer */
 228         long                    slab_refcnt;    /* outstanding allocations */
 229         long                    slab_chunks;    /* chunks (bufs) in this slab */
 230         uint32_t                slab_stuck_offset; /* unmoved buffer offset */
 231         uint16_t                slab_later_count; /* cf KMEM_CBRC_LATER */
 232         uint16_t                slab_flags;     /* bits to mark the slab */
 233 } kmem_slab_t;
 234 
 235 #define KMEM_HASH_INITIAL       64
 236 
 237 #define KMEM_HASH(cp, buf)      \
 238         ((cp)->cache_hash_table +    \
 239         (((uintptr_t)(buf) >> (cp)->cache_hash_shift) & (cp)->cache_hash_mask))
 240 
 241 typedef struct kmem_magazine {
 242         void    *mag_next;
 243         void    *mag_round[1];          /* one or more rounds */
 244 } kmem_magazine_t;
 245 
 246 /*
 247  * The magazine types for fast per-cpu allocation
 248  */
 249 typedef struct kmem_magtype {
 250         short           mt_magsize;     /* magazine size (number of rounds) */
 251         int             mt_align;       /* magazine alignment */
 252         size_t          mt_minbuf;      /* all smaller buffers qualify */
 253         size_t          mt_maxbuf;      /* no larger buffers qualify */
 254         kmem_cache_t    *mt_cache;      /* magazine cache */
 255 } kmem_magtype_t;
 256 
 257 #define KMEM_CPU_CACHE_SIZE     64      /* must be power of 2 */
 258 #define KMEM_CPU_PAD            (KMEM_CPU_CACHE_SIZE - sizeof (kmutex_t) - \
 259         2 * sizeof (uint64_t) - 2 * sizeof (void *) - sizeof (int) - \
 260         5 * sizeof (short))
 261 #define KMEM_CACHE_SIZE(ncpus)  \
 262         ((size_t)(&((kmem_cache_t *)0)->cache_cpu[ncpus]))
 263 
 264 /* Offset from kmem_cache->cache_cpu for per cpu caches */
 265 #define KMEM_CPU_CACHE_OFFSET(cpuid)                                    \
 266         ((size_t)(&((kmem_cache_t *)0)->cache_cpu[cpuid]) -              \
 267         (size_t)(&((kmem_cache_t *)0)->cache_cpu))
 268 
 269 typedef struct kmem_cpu_cache {
 270         kmutex_t        cc_lock;        /* protects this cpu's local cache */
 271         uint64_t        cc_alloc;       /* allocations from this cpu */
 272         uint64_t        cc_free;        /* frees to this cpu */
 273         kmem_magazine_t *cc_loaded;     /* the currently loaded magazine */
 274         kmem_magazine_t *cc_ploaded;    /* the previously loaded magazine */
 275         int             cc_flags;       /* CPU-local copy of cache_flags */
 276         short           cc_rounds;      /* number of objects in loaded mag */
 277         short           cc_prounds;     /* number of objects in previous mag */
 278         short           cc_magsize;     /* number of rounds in a full mag */
 279         short           cc_dump_rounds; /* dump time copy of cc_rounds */
 280         short           cc_dump_prounds; /* dump time copy of cc_prounds */
 281         char            cc_pad[KMEM_CPU_PAD]; /* for nice alignment */
 282 } kmem_cpu_cache_t;
 283 
 284 /*
 285  * The magazine lists used in the depot.
 286  */
 287 typedef struct kmem_maglist {
 288         kmem_magazine_t *ml_list;       /* magazine list */
 289         long            ml_total;       /* number of magazines */
 290         long            ml_min;         /* min since last update */
 291         long            ml_reaplimit;   /* max reapable magazines */
 292         uint64_t        ml_alloc;       /* allocations from this list */
 293 } kmem_maglist_t;
 294 
 295 typedef struct kmem_defrag {
 296         /*
 297          * Statistics
 298          */
 299         uint64_t        kmd_callbacks;          /* move callbacks */
 300         uint64_t        kmd_yes;                /* KMEM_CBRC_YES responses */
 301         uint64_t        kmd_no;                 /* NO responses */
 302         uint64_t        kmd_later;              /* LATER responses */
 303         uint64_t        kmd_dont_need;          /* DONT_NEED responses */
 304         uint64_t        kmd_dont_know;          /* DONT_KNOW responses */
 305         uint64_t        kmd_slabs_freed;        /* slabs freed by moves */
 306         uint64_t        kmd_defrags;            /* kmem_cache_defrag() */
 307         uint64_t        kmd_scans;              /* kmem_cache_scan() */
 308 
 309         /*
 310          * Consolidator fields
 311          */
 312         avl_tree_t      kmd_moves_pending;      /* buffer moves pending */
 313         list_t          kmd_deadlist;           /* deferred slab frees */
 314         size_t          kmd_deadcount;          /* # of slabs in kmd_deadlist */
 315         uint8_t         kmd_reclaim_numer;      /* slab usage threshold */
 316         uint8_t         kmd_pad1;               /* compiler padding */
 317         uint16_t        kmd_consolidate;        /* triggers consolidator */
 318         uint32_t        kmd_pad2;               /* compiler padding */
 319         size_t          kmd_slabs_sought;       /* reclaimable slabs sought */
 320         size_t          kmd_slabs_found;        /* reclaimable slabs found */
 321         size_t          kmd_tries;              /* nth scan interval counter */
 322         /*
 323          * Fields used to ASSERT that the client does not kmem_cache_free()
 324          * objects passed to the move callback.
 325          */
 326         void            *kmd_from_buf;          /* object to move */
 327         void            *kmd_to_buf;            /* move destination */
 328         kthread_t       *kmd_thread;            /* thread calling move */
 329 } kmem_defrag_t;
 330 
 331 #define KMEM_CACHE_NAMELEN      31
 332 
 333 struct kmem_cache {
 334         /*
 335          * Statistics
 336          */
 337         uint64_t        cache_slab_create;      /* slab creates */
 338         uint64_t        cache_slab_destroy;     /* slab destroys */
 339         uint64_t        cache_slab_alloc;       /* slab layer allocations */
 340         uint64_t        cache_slab_free;        /* slab layer frees */
 341         uint64_t        cache_alloc_fail;       /* total failed allocations */
 342         uint64_t        cache_buftotal;         /* total buffers */
 343         uint64_t        cache_bufmax;           /* max buffers ever */
 344         uint64_t        cache_bufslab;          /* buffers free in slab layer */
 345         uint64_t        cache_reap;             /* cache reaps */
 346         uint64_t        cache_rescale;          /* hash table rescales */
 347         uint64_t        cache_lookup_depth;     /* hash lookup depth */
 348         uint64_t        cache_depot_contention; /* mutex contention count */
 349         uint64_t        cache_depot_contention_prev; /* previous snapshot */
 350 
 351         /*
 352          * Cache properties
 353          */
 354         char            cache_name[KMEM_CACHE_NAMELEN + 1];
 355         size_t          cache_bufsize;          /* object size */
 356         size_t          cache_align;            /* object alignment */
 357         int             (*cache_constructor)(void *, void *, int);
 358         void            (*cache_destructor)(void *, void *);
 359         void            (*cache_reclaim)(void *);
 360         kmem_cbrc_t     (*cache_move)(void *, void *, size_t, void *);
 361         void            *cache_private;         /* opaque arg to callbacks */
 362         vmem_t          *cache_arena;           /* vmem source for slabs */
 363         int             cache_cflags;           /* cache creation flags */
 364         int             cache_flags;            /* various cache state info */
 365         uint32_t        cache_mtbf;             /* induced alloc failure rate */
 366         uint32_t        cache_pad1;             /* compiler padding */
 367         kstat_t         *cache_kstat;           /* exported statistics */
 368         list_node_t     cache_link;             /* cache linkage */
 369 
 370         /*
 371          * Slab layer
 372          */
 373         kmutex_t        cache_lock;             /* protects slab layer */
 374         size_t          cache_chunksize;        /* buf + alignment [+ debug] */
 375         size_t          cache_slabsize;         /* size of a slab */
 376         size_t          cache_maxchunks;        /* max buffers per slab */
 377         size_t          cache_bufctl;           /* buf-to-bufctl distance */
 378         size_t          cache_buftag;           /* buf-to-buftag distance */
 379         size_t          cache_verify;           /* bytes to verify */
 380         size_t          cache_contents;         /* bytes of saved content */
 381         size_t          cache_color;            /* next slab color */
 382         size_t          cache_mincolor;         /* maximum slab color */
 383         size_t          cache_maxcolor;         /* maximum slab color */
 384         size_t          cache_hash_shift;       /* get to interesting bits */
 385         size_t          cache_hash_mask;        /* hash table mask */
 386         list_t          cache_complete_slabs;   /* completely allocated slabs */
 387         size_t          cache_complete_slab_count;
 388         avl_tree_t      cache_partial_slabs;    /* partial slab freelist */
 389         size_t          cache_partial_binshift; /* for AVL sort bins */
 390         kmem_cache_t    *cache_bufctl_cache;    /* source of bufctls */
 391         kmem_bufctl_t   **cache_hash_table;     /* hash table base */
 392         kmem_defrag_t   *cache_defrag;          /* slab consolidator fields */
 393 
 394         /*
 395          * Depot layer
 396          */
 397         kmutex_t        cache_depot_lock;       /* protects depot */
 398         kmem_magtype_t  *cache_magtype;         /* magazine type */
 399         kmem_maglist_t  cache_full;             /* full magazines */
 400         kmem_maglist_t  cache_empty;            /* empty magazines */
 401         void            *cache_dumpfreelist;    /* heap during crash dump */
 402         void            *cache_dumplog;         /* log entry during dump */
 403 
 404         /*
 405          * Per-CPU layer
 406          */
 407         kmem_cpu_cache_t cache_cpu[1];          /* max_ncpus actual elements */
 408 };
 409 
 410 typedef struct kmem_cpu_log_header {
 411         kmutex_t        clh_lock;
 412         char            *clh_current;
 413         size_t          clh_avail;
 414         int             clh_chunk;
 415         int             clh_hits;
 416         char            clh_pad[64 - sizeof (kmutex_t) - sizeof (char *) -
 417                                 sizeof (size_t) - 2 * sizeof (int)];
 418 } kmem_cpu_log_header_t;
 419 
 420 typedef struct kmem_log_header {
 421         kmutex_t        lh_lock;
 422         char            *lh_base;
 423         int             *lh_free;
 424         size_t          lh_chunksize;
 425         int             lh_nchunks;
 426         int             lh_head;
 427         int             lh_tail;
 428         int             lh_hits;
 429         kmem_cpu_log_header_t lh_cpu[1];        /* ncpus actually allocated */
 430 } kmem_log_header_t;
 431 
 432 /* kmem_move kmm_flags */
 433 #define KMM_DESPERATE           0x1
 434 #define KMM_NOTIFY              0x2
 435 #define KMM_DEBUG               0x4
 436 
 437 typedef struct kmem_move {
 438         kmem_slab_t     *kmm_from_slab;
 439         void            *kmm_from_buf;
 440         void            *kmm_to_buf;
 441         avl_node_t      kmm_entry;
 442         int             kmm_flags;
 443 } kmem_move_t;
 444 
 445 /*
 446  * In order to consolidate partial slabs, it must be possible for the cache to
 447  * have partial slabs.
 448  */
 449 #define KMEM_IS_MOVABLE(cp)                                             \
 450         (((cp)->cache_chunksize * 2) <= (cp)->cache_slabsize)
 451 
 452 #ifdef  __cplusplus
 453 }
 454 #endif
 455 
 456 #endif  /* _SYS_KMEM_IMPL_H */