illumos-gate New usr/src/uts/common/dtrace/dtrace.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  25  */
  26 
  27 /*
  28  * DTrace - Dynamic Tracing for Solaris
  29  *
  30  * This is the implementation of the Solaris Dynamic Tracing framework
  31  * (DTrace).  The user-visible interface to DTrace is described at length in
  32  * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
  33  * library, the in-kernel DTrace framework, and the DTrace providers are
  34  * described in the block comments in the <sys/dtrace.h> header file.  The
  35  * internal architecture of DTrace is described in the block comments in the
  36  * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
  37  * implementation very much assume mastery of all of these sources; if one has
  38  * an unanswered question about the implementation, one should consult them
  39  * first.
  40  *
  41  * The functions here are ordered roughly as follows:
  42  *
  43  *   - Probe context functions
  44  *   - Probe hashing functions
  45  *   - Non-probe context utility functions
  46  *   - Matching functions
  47  *   - Provider-to-Framework API functions
  48  *   - Probe management functions
  49  *   - DIF object functions
  50  *   - Format functions
  51  *   - Predicate functions
  52  *   - ECB functions
  53  *   - Buffer functions
  54  *   - Enabling functions
  55  *   - DOF functions
  56  *   - Anonymous enabling functions
  57  *   - Consumer state functions
  58  *   - Helper functions
  59  *   - Hook functions
  60  *   - Driver cookbook functions
  61  *
  62  * Each group of functions begins with a block comment labelled the "DTrace
  63  * [Group] Functions", allowing one to find each block by searching forward
  64  * on capital-f functions.
  65  */
  66 #include <sys/errno.h>
  67 #include <sys/stat.h>
  68 #include <sys/modctl.h>
  69 #include <sys/conf.h>
  70 #include <sys/systm.h>
  71 #include <sys/ddi.h>
  72 #include <sys/sunddi.h>
  73 #include <sys/cpuvar.h>
  74 #include <sys/kmem.h>
  75 #include <sys/strsubr.h>
  76 #include <sys/sysmacros.h>
  77 #include <sys/dtrace_impl.h>
  78 #include <sys/atomic.h>
  79 #include <sys/cmn_err.h>
  80 #include <sys/mutex_impl.h>
  81 #include <sys/rwlock_impl.h>
  82 #include <sys/ctf_api.h>
  83 #include <sys/panic.h>
  84 #include <sys/priv_impl.h>
  85 #include <sys/policy.h>
  86 #include <sys/cred_impl.h>
  87 #include <sys/procfs_isa.h>
  88 #include <sys/taskq.h>
  89 #include <sys/mkdev.h>
  90 #include <sys/kdi.h>
  91 #include <sys/zone.h>
  92 #include <sys/socket.h>
  93 #include <netinet/in.h>
  94 
  95 /*
  96  * DTrace Tunable Variables
  97  *
  98  * The following variables may be tuned by adding a line to /etc/system that
  99  * includes both the name of the DTrace module ("dtrace") and the name of the
 100  * variable.  For example:
 101  *
 102  *   set dtrace:dtrace_destructive_disallow = 1
 103  *
 104  * In general, the only variables that one should be tuning this way are those
 105  * that affect system-wide DTrace behavior, and for which the default behavior
 106  * is undesirable.  Most of these variables are tunable on a per-consumer
 107  * basis using DTrace options, and need not be tuned on a system-wide basis.
 108  * When tuning these variables, avoid pathological values; while some attempt
 109  * is made to verify the integrity of these variables, they are not considered
 110  * part of the supported interface to DTrace, and they are therefore not
 111  * checked comprehensively.  Further, these variables should not be tuned
 112  * dynamically via "mdb -kw" or other means; they should only be tuned via
 113  * /etc/system.
 114  */
 115 int             dtrace_destructive_disallow = 0;
 116 dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
 117 size_t          dtrace_difo_maxsize = (256 * 1024);
 118 dtrace_optval_t dtrace_dof_maxsize = (256 * 1024);
 119 size_t          dtrace_global_maxsize = (16 * 1024);
 120 size_t          dtrace_actions_max = (16 * 1024);
 121 size_t          dtrace_retain_max = 1024;
 122 dtrace_optval_t dtrace_helper_actions_max = 1024;
 123 dtrace_optval_t dtrace_helper_providers_max = 32;
 124 dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
 125 size_t          dtrace_strsize_default = 256;
 126 dtrace_optval_t dtrace_cleanrate_default = 9900990;             /* 101 hz */
 127 dtrace_optval_t dtrace_cleanrate_min = 200000;                  /* 5000 hz */
 128 dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;  /* 1/minute */
 129 dtrace_optval_t dtrace_aggrate_default = NANOSEC;               /* 1 hz */
 130 dtrace_optval_t dtrace_statusrate_default = NANOSEC;            /* 1 hz */
 131 dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;  /* 6/minute */
 132 dtrace_optval_t dtrace_switchrate_default = NANOSEC;            /* 1 hz */
 133 dtrace_optval_t dtrace_nspec_default = 1;
 134 dtrace_optval_t dtrace_specsize_default = 32 * 1024;
 135 dtrace_optval_t dtrace_stackframes_default = 20;
 136 dtrace_optval_t dtrace_ustackframes_default = 20;
 137 dtrace_optval_t dtrace_jstackframes_default = 50;
 138 dtrace_optval_t dtrace_jstackstrsize_default = 512;
 139 int             dtrace_msgdsize_max = 128;
 140 hrtime_t        dtrace_chill_max = 500 * (NANOSEC / MILLISEC);  /* 500 ms */
 141 hrtime_t        dtrace_chill_interval = NANOSEC;                /* 1000 ms */
 142 int             dtrace_devdepth_max = 32;
 143 int             dtrace_err_verbose;
 144 hrtime_t        dtrace_deadman_interval = NANOSEC;
 145 hrtime_t        dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
 146 hrtime_t        dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
 147 hrtime_t        dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
 148 
 149 /*
 150  * DTrace External Variables
 151  *
 152  * As dtrace(7D) is a kernel module, any DTrace variables are obviously
 153  * available to DTrace consumers via the backtick (`) syntax.  One of these,
 154  * dtrace_zero, is made deliberately so:  it is provided as a source of
 155  * well-known, zero-filled memory.  While this variable is not documented,
 156  * it is used by some translators as an implementation detail.
 157  */
 158 const char      dtrace_zero[256] = { 0 };       /* zero-filled memory */
 159 
 160 /*
 161  * DTrace Internal Variables
 162  */
 163 static dev_info_t       *dtrace_devi;           /* device info */
 164 static vmem_t           *dtrace_arena;          /* probe ID arena */
 165 static vmem_t           *dtrace_minor;          /* minor number arena */
 166 static taskq_t          *dtrace_taskq;          /* task queue */
 167 static dtrace_probe_t   **dtrace_probes;        /* array of all probes */
 168 static int              dtrace_nprobes;         /* number of probes */
 169 static dtrace_provider_t *dtrace_provider;      /* provider list */
 170 static dtrace_meta_t    *dtrace_meta_pid;       /* user-land meta provider */
 171 static int              dtrace_opens;           /* number of opens */
 172 static int              dtrace_helpers;         /* number of helpers */
 173 static void             *dtrace_softstate;      /* softstate pointer */
 174 static dtrace_hash_t    *dtrace_bymod;          /* probes hashed by module */
 175 static dtrace_hash_t    *dtrace_byfunc;         /* probes hashed by function */
 176 static dtrace_hash_t    *dtrace_byname;         /* probes hashed by name */
 177 static dtrace_toxrange_t *dtrace_toxrange;      /* toxic range array */
 178 static int              dtrace_toxranges;       /* number of toxic ranges */
 179 static int              dtrace_toxranges_max;   /* size of toxic range array */
 180 static dtrace_anon_t    dtrace_anon;            /* anonymous enabling */
 181 static kmem_cache_t     *dtrace_state_cache;    /* cache for dynamic state */
 182 static uint64_t         dtrace_vtime_references; /* number of vtimestamp refs */
 183 static kthread_t        *dtrace_panicked;       /* panicking thread */
 184 static dtrace_ecb_t     *dtrace_ecb_create_cache; /* cached created ECB */
 185 static dtrace_genid_t   dtrace_probegen;        /* current probe generation */
 186 static dtrace_helpers_t *dtrace_deferred_pid;   /* deferred helper list */
 187 static dtrace_enabling_t *dtrace_retained;      /* list of retained enablings */
 188 static dtrace_genid_t   dtrace_retained_gen;    /* current retained enab gen */
 189 static dtrace_dynvar_t  dtrace_dynhash_sink;    /* end of dynamic hash chains */
 190 static int              dtrace_dynvar_failclean; /* dynvars failed to clean */
 191 
 192 /*
 193  * DTrace Locking
 194  * DTrace is protected by three (relatively coarse-grained) locks:
 195  *
 196  * (1) dtrace_lock is required to manipulate essentially any DTrace state,
 197  *     including enabling state, probes, ECBs, consumer state, helper state,
 198  *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
 199  *     probe context is lock-free -- synchronization is handled via the
 200  *     dtrace_sync() cross call mechanism.
 201  *
 202  * (2) dtrace_provider_lock is required when manipulating provider state, or
 203  *     when provider state must be held constant.
 204  *
 205  * (3) dtrace_meta_lock is required when manipulating meta provider state, or
 206  *     when meta provider state must be held constant.
 207  *
 208  * The lock ordering between these three locks is dtrace_meta_lock before
 209  * dtrace_provider_lock before dtrace_lock.  (In particular, there are
 210  * several places where dtrace_provider_lock is held by the framework as it
 211  * calls into the providers -- which then call back into the framework,
 212  * grabbing dtrace_lock.)
 213  *
 214  * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
 215  * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
 216  * role as a coarse-grained lock; it is acquired before both of these locks.
 217  * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
 218  * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
 219  * mod_lock is similar with respect to dtrace_provider_lock in that it must be
 220  * acquired _between_ dtrace_provider_lock and dtrace_lock.
 221  */
 222 static kmutex_t         dtrace_lock;            /* probe state lock */
 223 static kmutex_t         dtrace_provider_lock;   /* provider state lock */
 224 static kmutex_t         dtrace_meta_lock;       /* meta-provider state lock */
 225 
 226 /*
 227  * DTrace Provider Variables
 228  *
 229  * These are the variables relating to DTrace as a provider (that is, the
 230  * provider of the BEGIN, END, and ERROR probes).
 231  */
 232 static dtrace_pattr_t   dtrace_provider_attr = {
 233 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 234 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 235 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 236 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 237 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 238 };
 239 
 240 static void
 241 dtrace_nullop(void)
 242 {}
 243 
 244 static int
 245 dtrace_enable_nullop(void)
 246 {
 247         return (0);
 248 }
 249 
 250 static dtrace_pops_t    dtrace_provider_ops = {
 251         (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
 252         (void (*)(void *, struct modctl *))dtrace_nullop,
 253         (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop,
 254         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 255         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 256         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 257         NULL,
 258         NULL,
 259         NULL,
 260         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop
 261 };
 262 
 263 static dtrace_id_t      dtrace_probeid_begin;   /* special BEGIN probe */
 264 static dtrace_id_t      dtrace_probeid_end;     /* special END probe */
 265 dtrace_id_t             dtrace_probeid_error;   /* special ERROR probe */
 266 
 267 /*
 268  * DTrace Helper Tracing Variables
 269  */
 270 uint32_t dtrace_helptrace_next = 0;
 271 uint32_t dtrace_helptrace_nlocals;
 272 char    *dtrace_helptrace_buffer;
 273 int     dtrace_helptrace_bufsize = 512 * 1024;
 274 
 275 #ifdef DEBUG
 276 int     dtrace_helptrace_enabled = 1;
 277 #else
 278 int     dtrace_helptrace_enabled = 0;
 279 #endif
 280 
 281 /*
 282  * DTrace Error Hashing
 283  *
 284  * On DEBUG kernels, DTrace will track the errors that has seen in a hash
 285  * table.  This is very useful for checking coverage of tests that are
 286  * expected to induce DIF or DOF processing errors, and may be useful for
 287  * debugging problems in the DIF code generator or in DOF generation .  The
 288  * error hash may be examined with the ::dtrace_errhash MDB dcmd.
 289  */
 290 #ifdef DEBUG
 291 static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
 292 static const char *dtrace_errlast;
 293 static kthread_t *dtrace_errthread;
 294 static kmutex_t dtrace_errlock;
 295 #endif
 296 
 297 /*
 298  * DTrace Macros and Constants
 299  *
 300  * These are various macros that are useful in various spots in the
 301  * implementation, along with a few random constants that have no meaning
 302  * outside of the implementation.  There is no real structure to this cpp
 303  * mishmash -- but is there ever?
 304  */
 305 #define DTRACE_HASHSTR(hash, probe)     \
 306         dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
 307 
 308 #define DTRACE_HASHNEXT(hash, probe)    \
 309         (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
 310 
 311 #define DTRACE_HASHPREV(hash, probe)    \
 312         (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
 313 
 314 #define DTRACE_HASHEQ(hash, lhs, rhs)   \
 315         (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
 316             *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
 317 
 318 #define DTRACE_AGGHASHSIZE_SLEW         17
 319 
 320 #define DTRACE_V4MAPPED_OFFSET          (sizeof (uint32_t) * 3)
 321 
 322 /*
 323  * The key for a thread-local variable consists of the lower 61 bits of the
 324  * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
 325  * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
 326  * equal to a variable identifier.  This is necessary (but not sufficient) to
 327  * assure that global associative arrays never collide with thread-local
 328  * variables.  To guarantee that they cannot collide, we must also define the
 329  * order for keying dynamic variables.  That order is:
 330  *
 331  *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
 332  *
 333  * Because the variable-key and the tls-key are in orthogonal spaces, there is
 334  * no way for a global variable key signature to match a thread-local key
 335  * signature.
 336  */
 337 #define DTRACE_TLS_THRKEY(where) { \
 338         uint_t intr = 0; \
 339         uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
 340         for (; actv; actv >>= 1) \
 341                 intr++; \
 342         ASSERT(intr < (1 << 3)); \
 343         (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
 344             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 345 }
 346 
 347 #define DT_BSWAP_8(x)   ((x) & 0xff)
 348 #define DT_BSWAP_16(x)  ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
 349 #define DT_BSWAP_32(x)  ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
 350 #define DT_BSWAP_64(x)  ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
 351 
 352 #define DT_MASK_LO 0x00000000FFFFFFFFULL
 353 
 354 #define DTRACE_STORE(type, tomax, offset, what) \
 355         *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
 356 
 357 #ifndef __i386
 358 #define DTRACE_ALIGNCHECK(addr, size, flags)                            \
 359         if (addr & (size - 1)) {                                    \
 360                 *flags |= CPU_DTRACE_BADALIGN;                          \
 361                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;     \
 362                 return (0);                                             \
 363         }
 364 #else
 365 #define DTRACE_ALIGNCHECK(addr, size, flags)
 366 #endif
 367 
 368 /*
 369  * Test whether a range of memory starting at testaddr of size testsz falls
 370  * within the range of memory described by addr, sz.  We take care to avoid
 371  * problems with overflow and underflow of the unsigned quantities, and
 372  * disallow all negative sizes.  Ranges of size 0 are allowed.
 373  */
 374 #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
 375         ((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
 376         (testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
 377         (testaddr) + (testsz) >= (testaddr))
 378 
 379 /*
 380  * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
 381  * alloc_sz on the righthand side of the comparison in order to avoid overflow
 382  * or underflow in the comparison with it.  This is simpler than the INRANGE
 383  * check above, because we know that the dtms_scratch_ptr is valid in the
 384  * range.  Allocations of size zero are allowed.
 385  */
 386 #define DTRACE_INSCRATCH(mstate, alloc_sz) \
 387         ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
 388         (mstate)->dtms_scratch_ptr >= (alloc_sz))
 389 
 390 #define DTRACE_LOADFUNC(bits)                                           \
 391 /*CSTYLED*/                                                             \
 392 uint##bits##_t                                                          \
 393 dtrace_load##bits(uintptr_t addr)                                       \
 394 {                                                                       \
 395         size_t size = bits / NBBY;                                      \
 396         /*CSTYLED*/                                                     \
 397         uint##bits##_t rval;                                            \
 398         int i;                                                          \
 399         volatile uint16_t *flags = (volatile uint16_t *)                \
 400             &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;                    \
 401                                                                         \
 402         DTRACE_ALIGNCHECK(addr, size, flags);                           \
 403                                                                         \
 404         for (i = 0; i < dtrace_toxranges; i++) {                     \
 405                 if (addr >= dtrace_toxrange[i].dtt_limit)            \
 406                         continue;                                       \
 407                                                                         \
 408                 if (addr + size <= dtrace_toxrange[i].dtt_base)              \
 409                         continue;                                       \
 410                                                                         \
 411                 /*                                                      \
 412                  * This address falls within a toxic region; return 0.  \
 413                  */                                                     \
 414                 *flags |= CPU_DTRACE_BADADDR;                           \
 415                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;     \
 416                 return (0);                                             \
 417         }                                                               \
 418                                                                         \
 419         *flags |= CPU_DTRACE_NOFAULT;                                   \
 420         /*CSTYLED*/                                                     \
 421         rval = *((volatile uint##bits##_t *)addr);                      \
 422         *flags &= ~CPU_DTRACE_NOFAULT;                                      \
 423                                                                         \
 424         return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);           \
 425 }
 426 
 427 #ifdef _LP64
 428 #define dtrace_loadptr  dtrace_load64
 429 #else
 430 #define dtrace_loadptr  dtrace_load32
 431 #endif
 432 
 433 #define DTRACE_DYNHASH_FREE     0
 434 #define DTRACE_DYNHASH_SINK     1
 435 #define DTRACE_DYNHASH_VALID    2
 436 
 437 #define DTRACE_MATCH_FAIL       -1
 438 #define DTRACE_MATCH_NEXT       0
 439 #define DTRACE_MATCH_DONE       1
 440 #define DTRACE_ANCHORED(probe)  ((probe)->dtpr_func[0] != '\0')
 441 #define DTRACE_STATE_ALIGN      64
 442 
 443 #define DTRACE_FLAGS2FLT(flags)                                         \
 444         (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :               \
 445         ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :            \
 446         ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :                \
 447         ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :            \
 448         ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :            \
 449         ((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :             \
 450         ((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :             \
 451         ((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :   \
 452         ((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :             \
 453         DTRACEFLT_UNKNOWN)
 454 
 455 #define DTRACEACT_ISSTRING(act)                                         \
 456         ((act)->dta_kind == DTRACEACT_DIFEXPR &&                     \
 457         (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
 458 
 459 static size_t dtrace_strlen(const char *, size_t);
 460 static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
 461 static void dtrace_enabling_provide(dtrace_provider_t *);
 462 static int dtrace_enabling_match(dtrace_enabling_t *, int *);
 463 static void dtrace_enabling_matchall(void);
 464 static void dtrace_enabling_reap(void);
 465 static dtrace_state_t *dtrace_anon_grab(void);
 466 static uint64_t dtrace_helper(int, dtrace_mstate_t *,
 467     dtrace_state_t *, uint64_t, uint64_t);
 468 static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
 469 static void dtrace_buffer_drop(dtrace_buffer_t *);
 470 static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
 471 static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
 472     dtrace_state_t *, dtrace_mstate_t *);
 473 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
 474     dtrace_optval_t);
 475 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
 476 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
 477 static int dtrace_priv_proc(dtrace_state_t *, dtrace_mstate_t *);
 478 
 479 /*
 480  * DTrace Probe Context Functions
 481  *
 482  * These functions are called from probe context.  Because probe context is
 483  * any context in which C may be called, arbitrarily locks may be held,
 484  * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
 485  * As a result, functions called from probe context may only call other DTrace
 486  * support functions -- they may not interact at all with the system at large.
 487  * (Note that the ASSERT macro is made probe-context safe by redefining it in
 488  * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
 489  * loads are to be performed from probe context, they _must_ be in terms of
 490  * the safe dtrace_load*() variants.
 491  *
 492  * Some functions in this block are not actually called from probe context;
 493  * for these functions, there will be a comment above the function reading
 494  * "Note:  not called from probe context."
 495  */
 496 void
 497 dtrace_panic(const char *format, ...)
 498 {
 499         va_list alist;
 500 
 501         va_start(alist, format);
 502         dtrace_vpanic(format, alist);
 503         va_end(alist);
 504 }
 505 
 506 int
 507 dtrace_assfail(const char *a, const char *f, int l)
 508 {
 509         dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
 510 
 511         /*
 512          * We just need something here that even the most clever compiler
 513          * cannot optimize away.
 514          */
 515         return (a[(uintptr_t)f]);
 516 }
 517 
 518 /*
 519  * Atomically increment a specified error counter from probe context.
 520  */
 521 static void
 522 dtrace_error(uint32_t *counter)
 523 {
 524         /*
 525          * Most counters stored to in probe context are per-CPU counters.
 526          * However, there are some error conditions that are sufficiently
 527          * arcane that they don't merit per-CPU storage.  If these counters
 528          * are incremented concurrently on different CPUs, scalability will be
 529          * adversely affected -- but we don't expect them to be white-hot in a
 530          * correctly constructed enabling...
 531          */
 532         uint32_t oval, nval;
 533 
 534         do {
 535                 oval = *counter;
 536 
 537                 if ((nval = oval + 1) == 0) {
 538                         /*
 539                          * If the counter would wrap, set it to 1 -- assuring
 540                          * that the counter is never zero when we have seen
 541                          * errors.  (The counter must be 32-bits because we
 542                          * aren't guaranteed a 64-bit compare&swap operation.)
 543                          * To save this code both the infamy of being fingered
 544                          * by a priggish news story and the indignity of being
 545                          * the target of a neo-puritan witch trial, we're
 546                          * carefully avoiding any colorful description of the
 547                          * likelihood of this condition -- but suffice it to
 548                          * say that it is only slightly more likely than the
 549                          * overflow of predicate cache IDs, as discussed in
 550                          * dtrace_predicate_create().
 551                          */
 552                         nval = 1;
 553                 }
 554         } while (dtrace_cas32(counter, oval, nval) != oval);
 555 }
 556 
 557 /*
 558  * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
 559  * uint8_t, a uint16_t, a uint32_t and a uint64_t.
 560  */
 561 DTRACE_LOADFUNC(8)
 562 DTRACE_LOADFUNC(16)
 563 DTRACE_LOADFUNC(32)
 564 DTRACE_LOADFUNC(64)
 565 
 566 static int
 567 dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
 568 {
 569         if (dest < mstate->dtms_scratch_base)
 570                 return (0);
 571 
 572         if (dest + size < dest)
 573                 return (0);
 574 
 575         if (dest + size > mstate->dtms_scratch_ptr)
 576                 return (0);
 577 
 578         return (1);
 579 }
 580 
 581 static int
 582 dtrace_canstore_statvar(uint64_t addr, size_t sz,
 583     dtrace_statvar_t **svars, int nsvars)
 584 {
 585         int i;
 586 
 587         for (i = 0; i < nsvars; i++) {
 588                 dtrace_statvar_t *svar = svars[i];
 589 
 590                 if (svar == NULL || svar->dtsv_size == 0)
 591                         continue;
 592 
 593                 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
 594                         return (1);
 595         }
 596 
 597         return (0);
 598 }
 599 
 600 /*
 601  * Check to see if the address is within a memory region to which a store may
 602  * be issued.  This includes the DTrace scratch areas, and any DTrace variable
 603  * region.  The caller of dtrace_canstore() is responsible for performing any
 604  * alignment checks that are needed before stores are actually executed.
 605  */
 606 static int
 607 dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 608     dtrace_vstate_t *vstate)
 609 {
 610         /*
 611          * First, check to see if the address is in scratch space...
 612          */
 613         if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
 614             mstate->dtms_scratch_size))
 615                 return (1);
 616 
 617         /*
 618          * Now check to see if it's a dynamic variable.  This check will pick
 619          * up both thread-local variables and any global dynamically-allocated
 620          * variables.
 621          */
 622         if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
 623             vstate->dtvs_dynvars.dtds_size)) {
 624                 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
 625                 uintptr_t base = (uintptr_t)dstate->dtds_base +
 626                     (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
 627                 uintptr_t chunkoffs;
 628 
 629                 /*
 630                  * Before we assume that we can store here, we need to make
 631                  * sure that it isn't in our metadata -- storing to our
 632                  * dynamic variable metadata would corrupt our state.  For
 633                  * the range to not include any dynamic variable metadata,
 634                  * it must:
 635                  *
 636                  *      (1) Start above the hash table that is at the base of
 637                  *      the dynamic variable space
 638                  *
 639                  *      (2) Have a starting chunk offset that is beyond the
 640                  *      dtrace_dynvar_t that is at the base of every chunk
 641                  *
 642                  *      (3) Not span a chunk boundary
 643                  *
 644                  */
 645                 if (addr < base)
 646                         return (0);
 647 
 648                 chunkoffs = (addr - base) % dstate->dtds_chunksize;
 649 
 650                 if (chunkoffs < sizeof (dtrace_dynvar_t))
 651                         return (0);
 652 
 653                 if (chunkoffs + sz > dstate->dtds_chunksize)
 654                         return (0);
 655 
 656                 return (1);
 657         }
 658 
 659         /*
 660          * Finally, check the static local and global variables.  These checks
 661          * take the longest, so we perform them last.
 662          */
 663         if (dtrace_canstore_statvar(addr, sz,
 664             vstate->dtvs_locals, vstate->dtvs_nlocals))
 665                 return (1);
 666 
 667         if (dtrace_canstore_statvar(addr, sz,
 668             vstate->dtvs_globals, vstate->dtvs_nglobals))
 669                 return (1);
 670 
 671         return (0);
 672 }
 673 
 674 
 675 /*
 676  * Convenience routine to check to see if the address is within a memory
 677  * region in which a load may be issued given the user's privilege level;
 678  * if not, it sets the appropriate error flags and loads 'addr' into the
 679  * illegal value slot.
 680  *
 681  * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
 682  * appropriate memory access protection.
 683  */
 684 static int
 685 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 686     dtrace_vstate_t *vstate)
 687 {
 688         volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
 689 
 690         /*
 691          * If we hold the privilege to read from kernel memory, then
 692          * everything is readable.
 693          */
 694         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
 695                 return (1);
 696 
 697         /*
 698          * You can obviously read that which you can store.
 699          */
 700         if (dtrace_canstore(addr, sz, mstate, vstate))
 701                 return (1);
 702 
 703         /*
 704          * We're allowed to read from our own string table.
 705          */
 706         if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
 707             mstate->dtms_difo->dtdo_strlen))
 708                 return (1);
 709 
 710         if (vstate->dtvs_state != NULL &&
 711             dtrace_priv_proc(vstate->dtvs_state, mstate)) {
 712                 proc_t *p;
 713 
 714                 /*
 715                  * When we have privileges to the current process, there are
 716                  * several context-related kernel structures that are safe to
 717                  * read, even absent the privilege to read from kernel memory.
 718                  * These reads are safe because these structures contain only
 719                  * state that (1) we're permitted to read, (2) is harmless or
 720                  * (3) contains pointers to additional kernel state that we're
 721                  * not permitted to read (and as such, do not present an
 722                  * opportunity for privilege escalation).  Finally (and
 723                  * critically), because of the nature of their relation with
 724                  * the current thread context, the memory associated with these
 725                  * structures cannot change over the duration of probe context,
 726                  * and it is therefore impossible for this memory to be
 727                  * deallocated and reallocated as something else while it's
 728                  * being operated upon.
 729                  */
 730                 if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t)))
 731                         return (1);
 732 
 733                 if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
 734                     sz, curthread->t_procp, sizeof (proc_t))) {
 735                         return (1);
 736                 }
 737 
 738                 if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
 739                     curthread->t_cred, sizeof (cred_t))) {
 740                         return (1);
 741                 }
 742 
 743                 if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
 744                     &(p->p_pidp->pid_id), sizeof (pid_t))) {
 745                         return (1);
 746                 }
 747 
 748                 if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
 749                     curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
 750                         return (1);
 751                 }
 752         }
 753 
 754         DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
 755         *illval = addr;
 756         return (0);
 757 }
 758 
 759 /*
 760  * Convenience routine to check to see if a given string is within a memory
 761  * region in which a load may be issued given the user's privilege level;
 762  * this exists so that we don't need to issue unnecessary dtrace_strlen()
 763  * calls in the event that the user has all privileges.
 764  */
 765 static int
 766 dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 767     dtrace_vstate_t *vstate)
 768 {
 769         size_t strsz;
 770 
 771         /*
 772          * If we hold the privilege to read from kernel memory, then
 773          * everything is readable.
 774          */
 775         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
 776                 return (1);
 777 
 778         strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
 779         if (dtrace_canload(addr, strsz, mstate, vstate))
 780                 return (1);
 781 
 782         return (0);
 783 }
 784 
 785 /*
 786  * Convenience routine to check to see if a given variable is within a memory
 787  * region in which a load may be issued given the user's privilege level.
 788  */
 789 static int
 790 dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
 791     dtrace_vstate_t *vstate)
 792 {
 793         size_t sz;
 794         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
 795 
 796         /*
 797          * If we hold the privilege to read from kernel memory, then
 798          * everything is readable.
 799          */
 800         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
 801                 return (1);
 802 
 803         if (type->dtdt_kind == DIF_TYPE_STRING)
 804                 sz = dtrace_strlen(src,
 805                     vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
 806         else
 807                 sz = type->dtdt_size;
 808 
 809         return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
 810 }
 811 
 812 /*
 813  * Compare two strings using safe loads.
 814  */
 815 static int
 816 dtrace_strncmp(char *s1, char *s2, size_t limit)
 817 {
 818         uint8_t c1, c2;
 819         volatile uint16_t *flags;
 820 
 821         if (s1 == s2 || limit == 0)
 822                 return (0);
 823 
 824         flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
 825 
 826         do {
 827                 if (s1 == NULL) {
 828                         c1 = '\0';
 829                 } else {
 830                         c1 = dtrace_load8((uintptr_t)s1++);
 831                 }
 832 
 833                 if (s2 == NULL) {
 834                         c2 = '\0';
 835                 } else {
 836                         c2 = dtrace_load8((uintptr_t)s2++);
 837                 }
 838 
 839                 if (c1 != c2)
 840                         return (c1 - c2);
 841         } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
 842 
 843         return (0);
 844 }
 845 
 846 /*
 847  * Compute strlen(s) for a string using safe memory accesses.  The additional
 848  * len parameter is used to specify a maximum length to ensure completion.
 849  */
 850 static size_t
 851 dtrace_strlen(const char *s, size_t lim)
 852 {
 853         uint_t len;
 854 
 855         for (len = 0; len != lim; len++) {
 856                 if (dtrace_load8((uintptr_t)s++) == '\0')
 857                         break;
 858         }
 859 
 860         return (len);
 861 }
 862 
 863 /*
 864  * Check if an address falls within a toxic region.
 865  */
 866 static int
 867 dtrace_istoxic(uintptr_t kaddr, size_t size)
 868 {
 869         uintptr_t taddr, tsize;
 870         int i;
 871 
 872         for (i = 0; i < dtrace_toxranges; i++) {
 873                 taddr = dtrace_toxrange[i].dtt_base;
 874                 tsize = dtrace_toxrange[i].dtt_limit - taddr;
 875 
 876                 if (kaddr - taddr < tsize) {
 877                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
 878                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
 879                         return (1);
 880                 }
 881 
 882                 if (taddr - kaddr < size) {
 883                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
 884                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
 885                         return (1);
 886                 }
 887         }
 888 
 889         return (0);
 890 }
 891 
 892 /*
 893  * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
 894  * memory specified by the DIF program.  The dst is assumed to be safe memory
 895  * that we can store to directly because it is managed by DTrace.  As with
 896  * standard bcopy, overlapping copies are handled properly.
 897  */
 898 static void
 899 dtrace_bcopy(const void *src, void *dst, size_t len)
 900 {
 901         if (len != 0) {
 902                 uint8_t *s1 = dst;
 903                 const uint8_t *s2 = src;
 904 
 905                 if (s1 <= s2) {
 906                         do {
 907                                 *s1++ = dtrace_load8((uintptr_t)s2++);
 908                         } while (--len != 0);
 909                 } else {
 910                         s2 += len;
 911                         s1 += len;
 912 
 913                         do {
 914                                 *--s1 = dtrace_load8((uintptr_t)--s2);
 915                         } while (--len != 0);
 916                 }
 917         }
 918 }
 919 
 920 /*
 921  * Copy src to dst using safe memory accesses, up to either the specified
 922  * length, or the point that a nul byte is encountered.  The src is assumed to
 923  * be unsafe memory specified by the DIF program.  The dst is assumed to be
 924  * safe memory that we can store to directly because it is managed by DTrace.
 925  * Unlike dtrace_bcopy(), overlapping regions are not handled.
 926  */
 927 static void
 928 dtrace_strcpy(const void *src, void *dst, size_t len)
 929 {
 930         if (len != 0) {
 931                 uint8_t *s1 = dst, c;
 932                 const uint8_t *s2 = src;
 933 
 934                 do {
 935                         *s1++ = c = dtrace_load8((uintptr_t)s2++);
 936                 } while (--len != 0 && c != '\0');
 937         }
 938 }
 939 
 940 /*
 941  * Copy src to dst, deriving the size and type from the specified (BYREF)
 942  * variable type.  The src is assumed to be unsafe memory specified by the DIF
 943  * program.  The dst is assumed to be DTrace variable memory that is of the
 944  * specified type; we assume that we can store to directly.
 945  */
 946 static void
 947 dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
 948 {
 949         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
 950 
 951         if (type->dtdt_kind == DIF_TYPE_STRING) {
 952                 dtrace_strcpy(src, dst, type->dtdt_size);
 953         } else {
 954                 dtrace_bcopy(src, dst, type->dtdt_size);
 955         }
 956 }
 957 
 958 /*
 959  * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
 960  * unsafe memory specified by the DIF program.  The s2 data is assumed to be
 961  * safe memory that we can access directly because it is managed by DTrace.
 962  */
 963 static int
 964 dtrace_bcmp(const void *s1, const void *s2, size_t len)
 965 {
 966         volatile uint16_t *flags;
 967 
 968         flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
 969 
 970         if (s1 == s2)
 971                 return (0);
 972 
 973         if (s1 == NULL || s2 == NULL)
 974                 return (1);
 975 
 976         if (s1 != s2 && len != 0) {
 977                 const uint8_t *ps1 = s1;
 978                 const uint8_t *ps2 = s2;
 979 
 980                 do {
 981                         if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
 982                                 return (1);
 983                 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
 984         }
 985         return (0);
 986 }
 987 
 988 /*
 989  * Zero the specified region using a simple byte-by-byte loop.  Note that this
 990  * is for safe DTrace-managed memory only.
 991  */
 992 static void
 993 dtrace_bzero(void *dst, size_t len)
 994 {
 995         uchar_t *cp;
 996 
 997         for (cp = dst; len != 0; len--)
 998                 *cp++ = 0;
 999 }
1000 
1001 static void
1002 dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1003 {
1004         uint64_t result[2];
1005 
1006         result[0] = addend1[0] + addend2[0];
1007         result[1] = addend1[1] + addend2[1] +
1008             (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1009 
1010         sum[0] = result[0];
1011         sum[1] = result[1];
1012 }
1013 
1014 /*
1015  * Shift the 128-bit value in a by b. If b is positive, shift left.
1016  * If b is negative, shift right.
1017  */
1018 static void
1019 dtrace_shift_128(uint64_t *a, int b)
1020 {
1021         uint64_t mask;
1022 
1023         if (b == 0)
1024                 return;
1025 
1026         if (b < 0) {
1027                 b = -b;
1028                 if (b >= 64) {
1029                         a[0] = a[1] >> (b - 64);
1030                         a[1] = 0;
1031                 } else {
1032                         a[0] >>= b;
1033                         mask = 1LL << (64 - b);
1034                         mask -= 1;
1035                         a[0] |= ((a[1] & mask) << (64 - b));
1036                         a[1] >>= b;
1037                 }
1038         } else {
1039                 if (b >= 64) {
1040                         a[1] = a[0] << (b - 64);
1041                         a[0] = 0;
1042                 } else {
1043                         a[1] <<= b;
1044                         mask = a[0] >> (64 - b);
1045                         a[1] |= mask;
1046                         a[0] <<= b;
1047                 }
1048         }
1049 }
1050 
1051 /*
1052  * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1053  * use native multiplication on those, and then re-combine into the
1054  * resulting 128-bit value.
1055  *
1056  * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1057  *     hi1 * hi2 << 64 +
1058  *     hi1 * lo2 << 32 +
1059  *     hi2 * lo1 << 32 +
1060  *     lo1 * lo2
1061  */
1062 static void
1063 dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1064 {
1065         uint64_t hi1, hi2, lo1, lo2;
1066         uint64_t tmp[2];
1067 
1068         hi1 = factor1 >> 32;
1069         hi2 = factor2 >> 32;
1070 
1071         lo1 = factor1 & DT_MASK_LO;
1072         lo2 = factor2 & DT_MASK_LO;
1073 
1074         product[0] = lo1 * lo2;
1075         product[1] = hi1 * hi2;
1076 
1077         tmp[0] = hi1 * lo2;
1078         tmp[1] = 0;
1079         dtrace_shift_128(tmp, 32);
1080         dtrace_add_128(product, tmp, product);
1081 
1082         tmp[0] = hi2 * lo1;
1083         tmp[1] = 0;
1084         dtrace_shift_128(tmp, 32);
1085         dtrace_add_128(product, tmp, product);
1086 }
1087 
1088 /*
1089  * This privilege check should be used by actions and subroutines to
1090  * verify that the user credentials of the process that enabled the
1091  * invoking ECB match the target credentials
1092  */
1093 static int
1094 dtrace_priv_proc_common_user(dtrace_state_t *state)
1095 {
1096         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1097 
1098         /*
1099          * We should always have a non-NULL state cred here, since if cred
1100          * is null (anonymous tracing), we fast-path bypass this routine.
1101          */
1102         ASSERT(s_cr != NULL);
1103 
1104         if ((cr = CRED()) != NULL &&
1105             s_cr->cr_uid == cr->cr_uid &&
1106             s_cr->cr_uid == cr->cr_ruid &&
1107             s_cr->cr_uid == cr->cr_suid &&
1108             s_cr->cr_gid == cr->cr_gid &&
1109             s_cr->cr_gid == cr->cr_rgid &&
1110             s_cr->cr_gid == cr->cr_sgid)
1111                 return (1);
1112 
1113         return (0);
1114 }
1115 
1116 /*
1117  * This privilege check should be used by actions and subroutines to
1118  * verify that the zone of the process that enabled the invoking ECB
1119  * matches the target credentials
1120  */
1121 static int
1122 dtrace_priv_proc_common_zone(dtrace_state_t *state)
1123 {
1124         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1125 
1126         /*
1127          * We should always have a non-NULL state cred here, since if cred
1128          * is null (anonymous tracing), we fast-path bypass this routine.
1129          */
1130         ASSERT(s_cr != NULL);
1131 
1132         if ((cr = CRED()) != NULL &&
1133             s_cr->cr_zone == cr->cr_zone)
1134                 return (1);
1135 
1136         return (0);
1137 }
1138 
1139 /*
1140  * This privilege check should be used by actions and subroutines to
1141  * verify that the process has not setuid or changed credentials.
1142  */
1143 static int
1144 dtrace_priv_proc_common_nocd()
1145 {
1146         proc_t *proc;
1147 
1148         if ((proc = ttoproc(curthread)) != NULL &&
1149             !(proc->p_flag & SNOCD))
1150                 return (1);
1151 
1152         return (0);
1153 }
1154 
1155 static int
1156 dtrace_priv_proc_destructive(dtrace_state_t *state, dtrace_mstate_t *mstate)
1157 {
1158         int action = state->dts_cred.dcr_action;
1159 
1160         if (!(mstate->dtms_access & DTRACE_ACCESS_PROC))
1161                 goto bad;
1162 
1163         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1164             dtrace_priv_proc_common_zone(state) == 0)
1165                 goto bad;
1166 
1167         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1168             dtrace_priv_proc_common_user(state) == 0)
1169                 goto bad;
1170 
1171         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1172             dtrace_priv_proc_common_nocd() == 0)
1173                 goto bad;
1174 
1175         return (1);
1176 
1177 bad:
1178         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1179 
1180         return (0);
1181 }
1182 
1183 static int
1184 dtrace_priv_proc_control(dtrace_state_t *state, dtrace_mstate_t *mstate)
1185 {
1186         if (mstate->dtms_access & DTRACE_ACCESS_PROC) {
1187                 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1188                         return (1);
1189 
1190                 if (dtrace_priv_proc_common_zone(state) &&
1191                     dtrace_priv_proc_common_user(state) &&
1192                     dtrace_priv_proc_common_nocd())
1193                         return (1);
1194         }
1195 
1196         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1197 
1198         return (0);
1199 }
1200 
1201 static int
1202 dtrace_priv_proc(dtrace_state_t *state, dtrace_mstate_t *mstate)
1203 {
1204         if ((mstate->dtms_access & DTRACE_ACCESS_PROC) &&
1205             (state->dts_cred.dcr_action & DTRACE_CRA_PROC))
1206                 return (1);
1207 
1208         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1209 
1210         return (0);
1211 }
1212 
1213 static int
1214 dtrace_priv_kernel(dtrace_state_t *state)
1215 {
1216         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1217                 return (1);
1218 
1219         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1220 
1221         return (0);
1222 }
1223 
1224 static int
1225 dtrace_priv_kernel_destructive(dtrace_state_t *state)
1226 {
1227         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1228                 return (1);
1229 
1230         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1231 
1232         return (0);
1233 }
1234 
1235 /*
1236  * Determine if the dte_cond of the specified ECB allows for processing of
1237  * the current probe to continue.  Note that this routine may allow continued
1238  * processing, but with access(es) stripped from the mstate's dtms_access
1239  * field.
1240  */
1241 static int
1242 dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
1243     dtrace_ecb_t *ecb)
1244 {
1245         dtrace_probe_t *probe = ecb->dte_probe;
1246         dtrace_provider_t *prov = probe->dtpr_provider;
1247         dtrace_pops_t *pops = &prov->dtpv_pops;
1248         int mode = DTRACE_MODE_NOPRIV_DROP;
1249 
1250         ASSERT(ecb->dte_cond);
1251 
1252         if (pops->dtps_mode != NULL) {
1253                 mode = pops->dtps_mode(prov->dtpv_arg,
1254                     probe->dtpr_id, probe->dtpr_arg);
1255 
1256                 ASSERT((mode & DTRACE_MODE_USER) ||
1257                     (mode & DTRACE_MODE_KERNEL));
1258                 ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) ||
1259                     (mode & DTRACE_MODE_NOPRIV_DROP));
1260         }
1261 
1262         /*
1263          * If the dte_cond bits indicate that this consumer is only allowed to
1264          * see user-mode firings of this probe, call the provider's dtps_mode()
1265          * entry point to check that the probe was fired while in a user
1266          * context.  If that's not the case, use the policy specified by the
1267          * provider to determine if we drop the probe or merely restrict
1268          * operation.
1269          */
1270         if (ecb->dte_cond & DTRACE_COND_USERMODE) {
1271                 ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
1272 
1273                 if (!(mode & DTRACE_MODE_USER)) {
1274                         if (mode & DTRACE_MODE_NOPRIV_DROP)
1275                                 return (0);
1276 
1277                         mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
1278                 }
1279         }
1280 
1281         /*
1282          * This is more subtle than it looks. We have to be absolutely certain
1283          * that CRED() isn't going to change out from under us so it's only
1284          * legit to examine that structure if we're in constrained situations.
1285          * Currently, the only times we'll this check is if a non-super-user
1286          * has enabled the profile or syscall providers -- providers that
1287          * allow visibility of all processes. For the profile case, the check
1288          * above will ensure that we're examining a user context.
1289          */
1290         if (ecb->dte_cond & DTRACE_COND_OWNER) {
1291                 cred_t *cr;
1292                 cred_t *s_cr = state->dts_cred.dcr_cred;
1293                 proc_t *proc;
1294 
1295                 ASSERT(s_cr != NULL);
1296 
1297                 if ((cr = CRED()) == NULL ||
1298                     s_cr->cr_uid != cr->cr_uid ||
1299                     s_cr->cr_uid != cr->cr_ruid ||
1300                     s_cr->cr_uid != cr->cr_suid ||
1301                     s_cr->cr_gid != cr->cr_gid ||
1302                     s_cr->cr_gid != cr->cr_rgid ||
1303                     s_cr->cr_gid != cr->cr_sgid ||
1304                     (proc = ttoproc(curthread)) == NULL ||
1305                     (proc->p_flag & SNOCD)) {
1306                         if (mode & DTRACE_MODE_NOPRIV_DROP)
1307                                 return (0);
1308 
1309                         mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
1310                 }
1311         }
1312 
1313         /*
1314          * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
1315          * in our zone, check to see if our mode policy is to restrict rather
1316          * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
1317          * and DTRACE_ACCESS_ARGS
1318          */
1319         if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
1320                 cred_t *cr;
1321                 cred_t *s_cr = state->dts_cred.dcr_cred;
1322 
1323                 ASSERT(s_cr != NULL);
1324 
1325                 if ((cr = CRED()) == NULL ||
1326                     s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
1327                         if (mode & DTRACE_MODE_NOPRIV_DROP)
1328                                 return (0);
1329 
1330                         mstate->dtms_access &=
1331                             ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
1332                 }
1333         }
1334 
1335         return (1);
1336 }
1337 
1338 /*
1339  * Note:  not called from probe context.  This function is called
1340  * asynchronously (and at a regular interval) from outside of probe context to
1341  * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1342  * cleaning is explained in detail in <sys/dtrace_impl.h>.
1343  */
1344 void
1345 dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1346 {
1347         dtrace_dynvar_t *dirty;
1348         dtrace_dstate_percpu_t *dcpu;
1349         dtrace_dynvar_t **rinsep;
1350         int i, j, work = 0;
1351 
1352         for (i = 0; i < NCPU; i++) {
1353                 dcpu = &dstate->dtds_percpu[i];
1354                 rinsep = &dcpu->dtdsc_rinsing;
1355 
1356                 /*
1357                  * If the dirty list is NULL, there is no dirty work to do.
1358                  */
1359                 if (dcpu->dtdsc_dirty == NULL)
1360                         continue;
1361 
1362                 if (dcpu->dtdsc_rinsing != NULL) {
1363                         /*
1364                          * If the rinsing list is non-NULL, then it is because
1365                          * this CPU was selected to accept another CPU's
1366                          * dirty list -- and since that time, dirty buffers
1367                          * have accumulated.  This is a highly unlikely
1368                          * condition, but we choose to ignore the dirty
1369                          * buffers -- they'll be picked up a future cleanse.
1370                          */
1371                         continue;
1372                 }
1373 
1374                 if (dcpu->dtdsc_clean != NULL) {
1375                         /*
1376                          * If the clean list is non-NULL, then we're in a
1377                          * situation where a CPU has done deallocations (we
1378                          * have a non-NULL dirty list) but no allocations (we
1379                          * also have a non-NULL clean list).  We can't simply
1380                          * move the dirty list into the clean list on this
1381                          * CPU, yet we also don't want to allow this condition
1382                          * to persist, lest a short clean list prevent a
1383                          * massive dirty list from being cleaned (which in
1384                          * turn could lead to otherwise avoidable dynamic
1385                          * drops).  To deal with this, we look for some CPU
1386                          * with a NULL clean list, NULL dirty list, and NULL
1387                          * rinsing list -- and then we borrow this CPU to
1388                          * rinse our dirty list.
1389                          */
1390                         for (j = 0; j < NCPU; j++) {
1391                                 dtrace_dstate_percpu_t *rinser;
1392 
1393                                 rinser = &dstate->dtds_percpu[j];
1394 
1395                                 if (rinser->dtdsc_rinsing != NULL)
1396                                         continue;
1397 
1398                                 if (rinser->dtdsc_dirty != NULL)
1399                                         continue;
1400 
1401                                 if (rinser->dtdsc_clean != NULL)
1402                                         continue;
1403 
1404                                 rinsep = &rinser->dtdsc_rinsing;
1405                                 break;
1406                         }
1407 
1408                         if (j == NCPU) {
1409                                 /*
1410                                  * We were unable to find another CPU that
1411                                  * could accept this dirty list -- we are
1412                                  * therefore unable to clean it now.
1413                                  */
1414                                 dtrace_dynvar_failclean++;
1415                                 continue;
1416                         }
1417                 }
1418 
1419                 work = 1;
1420 
1421                 /*
1422                  * Atomically move the dirty list aside.
1423                  */
1424                 do {
1425                         dirty = dcpu->dtdsc_dirty;
1426 
1427                         /*
1428                          * Before we zap the dirty list, set the rinsing list.
1429                          * (This allows for a potential assertion in
1430                          * dtrace_dynvar():  if a free dynamic variable appears
1431                          * on a hash chain, either the dirty list or the
1432                          * rinsing list for some CPU must be non-NULL.)
1433                          */
1434                         *rinsep = dirty;
1435                         dtrace_membar_producer();
1436                 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1437                     dirty, NULL) != dirty);
1438         }
1439 
1440         if (!work) {
1441                 /*
1442                  * We have no work to do; we can simply return.
1443                  */
1444                 return;
1445         }
1446 
1447         dtrace_sync();
1448 
1449         for (i = 0; i < NCPU; i++) {
1450                 dcpu = &dstate->dtds_percpu[i];
1451 
1452                 if (dcpu->dtdsc_rinsing == NULL)
1453                         continue;
1454 
1455                 /*
1456                  * We are now guaranteed that no hash chain contains a pointer
1457                  * into this dirty list; we can make it clean.
1458                  */
1459                 ASSERT(dcpu->dtdsc_clean == NULL);
1460                 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1461                 dcpu->dtdsc_rinsing = NULL;
1462         }
1463 
1464         /*
1465          * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1466          * sure that all CPUs have seen all of the dtdsc_clean pointers.
1467          * This prevents a race whereby a CPU incorrectly decides that
1468          * the state should be something other than DTRACE_DSTATE_CLEAN
1469          * after dtrace_dynvar_clean() has completed.
1470          */
1471         dtrace_sync();
1472 
1473         dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1474 }
1475 
1476 /*
1477  * Depending on the value of the op parameter, this function looks-up,
1478  * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1479  * allocation is requested, this function will return a pointer to a
1480  * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1481  * variable can be allocated.  If NULL is returned, the appropriate counter
1482  * will be incremented.
1483  */
1484 dtrace_dynvar_t *
1485 dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1486     dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1487     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1488 {
1489         uint64_t hashval = DTRACE_DYNHASH_VALID;
1490         dtrace_dynhash_t *hash = dstate->dtds_hash;
1491         dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1492         processorid_t me = CPU->cpu_id, cpu = me;
1493         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1494         size_t bucket, ksize;
1495         size_t chunksize = dstate->dtds_chunksize;
1496         uintptr_t kdata, lock, nstate;
1497         uint_t i;
1498 
1499         ASSERT(nkeys != 0);
1500 
1501         /*
1502          * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1503          * algorithm.  For the by-value portions, we perform the algorithm in
1504          * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1505          * bit, and seems to have only a minute effect on distribution.  For
1506          * the by-reference data, we perform "One-at-a-time" iterating (safely)
1507          * over each referenced byte.  It's painful to do this, but it's much
1508          * better than pathological hash distribution.  The efficacy of the
1509          * hashing algorithm (and a comparison with other algorithms) may be
1510          * found by running the ::dtrace_dynstat MDB dcmd.
1511          */
1512         for (i = 0; i < nkeys; i++) {
1513                 if (key[i].dttk_size == 0) {
1514                         uint64_t val = key[i].dttk_value;
1515 
1516                         hashval += (val >> 48) & 0xffff;
1517                         hashval += (hashval << 10);
1518                         hashval ^= (hashval >> 6);
1519 
1520                         hashval += (val >> 32) & 0xffff;
1521                         hashval += (hashval << 10);
1522                         hashval ^= (hashval >> 6);
1523 
1524                         hashval += (val >> 16) & 0xffff;
1525                         hashval += (hashval << 10);
1526                         hashval ^= (hashval >> 6);
1527 
1528                         hashval += val & 0xffff;
1529                         hashval += (hashval << 10);
1530                         hashval ^= (hashval >> 6);
1531                 } else {
1532                         /*
1533                          * This is incredibly painful, but it beats the hell
1534                          * out of the alternative.
1535                          */
1536                         uint64_t j, size = key[i].dttk_size;
1537                         uintptr_t base = (uintptr_t)key[i].dttk_value;
1538 
1539                         if (!dtrace_canload(base, size, mstate, vstate))
1540                                 break;
1541 
1542                         for (j = 0; j < size; j++) {
1543                                 hashval += dtrace_load8(base + j);
1544                                 hashval += (hashval << 10);
1545                                 hashval ^= (hashval >> 6);
1546                         }
1547                 }
1548         }
1549 
1550         if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1551                 return (NULL);
1552 
1553         hashval += (hashval << 3);
1554         hashval ^= (hashval >> 11);
1555         hashval += (hashval << 15);
1556 
1557         /*
1558          * There is a remote chance (ideally, 1 in 2^31) that our hashval
1559          * comes out to be one of our two sentinel hash values.  If this
1560          * actually happens, we set the hashval to be a value known to be a
1561          * non-sentinel value.
1562          */
1563         if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1564                 hashval = DTRACE_DYNHASH_VALID;
1565 
1566         /*
1567          * Yes, it's painful to do a divide here.  If the cycle count becomes
1568          * important here, tricks can be pulled to reduce it.  (However, it's
1569          * critical that hash collisions be kept to an absolute minimum;
1570          * they're much more painful than a divide.)  It's better to have a
1571          * solution that generates few collisions and still keeps things
1572          * relatively simple.
1573          */
1574         bucket = hashval % dstate->dtds_hashsize;
1575 
1576         if (op == DTRACE_DYNVAR_DEALLOC) {
1577                 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1578 
1579                 for (;;) {
1580                         while ((lock = *lockp) & 1)
1581                                 continue;
1582 
1583                         if (dtrace_casptr((void *)lockp,
1584                             (void *)lock, (void *)(lock + 1)) == (void *)lock)
1585                                 break;
1586                 }
1587 
1588                 dtrace_membar_producer();
1589         }
1590 
1591 top:
1592         prev = NULL;
1593         lock = hash[bucket].dtdh_lock;
1594 
1595         dtrace_membar_consumer();
1596 
1597         start = hash[bucket].dtdh_chain;
1598         ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1599             start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1600             op != DTRACE_DYNVAR_DEALLOC));
1601 
1602         for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1603                 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1604                 dtrace_key_t *dkey = &dtuple->dtt_key[0];
1605 
1606                 if (dvar->dtdv_hashval != hashval) {
1607                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1608                                 /*
1609                                  * We've reached the sink, and therefore the
1610                                  * end of the hash chain; we can kick out of
1611                                  * the loop knowing that we have seen a valid
1612                                  * snapshot of state.
1613                                  */
1614                                 ASSERT(dvar->dtdv_next == NULL);
1615                                 ASSERT(dvar == &dtrace_dynhash_sink);
1616                                 break;
1617                         }
1618 
1619                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1620                                 /*
1621                                  * We've gone off the rails:  somewhere along
1622                                  * the line, one of the members of this hash
1623                                  * chain was deleted.  Note that we could also
1624                                  * detect this by simply letting this loop run
1625                                  * to completion, as we would eventually hit
1626                                  * the end of the dirty list.  However, we
1627                                  * want to avoid running the length of the
1628                                  * dirty list unnecessarily (it might be quite
1629                                  * long), so we catch this as early as
1630                                  * possible by detecting the hash marker.  In
1631                                  * this case, we simply set dvar to NULL and
1632                                  * break; the conditional after the loop will
1633                                  * send us back to top.
1634                                  */
1635                                 dvar = NULL;
1636                                 break;
1637                         }
1638 
1639                         goto next;
1640                 }
1641 
1642                 if (dtuple->dtt_nkeys != nkeys)
1643                         goto next;
1644 
1645                 for (i = 0; i < nkeys; i++, dkey++) {
1646                         if (dkey->dttk_size != key[i].dttk_size)
1647                                 goto next; /* size or type mismatch */
1648 
1649                         if (dkey->dttk_size != 0) {
1650                                 if (dtrace_bcmp(
1651                                     (void *)(uintptr_t)key[i].dttk_value,
1652                                     (void *)(uintptr_t)dkey->dttk_value,
1653                                     dkey->dttk_size))
1654                                         goto next;
1655                         } else {
1656                                 if (dkey->dttk_value != key[i].dttk_value)
1657                                         goto next;
1658                         }
1659                 }
1660 
1661                 if (op != DTRACE_DYNVAR_DEALLOC)
1662                         return (dvar);
1663 
1664                 ASSERT(dvar->dtdv_next == NULL ||
1665                     dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1666 
1667                 if (prev != NULL) {
1668                         ASSERT(hash[bucket].dtdh_chain != dvar);
1669                         ASSERT(start != dvar);
1670                         ASSERT(prev->dtdv_next == dvar);
1671                         prev->dtdv_next = dvar->dtdv_next;
1672                 } else {
1673                         if (dtrace_casptr(&hash[bucket].dtdh_chain,
1674                             start, dvar->dtdv_next) != start) {
1675                                 /*
1676                                  * We have failed to atomically swing the
1677                                  * hash table head pointer, presumably because
1678                                  * of a conflicting allocation on another CPU.
1679                                  * We need to reread the hash chain and try
1680                                  * again.
1681                                  */
1682                                 goto top;
1683                         }
1684                 }
1685 
1686                 dtrace_membar_producer();
1687 
1688                 /*
1689                  * Now set the hash value to indicate that it's free.
1690                  */
1691                 ASSERT(hash[bucket].dtdh_chain != dvar);
1692                 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1693 
1694                 dtrace_membar_producer();
1695 
1696                 /*
1697                  * Set the next pointer to point at the dirty list, and
1698                  * atomically swing the dirty pointer to the newly freed dvar.
1699                  */
1700                 do {
1701                         next = dcpu->dtdsc_dirty;
1702                         dvar->dtdv_next = next;
1703                 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1704 
1705                 /*
1706                  * Finally, unlock this hash bucket.
1707                  */
1708                 ASSERT(hash[bucket].dtdh_lock == lock);
1709                 ASSERT(lock & 1);
1710                 hash[bucket].dtdh_lock++;
1711 
1712                 return (NULL);
1713 next:
1714                 prev = dvar;
1715                 continue;
1716         }
1717 
1718         if (dvar == NULL) {
1719                 /*
1720                  * If dvar is NULL, it is because we went off the rails:
1721                  * one of the elements that we traversed in the hash chain
1722                  * was deleted while we were traversing it.  In this case,
1723                  * we assert that we aren't doing a dealloc (deallocs lock
1724                  * the hash bucket to prevent themselves from racing with
1725                  * one another), and retry the hash chain traversal.
1726                  */
1727                 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1728                 goto top;
1729         }
1730 
1731         if (op != DTRACE_DYNVAR_ALLOC) {
1732                 /*
1733                  * If we are not to allocate a new variable, we want to
1734                  * return NULL now.  Before we return, check that the value
1735                  * of the lock word hasn't changed.  If it has, we may have
1736                  * seen an inconsistent snapshot.
1737                  */
1738                 if (op == DTRACE_DYNVAR_NOALLOC) {
1739                         if (hash[bucket].dtdh_lock != lock)
1740                                 goto top;
1741                 } else {
1742                         ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1743                         ASSERT(hash[bucket].dtdh_lock == lock);
1744                         ASSERT(lock & 1);
1745                         hash[bucket].dtdh_lock++;
1746                 }
1747 
1748                 return (NULL);
1749         }
1750 
1751         /*
1752          * We need to allocate a new dynamic variable.  The size we need is the
1753          * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1754          * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1755          * the size of any referred-to data (dsize).  We then round the final
1756          * size up to the chunksize for allocation.
1757          */
1758         for (ksize = 0, i = 0; i < nkeys; i++)
1759                 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1760 
1761         /*
1762          * This should be pretty much impossible, but could happen if, say,
1763          * strange DIF specified the tuple.  Ideally, this should be an
1764          * assertion and not an error condition -- but that requires that the
1765          * chunksize calculation in dtrace_difo_chunksize() be absolutely
1766          * bullet-proof.  (That is, it must not be able to be fooled by
1767          * malicious DIF.)  Given the lack of backwards branches in DIF,
1768          * solving this would presumably not amount to solving the Halting
1769          * Problem -- but it still seems awfully hard.
1770          */
1771         if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1772             ksize + dsize > chunksize) {
1773                 dcpu->dtdsc_drops++;
1774                 return (NULL);
1775         }
1776 
1777         nstate = DTRACE_DSTATE_EMPTY;
1778 
1779         do {
1780 retry:
1781                 free = dcpu->dtdsc_free;
1782 
1783                 if (free == NULL) {
1784                         dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
1785                         void *rval;
1786 
1787                         if (clean == NULL) {
1788                                 /*
1789                                  * We're out of dynamic variable space on
1790                                  * this CPU.  Unless we have tried all CPUs,
1791                                  * we'll try to allocate from a different
1792                                  * CPU.
1793                                  */
1794                                 switch (dstate->dtds_state) {
1795                                 case DTRACE_DSTATE_CLEAN: {
1796                                         void *sp = &dstate->dtds_state;
1797 
1798                                         if (++cpu >= NCPU)
1799                                                 cpu = 0;
1800 
1801                                         if (dcpu->dtdsc_dirty != NULL &&
1802                                             nstate == DTRACE_DSTATE_EMPTY)
1803                                                 nstate = DTRACE_DSTATE_DIRTY;
1804 
1805                                         if (dcpu->dtdsc_rinsing != NULL)
1806                                                 nstate = DTRACE_DSTATE_RINSING;
1807 
1808                                         dcpu = &dstate->dtds_percpu[cpu];
1809 
1810                                         if (cpu != me)
1811                                                 goto retry;
1812 
1813                                         (void) dtrace_cas32(sp,
1814                                             DTRACE_DSTATE_CLEAN, nstate);
1815 
1816                                         /*
1817                                          * To increment the correct bean
1818                                          * counter, take another lap.
1819                                          */
1820                                         goto retry;
1821                                 }
1822 
1823                                 case DTRACE_DSTATE_DIRTY:
1824                                         dcpu->dtdsc_dirty_drops++;
1825                                         break;
1826 
1827                                 case DTRACE_DSTATE_RINSING:
1828                                         dcpu->dtdsc_rinsing_drops++;
1829                                         break;
1830 
1831                                 case DTRACE_DSTATE_EMPTY:
1832                                         dcpu->dtdsc_drops++;
1833                                         break;
1834                                 }
1835 
1836                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
1837                                 return (NULL);
1838                         }
1839 
1840                         /*
1841                          * The clean list appears to be non-empty.  We want to
1842                          * move the clean list to the free list; we start by
1843                          * moving the clean pointer aside.
1844                          */
1845                         if (dtrace_casptr(&dcpu->dtdsc_clean,
1846                             clean, NULL) != clean) {
1847                                 /*
1848                                  * We are in one of two situations:
1849                                  *
1850                                  *  (a) The clean list was switched to the
1851                                  *      free list by another CPU.
1852                                  *
1853                                  *  (b) The clean list was added to by the
1854                                  *      cleansing cyclic.
1855                                  *
1856                                  * In either of these situations, we can
1857                                  * just reattempt the free list allocation.
1858                                  */
1859                                 goto retry;
1860                         }
1861 
1862                         ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
1863 
1864                         /*
1865                          * Now we'll move the clean list to our free list.
1866                          * It's impossible for this to fail:  the only way
1867                          * the free list can be updated is through this
1868                          * code path, and only one CPU can own the clean list.
1869                          * Thus, it would only be possible for this to fail if
1870                          * this code were racing with dtrace_dynvar_clean().
1871                          * (That is, if dtrace_dynvar_clean() updated the clean
1872                          * list, and we ended up racing to update the free
1873                          * list.)  This race is prevented by the dtrace_sync()
1874                          * in dtrace_dynvar_clean() -- which flushes the
1875                          * owners of the clean lists out before resetting
1876                          * the clean lists.
1877                          */
1878                         dcpu = &dstate->dtds_percpu[me];
1879                         rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
1880                         ASSERT(rval == NULL);
1881                         goto retry;
1882                 }
1883 
1884                 dvar = free;
1885                 new_free = dvar->dtdv_next;
1886         } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
1887 
1888         /*
1889          * We have now allocated a new chunk.  We copy the tuple keys into the
1890          * tuple array and copy any referenced key data into the data space
1891          * following the tuple array.  As we do this, we relocate dttk_value
1892          * in the final tuple to point to the key data address in the chunk.
1893          */
1894         kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
1895         dvar->dtdv_data = (void *)(kdata + ksize);
1896         dvar->dtdv_tuple.dtt_nkeys = nkeys;
1897 
1898         for (i = 0; i < nkeys; i++) {
1899                 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
1900                 size_t kesize = key[i].dttk_size;
1901 
1902                 if (kesize != 0) {
1903                         dtrace_bcopy(
1904                             (const void *)(uintptr_t)key[i].dttk_value,
1905                             (void *)kdata, kesize);
1906                         dkey->dttk_value = kdata;
1907                         kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
1908                 } else {
1909                         dkey->dttk_value = key[i].dttk_value;
1910                 }
1911 
1912                 dkey->dttk_size = kesize;
1913         }
1914 
1915         ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
1916         dvar->dtdv_hashval = hashval;
1917         dvar->dtdv_next = start;
1918 
1919         if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
1920                 return (dvar);
1921 
1922         /*
1923          * The cas has failed.  Either another CPU is adding an element to
1924          * this hash chain, or another CPU is deleting an element from this
1925          * hash chain.  The simplest way to deal with both of these cases
1926          * (though not necessarily the most efficient) is to free our
1927          * allocated block and tail-call ourselves.  Note that the free is
1928          * to the dirty list and _not_ to the free list.  This is to prevent
1929          * races with allocators, above.
1930          */
1931         dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1932 
1933         dtrace_membar_producer();
1934 
1935         do {
1936                 free = dcpu->dtdsc_dirty;
1937                 dvar->dtdv_next = free;
1938         } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
1939 
1940         return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
1941 }
1942 
1943 /*ARGSUSED*/
1944 static void
1945 dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
1946 {
1947         if ((int64_t)nval < (int64_t)*oval)
1948                 *oval = nval;
1949 }
1950 
1951 /*ARGSUSED*/
1952 static void
1953 dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
1954 {
1955         if ((int64_t)nval > (int64_t)*oval)
1956                 *oval = nval;
1957 }
1958 
1959 static void
1960 dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
1961 {
1962         int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
1963         int64_t val = (int64_t)nval;
1964 
1965         if (val < 0) {
1966                 for (i = 0; i < zero; i++) {
1967                         if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
1968                                 quanta[i] += incr;
1969                                 return;
1970                         }
1971                 }
1972         } else {
1973                 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
1974                         if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
1975                                 quanta[i - 1] += incr;
1976                                 return;
1977                         }
1978                 }
1979 
1980                 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
1981                 return;
1982         }
1983 
1984         ASSERT(0);
1985 }
1986 
1987 static void
1988 dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
1989 {
1990         uint64_t arg = *lquanta++;
1991         int32_t base = DTRACE_LQUANTIZE_BASE(arg);
1992         uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
1993         uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
1994         int32_t val = (int32_t)nval, level;
1995 
1996         ASSERT(step != 0);
1997         ASSERT(levels != 0);
1998 
1999         if (val < base) {
2000                 /*
2001                  * This is an underflow.
2002                  */
2003                 lquanta[0] += incr;
2004                 return;
2005         }
2006 
2007         level = (val - base) / step;
2008 
2009         if (level < levels) {
2010                 lquanta[level + 1] += incr;
2011                 return;
2012         }
2013 
2014         /*
2015          * This is an overflow.
2016          */
2017         lquanta[levels + 1] += incr;
2018 }
2019 
2020 static int
2021 dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
2022     uint16_t high, uint16_t nsteps, int64_t value)
2023 {
2024         int64_t this = 1, last, next;
2025         int base = 1, order;
2026 
2027         ASSERT(factor <= nsteps);
2028         ASSERT(nsteps % factor == 0);
2029 
2030         for (order = 0; order < low; order++)
2031                 this *= factor;
2032 
2033         /*
2034          * If our value is less than our factor taken to the power of the
2035          * low order of magnitude, it goes into the zeroth bucket.
2036          */
2037         if (value < (last = this))
2038                 return (0);
2039 
2040         for (this *= factor; order <= high; order++) {
2041                 int nbuckets = this > nsteps ? nsteps : this;
2042 
2043                 if ((next = this * factor) < this) {
2044                         /*
2045                          * We should not generally get log/linear quantizations
2046                          * with a high magnitude that allows 64-bits to
2047                          * overflow, but we nonetheless protect against this
2048                          * by explicitly checking for overflow, and clamping
2049                          * our value accordingly.
2050                          */
2051                         value = this - 1;
2052                 }
2053 
2054                 if (value < this) {
2055                         /*
2056                          * If our value lies within this order of magnitude,
2057                          * determine its position by taking the offset within
2058                          * the order of magnitude, dividing by the bucket
2059                          * width, and adding to our (accumulated) base.
2060                          */
2061                         return (base + (value - last) / (this / nbuckets));
2062                 }
2063 
2064                 base += nbuckets - (nbuckets / factor);
2065                 last = this;
2066                 this = next;
2067         }
2068 
2069         /*
2070          * Our value is greater than or equal to our factor taken to the
2071          * power of one plus the high magnitude -- return the top bucket.
2072          */
2073         return (base);
2074 }
2075 
2076 static void
2077 dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2078 {
2079         uint64_t arg = *llquanta++;
2080         uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2081         uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2082         uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2083         uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2084 
2085         llquanta[dtrace_aggregate_llquantize_bucket(factor,
2086             low, high, nsteps, nval)] += incr;
2087 }
2088 
2089 /*ARGSUSED*/
2090 static void
2091 dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2092 {
2093         data[0]++;
2094         data[1] += nval;
2095 }
2096 
2097 /*ARGSUSED*/
2098 static void
2099 dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2100 {
2101         int64_t snval = (int64_t)nval;
2102         uint64_t tmp[2];
2103 
2104         data[0]++;
2105         data[1] += nval;
2106 
2107         /*
2108          * What we want to say here is:
2109          *
2110          * data[2] += nval * nval;
2111          *
2112          * But given that nval is 64-bit, we could easily overflow, so
2113          * we do this as 128-bit arithmetic.
2114          */
2115         if (snval < 0)
2116                 snval = -snval;
2117 
2118         dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2119         dtrace_add_128(data + 2, tmp, data + 2);
2120 }
2121 
2122 /*ARGSUSED*/
2123 static void
2124 dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2125 {
2126         *oval = *oval + 1;
2127 }
2128 
2129 /*ARGSUSED*/
2130 static void
2131 dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2132 {
2133         *oval += nval;
2134 }
2135 
2136 /*
2137  * Aggregate given the tuple in the principal data buffer, and the aggregating
2138  * action denoted by the specified dtrace_aggregation_t.  The aggregation
2139  * buffer is specified as the buf parameter.  This routine does not return
2140  * failure; if there is no space in the aggregation buffer, the data will be
2141  * dropped, and a corresponding counter incremented.
2142  */
2143 static void
2144 dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2145     intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2146 {
2147         dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2148         uint32_t i, ndx, size, fsize;
2149         uint32_t align = sizeof (uint64_t) - 1;
2150         dtrace_aggbuffer_t *agb;
2151         dtrace_aggkey_t *key;
2152         uint32_t hashval = 0, limit, isstr;
2153         caddr_t tomax, data, kdata;
2154         dtrace_actkind_t action;
2155         dtrace_action_t *act;
2156         uintptr_t offs;
2157 
2158         if (buf == NULL)
2159                 return;
2160 
2161         if (!agg->dtag_hasarg) {
2162                 /*
2163                  * Currently, only quantize() and lquantize() take additional
2164                  * arguments, and they have the same semantics:  an increment
2165                  * value that defaults to 1 when not present.  If additional
2166                  * aggregating actions take arguments, the setting of the
2167                  * default argument value will presumably have to become more
2168                  * sophisticated...
2169                  */
2170                 arg = 1;
2171         }
2172 
2173         action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2174         size = rec->dtrd_offset - agg->dtag_base;
2175         fsize = size + rec->dtrd_size;
2176 
2177         ASSERT(dbuf->dtb_tomax != NULL);
2178         data = dbuf->dtb_tomax + offset + agg->dtag_base;
2179 
2180         if ((tomax = buf->dtb_tomax) == NULL) {
2181                 dtrace_buffer_drop(buf);
2182                 return;
2183         }
2184 
2185         /*
2186          * The metastructure is always at the bottom of the buffer.
2187          */
2188         agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2189             sizeof (dtrace_aggbuffer_t));
2190 
2191         if (buf->dtb_offset == 0) {
2192                 /*
2193                  * We just kludge up approximately 1/8th of the size to be
2194                  * buckets.  If this guess ends up being routinely
2195                  * off-the-mark, we may need to dynamically readjust this
2196                  * based on past performance.
2197                  */
2198                 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2199 
2200                 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2201                     (uintptr_t)tomax || hashsize == 0) {
2202                         /*
2203                          * We've been given a ludicrously small buffer;
2204                          * increment our drop count and leave.
2205                          */
2206                         dtrace_buffer_drop(buf);
2207                         return;
2208                 }
2209 
2210                 /*
2211                  * And now, a pathetic attempt to try to get a an odd (or
2212                  * perchance, a prime) hash size for better hash distribution.
2213                  */
2214                 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2215                         hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2216 
2217                 agb->dtagb_hashsize = hashsize;
2218                 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2219                     agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2220                 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2221 
2222                 for (i = 0; i < agb->dtagb_hashsize; i++)
2223                         agb->dtagb_hash[i] = NULL;
2224         }
2225 
2226         ASSERT(agg->dtag_first != NULL);
2227         ASSERT(agg->dtag_first->dta_intuple);
2228 
2229         /*
2230          * Calculate the hash value based on the key.  Note that we _don't_
2231          * include the aggid in the hashing (but we will store it as part of
2232          * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
2233          * algorithm: a simple, quick algorithm that has no known funnels, and
2234          * gets good distribution in practice.  The efficacy of the hashing
2235          * algorithm (and a comparison with other algorithms) may be found by
2236          * running the ::dtrace_aggstat MDB dcmd.
2237          */
2238         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2239                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2240                 limit = i + act->dta_rec.dtrd_size;
2241                 ASSERT(limit <= size);
2242                 isstr = DTRACEACT_ISSTRING(act);
2243 
2244                 for (; i < limit; i++) {
2245                         hashval += data[i];
2246                         hashval += (hashval << 10);
2247                         hashval ^= (hashval >> 6);
2248 
2249                         if (isstr && data[i] == '\0')
2250                                 break;
2251                 }
2252         }
2253 
2254         hashval += (hashval << 3);
2255         hashval ^= (hashval >> 11);
2256         hashval += (hashval << 15);
2257 
2258         /*
2259          * Yes, the divide here is expensive -- but it's generally the least
2260          * of the performance issues given the amount of data that we iterate
2261          * over to compute hash values, compare data, etc.
2262          */
2263         ndx = hashval % agb->dtagb_hashsize;
2264 
2265         for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2266                 ASSERT((caddr_t)key >= tomax);
2267                 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2268 
2269                 if (hashval != key->dtak_hashval || key->dtak_size != size)
2270                         continue;
2271 
2272                 kdata = key->dtak_data;
2273                 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2274 
2275                 for (act = agg->dtag_first; act->dta_intuple;
2276                     act = act->dta_next) {
2277                         i = act->dta_rec.dtrd_offset - agg->dtag_base;
2278                         limit = i + act->dta_rec.dtrd_size;
2279                         ASSERT(limit <= size);
2280                         isstr = DTRACEACT_ISSTRING(act);
2281 
2282                         for (; i < limit; i++) {
2283                                 if (kdata[i] != data[i])
2284                                         goto next;
2285 
2286                                 if (isstr && data[i] == '\0')
2287                                         break;
2288                         }
2289                 }
2290 
2291                 if (action != key->dtak_action) {
2292                         /*
2293                          * We are aggregating on the same value in the same
2294                          * aggregation with two different aggregating actions.
2295                          * (This should have been picked up in the compiler,
2296                          * so we may be dealing with errant or devious DIF.)
2297                          * This is an error condition; we indicate as much,
2298                          * and return.
2299                          */
2300                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2301                         return;
2302                 }
2303 
2304                 /*
2305                  * This is a hit:  we need to apply the aggregator to
2306                  * the value at this key.
2307                  */
2308                 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2309                 return;
2310 next:
2311                 continue;
2312         }
2313 
2314         /*
2315          * We didn't find it.  We need to allocate some zero-filled space,
2316          * link it into the hash table appropriately, and apply the aggregator
2317          * to the (zero-filled) value.
2318          */
2319         offs = buf->dtb_offset;
2320         while (offs & (align - 1))
2321                 offs += sizeof (uint32_t);
2322 
2323         /*
2324          * If we don't have enough room to both allocate a new key _and_
2325          * its associated data, increment the drop count and return.
2326          */
2327         if ((uintptr_t)tomax + offs + fsize >
2328             agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2329                 dtrace_buffer_drop(buf);
2330                 return;
2331         }
2332 
2333         /*CONSTCOND*/
2334         ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2335         key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2336         agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2337 
2338         key->dtak_data = kdata = tomax + offs;
2339         buf->dtb_offset = offs + fsize;
2340 
2341         /*
2342          * Now copy the data across.
2343          */
2344         *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2345 
2346         for (i = sizeof (dtrace_aggid_t); i < size; i++)
2347                 kdata[i] = data[i];
2348 
2349         /*
2350          * Because strings are not zeroed out by default, we need to iterate
2351          * looking for actions that store strings, and we need to explicitly
2352          * pad these strings out with zeroes.
2353          */
2354         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2355                 int nul;
2356 
2357                 if (!DTRACEACT_ISSTRING(act))
2358                         continue;
2359 
2360                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2361                 limit = i + act->dta_rec.dtrd_size;
2362                 ASSERT(limit <= size);
2363 
2364                 for (nul = 0; i < limit; i++) {
2365                         if (nul) {
2366                                 kdata[i] = '\0';
2367                                 continue;
2368                         }
2369 
2370                         if (data[i] != '\0')
2371                                 continue;
2372 
2373                         nul = 1;
2374                 }
2375         }
2376 
2377         for (i = size; i < fsize; i++)
2378                 kdata[i] = 0;
2379 
2380         key->dtak_hashval = hashval;
2381         key->dtak_size = size;
2382         key->dtak_action = action;
2383         key->dtak_next = agb->dtagb_hash[ndx];
2384         agb->dtagb_hash[ndx] = key;
2385 
2386         /*
2387          * Finally, apply the aggregator.
2388          */
2389         *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2390         agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2391 }
2392 
2393 /*
2394  * Given consumer state, this routine finds a speculation in the INACTIVE
2395  * state and transitions it into the ACTIVE state.  If there is no speculation
2396  * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2397  * incremented -- it is up to the caller to take appropriate action.
2398  */
2399 static int
2400 dtrace_speculation(dtrace_state_t *state)
2401 {
2402         int i = 0;
2403         dtrace_speculation_state_t current;
2404         uint32_t *stat = &state->dts_speculations_unavail, count;
2405 
2406         while (i < state->dts_nspeculations) {
2407                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2408 
2409                 current = spec->dtsp_state;
2410 
2411                 if (current != DTRACESPEC_INACTIVE) {
2412                         if (current == DTRACESPEC_COMMITTINGMANY ||
2413                             current == DTRACESPEC_COMMITTING ||
2414                             current == DTRACESPEC_DISCARDING)
2415                                 stat = &state->dts_speculations_busy;
2416                         i++;
2417                         continue;
2418                 }
2419 
2420                 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2421                     current, DTRACESPEC_ACTIVE) == current)
2422                         return (i + 1);
2423         }
2424 
2425         /*
2426          * We couldn't find a speculation.  If we found as much as a single
2427          * busy speculation buffer, we'll attribute this failure as "busy"
2428          * instead of "unavail".
2429          */
2430         do {
2431                 count = *stat;
2432         } while (dtrace_cas32(stat, count, count + 1) != count);
2433 
2434         return (0);
2435 }
2436 
2437 /*
2438  * This routine commits an active speculation.  If the specified speculation
2439  * is not in a valid state to perform a commit(), this routine will silently do
2440  * nothing.  The state of the specified speculation is transitioned according
2441  * to the state transition diagram outlined in <sys/dtrace_impl.h>
2442  */
2443 static void
2444 dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2445     dtrace_specid_t which)
2446 {
2447         dtrace_speculation_t *spec;
2448         dtrace_buffer_t *src, *dest;
2449         uintptr_t daddr, saddr, dlimit;
2450         dtrace_speculation_state_t current, new;
2451         intptr_t offs;
2452 
2453         if (which == 0)
2454                 return;
2455 
2456         if (which > state->dts_nspeculations) {
2457                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2458                 return;
2459         }
2460 
2461         spec = &state->dts_speculations[which - 1];
2462         src = &spec->dtsp_buffer[cpu];
2463         dest = &state->dts_buffer[cpu];
2464 
2465         do {
2466                 current = spec->dtsp_state;
2467 
2468                 if (current == DTRACESPEC_COMMITTINGMANY)
2469                         break;
2470 
2471                 switch (current) {
2472                 case DTRACESPEC_INACTIVE:
2473                 case DTRACESPEC_DISCARDING:
2474                         return;
2475 
2476                 case DTRACESPEC_COMMITTING:
2477                         /*
2478                          * This is only possible if we are (a) commit()'ing
2479                          * without having done a prior speculate() on this CPU
2480                          * and (b) racing with another commit() on a different
2481                          * CPU.  There's nothing to do -- we just assert that
2482                          * our offset is 0.
2483                          */
2484                         ASSERT(src->dtb_offset == 0);
2485                         return;
2486 
2487                 case DTRACESPEC_ACTIVE:
2488                         new = DTRACESPEC_COMMITTING;
2489                         break;
2490 
2491                 case DTRACESPEC_ACTIVEONE:
2492                         /*
2493                          * This speculation is active on one CPU.  If our
2494                          * buffer offset is non-zero, we know that the one CPU
2495                          * must be us.  Otherwise, we are committing on a
2496                          * different CPU from the speculate(), and we must
2497                          * rely on being asynchronously cleaned.
2498                          */
2499                         if (src->dtb_offset != 0) {
2500                                 new = DTRACESPEC_COMMITTING;
2501                                 break;
2502                         }
2503                         /*FALLTHROUGH*/
2504 
2505                 case DTRACESPEC_ACTIVEMANY:
2506                         new = DTRACESPEC_COMMITTINGMANY;
2507                         break;
2508 
2509                 default:
2510                         ASSERT(0);
2511                 }
2512         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2513             current, new) != current);
2514 
2515         /*
2516          * We have set the state to indicate that we are committing this
2517          * speculation.  Now reserve the necessary space in the destination
2518          * buffer.
2519          */
2520         if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2521             sizeof (uint64_t), state, NULL)) < 0) {
2522                 dtrace_buffer_drop(dest);
2523                 goto out;
2524         }
2525 
2526         /*
2527          * We have the space; copy the buffer across.  (Note that this is a
2528          * highly subobtimal bcopy(); in the unlikely event that this becomes
2529          * a serious performance issue, a high-performance DTrace-specific
2530          * bcopy() should obviously be invented.)
2531          */
2532         daddr = (uintptr_t)dest->dtb_tomax + offs;
2533         dlimit = daddr + src->dtb_offset;
2534         saddr = (uintptr_t)src->dtb_tomax;
2535 
2536         /*
2537          * First, the aligned portion.
2538          */
2539         while (dlimit - daddr >= sizeof (uint64_t)) {
2540                 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2541 
2542                 daddr += sizeof (uint64_t);
2543                 saddr += sizeof (uint64_t);
2544         }
2545 
2546         /*
2547          * Now any left-over bit...
2548          */
2549         while (dlimit - daddr)
2550                 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2551 
2552         /*
2553          * Finally, commit the reserved space in the destination buffer.
2554          */
2555         dest->dtb_offset = offs + src->dtb_offset;
2556 
2557 out:
2558         /*
2559          * If we're lucky enough to be the only active CPU on this speculation
2560          * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2561          */
2562         if (current == DTRACESPEC_ACTIVE ||
2563             (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2564                 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2565                     DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2566 
2567                 ASSERT(rval == DTRACESPEC_COMMITTING);
2568         }
2569 
2570         src->dtb_offset = 0;
2571         src->dtb_xamot_drops += src->dtb_drops;
2572         src->dtb_drops = 0;
2573 }
2574 
2575 /*
2576  * This routine discards an active speculation.  If the specified speculation
2577  * is not in a valid state to perform a discard(), this routine will silently
2578  * do nothing.  The state of the specified speculation is transitioned
2579  * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2580  */
2581 static void
2582 dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2583     dtrace_specid_t which)
2584 {
2585         dtrace_speculation_t *spec;
2586         dtrace_speculation_state_t current, new;
2587         dtrace_buffer_t *buf;
2588 
2589         if (which == 0)
2590                 return;
2591 
2592         if (which > state->dts_nspeculations) {
2593                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2594                 return;
2595         }
2596 
2597         spec = &state->dts_speculations[which - 1];
2598         buf = &spec->dtsp_buffer[cpu];
2599 
2600         do {
2601                 current = spec->dtsp_state;
2602 
2603                 switch (current) {
2604                 case DTRACESPEC_INACTIVE:
2605                 case DTRACESPEC_COMMITTINGMANY:
2606                 case DTRACESPEC_COMMITTING:
2607                 case DTRACESPEC_DISCARDING:
2608                         return;
2609 
2610                 case DTRACESPEC_ACTIVE:
2611                 case DTRACESPEC_ACTIVEMANY:
2612                         new = DTRACESPEC_DISCARDING;
2613                         break;
2614 
2615                 case DTRACESPEC_ACTIVEONE:
2616                         if (buf->dtb_offset != 0) {
2617                                 new = DTRACESPEC_INACTIVE;
2618                         } else {
2619                                 new = DTRACESPEC_DISCARDING;
2620                         }
2621                         break;
2622 
2623                 default:
2624                         ASSERT(0);
2625                 }
2626         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2627             current, new) != current);
2628 
2629         buf->dtb_offset = 0;
2630         buf->dtb_drops = 0;
2631 }
2632 
2633 /*
2634  * Note:  not called from probe context.  This function is called
2635  * asynchronously from cross call context to clean any speculations that are
2636  * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
2637  * transitioned back to the INACTIVE state until all CPUs have cleaned the
2638  * speculation.
2639  */
2640 static void
2641 dtrace_speculation_clean_here(dtrace_state_t *state)
2642 {
2643         dtrace_icookie_t cookie;
2644         processorid_t cpu = CPU->cpu_id;
2645         dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2646         dtrace_specid_t i;
2647 
2648         cookie = dtrace_interrupt_disable();
2649 
2650         if (dest->dtb_tomax == NULL) {
2651                 dtrace_interrupt_enable(cookie);
2652                 return;
2653         }
2654 
2655         for (i = 0; i < state->dts_nspeculations; i++) {
2656                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2657                 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2658 
2659                 if (src->dtb_tomax == NULL)
2660                         continue;
2661 
2662                 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2663                         src->dtb_offset = 0;
2664                         continue;
2665                 }
2666 
2667                 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2668                         continue;
2669 
2670                 if (src->dtb_offset == 0)
2671                         continue;
2672 
2673                 dtrace_speculation_commit(state, cpu, i + 1);
2674         }
2675 
2676         dtrace_interrupt_enable(cookie);
2677 }
2678 
2679 /*
2680  * Note:  not called from probe context.  This function is called
2681  * asynchronously (and at a regular interval) to clean any speculations that
2682  * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
2683  * is work to be done, it cross calls all CPUs to perform that work;
2684  * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2685  * INACTIVE state until they have been cleaned by all CPUs.
2686  */
2687 static void
2688 dtrace_speculation_clean(dtrace_state_t *state)
2689 {
2690         int work = 0, rv;
2691         dtrace_specid_t i;
2692 
2693         for (i = 0; i < state->dts_nspeculations; i++) {
2694                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2695 
2696                 ASSERT(!spec->dtsp_cleaning);
2697 
2698                 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2699                     spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2700                         continue;
2701 
2702                 work++;
2703                 spec->dtsp_cleaning = 1;
2704         }
2705 
2706         if (!work)
2707                 return;
2708 
2709         dtrace_xcall(DTRACE_CPUALL,
2710             (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2711 
2712         /*
2713          * We now know that all CPUs have committed or discarded their
2714          * speculation buffers, as appropriate.  We can now set the state
2715          * to inactive.
2716          */
2717         for (i = 0; i < state->dts_nspeculations; i++) {
2718                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2719                 dtrace_speculation_state_t current, new;
2720 
2721                 if (!spec->dtsp_cleaning)
2722                         continue;
2723 
2724                 current = spec->dtsp_state;
2725                 ASSERT(current == DTRACESPEC_DISCARDING ||
2726                     current == DTRACESPEC_COMMITTINGMANY);
2727 
2728                 new = DTRACESPEC_INACTIVE;
2729 
2730                 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2731                 ASSERT(rv == current);
2732                 spec->dtsp_cleaning = 0;
2733         }
2734 }
2735 
2736 /*
2737  * Called as part of a speculate() to get the speculative buffer associated
2738  * with a given speculation.  Returns NULL if the specified speculation is not
2739  * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
2740  * the active CPU is not the specified CPU -- the speculation will be
2741  * atomically transitioned into the ACTIVEMANY state.
2742  */
2743 static dtrace_buffer_t *
2744 dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2745     dtrace_specid_t which)
2746 {
2747         dtrace_speculation_t *spec;
2748         dtrace_speculation_state_t current, new;
2749         dtrace_buffer_t *buf;
2750 
2751         if (which == 0)
2752                 return (NULL);
2753 
2754         if (which > state->dts_nspeculations) {
2755                 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2756                 return (NULL);
2757         }
2758 
2759         spec = &state->dts_speculations[which - 1];
2760         buf = &spec->dtsp_buffer[cpuid];
2761 
2762         do {
2763                 current = spec->dtsp_state;
2764 
2765                 switch (current) {
2766                 case DTRACESPEC_INACTIVE:
2767                 case DTRACESPEC_COMMITTINGMANY:
2768                 case DTRACESPEC_DISCARDING:
2769                         return (NULL);
2770 
2771                 case DTRACESPEC_COMMITTING:
2772                         ASSERT(buf->dtb_offset == 0);
2773                         return (NULL);
2774 
2775                 case DTRACESPEC_ACTIVEONE:
2776                         /*
2777                          * This speculation is currently active on one CPU.
2778                          * Check the offset in the buffer; if it's non-zero,
2779                          * that CPU must be us (and we leave the state alone).
2780                          * If it's zero, assume that we're starting on a new
2781                          * CPU -- and change the state to indicate that the
2782                          * speculation is active on more than one CPU.
2783                          */
2784                         if (buf->dtb_offset != 0)
2785                                 return (buf);
2786 
2787                         new = DTRACESPEC_ACTIVEMANY;
2788                         break;
2789 
2790                 case DTRACESPEC_ACTIVEMANY:
2791                         return (buf);
2792 
2793                 case DTRACESPEC_ACTIVE:
2794                         new = DTRACESPEC_ACTIVEONE;
2795                         break;
2796 
2797                 default:
2798                         ASSERT(0);
2799                 }
2800         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2801             current, new) != current);
2802 
2803         ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
2804         return (buf);
2805 }
2806 
2807 /*
2808  * Return a string.  In the event that the user lacks the privilege to access
2809  * arbitrary kernel memory, we copy the string out to scratch memory so that we
2810  * don't fail access checking.
2811  *
2812  * dtrace_dif_variable() uses this routine as a helper for various
2813  * builtin values such as 'execname' and 'probefunc.'
2814  */
2815 uintptr_t
2816 dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
2817     dtrace_mstate_t *mstate)
2818 {
2819         uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
2820         uintptr_t ret;
2821         size_t strsz;
2822 
2823         /*
2824          * The easy case: this probe is allowed to read all of memory, so
2825          * we can just return this as a vanilla pointer.
2826          */
2827         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
2828                 return (addr);
2829 
2830         /*
2831          * This is the tougher case: we copy the string in question from
2832          * kernel memory into scratch memory and return it that way: this
2833          * ensures that we won't trip up when access checking tests the
2834          * BYREF return value.
2835          */
2836         strsz = dtrace_strlen((char *)addr, size) + 1;
2837 
2838         if (mstate->dtms_scratch_ptr + strsz >
2839             mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
2840                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
2841                 return (NULL);
2842         }
2843 
2844         dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
2845             strsz);
2846         ret = mstate->dtms_scratch_ptr;
2847         mstate->dtms_scratch_ptr += strsz;
2848         return (ret);
2849 }
2850 
2851 /*
2852  * This function implements the DIF emulator's variable lookups.  The emulator
2853  * passes a reserved variable identifier and optional built-in array index.
2854  */
2855 static uint64_t
2856 dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
2857     uint64_t ndx)
2858 {
2859         /*
2860          * If we're accessing one of the uncached arguments, we'll turn this
2861          * into a reference in the args array.
2862          */
2863         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
2864                 ndx = v - DIF_VAR_ARG0;
2865                 v = DIF_VAR_ARGS;
2866         }
2867 
2868         switch (v) {
2869         case DIF_VAR_ARGS:
2870                 if (!(mstate->dtms_access & DTRACE_ACCESS_ARGS)) {
2871                         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |=
2872                             CPU_DTRACE_KPRIV;
2873                         return (0);
2874                 }
2875 
2876                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
2877                 if (ndx >= sizeof (mstate->dtms_arg) /
2878                     sizeof (mstate->dtms_arg[0])) {
2879                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2880                         dtrace_provider_t *pv;
2881                         uint64_t val;
2882 
2883                         pv = mstate->dtms_probe->dtpr_provider;
2884                         if (pv->dtpv_pops.dtps_getargval != NULL)
2885                                 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
2886                                     mstate->dtms_probe->dtpr_id,
2887                                     mstate->dtms_probe->dtpr_arg, ndx, aframes);
2888                         else
2889                                 val = dtrace_getarg(ndx, aframes);
2890 
2891                         /*
2892                          * This is regrettably required to keep the compiler
2893                          * from tail-optimizing the call to dtrace_getarg().
2894                          * The condition always evaluates to true, but the
2895                          * compiler has no way of figuring that out a priori.
2896                          * (None of this would be necessary if the compiler
2897                          * could be relied upon to _always_ tail-optimize
2898                          * the call to dtrace_getarg() -- but it can't.)
2899                          */
2900                         if (mstate->dtms_probe != NULL)
2901                                 return (val);
2902 
2903                         ASSERT(0);
2904                 }
2905 
2906                 return (mstate->dtms_arg[ndx]);
2907 
2908         case DIF_VAR_UREGS: {
2909                 klwp_t *lwp;
2910 
2911                 if (!dtrace_priv_proc(state, mstate))
2912                         return (0);
2913 
2914                 if ((lwp = curthread->t_lwp) == NULL) {
2915                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
2916                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = NULL;
2917                         return (0);
2918                 }
2919 
2920                 return (dtrace_getreg(lwp->lwp_regs, ndx));
2921         }
2922 
2923         case DIF_VAR_VMREGS: {
2924                 uint64_t rval;
2925 
2926                 if (!dtrace_priv_kernel(state))
2927                         return (0);
2928 
2929                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
2930 
2931                 rval = dtrace_getvmreg(ndx,
2932                     &cpu_core[CPU->cpu_id].cpuc_dtrace_flags);
2933 
2934                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
2935 
2936                 return (rval);
2937         }
2938 
2939         case DIF_VAR_CURTHREAD:
2940                 if (!dtrace_priv_proc(state, mstate))
2941                         return (0);
2942                 return ((uint64_t)(uintptr_t)curthread);
2943 
2944         case DIF_VAR_TIMESTAMP:
2945                 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
2946                         mstate->dtms_timestamp = dtrace_gethrtime();
2947                         mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
2948                 }
2949                 return (mstate->dtms_timestamp);
2950 
2951         case DIF_VAR_VTIMESTAMP:
2952                 ASSERT(dtrace_vtime_references != 0);
2953                 return (curthread->t_dtrace_vtime);
2954 
2955         case DIF_VAR_WALLTIMESTAMP:
2956                 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
2957                         mstate->dtms_walltimestamp = dtrace_gethrestime();
2958                         mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
2959                 }
2960                 return (mstate->dtms_walltimestamp);
2961 
2962         case DIF_VAR_IPL:
2963                 if (!dtrace_priv_kernel(state))
2964                         return (0);
2965                 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
2966                         mstate->dtms_ipl = dtrace_getipl();
2967                         mstate->dtms_present |= DTRACE_MSTATE_IPL;
2968                 }
2969                 return (mstate->dtms_ipl);
2970 
2971         case DIF_VAR_EPID:
2972                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
2973                 return (mstate->dtms_epid);
2974 
2975         case DIF_VAR_ID:
2976                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2977                 return (mstate->dtms_probe->dtpr_id);
2978 
2979         case DIF_VAR_STACKDEPTH:
2980                 if (!dtrace_priv_kernel(state))
2981                         return (0);
2982                 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
2983                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2984 
2985                         mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
2986                         mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
2987                 }
2988                 return (mstate->dtms_stackdepth);
2989 
2990         case DIF_VAR_USTACKDEPTH:
2991                 if (!dtrace_priv_proc(state, mstate))
2992                         return (0);
2993                 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
2994                         /*
2995                          * See comment in DIF_VAR_PID.
2996                          */
2997                         if (DTRACE_ANCHORED(mstate->dtms_probe) &&
2998                             CPU_ON_INTR(CPU)) {
2999                                 mstate->dtms_ustackdepth = 0;
3000                         } else {
3001                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3002                                 mstate->dtms_ustackdepth =
3003                                     dtrace_getustackdepth();
3004                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3005                         }
3006                         mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3007                 }
3008                 return (mstate->dtms_ustackdepth);
3009 
3010         case DIF_VAR_CALLER:
3011                 if (!dtrace_priv_kernel(state))
3012                         return (0);
3013                 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3014                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3015 
3016                         if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3017                                 /*
3018                                  * If this is an unanchored probe, we are
3019                                  * required to go through the slow path:
3020                                  * dtrace_caller() only guarantees correct
3021                                  * results for anchored probes.
3022                                  */
3023                                 pc_t caller[2];
3024 
3025                                 dtrace_getpcstack(caller, 2, aframes,
3026                                     (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3027                                 mstate->dtms_caller = caller[1];
3028                         } else if ((mstate->dtms_caller =
3029                             dtrace_caller(aframes)) == -1) {
3030                                 /*
3031                                  * We have failed to do this the quick way;
3032                                  * we must resort to the slower approach of
3033                                  * calling dtrace_getpcstack().
3034                                  */
3035                                 pc_t caller;
3036 
3037                                 dtrace_getpcstack(&caller, 1, aframes, NULL);
3038                                 mstate->dtms_caller = caller;
3039                         }
3040 
3041                         mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3042                 }
3043                 return (mstate->dtms_caller);
3044 
3045         case DIF_VAR_UCALLER:
3046                 if (!dtrace_priv_proc(state, mstate))
3047                         return (0);
3048 
3049                 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3050                         uint64_t ustack[3];
3051 
3052                         /*
3053                          * dtrace_getupcstack() fills in the first uint64_t
3054                          * with the current PID.  The second uint64_t will
3055                          * be the program counter at user-level.  The third
3056                          * uint64_t will contain the caller, which is what
3057                          * we're after.
3058                          */
3059                         ustack[2] = NULL;
3060                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3061                         dtrace_getupcstack(ustack, 3);
3062                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3063                         mstate->dtms_ucaller = ustack[2];
3064                         mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3065                 }
3066 
3067                 return (mstate->dtms_ucaller);
3068 
3069         case DIF_VAR_PROBEPROV:
3070                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3071                 return (dtrace_dif_varstr(
3072                     (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3073                     state, mstate));
3074 
3075         case DIF_VAR_PROBEMOD:
3076                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3077                 return (dtrace_dif_varstr(
3078                     (uintptr_t)mstate->dtms_probe->dtpr_mod,
3079                     state, mstate));
3080 
3081         case DIF_VAR_PROBEFUNC:
3082                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3083                 return (dtrace_dif_varstr(
3084                     (uintptr_t)mstate->dtms_probe->dtpr_func,
3085                     state, mstate));
3086 
3087         case DIF_VAR_PROBENAME:
3088                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3089                 return (dtrace_dif_varstr(
3090                     (uintptr_t)mstate->dtms_probe->dtpr_name,
3091                     state, mstate));
3092 
3093         case DIF_VAR_PID:
3094                 if (!dtrace_priv_proc(state, mstate))
3095                         return (0);
3096 
3097                 /*
3098                  * Note that we are assuming that an unanchored probe is
3099                  * always due to a high-level interrupt.  (And we're assuming
3100                  * that there is only a single high level interrupt.)
3101                  */
3102                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3103                         return (pid0.pid_id);
3104 
3105                 /*
3106                  * It is always safe to dereference one's own t_procp pointer:
3107                  * it always points to a valid, allocated proc structure.
3108                  * Further, it is always safe to dereference the p_pidp member
3109                  * of one's own proc structure.  (These are truisms becuase
3110                  * threads and processes don't clean up their own state --
3111                  * they leave that task to whomever reaps them.)
3112                  */
3113                 return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
3114 
3115         case DIF_VAR_PPID:
3116                 if (!dtrace_priv_proc(state, mstate))
3117                         return (0);
3118 
3119                 /*
3120                  * See comment in DIF_VAR_PID.
3121                  */
3122                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3123                         return (pid0.pid_id);
3124 
3125                 /*
3126                  * It is always safe to dereference one's own t_procp pointer:
3127                  * it always points to a valid, allocated proc structure.
3128                  * (This is true because threads don't clean up their own
3129                  * state -- they leave that task to whomever reaps them.)
3130                  */
3131                 return ((uint64_t)curthread->t_procp->p_ppid);
3132 
3133         case DIF_VAR_TID:
3134                 /*
3135                  * See comment in DIF_VAR_PID.
3136                  */
3137                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3138                         return (0);
3139 
3140                 return ((uint64_t)curthread->t_tid);
3141 
3142         case DIF_VAR_EXECNAME:
3143                 if (!dtrace_priv_proc(state, mstate))
3144                         return (0);
3145 
3146                 /*
3147                  * See comment in DIF_VAR_PID.
3148                  */
3149                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3150                         return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3151 
3152                 /*
3153                  * It is always safe to dereference one's own t_procp pointer:
3154                  * it always points to a valid, allocated proc structure.
3155                  * (This is true because threads don't clean up their own
3156                  * state -- they leave that task to whomever reaps them.)
3157                  */
3158                 return (dtrace_dif_varstr(
3159                     (uintptr_t)curthread->t_procp->p_user.u_comm,
3160                     state, mstate));
3161 
3162         case DIF_VAR_ZONENAME:
3163                 if (!dtrace_priv_proc(state, mstate))
3164                         return (0);
3165 
3166                 /*
3167                  * See comment in DIF_VAR_PID.
3168                  */
3169                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3170                         return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3171 
3172                 /*
3173                  * It is always safe to dereference one's own t_procp pointer:
3174                  * it always points to a valid, allocated proc structure.
3175                  * (This is true because threads don't clean up their own
3176                  * state -- they leave that task to whomever reaps them.)
3177                  */
3178                 return (dtrace_dif_varstr(
3179                     (uintptr_t)curthread->t_procp->p_zone->zone_name,
3180                     state, mstate));
3181 
3182         case DIF_VAR_UID:
3183                 if (!dtrace_priv_proc(state, mstate))
3184                         return (0);
3185 
3186                 /*
3187                  * See comment in DIF_VAR_PID.
3188                  */
3189                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3190                         return ((uint64_t)p0.p_cred->cr_uid);
3191 
3192                 /*
3193                  * It is always safe to dereference one's own t_procp pointer:
3194                  * it always points to a valid, allocated proc structure.
3195                  * (This is true because threads don't clean up their own
3196                  * state -- they leave that task to whomever reaps them.)
3197                  *
3198                  * Additionally, it is safe to dereference one's own process
3199                  * credential, since this is never NULL after process birth.
3200                  */
3201                 return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3202 
3203         case DIF_VAR_GID:
3204                 if (!dtrace_priv_proc(state, mstate))
3205                         return (0);
3206 
3207                 /*
3208                  * See comment in DIF_VAR_PID.
3209                  */
3210                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3211                         return ((uint64_t)p0.p_cred->cr_gid);
3212 
3213                 /*
3214                  * It is always safe to dereference one's own t_procp pointer:
3215                  * it always points to a valid, allocated proc structure.
3216                  * (This is true because threads don't clean up their own
3217                  * state -- they leave that task to whomever reaps them.)
3218                  *
3219                  * Additionally, it is safe to dereference one's own process
3220                  * credential, since this is never NULL after process birth.
3221                  */
3222                 return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3223 
3224         case DIF_VAR_ERRNO: {
3225                 klwp_t *lwp;
3226                 if (!dtrace_priv_proc(state, mstate))
3227                         return (0);
3228 
3229                 /*
3230                  * See comment in DIF_VAR_PID.
3231                  */
3232                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3233                         return (0);
3234 
3235                 /*
3236                  * It is always safe to dereference one's own t_lwp pointer in
3237                  * the event that this pointer is non-NULL.  (This is true
3238                  * because threads and lwps don't clean up their own state --
3239                  * they leave that task to whomever reaps them.)
3240                  */
3241                 if ((lwp = curthread->t_lwp) == NULL)
3242                         return (0);
3243 
3244                 return ((uint64_t)lwp->lwp_errno);
3245         }
3246         default:
3247                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3248                 return (0);
3249         }
3250 }
3251 
3252 /*
3253  * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
3254  * Notice that we don't bother validating the proper number of arguments or
3255  * their types in the tuple stack.  This isn't needed because all argument
3256  * interpretation is safe because of our load safety -- the worst that can
3257  * happen is that a bogus program can obtain bogus results.
3258  */
3259 static void
3260 dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
3261     dtrace_key_t *tupregs, int nargs,
3262     dtrace_mstate_t *mstate, dtrace_state_t *state)
3263 {
3264         volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
3265         volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
3266         dtrace_vstate_t *vstate = &state->dts_vstate;
3267 
3268         union {
3269                 mutex_impl_t mi;
3270                 uint64_t mx;
3271         } m;
3272 
3273         union {
3274                 krwlock_t ri;
3275                 uintptr_t rw;
3276         } r;
3277 
3278         switch (subr) {
3279         case DIF_SUBR_RAND:
3280                 regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
3281                 break;
3282 
3283         case DIF_SUBR_MUTEX_OWNED:
3284                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3285                     mstate, vstate)) {
3286                         regs[rd] = NULL;
3287                         break;
3288                 }
3289 
3290                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3291                 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
3292                         regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
3293                 else
3294                         regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
3295                 break;
3296 
3297         case DIF_SUBR_MUTEX_OWNER:
3298                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3299                     mstate, vstate)) {
3300                         regs[rd] = NULL;
3301                         break;
3302                 }
3303 
3304                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3305                 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
3306                     MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
3307                         regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
3308                 else
3309                         regs[rd] = 0;
3310                 break;
3311 
3312         case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
3313                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3314                     mstate, vstate)) {
3315                         regs[rd] = NULL;
3316                         break;
3317                 }
3318 
3319                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3320                 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
3321                 break;
3322 
3323         case DIF_SUBR_MUTEX_TYPE_SPIN:
3324                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3325                     mstate, vstate)) {
3326                         regs[rd] = NULL;
3327                         break;
3328                 }
3329 
3330                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3331                 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
3332                 break;
3333 
3334         case DIF_SUBR_RW_READ_HELD: {
3335                 uintptr_t tmp;
3336 
3337                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3338                     mstate, vstate)) {
3339                         regs[rd] = NULL;
3340                         break;
3341                 }
3342 
3343                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3344                 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
3345                 break;
3346         }
3347 
3348         case DIF_SUBR_RW_WRITE_HELD:
3349                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3350                     mstate, vstate)) {
3351                         regs[rd] = NULL;
3352                         break;
3353                 }
3354 
3355                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3356                 regs[rd] = _RW_WRITE_HELD(&r.ri);
3357                 break;
3358 
3359         case DIF_SUBR_RW_ISWRITER:
3360                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3361                     mstate, vstate)) {
3362                         regs[rd] = NULL;
3363                         break;
3364                 }
3365 
3366                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3367                 regs[rd] = _RW_ISWRITER(&r.ri);
3368                 break;
3369 
3370         case DIF_SUBR_BCOPY: {
3371                 /*
3372                  * We need to be sure that the destination is in the scratch
3373                  * region -- no other region is allowed.
3374                  */
3375                 uintptr_t src = tupregs[0].dttk_value;
3376                 uintptr_t dest = tupregs[1].dttk_value;
3377                 size_t size = tupregs[2].dttk_value;
3378 
3379                 if (!dtrace_inscratch(dest, size, mstate)) {
3380                         *flags |= CPU_DTRACE_BADADDR;
3381                         *illval = regs[rd];
3382                         break;
3383                 }
3384 
3385                 if (!dtrace_canload(src, size, mstate, vstate)) {
3386                         regs[rd] = NULL;
3387                         break;
3388                 }
3389 
3390                 dtrace_bcopy((void *)src, (void *)dest, size);
3391                 break;
3392         }
3393 
3394         case DIF_SUBR_ALLOCA:
3395         case DIF_SUBR_COPYIN: {
3396                 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
3397                 uint64_t size =
3398                     tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
3399                 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
3400 
3401                 /*
3402                  * This action doesn't require any credential checks since
3403                  * probes will not activate in user contexts to which the
3404                  * enabling user does not have permissions.
3405                  */
3406 
3407                 /*
3408                  * Rounding up the user allocation size could have overflowed
3409                  * a large, bogus allocation (like -1ULL) to 0.
3410                  */
3411                 if (scratch_size < size ||
3412                     !DTRACE_INSCRATCH(mstate, scratch_size)) {
3413                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3414                         regs[rd] = NULL;
3415                         break;
3416                 }
3417 
3418                 if (subr == DIF_SUBR_COPYIN) {
3419                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3420                         dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3421                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3422                 }
3423 
3424                 mstate->dtms_scratch_ptr += scratch_size;
3425                 regs[rd] = dest;
3426                 break;
3427         }
3428 
3429         case DIF_SUBR_COPYINTO: {
3430                 uint64_t size = tupregs[1].dttk_value;
3431                 uintptr_t dest = tupregs[2].dttk_value;
3432 
3433                 /*
3434                  * This action doesn't require any credential checks since
3435                  * probes will not activate in user contexts to which the
3436                  * enabling user does not have permissions.
3437                  */
3438                 if (!dtrace_inscratch(dest, size, mstate)) {
3439                         *flags |= CPU_DTRACE_BADADDR;
3440                         *illval = regs[rd];
3441                         break;
3442                 }
3443 
3444                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3445                 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3446                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3447                 break;
3448         }
3449 
3450         case DIF_SUBR_COPYINSTR: {
3451                 uintptr_t dest = mstate->dtms_scratch_ptr;
3452                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3453 
3454                 if (nargs > 1 && tupregs[1].dttk_value < size)
3455                         size = tupregs[1].dttk_value + 1;
3456 
3457                 /*
3458                  * This action doesn't require any credential checks since
3459                  * probes will not activate in user contexts to which the
3460                  * enabling user does not have permissions.
3461                  */
3462                 if (!DTRACE_INSCRATCH(mstate, size)) {
3463                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3464                         regs[rd] = NULL;
3465                         break;
3466                 }
3467 
3468                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3469                 dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
3470                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3471 
3472                 ((char *)dest)[size - 1] = '\0';
3473                 mstate->dtms_scratch_ptr += size;
3474                 regs[rd] = dest;
3475                 break;
3476         }
3477 
3478         case DIF_SUBR_MSGSIZE:
3479         case DIF_SUBR_MSGDSIZE: {
3480                 uintptr_t baddr = tupregs[0].dttk_value, daddr;
3481                 uintptr_t wptr, rptr;
3482                 size_t count = 0;
3483                 int cont = 0;
3484 
3485                 while (baddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
3486 
3487                         if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
3488                             vstate)) {
3489                                 regs[rd] = NULL;
3490                                 break;
3491                         }
3492 
3493                         wptr = dtrace_loadptr(baddr +
3494                             offsetof(mblk_t, b_wptr));
3495 
3496                         rptr = dtrace_loadptr(baddr +
3497                             offsetof(mblk_t, b_rptr));
3498 
3499                         if (wptr < rptr) {
3500                                 *flags |= CPU_DTRACE_BADADDR;
3501                                 *illval = tupregs[0].dttk_value;
3502                                 break;
3503                         }
3504 
3505                         daddr = dtrace_loadptr(baddr +
3506                             offsetof(mblk_t, b_datap));
3507 
3508                         baddr = dtrace_loadptr(baddr +
3509                             offsetof(mblk_t, b_cont));
3510 
3511                         /*
3512                          * We want to prevent against denial-of-service here,
3513                          * so we're only going to search the list for
3514                          * dtrace_msgdsize_max mblks.
3515                          */
3516                         if (cont++ > dtrace_msgdsize_max) {
3517                                 *flags |= CPU_DTRACE_ILLOP;
3518                                 break;
3519                         }
3520 
3521                         if (subr == DIF_SUBR_MSGDSIZE) {
3522                                 if (dtrace_load8(daddr +
3523                                     offsetof(dblk_t, db_type)) != M_DATA)
3524                                         continue;
3525                         }
3526 
3527                         count += wptr - rptr;
3528                 }
3529 
3530                 if (!(*flags & CPU_DTRACE_FAULT))
3531                         regs[rd] = count;
3532 
3533                 break;
3534         }
3535 
3536         case DIF_SUBR_PROGENYOF: {
3537                 pid_t pid = tupregs[0].dttk_value;
3538                 proc_t *p;
3539                 int rval = 0;
3540 
3541                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3542 
3543                 for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
3544                         if (p->p_pidp->pid_id == pid) {
3545                                 rval = 1;
3546                                 break;
3547                         }
3548                 }
3549 
3550                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3551 
3552                 regs[rd] = rval;
3553                 break;
3554         }
3555 
3556         case DIF_SUBR_SPECULATION:
3557                 regs[rd] = dtrace_speculation(state);
3558                 break;
3559 
3560         case DIF_SUBR_COPYOUT: {
3561                 uintptr_t kaddr = tupregs[0].dttk_value;
3562                 uintptr_t uaddr = tupregs[1].dttk_value;
3563                 uint64_t size = tupregs[2].dttk_value;
3564 
3565                 if (!dtrace_destructive_disallow &&
3566                     dtrace_priv_proc_control(state, mstate) &&
3567                     !dtrace_istoxic(kaddr, size)) {
3568                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3569                         dtrace_copyout(kaddr, uaddr, size, flags);
3570                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3571                 }
3572                 break;
3573         }
3574 
3575         case DIF_SUBR_COPYOUTSTR: {
3576                 uintptr_t kaddr = tupregs[0].dttk_value;
3577                 uintptr_t uaddr = tupregs[1].dttk_value;
3578                 uint64_t size = tupregs[2].dttk_value;
3579 
3580                 if (!dtrace_destructive_disallow &&
3581                     dtrace_priv_proc_control(state, mstate) &&
3582                     !dtrace_istoxic(kaddr, size)) {
3583                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3584                         dtrace_copyoutstr(kaddr, uaddr, size, flags);
3585                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3586                 }
3587                 break;
3588         }
3589 
3590         case DIF_SUBR_STRLEN: {
3591                 size_t sz;
3592                 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
3593                 sz = dtrace_strlen((char *)addr,
3594                     state->dts_options[DTRACEOPT_STRSIZE]);
3595 
3596                 if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
3597                         regs[rd] = NULL;
3598                         break;
3599                 }
3600 
3601                 regs[rd] = sz;
3602 
3603                 break;
3604         }
3605 
3606         case DIF_SUBR_STRCHR:
3607         case DIF_SUBR_STRRCHR: {
3608                 /*
3609                  * We're going to iterate over the string looking for the
3610                  * specified character.  We will iterate until we have reached
3611                  * the string length or we have found the character.  If this
3612                  * is DIF_SUBR_STRRCHR, we will look for the last occurrence
3613                  * of the specified character instead of the first.
3614                  */
3615                 uintptr_t saddr = tupregs[0].dttk_value;
3616                 uintptr_t addr = tupregs[0].dttk_value;
3617                 uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
3618                 char c, target = (char)tupregs[1].dttk_value;
3619 
3620                 for (regs[rd] = NULL; addr < limit; addr++) {
3621                         if ((c = dtrace_load8(addr)) == target) {
3622                                 regs[rd] = addr;
3623 
3624                                 if (subr == DIF_SUBR_STRCHR)
3625                                         break;
3626                         }
3627 
3628                         if (c == '\0')
3629                                 break;
3630                 }
3631 
3632                 if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
3633                         regs[rd] = NULL;
3634                         break;
3635                 }
3636 
3637                 break;
3638         }
3639 
3640         case DIF_SUBR_STRSTR:
3641         case DIF_SUBR_INDEX:
3642         case DIF_SUBR_RINDEX: {
3643                 /*
3644                  * We're going to iterate over the string looking for the
3645                  * specified string.  We will iterate until we have reached
3646                  * the string length or we have found the string.  (Yes, this
3647                  * is done in the most naive way possible -- but considering
3648                  * that the string we're searching for is likely to be
3649                  * relatively short, the complexity of Rabin-Karp or similar
3650                  * hardly seems merited.)
3651                  */
3652                 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
3653                 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
3654                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3655                 size_t len = dtrace_strlen(addr, size);
3656                 size_t sublen = dtrace_strlen(substr, size);
3657                 char *limit = addr + len, *orig = addr;
3658                 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
3659                 int inc = 1;
3660 
3661                 regs[rd] = notfound;
3662 
3663                 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
3664                         regs[rd] = NULL;
3665                         break;
3666                 }
3667 
3668                 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
3669                     vstate)) {
3670                         regs[rd] = NULL;
3671                         break;
3672                 }
3673 
3674                 /*
3675                  * strstr() and index()/rindex() have similar semantics if
3676                  * both strings are the empty string: strstr() returns a
3677                  * pointer to the (empty) string, and index() and rindex()
3678                  * both return index 0 (regardless of any position argument).
3679                  */
3680                 if (sublen == 0 && len == 0) {
3681                         if (subr == DIF_SUBR_STRSTR)
3682                                 regs[rd] = (uintptr_t)addr;
3683                         else
3684                                 regs[rd] = 0;
3685                         break;
3686                 }
3687 
3688                 if (subr != DIF_SUBR_STRSTR) {
3689                         if (subr == DIF_SUBR_RINDEX) {
3690                                 limit = orig - 1;
3691                                 addr += len;
3692                                 inc = -1;
3693                         }
3694 
3695                         /*
3696                          * Both index() and rindex() take an optional position
3697                          * argument that denotes the starting position.
3698                          */
3699                         if (nargs == 3) {
3700                                 int64_t pos = (int64_t)tupregs[2].dttk_value;
3701 
3702                                 /*
3703                                  * If the position argument to index() is
3704                                  * negative, Perl implicitly clamps it at
3705                                  * zero.  This semantic is a little surprising
3706                                  * given the special meaning of negative
3707                                  * positions to similar Perl functions like
3708                                  * substr(), but it appears to reflect a
3709                                  * notion that index() can start from a
3710                                  * negative index and increment its way up to
3711                                  * the string.  Given this notion, Perl's
3712                                  * rindex() is at least self-consistent in
3713                                  * that it implicitly clamps positions greater
3714                                  * than the string length to be the string
3715                                  * length.  Where Perl completely loses
3716                                  * coherence, however, is when the specified
3717                                  * substring is the empty string ("").  In
3718                                  * this case, even if the position is
3719                                  * negative, rindex() returns 0 -- and even if
3720                                  * the position is greater than the length,
3721                                  * index() returns the string length.  These
3722                                  * semantics violate the notion that index()
3723                                  * should never return a value less than the
3724                                  * specified position and that rindex() should
3725                                  * never return a value greater than the
3726                                  * specified position.  (One assumes that
3727                                  * these semantics are artifacts of Perl's
3728                                  * implementation and not the results of
3729                                  * deliberate design -- it beggars belief that
3730                                  * even Larry Wall could desire such oddness.)
3731                                  * While in the abstract one would wish for
3732                                  * consistent position semantics across
3733                                  * substr(), index() and rindex() -- or at the
3734                                  * very least self-consistent position
3735                                  * semantics for index() and rindex() -- we
3736                                  * instead opt to keep with the extant Perl
3737                                  * semantics, in all their broken glory.  (Do
3738                                  * we have more desire to maintain Perl's
3739                                  * semantics than Perl does?  Probably.)
3740                                  */
3741                                 if (subr == DIF_SUBR_RINDEX) {
3742                                         if (pos < 0) {
3743                                                 if (sublen == 0)
3744                                                         regs[rd] = 0;
3745                                                 break;
3746                                         }
3747 
3748                                         if (pos > len)
3749                                                 pos = len;
3750                                 } else {
3751                                         if (pos < 0)
3752                                                 pos = 0;
3753 
3754                                         if (pos >= len) {
3755                                                 if (sublen == 0)
3756                                                         regs[rd] = len;
3757                                                 break;
3758                                         }
3759                                 }
3760 
3761                                 addr = orig + pos;
3762                         }
3763                 }
3764 
3765                 for (regs[rd] = notfound; addr != limit; addr += inc) {
3766                         if (dtrace_strncmp(addr, substr, sublen) == 0) {
3767                                 if (subr != DIF_SUBR_STRSTR) {
3768                                         /*
3769                                          * As D index() and rindex() are
3770                                          * modeled on Perl (and not on awk),
3771                                          * we return a zero-based (and not a
3772                                          * one-based) index.  (For you Perl
3773                                          * weenies: no, we're not going to add
3774                                          * $[ -- and shouldn't you be at a con
3775                                          * or something?)
3776                                          */
3777                                         regs[rd] = (uintptr_t)(addr - orig);
3778                                         break;
3779                                 }
3780 
3781                                 ASSERT(subr == DIF_SUBR_STRSTR);
3782                                 regs[rd] = (uintptr_t)addr;
3783                                 break;
3784                         }
3785                 }
3786 
3787                 break;
3788         }
3789 
3790         case DIF_SUBR_STRTOK: {
3791                 uintptr_t addr = tupregs[0].dttk_value;
3792                 uintptr_t tokaddr = tupregs[1].dttk_value;
3793                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3794                 uintptr_t limit, toklimit = tokaddr + size;
3795                 uint8_t c, tokmap[32];   /* 256 / 8 */
3796                 char *dest = (char *)mstate->dtms_scratch_ptr;
3797                 int i;
3798 
3799                 /*
3800                  * Check both the token buffer and (later) the input buffer,
3801                  * since both could be non-scratch addresses.
3802                  */
3803                 if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
3804                         regs[rd] = NULL;
3805                         break;
3806                 }
3807 
3808                 if (!DTRACE_INSCRATCH(mstate, size)) {
3809                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3810                         regs[rd] = NULL;
3811                         break;
3812                 }
3813 
3814                 if (addr == NULL) {
3815                         /*
3816                          * If the address specified is NULL, we use our saved
3817                          * strtok pointer from the mstate.  Note that this
3818                          * means that the saved strtok pointer is _only_
3819                          * valid within multiple enablings of the same probe --
3820                          * it behaves like an implicit clause-local variable.
3821                          */
3822                         addr = mstate->dtms_strtok;
3823                 } else {
3824                         /*
3825                          * If the user-specified address is non-NULL we must
3826                          * access check it.  This is the only time we have
3827                          * a chance to do so, since this address may reside
3828                          * in the string table of this clause-- future calls
3829                          * (when we fetch addr from mstate->dtms_strtok)
3830                          * would fail this access check.
3831                          */
3832                         if (!dtrace_strcanload(addr, size, mstate, vstate)) {
3833                                 regs[rd] = NULL;
3834                                 break;
3835                         }
3836                 }
3837 
3838                 /*
3839                  * First, zero the token map, and then process the token
3840                  * string -- setting a bit in the map for every character
3841                  * found in the token string.
3842                  */
3843                 for (i = 0; i < sizeof (tokmap); i++)
3844                         tokmap[i] = 0;
3845 
3846                 for (; tokaddr < toklimit; tokaddr++) {
3847                         if ((c = dtrace_load8(tokaddr)) == '\0')
3848                                 break;
3849 
3850                         ASSERT((c >> 3) < sizeof (tokmap));
3851                         tokmap[c >> 3] |= (1 << (c & 0x7));
3852                 }
3853 
3854                 for (limit = addr + size; addr < limit; addr++) {
3855                         /*
3856                          * We're looking for a character that is _not_ contained
3857                          * in the token string.
3858                          */
3859                         if ((c = dtrace_load8(addr)) == '\0')
3860                                 break;
3861 
3862                         if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
3863                                 break;
3864                 }
3865 
3866                 if (c == '\0') {
3867                         /*
3868                          * We reached the end of the string without finding
3869                          * any character that was not in the token string.
3870                          * We return NULL in this case, and we set the saved
3871                          * address to NULL as well.
3872                          */
3873                         regs[rd] = NULL;
3874                         mstate->dtms_strtok = NULL;
3875                         break;
3876                 }
3877 
3878                 /*
3879                  * From here on, we're copying into the destination string.
3880                  */
3881                 for (i = 0; addr < limit && i < size - 1; addr++) {
3882                         if ((c = dtrace_load8(addr)) == '\0')
3883                                 break;
3884 
3885                         if (tokmap[c >> 3] & (1 << (c & 0x7)))
3886                                 break;
3887 
3888                         ASSERT(i < size);
3889                         dest[i++] = c;
3890                 }
3891 
3892                 ASSERT(i < size);
3893                 dest[i] = '\0';
3894                 regs[rd] = (uintptr_t)dest;
3895                 mstate->dtms_scratch_ptr += size;
3896                 mstate->dtms_strtok = addr;
3897                 break;
3898         }
3899 
3900         case DIF_SUBR_SUBSTR: {
3901                 uintptr_t s = tupregs[0].dttk_value;
3902                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3903                 char *d = (char *)mstate->dtms_scratch_ptr;
3904                 int64_t index = (int64_t)tupregs[1].dttk_value;
3905                 int64_t remaining = (int64_t)tupregs[2].dttk_value;
3906                 size_t len = dtrace_strlen((char *)s, size);
3907                 int64_t i;
3908 
3909                 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
3910                         regs[rd] = NULL;
3911                         break;
3912                 }
3913 
3914                 if (!DTRACE_INSCRATCH(mstate, size)) {
3915                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3916                         regs[rd] = NULL;
3917                         break;
3918                 }
3919 
3920                 if (nargs <= 2)
3921                         remaining = (int64_t)size;
3922 
3923                 if (index < 0) {
3924                         index += len;
3925 
3926                         if (index < 0 && index + remaining > 0) {
3927                                 remaining += index;
3928                                 index = 0;
3929                         }
3930                 }
3931 
3932                 if (index >= len || index < 0) {
3933                         remaining = 0;
3934                 } else if (remaining < 0) {
3935                         remaining += len - index;
3936                 } else if (index + remaining > size) {
3937                         remaining = size - index;
3938                 }
3939 
3940                 for (i = 0; i < remaining; i++) {
3941                         if ((d[i] = dtrace_load8(s + index + i)) == '\0')
3942                                 break;
3943                 }
3944 
3945                 d[i] = '\0';
3946 
3947                 mstate->dtms_scratch_ptr += size;
3948                 regs[rd] = (uintptr_t)d;
3949                 break;
3950         }
3951 
3952         case DIF_SUBR_TOUPPER:
3953         case DIF_SUBR_TOLOWER: {
3954                 uintptr_t s = tupregs[0].dttk_value;
3955                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3956                 char *dest = (char *)mstate->dtms_scratch_ptr, c;
3957                 size_t len = dtrace_strlen((char *)s, size);
3958                 char lower, upper, convert;
3959                 int64_t i;
3960 
3961                 if (subr == DIF_SUBR_TOUPPER) {
3962                         lower = 'a';
3963                         upper = 'z';
3964                         convert = 'A';
3965                 } else {
3966                         lower = 'A';
3967                         upper = 'Z';
3968                         convert = 'a';
3969                 }
3970 
3971                 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
3972                         regs[rd] = NULL;
3973                         break;
3974                 }
3975 
3976                 if (!DTRACE_INSCRATCH(mstate, size)) {
3977                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3978                         regs[rd] = NULL;
3979                         break;
3980                 }
3981 
3982                 for (i = 0; i < size - 1; i++) {
3983                         if ((c = dtrace_load8(s + i)) == '\0')
3984                                 break;
3985 
3986                         if (c >= lower && c <= upper)
3987                                 c = convert + (c - lower);
3988 
3989                         dest[i] = c;
3990                 }
3991 
3992                 ASSERT(i < size);
3993                 dest[i] = '\0';
3994                 regs[rd] = (uintptr_t)dest;
3995                 mstate->dtms_scratch_ptr += size;
3996                 break;
3997         }
3998 
3999 case DIF_SUBR_GETMAJOR:
4000 #ifdef _LP64
4001                 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
4002 #else
4003                 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
4004 #endif
4005                 break;
4006 
4007         case DIF_SUBR_GETMINOR:
4008 #ifdef _LP64
4009                 regs[rd] = tupregs[0].dttk_value & MAXMIN64;
4010 #else
4011                 regs[rd] = tupregs[0].dttk_value & MAXMIN;
4012 #endif
4013                 break;
4014 
4015         case DIF_SUBR_DDI_PATHNAME: {
4016                 /*
4017                  * This one is a galactic mess.  We are going to roughly
4018                  * emulate ddi_pathname(), but it's made more complicated
4019                  * by the fact that we (a) want to include the minor name and
4020                  * (b) must proceed iteratively instead of recursively.
4021                  */
4022                 uintptr_t dest = mstate->dtms_scratch_ptr;
4023                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4024                 char *start = (char *)dest, *end = start + size - 1;
4025                 uintptr_t daddr = tupregs[0].dttk_value;
4026                 int64_t minor = (int64_t)tupregs[1].dttk_value;
4027                 char *s;
4028                 int i, len, depth = 0;
4029 
4030                 /*
4031                  * Due to all the pointer jumping we do and context we must
4032                  * rely upon, we just mandate that the user must have kernel
4033                  * read privileges to use this routine.
4034                  */
4035                 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
4036                         *flags |= CPU_DTRACE_KPRIV;
4037                         *illval = daddr;
4038                         regs[rd] = NULL;
4039                 }
4040 
4041                 if (!DTRACE_INSCRATCH(mstate, size)) {
4042                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4043                         regs[rd] = NULL;
4044                         break;
4045                 }
4046 
4047                 *end = '\0';
4048 
4049                 /*
4050                  * We want to have a name for the minor.  In order to do this,
4051                  * we need to walk the minor list from the devinfo.  We want
4052                  * to be sure that we don't infinitely walk a circular list,
4053                  * so we check for circularity by sending a scout pointer
4054                  * ahead two elements for every element that we iterate over;
4055                  * if the list is circular, these will ultimately point to the
4056                  * same element.  You may recognize this little trick as the
4057                  * answer to a stupid interview question -- one that always
4058                  * seems to be asked by those who had to have it laboriously
4059                  * explained to them, and who can't even concisely describe
4060                  * the conditions under which one would be forced to resort to
4061                  * this technique.  Needless to say, those conditions are
4062                  * found here -- and probably only here.  Is this the only use
4063                  * of this infamous trick in shipping, production code?  If it
4064                  * isn't, it probably should be...
4065                  */
4066                 if (minor != -1) {
4067                         uintptr_t maddr = dtrace_loadptr(daddr +
4068                             offsetof(struct dev_info, devi_minor));
4069 
4070                         uintptr_t next = offsetof(struct ddi_minor_data, next);
4071                         uintptr_t name = offsetof(struct ddi_minor_data,
4072                             d_minor) + offsetof(struct ddi_minor, name);
4073                         uintptr_t dev = offsetof(struct ddi_minor_data,
4074                             d_minor) + offsetof(struct ddi_minor, dev);
4075                         uintptr_t scout;
4076 
4077                         if (maddr != NULL)
4078                                 scout = dtrace_loadptr(maddr + next);
4079 
4080                         while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4081                                 uint64_t m;
4082 #ifdef _LP64
4083                                 m = dtrace_load64(maddr + dev) & MAXMIN64;
4084 #else
4085                                 m = dtrace_load32(maddr + dev) & MAXMIN;
4086 #endif
4087                                 if (m != minor) {
4088                                         maddr = dtrace_loadptr(maddr + next);
4089 
4090                                         if (scout == NULL)
4091                                                 continue;
4092 
4093                                         scout = dtrace_loadptr(scout + next);
4094 
4095                                         if (scout == NULL)
4096                                                 continue;
4097 
4098                                         scout = dtrace_loadptr(scout + next);
4099 
4100                                         if (scout == NULL)
4101                                                 continue;
4102 
4103                                         if (scout == maddr) {
4104                                                 *flags |= CPU_DTRACE_ILLOP;
4105                                                 break;
4106                                         }
4107 
4108                                         continue;
4109                                 }
4110 
4111                                 /*
4112                                  * We have the minor data.  Now we need to
4113                                  * copy the minor's name into the end of the
4114                                  * pathname.
4115                                  */
4116                                 s = (char *)dtrace_loadptr(maddr + name);
4117                                 len = dtrace_strlen(s, size);
4118 
4119                                 if (*flags & CPU_DTRACE_FAULT)
4120                                         break;
4121 
4122                                 if (len != 0) {
4123                                         if ((end -= (len + 1)) < start)
4124                                                 break;
4125 
4126                                         *end = ':';
4127                                 }
4128 
4129                                 for (i = 1; i <= len; i++)
4130                                         end[i] = dtrace_load8((uintptr_t)s++);
4131                                 break;
4132                         }
4133                 }
4134 
4135                 while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4136                         ddi_node_state_t devi_state;
4137 
4138                         devi_state = dtrace_load32(daddr +
4139                             offsetof(struct dev_info, devi_node_state));
4140 
4141                         if (*flags & CPU_DTRACE_FAULT)
4142                                 break;
4143 
4144                         if (devi_state >= DS_INITIALIZED) {
4145                                 s = (char *)dtrace_loadptr(daddr +
4146                                     offsetof(struct dev_info, devi_addr));
4147                                 len = dtrace_strlen(s, size);
4148 
4149                                 if (*flags & CPU_DTRACE_FAULT)
4150                                         break;
4151 
4152                                 if (len != 0) {
4153                                         if ((end -= (len + 1)) < start)
4154                                                 break;
4155 
4156                                         *end = '@';
4157                                 }
4158 
4159                                 for (i = 1; i <= len; i++)
4160                                         end[i] = dtrace_load8((uintptr_t)s++);
4161                         }
4162 
4163                         /*
4164                          * Now for the node name...
4165                          */
4166                         s = (char *)dtrace_loadptr(daddr +
4167                             offsetof(struct dev_info, devi_node_name));
4168 
4169                         daddr = dtrace_loadptr(daddr +
4170                             offsetof(struct dev_info, devi_parent));
4171 
4172                         /*
4173                          * If our parent is NULL (that is, if we're the root
4174                          * node), we're going to use the special path
4175                          * "devices".
4176                          */
4177                         if (daddr == NULL)
4178                                 s = "devices";
4179 
4180                         len = dtrace_strlen(s, size);
4181                         if (*flags & CPU_DTRACE_FAULT)
4182                                 break;
4183 
4184                         if ((end -= (len + 1)) < start)
4185                                 break;
4186 
4187                         for (i = 1; i <= len; i++)
4188                                 end[i] = dtrace_load8((uintptr_t)s++);
4189                         *end = '/';
4190 
4191                         if (depth++ > dtrace_devdepth_max) {
4192                                 *flags |= CPU_DTRACE_ILLOP;
4193                                 break;
4194                         }
4195                 }
4196 
4197                 if (end < start)
4198                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4199 
4200                 if (daddr == NULL) {
4201                         regs[rd] = (uintptr_t)end;
4202                         mstate->dtms_scratch_ptr += size;
4203                 }
4204 
4205                 break;
4206         }
4207 
4208         case DIF_SUBR_STRJOIN: {
4209                 char *d = (char *)mstate->dtms_scratch_ptr;
4210                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4211                 uintptr_t s1 = tupregs[0].dttk_value;
4212                 uintptr_t s2 = tupregs[1].dttk_value;
4213                 int i = 0;
4214 
4215                 if (!dtrace_strcanload(s1, size, mstate, vstate) ||
4216                     !dtrace_strcanload(s2, size, mstate, vstate)) {
4217                         regs[rd] = NULL;
4218                         break;
4219                 }
4220 
4221                 if (!DTRACE_INSCRATCH(mstate, size)) {
4222                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4223                         regs[rd] = NULL;
4224                         break;
4225                 }
4226 
4227                 for (;;) {
4228                         if (i >= size) {
4229                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4230                                 regs[rd] = NULL;
4231                                 break;
4232                         }
4233 
4234                         if ((d[i++] = dtrace_load8(s1++)) == '\0') {
4235                                 i--;
4236                                 break;
4237                         }
4238                 }
4239 
4240                 for (;;) {
4241                         if (i >= size) {
4242                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4243                                 regs[rd] = NULL;
4244                                 break;
4245                         }
4246 
4247                         if ((d[i++] = dtrace_load8(s2++)) == '\0')
4248                                 break;
4249                 }
4250 
4251                 if (i < size) {
4252                         mstate->dtms_scratch_ptr += i;
4253                         regs[rd] = (uintptr_t)d;
4254                 }
4255 
4256                 break;
4257         }
4258 
4259         case DIF_SUBR_LLTOSTR: {
4260                 int64_t i = (int64_t)tupregs[0].dttk_value;
4261                 uint64_t val, digit;
4262                 uint64_t size = 65;     /* enough room for 2^64 in binary */
4263                 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4264                 int base = 10;
4265 
4266                 if (nargs > 1) {
4267                         if ((base = tupregs[1].dttk_value) <= 1 ||
4268                             base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
4269                                 *flags |= CPU_DTRACE_ILLOP;
4270                                 break;
4271                         }
4272                 }
4273 
4274                 val = (base == 10 && i < 0) ? i * -1 : i;
4275 
4276                 if (!DTRACE_INSCRATCH(mstate, size)) {
4277                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4278                         regs[rd] = NULL;
4279                         break;
4280                 }
4281 
4282                 for (*end-- = '\0'; val; val /= base) {
4283                         if ((digit = val % base) <= '9' - '0') {
4284                                 *end-- = '0' + digit;
4285                         } else {
4286                                 *end-- = 'a' + (digit - ('9' - '0') - 1);
4287                         }
4288                 }
4289 
4290                 if (i == 0 && base == 16)
4291                         *end-- = '0';
4292 
4293                 if (base == 16)
4294                         *end-- = 'x';
4295 
4296                 if (i == 0 || base == 8 || base == 16)
4297                         *end-- = '0';
4298 
4299                 if (i < 0 && base == 10)
4300                         *end-- = '-';
4301 
4302                 regs[rd] = (uintptr_t)end + 1;
4303                 mstate->dtms_scratch_ptr += size;
4304                 break;
4305         }
4306 
4307         case DIF_SUBR_HTONS:
4308         case DIF_SUBR_NTOHS:
4309 #ifdef _BIG_ENDIAN
4310                 regs[rd] = (uint16_t)tupregs[0].dttk_value;
4311 #else
4312                 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
4313 #endif
4314                 break;
4315 
4316 
4317         case DIF_SUBR_HTONL:
4318         case DIF_SUBR_NTOHL:
4319 #ifdef _BIG_ENDIAN
4320                 regs[rd] = (uint32_t)tupregs[0].dttk_value;
4321 #else
4322                 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
4323 #endif
4324                 break;
4325 
4326 
4327         case DIF_SUBR_HTONLL:
4328         case DIF_SUBR_NTOHLL:
4329 #ifdef _BIG_ENDIAN
4330                 regs[rd] = (uint64_t)tupregs[0].dttk_value;
4331 #else
4332                 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
4333 #endif
4334                 break;
4335 
4336 
4337         case DIF_SUBR_DIRNAME:
4338         case DIF_SUBR_BASENAME: {
4339                 char *dest = (char *)mstate->dtms_scratch_ptr;
4340                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4341                 uintptr_t src = tupregs[0].dttk_value;
4342                 int i, j, len = dtrace_strlen((char *)src, size);
4343                 int lastbase = -1, firstbase = -1, lastdir = -1;
4344                 int start, end;
4345 
4346                 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
4347                         regs[rd] = NULL;
4348                         break;
4349                 }
4350 
4351                 if (!DTRACE_INSCRATCH(mstate, size)) {
4352                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4353                         regs[rd] = NULL;
4354                         break;
4355                 }
4356 
4357                 /*
4358                  * The basename and dirname for a zero-length string is
4359                  * defined to be "."
4360                  */
4361                 if (len == 0) {
4362                         len = 1;
4363                         src = (uintptr_t)".";
4364                 }
4365 
4366                 /*
4367                  * Start from the back of the string, moving back toward the
4368                  * front until we see a character that isn't a slash.  That
4369                  * character is the last character in the basename.
4370                  */
4371                 for (i = len - 1; i >= 0; i--) {
4372                         if (dtrace_load8(src + i) != '/')
4373                                 break;
4374                 }
4375 
4376                 if (i >= 0)
4377                         lastbase = i;
4378 
4379                 /*
4380                  * Starting from the last character in the basename, move
4381                  * towards the front until we find a slash.  The character
4382                  * that we processed immediately before that is the first
4383                  * character in the basename.
4384                  */
4385                 for (; i >= 0; i--) {
4386                         if (dtrace_load8(src + i) == '/')
4387                                 break;
4388                 }
4389 
4390                 if (i >= 0)
4391                         firstbase = i + 1;
4392 
4393                 /*
4394                  * Now keep going until we find a non-slash character.  That
4395                  * character is the last character in the dirname.
4396                  */
4397                 for (; i >= 0; i--) {
4398                         if (dtrace_load8(src + i) != '/')
4399                                 break;
4400                 }
4401 
4402                 if (i >= 0)
4403                         lastdir = i;
4404 
4405                 ASSERT(!(lastbase == -1 && firstbase != -1));
4406                 ASSERT(!(firstbase == -1 && lastdir != -1));
4407 
4408                 if (lastbase == -1) {
4409                         /*
4410                          * We didn't find a non-slash character.  We know that
4411                          * the length is non-zero, so the whole string must be
4412                          * slashes.  In either the dirname or the basename
4413                          * case, we return '/'.
4414                          */
4415                         ASSERT(firstbase == -1);
4416                         firstbase = lastbase = lastdir = 0;
4417                 }
4418 
4419                 if (firstbase == -1) {
4420                         /*
4421                          * The entire string consists only of a basename
4422                          * component.  If we're looking for dirname, we need
4423                          * to change our string to be just "."; if we're
4424                          * looking for a basename, we'll just set the first
4425                          * character of the basename to be 0.
4426                          */
4427                         if (subr == DIF_SUBR_DIRNAME) {
4428                                 ASSERT(lastdir == -1);
4429                                 src = (uintptr_t)".";
4430                                 lastdir = 0;
4431                         } else {
4432                                 firstbase = 0;
4433                         }
4434                 }
4435 
4436                 if (subr == DIF_SUBR_DIRNAME) {
4437                         if (lastdir == -1) {
4438                                 /*
4439                                  * We know that we have a slash in the name --
4440                                  * or lastdir would be set to 0, above.  And
4441                                  * because lastdir is -1, we know that this
4442                                  * slash must be the first character.  (That
4443                                  * is, the full string must be of the form
4444                                  * "/basename".)  In this case, the last
4445                                  * character of the directory name is 0.
4446                                  */
4447                                 lastdir = 0;
4448                         }
4449 
4450                         start = 0;
4451                         end = lastdir;
4452                 } else {
4453                         ASSERT(subr == DIF_SUBR_BASENAME);
4454                         ASSERT(firstbase != -1 && lastbase != -1);
4455                         start = firstbase;
4456                         end = lastbase;
4457                 }
4458 
4459                 for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
4460                         dest[j] = dtrace_load8(src + i);
4461 
4462                 dest[j] = '\0';
4463                 regs[rd] = (uintptr_t)dest;
4464                 mstate->dtms_scratch_ptr += size;
4465                 break;
4466         }
4467 
4468         case DIF_SUBR_CLEANPATH: {
4469                 char *dest = (char *)mstate->dtms_scratch_ptr, c;
4470                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4471                 uintptr_t src = tupregs[0].dttk_value;
4472                 int i = 0, j = 0;
4473 
4474                 if (!dtrace_strcanload(src, size, mstate, vstate)) {
4475                         regs[rd] = NULL;
4476                         break;
4477                 }
4478 
4479                 if (!DTRACE_INSCRATCH(mstate, size)) {
4480                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4481                         regs[rd] = NULL;
4482                         break;
4483                 }
4484 
4485                 /*
4486                  * Move forward, loading each character.
4487                  */
4488                 do {
4489                         c = dtrace_load8(src + i++);
4490 next:
4491                         if (j + 5 >= size)   /* 5 = strlen("/..c\0") */
4492                                 break;
4493 
4494                         if (c != '/') {
4495                                 dest[j++] = c;
4496                                 continue;
4497                         }
4498 
4499                         c = dtrace_load8(src + i++);
4500 
4501                         if (c == '/') {
4502                                 /*
4503                                  * We have two slashes -- we can just advance
4504                                  * to the next character.
4505                                  */
4506                                 goto next;
4507                         }
4508 
4509                         if (c != '.') {
4510                                 /*
4511                                  * This is not "." and it's not ".." -- we can
4512                                  * just store the "/" and this character and
4513                                  * drive on.
4514                                  */
4515                                 dest[j++] = '/';
4516                                 dest[j++] = c;
4517                                 continue;
4518                         }
4519 
4520                         c = dtrace_load8(src + i++);
4521 
4522                         if (c == '/') {
4523                                 /*
4524                                  * This is a "/./" component.  We're not going
4525                                  * to store anything in the destination buffer;
4526                                  * we're just going to go to the next component.
4527                                  */
4528                                 goto next;
4529                         }
4530 
4531                         if (c != '.') {
4532                                 /*
4533                                  * This is not ".." -- we can just store the
4534                                  * "/." and this character and continue
4535                                  * processing.
4536                                  */
4537                                 dest[j++] = '/';
4538                                 dest[j++] = '.';
4539                                 dest[j++] = c;
4540                                 continue;
4541                         }
4542 
4543                         c = dtrace_load8(src + i++);
4544 
4545                         if (c != '/' && c != '\0') {
4546                                 /*
4547                                  * This is not ".." -- it's "..[mumble]".
4548                                  * We'll store the "/.." and this character
4549                                  * and continue processing.
4550                                  */
4551                                 dest[j++] = '/';
4552                                 dest[j++] = '.';
4553                                 dest[j++] = '.';
4554                                 dest[j++] = c;
4555                                 continue;
4556                         }
4557 
4558                         /*
4559                          * This is "/../" or "/..\0".  We need to back up
4560                          * our destination pointer until we find a "/".
4561                          */
4562                         i--;
4563                         while (j != 0 && dest[--j] != '/')
4564                                 continue;
4565 
4566                         if (c == '\0')
4567                                 dest[++j] = '/';
4568                 } while (c != '\0');
4569 
4570                 dest[j] = '\0';
4571                 regs[rd] = (uintptr_t)dest;
4572                 mstate->dtms_scratch_ptr += size;
4573                 break;
4574         }
4575 
4576         case DIF_SUBR_INET_NTOA:
4577         case DIF_SUBR_INET_NTOA6:
4578         case DIF_SUBR_INET_NTOP: {
4579                 size_t size;
4580                 int af, argi, i;
4581                 char *base, *end;
4582 
4583                 if (subr == DIF_SUBR_INET_NTOP) {
4584                         af = (int)tupregs[0].dttk_value;
4585                         argi = 1;
4586                 } else {
4587                         af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
4588                         argi = 0;
4589                 }
4590 
4591                 if (af == AF_INET) {
4592                         ipaddr_t ip4;
4593                         uint8_t *ptr8, val;
4594 
4595                         /*
4596                          * Safely load the IPv4 address.
4597                          */
4598                         ip4 = dtrace_load32(tupregs[argi].dttk_value);
4599 
4600                         /*
4601                          * Check an IPv4 string will fit in scratch.
4602                          */
4603                         size = INET_ADDRSTRLEN;
4604                         if (!DTRACE_INSCRATCH(mstate, size)) {
4605                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4606                                 regs[rd] = NULL;
4607                                 break;
4608                         }
4609                         base = (char *)mstate->dtms_scratch_ptr;
4610                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
4611 
4612                         /*
4613                          * Stringify as a dotted decimal quad.
4614                          */
4615                         *end-- = '\0';
4616                         ptr8 = (uint8_t *)&ip4;
4617                         for (i = 3; i >= 0; i--) {
4618                                 val = ptr8[i];
4619 
4620                                 if (val == 0) {
4621                                         *end-- = '0';
4622                                 } else {
4623                                         for (; val; val /= 10) {
4624                                                 *end-- = '0' + (val % 10);
4625                                         }
4626                                 }
4627 
4628                                 if (i > 0)
4629                                         *end-- = '.';
4630                         }
4631                         ASSERT(end + 1 >= base);
4632 
4633                 } else if (af == AF_INET6) {
4634                         struct in6_addr ip6;
4635                         int firstzero, tryzero, numzero, v6end;
4636                         uint16_t val;
4637                         const char digits[] = "0123456789abcdef";
4638 
4639                         /*
4640                          * Stringify using RFC 1884 convention 2 - 16 bit
4641                          * hexadecimal values with a zero-run compression.
4642                          * Lower case hexadecimal digits are used.
4643                          *      eg, fe80::214:4fff:fe0b:76c8.
4644                          * The IPv4 embedded form is returned for inet_ntop,
4645                          * just the IPv4 string is returned for inet_ntoa6.
4646                          */
4647 
4648                         /*
4649                          * Safely load the IPv6 address.
4650                          */
4651                         dtrace_bcopy(
4652                             (void *)(uintptr_t)tupregs[argi].dttk_value,
4653                             (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
4654 
4655                         /*
4656                          * Check an IPv6 string will fit in scratch.
4657                          */
4658                         size = INET6_ADDRSTRLEN;
4659                         if (!DTRACE_INSCRATCH(mstate, size)) {
4660                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4661                                 regs[rd] = NULL;
4662                                 break;
4663                         }
4664                         base = (char *)mstate->dtms_scratch_ptr;
4665                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
4666                         *end-- = '\0';
4667 
4668                         /*
4669                          * Find the longest run of 16 bit zero values
4670                          * for the single allowed zero compression - "::".
4671                          */
4672                         firstzero = -1;
4673                         tryzero = -1;
4674                         numzero = 1;
4675                         for (i = 0; i < sizeof (struct in6_addr); i++) {
4676                                 if (ip6._S6_un._S6_u8[i] == 0 &&
4677                                     tryzero == -1 && i % 2 == 0) {
4678                                         tryzero = i;
4679                                         continue;
4680                                 }
4681 
4682                                 if (tryzero != -1 &&
4683                                     (ip6._S6_un._S6_u8[i] != 0 ||
4684                                     i == sizeof (struct in6_addr) - 1)) {
4685 
4686                                         if (i - tryzero <= numzero) {
4687                                                 tryzero = -1;
4688                                                 continue;
4689                                         }
4690 
4691                                         firstzero = tryzero;
4692                                         numzero = i - i % 2 - tryzero;
4693                                         tryzero = -1;
4694 
4695                                         if (ip6._S6_un._S6_u8[i] == 0 &&
4696                                             i == sizeof (struct in6_addr) - 1)
4697                                                 numzero += 2;
4698                                 }
4699                         }
4700                         ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
4701 
4702                         /*
4703                          * Check for an IPv4 embedded address.
4704                          */
4705                         v6end = sizeof (struct in6_addr) - 2;
4706                         if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
4707                             IN6_IS_ADDR_V4COMPAT(&ip6)) {
4708                                 for (i = sizeof (struct in6_addr) - 1;
4709                                     i >= DTRACE_V4MAPPED_OFFSET; i--) {
4710                                         ASSERT(end >= base);
4711 
4712                                         val = ip6._S6_un._S6_u8[i];
4713 
4714                                         if (val == 0) {
4715                                                 *end-- = '0';
4716                                         } else {
4717                                                 for (; val; val /= 10) {
4718                                                         *end-- = '0' + val % 10;
4719                                                 }
4720                                         }
4721 
4722                                         if (i > DTRACE_V4MAPPED_OFFSET)
4723                                                 *end-- = '.';
4724                                 }
4725 
4726                                 if (subr == DIF_SUBR_INET_NTOA6)
4727                                         goto inetout;
4728 
4729                                 /*
4730                                  * Set v6end to skip the IPv4 address that
4731                                  * we have already stringified.
4732                                  */
4733                                 v6end = 10;
4734                         }
4735 
4736                         /*
4737                          * Build the IPv6 string by working through the
4738                          * address in reverse.
4739                          */
4740                         for (i = v6end; i >= 0; i -= 2) {
4741                                 ASSERT(end >= base);
4742 
4743                                 if (i == firstzero + numzero - 2) {
4744                                         *end-- = ':';
4745                                         *end-- = ':';
4746                                         i -= numzero - 2;
4747                                         continue;
4748                                 }
4749 
4750                                 if (i < 14 && i != firstzero - 2)
4751                                         *end-- = ':';
4752 
4753                                 val = (ip6._S6_un._S6_u8[i] << 8) +
4754                                     ip6._S6_un._S6_u8[i + 1];
4755 
4756                                 if (val == 0) {
4757                                         *end-- = '0';
4758                                 } else {
4759                                         for (; val; val /= 16) {
4760                                                 *end-- = digits[val % 16];
4761                                         }
4762                                 }
4763                         }
4764                         ASSERT(end + 1 >= base);
4765 
4766                 } else {
4767                         /*
4768                          * The user didn't use AH_INET or AH_INET6.
4769                          */
4770                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4771                         regs[rd] = NULL;
4772                         break;
4773                 }
4774 
4775 inetout:        regs[rd] = (uintptr_t)end + 1;
4776                 mstate->dtms_scratch_ptr += size;
4777                 break;
4778         }
4779 
4780         }
4781 }
4782 
4783 /*
4784  * Emulate the execution of DTrace IR instructions specified by the given
4785  * DIF object.  This function is deliberately void of assertions as all of
4786  * the necessary checks are handled by a call to dtrace_difo_validate().
4787  */
4788 static uint64_t
4789 dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
4790     dtrace_vstate_t *vstate, dtrace_state_t *state)
4791 {
4792         const dif_instr_t *text = difo->dtdo_buf;
4793         const uint_t textlen = difo->dtdo_len;
4794         const char *strtab = difo->dtdo_strtab;
4795         const uint64_t *inttab = difo->dtdo_inttab;
4796 
4797         uint64_t rval = 0;
4798         dtrace_statvar_t *svar;
4799         dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
4800         dtrace_difv_t *v;
4801         volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
4802         volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
4803 
4804         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
4805         uint64_t regs[DIF_DIR_NREGS];
4806         uint64_t *tmp;
4807 
4808         uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
4809         int64_t cc_r;
4810         uint_t pc = 0, id, opc;
4811         uint8_t ttop = 0;
4812         dif_instr_t instr;
4813         uint_t r1, r2, rd;
4814 
4815         /*
4816          * We stash the current DIF object into the machine state: we need it
4817          * for subsequent access checking.
4818          */
4819         mstate->dtms_difo = difo;
4820 
4821         regs[DIF_REG_R0] = 0;           /* %r0 is fixed at zero */
4822 
4823         while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
4824                 opc = pc;
4825 
4826                 instr = text[pc++];
4827                 r1 = DIF_INSTR_R1(instr);
4828                 r2 = DIF_INSTR_R2(instr);
4829                 rd = DIF_INSTR_RD(instr);
4830 
4831                 switch (DIF_INSTR_OP(instr)) {
4832                 case DIF_OP_OR:
4833                         regs[rd] = regs[r1] | regs[r2];
4834                         break;
4835                 case DIF_OP_XOR:
4836                         regs[rd] = regs[r1] ^ regs[r2];
4837                         break;
4838                 case DIF_OP_AND:
4839                         regs[rd] = regs[r1] & regs[r2];
4840                         break;
4841                 case DIF_OP_SLL:
4842                         regs[rd] = regs[r1] << regs[r2];
4843                         break;
4844                 case DIF_OP_SRL:
4845                         regs[rd] = regs[r1] >> regs[r2];
4846                         break;
4847                 case DIF_OP_SUB:
4848                         regs[rd] = regs[r1] - regs[r2];
4849                         break;
4850                 case DIF_OP_ADD:
4851                         regs[rd] = regs[r1] + regs[r2];
4852                         break;
4853                 case DIF_OP_MUL:
4854                         regs[rd] = regs[r1] * regs[r2];
4855                         break;
4856                 case DIF_OP_SDIV:
4857                         if (regs[r2] == 0) {
4858                                 regs[rd] = 0;
4859                                 *flags |= CPU_DTRACE_DIVZERO;
4860                         } else {
4861                                 regs[rd] = (int64_t)regs[r1] /
4862                                     (int64_t)regs[r2];
4863                         }
4864                         break;
4865 
4866                 case DIF_OP_UDIV:
4867                         if (regs[r2] == 0) {
4868                                 regs[rd] = 0;
4869                                 *flags |= CPU_DTRACE_DIVZERO;
4870                         } else {
4871                                 regs[rd] = regs[r1] / regs[r2];
4872                         }
4873                         break;
4874 
4875                 case DIF_OP_SREM:
4876                         if (regs[r2] == 0) {
4877                                 regs[rd] = 0;
4878                                 *flags |= CPU_DTRACE_DIVZERO;
4879                         } else {
4880                                 regs[rd] = (int64_t)regs[r1] %
4881                                     (int64_t)regs[r2];
4882                         }
4883                         break;
4884 
4885                 case DIF_OP_UREM:
4886                         if (regs[r2] == 0) {
4887                                 regs[rd] = 0;
4888                                 *flags |= CPU_DTRACE_DIVZERO;
4889                         } else {
4890                                 regs[rd] = regs[r1] % regs[r2];
4891                         }
4892                         break;
4893 
4894                 case DIF_OP_NOT:
4895                         regs[rd] = ~regs[r1];
4896                         break;
4897                 case DIF_OP_MOV:
4898                         regs[rd] = regs[r1];
4899                         break;
4900                 case DIF_OP_CMP:
4901                         cc_r = regs[r1] - regs[r2];
4902                         cc_n = cc_r < 0;
4903                         cc_z = cc_r == 0;
4904                         cc_v = 0;
4905                         cc_c = regs[r1] < regs[r2];
4906                         break;
4907                 case DIF_OP_TST:
4908                         cc_n = cc_v = cc_c = 0;
4909                         cc_z = regs[r1] == 0;
4910                         break;
4911                 case DIF_OP_BA:
4912                         pc = DIF_INSTR_LABEL(instr);
4913                         break;
4914                 case DIF_OP_BE:
4915                         if (cc_z)
4916                                 pc = DIF_INSTR_LABEL(instr);
4917                         break;
4918                 case DIF_OP_BNE:
4919                         if (cc_z == 0)
4920                                 pc = DIF_INSTR_LABEL(instr);
4921                         break;
4922                 case DIF_OP_BG:
4923                         if ((cc_z | (cc_n ^ cc_v)) == 0)
4924                                 pc = DIF_INSTR_LABEL(instr);
4925                         break;
4926                 case DIF_OP_BGU:
4927                         if ((cc_c | cc_z) == 0)
4928                                 pc = DIF_INSTR_LABEL(instr);
4929                         break;
4930                 case DIF_OP_BGE:
4931                         if ((cc_n ^ cc_v) == 0)
4932                                 pc = DIF_INSTR_LABEL(instr);
4933                         break;
4934                 case DIF_OP_BGEU:
4935                         if (cc_c == 0)
4936                                 pc = DIF_INSTR_LABEL(instr);
4937                         break;
4938                 case DIF_OP_BL:
4939                         if (cc_n ^ cc_v)
4940                                 pc = DIF_INSTR_LABEL(instr);
4941                         break;
4942                 case DIF_OP_BLU:
4943                         if (cc_c)
4944                                 pc = DIF_INSTR_LABEL(instr);
4945                         break;
4946                 case DIF_OP_BLE:
4947                         if (cc_z | (cc_n ^ cc_v))
4948                                 pc = DIF_INSTR_LABEL(instr);
4949                         break;
4950                 case DIF_OP_BLEU:
4951                         if (cc_c | cc_z)
4952                                 pc = DIF_INSTR_LABEL(instr);
4953                         break;
4954                 case DIF_OP_RLDSB:
4955                         if (!dtrace_canload(regs[r1], 1, mstate, vstate))
4956                                 break;
4957                         /*FALLTHROUGH*/
4958                 case DIF_OP_LDSB:
4959                         regs[rd] = (int8_t)dtrace_load8(regs[r1]);
4960                         break;
4961                 case DIF_OP_RLDSH:
4962                         if (!dtrace_canload(regs[r1], 2, mstate, vstate))
4963                                 break;
4964                         /*FALLTHROUGH*/
4965                 case DIF_OP_LDSH:
4966                         regs[rd] = (int16_t)dtrace_load16(regs[r1]);
4967                         break;
4968                 case DIF_OP_RLDSW:
4969                         if (!dtrace_canload(regs[r1], 4, mstate, vstate))
4970                                 break;
4971                         /*FALLTHROUGH*/
4972                 case DIF_OP_LDSW:
4973                         regs[rd] = (int32_t)dtrace_load32(regs[r1]);
4974                         break;
4975                 case DIF_OP_RLDUB:
4976                         if (!dtrace_canload(regs[r1], 1, mstate, vstate))
4977                                 break;
4978                         /*FALLTHROUGH*/
4979                 case DIF_OP_LDUB:
4980                         regs[rd] = dtrace_load8(regs[r1]);
4981                         break;
4982                 case DIF_OP_RLDUH:
4983                         if (!dtrace_canload(regs[r1], 2, mstate, vstate))
4984                                 break;
4985                         /*FALLTHROUGH*/
4986                 case DIF_OP_LDUH:
4987                         regs[rd] = dtrace_load16(regs[r1]);
4988                         break;
4989                 case DIF_OP_RLDUW:
4990                         if (!dtrace_canload(regs[r1], 4, mstate, vstate))
4991                                 break;
4992                         /*FALLTHROUGH*/
4993                 case DIF_OP_LDUW:
4994                         regs[rd] = dtrace_load32(regs[r1]);
4995                         break;
4996                 case DIF_OP_RLDX:
4997                         if (!dtrace_canload(regs[r1], 8, mstate, vstate))
4998                                 break;
4999                         /*FALLTHROUGH*/
5000                 case DIF_OP_LDX:
5001                         regs[rd] = dtrace_load64(regs[r1]);
5002                         break;
5003                 case DIF_OP_ULDSB:
5004                         regs[rd] = (int8_t)
5005                             dtrace_fuword8((void *)(uintptr_t)regs[r1]);
5006                         break;
5007                 case DIF_OP_ULDSH:
5008                         regs[rd] = (int16_t)
5009                             dtrace_fuword16((void *)(uintptr_t)regs[r1]);
5010                         break;
5011                 case DIF_OP_ULDSW:
5012                         regs[rd] = (int32_t)
5013                             dtrace_fuword32((void *)(uintptr_t)regs[r1]);
5014                         break;
5015                 case DIF_OP_ULDUB:
5016                         regs[rd] =
5017                             dtrace_fuword8((void *)(uintptr_t)regs[r1]);
5018                         break;
5019                 case DIF_OP_ULDUH:
5020                         regs[rd] =
5021                             dtrace_fuword16((void *)(uintptr_t)regs[r1]);
5022                         break;
5023                 case DIF_OP_ULDUW:
5024                         regs[rd] =
5025                             dtrace_fuword32((void *)(uintptr_t)regs[r1]);
5026                         break;
5027                 case DIF_OP_ULDX:
5028                         regs[rd] =
5029                             dtrace_fuword64((void *)(uintptr_t)regs[r1]);
5030                         break;
5031                 case DIF_OP_RET:
5032                         rval = regs[rd];
5033                         pc = textlen;
5034                         break;
5035                 case DIF_OP_NOP:
5036                         break;
5037                 case DIF_OP_SETX:
5038                         regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
5039                         break;
5040                 case DIF_OP_SETS:
5041                         regs[rd] = (uint64_t)(uintptr_t)
5042                             (strtab + DIF_INSTR_STRING(instr));
5043                         break;
5044                 case DIF_OP_SCMP: {
5045                         size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
5046                         uintptr_t s1 = regs[r1];
5047                         uintptr_t s2 = regs[r2];
5048 
5049                         if (s1 != NULL &&
5050                             !dtrace_strcanload(s1, sz, mstate, vstate))
5051                                 break;
5052                         if (s2 != NULL &&
5053                             !dtrace_strcanload(s2, sz, mstate, vstate))
5054                                 break;
5055 
5056                         cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
5057 
5058                         cc_n = cc_r < 0;
5059                         cc_z = cc_r == 0;
5060                         cc_v = cc_c = 0;
5061                         break;
5062                 }
5063                 case DIF_OP_LDGA:
5064                         regs[rd] = dtrace_dif_variable(mstate, state,
5065                             r1, regs[r2]);
5066                         break;
5067                 case DIF_OP_LDGS:
5068                         id = DIF_INSTR_VAR(instr);
5069 
5070                         if (id >= DIF_VAR_OTHER_UBASE) {
5071                                 uintptr_t a;
5072 
5073                                 id -= DIF_VAR_OTHER_UBASE;
5074                                 svar = vstate->dtvs_globals[id];
5075                                 ASSERT(svar != NULL);
5076                                 v = &svar->dtsv_var;
5077 
5078                                 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
5079                                         regs[rd] = svar->dtsv_data;
5080                                         break;
5081                                 }
5082 
5083                                 a = (uintptr_t)svar->dtsv_data;
5084 
5085                                 if (*(uint8_t *)a == UINT8_MAX) {
5086                                         /*
5087                                          * If the 0th byte is set to UINT8_MAX
5088                                          * then this is to be treated as a
5089                                          * reference to a NULL variable.
5090                                          */
5091                                         regs[rd] = NULL;
5092                                 } else {
5093                                         regs[rd] = a + sizeof (uint64_t);
5094                                 }
5095 
5096                                 break;
5097                         }
5098 
5099                         regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
5100                         break;
5101 
5102                 case DIF_OP_STGS:
5103                         id = DIF_INSTR_VAR(instr);
5104 
5105                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5106                         id -= DIF_VAR_OTHER_UBASE;
5107 
5108                         svar = vstate->dtvs_globals[id];
5109                         ASSERT(svar != NULL);
5110                         v = &svar->dtsv_var;
5111 
5112                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5113                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
5114 
5115                                 ASSERT(a != NULL);
5116                                 ASSERT(svar->dtsv_size != 0);
5117 
5118                                 if (regs[rd] == NULL) {
5119                                         *(uint8_t *)a = UINT8_MAX;
5120                                         break;
5121                                 } else {
5122                                         *(uint8_t *)a = 0;
5123                                         a += sizeof (uint64_t);
5124                                 }
5125                                 if (!dtrace_vcanload(
5126                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5127                                     mstate, vstate))
5128                                         break;
5129 
5130                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5131                                     (void *)a, &v->dtdv_type);
5132                                 break;
5133                         }
5134 
5135                         svar->dtsv_data = regs[rd];
5136                         break;
5137 
5138                 case DIF_OP_LDTA:
5139                         /*
5140                          * There are no DTrace built-in thread-local arrays at
5141                          * present.  This opcode is saved for future work.
5142                          */
5143                         *flags |= CPU_DTRACE_ILLOP;
5144                         regs[rd] = 0;
5145                         break;
5146 
5147                 case DIF_OP_LDLS:
5148                         id = DIF_INSTR_VAR(instr);
5149 
5150                         if (id < DIF_VAR_OTHER_UBASE) {
5151                                 /*
5152                                  * For now, this has no meaning.
5153                                  */
5154                                 regs[rd] = 0;
5155                                 break;
5156                         }
5157 
5158                         id -= DIF_VAR_OTHER_UBASE;
5159 
5160                         ASSERT(id < vstate->dtvs_nlocals);
5161                         ASSERT(vstate->dtvs_locals != NULL);
5162 
5163                         svar = vstate->dtvs_locals[id];
5164                         ASSERT(svar != NULL);
5165                         v = &svar->dtsv_var;
5166 
5167                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5168                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
5169                                 size_t sz = v->dtdv_type.dtdt_size;
5170 
5171                                 sz += sizeof (uint64_t);
5172                                 ASSERT(svar->dtsv_size == NCPU * sz);
5173                                 a += CPU->cpu_id * sz;
5174 
5175                                 if (*(uint8_t *)a == UINT8_MAX) {
5176                                         /*
5177                                          * If the 0th byte is set to UINT8_MAX
5178                                          * then this is to be treated as a
5179                                          * reference to a NULL variable.
5180                                          */
5181                                         regs[rd] = NULL;
5182                                 } else {
5183                                         regs[rd] = a + sizeof (uint64_t);
5184                                 }
5185 
5186                                 break;
5187                         }
5188 
5189                         ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
5190                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5191                         regs[rd] = tmp[CPU->cpu_id];
5192                         break;
5193 
5194                 case DIF_OP_STLS:
5195                         id = DIF_INSTR_VAR(instr);
5196 
5197                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5198                         id -= DIF_VAR_OTHER_UBASE;
5199                         ASSERT(id < vstate->dtvs_nlocals);
5200 
5201                         ASSERT(vstate->dtvs_locals != NULL);
5202                         svar = vstate->dtvs_locals[id];
5203                         ASSERT(svar != NULL);
5204                         v = &svar->dtsv_var;
5205 
5206                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5207                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
5208                                 size_t sz = v->dtdv_type.dtdt_size;
5209 
5210                                 sz += sizeof (uint64_t);
5211                                 ASSERT(svar->dtsv_size == NCPU * sz);
5212                                 a += CPU->cpu_id * sz;
5213 
5214                                 if (regs[rd] == NULL) {
5215                                         *(uint8_t *)a = UINT8_MAX;
5216                                         break;
5217                                 } else {
5218                                         *(uint8_t *)a = 0;
5219                                         a += sizeof (uint64_t);
5220                                 }
5221 
5222                                 if (!dtrace_vcanload(
5223                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5224                                     mstate, vstate))
5225                                         break;
5226 
5227                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5228                                     (void *)a, &v->dtdv_type);
5229                                 break;
5230                         }
5231 
5232                         ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
5233                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5234                         tmp[CPU->cpu_id] = regs[rd];
5235                         break;
5236 
5237                 case DIF_OP_LDTS: {
5238                         dtrace_dynvar_t *dvar;
5239                         dtrace_key_t *key;
5240 
5241                         id = DIF_INSTR_VAR(instr);
5242                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5243                         id -= DIF_VAR_OTHER_UBASE;
5244                         v = &vstate->dtvs_tlocals[id];
5245 
5246                         key = &tupregs[DIF_DTR_NREGS];
5247                         key[0].dttk_value = (uint64_t)id;
5248                         key[0].dttk_size = 0;
5249                         DTRACE_TLS_THRKEY(key[1].dttk_value);
5250                         key[1].dttk_size = 0;
5251 
5252                         dvar = dtrace_dynvar(dstate, 2, key,
5253                             sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
5254                             mstate, vstate);
5255 
5256                         if (dvar == NULL) {
5257                                 regs[rd] = 0;
5258                                 break;
5259                         }
5260 
5261                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5262                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5263                         } else {
5264                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5265                         }
5266 
5267                         break;
5268                 }
5269 
5270                 case DIF_OP_STTS: {
5271                         dtrace_dynvar_t *dvar;
5272                         dtrace_key_t *key;
5273 
5274                         id = DIF_INSTR_VAR(instr);
5275                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5276                         id -= DIF_VAR_OTHER_UBASE;
5277 
5278                         key = &tupregs[DIF_DTR_NREGS];
5279                         key[0].dttk_value = (uint64_t)id;
5280                         key[0].dttk_size = 0;
5281                         DTRACE_TLS_THRKEY(key[1].dttk_value);
5282                         key[1].dttk_size = 0;
5283                         v = &vstate->dtvs_tlocals[id];
5284 
5285                         dvar = dtrace_dynvar(dstate, 2, key,
5286                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5287                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
5288                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
5289                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5290 
5291                         /*
5292                          * Given that we're storing to thread-local data,
5293                          * we need to flush our predicate cache.
5294                          */
5295                         curthread->t_predcache = NULL;
5296 
5297                         if (dvar == NULL)
5298                                 break;
5299 
5300                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5301                                 if (!dtrace_vcanload(
5302                                     (void *)(uintptr_t)regs[rd],
5303                                     &v->dtdv_type, mstate, vstate))
5304                                         break;
5305 
5306                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5307                                     dvar->dtdv_data, &v->dtdv_type);
5308                         } else {
5309                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5310                         }
5311 
5312                         break;
5313                 }
5314 
5315                 case DIF_OP_SRA:
5316                         regs[rd] = (int64_t)regs[r1] >> regs[r2];
5317                         break;
5318 
5319                 case DIF_OP_CALL:
5320                         dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
5321                             regs, tupregs, ttop, mstate, state);
5322                         break;
5323 
5324                 case DIF_OP_PUSHTR:
5325                         if (ttop == DIF_DTR_NREGS) {
5326                                 *flags |= CPU_DTRACE_TUPOFLOW;
5327                                 break;
5328                         }
5329 
5330                         if (r1 == DIF_TYPE_STRING) {
5331                                 /*
5332                                  * If this is a string type and the size is 0,
5333                                  * we'll use the system-wide default string
5334                                  * size.  Note that we are _not_ looking at
5335                                  * the value of the DTRACEOPT_STRSIZE option;
5336                                  * had this been set, we would expect to have
5337                                  * a non-zero size value in the "pushtr".
5338                                  */
5339                                 tupregs[ttop].dttk_size =
5340                                     dtrace_strlen((char *)(uintptr_t)regs[rd],
5341                                     regs[r2] ? regs[r2] :
5342                                     dtrace_strsize_default) + 1;
5343                         } else {
5344                                 tupregs[ttop].dttk_size = regs[r2];
5345                         }
5346 
5347                         tupregs[ttop++].dttk_value = regs[rd];
5348                         break;
5349 
5350                 case DIF_OP_PUSHTV:
5351                         if (ttop == DIF_DTR_NREGS) {
5352                                 *flags |= CPU_DTRACE_TUPOFLOW;
5353                                 break;
5354                         }
5355 
5356                         tupregs[ttop].dttk_value = regs[rd];
5357                         tupregs[ttop++].dttk_size = 0;
5358                         break;
5359 
5360                 case DIF_OP_POPTS:
5361                         if (ttop != 0)
5362                                 ttop--;
5363                         break;
5364 
5365                 case DIF_OP_FLUSHTS:
5366                         ttop = 0;
5367                         break;
5368 
5369                 case DIF_OP_LDGAA:
5370                 case DIF_OP_LDTAA: {
5371                         dtrace_dynvar_t *dvar;
5372                         dtrace_key_t *key = tupregs;
5373                         uint_t nkeys = ttop;
5374 
5375                         id = DIF_INSTR_VAR(instr);
5376                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5377                         id -= DIF_VAR_OTHER_UBASE;
5378 
5379                         key[nkeys].dttk_value = (uint64_t)id;
5380                         key[nkeys++].dttk_size = 0;
5381 
5382                         if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
5383                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5384                                 key[nkeys++].dttk_size = 0;
5385                                 v = &vstate->dtvs_tlocals[id];
5386                         } else {
5387                                 v = &vstate->dtvs_globals[id]->dtsv_var;
5388                         }
5389 
5390                         dvar = dtrace_dynvar(dstate, nkeys, key,
5391                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5392                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
5393                             DTRACE_DYNVAR_NOALLOC, mstate, vstate);
5394 
5395                         if (dvar == NULL) {
5396                                 regs[rd] = 0;
5397                                 break;
5398                         }
5399 
5400                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5401                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5402                         } else {
5403                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5404                         }
5405 
5406                         break;
5407                 }
5408 
5409                 case DIF_OP_STGAA:
5410                 case DIF_OP_STTAA: {
5411                         dtrace_dynvar_t *dvar;
5412                         dtrace_key_t *key = tupregs;
5413                         uint_t nkeys = ttop;
5414 
5415                         id = DIF_INSTR_VAR(instr);
5416                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5417                         id -= DIF_VAR_OTHER_UBASE;
5418 
5419                         key[nkeys].dttk_value = (uint64_t)id;
5420                         key[nkeys++].dttk_size = 0;
5421 
5422                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
5423                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5424                                 key[nkeys++].dttk_size = 0;
5425                                 v = &vstate->dtvs_tlocals[id];
5426                         } else {
5427                                 v = &vstate->dtvs_globals[id]->dtsv_var;
5428                         }
5429 
5430                         dvar = dtrace_dynvar(dstate, nkeys, key,
5431                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5432                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
5433                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
5434                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5435 
5436                         if (dvar == NULL)
5437                                 break;
5438 
5439                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5440                                 if (!dtrace_vcanload(
5441                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5442                                     mstate, vstate))
5443                                         break;
5444 
5445                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5446                                     dvar->dtdv_data, &v->dtdv_type);
5447                         } else {
5448                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5449                         }
5450 
5451                         break;
5452                 }
5453 
5454                 case DIF_OP_ALLOCS: {
5455                         uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5456                         size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
5457 
5458                         /*
5459                          * Rounding up the user allocation size could have
5460                          * overflowed large, bogus allocations (like -1ULL) to
5461                          * 0.
5462                          */
5463                         if (size < regs[r1] ||
5464                             !DTRACE_INSCRATCH(mstate, size)) {
5465                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5466                                 regs[rd] = NULL;
5467                                 break;
5468                         }
5469 
5470                         dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
5471                         mstate->dtms_scratch_ptr += size;
5472                         regs[rd] = ptr;
5473                         break;
5474                 }
5475 
5476                 case DIF_OP_COPYS:
5477                         if (!dtrace_canstore(regs[rd], regs[r2],
5478                             mstate, vstate)) {
5479                                 *flags |= CPU_DTRACE_BADADDR;
5480                                 *illval = regs[rd];
5481                                 break;
5482                         }
5483 
5484                         if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
5485                                 break;
5486 
5487                         dtrace_bcopy((void *)(uintptr_t)regs[r1],
5488                             (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
5489                         break;
5490 
5491                 case DIF_OP_STB:
5492                         if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
5493                                 *flags |= CPU_DTRACE_BADADDR;
5494                                 *illval = regs[rd];
5495                                 break;
5496                         }
5497                         *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
5498                         break;
5499 
5500                 case DIF_OP_STH:
5501                         if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
5502                                 *flags |= CPU_DTRACE_BADADDR;
5503                                 *illval = regs[rd];
5504                                 break;
5505                         }
5506                         if (regs[rd] & 1) {
5507                                 *flags |= CPU_DTRACE_BADALIGN;
5508                                 *illval = regs[rd];
5509                                 break;
5510                         }
5511                         *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
5512                         break;
5513 
5514                 case DIF_OP_STW:
5515                         if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
5516                                 *flags |= CPU_DTRACE_BADADDR;
5517                                 *illval = regs[rd];
5518                                 break;
5519                         }
5520                         if (regs[rd] & 3) {
5521                                 *flags |= CPU_DTRACE_BADALIGN;
5522                                 *illval = regs[rd];
5523                                 break;
5524                         }
5525                         *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
5526                         break;
5527 
5528                 case DIF_OP_STX:
5529                         if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
5530                                 *flags |= CPU_DTRACE_BADADDR;
5531                                 *illval = regs[rd];
5532                                 break;
5533                         }
5534                         if (regs[rd] & 7) {
5535                                 *flags |= CPU_DTRACE_BADALIGN;
5536                                 *illval = regs[rd];
5537                                 break;
5538                         }
5539                         *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
5540                         break;
5541                 }
5542         }
5543 
5544         if (!(*flags & CPU_DTRACE_FAULT))
5545                 return (rval);
5546 
5547         mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
5548         mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
5549 
5550         return (0);
5551 }
5552 
5553 static void
5554 dtrace_action_breakpoint(dtrace_ecb_t *ecb)
5555 {
5556         dtrace_probe_t *probe = ecb->dte_probe;
5557         dtrace_provider_t *prov = probe->dtpr_provider;
5558         char c[DTRACE_FULLNAMELEN + 80], *str;
5559         char *msg = "dtrace: breakpoint action at probe ";
5560         char *ecbmsg = " (ecb ";
5561         uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
5562         uintptr_t val = (uintptr_t)ecb;
5563         int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
5564 
5565         if (dtrace_destructive_disallow)
5566                 return;
5567 
5568         /*
5569          * It's impossible to be taking action on the NULL probe.
5570          */
5571         ASSERT(probe != NULL);
5572 
5573         /*
5574          * This is a poor man's (destitute man's?) sprintf():  we want to
5575          * print the provider name, module name, function name and name of
5576          * the probe, along with the hex address of the ECB with the breakpoint
5577          * action -- all of which we must place in the character buffer by
5578          * hand.
5579          */
5580         while (*msg != '\0')
5581                 c[i++] = *msg++;
5582 
5583         for (str = prov->dtpv_name; *str != '\0'; str++)
5584                 c[i++] = *str;
5585         c[i++] = ':';
5586 
5587         for (str = probe->dtpr_mod; *str != '\0'; str++)
5588                 c[i++] = *str;
5589         c[i++] = ':';
5590 
5591         for (str = probe->dtpr_func; *str != '\0'; str++)
5592                 c[i++] = *str;
5593         c[i++] = ':';
5594 
5595         for (str = probe->dtpr_name; *str != '\0'; str++)
5596                 c[i++] = *str;
5597 
5598         while (*ecbmsg != '\0')
5599                 c[i++] = *ecbmsg++;
5600 
5601         while (shift >= 0) {
5602                 mask = (uintptr_t)0xf << shift;
5603 
5604                 if (val >= ((uintptr_t)1 << shift))
5605                         c[i++] = "0123456789abcdef"[(val & mask) >> shift];
5606                 shift -= 4;
5607         }
5608 
5609         c[i++] = ')';
5610         c[i] = '\0';
5611 
5612         debug_enter(c);
5613 }
5614 
5615 static void
5616 dtrace_action_panic(dtrace_ecb_t *ecb)
5617 {
5618         dtrace_probe_t *probe = ecb->dte_probe;
5619 
5620         /*
5621          * It's impossible to be taking action on the NULL probe.
5622          */
5623         ASSERT(probe != NULL);
5624 
5625         if (dtrace_destructive_disallow)
5626                 return;
5627 
5628         if (dtrace_panicked != NULL)
5629                 return;
5630 
5631         if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
5632                 return;
5633 
5634         /*
5635          * We won the right to panic.  (We want to be sure that only one
5636          * thread calls panic() from dtrace_probe(), and that panic() is
5637          * called exactly once.)
5638          */
5639         dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
5640             probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
5641             probe->dtpr_func, probe->dtpr_name, (void *)ecb);
5642 }
5643 
5644 static void
5645 dtrace_action_raise(uint64_t sig)
5646 {
5647         if (dtrace_destructive_disallow)
5648                 return;
5649 
5650         if (sig >= NSIG) {
5651                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5652                 return;
5653         }
5654 
5655         /*
5656          * raise() has a queue depth of 1 -- we ignore all subsequent
5657          * invocations of the raise() action.
5658          */
5659         if (curthread->t_dtrace_sig == 0)
5660                 curthread->t_dtrace_sig = (uint8_t)sig;
5661 
5662         curthread->t_sig_check = 1;
5663         aston(curthread);
5664 }
5665 
5666 static void
5667 dtrace_action_stop(void)
5668 {
5669         if (dtrace_destructive_disallow)
5670                 return;
5671 
5672         if (!curthread->t_dtrace_stop) {
5673                 curthread->t_dtrace_stop = 1;
5674                 curthread->t_sig_check = 1;
5675                 aston(curthread);
5676         }
5677 }
5678 
5679 static void
5680 dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
5681 {
5682         hrtime_t now;
5683         volatile uint16_t *flags;
5684         cpu_t *cpu = CPU;
5685 
5686         if (dtrace_destructive_disallow)
5687                 return;
5688 
5689         flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
5690 
5691         now = dtrace_gethrtime();
5692 
5693         if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
5694                 /*
5695                  * We need to advance the mark to the current time.
5696                  */
5697                 cpu->cpu_dtrace_chillmark = now;
5698                 cpu->cpu_dtrace_chilled = 0;
5699         }
5700 
5701         /*
5702          * Now check to see if the requested chill time would take us over
5703          * the maximum amount of time allowed in the chill interval.  (Or
5704          * worse, if the calculation itself induces overflow.)
5705          */
5706         if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
5707             cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
5708                 *flags |= CPU_DTRACE_ILLOP;
5709                 return;
5710         }
5711 
5712         while (dtrace_gethrtime() - now < val)
5713                 continue;
5714 
5715         /*
5716          * Normally, we assure that the value of the variable "timestamp" does
5717          * not change within an ECB.  The presence of chill() represents an
5718          * exception to this rule, however.
5719          */
5720         mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
5721         cpu->cpu_dtrace_chilled += val;
5722 }
5723 
5724 static void
5725 dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
5726     uint64_t *buf, uint64_t arg)
5727 {
5728         int nframes = DTRACE_USTACK_NFRAMES(arg);
5729         int strsize = DTRACE_USTACK_STRSIZE(arg);
5730         uint64_t *pcs = &buf[1], *fps;
5731         char *str = (char *)&pcs[nframes];
5732         int size, offs = 0, i, j;
5733         uintptr_t old = mstate->dtms_scratch_ptr, saved;
5734         uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5735         char *sym;
5736 
5737         /*
5738          * Should be taking a faster path if string space has not been
5739          * allocated.
5740          */
5741         ASSERT(strsize != 0);
5742 
5743         /*
5744          * We will first allocate some temporary space for the frame pointers.
5745          */
5746         fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5747         size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
5748             (nframes * sizeof (uint64_t));
5749 
5750         if (!DTRACE_INSCRATCH(mstate, size)) {
5751                 /*
5752                  * Not enough room for our frame pointers -- need to indicate
5753                  * that we ran out of scratch space.
5754                  */
5755                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5756                 return;
5757         }
5758 
5759         mstate->dtms_scratch_ptr += size;
5760         saved = mstate->dtms_scratch_ptr;
5761 
5762         /*
5763          * Now get a stack with both program counters and frame pointers.
5764          */
5765         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5766         dtrace_getufpstack(buf, fps, nframes + 1);
5767         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5768 
5769         /*
5770          * If that faulted, we're cooked.
5771          */
5772         if (*flags & CPU_DTRACE_FAULT)
5773                 goto out;
5774 
5775         /*
5776          * Now we want to walk up the stack, calling the USTACK helper.  For
5777          * each iteration, we restore the scratch pointer.
5778          */
5779         for (i = 0; i < nframes; i++) {
5780                 mstate->dtms_scratch_ptr = saved;
5781 
5782                 if (offs >= strsize)
5783                         break;
5784 
5785                 sym = (char *)(uintptr_t)dtrace_helper(
5786                     DTRACE_HELPER_ACTION_USTACK,
5787                     mstate, state, pcs[i], fps[i]);
5788 
5789                 /*
5790                  * If we faulted while running the helper, we're going to
5791                  * clear the fault and null out the corresponding string.
5792                  */
5793                 if (*flags & CPU_DTRACE_FAULT) {
5794                         *flags &= ~CPU_DTRACE_FAULT;
5795                         str[offs++] = '\0';
5796                         continue;
5797                 }
5798 
5799                 if (sym == NULL) {
5800                         str[offs++] = '\0';
5801                         continue;
5802                 }
5803 
5804                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5805 
5806                 /*
5807                  * Now copy in the string that the helper returned to us.
5808                  */
5809                 for (j = 0; offs + j < strsize; j++) {
5810                         if ((str[offs + j] = sym[j]) == '\0')
5811                                 break;
5812                 }
5813 
5814                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5815 
5816                 offs += j + 1;
5817         }
5818 
5819         if (offs >= strsize) {
5820                 /*
5821                  * If we didn't have room for all of the strings, we don't
5822                  * abort processing -- this needn't be a fatal error -- but we
5823                  * still want to increment a counter (dts_stkstroverflows) to
5824                  * allow this condition to be warned about.  (If this is from
5825                  * a jstack() action, it is easily tuned via jstackstrsize.)
5826                  */
5827                 dtrace_error(&state->dts_stkstroverflows);
5828         }
5829 
5830         while (offs < strsize)
5831                 str[offs++] = '\0';
5832 
5833 out:
5834         mstate->dtms_scratch_ptr = old;
5835 }
5836 
5837 /*
5838  * If you're looking for the epicenter of DTrace, you just found it.  This
5839  * is the function called by the provider to fire a probe -- from which all
5840  * subsequent probe-context DTrace activity emanates.
5841  */
5842 void
5843 dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
5844     uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
5845 {
5846         processorid_t cpuid;
5847         dtrace_icookie_t cookie;
5848         dtrace_probe_t *probe;
5849         dtrace_mstate_t mstate;
5850         dtrace_ecb_t *ecb;
5851         dtrace_action_t *act;
5852         intptr_t offs;
5853         size_t size;
5854         int vtime, onintr;
5855         volatile uint16_t *flags;
5856         hrtime_t now;
5857 
5858         /*
5859          * Kick out immediately if this CPU is still being born (in which case
5860          * curthread will be set to -1) or the current thread can't allow
5861          * probes in its current context.
5862          */
5863         if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
5864                 return;
5865 
5866         cookie = dtrace_interrupt_disable();
5867         probe = dtrace_probes[id - 1];
5868         cpuid = CPU->cpu_id;
5869         onintr = CPU_ON_INTR(CPU);
5870 
5871         if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
5872             probe->dtpr_predcache == curthread->t_predcache) {
5873                 /*
5874                  * We have hit in the predicate cache; we know that
5875                  * this predicate would evaluate to be false.
5876                  */
5877                 dtrace_interrupt_enable(cookie);
5878                 return;
5879         }
5880 
5881         if (panic_quiesce) {
5882                 /*
5883                  * We don't trace anything if we're panicking.
5884                  */
5885                 dtrace_interrupt_enable(cookie);
5886                 return;
5887         }
5888 
5889         now = dtrace_gethrtime();
5890         vtime = dtrace_vtime_references != 0;
5891 
5892         if (vtime && curthread->t_dtrace_start)
5893                 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
5894 
5895         mstate.dtms_difo = NULL;
5896         mstate.dtms_probe = probe;
5897         mstate.dtms_strtok = NULL;
5898         mstate.dtms_arg[0] = arg0;
5899         mstate.dtms_arg[1] = arg1;
5900         mstate.dtms_arg[2] = arg2;
5901         mstate.dtms_arg[3] = arg3;
5902         mstate.dtms_arg[4] = arg4;
5903 
5904         flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
5905 
5906         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
5907                 dtrace_predicate_t *pred = ecb->dte_predicate;
5908                 dtrace_state_t *state = ecb->dte_state;
5909                 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
5910                 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
5911                 dtrace_vstate_t *vstate = &state->dts_vstate;
5912                 dtrace_provider_t *prov = probe->dtpr_provider;
5913                 uint64_t tracememsize = 0;
5914                 int committed = 0;
5915                 caddr_t tomax;
5916 
5917                 /*
5918                  * A little subtlety with the following (seemingly innocuous)
5919                  * declaration of the automatic 'val':  by looking at the
5920                  * code, you might think that it could be declared in the
5921                  * action processing loop, below.  (That is, it's only used in
5922                  * the action processing loop.)  However, it must be declared
5923                  * out of that scope because in the case of DIF expression
5924                  * arguments to aggregating actions, one iteration of the
5925                  * action loop will use the last iteration's value.
5926                  */
5927 #ifdef lint
5928                 uint64_t val = 0;
5929 #else
5930                 uint64_t val;
5931 #endif
5932 
5933                 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
5934                 mstate.dtms_access = DTRACE_ACCESS_ARGS | DTRACE_ACCESS_PROC;
5935                 *flags &= ~CPU_DTRACE_ERROR;
5936 
5937                 if (prov == dtrace_provider) {
5938                         /*
5939                          * If dtrace itself is the provider of this probe,
5940                          * we're only going to continue processing the ECB if
5941                          * arg0 (the dtrace_state_t) is equal to the ECB's
5942                          * creating state.  (This prevents disjoint consumers
5943                          * from seeing one another's metaprobes.)
5944                          */
5945                         if (arg0 != (uint64_t)(uintptr_t)state)
5946                                 continue;
5947                 }
5948 
5949                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
5950                         /*
5951                          * We're not currently active.  If our provider isn't
5952                          * the dtrace pseudo provider, we're not interested.
5953                          */
5954                         if (prov != dtrace_provider)
5955                                 continue;
5956 
5957                         /*
5958                          * Now we must further check if we are in the BEGIN
5959                          * probe.  If we are, we will only continue processing
5960                          * if we're still in WARMUP -- if one BEGIN enabling
5961                          * has invoked the exit() action, we don't want to
5962                          * evaluate subsequent BEGIN enablings.
5963                          */
5964                         if (probe->dtpr_id == dtrace_probeid_begin &&
5965                             state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
5966                                 ASSERT(state->dts_activity ==
5967                                     DTRACE_ACTIVITY_DRAINING);
5968                                 continue;
5969                         }
5970                 }
5971 
5972                 if (ecb->dte_cond && !dtrace_priv_probe(state, &mstate, ecb))
5973                         continue;
5974 
5975                 if (now - state->dts_alive > dtrace_deadman_timeout) {
5976                         /*
5977                          * We seem to be dead.  Unless we (a) have kernel
5978                          * destructive permissions (b) have expicitly enabled
5979                          * destructive actions and (c) destructive actions have
5980                          * not been disabled, we're going to transition into
5981                          * the KILLED state, from which no further processing
5982                          * on this state will be performed.
5983                          */
5984                         if (!dtrace_priv_kernel_destructive(state) ||
5985                             !state->dts_cred.dcr_destructive ||
5986                             dtrace_destructive_disallow) {
5987                                 void *activity = &state->dts_activity;
5988                                 dtrace_activity_t current;
5989 
5990                                 do {
5991                                         current = state->dts_activity;
5992                                 } while (dtrace_cas32(activity, current,
5993                                     DTRACE_ACTIVITY_KILLED) != current);
5994 
5995                                 continue;
5996                         }
5997                 }
5998 
5999                 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
6000                     ecb->dte_alignment, state, &mstate)) < 0)
6001                         continue;
6002 
6003                 tomax = buf->dtb_tomax;
6004                 ASSERT(tomax != NULL);
6005 
6006                 if (ecb->dte_size != 0)
6007                         DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
6008 
6009                 mstate.dtms_epid = ecb->dte_epid;
6010                 mstate.dtms_present |= DTRACE_MSTATE_EPID;
6011 
6012                 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
6013                         mstate.dtms_access |= DTRACE_ACCESS_KERNEL;
6014 
6015                 if (pred != NULL) {
6016                         dtrace_difo_t *dp = pred->dtp_difo;
6017                         int rval;
6018 
6019                         rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
6020 
6021                         if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
6022                                 dtrace_cacheid_t cid = probe->dtpr_predcache;
6023 
6024                                 if (cid != DTRACE_CACHEIDNONE && !onintr) {
6025                                         /*
6026                                          * Update the predicate cache...
6027                                          */
6028                                         ASSERT(cid == pred->dtp_cacheid);
6029                                         curthread->t_predcache = cid;
6030                                 }
6031 
6032                                 continue;
6033                         }
6034                 }
6035 
6036                 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
6037                     act != NULL; act = act->dta_next) {
6038                         size_t valoffs;
6039                         dtrace_difo_t *dp;
6040                         dtrace_recdesc_t *rec = &act->dta_rec;
6041 
6042                         size = rec->dtrd_size;
6043                         valoffs = offs + rec->dtrd_offset;
6044 
6045                         if (DTRACEACT_ISAGG(act->dta_kind)) {
6046                                 uint64_t v = 0xbad;
6047                                 dtrace_aggregation_t *agg;
6048 
6049                                 agg = (dtrace_aggregation_t *)act;
6050 
6051                                 if ((dp = act->dta_difo) != NULL)
6052                                         v = dtrace_dif_emulate(dp,
6053                                             &mstate, vstate, state);
6054 
6055                                 if (*flags & CPU_DTRACE_ERROR)
6056                                         continue;
6057 
6058                                 /*
6059                                  * Note that we always pass the expression
6060                                  * value from the previous iteration of the
6061                                  * action loop.  This value will only be used
6062                                  * if there is an expression argument to the
6063                                  * aggregating action, denoted by the
6064                                  * dtag_hasarg field.
6065                                  */
6066                                 dtrace_aggregate(agg, buf,
6067                                     offs, aggbuf, v, val);
6068                                 continue;
6069                         }
6070 
6071                         switch (act->dta_kind) {
6072                         case DTRACEACT_STOP:
6073                                 if (dtrace_priv_proc_destructive(state,
6074                                     &mstate))
6075                                         dtrace_action_stop();
6076                                 continue;
6077 
6078                         case DTRACEACT_BREAKPOINT:
6079                                 if (dtrace_priv_kernel_destructive(state))
6080                                         dtrace_action_breakpoint(ecb);
6081                                 continue;
6082 
6083                         case DTRACEACT_PANIC:
6084                                 if (dtrace_priv_kernel_destructive(state))
6085                                         dtrace_action_panic(ecb);
6086                                 continue;
6087 
6088                         case DTRACEACT_STACK:
6089                                 if (!dtrace_priv_kernel(state))
6090                                         continue;
6091 
6092                                 dtrace_getpcstack((pc_t *)(tomax + valoffs),
6093                                     size / sizeof (pc_t), probe->dtpr_aframes,
6094                                     DTRACE_ANCHORED(probe) ? NULL :
6095                                     (uint32_t *)arg0);
6096 
6097                                 continue;
6098 
6099                         case DTRACEACT_JSTACK:
6100                         case DTRACEACT_USTACK:
6101                                 if (!dtrace_priv_proc(state, &mstate))
6102                                         continue;
6103 
6104                                 /*
6105                                  * See comment in DIF_VAR_PID.
6106                                  */
6107                                 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
6108                                     CPU_ON_INTR(CPU)) {
6109                                         int depth = DTRACE_USTACK_NFRAMES(
6110                                             rec->dtrd_arg) + 1;
6111 
6112                                         dtrace_bzero((void *)(tomax + valoffs),
6113                                             DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
6114                                             + depth * sizeof (uint64_t));
6115 
6116                                         continue;
6117                                 }
6118 
6119                                 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
6120                                     curproc->p_dtrace_helpers != NULL) {
6121                                         /*
6122                                          * This is the slow path -- we have
6123                                          * allocated string space, and we're
6124                                          * getting the stack of a process that
6125                                          * has helpers.  Call into a separate
6126                                          * routine to perform this processing.
6127                                          */
6128                                         dtrace_action_ustack(&mstate, state,
6129                                             (uint64_t *)(tomax + valoffs),
6130                                             rec->dtrd_arg);
6131                                         continue;
6132                                 }
6133 
6134                                 /*
6135                                  * Clear the string space, since there's no
6136                                  * helper to do it for us.
6137                                  */
6138                                 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0) {
6139                                         int depth = DTRACE_USTACK_NFRAMES(
6140                                             rec->dtrd_arg);
6141                                         size_t strsize = DTRACE_USTACK_STRSIZE(
6142                                             rec->dtrd_arg);
6143                                         uint64_t *buf = (uint64_t *)(tomax +
6144                                             valoffs);
6145                                         void *strspace = &buf[depth + 1];
6146 
6147                                         dtrace_bzero(strspace,
6148                                             MIN(depth, strsize));
6149                                 }
6150 
6151                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6152                                 dtrace_getupcstack((uint64_t *)
6153                                     (tomax + valoffs),
6154                                     DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
6155                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6156                                 continue;
6157 
6158                         default:
6159                                 break;
6160                         }
6161 
6162                         dp = act->dta_difo;
6163                         ASSERT(dp != NULL);
6164 
6165                         val = dtrace_dif_emulate(dp, &mstate, vstate, state);
6166 
6167                         if (*flags & CPU_DTRACE_ERROR)
6168                                 continue;
6169 
6170                         switch (act->dta_kind) {
6171                         case DTRACEACT_SPECULATE:
6172                                 ASSERT(buf == &state->dts_buffer[cpuid]);
6173                                 buf = dtrace_speculation_buffer(state,
6174                                     cpuid, val);
6175 
6176                                 if (buf == NULL) {
6177                                         *flags |= CPU_DTRACE_DROP;
6178                                         continue;
6179                                 }
6180 
6181                                 offs = dtrace_buffer_reserve(buf,
6182                                     ecb->dte_needed, ecb->dte_alignment,
6183                                     state, NULL);
6184 
6185                                 if (offs < 0) {
6186                                         *flags |= CPU_DTRACE_DROP;
6187                                         continue;
6188                                 }
6189 
6190                                 tomax = buf->dtb_tomax;
6191                                 ASSERT(tomax != NULL);
6192 
6193                                 if (ecb->dte_size != 0)
6194                                         DTRACE_STORE(uint32_t, tomax, offs,
6195                                             ecb->dte_epid);
6196                                 continue;
6197 
6198                         case DTRACEACT_CHILL:
6199                                 if (dtrace_priv_kernel_destructive(state))
6200                                         dtrace_action_chill(&mstate, val);
6201                                 continue;
6202 
6203                         case DTRACEACT_RAISE:
6204                                 if (dtrace_priv_proc_destructive(state,
6205                                     &mstate))
6206                                         dtrace_action_raise(val);
6207                                 continue;
6208 
6209                         case DTRACEACT_COMMIT:
6210                                 ASSERT(!committed);
6211 
6212                                 /*
6213                                  * We need to commit our buffer state.
6214                                  */
6215                                 if (ecb->dte_size)
6216                                         buf->dtb_offset = offs + ecb->dte_size;
6217                                 buf = &state->dts_buffer[cpuid];
6218                                 dtrace_speculation_commit(state, cpuid, val);
6219                                 committed = 1;
6220                                 continue;
6221 
6222                         case DTRACEACT_DISCARD:
6223                                 dtrace_speculation_discard(state, cpuid, val);
6224                                 continue;
6225 
6226                         case DTRACEACT_DIFEXPR:
6227                         case DTRACEACT_LIBACT:
6228                         case DTRACEACT_PRINTF:
6229                         case DTRACEACT_PRINTA:
6230                         case DTRACEACT_SYSTEM:
6231                         case DTRACEACT_FREOPEN:
6232                         case DTRACEACT_TRACEMEM:
6233                                 break;
6234 
6235                         case DTRACEACT_TRACEMEM_DYNSIZE:
6236                                 tracememsize = val;
6237                                 break;
6238 
6239                         case DTRACEACT_SYM:
6240                         case DTRACEACT_MOD:
6241                                 if (!dtrace_priv_kernel(state))
6242                                         continue;
6243                                 break;
6244 
6245                         case DTRACEACT_USYM:
6246                         case DTRACEACT_UMOD:
6247                         case DTRACEACT_UADDR: {
6248                                 struct pid *pid = curthread->t_procp->p_pidp;
6249 
6250                                 if (!dtrace_priv_proc(state, &mstate))
6251                                         continue;
6252 
6253                                 DTRACE_STORE(uint64_t, tomax,
6254                                     valoffs, (uint64_t)pid->pid_id);
6255                                 DTRACE_STORE(uint64_t, tomax,
6256                                     valoffs + sizeof (uint64_t), val);
6257 
6258                                 continue;
6259                         }
6260 
6261                         case DTRACEACT_EXIT: {
6262                                 /*
6263                                  * For the exit action, we are going to attempt
6264                                  * to atomically set our activity to be
6265                                  * draining.  If this fails (either because
6266                                  * another CPU has beat us to the exit action,
6267                                  * or because our current activity is something
6268                                  * other than ACTIVE or WARMUP), we will
6269                                  * continue.  This assures that the exit action
6270                                  * can be successfully recorded at most once
6271                                  * when we're in the ACTIVE state.  If we're
6272                                  * encountering the exit() action while in
6273                                  * COOLDOWN, however, we want to honor the new
6274                                  * status code.  (We know that we're the only
6275                                  * thread in COOLDOWN, so there is no race.)
6276                                  */
6277                                 void *activity = &state->dts_activity;
6278                                 dtrace_activity_t current = state->dts_activity;
6279 
6280                                 if (current == DTRACE_ACTIVITY_COOLDOWN)
6281                                         break;
6282 
6283                                 if (current != DTRACE_ACTIVITY_WARMUP)
6284                                         current = DTRACE_ACTIVITY_ACTIVE;
6285 
6286                                 if (dtrace_cas32(activity, current,
6287                                     DTRACE_ACTIVITY_DRAINING) != current) {
6288                                         *flags |= CPU_DTRACE_DROP;
6289                                         continue;
6290                                 }
6291 
6292                                 break;
6293                         }
6294 
6295                         default:
6296                                 ASSERT(0);
6297                         }
6298 
6299                         if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) {
6300                                 uintptr_t end = valoffs + size;
6301 
6302                                 if (tracememsize != 0 &&
6303                                     valoffs + tracememsize < end) {
6304                                         end = valoffs + tracememsize;
6305                                         tracememsize = 0;
6306                                 }
6307 
6308                                 if (!dtrace_vcanload((void *)(uintptr_t)val,
6309                                     &dp->dtdo_rtype, &mstate, vstate))
6310                                         continue;
6311 
6312                                 /*
6313                                  * If this is a string, we're going to only
6314                                  * load until we find the zero byte -- after
6315                                  * which we'll store zero bytes.
6316                                  */
6317                                 if (dp->dtdo_rtype.dtdt_kind ==
6318                                     DIF_TYPE_STRING) {
6319                                         char c = '\0' + 1;
6320                                         int intuple = act->dta_intuple;
6321                                         size_t s;
6322 
6323                                         for (s = 0; s < size; s++) {
6324                                                 if (c != '\0')
6325                                                         c = dtrace_load8(val++);
6326 
6327                                                 DTRACE_STORE(uint8_t, tomax,
6328                                                     valoffs++, c);
6329 
6330                                                 if (c == '\0' && intuple)
6331                                                         break;
6332                                         }
6333 
6334                                         continue;
6335                                 }
6336 
6337                                 while (valoffs < end) {
6338                                         DTRACE_STORE(uint8_t, tomax, valoffs++,
6339                                             dtrace_load8(val++));
6340                                 }
6341 
6342                                 continue;
6343                         }
6344 
6345                         switch (size) {
6346                         case 0:
6347                                 break;
6348 
6349                         case sizeof (uint8_t):
6350                                 DTRACE_STORE(uint8_t, tomax, valoffs, val);
6351                                 break;
6352                         case sizeof (uint16_t):
6353                                 DTRACE_STORE(uint16_t, tomax, valoffs, val);
6354                                 break;
6355                         case sizeof (uint32_t):
6356                                 DTRACE_STORE(uint32_t, tomax, valoffs, val);
6357                                 break;
6358                         case sizeof (uint64_t):
6359                                 DTRACE_STORE(uint64_t, tomax, valoffs, val);
6360                                 break;
6361                         default:
6362                                 /*
6363                                  * Any other size should have been returned by
6364                                  * reference, not by value.
6365                                  */
6366                                 ASSERT(0);
6367                                 break;
6368                         }
6369                 }
6370 
6371                 if (*flags & CPU_DTRACE_DROP)
6372                         continue;
6373 
6374                 if (*flags & CPU_DTRACE_FAULT) {
6375                         int ndx;
6376                         dtrace_action_t *err;
6377 
6378                         buf->dtb_errors++;
6379 
6380                         if (probe->dtpr_id == dtrace_probeid_error) {
6381                                 /*
6382                                  * There's nothing we can do -- we had an
6383                                  * error on the error probe.  We bump an
6384                                  * error counter to at least indicate that
6385                                  * this condition happened.
6386                                  */
6387                                 dtrace_error(&state->dts_dblerrors);
6388                                 continue;
6389                         }
6390 
6391                         if (vtime) {
6392                                 /*
6393                                  * Before recursing on dtrace_probe(), we
6394                                  * need to explicitly clear out our start
6395                                  * time to prevent it from being accumulated
6396                                  * into t_dtrace_vtime.
6397                                  */
6398                                 curthread->t_dtrace_start = 0;
6399                         }
6400 
6401                         /*
6402                          * Iterate over the actions to figure out which action
6403                          * we were processing when we experienced the error.
6404                          * Note that act points _past_ the faulting action; if
6405                          * act is ecb->dte_action, the fault was in the
6406                          * predicate, if it's ecb->dte_action->dta_next it's
6407                          * in action #1, and so on.
6408                          */
6409                         for (err = ecb->dte_action, ndx = 0;
6410                             err != act; err = err->dta_next, ndx++)
6411                                 continue;
6412 
6413                         dtrace_probe_error(state, ecb->dte_epid, ndx,
6414                             (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
6415                             mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
6416                             cpu_core[cpuid].cpuc_dtrace_illval);
6417 
6418                         continue;
6419                 }
6420 
6421                 if (!committed)
6422                         buf->dtb_offset = offs + ecb->dte_size;
6423         }
6424 
6425         if (vtime)
6426                 curthread->t_dtrace_start = dtrace_gethrtime();
6427 
6428         dtrace_interrupt_enable(cookie);
6429 }
6430 
6431 /*
6432  * DTrace Probe Hashing Functions
6433  *
6434  * The functions in this section (and indeed, the functions in remaining
6435  * sections) are not _called_ from probe context.  (Any exceptions to this are
6436  * marked with a "Note:".)  Rather, they are called from elsewhere in the
6437  * DTrace framework to look-up probes in, add probes to and remove probes from
6438  * the DTrace probe hashes.  (Each probe is hashed by each element of the
6439  * probe tuple -- allowing for fast lookups, regardless of what was
6440  * specified.)
6441  */
6442 static uint_t
6443 dtrace_hash_str(char *p)
6444 {
6445         unsigned int g;
6446         uint_t hval = 0;
6447 
6448         while (*p) {
6449                 hval = (hval << 4) + *p++;
6450                 if ((g = (hval & 0xf0000000)) != 0)
6451                         hval ^= g >> 24;
6452                 hval &= ~g;
6453         }
6454         return (hval);
6455 }
6456 
6457 static dtrace_hash_t *
6458 dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
6459 {
6460         dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
6461 
6462         hash->dth_stroffs = stroffs;
6463         hash->dth_nextoffs = nextoffs;
6464         hash->dth_prevoffs = prevoffs;
6465 
6466         hash->dth_size = 1;
6467         hash->dth_mask = hash->dth_size - 1;
6468 
6469         hash->dth_tab = kmem_zalloc(hash->dth_size *
6470             sizeof (dtrace_hashbucket_t *), KM_SLEEP);
6471 
6472         return (hash);
6473 }
6474 
6475 static void
6476 dtrace_hash_destroy(dtrace_hash_t *hash)
6477 {
6478 #ifdef DEBUG
6479         int i;
6480 
6481         for (i = 0; i < hash->dth_size; i++)
6482                 ASSERT(hash->dth_tab[i] == NULL);
6483 #endif
6484 
6485         kmem_free(hash->dth_tab,
6486             hash->dth_size * sizeof (dtrace_hashbucket_t *));
6487         kmem_free(hash, sizeof (dtrace_hash_t));
6488 }
6489 
6490 static void
6491 dtrace_hash_resize(dtrace_hash_t *hash)
6492 {
6493         int size = hash->dth_size, i, ndx;
6494         int new_size = hash->dth_size << 1;
6495         int new_mask = new_size - 1;
6496         dtrace_hashbucket_t **new_tab, *bucket, *next;
6497 
6498         ASSERT((new_size & new_mask) == 0);
6499 
6500         new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
6501 
6502         for (i = 0; i < size; i++) {
6503                 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
6504                         dtrace_probe_t *probe = bucket->dthb_chain;
6505 
6506                         ASSERT(probe != NULL);
6507                         ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
6508 
6509                         next = bucket->dthb_next;
6510                         bucket->dthb_next = new_tab[ndx];
6511                         new_tab[ndx] = bucket;
6512                 }
6513         }
6514 
6515         kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
6516         hash->dth_tab = new_tab;
6517         hash->dth_size = new_size;
6518         hash->dth_mask = new_mask;
6519 }
6520 
6521 static void
6522 dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
6523 {
6524         int hashval = DTRACE_HASHSTR(hash, new);
6525         int ndx = hashval & hash->dth_mask;
6526         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6527         dtrace_probe_t **nextp, **prevp;
6528 
6529         for (; bucket != NULL; bucket = bucket->dthb_next) {
6530                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
6531                         goto add;
6532         }
6533 
6534         if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
6535                 dtrace_hash_resize(hash);
6536                 dtrace_hash_add(hash, new);
6537                 return;
6538         }
6539 
6540         bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
6541         bucket->dthb_next = hash->dth_tab[ndx];
6542         hash->dth_tab[ndx] = bucket;
6543         hash->dth_nbuckets++;
6544 
6545 add:
6546         nextp = DTRACE_HASHNEXT(hash, new);
6547         ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
6548         *nextp = bucket->dthb_chain;
6549 
6550         if (bucket->dthb_chain != NULL) {
6551                 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
6552                 ASSERT(*prevp == NULL);
6553                 *prevp = new;
6554         }
6555 
6556         bucket->dthb_chain = new;
6557         bucket->dthb_len++;
6558 }
6559 
6560 static dtrace_probe_t *
6561 dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
6562 {
6563         int hashval = DTRACE_HASHSTR(hash, template);
6564         int ndx = hashval & hash->dth_mask;
6565         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6566 
6567         for (; bucket != NULL; bucket = bucket->dthb_next) {
6568                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6569                         return (bucket->dthb_chain);
6570         }
6571 
6572         return (NULL);
6573 }
6574 
6575 static int
6576 dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
6577 {
6578         int hashval = DTRACE_HASHSTR(hash, template);
6579         int ndx = hashval & hash->dth_mask;
6580         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6581 
6582         for (; bucket != NULL; bucket = bucket->dthb_next) {
6583                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6584                         return (bucket->dthb_len);
6585         }
6586 
6587         return (NULL);
6588 }
6589 
6590 static void
6591 dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
6592 {
6593         int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
6594         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6595 
6596         dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
6597         dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
6598 
6599         /*
6600          * Find the bucket that we're removing this probe from.
6601          */
6602         for (; bucket != NULL; bucket = bucket->dthb_next) {
6603                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
6604                         break;
6605         }
6606 
6607         ASSERT(bucket != NULL);
6608 
6609         if (*prevp == NULL) {
6610                 if (*nextp == NULL) {
6611                         /*
6612                          * The removed probe was the only probe on this
6613                          * bucket; we need to remove the bucket.
6614                          */
6615                         dtrace_hashbucket_t *b = hash->dth_tab[ndx];
6616 
6617                         ASSERT(bucket->dthb_chain == probe);
6618                         ASSERT(b != NULL);
6619 
6620                         if (b == bucket) {
6621                                 hash->dth_tab[ndx] = bucket->dthb_next;
6622                         } else {
6623                                 while (b->dthb_next != bucket)
6624                                         b = b->dthb_next;
6625                                 b->dthb_next = bucket->dthb_next;
6626                         }
6627 
6628                         ASSERT(hash->dth_nbuckets > 0);
6629                         hash->dth_nbuckets--;
6630                         kmem_free(bucket, sizeof (dtrace_hashbucket_t));
6631                         return;
6632                 }
6633 
6634                 bucket->dthb_chain = *nextp;
6635         } else {
6636                 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
6637         }
6638 
6639         if (*nextp != NULL)
6640                 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
6641 }
6642 
6643 /*
6644  * DTrace Utility Functions
6645  *
6646  * These are random utility functions that are _not_ called from probe context.
6647  */
6648 static int
6649 dtrace_badattr(const dtrace_attribute_t *a)
6650 {
6651         return (a->dtat_name > DTRACE_STABILITY_MAX ||
6652             a->dtat_data > DTRACE_STABILITY_MAX ||
6653             a->dtat_class > DTRACE_CLASS_MAX);
6654 }
6655 
6656 /*
6657  * Return a duplicate copy of a string.  If the specified string is NULL,
6658  * this function returns a zero-length string.
6659  */
6660 static char *
6661 dtrace_strdup(const char *str)
6662 {
6663         char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
6664 
6665         if (str != NULL)
6666                 (void) strcpy(new, str);
6667 
6668         return (new);
6669 }
6670 
6671 #define DTRACE_ISALPHA(c)       \
6672         (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
6673 
6674 static int
6675 dtrace_badname(const char *s)
6676 {
6677         char c;
6678 
6679         if (s == NULL || (c = *s++) == '\0')
6680                 return (0);
6681 
6682         if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
6683                 return (1);
6684 
6685         while ((c = *s++) != '\0') {
6686                 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
6687                     c != '-' && c != '_' && c != '.' && c != '`')
6688                         return (1);
6689         }
6690 
6691         return (0);
6692 }
6693 
6694 static void
6695 dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
6696 {
6697         uint32_t priv;
6698 
6699         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
6700                 /*
6701                  * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
6702                  */
6703                 priv = DTRACE_PRIV_ALL;
6704         } else {
6705                 *uidp = crgetuid(cr);
6706                 *zoneidp = crgetzoneid(cr);
6707 
6708                 priv = 0;
6709                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
6710                         priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
6711                 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
6712                         priv |= DTRACE_PRIV_USER;
6713                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
6714                         priv |= DTRACE_PRIV_PROC;
6715                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
6716                         priv |= DTRACE_PRIV_OWNER;
6717                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
6718                         priv |= DTRACE_PRIV_ZONEOWNER;
6719         }
6720 
6721         *privp = priv;
6722 }
6723 
6724 #ifdef DTRACE_ERRDEBUG
6725 static void
6726 dtrace_errdebug(const char *str)
6727 {
6728         int hval = dtrace_hash_str((char *)str) % DTRACE_ERRHASHSZ;
6729         int occupied = 0;
6730 
6731         mutex_enter(&dtrace_errlock);
6732         dtrace_errlast = str;
6733         dtrace_errthread = curthread;
6734 
6735         while (occupied++ < DTRACE_ERRHASHSZ) {
6736                 if (dtrace_errhash[hval].dter_msg == str) {
6737                         dtrace_errhash[hval].dter_count++;
6738                         goto out;
6739                 }
6740 
6741                 if (dtrace_errhash[hval].dter_msg != NULL) {
6742                         hval = (hval + 1) % DTRACE_ERRHASHSZ;
6743                         continue;
6744                 }
6745 
6746                 dtrace_errhash[hval].dter_msg = str;
6747                 dtrace_errhash[hval].dter_count = 1;
6748                 goto out;
6749         }
6750 
6751         panic("dtrace: undersized error hash");
6752 out:
6753         mutex_exit(&dtrace_errlock);
6754 }
6755 #endif
6756 
6757 /*
6758  * DTrace Matching Functions
6759  *
6760  * These functions are used to match groups of probes, given some elements of
6761  * a probe tuple, or some globbed expressions for elements of a probe tuple.
6762  */
6763 static int
6764 dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
6765     zoneid_t zoneid)
6766 {
6767         if (priv != DTRACE_PRIV_ALL) {
6768                 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
6769                 uint32_t match = priv & ppriv;
6770 
6771                 /*
6772                  * No PRIV_DTRACE_* privileges...
6773                  */
6774                 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
6775                     DTRACE_PRIV_KERNEL)) == 0)
6776                         return (0);
6777 
6778                 /*
6779                  * No matching bits, but there were bits to match...
6780                  */
6781                 if (match == 0 && ppriv != 0)
6782                         return (0);
6783 
6784                 /*
6785                  * Need to have permissions to the process, but don't...
6786                  */
6787                 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
6788                     uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
6789                         return (0);
6790                 }
6791 
6792                 /*
6793                  * Need to be in the same zone unless we possess the
6794                  * privilege to examine all zones.
6795                  */
6796                 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
6797                     zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
6798                         return (0);
6799                 }
6800         }
6801 
6802         return (1);
6803 }
6804 
6805 /*
6806  * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
6807  * consists of input pattern strings and an ops-vector to evaluate them.
6808  * This function returns >0 for match, 0 for no match, and <0 for error.
6809  */
6810 static int
6811 dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
6812     uint32_t priv, uid_t uid, zoneid_t zoneid)
6813 {
6814         dtrace_provider_t *pvp = prp->dtpr_provider;
6815         int rv;
6816 
6817         if (pvp->dtpv_defunct)
6818                 return (0);
6819 
6820         if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
6821                 return (rv);
6822 
6823         if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
6824                 return (rv);
6825 
6826         if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
6827                 return (rv);
6828 
6829         if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
6830                 return (rv);
6831 
6832         if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
6833                 return (0);
6834 
6835         return (rv);
6836 }
6837 
6838 /*
6839  * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
6840  * interface for matching a glob pattern 'p' to an input string 's'.  Unlike
6841  * libc's version, the kernel version only applies to 8-bit ASCII strings.
6842  * In addition, all of the recursion cases except for '*' matching have been
6843  * unwound.  For '*', we still implement recursive evaluation, but a depth
6844  * counter is maintained and matching is aborted if we recurse too deep.
6845  * The function returns 0 if no match, >0 if match, and <0 if recursion error.
6846  */
6847 static int
6848 dtrace_match_glob(const char *s, const char *p, int depth)
6849 {
6850         const char *olds;
6851         char s1, c;
6852         int gs;
6853 
6854         if (depth > DTRACE_PROBEKEY_MAXDEPTH)
6855                 return (-1);
6856 
6857         if (s == NULL)
6858                 s = ""; /* treat NULL as empty string */
6859 
6860 top:
6861         olds = s;
6862         s1 = *s++;
6863 
6864         if (p == NULL)
6865                 return (0);
6866 
6867         if ((c = *p++) == '\0')
6868                 return (s1 == '\0');
6869 
6870         switch (c) {
6871         case '[': {
6872                 int ok = 0, notflag = 0;
6873                 char lc = '\0';
6874 
6875                 if (s1 == '\0')
6876                         return (0);
6877 
6878                 if (*p == '!') {
6879                         notflag = 1;
6880                         p++;
6881                 }
6882 
6883                 if ((c = *p++) == '\0')
6884                         return (0);
6885 
6886                 do {
6887                         if (c == '-' && lc != '\0' && *p != ']') {
6888                                 if ((c = *p++) == '\0')
6889                                         return (0);
6890                                 if (c == '\\' && (c = *p++) == '\0')
6891                                         return (0);
6892 
6893                                 if (notflag) {
6894                                         if (s1 < lc || s1 > c)
6895                                                 ok++;
6896                                         else
6897                                                 return (0);
6898                                 } else if (lc <= s1 && s1 <= c)
6899                                         ok++;
6900 
6901                         } else if (c == '\\' && (c = *p++) == '\0')
6902                                 return (0);
6903 
6904                         lc = c; /* save left-hand 'c' for next iteration */
6905 
6906                         if (notflag) {
6907                                 if (s1 != c)
6908                                         ok++;
6909                                 else
6910                                         return (0);
6911                         } else if (s1 == c)
6912                                 ok++;
6913 
6914                         if ((c = *p++) == '\0')
6915                                 return (0);
6916 
6917                 } while (c != ']');
6918 
6919                 if (ok)
6920                         goto top;
6921 
6922                 return (0);
6923         }
6924 
6925         case '\\':
6926                 if ((c = *p++) == '\0')
6927                         return (0);
6928                 /*FALLTHRU*/
6929 
6930         default:
6931                 if (c != s1)
6932                         return (0);
6933                 /*FALLTHRU*/
6934 
6935         case '?':
6936                 if (s1 != '\0')
6937                         goto top;
6938                 return (0);
6939 
6940         case '*':
6941                 while (*p == '*')
6942                         p++; /* consecutive *'s are identical to a single one */
6943 
6944                 if (*p == '\0')
6945                         return (1);
6946 
6947                 for (s = olds; *s != '\0'; s++) {
6948                         if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
6949                                 return (gs);
6950                 }
6951 
6952                 return (0);
6953         }
6954 }
6955 
6956 /*ARGSUSED*/
6957 static int
6958 dtrace_match_string(const char *s, const char *p, int depth)
6959 {
6960         return (s != NULL && strcmp(s, p) == 0);
6961 }
6962 
6963 /*ARGSUSED*/
6964 static int
6965 dtrace_match_nul(const char *s, const char *p, int depth)
6966 {
6967         return (1); /* always match the empty pattern */
6968 }
6969 
6970 /*ARGSUSED*/
6971 static int
6972 dtrace_match_nonzero(const char *s, const char *p, int depth)
6973 {
6974         return (s != NULL && s[0] != '\0');
6975 }
6976 
6977 static int
6978 dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
6979     zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
6980 {
6981         dtrace_probe_t template, *probe;
6982         dtrace_hash_t *hash = NULL;
6983         int len, rc, best = INT_MAX, nmatched = 0;
6984         dtrace_id_t i;
6985 
6986         ASSERT(MUTEX_HELD(&dtrace_lock));
6987 
6988         /*
6989          * If the probe ID is specified in the key, just lookup by ID and
6990          * invoke the match callback once if a matching probe is found.
6991          */
6992         if (pkp->dtpk_id != DTRACE_IDNONE) {
6993                 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
6994                     dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
6995                         if ((*matched)(probe, arg) == DTRACE_MATCH_FAIL)
6996                                 return (DTRACE_MATCH_FAIL);
6997                         nmatched++;
6998                 }
6999                 return (nmatched);
7000         }
7001 
7002         template.dtpr_mod = (char *)pkp->dtpk_mod;
7003         template.dtpr_func = (char *)pkp->dtpk_func;
7004         template.dtpr_name = (char *)pkp->dtpk_name;
7005 
7006         /*
7007          * We want to find the most distinct of the module name, function
7008          * name, and name.  So for each one that is not a glob pattern or
7009          * empty string, we perform a lookup in the corresponding hash and
7010          * use the hash table with the fewest collisions to do our search.
7011          */
7012         if (pkp->dtpk_mmatch == &dtrace_match_string &&
7013             (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
7014                 best = len;
7015                 hash = dtrace_bymod;
7016         }
7017 
7018         if (pkp->dtpk_fmatch == &dtrace_match_string &&
7019             (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
7020                 best = len;
7021                 hash = dtrace_byfunc;
7022         }
7023 
7024         if (pkp->dtpk_nmatch == &dtrace_match_string &&
7025             (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
7026                 best = len;
7027                 hash = dtrace_byname;
7028         }
7029 
7030         /*
7031          * If we did not select a hash table, iterate over every probe and
7032          * invoke our callback for each one that matches our input probe key.
7033          */
7034         if (hash == NULL) {
7035                 for (i = 0; i < dtrace_nprobes; i++) {
7036                         if ((probe = dtrace_probes[i]) == NULL ||
7037                             dtrace_match_probe(probe, pkp, priv, uid,
7038                             zoneid) <= 0)
7039                                 continue;
7040 
7041                         nmatched++;
7042 
7043                         if ((rc = (*matched)(probe, arg)) !=
7044                             DTRACE_MATCH_NEXT) {
7045                                 if (rc == DTRACE_MATCH_FAIL)
7046                                         return (DTRACE_MATCH_FAIL);
7047                                 break;
7048                         }
7049                 }
7050 
7051                 return (nmatched);
7052         }
7053 
7054         /*
7055          * If we selected a hash table, iterate over each probe of the same key
7056          * name and invoke the callback for every probe that matches the other
7057          * attributes of our input probe key.
7058          */
7059         for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
7060             probe = *(DTRACE_HASHNEXT(hash, probe))) {
7061 
7062                 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
7063                         continue;
7064 
7065                 nmatched++;
7066 
7067                 if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) {
7068                         if (rc == DTRACE_MATCH_FAIL)
7069                                 return (DTRACE_MATCH_FAIL);
7070                         break;
7071                 }
7072         }
7073 
7074         return (nmatched);
7075 }
7076 
7077 /*
7078  * Return the function pointer dtrace_probecmp() should use to compare the
7079  * specified pattern with a string.  For NULL or empty patterns, we select
7080  * dtrace_match_nul().  For glob pattern strings, we use dtrace_match_glob().
7081  * For non-empty non-glob strings, we use dtrace_match_string().
7082  */
7083 static dtrace_probekey_f *
7084 dtrace_probekey_func(const char *p)
7085 {
7086         char c;
7087 
7088         if (p == NULL || *p == '\0')
7089                 return (&dtrace_match_nul);
7090 
7091         while ((c = *p++) != '\0') {
7092                 if (c == '[' || c == '?' || c == '*' || c == '\\')
7093                         return (&dtrace_match_glob);
7094         }
7095 
7096         return (&dtrace_match_string);
7097 }
7098 
7099 /*
7100  * Build a probe comparison key for use with dtrace_match_probe() from the
7101  * given probe description.  By convention, a null key only matches anchored
7102  * probes: if each field is the empty string, reset dtpk_fmatch to
7103  * dtrace_match_nonzero().
7104  */
7105 static void
7106 dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
7107 {
7108         pkp->dtpk_prov = pdp->dtpd_provider;
7109         pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
7110 
7111         pkp->dtpk_mod = pdp->dtpd_mod;
7112         pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
7113 
7114         pkp->dtpk_func = pdp->dtpd_func;
7115         pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
7116 
7117         pkp->dtpk_name = pdp->dtpd_name;
7118         pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
7119 
7120         pkp->dtpk_id = pdp->dtpd_id;
7121 
7122         if (pkp->dtpk_id == DTRACE_IDNONE &&
7123             pkp->dtpk_pmatch == &dtrace_match_nul &&
7124             pkp->dtpk_mmatch == &dtrace_match_nul &&
7125             pkp->dtpk_fmatch == &dtrace_match_nul &&
7126             pkp->dtpk_nmatch == &dtrace_match_nul)
7127                 pkp->dtpk_fmatch = &dtrace_match_nonzero;
7128 }
7129 
7130 /*
7131  * DTrace Provider-to-Framework API Functions
7132  *
7133  * These functions implement much of the Provider-to-Framework API, as
7134  * described in <sys/dtrace.h>.  The parts of the API not in this section are
7135  * the functions in the API for probe management (found below), and
7136  * dtrace_probe() itself (found above).
7137  */
7138 
7139 /*
7140  * Register the calling provider with the DTrace framework.  This should
7141  * generally be called by DTrace providers in their attach(9E) entry point.
7142  */
7143 int
7144 dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
7145     cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
7146 {
7147         dtrace_provider_t *provider;
7148 
7149         if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
7150                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7151                     "arguments", name ? name : "<NULL>");
7152                 return (EINVAL);
7153         }
7154 
7155         if (name[0] == '\0' || dtrace_badname(name)) {
7156                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7157                     "provider name", name);
7158                 return (EINVAL);
7159         }
7160 
7161         if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
7162             pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
7163             pops->dtps_destroy == NULL ||
7164             ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
7165                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7166                     "provider ops", name);
7167                 return (EINVAL);
7168         }
7169 
7170         if (dtrace_badattr(&pap->dtpa_provider) ||
7171             dtrace_badattr(&pap->dtpa_mod) ||
7172             dtrace_badattr(&pap->dtpa_func) ||
7173             dtrace_badattr(&pap->dtpa_name) ||
7174             dtrace_badattr(&pap->dtpa_args)) {
7175                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7176                     "provider attributes", name);
7177                 return (EINVAL);
7178         }
7179 
7180         if (priv & ~DTRACE_PRIV_ALL) {
7181                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7182                     "privilege attributes", name);
7183                 return (EINVAL);
7184         }
7185 
7186         if ((priv & DTRACE_PRIV_KERNEL) &&
7187             (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
7188             pops->dtps_mode == NULL) {
7189                 cmn_err(CE_WARN, "failed to register provider '%s': need "
7190                     "dtps_mode() op for given privilege attributes", name);
7191                 return (EINVAL);
7192         }
7193 
7194         provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
7195         provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
7196         (void) strcpy(provider->dtpv_name, name);
7197 
7198         provider->dtpv_attr = *pap;
7199         provider->dtpv_priv.dtpp_flags = priv;
7200         if (cr != NULL) {
7201                 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
7202                 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
7203         }
7204         provider->dtpv_pops = *pops;
7205 
7206         if (pops->dtps_provide == NULL) {
7207                 ASSERT(pops->dtps_provide_module != NULL);
7208                 provider->dtpv_pops.dtps_provide =
7209                     (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop;
7210         }
7211 
7212         if (pops->dtps_provide_module == NULL) {
7213                 ASSERT(pops->dtps_provide != NULL);
7214                 provider->dtpv_pops.dtps_provide_module =
7215                     (void (*)(void *, struct modctl *))dtrace_nullop;
7216         }
7217 
7218         if (pops->dtps_suspend == NULL) {
7219                 ASSERT(pops->dtps_resume == NULL);
7220                 provider->dtpv_pops.dtps_suspend =
7221                     (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7222                 provider->dtpv_pops.dtps_resume =
7223                     (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7224         }
7225 
7226         provider->dtpv_arg = arg;
7227         *idp = (dtrace_provider_id_t)provider;
7228 
7229         if (pops == &dtrace_provider_ops) {
7230                 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7231                 ASSERT(MUTEX_HELD(&dtrace_lock));
7232                 ASSERT(dtrace_anon.dta_enabling == NULL);
7233 
7234                 /*
7235                  * We make sure that the DTrace provider is at the head of
7236                  * the provider chain.
7237                  */
7238                 provider->dtpv_next = dtrace_provider;
7239                 dtrace_provider = provider;
7240                 return (0);
7241         }
7242 
7243         mutex_enter(&dtrace_provider_lock);
7244         mutex_enter(&dtrace_lock);
7245 
7246         /*
7247          * If there is at least one provider registered, we'll add this
7248          * provider after the first provider.
7249          */
7250         if (dtrace_provider != NULL) {
7251                 provider->dtpv_next = dtrace_provider->dtpv_next;
7252                 dtrace_provider->dtpv_next = provider;
7253         } else {
7254                 dtrace_provider = provider;
7255         }
7256 
7257         if (dtrace_retained != NULL) {
7258                 dtrace_enabling_provide(provider);
7259 
7260                 /*
7261                  * Now we need to call dtrace_enabling_matchall() -- which
7262                  * will acquire cpu_lock and dtrace_lock.  We therefore need
7263                  * to drop all of our locks before calling into it...
7264                  */
7265                 mutex_exit(&dtrace_lock);
7266                 mutex_exit(&dtrace_provider_lock);
7267                 dtrace_enabling_matchall();
7268 
7269                 return (0);
7270         }
7271 
7272         mutex_exit(&dtrace_lock);
7273         mutex_exit(&dtrace_provider_lock);
7274 
7275         return (0);
7276 }
7277 
7278 /*
7279  * Unregister the specified provider from the DTrace framework.  This should
7280  * generally be called by DTrace providers in their detach(9E) entry point.
7281  */
7282 int
7283 dtrace_unregister(dtrace_provider_id_t id)
7284 {
7285         dtrace_provider_t *old = (dtrace_provider_t *)id;
7286         dtrace_provider_t *prev = NULL;
7287         int i, self = 0, noreap = 0;
7288         dtrace_probe_t *probe, *first = NULL;
7289 
7290         if (old->dtpv_pops.dtps_enable ==
7291             (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
7292                 /*
7293                  * If DTrace itself is the provider, we're called with locks
7294                  * already held.
7295                  */
7296                 ASSERT(old == dtrace_provider);
7297                 ASSERT(dtrace_devi != NULL);
7298                 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7299                 ASSERT(MUTEX_HELD(&dtrace_lock));
7300                 self = 1;
7301 
7302                 if (dtrace_provider->dtpv_next != NULL) {
7303                         /*
7304                          * There's another provider here; return failure.
7305                          */
7306                         return (EBUSY);
7307                 }
7308         } else {
7309                 mutex_enter(&dtrace_provider_lock);
7310                 mutex_enter(&mod_lock);
7311                 mutex_enter(&dtrace_lock);
7312         }
7313 
7314         /*
7315          * If anyone has /dev/dtrace open, or if there are anonymous enabled
7316          * probes, we refuse to let providers slither away, unless this
7317          * provider has already been explicitly invalidated.
7318          */
7319         if (!old->dtpv_defunct &&
7320             (dtrace_opens || (dtrace_anon.dta_state != NULL &&
7321             dtrace_anon.dta_state->dts_necbs > 0))) {
7322                 if (!self) {
7323                         mutex_exit(&dtrace_lock);
7324                         mutex_exit(&mod_lock);
7325                         mutex_exit(&dtrace_provider_lock);
7326                 }
7327                 return (EBUSY);
7328         }
7329 
7330         /*
7331          * Attempt to destroy the probes associated with this provider.
7332          */
7333         for (i = 0; i < dtrace_nprobes; i++) {
7334                 if ((probe = dtrace_probes[i]) == NULL)
7335                         continue;
7336 
7337                 if (probe->dtpr_provider != old)
7338                         continue;
7339 
7340                 if (probe->dtpr_ecb == NULL)
7341                         continue;
7342 
7343                 /*
7344                  * If we are trying to unregister a defunct provider, and the
7345                  * provider was made defunct within the interval dictated by
7346                  * dtrace_unregister_defunct_reap, we'll (asynchronously)
7347                  * attempt to reap our enablings.  To denote that the provider
7348                  * should reattempt to unregister itself at some point in the
7349                  * future, we will return a differentiable error code (EAGAIN
7350                  * instead of EBUSY) in this case.
7351                  */
7352                 if (dtrace_gethrtime() - old->dtpv_defunct >
7353                     dtrace_unregister_defunct_reap)
7354                         noreap = 1;
7355 
7356                 if (!self) {
7357                         mutex_exit(&dtrace_lock);
7358                         mutex_exit(&mod_lock);
7359                         mutex_exit(&dtrace_provider_lock);
7360                 }
7361 
7362                 if (noreap)
7363                         return (EBUSY);
7364 
7365                 (void) taskq_dispatch(dtrace_taskq,
7366                     (task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP);
7367 
7368                 return (EAGAIN);
7369         }
7370 
7371         /*
7372          * All of the probes for this provider are disabled; we can safely
7373          * remove all of them from their hash chains and from the probe array.
7374          */
7375         for (i = 0; i < dtrace_nprobes; i++) {
7376                 if ((probe = dtrace_probes[i]) == NULL)
7377                         continue;
7378 
7379                 if (probe->dtpr_provider != old)
7380                         continue;
7381 
7382                 dtrace_probes[i] = NULL;
7383 
7384                 dtrace_hash_remove(dtrace_bymod, probe);
7385                 dtrace_hash_remove(dtrace_byfunc, probe);
7386                 dtrace_hash_remove(dtrace_byname, probe);
7387 
7388                 if (first == NULL) {
7389                         first = probe;
7390                         probe->dtpr_nextmod = NULL;
7391                 } else {
7392                         probe->dtpr_nextmod = first;
7393                         first = probe;
7394                 }
7395         }
7396 
7397         /*
7398          * The provider's probes have been removed from the hash chains and
7399          * from the probe array.  Now issue a dtrace_sync() to be sure that
7400          * everyone has cleared out from any probe array processing.
7401          */
7402         dtrace_sync();
7403 
7404         for (probe = first; probe != NULL; probe = first) {
7405                 first = probe->dtpr_nextmod;
7406 
7407                 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
7408                     probe->dtpr_arg);
7409                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7410                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7411                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7412                 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
7413                 kmem_free(probe, sizeof (dtrace_probe_t));
7414         }
7415 
7416         if ((prev = dtrace_provider) == old) {
7417                 ASSERT(self || dtrace_devi == NULL);
7418                 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
7419                 dtrace_provider = old->dtpv_next;
7420         } else {
7421                 while (prev != NULL && prev->dtpv_next != old)
7422                         prev = prev->dtpv_next;
7423 
7424                 if (prev == NULL) {
7425                         panic("attempt to unregister non-existent "
7426                             "dtrace provider %p\n", (void *)id);
7427                 }
7428 
7429                 prev->dtpv_next = old->dtpv_next;
7430         }
7431 
7432         if (!self) {
7433                 mutex_exit(&dtrace_lock);
7434                 mutex_exit(&mod_lock);
7435                 mutex_exit(&dtrace_provider_lock);
7436         }
7437 
7438         kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
7439         kmem_free(old, sizeof (dtrace_provider_t));
7440 
7441         return (0);
7442 }
7443 
7444 /*
7445  * Invalidate the specified provider.  All subsequent probe lookups for the
7446  * specified provider will fail, but its probes will not be removed.
7447  */
7448 void
7449 dtrace_invalidate(dtrace_provider_id_t id)
7450 {
7451         dtrace_provider_t *pvp = (dtrace_provider_t *)id;
7452 
7453         ASSERT(pvp->dtpv_pops.dtps_enable !=
7454             (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
7455 
7456         mutex_enter(&dtrace_provider_lock);
7457         mutex_enter(&dtrace_lock);
7458 
7459         pvp->dtpv_defunct = dtrace_gethrtime();
7460 
7461         mutex_exit(&dtrace_lock);
7462         mutex_exit(&dtrace_provider_lock);
7463 }
7464 
7465 /*
7466  * Indicate whether or not DTrace has attached.
7467  */
7468 int
7469 dtrace_attached(void)
7470 {
7471         /*
7472          * dtrace_provider will be non-NULL iff the DTrace driver has
7473          * attached.  (It's non-NULL because DTrace is always itself a
7474          * provider.)
7475          */
7476         return (dtrace_provider != NULL);
7477 }
7478 
7479 /*
7480  * Remove all the unenabled probes for the given provider.  This function is
7481  * not unlike dtrace_unregister(), except that it doesn't remove the provider
7482  * -- just as many of its associated probes as it can.
7483  */
7484 int
7485 dtrace_condense(dtrace_provider_id_t id)
7486 {
7487         dtrace_provider_t *prov = (dtrace_provider_t *)id;
7488         int i;
7489         dtrace_probe_t *probe;
7490 
7491         /*
7492          * Make sure this isn't the dtrace provider itself.
7493          */
7494         ASSERT(prov->dtpv_pops.dtps_enable !=
7495             (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
7496 
7497         mutex_enter(&dtrace_provider_lock);
7498         mutex_enter(&dtrace_lock);
7499 
7500         /*
7501          * Attempt to destroy the probes associated with this provider.
7502          */
7503         for (i = 0; i < dtrace_nprobes; i++) {
7504                 if ((probe = dtrace_probes[i]) == NULL)
7505                         continue;
7506 
7507                 if (probe->dtpr_provider != prov)
7508                         continue;
7509 
7510                 if (probe->dtpr_ecb != NULL)
7511                         continue;
7512 
7513                 dtrace_probes[i] = NULL;
7514 
7515                 dtrace_hash_remove(dtrace_bymod, probe);
7516                 dtrace_hash_remove(dtrace_byfunc, probe);
7517                 dtrace_hash_remove(dtrace_byname, probe);
7518 
7519                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
7520                     probe->dtpr_arg);
7521                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7522                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7523                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7524                 kmem_free(probe, sizeof (dtrace_probe_t));
7525                 vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
7526         }
7527 
7528         mutex_exit(&dtrace_lock);
7529         mutex_exit(&dtrace_provider_lock);
7530 
7531         return (0);
7532 }
7533 
7534 /*
7535  * DTrace Probe Management Functions
7536  *
7537  * The functions in this section perform the DTrace probe management,
7538  * including functions to create probes, look-up probes, and call into the
7539  * providers to request that probes be provided.  Some of these functions are
7540  * in the Provider-to-Framework API; these functions can be identified by the
7541  * fact that they are not declared "static".
7542  */
7543 
7544 /*
7545  * Create a probe with the specified module name, function name, and name.
7546  */
7547 dtrace_id_t
7548 dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
7549     const char *func, const char *name, int aframes, void *arg)
7550 {
7551         dtrace_probe_t *probe, **probes;
7552         dtrace_provider_t *provider = (dtrace_provider_t *)prov;
7553         dtrace_id_t id;
7554 
7555         if (provider == dtrace_provider) {
7556                 ASSERT(MUTEX_HELD(&dtrace_lock));
7557         } else {
7558                 mutex_enter(&dtrace_lock);
7559         }
7560 
7561         id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
7562             VM_BESTFIT | VM_SLEEP);
7563         probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
7564 
7565         probe->dtpr_id = id;
7566         probe->dtpr_gen = dtrace_probegen++;
7567         probe->dtpr_mod = dtrace_strdup(mod);
7568         probe->dtpr_func = dtrace_strdup(func);
7569         probe->dtpr_name = dtrace_strdup(name);
7570         probe->dtpr_arg = arg;
7571         probe->dtpr_aframes = aframes;
7572         probe->dtpr_provider = provider;
7573 
7574         dtrace_hash_add(dtrace_bymod, probe);
7575         dtrace_hash_add(dtrace_byfunc, probe);
7576         dtrace_hash_add(dtrace_byname, probe);
7577 
7578         if (id - 1 >= dtrace_nprobes) {
7579                 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
7580                 size_t nsize = osize << 1;
7581 
7582                 if (nsize == 0) {
7583                         ASSERT(osize == 0);
7584                         ASSERT(dtrace_probes == NULL);
7585                         nsize = sizeof (dtrace_probe_t *);
7586                 }
7587 
7588                 probes = kmem_zalloc(nsize, KM_SLEEP);
7589 
7590                 if (dtrace_probes == NULL) {
7591                         ASSERT(osize == 0);
7592                         dtrace_probes = probes;
7593                         dtrace_nprobes = 1;
7594                 } else {
7595                         dtrace_probe_t **oprobes = dtrace_probes;
7596 
7597                         bcopy(oprobes, probes, osize);
7598                         dtrace_membar_producer();
7599                         dtrace_probes = probes;
7600 
7601                         dtrace_sync();
7602 
7603                         /*
7604                          * All CPUs are now seeing the new probes array; we can
7605                          * safely free the old array.
7606                          */
7607                         kmem_free(oprobes, osize);
7608                         dtrace_nprobes <<= 1;
7609                 }
7610 
7611                 ASSERT(id - 1 < dtrace_nprobes);
7612         }
7613 
7614         ASSERT(dtrace_probes[id - 1] == NULL);
7615         dtrace_probes[id - 1] = probe;
7616 
7617         if (provider != dtrace_provider)
7618                 mutex_exit(&dtrace_lock);
7619 
7620         return (id);
7621 }
7622 
7623 static dtrace_probe_t *
7624 dtrace_probe_lookup_id(dtrace_id_t id)
7625 {
7626         ASSERT(MUTEX_HELD(&dtrace_lock));
7627 
7628         if (id == 0 || id > dtrace_nprobes)
7629                 return (NULL);
7630 
7631         return (dtrace_probes[id - 1]);
7632 }
7633 
7634 static int
7635 dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
7636 {
7637         *((dtrace_id_t *)arg) = probe->dtpr_id;
7638 
7639         return (DTRACE_MATCH_DONE);
7640 }
7641 
7642 /*
7643  * Look up a probe based on provider and one or more of module name, function
7644  * name and probe name.
7645  */
7646 dtrace_id_t
7647 dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
7648     const char *func, const char *name)
7649 {
7650         dtrace_probekey_t pkey;
7651         dtrace_id_t id;
7652         int match;
7653 
7654         pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
7655         pkey.dtpk_pmatch = &dtrace_match_string;
7656         pkey.dtpk_mod = mod;
7657         pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
7658         pkey.dtpk_func = func;
7659         pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
7660         pkey.dtpk_name = name;
7661         pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
7662         pkey.dtpk_id = DTRACE_IDNONE;
7663 
7664         mutex_enter(&dtrace_lock);
7665         match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
7666             dtrace_probe_lookup_match, &id);
7667         mutex_exit(&dtrace_lock);
7668 
7669         ASSERT(match == 1 || match == 0);
7670         return (match ? id : 0);
7671 }
7672 
7673 /*
7674  * Returns the probe argument associated with the specified probe.
7675  */
7676 void *
7677 dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
7678 {
7679         dtrace_probe_t *probe;
7680         void *rval = NULL;
7681 
7682         mutex_enter(&dtrace_lock);
7683 
7684         if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
7685             probe->dtpr_provider == (dtrace_provider_t *)id)
7686                 rval = probe->dtpr_arg;
7687 
7688         mutex_exit(&dtrace_lock);
7689 
7690         return (rval);
7691 }
7692 
7693 /*
7694  * Copy a probe into a probe description.
7695  */
7696 static void
7697 dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
7698 {
7699         bzero(pdp, sizeof (dtrace_probedesc_t));
7700         pdp->dtpd_id = prp->dtpr_id;
7701 
7702         (void) strncpy(pdp->dtpd_provider,
7703             prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
7704 
7705         (void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
7706         (void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
7707         (void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
7708 }
7709 
7710 /*
7711  * Called to indicate that a probe -- or probes -- should be provided by a
7712  * specfied provider.  If the specified description is NULL, the provider will
7713  * be told to provide all of its probes.  (This is done whenever a new
7714  * consumer comes along, or whenever a retained enabling is to be matched.) If
7715  * the specified description is non-NULL, the provider is given the
7716  * opportunity to dynamically provide the specified probe, allowing providers
7717  * to support the creation of probes on-the-fly.  (So-called _autocreated_
7718  * probes.)  If the provider is NULL, the operations will be applied to all
7719  * providers; if the provider is non-NULL the operations will only be applied
7720  * to the specified provider.  The dtrace_provider_lock must be held, and the
7721  * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
7722  * will need to grab the dtrace_lock when it reenters the framework through
7723  * dtrace_probe_lookup(), dtrace_probe_create(), etc.
7724  */
7725 static void
7726 dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
7727 {
7728         struct modctl *ctl;
7729         int all = 0;
7730 
7731         ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7732 
7733         if (prv == NULL) {
7734                 all = 1;
7735                 prv = dtrace_provider;
7736         }
7737 
7738         do {
7739                 /*
7740                  * First, call the blanket provide operation.
7741                  */
7742                 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
7743 
7744                 /*
7745                  * Now call the per-module provide operation.  We will grab
7746                  * mod_lock to prevent the list from being modified.  Note
7747                  * that this also prevents the mod_busy bits from changing.
7748                  * (mod_busy can only be changed with mod_lock held.)
7749                  */
7750                 mutex_enter(&mod_lock);
7751 
7752                 ctl = &modules;
7753                 do {
7754                         if (ctl->mod_busy || ctl->mod_mp == NULL)
7755                                 continue;
7756 
7757                         prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
7758 
7759                 } while ((ctl = ctl->mod_next) != &modules);
7760 
7761                 mutex_exit(&mod_lock);
7762         } while (all && (prv = prv->dtpv_next) != NULL);
7763 }
7764 
7765 /*
7766  * Iterate over each probe, and call the Framework-to-Provider API function
7767  * denoted by offs.
7768  */
7769 static void
7770 dtrace_probe_foreach(uintptr_t offs)
7771 {
7772         dtrace_provider_t *prov;
7773         void (*func)(void *, dtrace_id_t, void *);
7774         dtrace_probe_t *probe;
7775         dtrace_icookie_t cookie;
7776         int i;
7777 
7778         /*
7779          * We disable interrupts to walk through the probe array.  This is
7780          * safe -- the dtrace_sync() in dtrace_unregister() assures that we
7781          * won't see stale data.
7782          */
7783         cookie = dtrace_interrupt_disable();
7784 
7785         for (i = 0; i < dtrace_nprobes; i++) {
7786                 if ((probe = dtrace_probes[i]) == NULL)
7787                         continue;
7788 
7789                 if (probe->dtpr_ecb == NULL) {
7790                         /*
7791                          * This probe isn't enabled -- don't call the function.
7792                          */
7793                         continue;
7794                 }
7795 
7796                 prov = probe->dtpr_provider;
7797                 func = *((void(**)(void *, dtrace_id_t, void *))
7798                     ((uintptr_t)&prov->dtpv_pops + offs));
7799 
7800                 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
7801         }
7802 
7803         dtrace_interrupt_enable(cookie);
7804 }
7805 
7806 static int
7807 dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
7808 {
7809         dtrace_probekey_t pkey;
7810         uint32_t priv;
7811         uid_t uid;
7812         zoneid_t zoneid;
7813 
7814         ASSERT(MUTEX_HELD(&dtrace_lock));
7815         dtrace_ecb_create_cache = NULL;
7816 
7817         if (desc == NULL) {
7818                 /*
7819                  * If we're passed a NULL description, we're being asked to
7820                  * create an ECB with a NULL probe.
7821                  */
7822                 (void) dtrace_ecb_create_enable(NULL, enab);
7823                 return (0);
7824         }
7825 
7826         dtrace_probekey(desc, &pkey);
7827         dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
7828             &priv, &uid, &zoneid);
7829 
7830         return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
7831             enab));
7832 }
7833 
7834 /*
7835  * DTrace Helper Provider Functions
7836  */
7837 static void
7838 dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
7839 {
7840         attr->dtat_name = DOF_ATTR_NAME(dofattr);
7841         attr->dtat_data = DOF_ATTR_DATA(dofattr);
7842         attr->dtat_class = DOF_ATTR_CLASS(dofattr);
7843 }
7844 
7845 static void
7846 dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
7847     const dof_provider_t *dofprov, char *strtab)
7848 {
7849         hprov->dthpv_provname = strtab + dofprov->dofpv_name;
7850         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
7851             dofprov->dofpv_provattr);
7852         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
7853             dofprov->dofpv_modattr);
7854         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
7855             dofprov->dofpv_funcattr);
7856         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
7857             dofprov->dofpv_nameattr);
7858         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
7859             dofprov->dofpv_argsattr);
7860 }
7861 
7862 static void
7863 dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
7864 {
7865         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7866         dof_hdr_t *dof = (dof_hdr_t *)daddr;
7867         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
7868         dof_provider_t *provider;
7869         dof_probe_t *probe;
7870         uint32_t *off, *enoff;
7871         uint8_t *arg;
7872         char *strtab;
7873         uint_t i, nprobes;
7874         dtrace_helper_provdesc_t dhpv;
7875         dtrace_helper_probedesc_t dhpb;
7876         dtrace_meta_t *meta = dtrace_meta_pid;
7877         dtrace_mops_t *mops = &meta->dtm_mops;
7878         void *parg;
7879 
7880         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
7881         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7882             provider->dofpv_strtab * dof->dofh_secsize);
7883         prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7884             provider->dofpv_probes * dof->dofh_secsize);
7885         arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7886             provider->dofpv_prargs * dof->dofh_secsize);
7887         off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7888             provider->dofpv_proffs * dof->dofh_secsize);
7889 
7890         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
7891         off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
7892         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
7893         enoff = NULL;
7894 
7895         /*
7896          * See dtrace_helper_provider_validate().
7897          */
7898         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
7899             provider->dofpv_prenoffs != DOF_SECT_NONE) {
7900                 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7901                     provider->dofpv_prenoffs * dof->dofh_secsize);
7902                 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
7903         }
7904 
7905         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
7906 
7907         /*
7908          * Create the provider.
7909          */
7910         dtrace_dofprov2hprov(&dhpv, provider, strtab);
7911 
7912         if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
7913                 return;
7914 
7915         meta->dtm_count++;
7916 
7917         /*
7918          * Create the probes.
7919          */
7920         for (i = 0; i < nprobes; i++) {
7921                 probe = (dof_probe_t *)(uintptr_t)(daddr +
7922                     prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
7923 
7924                 dhpb.dthpb_mod = dhp->dofhp_mod;
7925                 dhpb.dthpb_func = strtab + probe->dofpr_func;
7926                 dhpb.dthpb_name = strtab + probe->dofpr_name;
7927                 dhpb.dthpb_base = probe->dofpr_addr;
7928                 dhpb.dthpb_offs = off + probe->dofpr_offidx;
7929                 dhpb.dthpb_noffs = probe->dofpr_noffs;
7930                 if (enoff != NULL) {
7931                         dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
7932                         dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
7933                 } else {
7934                         dhpb.dthpb_enoffs = NULL;
7935                         dhpb.dthpb_nenoffs = 0;
7936                 }
7937                 dhpb.dthpb_args = arg + probe->dofpr_argidx;
7938                 dhpb.dthpb_nargc = probe->dofpr_nargc;
7939                 dhpb.dthpb_xargc = probe->dofpr_xargc;
7940                 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
7941                 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
7942 
7943                 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
7944         }
7945 }
7946 
7947 static void
7948 dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
7949 {
7950         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7951         dof_hdr_t *dof = (dof_hdr_t *)daddr;
7952         int i;
7953 
7954         ASSERT(MUTEX_HELD(&dtrace_meta_lock));
7955 
7956         for (i = 0; i < dof->dofh_secnum; i++) {
7957                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
7958                     dof->dofh_secoff + i * dof->dofh_secsize);
7959 
7960                 if (sec->dofs_type != DOF_SECT_PROVIDER)
7961                         continue;
7962 
7963                 dtrace_helper_provide_one(dhp, sec, pid);
7964         }
7965 
7966         /*
7967          * We may have just created probes, so we must now rematch against
7968          * any retained enablings.  Note that this call will acquire both
7969          * cpu_lock and dtrace_lock; the fact that we are holding
7970          * dtrace_meta_lock now is what defines the ordering with respect to
7971          * these three locks.
7972          */
7973         dtrace_enabling_matchall();
7974 }
7975 
7976 static void
7977 dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
7978 {
7979         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7980         dof_hdr_t *dof = (dof_hdr_t *)daddr;
7981         dof_sec_t *str_sec;
7982         dof_provider_t *provider;
7983         char *strtab;
7984         dtrace_helper_provdesc_t dhpv;
7985         dtrace_meta_t *meta = dtrace_meta_pid;
7986         dtrace_mops_t *mops = &meta->dtm_mops;
7987 
7988         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
7989         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7990             provider->dofpv_strtab * dof->dofh_secsize);
7991 
7992         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
7993 
7994         /*
7995          * Create the provider.
7996          */
7997         dtrace_dofprov2hprov(&dhpv, provider, strtab);
7998 
7999         mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
8000 
8001         meta->dtm_count--;
8002 }
8003 
8004 static void
8005 dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
8006 {
8007         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8008         dof_hdr_t *dof = (dof_hdr_t *)daddr;
8009         int i;
8010 
8011         ASSERT(MUTEX_HELD(&dtrace_meta_lock));
8012 
8013         for (i = 0; i < dof->dofh_secnum; i++) {
8014                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8015                     dof->dofh_secoff + i * dof->dofh_secsize);
8016 
8017                 if (sec->dofs_type != DOF_SECT_PROVIDER)
8018                         continue;
8019 
8020                 dtrace_helper_provider_remove_one(dhp, sec, pid);
8021         }
8022 }
8023 
8024 /*
8025  * DTrace Meta Provider-to-Framework API Functions
8026  *
8027  * These functions implement the Meta Provider-to-Framework API, as described
8028  * in <sys/dtrace.h>.
8029  */
8030 int
8031 dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
8032     dtrace_meta_provider_id_t *idp)
8033 {
8034         dtrace_meta_t *meta;
8035         dtrace_helpers_t *help, *next;
8036         int i;
8037 
8038         *idp = DTRACE_METAPROVNONE;
8039 
8040         /*
8041          * We strictly don't need the name, but we hold onto it for
8042          * debuggability. All hail error queues!
8043          */
8044         if (name == NULL) {
8045                 cmn_err(CE_WARN, "failed to register meta-provider: "
8046                     "invalid name");
8047                 return (EINVAL);
8048         }
8049 
8050         if (mops == NULL ||
8051             mops->dtms_create_probe == NULL ||
8052             mops->dtms_provide_pid == NULL ||
8053             mops->dtms_remove_pid == NULL) {
8054                 cmn_err(CE_WARN, "failed to register meta-register %s: "
8055                     "invalid ops", name);
8056                 return (EINVAL);
8057         }
8058 
8059         meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
8060         meta->dtm_mops = *mops;
8061         meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
8062         (void) strcpy(meta->dtm_name, name);
8063         meta->dtm_arg = arg;
8064 
8065         mutex_enter(&dtrace_meta_lock);
8066         mutex_enter(&dtrace_lock);
8067 
8068         if (dtrace_meta_pid != NULL) {
8069                 mutex_exit(&dtrace_lock);
8070                 mutex_exit(&dtrace_meta_lock);
8071                 cmn_err(CE_WARN, "failed to register meta-register %s: "
8072                     "user-land meta-provider exists", name);
8073                 kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
8074                 kmem_free(meta, sizeof (dtrace_meta_t));
8075                 return (EINVAL);
8076         }
8077 
8078         dtrace_meta_pid = meta;
8079         *idp = (dtrace_meta_provider_id_t)meta;
8080 
8081         /*
8082          * If there are providers and probes ready to go, pass them
8083          * off to the new meta provider now.
8084          */
8085 
8086         help = dtrace_deferred_pid;
8087         dtrace_deferred_pid = NULL;
8088 
8089         mutex_exit(&dtrace_lock);
8090 
8091         while (help != NULL) {
8092                 for (i = 0; i < help->dthps_nprovs; i++) {
8093                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
8094                             help->dthps_pid);
8095                 }
8096 
8097                 next = help->dthps_next;
8098                 help->dthps_next = NULL;
8099                 help->dthps_prev = NULL;
8100                 help->dthps_deferred = 0;
8101                 help = next;
8102         }
8103 
8104         mutex_exit(&dtrace_meta_lock);
8105 
8106         return (0);
8107 }
8108 
8109 int
8110 dtrace_meta_unregister(dtrace_meta_provider_id_t id)
8111 {
8112         dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
8113 
8114         mutex_enter(&dtrace_meta_lock);
8115         mutex_enter(&dtrace_lock);
8116 
8117         if (old == dtrace_meta_pid) {
8118                 pp = &dtrace_meta_pid;
8119         } else {
8120                 panic("attempt to unregister non-existent "
8121                     "dtrace meta-provider %p\n", (void *)old);
8122         }
8123 
8124         if (old->dtm_count != 0) {
8125                 mutex_exit(&dtrace_lock);
8126                 mutex_exit(&dtrace_meta_lock);
8127                 return (EBUSY);
8128         }
8129 
8130         *pp = NULL;
8131 
8132         mutex_exit(&dtrace_lock);
8133         mutex_exit(&dtrace_meta_lock);
8134 
8135         kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
8136         kmem_free(old, sizeof (dtrace_meta_t));
8137 
8138         return (0);
8139 }
8140 
8141 
8142 /*
8143  * DTrace DIF Object Functions
8144  */
8145 static int
8146 dtrace_difo_err(uint_t pc, const char *format, ...)
8147 {
8148         if (dtrace_err_verbose) {
8149                 va_list alist;
8150 
8151                 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
8152                 va_start(alist, format);
8153                 (void) vuprintf(format, alist);
8154                 va_end(alist);
8155         }
8156 
8157 #ifdef DTRACE_ERRDEBUG
8158         dtrace_errdebug(format);
8159 #endif
8160         return (1);
8161 }
8162 
8163 /*
8164  * Validate a DTrace DIF object by checking the IR instructions.  The following
8165  * rules are currently enforced by dtrace_difo_validate():
8166  *
8167  * 1. Each instruction must have a valid opcode
8168  * 2. Each register, string, variable, or subroutine reference must be valid
8169  * 3. No instruction can modify register %r0 (must be zero)
8170  * 4. All instruction reserved bits must be set to zero
8171  * 5. The last instruction must be a "ret" instruction
8172  * 6. All branch targets must reference a valid instruction _after_ the branch
8173  */
8174 static int
8175 dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
8176     cred_t *cr)
8177 {
8178         int err = 0, i;
8179         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8180         int kcheckload;
8181         uint_t pc;
8182 
8183         kcheckload = cr == NULL ||
8184             (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
8185 
8186         dp->dtdo_destructive = 0;
8187 
8188         for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
8189                 dif_instr_t instr = dp->dtdo_buf[pc];
8190 
8191                 uint_t r1 = DIF_INSTR_R1(instr);
8192                 uint_t r2 = DIF_INSTR_R2(instr);
8193                 uint_t rd = DIF_INSTR_RD(instr);
8194                 uint_t rs = DIF_INSTR_RS(instr);
8195                 uint_t label = DIF_INSTR_LABEL(instr);
8196                 uint_t v = DIF_INSTR_VAR(instr);
8197                 uint_t subr = DIF_INSTR_SUBR(instr);
8198                 uint_t type = DIF_INSTR_TYPE(instr);
8199                 uint_t op = DIF_INSTR_OP(instr);
8200 
8201                 switch (op) {
8202                 case DIF_OP_OR:
8203                 case DIF_OP_XOR:
8204                 case DIF_OP_AND:
8205                 case DIF_OP_SLL:
8206                 case DIF_OP_SRL:
8207                 case DIF_OP_SRA:
8208                 case DIF_OP_SUB:
8209                 case DIF_OP_ADD:
8210                 case DIF_OP_MUL:
8211                 case DIF_OP_SDIV:
8212                 case DIF_OP_UDIV:
8213                 case DIF_OP_SREM:
8214                 case DIF_OP_UREM:
8215                 case DIF_OP_COPYS:
8216                         if (r1 >= nregs)
8217                                 err += efunc(pc, "invalid register %u\n", r1);
8218                         if (r2 >= nregs)
8219                                 err += efunc(pc, "invalid register %u\n", r2);
8220                         if (rd >= nregs)
8221                                 err += efunc(pc, "invalid register %u\n", rd);
8222                         if (rd == 0)
8223                                 err += efunc(pc, "cannot write to %r0\n");
8224                         break;
8225                 case DIF_OP_NOT:
8226                 case DIF_OP_MOV:
8227                 case DIF_OP_ALLOCS:
8228                         if (r1 >= nregs)
8229                                 err += efunc(pc, "invalid register %u\n", r1);
8230                         if (r2 != 0)
8231                                 err += efunc(pc, "non-zero reserved bits\n");
8232                         if (rd >= nregs)
8233                                 err += efunc(pc, "invalid register %u\n", rd);
8234                         if (rd == 0)
8235                                 err += efunc(pc, "cannot write to %r0\n");
8236                         break;
8237                 case DIF_OP_LDSB:
8238                 case DIF_OP_LDSH:
8239                 case DIF_OP_LDSW:
8240                 case DIF_OP_LDUB:
8241                 case DIF_OP_LDUH:
8242                 case DIF_OP_LDUW:
8243                 case DIF_OP_LDX:
8244                         if (r1 >= nregs)
8245                                 err += efunc(pc, "invalid register %u\n", r1);
8246                         if (r2 != 0)
8247                                 err += efunc(pc, "non-zero reserved bits\n");
8248                         if (rd >= nregs)
8249                                 err += efunc(pc, "invalid register %u\n", rd);
8250                         if (rd == 0)
8251                                 err += efunc(pc, "cannot write to %r0\n");
8252                         if (kcheckload)
8253                                 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
8254                                     DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
8255                         break;
8256                 case DIF_OP_RLDSB:
8257                 case DIF_OP_RLDSH:
8258                 case DIF_OP_RLDSW:
8259                 case DIF_OP_RLDUB:
8260                 case DIF_OP_RLDUH:
8261                 case DIF_OP_RLDUW:
8262                 case DIF_OP_RLDX:
8263                         if (r1 >= nregs)
8264                                 err += efunc(pc, "invalid register %u\n", r1);
8265                         if (r2 != 0)
8266                                 err += efunc(pc, "non-zero reserved bits\n");
8267                         if (rd >= nregs)
8268                                 err += efunc(pc, "invalid register %u\n", rd);
8269                         if (rd == 0)
8270                                 err += efunc(pc, "cannot write to %r0\n");
8271                         break;
8272                 case DIF_OP_ULDSB:
8273                 case DIF_OP_ULDSH:
8274                 case DIF_OP_ULDSW:
8275                 case DIF_OP_ULDUB:
8276                 case DIF_OP_ULDUH:
8277                 case DIF_OP_ULDUW:
8278                 case DIF_OP_ULDX:
8279                         if (r1 >= nregs)
8280                                 err += efunc(pc, "invalid register %u\n", r1);
8281                         if (r2 != 0)
8282                                 err += efunc(pc, "non-zero reserved bits\n");
8283                         if (rd >= nregs)
8284                                 err += efunc(pc, "invalid register %u\n", rd);
8285                         if (rd == 0)
8286                                 err += efunc(pc, "cannot write to %r0\n");
8287                         break;
8288                 case DIF_OP_STB:
8289                 case DIF_OP_STH:
8290                 case DIF_OP_STW:
8291                 case DIF_OP_STX:
8292                         if (r1 >= nregs)
8293                                 err += efunc(pc, "invalid register %u\n", r1);
8294                         if (r2 != 0)
8295                                 err += efunc(pc, "non-zero reserved bits\n");
8296                         if (rd >= nregs)
8297                                 err += efunc(pc, "invalid register %u\n", rd);
8298                         if (rd == 0)
8299                                 err += efunc(pc, "cannot write to 0 address\n");
8300                         break;
8301                 case DIF_OP_CMP:
8302                 case DIF_OP_SCMP:
8303                         if (r1 >= nregs)
8304                                 err += efunc(pc, "invalid register %u\n", r1);
8305                         if (r2 >= nregs)
8306                                 err += efunc(pc, "invalid register %u\n", r2);
8307                         if (rd != 0)
8308                                 err += efunc(pc, "non-zero reserved bits\n");
8309                         break;
8310                 case DIF_OP_TST:
8311                         if (r1 >= nregs)
8312                                 err += efunc(pc, "invalid register %u\n", r1);
8313                         if (r2 != 0 || rd != 0)
8314                                 err += efunc(pc, "non-zero reserved bits\n");
8315                         break;
8316                 case DIF_OP_BA:
8317                 case DIF_OP_BE:
8318                 case DIF_OP_BNE:
8319                 case DIF_OP_BG:
8320                 case DIF_OP_BGU:
8321                 case DIF_OP_BGE:
8322                 case DIF_OP_BGEU:
8323                 case DIF_OP_BL:
8324                 case DIF_OP_BLU:
8325                 case DIF_OP_BLE:
8326                 case DIF_OP_BLEU:
8327                         if (label >= dp->dtdo_len) {
8328                                 err += efunc(pc, "invalid branch target %u\n",
8329                                     label);
8330                         }
8331                         if (label <= pc) {
8332                                 err += efunc(pc, "backward branch to %u\n",
8333                                     label);
8334                         }
8335                         break;
8336                 case DIF_OP_RET:
8337                         if (r1 != 0 || r2 != 0)
8338                                 err += efunc(pc, "non-zero reserved bits\n");
8339                         if (rd >= nregs)
8340                                 err += efunc(pc, "invalid register %u\n", rd);
8341                         break;
8342                 case DIF_OP_NOP:
8343                 case DIF_OP_POPTS:
8344                 case DIF_OP_FLUSHTS:
8345                         if (r1 != 0 || r2 != 0 || rd != 0)
8346                                 err += efunc(pc, "non-zero reserved bits\n");
8347                         break;
8348                 case DIF_OP_SETX:
8349                         if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
8350                                 err += efunc(pc, "invalid integer ref %u\n",
8351                                     DIF_INSTR_INTEGER(instr));
8352                         }
8353                         if (rd >= nregs)
8354                                 err += efunc(pc, "invalid register %u\n", rd);
8355                         if (rd == 0)
8356                                 err += efunc(pc, "cannot write to %r0\n");
8357                         break;
8358                 case DIF_OP_SETS:
8359                         if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
8360                                 err += efunc(pc, "invalid string ref %u\n",
8361                                     DIF_INSTR_STRING(instr));
8362                         }
8363                         if (rd >= nregs)
8364                                 err += efunc(pc, "invalid register %u\n", rd);
8365                         if (rd == 0)
8366                                 err += efunc(pc, "cannot write to %r0\n");
8367                         break;
8368                 case DIF_OP_LDGA:
8369                 case DIF_OP_LDTA:
8370                         if (r1 > DIF_VAR_ARRAY_MAX)
8371                                 err += efunc(pc, "invalid array %u\n", r1);
8372                         if (r2 >= nregs)
8373                                 err += efunc(pc, "invalid register %u\n", r2);
8374                         if (rd >= nregs)
8375                                 err += efunc(pc, "invalid register %u\n", rd);
8376                         if (rd == 0)
8377                                 err += efunc(pc, "cannot write to %r0\n");
8378                         break;
8379                 case DIF_OP_LDGS:
8380                 case DIF_OP_LDTS:
8381                 case DIF_OP_LDLS:
8382                 case DIF_OP_LDGAA:
8383                 case DIF_OP_LDTAA:
8384                         if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
8385                                 err += efunc(pc, "invalid variable %u\n", v);
8386                         if (rd >= nregs)
8387                                 err += efunc(pc, "invalid register %u\n", rd);
8388                         if (rd == 0)
8389                                 err += efunc(pc, "cannot write to %r0\n");
8390                         break;
8391                 case DIF_OP_STGS:
8392                 case DIF_OP_STTS:
8393                 case DIF_OP_STLS:
8394                 case DIF_OP_STGAA:
8395                 case DIF_OP_STTAA:
8396                         if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
8397                                 err += efunc(pc, "invalid variable %u\n", v);
8398                         if (rs >= nregs)
8399                                 err += efunc(pc, "invalid register %u\n", rd);
8400                         break;
8401                 case DIF_OP_CALL:
8402                         if (subr > DIF_SUBR_MAX)
8403                                 err += efunc(pc, "invalid subr %u\n", subr);
8404                         if (rd >= nregs)
8405                                 err += efunc(pc, "invalid register %u\n", rd);
8406                         if (rd == 0)
8407                                 err += efunc(pc, "cannot write to %r0\n");
8408 
8409                         if (subr == DIF_SUBR_COPYOUT ||
8410                             subr == DIF_SUBR_COPYOUTSTR) {
8411                                 dp->dtdo_destructive = 1;
8412                         }
8413                         break;
8414                 case DIF_OP_PUSHTR:
8415                         if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
8416                                 err += efunc(pc, "invalid ref type %u\n", type);
8417                         if (r2 >= nregs)
8418                                 err += efunc(pc, "invalid register %u\n", r2);
8419                         if (rs >= nregs)
8420                                 err += efunc(pc, "invalid register %u\n", rs);
8421                         break;
8422                 case DIF_OP_PUSHTV:
8423                         if (type != DIF_TYPE_CTF)
8424                                 err += efunc(pc, "invalid val type %u\n", type);
8425                         if (r2 >= nregs)
8426                                 err += efunc(pc, "invalid register %u\n", r2);
8427                         if (rs >= nregs)
8428                                 err += efunc(pc, "invalid register %u\n", rs);
8429                         break;
8430                 default:
8431                         err += efunc(pc, "invalid opcode %u\n",
8432                             DIF_INSTR_OP(instr));
8433                 }
8434         }
8435 
8436         if (dp->dtdo_len != 0 &&
8437             DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
8438                 err += efunc(dp->dtdo_len - 1,
8439                     "expected 'ret' as last DIF instruction\n");
8440         }
8441 
8442         if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) {
8443                 /*
8444                  * If we're not returning by reference, the size must be either
8445                  * 0 or the size of one of the base types.
8446                  */
8447                 switch (dp->dtdo_rtype.dtdt_size) {
8448                 case 0:
8449                 case sizeof (uint8_t):
8450                 case sizeof (uint16_t):
8451                 case sizeof (uint32_t):
8452                 case sizeof (uint64_t):
8453                         break;
8454 
8455                 default:
8456                         err += efunc(dp->dtdo_len - 1, "bad return size\n");
8457                 }
8458         }
8459 
8460         for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
8461                 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
8462                 dtrace_diftype_t *vt, *et;
8463                 uint_t id, ndx;
8464 
8465                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
8466                     v->dtdv_scope != DIFV_SCOPE_THREAD &&
8467                     v->dtdv_scope != DIFV_SCOPE_LOCAL) {
8468                         err += efunc(i, "unrecognized variable scope %d\n",
8469                             v->dtdv_scope);
8470                         break;
8471                 }
8472 
8473                 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
8474                     v->dtdv_kind != DIFV_KIND_SCALAR) {
8475                         err += efunc(i, "unrecognized variable type %d\n",
8476                             v->dtdv_kind);
8477                         break;
8478                 }
8479 
8480                 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
8481                         err += efunc(i, "%d exceeds variable id limit\n", id);
8482                         break;
8483                 }
8484 
8485                 if (id < DIF_VAR_OTHER_UBASE)
8486                         continue;
8487 
8488                 /*
8489                  * For user-defined variables, we need to check that this
8490                  * definition is identical to any previous definition that we
8491                  * encountered.
8492                  */
8493                 ndx = id - DIF_VAR_OTHER_UBASE;
8494 
8495                 switch (v->dtdv_scope) {
8496                 case DIFV_SCOPE_GLOBAL:
8497                         if (ndx < vstate->dtvs_nglobals) {
8498                                 dtrace_statvar_t *svar;
8499 
8500                                 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
8501                                         existing = &svar->dtsv_var;
8502                         }
8503 
8504                         break;
8505 
8506                 case DIFV_SCOPE_THREAD:
8507                         if (ndx < vstate->dtvs_ntlocals)
8508                                 existing = &vstate->dtvs_tlocals[ndx];
8509                         break;
8510 
8511                 case DIFV_SCOPE_LOCAL:
8512                         if (ndx < vstate->dtvs_nlocals) {
8513                                 dtrace_statvar_t *svar;
8514 
8515                                 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
8516                                         existing = &svar->dtsv_var;
8517                         }
8518 
8519                         break;
8520                 }
8521 
8522                 vt = &v->dtdv_type;
8523 
8524                 if (vt->dtdt_flags & DIF_TF_BYREF) {
8525                         if (vt->dtdt_size == 0) {
8526                                 err += efunc(i, "zero-sized variable\n");
8527                                 break;
8528                         }
8529 
8530                         if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
8531                             vt->dtdt_size > dtrace_global_maxsize) {
8532                                 err += efunc(i, "oversized by-ref global\n");
8533                                 break;
8534                         }
8535                 }
8536 
8537                 if (existing == NULL || existing->dtdv_id == 0)
8538                         continue;
8539 
8540                 ASSERT(existing->dtdv_id == v->dtdv_id);
8541                 ASSERT(existing->dtdv_scope == v->dtdv_scope);
8542 
8543                 if (existing->dtdv_kind != v->dtdv_kind)
8544                         err += efunc(i, "%d changed variable kind\n", id);
8545 
8546                 et = &existing->dtdv_type;
8547 
8548                 if (vt->dtdt_flags != et->dtdt_flags) {
8549                         err += efunc(i, "%d changed variable type flags\n", id);
8550                         break;
8551                 }
8552 
8553                 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
8554                         err += efunc(i, "%d changed variable type size\n", id);
8555                         break;
8556                 }
8557         }
8558 
8559         return (err);
8560 }
8561 
8562 /*
8563  * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
8564  * are much more constrained than normal DIFOs.  Specifically, they may
8565  * not:
8566  *
8567  * 1. Make calls to subroutines other than copyin(), copyinstr() or
8568  *    miscellaneous string routines
8569  * 2. Access DTrace variables other than the args[] array, and the
8570  *    curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
8571  * 3. Have thread-local variables.
8572  * 4. Have dynamic variables.
8573  */
8574 static int
8575 dtrace_difo_validate_helper(dtrace_difo_t *dp)
8576 {
8577         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8578         int err = 0;
8579         uint_t pc;
8580 
8581         for (pc = 0; pc < dp->dtdo_len; pc++) {
8582                 dif_instr_t instr = dp->dtdo_buf[pc];
8583 
8584                 uint_t v = DIF_INSTR_VAR(instr);
8585                 uint_t subr = DIF_INSTR_SUBR(instr);
8586                 uint_t op = DIF_INSTR_OP(instr);
8587 
8588                 switch (op) {
8589                 case DIF_OP_OR:
8590                 case DIF_OP_XOR:
8591                 case DIF_OP_AND:
8592                 case DIF_OP_SLL:
8593                 case DIF_OP_SRL:
8594                 case DIF_OP_SRA:
8595                 case DIF_OP_SUB:
8596                 case DIF_OP_ADD:
8597                 case DIF_OP_MUL:
8598                 case DIF_OP_SDIV:
8599                 case DIF_OP_UDIV:
8600                 case DIF_OP_SREM:
8601                 case DIF_OP_UREM:
8602                 case DIF_OP_COPYS:
8603                 case DIF_OP_NOT:
8604                 case DIF_OP_MOV:
8605                 case DIF_OP_RLDSB:
8606                 case DIF_OP_RLDSH:
8607                 case DIF_OP_RLDSW:
8608                 case DIF_OP_RLDUB:
8609                 case DIF_OP_RLDUH:
8610                 case DIF_OP_RLDUW:
8611                 case DIF_OP_RLDX:
8612                 case DIF_OP_ULDSB:
8613                 case DIF_OP_ULDSH:
8614                 case DIF_OP_ULDSW:
8615                 case DIF_OP_ULDUB:
8616                 case DIF_OP_ULDUH:
8617                 case DIF_OP_ULDUW:
8618                 case DIF_OP_ULDX:
8619                 case DIF_OP_STB:
8620                 case DIF_OP_STH:
8621                 case DIF_OP_STW:
8622                 case DIF_OP_STX:
8623                 case DIF_OP_ALLOCS:
8624                 case DIF_OP_CMP:
8625                 case DIF_OP_SCMP:
8626                 case DIF_OP_TST:
8627                 case DIF_OP_BA:
8628                 case DIF_OP_BE:
8629                 case DIF_OP_BNE:
8630                 case DIF_OP_BG:
8631                 case DIF_OP_BGU:
8632                 case DIF_OP_BGE:
8633                 case DIF_OP_BGEU:
8634                 case DIF_OP_BL:
8635                 case DIF_OP_BLU:
8636                 case DIF_OP_BLE:
8637                 case DIF_OP_BLEU:
8638                 case DIF_OP_RET:
8639                 case DIF_OP_NOP:
8640                 case DIF_OP_POPTS:
8641                 case DIF_OP_FLUSHTS:
8642                 case DIF_OP_SETX:
8643                 case DIF_OP_SETS:
8644                 case DIF_OP_LDGA:
8645                 case DIF_OP_LDLS:
8646                 case DIF_OP_STGS:
8647                 case DIF_OP_STLS:
8648                 case DIF_OP_PUSHTR:
8649                 case DIF_OP_PUSHTV:
8650                         break;
8651 
8652                 case DIF_OP_LDGS:
8653                         if (v >= DIF_VAR_OTHER_UBASE)
8654                                 break;
8655 
8656                         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
8657                                 break;
8658 
8659                         if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
8660                             v == DIF_VAR_PPID || v == DIF_VAR_TID ||
8661                             v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
8662                             v == DIF_VAR_UID || v == DIF_VAR_GID)
8663                                 break;
8664 
8665                         err += efunc(pc, "illegal variable %u\n", v);
8666                         break;
8667 
8668                 case DIF_OP_LDTA:
8669                 case DIF_OP_LDTS:
8670                 case DIF_OP_LDGAA:
8671                 case DIF_OP_LDTAA:
8672                         err += efunc(pc, "illegal dynamic variable load\n");
8673                         break;
8674 
8675                 case DIF_OP_STTS:
8676                 case DIF_OP_STGAA:
8677                 case DIF_OP_STTAA:
8678                         err += efunc(pc, "illegal dynamic variable store\n");
8679                         break;
8680 
8681                 case DIF_OP_CALL:
8682                         if (subr == DIF_SUBR_ALLOCA ||
8683                             subr == DIF_SUBR_BCOPY ||
8684                             subr == DIF_SUBR_COPYIN ||
8685                             subr == DIF_SUBR_COPYINTO ||
8686                             subr == DIF_SUBR_COPYINSTR ||
8687                             subr == DIF_SUBR_INDEX ||
8688                             subr == DIF_SUBR_INET_NTOA ||
8689                             subr == DIF_SUBR_INET_NTOA6 ||
8690                             subr == DIF_SUBR_INET_NTOP ||
8691                             subr == DIF_SUBR_LLTOSTR ||
8692                             subr == DIF_SUBR_RINDEX ||
8693                             subr == DIF_SUBR_STRCHR ||
8694                             subr == DIF_SUBR_STRJOIN ||
8695                             subr == DIF_SUBR_STRRCHR ||
8696                             subr == DIF_SUBR_STRSTR ||
8697                             subr == DIF_SUBR_HTONS ||
8698                             subr == DIF_SUBR_HTONL ||
8699                             subr == DIF_SUBR_HTONLL ||
8700                             subr == DIF_SUBR_NTOHS ||
8701                             subr == DIF_SUBR_NTOHL ||
8702                             subr == DIF_SUBR_NTOHLL)
8703                                 break;
8704 
8705                         err += efunc(pc, "invalid subr %u\n", subr);
8706                         break;
8707 
8708                 default:
8709                         err += efunc(pc, "invalid opcode %u\n",
8710                             DIF_INSTR_OP(instr));
8711                 }
8712         }
8713 
8714         return (err);
8715 }
8716 
8717 /*
8718  * Returns 1 if the expression in the DIF object can be cached on a per-thread
8719  * basis; 0 if not.
8720  */
8721 static int
8722 dtrace_difo_cacheable(dtrace_difo_t *dp)
8723 {
8724         int i;
8725 
8726         if (dp == NULL)
8727                 return (0);
8728 
8729         for (i = 0; i < dp->dtdo_varlen; i++) {
8730                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8731 
8732                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
8733                         continue;
8734 
8735                 switch (v->dtdv_id) {
8736                 case DIF_VAR_CURTHREAD:
8737                 case DIF_VAR_PID:
8738                 case DIF_VAR_TID:
8739                 case DIF_VAR_EXECNAME:
8740                 case DIF_VAR_ZONENAME:
8741                         break;
8742 
8743                 default:
8744                         return (0);
8745                 }
8746         }
8747 
8748         /*
8749          * This DIF object may be cacheable.  Now we need to look for any
8750          * array loading instructions, any memory loading instructions, or
8751          * any stores to thread-local variables.
8752          */
8753         for (i = 0; i < dp->dtdo_len; i++) {
8754                 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
8755 
8756                 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
8757                     (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
8758                     (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
8759                     op == DIF_OP_LDGA || op == DIF_OP_STTS)
8760                         return (0);
8761         }
8762 
8763         return (1);
8764 }
8765 
8766 static void
8767 dtrace_difo_hold(dtrace_difo_t *dp)
8768 {
8769         int i;
8770 
8771         ASSERT(MUTEX_HELD(&dtrace_lock));
8772 
8773         dp->dtdo_refcnt++;
8774         ASSERT(dp->dtdo_refcnt != 0);
8775 
8776         /*
8777          * We need to check this DIF object for references to the variable
8778          * DIF_VAR_VTIMESTAMP.
8779          */
8780         for (i = 0; i < dp->dtdo_varlen; i++) {
8781                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8782 
8783                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
8784                         continue;
8785 
8786                 if (dtrace_vtime_references++ == 0)
8787                         dtrace_vtime_enable();
8788         }
8789 }
8790 
8791 /*
8792  * This routine calculates the dynamic variable chunksize for a given DIF
8793  * object.  The calculation is not fool-proof, and can probably be tricked by
8794  * malicious DIF -- but it works for all compiler-generated DIF.  Because this
8795  * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
8796  * if a dynamic variable size exceeds the chunksize.
8797  */
8798 static void
8799 dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8800 {
8801         uint64_t sval;
8802         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
8803         const dif_instr_t *text = dp->dtdo_buf;
8804         uint_t pc, srd = 0;
8805         uint_t ttop = 0;
8806         size_t size, ksize;
8807         uint_t id, i;
8808 
8809         for (pc = 0; pc < dp->dtdo_len; pc++) {
8810                 dif_instr_t instr = text[pc];
8811                 uint_t op = DIF_INSTR_OP(instr);
8812                 uint_t rd = DIF_INSTR_RD(instr);
8813                 uint_t r1 = DIF_INSTR_R1(instr);
8814                 uint_t nkeys = 0;
8815                 uchar_t scope;
8816 
8817                 dtrace_key_t *key = tupregs;
8818 
8819                 switch (op) {
8820                 case DIF_OP_SETX:
8821                         sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
8822                         srd = rd;
8823                         continue;
8824 
8825                 case DIF_OP_STTS:
8826                         key = &tupregs[DIF_DTR_NREGS];
8827                         key[0].dttk_size = 0;
8828                         key[1].dttk_size = 0;
8829                         nkeys = 2;
8830                         scope = DIFV_SCOPE_THREAD;
8831                         break;
8832 
8833                 case DIF_OP_STGAA:
8834                 case DIF_OP_STTAA:
8835                         nkeys = ttop;
8836 
8837                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
8838                                 key[nkeys++].dttk_size = 0;
8839 
8840                         key[nkeys++].dttk_size = 0;
8841 
8842                         if (op == DIF_OP_STTAA) {
8843                                 scope = DIFV_SCOPE_THREAD;
8844                         } else {
8845                                 scope = DIFV_SCOPE_GLOBAL;
8846                         }
8847 
8848                         break;
8849 
8850                 case DIF_OP_PUSHTR:
8851                         if (ttop == DIF_DTR_NREGS)
8852                                 return;
8853 
8854                         if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
8855                                 /*
8856                                  * If the register for the size of the "pushtr"
8857                                  * is %r0 (or the value is 0) and the type is
8858                                  * a string, we'll use the system-wide default
8859                                  * string size.
8860                                  */
8861                                 tupregs[ttop++].dttk_size =
8862                                     dtrace_strsize_default;
8863                         } else {
8864                                 if (srd == 0)
8865                                         return;
8866 
8867                                 tupregs[ttop++].dttk_size = sval;
8868                         }
8869 
8870                         break;
8871 
8872                 case DIF_OP_PUSHTV:
8873                         if (ttop == DIF_DTR_NREGS)
8874                                 return;
8875 
8876                         tupregs[ttop++].dttk_size = 0;
8877                         break;
8878 
8879                 case DIF_OP_FLUSHTS:
8880                         ttop = 0;
8881                         break;
8882 
8883                 case DIF_OP_POPTS:
8884                         if (ttop != 0)
8885                                 ttop--;
8886                         break;
8887                 }
8888 
8889                 sval = 0;
8890                 srd = 0;
8891 
8892                 if (nkeys == 0)
8893                         continue;
8894 
8895                 /*
8896                  * We have a dynamic variable allocation; calculate its size.
8897                  */
8898                 for (ksize = 0, i = 0; i < nkeys; i++)
8899                         ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
8900 
8901                 size = sizeof (dtrace_dynvar_t);
8902                 size += sizeof (dtrace_key_t) * (nkeys - 1);
8903                 size += ksize;
8904 
8905                 /*
8906                  * Now we need to determine the size of the stored data.
8907                  */
8908                 id = DIF_INSTR_VAR(instr);
8909 
8910                 for (i = 0; i < dp->dtdo_varlen; i++) {
8911                         dtrace_difv_t *v = &dp->dtdo_vartab[i];
8912 
8913                         if (v->dtdv_id == id && v->dtdv_scope == scope) {
8914                                 size += v->dtdv_type.dtdt_size;
8915                                 break;
8916                         }
8917                 }
8918 
8919                 if (i == dp->dtdo_varlen)
8920                         return;
8921 
8922                 /*
8923                  * We have the size.  If this is larger than the chunk size
8924                  * for our dynamic variable state, reset the chunk size.
8925                  */
8926                 size = P2ROUNDUP(size, sizeof (uint64_t));
8927 
8928                 if (size > vstate->dtvs_dynvars.dtds_chunksize)
8929                         vstate->dtvs_dynvars.dtds_chunksize = size;
8930         }
8931 }
8932 
8933 static void
8934 dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8935 {
8936         int i, oldsvars, osz, nsz, otlocals, ntlocals;
8937         uint_t id;
8938 
8939         ASSERT(MUTEX_HELD(&dtrace_lock));
8940         ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
8941 
8942         for (i = 0; i < dp->dtdo_varlen; i++) {
8943                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8944                 dtrace_statvar_t *svar, ***svarp;
8945                 size_t dsize = 0;
8946                 uint8_t scope = v->dtdv_scope;
8947                 int *np;
8948 
8949                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
8950                         continue;
8951 
8952                 id -= DIF_VAR_OTHER_UBASE;
8953 
8954                 switch (scope) {
8955                 case DIFV_SCOPE_THREAD:
8956                         while (id >= (otlocals = vstate->dtvs_ntlocals)) {
8957                                 dtrace_difv_t *tlocals;
8958 
8959                                 if ((ntlocals = (otlocals << 1)) == 0)
8960                                         ntlocals = 1;
8961 
8962                                 osz = otlocals * sizeof (dtrace_difv_t);
8963                                 nsz = ntlocals * sizeof (dtrace_difv_t);
8964 
8965                                 tlocals = kmem_zalloc(nsz, KM_SLEEP);
8966 
8967                                 if (osz != 0) {
8968                                         bcopy(vstate->dtvs_tlocals,
8969                                             tlocals, osz);
8970                                         kmem_free(vstate->dtvs_tlocals, osz);
8971                                 }
8972 
8973                                 vstate->dtvs_tlocals = tlocals;
8974                                 vstate->dtvs_ntlocals = ntlocals;
8975                         }
8976 
8977                         vstate->dtvs_tlocals[id] = *v;
8978                         continue;
8979 
8980                 case DIFV_SCOPE_LOCAL:
8981                         np = &vstate->dtvs_nlocals;
8982                         svarp = &vstate->dtvs_locals;
8983 
8984                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
8985                                 dsize = NCPU * (v->dtdv_type.dtdt_size +
8986                                     sizeof (uint64_t));
8987                         else
8988                                 dsize = NCPU * sizeof (uint64_t);
8989 
8990                         break;
8991 
8992                 case DIFV_SCOPE_GLOBAL:
8993                         np = &vstate->dtvs_nglobals;
8994                         svarp = &vstate->dtvs_globals;
8995 
8996                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
8997                                 dsize = v->dtdv_type.dtdt_size +
8998                                     sizeof (uint64_t);
8999 
9000                         break;
9001 
9002                 default:
9003                         ASSERT(0);
9004                 }
9005 
9006                 while (id >= (oldsvars = *np)) {
9007                         dtrace_statvar_t **statics;
9008                         int newsvars, oldsize, newsize;
9009 
9010                         if ((newsvars = (oldsvars << 1)) == 0)
9011                                 newsvars = 1;
9012 
9013                         oldsize = oldsvars * sizeof (dtrace_statvar_t *);
9014                         newsize = newsvars * sizeof (dtrace_statvar_t *);
9015 
9016                         statics = kmem_zalloc(newsize, KM_SLEEP);
9017 
9018                         if (oldsize != 0) {
9019                                 bcopy(*svarp, statics, oldsize);
9020                                 kmem_free(*svarp, oldsize);
9021                         }
9022 
9023                         *svarp = statics;
9024                         *np = newsvars;
9025                 }
9026 
9027                 if ((svar = (*svarp)[id]) == NULL) {
9028                         svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
9029                         svar->dtsv_var = *v;
9030 
9031                         if ((svar->dtsv_size = dsize) != 0) {
9032                                 svar->dtsv_data = (uint64_t)(uintptr_t)
9033                                     kmem_zalloc(dsize, KM_SLEEP);
9034                         }
9035 
9036                         (*svarp)[id] = svar;
9037                 }
9038 
9039                 svar->dtsv_refcnt++;
9040         }
9041 
9042         dtrace_difo_chunksize(dp, vstate);
9043         dtrace_difo_hold(dp);
9044 }
9045 
9046 static dtrace_difo_t *
9047 dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9048 {
9049         dtrace_difo_t *new;
9050         size_t sz;
9051 
9052         ASSERT(dp->dtdo_buf != NULL);
9053         ASSERT(dp->dtdo_refcnt != 0);
9054 
9055         new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
9056 
9057         ASSERT(dp->dtdo_buf != NULL);
9058         sz = dp->dtdo_len * sizeof (dif_instr_t);
9059         new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
9060         bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
9061         new->dtdo_len = dp->dtdo_len;
9062 
9063         if (dp->dtdo_strtab != NULL) {
9064                 ASSERT(dp->dtdo_strlen != 0);
9065                 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
9066                 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
9067                 new->dtdo_strlen = dp->dtdo_strlen;
9068         }
9069 
9070         if (dp->dtdo_inttab != NULL) {
9071                 ASSERT(dp->dtdo_intlen != 0);
9072                 sz = dp->dtdo_intlen * sizeof (uint64_t);
9073                 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
9074                 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
9075                 new->dtdo_intlen = dp->dtdo_intlen;
9076         }
9077 
9078         if (dp->dtdo_vartab != NULL) {
9079                 ASSERT(dp->dtdo_varlen != 0);
9080                 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
9081                 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
9082                 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
9083                 new->dtdo_varlen = dp->dtdo_varlen;
9084         }
9085 
9086         dtrace_difo_init(new, vstate);
9087         return (new);
9088 }
9089 
9090 static void
9091 dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9092 {
9093         int i;
9094 
9095         ASSERT(dp->dtdo_refcnt == 0);
9096 
9097         for (i = 0; i < dp->dtdo_varlen; i++) {
9098                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9099                 dtrace_statvar_t *svar, **svarp;
9100                 uint_t id;
9101                 uint8_t scope = v->dtdv_scope;
9102                 int *np;
9103 
9104                 switch (scope) {
9105                 case DIFV_SCOPE_THREAD:
9106                         continue;
9107 
9108                 case DIFV_SCOPE_LOCAL:
9109                         np = &vstate->dtvs_nlocals;
9110                         svarp = vstate->dtvs_locals;
9111                         break;
9112 
9113                 case DIFV_SCOPE_GLOBAL:
9114                         np = &vstate->dtvs_nglobals;
9115                         svarp = vstate->dtvs_globals;
9116                         break;
9117 
9118                 default:
9119                         ASSERT(0);
9120                 }
9121 
9122                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9123                         continue;
9124 
9125                 id -= DIF_VAR_OTHER_UBASE;
9126                 ASSERT(id < *np);
9127 
9128                 svar = svarp[id];
9129                 ASSERT(svar != NULL);
9130                 ASSERT(svar->dtsv_refcnt > 0);
9131 
9132                 if (--svar->dtsv_refcnt > 0)
9133                         continue;
9134 
9135                 if (svar->dtsv_size != 0) {
9136                         ASSERT(svar->dtsv_data != NULL);
9137                         kmem_free((void *)(uintptr_t)svar->dtsv_data,
9138                             svar->dtsv_size);
9139                 }
9140 
9141                 kmem_free(svar, sizeof (dtrace_statvar_t));
9142                 svarp[id] = NULL;
9143         }
9144 
9145         kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
9146         kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
9147         kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
9148         kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
9149 
9150         kmem_free(dp, sizeof (dtrace_difo_t));
9151 }
9152 
9153 static void
9154 dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9155 {
9156         int i;
9157 
9158         ASSERT(MUTEX_HELD(&dtrace_lock));
9159         ASSERT(dp->dtdo_refcnt != 0);
9160 
9161         for (i = 0; i < dp->dtdo_varlen; i++) {
9162                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9163 
9164                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9165                         continue;
9166 
9167                 ASSERT(dtrace_vtime_references > 0);
9168                 if (--dtrace_vtime_references == 0)
9169                         dtrace_vtime_disable();
9170         }
9171 
9172         if (--dp->dtdo_refcnt == 0)
9173                 dtrace_difo_destroy(dp, vstate);
9174 }
9175 
9176 /*
9177  * DTrace Format Functions
9178  */
9179 static uint16_t
9180 dtrace_format_add(dtrace_state_t *state, char *str)
9181 {
9182         char *fmt, **new;
9183         uint16_t ndx, len = strlen(str) + 1;
9184 
9185         fmt = kmem_zalloc(len, KM_SLEEP);
9186         bcopy(str, fmt, len);
9187 
9188         for (ndx = 0; ndx < state->dts_nformats; ndx++) {
9189                 if (state->dts_formats[ndx] == NULL) {
9190                         state->dts_formats[ndx] = fmt;
9191                         return (ndx + 1);
9192                 }
9193         }
9194 
9195         if (state->dts_nformats == USHRT_MAX) {
9196                 /*
9197                  * This is only likely if a denial-of-service attack is being
9198                  * attempted.  As such, it's okay to fail silently here.
9199                  */
9200                 kmem_free(fmt, len);
9201                 return (0);
9202         }
9203 
9204         /*
9205          * For simplicity, we always resize the formats array to be exactly the
9206          * number of formats.
9207          */
9208         ndx = state->dts_nformats++;
9209         new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
9210 
9211         if (state->dts_formats != NULL) {
9212                 ASSERT(ndx != 0);
9213                 bcopy(state->dts_formats, new, ndx * sizeof (char *));
9214                 kmem_free(state->dts_formats, ndx * sizeof (char *));
9215         }
9216 
9217         state->dts_formats = new;
9218         state->dts_formats[ndx] = fmt;
9219 
9220         return (ndx + 1);
9221 }
9222 
9223 static void
9224 dtrace_format_remove(dtrace_state_t *state, uint16_t format)
9225 {
9226         char *fmt;
9227 
9228         ASSERT(state->dts_formats != NULL);
9229         ASSERT(format <= state->dts_nformats);
9230         ASSERT(state->dts_formats[format - 1] != NULL);
9231 
9232         fmt = state->dts_formats[format - 1];
9233         kmem_free(fmt, strlen(fmt) + 1);
9234         state->dts_formats[format - 1] = NULL;
9235 }
9236 
9237 static void
9238 dtrace_format_destroy(dtrace_state_t *state)
9239 {
9240         int i;
9241 
9242         if (state->dts_nformats == 0) {
9243                 ASSERT(state->dts_formats == NULL);
9244                 return;
9245         }
9246 
9247         ASSERT(state->dts_formats != NULL);
9248 
9249         for (i = 0; i < state->dts_nformats; i++) {
9250                 char *fmt = state->dts_formats[i];
9251 
9252                 if (fmt == NULL)
9253                         continue;
9254 
9255                 kmem_free(fmt, strlen(fmt) + 1);
9256         }
9257 
9258         kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
9259         state->dts_nformats = 0;
9260         state->dts_formats = NULL;
9261 }
9262 
9263 /*
9264  * DTrace Predicate Functions
9265  */
9266 static dtrace_predicate_t *
9267 dtrace_predicate_create(dtrace_difo_t *dp)
9268 {
9269         dtrace_predicate_t *pred;
9270 
9271         ASSERT(MUTEX_HELD(&dtrace_lock));
9272         ASSERT(dp->dtdo_refcnt != 0);
9273 
9274         pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
9275         pred->dtp_difo = dp;
9276         pred->dtp_refcnt = 1;
9277 
9278         if (!dtrace_difo_cacheable(dp))
9279                 return (pred);
9280 
9281         if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
9282                 /*
9283                  * This is only theoretically possible -- we have had 2^32
9284                  * cacheable predicates on this machine.  We cannot allow any
9285                  * more predicates to become cacheable:  as unlikely as it is,
9286                  * there may be a thread caching a (now stale) predicate cache
9287                  * ID. (N.B.: the temptation is being successfully resisted to
9288                  * have this cmn_err() "Holy shit -- we executed this code!")
9289                  */
9290                 return (pred);
9291         }
9292 
9293         pred->dtp_cacheid = dtrace_predcache_id++;
9294 
9295         return (pred);
9296 }
9297 
9298 static void
9299 dtrace_predicate_hold(dtrace_predicate_t *pred)
9300 {
9301         ASSERT(MUTEX_HELD(&dtrace_lock));
9302         ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
9303         ASSERT(pred->dtp_refcnt > 0);
9304 
9305         pred->dtp_refcnt++;
9306 }
9307 
9308 static void
9309 dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
9310 {
9311         dtrace_difo_t *dp = pred->dtp_difo;
9312 
9313         ASSERT(MUTEX_HELD(&dtrace_lock));
9314         ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
9315         ASSERT(pred->dtp_refcnt > 0);
9316 
9317         if (--pred->dtp_refcnt == 0) {
9318                 dtrace_difo_release(pred->dtp_difo, vstate);
9319                 kmem_free(pred, sizeof (dtrace_predicate_t));
9320         }
9321 }
9322 
9323 /*
9324  * DTrace Action Description Functions
9325  */
9326 static dtrace_actdesc_t *
9327 dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
9328     uint64_t uarg, uint64_t arg)
9329 {
9330         dtrace_actdesc_t *act;
9331 
9332         ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
9333             arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
9334 
9335         act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
9336         act->dtad_kind = kind;
9337         act->dtad_ntuple = ntuple;
9338         act->dtad_uarg = uarg;
9339         act->dtad_arg = arg;
9340         act->dtad_refcnt = 1;
9341 
9342         return (act);
9343 }
9344 
9345 static void
9346 dtrace_actdesc_hold(dtrace_actdesc_t *act)
9347 {
9348         ASSERT(act->dtad_refcnt >= 1);
9349         act->dtad_refcnt++;
9350 }
9351 
9352 static void
9353 dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
9354 {
9355         dtrace_actkind_t kind = act->dtad_kind;
9356         dtrace_difo_t *dp;
9357 
9358         ASSERT(act->dtad_refcnt >= 1);
9359 
9360         if (--act->dtad_refcnt != 0)
9361                 return;
9362 
9363         if ((dp = act->dtad_difo) != NULL)
9364                 dtrace_difo_release(dp, vstate);
9365 
9366         if (DTRACEACT_ISPRINTFLIKE(kind)) {
9367                 char *str = (char *)(uintptr_t)act->dtad_arg;
9368 
9369                 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
9370                     (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
9371 
9372                 if (str != NULL)
9373                         kmem_free(str, strlen(str) + 1);
9374         }
9375 
9376         kmem_free(act, sizeof (dtrace_actdesc_t));
9377 }
9378 
9379 /*
9380  * DTrace ECB Functions
9381  */
9382 static dtrace_ecb_t *
9383 dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
9384 {
9385         dtrace_ecb_t *ecb;
9386         dtrace_epid_t epid;
9387 
9388         ASSERT(MUTEX_HELD(&dtrace_lock));
9389 
9390         ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
9391         ecb->dte_predicate = NULL;
9392         ecb->dte_probe = probe;
9393 
9394         /*
9395          * The default size is the size of the default action: recording
9396          * the epid.
9397          */
9398         ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9399         ecb->dte_alignment = sizeof (dtrace_epid_t);
9400 
9401         epid = state->dts_epid++;
9402 
9403         if (epid - 1 >= state->dts_necbs) {
9404                 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
9405                 int necbs = state->dts_necbs << 1;
9406 
9407                 ASSERT(epid == state->dts_necbs + 1);
9408 
9409                 if (necbs == 0) {
9410                         ASSERT(oecbs == NULL);
9411                         necbs = 1;
9412                 }
9413 
9414                 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
9415 
9416                 if (oecbs != NULL)
9417                         bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
9418 
9419                 dtrace_membar_producer();
9420                 state->dts_ecbs = ecbs;
9421 
9422                 if (oecbs != NULL) {
9423                         /*
9424                          * If this state is active, we must dtrace_sync()
9425                          * before we can free the old dts_ecbs array:  we're
9426                          * coming in hot, and there may be active ring
9427                          * buffer processing (which indexes into the dts_ecbs
9428                          * array) on another CPU.
9429                          */
9430                         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
9431                                 dtrace_sync();
9432 
9433                         kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
9434                 }
9435 
9436                 dtrace_membar_producer();
9437                 state->dts_necbs = necbs;
9438         }
9439 
9440         ecb->dte_state = state;
9441 
9442         ASSERT(state->dts_ecbs[epid - 1] == NULL);
9443         dtrace_membar_producer();
9444         state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
9445 
9446         return (ecb);
9447 }
9448 
9449 static int
9450 dtrace_ecb_enable(dtrace_ecb_t *ecb)
9451 {
9452         dtrace_probe_t *probe = ecb->dte_probe;
9453 
9454         ASSERT(MUTEX_HELD(&cpu_lock));
9455         ASSERT(MUTEX_HELD(&dtrace_lock));
9456         ASSERT(ecb->dte_next == NULL);
9457 
9458         if (probe == NULL) {
9459                 /*
9460                  * This is the NULL probe -- there's nothing to do.
9461                  */
9462                 return (0);
9463         }
9464 
9465         if (probe->dtpr_ecb == NULL) {
9466                 dtrace_provider_t *prov = probe->dtpr_provider;
9467 
9468                 /*
9469                  * We're the first ECB on this probe.
9470                  */
9471                 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
9472 
9473                 if (ecb->dte_predicate != NULL)
9474                         probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
9475 
9476                 return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
9477                     probe->dtpr_id, probe->dtpr_arg));
9478         } else {
9479                 /*
9480                  * This probe is already active.  Swing the last pointer to
9481                  * point to the new ECB, and issue a dtrace_sync() to assure
9482                  * that all CPUs have seen the change.
9483                  */
9484                 ASSERT(probe->dtpr_ecb_last != NULL);
9485                 probe->dtpr_ecb_last->dte_next = ecb;
9486                 probe->dtpr_ecb_last = ecb;
9487                 probe->dtpr_predcache = 0;
9488 
9489                 dtrace_sync();
9490                 return (0);
9491         }
9492 }
9493 
9494 static void
9495 dtrace_ecb_resize(dtrace_ecb_t *ecb)
9496 {
9497         uint32_t maxalign = sizeof (dtrace_epid_t);
9498         uint32_t align = sizeof (uint8_t), offs, diff;
9499         dtrace_action_t *act;
9500         int wastuple = 0;
9501         uint32_t aggbase = UINT32_MAX;
9502         dtrace_state_t *state = ecb->dte_state;
9503 
9504         /*
9505          * If we record anything, we always record the epid.  (And we always
9506          * record it first.)
9507          */
9508         offs = sizeof (dtrace_epid_t);
9509         ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9510 
9511         for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9512                 dtrace_recdesc_t *rec = &act->dta_rec;
9513 
9514                 if ((align = rec->dtrd_alignment) > maxalign)
9515                         maxalign = align;
9516 
9517                 if (!wastuple && act->dta_intuple) {
9518                         /*
9519                          * This is the first record in a tuple.  Align the
9520                          * offset to be at offset 4 in an 8-byte aligned
9521                          * block.
9522                          */
9523                         diff = offs + sizeof (dtrace_aggid_t);
9524 
9525                         if (diff = (diff & (sizeof (uint64_t) - 1)))
9526                                 offs += sizeof (uint64_t) - diff;
9527 
9528                         aggbase = offs - sizeof (dtrace_aggid_t);
9529                         ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
9530                 }
9531 
9532                 /*LINTED*/
9533                 if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
9534                         /*
9535                          * The current offset is not properly aligned; align it.
9536                          */
9537                         offs += align - diff;
9538                 }
9539 
9540                 rec->dtrd_offset = offs;
9541 
9542                 if (offs + rec->dtrd_size > ecb->dte_needed) {
9543                         ecb->dte_needed = offs + rec->dtrd_size;
9544 
9545                         if (ecb->dte_needed > state->dts_needed)
9546                                 state->dts_needed = ecb->dte_needed;
9547                 }
9548 
9549                 if (DTRACEACT_ISAGG(act->dta_kind)) {
9550                         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9551                         dtrace_action_t *first = agg->dtag_first, *prev;
9552 
9553                         ASSERT(rec->dtrd_size != 0 && first != NULL);
9554                         ASSERT(wastuple);
9555                         ASSERT(aggbase != UINT32_MAX);
9556 
9557                         agg->dtag_base = aggbase;
9558 
9559                         while ((prev = first->dta_prev) != NULL &&
9560                             DTRACEACT_ISAGG(prev->dta_kind)) {
9561                                 agg = (dtrace_aggregation_t *)prev;
9562                                 first = agg->dtag_first;
9563                         }
9564 
9565                         if (prev != NULL) {
9566                                 offs = prev->dta_rec.dtrd_offset +
9567                                     prev->dta_rec.dtrd_size;
9568                         } else {
9569                                 offs = sizeof (dtrace_epid_t);
9570                         }
9571                         wastuple = 0;
9572                 } else {
9573                         if (!act->dta_intuple)
9574                                 ecb->dte_size = offs + rec->dtrd_size;
9575 
9576                         offs += rec->dtrd_size;
9577                 }
9578 
9579                 wastuple = act->dta_intuple;
9580         }
9581 
9582         if ((act = ecb->dte_action) != NULL &&
9583             !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
9584             ecb->dte_size == sizeof (dtrace_epid_t)) {
9585                 /*
9586                  * If the size is still sizeof (dtrace_epid_t), then all
9587                  * actions store no data; set the size to 0.
9588                  */
9589                 ecb->dte_alignment = maxalign;
9590                 ecb->dte_size = 0;
9591 
9592                 /*
9593                  * If the needed space is still sizeof (dtrace_epid_t), then
9594                  * all actions need no additional space; set the needed
9595                  * size to 0.
9596                  */
9597                 if (ecb->dte_needed == sizeof (dtrace_epid_t))
9598                         ecb->dte_needed = 0;
9599 
9600                 return;
9601         }
9602 
9603         /*
9604          * Set our alignment, and make sure that the dte_size and dte_needed
9605          * are aligned to the size of an EPID.
9606          */
9607         ecb->dte_alignment = maxalign;
9608         ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
9609             ~(sizeof (dtrace_epid_t) - 1);
9610         ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
9611             ~(sizeof (dtrace_epid_t) - 1);
9612         ASSERT(ecb->dte_size <= ecb->dte_needed);
9613 }
9614 
9615 static dtrace_action_t *
9616 dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9617 {
9618         dtrace_aggregation_t *agg;
9619         size_t size = sizeof (uint64_t);
9620         int ntuple = desc->dtad_ntuple;
9621         dtrace_action_t *act;
9622         dtrace_recdesc_t *frec;
9623         dtrace_aggid_t aggid;
9624         dtrace_state_t *state = ecb->dte_state;
9625 
9626         agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
9627         agg->dtag_ecb = ecb;
9628 
9629         ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
9630 
9631         switch (desc->dtad_kind) {
9632         case DTRACEAGG_MIN:
9633                 agg->dtag_initial = INT64_MAX;
9634                 agg->dtag_aggregate = dtrace_aggregate_min;
9635                 break;
9636 
9637         case DTRACEAGG_MAX:
9638                 agg->dtag_initial = INT64_MIN;
9639                 agg->dtag_aggregate = dtrace_aggregate_max;
9640                 break;
9641 
9642         case DTRACEAGG_COUNT:
9643                 agg->dtag_aggregate = dtrace_aggregate_count;
9644                 break;
9645 
9646         case DTRACEAGG_QUANTIZE:
9647                 agg->dtag_aggregate = dtrace_aggregate_quantize;
9648                 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
9649                     sizeof (uint64_t);
9650                 break;
9651 
9652         case DTRACEAGG_LQUANTIZE: {
9653                 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
9654                 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
9655 
9656                 agg->dtag_initial = desc->dtad_arg;
9657                 agg->dtag_aggregate = dtrace_aggregate_lquantize;
9658 
9659                 if (step == 0 || levels == 0)
9660                         goto err;
9661 
9662                 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
9663                 break;
9664         }
9665 
9666         case DTRACEAGG_LLQUANTIZE: {
9667                 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
9668                 uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
9669                 uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
9670                 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
9671                 int64_t v;
9672 
9673                 agg->dtag_initial = desc->dtad_arg;
9674                 agg->dtag_aggregate = dtrace_aggregate_llquantize;
9675 
9676                 if (factor < 2 || low >= high || nsteps < factor)
9677                         goto err;
9678 
9679                 /*
9680                  * Now check that the number of steps evenly divides a power
9681                  * of the factor.  (This assures both integer bucket size and
9682                  * linearity within each magnitude.)
9683                  */
9684                 for (v = factor; v < nsteps; v *= factor)
9685                         continue;
9686 
9687                 if ((v % nsteps) || (nsteps % factor))
9688                         goto err;
9689 
9690                 size = (dtrace_aggregate_llquantize_bucket(factor,
9691                     low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
9692                 break;
9693         }
9694 
9695         case DTRACEAGG_AVG:
9696                 agg->dtag_aggregate = dtrace_aggregate_avg;
9697                 size = sizeof (uint64_t) * 2;
9698                 break;
9699 
9700         case DTRACEAGG_STDDEV:
9701                 agg->dtag_aggregate = dtrace_aggregate_stddev;
9702                 size = sizeof (uint64_t) * 4;
9703                 break;
9704 
9705         case DTRACEAGG_SUM:
9706                 agg->dtag_aggregate = dtrace_aggregate_sum;
9707                 break;
9708 
9709         default:
9710                 goto err;
9711         }
9712 
9713         agg->dtag_action.dta_rec.dtrd_size = size;
9714 
9715         if (ntuple == 0)
9716                 goto err;
9717 
9718         /*
9719          * We must make sure that we have enough actions for the n-tuple.
9720          */
9721         for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
9722                 if (DTRACEACT_ISAGG(act->dta_kind))
9723                         break;
9724 
9725                 if (--ntuple == 0) {
9726                         /*
9727                          * This is the action with which our n-tuple begins.
9728                          */
9729                         agg->dtag_first = act;
9730                         goto success;
9731                 }
9732         }
9733 
9734         /*
9735          * This n-tuple is short by ntuple elements.  Return failure.
9736          */
9737         ASSERT(ntuple != 0);
9738 err:
9739         kmem_free(agg, sizeof (dtrace_aggregation_t));
9740         return (NULL);
9741 
9742 success:
9743         /*
9744          * If the last action in the tuple has a size of zero, it's actually
9745          * an expression argument for the aggregating action.
9746          */
9747         ASSERT(ecb->dte_action_last != NULL);
9748         act = ecb->dte_action_last;
9749 
9750         if (act->dta_kind == DTRACEACT_DIFEXPR) {
9751                 ASSERT(act->dta_difo != NULL);
9752 
9753                 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
9754                         agg->dtag_hasarg = 1;
9755         }
9756 
9757         /*
9758          * We need to allocate an id for this aggregation.
9759          */
9760         aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
9761             VM_BESTFIT | VM_SLEEP);
9762 
9763         if (aggid - 1 >= state->dts_naggregations) {
9764                 dtrace_aggregation_t **oaggs = state->dts_aggregations;
9765                 dtrace_aggregation_t **aggs;
9766                 int naggs = state->dts_naggregations << 1;
9767                 int onaggs = state->dts_naggregations;
9768 
9769                 ASSERT(aggid == state->dts_naggregations + 1);
9770 
9771                 if (naggs == 0) {
9772                         ASSERT(oaggs == NULL);
9773                         naggs = 1;
9774                 }
9775 
9776                 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
9777 
9778                 if (oaggs != NULL) {
9779                         bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
9780                         kmem_free(oaggs, onaggs * sizeof (*aggs));
9781                 }
9782 
9783                 state->dts_aggregations = aggs;
9784                 state->dts_naggregations = naggs;
9785         }
9786 
9787         ASSERT(state->dts_aggregations[aggid - 1] == NULL);
9788         state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
9789 
9790         frec = &agg->dtag_first->dta_rec;
9791         if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
9792                 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
9793 
9794         for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
9795                 ASSERT(!act->dta_intuple);
9796                 act->dta_intuple = 1;
9797         }
9798 
9799         return (&agg->dtag_action);
9800 }
9801 
9802 static void
9803 dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
9804 {
9805         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9806         dtrace_state_t *state = ecb->dte_state;
9807         dtrace_aggid_t aggid = agg->dtag_id;
9808 
9809         ASSERT(DTRACEACT_ISAGG(act->dta_kind));
9810         vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
9811 
9812         ASSERT(state->dts_aggregations[aggid - 1] == agg);
9813         state->dts_aggregations[aggid - 1] = NULL;
9814 
9815         kmem_free(agg, sizeof (dtrace_aggregation_t));
9816 }
9817 
9818 static int
9819 dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9820 {
9821         dtrace_action_t *action, *last;
9822         dtrace_difo_t *dp = desc->dtad_difo;
9823         uint32_t size = 0, align = sizeof (uint8_t), mask;
9824         uint16_t format = 0;
9825         dtrace_recdesc_t *rec;
9826         dtrace_state_t *state = ecb->dte_state;
9827         dtrace_optval_t *opt = state->dts_options, nframes, strsize;
9828         uint64_t arg = desc->dtad_arg;
9829 
9830         ASSERT(MUTEX_HELD(&dtrace_lock));
9831         ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
9832 
9833         if (DTRACEACT_ISAGG(desc->dtad_kind)) {
9834                 /*
9835                  * If this is an aggregating action, there must be neither
9836                  * a speculate nor a commit on the action chain.
9837                  */
9838                 dtrace_action_t *act;
9839 
9840                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9841                         if (act->dta_kind == DTRACEACT_COMMIT)
9842                                 return (EINVAL);
9843 
9844                         if (act->dta_kind == DTRACEACT_SPECULATE)
9845                                 return (EINVAL);
9846                 }
9847 
9848                 action = dtrace_ecb_aggregation_create(ecb, desc);
9849 
9850                 if (action == NULL)
9851                         return (EINVAL);
9852         } else {
9853                 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
9854                     (desc->dtad_kind == DTRACEACT_DIFEXPR &&
9855                     dp != NULL && dp->dtdo_destructive)) {
9856                         state->dts_destructive = 1;
9857                 }
9858 
9859                 switch (desc->dtad_kind) {
9860                 case DTRACEACT_PRINTF:
9861                 case DTRACEACT_PRINTA:
9862                 case DTRACEACT_SYSTEM:
9863                 case DTRACEACT_FREOPEN:
9864                 case DTRACEACT_DIFEXPR:
9865                         /*
9866                          * We know that our arg is a string -- turn it into a
9867                          * format.
9868                          */
9869                         if (arg == NULL) {
9870                                 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
9871                                     desc->dtad_kind == DTRACEACT_DIFEXPR);
9872                                 format = 0;
9873                         } else {
9874                                 ASSERT(arg != NULL);
9875                                 ASSERT(arg > KERNELBASE);
9876                                 format = dtrace_format_add(state,
9877                                     (char *)(uintptr_t)arg);
9878                         }
9879 
9880                         /*FALLTHROUGH*/
9881                 case DTRACEACT_LIBACT:
9882                 case DTRACEACT_TRACEMEM:
9883                 case DTRACEACT_TRACEMEM_DYNSIZE:
9884                         if (dp == NULL)
9885                                 return (EINVAL);
9886 
9887                         if ((size = dp->dtdo_rtype.dtdt_size) != 0)
9888                                 break;
9889 
9890                         if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
9891                                 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9892                                         return (EINVAL);
9893 
9894                                 size = opt[DTRACEOPT_STRSIZE];
9895                         }
9896 
9897                         break;
9898 
9899                 case DTRACEACT_STACK:
9900                         if ((nframes = arg) == 0) {
9901                                 nframes = opt[DTRACEOPT_STACKFRAMES];
9902                                 ASSERT(nframes > 0);
9903                                 arg = nframes;
9904                         }
9905 
9906                         size = nframes * sizeof (pc_t);
9907                         break;
9908 
9909                 case DTRACEACT_JSTACK:
9910                         if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
9911                                 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
9912 
9913                         if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
9914                                 nframes = opt[DTRACEOPT_JSTACKFRAMES];
9915 
9916                         arg = DTRACE_USTACK_ARG(nframes, strsize);
9917 
9918                         /*FALLTHROUGH*/
9919                 case DTRACEACT_USTACK:
9920                         if (desc->dtad_kind != DTRACEACT_JSTACK &&
9921                             (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
9922                                 strsize = DTRACE_USTACK_STRSIZE(arg);
9923                                 nframes = opt[DTRACEOPT_USTACKFRAMES];
9924                                 ASSERT(nframes > 0);
9925                                 arg = DTRACE_USTACK_ARG(nframes, strsize);
9926                         }
9927 
9928                         /*
9929                          * Save a slot for the pid.
9930                          */
9931                         size = (nframes + 1) * sizeof (uint64_t);
9932                         size += DTRACE_USTACK_STRSIZE(arg);
9933                         size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
9934 
9935                         break;
9936 
9937                 case DTRACEACT_SYM:
9938                 case DTRACEACT_MOD:
9939                         if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
9940                             sizeof (uint64_t)) ||
9941                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9942                                 return (EINVAL);
9943                         break;
9944 
9945                 case DTRACEACT_USYM:
9946                 case DTRACEACT_UMOD:
9947                 case DTRACEACT_UADDR:
9948                         if (dp == NULL ||
9949                             (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
9950                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9951                                 return (EINVAL);
9952 
9953                         /*
9954                          * We have a slot for the pid, plus a slot for the
9955                          * argument.  To keep things simple (aligned with
9956                          * bitness-neutral sizing), we store each as a 64-bit
9957                          * quantity.
9958                          */
9959                         size = 2 * sizeof (uint64_t);
9960                         break;
9961 
9962                 case DTRACEACT_STOP:
9963                 case DTRACEACT_BREAKPOINT:
9964                 case DTRACEACT_PANIC:
9965                         break;
9966 
9967                 case DTRACEACT_CHILL:
9968                 case DTRACEACT_DISCARD:
9969                 case DTRACEACT_RAISE:
9970                         if (dp == NULL)
9971                                 return (EINVAL);
9972                         break;
9973 
9974                 case DTRACEACT_EXIT:
9975                         if (dp == NULL ||
9976                             (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
9977                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9978                                 return (EINVAL);
9979                         break;
9980 
9981                 case DTRACEACT_SPECULATE:
9982                         if (ecb->dte_size > sizeof (dtrace_epid_t))
9983                                 return (EINVAL);
9984 
9985                         if (dp == NULL)
9986                                 return (EINVAL);
9987 
9988                         state->dts_speculates = 1;
9989                         break;
9990 
9991                 case DTRACEACT_COMMIT: {
9992                         dtrace_action_t *act = ecb->dte_action;
9993 
9994                         for (; act != NULL; act = act->dta_next) {
9995                                 if (act->dta_kind == DTRACEACT_COMMIT)
9996                                         return (EINVAL);
9997                         }
9998 
9999                         if (dp == NULL)
10000                                 return (EINVAL);
10001                         break;
10002                 }
10003 
10004                 default:
10005                         return (EINVAL);
10006                 }
10007 
10008                 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
10009                         /*
10010                          * If this is a data-storing action or a speculate,
10011                          * we must be sure that there isn't a commit on the
10012                          * action chain.
10013                          */
10014                         dtrace_action_t *act = ecb->dte_action;
10015 
10016                         for (; act != NULL; act = act->dta_next) {
10017                                 if (act->dta_kind == DTRACEACT_COMMIT)
10018                                         return (EINVAL);
10019                         }
10020                 }
10021 
10022                 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
10023                 action->dta_rec.dtrd_size = size;
10024         }
10025 
10026         action->dta_refcnt = 1;
10027         rec = &action->dta_rec;
10028         size = rec->dtrd_size;
10029 
10030         for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
10031                 if (!(size & mask)) {
10032                         align = mask + 1;
10033                         break;
10034                 }
10035         }
10036 
10037         action->dta_kind = desc->dtad_kind;
10038 
10039         if ((action->dta_difo = dp) != NULL)
10040                 dtrace_difo_hold(dp);
10041 
10042         rec->dtrd_action = action->dta_kind;
10043         rec->dtrd_arg = arg;
10044         rec->dtrd_uarg = desc->dtad_uarg;
10045         rec->dtrd_alignment = (uint16_t)align;
10046         rec->dtrd_format = format;
10047 
10048         if ((last = ecb->dte_action_last) != NULL) {
10049                 ASSERT(ecb->dte_action != NULL);
10050                 action->dta_prev = last;
10051                 last->dta_next = action;
10052         } else {
10053                 ASSERT(ecb->dte_action == NULL);
10054                 ecb->dte_action = action;
10055         }
10056 
10057         ecb->dte_action_last = action;
10058 
10059         return (0);
10060 }
10061 
10062 static void
10063 dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
10064 {
10065         dtrace_action_t *act = ecb->dte_action, *next;
10066         dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
10067         dtrace_difo_t *dp;
10068         uint16_t format;
10069 
10070         if (act != NULL && act->dta_refcnt > 1) {
10071                 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
10072                 act->dta_refcnt--;
10073         } else {
10074                 for (; act != NULL; act = next) {
10075                         next = act->dta_next;
10076                         ASSERT(next != NULL || act == ecb->dte_action_last);
10077                         ASSERT(act->dta_refcnt == 1);
10078 
10079                         if ((format = act->dta_rec.dtrd_format) != 0)
10080                                 dtrace_format_remove(ecb->dte_state, format);
10081 
10082                         if ((dp = act->dta_difo) != NULL)
10083                                 dtrace_difo_release(dp, vstate);
10084 
10085                         if (DTRACEACT_ISAGG(act->dta_kind)) {
10086                                 dtrace_ecb_aggregation_destroy(ecb, act);
10087                         } else {
10088                                 kmem_free(act, sizeof (dtrace_action_t));
10089                         }
10090                 }
10091         }
10092 
10093         ecb->dte_action = NULL;
10094         ecb->dte_action_last = NULL;
10095         ecb->dte_size = sizeof (dtrace_epid_t);
10096 }
10097 
10098 static void
10099 dtrace_ecb_disable(dtrace_ecb_t *ecb)
10100 {
10101         /*
10102          * We disable the ECB by removing it from its probe.
10103          */
10104         dtrace_ecb_t *pecb, *prev = NULL;
10105         dtrace_probe_t *probe = ecb->dte_probe;
10106 
10107         ASSERT(MUTEX_HELD(&dtrace_lock));
10108 
10109         if (probe == NULL) {
10110                 /*
10111                  * This is the NULL probe; there is nothing to disable.
10112                  */
10113                 return;
10114         }
10115 
10116         for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
10117                 if (pecb == ecb)
10118                         break;
10119                 prev = pecb;
10120         }
10121 
10122         ASSERT(pecb != NULL);
10123 
10124         if (prev == NULL) {
10125                 probe->dtpr_ecb = ecb->dte_next;
10126         } else {
10127                 prev->dte_next = ecb->dte_next;
10128         }
10129 
10130         if (ecb == probe->dtpr_ecb_last) {
10131                 ASSERT(ecb->dte_next == NULL);
10132                 probe->dtpr_ecb_last = prev;
10133         }
10134 
10135         /*
10136          * The ECB has been disconnected from the probe; now sync to assure
10137          * that all CPUs have seen the change before returning.
10138          */
10139         dtrace_sync();
10140 
10141         if (probe->dtpr_ecb == NULL) {
10142                 /*
10143                  * That was the last ECB on the probe; clear the predicate
10144                  * cache ID for the probe, disable it and sync one more time
10145                  * to assure that we'll never hit it again.
10146                  */
10147                 dtrace_provider_t *prov = probe->dtpr_provider;
10148 
10149                 ASSERT(ecb->dte_next == NULL);
10150                 ASSERT(probe->dtpr_ecb_last == NULL);
10151                 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
10152                 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
10153                     probe->dtpr_id, probe->dtpr_arg);
10154                 dtrace_sync();
10155         } else {
10156                 /*
10157                  * There is at least one ECB remaining on the probe.  If there
10158                  * is _exactly_ one, set the probe's predicate cache ID to be
10159                  * the predicate cache ID of the remaining ECB.
10160                  */
10161                 ASSERT(probe->dtpr_ecb_last != NULL);
10162                 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
10163 
10164                 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
10165                         dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
10166 
10167                         ASSERT(probe->dtpr_ecb->dte_next == NULL);
10168 
10169                         if (p != NULL)
10170                                 probe->dtpr_predcache = p->dtp_cacheid;
10171                 }
10172 
10173                 ecb->dte_next = NULL;
10174         }
10175 }
10176 
10177 static void
10178 dtrace_ecb_destroy(dtrace_ecb_t *ecb)
10179 {
10180         dtrace_state_t *state = ecb->dte_state;
10181         dtrace_vstate_t *vstate = &state->dts_vstate;
10182         dtrace_predicate_t *pred;
10183         dtrace_epid_t epid = ecb->dte_epid;
10184 
10185         ASSERT(MUTEX_HELD(&dtrace_lock));
10186         ASSERT(ecb->dte_next == NULL);
10187         ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
10188 
10189         if ((pred = ecb->dte_predicate) != NULL)
10190                 dtrace_predicate_release(pred, vstate);
10191 
10192         dtrace_ecb_action_remove(ecb);
10193 
10194         ASSERT(state->dts_ecbs[epid - 1] == ecb);
10195         state->dts_ecbs[epid - 1] = NULL;
10196 
10197         kmem_free(ecb, sizeof (dtrace_ecb_t));
10198 }
10199 
10200 static dtrace_ecb_t *
10201 dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
10202     dtrace_enabling_t *enab)
10203 {
10204         dtrace_ecb_t *ecb;
10205         dtrace_predicate_t *pred;
10206         dtrace_actdesc_t *act;
10207         dtrace_provider_t *prov;
10208         dtrace_ecbdesc_t *desc = enab->dten_current;
10209 
10210         ASSERT(MUTEX_HELD(&dtrace_lock));
10211         ASSERT(state != NULL);
10212 
10213         ecb = dtrace_ecb_add(state, probe);
10214         ecb->dte_uarg = desc->dted_uarg;
10215 
10216         if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
10217                 dtrace_predicate_hold(pred);
10218                 ecb->dte_predicate = pred;
10219         }
10220 
10221         if (probe != NULL) {
10222                 /*
10223                  * If the provider shows more leg than the consumer is old
10224                  * enough to see, we need to enable the appropriate implicit
10225                  * predicate bits to prevent the ecb from activating at
10226                  * revealing times.
10227                  *
10228                  * Providers specifying DTRACE_PRIV_USER at register time
10229                  * are stating that they need the /proc-style privilege
10230                  * model to be enforced, and this is what DTRACE_COND_OWNER
10231                  * and DTRACE_COND_ZONEOWNER will then do at probe time.
10232                  */
10233                 prov = probe->dtpr_provider;
10234                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
10235                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10236                         ecb->dte_cond |= DTRACE_COND_OWNER;
10237 
10238                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
10239                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10240                         ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
10241 
10242                 /*
10243                  * If the provider shows us kernel innards and the user
10244                  * is lacking sufficient privilege, enable the
10245                  * DTRACE_COND_USERMODE implicit predicate.
10246                  */
10247                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
10248                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
10249                         ecb->dte_cond |= DTRACE_COND_USERMODE;
10250         }
10251 
10252         if (dtrace_ecb_create_cache != NULL) {
10253                 /*
10254                  * If we have a cached ecb, we'll use its action list instead
10255                  * of creating our own (saving both time and space).
10256                  */
10257                 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
10258                 dtrace_action_t *act = cached->dte_action;
10259 
10260                 if (act != NULL) {
10261                         ASSERT(act->dta_refcnt > 0);
10262                         act->dta_refcnt++;
10263                         ecb->dte_action = act;
10264                         ecb->dte_action_last = cached->dte_action_last;
10265                         ecb->dte_needed = cached->dte_needed;
10266                         ecb->dte_size = cached->dte_size;
10267                         ecb->dte_alignment = cached->dte_alignment;
10268                 }
10269 
10270                 return (ecb);
10271         }
10272 
10273         for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
10274                 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
10275                         dtrace_ecb_destroy(ecb);
10276                         return (NULL);
10277                 }
10278         }
10279 
10280         dtrace_ecb_resize(ecb);
10281 
10282         return (dtrace_ecb_create_cache = ecb);
10283 }
10284 
10285 static int
10286 dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
10287 {
10288         dtrace_ecb_t *ecb;
10289         dtrace_enabling_t *enab = arg;
10290         dtrace_state_t *state = enab->dten_vstate->dtvs_state;
10291 
10292         ASSERT(state != NULL);
10293 
10294         if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
10295                 /*
10296                  * This probe was created in a generation for which this
10297                  * enabling has previously created ECBs; we don't want to
10298                  * enable it again, so just kick out.
10299                  */
10300                 return (DTRACE_MATCH_NEXT);
10301         }
10302 
10303         if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
10304                 return (DTRACE_MATCH_DONE);
10305 
10306         if (dtrace_ecb_enable(ecb) < 0)
10307                 return (DTRACE_MATCH_FAIL);
10308 
10309         return (DTRACE_MATCH_NEXT);
10310 }
10311 
10312 static dtrace_ecb_t *
10313 dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
10314 {
10315         dtrace_ecb_t *ecb;
10316 
10317         ASSERT(MUTEX_HELD(&dtrace_lock));
10318 
10319         if (id == 0 || id > state->dts_necbs)
10320                 return (NULL);
10321 
10322         ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
10323         ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
10324 
10325         return (state->dts_ecbs[id - 1]);
10326 }
10327 
10328 static dtrace_aggregation_t *
10329 dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
10330 {
10331         dtrace_aggregation_t *agg;
10332 
10333         ASSERT(MUTEX_HELD(&dtrace_lock));
10334 
10335         if (id == 0 || id > state->dts_naggregations)
10336                 return (NULL);
10337 
10338         ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
10339         ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
10340             agg->dtag_id == id);
10341 
10342         return (state->dts_aggregations[id - 1]);
10343 }
10344 
10345 /*
10346  * DTrace Buffer Functions
10347  *
10348  * The following functions manipulate DTrace buffers.  Most of these functions
10349  * are called in the context of establishing or processing consumer state;
10350  * exceptions are explicitly noted.
10351  */
10352 
10353 /*
10354  * Note:  called from cross call context.  This function switches the two
10355  * buffers on a given CPU.  The atomicity of this operation is assured by
10356  * disabling interrupts while the actual switch takes place; the disabling of
10357  * interrupts serializes the execution with any execution of dtrace_probe() on
10358  * the same CPU.
10359  */
10360 static void
10361 dtrace_buffer_switch(dtrace_buffer_t *buf)
10362 {
10363         caddr_t tomax = buf->dtb_tomax;
10364         caddr_t xamot = buf->dtb_xamot;
10365         dtrace_icookie_t cookie;
10366         hrtime_t now = dtrace_gethrtime();
10367 
10368         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
10369         ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
10370 
10371         cookie = dtrace_interrupt_disable();
10372         buf->dtb_tomax = xamot;
10373         buf->dtb_xamot = tomax;
10374         buf->dtb_xamot_drops = buf->dtb_drops;
10375         buf->dtb_xamot_offset = buf->dtb_offset;
10376         buf->dtb_xamot_errors = buf->dtb_errors;
10377         buf->dtb_xamot_flags = buf->dtb_flags;
10378         buf->dtb_offset = 0;
10379         buf->dtb_drops = 0;
10380         buf->dtb_errors = 0;
10381         buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
10382         buf->dtb_interval = now - buf->dtb_switched;
10383         buf->dtb_switched = now;
10384         dtrace_interrupt_enable(cookie);
10385 }
10386 
10387 /*
10388  * Note:  called from cross call context.  This function activates a buffer
10389  * on a CPU.  As with dtrace_buffer_switch(), the atomicity of the operation
10390  * is guaranteed by the disabling of interrupts.
10391  */
10392 static void
10393 dtrace_buffer_activate(dtrace_state_t *state)
10394 {
10395         dtrace_buffer_t *buf;
10396         dtrace_icookie_t cookie = dtrace_interrupt_disable();
10397 
10398         buf = &state->dts_buffer[CPU->cpu_id];
10399 
10400         if (buf->dtb_tomax != NULL) {
10401                 /*
10402                  * We might like to assert that the buffer is marked inactive,
10403                  * but this isn't necessarily true:  the buffer for the CPU
10404                  * that processes the BEGIN probe has its buffer activated
10405                  * manually.  In this case, we take the (harmless) action
10406                  * re-clearing the bit INACTIVE bit.
10407                  */
10408                 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
10409         }
10410 
10411         dtrace_interrupt_enable(cookie);
10412 }
10413 
10414 static int
10415 dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
10416     processorid_t cpu, int *factor)
10417 {
10418         cpu_t *cp;
10419         dtrace_buffer_t *buf;
10420         int allocated = 0, desired = 0;
10421 
10422         ASSERT(MUTEX_HELD(&cpu_lock));
10423         ASSERT(MUTEX_HELD(&dtrace_lock));
10424 
10425         *factor = 1;
10426 
10427         if (size > dtrace_nonroot_maxsize &&
10428             !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
10429                 return (EFBIG);
10430 
10431         cp = cpu_list;
10432 
10433         do {
10434                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10435                         continue;
10436 
10437                 buf = &bufs[cp->cpu_id];
10438 
10439                 /*
10440                  * If there is already a buffer allocated for this CPU, it
10441                  * is only possible that this is a DR event.  In this case,
10442                  * the buffer size must match our specified size.
10443                  */
10444                 if (buf->dtb_tomax != NULL) {
10445                         ASSERT(buf->dtb_size == size);
10446                         continue;
10447                 }
10448 
10449                 ASSERT(buf->dtb_xamot == NULL);
10450 
10451                 if ((buf->dtb_tomax = kmem_zalloc(size,
10452                     KM_NOSLEEP | KM_NORMALPRI)) == NULL)
10453                         goto err;
10454 
10455                 buf->dtb_size = size;
10456                 buf->dtb_flags = flags;
10457                 buf->dtb_offset = 0;
10458                 buf->dtb_drops = 0;
10459 
10460                 if (flags & DTRACEBUF_NOSWITCH)
10461                         continue;
10462 
10463                 if ((buf->dtb_xamot = kmem_zalloc(size,
10464                     KM_NOSLEEP | KM_NORMALPRI)) == NULL)
10465                         goto err;
10466         } while ((cp = cp->cpu_next) != cpu_list);
10467 
10468         return (0);
10469 
10470 err:
10471         cp = cpu_list;
10472 
10473         do {
10474                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10475                         continue;
10476 
10477                 buf = &bufs[cp->cpu_id];
10478                 desired += 2;
10479 
10480                 if (buf->dtb_xamot != NULL) {
10481                         ASSERT(buf->dtb_tomax != NULL);
10482                         ASSERT(buf->dtb_size == size);
10483                         kmem_free(buf->dtb_xamot, size);
10484                         allocated++;
10485                 }
10486 
10487                 if (buf->dtb_tomax != NULL) {
10488                         ASSERT(buf->dtb_size == size);
10489                         kmem_free(buf->dtb_tomax, size);
10490                         allocated++;
10491                 }
10492 
10493                 buf->dtb_tomax = NULL;
10494                 buf->dtb_xamot = NULL;
10495                 buf->dtb_size = 0;
10496         } while ((cp = cp->cpu_next) != cpu_list);
10497 
10498         *factor = desired / (allocated > 0 ? allocated : 1);
10499 
10500         return (ENOMEM);
10501 }
10502 
10503 /*
10504  * Note:  called from probe context.  This function just increments the drop
10505  * count on a buffer.  It has been made a function to allow for the
10506  * possibility of understanding the source of mysterious drop counts.  (A
10507  * problem for which one may be particularly disappointed that DTrace cannot
10508  * be used to understand DTrace.)
10509  */
10510 static void
10511 dtrace_buffer_drop(dtrace_buffer_t *buf)
10512 {
10513         buf->dtb_drops++;
10514 }
10515 
10516 /*
10517  * Note:  called from probe context.  This function is called to reserve space
10518  * in a buffer.  If mstate is non-NULL, sets the scratch base and size in the
10519  * mstate.  Returns the new offset in the buffer, or a negative value if an
10520  * error has occurred.
10521  */
10522 static intptr_t
10523 dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
10524     dtrace_state_t *state, dtrace_mstate_t *mstate)
10525 {
10526         intptr_t offs = buf->dtb_offset, soffs;
10527         intptr_t woffs;
10528         caddr_t tomax;
10529         size_t total;
10530 
10531         if (buf->dtb_flags & DTRACEBUF_INACTIVE)
10532                 return (-1);
10533 
10534         if ((tomax = buf->dtb_tomax) == NULL) {
10535                 dtrace_buffer_drop(buf);
10536                 return (-1);
10537         }
10538 
10539         if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
10540                 while (offs & (align - 1)) {
10541                         /*
10542                          * Assert that our alignment is off by a number which
10543                          * is itself sizeof (uint32_t) aligned.
10544                          */
10545                         ASSERT(!((align - (offs & (align - 1))) &
10546                             (sizeof (uint32_t) - 1)));
10547                         DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10548                         offs += sizeof (uint32_t);
10549                 }
10550 
10551                 if ((soffs = offs + needed) > buf->dtb_size) {
10552                         dtrace_buffer_drop(buf);
10553                         return (-1);
10554                 }
10555 
10556                 if (mstate == NULL)
10557                         return (offs);
10558 
10559                 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
10560                 mstate->dtms_scratch_size = buf->dtb_size - soffs;
10561                 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10562 
10563                 return (offs);
10564         }
10565 
10566         if (buf->dtb_flags & DTRACEBUF_FILL) {
10567                 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
10568                     (buf->dtb_flags & DTRACEBUF_FULL))
10569                         return (-1);
10570                 goto out;
10571         }
10572 
10573         total = needed + (offs & (align - 1));
10574 
10575         /*
10576          * For a ring buffer, life is quite a bit more complicated.  Before
10577          * we can store any padding, we need to adjust our wrapping offset.
10578          * (If we've never before wrapped or we're not about to, no adjustment
10579          * is required.)
10580          */
10581         if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
10582             offs + total > buf->dtb_size) {
10583                 woffs = buf->dtb_xamot_offset;
10584 
10585                 if (offs + total > buf->dtb_size) {
10586                         /*
10587                          * We can't fit in the end of the buffer.  First, a
10588                          * sanity check that we can fit in the buffer at all.
10589                          */
10590                         if (total > buf->dtb_size) {
10591                                 dtrace_buffer_drop(buf);
10592                                 return (-1);
10593                         }
10594 
10595                         /*
10596                          * We're going to be storing at the top of the buffer,
10597                          * so now we need to deal with the wrapped offset.  We
10598                          * only reset our wrapped offset to 0 if it is
10599                          * currently greater than the current offset.  If it
10600                          * is less than the current offset, it is because a
10601                          * previous allocation induced a wrap -- but the
10602                          * allocation didn't subsequently take the space due
10603                          * to an error or false predicate evaluation.  In this
10604                          * case, we'll just leave the wrapped offset alone: if
10605                          * the wrapped offset hasn't been advanced far enough
10606                          * for this allocation, it will be adjusted in the
10607                          * lower loop.
10608                          */
10609                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
10610                                 if (woffs >= offs)
10611                                         woffs = 0;
10612                         } else {
10613                                 woffs = 0;
10614                         }
10615 
10616                         /*
10617                          * Now we know that we're going to be storing to the
10618                          * top of the buffer and that there is room for us
10619                          * there.  We need to clear the buffer from the current
10620                          * offset to the end (there may be old gunk there).
10621                          */
10622                         while (offs < buf->dtb_size)
10623                                 tomax[offs++] = 0;
10624 
10625                         /*
10626                          * We need to set our offset to zero.  And because we
10627                          * are wrapping, we need to set the bit indicating as
10628                          * much.  We can also adjust our needed space back
10629                          * down to the space required by the ECB -- we know
10630                          * that the top of the buffer is aligned.
10631                          */
10632                         offs = 0;
10633                         total = needed;
10634                         buf->dtb_flags |= DTRACEBUF_WRAPPED;
10635                 } else {
10636                         /*
10637                          * There is room for us in the buffer, so we simply
10638                          * need to check the wrapped offset.
10639                          */
10640                         if (woffs < offs) {
10641                                 /*
10642                                  * The wrapped offset is less than the offset.
10643                                  * This can happen if we allocated buffer space
10644                                  * that induced a wrap, but then we didn't
10645                                  * subsequently take the space due to an error
10646                                  * or false predicate evaluation.  This is
10647                                  * okay; we know that _this_ allocation isn't
10648                                  * going to induce a wrap.  We still can't
10649                                  * reset the wrapped offset to be zero,
10650                                  * however: the space may have been trashed in
10651                                  * the previous failed probe attempt.  But at
10652                                  * least the wrapped offset doesn't need to
10653                                  * be adjusted at all...
10654                                  */
10655                                 goto out;
10656                         }
10657                 }
10658 
10659                 while (offs + total > woffs) {
10660                         dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
10661                         size_t size;
10662 
10663                         if (epid == DTRACE_EPIDNONE) {
10664                                 size = sizeof (uint32_t);
10665                         } else {
10666                                 ASSERT(epid <= state->dts_necbs);
10667                                 ASSERT(state->dts_ecbs[epid - 1] != NULL);
10668 
10669                                 size = state->dts_ecbs[epid - 1]->dte_size;
10670                         }
10671 
10672                         ASSERT(woffs + size <= buf->dtb_size);
10673                         ASSERT(size != 0);
10674 
10675                         if (woffs + size == buf->dtb_size) {
10676                                 /*
10677                                  * We've reached the end of the buffer; we want
10678                                  * to set the wrapped offset to 0 and break
10679                                  * out.  However, if the offs is 0, then we're
10680                                  * in a strange edge-condition:  the amount of
10681                                  * space that we want to reserve plus the size
10682                                  * of the record that we're overwriting is
10683                                  * greater than the size of the buffer.  This
10684                                  * is problematic because if we reserve the
10685                                  * space but subsequently don't consume it (due
10686                                  * to a failed predicate or error) the wrapped
10687                                  * offset will be 0 -- yet the EPID at offset 0
10688                                  * will not be committed.  This situation is
10689                                  * relatively easy to deal with:  if we're in
10690                                  * this case, the buffer is indistinguishable
10691                                  * from one that hasn't wrapped; we need only
10692                                  * finish the job by clearing the wrapped bit,
10693                                  * explicitly setting the offset to be 0, and
10694                                  * zero'ing out the old data in the buffer.
10695                                  */
10696                                 if (offs == 0) {
10697                                         buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
10698                                         buf->dtb_offset = 0;
10699                                         woffs = total;
10700 
10701                                         while (woffs < buf->dtb_size)
10702                                                 tomax[woffs++] = 0;
10703                                 }
10704 
10705                                 woffs = 0;
10706                                 break;
10707                         }
10708 
10709                         woffs += size;
10710                 }
10711 
10712                 /*
10713                  * We have a wrapped offset.  It may be that the wrapped offset
10714                  * has become zero -- that's okay.
10715                  */
10716                 buf->dtb_xamot_offset = woffs;
10717         }
10718 
10719 out:
10720         /*
10721          * Now we can plow the buffer with any necessary padding.
10722          */
10723         while (offs & (align - 1)) {
10724                 /*
10725                  * Assert that our alignment is off by a number which
10726                  * is itself sizeof (uint32_t) aligned.
10727                  */
10728                 ASSERT(!((align - (offs & (align - 1))) &
10729                     (sizeof (uint32_t) - 1)));
10730                 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10731                 offs += sizeof (uint32_t);
10732         }
10733 
10734         if (buf->dtb_flags & DTRACEBUF_FILL) {
10735                 if (offs + needed > buf->dtb_size - state->dts_reserve) {
10736                         buf->dtb_flags |= DTRACEBUF_FULL;
10737                         return (-1);
10738                 }
10739         }
10740 
10741         if (mstate == NULL)
10742                 return (offs);
10743 
10744         /*
10745          * For ring buffers and fill buffers, the scratch space is always
10746          * the inactive buffer.
10747          */
10748         mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
10749         mstate->dtms_scratch_size = buf->dtb_size;
10750         mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10751 
10752         return (offs);
10753 }
10754 
10755 static void
10756 dtrace_buffer_polish(dtrace_buffer_t *buf)
10757 {
10758         ASSERT(buf->dtb_flags & DTRACEBUF_RING);
10759         ASSERT(MUTEX_HELD(&dtrace_lock));
10760 
10761         if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
10762                 return;
10763 
10764         /*
10765          * We need to polish the ring buffer.  There are three cases:
10766          *
10767          * - The first (and presumably most common) is that there is no gap
10768          *   between the buffer offset and the wrapped offset.  In this case,
10769          *   there is nothing in the buffer that isn't valid data; we can
10770          *   mark the buffer as polished and return.
10771          *
10772          * - The second (less common than the first but still more common
10773          *   than the third) is that there is a gap between the buffer offset
10774          *   and the wrapped offset, and the wrapped offset is larger than the
10775          *   buffer offset.  This can happen because of an alignment issue, or
10776          *   can happen because of a call to dtrace_buffer_reserve() that
10777          *   didn't subsequently consume the buffer space.  In this case,
10778          *   we need to zero the data from the buffer offset to the wrapped
10779          *   offset.
10780          *
10781          * - The third (and least common) is that there is a gap between the
10782          *   buffer offset and the wrapped offset, but the wrapped offset is
10783          *   _less_ than the buffer offset.  This can only happen because a
10784          *   call to dtrace_buffer_reserve() induced a wrap, but the space
10785          *   was not subsequently consumed.  In this case, we need to zero the
10786          *   space from the offset to the end of the buffer _and_ from the
10787          *   top of the buffer to the wrapped offset.
10788          */
10789         if (buf->dtb_offset < buf->dtb_xamot_offset) {
10790                 bzero(buf->dtb_tomax + buf->dtb_offset,
10791                     buf->dtb_xamot_offset - buf->dtb_offset);
10792         }
10793 
10794         if (buf->dtb_offset > buf->dtb_xamot_offset) {
10795                 bzero(buf->dtb_tomax + buf->dtb_offset,
10796                     buf->dtb_size - buf->dtb_offset);
10797                 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
10798         }
10799 }
10800 
10801 /*
10802  * This routine determines if data generated at the specified time has likely
10803  * been entirely consumed at user-level.  This routine is called to determine
10804  * if an ECB on a defunct probe (but for an active enabling) can be safely
10805  * disabled and destroyed.
10806  */
10807 static int
10808 dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when)
10809 {
10810         int i;
10811 
10812         for (i = 0; i < NCPU; i++) {
10813                 dtrace_buffer_t *buf = &bufs[i];
10814 
10815                 if (buf->dtb_size == 0)
10816                         continue;
10817 
10818                 if (buf->dtb_flags & DTRACEBUF_RING)
10819                         return (0);
10820 
10821                 if (!buf->dtb_switched && buf->dtb_offset != 0)
10822                         return (0);
10823 
10824                 if (buf->dtb_switched - buf->dtb_interval < when)
10825                         return (0);
10826         }
10827 
10828         return (1);
10829 }
10830 
10831 static void
10832 dtrace_buffer_free(dtrace_buffer_t *bufs)
10833 {
10834         int i;
10835 
10836         for (i = 0; i < NCPU; i++) {
10837                 dtrace_buffer_t *buf = &bufs[i];
10838 
10839                 if (buf->dtb_tomax == NULL) {
10840                         ASSERT(buf->dtb_xamot == NULL);
10841                         ASSERT(buf->dtb_size == 0);
10842                         continue;
10843                 }
10844 
10845                 if (buf->dtb_xamot != NULL) {
10846                         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
10847                         kmem_free(buf->dtb_xamot, buf->dtb_size);
10848                 }
10849 
10850                 kmem_free(buf->dtb_tomax, buf->dtb_size);
10851                 buf->dtb_size = 0;
10852                 buf->dtb_tomax = NULL;
10853                 buf->dtb_xamot = NULL;
10854         }
10855 }
10856 
10857 /*
10858  * DTrace Enabling Functions
10859  */
10860 static dtrace_enabling_t *
10861 dtrace_enabling_create(dtrace_vstate_t *vstate)
10862 {
10863         dtrace_enabling_t *enab;
10864 
10865         enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
10866         enab->dten_vstate = vstate;
10867 
10868         return (enab);
10869 }
10870 
10871 static void
10872 dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
10873 {
10874         dtrace_ecbdesc_t **ndesc;
10875         size_t osize, nsize;
10876 
10877         /*
10878          * We can't add to enablings after we've enabled them, or after we've
10879          * retained them.
10880          */
10881         ASSERT(enab->dten_probegen == 0);
10882         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
10883 
10884         if (enab->dten_ndesc < enab->dten_maxdesc) {
10885                 enab->dten_desc[enab->dten_ndesc++] = ecb;
10886                 return;
10887         }
10888 
10889         osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
10890 
10891         if (enab->dten_maxdesc == 0) {
10892                 enab->dten_maxdesc = 1;
10893         } else {
10894                 enab->dten_maxdesc <<= 1;
10895         }
10896 
10897         ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
10898 
10899         nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
10900         ndesc = kmem_zalloc(nsize, KM_SLEEP);
10901         bcopy(enab->dten_desc, ndesc, osize);
10902         kmem_free(enab->dten_desc, osize);
10903 
10904         enab->dten_desc = ndesc;
10905         enab->dten_desc[enab->dten_ndesc++] = ecb;
10906 }
10907 
10908 static void
10909 dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
10910     dtrace_probedesc_t *pd)
10911 {
10912         dtrace_ecbdesc_t *new;
10913         dtrace_predicate_t *pred;
10914         dtrace_actdesc_t *act;
10915 
10916         /*
10917          * We're going to create a new ECB description that matches the
10918          * specified ECB in every way, but has the specified probe description.
10919          */
10920         new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
10921 
10922         if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
10923                 dtrace_predicate_hold(pred);
10924 
10925         for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
10926                 dtrace_actdesc_hold(act);
10927 
10928         new->dted_action = ecb->dted_action;
10929         new->dted_pred = ecb->dted_pred;
10930         new->dted_probe = *pd;
10931         new->dted_uarg = ecb->dted_uarg;
10932 
10933         dtrace_enabling_add(enab, new);
10934 }
10935 
10936 static void
10937 dtrace_enabling_dump(dtrace_enabling_t *enab)
10938 {
10939         int i;
10940 
10941         for (i = 0; i < enab->dten_ndesc; i++) {
10942                 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
10943 
10944                 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
10945                     desc->dtpd_provider, desc->dtpd_mod,
10946                     desc->dtpd_func, desc->dtpd_name);
10947         }
10948 }
10949 
10950 static void
10951 dtrace_enabling_destroy(dtrace_enabling_t *enab)
10952 {
10953         int i;
10954         dtrace_ecbdesc_t *ep;
10955         dtrace_vstate_t *vstate = enab->dten_vstate;
10956 
10957         ASSERT(MUTEX_HELD(&dtrace_lock));
10958 
10959         for (i = 0; i < enab->dten_ndesc; i++) {
10960                 dtrace_actdesc_t *act, *next;
10961                 dtrace_predicate_t *pred;
10962 
10963                 ep = enab->dten_desc[i];
10964 
10965                 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
10966                         dtrace_predicate_release(pred, vstate);
10967 
10968                 for (act = ep->dted_action; act != NULL; act = next) {
10969                         next = act->dtad_next;
10970                         dtrace_actdesc_release(act, vstate);
10971                 }
10972 
10973                 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
10974         }
10975 
10976         kmem_free(enab->dten_desc,
10977             enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
10978 
10979         /*
10980          * If this was a retained enabling, decrement the dts_nretained count
10981          * and take it off of the dtrace_retained list.
10982          */
10983         if (enab->dten_prev != NULL || enab->dten_next != NULL ||
10984             dtrace_retained == enab) {
10985                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
10986                 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
10987                 enab->dten_vstate->dtvs_state->dts_nretained--;
10988                 dtrace_retained_gen++;
10989         }
10990 
10991         if (enab->dten_prev == NULL) {
10992                 if (dtrace_retained == enab) {
10993                         dtrace_retained = enab->dten_next;
10994 
10995                         if (dtrace_retained != NULL)
10996                                 dtrace_retained->dten_prev = NULL;
10997                 }
10998         } else {
10999                 ASSERT(enab != dtrace_retained);
11000                 ASSERT(dtrace_retained != NULL);
11001                 enab->dten_prev->dten_next = enab->dten_next;
11002         }
11003 
11004         if (enab->dten_next != NULL) {
11005                 ASSERT(dtrace_retained != NULL);
11006                 enab->dten_next->dten_prev = enab->dten_prev;
11007         }
11008 
11009         kmem_free(enab, sizeof (dtrace_enabling_t));
11010 }
11011 
11012 static int
11013 dtrace_enabling_retain(dtrace_enabling_t *enab)
11014 {
11015         dtrace_state_t *state;
11016 
11017         ASSERT(MUTEX_HELD(&dtrace_lock));
11018         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11019         ASSERT(enab->dten_vstate != NULL);
11020 
11021         state = enab->dten_vstate->dtvs_state;
11022         ASSERT(state != NULL);
11023 
11024         /*
11025          * We only allow each state to retain dtrace_retain_max enablings.
11026          */
11027         if (state->dts_nretained >= dtrace_retain_max)
11028                 return (ENOSPC);
11029 
11030         state->dts_nretained++;
11031         dtrace_retained_gen++;
11032 
11033         if (dtrace_retained == NULL) {
11034                 dtrace_retained = enab;
11035                 return (0);
11036         }
11037 
11038         enab->dten_next = dtrace_retained;
11039         dtrace_retained->dten_prev = enab;
11040         dtrace_retained = enab;
11041 
11042         return (0);
11043 }
11044 
11045 static int
11046 dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
11047     dtrace_probedesc_t *create)
11048 {
11049         dtrace_enabling_t *new, *enab;
11050         int found = 0, err = ENOENT;
11051 
11052         ASSERT(MUTEX_HELD(&dtrace_lock));
11053         ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
11054         ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
11055         ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
11056         ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
11057 
11058         new = dtrace_enabling_create(&state->dts_vstate);
11059 
11060         /*
11061          * Iterate over all retained enablings, looking for enablings that
11062          * match the specified state.
11063          */
11064         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11065                 int i;
11066 
11067                 /*
11068                  * dtvs_state can only be NULL for helper enablings -- and
11069                  * helper enablings can't be retained.
11070                  */
11071                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11072 
11073                 if (enab->dten_vstate->dtvs_state != state)
11074                         continue;
11075 
11076                 /*
11077                  * Now iterate over each probe description; we're looking for
11078                  * an exact match to the specified probe description.
11079                  */
11080                 for (i = 0; i < enab->dten_ndesc; i++) {
11081                         dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11082                         dtrace_probedesc_t *pd = &ep->dted_probe;
11083 
11084                         if (strcmp(pd->dtpd_provider, match->dtpd_provider))
11085                                 continue;
11086 
11087                         if (strcmp(pd->dtpd_mod, match->dtpd_mod))
11088                                 continue;
11089 
11090                         if (strcmp(pd->dtpd_func, match->dtpd_func))
11091                                 continue;
11092 
11093                         if (strcmp(pd->dtpd_name, match->dtpd_name))
11094                                 continue;
11095 
11096                         /*
11097                          * We have a winning probe!  Add it to our growing
11098                          * enabling.
11099                          */
11100                         found = 1;
11101                         dtrace_enabling_addlike(new, ep, create);
11102                 }
11103         }
11104 
11105         if (!found || (err = dtrace_enabling_retain(new)) != 0) {
11106                 dtrace_enabling_destroy(new);
11107                 return (err);
11108         }
11109 
11110         return (0);
11111 }
11112 
11113 static void
11114 dtrace_enabling_retract(dtrace_state_t *state)
11115 {
11116         dtrace_enabling_t *enab, *next;
11117 
11118         ASSERT(MUTEX_HELD(&dtrace_lock));
11119 
11120         /*
11121          * Iterate over all retained enablings, destroy the enablings retained
11122          * for the specified state.
11123          */
11124         for (enab = dtrace_retained; enab != NULL; enab = next) {
11125                 next = enab->dten_next;
11126 
11127                 /*
11128                  * dtvs_state can only be NULL for helper enablings -- and
11129                  * helper enablings can't be retained.
11130                  */
11131                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11132 
11133                 if (enab->dten_vstate->dtvs_state == state) {
11134                         ASSERT(state->dts_nretained > 0);
11135                         dtrace_enabling_destroy(enab);
11136                 }
11137         }
11138 
11139         ASSERT(state->dts_nretained == 0);
11140 }
11141 
11142 static int
11143 dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
11144 {
11145         int i = 0;
11146         int total_matched = 0, matched = 0;
11147 
11148         ASSERT(MUTEX_HELD(&cpu_lock));
11149         ASSERT(MUTEX_HELD(&dtrace_lock));
11150 
11151         for (i = 0; i < enab->dten_ndesc; i++) {
11152                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11153 
11154                 enab->dten_current = ep;
11155                 enab->dten_error = 0;
11156 
11157                 /*
11158                  * If a provider failed to enable a probe then get out and
11159                  * let the consumer know we failed.
11160                  */
11161                 if ((matched = dtrace_probe_enable(&ep->dted_probe, enab)) < 0)
11162                         return (EBUSY);
11163 
11164                 total_matched += matched;
11165 
11166                 if (enab->dten_error != 0) {
11167                         /*
11168                          * If we get an error half-way through enabling the
11169                          * probes, we kick out -- perhaps with some number of
11170                          * them enabled.  Leaving enabled probes enabled may
11171                          * be slightly confusing for user-level, but we expect
11172                          * that no one will attempt to actually drive on in
11173                          * the face of such errors.  If this is an anonymous
11174                          * enabling (indicated with a NULL nmatched pointer),
11175                          * we cmn_err() a message.  We aren't expecting to
11176                          * get such an error -- such as it can exist at all,
11177                          * it would be a result of corrupted DOF in the driver
11178                          * properties.
11179                          */
11180                         if (nmatched == NULL) {
11181                                 cmn_err(CE_WARN, "dtrace_enabling_match() "
11182                                     "error on %p: %d", (void *)ep,
11183                                     enab->dten_error);
11184                         }
11185 
11186                         return (enab->dten_error);
11187                 }
11188         }
11189 
11190         enab->dten_probegen = dtrace_probegen;
11191         if (nmatched != NULL)
11192                 *nmatched = total_matched;
11193 
11194         return (0);
11195 }
11196 
11197 static void
11198 dtrace_enabling_matchall(void)
11199 {
11200         dtrace_enabling_t *enab;
11201 
11202         mutex_enter(&cpu_lock);
11203         mutex_enter(&dtrace_lock);
11204 
11205         /*
11206          * Iterate over all retained enablings to see if any probes match
11207          * against them.  We only perform this operation on enablings for which
11208          * we have sufficient permissions by virtue of being in the global zone
11209          * or in the same zone as the DTrace client.  Because we can be called
11210          * after dtrace_detach() has been called, we cannot assert that there
11211          * are retained enablings.  We can safely load from dtrace_retained,
11212          * however:  the taskq_destroy() at the end of dtrace_detach() will
11213          * block pending our completion.
11214          */
11215         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11216                 dtrace_cred_t *dcr = &enab->dten_vstate->dtvs_state->dts_cred;
11217                 cred_t *cr = dcr->dcr_cred;
11218                 zoneid_t zone = cr != NULL ? crgetzoneid(cr) : 0;
11219 
11220                 if ((dcr->dcr_visible & DTRACE_CRV_ALLZONE) || (cr != NULL &&
11221                     (zone == GLOBAL_ZONEID || getzoneid() == zone)))
11222                         (void) dtrace_enabling_match(enab, NULL);
11223         }
11224 
11225         mutex_exit(&dtrace_lock);
11226         mutex_exit(&cpu_lock);
11227 }
11228 
11229 /*
11230  * If an enabling is to be enabled without having matched probes (that is, if
11231  * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
11232  * enabling must be _primed_ by creating an ECB for every ECB description.
11233  * This must be done to assure that we know the number of speculations, the
11234  * number of aggregations, the minimum buffer size needed, etc. before we
11235  * transition out of DTRACE_ACTIVITY_INACTIVE.  To do this without actually
11236  * enabling any probes, we create ECBs for every ECB decription, but with a
11237  * NULL probe -- which is exactly what this function does.
11238  */
11239 static void
11240 dtrace_enabling_prime(dtrace_state_t *state)
11241 {
11242         dtrace_enabling_t *enab;
11243         int i;
11244 
11245         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11246                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11247 
11248                 if (enab->dten_vstate->dtvs_state != state)
11249                         continue;
11250 
11251                 /*
11252                  * We don't want to prime an enabling more than once, lest
11253                  * we allow a malicious user to induce resource exhaustion.
11254                  * (The ECBs that result from priming an enabling aren't
11255                  * leaked -- but they also aren't deallocated until the
11256                  * consumer state is destroyed.)
11257                  */
11258                 if (enab->dten_primed)
11259                         continue;
11260 
11261                 for (i = 0; i < enab->dten_ndesc; i++) {
11262                         enab->dten_current = enab->dten_desc[i];
11263                         (void) dtrace_probe_enable(NULL, enab);
11264                 }
11265 
11266                 enab->dten_primed = 1;
11267         }
11268 }
11269 
11270 /*
11271  * Called to indicate that probes should be provided due to retained
11272  * enablings.  This is implemented in terms of dtrace_probe_provide(), but it
11273  * must take an initial lap through the enabling calling the dtps_provide()
11274  * entry point explicitly to allow for autocreated probes.
11275  */
11276 static void
11277 dtrace_enabling_provide(dtrace_provider_t *prv)
11278 {
11279         int i, all = 0;
11280         dtrace_probedesc_t desc;
11281         dtrace_genid_t gen;
11282 
11283         ASSERT(MUTEX_HELD(&dtrace_lock));
11284         ASSERT(MUTEX_HELD(&dtrace_provider_lock));
11285 
11286         if (prv == NULL) {
11287                 all = 1;
11288                 prv = dtrace_provider;
11289         }
11290 
11291         do {
11292                 dtrace_enabling_t *enab;
11293                 void *parg = prv->dtpv_arg;
11294 
11295 retry:
11296                 gen = dtrace_retained_gen;
11297                 for (enab = dtrace_retained; enab != NULL;
11298                     enab = enab->dten_next) {
11299                         for (i = 0; i < enab->dten_ndesc; i++) {
11300                                 desc = enab->dten_desc[i]->dted_probe;
11301                                 mutex_exit(&dtrace_lock);
11302                                 prv->dtpv_pops.dtps_provide(parg, &desc);
11303                                 mutex_enter(&dtrace_lock);
11304                                 /*
11305                                  * Process the retained enablings again if
11306                                  * they have changed while we weren't holding
11307                                  * dtrace_lock.
11308                                  */
11309                                 if (gen != dtrace_retained_gen)
11310                                         goto retry;
11311                         }
11312                 }
11313         } while (all && (prv = prv->dtpv_next) != NULL);
11314 
11315         mutex_exit(&dtrace_lock);
11316         dtrace_probe_provide(NULL, all ? NULL : prv);
11317         mutex_enter(&dtrace_lock);
11318 }
11319 
11320 /*
11321  * Called to reap ECBs that are attached to probes from defunct providers.
11322  */
11323 static void
11324 dtrace_enabling_reap(void)
11325 {
11326         dtrace_provider_t *prov;
11327         dtrace_probe_t *probe;
11328         dtrace_ecb_t *ecb;
11329         hrtime_t when;
11330         int i;
11331 
11332         mutex_enter(&cpu_lock);
11333         mutex_enter(&dtrace_lock);
11334 
11335         for (i = 0; i < dtrace_nprobes; i++) {
11336                 if ((probe = dtrace_probes[i]) == NULL)
11337                         continue;
11338 
11339                 if (probe->dtpr_ecb == NULL)
11340                         continue;
11341 
11342                 prov = probe->dtpr_provider;
11343 
11344                 if ((when = prov->dtpv_defunct) == 0)
11345                         continue;
11346 
11347                 /*
11348                  * We have ECBs on a defunct provider:  we want to reap these
11349                  * ECBs to allow the provider to unregister.  The destruction
11350                  * of these ECBs must be done carefully:  if we destroy the ECB
11351                  * and the consumer later wishes to consume an EPID that
11352                  * corresponds to the destroyed ECB (and if the EPID metadata
11353                  * has not been previously consumed), the consumer will abort
11354                  * processing on the unknown EPID.  To reduce (but not, sadly,
11355                  * eliminate) the possibility of this, we will only destroy an
11356                  * ECB for a defunct provider if, for the state that
11357                  * corresponds to the ECB:
11358                  *
11359                  *  (a) There is no speculative tracing (which can effectively
11360                  *      cache an EPID for an arbitrary amount of time).
11361                  *
11362                  *  (b) The principal buffers have been switched twice since the
11363                  *      provider became defunct.
11364                  *
11365                  *  (c) The aggregation buffers are of zero size or have been
11366                  *      switched twice since the provider became defunct.
11367                  *
11368                  * We use dts_speculates to determine (a) and call a function
11369                  * (dtrace_buffer_consumed()) to determine (b) and (c).  Note
11370                  * that as soon as we've been unable to destroy one of the ECBs
11371                  * associated with the probe, we quit trying -- reaping is only
11372                  * fruitful in as much as we can destroy all ECBs associated
11373                  * with the defunct provider's probes.
11374                  */
11375                 while ((ecb = probe->dtpr_ecb) != NULL) {
11376                         dtrace_state_t *state = ecb->dte_state;
11377                         dtrace_buffer_t *buf = state->dts_buffer;
11378                         dtrace_buffer_t *aggbuf = state->dts_aggbuffer;
11379 
11380                         if (state->dts_speculates)
11381                                 break;
11382 
11383                         if (!dtrace_buffer_consumed(buf, when))
11384                                 break;
11385 
11386                         if (!dtrace_buffer_consumed(aggbuf, when))
11387                                 break;
11388 
11389                         dtrace_ecb_disable(ecb);
11390                         ASSERT(probe->dtpr_ecb != ecb);
11391                         dtrace_ecb_destroy(ecb);
11392                 }
11393         }
11394 
11395         mutex_exit(&dtrace_lock);
11396         mutex_exit(&cpu_lock);
11397 }
11398 
11399 /*
11400  * DTrace DOF Functions
11401  */
11402 /*ARGSUSED*/
11403 static void
11404 dtrace_dof_error(dof_hdr_t *dof, const char *str)
11405 {
11406         if (dtrace_err_verbose)
11407                 cmn_err(CE_WARN, "failed to process DOF: %s", str);
11408 
11409 #ifdef DTRACE_ERRDEBUG
11410         dtrace_errdebug(str);
11411 #endif
11412 }
11413 
11414 /*
11415  * Create DOF out of a currently enabled state.  Right now, we only create
11416  * DOF containing the run-time options -- but this could be expanded to create
11417  * complete DOF representing the enabled state.
11418  */
11419 static dof_hdr_t *
11420 dtrace_dof_create(dtrace_state_t *state)
11421 {
11422         dof_hdr_t *dof;
11423         dof_sec_t *sec;
11424         dof_optdesc_t *opt;
11425         int i, len = sizeof (dof_hdr_t) +
11426             roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
11427             sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11428 
11429         ASSERT(MUTEX_HELD(&dtrace_lock));
11430 
11431         dof = kmem_zalloc(len, KM_SLEEP);
11432         dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
11433         dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
11434         dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
11435         dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
11436 
11437         dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
11438         dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
11439         dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
11440         dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
11441         dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
11442         dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
11443 
11444         dof->dofh_flags = 0;
11445         dof->dofh_hdrsize = sizeof (dof_hdr_t);
11446         dof->dofh_secsize = sizeof (dof_sec_t);
11447         dof->dofh_secnum = 1;        /* only DOF_SECT_OPTDESC */
11448         dof->dofh_secoff = sizeof (dof_hdr_t);
11449         dof->dofh_loadsz = len;
11450         dof->dofh_filesz = len;
11451         dof->dofh_pad = 0;
11452 
11453         /*
11454          * Fill in the option section header...
11455          */
11456         sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
11457         sec->dofs_type = DOF_SECT_OPTDESC;
11458         sec->dofs_align = sizeof (uint64_t);
11459         sec->dofs_flags = DOF_SECF_LOAD;
11460         sec->dofs_entsize = sizeof (dof_optdesc_t);
11461 
11462         opt = (dof_optdesc_t *)((uintptr_t)sec +
11463             roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
11464 
11465         sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
11466         sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11467 
11468         for (i = 0; i < DTRACEOPT_MAX; i++) {
11469                 opt[i].dofo_option = i;
11470                 opt[i].dofo_strtab = DOF_SECIDX_NONE;
11471                 opt[i].dofo_value = state->dts_options[i];
11472         }
11473 
11474         return (dof);
11475 }
11476 
11477 static dof_hdr_t *
11478 dtrace_dof_copyin(uintptr_t uarg, int *errp)
11479 {
11480         dof_hdr_t hdr, *dof;
11481 
11482         ASSERT(!MUTEX_HELD(&dtrace_lock));
11483 
11484         /*
11485          * First, we're going to copyin() the sizeof (dof_hdr_t).
11486          */
11487         if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
11488                 dtrace_dof_error(NULL, "failed to copyin DOF header");
11489                 *errp = EFAULT;
11490                 return (NULL);
11491         }
11492 
11493         /*
11494          * Now we'll allocate the entire DOF and copy it in -- provided
11495          * that the length isn't outrageous.
11496          */
11497         if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
11498                 dtrace_dof_error(&hdr, "load size exceeds maximum");
11499                 *errp = E2BIG;
11500                 return (NULL);
11501         }
11502 
11503         if (hdr.dofh_loadsz < sizeof (hdr)) {
11504                 dtrace_dof_error(&hdr, "invalid load size");
11505                 *errp = EINVAL;
11506                 return (NULL);
11507         }
11508 
11509         dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
11510 
11511         if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
11512             dof->dofh_loadsz != hdr.dofh_loadsz) {
11513                 kmem_free(dof, hdr.dofh_loadsz);
11514                 *errp = EFAULT;
11515                 return (NULL);
11516         }
11517 
11518         return (dof);
11519 }
11520 
11521 static dof_hdr_t *
11522 dtrace_dof_property(const char *name)
11523 {
11524         uchar_t *buf;
11525         uint64_t loadsz;
11526         unsigned int len, i;
11527         dof_hdr_t *dof;
11528 
11529         /*
11530          * Unfortunately, array of values in .conf files are always (and
11531          * only) interpreted to be integer arrays.  We must read our DOF
11532          * as an integer array, and then squeeze it into a byte array.
11533          */
11534         if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
11535             (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
11536                 return (NULL);
11537 
11538         for (i = 0; i < len; i++)
11539                 buf[i] = (uchar_t)(((int *)buf)[i]);
11540 
11541         if (len < sizeof (dof_hdr_t)) {
11542                 ddi_prop_free(buf);
11543                 dtrace_dof_error(NULL, "truncated header");
11544                 return (NULL);
11545         }
11546 
11547         if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
11548                 ddi_prop_free(buf);
11549                 dtrace_dof_error(NULL, "truncated DOF");
11550                 return (NULL);
11551         }
11552 
11553         if (loadsz >= dtrace_dof_maxsize) {
11554                 ddi_prop_free(buf);
11555                 dtrace_dof_error(NULL, "oversized DOF");
11556                 return (NULL);
11557         }
11558 
11559         dof = kmem_alloc(loadsz, KM_SLEEP);
11560         bcopy(buf, dof, loadsz);
11561         ddi_prop_free(buf);
11562 
11563         return (dof);
11564 }
11565 
11566 static void
11567 dtrace_dof_destroy(dof_hdr_t *dof)
11568 {
11569         kmem_free(dof, dof->dofh_loadsz);
11570 }
11571 
11572 /*
11573  * Return the dof_sec_t pointer corresponding to a given section index.  If the
11574  * index is not valid, dtrace_dof_error() is called and NULL is returned.  If
11575  * a type other than DOF_SECT_NONE is specified, the header is checked against
11576  * this type and NULL is returned if the types do not match.
11577  */
11578 static dof_sec_t *
11579 dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
11580 {
11581         dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
11582             ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
11583 
11584         if (i >= dof->dofh_secnum) {
11585                 dtrace_dof_error(dof, "referenced section index is invalid");
11586                 return (NULL);
11587         }
11588 
11589         if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
11590                 dtrace_dof_error(dof, "referenced section is not loadable");
11591                 return (NULL);
11592         }
11593 
11594         if (type != DOF_SECT_NONE && type != sec->dofs_type) {
11595                 dtrace_dof_error(dof, "referenced section is the wrong type");
11596                 return (NULL);
11597         }
11598 
11599         return (sec);
11600 }
11601 
11602 static dtrace_probedesc_t *
11603 dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
11604 {
11605         dof_probedesc_t *probe;
11606         dof_sec_t *strtab;
11607         uintptr_t daddr = (uintptr_t)dof;
11608         uintptr_t str;
11609         size_t size;
11610 
11611         if (sec->dofs_type != DOF_SECT_PROBEDESC) {
11612                 dtrace_dof_error(dof, "invalid probe section");
11613                 return (NULL);
11614         }
11615 
11616         if (sec->dofs_align != sizeof (dof_secidx_t)) {
11617                 dtrace_dof_error(dof, "bad alignment in probe description");
11618                 return (NULL);
11619         }
11620 
11621         if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
11622                 dtrace_dof_error(dof, "truncated probe description");
11623                 return (NULL);
11624         }
11625 
11626         probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
11627         strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
11628 
11629         if (strtab == NULL)
11630                 return (NULL);
11631 
11632         str = daddr + strtab->dofs_offset;
11633         size = strtab->dofs_size;
11634 
11635         if (probe->dofp_provider >= strtab->dofs_size) {
11636                 dtrace_dof_error(dof, "corrupt probe provider");
11637                 return (NULL);
11638         }
11639 
11640         (void) strncpy(desc->dtpd_provider,
11641             (char *)(str + probe->dofp_provider),
11642             MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
11643 
11644         if (probe->dofp_mod >= strtab->dofs_size) {
11645                 dtrace_dof_error(dof, "corrupt probe module");
11646                 return (NULL);
11647         }
11648 
11649         (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
11650             MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
11651 
11652         if (probe->dofp_func >= strtab->dofs_size) {
11653                 dtrace_dof_error(dof, "corrupt probe function");
11654                 return (NULL);
11655         }
11656 
11657         (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
11658             MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
11659 
11660         if (probe->dofp_name >= strtab->dofs_size) {
11661                 dtrace_dof_error(dof, "corrupt probe name");
11662                 return (NULL);
11663         }
11664 
11665         (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
11666             MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
11667 
11668         return (desc);
11669 }
11670 
11671 static dtrace_difo_t *
11672 dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11673     cred_t *cr)
11674 {
11675         dtrace_difo_t *dp;
11676         size_t ttl = 0;
11677         dof_difohdr_t *dofd;
11678         uintptr_t daddr = (uintptr_t)dof;
11679         size_t max = dtrace_difo_maxsize;
11680         int i, l, n;
11681 
11682         static const struct {
11683                 int section;
11684                 int bufoffs;
11685                 int lenoffs;
11686                 int entsize;
11687                 int align;
11688                 const char *msg;
11689         } difo[] = {
11690                 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
11691                 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
11692                 sizeof (dif_instr_t), "multiple DIF sections" },
11693 
11694                 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
11695                 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
11696                 sizeof (uint64_t), "multiple integer tables" },
11697 
11698                 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
11699                 offsetof(dtrace_difo_t, dtdo_strlen), 0,
11700                 sizeof (char), "multiple string tables" },
11701 
11702                 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
11703                 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
11704                 sizeof (uint_t), "multiple variable tables" },
11705 
11706                 { DOF_SECT_NONE, 0, 0, 0, NULL }
11707         };
11708 
11709         if (sec->dofs_type != DOF_SECT_DIFOHDR) {
11710                 dtrace_dof_error(dof, "invalid DIFO header section");
11711                 return (NULL);
11712         }
11713 
11714         if (sec->dofs_align != sizeof (dof_secidx_t)) {
11715                 dtrace_dof_error(dof, "bad alignment in DIFO header");
11716                 return (NULL);
11717         }
11718 
11719         if (sec->dofs_size < sizeof (dof_difohdr_t) ||
11720             sec->dofs_size % sizeof (dof_secidx_t)) {
11721                 dtrace_dof_error(dof, "bad size in DIFO header");
11722                 return (NULL);
11723         }
11724 
11725         dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
11726         n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
11727 
11728         dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
11729         dp->dtdo_rtype = dofd->dofd_rtype;
11730 
11731         for (l = 0; l < n; l++) {
11732                 dof_sec_t *subsec;
11733                 void **bufp;
11734                 uint32_t *lenp;
11735 
11736                 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
11737                     dofd->dofd_links[l])) == NULL)
11738                         goto err; /* invalid section link */
11739 
11740                 if (ttl + subsec->dofs_size > max) {
11741                         dtrace_dof_error(dof, "exceeds maximum size");
11742                         goto err;
11743                 }
11744 
11745                 ttl += subsec->dofs_size;
11746 
11747                 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
11748                         if (subsec->dofs_type != difo[i].section)
11749                                 continue;
11750 
11751                         if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
11752                                 dtrace_dof_error(dof, "section not loaded");
11753                                 goto err;
11754                         }
11755 
11756                         if (subsec->dofs_align != difo[i].align) {
11757                                 dtrace_dof_error(dof, "bad alignment");
11758                                 goto err;
11759                         }
11760 
11761                         bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
11762                         lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
11763 
11764                         if (*bufp != NULL) {
11765                                 dtrace_dof_error(dof, difo[i].msg);
11766                                 goto err;
11767                         }
11768 
11769                         if (difo[i].entsize != subsec->dofs_entsize) {
11770                                 dtrace_dof_error(dof, "entry size mismatch");
11771                                 goto err;
11772                         }
11773 
11774                         if (subsec->dofs_entsize != 0 &&
11775                             (subsec->dofs_size % subsec->dofs_entsize) != 0) {
11776                                 dtrace_dof_error(dof, "corrupt entry size");
11777                                 goto err;
11778                         }
11779 
11780                         *lenp = subsec->dofs_size;
11781                         *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
11782                         bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
11783                             *bufp, subsec->dofs_size);
11784 
11785                         if (subsec->dofs_entsize != 0)
11786                                 *lenp /= subsec->dofs_entsize;
11787 
11788                         break;
11789                 }
11790 
11791                 /*
11792                  * If we encounter a loadable DIFO sub-section that is not
11793                  * known to us, assume this is a broken program and fail.
11794                  */
11795                 if (difo[i].section == DOF_SECT_NONE &&
11796                     (subsec->dofs_flags & DOF_SECF_LOAD)) {
11797                         dtrace_dof_error(dof, "unrecognized DIFO subsection");
11798                         goto err;
11799                 }
11800         }
11801 
11802         if (dp->dtdo_buf == NULL) {
11803                 /*
11804                  * We can't have a DIF object without DIF text.
11805                  */
11806                 dtrace_dof_error(dof, "missing DIF text");
11807                 goto err;
11808         }
11809 
11810         /*
11811          * Before we validate the DIF object, run through the variable table
11812          * looking for the strings -- if any of their size are under, we'll set
11813          * their size to be the system-wide default string size.  Note that
11814          * this should _not_ happen if the "strsize" option has been set --
11815          * in this case, the compiler should have set the size to reflect the
11816          * setting of the option.
11817          */
11818         for (i = 0; i < dp->dtdo_varlen; i++) {
11819                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
11820                 dtrace_diftype_t *t = &v->dtdv_type;
11821 
11822                 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
11823                         continue;
11824 
11825                 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
11826                         t->dtdt_size = dtrace_strsize_default;
11827         }
11828 
11829         if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
11830                 goto err;
11831 
11832         dtrace_difo_init(dp, vstate);
11833         return (dp);
11834 
11835 err:
11836         kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
11837         kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
11838         kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
11839         kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
11840 
11841         kmem_free(dp, sizeof (dtrace_difo_t));
11842         return (NULL);
11843 }
11844 
11845 static dtrace_predicate_t *
11846 dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11847     cred_t *cr)
11848 {
11849         dtrace_difo_t *dp;
11850 
11851         if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
11852                 return (NULL);
11853 
11854         return (dtrace_predicate_create(dp));
11855 }
11856 
11857 static dtrace_actdesc_t *
11858 dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11859     cred_t *cr)
11860 {
11861         dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
11862         dof_actdesc_t *desc;
11863         dof_sec_t *difosec;
11864         size_t offs;
11865         uintptr_t daddr = (uintptr_t)dof;
11866         uint64_t arg;
11867         dtrace_actkind_t kind;
11868 
11869         if (sec->dofs_type != DOF_SECT_ACTDESC) {
11870                 dtrace_dof_error(dof, "invalid action section");
11871                 return (NULL);
11872         }
11873 
11874         if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
11875                 dtrace_dof_error(dof, "truncated action description");
11876                 return (NULL);
11877         }
11878 
11879         if (sec->dofs_align != sizeof (uint64_t)) {
11880                 dtrace_dof_error(dof, "bad alignment in action description");
11881                 return (NULL);
11882         }
11883 
11884         if (sec->dofs_size < sec->dofs_entsize) {
11885                 dtrace_dof_error(dof, "section entry size exceeds total size");
11886                 return (NULL);
11887         }
11888 
11889         if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
11890                 dtrace_dof_error(dof, "bad entry size in action description");
11891                 return (NULL);
11892         }
11893 
11894         if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
11895                 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
11896                 return (NULL);
11897         }
11898 
11899         for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
11900                 desc = (dof_actdesc_t *)(daddr +
11901                     (uintptr_t)sec->dofs_offset + offs);
11902                 kind = (dtrace_actkind_t)desc->dofa_kind;
11903 
11904                 if ((DTRACEACT_ISPRINTFLIKE(kind) &&
11905                     (kind != DTRACEACT_PRINTA ||
11906                     desc->dofa_strtab != DOF_SECIDX_NONE)) ||
11907                     (kind == DTRACEACT_DIFEXPR &&
11908                     desc->dofa_strtab != DOF_SECIDX_NONE)) {
11909                         dof_sec_t *strtab;
11910                         char *str, *fmt;
11911                         uint64_t i;
11912 
11913                         /*
11914                          * The argument to these actions is an index into the
11915                          * DOF string table.  For printf()-like actions, this
11916                          * is the format string.  For print(), this is the
11917                          * CTF type of the expression result.
11918                          */
11919                         if ((strtab = dtrace_dof_sect(dof,
11920                             DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
11921                                 goto err;
11922 
11923                         str = (char *)((uintptr_t)dof +
11924                             (uintptr_t)strtab->dofs_offset);
11925 
11926                         for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
11927                                 if (str[i] == '\0')
11928                                         break;
11929                         }
11930 
11931                         if (i >= strtab->dofs_size) {
11932                                 dtrace_dof_error(dof, "bogus format string");
11933                                 goto err;
11934                         }
11935 
11936                         if (i == desc->dofa_arg) {
11937                                 dtrace_dof_error(dof, "empty format string");
11938                                 goto err;
11939                         }
11940 
11941                         i -= desc->dofa_arg;
11942                         fmt = kmem_alloc(i + 1, KM_SLEEP);
11943                         bcopy(&str[desc->dofa_arg], fmt, i + 1);
11944                         arg = (uint64_t)(uintptr_t)fmt;
11945                 } else {
11946                         if (kind == DTRACEACT_PRINTA) {
11947                                 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
11948                                 arg = 0;
11949                         } else {
11950                                 arg = desc->dofa_arg;
11951                         }
11952                 }
11953 
11954                 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
11955                     desc->dofa_uarg, arg);
11956 
11957                 if (last != NULL) {
11958                         last->dtad_next = act;
11959                 } else {
11960                         first = act;
11961                 }
11962 
11963                 last = act;
11964 
11965                 if (desc->dofa_difo == DOF_SECIDX_NONE)
11966                         continue;
11967 
11968                 if ((difosec = dtrace_dof_sect(dof,
11969                     DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
11970                         goto err;
11971 
11972                 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
11973 
11974                 if (act->dtad_difo == NULL)
11975                         goto err;
11976         }
11977 
11978         ASSERT(first != NULL);
11979         return (first);
11980 
11981 err:
11982         for (act = first; act != NULL; act = next) {
11983                 next = act->dtad_next;
11984                 dtrace_actdesc_release(act, vstate);
11985         }
11986 
11987         return (NULL);
11988 }
11989 
11990 static dtrace_ecbdesc_t *
11991 dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11992     cred_t *cr)
11993 {
11994         dtrace_ecbdesc_t *ep;
11995         dof_ecbdesc_t *ecb;
11996         dtrace_probedesc_t *desc;
11997         dtrace_predicate_t *pred = NULL;
11998 
11999         if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
12000                 dtrace_dof_error(dof, "truncated ECB description");
12001                 return (NULL);
12002         }
12003 
12004         if (sec->dofs_align != sizeof (uint64_t)) {
12005                 dtrace_dof_error(dof, "bad alignment in ECB description");
12006                 return (NULL);
12007         }
12008 
12009         ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
12010         sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
12011 
12012         if (sec == NULL)
12013                 return (NULL);
12014 
12015         ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12016         ep->dted_uarg = ecb->dofe_uarg;
12017         desc = &ep->dted_probe;
12018 
12019         if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
12020                 goto err;
12021 
12022         if (ecb->dofe_pred != DOF_SECIDX_NONE) {
12023                 if ((sec = dtrace_dof_sect(dof,
12024                     DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
12025                         goto err;
12026 
12027                 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
12028                         goto err;
12029 
12030                 ep->dted_pred.dtpdd_predicate = pred;
12031         }
12032 
12033         if (ecb->dofe_actions != DOF_SECIDX_NONE) {
12034                 if ((sec = dtrace_dof_sect(dof,
12035                     DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
12036                         goto err;
12037 
12038                 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
12039 
12040                 if (ep->dted_action == NULL)
12041                         goto err;
12042         }
12043 
12044         return (ep);
12045 
12046 err:
12047         if (pred != NULL)
12048                 dtrace_predicate_release(pred, vstate);
12049         kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12050         return (NULL);
12051 }
12052 
12053 /*
12054  * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
12055  * specified DOF.  At present, this amounts to simply adding 'ubase' to the
12056  * site of any user SETX relocations to account for load object base address.
12057  * In the future, if we need other relocations, this function can be extended.
12058  */
12059 static int
12060 dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
12061 {
12062         uintptr_t daddr = (uintptr_t)dof;
12063         dof_relohdr_t *dofr =
12064             (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
12065         dof_sec_t *ss, *rs, *ts;
12066         dof_relodesc_t *r;
12067         uint_t i, n;
12068 
12069         if (sec->dofs_size < sizeof (dof_relohdr_t) ||
12070             sec->dofs_align != sizeof (dof_secidx_t)) {
12071                 dtrace_dof_error(dof, "invalid relocation header");
12072                 return (-1);
12073         }
12074 
12075         ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
12076         rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
12077         ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
12078 
12079         if (ss == NULL || rs == NULL || ts == NULL)
12080                 return (-1); /* dtrace_dof_error() has been called already */
12081 
12082         if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
12083             rs->dofs_align != sizeof (uint64_t)) {
12084                 dtrace_dof_error(dof, "invalid relocation section");
12085                 return (-1);
12086         }
12087 
12088         r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
12089         n = rs->dofs_size / rs->dofs_entsize;
12090 
12091         for (i = 0; i < n; i++) {
12092                 uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
12093 
12094                 switch (r->dofr_type) {
12095                 case DOF_RELO_NONE:
12096                         break;
12097                 case DOF_RELO_SETX:
12098                         if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
12099                             sizeof (uint64_t) > ts->dofs_size) {
12100                                 dtrace_dof_error(dof, "bad relocation offset");
12101                                 return (-1);
12102                         }
12103 
12104                         if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
12105                                 dtrace_dof_error(dof, "misaligned setx relo");
12106                                 return (-1);
12107                         }
12108 
12109                         *(uint64_t *)taddr += ubase;
12110                         break;
12111                 default:
12112                         dtrace_dof_error(dof, "invalid relocation type");
12113                         return (-1);
12114                 }
12115 
12116                 r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
12117         }
12118 
12119         return (0);
12120 }
12121 
12122 /*
12123  * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
12124  * header:  it should be at the front of a memory region that is at least
12125  * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
12126  * size.  It need not be validated in any other way.
12127  */
12128 static int
12129 dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
12130     dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
12131 {
12132         uint64_t len = dof->dofh_loadsz, seclen;
12133         uintptr_t daddr = (uintptr_t)dof;
12134         dtrace_ecbdesc_t *ep;
12135         dtrace_enabling_t *enab;
12136         uint_t i;
12137 
12138         ASSERT(MUTEX_HELD(&dtrace_lock));
12139         ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
12140 
12141         /*
12142          * Check the DOF header identification bytes.  In addition to checking
12143          * valid settings, we also verify that unused bits/bytes are zeroed so
12144          * we can use them later without fear of regressing existing binaries.
12145          */
12146         if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
12147             DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
12148                 dtrace_dof_error(dof, "DOF magic string mismatch");
12149                 return (-1);
12150         }
12151 
12152         if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
12153             dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
12154                 dtrace_dof_error(dof, "DOF has invalid data model");
12155                 return (-1);
12156         }
12157 
12158         if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
12159                 dtrace_dof_error(dof, "DOF encoding mismatch");
12160                 return (-1);
12161         }
12162 
12163         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
12164             dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
12165                 dtrace_dof_error(dof, "DOF version mismatch");
12166                 return (-1);
12167         }
12168 
12169         if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
12170                 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
12171                 return (-1);
12172         }
12173 
12174         if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
12175                 dtrace_dof_error(dof, "DOF uses too many integer registers");
12176                 return (-1);
12177         }
12178 
12179         if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
12180                 dtrace_dof_error(dof, "DOF uses too many tuple registers");
12181                 return (-1);
12182         }
12183 
12184         for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
12185                 if (dof->dofh_ident[i] != 0) {
12186                         dtrace_dof_error(dof, "DOF has invalid ident byte set");
12187                         return (-1);
12188                 }
12189         }
12190 
12191         if (dof->dofh_flags & ~DOF_FL_VALID) {
12192                 dtrace_dof_error(dof, "DOF has invalid flag bits set");
12193                 return (-1);
12194         }
12195 
12196         if (dof->dofh_secsize == 0) {
12197                 dtrace_dof_error(dof, "zero section header size");
12198                 return (-1);
12199         }
12200 
12201         /*
12202          * Check that the section headers don't exceed the amount of DOF
12203          * data.  Note that we cast the section size and number of sections
12204          * to uint64_t's to prevent possible overflow in the multiplication.
12205          */
12206         seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
12207 
12208         if (dof->dofh_secoff > len || seclen > len ||
12209             dof->dofh_secoff + seclen > len) {
12210                 dtrace_dof_error(dof, "truncated section headers");
12211                 return (-1);
12212         }
12213 
12214         if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
12215                 dtrace_dof_error(dof, "misaligned section headers");
12216                 return (-1);
12217         }
12218 
12219         if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
12220                 dtrace_dof_error(dof, "misaligned section size");
12221                 return (-1);
12222         }
12223 
12224         /*
12225          * Take an initial pass through the section headers to be sure that
12226          * the headers don't have stray offsets.  If the 'noprobes' flag is
12227          * set, do not permit sections relating to providers, probes, or args.
12228          */
12229         for (i = 0; i < dof->dofh_secnum; i++) {
12230                 dof_sec_t *sec = (dof_sec_t *)(daddr +
12231                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12232 
12233                 if (noprobes) {
12234                         switch (sec->dofs_type) {
12235                         case DOF_SECT_PROVIDER:
12236                         case DOF_SECT_PROBES:
12237                         case DOF_SECT_PRARGS:
12238                         case DOF_SECT_PROFFS:
12239                                 dtrace_dof_error(dof, "illegal sections "
12240                                     "for enabling");
12241                                 return (-1);
12242                         }
12243                 }
12244 
12245                 if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
12246                     !(sec->dofs_flags & DOF_SECF_LOAD)) {
12247                         dtrace_dof_error(dof, "loadable section with load "
12248                             "flag unset");
12249                         return (-1);
12250                 }
12251 
12252                 if (!(sec->dofs_flags & DOF_SECF_LOAD))
12253                         continue; /* just ignore non-loadable sections */
12254 
12255                 if (sec->dofs_align & (sec->dofs_align - 1)) {
12256                         dtrace_dof_error(dof, "bad section alignment");
12257                         return (-1);
12258                 }
12259 
12260                 if (sec->dofs_offset & (sec->dofs_align - 1)) {
12261                         dtrace_dof_error(dof, "misaligned section");
12262                         return (-1);
12263                 }
12264 
12265                 if (sec->dofs_offset > len || sec->dofs_size > len ||
12266                     sec->dofs_offset + sec->dofs_size > len) {
12267                         dtrace_dof_error(dof, "corrupt section header");
12268                         return (-1);
12269                 }
12270 
12271                 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
12272                     sec->dofs_offset + sec->dofs_size - 1) != '\0') {
12273                         dtrace_dof_error(dof, "non-terminating string table");
12274                         return (-1);
12275                 }
12276         }
12277 
12278         /*
12279          * Take a second pass through the sections and locate and perform any
12280          * relocations that are present.  We do this after the first pass to
12281          * be sure that all sections have had their headers validated.
12282          */
12283         for (i = 0; i < dof->dofh_secnum; i++) {
12284                 dof_sec_t *sec = (dof_sec_t *)(daddr +
12285                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12286 
12287                 if (!(sec->dofs_flags & DOF_SECF_LOAD))
12288                         continue; /* skip sections that are not loadable */
12289 
12290                 switch (sec->dofs_type) {
12291                 case DOF_SECT_URELHDR:
12292                         if (dtrace_dof_relocate(dof, sec, ubase) != 0)
12293                                 return (-1);
12294                         break;
12295                 }
12296         }
12297 
12298         if ((enab = *enabp) == NULL)
12299                 enab = *enabp = dtrace_enabling_create(vstate);
12300 
12301         for (i = 0; i < dof->dofh_secnum; i++) {
12302                 dof_sec_t *sec = (dof_sec_t *)(daddr +
12303                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12304 
12305                 if (sec->dofs_type != DOF_SECT_ECBDESC)
12306                         continue;
12307 
12308                 if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
12309                         dtrace_enabling_destroy(enab);
12310                         *enabp = NULL;
12311                         return (-1);
12312                 }
12313 
12314                 dtrace_enabling_add(enab, ep);
12315         }
12316 
12317         return (0);
12318 }
12319 
12320 /*
12321  * Process DOF for any options.  This routine assumes that the DOF has been
12322  * at least processed by dtrace_dof_slurp().
12323  */
12324 static int
12325 dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
12326 {
12327         int i, rval;
12328         uint32_t entsize;
12329         size_t offs;
12330         dof_optdesc_t *desc;
12331 
12332         for (i = 0; i < dof->dofh_secnum; i++) {
12333                 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
12334                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12335 
12336                 if (sec->dofs_type != DOF_SECT_OPTDESC)
12337                         continue;
12338 
12339                 if (sec->dofs_align != sizeof (uint64_t)) {
12340                         dtrace_dof_error(dof, "bad alignment in "
12341                             "option description");
12342                         return (EINVAL);
12343                 }
12344 
12345                 if ((entsize = sec->dofs_entsize) == 0) {
12346                         dtrace_dof_error(dof, "zeroed option entry size");
12347                         return (EINVAL);
12348                 }
12349 
12350                 if (entsize < sizeof (dof_optdesc_t)) {
12351                         dtrace_dof_error(dof, "bad option entry size");
12352                         return (EINVAL);
12353                 }
12354 
12355                 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
12356                         desc = (dof_optdesc_t *)((uintptr_t)dof +
12357                             (uintptr_t)sec->dofs_offset + offs);
12358 
12359                         if (desc->dofo_strtab != DOF_SECIDX_NONE) {
12360                                 dtrace_dof_error(dof, "non-zero option string");
12361                                 return (EINVAL);
12362                         }
12363 
12364                         if (desc->dofo_value == DTRACEOPT_UNSET) {
12365                                 dtrace_dof_error(dof, "unset option");
12366                                 return (EINVAL);
12367                         }
12368 
12369                         if ((rval = dtrace_state_option(state,
12370                             desc->dofo_option, desc->dofo_value)) != 0) {
12371                                 dtrace_dof_error(dof, "rejected option");
12372                                 return (rval);
12373                         }
12374                 }
12375         }
12376 
12377         return (0);
12378 }
12379 
12380 /*
12381  * DTrace Consumer State Functions
12382  */
12383 int
12384 dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
12385 {
12386         size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
12387         void *base;
12388         uintptr_t limit;
12389         dtrace_dynvar_t *dvar, *next, *start;
12390         int i;
12391 
12392         ASSERT(MUTEX_HELD(&dtrace_lock));
12393         ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
12394 
12395         bzero(dstate, sizeof (dtrace_dstate_t));
12396 
12397         if ((dstate->dtds_chunksize = chunksize) == 0)
12398                 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
12399 
12400         if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
12401                 size = min;
12402 
12403         if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL)
12404                 return (ENOMEM);
12405 
12406         dstate->dtds_size = size;
12407         dstate->dtds_base = base;
12408         dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
12409         bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
12410 
12411         hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
12412 
12413         if (hashsize != 1 && (hashsize & 1))
12414                 hashsize--;
12415 
12416         dstate->dtds_hashsize = hashsize;
12417         dstate->dtds_hash = dstate->dtds_base;
12418 
12419         /*
12420          * Set all of our hash buckets to point to the single sink, and (if
12421          * it hasn't already been set), set the sink's hash value to be the
12422          * sink sentinel value.  The sink is needed for dynamic variable
12423          * lookups to know that they have iterated over an entire, valid hash
12424          * chain.
12425          */
12426         for (i = 0; i < hashsize; i++)
12427                 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
12428 
12429         if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
12430                 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
12431 
12432         /*
12433          * Determine number of active CPUs.  Divide free list evenly among
12434          * active CPUs.
12435          */
12436         start = (dtrace_dynvar_t *)
12437             ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
12438         limit = (uintptr_t)base + size;
12439 
12440         maxper = (limit - (uintptr_t)start) / NCPU;
12441         maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
12442 
12443         for (i = 0; i < NCPU; i++) {
12444                 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
12445 
12446                 /*
12447                  * If we don't even have enough chunks to make it once through
12448                  * NCPUs, we're just going to allocate everything to the first
12449                  * CPU.  And if we're on the last CPU, we're going to allocate
12450                  * whatever is left over.  In either case, we set the limit to
12451                  * be the limit of the dynamic variable space.
12452                  */
12453                 if (maxper == 0 || i == NCPU - 1) {
12454                         limit = (uintptr_t)base + size;
12455                         start = NULL;
12456                 } else {
12457                         limit = (uintptr_t)start + maxper;
12458                         start = (dtrace_dynvar_t *)limit;
12459                 }
12460 
12461                 ASSERT(limit <= (uintptr_t)base + size);
12462 
12463                 for (;;) {
12464                         next = (dtrace_dynvar_t *)((uintptr_t)dvar +
12465                             dstate->dtds_chunksize);
12466 
12467                         if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
12468                                 break;
12469 
12470                         dvar->dtdv_next = next;
12471                         dvar = next;
12472                 }
12473 
12474                 if (maxper == 0)
12475                         break;
12476         }
12477 
12478         return (0);
12479 }
12480 
12481 void
12482 dtrace_dstate_fini(dtrace_dstate_t *dstate)
12483 {
12484         ASSERT(MUTEX_HELD(&cpu_lock));
12485 
12486         if (dstate->dtds_base == NULL)
12487                 return;
12488 
12489         kmem_free(dstate->dtds_base, dstate->dtds_size);
12490         kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
12491 }
12492 
12493 static void
12494 dtrace_vstate_fini(dtrace_vstate_t *vstate)
12495 {
12496         /*
12497          * Logical XOR, where are you?
12498          */
12499         ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
12500 
12501         if (vstate->dtvs_nglobals > 0) {
12502                 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
12503                     sizeof (dtrace_statvar_t *));
12504         }
12505 
12506         if (vstate->dtvs_ntlocals > 0) {
12507                 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
12508                     sizeof (dtrace_difv_t));
12509         }
12510 
12511         ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
12512 
12513         if (vstate->dtvs_nlocals > 0) {
12514                 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
12515                     sizeof (dtrace_statvar_t *));
12516         }
12517 }
12518 
12519 static void
12520 dtrace_state_clean(dtrace_state_t *state)
12521 {
12522         if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
12523                 return;
12524 
12525         dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
12526         dtrace_speculation_clean(state);
12527 }
12528 
12529 static void
12530 dtrace_state_deadman(dtrace_state_t *state)
12531 {
12532         hrtime_t now;
12533 
12534         dtrace_sync();
12535 
12536         now = dtrace_gethrtime();
12537 
12538         if (state != dtrace_anon.dta_state &&
12539             now - state->dts_laststatus >= dtrace_deadman_user)
12540                 return;
12541 
12542         /*
12543          * We must be sure that dts_alive never appears to be less than the
12544          * value upon entry to dtrace_state_deadman(), and because we lack a
12545          * dtrace_cas64(), we cannot store to it atomically.  We thus instead
12546          * store INT64_MAX to it, followed by a memory barrier, followed by
12547          * the new value.  This assures that dts_alive never appears to be
12548          * less than its true value, regardless of the order in which the
12549          * stores to the underlying storage are issued.
12550          */
12551         state->dts_alive = INT64_MAX;
12552         dtrace_membar_producer();
12553         state->dts_alive = now;
12554 }
12555 
12556 dtrace_state_t *
12557 dtrace_state_create(dev_t *devp, cred_t *cr)
12558 {
12559         minor_t minor;
12560         major_t major;
12561         char c[30];
12562         dtrace_state_t *state;
12563         dtrace_optval_t *opt;
12564         int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
12565 
12566         ASSERT(MUTEX_HELD(&dtrace_lock));
12567         ASSERT(MUTEX_HELD(&cpu_lock));
12568 
12569         minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
12570             VM_BESTFIT | VM_SLEEP);
12571 
12572         if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
12573                 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
12574                 return (NULL);
12575         }
12576 
12577         state = ddi_get_soft_state(dtrace_softstate, minor);
12578         state->dts_epid = DTRACE_EPIDNONE + 1;
12579 
12580         (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
12581         state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
12582             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
12583 
12584         if (devp != NULL) {
12585                 major = getemajor(*devp);
12586         } else {
12587                 major = ddi_driver_major(dtrace_devi);
12588         }
12589 
12590         state->dts_dev = makedevice(major, minor);
12591 
12592         if (devp != NULL)
12593                 *devp = state->dts_dev;
12594 
12595         /*
12596          * We allocate NCPU buffers.  On the one hand, this can be quite
12597          * a bit of memory per instance (nearly 36K on a Starcat).  On the
12598          * other hand, it saves an additional memory reference in the probe
12599          * path.
12600          */
12601         state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
12602         state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
12603         state->dts_cleaner = CYCLIC_NONE;
12604         state->dts_deadman = CYCLIC_NONE;
12605         state->dts_vstate.dtvs_state = state;
12606 
12607         for (i = 0; i < DTRACEOPT_MAX; i++)
12608                 state->dts_options[i] = DTRACEOPT_UNSET;
12609 
12610         /*
12611          * Set the default options.
12612          */
12613         opt = state->dts_options;
12614         opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
12615         opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
12616         opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
12617         opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
12618         opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
12619         opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
12620         opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
12621         opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
12622         opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
12623         opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
12624         opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
12625         opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
12626         opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
12627         opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
12628 
12629         state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
12630 
12631         /*
12632          * Depending on the user credentials, we set flag bits which alter probe
12633          * visibility or the amount of destructiveness allowed.  In the case of
12634          * actual anonymous tracing, or the possession of all privileges, all of
12635          * the normal checks are bypassed.
12636          */
12637         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
12638                 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
12639                 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
12640         } else {
12641                 /*
12642                  * Set up the credentials for this instantiation.  We take a
12643                  * hold on the credential to prevent it from disappearing on
12644                  * us; this in turn prevents the zone_t referenced by this
12645                  * credential from disappearing.  This means that we can
12646                  * examine the credential and the zone from probe context.
12647                  */
12648                 crhold(cr);
12649                 state->dts_cred.dcr_cred = cr;
12650 
12651                 /*
12652                  * CRA_PROC means "we have *some* privilege for dtrace" and
12653                  * unlocks the use of variables like pid, zonename, etc.
12654                  */
12655                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
12656                     PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12657                         state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
12658                 }
12659 
12660                 /*
12661                  * dtrace_user allows use of syscall and profile providers.
12662                  * If the user also has proc_owner and/or proc_zone, we
12663                  * extend the scope to include additional visibility and
12664                  * destructive power.
12665                  */
12666                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
12667                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
12668                                 state->dts_cred.dcr_visible |=
12669                                     DTRACE_CRV_ALLPROC;
12670 
12671                                 state->dts_cred.dcr_action |=
12672                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12673                         }
12674 
12675                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
12676                                 state->dts_cred.dcr_visible |=
12677                                     DTRACE_CRV_ALLZONE;
12678 
12679                                 state->dts_cred.dcr_action |=
12680                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12681                         }
12682 
12683                         /*
12684                          * If we have all privs in whatever zone this is,
12685                          * we can do destructive things to processes which
12686                          * have altered credentials.
12687                          */
12688                         if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
12689                             cr->cr_zone->zone_privset)) {
12690                                 state->dts_cred.dcr_action |=
12691                                     DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12692                         }
12693                 }
12694 
12695                 /*
12696                  * Holding the dtrace_kernel privilege also implies that
12697                  * the user has the dtrace_user privilege from a visibility
12698                  * perspective.  But without further privileges, some
12699                  * destructive actions are not available.
12700                  */
12701                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
12702                         /*
12703                          * Make all probes in all zones visible.  However,
12704                          * this doesn't mean that all actions become available
12705                          * to all zones.
12706                          */
12707                         state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
12708                             DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
12709 
12710                         state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
12711                             DTRACE_CRA_PROC;
12712                         /*
12713                          * Holding proc_owner means that destructive actions
12714                          * for *this* zone are allowed.
12715                          */
12716                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12717                                 state->dts_cred.dcr_action |=
12718                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12719 
12720                         /*
12721                          * Holding proc_zone means that destructive actions
12722                          * for this user/group ID in all zones is allowed.
12723                          */
12724                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12725                                 state->dts_cred.dcr_action |=
12726                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12727 
12728                         /*
12729                          * If we have all privs in whatever zone this is,
12730                          * we can do destructive things to processes which
12731                          * have altered credentials.
12732                          */
12733                         if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
12734                             cr->cr_zone->zone_privset)) {
12735                                 state->dts_cred.dcr_action |=
12736                                     DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12737                         }
12738                 }
12739 
12740                 /*
12741                  * Holding the dtrace_proc privilege gives control over fasttrap
12742                  * and pid providers.  We need to grant wider destructive
12743                  * privileges in the event that the user has proc_owner and/or
12744                  * proc_zone.
12745                  */
12746                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12747                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12748                                 state->dts_cred.dcr_action |=
12749                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12750 
12751                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12752                                 state->dts_cred.dcr_action |=
12753                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12754                 }
12755         }
12756 
12757         return (state);
12758 }
12759 
12760 static int
12761 dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
12762 {
12763         dtrace_optval_t *opt = state->dts_options, size;
12764         processorid_t cpu;
12765         int flags = 0, rval, factor, divisor = 1;
12766 
12767         ASSERT(MUTEX_HELD(&dtrace_lock));
12768         ASSERT(MUTEX_HELD(&cpu_lock));
12769         ASSERT(which < DTRACEOPT_MAX);
12770         ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
12771             (state == dtrace_anon.dta_state &&
12772             state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
12773 
12774         if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
12775                 return (0);
12776 
12777         if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
12778                 cpu = opt[DTRACEOPT_CPU];
12779 
12780         if (which == DTRACEOPT_SPECSIZE)
12781                 flags |= DTRACEBUF_NOSWITCH;
12782 
12783         if (which == DTRACEOPT_BUFSIZE) {
12784                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
12785                         flags |= DTRACEBUF_RING;
12786 
12787                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
12788                         flags |= DTRACEBUF_FILL;
12789 
12790                 if (state != dtrace_anon.dta_state ||
12791                     state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
12792                         flags |= DTRACEBUF_INACTIVE;
12793         }
12794 
12795         for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) {
12796                 /*
12797                  * The size must be 8-byte aligned.  If the size is not 8-byte
12798                  * aligned, drop it down by the difference.
12799                  */
12800                 if (size & (sizeof (uint64_t) - 1))
12801                         size -= size & (sizeof (uint64_t) - 1);
12802 
12803                 if (size < state->dts_reserve) {
12804                         /*
12805                          * Buffers always must be large enough to accommodate
12806                          * their prereserved space.  We return E2BIG instead
12807                          * of ENOMEM in this case to allow for user-level
12808                          * software to differentiate the cases.
12809                          */
12810                         return (E2BIG);
12811                 }
12812 
12813                 rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor);
12814 
12815                 if (rval != ENOMEM) {
12816                         opt[which] = size;
12817                         return (rval);
12818                 }
12819 
12820                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
12821                         return (rval);
12822 
12823                 for (divisor = 2; divisor < factor; divisor <<= 1)
12824                         continue;
12825         }
12826 
12827         return (ENOMEM);
12828 }
12829 
12830 static int
12831 dtrace_state_buffers(dtrace_state_t *state)
12832 {
12833         dtrace_speculation_t *spec = state->dts_speculations;
12834         int rval, i;
12835 
12836         if ((rval = dtrace_state_buffer(state, state->dts_buffer,
12837             DTRACEOPT_BUFSIZE)) != 0)
12838                 return (rval);
12839 
12840         if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
12841             DTRACEOPT_AGGSIZE)) != 0)
12842                 return (rval);
12843 
12844         for (i = 0; i < state->dts_nspeculations; i++) {
12845                 if ((rval = dtrace_state_buffer(state,
12846                     spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
12847                         return (rval);
12848         }
12849 
12850         return (0);
12851 }
12852 
12853 static void
12854 dtrace_state_prereserve(dtrace_state_t *state)
12855 {
12856         dtrace_ecb_t *ecb;
12857         dtrace_probe_t *probe;
12858 
12859         state->dts_reserve = 0;
12860 
12861         if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
12862                 return;
12863 
12864         /*
12865          * If our buffer policy is a "fill" buffer policy, we need to set the
12866          * prereserved space to be the space required by the END probes.
12867          */
12868         probe = dtrace_probes[dtrace_probeid_end - 1];
12869         ASSERT(probe != NULL);
12870 
12871         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
12872                 if (ecb->dte_state != state)
12873                         continue;
12874 
12875                 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
12876         }
12877 }
12878 
12879 static int
12880 dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
12881 {
12882         dtrace_optval_t *opt = state->dts_options, sz, nspec;
12883         dtrace_speculation_t *spec;
12884         dtrace_buffer_t *buf;
12885         cyc_handler_t hdlr;
12886         cyc_time_t when;
12887         int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
12888         dtrace_icookie_t cookie;
12889 
12890         mutex_enter(&cpu_lock);
12891         mutex_enter(&dtrace_lock);
12892 
12893         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
12894                 rval = EBUSY;
12895                 goto out;
12896         }
12897 
12898         /*
12899          * Before we can perform any checks, we must prime all of the
12900          * retained enablings that correspond to this state.
12901          */
12902         dtrace_enabling_prime(state);
12903 
12904         if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
12905                 rval = EACCES;
12906                 goto out;
12907         }
12908 
12909         dtrace_state_prereserve(state);
12910 
12911         /*
12912          * Now we want to do is try to allocate our speculations.
12913          * We do not automatically resize the number of speculations; if
12914          * this fails, we will fail the operation.
12915          */
12916         nspec = opt[DTRACEOPT_NSPEC];
12917         ASSERT(nspec != DTRACEOPT_UNSET);
12918 
12919         if (nspec > INT_MAX) {
12920                 rval = ENOMEM;
12921                 goto out;
12922         }
12923 
12924         spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t),
12925             KM_NOSLEEP | KM_NORMALPRI);
12926 
12927         if (spec == NULL) {
12928                 rval = ENOMEM;
12929                 goto out;
12930         }
12931 
12932         state->dts_speculations = spec;
12933         state->dts_nspeculations = (int)nspec;
12934 
12935         for (i = 0; i < nspec; i++) {
12936                 if ((buf = kmem_zalloc(bufsize,
12937                     KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
12938                         rval = ENOMEM;
12939                         goto err;
12940                 }
12941 
12942                 spec[i].dtsp_buffer = buf;
12943         }
12944 
12945         if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
12946                 if (dtrace_anon.dta_state == NULL) {
12947                         rval = ENOENT;
12948                         goto out;
12949                 }
12950 
12951                 if (state->dts_necbs != 0) {
12952                         rval = EALREADY;
12953                         goto out;
12954                 }
12955 
12956                 state->dts_anon = dtrace_anon_grab();
12957                 ASSERT(state->dts_anon != NULL);
12958                 state = state->dts_anon;
12959 
12960                 /*
12961                  * We want "grabanon" to be set in the grabbed state, so we'll
12962                  * copy that option value from the grabbing state into the
12963                  * grabbed state.
12964                  */
12965                 state->dts_options[DTRACEOPT_GRABANON] =
12966                     opt[DTRACEOPT_GRABANON];
12967 
12968                 *cpu = dtrace_anon.dta_beganon;
12969 
12970                 /*
12971                  * If the anonymous state is active (as it almost certainly
12972                  * is if the anonymous enabling ultimately matched anything),
12973                  * we don't allow any further option processing -- but we
12974                  * don't return failure.
12975                  */
12976                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
12977                         goto out;
12978         }
12979 
12980         if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
12981             opt[DTRACEOPT_AGGSIZE] != 0) {
12982                 if (state->dts_aggregations == NULL) {
12983                         /*
12984                          * We're not going to create an aggregation buffer
12985                          * because we don't have any ECBs that contain
12986                          * aggregations -- set this option to 0.
12987                          */
12988                         opt[DTRACEOPT_AGGSIZE] = 0;
12989                 } else {
12990                         /*
12991                          * If we have an aggregation buffer, we must also have
12992                          * a buffer to use as scratch.
12993                          */
12994                         if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
12995                             opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
12996                                 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
12997                         }
12998                 }
12999         }
13000 
13001         if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
13002             opt[DTRACEOPT_SPECSIZE] != 0) {
13003                 if (!state->dts_speculates) {
13004                         /*
13005                          * We're not going to create speculation buffers
13006                          * because we don't have any ECBs that actually
13007                          * speculate -- set the speculation size to 0.
13008                          */
13009                         opt[DTRACEOPT_SPECSIZE] = 0;
13010                 }
13011         }
13012 
13013         /*
13014          * The bare minimum size for any buffer that we're actually going to
13015          * do anything to is sizeof (uint64_t).
13016          */
13017         sz = sizeof (uint64_t);
13018 
13019         if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
13020             (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
13021             (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
13022                 /*
13023                  * A buffer size has been explicitly set to 0 (or to a size
13024                  * that will be adjusted to 0) and we need the space -- we
13025                  * need to return failure.  We return ENOSPC to differentiate
13026                  * it from failing to allocate a buffer due to failure to meet
13027                  * the reserve (for which we return E2BIG).
13028                  */
13029                 rval = ENOSPC;
13030                 goto out;
13031         }
13032 
13033         if ((rval = dtrace_state_buffers(state)) != 0)
13034                 goto err;
13035 
13036         if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
13037                 sz = dtrace_dstate_defsize;
13038 
13039         do {
13040                 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
13041 
13042                 if (rval == 0)
13043                         break;
13044 
13045                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
13046                         goto err;
13047         } while (sz >>= 1);
13048 
13049         opt[DTRACEOPT_DYNVARSIZE] = sz;
13050 
13051         if (rval != 0)
13052                 goto err;
13053 
13054         if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
13055                 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
13056 
13057         if (opt[DTRACEOPT_CLEANRATE] == 0)
13058                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13059 
13060         if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
13061                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
13062 
13063         if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
13064                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13065 
13066         hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
13067         hdlr.cyh_arg = state;
13068         hdlr.cyh_level = CY_LOW_LEVEL;
13069 
13070         when.cyt_when = 0;
13071         when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
13072 
13073         state->dts_cleaner = cyclic_add(&hdlr, &when);
13074 
13075         hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
13076         hdlr.cyh_arg = state;
13077         hdlr.cyh_level = CY_LOW_LEVEL;
13078 
13079         when.cyt_when = 0;
13080         when.cyt_interval = dtrace_deadman_interval;
13081 
13082         state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
13083         state->dts_deadman = cyclic_add(&hdlr, &when);
13084 
13085         state->dts_activity = DTRACE_ACTIVITY_WARMUP;
13086 
13087         /*
13088          * Now it's time to actually fire the BEGIN probe.  We need to disable
13089          * interrupts here both to record the CPU on which we fired the BEGIN
13090          * probe (the data from this CPU will be processed first at user
13091          * level) and to manually activate the buffer for this CPU.
13092          */
13093         cookie = dtrace_interrupt_disable();
13094         *cpu = CPU->cpu_id;
13095         ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
13096         state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
13097 
13098         dtrace_probe(dtrace_probeid_begin,
13099             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13100         dtrace_interrupt_enable(cookie);
13101         /*
13102          * We may have had an exit action from a BEGIN probe; only change our
13103          * state to ACTIVE if we're still in WARMUP.
13104          */
13105         ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
13106             state->dts_activity == DTRACE_ACTIVITY_DRAINING);
13107 
13108         if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
13109                 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
13110 
13111         /*
13112          * Regardless of whether or not now we're in ACTIVE or DRAINING, we
13113          * want each CPU to transition its principal buffer out of the
13114          * INACTIVE state.  Doing this assures that no CPU will suddenly begin
13115          * processing an ECB halfway down a probe's ECB chain; all CPUs will
13116          * atomically transition from processing none of a state's ECBs to
13117          * processing all of them.
13118          */
13119         dtrace_xcall(DTRACE_CPUALL,
13120             (dtrace_xcall_t)dtrace_buffer_activate, state);
13121         goto out;
13122 
13123 err:
13124         dtrace_buffer_free(state->dts_buffer);
13125         dtrace_buffer_free(state->dts_aggbuffer);
13126 
13127         if ((nspec = state->dts_nspeculations) == 0) {
13128                 ASSERT(state->dts_speculations == NULL);
13129                 goto out;
13130         }
13131 
13132         spec = state->dts_speculations;
13133         ASSERT(spec != NULL);
13134 
13135         for (i = 0; i < state->dts_nspeculations; i++) {
13136                 if ((buf = spec[i].dtsp_buffer) == NULL)
13137                         break;
13138 
13139                 dtrace_buffer_free(buf);
13140                 kmem_free(buf, bufsize);
13141         }
13142 
13143         kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13144         state->dts_nspeculations = 0;
13145         state->dts_speculations = NULL;
13146 
13147 out:
13148         mutex_exit(&dtrace_lock);
13149         mutex_exit(&cpu_lock);
13150 
13151         return (rval);
13152 }
13153 
13154 static int
13155 dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
13156 {
13157         dtrace_icookie_t cookie;
13158 
13159         ASSERT(MUTEX_HELD(&dtrace_lock));
13160 
13161         if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
13162             state->dts_activity != DTRACE_ACTIVITY_DRAINING)
13163                 return (EINVAL);
13164 
13165         /*
13166          * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
13167          * to be sure that every CPU has seen it.  See below for the details
13168          * on why this is done.
13169          */
13170         state->dts_activity = DTRACE_ACTIVITY_DRAINING;
13171         dtrace_sync();
13172 
13173         /*
13174          * By this point, it is impossible for any CPU to be still processing
13175          * with DTRACE_ACTIVITY_ACTIVE.  We can thus set our activity to
13176          * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
13177          * other CPU in dtrace_buffer_reserve().  This allows dtrace_probe()
13178          * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
13179          * iff we're in the END probe.
13180          */
13181         state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
13182         dtrace_sync();
13183         ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
13184 
13185         /*
13186          * Finally, we can release the reserve and call the END probe.  We
13187          * disable interrupts across calling the END probe to allow us to
13188          * return the CPU on which we actually called the END probe.  This
13189          * allows user-land to be sure that this CPU's principal buffer is
13190          * processed last.
13191          */
13192         state->dts_reserve = 0;
13193 
13194         cookie = dtrace_interrupt_disable();
13195         *cpu = CPU->cpu_id;
13196         dtrace_probe(dtrace_probeid_end,
13197             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13198         dtrace_interrupt_enable(cookie);
13199 
13200         state->dts_activity = DTRACE_ACTIVITY_STOPPED;
13201         dtrace_sync();
13202 
13203         return (0);
13204 }
13205 
13206 static int
13207 dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
13208     dtrace_optval_t val)
13209 {
13210         ASSERT(MUTEX_HELD(&dtrace_lock));
13211 
13212         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
13213                 return (EBUSY);
13214 
13215         if (option >= DTRACEOPT_MAX)
13216                 return (EINVAL);
13217 
13218         if (option != DTRACEOPT_CPU && val < 0)
13219                 return (EINVAL);
13220 
13221         switch (option) {
13222         case DTRACEOPT_DESTRUCTIVE:
13223                 if (dtrace_destructive_disallow)
13224                         return (EACCES);
13225 
13226                 state->dts_cred.dcr_destructive = 1;
13227                 break;
13228 
13229         case DTRACEOPT_BUFSIZE:
13230         case DTRACEOPT_DYNVARSIZE:
13231         case DTRACEOPT_AGGSIZE:
13232         case DTRACEOPT_SPECSIZE:
13233         case DTRACEOPT_STRSIZE:
13234                 if (val < 0)
13235                         return (EINVAL);
13236 
13237                 if (val >= LONG_MAX) {
13238                         /*
13239                          * If this is an otherwise negative value, set it to
13240                          * the highest multiple of 128m less than LONG_MAX.
13241                          * Technically, we're adjusting the size without
13242                          * regard to the buffer resizing policy, but in fact,
13243                          * this has no effect -- if we set the buffer size to
13244                          * ~LONG_MAX and the buffer policy is ultimately set to
13245                          * be "manual", the buffer allocation is guaranteed to
13246                          * fail, if only because the allocation requires two
13247                          * buffers.  (We set the the size to the highest
13248                          * multiple of 128m because it ensures that the size
13249                          * will remain a multiple of a megabyte when
13250                          * repeatedly halved -- all the way down to 15m.)
13251                          */
13252                         val = LONG_MAX - (1 << 27) + 1;
13253                 }
13254         }
13255 
13256         state->dts_options[option] = val;
13257 
13258         return (0);
13259 }
13260 
13261 static void
13262 dtrace_state_destroy(dtrace_state_t *state)
13263 {
13264         dtrace_ecb_t *ecb;
13265         dtrace_vstate_t *vstate = &state->dts_vstate;
13266         minor_t minor = getminor(state->dts_dev);
13267         int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
13268         dtrace_speculation_t *spec = state->dts_speculations;
13269         int nspec = state->dts_nspeculations;
13270         uint32_t match;
13271 
13272         ASSERT(MUTEX_HELD(&dtrace_lock));
13273         ASSERT(MUTEX_HELD(&cpu_lock));
13274 
13275         /*
13276          * First, retract any retained enablings for this state.
13277          */
13278         dtrace_enabling_retract(state);
13279         ASSERT(state->dts_nretained == 0);
13280 
13281         if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
13282             state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
13283                 /*
13284                  * We have managed to come into dtrace_state_destroy() on a
13285                  * hot enabling -- almost certainly because of a disorderly
13286                  * shutdown of a consumer.  (That is, a consumer that is
13287                  * exiting without having called dtrace_stop().) In this case,
13288                  * we're going to set our activity to be KILLED, and then
13289                  * issue a sync to be sure that everyone is out of probe
13290                  * context before we start blowing away ECBs.
13291                  */
13292                 state->dts_activity = DTRACE_ACTIVITY_KILLED;
13293                 dtrace_sync();
13294         }
13295 
13296         /*
13297          * Release the credential hold we took in dtrace_state_create().
13298          */
13299         if (state->dts_cred.dcr_cred != NULL)
13300                 crfree(state->dts_cred.dcr_cred);
13301 
13302         /*
13303          * Now we can safely disable and destroy any enabled probes.  Because
13304          * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
13305          * (especially if they're all enabled), we take two passes through the
13306          * ECBs:  in the first, we disable just DTRACE_PRIV_KERNEL probes, and
13307          * in the second we disable whatever is left over.
13308          */
13309         for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
13310                 for (i = 0; i < state->dts_necbs; i++) {
13311                         if ((ecb = state->dts_ecbs[i]) == NULL)
13312                                 continue;
13313 
13314                         if (match && ecb->dte_probe != NULL) {
13315                                 dtrace_probe_t *probe = ecb->dte_probe;
13316                                 dtrace_provider_t *prov = probe->dtpr_provider;
13317 
13318                                 if (!(prov->dtpv_priv.dtpp_flags & match))
13319                                         continue;
13320                         }
13321 
13322                         dtrace_ecb_disable(ecb);
13323                         dtrace_ecb_destroy(ecb);
13324                 }
13325 
13326                 if (!match)
13327                         break;
13328         }
13329 
13330         /*
13331          * Before we free the buffers, perform one more sync to assure that
13332          * every CPU is out of probe context.
13333          */
13334         dtrace_sync();
13335 
13336         dtrace_buffer_free(state->dts_buffer);
13337         dtrace_buffer_free(state->dts_aggbuffer);
13338 
13339         for (i = 0; i < nspec; i++)
13340                 dtrace_buffer_free(spec[i].dtsp_buffer);
13341 
13342         if (state->dts_cleaner != CYCLIC_NONE)
13343                 cyclic_remove(state->dts_cleaner);
13344 
13345         if (state->dts_deadman != CYCLIC_NONE)
13346                 cyclic_remove(state->dts_deadman);
13347 
13348         dtrace_dstate_fini(&vstate->dtvs_dynvars);
13349         dtrace_vstate_fini(vstate);
13350         kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
13351 
13352         if (state->dts_aggregations != NULL) {
13353 #ifdef DEBUG
13354                 for (i = 0; i < state->dts_naggregations; i++)
13355                         ASSERT(state->dts_aggregations[i] == NULL);
13356 #endif
13357                 ASSERT(state->dts_naggregations > 0);
13358                 kmem_free(state->dts_aggregations,
13359                     state->dts_naggregations * sizeof (dtrace_aggregation_t *));
13360         }
13361 
13362         kmem_free(state->dts_buffer, bufsize);
13363         kmem_free(state->dts_aggbuffer, bufsize);
13364 
13365         for (i = 0; i < nspec; i++)
13366                 kmem_free(spec[i].dtsp_buffer, bufsize);
13367 
13368         kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13369 
13370         dtrace_format_destroy(state);
13371 
13372         vmem_destroy(state->dts_aggid_arena);
13373         ddi_soft_state_free(dtrace_softstate, minor);
13374         vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
13375 }
13376 
13377 /*
13378  * DTrace Anonymous Enabling Functions
13379  */
13380 static dtrace_state_t *
13381 dtrace_anon_grab(void)
13382 {
13383         dtrace_state_t *state;
13384 
13385         ASSERT(MUTEX_HELD(&dtrace_lock));
13386 
13387         if ((state = dtrace_anon.dta_state) == NULL) {
13388                 ASSERT(dtrace_anon.dta_enabling == NULL);
13389                 return (NULL);
13390         }
13391 
13392         ASSERT(dtrace_anon.dta_enabling != NULL);
13393         ASSERT(dtrace_retained != NULL);
13394 
13395         dtrace_enabling_destroy(dtrace_anon.dta_enabling);
13396         dtrace_anon.dta_enabling = NULL;
13397         dtrace_anon.dta_state = NULL;
13398 
13399         return (state);
13400 }
13401 
13402 static void
13403 dtrace_anon_property(void)
13404 {
13405         int i, rv;
13406         dtrace_state_t *state;
13407         dof_hdr_t *dof;
13408         char c[32];             /* enough for "dof-data-" + digits */
13409 
13410         ASSERT(MUTEX_HELD(&dtrace_lock));
13411         ASSERT(MUTEX_HELD(&cpu_lock));
13412 
13413         for (i = 0; ; i++) {
13414                 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
13415 
13416                 dtrace_err_verbose = 1;
13417 
13418                 if ((dof = dtrace_dof_property(c)) == NULL) {
13419                         dtrace_err_verbose = 0;
13420                         break;
13421                 }
13422 
13423                 /*
13424                  * We want to create anonymous state, so we need to transition
13425                  * the kernel debugger to indicate that DTrace is active.  If
13426                  * this fails (e.g. because the debugger has modified text in
13427                  * some way), we won't continue with the processing.
13428                  */
13429                 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
13430                         cmn_err(CE_NOTE, "kernel debugger active; anonymous "
13431                             "enabling ignored.");
13432                         dtrace_dof_destroy(dof);
13433                         break;
13434                 }
13435 
13436                 /*
13437                  * If we haven't allocated an anonymous state, we'll do so now.
13438                  */
13439                 if ((state = dtrace_anon.dta_state) == NULL) {
13440                         state = dtrace_state_create(NULL, NULL);
13441                         dtrace_anon.dta_state = state;
13442 
13443                         if (state == NULL) {
13444                                 /*
13445                                  * This basically shouldn't happen:  the only
13446                                  * failure mode from dtrace_state_create() is a
13447                                  * failure of ddi_soft_state_zalloc() that
13448                                  * itself should never happen.  Still, the
13449                                  * interface allows for a failure mode, and
13450                                  * we want to fail as gracefully as possible:
13451                                  * we'll emit an error message and cease
13452                                  * processing anonymous state in this case.
13453                                  */
13454                                 cmn_err(CE_WARN, "failed to create "
13455                                     "anonymous state");
13456                                 dtrace_dof_destroy(dof);
13457                                 break;
13458                         }
13459                 }
13460 
13461                 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
13462                     &dtrace_anon.dta_enabling, 0, B_TRUE);
13463 
13464                 if (rv == 0)
13465                         rv = dtrace_dof_options(dof, state);
13466 
13467                 dtrace_err_verbose = 0;
13468                 dtrace_dof_destroy(dof);
13469 
13470                 if (rv != 0) {
13471                         /*
13472                          * This is malformed DOF; chuck any anonymous state
13473                          * that we created.
13474                          */
13475                         ASSERT(dtrace_anon.dta_enabling == NULL);
13476                         dtrace_state_destroy(state);
13477                         dtrace_anon.dta_state = NULL;
13478                         break;
13479                 }
13480 
13481                 ASSERT(dtrace_anon.dta_enabling != NULL);
13482         }
13483 
13484         if (dtrace_anon.dta_enabling != NULL) {
13485                 int rval;
13486 
13487                 /*
13488                  * dtrace_enabling_retain() can only fail because we are
13489                  * trying to retain more enablings than are allowed -- but
13490                  * we only have one anonymous enabling, and we are guaranteed
13491                  * to be allowed at least one retained enabling; we assert
13492                  * that dtrace_enabling_retain() returns success.
13493                  */
13494                 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
13495                 ASSERT(rval == 0);
13496 
13497                 dtrace_enabling_dump(dtrace_anon.dta_enabling);
13498         }
13499 }
13500 
13501 /*
13502  * DTrace Helper Functions
13503  */
13504 static void
13505 dtrace_helper_trace(dtrace_helper_action_t *helper,
13506     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
13507 {
13508         uint32_t size, next, nnext, i;
13509         dtrace_helptrace_t *ent;
13510         uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
13511 
13512         if (!dtrace_helptrace_enabled)
13513                 return;
13514 
13515         ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
13516 
13517         /*
13518          * What would a tracing framework be without its own tracing
13519          * framework?  (Well, a hell of a lot simpler, for starters...)
13520          */
13521         size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
13522             sizeof (uint64_t) - sizeof (uint64_t);
13523 
13524         /*
13525          * Iterate until we can allocate a slot in the trace buffer.
13526          */
13527         do {
13528                 next = dtrace_helptrace_next;
13529 
13530                 if (next + size < dtrace_helptrace_bufsize) {
13531                         nnext = next + size;
13532                 } else {
13533                         nnext = size;
13534                 }
13535         } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
13536 
13537         /*
13538          * We have our slot; fill it in.
13539          */
13540         if (nnext == size)
13541                 next = 0;
13542 
13543         ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
13544         ent->dtht_helper = helper;
13545         ent->dtht_where = where;
13546         ent->dtht_nlocals = vstate->dtvs_nlocals;
13547 
13548         ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
13549             mstate->dtms_fltoffs : -1;
13550         ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
13551         ent->dtht_illval = cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
13552 
13553         for (i = 0; i < vstate->dtvs_nlocals; i++) {
13554                 dtrace_statvar_t *svar;
13555 
13556                 if ((svar = vstate->dtvs_locals[i]) == NULL)
13557                         continue;
13558 
13559                 ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
13560                 ent->dtht_locals[i] =
13561                     ((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
13562         }
13563 }
13564 
13565 static uint64_t
13566 dtrace_helper(int which, dtrace_mstate_t *mstate,
13567     dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
13568 {
13569         uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
13570         uint64_t sarg0 = mstate->dtms_arg[0];
13571         uint64_t sarg1 = mstate->dtms_arg[1];
13572         uint64_t rval;
13573         dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
13574         dtrace_helper_action_t *helper;
13575         dtrace_vstate_t *vstate;
13576         dtrace_difo_t *pred;
13577         int i, trace = dtrace_helptrace_enabled;
13578 
13579         ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
13580 
13581         if (helpers == NULL)
13582                 return (0);
13583 
13584         if ((helper = helpers->dthps_actions[which]) == NULL)
13585                 return (0);
13586 
13587         vstate = &helpers->dthps_vstate;
13588         mstate->dtms_arg[0] = arg0;
13589         mstate->dtms_arg[1] = arg1;
13590 
13591         /*
13592          * Now iterate over each helper.  If its predicate evaluates to 'true',
13593          * we'll call the corresponding actions.  Note that the below calls
13594          * to dtrace_dif_emulate() may set faults in machine state.  This is
13595          * okay:  our caller (the outer dtrace_dif_emulate()) will simply plow
13596          * the stored DIF offset with its own (which is the desired behavior).
13597          * Also, note the calls to dtrace_dif_emulate() may allocate scratch
13598          * from machine state; this is okay, too.
13599          */
13600         for (; helper != NULL; helper = helper->dtha_next) {
13601                 if ((pred = helper->dtha_predicate) != NULL) {
13602                         if (trace)
13603                                 dtrace_helper_trace(helper, mstate, vstate, 0);
13604 
13605                         if (!dtrace_dif_emulate(pred, mstate, vstate, state))
13606                                 goto next;
13607 
13608                         if (*flags & CPU_DTRACE_FAULT)
13609                                 goto err;
13610                 }
13611 
13612                 for (i = 0; i < helper->dtha_nactions; i++) {
13613                         if (trace)
13614                                 dtrace_helper_trace(helper,
13615                                     mstate, vstate, i + 1);
13616 
13617                         rval = dtrace_dif_emulate(helper->dtha_actions[i],
13618                             mstate, vstate, state);
13619 
13620                         if (*flags & CPU_DTRACE_FAULT)
13621                                 goto err;
13622                 }
13623 
13624 next:
13625                 if (trace)
13626                         dtrace_helper_trace(helper, mstate, vstate,
13627                             DTRACE_HELPTRACE_NEXT);
13628         }
13629 
13630         if (trace)
13631                 dtrace_helper_trace(helper, mstate, vstate,
13632                     DTRACE_HELPTRACE_DONE);
13633 
13634         /*
13635          * Restore the arg0 that we saved upon entry.
13636          */
13637         mstate->dtms_arg[0] = sarg0;
13638         mstate->dtms_arg[1] = sarg1;
13639 
13640         return (rval);
13641 
13642 err:
13643         if (trace)
13644                 dtrace_helper_trace(helper, mstate, vstate,
13645                     DTRACE_HELPTRACE_ERR);
13646 
13647         /*
13648          * Restore the arg0 that we saved upon entry.
13649          */
13650         mstate->dtms_arg[0] = sarg0;
13651         mstate->dtms_arg[1] = sarg1;
13652 
13653         return (NULL);
13654 }
13655 
13656 static void
13657 dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
13658     dtrace_vstate_t *vstate)
13659 {
13660         int i;
13661 
13662         if (helper->dtha_predicate != NULL)
13663                 dtrace_difo_release(helper->dtha_predicate, vstate);
13664 
13665         for (i = 0; i < helper->dtha_nactions; i++) {
13666                 ASSERT(helper->dtha_actions[i] != NULL);
13667                 dtrace_difo_release(helper->dtha_actions[i], vstate);
13668         }
13669 
13670         kmem_free(helper->dtha_actions,
13671             helper->dtha_nactions * sizeof (dtrace_difo_t *));
13672         kmem_free(helper, sizeof (dtrace_helper_action_t));
13673 }
13674 
13675 static int
13676 dtrace_helper_destroygen(int gen)
13677 {
13678         proc_t *p = curproc;
13679         dtrace_helpers_t *help = p->p_dtrace_helpers;
13680         dtrace_vstate_t *vstate;
13681         int i;
13682 
13683         ASSERT(MUTEX_HELD(&dtrace_lock));
13684 
13685         if (help == NULL || gen > help->dthps_generation)
13686                 return (EINVAL);
13687 
13688         vstate = &help->dthps_vstate;
13689 
13690         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
13691                 dtrace_helper_action_t *last = NULL, *h, *next;
13692 
13693                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
13694                         next = h->dtha_next;
13695 
13696                         if (h->dtha_generation == gen) {
13697                                 if (last != NULL) {
13698                                         last->dtha_next = next;
13699                                 } else {
13700                                         help->dthps_actions[i] = next;
13701                                 }
13702 
13703                                 dtrace_helper_action_destroy(h, vstate);
13704                         } else {
13705                                 last = h;
13706                         }
13707                 }
13708         }
13709 
13710         /*
13711          * Interate until we've cleared out all helper providers with the
13712          * given generation number.
13713          */
13714         for (;;) {
13715                 dtrace_helper_provider_t *prov;
13716 
13717                 /*
13718                  * Look for a helper provider with the right generation. We
13719                  * have to start back at the beginning of the list each time
13720                  * because we drop dtrace_lock. It's unlikely that we'll make
13721                  * more than two passes.
13722                  */
13723                 for (i = 0; i < help->dthps_nprovs; i++) {
13724                         prov = help->dthps_provs[i];
13725 
13726                         if (prov->dthp_generation == gen)
13727                                 break;
13728                 }
13729 
13730                 /*
13731                  * If there were no matches, we're done.
13732                  */
13733                 if (i == help->dthps_nprovs)
13734                         break;
13735 
13736                 /*
13737                  * Move the last helper provider into this slot.
13738                  */
13739                 help->dthps_nprovs--;
13740                 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
13741                 help->dthps_provs[help->dthps_nprovs] = NULL;
13742 
13743                 mutex_exit(&dtrace_lock);
13744 
13745                 /*
13746                  * If we have a meta provider, remove this helper provider.
13747                  */
13748                 mutex_enter(&dtrace_meta_lock);
13749                 if (dtrace_meta_pid != NULL) {
13750                         ASSERT(dtrace_deferred_pid == NULL);
13751                         dtrace_helper_provider_remove(&prov->dthp_prov,
13752                             p->p_pid);
13753                 }
13754                 mutex_exit(&dtrace_meta_lock);
13755 
13756                 dtrace_helper_provider_destroy(prov);
13757 
13758                 mutex_enter(&dtrace_lock);
13759         }
13760 
13761         return (0);
13762 }
13763 
13764 static int
13765 dtrace_helper_validate(dtrace_helper_action_t *helper)
13766 {
13767         int err = 0, i;
13768         dtrace_difo_t *dp;
13769 
13770         if ((dp = helper->dtha_predicate) != NULL)
13771                 err += dtrace_difo_validate_helper(dp);
13772 
13773         for (i = 0; i < helper->dtha_nactions; i++)
13774                 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
13775 
13776         return (err == 0);
13777 }
13778 
13779 static int
13780 dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
13781 {
13782         dtrace_helpers_t *help;
13783         dtrace_helper_action_t *helper, *last;
13784         dtrace_actdesc_t *act;
13785         dtrace_vstate_t *vstate;
13786         dtrace_predicate_t *pred;
13787         int count = 0, nactions = 0, i;
13788 
13789         if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
13790                 return (EINVAL);
13791 
13792         help = curproc->p_dtrace_helpers;
13793         last = help->dthps_actions[which];
13794         vstate = &help->dthps_vstate;
13795 
13796         for (count = 0; last != NULL; last = last->dtha_next) {
13797                 count++;
13798                 if (last->dtha_next == NULL)
13799                         break;
13800         }
13801 
13802         /*
13803          * If we already have dtrace_helper_actions_max helper actions for this
13804          * helper action type, we'll refuse to add a new one.
13805          */
13806         if (count >= dtrace_helper_actions_max)
13807                 return (ENOSPC);
13808 
13809         helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
13810         helper->dtha_generation = help->dthps_generation;
13811 
13812         if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
13813                 ASSERT(pred->dtp_difo != NULL);
13814                 dtrace_difo_hold(pred->dtp_difo);
13815                 helper->dtha_predicate = pred->dtp_difo;
13816         }
13817 
13818         for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
13819                 if (act->dtad_kind != DTRACEACT_DIFEXPR)
13820                         goto err;
13821 
13822                 if (act->dtad_difo == NULL)
13823                         goto err;
13824 
13825                 nactions++;
13826         }
13827 
13828         helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
13829             (helper->dtha_nactions = nactions), KM_SLEEP);
13830 
13831         for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
13832                 dtrace_difo_hold(act->dtad_difo);
13833                 helper->dtha_actions[i++] = act->dtad_difo;
13834         }
13835 
13836         if (!dtrace_helper_validate(helper))
13837                 goto err;
13838 
13839         if (last == NULL) {
13840                 help->dthps_actions[which] = helper;
13841         } else {
13842                 last->dtha_next = helper;
13843         }
13844 
13845         if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
13846                 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
13847                 dtrace_helptrace_next = 0;
13848         }
13849 
13850         return (0);
13851 err:
13852         dtrace_helper_action_destroy(helper, vstate);
13853         return (EINVAL);
13854 }
13855 
13856 static void
13857 dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
13858     dof_helper_t *dofhp)
13859 {
13860         ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
13861 
13862         mutex_enter(&dtrace_meta_lock);
13863         mutex_enter(&dtrace_lock);
13864 
13865         if (!dtrace_attached() || dtrace_meta_pid == NULL) {
13866                 /*
13867                  * If the dtrace module is loaded but not attached, or if
13868                  * there aren't isn't a meta provider registered to deal with
13869                  * these provider descriptions, we need to postpone creating
13870                  * the actual providers until later.
13871                  */
13872 
13873                 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
13874                     dtrace_deferred_pid != help) {
13875                         help->dthps_deferred = 1;
13876                         help->dthps_pid = p->p_pid;
13877                         help->dthps_next = dtrace_deferred_pid;
13878                         help->dthps_prev = NULL;
13879                         if (dtrace_deferred_pid != NULL)
13880                                 dtrace_deferred_pid->dthps_prev = help;
13881                         dtrace_deferred_pid = help;
13882                 }
13883 
13884                 mutex_exit(&dtrace_lock);
13885 
13886         } else if (dofhp != NULL) {
13887                 /*
13888                  * If the dtrace module is loaded and we have a particular
13889                  * helper provider description, pass that off to the
13890                  * meta provider.
13891                  */
13892 
13893                 mutex_exit(&dtrace_lock);
13894 
13895                 dtrace_helper_provide(dofhp, p->p_pid);
13896 
13897         } else {
13898                 /*
13899                  * Otherwise, just pass all the helper provider descriptions
13900                  * off to the meta provider.
13901                  */
13902 
13903                 int i;
13904                 mutex_exit(&dtrace_lock);
13905 
13906                 for (i = 0; i < help->dthps_nprovs; i++) {
13907                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
13908                             p->p_pid);
13909                 }
13910         }
13911 
13912         mutex_exit(&dtrace_meta_lock);
13913 }
13914 
13915 static int
13916 dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
13917 {
13918         dtrace_helpers_t *help;
13919         dtrace_helper_provider_t *hprov, **tmp_provs;
13920         uint_t tmp_maxprovs, i;
13921 
13922         ASSERT(MUTEX_HELD(&dtrace_lock));
13923 
13924         help = curproc->p_dtrace_helpers;
13925         ASSERT(help != NULL);
13926 
13927         /*
13928          * If we already have dtrace_helper_providers_max helper providers,
13929          * we're refuse to add a new one.
13930          */
13931         if (help->dthps_nprovs >= dtrace_helper_providers_max)
13932                 return (ENOSPC);
13933 
13934         /*
13935          * Check to make sure this isn't a duplicate.
13936          */
13937         for (i = 0; i < help->dthps_nprovs; i++) {
13938                 if (dofhp->dofhp_addr ==
13939                     help->dthps_provs[i]->dthp_prov.dofhp_addr)
13940                         return (EALREADY);
13941         }
13942 
13943         hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
13944         hprov->dthp_prov = *dofhp;
13945         hprov->dthp_ref = 1;
13946         hprov->dthp_generation = gen;
13947 
13948         /*
13949          * Allocate a bigger table for helper providers if it's already full.
13950          */
13951         if (help->dthps_maxprovs == help->dthps_nprovs) {
13952                 tmp_maxprovs = help->dthps_maxprovs;
13953                 tmp_provs = help->dthps_provs;
13954 
13955                 if (help->dthps_maxprovs == 0)
13956                         help->dthps_maxprovs = 2;
13957                 else
13958                         help->dthps_maxprovs *= 2;
13959                 if (help->dthps_maxprovs > dtrace_helper_providers_max)
13960                         help->dthps_maxprovs = dtrace_helper_providers_max;
13961 
13962                 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
13963 
13964                 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
13965                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
13966 
13967                 if (tmp_provs != NULL) {
13968                         bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
13969                             sizeof (dtrace_helper_provider_t *));
13970                         kmem_free(tmp_provs, tmp_maxprovs *
13971                             sizeof (dtrace_helper_provider_t *));
13972                 }
13973         }
13974 
13975         help->dthps_provs[help->dthps_nprovs] = hprov;
13976         help->dthps_nprovs++;
13977 
13978         return (0);
13979 }
13980 
13981 static void
13982 dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
13983 {
13984         mutex_enter(&dtrace_lock);
13985 
13986         if (--hprov->dthp_ref == 0) {
13987                 dof_hdr_t *dof;
13988                 mutex_exit(&dtrace_lock);
13989                 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
13990                 dtrace_dof_destroy(dof);
13991                 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
13992         } else {
13993                 mutex_exit(&dtrace_lock);
13994         }
13995 }
13996 
13997 static int
13998 dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
13999 {
14000         uintptr_t daddr = (uintptr_t)dof;
14001         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
14002         dof_provider_t *provider;
14003         dof_probe_t *probe;
14004         uint8_t *arg;
14005         char *strtab, *typestr;
14006         dof_stridx_t typeidx;
14007         size_t typesz;
14008         uint_t nprobes, j, k;
14009 
14010         ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
14011 
14012         if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
14013                 dtrace_dof_error(dof, "misaligned section offset");
14014                 return (-1);
14015         }
14016 
14017         /*
14018          * The section needs to be large enough to contain the DOF provider
14019          * structure appropriate for the given version.
14020          */
14021         if (sec->dofs_size <
14022             ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
14023             offsetof(dof_provider_t, dofpv_prenoffs) :
14024             sizeof (dof_provider_t))) {
14025                 dtrace_dof_error(dof, "provider section too small");
14026                 return (-1);
14027         }
14028 
14029         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
14030         str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
14031         prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
14032         arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
14033         off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
14034 
14035         if (str_sec == NULL || prb_sec == NULL ||
14036             arg_sec == NULL || off_sec == NULL)
14037                 return (-1);
14038 
14039         enoff_sec = NULL;
14040 
14041         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
14042             provider->dofpv_prenoffs != DOF_SECT_NONE &&
14043             (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
14044             provider->dofpv_prenoffs)) == NULL)
14045                 return (-1);
14046 
14047         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
14048 
14049         if (provider->dofpv_name >= str_sec->dofs_size ||
14050             strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
14051                 dtrace_dof_error(dof, "invalid provider name");
14052                 return (-1);
14053         }
14054 
14055         if (prb_sec->dofs_entsize == 0 ||
14056             prb_sec->dofs_entsize > prb_sec->dofs_size) {
14057                 dtrace_dof_error(dof, "invalid entry size");
14058                 return (-1);
14059         }
14060 
14061         if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
14062                 dtrace_dof_error(dof, "misaligned entry size");
14063                 return (-1);
14064         }
14065 
14066         if (off_sec->dofs_entsize != sizeof (uint32_t)) {
14067                 dtrace_dof_error(dof, "invalid entry size");
14068                 return (-1);
14069         }
14070 
14071         if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
14072                 dtrace_dof_error(dof, "misaligned section offset");
14073                 return (-1);
14074         }
14075 
14076         if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
14077                 dtrace_dof_error(dof, "invalid entry size");
14078                 return (-1);
14079         }
14080 
14081         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
14082 
14083         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
14084 
14085         /*
14086          * Take a pass through the probes to check for errors.
14087          */
14088         for (j = 0; j < nprobes; j++) {
14089                 probe = (dof_probe_t *)(uintptr_t)(daddr +
14090                     prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
14091 
14092                 if (probe->dofpr_func >= str_sec->dofs_size) {
14093                         dtrace_dof_error(dof, "invalid function name");
14094                         return (-1);
14095                 }
14096 
14097                 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
14098                         dtrace_dof_error(dof, "function name too long");
14099                         return (-1);
14100                 }
14101 
14102                 if (probe->dofpr_name >= str_sec->dofs_size ||
14103                     strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
14104                         dtrace_dof_error(dof, "invalid probe name");
14105                         return (-1);
14106                 }
14107 
14108                 /*
14109                  * The offset count must not wrap the index, and the offsets
14110                  * must also not overflow the section's data.
14111                  */
14112                 if (probe->dofpr_offidx + probe->dofpr_noffs <
14113                     probe->dofpr_offidx ||
14114                     (probe->dofpr_offidx + probe->dofpr_noffs) *
14115                     off_sec->dofs_entsize > off_sec->dofs_size) {
14116                         dtrace_dof_error(dof, "invalid probe offset");
14117                         return (-1);
14118                 }
14119 
14120                 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
14121                         /*
14122                          * If there's no is-enabled offset section, make sure
14123                          * there aren't any is-enabled offsets. Otherwise
14124                          * perform the same checks as for probe offsets
14125                          * (immediately above).
14126                          */
14127                         if (enoff_sec == NULL) {
14128                                 if (probe->dofpr_enoffidx != 0 ||
14129                                     probe->dofpr_nenoffs != 0) {
14130                                         dtrace_dof_error(dof, "is-enabled "
14131                                             "offsets with null section");
14132                                         return (-1);
14133                                 }
14134                         } else if (probe->dofpr_enoffidx +
14135                             probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
14136                             (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
14137                             enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
14138                                 dtrace_dof_error(dof, "invalid is-enabled "
14139                                     "offset");
14140                                 return (-1);
14141                         }
14142 
14143                         if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
14144                                 dtrace_dof_error(dof, "zero probe and "
14145                                     "is-enabled offsets");
14146                                 return (-1);
14147                         }
14148                 } else if (probe->dofpr_noffs == 0) {
14149                         dtrace_dof_error(dof, "zero probe offsets");
14150                         return (-1);
14151                 }
14152 
14153                 if (probe->dofpr_argidx + probe->dofpr_xargc <
14154                     probe->dofpr_argidx ||
14155                     (probe->dofpr_argidx + probe->dofpr_xargc) *
14156                     arg_sec->dofs_entsize > arg_sec->dofs_size) {
14157                         dtrace_dof_error(dof, "invalid args");
14158                         return (-1);
14159                 }
14160 
14161                 typeidx = probe->dofpr_nargv;
14162                 typestr = strtab + probe->dofpr_nargv;
14163                 for (k = 0; k < probe->dofpr_nargc; k++) {
14164                         if (typeidx >= str_sec->dofs_size) {
14165                                 dtrace_dof_error(dof, "bad "
14166                                     "native argument type");
14167                                 return (-1);
14168                         }
14169 
14170                         typesz = strlen(typestr) + 1;
14171                         if (typesz > DTRACE_ARGTYPELEN) {
14172                                 dtrace_dof_error(dof, "native "
14173                                     "argument type too long");
14174                                 return (-1);
14175                         }
14176                         typeidx += typesz;
14177                         typestr += typesz;
14178                 }
14179 
14180                 typeidx = probe->dofpr_xargv;
14181                 typestr = strtab + probe->dofpr_xargv;
14182                 for (k = 0; k < probe->dofpr_xargc; k++) {
14183                         if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
14184                                 dtrace_dof_error(dof, "bad "
14185                                     "native argument index");
14186                                 return (-1);
14187                         }
14188 
14189                         if (typeidx >= str_sec->dofs_size) {
14190                                 dtrace_dof_error(dof, "bad "
14191                                     "translated argument type");
14192                                 return (-1);
14193                         }
14194 
14195                         typesz = strlen(typestr) + 1;
14196                         if (typesz > DTRACE_ARGTYPELEN) {
14197                                 dtrace_dof_error(dof, "translated argument "
14198                                     "type too long");
14199                                 return (-1);
14200                         }
14201 
14202                         typeidx += typesz;
14203                         typestr += typesz;
14204                 }
14205         }
14206 
14207         return (0);
14208 }
14209 
14210 static int
14211 dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
14212 {
14213         dtrace_helpers_t *help;
14214         dtrace_vstate_t *vstate;
14215         dtrace_enabling_t *enab = NULL;
14216         int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
14217         uintptr_t daddr = (uintptr_t)dof;
14218 
14219         ASSERT(MUTEX_HELD(&dtrace_lock));
14220 
14221         if ((help = curproc->p_dtrace_helpers) == NULL)
14222                 help = dtrace_helpers_create(curproc);
14223 
14224         vstate = &help->dthps_vstate;
14225 
14226         if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
14227             dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
14228                 dtrace_dof_destroy(dof);
14229                 return (rv);
14230         }
14231 
14232         /*
14233          * Look for helper providers and validate their descriptions.
14234          */
14235         if (dhp != NULL) {
14236                 for (i = 0; i < dof->dofh_secnum; i++) {
14237                         dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
14238                             dof->dofh_secoff + i * dof->dofh_secsize);
14239 
14240                         if (sec->dofs_type != DOF_SECT_PROVIDER)
14241                                 continue;
14242 
14243                         if (dtrace_helper_provider_validate(dof, sec) != 0) {
14244                                 dtrace_enabling_destroy(enab);
14245                                 dtrace_dof_destroy(dof);
14246                                 return (-1);
14247                         }
14248 
14249                         nprovs++;
14250                 }
14251         }
14252 
14253         /*
14254          * Now we need to walk through the ECB descriptions in the enabling.
14255          */
14256         for (i = 0; i < enab->dten_ndesc; i++) {
14257                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
14258                 dtrace_probedesc_t *desc = &ep->dted_probe;
14259 
14260                 if (strcmp(desc->dtpd_provider, "dtrace") != 0)
14261                         continue;
14262 
14263                 if (strcmp(desc->dtpd_mod, "helper") != 0)
14264                         continue;
14265 
14266                 if (strcmp(desc->dtpd_func, "ustack") != 0)
14267                         continue;
14268 
14269                 if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
14270                     ep)) != 0) {
14271                         /*
14272                          * Adding this helper action failed -- we are now going
14273                          * to rip out the entire generation and return failure.
14274                          */
14275                         (void) dtrace_helper_destroygen(help->dthps_generation);
14276                         dtrace_enabling_destroy(enab);
14277                         dtrace_dof_destroy(dof);
14278                         return (-1);
14279                 }
14280 
14281                 nhelpers++;
14282         }
14283 
14284         if (nhelpers < enab->dten_ndesc)
14285                 dtrace_dof_error(dof, "unmatched helpers");
14286 
14287         gen = help->dthps_generation++;
14288         dtrace_enabling_destroy(enab);
14289 
14290         if (dhp != NULL && nprovs > 0) {
14291                 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
14292                 if (dtrace_helper_provider_add(dhp, gen) == 0) {
14293                         mutex_exit(&dtrace_lock);
14294                         dtrace_helper_provider_register(curproc, help, dhp);
14295                         mutex_enter(&dtrace_lock);
14296 
14297                         destroy = 0;
14298                 }
14299         }
14300 
14301         if (destroy)
14302                 dtrace_dof_destroy(dof);
14303 
14304         return (gen);
14305 }
14306 
14307 static dtrace_helpers_t *
14308 dtrace_helpers_create(proc_t *p)
14309 {
14310         dtrace_helpers_t *help;
14311 
14312         ASSERT(MUTEX_HELD(&dtrace_lock));
14313         ASSERT(p->p_dtrace_helpers == NULL);
14314 
14315         help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
14316         help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
14317             DTRACE_NHELPER_ACTIONS, KM_SLEEP);
14318 
14319         p->p_dtrace_helpers = help;
14320         dtrace_helpers++;
14321 
14322         return (help);
14323 }
14324 
14325 static void
14326 dtrace_helpers_destroy(void)
14327 {
14328         dtrace_helpers_t *help;
14329         dtrace_vstate_t *vstate;
14330         proc_t *p = curproc;
14331         int i;
14332 
14333         mutex_enter(&dtrace_lock);
14334 
14335         ASSERT(p->p_dtrace_helpers != NULL);
14336         ASSERT(dtrace_helpers > 0);
14337 
14338         help = p->p_dtrace_helpers;
14339         vstate = &help->dthps_vstate;
14340 
14341         /*
14342          * We're now going to lose the help from this process.
14343          */
14344         p->p_dtrace_helpers = NULL;
14345         dtrace_sync();
14346 
14347         /*
14348          * Destory the helper actions.
14349          */
14350         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14351                 dtrace_helper_action_t *h, *next;
14352 
14353                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
14354                         next = h->dtha_next;
14355                         dtrace_helper_action_destroy(h, vstate);
14356                         h = next;
14357                 }
14358         }
14359 
14360         mutex_exit(&dtrace_lock);
14361 
14362         /*
14363          * Destroy the helper providers.
14364          */
14365         if (help->dthps_maxprovs > 0) {
14366                 mutex_enter(&dtrace_meta_lock);
14367                 if (dtrace_meta_pid != NULL) {
14368                         ASSERT(dtrace_deferred_pid == NULL);
14369 
14370                         for (i = 0; i < help->dthps_nprovs; i++) {
14371                                 dtrace_helper_provider_remove(
14372                                     &help->dthps_provs[i]->dthp_prov, p->p_pid);
14373                         }
14374                 } else {
14375                         mutex_enter(&dtrace_lock);
14376                         ASSERT(help->dthps_deferred == 0 ||
14377                             help->dthps_next != NULL ||
14378                             help->dthps_prev != NULL ||
14379                             help == dtrace_deferred_pid);
14380 
14381                         /*
14382                          * Remove the helper from the deferred list.
14383                          */
14384                         if (help->dthps_next != NULL)
14385                                 help->dthps_next->dthps_prev = help->dthps_prev;
14386                         if (help->dthps_prev != NULL)
14387                                 help->dthps_prev->dthps_next = help->dthps_next;
14388                         if (dtrace_deferred_pid == help) {
14389                                 dtrace_deferred_pid = help->dthps_next;
14390                                 ASSERT(help->dthps_prev == NULL);
14391                         }
14392 
14393                         mutex_exit(&dtrace_lock);
14394                 }
14395 
14396                 mutex_exit(&dtrace_meta_lock);
14397 
14398                 for (i = 0; i < help->dthps_nprovs; i++) {
14399                         dtrace_helper_provider_destroy(help->dthps_provs[i]);
14400                 }
14401 
14402                 kmem_free(help->dthps_provs, help->dthps_maxprovs *
14403                     sizeof (dtrace_helper_provider_t *));
14404         }
14405 
14406         mutex_enter(&dtrace_lock);
14407 
14408         dtrace_vstate_fini(&help->dthps_vstate);
14409         kmem_free(help->dthps_actions,
14410             sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
14411         kmem_free(help, sizeof (dtrace_helpers_t));
14412 
14413         --dtrace_helpers;
14414         mutex_exit(&dtrace_lock);
14415 }
14416 
14417 static void
14418 dtrace_helpers_duplicate(proc_t *from, proc_t *to)
14419 {
14420         dtrace_helpers_t *help, *newhelp;
14421         dtrace_helper_action_t *helper, *new, *last;
14422         dtrace_difo_t *dp;
14423         dtrace_vstate_t *vstate;
14424         int i, j, sz, hasprovs = 0;
14425 
14426         mutex_enter(&dtrace_lock);
14427         ASSERT(from->p_dtrace_helpers != NULL);
14428         ASSERT(dtrace_helpers > 0);
14429 
14430         help = from->p_dtrace_helpers;
14431         newhelp = dtrace_helpers_create(to);
14432         ASSERT(to->p_dtrace_helpers != NULL);
14433 
14434         newhelp->dthps_generation = help->dthps_generation;
14435         vstate = &newhelp->dthps_vstate;
14436 
14437         /*
14438          * Duplicate the helper actions.
14439          */
14440         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14441                 if ((helper = help->dthps_actions[i]) == NULL)
14442                         continue;
14443 
14444                 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
14445                         new = kmem_zalloc(sizeof (dtrace_helper_action_t),
14446                             KM_SLEEP);
14447                         new->dtha_generation = helper->dtha_generation;
14448 
14449                         if ((dp = helper->dtha_predicate) != NULL) {
14450                                 dp = dtrace_difo_duplicate(dp, vstate);
14451                                 new->dtha_predicate = dp;
14452                         }
14453 
14454                         new->dtha_nactions = helper->dtha_nactions;
14455                         sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
14456                         new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
14457 
14458                         for (j = 0; j < new->dtha_nactions; j++) {
14459                                 dtrace_difo_t *dp = helper->dtha_actions[j];
14460 
14461                                 ASSERT(dp != NULL);
14462                                 dp = dtrace_difo_duplicate(dp, vstate);
14463                                 new->dtha_actions[j] = dp;
14464                         }
14465 
14466                         if (last != NULL) {
14467                                 last->dtha_next = new;
14468                         } else {
14469                                 newhelp->dthps_actions[i] = new;
14470                         }
14471 
14472                         last = new;
14473                 }
14474         }
14475 
14476         /*
14477          * Duplicate the helper providers and register them with the
14478          * DTrace framework.
14479          */
14480         if (help->dthps_nprovs > 0) {
14481                 newhelp->dthps_nprovs = help->dthps_nprovs;
14482                 newhelp->dthps_maxprovs = help->dthps_nprovs;
14483                 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
14484                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
14485                 for (i = 0; i < newhelp->dthps_nprovs; i++) {
14486                         newhelp->dthps_provs[i] = help->dthps_provs[i];
14487                         newhelp->dthps_provs[i]->dthp_ref++;
14488                 }
14489 
14490                 hasprovs = 1;
14491         }
14492 
14493         mutex_exit(&dtrace_lock);
14494 
14495         if (hasprovs)
14496                 dtrace_helper_provider_register(to, newhelp, NULL);
14497 }
14498 
14499 /*
14500  * DTrace Hook Functions
14501  */
14502 static void
14503 dtrace_module_loaded(struct modctl *ctl)
14504 {
14505         dtrace_provider_t *prv;
14506 
14507         mutex_enter(&dtrace_provider_lock);
14508         mutex_enter(&mod_lock);
14509 
14510         ASSERT(ctl->mod_busy);
14511 
14512         /*
14513          * We're going to call each providers per-module provide operation
14514          * specifying only this module.
14515          */
14516         for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
14517                 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
14518 
14519         mutex_exit(&mod_lock);
14520         mutex_exit(&dtrace_provider_lock);
14521 
14522         /*
14523          * If we have any retained enablings, we need to match against them.
14524          * Enabling probes requires that cpu_lock be held, and we cannot hold
14525          * cpu_lock here -- it is legal for cpu_lock to be held when loading a
14526          * module.  (In particular, this happens when loading scheduling
14527          * classes.)  So if we have any retained enablings, we need to dispatch
14528          * our task queue to do the match for us.
14529          */
14530         mutex_enter(&dtrace_lock);
14531 
14532         if (dtrace_retained == NULL) {
14533                 mutex_exit(&dtrace_lock);
14534                 return;
14535         }
14536 
14537         (void) taskq_dispatch(dtrace_taskq,
14538             (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
14539 
14540         mutex_exit(&dtrace_lock);
14541 
14542         /*
14543          * And now, for a little heuristic sleaze:  in general, we want to
14544          * match modules as soon as they load.  However, we cannot guarantee
14545          * this, because it would lead us to the lock ordering violation
14546          * outlined above.  The common case, of course, is that cpu_lock is
14547          * _not_ held -- so we delay here for a clock tick, hoping that that's
14548          * long enough for the task queue to do its work.  If it's not, it's
14549          * not a serious problem -- it just means that the module that we
14550          * just loaded may not be immediately instrumentable.
14551          */
14552         delay(1);
14553 }
14554 
14555 static void
14556 dtrace_module_unloaded(struct modctl *ctl)
14557 {
14558         dtrace_probe_t template, *probe, *first, *next;
14559         dtrace_provider_t *prov;
14560 
14561         template.dtpr_mod = ctl->mod_modname;
14562 
14563         mutex_enter(&dtrace_provider_lock);
14564         mutex_enter(&mod_lock);
14565         mutex_enter(&dtrace_lock);
14566 
14567         if (dtrace_bymod == NULL) {
14568                 /*
14569                  * The DTrace module is loaded (obviously) but not attached;
14570                  * we don't have any work to do.
14571                  */
14572                 mutex_exit(&dtrace_provider_lock);
14573                 mutex_exit(&mod_lock);
14574                 mutex_exit(&dtrace_lock);
14575                 return;
14576         }
14577 
14578         for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
14579             probe != NULL; probe = probe->dtpr_nextmod) {
14580                 if (probe->dtpr_ecb != NULL) {
14581                         mutex_exit(&dtrace_provider_lock);
14582                         mutex_exit(&mod_lock);
14583                         mutex_exit(&dtrace_lock);
14584 
14585                         /*
14586                          * This shouldn't _actually_ be possible -- we're
14587                          * unloading a module that has an enabled probe in it.
14588                          * (It's normally up to the provider to make sure that
14589                          * this can't happen.)  However, because dtps_enable()
14590                          * doesn't have a failure mode, there can be an
14591                          * enable/unload race.  Upshot:  we don't want to
14592                          * assert, but we're not going to disable the
14593                          * probe, either.
14594                          */
14595                         if (dtrace_err_verbose) {
14596                                 cmn_err(CE_WARN, "unloaded module '%s' had "
14597                                     "enabled probes", ctl->mod_modname);
14598                         }
14599 
14600                         return;
14601                 }
14602         }
14603 
14604         probe = first;
14605 
14606         for (first = NULL; probe != NULL; probe = next) {
14607                 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
14608 
14609                 dtrace_probes[probe->dtpr_id - 1] = NULL;
14610 
14611                 next = probe->dtpr_nextmod;
14612                 dtrace_hash_remove(dtrace_bymod, probe);
14613                 dtrace_hash_remove(dtrace_byfunc, probe);
14614                 dtrace_hash_remove(dtrace_byname, probe);
14615 
14616                 if (first == NULL) {
14617                         first = probe;
14618                         probe->dtpr_nextmod = NULL;
14619                 } else {
14620                         probe->dtpr_nextmod = first;
14621                         first = probe;
14622                 }
14623         }
14624 
14625         /*
14626          * We've removed all of the module's probes from the hash chains and
14627          * from the probe array.  Now issue a dtrace_sync() to be sure that
14628          * everyone has cleared out from any probe array processing.
14629          */
14630         dtrace_sync();
14631 
14632         for (probe = first; probe != NULL; probe = first) {
14633                 first = probe->dtpr_nextmod;
14634                 prov = probe->dtpr_provider;
14635                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
14636                     probe->dtpr_arg);
14637                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
14638                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
14639                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
14640                 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
14641                 kmem_free(probe, sizeof (dtrace_probe_t));
14642         }
14643 
14644         mutex_exit(&dtrace_lock);
14645         mutex_exit(&mod_lock);
14646         mutex_exit(&dtrace_provider_lock);
14647 }
14648 
14649 void
14650 dtrace_suspend(void)
14651 {
14652         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
14653 }
14654 
14655 void
14656 dtrace_resume(void)
14657 {
14658         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
14659 }
14660 
14661 static int
14662 dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
14663 {
14664         ASSERT(MUTEX_HELD(&cpu_lock));
14665         mutex_enter(&dtrace_lock);
14666 
14667         switch (what) {
14668         case CPU_CONFIG: {
14669                 dtrace_state_t *state;
14670                 dtrace_optval_t *opt, rs, c;
14671 
14672                 /*
14673                  * For now, we only allocate a new buffer for anonymous state.
14674                  */
14675                 if ((state = dtrace_anon.dta_state) == NULL)
14676                         break;
14677 
14678                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14679                         break;
14680 
14681                 opt = state->dts_options;
14682                 c = opt[DTRACEOPT_CPU];
14683 
14684                 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
14685                         break;
14686 
14687                 /*
14688                  * Regardless of what the actual policy is, we're going to
14689                  * temporarily set our resize policy to be manual.  We're
14690                  * also going to temporarily set our CPU option to denote
14691                  * the newly configured CPU.
14692                  */
14693                 rs = opt[DTRACEOPT_BUFRESIZE];
14694                 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
14695                 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
14696 
14697                 (void) dtrace_state_buffers(state);
14698 
14699                 opt[DTRACEOPT_BUFRESIZE] = rs;
14700                 opt[DTRACEOPT_CPU] = c;
14701 
14702                 break;
14703         }
14704 
14705         case CPU_UNCONFIG:
14706                 /*
14707                  * We don't free the buffer in the CPU_UNCONFIG case.  (The
14708                  * buffer will be freed when the consumer exits.)
14709                  */
14710                 break;
14711 
14712         default:
14713                 break;
14714         }
14715 
14716         mutex_exit(&dtrace_lock);
14717         return (0);
14718 }
14719 
14720 static void
14721 dtrace_cpu_setup_initial(processorid_t cpu)
14722 {
14723         (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
14724 }
14725 
14726 static void
14727 dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
14728 {
14729         if (dtrace_toxranges >= dtrace_toxranges_max) {
14730                 int osize, nsize;
14731                 dtrace_toxrange_t *range;
14732 
14733                 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
14734 
14735                 if (osize == 0) {
14736                         ASSERT(dtrace_toxrange == NULL);
14737                         ASSERT(dtrace_toxranges_max == 0);
14738                         dtrace_toxranges_max = 1;
14739                 } else {
14740                         dtrace_toxranges_max <<= 1;
14741                 }
14742 
14743                 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
14744                 range = kmem_zalloc(nsize, KM_SLEEP);
14745 
14746                 if (dtrace_toxrange != NULL) {
14747                         ASSERT(osize != 0);
14748                         bcopy(dtrace_toxrange, range, osize);
14749                         kmem_free(dtrace_toxrange, osize);
14750                 }
14751 
14752                 dtrace_toxrange = range;
14753         }
14754 
14755         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == NULL);
14756         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == NULL);
14757 
14758         dtrace_toxrange[dtrace_toxranges].dtt_base = base;
14759         dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
14760         dtrace_toxranges++;
14761 }
14762 
14763 /*
14764  * DTrace Driver Cookbook Functions
14765  */
14766 /*ARGSUSED*/
14767 static int
14768 dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
14769 {
14770         dtrace_provider_id_t id;
14771         dtrace_state_t *state = NULL;
14772         dtrace_enabling_t *enab;
14773 
14774         mutex_enter(&cpu_lock);
14775         mutex_enter(&dtrace_provider_lock);
14776         mutex_enter(&dtrace_lock);
14777 
14778         if (ddi_soft_state_init(&dtrace_softstate,
14779             sizeof (dtrace_state_t), 0) != 0) {
14780                 cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
14781                 mutex_exit(&cpu_lock);
14782                 mutex_exit(&dtrace_provider_lock);
14783                 mutex_exit(&dtrace_lock);
14784                 return (DDI_FAILURE);
14785         }
14786 
14787         if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
14788             DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
14789             ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
14790             DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
14791                 cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
14792                 ddi_remove_minor_node(devi, NULL);
14793                 ddi_soft_state_fini(&dtrace_softstate);
14794                 mutex_exit(&cpu_lock);
14795                 mutex_exit(&dtrace_provider_lock);
14796                 mutex_exit(&dtrace_lock);
14797                 return (DDI_FAILURE);
14798         }
14799 
14800         ddi_report_dev(devi);
14801         dtrace_devi = devi;
14802 
14803         dtrace_modload = dtrace_module_loaded;
14804         dtrace_modunload = dtrace_module_unloaded;
14805         dtrace_cpu_init = dtrace_cpu_setup_initial;
14806         dtrace_helpers_cleanup = dtrace_helpers_destroy;
14807         dtrace_helpers_fork = dtrace_helpers_duplicate;
14808         dtrace_cpustart_init = dtrace_suspend;
14809         dtrace_cpustart_fini = dtrace_resume;
14810         dtrace_debugger_init = dtrace_suspend;
14811         dtrace_debugger_fini = dtrace_resume;
14812 
14813         register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
14814 
14815         ASSERT(MUTEX_HELD(&cpu_lock));
14816 
14817         dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
14818             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14819         dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
14820             UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
14821             VM_SLEEP | VMC_IDENTIFIER);
14822         dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
14823             1, INT_MAX, 0);
14824 
14825         dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
14826             sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
14827             NULL, NULL, NULL, NULL, NULL, 0);
14828 
14829         ASSERT(MUTEX_HELD(&cpu_lock));
14830         dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
14831             offsetof(dtrace_probe_t, dtpr_nextmod),
14832             offsetof(dtrace_probe_t, dtpr_prevmod));
14833 
14834         dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
14835             offsetof(dtrace_probe_t, dtpr_nextfunc),
14836             offsetof(dtrace_probe_t, dtpr_prevfunc));
14837 
14838         dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
14839             offsetof(dtrace_probe_t, dtpr_nextname),
14840             offsetof(dtrace_probe_t, dtpr_prevname));
14841 
14842         if (dtrace_retain_max < 1) {
14843                 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
14844                     "setting to 1", dtrace_retain_max);
14845                 dtrace_retain_max = 1;
14846         }
14847 
14848         /*
14849          * Now discover our toxic ranges.
14850          */
14851         dtrace_toxic_ranges(dtrace_toxrange_add);
14852 
14853         /*
14854          * Before we register ourselves as a provider to our own framework,
14855          * we would like to assert that dtrace_provider is NULL -- but that's
14856          * not true if we were loaded as a dependency of a DTrace provider.
14857          * Once we've registered, we can assert that dtrace_provider is our
14858          * pseudo provider.
14859          */
14860         (void) dtrace_register("dtrace", &dtrace_provider_attr,
14861             DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
14862 
14863         ASSERT(dtrace_provider != NULL);
14864         ASSERT((dtrace_provider_id_t)dtrace_provider == id);
14865 
14866         dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
14867             dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
14868         dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
14869             dtrace_provider, NULL, NULL, "END", 0, NULL);
14870         dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
14871             dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
14872 
14873         dtrace_anon_property();
14874         mutex_exit(&cpu_lock);
14875 
14876         /*
14877          * If DTrace helper tracing is enabled, we need to allocate the
14878          * trace buffer and initialize the values.
14879          */
14880         if (dtrace_helptrace_enabled) {
14881                 ASSERT(dtrace_helptrace_buffer == NULL);
14882                 dtrace_helptrace_buffer =
14883                     kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
14884                 dtrace_helptrace_next = 0;
14885         }
14886 
14887         /*
14888          * If there are already providers, we must ask them to provide their
14889          * probes, and then match any anonymous enabling against them.  Note
14890          * that there should be no other retained enablings at this time:
14891          * the only retained enablings at this time should be the anonymous
14892          * enabling.
14893          */
14894         if (dtrace_anon.dta_enabling != NULL) {
14895                 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
14896 
14897                 dtrace_enabling_provide(NULL);
14898                 state = dtrace_anon.dta_state;
14899 
14900                 /*
14901                  * We couldn't hold cpu_lock across the above call to
14902                  * dtrace_enabling_provide(), but we must hold it to actually
14903                  * enable the probes.  We have to drop all of our locks, pick
14904                  * up cpu_lock, and regain our locks before matching the
14905                  * retained anonymous enabling.
14906                  */
14907                 mutex_exit(&dtrace_lock);
14908                 mutex_exit(&dtrace_provider_lock);
14909 
14910                 mutex_enter(&cpu_lock);
14911                 mutex_enter(&dtrace_provider_lock);
14912                 mutex_enter(&dtrace_lock);
14913 
14914                 if ((enab = dtrace_anon.dta_enabling) != NULL)
14915                         (void) dtrace_enabling_match(enab, NULL);
14916 
14917                 mutex_exit(&cpu_lock);
14918         }
14919 
14920         mutex_exit(&dtrace_lock);
14921         mutex_exit(&dtrace_provider_lock);
14922 
14923         if (state != NULL) {
14924                 /*
14925                  * If we created any anonymous state, set it going now.
14926                  */
14927                 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
14928         }
14929 
14930         return (DDI_SUCCESS);
14931 }
14932 
14933 /*ARGSUSED*/
14934 static int
14935 dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
14936 {
14937         dtrace_state_t *state;
14938         uint32_t priv;
14939         uid_t uid;
14940         zoneid_t zoneid;
14941 
14942         if (getminor(*devp) == DTRACEMNRN_HELPER)
14943                 return (0);
14944 
14945         /*
14946          * If this wasn't an open with the "helper" minor, then it must be
14947          * the "dtrace" minor.
14948          */
14949         if (getminor(*devp) != DTRACEMNRN_DTRACE)
14950                 return (ENXIO);
14951 
14952         /*
14953          * If no DTRACE_PRIV_* bits are set in the credential, then the
14954          * caller lacks sufficient permission to do anything with DTrace.
14955          */
14956         dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
14957         if (priv == DTRACE_PRIV_NONE)
14958                 return (EACCES);
14959 
14960         /*
14961          * Ask all providers to provide all their probes.
14962          */
14963         mutex_enter(&dtrace_provider_lock);
14964         dtrace_probe_provide(NULL, NULL);
14965         mutex_exit(&dtrace_provider_lock);
14966 
14967         mutex_enter(&cpu_lock);
14968         mutex_enter(&dtrace_lock);
14969         dtrace_opens++;
14970         dtrace_membar_producer();
14971 
14972         /*
14973          * If the kernel debugger is active (that is, if the kernel debugger
14974          * modified text in some way), we won't allow the open.
14975          */
14976         if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
14977                 dtrace_opens--;
14978                 mutex_exit(&cpu_lock);
14979                 mutex_exit(&dtrace_lock);
14980                 return (EBUSY);
14981         }
14982 
14983         state = dtrace_state_create(devp, cred_p);
14984         mutex_exit(&cpu_lock);
14985 
14986         if (state == NULL) {
14987                 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
14988                         (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
14989                 mutex_exit(&dtrace_lock);
14990                 return (EAGAIN);
14991         }
14992 
14993         mutex_exit(&dtrace_lock);
14994 
14995         return (0);
14996 }
14997 
14998 /*ARGSUSED*/
14999 static int
15000 dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
15001 {
15002         minor_t minor = getminor(dev);
15003         dtrace_state_t *state;
15004 
15005         if (minor == DTRACEMNRN_HELPER)
15006                 return (0);
15007 
15008         state = ddi_get_soft_state(dtrace_softstate, minor);
15009 
15010         mutex_enter(&cpu_lock);
15011         mutex_enter(&dtrace_lock);
15012 
15013         if (state->dts_anon) {
15014                 /*
15015                  * There is anonymous state. Destroy that first.
15016                  */
15017                 ASSERT(dtrace_anon.dta_state == NULL);
15018                 dtrace_state_destroy(state->dts_anon);
15019         }
15020 
15021         dtrace_state_destroy(state);
15022         ASSERT(dtrace_opens > 0);
15023 
15024         /*
15025          * Only relinquish control of the kernel debugger interface when there
15026          * are no consumers and no anonymous enablings.
15027          */
15028         if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
15029                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
15030 
15031         mutex_exit(&dtrace_lock);
15032         mutex_exit(&cpu_lock);
15033 
15034         return (0);
15035 }
15036 
15037 /*ARGSUSED*/
15038 static int
15039 dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
15040 {
15041         int rval;
15042         dof_helper_t help, *dhp = NULL;
15043 
15044         switch (cmd) {
15045         case DTRACEHIOC_ADDDOF:
15046                 if (copyin((void *)arg, &help, sizeof (help)) != 0) {
15047                         dtrace_dof_error(NULL, "failed to copyin DOF helper");
15048                         return (EFAULT);
15049                 }
15050 
15051                 dhp = &help;
15052                 arg = (intptr_t)help.dofhp_dof;
15053                 /*FALLTHROUGH*/
15054 
15055         case DTRACEHIOC_ADD: {
15056                 dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
15057 
15058                 if (dof == NULL)
15059                         return (rval);
15060 
15061                 mutex_enter(&dtrace_lock);
15062 
15063                 /*
15064                  * dtrace_helper_slurp() takes responsibility for the dof --
15065                  * it may free it now or it may save it and free it later.
15066                  */
15067                 if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
15068                         *rv = rval;
15069                         rval = 0;
15070                 } else {
15071                         rval = EINVAL;
15072                 }
15073 
15074                 mutex_exit(&dtrace_lock);
15075                 return (rval);
15076         }
15077 
15078         case DTRACEHIOC_REMOVE: {
15079                 mutex_enter(&dtrace_lock);
15080                 rval = dtrace_helper_destroygen(arg);
15081                 mutex_exit(&dtrace_lock);
15082 
15083                 return (rval);
15084         }
15085 
15086         default:
15087                 break;
15088         }
15089 
15090         return (ENOTTY);
15091 }
15092 
15093 /*ARGSUSED*/
15094 static int
15095 dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
15096 {
15097         minor_t minor = getminor(dev);
15098         dtrace_state_t *state;
15099         int rval;
15100 
15101         if (minor == DTRACEMNRN_HELPER)
15102                 return (dtrace_ioctl_helper(cmd, arg, rv));
15103 
15104         state = ddi_get_soft_state(dtrace_softstate, minor);
15105 
15106         if (state->dts_anon) {
15107                 ASSERT(dtrace_anon.dta_state == NULL);
15108                 state = state->dts_anon;
15109         }
15110 
15111         switch (cmd) {
15112         case DTRACEIOC_PROVIDER: {
15113                 dtrace_providerdesc_t pvd;
15114                 dtrace_provider_t *pvp;
15115 
15116                 if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
15117                         return (EFAULT);
15118 
15119                 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
15120                 mutex_enter(&dtrace_provider_lock);
15121 
15122                 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
15123                         if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
15124                                 break;
15125                 }
15126 
15127                 mutex_exit(&dtrace_provider_lock);
15128 
15129                 if (pvp == NULL)
15130                         return (ESRCH);
15131 
15132                 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
15133                 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
15134                 if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
15135                         return (EFAULT);
15136 
15137                 return (0);
15138         }
15139 
15140         case DTRACEIOC_EPROBE: {
15141                 dtrace_eprobedesc_t epdesc;
15142                 dtrace_ecb_t *ecb;
15143                 dtrace_action_t *act;
15144                 void *buf;
15145                 size_t size;
15146                 uintptr_t dest;
15147                 int nrecs;
15148 
15149                 if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
15150                         return (EFAULT);
15151 
15152                 mutex_enter(&dtrace_lock);
15153 
15154                 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
15155                         mutex_exit(&dtrace_lock);
15156                         return (EINVAL);
15157                 }
15158 
15159                 if (ecb->dte_probe == NULL) {
15160                         mutex_exit(&dtrace_lock);
15161                         return (EINVAL);
15162                 }
15163 
15164                 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
15165                 epdesc.dtepd_uarg = ecb->dte_uarg;
15166                 epdesc.dtepd_size = ecb->dte_size;
15167 
15168                 nrecs = epdesc.dtepd_nrecs;
15169                 epdesc.dtepd_nrecs = 0;
15170                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
15171                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
15172                                 continue;
15173 
15174                         epdesc.dtepd_nrecs++;
15175                 }
15176 
15177                 /*
15178                  * Now that we have the size, we need to allocate a temporary
15179                  * buffer in which to store the complete description.  We need
15180                  * the temporary buffer to be able to drop dtrace_lock()
15181                  * across the copyout(), below.
15182                  */
15183                 size = sizeof (dtrace_eprobedesc_t) +
15184                     (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
15185 
15186                 buf = kmem_alloc(size, KM_SLEEP);
15187                 dest = (uintptr_t)buf;
15188 
15189                 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
15190                 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
15191 
15192                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
15193                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
15194                                 continue;
15195 
15196                         if (nrecs-- == 0)
15197                                 break;
15198 
15199                         bcopy(&act->dta_rec, (void *)dest,
15200                             sizeof (dtrace_recdesc_t));
15201                         dest += sizeof (dtrace_recdesc_t);
15202                 }
15203 
15204                 mutex_exit(&dtrace_lock);
15205 
15206                 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
15207                         kmem_free(buf, size);
15208                         return (EFAULT);
15209                 }
15210 
15211                 kmem_free(buf, size);
15212                 return (0);
15213         }
15214 
15215         case DTRACEIOC_AGGDESC: {
15216                 dtrace_aggdesc_t aggdesc;
15217                 dtrace_action_t *act;
15218                 dtrace_aggregation_t *agg;
15219                 int nrecs;
15220                 uint32_t offs;
15221                 dtrace_recdesc_t *lrec;
15222                 void *buf;
15223                 size_t size;
15224                 uintptr_t dest;
15225 
15226                 if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
15227                         return (EFAULT);
15228 
15229                 mutex_enter(&dtrace_lock);
15230 
15231                 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
15232                         mutex_exit(&dtrace_lock);
15233                         return (EINVAL);
15234                 }
15235 
15236                 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
15237 
15238                 nrecs = aggdesc.dtagd_nrecs;
15239                 aggdesc.dtagd_nrecs = 0;
15240 
15241                 offs = agg->dtag_base;
15242                 lrec = &agg->dtag_action.dta_rec;
15243                 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
15244 
15245                 for (act = agg->dtag_first; ; act = act->dta_next) {
15246                         ASSERT(act->dta_intuple ||
15247                             DTRACEACT_ISAGG(act->dta_kind));
15248 
15249                         /*
15250                          * If this action has a record size of zero, it
15251                          * denotes an argument to the aggregating action.
15252                          * Because the presence of this record doesn't (or
15253                          * shouldn't) affect the way the data is interpreted,
15254                          * we don't copy it out to save user-level the
15255                          * confusion of dealing with a zero-length record.
15256                          */
15257                         if (act->dta_rec.dtrd_size == 0) {
15258                                 ASSERT(agg->dtag_hasarg);
15259                                 continue;
15260                         }
15261 
15262                         aggdesc.dtagd_nrecs++;
15263 
15264                         if (act == &agg->dtag_action)
15265                                 break;
15266                 }
15267 
15268                 /*
15269                  * Now that we have the size, we need to allocate a temporary
15270                  * buffer in which to store the complete description.  We need
15271                  * the temporary buffer to be able to drop dtrace_lock()
15272                  * across the copyout(), below.
15273                  */
15274                 size = sizeof (dtrace_aggdesc_t) +
15275                     (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
15276 
15277                 buf = kmem_alloc(size, KM_SLEEP);
15278                 dest = (uintptr_t)buf;
15279 
15280                 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
15281                 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
15282 
15283                 for (act = agg->dtag_first; ; act = act->dta_next) {
15284                         dtrace_recdesc_t rec = act->dta_rec;
15285 
15286                         /*
15287                          * See the comment in the above loop for why we pass
15288                          * over zero-length records.
15289                          */
15290                         if (rec.dtrd_size == 0) {
15291                                 ASSERT(agg->dtag_hasarg);
15292                                 continue;
15293                         }
15294 
15295                         if (nrecs-- == 0)
15296                                 break;
15297 
15298                         rec.dtrd_offset -= offs;
15299                         bcopy(&rec, (void *)dest, sizeof (rec));
15300                         dest += sizeof (dtrace_recdesc_t);
15301 
15302                         if (act == &agg->dtag_action)
15303                                 break;
15304                 }
15305 
15306                 mutex_exit(&dtrace_lock);
15307 
15308                 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
15309                         kmem_free(buf, size);
15310                         return (EFAULT);
15311                 }
15312 
15313                 kmem_free(buf, size);
15314                 return (0);
15315         }
15316 
15317         case DTRACEIOC_ENABLE: {
15318                 dof_hdr_t *dof;
15319                 dtrace_enabling_t *enab = NULL;
15320                 dtrace_vstate_t *vstate;
15321                 int err = 0;
15322 
15323                 *rv = 0;
15324 
15325                 /*
15326                  * If a NULL argument has been passed, we take this as our
15327                  * cue to reevaluate our enablings.
15328                  */
15329                 if (arg == NULL) {
15330                         dtrace_enabling_matchall();
15331 
15332                         return (0);
15333                 }
15334 
15335                 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
15336                         return (rval);
15337 
15338                 mutex_enter(&cpu_lock);
15339                 mutex_enter(&dtrace_lock);
15340                 vstate = &state->dts_vstate;
15341 
15342                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
15343                         mutex_exit(&dtrace_lock);
15344                         mutex_exit(&cpu_lock);
15345                         dtrace_dof_destroy(dof);
15346                         return (EBUSY);
15347                 }
15348 
15349                 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
15350                         mutex_exit(&dtrace_lock);
15351                         mutex_exit(&cpu_lock);
15352                         dtrace_dof_destroy(dof);
15353                         return (EINVAL);
15354                 }
15355 
15356                 if ((rval = dtrace_dof_options(dof, state)) != 0) {
15357                         dtrace_enabling_destroy(enab);
15358                         mutex_exit(&dtrace_lock);
15359                         mutex_exit(&cpu_lock);
15360                         dtrace_dof_destroy(dof);
15361                         return (rval);
15362                 }
15363 
15364                 if ((err = dtrace_enabling_match(enab, rv)) == 0) {
15365                         err = dtrace_enabling_retain(enab);
15366                 } else {
15367                         dtrace_enabling_destroy(enab);
15368                 }
15369 
15370                 mutex_exit(&cpu_lock);
15371                 mutex_exit(&dtrace_lock);
15372                 dtrace_dof_destroy(dof);
15373 
15374                 return (err);
15375         }
15376 
15377         case DTRACEIOC_REPLICATE: {
15378                 dtrace_repldesc_t desc;
15379                 dtrace_probedesc_t *match = &desc.dtrpd_match;
15380                 dtrace_probedesc_t *create = &desc.dtrpd_create;
15381                 int err;
15382 
15383                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15384                         return (EFAULT);
15385 
15386                 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
15387                 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
15388                 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
15389                 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
15390 
15391                 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
15392                 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
15393                 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
15394                 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
15395 
15396                 mutex_enter(&dtrace_lock);
15397                 err = dtrace_enabling_replicate(state, match, create);
15398                 mutex_exit(&dtrace_lock);
15399 
15400                 return (err);
15401         }
15402 
15403         case DTRACEIOC_PROBEMATCH:
15404         case DTRACEIOC_PROBES: {
15405                 dtrace_probe_t *probe = NULL;
15406                 dtrace_probedesc_t desc;
15407                 dtrace_probekey_t pkey;
15408                 dtrace_id_t i;
15409                 int m = 0;
15410                 uint32_t priv;
15411                 uid_t uid;
15412                 zoneid_t zoneid;
15413 
15414                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15415                         return (EFAULT);
15416 
15417                 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
15418                 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
15419                 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
15420                 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
15421 
15422                 /*
15423                  * Before we attempt to match this probe, we want to give
15424                  * all providers the opportunity to provide it.
15425                  */
15426                 if (desc.dtpd_id == DTRACE_IDNONE) {
15427                         mutex_enter(&dtrace_provider_lock);
15428                         dtrace_probe_provide(&desc, NULL);
15429                         mutex_exit(&dtrace_provider_lock);
15430                         desc.dtpd_id++;
15431                 }
15432 
15433                 if (cmd == DTRACEIOC_PROBEMATCH)  {
15434                         dtrace_probekey(&desc, &pkey);
15435                         pkey.dtpk_id = DTRACE_IDNONE;
15436                 }
15437 
15438                 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
15439 
15440                 mutex_enter(&dtrace_lock);
15441 
15442                 if (cmd == DTRACEIOC_PROBEMATCH) {
15443                         for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
15444                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
15445                                     (m = dtrace_match_probe(probe, &pkey,
15446                                     priv, uid, zoneid)) != 0)
15447                                         break;
15448                         }
15449 
15450                         if (m < 0) {
15451                                 mutex_exit(&dtrace_lock);
15452                                 return (EINVAL);
15453                         }
15454 
15455                 } else {
15456                         for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
15457                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
15458                                     dtrace_match_priv(probe, priv, uid, zoneid))
15459                                         break;
15460                         }
15461                 }
15462 
15463                 if (probe == NULL) {
15464                         mutex_exit(&dtrace_lock);
15465                         return (ESRCH);
15466                 }
15467 
15468                 dtrace_probe_description(probe, &desc);
15469                 mutex_exit(&dtrace_lock);
15470 
15471                 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15472                         return (EFAULT);
15473 
15474                 return (0);
15475         }
15476 
15477         case DTRACEIOC_PROBEARG: {
15478                 dtrace_argdesc_t desc;
15479                 dtrace_probe_t *probe;
15480                 dtrace_provider_t *prov;
15481 
15482                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15483                         return (EFAULT);
15484 
15485                 if (desc.dtargd_id == DTRACE_IDNONE)
15486                         return (EINVAL);
15487 
15488                 if (desc.dtargd_ndx == DTRACE_ARGNONE)
15489                         return (EINVAL);
15490 
15491                 mutex_enter(&dtrace_provider_lock);
15492                 mutex_enter(&mod_lock);
15493                 mutex_enter(&dtrace_lock);
15494 
15495                 if (desc.dtargd_id > dtrace_nprobes) {
15496                         mutex_exit(&dtrace_lock);
15497                         mutex_exit(&mod_lock);
15498                         mutex_exit(&dtrace_provider_lock);
15499                         return (EINVAL);
15500                 }
15501 
15502                 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
15503                         mutex_exit(&dtrace_lock);
15504                         mutex_exit(&mod_lock);
15505                         mutex_exit(&dtrace_provider_lock);
15506                         return (EINVAL);
15507                 }
15508 
15509                 mutex_exit(&dtrace_lock);
15510 
15511                 prov = probe->dtpr_provider;
15512 
15513                 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
15514                         /*
15515                          * There isn't any typed information for this probe.
15516                          * Set the argument number to DTRACE_ARGNONE.
15517                          */
15518                         desc.dtargd_ndx = DTRACE_ARGNONE;
15519                 } else {
15520                         desc.dtargd_native[0] = '\0';
15521                         desc.dtargd_xlate[0] = '\0';
15522                         desc.dtargd_mapping = desc.dtargd_ndx;
15523 
15524                         prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
15525                             probe->dtpr_id, probe->dtpr_arg, &desc);
15526                 }
15527 
15528                 mutex_exit(&mod_lock);
15529                 mutex_exit(&dtrace_provider_lock);
15530 
15531                 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15532                         return (EFAULT);
15533 
15534                 return (0);
15535         }
15536 
15537         case DTRACEIOC_GO: {
15538                 processorid_t cpuid;
15539                 rval = dtrace_state_go(state, &cpuid);
15540 
15541                 if (rval != 0)
15542                         return (rval);
15543 
15544                 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
15545                         return (EFAULT);
15546 
15547                 return (0);
15548         }
15549 
15550         case DTRACEIOC_STOP: {
15551                 processorid_t cpuid;
15552 
15553                 mutex_enter(&dtrace_lock);
15554                 rval = dtrace_state_stop(state, &cpuid);
15555                 mutex_exit(&dtrace_lock);
15556 
15557                 if (rval != 0)
15558                         return (rval);
15559 
15560                 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
15561                         return (EFAULT);
15562 
15563                 return (0);
15564         }
15565 
15566         case DTRACEIOC_DOFGET: {
15567                 dof_hdr_t hdr, *dof;
15568                 uint64_t len;
15569 
15570                 if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
15571                         return (EFAULT);
15572 
15573                 mutex_enter(&dtrace_lock);
15574                 dof = dtrace_dof_create(state);
15575                 mutex_exit(&dtrace_lock);
15576 
15577                 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
15578                 rval = copyout(dof, (void *)arg, len);
15579                 dtrace_dof_destroy(dof);
15580 
15581                 return (rval == 0 ? 0 : EFAULT);
15582         }
15583 
15584         case DTRACEIOC_AGGSNAP:
15585         case DTRACEIOC_BUFSNAP: {
15586                 dtrace_bufdesc_t desc;
15587                 caddr_t cached;
15588                 dtrace_buffer_t *buf;
15589 
15590                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15591                         return (EFAULT);
15592 
15593                 if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
15594                         return (EINVAL);
15595 
15596                 mutex_enter(&dtrace_lock);
15597 
15598                 if (cmd == DTRACEIOC_BUFSNAP) {
15599                         buf = &state->dts_buffer[desc.dtbd_cpu];
15600                 } else {
15601                         buf = &state->dts_aggbuffer[desc.dtbd_cpu];
15602                 }
15603 
15604                 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
15605                         size_t sz = buf->dtb_offset;
15606 
15607                         if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
15608                                 mutex_exit(&dtrace_lock);
15609                                 return (EBUSY);
15610                         }
15611 
15612                         /*
15613                          * If this buffer has already been consumed, we're
15614                          * going to indicate that there's nothing left here
15615                          * to consume.
15616                          */
15617                         if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
15618                                 mutex_exit(&dtrace_lock);
15619 
15620                                 desc.dtbd_size = 0;
15621                                 desc.dtbd_drops = 0;
15622                                 desc.dtbd_errors = 0;
15623                                 desc.dtbd_oldest = 0;
15624                                 sz = sizeof (desc);
15625 
15626                                 if (copyout(&desc, (void *)arg, sz) != 0)
15627                                         return (EFAULT);
15628 
15629                                 return (0);
15630                         }
15631 
15632                         /*
15633                          * If this is a ring buffer that has wrapped, we want
15634                          * to copy the whole thing out.
15635                          */
15636                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
15637                                 dtrace_buffer_polish(buf);
15638                                 sz = buf->dtb_size;
15639                         }
15640 
15641                         if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
15642                                 mutex_exit(&dtrace_lock);
15643                                 return (EFAULT);
15644                         }
15645 
15646                         desc.dtbd_size = sz;
15647                         desc.dtbd_drops = buf->dtb_drops;
15648                         desc.dtbd_errors = buf->dtb_errors;
15649                         desc.dtbd_oldest = buf->dtb_xamot_offset;
15650 
15651                         mutex_exit(&dtrace_lock);
15652 
15653                         if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15654                                 return (EFAULT);
15655 
15656                         buf->dtb_flags |= DTRACEBUF_CONSUMED;
15657 
15658                         return (0);
15659                 }
15660 
15661                 if (buf->dtb_tomax == NULL) {
15662                         ASSERT(buf->dtb_xamot == NULL);
15663                         mutex_exit(&dtrace_lock);
15664                         return (ENOENT);
15665                 }
15666 
15667                 cached = buf->dtb_tomax;
15668                 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
15669 
15670                 dtrace_xcall(desc.dtbd_cpu,
15671                     (dtrace_xcall_t)dtrace_buffer_switch, buf);
15672 
15673                 state->dts_errors += buf->dtb_xamot_errors;
15674 
15675                 /*
15676                  * If the buffers did not actually switch, then the cross call
15677                  * did not take place -- presumably because the given CPU is
15678                  * not in the ready set.  If this is the case, we'll return
15679                  * ENOENT.
15680                  */
15681                 if (buf->dtb_tomax == cached) {
15682                         ASSERT(buf->dtb_xamot != cached);
15683                         mutex_exit(&dtrace_lock);
15684                         return (ENOENT);
15685                 }
15686 
15687                 ASSERT(cached == buf->dtb_xamot);
15688 
15689                 /*
15690                  * We have our snapshot; now copy it out.
15691                  */
15692                 if (copyout(buf->dtb_xamot, desc.dtbd_data,
15693                     buf->dtb_xamot_offset) != 0) {
15694                         mutex_exit(&dtrace_lock);
15695                         return (EFAULT);
15696                 }
15697 
15698                 desc.dtbd_size = buf->dtb_xamot_offset;
15699                 desc.dtbd_drops = buf->dtb_xamot_drops;
15700                 desc.dtbd_errors = buf->dtb_xamot_errors;
15701                 desc.dtbd_oldest = 0;
15702 
15703                 mutex_exit(&dtrace_lock);
15704 
15705                 /*
15706                  * Finally, copy out the buffer description.
15707                  */
15708                 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15709                         return (EFAULT);
15710 
15711                 return (0);
15712         }
15713 
15714         case DTRACEIOC_CONF: {
15715                 dtrace_conf_t conf;
15716 
15717                 bzero(&conf, sizeof (conf));
15718                 conf.dtc_difversion = DIF_VERSION;
15719                 conf.dtc_difintregs = DIF_DIR_NREGS;
15720                 conf.dtc_diftupregs = DIF_DTR_NREGS;
15721                 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
15722 
15723                 if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
15724                         return (EFAULT);
15725 
15726                 return (0);
15727         }
15728 
15729         case DTRACEIOC_STATUS: {
15730                 dtrace_status_t stat;
15731                 dtrace_dstate_t *dstate;
15732                 int i, j;
15733                 uint64_t nerrs;
15734 
15735                 /*
15736                  * See the comment in dtrace_state_deadman() for the reason
15737                  * for setting dts_laststatus to INT64_MAX before setting
15738                  * it to the correct value.
15739                  */
15740                 state->dts_laststatus = INT64_MAX;
15741                 dtrace_membar_producer();
15742                 state->dts_laststatus = dtrace_gethrtime();
15743 
15744                 bzero(&stat, sizeof (stat));
15745 
15746                 mutex_enter(&dtrace_lock);
15747 
15748                 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
15749                         mutex_exit(&dtrace_lock);
15750                         return (ENOENT);
15751                 }
15752 
15753                 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
15754                         stat.dtst_exiting = 1;
15755 
15756                 nerrs = state->dts_errors;
15757                 dstate = &state->dts_vstate.dtvs_dynvars;
15758 
15759                 for (i = 0; i < NCPU; i++) {
15760                         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
15761 
15762                         stat.dtst_dyndrops += dcpu->dtdsc_drops;
15763                         stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
15764                         stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
15765 
15766                         if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
15767                                 stat.dtst_filled++;
15768 
15769                         nerrs += state->dts_buffer[i].dtb_errors;
15770 
15771                         for (j = 0; j < state->dts_nspeculations; j++) {
15772                                 dtrace_speculation_t *spec;
15773                                 dtrace_buffer_t *buf;
15774 
15775                                 spec = &state->dts_speculations[j];
15776                                 buf = &spec->dtsp_buffer[i];
15777                                 stat.dtst_specdrops += buf->dtb_xamot_drops;
15778                         }
15779                 }
15780 
15781                 stat.dtst_specdrops_busy = state->dts_speculations_busy;
15782                 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
15783                 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
15784                 stat.dtst_dblerrors = state->dts_dblerrors;
15785                 stat.dtst_killed =
15786                     (state->dts_activity == DTRACE_ACTIVITY_KILLED);
15787                 stat.dtst_errors = nerrs;
15788 
15789                 mutex_exit(&dtrace_lock);
15790 
15791                 if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
15792                         return (EFAULT);
15793 
15794                 return (0);
15795         }
15796 
15797         case DTRACEIOC_FORMAT: {
15798                 dtrace_fmtdesc_t fmt;
15799                 char *str;
15800                 int len;
15801 
15802                 if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
15803                         return (EFAULT);
15804 
15805                 mutex_enter(&dtrace_lock);
15806 
15807                 if (fmt.dtfd_format == 0 ||
15808                     fmt.dtfd_format > state->dts_nformats) {
15809                         mutex_exit(&dtrace_lock);
15810                         return (EINVAL);
15811                 }
15812 
15813                 /*
15814                  * Format strings are allocated contiguously and they are
15815                  * never freed; if a format index is less than the number
15816                  * of formats, we can assert that the format map is non-NULL
15817                  * and that the format for the specified index is non-NULL.
15818                  */
15819                 ASSERT(state->dts_formats != NULL);
15820                 str = state->dts_formats[fmt.dtfd_format - 1];
15821                 ASSERT(str != NULL);
15822 
15823                 len = strlen(str) + 1;
15824 
15825                 if (len > fmt.dtfd_length) {
15826                         fmt.dtfd_length = len;
15827 
15828                         if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
15829                                 mutex_exit(&dtrace_lock);
15830                                 return (EINVAL);
15831                         }
15832                 } else {
15833                         if (copyout(str, fmt.dtfd_string, len) != 0) {
15834                                 mutex_exit(&dtrace_lock);
15835                                 return (EINVAL);
15836                         }
15837                 }
15838 
15839                 mutex_exit(&dtrace_lock);
15840                 return (0);
15841         }
15842 
15843         default:
15844                 break;
15845         }
15846 
15847         return (ENOTTY);
15848 }
15849 
15850 /*ARGSUSED*/
15851 static int
15852 dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
15853 {
15854         dtrace_state_t *state;
15855 
15856         switch (cmd) {
15857         case DDI_DETACH:
15858                 break;
15859 
15860         case DDI_SUSPEND:
15861                 return (DDI_SUCCESS);
15862 
15863         default:
15864                 return (DDI_FAILURE);
15865         }
15866 
15867         mutex_enter(&cpu_lock);
15868         mutex_enter(&dtrace_provider_lock);
15869         mutex_enter(&dtrace_lock);
15870 
15871         ASSERT(dtrace_opens == 0);
15872 
15873         if (dtrace_helpers > 0) {
15874                 mutex_exit(&dtrace_provider_lock);
15875                 mutex_exit(&dtrace_lock);
15876                 mutex_exit(&cpu_lock);
15877                 return (DDI_FAILURE);
15878         }
15879 
15880         if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
15881                 mutex_exit(&dtrace_provider_lock);
15882                 mutex_exit(&dtrace_lock);
15883                 mutex_exit(&cpu_lock);
15884                 return (DDI_FAILURE);
15885         }
15886 
15887         dtrace_provider = NULL;
15888 
15889         if ((state = dtrace_anon_grab()) != NULL) {
15890                 /*
15891                  * If there were ECBs on this state, the provider should
15892                  * have not been allowed to detach; assert that there is
15893                  * none.
15894                  */
15895                 ASSERT(state->dts_necbs == 0);
15896                 dtrace_state_destroy(state);
15897 
15898                 /*
15899                  * If we're being detached with anonymous state, we need to
15900                  * indicate to the kernel debugger that DTrace is now inactive.
15901                  */
15902                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
15903         }
15904 
15905         bzero(&dtrace_anon, sizeof (dtrace_anon_t));
15906         unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
15907         dtrace_cpu_init = NULL;
15908         dtrace_helpers_cleanup = NULL;
15909         dtrace_helpers_fork = NULL;
15910         dtrace_cpustart_init = NULL;
15911         dtrace_cpustart_fini = NULL;
15912         dtrace_debugger_init = NULL;
15913         dtrace_debugger_fini = NULL;
15914         dtrace_modload = NULL;
15915         dtrace_modunload = NULL;
15916 
15917         mutex_exit(&cpu_lock);
15918 
15919         if (dtrace_helptrace_enabled) {
15920                 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
15921                 dtrace_helptrace_buffer = NULL;
15922         }
15923 
15924         kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
15925         dtrace_probes = NULL;
15926         dtrace_nprobes = 0;
15927 
15928         dtrace_hash_destroy(dtrace_bymod);
15929         dtrace_hash_destroy(dtrace_byfunc);
15930         dtrace_hash_destroy(dtrace_byname);
15931         dtrace_bymod = NULL;
15932         dtrace_byfunc = NULL;
15933         dtrace_byname = NULL;
15934 
15935         kmem_cache_destroy(dtrace_state_cache);
15936         vmem_destroy(dtrace_minor);
15937         vmem_destroy(dtrace_arena);
15938 
15939         if (dtrace_toxrange != NULL) {
15940                 kmem_free(dtrace_toxrange,
15941                     dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
15942                 dtrace_toxrange = NULL;
15943                 dtrace_toxranges = 0;
15944                 dtrace_toxranges_max = 0;
15945         }
15946 
15947         ddi_remove_minor_node(dtrace_devi, NULL);
15948         dtrace_devi = NULL;
15949 
15950         ddi_soft_state_fini(&dtrace_softstate);
15951 
15952         ASSERT(dtrace_vtime_references == 0);
15953         ASSERT(dtrace_opens == 0);
15954         ASSERT(dtrace_retained == NULL);
15955 
15956         mutex_exit(&dtrace_lock);
15957         mutex_exit(&dtrace_provider_lock);
15958 
15959         /*
15960          * We don't destroy the task queue until after we have dropped our
15961          * locks (taskq_destroy() may block on running tasks).  To prevent
15962          * attempting to do work after we have effectively detached but before
15963          * the task queue has been destroyed, all tasks dispatched via the
15964          * task queue must check that DTrace is still attached before
15965          * performing any operation.
15966          */
15967         taskq_destroy(dtrace_taskq);
15968         dtrace_taskq = NULL;
15969 
15970         return (DDI_SUCCESS);
15971 }
15972 
15973 /*ARGSUSED*/
15974 static int
15975 dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
15976 {
15977         int error;
15978 
15979         switch (infocmd) {
15980         case DDI_INFO_DEVT2DEVINFO:
15981                 *result = (void *)dtrace_devi;
15982                 error = DDI_SUCCESS;
15983                 break;
15984         case DDI_INFO_DEVT2INSTANCE:
15985                 *result = (void *)0;
15986                 error = DDI_SUCCESS;
15987                 break;
15988         default:
15989                 error = DDI_FAILURE;
15990         }
15991         return (error);
15992 }
15993 
15994 static struct cb_ops dtrace_cb_ops = {
15995         dtrace_open,            /* open */
15996         dtrace_close,           /* close */
15997         nulldev,                /* strategy */
15998         nulldev,                /* print */
15999         nodev,                  /* dump */
16000         nodev,                  /* read */
16001         nodev,                  /* write */
16002         dtrace_ioctl,           /* ioctl */
16003         nodev,                  /* devmap */
16004         nodev,                  /* mmap */
16005         nodev,                  /* segmap */
16006         nochpoll,               /* poll */
16007         ddi_prop_op,            /* cb_prop_op */
16008         0,                      /* streamtab  */
16009         D_NEW | D_MP            /* Driver compatibility flag */
16010 };
16011 
16012 static struct dev_ops dtrace_ops = {
16013         DEVO_REV,               /* devo_rev */
16014         0,                      /* refcnt */
16015         dtrace_info,            /* get_dev_info */
16016         nulldev,                /* identify */
16017         nulldev,                /* probe */
16018         dtrace_attach,          /* attach */
16019         dtrace_detach,          /* detach */
16020         nodev,                  /* reset */
16021         &dtrace_cb_ops,             /* driver operations */
16022         NULL,                   /* bus operations */
16023         nodev,                  /* dev power */
16024         ddi_quiesce_not_needed,         /* quiesce */
16025 };
16026 
16027 static struct modldrv modldrv = {
16028         &mod_driverops,             /* module type (this is a pseudo driver) */
16029         "Dynamic Tracing",      /* name of module */
16030         &dtrace_ops,                /* driver ops */
16031 };
16032 
16033 static struct modlinkage modlinkage = {
16034         MODREV_1,
16035         (void *)&modldrv,
16036         NULL
16037 };
16038 
16039 int
16040 _init(void)
16041 {
16042         return (mod_install(&modlinkage));
16043 }
16044 
16045 int
16046 _info(struct modinfo *modinfop)
16047 {
16048         return (mod_info(&modlinkage, modinfop));
16049 }
16050 
16051 int
16052 _fini(void)
16053 {
16054         return (mod_remove(&modlinkage));
16055 }