illumos-gate Wdiff usr/src/uts/common/dtrace/dtrace.c

Print this page

2917 DTrace in a zone should have limited provider access

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/dtrace/dtrace.c
          +++ new/usr/src/uts/common/dtrace/dtrace.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  25   25   */
  26   26  
  27   27  /*
  28   28   * DTrace - Dynamic Tracing for Solaris
  29   29   *
  30   30   * This is the implementation of the Solaris Dynamic Tracing framework
  31   31   * (DTrace).  The user-visible interface to DTrace is described at length in
  32   32   * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
  33   33   * library, the in-kernel DTrace framework, and the DTrace providers are
  34   34   * described in the block comments in the <sys/dtrace.h> header file.  The
  35   35   * internal architecture of DTrace is described in the block comments in the
  36   36   * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
  37   37   * implementation very much assume mastery of all of these sources; if one has
  38   38   * an unanswered question about the implementation, one should consult them
  39   39   * first.
  40   40   *
  41   41   * The functions here are ordered roughly as follows:
  42   42   *
  43   43   *   - Probe context functions
  44   44   *   - Probe hashing functions
  45   45   *   - Non-probe context utility functions
  46   46   *   - Matching functions
  47   47   *   - Provider-to-Framework API functions
  48   48   *   - Probe management functions
  49   49   *   - DIF object functions
  50   50   *   - Format functions
  51   51   *   - Predicate functions
  52   52   *   - ECB functions
  53   53   *   - Buffer functions
  54   54   *   - Enabling functions
  55   55   *   - DOF functions
  56   56   *   - Anonymous enabling functions
  57   57   *   - Consumer state functions
  58   58   *   - Helper functions
  59   59   *   - Hook functions
  60   60   *   - Driver cookbook functions
  61   61   *
  62   62   * Each group of functions begins with a block comment labelled the "DTrace
  63   63   * [Group] Functions", allowing one to find each block by searching forward
  64   64   * on capital-f functions.
  65   65   */
  66   66  #include <sys/errno.h>
  67   67  #include <sys/stat.h>
  68   68  #include <sys/modctl.h>
  69   69  #include <sys/conf.h>
  70   70  #include <sys/systm.h>
  71   71  #include <sys/ddi.h>
  72   72  #include <sys/sunddi.h>
  73   73  #include <sys/cpuvar.h>
  74   74  #include <sys/kmem.h>
  75   75  #include <sys/strsubr.h>
  76   76  #include <sys/sysmacros.h>
  77   77  #include <sys/dtrace_impl.h>
  78   78  #include <sys/atomic.h>
  79   79  #include <sys/cmn_err.h>
  80   80  #include <sys/mutex_impl.h>
  81   81  #include <sys/rwlock_impl.h>
  82   82  #include <sys/ctf_api.h>
  83   83  #include <sys/panic.h>
  84   84  #include <sys/priv_impl.h>
  85   85  #include <sys/policy.h>
  86   86  #include <sys/cred_impl.h>
  87   87  #include <sys/procfs_isa.h>
  88   88  #include <sys/taskq.h>
  89   89  #include <sys/mkdev.h>
  90   90  #include <sys/kdi.h>
  91   91  #include <sys/zone.h>
  92   92  #include <sys/socket.h>
  93   93  #include <netinet/in.h>
  94   94  
  95   95  /*
  96   96   * DTrace Tunable Variables
  97   97   *
  98   98   * The following variables may be tuned by adding a line to /etc/system that
  99   99   * includes both the name of the DTrace module ("dtrace") and the name of the
 100  100   * variable.  For example:
 101  101   *
 102  102   *   set dtrace:dtrace_destructive_disallow = 1
 103  103   *
 104  104   * In general, the only variables that one should be tuning this way are those
 105  105   * that affect system-wide DTrace behavior, and for which the default behavior
 106  106   * is undesirable.  Most of these variables are tunable on a per-consumer
 107  107   * basis using DTrace options, and need not be tuned on a system-wide basis.
 108  108   * When tuning these variables, avoid pathological values; while some attempt
 109  109   * is made to verify the integrity of these variables, they are not considered
 110  110   * part of the supported interface to DTrace, and they are therefore not
 111  111   * checked comprehensively.  Further, these variables should not be tuned
 112  112   * dynamically via "mdb -kw" or other means; they should only be tuned via
 113  113   * /etc/system.
 114  114   */
 115  115  int             dtrace_destructive_disallow = 0;
 116  116  dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
 117  117  size_t          dtrace_difo_maxsize = (256 * 1024);
 118  118  dtrace_optval_t dtrace_dof_maxsize = (256 * 1024);
 119  119  size_t          dtrace_global_maxsize = (16 * 1024);
 120  120  size_t          dtrace_actions_max = (16 * 1024);
 121  121  size_t          dtrace_retain_max = 1024;
 122  122  dtrace_optval_t dtrace_helper_actions_max = 1024;
 123  123  dtrace_optval_t dtrace_helper_providers_max = 32;
 124  124  dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
 125  125  size_t          dtrace_strsize_default = 256;
 126  126  dtrace_optval_t dtrace_cleanrate_default = 9900990;             /* 101 hz */
 127  127  dtrace_optval_t dtrace_cleanrate_min = 200000;                  /* 5000 hz */
 128  128  dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;  /* 1/minute */
 129  129  dtrace_optval_t dtrace_aggrate_default = NANOSEC;               /* 1 hz */
 130  130  dtrace_optval_t dtrace_statusrate_default = NANOSEC;            /* 1 hz */
 131  131  dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;  /* 6/minute */
 132  132  dtrace_optval_t dtrace_switchrate_default = NANOSEC;            /* 1 hz */
 133  133  dtrace_optval_t dtrace_nspec_default = 1;
 134  134  dtrace_optval_t dtrace_specsize_default = 32 * 1024;
 135  135  dtrace_optval_t dtrace_stackframes_default = 20;
 136  136  dtrace_optval_t dtrace_ustackframes_default = 20;
 137  137  dtrace_optval_t dtrace_jstackframes_default = 50;
 138  138  dtrace_optval_t dtrace_jstackstrsize_default = 512;
 139  139  int             dtrace_msgdsize_max = 128;
 140  140  hrtime_t        dtrace_chill_max = 500 * (NANOSEC / MILLISEC);  /* 500 ms */
 141  141  hrtime_t        dtrace_chill_interval = NANOSEC;                /* 1000 ms */
 142  142  int             dtrace_devdepth_max = 32;
 143  143  int             dtrace_err_verbose;
 144  144  hrtime_t        dtrace_deadman_interval = NANOSEC;
 145  145  hrtime_t        dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
 146  146  hrtime_t        dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
 147  147  hrtime_t        dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
 148  148  
 149  149  /*
 150  150   * DTrace External Variables
 151  151   *
 152  152   * As dtrace(7D) is a kernel module, any DTrace variables are obviously
 153  153   * available to DTrace consumers via the backtick (`) syntax.  One of these,
 154  154   * dtrace_zero, is made deliberately so:  it is provided as a source of
 155  155   * well-known, zero-filled memory.  While this variable is not documented,
 156  156   * it is used by some translators as an implementation detail.
 157  157   */
 158  158  const char      dtrace_zero[256] = { 0 };       /* zero-filled memory */
 159  159  
 160  160  /*
 161  161   * DTrace Internal Variables
 162  162   */
 163  163  static dev_info_t       *dtrace_devi;           /* device info */
 164  164  static vmem_t           *dtrace_arena;          /* probe ID arena */
 165  165  static vmem_t           *dtrace_minor;          /* minor number arena */
 166  166  static taskq_t          *dtrace_taskq;          /* task queue */
 167  167  static dtrace_probe_t   **dtrace_probes;        /* array of all probes */
 168  168  static int              dtrace_nprobes;         /* number of probes */
 169  169  static dtrace_provider_t *dtrace_provider;      /* provider list */
 170  170  static dtrace_meta_t    *dtrace_meta_pid;       /* user-land meta provider */
 171  171  static int              dtrace_opens;           /* number of opens */
 172  172  static int              dtrace_helpers;         /* number of helpers */
 173  173  static int              dtrace_getf;            /* number of unpriv getf()s */
 174  174  static void             *dtrace_softstate;      /* softstate pointer */
 175  175  static dtrace_hash_t    *dtrace_bymod;          /* probes hashed by module */
 176  176  static dtrace_hash_t    *dtrace_byfunc;         /* probes hashed by function */
 177  177  static dtrace_hash_t    *dtrace_byname;         /* probes hashed by name */
 178  178  static dtrace_toxrange_t *dtrace_toxrange;      /* toxic range array */
 179  179  static int              dtrace_toxranges;       /* number of toxic ranges */
 180  180  static int              dtrace_toxranges_max;   /* size of toxic range array */
 181  181  static dtrace_anon_t    dtrace_anon;            /* anonymous enabling */
 182  182  static kmem_cache_t     *dtrace_state_cache;    /* cache for dynamic state */
 183  183  static uint64_t         dtrace_vtime_references; /* number of vtimestamp refs */
 184  184  static kthread_t        *dtrace_panicked;       /* panicking thread */
 185  185  static dtrace_ecb_t     *dtrace_ecb_create_cache; /* cached created ECB */
 186  186  static dtrace_genid_t   dtrace_probegen;        /* current probe generation */
 187  187  static dtrace_helpers_t *dtrace_deferred_pid;   /* deferred helper list */
 188  188  static dtrace_enabling_t *dtrace_retained;      /* list of retained enablings */
 189  189  static dtrace_genid_t   dtrace_retained_gen;    /* current retained enab gen */
 190  190  static dtrace_dynvar_t  dtrace_dynhash_sink;    /* end of dynamic hash chains */
 191  191  static int              dtrace_dynvar_failclean; /* dynvars failed to clean */
 192  192  
 193  193  /*
 194  194   * DTrace Locking
 195  195   * DTrace is protected by three (relatively coarse-grained) locks:
 196  196   *
 197  197   * (1) dtrace_lock is required to manipulate essentially any DTrace state,
 198  198   *     including enabling state, probes, ECBs, consumer state, helper state,
 199  199   *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
 200  200   *     probe context is lock-free -- synchronization is handled via the
 201  201   *     dtrace_sync() cross call mechanism.
 202  202   *
 203  203   * (2) dtrace_provider_lock is required when manipulating provider state, or
 204  204   *     when provider state must be held constant.
 205  205   *
 206  206   * (3) dtrace_meta_lock is required when manipulating meta provider state, or
 207  207   *     when meta provider state must be held constant.
 208  208   *
 209  209   * The lock ordering between these three locks is dtrace_meta_lock before
 210  210   * dtrace_provider_lock before dtrace_lock.  (In particular, there are
 211  211   * several places where dtrace_provider_lock is held by the framework as it
 212  212   * calls into the providers -- which then call back into the framework,
 213  213   * grabbing dtrace_lock.)
 214  214   *
 215  215   * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
 216  216   * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
 217  217   * role as a coarse-grained lock; it is acquired before both of these locks.
 218  218   * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
 219  219   * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
 220  220   * mod_lock is similar with respect to dtrace_provider_lock in that it must be
 221  221   * acquired _between_ dtrace_provider_lock and dtrace_lock.
 222  222   */
 223  223  static kmutex_t         dtrace_lock;            /* probe state lock */
 224  224  static kmutex_t         dtrace_provider_lock;   /* provider state lock */
 225  225  static kmutex_t         dtrace_meta_lock;       /* meta-provider state lock */
 226  226  
 227  227  /*
 228  228   * DTrace Provider Variables
 229  229   *
 230  230   * These are the variables relating to DTrace as a provider (that is, the
 231  231   * provider of the BEGIN, END, and ERROR probes).
 232  232   */
 233  233  static dtrace_pattr_t   dtrace_provider_attr = {
 234  234  { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 235  235  { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 236  236  { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 237  237  { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 238  238  { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 239  239  };
 240  240  
 241  241  static void
 242  242  dtrace_nullop(void)
 243  243  {}
 244  244  
 245  245  static int
 246  246  dtrace_enable_nullop(void)
 247  247  {
 248  248          return (0);
 249  249  }
 250  250  
 251  251  static dtrace_pops_t    dtrace_provider_ops = {
 252  252          (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
 253  253          (void (*)(void *, struct modctl *))dtrace_nullop,
 254  254          (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop,
 255  255          (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 256  256          (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 257  257          (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 258  258          NULL,
 259  259          NULL,
 260  260          NULL,
 261  261          (void (*)(void *, dtrace_id_t, void *))dtrace_nullop
 262  262  };
 263  263  
 264  264  static dtrace_id_t      dtrace_probeid_begin;   /* special BEGIN probe */
 265  265  static dtrace_id_t      dtrace_probeid_end;     /* special END probe */
 266  266  dtrace_id_t             dtrace_probeid_error;   /* special ERROR probe */
 267  267  
 268  268  /*
 269  269   * DTrace Helper Tracing Variables
 270  270   */
 271  271  uint32_t dtrace_helptrace_next = 0;
 272  272  uint32_t dtrace_helptrace_nlocals;
 273  273  char    *dtrace_helptrace_buffer;
 274  274  int     dtrace_helptrace_bufsize = 512 * 1024;
 275  275  
 276  276  #ifdef DEBUG
 277  277  int     dtrace_helptrace_enabled = 1;
 278  278  #else
 279  279  int     dtrace_helptrace_enabled = 0;
 280  280  #endif
 281  281  
 282  282  /*
 283  283   * DTrace Error Hashing
 284  284   *
 285  285   * On DEBUG kernels, DTrace will track the errors that has seen in a hash
 286  286   * table.  This is very useful for checking coverage of tests that are
 287  287   * expected to induce DIF or DOF processing errors, and may be useful for
 288  288   * debugging problems in the DIF code generator or in DOF generation .  The
 289  289   * error hash may be examined with the ::dtrace_errhash MDB dcmd.
 290  290   */
 291  291  #ifdef DEBUG
 292  292  static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
 293  293  static const char *dtrace_errlast;
 294  294  static kthread_t *dtrace_errthread;
 295  295  static kmutex_t dtrace_errlock;
 296  296  #endif
 297  297  
 298  298  /*
 299  299   * DTrace Macros and Constants
 300  300   *
 301  301   * These are various macros that are useful in various spots in the
 302  302   * implementation, along with a few random constants that have no meaning
 303  303   * outside of the implementation.  There is no real structure to this cpp
 304  304   * mishmash -- but is there ever?
 305  305   */
 306  306  #define DTRACE_HASHSTR(hash, probe)     \
 307  307          dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
 308  308  
 309  309  #define DTRACE_HASHNEXT(hash, probe)    \
 310  310          (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
 311  311  
 312  312  #define DTRACE_HASHPREV(hash, probe)    \
 313  313          (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
 314  314  
 315  315  #define DTRACE_HASHEQ(hash, lhs, rhs)   \
 316  316          (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
 317  317              *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
 318  318  
 319  319  #define DTRACE_AGGHASHSIZE_SLEW         17
 320  320  
 321  321  #define DTRACE_V4MAPPED_OFFSET          (sizeof (uint32_t) * 3)
 322  322  
 323  323  /*
 324  324   * The key for a thread-local variable consists of the lower 61 bits of the
 325  325   * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
 326  326   * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
 327  327   * equal to a variable identifier.  This is necessary (but not sufficient) to
 328  328   * assure that global associative arrays never collide with thread-local
 329  329   * variables.  To guarantee that they cannot collide, we must also define the
 330  330   * order for keying dynamic variables.  That order is:
 331  331   *
 332  332   *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
 333  333   *
 334  334   * Because the variable-key and the tls-key are in orthogonal spaces, there is
 335  335   * no way for a global variable key signature to match a thread-local key
 336  336   * signature.
 337  337   */
 338  338  #define DTRACE_TLS_THRKEY(where) { \
 339  339          uint_t intr = 0; \
 340  340          uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
 341  341          for (; actv; actv >>= 1) \
 342  342                  intr++; \
 343  343          ASSERT(intr < (1 << 3)); \
 344  344          (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
 345  345              (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 346  346  }
 347  347  
 348  348  #define DT_BSWAP_8(x)   ((x) & 0xff)
 349  349  #define DT_BSWAP_16(x)  ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
 350  350  #define DT_BSWAP_32(x)  ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
 351  351  #define DT_BSWAP_64(x)  ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
 352  352  
 353  353  #define DT_MASK_LO 0x00000000FFFFFFFFULL
 354  354  
 355  355  #define DTRACE_STORE(type, tomax, offset, what) \
 356  356          *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
 357  357  
 358  358  #ifndef __i386
 359  359  #define DTRACE_ALIGNCHECK(addr, size, flags)                            \
 360  360          if (addr & (size - 1)) {                                        \
 361  361                  *flags |= CPU_DTRACE_BADALIGN;                          \
 362  362                  cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 363  363                  return (0);                                             \
 364  364          }
 365  365  #else
 366  366  #define DTRACE_ALIGNCHECK(addr, size, flags)
 367  367  #endif
 368  368  
 369  369  /*
 370  370   * Test whether a range of memory starting at testaddr of size testsz falls
 371  371   * within the range of memory described by addr, sz.  We take care to avoid
 372  372   * problems with overflow and underflow of the unsigned quantities, and
 373  373   * disallow all negative sizes.  Ranges of size 0 are allowed.
 374  374   */
 375  375  #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
 376  376          ((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
 377  377          (testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
 378  378          (testaddr) + (testsz) >= (testaddr))
 379  379  
 380  380  /*
 381  381   * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
 382  382   * alloc_sz on the righthand side of the comparison in order to avoid overflow
 383  383   * or underflow in the comparison with it.  This is simpler than the INRANGE
 384  384   * check above, because we know that the dtms_scratch_ptr is valid in the
 385  385   * range.  Allocations of size zero are allowed.
 386  386   */
 387  387  #define DTRACE_INSCRATCH(mstate, alloc_sz) \
 388  388          ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
 389  389          (mstate)->dtms_scratch_ptr >= (alloc_sz))
 390  390  
 391  391  #define DTRACE_LOADFUNC(bits)                                           \
 392  392  /*CSTYLED*/                                                             \
 393  393  uint##bits##_t                                                          \
 394  394  dtrace_load##bits(uintptr_t addr)                                       \
 395  395  {                                                                       \
 396  396          size_t size = bits / NBBY;                                      \
 397  397          /*CSTYLED*/                                                     \
 398  398          uint##bits##_t rval;                                            \
 399  399          int i;                                                          \
 400  400          volatile uint16_t *flags = (volatile uint16_t *)                \
 401  401              &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;                   \
 402  402                                                                          \
 403  403          DTRACE_ALIGNCHECK(addr, size, flags);                           \
 404  404                                                                          \
 405  405          for (i = 0; i < dtrace_toxranges; i++) {                        \
 406  406                  if (addr >= dtrace_toxrange[i].dtt_limit)               \
 407  407                          continue;                                       \
 408  408                                                                          \
 409  409                  if (addr + size <= dtrace_toxrange[i].dtt_base)         \
 410  410                          continue;                                       \
 411  411                                                                          \
 412  412                  /*                                                      \
 413  413                   * This address falls within a toxic region; return 0.  \
 414  414                   */                                                     \
 415  415                  *flags |= CPU_DTRACE_BADADDR;                           \
 416  416                  cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 417  417                  return (0);                                             \
 418  418          }                                                               \
 419  419                                                                          \
 420  420          *flags |= CPU_DTRACE_NOFAULT;                                   \
 421  421          /*CSTYLED*/                                                     \
 422  422          rval = *((volatile uint##bits##_t *)addr);                      \
 423  423          *flags &= ~CPU_DTRACE_NOFAULT;                                  \
 424  424                                                                          \
 425  425          return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);               \
 426  426  }
 427  427  
 428  428  #ifdef _LP64
 429  429  #define dtrace_loadptr  dtrace_load64
 430  430  #else
 431  431  #define dtrace_loadptr  dtrace_load32
 432  432  #endif
 433  433  
 434  434  #define DTRACE_DYNHASH_FREE     0
 435  435  #define DTRACE_DYNHASH_SINK     1
 436  436  #define DTRACE_DYNHASH_VALID    2
 437  437  
 438  438  #define DTRACE_MATCH_FAIL       -1
 439  439  #define DTRACE_MATCH_NEXT       0
 440  440  #define DTRACE_MATCH_DONE       1
 441  441  #define DTRACE_ANCHORED(probe)  ((probe)->dtpr_func[0] != '\0')
 442  442  #define DTRACE_STATE_ALIGN      64
 443  443  
 444  444  #define DTRACE_FLAGS2FLT(flags)                                         \
 445  445          (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :           \
 446  446          ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :                \
 447  447          ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :            \
 448  448          ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :                \
 449  449          ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :                \
 450  450          ((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :         \
 451  451          ((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :         \
 452  452          ((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :       \
 453  453          ((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :         \
 454  454          DTRACEFLT_UNKNOWN)
 455  455  
 456  456  #define DTRACEACT_ISSTRING(act)                                         \
 457  457          ((act)->dta_kind == DTRACEACT_DIFEXPR &&                        \
 458  458          (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
 459  459  
 460  460  static size_t dtrace_strlen(const char *, size_t);
 461  461  static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
 462  462  static void dtrace_enabling_provide(dtrace_provider_t *);
 463  463  static int dtrace_enabling_match(dtrace_enabling_t *, int *);
 464  464  static void dtrace_enabling_matchall(void);
 465  465  static void dtrace_enabling_reap(void);
 466  466  static dtrace_state_t *dtrace_anon_grab(void);
 467  467  static uint64_t dtrace_helper(int, dtrace_mstate_t *,
 468  468      dtrace_state_t *, uint64_t, uint64_t);
 469  469  static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
 470  470  static void dtrace_buffer_drop(dtrace_buffer_t *);
 471  471  static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
 472  472  static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
 473  473      dtrace_state_t *, dtrace_mstate_t *);
 474  474  static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
 475  475      dtrace_optval_t);
 476  476  static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
 477  477  static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
 478  478  static int dtrace_priv_proc(dtrace_state_t *, dtrace_mstate_t *);
 479  479  static void dtrace_getf_barrier(void);
 480  480  
 481  481  /*
 482  482   * DTrace Probe Context Functions
 483  483   *
 484  484   * These functions are called from probe context.  Because probe context is
 485  485   * any context in which C may be called, arbitrarily locks may be held,
 486  486   * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
 487  487   * As a result, functions called from probe context may only call other DTrace
 488  488   * support functions -- they may not interact at all with the system at large.
 489  489   * (Note that the ASSERT macro is made probe-context safe by redefining it in
 490  490   * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
 491  491   * loads are to be performed from probe context, they _must_ be in terms of
 492  492   * the safe dtrace_load*() variants.
 493  493   *
 494  494   * Some functions in this block are not actually called from probe context;
 495  495   * for these functions, there will be a comment above the function reading
 496  496   * "Note:  not called from probe context."
 497  497   */
 498  498  void
 499  499  dtrace_panic(const char *format, ...)
 500  500  {
 501  501          va_list alist;
 502  502  
 503  503          va_start(alist, format);
 504  504          dtrace_vpanic(format, alist);
 505  505          va_end(alist);
 506  506  }
 507  507  
 508  508  int
 509  509  dtrace_assfail(const char *a, const char *f, int l)
 510  510  {
 511  511          dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
 512  512  
 513  513          /*
 514  514           * We just need something here that even the most clever compiler
 515  515           * cannot optimize away.
 516  516           */
 517  517          return (a[(uintptr_t)f]);
 518  518  }
 519  519  
 520  520  /*
 521  521   * Atomically increment a specified error counter from probe context.
 522  522   */
 523  523  static void
 524  524  dtrace_error(uint32_t *counter)
 525  525  {
 526  526          /*
 527  527           * Most counters stored to in probe context are per-CPU counters.
 528  528           * However, there are some error conditions that are sufficiently
 529  529           * arcane that they don't merit per-CPU storage.  If these counters
 530  530           * are incremented concurrently on different CPUs, scalability will be
 531  531           * adversely affected -- but we don't expect them to be white-hot in a
 532  532           * correctly constructed enabling...
 533  533           */
 534  534          uint32_t oval, nval;
 535  535  
 536  536          do {
 537  537                  oval = *counter;
 538  538  
 539  539                  if ((nval = oval + 1) == 0) {
 540  540                          /*
 541  541                           * If the counter would wrap, set it to 1 -- assuring
 542  542                           * that the counter is never zero when we have seen
 543  543                           * errors.  (The counter must be 32-bits because we
 544  544                           * aren't guaranteed a 64-bit compare&swap operation.)
 545  545                           * To save this code both the infamy of being fingered
 546  546                           * by a priggish news story and the indignity of being
 547  547                           * the target of a neo-puritan witch trial, we're
 548  548                           * carefully avoiding any colorful description of the
 549  549                           * likelihood of this condition -- but suffice it to
 550  550                           * say that it is only slightly more likely than the
 551  551                           * overflow of predicate cache IDs, as discussed in
 552  552                           * dtrace_predicate_create().
 553  553                           */
 554  554                          nval = 1;
 555  555                  }
 556  556          } while (dtrace_cas32(counter, oval, nval) != oval);
 557  557  }
 558  558  
 559  559  /*
 560  560   * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
 561  561   * uint8_t, a uint16_t, a uint32_t and a uint64_t.
 562  562   */
 563  563  DTRACE_LOADFUNC(8)
 564  564  DTRACE_LOADFUNC(16)
 565  565  DTRACE_LOADFUNC(32)
 566  566  DTRACE_LOADFUNC(64)
 567  567  
 568  568  static int
 569  569  dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
 570  570  {
 571  571          if (dest < mstate->dtms_scratch_base)
 572  572                  return (0);
 573  573  
 574  574          if (dest + size < dest)
 575  575                  return (0);
 576  576  
 577  577          if (dest + size > mstate->dtms_scratch_ptr)
 578  578                  return (0);
 579  579  
 580  580          return (1);
 581  581  }
 582  582  
 583  583  static int
 584  584  dtrace_canstore_statvar(uint64_t addr, size_t sz,
 585  585      dtrace_statvar_t **svars, int nsvars)
 586  586  {
 587  587          int i;
 588  588  
 589  589          for (i = 0; i < nsvars; i++) {
 590  590                  dtrace_statvar_t *svar = svars[i];
 591  591  
 592  592                  if (svar == NULL || svar->dtsv_size == 0)
 593  593                          continue;
 594  594  
 595  595                  if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
 596  596                          return (1);
 597  597          }
 598  598  
 599  599          return (0);
 600  600  }
 601  601  
 602  602  /*
 603  603   * Check to see if the address is within a memory region to which a store may
 604  604   * be issued.  This includes the DTrace scratch areas, and any DTrace variable
 605  605   * region.  The caller of dtrace_canstore() is responsible for performing any
 606  606   * alignment checks that are needed before stores are actually executed.
 607  607   */
 608  608  static int
 609  609  dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 610  610      dtrace_vstate_t *vstate)
 611  611  {
 612  612          /*
 613  613           * First, check to see if the address is in scratch space...
 614  614           */
 615  615          if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
 616  616              mstate->dtms_scratch_size))
 617  617                  return (1);
 618  618  
 619  619          /*
 620  620           * Now check to see if it's a dynamic variable.  This check will pick
 621  621           * up both thread-local variables and any global dynamically-allocated
 622  622           * variables.
 623  623           */
 624  624          if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
 625  625              vstate->dtvs_dynvars.dtds_size)) {
 626  626                  dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
 627  627                  uintptr_t base = (uintptr_t)dstate->dtds_base +
 628  628                      (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
 629  629                  uintptr_t chunkoffs;
 630  630  
 631  631                  /*
 632  632                   * Before we assume that we can store here, we need to make
 633  633                   * sure that it isn't in our metadata -- storing to our
 634  634                   * dynamic variable metadata would corrupt our state.  For
 635  635                   * the range to not include any dynamic variable metadata,
 636  636                   * it must:
 637  637                   *
 638  638                   *      (1) Start above the hash table that is at the base of
 639  639                   *      the dynamic variable space
 640  640                   *
 641  641                   *      (2) Have a starting chunk offset that is beyond the
 642  642                   *      dtrace_dynvar_t that is at the base of every chunk
 643  643                   *
 644  644                   *      (3) Not span a chunk boundary
 645  645                   *
 646  646                   */
 647  647                  if (addr < base)
 648  648                          return (0);
 649  649  
 650  650                  chunkoffs = (addr - base) % dstate->dtds_chunksize;
 651  651  
 652  652                  if (chunkoffs < sizeof (dtrace_dynvar_t))
 653  653                          return (0);
 654  654  
 655  655                  if (chunkoffs + sz > dstate->dtds_chunksize)
 656  656                          return (0);
 657  657  
 658  658                  return (1);
 659  659          }
 660  660  
 661  661          /*
 662  662           * Finally, check the static local and global variables.  These checks
 663  663           * take the longest, so we perform them last.
 664  664           */
 665  665          if (dtrace_canstore_statvar(addr, sz,
 666  666              vstate->dtvs_locals, vstate->dtvs_nlocals))
 667  667                  return (1);
 668  668  
 669  669          if (dtrace_canstore_statvar(addr, sz,
 670  670              vstate->dtvs_globals, vstate->dtvs_nglobals))
 671  671                  return (1);
 672  672  
 673  673          return (0);
 674  674  }
 675  675  
 676  676  
 677  677  /*
 678  678   * Convenience routine to check to see if the address is within a memory
 679  679   * region in which a load may be issued given the user's privilege level;
 680  680   * if not, it sets the appropriate error flags and loads 'addr' into the
 681  681   * illegal value slot.
 682  682   *
 683  683   * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
 684  684   * appropriate memory access protection.
 685  685   */
 686  686  static int
 687  687  dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 688  688      dtrace_vstate_t *vstate)
 689  689  {
 690  690          volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
 691  691          file_t *fp;
 692  692  
 693  693          /*
 694  694           * If we hold the privilege to read from kernel memory, then
 695  695           * everything is readable.
 696  696           */
 697  697          if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
 698  698                  return (1);
 699  699  
 700  700          /*
 701  701           * You can obviously read that which you can store.
 702  702           */
 703  703          if (dtrace_canstore(addr, sz, mstate, vstate))
 704  704                  return (1);
 705  705  
 706  706          /*
 707  707           * We're allowed to read from our own string table.
 708  708           */
 709  709          if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
 710  710              mstate->dtms_difo->dtdo_strlen))
 711  711                  return (1);
 712  712  
 713  713          if (vstate->dtvs_state != NULL &&
 714  714              dtrace_priv_proc(vstate->dtvs_state, mstate)) {
 715  715                  proc_t *p;
 716  716  
 717  717                  /*
 718  718                   * When we have privileges to the current process, there are
 719  719                   * several context-related kernel structures that are safe to
 720  720                   * read, even absent the privilege to read from kernel memory.
 721  721                   * These reads are safe because these structures contain only
 722  722                   * state that (1) we're permitted to read, (2) is harmless or
 723  723                   * (3) contains pointers to additional kernel state that we're
 724  724                   * not permitted to read (and as such, do not present an
 725  725                   * opportunity for privilege escalation).  Finally (and
 726  726                   * critically), because of the nature of their relation with
 727  727                   * the current thread context, the memory associated with these
 728  728                   * structures cannot change over the duration of probe context,
 729  729                   * and it is therefore impossible for this memory to be
 730  730                   * deallocated and reallocated as something else while it's
 731  731                   * being operated upon.
 732  732                   */
 733  733                  if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t)))
 734  734                          return (1);
 735  735  
 736  736                  if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
 737  737                      sz, curthread->t_procp, sizeof (proc_t))) {
 738  738                          return (1);
 739  739                  }
 740  740  
 741  741                  if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
 742  742                      curthread->t_cred, sizeof (cred_t))) {
 743  743                          return (1);
 744  744                  }
 745  745  
 746  746                  if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
 747  747                      &(p->p_pidp->pid_id), sizeof (pid_t))) {
 748  748                          return (1);
 749  749                  }
 750  750  
 751  751                  if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
 752  752                      curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
 753  753                          return (1);
 754  754                  }
 755  755          }
 756  756  
 757  757          if ((fp = mstate->dtms_getf) != NULL) {
 758  758                  uintptr_t psz = sizeof (void *);
 759  759                  vnode_t *vp;
 760  760                  vnodeops_t *op;
 761  761  
 762  762                  /*
 763  763                   * When getf() returns a file_t, the enabling is implicitly
 764  764                   * granted the (transient) right to read the returned file_t
 765  765                   * as well as the v_path and v_op->vnop_name of the underlying
 766  766                   * vnode.  These accesses are allowed after a successful
 767  767                   * getf() because the members that they refer to cannot change
 768  768                   * once set -- and the barrier logic in the kernel's closef()
 769  769                   * path assures that the file_t and its referenced vode_t
 770  770                   * cannot themselves be stale (that is, it impossible for
 771  771                   * either dtms_getf itself or its f_vnode member to reference
 772  772                   * freed memory).
 773  773                   */
 774  774                  if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t)))
 775  775                          return (1);
 776  776  
 777  777                  if ((vp = fp->f_vnode) != NULL) {
 778  778                          if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz))
 779  779                                  return (1);
 780  780  
 781  781                          if (vp->v_path != NULL && DTRACE_INRANGE(addr, sz,
 782  782                              vp->v_path, strlen(vp->v_path) + 1)) {
 783  783                                  return (1);
 784  784                          }
 785  785  
 786  786                          if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz))
 787  787                                  return (1);
 788  788  
 789  789                          if ((op = vp->v_op) != NULL &&
 790  790                              DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
 791  791                                  return (1);
 792  792                          }
 793  793  
 794  794                          if (op != NULL && op->vnop_name != NULL &&
 795  795                              DTRACE_INRANGE(addr, sz, op->vnop_name,
 796  796                              strlen(op->vnop_name) + 1)) {
 797  797                                  return (1);
 798  798                          }
 799  799                  }
 800  800          }
 801  801  
 802  802          DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
 803  803          *illval = addr;
 804  804          return (0);
 805  805  }
 806  806  
 807  807  /*
 808  808   * Convenience routine to check to see if a given string is within a memory
 809  809   * region in which a load may be issued given the user's privilege level;
 810  810   * this exists so that we don't need to issue unnecessary dtrace_strlen()
 811  811   * calls in the event that the user has all privileges.
 812  812   */
 813  813  static int
 814  814  dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 815  815      dtrace_vstate_t *vstate)
 816  816  {
 817  817          size_t strsz;
 818  818  
 819  819          /*
 820  820           * If we hold the privilege to read from kernel memory, then
 821  821           * everything is readable.
 822  822           */
 823  823          if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
 824  824                  return (1);
 825  825  
 826  826          strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
 827  827          if (dtrace_canload(addr, strsz, mstate, vstate))
 828  828                  return (1);
 829  829  
 830  830          return (0);
 831  831  }
 832  832  
 833  833  /*
 834  834   * Convenience routine to check to see if a given variable is within a memory
 835  835   * region in which a load may be issued given the user's privilege level.
 836  836   */
 837  837  static int
 838  838  dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
 839  839      dtrace_vstate_t *vstate)
 840  840  {
 841  841          size_t sz;
 842  842          ASSERT(type->dtdt_flags & DIF_TF_BYREF);
 843  843  
 844  844          /*
 845  845           * If we hold the privilege to read from kernel memory, then
 846  846           * everything is readable.
 847  847           */
 848  848          if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
 849  849                  return (1);
 850  850  
 851  851          if (type->dtdt_kind == DIF_TYPE_STRING)
 852  852                  sz = dtrace_strlen(src,
 853  853                      vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
 854  854          else
 855  855                  sz = type->dtdt_size;
 856  856  
 857  857          return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
 858  858  }
 859  859  
 860  860  /*
 861  861   * Compare two strings using safe loads.
 862  862   */
 863  863  static int
 864  864  dtrace_strncmp(char *s1, char *s2, size_t limit)
 865  865  {
 866  866          uint8_t c1, c2;
 867  867          volatile uint16_t *flags;
 868  868  
 869  869          if (s1 == s2 || limit == 0)
 870  870                  return (0);
 871  871  
 872  872          flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
 873  873  
 874  874          do {
 875  875                  if (s1 == NULL) {
 876  876                          c1 = '\0';
 877  877                  } else {
 878  878                          c1 = dtrace_load8((uintptr_t)s1++);
 879  879                  }
 880  880  
 881  881                  if (s2 == NULL) {
 882  882                          c2 = '\0';
 883  883                  } else {
 884  884                          c2 = dtrace_load8((uintptr_t)s2++);
 885  885                  }
 886  886  
 887  887                  if (c1 != c2)
 888  888                          return (c1 - c2);
 889  889          } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
 890  890  
 891  891          return (0);
 892  892  }
 893  893  
 894  894  /*
 895  895   * Compute strlen(s) for a string using safe memory accesses.  The additional
 896  896   * len parameter is used to specify a maximum length to ensure completion.
 897  897   */
 898  898  static size_t
 899  899  dtrace_strlen(const char *s, size_t lim)
 900  900  {
 901  901          uint_t len;
 902  902  
 903  903          for (len = 0; len != lim; len++) {
 904  904                  if (dtrace_load8((uintptr_t)s++) == '\0')
 905  905                          break;
 906  906          }
 907  907  
 908  908          return (len);
 909  909  }
 910  910  
 911  911  /*
 912  912   * Check if an address falls within a toxic region.
 913  913   */
 914  914  static int
 915  915  dtrace_istoxic(uintptr_t kaddr, size_t size)
 916  916  {
 917  917          uintptr_t taddr, tsize;
 918  918          int i;
 919  919  
 920  920          for (i = 0; i < dtrace_toxranges; i++) {
 921  921                  taddr = dtrace_toxrange[i].dtt_base;
 922  922                  tsize = dtrace_toxrange[i].dtt_limit - taddr;
 923  923  
 924  924                  if (kaddr - taddr < tsize) {
 925  925                          DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
 926  926                          cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
 927  927                          return (1);
 928  928                  }
 929  929  
 930  930                  if (taddr - kaddr < size) {
 931  931                          DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
 932  932                          cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
 933  933                          return (1);
 934  934                  }
 935  935          }
 936  936  
 937  937          return (0);
 938  938  }
 939  939  
 940  940  /*
 941  941   * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
 942  942   * memory specified by the DIF program.  The dst is assumed to be safe memory
 943  943   * that we can store to directly because it is managed by DTrace.  As with
 944  944   * standard bcopy, overlapping copies are handled properly.
 945  945   */
 946  946  static void
 947  947  dtrace_bcopy(const void *src, void *dst, size_t len)
 948  948  {
 949  949          if (len != 0) {
 950  950                  uint8_t *s1 = dst;
 951  951                  const uint8_t *s2 = src;
 952  952  
 953  953                  if (s1 <= s2) {
 954  954                          do {
 955  955                                  *s1++ = dtrace_load8((uintptr_t)s2++);
 956  956                          } while (--len != 0);
 957  957                  } else {
 958  958                          s2 += len;
 959  959                          s1 += len;
 960  960  
 961  961                          do {
 962  962                                  *--s1 = dtrace_load8((uintptr_t)--s2);
 963  963                          } while (--len != 0);
 964  964                  }
 965  965          }
 966  966  }
 967  967  
 968  968  /*
 969  969   * Copy src to dst using safe memory accesses, up to either the specified
 970  970   * length, or the point that a nul byte is encountered.  The src is assumed to
 971  971   * be unsafe memory specified by the DIF program.  The dst is assumed to be
 972  972   * safe memory that we can store to directly because it is managed by DTrace.
 973  973   * Unlike dtrace_bcopy(), overlapping regions are not handled.
 974  974   */
 975  975  static void
 976  976  dtrace_strcpy(const void *src, void *dst, size_t len)
 977  977  {
 978  978          if (len != 0) {
 979  979                  uint8_t *s1 = dst, c;
 980  980                  const uint8_t *s2 = src;
 981  981  
 982  982                  do {
 983  983                          *s1++ = c = dtrace_load8((uintptr_t)s2++);
 984  984                  } while (--len != 0 && c != '\0');
 985  985          }
 986  986  }
 987  987  
 988  988  /*
 989  989   * Copy src to dst, deriving the size and type from the specified (BYREF)
 990  990   * variable type.  The src is assumed to be unsafe memory specified by the DIF
 991  991   * program.  The dst is assumed to be DTrace variable memory that is of the
 992  992   * specified type; we assume that we can store to directly.
 993  993   */
 994  994  static void
 995  995  dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
 996  996  {
 997  997          ASSERT(type->dtdt_flags & DIF_TF_BYREF);
 998  998  
 999  999          if (type->dtdt_kind == DIF_TYPE_STRING) {
1000 1000                  dtrace_strcpy(src, dst, type->dtdt_size);
1001 1001          } else {
1002 1002                  dtrace_bcopy(src, dst, type->dtdt_size);
1003 1003          }
1004 1004  }
1005 1005  
1006 1006  /*
1007 1007   * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
1008 1008   * unsafe memory specified by the DIF program.  The s2 data is assumed to be
1009 1009   * safe memory that we can access directly because it is managed by DTrace.
1010 1010   */
1011 1011  static int
1012 1012  dtrace_bcmp(const void *s1, const void *s2, size_t len)
1013 1013  {
1014 1014          volatile uint16_t *flags;
1015 1015  
1016 1016          flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1017 1017  
1018 1018          if (s1 == s2)
1019 1019                  return (0);
1020 1020  
1021 1021          if (s1 == NULL || s2 == NULL)
1022 1022                  return (1);
1023 1023  
1024 1024          if (s1 != s2 && len != 0) {
1025 1025                  const uint8_t *ps1 = s1;
1026 1026                  const uint8_t *ps2 = s2;
1027 1027  
1028 1028                  do {
1029 1029                          if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1030 1030                                  return (1);
1031 1031                  } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1032 1032          }
1033 1033          return (0);
1034 1034  }
1035 1035  
1036 1036  /*
1037 1037   * Zero the specified region using a simple byte-by-byte loop.  Note that this
1038 1038   * is for safe DTrace-managed memory only.
1039 1039   */
1040 1040  static void
1041 1041  dtrace_bzero(void *dst, size_t len)
1042 1042  {
1043 1043          uchar_t *cp;
1044 1044  
1045 1045          for (cp = dst; len != 0; len--)
1046 1046                  *cp++ = 0;
1047 1047  }
1048 1048  
1049 1049  static void
1050 1050  dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1051 1051  {
1052 1052          uint64_t result[2];
1053 1053  
1054 1054          result[0] = addend1[0] + addend2[0];
1055 1055          result[1] = addend1[1] + addend2[1] +
1056 1056              (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1057 1057  
1058 1058          sum[0] = result[0];
1059 1059          sum[1] = result[1];
1060 1060  }
1061 1061  
1062 1062  /*
1063 1063   * Shift the 128-bit value in a by b. If b is positive, shift left.
1064 1064   * If b is negative, shift right.
1065 1065   */
1066 1066  static void
1067 1067  dtrace_shift_128(uint64_t *a, int b)
1068 1068  {
1069 1069          uint64_t mask;
1070 1070  
1071 1071          if (b == 0)
1072 1072                  return;
1073 1073  
1074 1074          if (b < 0) {
1075 1075                  b = -b;
1076 1076                  if (b >= 64) {
1077 1077                          a[0] = a[1] >> (b - 64);
1078 1078                          a[1] = 0;
1079 1079                  } else {
1080 1080                          a[0] >>= b;
1081 1081                          mask = 1LL << (64 - b);
1082 1082                          mask -= 1;
1083 1083                          a[0] |= ((a[1] & mask) << (64 - b));
1084 1084                          a[1] >>= b;
1085 1085                  }
1086 1086          } else {
1087 1087                  if (b >= 64) {
1088 1088                          a[1] = a[0] << (b - 64);
1089 1089                          a[0] = 0;
1090 1090                  } else {
1091 1091                          a[1] <<= b;
1092 1092                          mask = a[0] >> (64 - b);
1093 1093                          a[1] |= mask;
1094 1094                          a[0] <<= b;
1095 1095                  }
1096 1096          }
1097 1097  }
1098 1098  
1099 1099  /*
1100 1100   * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1101 1101   * use native multiplication on those, and then re-combine into the
1102 1102   * resulting 128-bit value.
1103 1103   *
1104 1104   * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1105 1105   *     hi1 * hi2 << 64 +
1106 1106   *     hi1 * lo2 << 32 +
1107 1107   *     hi2 * lo1 << 32 +
1108 1108   *     lo1 * lo2
1109 1109   */
1110 1110  static void
1111 1111  dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1112 1112  {
1113 1113          uint64_t hi1, hi2, lo1, lo2;
1114 1114          uint64_t tmp[2];
1115 1115  
1116 1116          hi1 = factor1 >> 32;
1117 1117          hi2 = factor2 >> 32;
1118 1118  
1119 1119          lo1 = factor1 & DT_MASK_LO;
1120 1120          lo2 = factor2 & DT_MASK_LO;
1121 1121  
1122 1122          product[0] = lo1 * lo2;
1123 1123          product[1] = hi1 * hi2;
1124 1124  
1125 1125          tmp[0] = hi1 * lo2;
1126 1126          tmp[1] = 0;
1127 1127          dtrace_shift_128(tmp, 32);
1128 1128          dtrace_add_128(product, tmp, product);
1129 1129  
1130 1130          tmp[0] = hi2 * lo1;
1131 1131          tmp[1] = 0;
1132 1132          dtrace_shift_128(tmp, 32);
1133 1133          dtrace_add_128(product, tmp, product);
1134 1134  }
1135 1135  
1136 1136  /*
1137 1137   * This privilege check should be used by actions and subroutines to
1138 1138   * verify that the user credentials of the process that enabled the
1139 1139   * invoking ECB match the target credentials
1140 1140   */
1141 1141  static int
1142 1142  dtrace_priv_proc_common_user(dtrace_state_t *state)
1143 1143  {
1144 1144          cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1145 1145  
1146 1146          /*
1147 1147           * We should always have a non-NULL state cred here, since if cred
1148 1148           * is null (anonymous tracing), we fast-path bypass this routine.
1149 1149           */
1150 1150          ASSERT(s_cr != NULL);
1151 1151  
1152 1152          if ((cr = CRED()) != NULL &&
1153 1153              s_cr->cr_uid == cr->cr_uid &&
1154 1154              s_cr->cr_uid == cr->cr_ruid &&
1155 1155              s_cr->cr_uid == cr->cr_suid &&
1156 1156              s_cr->cr_gid == cr->cr_gid &&
1157 1157              s_cr->cr_gid == cr->cr_rgid &&
1158 1158              s_cr->cr_gid == cr->cr_sgid)
1159 1159                  return (1);
1160 1160  
1161 1161          return (0);
1162 1162  }
1163 1163  
1164 1164  /*
1165 1165   * This privilege check should be used by actions and subroutines to
1166 1166   * verify that the zone of the process that enabled the invoking ECB
1167 1167   * matches the target credentials
1168 1168   */
1169 1169  static int
1170 1170  dtrace_priv_proc_common_zone(dtrace_state_t *state)
1171 1171  {
1172 1172          cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1173 1173  
1174 1174          /*
1175 1175           * We should always have a non-NULL state cred here, since if cred
1176 1176           * is null (anonymous tracing), we fast-path bypass this routine.
1177 1177           */
1178 1178          ASSERT(s_cr != NULL);
1179 1179  
1180 1180          if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
1181 1181                  return (1);
1182 1182  
1183 1183          return (0);
1184 1184  }
1185 1185  
1186 1186  /*
1187 1187   * This privilege check should be used by actions and subroutines to
1188 1188   * verify that the process has not setuid or changed credentials.
1189 1189   */
1190 1190  static int
1191 1191  dtrace_priv_proc_common_nocd()
1192 1192  {
1193 1193          proc_t *proc;
1194 1194  
1195 1195          if ((proc = ttoproc(curthread)) != NULL &&
1196 1196              !(proc->p_flag & SNOCD))
1197 1197                  return (1);
1198 1198  
1199 1199          return (0);
1200 1200  }
1201 1201  
1202 1202  static int
1203 1203  dtrace_priv_proc_destructive(dtrace_state_t *state, dtrace_mstate_t *mstate)
1204 1204  {
1205 1205          int action = state->dts_cred.dcr_action;
1206 1206  
1207 1207          if (!(mstate->dtms_access & DTRACE_ACCESS_PROC))
1208 1208                  goto bad;
1209 1209  
1210 1210          if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1211 1211              dtrace_priv_proc_common_zone(state) == 0)
1212 1212                  goto bad;
1213 1213  
1214 1214          if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1215 1215              dtrace_priv_proc_common_user(state) == 0)
1216 1216                  goto bad;
1217 1217  
1218 1218          if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1219 1219              dtrace_priv_proc_common_nocd() == 0)
1220 1220                  goto bad;
1221 1221  
1222 1222          return (1);
1223 1223  
1224 1224  bad:
1225 1225          cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1226 1226  
1227 1227          return (0);
1228 1228  }
1229 1229  
1230 1230  static int
1231 1231  dtrace_priv_proc_control(dtrace_state_t *state, dtrace_mstate_t *mstate)
1232 1232  {
1233 1233          if (mstate->dtms_access & DTRACE_ACCESS_PROC) {
1234 1234                  if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1235 1235                          return (1);
1236 1236  
1237 1237                  if (dtrace_priv_proc_common_zone(state) &&
1238 1238                      dtrace_priv_proc_common_user(state) &&
1239 1239                      dtrace_priv_proc_common_nocd())
1240 1240                          return (1);
1241 1241          }
1242 1242  
1243 1243          cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1244 1244  
1245 1245          return (0);
1246 1246  }
1247 1247  
1248 1248  static int
1249 1249  dtrace_priv_proc(dtrace_state_t *state, dtrace_mstate_t *mstate)
1250 1250  {
1251 1251          if ((mstate->dtms_access & DTRACE_ACCESS_PROC) &&
1252 1252              (state->dts_cred.dcr_action & DTRACE_CRA_PROC))
1253 1253                  return (1);
1254 1254  
1255 1255          cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1256 1256  
1257 1257          return (0);
1258 1258  }
1259 1259  
1260 1260  static int
1261 1261  dtrace_priv_kernel(dtrace_state_t *state)
1262 1262  {
1263 1263          if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1264 1264                  return (1);
1265 1265  
1266 1266          cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1267 1267  
1268 1268          return (0);
1269 1269  }
1270 1270  
1271 1271  static int
1272 1272  dtrace_priv_kernel_destructive(dtrace_state_t *state)
1273 1273  {
1274 1274          if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1275 1275                  return (1);
1276 1276  
1277 1277          cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1278 1278  
1279 1279          return (0);
1280 1280  }
1281 1281  
1282 1282  /*
1283 1283   * Determine if the dte_cond of the specified ECB allows for processing of
1284 1284   * the current probe to continue.  Note that this routine may allow continued
1285 1285   * processing, but with access(es) stripped from the mstate's dtms_access
1286 1286   * field.
1287 1287   */
1288 1288  static int
1289 1289  dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
1290 1290      dtrace_ecb_t *ecb)
1291 1291  {
1292 1292          dtrace_probe_t *probe = ecb->dte_probe;

↓ open down ↓

1292 lines elided

↑ open up ↑

1293 1293          dtrace_provider_t *prov = probe->dtpr_provider;
1294 1294          dtrace_pops_t *pops = &prov->dtpv_pops;
1295 1295          int mode = DTRACE_MODE_NOPRIV_DROP;
1296 1296  
1297 1297          ASSERT(ecb->dte_cond);
1298 1298  
1299 1299          if (pops->dtps_mode != NULL) {
1300 1300                  mode = pops->dtps_mode(prov->dtpv_arg,
1301 1301                      probe->dtpr_id, probe->dtpr_arg);
1302 1302  
1303      -                ASSERT((mode & DTRACE_MODE_USER) ||
1304      -                    (mode & DTRACE_MODE_KERNEL));
1305      -                ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) ||
1306      -                    (mode & DTRACE_MODE_NOPRIV_DROP));
     1303 +                ASSERT(mode & (DTRACE_MODE_USER | DTRACE_MODE_KERNEL));
     1304 +                ASSERT(mode & (DTRACE_MODE_NOPRIV_RESTRICT |
     1305 +                    DTRACE_MODE_NOPRIV_DROP));
1307 1306          }
1308 1307  
1309 1308          /*
1310 1309           * If the dte_cond bits indicate that this consumer is only allowed to
1311      -         * see user-mode firings of this probe, call the provider's dtps_mode()
1312      -         * entry point to check that the probe was fired while in a user
1313      -         * context.  If that's not the case, use the policy specified by the
1314      -         * provider to determine if we drop the probe or merely restrict
1315      -         * operation.
     1310 +         * see user-mode firings of this probe, check that the probe was fired
     1311 +         * while in a user context.  If that's not the case, use the policy
     1312 +         * specified by the provider to determine if we drop the probe or
     1313 +         * merely restrict operation.
1316 1314           */
1317 1315          if (ecb->dte_cond & DTRACE_COND_USERMODE) {
1318 1316                  ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
1319 1317  
1320 1318                  if (!(mode & DTRACE_MODE_USER)) {
1321 1319                          if (mode & DTRACE_MODE_NOPRIV_DROP)
1322 1320                                  return (0);
1323 1321  
1324 1322                          mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
1325 1323                  }

1326 1324          }
1327 1325  
1328 1326          /*
1329 1327           * This is more subtle than it looks. We have to be absolutely certain
1330 1328           * that CRED() isn't going to change out from under us so it's only
1331 1329           * legit to examine that structure if we're in constrained situations.
1332 1330           * Currently, the only times we'll this check is if a non-super-user
1333 1331           * has enabled the profile or syscall providers -- providers that
1334 1332           * allow visibility of all processes. For the profile case, the check
1335 1333           * above will ensure that we're examining a user context.
1336 1334           */
1337 1335          if (ecb->dte_cond & DTRACE_COND_OWNER) {
1338 1336                  cred_t *cr;
1339 1337                  cred_t *s_cr = state->dts_cred.dcr_cred;
1340 1338                  proc_t *proc;
1341 1339  
1342 1340                  ASSERT(s_cr != NULL);
1343 1341  
1344 1342                  if ((cr = CRED()) == NULL ||
1345 1343                      s_cr->cr_uid != cr->cr_uid ||
1346 1344                      s_cr->cr_uid != cr->cr_ruid ||
1347 1345                      s_cr->cr_uid != cr->cr_suid ||
1348 1346                      s_cr->cr_gid != cr->cr_gid ||
1349 1347                      s_cr->cr_gid != cr->cr_rgid ||
1350 1348                      s_cr->cr_gid != cr->cr_sgid ||
1351 1349                      (proc = ttoproc(curthread)) == NULL ||
1352 1350                      (proc->p_flag & SNOCD)) {
1353 1351                          if (mode & DTRACE_MODE_NOPRIV_DROP)
1354 1352                                  return (0);
1355 1353  
1356 1354                          mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
1357 1355                  }
1358 1356          }
1359 1357  
1360 1358          /*
1361 1359           * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
1362 1360           * in our zone, check to see if our mode policy is to restrict rather
1363 1361           * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
1364 1362           * and DTRACE_ACCESS_ARGS
1365 1363           */
1366 1364          if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
1367 1365                  cred_t *cr;
1368 1366                  cred_t *s_cr = state->dts_cred.dcr_cred;
1369 1367  
1370 1368                  ASSERT(s_cr != NULL);
1371 1369

↓ open down ↓

46 lines elided

↑ open up ↑

1372 1370                  if ((cr = CRED()) == NULL ||
1373 1371                      s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
1374 1372                          if (mode & DTRACE_MODE_NOPRIV_DROP)
1375 1373                                  return (0);
1376 1374  
1377 1375                          mstate->dtms_access &=
1378 1376                              ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
1379 1377                  }
1380 1378          }
1381 1379  
     1380 +        /*
     1381 +         * By merits of being in this code path at all, we have limited
     1382 +         * privileges.  If the provider has indicated that limited privileges
     1383 +         * are to denote restricted operation, strip off the ability to access
     1384 +         * arguments.
     1385 +         */
     1386 +        if (mode & DTRACE_MODE_LIMITEDPRIV_RESTRICT)
     1387 +                mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
     1388 +
1382 1389          return (1);
1383 1390  }
1384 1391  
1385 1392  /*
1386 1393   * Note:  not called from probe context.  This function is called
1387 1394   * asynchronously (and at a regular interval) from outside of probe context to
1388 1395   * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1389 1396   * cleaning is explained in detail in <sys/dtrace_impl.h>.
1390 1397   */
1391 1398  void

1392 1399  dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1393 1400  {
1394 1401          dtrace_dynvar_t *dirty;
1395 1402          dtrace_dstate_percpu_t *dcpu;
1396 1403          dtrace_dynvar_t **rinsep;
1397 1404          int i, j, work = 0;
1398 1405  
1399 1406          for (i = 0; i < NCPU; i++) {
1400 1407                  dcpu = &dstate->dtds_percpu[i];
1401 1408                  rinsep = &dcpu->dtdsc_rinsing;
1402 1409  
1403 1410                  /*
1404 1411                   * If the dirty list is NULL, there is no dirty work to do.
1405 1412                   */
1406 1413                  if (dcpu->dtdsc_dirty == NULL)
1407 1414                          continue;
1408 1415  
1409 1416                  if (dcpu->dtdsc_rinsing != NULL) {
1410 1417                          /*
1411 1418                           * If the rinsing list is non-NULL, then it is because
1412 1419                           * this CPU was selected to accept another CPU's
1413 1420                           * dirty list -- and since that time, dirty buffers
1414 1421                           * have accumulated.  This is a highly unlikely
1415 1422                           * condition, but we choose to ignore the dirty
1416 1423                           * buffers -- they'll be picked up a future cleanse.
1417 1424                           */
1418 1425                          continue;
1419 1426                  }
1420 1427  
1421 1428                  if (dcpu->dtdsc_clean != NULL) {
1422 1429                          /*
1423 1430                           * If the clean list is non-NULL, then we're in a
1424 1431                           * situation where a CPU has done deallocations (we
1425 1432                           * have a non-NULL dirty list) but no allocations (we
1426 1433                           * also have a non-NULL clean list).  We can't simply
1427 1434                           * move the dirty list into the clean list on this
1428 1435                           * CPU, yet we also don't want to allow this condition
1429 1436                           * to persist, lest a short clean list prevent a
1430 1437                           * massive dirty list from being cleaned (which in
1431 1438                           * turn could lead to otherwise avoidable dynamic
1432 1439                           * drops).  To deal with this, we look for some CPU
1433 1440                           * with a NULL clean list, NULL dirty list, and NULL
1434 1441                           * rinsing list -- and then we borrow this CPU to
1435 1442                           * rinse our dirty list.
1436 1443                           */
1437 1444                          for (j = 0; j < NCPU; j++) {
1438 1445                                  dtrace_dstate_percpu_t *rinser;
1439 1446  
1440 1447                                  rinser = &dstate->dtds_percpu[j];
1441 1448  
1442 1449                                  if (rinser->dtdsc_rinsing != NULL)
1443 1450                                          continue;
1444 1451  
1445 1452                                  if (rinser->dtdsc_dirty != NULL)
1446 1453                                          continue;
1447 1454  
1448 1455                                  if (rinser->dtdsc_clean != NULL)
1449 1456                                          continue;
1450 1457  
1451 1458                                  rinsep = &rinser->dtdsc_rinsing;
1452 1459                                  break;
1453 1460                          }
1454 1461  
1455 1462                          if (j == NCPU) {
1456 1463                                  /*
1457 1464                                   * We were unable to find another CPU that
1458 1465                                   * could accept this dirty list -- we are
1459 1466                                   * therefore unable to clean it now.
1460 1467                                   */
1461 1468                                  dtrace_dynvar_failclean++;
1462 1469                                  continue;
1463 1470                          }
1464 1471                  }
1465 1472  
1466 1473                  work = 1;
1467 1474  
1468 1475                  /*
1469 1476                   * Atomically move the dirty list aside.
1470 1477                   */
1471 1478                  do {
1472 1479                          dirty = dcpu->dtdsc_dirty;
1473 1480  
1474 1481                          /*
1475 1482                           * Before we zap the dirty list, set the rinsing list.
1476 1483                           * (This allows for a potential assertion in
1477 1484                           * dtrace_dynvar():  if a free dynamic variable appears
1478 1485                           * on a hash chain, either the dirty list or the
1479 1486                           * rinsing list for some CPU must be non-NULL.)
1480 1487                           */
1481 1488                          *rinsep = dirty;
1482 1489                          dtrace_membar_producer();
1483 1490                  } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1484 1491                      dirty, NULL) != dirty);
1485 1492          }
1486 1493  
1487 1494          if (!work) {
1488 1495                  /*
1489 1496                   * We have no work to do; we can simply return.
1490 1497                   */
1491 1498                  return;
1492 1499          }
1493 1500  
1494 1501          dtrace_sync();
1495 1502  
1496 1503          for (i = 0; i < NCPU; i++) {
1497 1504                  dcpu = &dstate->dtds_percpu[i];
1498 1505  
1499 1506                  if (dcpu->dtdsc_rinsing == NULL)
1500 1507                          continue;
1501 1508  
1502 1509                  /*
1503 1510                   * We are now guaranteed that no hash chain contains a pointer
1504 1511                   * into this dirty list; we can make it clean.
1505 1512                   */
1506 1513                  ASSERT(dcpu->dtdsc_clean == NULL);
1507 1514                  dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1508 1515                  dcpu->dtdsc_rinsing = NULL;
1509 1516          }
1510 1517  
1511 1518          /*
1512 1519           * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1513 1520           * sure that all CPUs have seen all of the dtdsc_clean pointers.
1514 1521           * This prevents a race whereby a CPU incorrectly decides that
1515 1522           * the state should be something other than DTRACE_DSTATE_CLEAN
1516 1523           * after dtrace_dynvar_clean() has completed.
1517 1524           */
1518 1525          dtrace_sync();
1519 1526  
1520 1527          dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1521 1528  }
1522 1529  
1523 1530  /*
1524 1531   * Depending on the value of the op parameter, this function looks-up,
1525 1532   * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1526 1533   * allocation is requested, this function will return a pointer to a
1527 1534   * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1528 1535   * variable can be allocated.  If NULL is returned, the appropriate counter
1529 1536   * will be incremented.
1530 1537   */
1531 1538  dtrace_dynvar_t *
1532 1539  dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1533 1540      dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1534 1541      dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1535 1542  {
1536 1543          uint64_t hashval = DTRACE_DYNHASH_VALID;
1537 1544          dtrace_dynhash_t *hash = dstate->dtds_hash;
1538 1545          dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1539 1546          processorid_t me = CPU->cpu_id, cpu = me;
1540 1547          dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1541 1548          size_t bucket, ksize;
1542 1549          size_t chunksize = dstate->dtds_chunksize;
1543 1550          uintptr_t kdata, lock, nstate;
1544 1551          uint_t i;
1545 1552  
1546 1553          ASSERT(nkeys != 0);
1547 1554  
1548 1555          /*
1549 1556           * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1550 1557           * algorithm.  For the by-value portions, we perform the algorithm in
1551 1558           * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1552 1559           * bit, and seems to have only a minute effect on distribution.  For
1553 1560           * the by-reference data, we perform "One-at-a-time" iterating (safely)
1554 1561           * over each referenced byte.  It's painful to do this, but it's much
1555 1562           * better than pathological hash distribution.  The efficacy of the
1556 1563           * hashing algorithm (and a comparison with other algorithms) may be
1557 1564           * found by running the ::dtrace_dynstat MDB dcmd.
1558 1565           */
1559 1566          for (i = 0; i < nkeys; i++) {
1560 1567                  if (key[i].dttk_size == 0) {
1561 1568                          uint64_t val = key[i].dttk_value;
1562 1569  
1563 1570                          hashval += (val >> 48) & 0xffff;
1564 1571                          hashval += (hashval << 10);
1565 1572                          hashval ^= (hashval >> 6);
1566 1573  
1567 1574                          hashval += (val >> 32) & 0xffff;
1568 1575                          hashval += (hashval << 10);
1569 1576                          hashval ^= (hashval >> 6);
1570 1577  
1571 1578                          hashval += (val >> 16) & 0xffff;
1572 1579                          hashval += (hashval << 10);
1573 1580                          hashval ^= (hashval >> 6);
1574 1581  
1575 1582                          hashval += val & 0xffff;
1576 1583                          hashval += (hashval << 10);
1577 1584                          hashval ^= (hashval >> 6);
1578 1585                  } else {
1579 1586                          /*
1580 1587                           * This is incredibly painful, but it beats the hell
1581 1588                           * out of the alternative.
1582 1589                           */
1583 1590                          uint64_t j, size = key[i].dttk_size;
1584 1591                          uintptr_t base = (uintptr_t)key[i].dttk_value;
1585 1592  
1586 1593                          if (!dtrace_canload(base, size, mstate, vstate))
1587 1594                                  break;
1588 1595  
1589 1596                          for (j = 0; j < size; j++) {
1590 1597                                  hashval += dtrace_load8(base + j);
1591 1598                                  hashval += (hashval << 10);
1592 1599                                  hashval ^= (hashval >> 6);
1593 1600                          }
1594 1601                  }
1595 1602          }
1596 1603  
1597 1604          if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1598 1605                  return (NULL);
1599 1606  
1600 1607          hashval += (hashval << 3);
1601 1608          hashval ^= (hashval >> 11);
1602 1609          hashval += (hashval << 15);
1603 1610  
1604 1611          /*
1605 1612           * There is a remote chance (ideally, 1 in 2^31) that our hashval
1606 1613           * comes out to be one of our two sentinel hash values.  If this
1607 1614           * actually happens, we set the hashval to be a value known to be a
1608 1615           * non-sentinel value.
1609 1616           */
1610 1617          if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1611 1618                  hashval = DTRACE_DYNHASH_VALID;
1612 1619  
1613 1620          /*
1614 1621           * Yes, it's painful to do a divide here.  If the cycle count becomes
1615 1622           * important here, tricks can be pulled to reduce it.  (However, it's
1616 1623           * critical that hash collisions be kept to an absolute minimum;
1617 1624           * they're much more painful than a divide.)  It's better to have a
1618 1625           * solution that generates few collisions and still keeps things
1619 1626           * relatively simple.
1620 1627           */
1621 1628          bucket = hashval % dstate->dtds_hashsize;
1622 1629  
1623 1630          if (op == DTRACE_DYNVAR_DEALLOC) {
1624 1631                  volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1625 1632  
1626 1633                  for (;;) {
1627 1634                          while ((lock = *lockp) & 1)
1628 1635                                  continue;
1629 1636  
1630 1637                          if (dtrace_casptr((void *)lockp,
1631 1638                              (void *)lock, (void *)(lock + 1)) == (void *)lock)
1632 1639                                  break;
1633 1640                  }
1634 1641  
1635 1642                  dtrace_membar_producer();
1636 1643          }
1637 1644  
1638 1645  top:
1639 1646          prev = NULL;
1640 1647          lock = hash[bucket].dtdh_lock;
1641 1648  
1642 1649          dtrace_membar_consumer();
1643 1650  
1644 1651          start = hash[bucket].dtdh_chain;
1645 1652          ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1646 1653              start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1647 1654              op != DTRACE_DYNVAR_DEALLOC));
1648 1655  
1649 1656          for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1650 1657                  dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1651 1658                  dtrace_key_t *dkey = &dtuple->dtt_key[0];
1652 1659  
1653 1660                  if (dvar->dtdv_hashval != hashval) {
1654 1661                          if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1655 1662                                  /*
1656 1663                                   * We've reached the sink, and therefore the
1657 1664                                   * end of the hash chain; we can kick out of
1658 1665                                   * the loop knowing that we have seen a valid
1659 1666                                   * snapshot of state.
1660 1667                                   */
1661 1668                                  ASSERT(dvar->dtdv_next == NULL);
1662 1669                                  ASSERT(dvar == &dtrace_dynhash_sink);
1663 1670                                  break;
1664 1671                          }
1665 1672  
1666 1673                          if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1667 1674                                  /*
1668 1675                                   * We've gone off the rails:  somewhere along
1669 1676                                   * the line, one of the members of this hash
1670 1677                                   * chain was deleted.  Note that we could also
1671 1678                                   * detect this by simply letting this loop run
1672 1679                                   * to completion, as we would eventually hit
1673 1680                                   * the end of the dirty list.  However, we
1674 1681                                   * want to avoid running the length of the
1675 1682                                   * dirty list unnecessarily (it might be quite
1676 1683                                   * long), so we catch this as early as
1677 1684                                   * possible by detecting the hash marker.  In
1678 1685                                   * this case, we simply set dvar to NULL and
1679 1686                                   * break; the conditional after the loop will
1680 1687                                   * send us back to top.
1681 1688                                   */
1682 1689                                  dvar = NULL;
1683 1690                                  break;
1684 1691                          }
1685 1692  
1686 1693                          goto next;
1687 1694                  }
1688 1695  
1689 1696                  if (dtuple->dtt_nkeys != nkeys)
1690 1697                          goto next;
1691 1698  
1692 1699                  for (i = 0; i < nkeys; i++, dkey++) {
1693 1700                          if (dkey->dttk_size != key[i].dttk_size)
1694 1701                                  goto next; /* size or type mismatch */
1695 1702  
1696 1703                          if (dkey->dttk_size != 0) {
1697 1704                                  if (dtrace_bcmp(
1698 1705                                      (void *)(uintptr_t)key[i].dttk_value,
1699 1706                                      (void *)(uintptr_t)dkey->dttk_value,
1700 1707                                      dkey->dttk_size))
1701 1708                                          goto next;
1702 1709                          } else {
1703 1710                                  if (dkey->dttk_value != key[i].dttk_value)
1704 1711                                          goto next;
1705 1712                          }
1706 1713                  }
1707 1714  
1708 1715                  if (op != DTRACE_DYNVAR_DEALLOC)
1709 1716                          return (dvar);
1710 1717  
1711 1718                  ASSERT(dvar->dtdv_next == NULL ||
1712 1719                      dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1713 1720  
1714 1721                  if (prev != NULL) {
1715 1722                          ASSERT(hash[bucket].dtdh_chain != dvar);
1716 1723                          ASSERT(start != dvar);
1717 1724                          ASSERT(prev->dtdv_next == dvar);
1718 1725                          prev->dtdv_next = dvar->dtdv_next;
1719 1726                  } else {
1720 1727                          if (dtrace_casptr(&hash[bucket].dtdh_chain,
1721 1728                              start, dvar->dtdv_next) != start) {
1722 1729                                  /*
1723 1730                                   * We have failed to atomically swing the
1724 1731                                   * hash table head pointer, presumably because
1725 1732                                   * of a conflicting allocation on another CPU.
1726 1733                                   * We need to reread the hash chain and try
1727 1734                                   * again.
1728 1735                                   */
1729 1736                                  goto top;
1730 1737                          }
1731 1738                  }
1732 1739  
1733 1740                  dtrace_membar_producer();
1734 1741  
1735 1742                  /*
1736 1743                   * Now set the hash value to indicate that it's free.
1737 1744                   */
1738 1745                  ASSERT(hash[bucket].dtdh_chain != dvar);
1739 1746                  dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1740 1747  
1741 1748                  dtrace_membar_producer();
1742 1749  
1743 1750                  /*
1744 1751                   * Set the next pointer to point at the dirty list, and
1745 1752                   * atomically swing the dirty pointer to the newly freed dvar.
1746 1753                   */
1747 1754                  do {
1748 1755                          next = dcpu->dtdsc_dirty;
1749 1756                          dvar->dtdv_next = next;
1750 1757                  } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1751 1758  
1752 1759                  /*
1753 1760                   * Finally, unlock this hash bucket.
1754 1761                   */
1755 1762                  ASSERT(hash[bucket].dtdh_lock == lock);
1756 1763                  ASSERT(lock & 1);
1757 1764                  hash[bucket].dtdh_lock++;
1758 1765  
1759 1766                  return (NULL);
1760 1767  next:
1761 1768                  prev = dvar;
1762 1769                  continue;
1763 1770          }
1764 1771  
1765 1772          if (dvar == NULL) {
1766 1773                  /*
1767 1774                   * If dvar is NULL, it is because we went off the rails:
1768 1775                   * one of the elements that we traversed in the hash chain
1769 1776                   * was deleted while we were traversing it.  In this case,
1770 1777                   * we assert that we aren't doing a dealloc (deallocs lock
1771 1778                   * the hash bucket to prevent themselves from racing with
1772 1779                   * one another), and retry the hash chain traversal.
1773 1780                   */
1774 1781                  ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1775 1782                  goto top;
1776 1783          }
1777 1784  
1778 1785          if (op != DTRACE_DYNVAR_ALLOC) {
1779 1786                  /*
1780 1787                   * If we are not to allocate a new variable, we want to
1781 1788                   * return NULL now.  Before we return, check that the value
1782 1789                   * of the lock word hasn't changed.  If it has, we may have
1783 1790                   * seen an inconsistent snapshot.
1784 1791                   */
1785 1792                  if (op == DTRACE_DYNVAR_NOALLOC) {
1786 1793                          if (hash[bucket].dtdh_lock != lock)
1787 1794                                  goto top;
1788 1795                  } else {
1789 1796                          ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1790 1797                          ASSERT(hash[bucket].dtdh_lock == lock);
1791 1798                          ASSERT(lock & 1);
1792 1799                          hash[bucket].dtdh_lock++;
1793 1800                  }
1794 1801  
1795 1802                  return (NULL);
1796 1803          }
1797 1804  
1798 1805          /*
1799 1806           * We need to allocate a new dynamic variable.  The size we need is the
1800 1807           * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1801 1808           * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1802 1809           * the size of any referred-to data (dsize).  We then round the final
1803 1810           * size up to the chunksize for allocation.
1804 1811           */
1805 1812          for (ksize = 0, i = 0; i < nkeys; i++)
1806 1813                  ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1807 1814  
1808 1815          /*
1809 1816           * This should be pretty much impossible, but could happen if, say,
1810 1817           * strange DIF specified the tuple.  Ideally, this should be an
1811 1818           * assertion and not an error condition -- but that requires that the
1812 1819           * chunksize calculation in dtrace_difo_chunksize() be absolutely
1813 1820           * bullet-proof.  (That is, it must not be able to be fooled by
1814 1821           * malicious DIF.)  Given the lack of backwards branches in DIF,
1815 1822           * solving this would presumably not amount to solving the Halting
1816 1823           * Problem -- but it still seems awfully hard.
1817 1824           */
1818 1825          if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1819 1826              ksize + dsize > chunksize) {
1820 1827                  dcpu->dtdsc_drops++;
1821 1828                  return (NULL);
1822 1829          }
1823 1830  
1824 1831          nstate = DTRACE_DSTATE_EMPTY;
1825 1832  
1826 1833          do {
1827 1834  retry:
1828 1835                  free = dcpu->dtdsc_free;
1829 1836  
1830 1837                  if (free == NULL) {
1831 1838                          dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
1832 1839                          void *rval;
1833 1840  
1834 1841                          if (clean == NULL) {
1835 1842                                  /*
1836 1843                                   * We're out of dynamic variable space on
1837 1844                                   * this CPU.  Unless we have tried all CPUs,
1838 1845                                   * we'll try to allocate from a different
1839 1846                                   * CPU.
1840 1847                                   */
1841 1848                                  switch (dstate->dtds_state) {
1842 1849                                  case DTRACE_DSTATE_CLEAN: {
1843 1850                                          void *sp = &dstate->dtds_state;
1844 1851  
1845 1852                                          if (++cpu >= NCPU)
1846 1853                                                  cpu = 0;
1847 1854  
1848 1855                                          if (dcpu->dtdsc_dirty != NULL &&
1849 1856                                              nstate == DTRACE_DSTATE_EMPTY)
1850 1857                                                  nstate = DTRACE_DSTATE_DIRTY;
1851 1858  
1852 1859                                          if (dcpu->dtdsc_rinsing != NULL)
1853 1860                                                  nstate = DTRACE_DSTATE_RINSING;
1854 1861  
1855 1862                                          dcpu = &dstate->dtds_percpu[cpu];
1856 1863  
1857 1864                                          if (cpu != me)
1858 1865                                                  goto retry;
1859 1866  
1860 1867                                          (void) dtrace_cas32(sp,
1861 1868                                              DTRACE_DSTATE_CLEAN, nstate);
1862 1869  
1863 1870                                          /*
1864 1871                                           * To increment the correct bean
1865 1872                                           * counter, take another lap.
1866 1873                                           */
1867 1874                                          goto retry;
1868 1875                                  }
1869 1876  
1870 1877                                  case DTRACE_DSTATE_DIRTY:
1871 1878                                          dcpu->dtdsc_dirty_drops++;
1872 1879                                          break;
1873 1880  
1874 1881                                  case DTRACE_DSTATE_RINSING:
1875 1882                                          dcpu->dtdsc_rinsing_drops++;
1876 1883                                          break;
1877 1884  
1878 1885                                  case DTRACE_DSTATE_EMPTY:
1879 1886                                          dcpu->dtdsc_drops++;
1880 1887                                          break;
1881 1888                                  }
1882 1889  
1883 1890                                  DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
1884 1891                                  return (NULL);
1885 1892                          }
1886 1893  
1887 1894                          /*
1888 1895                           * The clean list appears to be non-empty.  We want to
1889 1896                           * move the clean list to the free list; we start by
1890 1897                           * moving the clean pointer aside.
1891 1898                           */
1892 1899                          if (dtrace_casptr(&dcpu->dtdsc_clean,
1893 1900                              clean, NULL) != clean) {
1894 1901                                  /*
1895 1902                                   * We are in one of two situations:
1896 1903                                   *
1897 1904                                   *  (a) The clean list was switched to the
1898 1905                                   *      free list by another CPU.
1899 1906                                   *
1900 1907                                   *  (b) The clean list was added to by the
1901 1908                                   *      cleansing cyclic.
1902 1909                                   *
1903 1910                                   * In either of these situations, we can
1904 1911                                   * just reattempt the free list allocation.
1905 1912                                   */
1906 1913                                  goto retry;
1907 1914                          }
1908 1915  
1909 1916                          ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
1910 1917  
1911 1918                          /*
1912 1919                           * Now we'll move the clean list to our free list.
1913 1920                           * It's impossible for this to fail:  the only way
1914 1921                           * the free list can be updated is through this
1915 1922                           * code path, and only one CPU can own the clean list.
1916 1923                           * Thus, it would only be possible for this to fail if
1917 1924                           * this code were racing with dtrace_dynvar_clean().
1918 1925                           * (That is, if dtrace_dynvar_clean() updated the clean
1919 1926                           * list, and we ended up racing to update the free
1920 1927                           * list.)  This race is prevented by the dtrace_sync()
1921 1928                           * in dtrace_dynvar_clean() -- which flushes the
1922 1929                           * owners of the clean lists out before resetting
1923 1930                           * the clean lists.
1924 1931                           */
1925 1932                          dcpu = &dstate->dtds_percpu[me];
1926 1933                          rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
1927 1934                          ASSERT(rval == NULL);
1928 1935                          goto retry;
1929 1936                  }
1930 1937  
1931 1938                  dvar = free;
1932 1939                  new_free = dvar->dtdv_next;
1933 1940          } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
1934 1941  
1935 1942          /*
1936 1943           * We have now allocated a new chunk.  We copy the tuple keys into the
1937 1944           * tuple array and copy any referenced key data into the data space
1938 1945           * following the tuple array.  As we do this, we relocate dttk_value
1939 1946           * in the final tuple to point to the key data address in the chunk.
1940 1947           */
1941 1948          kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
1942 1949          dvar->dtdv_data = (void *)(kdata + ksize);
1943 1950          dvar->dtdv_tuple.dtt_nkeys = nkeys;
1944 1951  
1945 1952          for (i = 0; i < nkeys; i++) {
1946 1953                  dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
1947 1954                  size_t kesize = key[i].dttk_size;
1948 1955  
1949 1956                  if (kesize != 0) {
1950 1957                          dtrace_bcopy(
1951 1958                              (const void *)(uintptr_t)key[i].dttk_value,
1952 1959                              (void *)kdata, kesize);
1953 1960                          dkey->dttk_value = kdata;
1954 1961                          kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
1955 1962                  } else {
1956 1963                          dkey->dttk_value = key[i].dttk_value;
1957 1964                  }
1958 1965  
1959 1966                  dkey->dttk_size = kesize;
1960 1967          }
1961 1968  
1962 1969          ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
1963 1970          dvar->dtdv_hashval = hashval;
1964 1971          dvar->dtdv_next = start;
1965 1972  
1966 1973          if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
1967 1974                  return (dvar);
1968 1975  
1969 1976          /*
1970 1977           * The cas has failed.  Either another CPU is adding an element to
1971 1978           * this hash chain, or another CPU is deleting an element from this
1972 1979           * hash chain.  The simplest way to deal with both of these cases
1973 1980           * (though not necessarily the most efficient) is to free our
1974 1981           * allocated block and tail-call ourselves.  Note that the free is
1975 1982           * to the dirty list and _not_ to the free list.  This is to prevent
1976 1983           * races with allocators, above.
1977 1984           */
1978 1985          dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1979 1986  
1980 1987          dtrace_membar_producer();
1981 1988  
1982 1989          do {
1983 1990                  free = dcpu->dtdsc_dirty;
1984 1991                  dvar->dtdv_next = free;
1985 1992          } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
1986 1993  
1987 1994          return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
1988 1995  }
1989 1996  
1990 1997  /*ARGSUSED*/
1991 1998  static void
1992 1999  dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
1993 2000  {
1994 2001          if ((int64_t)nval < (int64_t)*oval)
1995 2002                  *oval = nval;
1996 2003  }
1997 2004  
1998 2005  /*ARGSUSED*/
1999 2006  static void
2000 2007  dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2001 2008  {
2002 2009          if ((int64_t)nval > (int64_t)*oval)
2003 2010                  *oval = nval;
2004 2011  }
2005 2012  
2006 2013  static void
2007 2014  dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2008 2015  {
2009 2016          int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2010 2017          int64_t val = (int64_t)nval;
2011 2018  
2012 2019          if (val < 0) {
2013 2020                  for (i = 0; i < zero; i++) {
2014 2021                          if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2015 2022                                  quanta[i] += incr;
2016 2023                                  return;
2017 2024                          }
2018 2025                  }
2019 2026          } else {
2020 2027                  for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2021 2028                          if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2022 2029                                  quanta[i - 1] += incr;
2023 2030                                  return;
2024 2031                          }
2025 2032                  }
2026 2033  
2027 2034                  quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2028 2035                  return;
2029 2036          }
2030 2037  
2031 2038          ASSERT(0);
2032 2039  }
2033 2040  
2034 2041  static void
2035 2042  dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2036 2043  {
2037 2044          uint64_t arg = *lquanta++;
2038 2045          int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2039 2046          uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2040 2047          uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2041 2048          int32_t val = (int32_t)nval, level;
2042 2049  
2043 2050          ASSERT(step != 0);
2044 2051          ASSERT(levels != 0);
2045 2052  
2046 2053          if (val < base) {
2047 2054                  /*
2048 2055                   * This is an underflow.
2049 2056                   */
2050 2057                  lquanta[0] += incr;
2051 2058                  return;
2052 2059          }
2053 2060  
2054 2061          level = (val - base) / step;
2055 2062  
2056 2063          if (level < levels) {
2057 2064                  lquanta[level + 1] += incr;
2058 2065                  return;
2059 2066          }
2060 2067  
2061 2068          /*
2062 2069           * This is an overflow.
2063 2070           */
2064 2071          lquanta[levels + 1] += incr;
2065 2072  }
2066 2073  
2067 2074  static int
2068 2075  dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
2069 2076      uint16_t high, uint16_t nsteps, int64_t value)
2070 2077  {
2071 2078          int64_t this = 1, last, next;
2072 2079          int base = 1, order;
2073 2080  
2074 2081          ASSERT(factor <= nsteps);
2075 2082          ASSERT(nsteps % factor == 0);
2076 2083  
2077 2084          for (order = 0; order < low; order++)
2078 2085                  this *= factor;
2079 2086  
2080 2087          /*
2081 2088           * If our value is less than our factor taken to the power of the
2082 2089           * low order of magnitude, it goes into the zeroth bucket.
2083 2090           */
2084 2091          if (value < (last = this))
2085 2092                  return (0);
2086 2093  
2087 2094          for (this *= factor; order <= high; order++) {
2088 2095                  int nbuckets = this > nsteps ? nsteps : this;
2089 2096  
2090 2097                  if ((next = this * factor) < this) {
2091 2098                          /*
2092 2099                           * We should not generally get log/linear quantizations
2093 2100                           * with a high magnitude that allows 64-bits to
2094 2101                           * overflow, but we nonetheless protect against this
2095 2102                           * by explicitly checking for overflow, and clamping
2096 2103                           * our value accordingly.
2097 2104                           */
2098 2105                          value = this - 1;
2099 2106                  }
2100 2107  
2101 2108                  if (value < this) {
2102 2109                          /*
2103 2110                           * If our value lies within this order of magnitude,
2104 2111                           * determine its position by taking the offset within
2105 2112                           * the order of magnitude, dividing by the bucket
2106 2113                           * width, and adding to our (accumulated) base.
2107 2114                           */
2108 2115                          return (base + (value - last) / (this / nbuckets));
2109 2116                  }
2110 2117  
2111 2118                  base += nbuckets - (nbuckets / factor);
2112 2119                  last = this;
2113 2120                  this = next;
2114 2121          }
2115 2122  
2116 2123          /*
2117 2124           * Our value is greater than or equal to our factor taken to the
2118 2125           * power of one plus the high magnitude -- return the top bucket.
2119 2126           */
2120 2127          return (base);
2121 2128  }
2122 2129  
2123 2130  static void
2124 2131  dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2125 2132  {
2126 2133          uint64_t arg = *llquanta++;
2127 2134          uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2128 2135          uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2129 2136          uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2130 2137          uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2131 2138  
2132 2139          llquanta[dtrace_aggregate_llquantize_bucket(factor,
2133 2140              low, high, nsteps, nval)] += incr;
2134 2141  }
2135 2142  
2136 2143  /*ARGSUSED*/
2137 2144  static void
2138 2145  dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2139 2146  {
2140 2147          data[0]++;
2141 2148          data[1] += nval;
2142 2149  }
2143 2150  
2144 2151  /*ARGSUSED*/
2145 2152  static void
2146 2153  dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2147 2154  {
2148 2155          int64_t snval = (int64_t)nval;
2149 2156          uint64_t tmp[2];
2150 2157  
2151 2158          data[0]++;
2152 2159          data[1] += nval;
2153 2160  
2154 2161          /*
2155 2162           * What we want to say here is:
2156 2163           *
2157 2164           * data[2] += nval * nval;
2158 2165           *
2159 2166           * But given that nval is 64-bit, we could easily overflow, so
2160 2167           * we do this as 128-bit arithmetic.
2161 2168           */
2162 2169          if (snval < 0)
2163 2170                  snval = -snval;
2164 2171  
2165 2172          dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2166 2173          dtrace_add_128(data + 2, tmp, data + 2);
2167 2174  }
2168 2175  
2169 2176  /*ARGSUSED*/
2170 2177  static void
2171 2178  dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2172 2179  {
2173 2180          *oval = *oval + 1;
2174 2181  }
2175 2182  
2176 2183  /*ARGSUSED*/
2177 2184  static void
2178 2185  dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2179 2186  {
2180 2187          *oval += nval;
2181 2188  }
2182 2189  
2183 2190  /*
2184 2191   * Aggregate given the tuple in the principal data buffer, and the aggregating
2185 2192   * action denoted by the specified dtrace_aggregation_t.  The aggregation
2186 2193   * buffer is specified as the buf parameter.  This routine does not return
2187 2194   * failure; if there is no space in the aggregation buffer, the data will be
2188 2195   * dropped, and a corresponding counter incremented.
2189 2196   */
2190 2197  static void
2191 2198  dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2192 2199      intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2193 2200  {
2194 2201          dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2195 2202          uint32_t i, ndx, size, fsize;
2196 2203          uint32_t align = sizeof (uint64_t) - 1;
2197 2204          dtrace_aggbuffer_t *agb;
2198 2205          dtrace_aggkey_t *key;
2199 2206          uint32_t hashval = 0, limit, isstr;
2200 2207          caddr_t tomax, data, kdata;
2201 2208          dtrace_actkind_t action;
2202 2209          dtrace_action_t *act;
2203 2210          uintptr_t offs;
2204 2211  
2205 2212          if (buf == NULL)
2206 2213                  return;
2207 2214  
2208 2215          if (!agg->dtag_hasarg) {
2209 2216                  /*
2210 2217                   * Currently, only quantize() and lquantize() take additional
2211 2218                   * arguments, and they have the same semantics:  an increment
2212 2219                   * value that defaults to 1 when not present.  If additional
2213 2220                   * aggregating actions take arguments, the setting of the
2214 2221                   * default argument value will presumably have to become more
2215 2222                   * sophisticated...
2216 2223                   */
2217 2224                  arg = 1;
2218 2225          }
2219 2226  
2220 2227          action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2221 2228          size = rec->dtrd_offset - agg->dtag_base;
2222 2229          fsize = size + rec->dtrd_size;
2223 2230  
2224 2231          ASSERT(dbuf->dtb_tomax != NULL);
2225 2232          data = dbuf->dtb_tomax + offset + agg->dtag_base;
2226 2233  
2227 2234          if ((tomax = buf->dtb_tomax) == NULL) {
2228 2235                  dtrace_buffer_drop(buf);
2229 2236                  return;
2230 2237          }
2231 2238  
2232 2239          /*
2233 2240           * The metastructure is always at the bottom of the buffer.
2234 2241           */
2235 2242          agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2236 2243              sizeof (dtrace_aggbuffer_t));
2237 2244  
2238 2245          if (buf->dtb_offset == 0) {
2239 2246                  /*
2240 2247                   * We just kludge up approximately 1/8th of the size to be
2241 2248                   * buckets.  If this guess ends up being routinely
2242 2249                   * off-the-mark, we may need to dynamically readjust this
2243 2250                   * based on past performance.
2244 2251                   */
2245 2252                  uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2246 2253  
2247 2254                  if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2248 2255                      (uintptr_t)tomax || hashsize == 0) {
2249 2256                          /*
2250 2257                           * We've been given a ludicrously small buffer;
2251 2258                           * increment our drop count and leave.
2252 2259                           */
2253 2260                          dtrace_buffer_drop(buf);
2254 2261                          return;
2255 2262                  }
2256 2263  
2257 2264                  /*
2258 2265                   * And now, a pathetic attempt to try to get a an odd (or
2259 2266                   * perchance, a prime) hash size for better hash distribution.
2260 2267                   */
2261 2268                  if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2262 2269                          hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2263 2270  
2264 2271                  agb->dtagb_hashsize = hashsize;
2265 2272                  agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2266 2273                      agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2267 2274                  agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2268 2275  
2269 2276                  for (i = 0; i < agb->dtagb_hashsize; i++)
2270 2277                          agb->dtagb_hash[i] = NULL;
2271 2278          }
2272 2279  
2273 2280          ASSERT(agg->dtag_first != NULL);
2274 2281          ASSERT(agg->dtag_first->dta_intuple);
2275 2282  
2276 2283          /*
2277 2284           * Calculate the hash value based on the key.  Note that we _don't_
2278 2285           * include the aggid in the hashing (but we will store it as part of
2279 2286           * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
2280 2287           * algorithm: a simple, quick algorithm that has no known funnels, and
2281 2288           * gets good distribution in practice.  The efficacy of the hashing
2282 2289           * algorithm (and a comparison with other algorithms) may be found by
2283 2290           * running the ::dtrace_aggstat MDB dcmd.
2284 2291           */
2285 2292          for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2286 2293                  i = act->dta_rec.dtrd_offset - agg->dtag_base;
2287 2294                  limit = i + act->dta_rec.dtrd_size;
2288 2295                  ASSERT(limit <= size);
2289 2296                  isstr = DTRACEACT_ISSTRING(act);
2290 2297  
2291 2298                  for (; i < limit; i++) {
2292 2299                          hashval += data[i];
2293 2300                          hashval += (hashval << 10);
2294 2301                          hashval ^= (hashval >> 6);
2295 2302  
2296 2303                          if (isstr && data[i] == '\0')
2297 2304                                  break;
2298 2305                  }
2299 2306          }
2300 2307  
2301 2308          hashval += (hashval << 3);
2302 2309          hashval ^= (hashval >> 11);
2303 2310          hashval += (hashval << 15);
2304 2311  
2305 2312          /*
2306 2313           * Yes, the divide here is expensive -- but it's generally the least
2307 2314           * of the performance issues given the amount of data that we iterate
2308 2315           * over to compute hash values, compare data, etc.
2309 2316           */
2310 2317          ndx = hashval % agb->dtagb_hashsize;
2311 2318  
2312 2319          for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2313 2320                  ASSERT((caddr_t)key >= tomax);
2314 2321                  ASSERT((caddr_t)key < tomax + buf->dtb_size);
2315 2322  
2316 2323                  if (hashval != key->dtak_hashval || key->dtak_size != size)
2317 2324                          continue;
2318 2325  
2319 2326                  kdata = key->dtak_data;
2320 2327                  ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2321 2328  
2322 2329                  for (act = agg->dtag_first; act->dta_intuple;
2323 2330                      act = act->dta_next) {
2324 2331                          i = act->dta_rec.dtrd_offset - agg->dtag_base;
2325 2332                          limit = i + act->dta_rec.dtrd_size;
2326 2333                          ASSERT(limit <= size);
2327 2334                          isstr = DTRACEACT_ISSTRING(act);
2328 2335  
2329 2336                          for (; i < limit; i++) {
2330 2337                                  if (kdata[i] != data[i])
2331 2338                                          goto next;
2332 2339  
2333 2340                                  if (isstr && data[i] == '\0')
2334 2341                                          break;
2335 2342                          }
2336 2343                  }
2337 2344  
2338 2345                  if (action != key->dtak_action) {
2339 2346                          /*
2340 2347                           * We are aggregating on the same value in the same
2341 2348                           * aggregation with two different aggregating actions.
2342 2349                           * (This should have been picked up in the compiler,
2343 2350                           * so we may be dealing with errant or devious DIF.)
2344 2351                           * This is an error condition; we indicate as much,
2345 2352                           * and return.
2346 2353                           */
2347 2354                          DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2348 2355                          return;
2349 2356                  }
2350 2357  
2351 2358                  /*
2352 2359                   * This is a hit:  we need to apply the aggregator to
2353 2360                   * the value at this key.
2354 2361                   */
2355 2362                  agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2356 2363                  return;
2357 2364  next:
2358 2365                  continue;
2359 2366          }
2360 2367  
2361 2368          /*
2362 2369           * We didn't find it.  We need to allocate some zero-filled space,
2363 2370           * link it into the hash table appropriately, and apply the aggregator
2364 2371           * to the (zero-filled) value.
2365 2372           */
2366 2373          offs = buf->dtb_offset;
2367 2374          while (offs & (align - 1))
2368 2375                  offs += sizeof (uint32_t);
2369 2376  
2370 2377          /*
2371 2378           * If we don't have enough room to both allocate a new key _and_
2372 2379           * its associated data, increment the drop count and return.
2373 2380           */
2374 2381          if ((uintptr_t)tomax + offs + fsize >
2375 2382              agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2376 2383                  dtrace_buffer_drop(buf);
2377 2384                  return;
2378 2385          }
2379 2386  
2380 2387          /*CONSTCOND*/
2381 2388          ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2382 2389          key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2383 2390          agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2384 2391  
2385 2392          key->dtak_data = kdata = tomax + offs;
2386 2393          buf->dtb_offset = offs + fsize;
2387 2394  
2388 2395          /*
2389 2396           * Now copy the data across.
2390 2397           */
2391 2398          *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2392 2399  
2393 2400          for (i = sizeof (dtrace_aggid_t); i < size; i++)
2394 2401                  kdata[i] = data[i];
2395 2402  
2396 2403          /*
2397 2404           * Because strings are not zeroed out by default, we need to iterate
2398 2405           * looking for actions that store strings, and we need to explicitly
2399 2406           * pad these strings out with zeroes.
2400 2407           */
2401 2408          for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2402 2409                  int nul;
2403 2410  
2404 2411                  if (!DTRACEACT_ISSTRING(act))
2405 2412                          continue;
2406 2413  
2407 2414                  i = act->dta_rec.dtrd_offset - agg->dtag_base;
2408 2415                  limit = i + act->dta_rec.dtrd_size;
2409 2416                  ASSERT(limit <= size);
2410 2417  
2411 2418                  for (nul = 0; i < limit; i++) {
2412 2419                          if (nul) {
2413 2420                                  kdata[i] = '\0';
2414 2421                                  continue;
2415 2422                          }
2416 2423  
2417 2424                          if (data[i] != '\0')
2418 2425                                  continue;
2419 2426  
2420 2427                          nul = 1;
2421 2428                  }
2422 2429          }
2423 2430  
2424 2431          for (i = size; i < fsize; i++)
2425 2432                  kdata[i] = 0;
2426 2433  
2427 2434          key->dtak_hashval = hashval;
2428 2435          key->dtak_size = size;
2429 2436          key->dtak_action = action;
2430 2437          key->dtak_next = agb->dtagb_hash[ndx];
2431 2438          agb->dtagb_hash[ndx] = key;
2432 2439  
2433 2440          /*
2434 2441           * Finally, apply the aggregator.
2435 2442           */
2436 2443          *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2437 2444          agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2438 2445  }
2439 2446  
2440 2447  /*
2441 2448   * Given consumer state, this routine finds a speculation in the INACTIVE
2442 2449   * state and transitions it into the ACTIVE state.  If there is no speculation
2443 2450   * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2444 2451   * incremented -- it is up to the caller to take appropriate action.
2445 2452   */
2446 2453  static int
2447 2454  dtrace_speculation(dtrace_state_t *state)
2448 2455  {
2449 2456          int i = 0;
2450 2457          dtrace_speculation_state_t current;
2451 2458          uint32_t *stat = &state->dts_speculations_unavail, count;
2452 2459  
2453 2460          while (i < state->dts_nspeculations) {
2454 2461                  dtrace_speculation_t *spec = &state->dts_speculations[i];
2455 2462  
2456 2463                  current = spec->dtsp_state;
2457 2464  
2458 2465                  if (current != DTRACESPEC_INACTIVE) {
2459 2466                          if (current == DTRACESPEC_COMMITTINGMANY ||
2460 2467                              current == DTRACESPEC_COMMITTING ||
2461 2468                              current == DTRACESPEC_DISCARDING)
2462 2469                                  stat = &state->dts_speculations_busy;
2463 2470                          i++;
2464 2471                          continue;
2465 2472                  }
2466 2473  
2467 2474                  if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2468 2475                      current, DTRACESPEC_ACTIVE) == current)
2469 2476                          return (i + 1);
2470 2477          }
2471 2478  
2472 2479          /*
2473 2480           * We couldn't find a speculation.  If we found as much as a single
2474 2481           * busy speculation buffer, we'll attribute this failure as "busy"
2475 2482           * instead of "unavail".
2476 2483           */
2477 2484          do {
2478 2485                  count = *stat;
2479 2486          } while (dtrace_cas32(stat, count, count + 1) != count);
2480 2487  
2481 2488          return (0);
2482 2489  }
2483 2490  
2484 2491  /*
2485 2492   * This routine commits an active speculation.  If the specified speculation
2486 2493   * is not in a valid state to perform a commit(), this routine will silently do
2487 2494   * nothing.  The state of the specified speculation is transitioned according
2488 2495   * to the state transition diagram outlined in <sys/dtrace_impl.h>
2489 2496   */
2490 2497  static void
2491 2498  dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2492 2499      dtrace_specid_t which)
2493 2500  {
2494 2501          dtrace_speculation_t *spec;
2495 2502          dtrace_buffer_t *src, *dest;
2496 2503          uintptr_t daddr, saddr, dlimit;
2497 2504          dtrace_speculation_state_t current, new;
2498 2505          intptr_t offs;
2499 2506  
2500 2507          if (which == 0)
2501 2508                  return;
2502 2509  
2503 2510          if (which > state->dts_nspeculations) {
2504 2511                  cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2505 2512                  return;
2506 2513          }
2507 2514  
2508 2515          spec = &state->dts_speculations[which - 1];
2509 2516          src = &spec->dtsp_buffer[cpu];
2510 2517          dest = &state->dts_buffer[cpu];
2511 2518  
2512 2519          do {
2513 2520                  current = spec->dtsp_state;
2514 2521  
2515 2522                  if (current == DTRACESPEC_COMMITTINGMANY)
2516 2523                          break;
2517 2524  
2518 2525                  switch (current) {
2519 2526                  case DTRACESPEC_INACTIVE:
2520 2527                  case DTRACESPEC_DISCARDING:
2521 2528                          return;
2522 2529  
2523 2530                  case DTRACESPEC_COMMITTING:
2524 2531                          /*
2525 2532                           * This is only possible if we are (a) commit()'ing
2526 2533                           * without having done a prior speculate() on this CPU
2527 2534                           * and (b) racing with another commit() on a different
2528 2535                           * CPU.  There's nothing to do -- we just assert that
2529 2536                           * our offset is 0.
2530 2537                           */
2531 2538                          ASSERT(src->dtb_offset == 0);
2532 2539                          return;
2533 2540  
2534 2541                  case DTRACESPEC_ACTIVE:
2535 2542                          new = DTRACESPEC_COMMITTING;
2536 2543                          break;
2537 2544  
2538 2545                  case DTRACESPEC_ACTIVEONE:
2539 2546                          /*
2540 2547                           * This speculation is active on one CPU.  If our
2541 2548                           * buffer offset is non-zero, we know that the one CPU
2542 2549                           * must be us.  Otherwise, we are committing on a
2543 2550                           * different CPU from the speculate(), and we must
2544 2551                           * rely on being asynchronously cleaned.
2545 2552                           */
2546 2553                          if (src->dtb_offset != 0) {
2547 2554                                  new = DTRACESPEC_COMMITTING;
2548 2555                                  break;
2549 2556                          }
2550 2557                          /*FALLTHROUGH*/
2551 2558  
2552 2559                  case DTRACESPEC_ACTIVEMANY:
2553 2560                          new = DTRACESPEC_COMMITTINGMANY;
2554 2561                          break;
2555 2562  
2556 2563                  default:
2557 2564                          ASSERT(0);
2558 2565                  }
2559 2566          } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2560 2567              current, new) != current);
2561 2568  
2562 2569          /*
2563 2570           * We have set the state to indicate that we are committing this
2564 2571           * speculation.  Now reserve the necessary space in the destination
2565 2572           * buffer.
2566 2573           */
2567 2574          if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2568 2575              sizeof (uint64_t), state, NULL)) < 0) {
2569 2576                  dtrace_buffer_drop(dest);
2570 2577                  goto out;
2571 2578          }
2572 2579  
2573 2580          /*
2574 2581           * We have the space; copy the buffer across.  (Note that this is a
2575 2582           * highly subobtimal bcopy(); in the unlikely event that this becomes
2576 2583           * a serious performance issue, a high-performance DTrace-specific
2577 2584           * bcopy() should obviously be invented.)
2578 2585           */
2579 2586          daddr = (uintptr_t)dest->dtb_tomax + offs;
2580 2587          dlimit = daddr + src->dtb_offset;
2581 2588          saddr = (uintptr_t)src->dtb_tomax;
2582 2589  
2583 2590          /*
2584 2591           * First, the aligned portion.
2585 2592           */
2586 2593          while (dlimit - daddr >= sizeof (uint64_t)) {
2587 2594                  *((uint64_t *)daddr) = *((uint64_t *)saddr);
2588 2595  
2589 2596                  daddr += sizeof (uint64_t);
2590 2597                  saddr += sizeof (uint64_t);
2591 2598          }
2592 2599  
2593 2600          /*
2594 2601           * Now any left-over bit...
2595 2602           */
2596 2603          while (dlimit - daddr)
2597 2604                  *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2598 2605  
2599 2606          /*
2600 2607           * Finally, commit the reserved space in the destination buffer.
2601 2608           */
2602 2609          dest->dtb_offset = offs + src->dtb_offset;
2603 2610  
2604 2611  out:
2605 2612          /*
2606 2613           * If we're lucky enough to be the only active CPU on this speculation
2607 2614           * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2608 2615           */
2609 2616          if (current == DTRACESPEC_ACTIVE ||
2610 2617              (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2611 2618                  uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2612 2619                      DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2613 2620  
2614 2621                  ASSERT(rval == DTRACESPEC_COMMITTING);
2615 2622          }
2616 2623  
2617 2624          src->dtb_offset = 0;
2618 2625          src->dtb_xamot_drops += src->dtb_drops;
2619 2626          src->dtb_drops = 0;
2620 2627  }
2621 2628  
2622 2629  /*
2623 2630   * This routine discards an active speculation.  If the specified speculation
2624 2631   * is not in a valid state to perform a discard(), this routine will silently
2625 2632   * do nothing.  The state of the specified speculation is transitioned
2626 2633   * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2627 2634   */
2628 2635  static void
2629 2636  dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2630 2637      dtrace_specid_t which)
2631 2638  {
2632 2639          dtrace_speculation_t *spec;
2633 2640          dtrace_speculation_state_t current, new;
2634 2641          dtrace_buffer_t *buf;
2635 2642  
2636 2643          if (which == 0)
2637 2644                  return;
2638 2645  
2639 2646          if (which > state->dts_nspeculations) {
2640 2647                  cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2641 2648                  return;
2642 2649          }
2643 2650  
2644 2651          spec = &state->dts_speculations[which - 1];
2645 2652          buf = &spec->dtsp_buffer[cpu];
2646 2653  
2647 2654          do {
2648 2655                  current = spec->dtsp_state;
2649 2656  
2650 2657                  switch (current) {
2651 2658                  case DTRACESPEC_INACTIVE:
2652 2659                  case DTRACESPEC_COMMITTINGMANY:
2653 2660                  case DTRACESPEC_COMMITTING:
2654 2661                  case DTRACESPEC_DISCARDING:
2655 2662                          return;
2656 2663  
2657 2664                  case DTRACESPEC_ACTIVE:
2658 2665                  case DTRACESPEC_ACTIVEMANY:
2659 2666                          new = DTRACESPEC_DISCARDING;
2660 2667                          break;
2661 2668  
2662 2669                  case DTRACESPEC_ACTIVEONE:
2663 2670                          if (buf->dtb_offset != 0) {
2664 2671                                  new = DTRACESPEC_INACTIVE;
2665 2672                          } else {
2666 2673                                  new = DTRACESPEC_DISCARDING;
2667 2674                          }
2668 2675                          break;
2669 2676  
2670 2677                  default:
2671 2678                          ASSERT(0);
2672 2679                  }
2673 2680          } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2674 2681              current, new) != current);
2675 2682  
2676 2683          buf->dtb_offset = 0;
2677 2684          buf->dtb_drops = 0;
2678 2685  }
2679 2686  
2680 2687  /*
2681 2688   * Note:  not called from probe context.  This function is called
2682 2689   * asynchronously from cross call context to clean any speculations that are
2683 2690   * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
2684 2691   * transitioned back to the INACTIVE state until all CPUs have cleaned the
2685 2692   * speculation.
2686 2693   */
2687 2694  static void
2688 2695  dtrace_speculation_clean_here(dtrace_state_t *state)
2689 2696  {
2690 2697          dtrace_icookie_t cookie;
2691 2698          processorid_t cpu = CPU->cpu_id;
2692 2699          dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2693 2700          dtrace_specid_t i;
2694 2701  
2695 2702          cookie = dtrace_interrupt_disable();
2696 2703  
2697 2704          if (dest->dtb_tomax == NULL) {
2698 2705                  dtrace_interrupt_enable(cookie);
2699 2706                  return;
2700 2707          }
2701 2708  
2702 2709          for (i = 0; i < state->dts_nspeculations; i++) {
2703 2710                  dtrace_speculation_t *spec = &state->dts_speculations[i];
2704 2711                  dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2705 2712  
2706 2713                  if (src->dtb_tomax == NULL)
2707 2714                          continue;
2708 2715  
2709 2716                  if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2710 2717                          src->dtb_offset = 0;
2711 2718                          continue;
2712 2719                  }
2713 2720  
2714 2721                  if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2715 2722                          continue;
2716 2723  
2717 2724                  if (src->dtb_offset == 0)
2718 2725                          continue;
2719 2726  
2720 2727                  dtrace_speculation_commit(state, cpu, i + 1);
2721 2728          }
2722 2729  
2723 2730          dtrace_interrupt_enable(cookie);
2724 2731  }
2725 2732  
2726 2733  /*
2727 2734   * Note:  not called from probe context.  This function is called
2728 2735   * asynchronously (and at a regular interval) to clean any speculations that
2729 2736   * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
2730 2737   * is work to be done, it cross calls all CPUs to perform that work;
2731 2738   * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2732 2739   * INACTIVE state until they have been cleaned by all CPUs.
2733 2740   */
2734 2741  static void
2735 2742  dtrace_speculation_clean(dtrace_state_t *state)
2736 2743  {
2737 2744          int work = 0, rv;
2738 2745          dtrace_specid_t i;
2739 2746  
2740 2747          for (i = 0; i < state->dts_nspeculations; i++) {
2741 2748                  dtrace_speculation_t *spec = &state->dts_speculations[i];
2742 2749  
2743 2750                  ASSERT(!spec->dtsp_cleaning);
2744 2751  
2745 2752                  if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2746 2753                      spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2747 2754                          continue;
2748 2755  
2749 2756                  work++;
2750 2757                  spec->dtsp_cleaning = 1;
2751 2758          }
2752 2759  
2753 2760          if (!work)
2754 2761                  return;
2755 2762  
2756 2763          dtrace_xcall(DTRACE_CPUALL,
2757 2764              (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2758 2765  
2759 2766          /*
2760 2767           * We now know that all CPUs have committed or discarded their
2761 2768           * speculation buffers, as appropriate.  We can now set the state
2762 2769           * to inactive.
2763 2770           */
2764 2771          for (i = 0; i < state->dts_nspeculations; i++) {
2765 2772                  dtrace_speculation_t *spec = &state->dts_speculations[i];
2766 2773                  dtrace_speculation_state_t current, new;
2767 2774  
2768 2775                  if (!spec->dtsp_cleaning)
2769 2776                          continue;
2770 2777  
2771 2778                  current = spec->dtsp_state;
2772 2779                  ASSERT(current == DTRACESPEC_DISCARDING ||
2773 2780                      current == DTRACESPEC_COMMITTINGMANY);
2774 2781  
2775 2782                  new = DTRACESPEC_INACTIVE;
2776 2783  
2777 2784                  rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2778 2785                  ASSERT(rv == current);
2779 2786                  spec->dtsp_cleaning = 0;
2780 2787          }
2781 2788  }
2782 2789  
2783 2790  /*
2784 2791   * Called as part of a speculate() to get the speculative buffer associated
2785 2792   * with a given speculation.  Returns NULL if the specified speculation is not
2786 2793   * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
2787 2794   * the active CPU is not the specified CPU -- the speculation will be
2788 2795   * atomically transitioned into the ACTIVEMANY state.
2789 2796   */
2790 2797  static dtrace_buffer_t *
2791 2798  dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2792 2799      dtrace_specid_t which)
2793 2800  {
2794 2801          dtrace_speculation_t *spec;
2795 2802          dtrace_speculation_state_t current, new;
2796 2803          dtrace_buffer_t *buf;
2797 2804  
2798 2805          if (which == 0)
2799 2806                  return (NULL);
2800 2807  
2801 2808          if (which > state->dts_nspeculations) {
2802 2809                  cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2803 2810                  return (NULL);
2804 2811          }
2805 2812  
2806 2813          spec = &state->dts_speculations[which - 1];
2807 2814          buf = &spec->dtsp_buffer[cpuid];
2808 2815  
2809 2816          do {
2810 2817                  current = spec->dtsp_state;
2811 2818  
2812 2819                  switch (current) {
2813 2820                  case DTRACESPEC_INACTIVE:
2814 2821                  case DTRACESPEC_COMMITTINGMANY:
2815 2822                  case DTRACESPEC_DISCARDING:
2816 2823                          return (NULL);
2817 2824  
2818 2825                  case DTRACESPEC_COMMITTING:
2819 2826                          ASSERT(buf->dtb_offset == 0);
2820 2827                          return (NULL);
2821 2828  
2822 2829                  case DTRACESPEC_ACTIVEONE:
2823 2830                          /*
2824 2831                           * This speculation is currently active on one CPU.
2825 2832                           * Check the offset in the buffer; if it's non-zero,
2826 2833                           * that CPU must be us (and we leave the state alone).
2827 2834                           * If it's zero, assume that we're starting on a new
2828 2835                           * CPU -- and change the state to indicate that the
2829 2836                           * speculation is active on more than one CPU.
2830 2837                           */
2831 2838                          if (buf->dtb_offset != 0)
2832 2839                                  return (buf);
2833 2840  
2834 2841                          new = DTRACESPEC_ACTIVEMANY;
2835 2842                          break;
2836 2843  
2837 2844                  case DTRACESPEC_ACTIVEMANY:
2838 2845                          return (buf);
2839 2846  
2840 2847                  case DTRACESPEC_ACTIVE:
2841 2848                          new = DTRACESPEC_ACTIVEONE;
2842 2849                          break;
2843 2850  
2844 2851                  default:
2845 2852                          ASSERT(0);
2846 2853                  }
2847 2854          } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2848 2855              current, new) != current);
2849 2856  
2850 2857          ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
2851 2858          return (buf);
2852 2859  }
2853 2860  
2854 2861  /*
2855 2862   * Return a string.  In the event that the user lacks the privilege to access
2856 2863   * arbitrary kernel memory, we copy the string out to scratch memory so that we
2857 2864   * don't fail access checking.
2858 2865   *
2859 2866   * dtrace_dif_variable() uses this routine as a helper for various
2860 2867   * builtin values such as 'execname' and 'probefunc.'
2861 2868   */
2862 2869  uintptr_t
2863 2870  dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
2864 2871      dtrace_mstate_t *mstate)
2865 2872  {
2866 2873          uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
2867 2874          uintptr_t ret;
2868 2875          size_t strsz;
2869 2876  
2870 2877          /*
2871 2878           * The easy case: this probe is allowed to read all of memory, so
2872 2879           * we can just return this as a vanilla pointer.
2873 2880           */
2874 2881          if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
2875 2882                  return (addr);
2876 2883  
2877 2884          /*
2878 2885           * This is the tougher case: we copy the string in question from
2879 2886           * kernel memory into scratch memory and return it that way: this
2880 2887           * ensures that we won't trip up when access checking tests the
2881 2888           * BYREF return value.
2882 2889           */
2883 2890          strsz = dtrace_strlen((char *)addr, size) + 1;
2884 2891  
2885 2892          if (mstate->dtms_scratch_ptr + strsz >
2886 2893              mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
2887 2894                  DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
2888 2895                  return (NULL);
2889 2896          }
2890 2897  
2891 2898          dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
2892 2899              strsz);
2893 2900          ret = mstate->dtms_scratch_ptr;
2894 2901          mstate->dtms_scratch_ptr += strsz;
2895 2902          return (ret);
2896 2903  }
2897 2904  
2898 2905  /*
2899 2906   * This function implements the DIF emulator's variable lookups.  The emulator
2900 2907   * passes a reserved variable identifier and optional built-in array index.
2901 2908   */
2902 2909  static uint64_t
2903 2910  dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
2904 2911      uint64_t ndx)
2905 2912  {
2906 2913          /*
2907 2914           * If we're accessing one of the uncached arguments, we'll turn this
2908 2915           * into a reference in the args array.
2909 2916           */
2910 2917          if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
2911 2918                  ndx = v - DIF_VAR_ARG0;
2912 2919                  v = DIF_VAR_ARGS;
2913 2920          }
2914 2921  
2915 2922          switch (v) {
2916 2923          case DIF_VAR_ARGS:
2917 2924                  if (!(mstate->dtms_access & DTRACE_ACCESS_ARGS)) {
2918 2925                          cpu_core[CPU->cpu_id].cpuc_dtrace_flags |=
2919 2926                              CPU_DTRACE_KPRIV;
2920 2927                          return (0);
2921 2928                  }
2922 2929  
2923 2930                  ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
2924 2931                  if (ndx >= sizeof (mstate->dtms_arg) /
2925 2932                      sizeof (mstate->dtms_arg[0])) {
2926 2933                          int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2927 2934                          dtrace_provider_t *pv;
2928 2935                          uint64_t val;
2929 2936  
2930 2937                          pv = mstate->dtms_probe->dtpr_provider;
2931 2938                          if (pv->dtpv_pops.dtps_getargval != NULL)
2932 2939                                  val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
2933 2940                                      mstate->dtms_probe->dtpr_id,
2934 2941                                      mstate->dtms_probe->dtpr_arg, ndx, aframes);
2935 2942                          else
2936 2943                                  val = dtrace_getarg(ndx, aframes);
2937 2944  
2938 2945                          /*
2939 2946                           * This is regrettably required to keep the compiler
2940 2947                           * from tail-optimizing the call to dtrace_getarg().
2941 2948                           * The condition always evaluates to true, but the
2942 2949                           * compiler has no way of figuring that out a priori.
2943 2950                           * (None of this would be necessary if the compiler
2944 2951                           * could be relied upon to _always_ tail-optimize
2945 2952                           * the call to dtrace_getarg() -- but it can't.)
2946 2953                           */
2947 2954                          if (mstate->dtms_probe != NULL)
2948 2955                                  return (val);
2949 2956  
2950 2957                          ASSERT(0);
2951 2958                  }
2952 2959  
2953 2960                  return (mstate->dtms_arg[ndx]);
2954 2961  
2955 2962          case DIF_VAR_UREGS: {
2956 2963                  klwp_t *lwp;
2957 2964  
2958 2965                  if (!dtrace_priv_proc(state, mstate))
2959 2966                          return (0);
2960 2967  
2961 2968                  if ((lwp = curthread->t_lwp) == NULL) {
2962 2969                          DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
2963 2970                          cpu_core[CPU->cpu_id].cpuc_dtrace_illval = NULL;
2964 2971                          return (0);
2965 2972                  }
2966 2973  
2967 2974                  return (dtrace_getreg(lwp->lwp_regs, ndx));
2968 2975          }
2969 2976  
2970 2977          case DIF_VAR_VMREGS: {
2971 2978                  uint64_t rval;
2972 2979  
2973 2980                  if (!dtrace_priv_kernel(state))
2974 2981                          return (0);
2975 2982  
2976 2983                  DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
2977 2984  
2978 2985                  rval = dtrace_getvmreg(ndx,
2979 2986                      &cpu_core[CPU->cpu_id].cpuc_dtrace_flags);
2980 2987  
2981 2988                  DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
2982 2989  
2983 2990                  return (rval);
2984 2991          }
2985 2992  
2986 2993          case DIF_VAR_CURTHREAD:
2987 2994                  if (!dtrace_priv_proc(state, mstate))
2988 2995                          return (0);
2989 2996                  return ((uint64_t)(uintptr_t)curthread);
2990 2997  
2991 2998          case DIF_VAR_TIMESTAMP:
2992 2999                  if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
2993 3000                          mstate->dtms_timestamp = dtrace_gethrtime();
2994 3001                          mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
2995 3002                  }
2996 3003                  return (mstate->dtms_timestamp);
2997 3004  
2998 3005          case DIF_VAR_VTIMESTAMP:
2999 3006                  ASSERT(dtrace_vtime_references != 0);
3000 3007                  return (curthread->t_dtrace_vtime);
3001 3008  
3002 3009          case DIF_VAR_WALLTIMESTAMP:
3003 3010                  if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3004 3011                          mstate->dtms_walltimestamp = dtrace_gethrestime();
3005 3012                          mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3006 3013                  }
3007 3014                  return (mstate->dtms_walltimestamp);
3008 3015  
3009 3016          case DIF_VAR_IPL:
3010 3017                  if (!dtrace_priv_kernel(state))
3011 3018                          return (0);
3012 3019                  if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3013 3020                          mstate->dtms_ipl = dtrace_getipl();
3014 3021                          mstate->dtms_present |= DTRACE_MSTATE_IPL;
3015 3022                  }
3016 3023                  return (mstate->dtms_ipl);
3017 3024  
3018 3025          case DIF_VAR_EPID:
3019 3026                  ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3020 3027                  return (mstate->dtms_epid);
3021 3028  
3022 3029          case DIF_VAR_ID:
3023 3030                  ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3024 3031                  return (mstate->dtms_probe->dtpr_id);
3025 3032  
3026 3033          case DIF_VAR_STACKDEPTH:
3027 3034                  if (!dtrace_priv_kernel(state))
3028 3035                          return (0);
3029 3036                  if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3030 3037                          int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3031 3038  
3032 3039                          mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3033 3040                          mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3034 3041                  }
3035 3042                  return (mstate->dtms_stackdepth);
3036 3043  
3037 3044          case DIF_VAR_USTACKDEPTH:
3038 3045                  if (!dtrace_priv_proc(state, mstate))
3039 3046                          return (0);
3040 3047                  if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3041 3048                          /*
3042 3049                           * See comment in DIF_VAR_PID.
3043 3050                           */
3044 3051                          if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3045 3052                              CPU_ON_INTR(CPU)) {
3046 3053                                  mstate->dtms_ustackdepth = 0;
3047 3054                          } else {
3048 3055                                  DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3049 3056                                  mstate->dtms_ustackdepth =
3050 3057                                      dtrace_getustackdepth();
3051 3058                                  DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3052 3059                          }
3053 3060                          mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3054 3061                  }
3055 3062                  return (mstate->dtms_ustackdepth);
3056 3063  
3057 3064          case DIF_VAR_CALLER:
3058 3065                  if (!dtrace_priv_kernel(state))
3059 3066                          return (0);
3060 3067                  if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3061 3068                          int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3062 3069  
3063 3070                          if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3064 3071                                  /*
3065 3072                                   * If this is an unanchored probe, we are
3066 3073                                   * required to go through the slow path:
3067 3074                                   * dtrace_caller() only guarantees correct
3068 3075                                   * results for anchored probes.
3069 3076                                   */
3070 3077                                  pc_t caller[2];
3071 3078  
3072 3079                                  dtrace_getpcstack(caller, 2, aframes,
3073 3080                                      (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3074 3081                                  mstate->dtms_caller = caller[1];
3075 3082                          } else if ((mstate->dtms_caller =
3076 3083                              dtrace_caller(aframes)) == -1) {
3077 3084                                  /*
3078 3085                                   * We have failed to do this the quick way;
3079 3086                                   * we must resort to the slower approach of
3080 3087                                   * calling dtrace_getpcstack().
3081 3088                                   */
3082 3089                                  pc_t caller;
3083 3090  
3084 3091                                  dtrace_getpcstack(&caller, 1, aframes, NULL);
3085 3092                                  mstate->dtms_caller = caller;
3086 3093                          }
3087 3094  
3088 3095                          mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3089 3096                  }
3090 3097                  return (mstate->dtms_caller);
3091 3098  
3092 3099          case DIF_VAR_UCALLER:
3093 3100                  if (!dtrace_priv_proc(state, mstate))
3094 3101                          return (0);
3095 3102  
3096 3103                  if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3097 3104                          uint64_t ustack[3];
3098 3105  
3099 3106                          /*
3100 3107                           * dtrace_getupcstack() fills in the first uint64_t
3101 3108                           * with the current PID.  The second uint64_t will
3102 3109                           * be the program counter at user-level.  The third
3103 3110                           * uint64_t will contain the caller, which is what
3104 3111                           * we're after.
3105 3112                           */
3106 3113                          ustack[2] = NULL;
3107 3114                          DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3108 3115                          dtrace_getupcstack(ustack, 3);
3109 3116                          DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3110 3117                          mstate->dtms_ucaller = ustack[2];
3111 3118                          mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3112 3119                  }
3113 3120  
3114 3121                  return (mstate->dtms_ucaller);
3115 3122  
3116 3123          case DIF_VAR_PROBEPROV:
3117 3124                  ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3118 3125                  return (dtrace_dif_varstr(
3119 3126                      (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3120 3127                      state, mstate));
3121 3128  
3122 3129          case DIF_VAR_PROBEMOD:
3123 3130                  ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3124 3131                  return (dtrace_dif_varstr(
3125 3132                      (uintptr_t)mstate->dtms_probe->dtpr_mod,
3126 3133                      state, mstate));
3127 3134  
3128 3135          case DIF_VAR_PROBEFUNC:
3129 3136                  ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3130 3137                  return (dtrace_dif_varstr(
3131 3138                      (uintptr_t)mstate->dtms_probe->dtpr_func,
3132 3139                      state, mstate));
3133 3140  
3134 3141          case DIF_VAR_PROBENAME:
3135 3142                  ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3136 3143                  return (dtrace_dif_varstr(
3137 3144                      (uintptr_t)mstate->dtms_probe->dtpr_name,
3138 3145                      state, mstate));
3139 3146  
3140 3147          case DIF_VAR_PID:
3141 3148                  if (!dtrace_priv_proc(state, mstate))
3142 3149                          return (0);
3143 3150  
3144 3151                  /*
3145 3152                   * Note that we are assuming that an unanchored probe is
3146 3153                   * always due to a high-level interrupt.  (And we're assuming
3147 3154                   * that there is only a single high level interrupt.)
3148 3155                   */
3149 3156                  if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3150 3157                          return (pid0.pid_id);
3151 3158  
3152 3159                  /*
3153 3160                   * It is always safe to dereference one's own t_procp pointer:
3154 3161                   * it always points to a valid, allocated proc structure.
3155 3162                   * Further, it is always safe to dereference the p_pidp member
3156 3163                   * of one's own proc structure.  (These are truisms becuase
3157 3164                   * threads and processes don't clean up their own state --
3158 3165                   * they leave that task to whomever reaps them.)
3159 3166                   */
3160 3167                  return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
3161 3168  
3162 3169          case DIF_VAR_PPID:
3163 3170                  if (!dtrace_priv_proc(state, mstate))
3164 3171                          return (0);
3165 3172  
3166 3173                  /*
3167 3174                   * See comment in DIF_VAR_PID.
3168 3175                   */
3169 3176                  if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3170 3177                          return (pid0.pid_id);
3171 3178  
3172 3179                  /*
3173 3180                   * It is always safe to dereference one's own t_procp pointer:
3174 3181                   * it always points to a valid, allocated proc structure.
3175 3182                   * (This is true because threads don't clean up their own
3176 3183                   * state -- they leave that task to whomever reaps them.)
3177 3184                   */
3178 3185                  return ((uint64_t)curthread->t_procp->p_ppid);
3179 3186  
3180 3187          case DIF_VAR_TID:
3181 3188                  /*
3182 3189                   * See comment in DIF_VAR_PID.
3183 3190                   */
3184 3191                  if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3185 3192                          return (0);
3186 3193  
3187 3194                  return ((uint64_t)curthread->t_tid);
3188 3195  
3189 3196          case DIF_VAR_EXECNAME:
3190 3197                  if (!dtrace_priv_proc(state, mstate))
3191 3198                          return (0);
3192 3199  
3193 3200                  /*
3194 3201                   * See comment in DIF_VAR_PID.
3195 3202                   */
3196 3203                  if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3197 3204                          return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3198 3205  
3199 3206                  /*
3200 3207                   * It is always safe to dereference one's own t_procp pointer:
3201 3208                   * it always points to a valid, allocated proc structure.
3202 3209                   * (This is true because threads don't clean up their own
3203 3210                   * state -- they leave that task to whomever reaps them.)
3204 3211                   */
3205 3212                  return (dtrace_dif_varstr(
3206 3213                      (uintptr_t)curthread->t_procp->p_user.u_comm,
3207 3214                      state, mstate));
3208 3215  
3209 3216          case DIF_VAR_ZONENAME:
3210 3217                  if (!dtrace_priv_proc(state, mstate))
3211 3218                          return (0);
3212 3219  
3213 3220                  /*
3214 3221                   * See comment in DIF_VAR_PID.
3215 3222                   */
3216 3223                  if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3217 3224                          return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3218 3225  
3219 3226                  /*
3220 3227                   * It is always safe to dereference one's own t_procp pointer:
3221 3228                   * it always points to a valid, allocated proc structure.
3222 3229                   * (This is true because threads don't clean up their own
3223 3230                   * state -- they leave that task to whomever reaps them.)
3224 3231                   */
3225 3232                  return (dtrace_dif_varstr(
3226 3233                      (uintptr_t)curthread->t_procp->p_zone->zone_name,
3227 3234                      state, mstate));
3228 3235  
3229 3236          case DIF_VAR_UID:
3230 3237                  if (!dtrace_priv_proc(state, mstate))
3231 3238                          return (0);
3232 3239  
3233 3240                  /*
3234 3241                   * See comment in DIF_VAR_PID.
3235 3242                   */
3236 3243                  if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3237 3244                          return ((uint64_t)p0.p_cred->cr_uid);
3238 3245  
3239 3246                  /*
3240 3247                   * It is always safe to dereference one's own t_procp pointer:
3241 3248                   * it always points to a valid, allocated proc structure.
3242 3249                   * (This is true because threads don't clean up their own
3243 3250                   * state -- they leave that task to whomever reaps them.)
3244 3251                   *
3245 3252                   * Additionally, it is safe to dereference one's own process
3246 3253                   * credential, since this is never NULL after process birth.
3247 3254                   */
3248 3255                  return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3249 3256  
3250 3257          case DIF_VAR_GID:
3251 3258                  if (!dtrace_priv_proc(state, mstate))
3252 3259                          return (0);
3253 3260  
3254 3261                  /*
3255 3262                   * See comment in DIF_VAR_PID.
3256 3263                   */
3257 3264                  if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3258 3265                          return ((uint64_t)p0.p_cred->cr_gid);
3259 3266  
3260 3267                  /*
3261 3268                   * It is always safe to dereference one's own t_procp pointer:
3262 3269                   * it always points to a valid, allocated proc structure.
3263 3270                   * (This is true because threads don't clean up their own
3264 3271                   * state -- they leave that task to whomever reaps them.)
3265 3272                   *
3266 3273                   * Additionally, it is safe to dereference one's own process
3267 3274                   * credential, since this is never NULL after process birth.
3268 3275                   */
3269 3276                  return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3270 3277  
3271 3278          case DIF_VAR_ERRNO: {
3272 3279                  klwp_t *lwp;
3273 3280                  if (!dtrace_priv_proc(state, mstate))
3274 3281                          return (0);
3275 3282  
3276 3283                  /*
3277 3284                   * See comment in DIF_VAR_PID.
3278 3285                   */
3279 3286                  if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3280 3287                          return (0);
3281 3288  
3282 3289                  /*
3283 3290                   * It is always safe to dereference one's own t_lwp pointer in
3284 3291                   * the event that this pointer is non-NULL.  (This is true
3285 3292                   * because threads and lwps don't clean up their own state --
3286 3293                   * they leave that task to whomever reaps them.)
3287 3294                   */
3288 3295                  if ((lwp = curthread->t_lwp) == NULL)
3289 3296                          return (0);
3290 3297  
3291 3298                  return ((uint64_t)lwp->lwp_errno);
3292 3299          }
3293 3300          default:
3294 3301                  DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3295 3302                  return (0);
3296 3303          }
3297 3304  }
3298 3305  
3299 3306  /*
3300 3307   * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
3301 3308   * Notice that we don't bother validating the proper number of arguments or
3302 3309   * their types in the tuple stack.  This isn't needed because all argument
3303 3310   * interpretation is safe because of our load safety -- the worst that can
3304 3311   * happen is that a bogus program can obtain bogus results.
3305 3312   */
3306 3313  static void
3307 3314  dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
3308 3315      dtrace_key_t *tupregs, int nargs,
3309 3316      dtrace_mstate_t *mstate, dtrace_state_t *state)
3310 3317  {
3311 3318          volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
3312 3319          volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
3313 3320          dtrace_vstate_t *vstate = &state->dts_vstate;
3314 3321  
3315 3322          union {
3316 3323                  mutex_impl_t mi;
3317 3324                  uint64_t mx;
3318 3325          } m;
3319 3326  
3320 3327          union {
3321 3328                  krwlock_t ri;
3322 3329                  uintptr_t rw;
3323 3330          } r;
3324 3331  
3325 3332          switch (subr) {
3326 3333          case DIF_SUBR_RAND:
3327 3334                  regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
3328 3335                  break;
3329 3336  
3330 3337          case DIF_SUBR_MUTEX_OWNED:
3331 3338                  if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3332 3339                      mstate, vstate)) {
3333 3340                          regs[rd] = NULL;
3334 3341                          break;
3335 3342                  }
3336 3343  
3337 3344                  m.mx = dtrace_load64(tupregs[0].dttk_value);
3338 3345                  if (MUTEX_TYPE_ADAPTIVE(&m.mi))
3339 3346                          regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
3340 3347                  else
3341 3348                          regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
3342 3349                  break;
3343 3350  
3344 3351          case DIF_SUBR_MUTEX_OWNER:
3345 3352                  if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3346 3353                      mstate, vstate)) {
3347 3354                          regs[rd] = NULL;
3348 3355                          break;
3349 3356                  }
3350 3357  
3351 3358                  m.mx = dtrace_load64(tupregs[0].dttk_value);
3352 3359                  if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
3353 3360                      MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
3354 3361                          regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
3355 3362                  else
3356 3363                          regs[rd] = 0;
3357 3364                  break;
3358 3365  
3359 3366          case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
3360 3367                  if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3361 3368                      mstate, vstate)) {
3362 3369                          regs[rd] = NULL;
3363 3370                          break;
3364 3371                  }
3365 3372  
3366 3373                  m.mx = dtrace_load64(tupregs[0].dttk_value);
3367 3374                  regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
3368 3375                  break;
3369 3376  
3370 3377          case DIF_SUBR_MUTEX_TYPE_SPIN:
3371 3378                  if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3372 3379                      mstate, vstate)) {
3373 3380                          regs[rd] = NULL;
3374 3381                          break;
3375 3382                  }
3376 3383  
3377 3384                  m.mx = dtrace_load64(tupregs[0].dttk_value);
3378 3385                  regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
3379 3386                  break;
3380 3387  
3381 3388          case DIF_SUBR_RW_READ_HELD: {
3382 3389                  uintptr_t tmp;
3383 3390  
3384 3391                  if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3385 3392                      mstate, vstate)) {
3386 3393                          regs[rd] = NULL;
3387 3394                          break;
3388 3395                  }
3389 3396  
3390 3397                  r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3391 3398                  regs[rd] = _RW_READ_HELD(&r.ri, tmp);
3392 3399                  break;
3393 3400          }
3394 3401  
3395 3402          case DIF_SUBR_RW_WRITE_HELD:
3396 3403                  if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3397 3404                      mstate, vstate)) {
3398 3405                          regs[rd] = NULL;
3399 3406                          break;
3400 3407                  }
3401 3408  
3402 3409                  r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3403 3410                  regs[rd] = _RW_WRITE_HELD(&r.ri);
3404 3411                  break;
3405 3412  
3406 3413          case DIF_SUBR_RW_ISWRITER:
3407 3414                  if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3408 3415                      mstate, vstate)) {
3409 3416                          regs[rd] = NULL;
3410 3417                          break;
3411 3418                  }
3412 3419  
3413 3420                  r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3414 3421                  regs[rd] = _RW_ISWRITER(&r.ri);
3415 3422                  break;
3416 3423  
3417 3424          case DIF_SUBR_BCOPY: {
3418 3425                  /*
3419 3426                   * We need to be sure that the destination is in the scratch
3420 3427                   * region -- no other region is allowed.
3421 3428                   */
3422 3429                  uintptr_t src = tupregs[0].dttk_value;
3423 3430                  uintptr_t dest = tupregs[1].dttk_value;
3424 3431                  size_t size = tupregs[2].dttk_value;
3425 3432  
3426 3433                  if (!dtrace_inscratch(dest, size, mstate)) {
3427 3434                          *flags |= CPU_DTRACE_BADADDR;
3428 3435                          *illval = regs[rd];
3429 3436                          break;
3430 3437                  }
3431 3438  
3432 3439                  if (!dtrace_canload(src, size, mstate, vstate)) {
3433 3440                          regs[rd] = NULL;
3434 3441                          break;
3435 3442                  }
3436 3443  
3437 3444                  dtrace_bcopy((void *)src, (void *)dest, size);
3438 3445                  break;
3439 3446          }
3440 3447  
3441 3448          case DIF_SUBR_ALLOCA:
3442 3449          case DIF_SUBR_COPYIN: {
3443 3450                  uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
3444 3451                  uint64_t size =
3445 3452                      tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
3446 3453                  size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
3447 3454  
3448 3455                  /*
3449 3456                   * This action doesn't require any credential checks since
3450 3457                   * probes will not activate in user contexts to which the
3451 3458                   * enabling user does not have permissions.
3452 3459                   */
3453 3460  
3454 3461                  /*
3455 3462                   * Rounding up the user allocation size could have overflowed
3456 3463                   * a large, bogus allocation (like -1ULL) to 0.
3457 3464                   */
3458 3465                  if (scratch_size < size ||
3459 3466                      !DTRACE_INSCRATCH(mstate, scratch_size)) {
3460 3467                          DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3461 3468                          regs[rd] = NULL;
3462 3469                          break;
3463 3470                  }
3464 3471  
3465 3472                  if (subr == DIF_SUBR_COPYIN) {
3466 3473                          DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3467 3474                          dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3468 3475                          DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3469 3476                  }
3470 3477  
3471 3478                  mstate->dtms_scratch_ptr += scratch_size;
3472 3479                  regs[rd] = dest;
3473 3480                  break;
3474 3481          }
3475 3482  
3476 3483          case DIF_SUBR_COPYINTO: {
3477 3484                  uint64_t size = tupregs[1].dttk_value;
3478 3485                  uintptr_t dest = tupregs[2].dttk_value;
3479 3486  
3480 3487                  /*
3481 3488                   * This action doesn't require any credential checks since
3482 3489                   * probes will not activate in user contexts to which the
3483 3490                   * enabling user does not have permissions.
3484 3491                   */
3485 3492                  if (!dtrace_inscratch(dest, size, mstate)) {
3486 3493                          *flags |= CPU_DTRACE_BADADDR;
3487 3494                          *illval = regs[rd];
3488 3495                          break;
3489 3496                  }
3490 3497  
3491 3498                  DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3492 3499                  dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3493 3500                  DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3494 3501                  break;
3495 3502          }
3496 3503  
3497 3504          case DIF_SUBR_COPYINSTR: {
3498 3505                  uintptr_t dest = mstate->dtms_scratch_ptr;
3499 3506                  uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3500 3507  
3501 3508                  if (nargs > 1 && tupregs[1].dttk_value < size)
3502 3509                          size = tupregs[1].dttk_value + 1;
3503 3510  
3504 3511                  /*
3505 3512                   * This action doesn't require any credential checks since
3506 3513                   * probes will not activate in user contexts to which the
3507 3514                   * enabling user does not have permissions.
3508 3515                   */
3509 3516                  if (!DTRACE_INSCRATCH(mstate, size)) {
3510 3517                          DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3511 3518                          regs[rd] = NULL;
3512 3519                          break;
3513 3520                  }
3514 3521  
3515 3522                  DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3516 3523                  dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
3517 3524                  DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3518 3525  
3519 3526                  ((char *)dest)[size - 1] = '\0';
3520 3527                  mstate->dtms_scratch_ptr += size;
3521 3528                  regs[rd] = dest;
3522 3529                  break;
3523 3530          }
3524 3531  
3525 3532          case DIF_SUBR_MSGSIZE:
3526 3533          case DIF_SUBR_MSGDSIZE: {
3527 3534                  uintptr_t baddr = tupregs[0].dttk_value, daddr;
3528 3535                  uintptr_t wptr, rptr;
3529 3536                  size_t count = 0;
3530 3537                  int cont = 0;
3531 3538  
3532 3539                  while (baddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
3533 3540  
3534 3541                          if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
3535 3542                              vstate)) {
3536 3543                                  regs[rd] = NULL;
3537 3544                                  break;
3538 3545                          }
3539 3546  
3540 3547                          wptr = dtrace_loadptr(baddr +
3541 3548                              offsetof(mblk_t, b_wptr));
3542 3549  
3543 3550                          rptr = dtrace_loadptr(baddr +
3544 3551                              offsetof(mblk_t, b_rptr));
3545 3552  
3546 3553                          if (wptr < rptr) {
3547 3554                                  *flags |= CPU_DTRACE_BADADDR;
3548 3555                                  *illval = tupregs[0].dttk_value;
3549 3556                                  break;
3550 3557                          }
3551 3558  
3552 3559                          daddr = dtrace_loadptr(baddr +
3553 3560                              offsetof(mblk_t, b_datap));
3554 3561  
3555 3562                          baddr = dtrace_loadptr(baddr +
3556 3563                              offsetof(mblk_t, b_cont));
3557 3564  
3558 3565                          /*
3559 3566                           * We want to prevent against denial-of-service here,
3560 3567                           * so we're only going to search the list for
3561 3568                           * dtrace_msgdsize_max mblks.
3562 3569                           */
3563 3570                          if (cont++ > dtrace_msgdsize_max) {
3564 3571                                  *flags |= CPU_DTRACE_ILLOP;
3565 3572                                  break;
3566 3573                          }
3567 3574  
3568 3575                          if (subr == DIF_SUBR_MSGDSIZE) {
3569 3576                                  if (dtrace_load8(daddr +
3570 3577                                      offsetof(dblk_t, db_type)) != M_DATA)
3571 3578                                          continue;
3572 3579                          }
3573 3580  
3574 3581                          count += wptr - rptr;
3575 3582                  }
3576 3583  
3577 3584                  if (!(*flags & CPU_DTRACE_FAULT))
3578 3585                          regs[rd] = count;
3579 3586  
3580 3587                  break;
3581 3588          }
3582 3589  
3583 3590          case DIF_SUBR_PROGENYOF: {
3584 3591                  pid_t pid = tupregs[0].dttk_value;
3585 3592                  proc_t *p;
3586 3593                  int rval = 0;
3587 3594  
3588 3595                  DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3589 3596  
3590 3597                  for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
3591 3598                          if (p->p_pidp->pid_id == pid) {
3592 3599                                  rval = 1;
3593 3600                                  break;
3594 3601                          }
3595 3602                  }
3596 3603  
3597 3604                  DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3598 3605  
3599 3606                  regs[rd] = rval;
3600 3607                  break;
3601 3608          }
3602 3609  
3603 3610          case DIF_SUBR_SPECULATION:
3604 3611                  regs[rd] = dtrace_speculation(state);
3605 3612                  break;
3606 3613  
3607 3614          case DIF_SUBR_COPYOUT: {
3608 3615                  uintptr_t kaddr = tupregs[0].dttk_value;
3609 3616                  uintptr_t uaddr = tupregs[1].dttk_value;
3610 3617                  uint64_t size = tupregs[2].dttk_value;
3611 3618  
3612 3619                  if (!dtrace_destructive_disallow &&
3613 3620                      dtrace_priv_proc_control(state, mstate) &&
3614 3621                      !dtrace_istoxic(kaddr, size)) {
3615 3622                          DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3616 3623                          dtrace_copyout(kaddr, uaddr, size, flags);
3617 3624                          DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3618 3625                  }
3619 3626                  break;
3620 3627          }
3621 3628  
3622 3629          case DIF_SUBR_COPYOUTSTR: {
3623 3630                  uintptr_t kaddr = tupregs[0].dttk_value;
3624 3631                  uintptr_t uaddr = tupregs[1].dttk_value;
3625 3632                  uint64_t size = tupregs[2].dttk_value;
3626 3633  
3627 3634                  if (!dtrace_destructive_disallow &&
3628 3635                      dtrace_priv_proc_control(state, mstate) &&
3629 3636                      !dtrace_istoxic(kaddr, size)) {
3630 3637                          DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3631 3638                          dtrace_copyoutstr(kaddr, uaddr, size, flags);
3632 3639                          DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3633 3640                  }
3634 3641                  break;
3635 3642          }
3636 3643  
3637 3644          case DIF_SUBR_STRLEN: {
3638 3645                  size_t sz;
3639 3646                  uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
3640 3647                  sz = dtrace_strlen((char *)addr,
3641 3648                      state->dts_options[DTRACEOPT_STRSIZE]);
3642 3649  
3643 3650                  if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
3644 3651                          regs[rd] = NULL;
3645 3652                          break;
3646 3653                  }
3647 3654  
3648 3655                  regs[rd] = sz;
3649 3656  
3650 3657                  break;
3651 3658          }
3652 3659  
3653 3660          case DIF_SUBR_STRCHR:
3654 3661          case DIF_SUBR_STRRCHR: {
3655 3662                  /*
3656 3663                   * We're going to iterate over the string looking for the
3657 3664                   * specified character.  We will iterate until we have reached
3658 3665                   * the string length or we have found the character.  If this
3659 3666                   * is DIF_SUBR_STRRCHR, we will look for the last occurrence
3660 3667                   * of the specified character instead of the first.
3661 3668                   */
3662 3669                  uintptr_t saddr = tupregs[0].dttk_value;
3663 3670                  uintptr_t addr = tupregs[0].dttk_value;
3664 3671                  uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
3665 3672                  char c, target = (char)tupregs[1].dttk_value;
3666 3673  
3667 3674                  for (regs[rd] = NULL; addr < limit; addr++) {
3668 3675                          if ((c = dtrace_load8(addr)) == target) {
3669 3676                                  regs[rd] = addr;
3670 3677  
3671 3678                                  if (subr == DIF_SUBR_STRCHR)
3672 3679                                          break;
3673 3680                          }
3674 3681  
3675 3682                          if (c == '\0')
3676 3683                                  break;
3677 3684                  }
3678 3685  
3679 3686                  if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
3680 3687                          regs[rd] = NULL;
3681 3688                          break;
3682 3689                  }
3683 3690  
3684 3691                  break;
3685 3692          }
3686 3693  
3687 3694          case DIF_SUBR_STRSTR:
3688 3695          case DIF_SUBR_INDEX:
3689 3696          case DIF_SUBR_RINDEX: {
3690 3697                  /*
3691 3698                   * We're going to iterate over the string looking for the
3692 3699                   * specified string.  We will iterate until we have reached
3693 3700                   * the string length or we have found the string.  (Yes, this
3694 3701                   * is done in the most naive way possible -- but considering
3695 3702                   * that the string we're searching for is likely to be
3696 3703                   * relatively short, the complexity of Rabin-Karp or similar
3697 3704                   * hardly seems merited.)
3698 3705                   */
3699 3706                  char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
3700 3707                  char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
3701 3708                  uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3702 3709                  size_t len = dtrace_strlen(addr, size);
3703 3710                  size_t sublen = dtrace_strlen(substr, size);
3704 3711                  char *limit = addr + len, *orig = addr;
3705 3712                  int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
3706 3713                  int inc = 1;
3707 3714  
3708 3715                  regs[rd] = notfound;
3709 3716  
3710 3717                  if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
3711 3718                          regs[rd] = NULL;
3712 3719                          break;
3713 3720                  }
3714 3721  
3715 3722                  if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
3716 3723                      vstate)) {
3717 3724                          regs[rd] = NULL;
3718 3725                          break;
3719 3726                  }
3720 3727  
3721 3728                  /*
3722 3729                   * strstr() and index()/rindex() have similar semantics if
3723 3730                   * both strings are the empty string: strstr() returns a
3724 3731                   * pointer to the (empty) string, and index() and rindex()
3725 3732                   * both return index 0 (regardless of any position argument).
3726 3733                   */
3727 3734                  if (sublen == 0 && len == 0) {
3728 3735                          if (subr == DIF_SUBR_STRSTR)
3729 3736                                  regs[rd] = (uintptr_t)addr;
3730 3737                          else
3731 3738                                  regs[rd] = 0;
3732 3739                          break;
3733 3740                  }
3734 3741  
3735 3742                  if (subr != DIF_SUBR_STRSTR) {
3736 3743                          if (subr == DIF_SUBR_RINDEX) {
3737 3744                                  limit = orig - 1;
3738 3745                                  addr += len;
3739 3746                                  inc = -1;
3740 3747                          }
3741 3748  
3742 3749                          /*
3743 3750                           * Both index() and rindex() take an optional position
3744 3751                           * argument that denotes the starting position.
3745 3752                           */
3746 3753                          if (nargs == 3) {
3747 3754                                  int64_t pos = (int64_t)tupregs[2].dttk_value;
3748 3755  
3749 3756                                  /*
3750 3757                                   * If the position argument to index() is
3751 3758                                   * negative, Perl implicitly clamps it at
3752 3759                                   * zero.  This semantic is a little surprising
3753 3760                                   * given the special meaning of negative
3754 3761                                   * positions to similar Perl functions like
3755 3762                                   * substr(), but it appears to reflect a
3756 3763                                   * notion that index() can start from a
3757 3764                                   * negative index and increment its way up to
3758 3765                                   * the string.  Given this notion, Perl's
3759 3766                                   * rindex() is at least self-consistent in
3760 3767                                   * that it implicitly clamps positions greater
3761 3768                                   * than the string length to be the string
3762 3769                                   * length.  Where Perl completely loses
3763 3770                                   * coherence, however, is when the specified
3764 3771                                   * substring is the empty string ("").  In
3765 3772                                   * this case, even if the position is
3766 3773                                   * negative, rindex() returns 0 -- and even if
3767 3774                                   * the position is greater than the length,
3768 3775                                   * index() returns the string length.  These
3769 3776                                   * semantics violate the notion that index()
3770 3777                                   * should never return a value less than the
3771 3778                                   * specified position and that rindex() should
3772 3779                                   * never return a value greater than the
3773 3780                                   * specified position.  (One assumes that
3774 3781                                   * these semantics are artifacts of Perl's
3775 3782                                   * implementation and not the results of
3776 3783                                   * deliberate design -- it beggars belief that
3777 3784                                   * even Larry Wall could desire such oddness.)
3778 3785                                   * While in the abstract one would wish for
3779 3786                                   * consistent position semantics across
3780 3787                                   * substr(), index() and rindex() -- or at the
3781 3788                                   * very least self-consistent position
3782 3789                                   * semantics for index() and rindex() -- we
3783 3790                                   * instead opt to keep with the extant Perl
3784 3791                                   * semantics, in all their broken glory.  (Do
3785 3792                                   * we have more desire to maintain Perl's
3786 3793                                   * semantics than Perl does?  Probably.)
3787 3794                                   */
3788 3795                                  if (subr == DIF_SUBR_RINDEX) {
3789 3796                                          if (pos < 0) {
3790 3797                                                  if (sublen == 0)
3791 3798                                                          regs[rd] = 0;
3792 3799                                                  break;
3793 3800                                          }
3794 3801  
3795 3802                                          if (pos > len)
3796 3803                                                  pos = len;
3797 3804                                  } else {
3798 3805                                          if (pos < 0)
3799 3806                                                  pos = 0;
3800 3807  
3801 3808                                          if (pos >= len) {
3802 3809                                                  if (sublen == 0)
3803 3810                                                          regs[rd] = len;
3804 3811                                                  break;
3805 3812                                          }
3806 3813                                  }
3807 3814  
3808 3815                                  addr = orig + pos;
3809 3816                          }
3810 3817                  }
3811 3818  
3812 3819                  for (regs[rd] = notfound; addr != limit; addr += inc) {
3813 3820                          if (dtrace_strncmp(addr, substr, sublen) == 0) {
3814 3821                                  if (subr != DIF_SUBR_STRSTR) {
3815 3822                                          /*
3816 3823                                           * As D index() and rindex() are
3817 3824                                           * modeled on Perl (and not on awk),
3818 3825                                           * we return a zero-based (and not a
3819 3826                                           * one-based) index.  (For you Perl
3820 3827                                           * weenies: no, we're not going to add
3821 3828                                           * $[ -- and shouldn't you be at a con
3822 3829                                           * or something?)
3823 3830                                           */
3824 3831                                          regs[rd] = (uintptr_t)(addr - orig);
3825 3832                                          break;
3826 3833                                  }
3827 3834  
3828 3835                                  ASSERT(subr == DIF_SUBR_STRSTR);
3829 3836                                  regs[rd] = (uintptr_t)addr;
3830 3837                                  break;
3831 3838                          }
3832 3839                  }
3833 3840  
3834 3841                  break;
3835 3842          }
3836 3843  
3837 3844          case DIF_SUBR_STRTOK: {
3838 3845                  uintptr_t addr = tupregs[0].dttk_value;
3839 3846                  uintptr_t tokaddr = tupregs[1].dttk_value;
3840 3847                  uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3841 3848                  uintptr_t limit, toklimit = tokaddr + size;
3842 3849                  uint8_t c, tokmap[32];   /* 256 / 8 */
3843 3850                  char *dest = (char *)mstate->dtms_scratch_ptr;
3844 3851                  int i;
3845 3852  
3846 3853                  /*
3847 3854                   * Check both the token buffer and (later) the input buffer,
3848 3855                   * since both could be non-scratch addresses.
3849 3856                   */
3850 3857                  if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
3851 3858                          regs[rd] = NULL;
3852 3859                          break;
3853 3860                  }
3854 3861  
3855 3862                  if (!DTRACE_INSCRATCH(mstate, size)) {
3856 3863                          DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3857 3864                          regs[rd] = NULL;
3858 3865                          break;
3859 3866                  }
3860 3867  
3861 3868                  if (addr == NULL) {
3862 3869                          /*
3863 3870                           * If the address specified is NULL, we use our saved
3864 3871                           * strtok pointer from the mstate.  Note that this
3865 3872                           * means that the saved strtok pointer is _only_
3866 3873                           * valid within multiple enablings of the same probe --
3867 3874                           * it behaves like an implicit clause-local variable.
3868 3875                           */
3869 3876                          addr = mstate->dtms_strtok;
3870 3877                  } else {
3871 3878                          /*
3872 3879                           * If the user-specified address is non-NULL we must
3873 3880                           * access check it.  This is the only time we have
3874 3881                           * a chance to do so, since this address may reside
3875 3882                           * in the string table of this clause-- future calls
3876 3883                           * (when we fetch addr from mstate->dtms_strtok)
3877 3884                           * would fail this access check.
3878 3885                           */
3879 3886                          if (!dtrace_strcanload(addr, size, mstate, vstate)) {
3880 3887                                  regs[rd] = NULL;
3881 3888                                  break;
3882 3889                          }
3883 3890                  }
3884 3891  
3885 3892                  /*
3886 3893                   * First, zero the token map, and then process the token
3887 3894                   * string -- setting a bit in the map for every character
3888 3895                   * found in the token string.
3889 3896                   */
3890 3897                  for (i = 0; i < sizeof (tokmap); i++)
3891 3898                          tokmap[i] = 0;
3892 3899  
3893 3900                  for (; tokaddr < toklimit; tokaddr++) {
3894 3901                          if ((c = dtrace_load8(tokaddr)) == '\0')
3895 3902                                  break;
3896 3903  
3897 3904                          ASSERT((c >> 3) < sizeof (tokmap));
3898 3905                          tokmap[c >> 3] |= (1 << (c & 0x7));
3899 3906                  }
3900 3907  
3901 3908                  for (limit = addr + size; addr < limit; addr++) {
3902 3909                          /*
3903 3910                           * We're looking for a character that is _not_ contained
3904 3911                           * in the token string.
3905 3912                           */
3906 3913                          if ((c = dtrace_load8(addr)) == '\0')
3907 3914                                  break;
3908 3915  
3909 3916                          if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
3910 3917                                  break;
3911 3918                  }
3912 3919  
3913 3920                  if (c == '\0') {
3914 3921                          /*
3915 3922                           * We reached the end of the string without finding
3916 3923                           * any character that was not in the token string.
3917 3924                           * We return NULL in this case, and we set the saved
3918 3925                           * address to NULL as well.
3919 3926                           */
3920 3927                          regs[rd] = NULL;
3921 3928                          mstate->dtms_strtok = NULL;
3922 3929                          break;
3923 3930                  }
3924 3931  
3925 3932                  /*
3926 3933                   * From here on, we're copying into the destination string.
3927 3934                   */
3928 3935                  for (i = 0; addr < limit && i < size - 1; addr++) {
3929 3936                          if ((c = dtrace_load8(addr)) == '\0')
3930 3937                                  break;
3931 3938  
3932 3939                          if (tokmap[c >> 3] & (1 << (c & 0x7)))
3933 3940                                  break;
3934 3941  
3935 3942                          ASSERT(i < size);
3936 3943                          dest[i++] = c;
3937 3944                  }
3938 3945  
3939 3946                  ASSERT(i < size);
3940 3947                  dest[i] = '\0';
3941 3948                  regs[rd] = (uintptr_t)dest;
3942 3949                  mstate->dtms_scratch_ptr += size;
3943 3950                  mstate->dtms_strtok = addr;
3944 3951                  break;
3945 3952          }
3946 3953  
3947 3954          case DIF_SUBR_SUBSTR: {
3948 3955                  uintptr_t s = tupregs[0].dttk_value;
3949 3956                  uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3950 3957                  char *d = (char *)mstate->dtms_scratch_ptr;
3951 3958                  int64_t index = (int64_t)tupregs[1].dttk_value;
3952 3959                  int64_t remaining = (int64_t)tupregs[2].dttk_value;
3953 3960                  size_t len = dtrace_strlen((char *)s, size);
3954 3961                  int64_t i;
3955 3962  
3956 3963                  if (!dtrace_canload(s, len + 1, mstate, vstate)) {
3957 3964                          regs[rd] = NULL;
3958 3965                          break;
3959 3966                  }
3960 3967  
3961 3968                  if (!DTRACE_INSCRATCH(mstate, size)) {
3962 3969                          DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3963 3970                          regs[rd] = NULL;
3964 3971                          break;
3965 3972                  }
3966 3973  
3967 3974                  if (nargs <= 2)
3968 3975                          remaining = (int64_t)size;
3969 3976  
3970 3977                  if (index < 0) {
3971 3978                          index += len;
3972 3979  
3973 3980                          if (index < 0 && index + remaining > 0) {
3974 3981                                  remaining += index;
3975 3982                                  index = 0;
3976 3983                          }
3977 3984                  }
3978 3985  
3979 3986                  if (index >= len || index < 0) {
3980 3987                          remaining = 0;
3981 3988                  } else if (remaining < 0) {
3982 3989                          remaining += len - index;
3983 3990                  } else if (index + remaining > size) {
3984 3991                          remaining = size - index;
3985 3992                  }
3986 3993  
3987 3994                  for (i = 0; i < remaining; i++) {
3988 3995                          if ((d[i] = dtrace_load8(s + index + i)) == '\0')
3989 3996                                  break;
3990 3997                  }
3991 3998  
3992 3999                  d[i] = '\0';
3993 4000  
3994 4001                  mstate->dtms_scratch_ptr += size;
3995 4002                  regs[rd] = (uintptr_t)d;
3996 4003                  break;
3997 4004          }
3998 4005  
3999 4006          case DIF_SUBR_TOUPPER:
4000 4007          case DIF_SUBR_TOLOWER: {
4001 4008                  uintptr_t s = tupregs[0].dttk_value;
4002 4009                  uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4003 4010                  char *dest = (char *)mstate->dtms_scratch_ptr, c;
4004 4011                  size_t len = dtrace_strlen((char *)s, size);
4005 4012                  char lower, upper, convert;
4006 4013                  int64_t i;
4007 4014  
4008 4015                  if (subr == DIF_SUBR_TOUPPER) {
4009 4016                          lower = 'a';
4010 4017                          upper = 'z';
4011 4018                          convert = 'A';
4012 4019                  } else {
4013 4020                          lower = 'A';
4014 4021                          upper = 'Z';
4015 4022                          convert = 'a';
4016 4023                  }
4017 4024  
4018 4025                  if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4019 4026                          regs[rd] = NULL;
4020 4027                          break;
4021 4028                  }
4022 4029  
4023 4030                  if (!DTRACE_INSCRATCH(mstate, size)) {
4024 4031                          DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4025 4032                          regs[rd] = NULL;
4026 4033                          break;
4027 4034                  }
4028 4035  
4029 4036                  for (i = 0; i < size - 1; i++) {
4030 4037                          if ((c = dtrace_load8(s + i)) == '\0')
4031 4038                                  break;
4032 4039  
4033 4040                          if (c >= lower && c <= upper)
4034 4041                                  c = convert + (c - lower);
4035 4042  
4036 4043                          dest[i] = c;
4037 4044                  }
4038 4045  
4039 4046                  ASSERT(i < size);
4040 4047                  dest[i] = '\0';
4041 4048                  regs[rd] = (uintptr_t)dest;
4042 4049                  mstate->dtms_scratch_ptr += size;
4043 4050                  break;
4044 4051          }
4045 4052  
4046 4053  case DIF_SUBR_GETMAJOR:
4047 4054  #ifdef _LP64
4048 4055                  regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
4049 4056  #else
4050 4057                  regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
4051 4058  #endif
4052 4059                  break;
4053 4060  
4054 4061          case DIF_SUBR_GETMINOR:
4055 4062  #ifdef _LP64
4056 4063                  regs[rd] = tupregs[0].dttk_value & MAXMIN64;
4057 4064  #else
4058 4065                  regs[rd] = tupregs[0].dttk_value & MAXMIN;
4059 4066  #endif
4060 4067                  break;
4061 4068  
4062 4069          case DIF_SUBR_DDI_PATHNAME: {
4063 4070                  /*
4064 4071                   * This one is a galactic mess.  We are going to roughly
4065 4072                   * emulate ddi_pathname(), but it's made more complicated
4066 4073                   * by the fact that we (a) want to include the minor name and
4067 4074                   * (b) must proceed iteratively instead of recursively.
4068 4075                   */
4069 4076                  uintptr_t dest = mstate->dtms_scratch_ptr;
4070 4077                  uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4071 4078                  char *start = (char *)dest, *end = start + size - 1;
4072 4079                  uintptr_t daddr = tupregs[0].dttk_value;
4073 4080                  int64_t minor = (int64_t)tupregs[1].dttk_value;
4074 4081                  char *s;
4075 4082                  int i, len, depth = 0;
4076 4083  
4077 4084                  /*
4078 4085                   * Due to all the pointer jumping we do and context we must
4079 4086                   * rely upon, we just mandate that the user must have kernel
4080 4087                   * read privileges to use this routine.
4081 4088                   */
4082 4089                  if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
4083 4090                          *flags |= CPU_DTRACE_KPRIV;
4084 4091                          *illval = daddr;
4085 4092                          regs[rd] = NULL;
4086 4093                  }
4087 4094  
4088 4095                  if (!DTRACE_INSCRATCH(mstate, size)) {
4089 4096                          DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4090 4097                          regs[rd] = NULL;
4091 4098                          break;
4092 4099                  }
4093 4100  
4094 4101                  *end = '\0';
4095 4102  
4096 4103                  /*
4097 4104                   * We want to have a name for the minor.  In order to do this,
4098 4105                   * we need to walk the minor list from the devinfo.  We want
4099 4106                   * to be sure that we don't infinitely walk a circular list,
4100 4107                   * so we check for circularity by sending a scout pointer
4101 4108                   * ahead two elements for every element that we iterate over;
4102 4109                   * if the list is circular, these will ultimately point to the
4103 4110                   * same element.  You may recognize this little trick as the
4104 4111                   * answer to a stupid interview question -- one that always
4105 4112                   * seems to be asked by those who had to have it laboriously
4106 4113                   * explained to them, and who can't even concisely describe
4107 4114                   * the conditions under which one would be forced to resort to
4108 4115                   * this technique.  Needless to say, those conditions are
4109 4116                   * found here -- and probably only here.  Is this the only use
4110 4117                   * of this infamous trick in shipping, production code?  If it
4111 4118                   * isn't, it probably should be...
4112 4119                   */
4113 4120                  if (minor != -1) {
4114 4121                          uintptr_t maddr = dtrace_loadptr(daddr +
4115 4122                              offsetof(struct dev_info, devi_minor));
4116 4123  
4117 4124                          uintptr_t next = offsetof(struct ddi_minor_data, next);
4118 4125                          uintptr_t name = offsetof(struct ddi_minor_data,
4119 4126                              d_minor) + offsetof(struct ddi_minor, name);
4120 4127                          uintptr_t dev = offsetof(struct ddi_minor_data,
4121 4128                              d_minor) + offsetof(struct ddi_minor, dev);
4122 4129                          uintptr_t scout;
4123 4130  
4124 4131                          if (maddr != NULL)
4125 4132                                  scout = dtrace_loadptr(maddr + next);
4126 4133  
4127 4134                          while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4128 4135                                  uint64_t m;
4129 4136  #ifdef _LP64
4130 4137                                  m = dtrace_load64(maddr + dev) & MAXMIN64;
4131 4138  #else
4132 4139                                  m = dtrace_load32(maddr + dev) & MAXMIN;
4133 4140  #endif
4134 4141                                  if (m != minor) {
4135 4142                                          maddr = dtrace_loadptr(maddr + next);
4136 4143  
4137 4144                                          if (scout == NULL)
4138 4145                                                  continue;
4139 4146  
4140 4147                                          scout = dtrace_loadptr(scout + next);
4141 4148  
4142 4149                                          if (scout == NULL)
4143 4150                                                  continue;
4144 4151  
4145 4152                                          scout = dtrace_loadptr(scout + next);
4146 4153  
4147 4154                                          if (scout == NULL)
4148 4155                                                  continue;
4149 4156  
4150 4157                                          if (scout == maddr) {
4151 4158                                                  *flags |= CPU_DTRACE_ILLOP;
4152 4159                                                  break;
4153 4160                                          }
4154 4161  
4155 4162                                          continue;
4156 4163                                  }
4157 4164  
4158 4165                                  /*
4159 4166                                   * We have the minor data.  Now we need to
4160 4167                                   * copy the minor's name into the end of the
4161 4168                                   * pathname.
4162 4169                                   */
4163 4170                                  s = (char *)dtrace_loadptr(maddr + name);
4164 4171                                  len = dtrace_strlen(s, size);
4165 4172  
4166 4173                                  if (*flags & CPU_DTRACE_FAULT)
4167 4174                                          break;
4168 4175  
4169 4176                                  if (len != 0) {
4170 4177                                          if ((end -= (len + 1)) < start)
4171 4178                                                  break;
4172 4179  
4173 4180                                          *end = ':';
4174 4181                                  }
4175 4182  
4176 4183                                  for (i = 1; i <= len; i++)
4177 4184                                          end[i] = dtrace_load8((uintptr_t)s++);
4178 4185                                  break;
4179 4186                          }
4180 4187                  }
4181 4188  
4182 4189                  while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4183 4190                          ddi_node_state_t devi_state;
4184 4191  
4185 4192                          devi_state = dtrace_load32(daddr +
4186 4193                              offsetof(struct dev_info, devi_node_state));
4187 4194  
4188 4195                          if (*flags & CPU_DTRACE_FAULT)
4189 4196                                  break;
4190 4197  
4191 4198                          if (devi_state >= DS_INITIALIZED) {
4192 4199                                  s = (char *)dtrace_loadptr(daddr +
4193 4200                                      offsetof(struct dev_info, devi_addr));
4194 4201                                  len = dtrace_strlen(s, size);
4195 4202  
4196 4203                                  if (*flags & CPU_DTRACE_FAULT)
4197 4204                                          break;
4198 4205  
4199 4206                                  if (len != 0) {
4200 4207                                          if ((end -= (len + 1)) < start)
4201 4208                                                  break;
4202 4209  
4203 4210                                          *end = '@';
4204 4211                                  }
4205 4212  
4206 4213                                  for (i = 1; i <= len; i++)
4207 4214                                          end[i] = dtrace_load8((uintptr_t)s++);
4208 4215                          }
4209 4216  
4210 4217                          /*
4211 4218                           * Now for the node name...
4212 4219                           */
4213 4220                          s = (char *)dtrace_loadptr(daddr +
4214 4221                              offsetof(struct dev_info, devi_node_name));
4215 4222  
4216 4223                          daddr = dtrace_loadptr(daddr +
4217 4224                              offsetof(struct dev_info, devi_parent));
4218 4225  
4219 4226                          /*
4220 4227                           * If our parent is NULL (that is, if we're the root
4221 4228                           * node), we're going to use the special path
4222 4229                           * "devices".
4223 4230                           */
4224 4231                          if (daddr == NULL)
4225 4232                                  s = "devices";
4226 4233  
4227 4234                          len = dtrace_strlen(s, size);
4228 4235                          if (*flags & CPU_DTRACE_FAULT)
4229 4236                                  break;
4230 4237  
4231 4238                          if ((end -= (len + 1)) < start)
4232 4239                                  break;
4233 4240  
4234 4241                          for (i = 1; i <= len; i++)
4235 4242                                  end[i] = dtrace_load8((uintptr_t)s++);
4236 4243                          *end = '/';
4237 4244  
4238 4245                          if (depth++ > dtrace_devdepth_max) {
4239 4246                                  *flags |= CPU_DTRACE_ILLOP;
4240 4247                                  break;
4241 4248                          }
4242 4249                  }
4243 4250  
4244 4251                  if (end < start)
4245 4252                          DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4246 4253  
4247 4254                  if (daddr == NULL) {
4248 4255                          regs[rd] = (uintptr_t)end;
4249 4256                          mstate->dtms_scratch_ptr += size;
4250 4257                  }
4251 4258  
4252 4259                  break;
4253 4260          }
4254 4261  
4255 4262          case DIF_SUBR_STRJOIN: {
4256 4263                  char *d = (char *)mstate->dtms_scratch_ptr;
4257 4264                  uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4258 4265                  uintptr_t s1 = tupregs[0].dttk_value;
4259 4266                  uintptr_t s2 = tupregs[1].dttk_value;
4260 4267                  int i = 0;
4261 4268  
4262 4269                  if (!dtrace_strcanload(s1, size, mstate, vstate) ||
4263 4270                      !dtrace_strcanload(s2, size, mstate, vstate)) {
4264 4271                          regs[rd] = NULL;
4265 4272                          break;
4266 4273                  }
4267 4274  
4268 4275                  if (!DTRACE_INSCRATCH(mstate, size)) {
4269 4276                          DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4270 4277                          regs[rd] = NULL;
4271 4278                          break;
4272 4279                  }
4273 4280  
4274 4281                  for (;;) {
4275 4282                          if (i >= size) {
4276 4283                                  DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4277 4284                                  regs[rd] = NULL;
4278 4285                                  break;
4279 4286                          }
4280 4287  
4281 4288                          if ((d[i++] = dtrace_load8(s1++)) == '\0') {
4282 4289                                  i--;
4283 4290                                  break;
4284 4291                          }
4285 4292                  }
4286 4293  
4287 4294                  for (;;) {
4288 4295                          if (i >= size) {
4289 4296                                  DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4290 4297                                  regs[rd] = NULL;
4291 4298                                  break;
4292 4299                          }
4293 4300  
4294 4301                          if ((d[i++] = dtrace_load8(s2++)) == '\0')
4295 4302                                  break;
4296 4303                  }
4297 4304  
4298 4305                  if (i < size) {
4299 4306                          mstate->dtms_scratch_ptr += i;
4300 4307                          regs[rd] = (uintptr_t)d;
4301 4308                  }
4302 4309  
4303 4310                  break;
4304 4311          }
4305 4312  
4306 4313          case DIF_SUBR_LLTOSTR: {
4307 4314                  int64_t i = (int64_t)tupregs[0].dttk_value;
4308 4315                  uint64_t val, digit;
4309 4316                  uint64_t size = 65;     /* enough room for 2^64 in binary */
4310 4317                  char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4311 4318                  int base = 10;
4312 4319  
4313 4320                  if (nargs > 1) {
4314 4321                          if ((base = tupregs[1].dttk_value) <= 1 ||
4315 4322                              base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
4316 4323                                  *flags |= CPU_DTRACE_ILLOP;
4317 4324                                  break;
4318 4325                          }
4319 4326                  }
4320 4327  
4321 4328                  val = (base == 10 && i < 0) ? i * -1 : i;
4322 4329  
4323 4330                  if (!DTRACE_INSCRATCH(mstate, size)) {
4324 4331                          DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4325 4332                          regs[rd] = NULL;
4326 4333                          break;
4327 4334                  }
4328 4335  
4329 4336                  for (*end-- = '\0'; val; val /= base) {
4330 4337                          if ((digit = val % base) <= '9' - '0') {
4331 4338                                  *end-- = '0' + digit;
4332 4339                          } else {
4333 4340                                  *end-- = 'a' + (digit - ('9' - '0') - 1);
4334 4341                          }
4335 4342                  }
4336 4343  
4337 4344                  if (i == 0 && base == 16)
4338 4345                          *end-- = '0';
4339 4346  
4340 4347                  if (base == 16)
4341 4348                          *end-- = 'x';
4342 4349  
4343 4350                  if (i == 0 || base == 8 || base == 16)
4344 4351                          *end-- = '0';
4345 4352  
4346 4353                  if (i < 0 && base == 10)
4347 4354                          *end-- = '-';
4348 4355  
4349 4356                  regs[rd] = (uintptr_t)end + 1;
4350 4357                  mstate->dtms_scratch_ptr += size;
4351 4358                  break;
4352 4359          }
4353 4360  
4354 4361          case DIF_SUBR_HTONS:
4355 4362          case DIF_SUBR_NTOHS:
4356 4363  #ifdef _BIG_ENDIAN
4357 4364                  regs[rd] = (uint16_t)tupregs[0].dttk_value;
4358 4365  #else
4359 4366                  regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
4360 4367  #endif
4361 4368                  break;
4362 4369  
4363 4370  
4364 4371          case DIF_SUBR_HTONL:
4365 4372          case DIF_SUBR_NTOHL:
4366 4373  #ifdef _BIG_ENDIAN
4367 4374                  regs[rd] = (uint32_t)tupregs[0].dttk_value;
4368 4375  #else
4369 4376                  regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
4370 4377  #endif
4371 4378                  break;
4372 4379  
4373 4380  
4374 4381          case DIF_SUBR_HTONLL:
4375 4382          case DIF_SUBR_NTOHLL:
4376 4383  #ifdef _BIG_ENDIAN
4377 4384                  regs[rd] = (uint64_t)tupregs[0].dttk_value;
4378 4385  #else
4379 4386                  regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
4380 4387  #endif
4381 4388                  break;
4382 4389  
4383 4390  
4384 4391          case DIF_SUBR_DIRNAME:
4385 4392          case DIF_SUBR_BASENAME: {
4386 4393                  char *dest = (char *)mstate->dtms_scratch_ptr;
4387 4394                  uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4388 4395                  uintptr_t src = tupregs[0].dttk_value;
4389 4396                  int i, j, len = dtrace_strlen((char *)src, size);
4390 4397                  int lastbase = -1, firstbase = -1, lastdir = -1;
4391 4398                  int start, end;
4392 4399  
4393 4400                  if (!dtrace_canload(src, len + 1, mstate, vstate)) {
4394 4401                          regs[rd] = NULL;
4395 4402                          break;
4396 4403                  }
4397 4404  
4398 4405                  if (!DTRACE_INSCRATCH(mstate, size)) {
4399 4406                          DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4400 4407                          regs[rd] = NULL;
4401 4408                          break;
4402 4409                  }
4403 4410  
4404 4411                  /*
4405 4412                   * The basename and dirname for a zero-length string is
4406 4413                   * defined to be "."
4407 4414                   */
4408 4415                  if (len == 0) {
4409 4416                          len = 1;
4410 4417                          src = (uintptr_t)".";
4411 4418                  }
4412 4419  
4413 4420                  /*
4414 4421                   * Start from the back of the string, moving back toward the
4415 4422                   * front until we see a character that isn't a slash.  That
4416 4423                   * character is the last character in the basename.
4417 4424                   */
4418 4425                  for (i = len - 1; i >= 0; i--) {
4419 4426                          if (dtrace_load8(src + i) != '/')
4420 4427                                  break;
4421 4428                  }
4422 4429  
4423 4430                  if (i >= 0)
4424 4431                          lastbase = i;
4425 4432  
4426 4433                  /*
4427 4434                   * Starting from the last character in the basename, move
4428 4435                   * towards the front until we find a slash.  The character
4429 4436                   * that we processed immediately before that is the first
4430 4437                   * character in the basename.
4431 4438                   */
4432 4439                  for (; i >= 0; i--) {
4433 4440                          if (dtrace_load8(src + i) == '/')
4434 4441                                  break;
4435 4442                  }
4436 4443  
4437 4444                  if (i >= 0)
4438 4445                          firstbase = i + 1;
4439 4446  
4440 4447                  /*
4441 4448                   * Now keep going until we find a non-slash character.  That
4442 4449                   * character is the last character in the dirname.
4443 4450                   */
4444 4451                  for (; i >= 0; i--) {
4445 4452                          if (dtrace_load8(src + i) != '/')
4446 4453                                  break;
4447 4454                  }
4448 4455  
4449 4456                  if (i >= 0)
4450 4457                          lastdir = i;
4451 4458  
4452 4459                  ASSERT(!(lastbase == -1 && firstbase != -1));
4453 4460                  ASSERT(!(firstbase == -1 && lastdir != -1));
4454 4461  
4455 4462                  if (lastbase == -1) {
4456 4463                          /*
4457 4464                           * We didn't find a non-slash character.  We know that
4458 4465                           * the length is non-zero, so the whole string must be
4459 4466                           * slashes.  In either the dirname or the basename
4460 4467                           * case, we return '/'.
4461 4468                           */
4462 4469                          ASSERT(firstbase == -1);
4463 4470                          firstbase = lastbase = lastdir = 0;
4464 4471                  }
4465 4472  
4466 4473                  if (firstbase == -1) {
4467 4474                          /*
4468 4475                           * The entire string consists only of a basename
4469 4476                           * component.  If we're looking for dirname, we need
4470 4477                           * to change our string to be just "."; if we're
4471 4478                           * looking for a basename, we'll just set the first
4472 4479                           * character of the basename to be 0.
4473 4480                           */
4474 4481                          if (subr == DIF_SUBR_DIRNAME) {
4475 4482                                  ASSERT(lastdir == -1);
4476 4483                                  src = (uintptr_t)".";
4477 4484                                  lastdir = 0;
4478 4485                          } else {
4479 4486                                  firstbase = 0;
4480 4487                          }
4481 4488                  }
4482 4489  
4483 4490                  if (subr == DIF_SUBR_DIRNAME) {
4484 4491                          if (lastdir == -1) {
4485 4492                                  /*
4486 4493                                   * We know that we have a slash in the name --
4487 4494                                   * or lastdir would be set to 0, above.  And
4488 4495                                   * because lastdir is -1, we know that this
4489 4496                                   * slash must be the first character.  (That
4490 4497                                   * is, the full string must be of the form
4491 4498                                   * "/basename".)  In this case, the last
4492 4499                                   * character of the directory name is 0.
4493 4500                                   */
4494 4501                                  lastdir = 0;
4495 4502                          }
4496 4503  
4497 4504                          start = 0;
4498 4505                          end = lastdir;
4499 4506                  } else {
4500 4507                          ASSERT(subr == DIF_SUBR_BASENAME);
4501 4508                          ASSERT(firstbase != -1 && lastbase != -1);
4502 4509                          start = firstbase;
4503 4510                          end = lastbase;
4504 4511                  }
4505 4512  
4506 4513                  for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
4507 4514                          dest[j] = dtrace_load8(src + i);
4508 4515  
4509 4516                  dest[j] = '\0';
4510 4517                  regs[rd] = (uintptr_t)dest;
4511 4518                  mstate->dtms_scratch_ptr += size;
4512 4519                  break;
4513 4520          }
4514 4521  
4515 4522          case DIF_SUBR_GETF: {
4516 4523                  uintptr_t fd = tupregs[0].dttk_value;
4517 4524                  uf_info_t *finfo = &curthread->t_procp->p_user.u_finfo;
4518 4525                  file_t *fp;
4519 4526  
4520 4527                  if (!dtrace_priv_proc(state, mstate)) {
4521 4528                          regs[rd] = NULL;
4522 4529                          break;
4523 4530                  }
4524 4531  
4525 4532                  /*
4526 4533                   * This is safe because fi_nfiles only increases, and the
4527 4534                   * fi_list array is not freed when the array size doubles.
4528 4535                   * (See the comment in flist_grow() for details on the
4529 4536                   * management of the u_finfo structure.)
4530 4537                   */
4531 4538                  fp = fd < finfo->fi_nfiles ? finfo->fi_list[fd].uf_file : NULL;
4532 4539  
4533 4540                  mstate->dtms_getf = fp;
4534 4541                  regs[rd] = (uintptr_t)fp;
4535 4542                  break;
4536 4543          }
4537 4544  
4538 4545          case DIF_SUBR_CLEANPATH: {
4539 4546                  char *dest = (char *)mstate->dtms_scratch_ptr, c;
4540 4547                  uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4541 4548                  uintptr_t src = tupregs[0].dttk_value;
4542 4549                  int i = 0, j = 0;
4543 4550                  zone_t *z;
4544 4551  
4545 4552                  if (!dtrace_strcanload(src, size, mstate, vstate)) {
4546 4553                          regs[rd] = NULL;
4547 4554                          break;
4548 4555                  }
4549 4556  
4550 4557                  if (!DTRACE_INSCRATCH(mstate, size)) {
4551 4558                          DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4552 4559                          regs[rd] = NULL;
4553 4560                          break;
4554 4561                  }
4555 4562  
4556 4563                  /*
4557 4564                   * Move forward, loading each character.
4558 4565                   */
4559 4566                  do {
4560 4567                          c = dtrace_load8(src + i++);
4561 4568  next:
4562 4569                          if (j + 5 >= size)      /* 5 = strlen("/..c\0") */
4563 4570                                  break;
4564 4571  
4565 4572                          if (c != '/') {
4566 4573                                  dest[j++] = c;
4567 4574                                  continue;
4568 4575                          }
4569 4576  
4570 4577                          c = dtrace_load8(src + i++);
4571 4578  
4572 4579                          if (c == '/') {
4573 4580                                  /*
4574 4581                                   * We have two slashes -- we can just advance
4575 4582                                   * to the next character.
4576 4583                                   */
4577 4584                                  goto next;
4578 4585                          }
4579 4586  
4580 4587                          if (c != '.') {
4581 4588                                  /*
4582 4589                                   * This is not "." and it's not ".." -- we can
4583 4590                                   * just store the "/" and this character and
4584 4591                                   * drive on.
4585 4592                                   */
4586 4593                                  dest[j++] = '/';
4587 4594                                  dest[j++] = c;
4588 4595                                  continue;
4589 4596                          }
4590 4597  
4591 4598                          c = dtrace_load8(src + i++);
4592 4599  
4593 4600                          if (c == '/') {
4594 4601                                  /*
4595 4602                                   * This is a "/./" component.  We're not going
4596 4603                                   * to store anything in the destination buffer;
4597 4604                                   * we're just going to go to the next component.
4598 4605                                   */
4599 4606                                  goto next;
4600 4607                          }
4601 4608  
4602 4609                          if (c != '.') {
4603 4610                                  /*
4604 4611                                   * This is not ".." -- we can just store the
4605 4612                                   * "/." and this character and continue
4606 4613                                   * processing.
4607 4614                                   */
4608 4615                                  dest[j++] = '/';
4609 4616                                  dest[j++] = '.';
4610 4617                                  dest[j++] = c;
4611 4618                                  continue;
4612 4619                          }
4613 4620  
4614 4621                          c = dtrace_load8(src + i++);
4615 4622  
4616 4623                          if (c != '/' && c != '\0') {
4617 4624                                  /*
4618 4625                                   * This is not ".." -- it's "..[mumble]".
4619 4626                                   * We'll store the "/.." and this character
4620 4627                                   * and continue processing.
4621 4628                                   */
4622 4629                                  dest[j++] = '/';
4623 4630                                  dest[j++] = '.';
4624 4631                                  dest[j++] = '.';
4625 4632                                  dest[j++] = c;
4626 4633                                  continue;
4627 4634                          }
4628 4635  
4629 4636                          /*
4630 4637                           * This is "/../" or "/..\0".  We need to back up
4631 4638                           * our destination pointer until we find a "/".
4632 4639                           */
4633 4640                          i--;
4634 4641                          while (j != 0 && dest[--j] != '/')
4635 4642                                  continue;
4636 4643  
4637 4644                          if (c == '\0')
4638 4645                                  dest[++j] = '/';
4639 4646                  } while (c != '\0');
4640 4647  
4641 4648                  dest[j] = '\0';
4642 4649  
4643 4650                  if (mstate->dtms_getf != NULL &&
4644 4651                      !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&
4645 4652                      (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {
4646 4653                          /*
4647 4654                           * If we've done a getf() as a part of this ECB and we
4648 4655                           * don't have kernel access (and we're not in the global
4649 4656                           * zone), check if the path we cleaned up begins with
4650 4657                           * the zone's root path, and trim it off if so.  Note
4651 4658                           * that this is an output cleanliness issue, not a
4652 4659                           * security issue: knowing one's zone root path does
4653 4660                           * not enable privilege escalation.
4654 4661                           */
4655 4662                          if (strstr(dest, z->zone_rootpath) == dest)
4656 4663                                  dest += strlen(z->zone_rootpath) - 1;
4657 4664                  }
4658 4665  
4659 4666                  regs[rd] = (uintptr_t)dest;
4660 4667                  mstate->dtms_scratch_ptr += size;
4661 4668                  break;
4662 4669          }
4663 4670  
4664 4671          case DIF_SUBR_INET_NTOA:
4665 4672          case DIF_SUBR_INET_NTOA6:
4666 4673          case DIF_SUBR_INET_NTOP: {
4667 4674                  size_t size;
4668 4675                  int af, argi, i;
4669 4676                  char *base, *end;
4670 4677  
4671 4678                  if (subr == DIF_SUBR_INET_NTOP) {
4672 4679                          af = (int)tupregs[0].dttk_value;
4673 4680                          argi = 1;
4674 4681                  } else {
4675 4682                          af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
4676 4683                          argi = 0;
4677 4684                  }
4678 4685  
4679 4686                  if (af == AF_INET) {
4680 4687                          ipaddr_t ip4;
4681 4688                          uint8_t *ptr8, val;
4682 4689  
4683 4690                          /*
4684 4691                           * Safely load the IPv4 address.
4685 4692                           */
4686 4693                          ip4 = dtrace_load32(tupregs[argi].dttk_value);
4687 4694  
4688 4695                          /*
4689 4696                           * Check an IPv4 string will fit in scratch.
4690 4697                           */
4691 4698                          size = INET_ADDRSTRLEN;
4692 4699                          if (!DTRACE_INSCRATCH(mstate, size)) {
4693 4700                                  DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4694 4701                                  regs[rd] = NULL;
4695 4702                                  break;
4696 4703                          }
4697 4704                          base = (char *)mstate->dtms_scratch_ptr;
4698 4705                          end = (char *)mstate->dtms_scratch_ptr + size - 1;
4699 4706  
4700 4707                          /*
4701 4708                           * Stringify as a dotted decimal quad.
4702 4709                           */
4703 4710                          *end-- = '\0';
4704 4711                          ptr8 = (uint8_t *)&ip4;
4705 4712                          for (i = 3; i >= 0; i--) {
4706 4713                                  val = ptr8[i];
4707 4714  
4708 4715                                  if (val == 0) {
4709 4716                                          *end-- = '0';
4710 4717                                  } else {
4711 4718                                          for (; val; val /= 10) {
4712 4719                                                  *end-- = '0' + (val % 10);
4713 4720                                          }
4714 4721                                  }
4715 4722  
4716 4723                                  if (i > 0)
4717 4724                                          *end-- = '.';
4718 4725                          }
4719 4726                          ASSERT(end + 1 >= base);
4720 4727  
4721 4728                  } else if (af == AF_INET6) {
4722 4729                          struct in6_addr ip6;
4723 4730                          int firstzero, tryzero, numzero, v6end;
4724 4731                          uint16_t val;
4725 4732                          const char digits[] = "0123456789abcdef";
4726 4733  
4727 4734                          /*
4728 4735                           * Stringify using RFC 1884 convention 2 - 16 bit
4729 4736                           * hexadecimal values with a zero-run compression.
4730 4737                           * Lower case hexadecimal digits are used.
4731 4738                           *      eg, fe80::214:4fff:fe0b:76c8.
4732 4739                           * The IPv4 embedded form is returned for inet_ntop,
4733 4740                           * just the IPv4 string is returned for inet_ntoa6.
4734 4741                           */
4735 4742  
4736 4743                          /*
4737 4744                           * Safely load the IPv6 address.
4738 4745                           */
4739 4746                          dtrace_bcopy(
4740 4747                              (void *)(uintptr_t)tupregs[argi].dttk_value,
4741 4748                              (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
4742 4749  
4743 4750                          /*
4744 4751                           * Check an IPv6 string will fit in scratch.
4745 4752                           */
4746 4753                          size = INET6_ADDRSTRLEN;
4747 4754                          if (!DTRACE_INSCRATCH(mstate, size)) {
4748 4755                                  DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4749 4756                                  regs[rd] = NULL;
4750 4757                                  break;
4751 4758                          }
4752 4759                          base = (char *)mstate->dtms_scratch_ptr;
4753 4760                          end = (char *)mstate->dtms_scratch_ptr + size - 1;
4754 4761                          *end-- = '\0';
4755 4762  
4756 4763                          /*
4757 4764                           * Find the longest run of 16 bit zero values
4758 4765                           * for the single allowed zero compression - "::".
4759 4766                           */
4760 4767                          firstzero = -1;
4761 4768                          tryzero = -1;
4762 4769                          numzero = 1;
4763 4770                          for (i = 0; i < sizeof (struct in6_addr); i++) {
4764 4771                                  if (ip6._S6_un._S6_u8[i] == 0 &&
4765 4772                                      tryzero == -1 && i % 2 == 0) {
4766 4773                                          tryzero = i;
4767 4774                                          continue;
4768 4775                                  }
4769 4776  
4770 4777                                  if (tryzero != -1 &&
4771 4778                                      (ip6._S6_un._S6_u8[i] != 0 ||
4772 4779                                      i == sizeof (struct in6_addr) - 1)) {
4773 4780  
4774 4781                                          if (i - tryzero <= numzero) {
4775 4782                                                  tryzero = -1;
4776 4783                                                  continue;
4777 4784                                          }
4778 4785  
4779 4786                                          firstzero = tryzero;
4780 4787                                          numzero = i - i % 2 - tryzero;
4781 4788                                          tryzero = -1;
4782 4789  
4783 4790                                          if (ip6._S6_un._S6_u8[i] == 0 &&
4784 4791                                              i == sizeof (struct in6_addr) - 1)
4785 4792                                                  numzero += 2;
4786 4793                                  }
4787 4794                          }
4788 4795                          ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
4789 4796  
4790 4797                          /*
4791 4798                           * Check for an IPv4 embedded address.
4792 4799                           */
4793 4800                          v6end = sizeof (struct in6_addr) - 2;
4794 4801                          if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
4795 4802                              IN6_IS_ADDR_V4COMPAT(&ip6)) {
4796 4803                                  for (i = sizeof (struct in6_addr) - 1;
4797 4804                                      i >= DTRACE_V4MAPPED_OFFSET; i--) {
4798 4805                                          ASSERT(end >= base);
4799 4806  
4800 4807                                          val = ip6._S6_un._S6_u8[i];
4801 4808  
4802 4809                                          if (val == 0) {
4803 4810                                                  *end-- = '0';
4804 4811                                          } else {
4805 4812                                                  for (; val; val /= 10) {
4806 4813                                                          *end-- = '0' + val % 10;
4807 4814                                                  }
4808 4815                                          }
4809 4816  
4810 4817                                          if (i > DTRACE_V4MAPPED_OFFSET)
4811 4818                                                  *end-- = '.';
4812 4819                                  }
4813 4820  
4814 4821                                  if (subr == DIF_SUBR_INET_NTOA6)
4815 4822                                          goto inetout;
4816 4823  
4817 4824                                  /*
4818 4825                                   * Set v6end to skip the IPv4 address that
4819 4826                                   * we have already stringified.
4820 4827                                   */
4821 4828                                  v6end = 10;
4822 4829                          }
4823 4830  
4824 4831                          /*
4825 4832                           * Build the IPv6 string by working through the
4826 4833                           * address in reverse.
4827 4834                           */
4828 4835                          for (i = v6end; i >= 0; i -= 2) {
4829 4836                                  ASSERT(end >= base);
4830 4837  
4831 4838                                  if (i == firstzero + numzero - 2) {
4832 4839                                          *end-- = ':';
4833 4840                                          *end-- = ':';
4834 4841                                          i -= numzero - 2;
4835 4842                                          continue;
4836 4843                                  }
4837 4844  
4838 4845                                  if (i < 14 && i != firstzero - 2)
4839 4846                                          *end-- = ':';
4840 4847  
4841 4848                                  val = (ip6._S6_un._S6_u8[i] << 8) +
4842 4849                                      ip6._S6_un._S6_u8[i + 1];
4843 4850  
4844 4851                                  if (val == 0) {
4845 4852                                          *end-- = '0';
4846 4853                                  } else {
4847 4854                                          for (; val; val /= 16) {
4848 4855                                                  *end-- = digits[val % 16];
4849 4856                                          }
4850 4857                                  }
4851 4858                          }
4852 4859                          ASSERT(end + 1 >= base);
4853 4860  
4854 4861                  } else {
4855 4862                          /*
4856 4863                           * The user didn't use AH_INET or AH_INET6.
4857 4864                           */
4858 4865                          DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4859 4866                          regs[rd] = NULL;
4860 4867                          break;
4861 4868                  }
4862 4869  
4863 4870  inetout:        regs[rd] = (uintptr_t)end + 1;
4864 4871                  mstate->dtms_scratch_ptr += size;
4865 4872                  break;
4866 4873          }
4867 4874  
4868 4875          }
4869 4876  }
4870 4877  
4871 4878  /*
4872 4879   * Emulate the execution of DTrace IR instructions specified by the given
4873 4880   * DIF object.  This function is deliberately void of assertions as all of
4874 4881   * the necessary checks are handled by a call to dtrace_difo_validate().
4875 4882   */
4876 4883  static uint64_t
4877 4884  dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
4878 4885      dtrace_vstate_t *vstate, dtrace_state_t *state)
4879 4886  {
4880 4887          const dif_instr_t *text = difo->dtdo_buf;
4881 4888          const uint_t textlen = difo->dtdo_len;
4882 4889          const char *strtab = difo->dtdo_strtab;
4883 4890          const uint64_t *inttab = difo->dtdo_inttab;
4884 4891  
4885 4892          uint64_t rval = 0;
4886 4893          dtrace_statvar_t *svar;
4887 4894          dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
4888 4895          dtrace_difv_t *v;
4889 4896          volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
4890 4897          volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
4891 4898  
4892 4899          dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
4893 4900          uint64_t regs[DIF_DIR_NREGS];
4894 4901          uint64_t *tmp;
4895 4902  
4896 4903          uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
4897 4904          int64_t cc_r;
4898 4905          uint_t pc = 0, id, opc;
4899 4906          uint8_t ttop = 0;
4900 4907          dif_instr_t instr;
4901 4908          uint_t r1, r2, rd;
4902 4909  
4903 4910          /*
4904 4911           * We stash the current DIF object into the machine state: we need it
4905 4912           * for subsequent access checking.
4906 4913           */
4907 4914          mstate->dtms_difo = difo;
4908 4915  
4909 4916          regs[DIF_REG_R0] = 0;           /* %r0 is fixed at zero */
4910 4917  
4911 4918          while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
4912 4919                  opc = pc;
4913 4920  
4914 4921                  instr = text[pc++];
4915 4922                  r1 = DIF_INSTR_R1(instr);
4916 4923                  r2 = DIF_INSTR_R2(instr);
4917 4924                  rd = DIF_INSTR_RD(instr);
4918 4925  
4919 4926                  switch (DIF_INSTR_OP(instr)) {
4920 4927                  case DIF_OP_OR:
4921 4928                          regs[rd] = regs[r1] | regs[r2];
4922 4929                          break;
4923 4930                  case DIF_OP_XOR:
4924 4931                          regs[rd] = regs[r1] ^ regs[r2];
4925 4932                          break;
4926 4933                  case DIF_OP_AND:
4927 4934                          regs[rd] = regs[r1] & regs[r2];
4928 4935                          break;
4929 4936                  case DIF_OP_SLL:
4930 4937                          regs[rd] = regs[r1] << regs[r2];
4931 4938                          break;
4932 4939                  case DIF_OP_SRL:
4933 4940                          regs[rd] = regs[r1] >> regs[r2];
4934 4941                          break;
4935 4942                  case DIF_OP_SUB:
4936 4943                          regs[rd] = regs[r1] - regs[r2];
4937 4944                          break;
4938 4945                  case DIF_OP_ADD:
4939 4946                          regs[rd] = regs[r1] + regs[r2];
4940 4947                          break;
4941 4948                  case DIF_OP_MUL:
4942 4949                          regs[rd] = regs[r1] * regs[r2];
4943 4950                          break;
4944 4951                  case DIF_OP_SDIV:
4945 4952                          if (regs[r2] == 0) {
4946 4953                                  regs[rd] = 0;
4947 4954                                  *flags |= CPU_DTRACE_DIVZERO;
4948 4955                          } else {
4949 4956                                  regs[rd] = (int64_t)regs[r1] /
4950 4957                                      (int64_t)regs[r2];
4951 4958                          }
4952 4959                          break;
4953 4960  
4954 4961                  case DIF_OP_UDIV:
4955 4962                          if (regs[r2] == 0) {
4956 4963                                  regs[rd] = 0;
4957 4964                                  *flags |= CPU_DTRACE_DIVZERO;
4958 4965                          } else {
4959 4966                                  regs[rd] = regs[r1] / regs[r2];
4960 4967                          }
4961 4968                          break;
4962 4969  
4963 4970                  case DIF_OP_SREM:
4964 4971                          if (regs[r2] == 0) {
4965 4972                                  regs[rd] = 0;
4966 4973                                  *flags |= CPU_DTRACE_DIVZERO;
4967 4974                          } else {
4968 4975                                  regs[rd] = (int64_t)regs[r1] %
4969 4976                                      (int64_t)regs[r2];
4970 4977                          }
4971 4978                          break;
4972 4979  
4973 4980                  case DIF_OP_UREM:
4974 4981                          if (regs[r2] == 0) {
4975 4982                                  regs[rd] = 0;
4976 4983                                  *flags |= CPU_DTRACE_DIVZERO;
4977 4984                          } else {
4978 4985                                  regs[rd] = regs[r1] % regs[r2];
4979 4986                          }
4980 4987                          break;
4981 4988  
4982 4989                  case DIF_OP_NOT:
4983 4990                          regs[rd] = ~regs[r1];
4984 4991                          break;
4985 4992                  case DIF_OP_MOV:
4986 4993                          regs[rd] = regs[r1];
4987 4994                          break;
4988 4995                  case DIF_OP_CMP:
4989 4996                          cc_r = regs[r1] - regs[r2];
4990 4997                          cc_n = cc_r < 0;
4991 4998                          cc_z = cc_r == 0;
4992 4999                          cc_v = 0;
4993 5000                          cc_c = regs[r1] < regs[r2];
4994 5001                          break;
4995 5002                  case DIF_OP_TST:
4996 5003                          cc_n = cc_v = cc_c = 0;
4997 5004                          cc_z = regs[r1] == 0;
4998 5005                          break;
4999 5006                  case DIF_OP_BA:
5000 5007                          pc = DIF_INSTR_LABEL(instr);
5001 5008                          break;
5002 5009                  case DIF_OP_BE:
5003 5010                          if (cc_z)
5004 5011                                  pc = DIF_INSTR_LABEL(instr);
5005 5012                          break;
5006 5013                  case DIF_OP_BNE:
5007 5014                          if (cc_z == 0)
5008 5015                                  pc = DIF_INSTR_LABEL(instr);
5009 5016                          break;
5010 5017                  case DIF_OP_BG:
5011 5018                          if ((cc_z | (cc_n ^ cc_v)) == 0)
5012 5019                                  pc = DIF_INSTR_LABEL(instr);
5013 5020                          break;
5014 5021                  case DIF_OP_BGU:
5015 5022                          if ((cc_c | cc_z) == 0)
5016 5023                                  pc = DIF_INSTR_LABEL(instr);
5017 5024                          break;
5018 5025                  case DIF_OP_BGE:
5019 5026                          if ((cc_n ^ cc_v) == 0)
5020 5027                                  pc = DIF_INSTR_LABEL(instr);
5021 5028                          break;
5022 5029                  case DIF_OP_BGEU:
5023 5030                          if (cc_c == 0)
5024 5031                                  pc = DIF_INSTR_LABEL(instr);
5025 5032                          break;
5026 5033                  case DIF_OP_BL:
5027 5034                          if (cc_n ^ cc_v)
5028 5035                                  pc = DIF_INSTR_LABEL(instr);
5029 5036                          break;
5030 5037                  case DIF_OP_BLU:
5031 5038                          if (cc_c)
5032 5039                                  pc = DIF_INSTR_LABEL(instr);
5033 5040                          break;
5034 5041                  case DIF_OP_BLE:
5035 5042                          if (cc_z | (cc_n ^ cc_v))
5036 5043                                  pc = DIF_INSTR_LABEL(instr);
5037 5044                          break;
5038 5045                  case DIF_OP_BLEU:
5039 5046                          if (cc_c | cc_z)
5040 5047                                  pc = DIF_INSTR_LABEL(instr);
5041 5048                          break;
5042 5049                  case DIF_OP_RLDSB:
5043 5050                          if (!dtrace_canload(regs[r1], 1, mstate, vstate))
5044 5051                                  break;
5045 5052                          /*FALLTHROUGH*/
5046 5053                  case DIF_OP_LDSB:
5047 5054                          regs[rd] = (int8_t)dtrace_load8(regs[r1]);
5048 5055                          break;
5049 5056                  case DIF_OP_RLDSH:
5050 5057                          if (!dtrace_canload(regs[r1], 2, mstate, vstate))
5051 5058                                  break;
5052 5059                          /*FALLTHROUGH*/
5053 5060                  case DIF_OP_LDSH:
5054 5061                          regs[rd] = (int16_t)dtrace_load16(regs[r1]);
5055 5062                          break;
5056 5063                  case DIF_OP_RLDSW:
5057 5064                          if (!dtrace_canload(regs[r1], 4, mstate, vstate))
5058 5065                                  break;
5059 5066                          /*FALLTHROUGH*/
5060 5067                  case DIF_OP_LDSW:
5061 5068                          regs[rd] = (int32_t)dtrace_load32(regs[r1]);
5062 5069                          break;
5063 5070                  case DIF_OP_RLDUB:
5064 5071                          if (!dtrace_canload(regs[r1], 1, mstate, vstate))
5065 5072                                  break;
5066 5073                          /*FALLTHROUGH*/
5067 5074                  case DIF_OP_LDUB:
5068 5075                          regs[rd] = dtrace_load8(regs[r1]);
5069 5076                          break;
5070 5077                  case DIF_OP_RLDUH:
5071 5078                          if (!dtrace_canload(regs[r1], 2, mstate, vstate))
5072 5079                                  break;
5073 5080                          /*FALLTHROUGH*/
5074 5081                  case DIF_OP_LDUH:
5075 5082                          regs[rd] = dtrace_load16(regs[r1]);
5076 5083                          break;
5077 5084                  case DIF_OP_RLDUW:
5078 5085                          if (!dtrace_canload(regs[r1], 4, mstate, vstate))
5079 5086                                  break;
5080 5087                          /*FALLTHROUGH*/
5081 5088                  case DIF_OP_LDUW:
5082 5089                          regs[rd] = dtrace_load32(regs[r1]);
5083 5090                          break;
5084 5091                  case DIF_OP_RLDX:
5085 5092                          if (!dtrace_canload(regs[r1], 8, mstate, vstate))
5086 5093                                  break;
5087 5094                          /*FALLTHROUGH*/
5088 5095                  case DIF_OP_LDX:
5089 5096                          regs[rd] = dtrace_load64(regs[r1]);
5090 5097                          break;
5091 5098                  case DIF_OP_ULDSB:
5092 5099                          regs[rd] = (int8_t)
5093 5100                              dtrace_fuword8((void *)(uintptr_t)regs[r1]);
5094 5101                          break;
5095 5102                  case DIF_OP_ULDSH:
5096 5103                          regs[rd] = (int16_t)
5097 5104                              dtrace_fuword16((void *)(uintptr_t)regs[r1]);
5098 5105                          break;
5099 5106                  case DIF_OP_ULDSW:
5100 5107                          regs[rd] = (int32_t)
5101 5108                              dtrace_fuword32((void *)(uintptr_t)regs[r1]);
5102 5109                          break;
5103 5110                  case DIF_OP_ULDUB:
5104 5111                          regs[rd] =
5105 5112                              dtrace_fuword8((void *)(uintptr_t)regs[r1]);
5106 5113                          break;
5107 5114                  case DIF_OP_ULDUH:
5108 5115                          regs[rd] =
5109 5116                              dtrace_fuword16((void *)(uintptr_t)regs[r1]);
5110 5117                          break;
5111 5118                  case DIF_OP_ULDUW:
5112 5119                          regs[rd] =
5113 5120                              dtrace_fuword32((void *)(uintptr_t)regs[r1]);
5114 5121                          break;
5115 5122                  case DIF_OP_ULDX:
5116 5123                          regs[rd] =
5117 5124                              dtrace_fuword64((void *)(uintptr_t)regs[r1]);
5118 5125                          break;
5119 5126                  case DIF_OP_RET:
5120 5127                          rval = regs[rd];
5121 5128                          pc = textlen;
5122 5129                          break;
5123 5130                  case DIF_OP_NOP:
5124 5131                          break;
5125 5132                  case DIF_OP_SETX:
5126 5133                          regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
5127 5134                          break;
5128 5135                  case DIF_OP_SETS:
5129 5136                          regs[rd] = (uint64_t)(uintptr_t)
5130 5137                              (strtab + DIF_INSTR_STRING(instr));
5131 5138                          break;
5132 5139                  case DIF_OP_SCMP: {
5133 5140                          size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
5134 5141                          uintptr_t s1 = regs[r1];
5135 5142                          uintptr_t s2 = regs[r2];
5136 5143  
5137 5144                          if (s1 != NULL &&
5138 5145                              !dtrace_strcanload(s1, sz, mstate, vstate))
5139 5146                                  break;
5140 5147                          if (s2 != NULL &&
5141 5148                              !dtrace_strcanload(s2, sz, mstate, vstate))
5142 5149                                  break;
5143 5150  
5144 5151                          cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
5145 5152  
5146 5153                          cc_n = cc_r < 0;
5147 5154                          cc_z = cc_r == 0;
5148 5155                          cc_v = cc_c = 0;
5149 5156                          break;
5150 5157                  }
5151 5158                  case DIF_OP_LDGA:
5152 5159                          regs[rd] = dtrace_dif_variable(mstate, state,
5153 5160                              r1, regs[r2]);
5154 5161                          break;
5155 5162                  case DIF_OP_LDGS:
5156 5163                          id = DIF_INSTR_VAR(instr);
5157 5164  
5158 5165                          if (id >= DIF_VAR_OTHER_UBASE) {
5159 5166                                  uintptr_t a;
5160 5167  
5161 5168                                  id -= DIF_VAR_OTHER_UBASE;
5162 5169                                  svar = vstate->dtvs_globals[id];
5163 5170                                  ASSERT(svar != NULL);
5164 5171                                  v = &svar->dtsv_var;
5165 5172  
5166 5173                                  if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
5167 5174                                          regs[rd] = svar->dtsv_data;
5168 5175                                          break;
5169 5176                                  }
5170 5177  
5171 5178                                  a = (uintptr_t)svar->dtsv_data;
5172 5179  
5173 5180                                  if (*(uint8_t *)a == UINT8_MAX) {
5174 5181                                          /*
5175 5182                                           * If the 0th byte is set to UINT8_MAX
5176 5183                                           * then this is to be treated as a
5177 5184                                           * reference to a NULL variable.
5178 5185                                           */
5179 5186                                          regs[rd] = NULL;
5180 5187                                  } else {
5181 5188                                          regs[rd] = a + sizeof (uint64_t);
5182 5189                                  }
5183 5190  
5184 5191                                  break;
5185 5192                          }
5186 5193  
5187 5194                          regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
5188 5195                          break;
5189 5196  
5190 5197                  case DIF_OP_STGS:
5191 5198                          id = DIF_INSTR_VAR(instr);
5192 5199  
5193 5200                          ASSERT(id >= DIF_VAR_OTHER_UBASE);
5194 5201                          id -= DIF_VAR_OTHER_UBASE;
5195 5202  
5196 5203                          svar = vstate->dtvs_globals[id];
5197 5204                          ASSERT(svar != NULL);
5198 5205                          v = &svar->dtsv_var;
5199 5206  
5200 5207                          if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5201 5208                                  uintptr_t a = (uintptr_t)svar->dtsv_data;
5202 5209  
5203 5210                                  ASSERT(a != NULL);
5204 5211                                  ASSERT(svar->dtsv_size != 0);
5205 5212  
5206 5213                                  if (regs[rd] == NULL) {
5207 5214                                          *(uint8_t *)a = UINT8_MAX;
5208 5215                                          break;
5209 5216                                  } else {
5210 5217                                          *(uint8_t *)a = 0;
5211 5218                                          a += sizeof (uint64_t);
5212 5219                                  }
5213 5220                                  if (!dtrace_vcanload(
5214 5221                                      (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5215 5222                                      mstate, vstate))
5216 5223                                          break;
5217 5224  
5218 5225                                  dtrace_vcopy((void *)(uintptr_t)regs[rd],
5219 5226                                      (void *)a, &v->dtdv_type);
5220 5227                                  break;
5221 5228                          }
5222 5229  
5223 5230                          svar->dtsv_data = regs[rd];
5224 5231                          break;
5225 5232  
5226 5233                  case DIF_OP_LDTA:
5227 5234                          /*
5228 5235                           * There are no DTrace built-in thread-local arrays at
5229 5236                           * present.  This opcode is saved for future work.
5230 5237                           */
5231 5238                          *flags |= CPU_DTRACE_ILLOP;
5232 5239                          regs[rd] = 0;
5233 5240                          break;
5234 5241  
5235 5242                  case DIF_OP_LDLS:
5236 5243                          id = DIF_INSTR_VAR(instr);
5237 5244  
5238 5245                          if (id < DIF_VAR_OTHER_UBASE) {
5239 5246                                  /*
5240 5247                                   * For now, this has no meaning.
5241 5248                                   */
5242 5249                                  regs[rd] = 0;
5243 5250                                  break;
5244 5251                          }
5245 5252  
5246 5253                          id -= DIF_VAR_OTHER_UBASE;
5247 5254  
5248 5255                          ASSERT(id < vstate->dtvs_nlocals);
5249 5256                          ASSERT(vstate->dtvs_locals != NULL);
5250 5257  
5251 5258                          svar = vstate->dtvs_locals[id];
5252 5259                          ASSERT(svar != NULL);
5253 5260                          v = &svar->dtsv_var;
5254 5261  
5255 5262                          if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5256 5263                                  uintptr_t a = (uintptr_t)svar->dtsv_data;
5257 5264                                  size_t sz = v->dtdv_type.dtdt_size;
5258 5265  
5259 5266                                  sz += sizeof (uint64_t);
5260 5267                                  ASSERT(svar->dtsv_size == NCPU * sz);
5261 5268                                  a += CPU->cpu_id * sz;
5262 5269  
5263 5270                                  if (*(uint8_t *)a == UINT8_MAX) {
5264 5271                                          /*
5265 5272                                           * If the 0th byte is set to UINT8_MAX
5266 5273                                           * then this is to be treated as a
5267 5274                                           * reference to a NULL variable.
5268 5275                                           */
5269 5276                                          regs[rd] = NULL;
5270 5277                                  } else {
5271 5278                                          regs[rd] = a + sizeof (uint64_t);
5272 5279                                  }
5273 5280  
5274 5281                                  break;
5275 5282                          }
5276 5283  
5277 5284                          ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
5278 5285                          tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5279 5286                          regs[rd] = tmp[CPU->cpu_id];
5280 5287                          break;
5281 5288  
5282 5289                  case DIF_OP_STLS:
5283 5290                          id = DIF_INSTR_VAR(instr);
5284 5291  
5285 5292                          ASSERT(id >= DIF_VAR_OTHER_UBASE);
5286 5293                          id -= DIF_VAR_OTHER_UBASE;
5287 5294                          ASSERT(id < vstate->dtvs_nlocals);
5288 5295  
5289 5296                          ASSERT(vstate->dtvs_locals != NULL);
5290 5297                          svar = vstate->dtvs_locals[id];
5291 5298                          ASSERT(svar != NULL);
5292 5299                          v = &svar->dtsv_var;
5293 5300  
5294 5301                          if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5295 5302                                  uintptr_t a = (uintptr_t)svar->dtsv_data;
5296 5303                                  size_t sz = v->dtdv_type.dtdt_size;
5297 5304  
5298 5305                                  sz += sizeof (uint64_t);
5299 5306                                  ASSERT(svar->dtsv_size == NCPU * sz);
5300 5307                                  a += CPU->cpu_id * sz;
5301 5308  
5302 5309                                  if (regs[rd] == NULL) {
5303 5310                                          *(uint8_t *)a = UINT8_MAX;
5304 5311                                          break;
5305 5312                                  } else {
5306 5313                                          *(uint8_t *)a = 0;
5307 5314                                          a += sizeof (uint64_t);
5308 5315                                  }
5309 5316  
5310 5317                                  if (!dtrace_vcanload(
5311 5318                                      (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5312 5319                                      mstate, vstate))
5313 5320                                          break;
5314 5321  
5315 5322                                  dtrace_vcopy((void *)(uintptr_t)regs[rd],
5316 5323                                      (void *)a, &v->dtdv_type);
5317 5324                                  break;
5318 5325                          }
5319 5326  
5320 5327                          ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
5321 5328                          tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5322 5329                          tmp[CPU->cpu_id] = regs[rd];
5323 5330                          break;
5324 5331  
5325 5332                  case DIF_OP_LDTS: {
5326 5333                          dtrace_dynvar_t *dvar;
5327 5334                          dtrace_key_t *key;
5328 5335  
5329 5336                          id = DIF_INSTR_VAR(instr);
5330 5337                          ASSERT(id >= DIF_VAR_OTHER_UBASE);
5331 5338                          id -= DIF_VAR_OTHER_UBASE;
5332 5339                          v = &vstate->dtvs_tlocals[id];
5333 5340  
5334 5341                          key = &tupregs[DIF_DTR_NREGS];
5335 5342                          key[0].dttk_value = (uint64_t)id;
5336 5343                          key[0].dttk_size = 0;
5337 5344                          DTRACE_TLS_THRKEY(key[1].dttk_value);
5338 5345                          key[1].dttk_size = 0;
5339 5346  
5340 5347                          dvar = dtrace_dynvar(dstate, 2, key,
5341 5348                              sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
5342 5349                              mstate, vstate);
5343 5350  
5344 5351                          if (dvar == NULL) {
5345 5352                                  regs[rd] = 0;
5346 5353                                  break;
5347 5354                          }
5348 5355  
5349 5356                          if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5350 5357                                  regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5351 5358                          } else {
5352 5359                                  regs[rd] = *((uint64_t *)dvar->dtdv_data);
5353 5360                          }
5354 5361  
5355 5362                          break;
5356 5363                  }
5357 5364  
5358 5365                  case DIF_OP_STTS: {
5359 5366                          dtrace_dynvar_t *dvar;
5360 5367                          dtrace_key_t *key;
5361 5368  
5362 5369                          id = DIF_INSTR_VAR(instr);
5363 5370                          ASSERT(id >= DIF_VAR_OTHER_UBASE);
5364 5371                          id -= DIF_VAR_OTHER_UBASE;
5365 5372  
5366 5373                          key = &tupregs[DIF_DTR_NREGS];
5367 5374                          key[0].dttk_value = (uint64_t)id;
5368 5375                          key[0].dttk_size = 0;
5369 5376                          DTRACE_TLS_THRKEY(key[1].dttk_value);
5370 5377                          key[1].dttk_size = 0;
5371 5378                          v = &vstate->dtvs_tlocals[id];
5372 5379  
5373 5380                          dvar = dtrace_dynvar(dstate, 2, key,
5374 5381                              v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5375 5382                              v->dtdv_type.dtdt_size : sizeof (uint64_t),
5376 5383                              regs[rd] ? DTRACE_DYNVAR_ALLOC :
5377 5384                              DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5378 5385  
5379 5386                          /*
5380 5387                           * Given that we're storing to thread-local data,
5381 5388                           * we need to flush our predicate cache.
5382 5389                           */
5383 5390                          curthread->t_predcache = NULL;
5384 5391  
5385 5392                          if (dvar == NULL)
5386 5393                                  break;
5387 5394  
5388 5395                          if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5389 5396                                  if (!dtrace_vcanload(
5390 5397                                      (void *)(uintptr_t)regs[rd],
5391 5398                                      &v->dtdv_type, mstate, vstate))
5392 5399                                          break;
5393 5400  
5394 5401                                  dtrace_vcopy((void *)(uintptr_t)regs[rd],
5395 5402                                      dvar->dtdv_data, &v->dtdv_type);
5396 5403                          } else {
5397 5404                                  *((uint64_t *)dvar->dtdv_data) = regs[rd];
5398 5405                          }
5399 5406  
5400 5407                          break;
5401 5408                  }
5402 5409  
5403 5410                  case DIF_OP_SRA:
5404 5411                          regs[rd] = (int64_t)regs[r1] >> regs[r2];
5405 5412                          break;
5406 5413  
5407 5414                  case DIF_OP_CALL:
5408 5415                          dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
5409 5416                              regs, tupregs, ttop, mstate, state);
5410 5417                          break;
5411 5418  
5412 5419                  case DIF_OP_PUSHTR:
5413 5420                          if (ttop == DIF_DTR_NREGS) {
5414 5421                                  *flags |= CPU_DTRACE_TUPOFLOW;
5415 5422                                  break;
5416 5423                          }
5417 5424  
5418 5425                          if (r1 == DIF_TYPE_STRING) {
5419 5426                                  /*
5420 5427                                   * If this is a string type and the size is 0,
5421 5428                                   * we'll use the system-wide default string
5422 5429                                   * size.  Note that we are _not_ looking at
5423 5430                                   * the value of the DTRACEOPT_STRSIZE option;
5424 5431                                   * had this been set, we would expect to have
5425 5432                                   * a non-zero size value in the "pushtr".
5426 5433                                   */
5427 5434                                  tupregs[ttop].dttk_size =
5428 5435                                      dtrace_strlen((char *)(uintptr_t)regs[rd],
5429 5436                                      regs[r2] ? regs[r2] :
5430 5437                                      dtrace_strsize_default) + 1;
5431 5438                          } else {
5432 5439                                  tupregs[ttop].dttk_size = regs[r2];
5433 5440                          }
5434 5441  
5435 5442                          tupregs[ttop++].dttk_value = regs[rd];
5436 5443                          break;
5437 5444  
5438 5445                  case DIF_OP_PUSHTV:
5439 5446                          if (ttop == DIF_DTR_NREGS) {
5440 5447                                  *flags |= CPU_DTRACE_TUPOFLOW;
5441 5448                                  break;
5442 5449                          }
5443 5450  
5444 5451                          tupregs[ttop].dttk_value = regs[rd];
5445 5452                          tupregs[ttop++].dttk_size = 0;
5446 5453                          break;
5447 5454  
5448 5455                  case DIF_OP_POPTS:
5449 5456                          if (ttop != 0)
5450 5457                                  ttop--;
5451 5458                          break;
5452 5459  
5453 5460                  case DIF_OP_FLUSHTS:
5454 5461                          ttop = 0;
5455 5462                          break;
5456 5463  
5457 5464                  case DIF_OP_LDGAA:
5458 5465                  case DIF_OP_LDTAA: {
5459 5466                          dtrace_dynvar_t *dvar;
5460 5467                          dtrace_key_t *key = tupregs;
5461 5468                          uint_t nkeys = ttop;
5462 5469  
5463 5470                          id = DIF_INSTR_VAR(instr);
5464 5471                          ASSERT(id >= DIF_VAR_OTHER_UBASE);
5465 5472                          id -= DIF_VAR_OTHER_UBASE;
5466 5473  
5467 5474                          key[nkeys].dttk_value = (uint64_t)id;
5468 5475                          key[nkeys++].dttk_size = 0;
5469 5476  
5470 5477                          if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
5471 5478                                  DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5472 5479                                  key[nkeys++].dttk_size = 0;
5473 5480                                  v = &vstate->dtvs_tlocals[id];
5474 5481                          } else {
5475 5482                                  v = &vstate->dtvs_globals[id]->dtsv_var;
5476 5483                          }
5477 5484  
5478 5485                          dvar = dtrace_dynvar(dstate, nkeys, key,
5479 5486                              v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5480 5487                              v->dtdv_type.dtdt_size : sizeof (uint64_t),
5481 5488                              DTRACE_DYNVAR_NOALLOC, mstate, vstate);
5482 5489  
5483 5490                          if (dvar == NULL) {
5484 5491                                  regs[rd] = 0;
5485 5492                                  break;
5486 5493                          }
5487 5494  
5488 5495                          if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5489 5496                                  regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5490 5497                          } else {
5491 5498                                  regs[rd] = *((uint64_t *)dvar->dtdv_data);
5492 5499                          }
5493 5500  
5494 5501                          break;
5495 5502                  }
5496 5503  
5497 5504                  case DIF_OP_STGAA:
5498 5505                  case DIF_OP_STTAA: {
5499 5506                          dtrace_dynvar_t *dvar;
5500 5507                          dtrace_key_t *key = tupregs;
5501 5508                          uint_t nkeys = ttop;
5502 5509  
5503 5510                          id = DIF_INSTR_VAR(instr);
5504 5511                          ASSERT(id >= DIF_VAR_OTHER_UBASE);
5505 5512                          id -= DIF_VAR_OTHER_UBASE;
5506 5513  
5507 5514                          key[nkeys].dttk_value = (uint64_t)id;
5508 5515                          key[nkeys++].dttk_size = 0;
5509 5516  
5510 5517                          if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
5511 5518                                  DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5512 5519                                  key[nkeys++].dttk_size = 0;
5513 5520                                  v = &vstate->dtvs_tlocals[id];
5514 5521                          } else {
5515 5522                                  v = &vstate->dtvs_globals[id]->dtsv_var;
5516 5523                          }
5517 5524  
5518 5525                          dvar = dtrace_dynvar(dstate, nkeys, key,
5519 5526                              v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5520 5527                              v->dtdv_type.dtdt_size : sizeof (uint64_t),
5521 5528                              regs[rd] ? DTRACE_DYNVAR_ALLOC :
5522 5529                              DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5523 5530  
5524 5531                          if (dvar == NULL)
5525 5532                                  break;
5526 5533  
5527 5534                          if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5528 5535                                  if (!dtrace_vcanload(
5529 5536                                      (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5530 5537                                      mstate, vstate))
5531 5538                                          break;
5532 5539  
5533 5540                                  dtrace_vcopy((void *)(uintptr_t)regs[rd],
5534 5541                                      dvar->dtdv_data, &v->dtdv_type);
5535 5542                          } else {
5536 5543                                  *((uint64_t *)dvar->dtdv_data) = regs[rd];
5537 5544                          }
5538 5545  
5539 5546                          break;
5540 5547                  }
5541 5548  
5542 5549                  case DIF_OP_ALLOCS: {
5543 5550                          uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5544 5551                          size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
5545 5552  
5546 5553                          /*
5547 5554                           * Rounding up the user allocation size could have
5548 5555                           * overflowed large, bogus allocations (like -1ULL) to
5549 5556                           * 0.
5550 5557                           */
5551 5558                          if (size < regs[r1] ||
5552 5559                              !DTRACE_INSCRATCH(mstate, size)) {
5553 5560                                  DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5554 5561                                  regs[rd] = NULL;
5555 5562                                  break;
5556 5563                          }
5557 5564  
5558 5565                          dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
5559 5566                          mstate->dtms_scratch_ptr += size;
5560 5567                          regs[rd] = ptr;
5561 5568                          break;
5562 5569                  }
5563 5570  
5564 5571                  case DIF_OP_COPYS:
5565 5572                          if (!dtrace_canstore(regs[rd], regs[r2],
5566 5573                              mstate, vstate)) {
5567 5574                                  *flags |= CPU_DTRACE_BADADDR;
5568 5575                                  *illval = regs[rd];
5569 5576                                  break;
5570 5577                          }
5571 5578  
5572 5579                          if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
5573 5580                                  break;
5574 5581  
5575 5582                          dtrace_bcopy((void *)(uintptr_t)regs[r1],
5576 5583                              (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
5577 5584                          break;
5578 5585  
5579 5586                  case DIF_OP_STB:
5580 5587                          if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
5581 5588                                  *flags |= CPU_DTRACE_BADADDR;
5582 5589                                  *illval = regs[rd];
5583 5590                                  break;
5584 5591                          }
5585 5592                          *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
5586 5593                          break;
5587 5594  
5588 5595                  case DIF_OP_STH:
5589 5596                          if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
5590 5597                                  *flags |= CPU_DTRACE_BADADDR;
5591 5598                                  *illval = regs[rd];
5592 5599                                  break;
5593 5600                          }
5594 5601                          if (regs[rd] & 1) {
5595 5602                                  *flags |= CPU_DTRACE_BADALIGN;
5596 5603                                  *illval = regs[rd];
5597 5604                                  break;
5598 5605                          }
5599 5606                          *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
5600 5607                          break;
5601 5608  
5602 5609                  case DIF_OP_STW:
5603 5610                          if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
5604 5611                                  *flags |= CPU_DTRACE_BADADDR;
5605 5612                                  *illval = regs[rd];
5606 5613                                  break;
5607 5614                          }
5608 5615                          if (regs[rd] & 3) {
5609 5616                                  *flags |= CPU_DTRACE_BADALIGN;
5610 5617                                  *illval = regs[rd];
5611 5618                                  break;
5612 5619                          }
5613 5620                          *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
5614 5621                          break;
5615 5622  
5616 5623                  case DIF_OP_STX:
5617 5624                          if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
5618 5625                                  *flags |= CPU_DTRACE_BADADDR;
5619 5626                                  *illval = regs[rd];
5620 5627                                  break;
5621 5628                          }
5622 5629                          if (regs[rd] & 7) {
5623 5630                                  *flags |= CPU_DTRACE_BADALIGN;
5624 5631                                  *illval = regs[rd];
5625 5632                                  break;
5626 5633                          }
5627 5634                          *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
5628 5635                          break;
5629 5636                  }
5630 5637          }
5631 5638  
5632 5639          if (!(*flags & CPU_DTRACE_FAULT))
5633 5640                  return (rval);
5634 5641  
5635 5642          mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
5636 5643          mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
5637 5644  
5638 5645          return (0);
5639 5646  }
5640 5647  
5641 5648  static void
5642 5649  dtrace_action_breakpoint(dtrace_ecb_t *ecb)
5643 5650  {
5644 5651          dtrace_probe_t *probe = ecb->dte_probe;
5645 5652          dtrace_provider_t *prov = probe->dtpr_provider;
5646 5653          char c[DTRACE_FULLNAMELEN + 80], *str;
5647 5654          char *msg = "dtrace: breakpoint action at probe ";
5648 5655          char *ecbmsg = " (ecb ";
5649 5656          uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
5650 5657          uintptr_t val = (uintptr_t)ecb;
5651 5658          int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
5652 5659  
5653 5660          if (dtrace_destructive_disallow)
5654 5661                  return;
5655 5662  
5656 5663          /*
5657 5664           * It's impossible to be taking action on the NULL probe.
5658 5665           */
5659 5666          ASSERT(probe != NULL);
5660 5667  
5661 5668          /*
5662 5669           * This is a poor man's (destitute man's?) sprintf():  we want to
5663 5670           * print the provider name, module name, function name and name of
5664 5671           * the probe, along with the hex address of the ECB with the breakpoint
5665 5672           * action -- all of which we must place in the character buffer by
5666 5673           * hand.
5667 5674           */
5668 5675          while (*msg != '\0')
5669 5676                  c[i++] = *msg++;
5670 5677  
5671 5678          for (str = prov->dtpv_name; *str != '\0'; str++)
5672 5679                  c[i++] = *str;
5673 5680          c[i++] = ':';
5674 5681  
5675 5682          for (str = probe->dtpr_mod; *str != '\0'; str++)
5676 5683                  c[i++] = *str;
5677 5684          c[i++] = ':';
5678 5685  
5679 5686          for (str = probe->dtpr_func; *str != '\0'; str++)
5680 5687                  c[i++] = *str;
5681 5688          c[i++] = ':';
5682 5689  
5683 5690          for (str = probe->dtpr_name; *str != '\0'; str++)
5684 5691                  c[i++] = *str;
5685 5692  
5686 5693          while (*ecbmsg != '\0')
5687 5694                  c[i++] = *ecbmsg++;
5688 5695  
5689 5696          while (shift >= 0) {
5690 5697                  mask = (uintptr_t)0xf << shift;
5691 5698  
5692 5699                  if (val >= ((uintptr_t)1 << shift))
5693 5700                          c[i++] = "0123456789abcdef"[(val & mask) >> shift];
5694 5701                  shift -= 4;
5695 5702          }
5696 5703  
5697 5704          c[i++] = ')';
5698 5705          c[i] = '\0';
5699 5706  
5700 5707          debug_enter(c);
5701 5708  }
5702 5709  
5703 5710  static void
5704 5711  dtrace_action_panic(dtrace_ecb_t *ecb)
5705 5712  {
5706 5713          dtrace_probe_t *probe = ecb->dte_probe;
5707 5714  
5708 5715          /*
5709 5716           * It's impossible to be taking action on the NULL probe.
5710 5717           */
5711 5718          ASSERT(probe != NULL);
5712 5719  
5713 5720          if (dtrace_destructive_disallow)
5714 5721                  return;
5715 5722  
5716 5723          if (dtrace_panicked != NULL)
5717 5724                  return;
5718 5725  
5719 5726          if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
5720 5727                  return;
5721 5728  
5722 5729          /*
5723 5730           * We won the right to panic.  (We want to be sure that only one
5724 5731           * thread calls panic() from dtrace_probe(), and that panic() is
5725 5732           * called exactly once.)
5726 5733           */
5727 5734          dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
5728 5735              probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
5729 5736              probe->dtpr_func, probe->dtpr_name, (void *)ecb);
5730 5737  }
5731 5738  
5732 5739  static void
5733 5740  dtrace_action_raise(uint64_t sig)
5734 5741  {
5735 5742          if (dtrace_destructive_disallow)
5736 5743                  return;
5737 5744  
5738 5745          if (sig >= NSIG) {
5739 5746                  DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5740 5747                  return;
5741 5748          }
5742 5749  
5743 5750          /*
5744 5751           * raise() has a queue depth of 1 -- we ignore all subsequent
5745 5752           * invocations of the raise() action.
5746 5753           */
5747 5754          if (curthread->t_dtrace_sig == 0)
5748 5755                  curthread->t_dtrace_sig = (uint8_t)sig;
5749 5756  
5750 5757          curthread->t_sig_check = 1;
5751 5758          aston(curthread);
5752 5759  }
5753 5760  
5754 5761  static void
5755 5762  dtrace_action_stop(void)
5756 5763  {
5757 5764          if (dtrace_destructive_disallow)
5758 5765                  return;
5759 5766  
5760 5767          if (!curthread->t_dtrace_stop) {
5761 5768                  curthread->t_dtrace_stop = 1;
5762 5769                  curthread->t_sig_check = 1;
5763 5770                  aston(curthread);
5764 5771          }
5765 5772  }
5766 5773  
5767 5774  static void
5768 5775  dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
5769 5776  {
5770 5777          hrtime_t now;
5771 5778          volatile uint16_t *flags;
5772 5779          cpu_t *cpu = CPU;
5773 5780  
5774 5781          if (dtrace_destructive_disallow)
5775 5782                  return;
5776 5783  
5777 5784          flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
5778 5785  
5779 5786          now = dtrace_gethrtime();
5780 5787  
5781 5788          if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
5782 5789                  /*
5783 5790                   * We need to advance the mark to the current time.
5784 5791                   */
5785 5792                  cpu->cpu_dtrace_chillmark = now;
5786 5793                  cpu->cpu_dtrace_chilled = 0;
5787 5794          }
5788 5795  
5789 5796          /*
5790 5797           * Now check to see if the requested chill time would take us over
5791 5798           * the maximum amount of time allowed in the chill interval.  (Or
5792 5799           * worse, if the calculation itself induces overflow.)
5793 5800           */
5794 5801          if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
5795 5802              cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
5796 5803                  *flags |= CPU_DTRACE_ILLOP;
5797 5804                  return;
5798 5805          }
5799 5806  
5800 5807          while (dtrace_gethrtime() - now < val)
5801 5808                  continue;
5802 5809  
5803 5810          /*
5804 5811           * Normally, we assure that the value of the variable "timestamp" does
5805 5812           * not change within an ECB.  The presence of chill() represents an
5806 5813           * exception to this rule, however.
5807 5814           */
5808 5815          mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
5809 5816          cpu->cpu_dtrace_chilled += val;
5810 5817  }
5811 5818  
5812 5819  static void
5813 5820  dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
5814 5821      uint64_t *buf, uint64_t arg)
5815 5822  {
5816 5823          int nframes = DTRACE_USTACK_NFRAMES(arg);
5817 5824          int strsize = DTRACE_USTACK_STRSIZE(arg);
5818 5825          uint64_t *pcs = &buf[1], *fps;
5819 5826          char *str = (char *)&pcs[nframes];
5820 5827          int size, offs = 0, i, j;
5821 5828          uintptr_t old = mstate->dtms_scratch_ptr, saved;
5822 5829          uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5823 5830          char *sym;
5824 5831  
5825 5832          /*
5826 5833           * Should be taking a faster path if string space has not been
5827 5834           * allocated.
5828 5835           */
5829 5836          ASSERT(strsize != 0);
5830 5837  
5831 5838          /*
5832 5839           * We will first allocate some temporary space for the frame pointers.
5833 5840           */
5834 5841          fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5835 5842          size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
5836 5843              (nframes * sizeof (uint64_t));
5837 5844  
5838 5845          if (!DTRACE_INSCRATCH(mstate, size)) {
5839 5846                  /*
5840 5847                   * Not enough room for our frame pointers -- need to indicate
5841 5848                   * that we ran out of scratch space.
5842 5849                   */
5843 5850                  DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5844 5851                  return;
5845 5852          }
5846 5853  
5847 5854          mstate->dtms_scratch_ptr += size;
5848 5855          saved = mstate->dtms_scratch_ptr;
5849 5856  
5850 5857          /*
5851 5858           * Now get a stack with both program counters and frame pointers.
5852 5859           */
5853 5860          DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5854 5861          dtrace_getufpstack(buf, fps, nframes + 1);
5855 5862          DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5856 5863  
5857 5864          /*
5858 5865           * If that faulted, we're cooked.
5859 5866           */
5860 5867          if (*flags & CPU_DTRACE_FAULT)
5861 5868                  goto out;
5862 5869  
5863 5870          /*
5864 5871           * Now we want to walk up the stack, calling the USTACK helper.  For
5865 5872           * each iteration, we restore the scratch pointer.
5866 5873           */
5867 5874          for (i = 0; i < nframes; i++) {
5868 5875                  mstate->dtms_scratch_ptr = saved;
5869 5876  
5870 5877                  if (offs >= strsize)
5871 5878                          break;
5872 5879  
5873 5880                  sym = (char *)(uintptr_t)dtrace_helper(
5874 5881                      DTRACE_HELPER_ACTION_USTACK,
5875 5882                      mstate, state, pcs[i], fps[i]);
5876 5883  
5877 5884                  /*
5878 5885                   * If we faulted while running the helper, we're going to
5879 5886                   * clear the fault and null out the corresponding string.
5880 5887                   */
5881 5888                  if (*flags & CPU_DTRACE_FAULT) {
5882 5889                          *flags &= ~CPU_DTRACE_FAULT;
5883 5890                          str[offs++] = '\0';
5884 5891                          continue;
5885 5892                  }
5886 5893  
5887 5894                  if (sym == NULL) {
5888 5895                          str[offs++] = '\0';
5889 5896                          continue;
5890 5897                  }
5891 5898  
5892 5899                  DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5893 5900  
5894 5901                  /*
5895 5902                   * Now copy in the string that the helper returned to us.
5896 5903                   */
5897 5904                  for (j = 0; offs + j < strsize; j++) {
5898 5905                          if ((str[offs + j] = sym[j]) == '\0')
5899 5906                                  break;
5900 5907                  }
5901 5908  
5902 5909                  DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5903 5910  
5904 5911                  offs += j + 1;
5905 5912          }
5906 5913  
5907 5914          if (offs >= strsize) {
5908 5915                  /*
5909 5916                   * If we didn't have room for all of the strings, we don't
5910 5917                   * abort processing -- this needn't be a fatal error -- but we
5911 5918                   * still want to increment a counter (dts_stkstroverflows) to
5912 5919                   * allow this condition to be warned about.  (If this is from
5913 5920                   * a jstack() action, it is easily tuned via jstackstrsize.)
5914 5921                   */
5915 5922                  dtrace_error(&state->dts_stkstroverflows);
5916 5923          }
5917 5924  
5918 5925          while (offs < strsize)
5919 5926                  str[offs++] = '\0';
5920 5927  
5921 5928  out:
5922 5929          mstate->dtms_scratch_ptr = old;
5923 5930  }
5924 5931  
5925 5932  /*
5926 5933   * If you're looking for the epicenter of DTrace, you just found it.  This
5927 5934   * is the function called by the provider to fire a probe -- from which all
5928 5935   * subsequent probe-context DTrace activity emanates.
5929 5936   */
5930 5937  void
5931 5938  dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
5932 5939      uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
5933 5940  {
5934 5941          processorid_t cpuid;
5935 5942          dtrace_icookie_t cookie;
5936 5943          dtrace_probe_t *probe;
5937 5944          dtrace_mstate_t mstate;
5938 5945          dtrace_ecb_t *ecb;
5939 5946          dtrace_action_t *act;
5940 5947          intptr_t offs;
5941 5948          size_t size;
5942 5949          int vtime, onintr;
5943 5950          volatile uint16_t *flags;
5944 5951          hrtime_t now;
5945 5952  
5946 5953          /*
5947 5954           * Kick out immediately if this CPU is still being born (in which case
5948 5955           * curthread will be set to -1) or the current thread can't allow
5949 5956           * probes in its current context.
5950 5957           */
5951 5958          if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
5952 5959                  return;
5953 5960  
5954 5961          cookie = dtrace_interrupt_disable();
5955 5962          probe = dtrace_probes[id - 1];
5956 5963          cpuid = CPU->cpu_id;
5957 5964          onintr = CPU_ON_INTR(CPU);
5958 5965  
5959 5966          if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
5960 5967              probe->dtpr_predcache == curthread->t_predcache) {
5961 5968                  /*
5962 5969                   * We have hit in the predicate cache; we know that
5963 5970                   * this predicate would evaluate to be false.
5964 5971                   */
5965 5972                  dtrace_interrupt_enable(cookie);
5966 5973                  return;
5967 5974          }
5968 5975  
5969 5976          if (panic_quiesce) {
5970 5977                  /*
5971 5978                   * We don't trace anything if we're panicking.
5972 5979                   */
5973 5980                  dtrace_interrupt_enable(cookie);
5974 5981                  return;
5975 5982          }
5976 5983  
5977 5984          now = dtrace_gethrtime();
5978 5985          vtime = dtrace_vtime_references != 0;
5979 5986  
5980 5987          if (vtime && curthread->t_dtrace_start)
5981 5988                  curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
5982 5989  
5983 5990          mstate.dtms_difo = NULL;
5984 5991          mstate.dtms_probe = probe;
5985 5992          mstate.dtms_strtok = NULL;
5986 5993          mstate.dtms_arg[0] = arg0;
5987 5994          mstate.dtms_arg[1] = arg1;
5988 5995          mstate.dtms_arg[2] = arg2;
5989 5996          mstate.dtms_arg[3] = arg3;
5990 5997          mstate.dtms_arg[4] = arg4;
5991 5998  
5992 5999          flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
5993 6000  
5994 6001          for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
5995 6002                  dtrace_predicate_t *pred = ecb->dte_predicate;
5996 6003                  dtrace_state_t *state = ecb->dte_state;
5997 6004                  dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
5998 6005                  dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
5999 6006                  dtrace_vstate_t *vstate = &state->dts_vstate;
6000 6007                  dtrace_provider_t *prov = probe->dtpr_provider;
6001 6008                  uint64_t tracememsize = 0;
6002 6009                  int committed = 0;
6003 6010                  caddr_t tomax;
6004 6011  
6005 6012                  /*
6006 6013                   * A little subtlety with the following (seemingly innocuous)
6007 6014                   * declaration of the automatic 'val':  by looking at the
6008 6015                   * code, you might think that it could be declared in the
6009 6016                   * action processing loop, below.  (That is, it's only used in
6010 6017                   * the action processing loop.)  However, it must be declared
6011 6018                   * out of that scope because in the case of DIF expression
6012 6019                   * arguments to aggregating actions, one iteration of the
6013 6020                   * action loop will use the last iteration's value.
6014 6021                   */
6015 6022  #ifdef lint
6016 6023                  uint64_t val = 0;
6017 6024  #else
6018 6025                  uint64_t val;
6019 6026  #endif
6020 6027  
6021 6028                  mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
6022 6029                  mstate.dtms_access = DTRACE_ACCESS_ARGS | DTRACE_ACCESS_PROC;
6023 6030                  mstate.dtms_getf = NULL;
6024 6031  
6025 6032                  *flags &= ~CPU_DTRACE_ERROR;
6026 6033  
6027 6034                  if (prov == dtrace_provider) {
6028 6035                          /*
6029 6036                           * If dtrace itself is the provider of this probe,
6030 6037                           * we're only going to continue processing the ECB if
6031 6038                           * arg0 (the dtrace_state_t) is equal to the ECB's
6032 6039                           * creating state.  (This prevents disjoint consumers
6033 6040                           * from seeing one another's metaprobes.)
6034 6041                           */
6035 6042                          if (arg0 != (uint64_t)(uintptr_t)state)
6036 6043                                  continue;
6037 6044                  }
6038 6045  
6039 6046                  if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
6040 6047                          /*
6041 6048                           * We're not currently active.  If our provider isn't
6042 6049                           * the dtrace pseudo provider, we're not interested.
6043 6050                           */
6044 6051                          if (prov != dtrace_provider)
6045 6052                                  continue;
6046 6053  
6047 6054                          /*
6048 6055                           * Now we must further check if we are in the BEGIN
6049 6056                           * probe.  If we are, we will only continue processing
6050 6057                           * if we're still in WARMUP -- if one BEGIN enabling
6051 6058                           * has invoked the exit() action, we don't want to
6052 6059                           * evaluate subsequent BEGIN enablings.
6053 6060                           */
6054 6061                          if (probe->dtpr_id == dtrace_probeid_begin &&
6055 6062                              state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
6056 6063                                  ASSERT(state->dts_activity ==
6057 6064                                      DTRACE_ACTIVITY_DRAINING);
6058 6065                                  continue;
6059 6066                          }
6060 6067                  }
6061 6068  
6062 6069                  if (ecb->dte_cond && !dtrace_priv_probe(state, &mstate, ecb))
6063 6070                          continue;
6064 6071  
6065 6072                  if (now - state->dts_alive > dtrace_deadman_timeout) {
6066 6073                          /*
6067 6074                           * We seem to be dead.  Unless we (a) have kernel
6068 6075                           * destructive permissions (b) have expicitly enabled
6069 6076                           * destructive actions and (c) destructive actions have
6070 6077                           * not been disabled, we're going to transition into
6071 6078                           * the KILLED state, from which no further processing
6072 6079                           * on this state will be performed.
6073 6080                           */
6074 6081                          if (!dtrace_priv_kernel_destructive(state) ||
6075 6082                              !state->dts_cred.dcr_destructive ||
6076 6083                              dtrace_destructive_disallow) {
6077 6084                                  void *activity = &state->dts_activity;
6078 6085                                  dtrace_activity_t current;
6079 6086  
6080 6087                                  do {
6081 6088                                          current = state->dts_activity;
6082 6089                                  } while (dtrace_cas32(activity, current,
6083 6090                                      DTRACE_ACTIVITY_KILLED) != current);
6084 6091  
6085 6092                                  continue;
6086 6093                          }
6087 6094                  }
6088 6095  
6089 6096                  if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
6090 6097                      ecb->dte_alignment, state, &mstate)) < 0)
6091 6098                          continue;
6092 6099  
6093 6100                  tomax = buf->dtb_tomax;
6094 6101                  ASSERT(tomax != NULL);
6095 6102  
6096 6103                  if (ecb->dte_size != 0)
6097 6104                          DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
6098 6105  
6099 6106                  mstate.dtms_epid = ecb->dte_epid;
6100 6107                  mstate.dtms_present |= DTRACE_MSTATE_EPID;
6101 6108  
6102 6109                  if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
6103 6110                          mstate.dtms_access |= DTRACE_ACCESS_KERNEL;
6104 6111  
6105 6112                  if (pred != NULL) {
6106 6113                          dtrace_difo_t *dp = pred->dtp_difo;
6107 6114                          int rval;
6108 6115  
6109 6116                          rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
6110 6117  
6111 6118                          if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
6112 6119                                  dtrace_cacheid_t cid = probe->dtpr_predcache;
6113 6120  
6114 6121                                  if (cid != DTRACE_CACHEIDNONE && !onintr) {
6115 6122                                          /*
6116 6123                                           * Update the predicate cache...
6117 6124                                           */
6118 6125                                          ASSERT(cid == pred->dtp_cacheid);
6119 6126                                          curthread->t_predcache = cid;
6120 6127                                  }
6121 6128  
6122 6129                                  continue;
6123 6130                          }
6124 6131                  }
6125 6132  
6126 6133                  for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
6127 6134                      act != NULL; act = act->dta_next) {
6128 6135                          size_t valoffs;
6129 6136                          dtrace_difo_t *dp;
6130 6137                          dtrace_recdesc_t *rec = &act->dta_rec;
6131 6138  
6132 6139                          size = rec->dtrd_size;
6133 6140                          valoffs = offs + rec->dtrd_offset;
6134 6141  
6135 6142                          if (DTRACEACT_ISAGG(act->dta_kind)) {
6136 6143                                  uint64_t v = 0xbad;
6137 6144                                  dtrace_aggregation_t *agg;
6138 6145  
6139 6146                                  agg = (dtrace_aggregation_t *)act;
6140 6147  
6141 6148                                  if ((dp = act->dta_difo) != NULL)
6142 6149                                          v = dtrace_dif_emulate(dp,
6143 6150                                              &mstate, vstate, state);
6144 6151  
6145 6152                                  if (*flags & CPU_DTRACE_ERROR)
6146 6153                                          continue;
6147 6154  
6148 6155                                  /*
6149 6156                                   * Note that we always pass the expression
6150 6157                                   * value from the previous iteration of the
6151 6158                                   * action loop.  This value will only be used
6152 6159                                   * if there is an expression argument to the
6153 6160                                   * aggregating action, denoted by the
6154 6161                                   * dtag_hasarg field.
6155 6162                                   */
6156 6163                                  dtrace_aggregate(agg, buf,
6157 6164                                      offs, aggbuf, v, val);
6158 6165                                  continue;
6159 6166                          }
6160 6167  
6161 6168                          switch (act->dta_kind) {
6162 6169                          case DTRACEACT_STOP:
6163 6170                                  if (dtrace_priv_proc_destructive(state,
6164 6171                                      &mstate))
6165 6172                                          dtrace_action_stop();
6166 6173                                  continue;
6167 6174  
6168 6175                          case DTRACEACT_BREAKPOINT:
6169 6176                                  if (dtrace_priv_kernel_destructive(state))
6170 6177                                          dtrace_action_breakpoint(ecb);
6171 6178                                  continue;
6172 6179  
6173 6180                          case DTRACEACT_PANIC:
6174 6181                                  if (dtrace_priv_kernel_destructive(state))
6175 6182                                          dtrace_action_panic(ecb);
6176 6183                                  continue;
6177 6184  
6178 6185                          case DTRACEACT_STACK:
6179 6186                                  if (!dtrace_priv_kernel(state))
6180 6187                                          continue;
6181 6188  
6182 6189                                  dtrace_getpcstack((pc_t *)(tomax + valoffs),
6183 6190                                      size / sizeof (pc_t), probe->dtpr_aframes,
6184 6191                                      DTRACE_ANCHORED(probe) ? NULL :
6185 6192                                      (uint32_t *)arg0);
6186 6193  
6187 6194                                  continue;
6188 6195  
6189 6196                          case DTRACEACT_JSTACK:
6190 6197                          case DTRACEACT_USTACK:
6191 6198                                  if (!dtrace_priv_proc(state, &mstate))
6192 6199                                          continue;
6193 6200  
6194 6201                                  /*
6195 6202                                   * See comment in DIF_VAR_PID.
6196 6203                                   */
6197 6204                                  if (DTRACE_ANCHORED(mstate.dtms_probe) &&
6198 6205                                      CPU_ON_INTR(CPU)) {
6199 6206                                          int depth = DTRACE_USTACK_NFRAMES(
6200 6207                                              rec->dtrd_arg) + 1;
6201 6208  
6202 6209                                          dtrace_bzero((void *)(tomax + valoffs),
6203 6210                                              DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
6204 6211                                              + depth * sizeof (uint64_t));
6205 6212  
6206 6213                                          continue;
6207 6214                                  }
6208 6215  
6209 6216                                  if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
6210 6217                                      curproc->p_dtrace_helpers != NULL) {
6211 6218                                          /*
6212 6219                                           * This is the slow path -- we have
6213 6220                                           * allocated string space, and we're
6214 6221                                           * getting the stack of a process that
6215 6222                                           * has helpers.  Call into a separate
6216 6223                                           * routine to perform this processing.
6217 6224                                           */
6218 6225                                          dtrace_action_ustack(&mstate, state,
6219 6226                                              (uint64_t *)(tomax + valoffs),
6220 6227                                              rec->dtrd_arg);
6221 6228                                          continue;
6222 6229                                  }
6223 6230  
6224 6231                                  /*
6225 6232                                   * Clear the string space, since there's no
6226 6233                                   * helper to do it for us.
6227 6234                                   */
6228 6235                                  if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0) {
6229 6236                                          int depth = DTRACE_USTACK_NFRAMES(
6230 6237                                              rec->dtrd_arg);
6231 6238                                          size_t strsize = DTRACE_USTACK_STRSIZE(
6232 6239                                              rec->dtrd_arg);
6233 6240                                          uint64_t *buf = (uint64_t *)(tomax +
6234 6241                                              valoffs);
6235 6242                                          void *strspace = &buf[depth + 1];
6236 6243  
6237 6244                                          dtrace_bzero(strspace,
6238 6245                                              MIN(depth, strsize));
6239 6246                                  }
6240 6247  
6241 6248                                  DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6242 6249                                  dtrace_getupcstack((uint64_t *)
6243 6250                                      (tomax + valoffs),
6244 6251                                      DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
6245 6252                                  DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6246 6253                                  continue;
6247 6254  
6248 6255                          default:
6249 6256                                  break;
6250 6257                          }
6251 6258  
6252 6259                          dp = act->dta_difo;
6253 6260                          ASSERT(dp != NULL);
6254 6261  
6255 6262                          val = dtrace_dif_emulate(dp, &mstate, vstate, state);
6256 6263  
6257 6264                          if (*flags & CPU_DTRACE_ERROR)
6258 6265                                  continue;
6259 6266  
6260 6267                          switch (act->dta_kind) {
6261 6268                          case DTRACEACT_SPECULATE:
6262 6269                                  ASSERT(buf == &state->dts_buffer[cpuid]);
6263 6270                                  buf = dtrace_speculation_buffer(state,
6264 6271                                      cpuid, val);
6265 6272  
6266 6273                                  if (buf == NULL) {
6267 6274                                          *flags |= CPU_DTRACE_DROP;
6268 6275                                          continue;
6269 6276                                  }
6270 6277  
6271 6278                                  offs = dtrace_buffer_reserve(buf,
6272 6279                                      ecb->dte_needed, ecb->dte_alignment,
6273 6280                                      state, NULL);
6274 6281  
6275 6282                                  if (offs < 0) {
6276 6283                                          *flags |= CPU_DTRACE_DROP;
6277 6284                                          continue;
6278 6285                                  }
6279 6286  
6280 6287                                  tomax = buf->dtb_tomax;
6281 6288                                  ASSERT(tomax != NULL);
6282 6289  
6283 6290                                  if (ecb->dte_size != 0)
6284 6291                                          DTRACE_STORE(uint32_t, tomax, offs,
6285 6292                                              ecb->dte_epid);
6286 6293                                  continue;
6287 6294  
6288 6295                          case DTRACEACT_CHILL:
6289 6296                                  if (dtrace_priv_kernel_destructive(state))
6290 6297                                          dtrace_action_chill(&mstate, val);
6291 6298                                  continue;
6292 6299  
6293 6300                          case DTRACEACT_RAISE:
6294 6301                                  if (dtrace_priv_proc_destructive(state,
6295 6302                                      &mstate))
6296 6303                                          dtrace_action_raise(val);
6297 6304                                  continue;
6298 6305  
6299 6306                          case DTRACEACT_COMMIT:
6300 6307                                  ASSERT(!committed);
6301 6308  
6302 6309                                  /*
6303 6310                                   * We need to commit our buffer state.
6304 6311                                   */
6305 6312                                  if (ecb->dte_size)
6306 6313                                          buf->dtb_offset = offs + ecb->dte_size;
6307 6314                                  buf = &state->dts_buffer[cpuid];
6308 6315                                  dtrace_speculation_commit(state, cpuid, val);
6309 6316                                  committed = 1;
6310 6317                                  continue;
6311 6318  
6312 6319                          case DTRACEACT_DISCARD:
6313 6320                                  dtrace_speculation_discard(state, cpuid, val);
6314 6321                                  continue;
6315 6322  
6316 6323                          case DTRACEACT_DIFEXPR:
6317 6324                          case DTRACEACT_LIBACT:
6318 6325                          case DTRACEACT_PRINTF:
6319 6326                          case DTRACEACT_PRINTA:
6320 6327                          case DTRACEACT_SYSTEM:
6321 6328                          case DTRACEACT_FREOPEN:
6322 6329                          case DTRACEACT_TRACEMEM:
6323 6330                                  break;
6324 6331  
6325 6332                          case DTRACEACT_TRACEMEM_DYNSIZE:
6326 6333                                  tracememsize = val;
6327 6334                                  break;
6328 6335  
6329 6336                          case DTRACEACT_SYM:
6330 6337                          case DTRACEACT_MOD:
6331 6338                                  if (!dtrace_priv_kernel(state))
6332 6339                                          continue;
6333 6340                                  break;
6334 6341  
6335 6342                          case DTRACEACT_USYM:
6336 6343                          case DTRACEACT_UMOD:
6337 6344                          case DTRACEACT_UADDR: {
6338 6345                                  struct pid *pid = curthread->t_procp->p_pidp;
6339 6346  
6340 6347                                  if (!dtrace_priv_proc(state, &mstate))
6341 6348                                          continue;
6342 6349  
6343 6350                                  DTRACE_STORE(uint64_t, tomax,
6344 6351                                      valoffs, (uint64_t)pid->pid_id);
6345 6352                                  DTRACE_STORE(uint64_t, tomax,
6346 6353                                      valoffs + sizeof (uint64_t), val);
6347 6354  
6348 6355                                  continue;
6349 6356                          }
6350 6357  
6351 6358                          case DTRACEACT_EXIT: {
6352 6359                                  /*
6353 6360                                   * For the exit action, we are going to attempt
6354 6361                                   * to atomically set our activity to be
6355 6362                                   * draining.  If this fails (either because
6356 6363                                   * another CPU has beat us to the exit action,
6357 6364                                   * or because our current activity is something
6358 6365                                   * other than ACTIVE or WARMUP), we will
6359 6366                                   * continue.  This assures that the exit action
6360 6367                                   * can be successfully recorded at most once
6361 6368                                   * when we're in the ACTIVE state.  If we're
6362 6369                                   * encountering the exit() action while in
6363 6370                                   * COOLDOWN, however, we want to honor the new
6364 6371                                   * status code.  (We know that we're the only
6365 6372                                   * thread in COOLDOWN, so there is no race.)
6366 6373                                   */
6367 6374                                  void *activity = &state->dts_activity;
6368 6375                                  dtrace_activity_t current = state->dts_activity;
6369 6376  
6370 6377                                  if (current == DTRACE_ACTIVITY_COOLDOWN)
6371 6378                                          break;
6372 6379  
6373 6380                                  if (current != DTRACE_ACTIVITY_WARMUP)
6374 6381                                          current = DTRACE_ACTIVITY_ACTIVE;
6375 6382  
6376 6383                                  if (dtrace_cas32(activity, current,
6377 6384                                      DTRACE_ACTIVITY_DRAINING) != current) {
6378 6385                                          *flags |= CPU_DTRACE_DROP;
6379 6386                                          continue;
6380 6387                                  }
6381 6388  
6382 6389                                  break;
6383 6390                          }
6384 6391  
6385 6392                          default:
6386 6393                                  ASSERT(0);
6387 6394                          }
6388 6395  
6389 6396                          if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) {
6390 6397                                  uintptr_t end = valoffs + size;
6391 6398  
6392 6399                                  if (tracememsize != 0 &&
6393 6400                                      valoffs + tracememsize < end) {
6394 6401                                          end = valoffs + tracememsize;
6395 6402                                          tracememsize = 0;
6396 6403                                  }
6397 6404  
6398 6405                                  if (!dtrace_vcanload((void *)(uintptr_t)val,
6399 6406                                      &dp->dtdo_rtype, &mstate, vstate))
6400 6407                                          continue;
6401 6408  
6402 6409                                  /*
6403 6410                                   * If this is a string, we're going to only
6404 6411                                   * load until we find the zero byte -- after
6405 6412                                   * which we'll store zero bytes.
6406 6413                                   */
6407 6414                                  if (dp->dtdo_rtype.dtdt_kind ==
6408 6415                                      DIF_TYPE_STRING) {
6409 6416                                          char c = '\0' + 1;
6410 6417                                          int intuple = act->dta_intuple;
6411 6418                                          size_t s;
6412 6419  
6413 6420                                          for (s = 0; s < size; s++) {
6414 6421                                                  if (c != '\0')
6415 6422                                                          c = dtrace_load8(val++);
6416 6423  
6417 6424                                                  DTRACE_STORE(uint8_t, tomax,
6418 6425                                                      valoffs++, c);
6419 6426  
6420 6427                                                  if (c == '\0' && intuple)
6421 6428                                                          break;
6422 6429                                          }
6423 6430  
6424 6431                                          continue;
6425 6432                                  }
6426 6433  
6427 6434                                  while (valoffs < end) {
6428 6435                                          DTRACE_STORE(uint8_t, tomax, valoffs++,
6429 6436                                              dtrace_load8(val++));
6430 6437                                  }
6431 6438  
6432 6439                                  continue;
6433 6440                          }
6434 6441  
6435 6442                          switch (size) {
6436 6443                          case 0:
6437 6444                                  break;
6438 6445  
6439 6446                          case sizeof (uint8_t):
6440 6447                                  DTRACE_STORE(uint8_t, tomax, valoffs, val);
6441 6448                                  break;
6442 6449                          case sizeof (uint16_t):
6443 6450                                  DTRACE_STORE(uint16_t, tomax, valoffs, val);
6444 6451                                  break;
6445 6452                          case sizeof (uint32_t):
6446 6453                                  DTRACE_STORE(uint32_t, tomax, valoffs, val);
6447 6454                                  break;
6448 6455                          case sizeof (uint64_t):
6449 6456                                  DTRACE_STORE(uint64_t, tomax, valoffs, val);
6450 6457                                  break;
6451 6458                          default:
6452 6459                                  /*
6453 6460                                   * Any other size should have been returned by
6454 6461                                   * reference, not by value.
6455 6462                                   */
6456 6463                                  ASSERT(0);
6457 6464                                  break;
6458 6465                          }
6459 6466                  }
6460 6467  
6461 6468                  if (*flags & CPU_DTRACE_DROP)
6462 6469                          continue;
6463 6470  
6464 6471                  if (*flags & CPU_DTRACE_FAULT) {
6465 6472                          int ndx;
6466 6473                          dtrace_action_t *err;
6467 6474  
6468 6475                          buf->dtb_errors++;
6469 6476  
6470 6477                          if (probe->dtpr_id == dtrace_probeid_error) {
6471 6478                                  /*
6472 6479                                   * There's nothing we can do -- we had an
6473 6480                                   * error on the error probe.  We bump an
6474 6481                                   * error counter to at least indicate that
6475 6482                                   * this condition happened.
6476 6483                                   */
6477 6484                                  dtrace_error(&state->dts_dblerrors);
6478 6485                                  continue;
6479 6486                          }
6480 6487  
6481 6488                          if (vtime) {
6482 6489                                  /*
6483 6490                                   * Before recursing on dtrace_probe(), we
6484 6491                                   * need to explicitly clear out our start
6485 6492                                   * time to prevent it from being accumulated
6486 6493                                   * into t_dtrace_vtime.
6487 6494                                   */
6488 6495                                  curthread->t_dtrace_start = 0;
6489 6496                          }
6490 6497  
6491 6498                          /*
6492 6499                           * Iterate over the actions to figure out which action
6493 6500                           * we were processing when we experienced the error.
6494 6501                           * Note that act points _past_ the faulting action; if
6495 6502                           * act is ecb->dte_action, the fault was in the
6496 6503                           * predicate, if it's ecb->dte_action->dta_next it's
6497 6504                           * in action #1, and so on.
6498 6505                           */
6499 6506                          for (err = ecb->dte_action, ndx = 0;
6500 6507                              err != act; err = err->dta_next, ndx++)
6501 6508                                  continue;
6502 6509  
6503 6510                          dtrace_probe_error(state, ecb->dte_epid, ndx,
6504 6511                              (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
6505 6512                              mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
6506 6513                              cpu_core[cpuid].cpuc_dtrace_illval);
6507 6514  
6508 6515                          continue;
6509 6516                  }
6510 6517  
6511 6518                  if (!committed)
6512 6519                          buf->dtb_offset = offs + ecb->dte_size;
6513 6520          }
6514 6521  
6515 6522          if (vtime)
6516 6523                  curthread->t_dtrace_start = dtrace_gethrtime();
6517 6524  
6518 6525          dtrace_interrupt_enable(cookie);
6519 6526  }
6520 6527  
6521 6528  /*
6522 6529   * DTrace Probe Hashing Functions
6523 6530   *
6524 6531   * The functions in this section (and indeed, the functions in remaining
6525 6532   * sections) are not _called_ from probe context.  (Any exceptions to this are
6526 6533   * marked with a "Note:".)  Rather, they are called from elsewhere in the
6527 6534   * DTrace framework to look-up probes in, add probes to and remove probes from
6528 6535   * the DTrace probe hashes.  (Each probe is hashed by each element of the
6529 6536   * probe tuple -- allowing for fast lookups, regardless of what was
6530 6537   * specified.)
6531 6538   */
6532 6539  static uint_t
6533 6540  dtrace_hash_str(char *p)
6534 6541  {
6535 6542          unsigned int g;
6536 6543          uint_t hval = 0;
6537 6544  
6538 6545          while (*p) {
6539 6546                  hval = (hval << 4) + *p++;
6540 6547                  if ((g = (hval & 0xf0000000)) != 0)
6541 6548                          hval ^= g >> 24;
6542 6549                  hval &= ~g;
6543 6550          }
6544 6551          return (hval);
6545 6552  }
6546 6553  
6547 6554  static dtrace_hash_t *
6548 6555  dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
6549 6556  {
6550 6557          dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
6551 6558  
6552 6559          hash->dth_stroffs = stroffs;
6553 6560          hash->dth_nextoffs = nextoffs;
6554 6561          hash->dth_prevoffs = prevoffs;
6555 6562  
6556 6563          hash->dth_size = 1;
6557 6564          hash->dth_mask = hash->dth_size - 1;
6558 6565  
6559 6566          hash->dth_tab = kmem_zalloc(hash->dth_size *
6560 6567              sizeof (dtrace_hashbucket_t *), KM_SLEEP);
6561 6568  
6562 6569          return (hash);
6563 6570  }
6564 6571  
6565 6572  static void
6566 6573  dtrace_hash_destroy(dtrace_hash_t *hash)
6567 6574  {
6568 6575  #ifdef DEBUG
6569 6576          int i;
6570 6577  
6571 6578          for (i = 0; i < hash->dth_size; i++)
6572 6579                  ASSERT(hash->dth_tab[i] == NULL);
6573 6580  #endif
6574 6581  
6575 6582          kmem_free(hash->dth_tab,
6576 6583              hash->dth_size * sizeof (dtrace_hashbucket_t *));
6577 6584          kmem_free(hash, sizeof (dtrace_hash_t));
6578 6585  }
6579 6586  
6580 6587  static void
6581 6588  dtrace_hash_resize(dtrace_hash_t *hash)
6582 6589  {
6583 6590          int size = hash->dth_size, i, ndx;
6584 6591          int new_size = hash->dth_size << 1;
6585 6592          int new_mask = new_size - 1;
6586 6593          dtrace_hashbucket_t **new_tab, *bucket, *next;
6587 6594  
6588 6595          ASSERT((new_size & new_mask) == 0);
6589 6596  
6590 6597          new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
6591 6598  
6592 6599          for (i = 0; i < size; i++) {
6593 6600                  for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
6594 6601                          dtrace_probe_t *probe = bucket->dthb_chain;
6595 6602  
6596 6603                          ASSERT(probe != NULL);
6597 6604                          ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
6598 6605  
6599 6606                          next = bucket->dthb_next;
6600 6607                          bucket->dthb_next = new_tab[ndx];
6601 6608                          new_tab[ndx] = bucket;
6602 6609                  }
6603 6610          }
6604 6611  
6605 6612          kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
6606 6613          hash->dth_tab = new_tab;
6607 6614          hash->dth_size = new_size;
6608 6615          hash->dth_mask = new_mask;
6609 6616  }
6610 6617  
6611 6618  static void
6612 6619  dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
6613 6620  {
6614 6621          int hashval = DTRACE_HASHSTR(hash, new);
6615 6622          int ndx = hashval & hash->dth_mask;
6616 6623          dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6617 6624          dtrace_probe_t **nextp, **prevp;
6618 6625  
6619 6626          for (; bucket != NULL; bucket = bucket->dthb_next) {
6620 6627                  if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
6621 6628                          goto add;
6622 6629          }
6623 6630  
6624 6631          if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
6625 6632                  dtrace_hash_resize(hash);
6626 6633                  dtrace_hash_add(hash, new);
6627 6634                  return;
6628 6635          }
6629 6636  
6630 6637          bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
6631 6638          bucket->dthb_next = hash->dth_tab[ndx];
6632 6639          hash->dth_tab[ndx] = bucket;
6633 6640          hash->dth_nbuckets++;
6634 6641  
6635 6642  add:
6636 6643          nextp = DTRACE_HASHNEXT(hash, new);
6637 6644          ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
6638 6645          *nextp = bucket->dthb_chain;
6639 6646  
6640 6647          if (bucket->dthb_chain != NULL) {
6641 6648                  prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
6642 6649                  ASSERT(*prevp == NULL);
6643 6650                  *prevp = new;
6644 6651          }
6645 6652  
6646 6653          bucket->dthb_chain = new;
6647 6654          bucket->dthb_len++;
6648 6655  }
6649 6656  
6650 6657  static dtrace_probe_t *
6651 6658  dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
6652 6659  {
6653 6660          int hashval = DTRACE_HASHSTR(hash, template);
6654 6661          int ndx = hashval & hash->dth_mask;
6655 6662          dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6656 6663  
6657 6664          for (; bucket != NULL; bucket = bucket->dthb_next) {
6658 6665                  if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6659 6666                          return (bucket->dthb_chain);
6660 6667          }
6661 6668  
6662 6669          return (NULL);
6663 6670  }
6664 6671  
6665 6672  static int
6666 6673  dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
6667 6674  {
6668 6675          int hashval = DTRACE_HASHSTR(hash, template);
6669 6676          int ndx = hashval & hash->dth_mask;
6670 6677          dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6671 6678  
6672 6679          for (; bucket != NULL; bucket = bucket->dthb_next) {
6673 6680                  if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6674 6681                          return (bucket->dthb_len);
6675 6682          }
6676 6683  
6677 6684          return (NULL);
6678 6685  }
6679 6686  
6680 6687  static void
6681 6688  dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
6682 6689  {
6683 6690          int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
6684 6691          dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6685 6692  
6686 6693          dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
6687 6694          dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
6688 6695  
6689 6696          /*
6690 6697           * Find the bucket that we're removing this probe from.
6691 6698           */
6692 6699          for (; bucket != NULL; bucket = bucket->dthb_next) {
6693 6700                  if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
6694 6701                          break;
6695 6702          }
6696 6703  
6697 6704          ASSERT(bucket != NULL);
6698 6705  
6699 6706          if (*prevp == NULL) {
6700 6707                  if (*nextp == NULL) {
6701 6708                          /*
6702 6709                           * The removed probe was the only probe on this
6703 6710                           * bucket; we need to remove the bucket.
6704 6711                           */
6705 6712                          dtrace_hashbucket_t *b = hash->dth_tab[ndx];
6706 6713  
6707 6714                          ASSERT(bucket->dthb_chain == probe);
6708 6715                          ASSERT(b != NULL);
6709 6716  
6710 6717                          if (b == bucket) {
6711 6718                                  hash->dth_tab[ndx] = bucket->dthb_next;
6712 6719                          } else {
6713 6720                                  while (b->dthb_next != bucket)
6714 6721                                          b = b->dthb_next;
6715 6722                                  b->dthb_next = bucket->dthb_next;
6716 6723                          }
6717 6724  
6718 6725                          ASSERT(hash->dth_nbuckets > 0);
6719 6726                          hash->dth_nbuckets--;
6720 6727                          kmem_free(bucket, sizeof (dtrace_hashbucket_t));
6721 6728                          return;
6722 6729                  }
6723 6730  
6724 6731                  bucket->dthb_chain = *nextp;
6725 6732          } else {
6726 6733                  *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
6727 6734          }
6728 6735  
6729 6736          if (*nextp != NULL)
6730 6737                  *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
6731 6738  }
6732 6739  
6733 6740  /*
6734 6741   * DTrace Utility Functions
6735 6742   *
6736 6743   * These are random utility functions that are _not_ called from probe context.
6737 6744   */
6738 6745  static int
6739 6746  dtrace_badattr(const dtrace_attribute_t *a)
6740 6747  {
6741 6748          return (a->dtat_name > DTRACE_STABILITY_MAX ||
6742 6749              a->dtat_data > DTRACE_STABILITY_MAX ||
6743 6750              a->dtat_class > DTRACE_CLASS_MAX);
6744 6751  }
6745 6752  
6746 6753  /*
6747 6754   * Return a duplicate copy of a string.  If the specified string is NULL,
6748 6755   * this function returns a zero-length string.
6749 6756   */
6750 6757  static char *
6751 6758  dtrace_strdup(const char *str)
6752 6759  {
6753 6760          char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
6754 6761  
6755 6762          if (str != NULL)
6756 6763                  (void) strcpy(new, str);
6757 6764  
6758 6765          return (new);
6759 6766  }
6760 6767  
6761 6768  #define DTRACE_ISALPHA(c)       \
6762 6769          (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
6763 6770  
6764 6771  static int
6765 6772  dtrace_badname(const char *s)
6766 6773  {
6767 6774          char c;
6768 6775  
6769 6776          if (s == NULL || (c = *s++) == '\0')
6770 6777                  return (0);
6771 6778  
6772 6779          if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
6773 6780                  return (1);
6774 6781  
6775 6782          while ((c = *s++) != '\0') {
6776 6783                  if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
6777 6784                      c != '-' && c != '_' && c != '.' && c != '`')
6778 6785                          return (1);
6779 6786          }
6780 6787  
6781 6788          return (0);
6782 6789  }
6783 6790  
6784 6791  static void
6785 6792  dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
6786 6793  {
6787 6794          uint32_t priv;
6788 6795  
6789 6796          if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
6790 6797                  /*
6791 6798                   * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
6792 6799                   */
6793 6800                  priv = DTRACE_PRIV_ALL;
6794 6801          } else {
6795 6802                  *uidp = crgetuid(cr);
6796 6803                  *zoneidp = crgetzoneid(cr);
6797 6804  
6798 6805                  priv = 0;
6799 6806                  if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
6800 6807                          priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
6801 6808                  else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
6802 6809                          priv |= DTRACE_PRIV_USER;
6803 6810                  if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
6804 6811                          priv |= DTRACE_PRIV_PROC;
6805 6812                  if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
6806 6813                          priv |= DTRACE_PRIV_OWNER;
6807 6814                  if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
6808 6815                          priv |= DTRACE_PRIV_ZONEOWNER;
6809 6816          }
6810 6817  
6811 6818          *privp = priv;
6812 6819  }
6813 6820  
6814 6821  #ifdef DTRACE_ERRDEBUG
6815 6822  static void
6816 6823  dtrace_errdebug(const char *str)
6817 6824  {
6818 6825          int hval = dtrace_hash_str((char *)str) % DTRACE_ERRHASHSZ;
6819 6826          int occupied = 0;
6820 6827  
6821 6828          mutex_enter(&dtrace_errlock);
6822 6829          dtrace_errlast = str;
6823 6830          dtrace_errthread = curthread;
6824 6831  
6825 6832          while (occupied++ < DTRACE_ERRHASHSZ) {
6826 6833                  if (dtrace_errhash[hval].dter_msg == str) {
6827 6834                          dtrace_errhash[hval].dter_count++;
6828 6835                          goto out;
6829 6836                  }
6830 6837  
6831 6838                  if (dtrace_errhash[hval].dter_msg != NULL) {
6832 6839                          hval = (hval + 1) % DTRACE_ERRHASHSZ;
6833 6840                          continue;
6834 6841                  }
6835 6842  
6836 6843                  dtrace_errhash[hval].dter_msg = str;
6837 6844                  dtrace_errhash[hval].dter_count = 1;
6838 6845                  goto out;
6839 6846          }
6840 6847  
6841 6848          panic("dtrace: undersized error hash");
6842 6849  out:
6843 6850          mutex_exit(&dtrace_errlock);
6844 6851  }
6845 6852  #endif
6846 6853  
6847 6854  /*
6848 6855   * DTrace Matching Functions
6849 6856   *
6850 6857   * These functions are used to match groups of probes, given some elements of
6851 6858   * a probe tuple, or some globbed expressions for elements of a probe tuple.
6852 6859   */
6853 6860  static int
6854 6861  dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
6855 6862      zoneid_t zoneid)
6856 6863  {
6857 6864          if (priv != DTRACE_PRIV_ALL) {
6858 6865                  uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
6859 6866                  uint32_t match = priv & ppriv;
6860 6867  
6861 6868                  /*
6862 6869                   * No PRIV_DTRACE_* privileges...
6863 6870                   */
6864 6871                  if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
6865 6872                      DTRACE_PRIV_KERNEL)) == 0)
6866 6873                          return (0);
6867 6874  
6868 6875                  /*
6869 6876                   * No matching bits, but there were bits to match...
6870 6877                   */
6871 6878                  if (match == 0 && ppriv != 0)
6872 6879                          return (0);
6873 6880  
6874 6881                  /*
6875 6882                   * Need to have permissions to the process, but don't...
6876 6883                   */
6877 6884                  if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
6878 6885                      uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
6879 6886                          return (0);
6880 6887                  }
6881 6888  
6882 6889                  /*
6883 6890                   * Need to be in the same zone unless we possess the
6884 6891                   * privilege to examine all zones.
6885 6892                   */
6886 6893                  if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
6887 6894                      zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
6888 6895                          return (0);
6889 6896                  }
6890 6897          }
6891 6898  
6892 6899          return (1);
6893 6900  }
6894 6901  
6895 6902  /*
6896 6903   * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
6897 6904   * consists of input pattern strings and an ops-vector to evaluate them.
6898 6905   * This function returns >0 for match, 0 for no match, and <0 for error.
6899 6906   */
6900 6907  static int
6901 6908  dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
6902 6909      uint32_t priv, uid_t uid, zoneid_t zoneid)
6903 6910  {
6904 6911          dtrace_provider_t *pvp = prp->dtpr_provider;
6905 6912          int rv;
6906 6913  
6907 6914          if (pvp->dtpv_defunct)
6908 6915                  return (0);
6909 6916  
6910 6917          if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
6911 6918                  return (rv);
6912 6919  
6913 6920          if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
6914 6921                  return (rv);
6915 6922  
6916 6923          if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
6917 6924                  return (rv);
6918 6925  
6919 6926          if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
6920 6927                  return (rv);
6921 6928  
6922 6929          if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
6923 6930                  return (0);
6924 6931  
6925 6932          return (rv);
6926 6933  }
6927 6934  
6928 6935  /*
6929 6936   * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
6930 6937   * interface for matching a glob pattern 'p' to an input string 's'.  Unlike
6931 6938   * libc's version, the kernel version only applies to 8-bit ASCII strings.
6932 6939   * In addition, all of the recursion cases except for '*' matching have been
6933 6940   * unwound.  For '*', we still implement recursive evaluation, but a depth
6934 6941   * counter is maintained and matching is aborted if we recurse too deep.
6935 6942   * The function returns 0 if no match, >0 if match, and <0 if recursion error.
6936 6943   */
6937 6944  static int
6938 6945  dtrace_match_glob(const char *s, const char *p, int depth)
6939 6946  {
6940 6947          const char *olds;
6941 6948          char s1, c;
6942 6949          int gs;
6943 6950  
6944 6951          if (depth > DTRACE_PROBEKEY_MAXDEPTH)
6945 6952                  return (-1);
6946 6953  
6947 6954          if (s == NULL)
6948 6955                  s = ""; /* treat NULL as empty string */
6949 6956  
6950 6957  top:
6951 6958          olds = s;
6952 6959          s1 = *s++;
6953 6960  
6954 6961          if (p == NULL)
6955 6962                  return (0);
6956 6963  
6957 6964          if ((c = *p++) == '\0')
6958 6965                  return (s1 == '\0');
6959 6966  
6960 6967          switch (c) {
6961 6968          case '[': {
6962 6969                  int ok = 0, notflag = 0;
6963 6970                  char lc = '\0';
6964 6971  
6965 6972                  if (s1 == '\0')
6966 6973                          return (0);
6967 6974  
6968 6975                  if (*p == '!') {
6969 6976                          notflag = 1;
6970 6977                          p++;
6971 6978                  }
6972 6979  
6973 6980                  if ((c = *p++) == '\0')
6974 6981                          return (0);
6975 6982  
6976 6983                  do {
6977 6984                          if (c == '-' && lc != '\0' && *p != ']') {
6978 6985                                  if ((c = *p++) == '\0')
6979 6986                                          return (0);
6980 6987                                  if (c == '\\' && (c = *p++) == '\0')
6981 6988                                          return (0);
6982 6989  
6983 6990                                  if (notflag) {
6984 6991                                          if (s1 < lc || s1 > c)
6985 6992                                                  ok++;
6986 6993                                          else
6987 6994                                                  return (0);
6988 6995                                  } else if (lc <= s1 && s1 <= c)
6989 6996                                          ok++;
6990 6997  
6991 6998                          } else if (c == '\\' && (c = *p++) == '\0')
6992 6999                                  return (0);
6993 7000  
6994 7001                          lc = c; /* save left-hand 'c' for next iteration */
6995 7002  
6996 7003                          if (notflag) {
6997 7004                                  if (s1 != c)
6998 7005                                          ok++;
6999 7006                                  else
7000 7007                                          return (0);
7001 7008                          } else if (s1 == c)
7002 7009                                  ok++;
7003 7010  
7004 7011                          if ((c = *p++) == '\0')
7005 7012                                  return (0);
7006 7013  
7007 7014                  } while (c != ']');
7008 7015  
7009 7016                  if (ok)
7010 7017                          goto top;
7011 7018  
7012 7019                  return (0);
7013 7020          }
7014 7021  
7015 7022          case '\\':
7016 7023                  if ((c = *p++) == '\0')
7017 7024                          return (0);
7018 7025                  /*FALLTHRU*/
7019 7026  
7020 7027          default:
7021 7028                  if (c != s1)
7022 7029                          return (0);
7023 7030                  /*FALLTHRU*/
7024 7031  
7025 7032          case '?':
7026 7033                  if (s1 != '\0')
7027 7034                          goto top;
7028 7035                  return (0);
7029 7036  
7030 7037          case '*':
7031 7038                  while (*p == '*')
7032 7039                          p++; /* consecutive *'s are identical to a single one */
7033 7040  
7034 7041                  if (*p == '\0')
7035 7042                          return (1);
7036 7043  
7037 7044                  for (s = olds; *s != '\0'; s++) {
7038 7045                          if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
7039 7046                                  return (gs);
7040 7047                  }
7041 7048  
7042 7049                  return (0);
7043 7050          }
7044 7051  }
7045 7052  
7046 7053  /*ARGSUSED*/
7047 7054  static int
7048 7055  dtrace_match_string(const char *s, const char *p, int depth)
7049 7056  {
7050 7057          return (s != NULL && strcmp(s, p) == 0);
7051 7058  }
7052 7059  
7053 7060  /*ARGSUSED*/
7054 7061  static int
7055 7062  dtrace_match_nul(const char *s, const char *p, int depth)
7056 7063  {
7057 7064          return (1); /* always match the empty pattern */
7058 7065  }
7059 7066  
7060 7067  /*ARGSUSED*/
7061 7068  static int
7062 7069  dtrace_match_nonzero(const char *s, const char *p, int depth)
7063 7070  {
7064 7071          return (s != NULL && s[0] != '\0');
7065 7072  }
7066 7073  
7067 7074  static int
7068 7075  dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
7069 7076      zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
7070 7077  {
7071 7078          dtrace_probe_t template, *probe;
7072 7079          dtrace_hash_t *hash = NULL;
7073 7080          int len, rc, best = INT_MAX, nmatched = 0;
7074 7081          dtrace_id_t i;
7075 7082  
7076 7083          ASSERT(MUTEX_HELD(&dtrace_lock));
7077 7084  
7078 7085          /*
7079 7086           * If the probe ID is specified in the key, just lookup by ID and
7080 7087           * invoke the match callback once if a matching probe is found.
7081 7088           */
7082 7089          if (pkp->dtpk_id != DTRACE_IDNONE) {
7083 7090                  if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
7084 7091                      dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
7085 7092                          if ((*matched)(probe, arg) == DTRACE_MATCH_FAIL)
7086 7093                                  return (DTRACE_MATCH_FAIL);
7087 7094                          nmatched++;
7088 7095                  }
7089 7096                  return (nmatched);
7090 7097          }
7091 7098  
7092 7099          template.dtpr_mod = (char *)pkp->dtpk_mod;
7093 7100          template.dtpr_func = (char *)pkp->dtpk_func;
7094 7101          template.dtpr_name = (char *)pkp->dtpk_name;
7095 7102  
7096 7103          /*
7097 7104           * We want to find the most distinct of the module name, function
7098 7105           * name, and name.  So for each one that is not a glob pattern or
7099 7106           * empty string, we perform a lookup in the corresponding hash and
7100 7107           * use the hash table with the fewest collisions to do our search.
7101 7108           */
7102 7109          if (pkp->dtpk_mmatch == &dtrace_match_string &&
7103 7110              (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
7104 7111                  best = len;
7105 7112                  hash = dtrace_bymod;
7106 7113          }
7107 7114  
7108 7115          if (pkp->dtpk_fmatch == &dtrace_match_string &&
7109 7116              (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
7110 7117                  best = len;
7111 7118                  hash = dtrace_byfunc;
7112 7119          }
7113 7120  
7114 7121          if (pkp->dtpk_nmatch == &dtrace_match_string &&
7115 7122              (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
7116 7123                  best = len;
7117 7124                  hash = dtrace_byname;
7118 7125          }
7119 7126  
7120 7127          /*
7121 7128           * If we did not select a hash table, iterate over every probe and
7122 7129           * invoke our callback for each one that matches our input probe key.
7123 7130           */
7124 7131          if (hash == NULL) {
7125 7132                  for (i = 0; i < dtrace_nprobes; i++) {
7126 7133                          if ((probe = dtrace_probes[i]) == NULL ||
7127 7134                              dtrace_match_probe(probe, pkp, priv, uid,
7128 7135                              zoneid) <= 0)
7129 7136                                  continue;
7130 7137  
7131 7138                          nmatched++;
7132 7139  
7133 7140                          if ((rc = (*matched)(probe, arg)) !=
7134 7141                              DTRACE_MATCH_NEXT) {
7135 7142                                  if (rc == DTRACE_MATCH_FAIL)
7136 7143                                          return (DTRACE_MATCH_FAIL);
7137 7144                                  break;
7138 7145                          }
7139 7146                  }
7140 7147  
7141 7148                  return (nmatched);
7142 7149          }
7143 7150  
7144 7151          /*
7145 7152           * If we selected a hash table, iterate over each probe of the same key
7146 7153           * name and invoke the callback for every probe that matches the other
7147 7154           * attributes of our input probe key.
7148 7155           */
7149 7156          for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
7150 7157              probe = *(DTRACE_HASHNEXT(hash, probe))) {
7151 7158  
7152 7159                  if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
7153 7160                          continue;
7154 7161  
7155 7162                  nmatched++;
7156 7163  
7157 7164                  if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) {
7158 7165                          if (rc == DTRACE_MATCH_FAIL)
7159 7166                                  return (DTRACE_MATCH_FAIL);
7160 7167                          break;
7161 7168                  }
7162 7169          }
7163 7170  
7164 7171          return (nmatched);
7165 7172  }
7166 7173  
7167 7174  /*
7168 7175   * Return the function pointer dtrace_probecmp() should use to compare the
7169 7176   * specified pattern with a string.  For NULL or empty patterns, we select
7170 7177   * dtrace_match_nul().  For glob pattern strings, we use dtrace_match_glob().
7171 7178   * For non-empty non-glob strings, we use dtrace_match_string().
7172 7179   */
7173 7180  static dtrace_probekey_f *
7174 7181  dtrace_probekey_func(const char *p)
7175 7182  {
7176 7183          char c;
7177 7184  
7178 7185          if (p == NULL || *p == '\0')
7179 7186                  return (&dtrace_match_nul);
7180 7187  
7181 7188          while ((c = *p++) != '\0') {
7182 7189                  if (c == '[' || c == '?' || c == '*' || c == '\\')
7183 7190                          return (&dtrace_match_glob);
7184 7191          }
7185 7192  
7186 7193          return (&dtrace_match_string);
7187 7194  }
7188 7195  
7189 7196  /*
7190 7197   * Build a probe comparison key for use with dtrace_match_probe() from the
7191 7198   * given probe description.  By convention, a null key only matches anchored
7192 7199   * probes: if each field is the empty string, reset dtpk_fmatch to
7193 7200   * dtrace_match_nonzero().
7194 7201   */
7195 7202  static void
7196 7203  dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
7197 7204  {
7198 7205          pkp->dtpk_prov = pdp->dtpd_provider;
7199 7206          pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
7200 7207  
7201 7208          pkp->dtpk_mod = pdp->dtpd_mod;
7202 7209          pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
7203 7210  
7204 7211          pkp->dtpk_func = pdp->dtpd_func;
7205 7212          pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
7206 7213  
7207 7214          pkp->dtpk_name = pdp->dtpd_name;
7208 7215          pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
7209 7216  
7210 7217          pkp->dtpk_id = pdp->dtpd_id;
7211 7218  
7212 7219          if (pkp->dtpk_id == DTRACE_IDNONE &&
7213 7220              pkp->dtpk_pmatch == &dtrace_match_nul &&
7214 7221              pkp->dtpk_mmatch == &dtrace_match_nul &&
7215 7222              pkp->dtpk_fmatch == &dtrace_match_nul &&
7216 7223              pkp->dtpk_nmatch == &dtrace_match_nul)
7217 7224                  pkp->dtpk_fmatch = &dtrace_match_nonzero;
7218 7225  }
7219 7226  
7220 7227  /*
7221 7228   * DTrace Provider-to-Framework API Functions
7222 7229   *
7223 7230   * These functions implement much of the Provider-to-Framework API, as
7224 7231   * described in <sys/dtrace.h>.  The parts of the API not in this section are
7225 7232   * the functions in the API for probe management (found below), and
7226 7233   * dtrace_probe() itself (found above).
7227 7234   */
7228 7235  
7229 7236  /*
7230 7237   * Register the calling provider with the DTrace framework.  This should
7231 7238   * generally be called by DTrace providers in their attach(9E) entry point.
7232 7239   */
7233 7240  int
7234 7241  dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
7235 7242      cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
7236 7243  {
7237 7244          dtrace_provider_t *provider;
7238 7245  
7239 7246          if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
7240 7247                  cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7241 7248                      "arguments", name ? name : "<NULL>");
7242 7249                  return (EINVAL);
7243 7250          }
7244 7251  
7245 7252          if (name[0] == '\0' || dtrace_badname(name)) {
7246 7253                  cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7247 7254                      "provider name", name);
7248 7255                  return (EINVAL);
7249 7256          }
7250 7257  
7251 7258          if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
7252 7259              pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
7253 7260              pops->dtps_destroy == NULL ||
7254 7261              ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
7255 7262                  cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7256 7263                      "provider ops", name);
7257 7264                  return (EINVAL);
7258 7265          }
7259 7266  
7260 7267          if (dtrace_badattr(&pap->dtpa_provider) ||
7261 7268              dtrace_badattr(&pap->dtpa_mod) ||
7262 7269              dtrace_badattr(&pap->dtpa_func) ||
7263 7270              dtrace_badattr(&pap->dtpa_name) ||
7264 7271              dtrace_badattr(&pap->dtpa_args)) {
7265 7272                  cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7266 7273                      "provider attributes", name);
7267 7274                  return (EINVAL);
7268 7275          }
7269 7276  
7270 7277          if (priv & ~DTRACE_PRIV_ALL) {
7271 7278                  cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7272 7279                      "privilege attributes", name);
7273 7280                  return (EINVAL);
7274 7281          }
7275 7282  
7276 7283          if ((priv & DTRACE_PRIV_KERNEL) &&
7277 7284              (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
7278 7285              pops->dtps_mode == NULL) {
7279 7286                  cmn_err(CE_WARN, "failed to register provider '%s': need "
7280 7287                      "dtps_mode() op for given privilege attributes", name);
7281 7288                  return (EINVAL);
7282 7289          }
7283 7290  
7284 7291          provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
7285 7292          provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
7286 7293          (void) strcpy(provider->dtpv_name, name);
7287 7294  
7288 7295          provider->dtpv_attr = *pap;
7289 7296          provider->dtpv_priv.dtpp_flags = priv;
7290 7297          if (cr != NULL) {
7291 7298                  provider->dtpv_priv.dtpp_uid = crgetuid(cr);
7292 7299                  provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
7293 7300          }
7294 7301          provider->dtpv_pops = *pops;
7295 7302  
7296 7303          if (pops->dtps_provide == NULL) {
7297 7304                  ASSERT(pops->dtps_provide_module != NULL);
7298 7305                  provider->dtpv_pops.dtps_provide =
7299 7306                      (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop;
7300 7307          }
7301 7308  
7302 7309          if (pops->dtps_provide_module == NULL) {
7303 7310                  ASSERT(pops->dtps_provide != NULL);
7304 7311                  provider->dtpv_pops.dtps_provide_module =
7305 7312                      (void (*)(void *, struct modctl *))dtrace_nullop;
7306 7313          }
7307 7314  
7308 7315          if (pops->dtps_suspend == NULL) {
7309 7316                  ASSERT(pops->dtps_resume == NULL);
7310 7317                  provider->dtpv_pops.dtps_suspend =
7311 7318                      (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7312 7319                  provider->dtpv_pops.dtps_resume =
7313 7320                      (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7314 7321          }
7315 7322  
7316 7323          provider->dtpv_arg = arg;
7317 7324          *idp = (dtrace_provider_id_t)provider;
7318 7325  
7319 7326          if (pops == &dtrace_provider_ops) {
7320 7327                  ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7321 7328                  ASSERT(MUTEX_HELD(&dtrace_lock));
7322 7329                  ASSERT(dtrace_anon.dta_enabling == NULL);
7323 7330  
7324 7331                  /*
7325 7332                   * We make sure that the DTrace provider is at the head of
7326 7333                   * the provider chain.
7327 7334                   */
7328 7335                  provider->dtpv_next = dtrace_provider;
7329 7336                  dtrace_provider = provider;
7330 7337                  return (0);
7331 7338          }
7332 7339  
7333 7340          mutex_enter(&dtrace_provider_lock);
7334 7341          mutex_enter(&dtrace_lock);
7335 7342  
7336 7343          /*
7337 7344           * If there is at least one provider registered, we'll add this
7338 7345           * provider after the first provider.
7339 7346           */
7340 7347          if (dtrace_provider != NULL) {
7341 7348                  provider->dtpv_next = dtrace_provider->dtpv_next;
7342 7349                  dtrace_provider->dtpv_next = provider;
7343 7350          } else {
7344 7351                  dtrace_provider = provider;
7345 7352          }
7346 7353  
7347 7354          if (dtrace_retained != NULL) {
7348 7355                  dtrace_enabling_provide(provider);
7349 7356  
7350 7357                  /*
7351 7358                   * Now we need to call dtrace_enabling_matchall() -- which
7352 7359                   * will acquire cpu_lock and dtrace_lock.  We therefore need
7353 7360                   * to drop all of our locks before calling into it...
7354 7361                   */
7355 7362                  mutex_exit(&dtrace_lock);
7356 7363                  mutex_exit(&dtrace_provider_lock);
7357 7364                  dtrace_enabling_matchall();
7358 7365  
7359 7366                  return (0);
7360 7367          }
7361 7368  
7362 7369          mutex_exit(&dtrace_lock);
7363 7370          mutex_exit(&dtrace_provider_lock);
7364 7371  
7365 7372          return (0);
7366 7373  }
7367 7374  
7368 7375  /*
7369 7376   * Unregister the specified provider from the DTrace framework.  This should
7370 7377   * generally be called by DTrace providers in their detach(9E) entry point.
7371 7378   */
7372 7379  int
7373 7380  dtrace_unregister(dtrace_provider_id_t id)
7374 7381  {
7375 7382          dtrace_provider_t *old = (dtrace_provider_t *)id;
7376 7383          dtrace_provider_t *prev = NULL;
7377 7384          int i, self = 0, noreap = 0;
7378 7385          dtrace_probe_t *probe, *first = NULL;
7379 7386  
7380 7387          if (old->dtpv_pops.dtps_enable ==
7381 7388              (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
7382 7389                  /*
7383 7390                   * If DTrace itself is the provider, we're called with locks
7384 7391                   * already held.
7385 7392                   */
7386 7393                  ASSERT(old == dtrace_provider);
7387 7394                  ASSERT(dtrace_devi != NULL);
7388 7395                  ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7389 7396                  ASSERT(MUTEX_HELD(&dtrace_lock));
7390 7397                  self = 1;
7391 7398  
7392 7399                  if (dtrace_provider->dtpv_next != NULL) {
7393 7400                          /*
7394 7401                           * There's another provider here; return failure.
7395 7402                           */
7396 7403                          return (EBUSY);
7397 7404                  }
7398 7405          } else {
7399 7406                  mutex_enter(&dtrace_provider_lock);
7400 7407                  mutex_enter(&mod_lock);
7401 7408                  mutex_enter(&dtrace_lock);
7402 7409          }
7403 7410  
7404 7411          /*
7405 7412           * If anyone has /dev/dtrace open, or if there are anonymous enabled
7406 7413           * probes, we refuse to let providers slither away, unless this
7407 7414           * provider has already been explicitly invalidated.
7408 7415           */
7409 7416          if (!old->dtpv_defunct &&
7410 7417              (dtrace_opens || (dtrace_anon.dta_state != NULL &&
7411 7418              dtrace_anon.dta_state->dts_necbs > 0))) {
7412 7419                  if (!self) {
7413 7420                          mutex_exit(&dtrace_lock);
7414 7421                          mutex_exit(&mod_lock);
7415 7422                          mutex_exit(&dtrace_provider_lock);
7416 7423                  }
7417 7424                  return (EBUSY);
7418 7425          }
7419 7426  
7420 7427          /*
7421 7428           * Attempt to destroy the probes associated with this provider.
7422 7429           */
7423 7430          for (i = 0; i < dtrace_nprobes; i++) {
7424 7431                  if ((probe = dtrace_probes[i]) == NULL)
7425 7432                          continue;
7426 7433  
7427 7434                  if (probe->dtpr_provider != old)
7428 7435                          continue;
7429 7436  
7430 7437                  if (probe->dtpr_ecb == NULL)
7431 7438                          continue;
7432 7439  
7433 7440                  /*
7434 7441                   * If we are trying to unregister a defunct provider, and the
7435 7442                   * provider was made defunct within the interval dictated by
7436 7443                   * dtrace_unregister_defunct_reap, we'll (asynchronously)
7437 7444                   * attempt to reap our enablings.  To denote that the provider
7438 7445                   * should reattempt to unregister itself at some point in the
7439 7446                   * future, we will return a differentiable error code (EAGAIN
7440 7447                   * instead of EBUSY) in this case.
7441 7448                   */
7442 7449                  if (dtrace_gethrtime() - old->dtpv_defunct >
7443 7450                      dtrace_unregister_defunct_reap)
7444 7451                          noreap = 1;
7445 7452  
7446 7453                  if (!self) {
7447 7454                          mutex_exit(&dtrace_lock);
7448 7455                          mutex_exit(&mod_lock);
7449 7456                          mutex_exit(&dtrace_provider_lock);
7450 7457                  }
7451 7458  
7452 7459                  if (noreap)
7453 7460                          return (EBUSY);
7454 7461  
7455 7462                  (void) taskq_dispatch(dtrace_taskq,
7456 7463                      (task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP);
7457 7464  
7458 7465                  return (EAGAIN);
7459 7466          }
7460 7467  
7461 7468          /*
7462 7469           * All of the probes for this provider are disabled; we can safely
7463 7470           * remove all of them from their hash chains and from the probe array.
7464 7471           */
7465 7472          for (i = 0; i < dtrace_nprobes; i++) {
7466 7473                  if ((probe = dtrace_probes[i]) == NULL)
7467 7474                          continue;
7468 7475  
7469 7476                  if (probe->dtpr_provider != old)
7470 7477                          continue;
7471 7478  
7472 7479                  dtrace_probes[i] = NULL;
7473 7480  
7474 7481                  dtrace_hash_remove(dtrace_bymod, probe);
7475 7482                  dtrace_hash_remove(dtrace_byfunc, probe);
7476 7483                  dtrace_hash_remove(dtrace_byname, probe);
7477 7484  
7478 7485                  if (first == NULL) {
7479 7486                          first = probe;
7480 7487                          probe->dtpr_nextmod = NULL;
7481 7488                  } else {
7482 7489                          probe->dtpr_nextmod = first;
7483 7490                          first = probe;
7484 7491                  }
7485 7492          }
7486 7493  
7487 7494          /*
7488 7495           * The provider's probes have been removed from the hash chains and
7489 7496           * from the probe array.  Now issue a dtrace_sync() to be sure that
7490 7497           * everyone has cleared out from any probe array processing.
7491 7498           */
7492 7499          dtrace_sync();
7493 7500  
7494 7501          for (probe = first; probe != NULL; probe = first) {
7495 7502                  first = probe->dtpr_nextmod;
7496 7503  
7497 7504                  old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
7498 7505                      probe->dtpr_arg);
7499 7506                  kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7500 7507                  kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7501 7508                  kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7502 7509                  vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
7503 7510                  kmem_free(probe, sizeof (dtrace_probe_t));
7504 7511          }
7505 7512  
7506 7513          if ((prev = dtrace_provider) == old) {
7507 7514                  ASSERT(self || dtrace_devi == NULL);
7508 7515                  ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
7509 7516                  dtrace_provider = old->dtpv_next;
7510 7517          } else {
7511 7518                  while (prev != NULL && prev->dtpv_next != old)
7512 7519                          prev = prev->dtpv_next;
7513 7520  
7514 7521                  if (prev == NULL) {
7515 7522                          panic("attempt to unregister non-existent "
7516 7523                              "dtrace provider %p\n", (void *)id);
7517 7524                  }
7518 7525  
7519 7526                  prev->dtpv_next = old->dtpv_next;
7520 7527          }
7521 7528  
7522 7529          if (!self) {
7523 7530                  mutex_exit(&dtrace_lock);
7524 7531                  mutex_exit(&mod_lock);
7525 7532                  mutex_exit(&dtrace_provider_lock);
7526 7533          }
7527 7534  
7528 7535          kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
7529 7536          kmem_free(old, sizeof (dtrace_provider_t));
7530 7537  
7531 7538          return (0);
7532 7539  }
7533 7540  
7534 7541  /*
7535 7542   * Invalidate the specified provider.  All subsequent probe lookups for the
7536 7543   * specified provider will fail, but its probes will not be removed.
7537 7544   */
7538 7545  void
7539 7546  dtrace_invalidate(dtrace_provider_id_t id)
7540 7547  {
7541 7548          dtrace_provider_t *pvp = (dtrace_provider_t *)id;
7542 7549  
7543 7550          ASSERT(pvp->dtpv_pops.dtps_enable !=
7544 7551              (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
7545 7552  
7546 7553          mutex_enter(&dtrace_provider_lock);
7547 7554          mutex_enter(&dtrace_lock);
7548 7555  
7549 7556          pvp->dtpv_defunct = dtrace_gethrtime();
7550 7557  
7551 7558          mutex_exit(&dtrace_lock);
7552 7559          mutex_exit(&dtrace_provider_lock);
7553 7560  }
7554 7561  
7555 7562  /*
7556 7563   * Indicate whether or not DTrace has attached.
7557 7564   */
7558 7565  int
7559 7566  dtrace_attached(void)
7560 7567  {
7561 7568          /*
7562 7569           * dtrace_provider will be non-NULL iff the DTrace driver has
7563 7570           * attached.  (It's non-NULL because DTrace is always itself a
7564 7571           * provider.)
7565 7572           */
7566 7573          return (dtrace_provider != NULL);
7567 7574  }
7568 7575  
7569 7576  /*
7570 7577   * Remove all the unenabled probes for the given provider.  This function is
7571 7578   * not unlike dtrace_unregister(), except that it doesn't remove the provider
7572 7579   * -- just as many of its associated probes as it can.
7573 7580   */
7574 7581  int
7575 7582  dtrace_condense(dtrace_provider_id_t id)
7576 7583  {
7577 7584          dtrace_provider_t *prov = (dtrace_provider_t *)id;
7578 7585          int i;
7579 7586          dtrace_probe_t *probe;
7580 7587  
7581 7588          /*
7582 7589           * Make sure this isn't the dtrace provider itself.
7583 7590           */
7584 7591          ASSERT(prov->dtpv_pops.dtps_enable !=
7585 7592              (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
7586 7593  
7587 7594          mutex_enter(&dtrace_provider_lock);
7588 7595          mutex_enter(&dtrace_lock);
7589 7596  
7590 7597          /*
7591 7598           * Attempt to destroy the probes associated with this provider.
7592 7599           */
7593 7600          for (i = 0; i < dtrace_nprobes; i++) {
7594 7601                  if ((probe = dtrace_probes[i]) == NULL)
7595 7602                          continue;
7596 7603  
7597 7604                  if (probe->dtpr_provider != prov)
7598 7605                          continue;
7599 7606  
7600 7607                  if (probe->dtpr_ecb != NULL)
7601 7608                          continue;
7602 7609  
7603 7610                  dtrace_probes[i] = NULL;
7604 7611  
7605 7612                  dtrace_hash_remove(dtrace_bymod, probe);
7606 7613                  dtrace_hash_remove(dtrace_byfunc, probe);
7607 7614                  dtrace_hash_remove(dtrace_byname, probe);
7608 7615  
7609 7616                  prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
7610 7617                      probe->dtpr_arg);
7611 7618                  kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7612 7619                  kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7613 7620                  kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7614 7621                  kmem_free(probe, sizeof (dtrace_probe_t));
7615 7622                  vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
7616 7623          }
7617 7624  
7618 7625          mutex_exit(&dtrace_lock);
7619 7626          mutex_exit(&dtrace_provider_lock);
7620 7627  
7621 7628          return (0);
7622 7629  }
7623 7630  
7624 7631  /*
7625 7632   * DTrace Probe Management Functions
7626 7633   *
7627 7634   * The functions in this section perform the DTrace probe management,
7628 7635   * including functions to create probes, look-up probes, and call into the
7629 7636   * providers to request that probes be provided.  Some of these functions are
7630 7637   * in the Provider-to-Framework API; these functions can be identified by the
7631 7638   * fact that they are not declared "static".
7632 7639   */
7633 7640  
7634 7641  /*
7635 7642   * Create a probe with the specified module name, function name, and name.
7636 7643   */
7637 7644  dtrace_id_t
7638 7645  dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
7639 7646      const char *func, const char *name, int aframes, void *arg)
7640 7647  {
7641 7648          dtrace_probe_t *probe, **probes;
7642 7649          dtrace_provider_t *provider = (dtrace_provider_t *)prov;
7643 7650          dtrace_id_t id;
7644 7651  
7645 7652          if (provider == dtrace_provider) {
7646 7653                  ASSERT(MUTEX_HELD(&dtrace_lock));
7647 7654          } else {
7648 7655                  mutex_enter(&dtrace_lock);
7649 7656          }
7650 7657  
7651 7658          id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
7652 7659              VM_BESTFIT | VM_SLEEP);
7653 7660          probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
7654 7661  
7655 7662          probe->dtpr_id = id;
7656 7663          probe->dtpr_gen = dtrace_probegen++;
7657 7664          probe->dtpr_mod = dtrace_strdup(mod);
7658 7665          probe->dtpr_func = dtrace_strdup(func);
7659 7666          probe->dtpr_name = dtrace_strdup(name);
7660 7667          probe->dtpr_arg = arg;
7661 7668          probe->dtpr_aframes = aframes;
7662 7669          probe->dtpr_provider = provider;
7663 7670  
7664 7671          dtrace_hash_add(dtrace_bymod, probe);
7665 7672          dtrace_hash_add(dtrace_byfunc, probe);
7666 7673          dtrace_hash_add(dtrace_byname, probe);
7667 7674  
7668 7675          if (id - 1 >= dtrace_nprobes) {
7669 7676                  size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
7670 7677                  size_t nsize = osize << 1;
7671 7678  
7672 7679                  if (nsize == 0) {
7673 7680                          ASSERT(osize == 0);
7674 7681                          ASSERT(dtrace_probes == NULL);
7675 7682                          nsize = sizeof (dtrace_probe_t *);
7676 7683                  }
7677 7684  
7678 7685                  probes = kmem_zalloc(nsize, KM_SLEEP);
7679 7686  
7680 7687                  if (dtrace_probes == NULL) {
7681 7688                          ASSERT(osize == 0);
7682 7689                          dtrace_probes = probes;
7683 7690                          dtrace_nprobes = 1;
7684 7691                  } else {
7685 7692                          dtrace_probe_t **oprobes = dtrace_probes;
7686 7693  
7687 7694                          bcopy(oprobes, probes, osize);
7688 7695                          dtrace_membar_producer();
7689 7696                          dtrace_probes = probes;
7690 7697  
7691 7698                          dtrace_sync();
7692 7699  
7693 7700                          /*
7694 7701                           * All CPUs are now seeing the new probes array; we can
7695 7702                           * safely free the old array.
7696 7703                           */
7697 7704                          kmem_free(oprobes, osize);
7698 7705                          dtrace_nprobes <<= 1;
7699 7706                  }
7700 7707  
7701 7708                  ASSERT(id - 1 < dtrace_nprobes);
7702 7709          }
7703 7710  
7704 7711          ASSERT(dtrace_probes[id - 1] == NULL);
7705 7712          dtrace_probes[id - 1] = probe;
7706 7713  
7707 7714          if (provider != dtrace_provider)
7708 7715                  mutex_exit(&dtrace_lock);
7709 7716  
7710 7717          return (id);
7711 7718  }
7712 7719  
7713 7720  static dtrace_probe_t *
7714 7721  dtrace_probe_lookup_id(dtrace_id_t id)
7715 7722  {
7716 7723          ASSERT(MUTEX_HELD(&dtrace_lock));
7717 7724  
7718 7725          if (id == 0 || id > dtrace_nprobes)
7719 7726                  return (NULL);
7720 7727  
7721 7728          return (dtrace_probes[id - 1]);
7722 7729  }
7723 7730  
7724 7731  static int
7725 7732  dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
7726 7733  {
7727 7734          *((dtrace_id_t *)arg) = probe->dtpr_id;
7728 7735  
7729 7736          return (DTRACE_MATCH_DONE);
7730 7737  }
7731 7738  
7732 7739  /*
7733 7740   * Look up a probe based on provider and one or more of module name, function
7734 7741   * name and probe name.
7735 7742   */
7736 7743  dtrace_id_t
7737 7744  dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
7738 7745      const char *func, const char *name)
7739 7746  {
7740 7747          dtrace_probekey_t pkey;
7741 7748          dtrace_id_t id;
7742 7749          int match;
7743 7750  
7744 7751          pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
7745 7752          pkey.dtpk_pmatch = &dtrace_match_string;
7746 7753          pkey.dtpk_mod = mod;
7747 7754          pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
7748 7755          pkey.dtpk_func = func;
7749 7756          pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
7750 7757          pkey.dtpk_name = name;
7751 7758          pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
7752 7759          pkey.dtpk_id = DTRACE_IDNONE;
7753 7760  
7754 7761          mutex_enter(&dtrace_lock);
7755 7762          match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
7756 7763              dtrace_probe_lookup_match, &id);
7757 7764          mutex_exit(&dtrace_lock);
7758 7765  
7759 7766          ASSERT(match == 1 || match == 0);
7760 7767          return (match ? id : 0);
7761 7768  }
7762 7769  
7763 7770  /*
7764 7771   * Returns the probe argument associated with the specified probe.
7765 7772   */
7766 7773  void *
7767 7774  dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
7768 7775  {
7769 7776          dtrace_probe_t *probe;
7770 7777          void *rval = NULL;
7771 7778  
7772 7779          mutex_enter(&dtrace_lock);
7773 7780  
7774 7781          if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
7775 7782              probe->dtpr_provider == (dtrace_provider_t *)id)
7776 7783                  rval = probe->dtpr_arg;
7777 7784  
7778 7785          mutex_exit(&dtrace_lock);
7779 7786  
7780 7787          return (rval);
7781 7788  }
7782 7789  
7783 7790  /*
7784 7791   * Copy a probe into a probe description.
7785 7792   */
7786 7793  static void
7787 7794  dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
7788 7795  {
7789 7796          bzero(pdp, sizeof (dtrace_probedesc_t));
7790 7797          pdp->dtpd_id = prp->dtpr_id;
7791 7798  
7792 7799          (void) strncpy(pdp->dtpd_provider,
7793 7800              prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
7794 7801  
7795 7802          (void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
7796 7803          (void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
7797 7804          (void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
7798 7805  }
7799 7806  
7800 7807  /*
7801 7808   * Called to indicate that a probe -- or probes -- should be provided by a
7802 7809   * specfied provider.  If the specified description is NULL, the provider will
7803 7810   * be told to provide all of its probes.  (This is done whenever a new
7804 7811   * consumer comes along, or whenever a retained enabling is to be matched.) If
7805 7812   * the specified description is non-NULL, the provider is given the
7806 7813   * opportunity to dynamically provide the specified probe, allowing providers
7807 7814   * to support the creation of probes on-the-fly.  (So-called _autocreated_
7808 7815   * probes.)  If the provider is NULL, the operations will be applied to all
7809 7816   * providers; if the provider is non-NULL the operations will only be applied
7810 7817   * to the specified provider.  The dtrace_provider_lock must be held, and the
7811 7818   * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
7812 7819   * will need to grab the dtrace_lock when it reenters the framework through
7813 7820   * dtrace_probe_lookup(), dtrace_probe_create(), etc.
7814 7821   */
7815 7822  static void
7816 7823  dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
7817 7824  {
7818 7825          struct modctl *ctl;
7819 7826          int all = 0;
7820 7827  
7821 7828          ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7822 7829  
7823 7830          if (prv == NULL) {
7824 7831                  all = 1;
7825 7832                  prv = dtrace_provider;
7826 7833          }
7827 7834  
7828 7835          do {
7829 7836                  /*
7830 7837                   * First, call the blanket provide operation.
7831 7838                   */
7832 7839                  prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
7833 7840  
7834 7841                  /*
7835 7842                   * Now call the per-module provide operation.  We will grab
7836 7843                   * mod_lock to prevent the list from being modified.  Note
7837 7844                   * that this also prevents the mod_busy bits from changing.
7838 7845                   * (mod_busy can only be changed with mod_lock held.)
7839 7846                   */
7840 7847                  mutex_enter(&mod_lock);
7841 7848  
7842 7849                  ctl = &modules;
7843 7850                  do {
7844 7851                          if (ctl->mod_busy || ctl->mod_mp == NULL)
7845 7852                                  continue;
7846 7853  
7847 7854                          prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
7848 7855  
7849 7856                  } while ((ctl = ctl->mod_next) != &modules);
7850 7857  
7851 7858                  mutex_exit(&mod_lock);
7852 7859          } while (all && (prv = prv->dtpv_next) != NULL);
7853 7860  }
7854 7861  
7855 7862  /*
7856 7863   * Iterate over each probe, and call the Framework-to-Provider API function
7857 7864   * denoted by offs.
7858 7865   */
7859 7866  static void
7860 7867  dtrace_probe_foreach(uintptr_t offs)
7861 7868  {
7862 7869          dtrace_provider_t *prov;
7863 7870          void (*func)(void *, dtrace_id_t, void *);
7864 7871          dtrace_probe_t *probe;
7865 7872          dtrace_icookie_t cookie;
7866 7873          int i;
7867 7874  
7868 7875          /*
7869 7876           * We disable interrupts to walk through the probe array.  This is
7870 7877           * safe -- the dtrace_sync() in dtrace_unregister() assures that we
7871 7878           * won't see stale data.
7872 7879           */
7873 7880          cookie = dtrace_interrupt_disable();
7874 7881  
7875 7882          for (i = 0; i < dtrace_nprobes; i++) {
7876 7883                  if ((probe = dtrace_probes[i]) == NULL)
7877 7884                          continue;
7878 7885  
7879 7886                  if (probe->dtpr_ecb == NULL) {
7880 7887                          /*
7881 7888                           * This probe isn't enabled -- don't call the function.
7882 7889                           */
7883 7890                          continue;
7884 7891                  }
7885 7892  
7886 7893                  prov = probe->dtpr_provider;
7887 7894                  func = *((void(**)(void *, dtrace_id_t, void *))
7888 7895                      ((uintptr_t)&prov->dtpv_pops + offs));
7889 7896  
7890 7897                  func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
7891 7898          }
7892 7899  
7893 7900          dtrace_interrupt_enable(cookie);
7894 7901  }
7895 7902  
7896 7903  static int
7897 7904  dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
7898 7905  {
7899 7906          dtrace_probekey_t pkey;
7900 7907          uint32_t priv;
7901 7908          uid_t uid;
7902 7909          zoneid_t zoneid;
7903 7910  
7904 7911          ASSERT(MUTEX_HELD(&dtrace_lock));
7905 7912          dtrace_ecb_create_cache = NULL;
7906 7913  
7907 7914          if (desc == NULL) {
7908 7915                  /*
7909 7916                   * If we're passed a NULL description, we're being asked to
7910 7917                   * create an ECB with a NULL probe.
7911 7918                   */
7912 7919                  (void) dtrace_ecb_create_enable(NULL, enab);
7913 7920                  return (0);
7914 7921          }
7915 7922  
7916 7923          dtrace_probekey(desc, &pkey);
7917 7924          dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
7918 7925              &priv, &uid, &zoneid);
7919 7926  
7920 7927          return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
7921 7928              enab));
7922 7929  }
7923 7930  
7924 7931  /*
7925 7932   * DTrace Helper Provider Functions
7926 7933   */
7927 7934  static void
7928 7935  dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
7929 7936  {
7930 7937          attr->dtat_name = DOF_ATTR_NAME(dofattr);
7931 7938          attr->dtat_data = DOF_ATTR_DATA(dofattr);
7932 7939          attr->dtat_class = DOF_ATTR_CLASS(dofattr);
7933 7940  }
7934 7941  
7935 7942  static void
7936 7943  dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
7937 7944      const dof_provider_t *dofprov, char *strtab)
7938 7945  {
7939 7946          hprov->dthpv_provname = strtab + dofprov->dofpv_name;
7940 7947          dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
7941 7948              dofprov->dofpv_provattr);
7942 7949          dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
7943 7950              dofprov->dofpv_modattr);
7944 7951          dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
7945 7952              dofprov->dofpv_funcattr);
7946 7953          dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
7947 7954              dofprov->dofpv_nameattr);
7948 7955          dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
7949 7956              dofprov->dofpv_argsattr);
7950 7957  }
7951 7958  
7952 7959  static void
7953 7960  dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
7954 7961  {
7955 7962          uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7956 7963          dof_hdr_t *dof = (dof_hdr_t *)daddr;
7957 7964          dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
7958 7965          dof_provider_t *provider;
7959 7966          dof_probe_t *probe;
7960 7967          uint32_t *off, *enoff;
7961 7968          uint8_t *arg;
7962 7969          char *strtab;
7963 7970          uint_t i, nprobes;
7964 7971          dtrace_helper_provdesc_t dhpv;
7965 7972          dtrace_helper_probedesc_t dhpb;
7966 7973          dtrace_meta_t *meta = dtrace_meta_pid;
7967 7974          dtrace_mops_t *mops = &meta->dtm_mops;
7968 7975          void *parg;
7969 7976  
7970 7977          provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
7971 7978          str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7972 7979              provider->dofpv_strtab * dof->dofh_secsize);
7973 7980          prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7974 7981              provider->dofpv_probes * dof->dofh_secsize);
7975 7982          arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7976 7983              provider->dofpv_prargs * dof->dofh_secsize);
7977 7984          off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7978 7985              provider->dofpv_proffs * dof->dofh_secsize);
7979 7986  
7980 7987          strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
7981 7988          off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
7982 7989          arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
7983 7990          enoff = NULL;
7984 7991  
7985 7992          /*
7986 7993           * See dtrace_helper_provider_validate().
7987 7994           */
7988 7995          if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
7989 7996              provider->dofpv_prenoffs != DOF_SECT_NONE) {
7990 7997                  enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7991 7998                      provider->dofpv_prenoffs * dof->dofh_secsize);
7992 7999                  enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
7993 8000          }
7994 8001  
7995 8002          nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
7996 8003  
7997 8004          /*
7998 8005           * Create the provider.
7999 8006           */
8000 8007          dtrace_dofprov2hprov(&dhpv, provider, strtab);
8001 8008  
8002 8009          if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
8003 8010                  return;
8004 8011  
8005 8012          meta->dtm_count++;
8006 8013  
8007 8014          /*
8008 8015           * Create the probes.
8009 8016           */
8010 8017          for (i = 0; i < nprobes; i++) {
8011 8018                  probe = (dof_probe_t *)(uintptr_t)(daddr +
8012 8019                      prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
8013 8020  
8014 8021                  dhpb.dthpb_mod = dhp->dofhp_mod;
8015 8022                  dhpb.dthpb_func = strtab + probe->dofpr_func;
8016 8023                  dhpb.dthpb_name = strtab + probe->dofpr_name;
8017 8024                  dhpb.dthpb_base = probe->dofpr_addr;
8018 8025                  dhpb.dthpb_offs = off + probe->dofpr_offidx;
8019 8026                  dhpb.dthpb_noffs = probe->dofpr_noffs;
8020 8027                  if (enoff != NULL) {
8021 8028                          dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
8022 8029                          dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
8023 8030                  } else {
8024 8031                          dhpb.dthpb_enoffs = NULL;
8025 8032                          dhpb.dthpb_nenoffs = 0;
8026 8033                  }
8027 8034                  dhpb.dthpb_args = arg + probe->dofpr_argidx;
8028 8035                  dhpb.dthpb_nargc = probe->dofpr_nargc;
8029 8036                  dhpb.dthpb_xargc = probe->dofpr_xargc;
8030 8037                  dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
8031 8038                  dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
8032 8039  
8033 8040                  mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
8034 8041          }
8035 8042  }
8036 8043  
8037 8044  static void
8038 8045  dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
8039 8046  {
8040 8047          uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8041 8048          dof_hdr_t *dof = (dof_hdr_t *)daddr;
8042 8049          int i;
8043 8050  
8044 8051          ASSERT(MUTEX_HELD(&dtrace_meta_lock));
8045 8052  
8046 8053          for (i = 0; i < dof->dofh_secnum; i++) {
8047 8054                  dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8048 8055                      dof->dofh_secoff + i * dof->dofh_secsize);
8049 8056  
8050 8057                  if (sec->dofs_type != DOF_SECT_PROVIDER)
8051 8058                          continue;
8052 8059  
8053 8060                  dtrace_helper_provide_one(dhp, sec, pid);
8054 8061          }
8055 8062  
8056 8063          /*
8057 8064           * We may have just created probes, so we must now rematch against
8058 8065           * any retained enablings.  Note that this call will acquire both
8059 8066           * cpu_lock and dtrace_lock; the fact that we are holding
8060 8067           * dtrace_meta_lock now is what defines the ordering with respect to
8061 8068           * these three locks.
8062 8069           */
8063 8070          dtrace_enabling_matchall();
8064 8071  }
8065 8072  
8066 8073  static void
8067 8074  dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
8068 8075  {
8069 8076          uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8070 8077          dof_hdr_t *dof = (dof_hdr_t *)daddr;
8071 8078          dof_sec_t *str_sec;
8072 8079          dof_provider_t *provider;
8073 8080          char *strtab;
8074 8081          dtrace_helper_provdesc_t dhpv;
8075 8082          dtrace_meta_t *meta = dtrace_meta_pid;
8076 8083          dtrace_mops_t *mops = &meta->dtm_mops;
8077 8084  
8078 8085          provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8079 8086          str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8080 8087              provider->dofpv_strtab * dof->dofh_secsize);
8081 8088  
8082 8089          strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8083 8090  
8084 8091          /*
8085 8092           * Create the provider.
8086 8093           */
8087 8094          dtrace_dofprov2hprov(&dhpv, provider, strtab);
8088 8095  
8089 8096          mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
8090 8097  
8091 8098          meta->dtm_count--;
8092 8099  }
8093 8100  
8094 8101  static void
8095 8102  dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
8096 8103  {
8097 8104          uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8098 8105          dof_hdr_t *dof = (dof_hdr_t *)daddr;
8099 8106          int i;
8100 8107  
8101 8108          ASSERT(MUTEX_HELD(&dtrace_meta_lock));
8102 8109  
8103 8110          for (i = 0; i < dof->dofh_secnum; i++) {
8104 8111                  dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8105 8112                      dof->dofh_secoff + i * dof->dofh_secsize);
8106 8113  
8107 8114                  if (sec->dofs_type != DOF_SECT_PROVIDER)
8108 8115                          continue;
8109 8116  
8110 8117                  dtrace_helper_provider_remove_one(dhp, sec, pid);
8111 8118          }
8112 8119  }
8113 8120  
8114 8121  /*
8115 8122   * DTrace Meta Provider-to-Framework API Functions
8116 8123   *
8117 8124   * These functions implement the Meta Provider-to-Framework API, as described
8118 8125   * in <sys/dtrace.h>.
8119 8126   */
8120 8127  int
8121 8128  dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
8122 8129      dtrace_meta_provider_id_t *idp)
8123 8130  {
8124 8131          dtrace_meta_t *meta;
8125 8132          dtrace_helpers_t *help, *next;
8126 8133          int i;
8127 8134  
8128 8135          *idp = DTRACE_METAPROVNONE;
8129 8136  
8130 8137          /*
8131 8138           * We strictly don't need the name, but we hold onto it for
8132 8139           * debuggability. All hail error queues!
8133 8140           */
8134 8141          if (name == NULL) {
8135 8142                  cmn_err(CE_WARN, "failed to register meta-provider: "
8136 8143                      "invalid name");
8137 8144                  return (EINVAL);
8138 8145          }
8139 8146  
8140 8147          if (mops == NULL ||
8141 8148              mops->dtms_create_probe == NULL ||
8142 8149              mops->dtms_provide_pid == NULL ||
8143 8150              mops->dtms_remove_pid == NULL) {
8144 8151                  cmn_err(CE_WARN, "failed to register meta-register %s: "
8145 8152                      "invalid ops", name);
8146 8153                  return (EINVAL);
8147 8154          }
8148 8155  
8149 8156          meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
8150 8157          meta->dtm_mops = *mops;
8151 8158          meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
8152 8159          (void) strcpy(meta->dtm_name, name);
8153 8160          meta->dtm_arg = arg;
8154 8161  
8155 8162          mutex_enter(&dtrace_meta_lock);
8156 8163          mutex_enter(&dtrace_lock);
8157 8164  
8158 8165          if (dtrace_meta_pid != NULL) {
8159 8166                  mutex_exit(&dtrace_lock);
8160 8167                  mutex_exit(&dtrace_meta_lock);
8161 8168                  cmn_err(CE_WARN, "failed to register meta-register %s: "
8162 8169                      "user-land meta-provider exists", name);
8163 8170                  kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
8164 8171                  kmem_free(meta, sizeof (dtrace_meta_t));
8165 8172                  return (EINVAL);
8166 8173          }
8167 8174  
8168 8175          dtrace_meta_pid = meta;
8169 8176          *idp = (dtrace_meta_provider_id_t)meta;
8170 8177  
8171 8178          /*
8172 8179           * If there are providers and probes ready to go, pass them
8173 8180           * off to the new meta provider now.
8174 8181           */
8175 8182  
8176 8183          help = dtrace_deferred_pid;
8177 8184          dtrace_deferred_pid = NULL;
8178 8185  
8179 8186          mutex_exit(&dtrace_lock);
8180 8187  
8181 8188          while (help != NULL) {
8182 8189                  for (i = 0; i < help->dthps_nprovs; i++) {
8183 8190                          dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
8184 8191                              help->dthps_pid);
8185 8192                  }
8186 8193  
8187 8194                  next = help->dthps_next;
8188 8195                  help->dthps_next = NULL;
8189 8196                  help->dthps_prev = NULL;
8190 8197                  help->dthps_deferred = 0;
8191 8198                  help = next;
8192 8199          }
8193 8200  
8194 8201          mutex_exit(&dtrace_meta_lock);
8195 8202  
8196 8203          return (0);
8197 8204  }
8198 8205  
8199 8206  int
8200 8207  dtrace_meta_unregister(dtrace_meta_provider_id_t id)
8201 8208  {
8202 8209          dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
8203 8210  
8204 8211          mutex_enter(&dtrace_meta_lock);
8205 8212          mutex_enter(&dtrace_lock);
8206 8213  
8207 8214          if (old == dtrace_meta_pid) {
8208 8215                  pp = &dtrace_meta_pid;
8209 8216          } else {
8210 8217                  panic("attempt to unregister non-existent "
8211 8218                      "dtrace meta-provider %p\n", (void *)old);
8212 8219          }
8213 8220  
8214 8221          if (old->dtm_count != 0) {
8215 8222                  mutex_exit(&dtrace_lock);
8216 8223                  mutex_exit(&dtrace_meta_lock);
8217 8224                  return (EBUSY);
8218 8225          }
8219 8226  
8220 8227          *pp = NULL;
8221 8228  
8222 8229          mutex_exit(&dtrace_lock);
8223 8230          mutex_exit(&dtrace_meta_lock);
8224 8231  
8225 8232          kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
8226 8233          kmem_free(old, sizeof (dtrace_meta_t));
8227 8234  
8228 8235          return (0);
8229 8236  }
8230 8237  
8231 8238  
8232 8239  /*
8233 8240   * DTrace DIF Object Functions
8234 8241   */
8235 8242  static int
8236 8243  dtrace_difo_err(uint_t pc, const char *format, ...)
8237 8244  {
8238 8245          if (dtrace_err_verbose) {
8239 8246                  va_list alist;
8240 8247  
8241 8248                  (void) uprintf("dtrace DIF object error: [%u]: ", pc);
8242 8249                  va_start(alist, format);
8243 8250                  (void) vuprintf(format, alist);
8244 8251                  va_end(alist);
8245 8252          }
8246 8253  
8247 8254  #ifdef DTRACE_ERRDEBUG
8248 8255          dtrace_errdebug(format);
8249 8256  #endif
8250 8257          return (1);
8251 8258  }
8252 8259  
8253 8260  /*
8254 8261   * Validate a DTrace DIF object by checking the IR instructions.  The following
8255 8262   * rules are currently enforced by dtrace_difo_validate():
8256 8263   *
8257 8264   * 1. Each instruction must have a valid opcode
8258 8265   * 2. Each register, string, variable, or subroutine reference must be valid
8259 8266   * 3. No instruction can modify register %r0 (must be zero)
8260 8267   * 4. All instruction reserved bits must be set to zero
8261 8268   * 5. The last instruction must be a "ret" instruction
8262 8269   * 6. All branch targets must reference a valid instruction _after_ the branch
8263 8270   */
8264 8271  static int
8265 8272  dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
8266 8273      cred_t *cr)
8267 8274  {
8268 8275          int err = 0, i;
8269 8276          int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8270 8277          int kcheckload;
8271 8278          uint_t pc;
8272 8279  
8273 8280          kcheckload = cr == NULL ||
8274 8281              (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
8275 8282  
8276 8283          dp->dtdo_destructive = 0;
8277 8284  
8278 8285          for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
8279 8286                  dif_instr_t instr = dp->dtdo_buf[pc];
8280 8287  
8281 8288                  uint_t r1 = DIF_INSTR_R1(instr);
8282 8289                  uint_t r2 = DIF_INSTR_R2(instr);
8283 8290                  uint_t rd = DIF_INSTR_RD(instr);
8284 8291                  uint_t rs = DIF_INSTR_RS(instr);
8285 8292                  uint_t label = DIF_INSTR_LABEL(instr);
8286 8293                  uint_t v = DIF_INSTR_VAR(instr);
8287 8294                  uint_t subr = DIF_INSTR_SUBR(instr);
8288 8295                  uint_t type = DIF_INSTR_TYPE(instr);
8289 8296                  uint_t op = DIF_INSTR_OP(instr);
8290 8297  
8291 8298                  switch (op) {
8292 8299                  case DIF_OP_OR:
8293 8300                  case DIF_OP_XOR:
8294 8301                  case DIF_OP_AND:
8295 8302                  case DIF_OP_SLL:
8296 8303                  case DIF_OP_SRL:
8297 8304                  case DIF_OP_SRA:
8298 8305                  case DIF_OP_SUB:
8299 8306                  case DIF_OP_ADD:
8300 8307                  case DIF_OP_MUL:
8301 8308                  case DIF_OP_SDIV:
8302 8309                  case DIF_OP_UDIV:
8303 8310                  case DIF_OP_SREM:
8304 8311                  case DIF_OP_UREM:
8305 8312                  case DIF_OP_COPYS:
8306 8313                          if (r1 >= nregs)
8307 8314                                  err += efunc(pc, "invalid register %u\n", r1);
8308 8315                          if (r2 >= nregs)
8309 8316                                  err += efunc(pc, "invalid register %u\n", r2);
8310 8317                          if (rd >= nregs)
8311 8318                                  err += efunc(pc, "invalid register %u\n", rd);
8312 8319                          if (rd == 0)
8313 8320                                  err += efunc(pc, "cannot write to %r0\n");
8314 8321                          break;
8315 8322                  case DIF_OP_NOT:
8316 8323                  case DIF_OP_MOV:
8317 8324                  case DIF_OP_ALLOCS:
8318 8325                          if (r1 >= nregs)
8319 8326                                  err += efunc(pc, "invalid register %u\n", r1);
8320 8327                          if (r2 != 0)
8321 8328                                  err += efunc(pc, "non-zero reserved bits\n");
8322 8329                          if (rd >= nregs)
8323 8330                                  err += efunc(pc, "invalid register %u\n", rd);
8324 8331                          if (rd == 0)
8325 8332                                  err += efunc(pc, "cannot write to %r0\n");
8326 8333                          break;
8327 8334                  case DIF_OP_LDSB:
8328 8335                  case DIF_OP_LDSH:
8329 8336                  case DIF_OP_LDSW:
8330 8337                  case DIF_OP_LDUB:
8331 8338                  case DIF_OP_LDUH:
8332 8339                  case DIF_OP_LDUW:
8333 8340                  case DIF_OP_LDX:
8334 8341                          if (r1 >= nregs)
8335 8342                                  err += efunc(pc, "invalid register %u\n", r1);
8336 8343                          if (r2 != 0)
8337 8344                                  err += efunc(pc, "non-zero reserved bits\n");
8338 8345                          if (rd >= nregs)
8339 8346                                  err += efunc(pc, "invalid register %u\n", rd);
8340 8347                          if (rd == 0)
8341 8348                                  err += efunc(pc, "cannot write to %r0\n");
8342 8349                          if (kcheckload)
8343 8350                                  dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
8344 8351                                      DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
8345 8352                          break;
8346 8353                  case DIF_OP_RLDSB:
8347 8354                  case DIF_OP_RLDSH:
8348 8355                  case DIF_OP_RLDSW:
8349 8356                  case DIF_OP_RLDUB:
8350 8357                  case DIF_OP_RLDUH:
8351 8358                  case DIF_OP_RLDUW:
8352 8359                  case DIF_OP_RLDX:
8353 8360                          if (r1 >= nregs)
8354 8361                                  err += efunc(pc, "invalid register %u\n", r1);
8355 8362                          if (r2 != 0)
8356 8363                                  err += efunc(pc, "non-zero reserved bits\n");
8357 8364                          if (rd >= nregs)
8358 8365                                  err += efunc(pc, "invalid register %u\n", rd);
8359 8366                          if (rd == 0)
8360 8367                                  err += efunc(pc, "cannot write to %r0\n");
8361 8368                          break;
8362 8369                  case DIF_OP_ULDSB:
8363 8370                  case DIF_OP_ULDSH:
8364 8371                  case DIF_OP_ULDSW:
8365 8372                  case DIF_OP_ULDUB:
8366 8373                  case DIF_OP_ULDUH:
8367 8374                  case DIF_OP_ULDUW:
8368 8375                  case DIF_OP_ULDX:
8369 8376                          if (r1 >= nregs)
8370 8377                                  err += efunc(pc, "invalid register %u\n", r1);
8371 8378                          if (r2 != 0)
8372 8379                                  err += efunc(pc, "non-zero reserved bits\n");
8373 8380                          if (rd >= nregs)
8374 8381                                  err += efunc(pc, "invalid register %u\n", rd);
8375 8382                          if (rd == 0)
8376 8383                                  err += efunc(pc, "cannot write to %r0\n");
8377 8384                          break;
8378 8385                  case DIF_OP_STB:
8379 8386                  case DIF_OP_STH:
8380 8387                  case DIF_OP_STW:
8381 8388                  case DIF_OP_STX:
8382 8389                          if (r1 >= nregs)
8383 8390                                  err += efunc(pc, "invalid register %u\n", r1);
8384 8391                          if (r2 != 0)
8385 8392                                  err += efunc(pc, "non-zero reserved bits\n");
8386 8393                          if (rd >= nregs)
8387 8394                                  err += efunc(pc, "invalid register %u\n", rd);
8388 8395                          if (rd == 0)
8389 8396                                  err += efunc(pc, "cannot write to 0 address\n");
8390 8397                          break;
8391 8398                  case DIF_OP_CMP:
8392 8399                  case DIF_OP_SCMP:
8393 8400                          if (r1 >= nregs)
8394 8401                                  err += efunc(pc, "invalid register %u\n", r1);
8395 8402                          if (r2 >= nregs)
8396 8403                                  err += efunc(pc, "invalid register %u\n", r2);
8397 8404                          if (rd != 0)
8398 8405                                  err += efunc(pc, "non-zero reserved bits\n");
8399 8406                          break;
8400 8407                  case DIF_OP_TST:
8401 8408                          if (r1 >= nregs)
8402 8409                                  err += efunc(pc, "invalid register %u\n", r1);
8403 8410                          if (r2 != 0 || rd != 0)
8404 8411                                  err += efunc(pc, "non-zero reserved bits\n");
8405 8412                          break;
8406 8413                  case DIF_OP_BA:
8407 8414                  case DIF_OP_BE:
8408 8415                  case DIF_OP_BNE:
8409 8416                  case DIF_OP_BG:
8410 8417                  case DIF_OP_BGU:
8411 8418                  case DIF_OP_BGE:
8412 8419                  case DIF_OP_BGEU:
8413 8420                  case DIF_OP_BL:
8414 8421                  case DIF_OP_BLU:
8415 8422                  case DIF_OP_BLE:
8416 8423                  case DIF_OP_BLEU:
8417 8424                          if (label >= dp->dtdo_len) {
8418 8425                                  err += efunc(pc, "invalid branch target %u\n",
8419 8426                                      label);
8420 8427                          }
8421 8428                          if (label <= pc) {
8422 8429                                  err += efunc(pc, "backward branch to %u\n",
8423 8430                                      label);
8424 8431                          }
8425 8432                          break;
8426 8433                  case DIF_OP_RET:
8427 8434                          if (r1 != 0 || r2 != 0)
8428 8435                                  err += efunc(pc, "non-zero reserved bits\n");
8429 8436                          if (rd >= nregs)
8430 8437                                  err += efunc(pc, "invalid register %u\n", rd);
8431 8438                          break;
8432 8439                  case DIF_OP_NOP:
8433 8440                  case DIF_OP_POPTS:
8434 8441                  case DIF_OP_FLUSHTS:
8435 8442                          if (r1 != 0 || r2 != 0 || rd != 0)
8436 8443                                  err += efunc(pc, "non-zero reserved bits\n");
8437 8444                          break;
8438 8445                  case DIF_OP_SETX:
8439 8446                          if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
8440 8447                                  err += efunc(pc, "invalid integer ref %u\n",
8441 8448                                      DIF_INSTR_INTEGER(instr));
8442 8449                          }
8443 8450                          if (rd >= nregs)
8444 8451                                  err += efunc(pc, "invalid register %u\n", rd);
8445 8452                          if (rd == 0)
8446 8453                                  err += efunc(pc, "cannot write to %r0\n");
8447 8454                          break;
8448 8455                  case DIF_OP_SETS:
8449 8456                          if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
8450 8457                                  err += efunc(pc, "invalid string ref %u\n",
8451 8458                                      DIF_INSTR_STRING(instr));
8452 8459                          }
8453 8460                          if (rd >= nregs)
8454 8461                                  err += efunc(pc, "invalid register %u\n", rd);
8455 8462                          if (rd == 0)
8456 8463                                  err += efunc(pc, "cannot write to %r0\n");
8457 8464                          break;
8458 8465                  case DIF_OP_LDGA:
8459 8466                  case DIF_OP_LDTA:
8460 8467                          if (r1 > DIF_VAR_ARRAY_MAX)
8461 8468                                  err += efunc(pc, "invalid array %u\n", r1);
8462 8469                          if (r2 >= nregs)
8463 8470                                  err += efunc(pc, "invalid register %u\n", r2);
8464 8471                          if (rd >= nregs)
8465 8472                                  err += efunc(pc, "invalid register %u\n", rd);
8466 8473                          if (rd == 0)
8467 8474                                  err += efunc(pc, "cannot write to %r0\n");
8468 8475                          break;
8469 8476                  case DIF_OP_LDGS:
8470 8477                  case DIF_OP_LDTS:
8471 8478                  case DIF_OP_LDLS:
8472 8479                  case DIF_OP_LDGAA:
8473 8480                  case DIF_OP_LDTAA:
8474 8481                          if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
8475 8482                                  err += efunc(pc, "invalid variable %u\n", v);
8476 8483                          if (rd >= nregs)
8477 8484                                  err += efunc(pc, "invalid register %u\n", rd);
8478 8485                          if (rd == 0)
8479 8486                                  err += efunc(pc, "cannot write to %r0\n");
8480 8487                          break;
8481 8488                  case DIF_OP_STGS:
8482 8489                  case DIF_OP_STTS:
8483 8490                  case DIF_OP_STLS:
8484 8491                  case DIF_OP_STGAA:
8485 8492                  case DIF_OP_STTAA:
8486 8493                          if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
8487 8494                                  err += efunc(pc, "invalid variable %u\n", v);
8488 8495                          if (rs >= nregs)
8489 8496                                  err += efunc(pc, "invalid register %u\n", rd);
8490 8497                          break;
8491 8498                  case DIF_OP_CALL:
8492 8499                          if (subr > DIF_SUBR_MAX)
8493 8500                                  err += efunc(pc, "invalid subr %u\n", subr);
8494 8501                          if (rd >= nregs)
8495 8502                                  err += efunc(pc, "invalid register %u\n", rd);
8496 8503                          if (rd == 0)
8497 8504                                  err += efunc(pc, "cannot write to %r0\n");
8498 8505  
8499 8506                          if (subr == DIF_SUBR_COPYOUT ||
8500 8507                              subr == DIF_SUBR_COPYOUTSTR) {
8501 8508                                  dp->dtdo_destructive = 1;
8502 8509                          }
8503 8510  
8504 8511                          if (subr == DIF_SUBR_GETF) {
8505 8512                                  /*
8506 8513                                   * If we have a getf() we need to record that
8507 8514                                   * in our state.  Note that our state can be
8508 8515                                   * NULL if this is a helper -- but in that
8509 8516                                   * case, the call to getf() is itself illegal,
8510 8517                                   * and will be caught (slightly later) when
8511 8518                                   * the helper is validated.
8512 8519                                   */
8513 8520                                  if (vstate->dtvs_state != NULL)
8514 8521                                          vstate->dtvs_state->dts_getf++;
8515 8522                          }
8516 8523  
8517 8524                          break;
8518 8525                  case DIF_OP_PUSHTR:
8519 8526                          if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
8520 8527                                  err += efunc(pc, "invalid ref type %u\n", type);
8521 8528                          if (r2 >= nregs)
8522 8529                                  err += efunc(pc, "invalid register %u\n", r2);
8523 8530                          if (rs >= nregs)
8524 8531                                  err += efunc(pc, "invalid register %u\n", rs);
8525 8532                          break;
8526 8533                  case DIF_OP_PUSHTV:
8527 8534                          if (type != DIF_TYPE_CTF)
8528 8535                                  err += efunc(pc, "invalid val type %u\n", type);
8529 8536                          if (r2 >= nregs)
8530 8537                                  err += efunc(pc, "invalid register %u\n", r2);
8531 8538                          if (rs >= nregs)
8532 8539                                  err += efunc(pc, "invalid register %u\n", rs);
8533 8540                          break;
8534 8541                  default:
8535 8542                          err += efunc(pc, "invalid opcode %u\n",
8536 8543                              DIF_INSTR_OP(instr));
8537 8544                  }
8538 8545          }
8539 8546  
8540 8547          if (dp->dtdo_len != 0 &&
8541 8548              DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
8542 8549                  err += efunc(dp->dtdo_len - 1,
8543 8550                      "expected 'ret' as last DIF instruction\n");
8544 8551          }
8545 8552  
8546 8553          if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) {
8547 8554                  /*
8548 8555                   * If we're not returning by reference, the size must be either
8549 8556                   * 0 or the size of one of the base types.
8550 8557                   */
8551 8558                  switch (dp->dtdo_rtype.dtdt_size) {
8552 8559                  case 0:
8553 8560                  case sizeof (uint8_t):
8554 8561                  case sizeof (uint16_t):
8555 8562                  case sizeof (uint32_t):
8556 8563                  case sizeof (uint64_t):
8557 8564                          break;
8558 8565  
8559 8566                  default:
8560 8567                          err += efunc(dp->dtdo_len - 1, "bad return size\n");
8561 8568                  }
8562 8569          }
8563 8570  
8564 8571          for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
8565 8572                  dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
8566 8573                  dtrace_diftype_t *vt, *et;
8567 8574                  uint_t id, ndx;
8568 8575  
8569 8576                  if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
8570 8577                      v->dtdv_scope != DIFV_SCOPE_THREAD &&
8571 8578                      v->dtdv_scope != DIFV_SCOPE_LOCAL) {
8572 8579                          err += efunc(i, "unrecognized variable scope %d\n",
8573 8580                              v->dtdv_scope);
8574 8581                          break;
8575 8582                  }
8576 8583  
8577 8584                  if (v->dtdv_kind != DIFV_KIND_ARRAY &&
8578 8585                      v->dtdv_kind != DIFV_KIND_SCALAR) {
8579 8586                          err += efunc(i, "unrecognized variable type %d\n",
8580 8587                              v->dtdv_kind);
8581 8588                          break;
8582 8589                  }
8583 8590  
8584 8591                  if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
8585 8592                          err += efunc(i, "%d exceeds variable id limit\n", id);
8586 8593                          break;
8587 8594                  }
8588 8595  
8589 8596                  if (id < DIF_VAR_OTHER_UBASE)
8590 8597                          continue;
8591 8598  
8592 8599                  /*
8593 8600                   * For user-defined variables, we need to check that this
8594 8601                   * definition is identical to any previous definition that we
8595 8602                   * encountered.
8596 8603                   */
8597 8604                  ndx = id - DIF_VAR_OTHER_UBASE;
8598 8605  
8599 8606                  switch (v->dtdv_scope) {
8600 8607                  case DIFV_SCOPE_GLOBAL:
8601 8608                          if (ndx < vstate->dtvs_nglobals) {
8602 8609                                  dtrace_statvar_t *svar;
8603 8610  
8604 8611                                  if ((svar = vstate->dtvs_globals[ndx]) != NULL)
8605 8612                                          existing = &svar->dtsv_var;
8606 8613                          }
8607 8614  
8608 8615                          break;
8609 8616  
8610 8617                  case DIFV_SCOPE_THREAD:
8611 8618                          if (ndx < vstate->dtvs_ntlocals)
8612 8619                                  existing = &vstate->dtvs_tlocals[ndx];
8613 8620                          break;
8614 8621  
8615 8622                  case DIFV_SCOPE_LOCAL:
8616 8623                          if (ndx < vstate->dtvs_nlocals) {
8617 8624                                  dtrace_statvar_t *svar;
8618 8625  
8619 8626                                  if ((svar = vstate->dtvs_locals[ndx]) != NULL)
8620 8627                                          existing = &svar->dtsv_var;
8621 8628                          }
8622 8629  
8623 8630                          break;
8624 8631                  }
8625 8632  
8626 8633                  vt = &v->dtdv_type;
8627 8634  
8628 8635                  if (vt->dtdt_flags & DIF_TF_BYREF) {
8629 8636                          if (vt->dtdt_size == 0) {
8630 8637                                  err += efunc(i, "zero-sized variable\n");
8631 8638                                  break;
8632 8639                          }
8633 8640  
8634 8641                          if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
8635 8642                              vt->dtdt_size > dtrace_global_maxsize) {
8636 8643                                  err += efunc(i, "oversized by-ref global\n");
8637 8644                                  break;
8638 8645                          }
8639 8646                  }
8640 8647  
8641 8648                  if (existing == NULL || existing->dtdv_id == 0)
8642 8649                          continue;
8643 8650  
8644 8651                  ASSERT(existing->dtdv_id == v->dtdv_id);
8645 8652                  ASSERT(existing->dtdv_scope == v->dtdv_scope);
8646 8653  
8647 8654                  if (existing->dtdv_kind != v->dtdv_kind)
8648 8655                          err += efunc(i, "%d changed variable kind\n", id);
8649 8656  
8650 8657                  et = &existing->dtdv_type;
8651 8658  
8652 8659                  if (vt->dtdt_flags != et->dtdt_flags) {
8653 8660                          err += efunc(i, "%d changed variable type flags\n", id);
8654 8661                          break;
8655 8662                  }
8656 8663  
8657 8664                  if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
8658 8665                          err += efunc(i, "%d changed variable type size\n", id);
8659 8666                          break;
8660 8667                  }
8661 8668          }
8662 8669  
8663 8670          return (err);
8664 8671  }
8665 8672  
8666 8673  /*
8667 8674   * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
8668 8675   * are much more constrained than normal DIFOs.  Specifically, they may
8669 8676   * not:
8670 8677   *
8671 8678   * 1. Make calls to subroutines other than copyin(), copyinstr() or
8672 8679   *    miscellaneous string routines
8673 8680   * 2. Access DTrace variables other than the args[] array, and the
8674 8681   *    curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
8675 8682   * 3. Have thread-local variables.
8676 8683   * 4. Have dynamic variables.
8677 8684   */
8678 8685  static int
8679 8686  dtrace_difo_validate_helper(dtrace_difo_t *dp)
8680 8687  {
8681 8688          int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8682 8689          int err = 0;
8683 8690          uint_t pc;
8684 8691  
8685 8692          for (pc = 0; pc < dp->dtdo_len; pc++) {
8686 8693                  dif_instr_t instr = dp->dtdo_buf[pc];
8687 8694  
8688 8695                  uint_t v = DIF_INSTR_VAR(instr);
8689 8696                  uint_t subr = DIF_INSTR_SUBR(instr);
8690 8697                  uint_t op = DIF_INSTR_OP(instr);
8691 8698  
8692 8699                  switch (op) {
8693 8700                  case DIF_OP_OR:
8694 8701                  case DIF_OP_XOR:
8695 8702                  case DIF_OP_AND:
8696 8703                  case DIF_OP_SLL:
8697 8704                  case DIF_OP_SRL:
8698 8705                  case DIF_OP_SRA:
8699 8706                  case DIF_OP_SUB:
8700 8707                  case DIF_OP_ADD:
8701 8708                  case DIF_OP_MUL:
8702 8709                  case DIF_OP_SDIV:
8703 8710                  case DIF_OP_UDIV:
8704 8711                  case DIF_OP_SREM:
8705 8712                  case DIF_OP_UREM:
8706 8713                  case DIF_OP_COPYS:
8707 8714                  case DIF_OP_NOT:
8708 8715                  case DIF_OP_MOV:
8709 8716                  case DIF_OP_RLDSB:
8710 8717                  case DIF_OP_RLDSH:
8711 8718                  case DIF_OP_RLDSW:
8712 8719                  case DIF_OP_RLDUB:
8713 8720                  case DIF_OP_RLDUH:
8714 8721                  case DIF_OP_RLDUW:
8715 8722                  case DIF_OP_RLDX:
8716 8723                  case DIF_OP_ULDSB:
8717 8724                  case DIF_OP_ULDSH:
8718 8725                  case DIF_OP_ULDSW:
8719 8726                  case DIF_OP_ULDUB:
8720 8727                  case DIF_OP_ULDUH:
8721 8728                  case DIF_OP_ULDUW:
8722 8729                  case DIF_OP_ULDX:
8723 8730                  case DIF_OP_STB:
8724 8731                  case DIF_OP_STH:
8725 8732                  case DIF_OP_STW:
8726 8733                  case DIF_OP_STX:
8727 8734                  case DIF_OP_ALLOCS:
8728 8735                  case DIF_OP_CMP:
8729 8736                  case DIF_OP_SCMP:
8730 8737                  case DIF_OP_TST:
8731 8738                  case DIF_OP_BA:
8732 8739                  case DIF_OP_BE:
8733 8740                  case DIF_OP_BNE:
8734 8741                  case DIF_OP_BG:
8735 8742                  case DIF_OP_BGU:
8736 8743                  case DIF_OP_BGE:
8737 8744                  case DIF_OP_BGEU:
8738 8745                  case DIF_OP_BL:
8739 8746                  case DIF_OP_BLU:
8740 8747                  case DIF_OP_BLE:
8741 8748                  case DIF_OP_BLEU:
8742 8749                  case DIF_OP_RET:
8743 8750                  case DIF_OP_NOP:
8744 8751                  case DIF_OP_POPTS:
8745 8752                  case DIF_OP_FLUSHTS:
8746 8753                  case DIF_OP_SETX:
8747 8754                  case DIF_OP_SETS:
8748 8755                  case DIF_OP_LDGA:
8749 8756                  case DIF_OP_LDLS:
8750 8757                  case DIF_OP_STGS:
8751 8758                  case DIF_OP_STLS:
8752 8759                  case DIF_OP_PUSHTR:
8753 8760                  case DIF_OP_PUSHTV:
8754 8761                          break;
8755 8762  
8756 8763                  case DIF_OP_LDGS:
8757 8764                          if (v >= DIF_VAR_OTHER_UBASE)
8758 8765                                  break;
8759 8766  
8760 8767                          if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
8761 8768                                  break;
8762 8769  
8763 8770                          if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
8764 8771                              v == DIF_VAR_PPID || v == DIF_VAR_TID ||
8765 8772                              v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
8766 8773                              v == DIF_VAR_UID || v == DIF_VAR_GID)
8767 8774                                  break;
8768 8775  
8769 8776                          err += efunc(pc, "illegal variable %u\n", v);
8770 8777                          break;
8771 8778  
8772 8779                  case DIF_OP_LDTA:
8773 8780                  case DIF_OP_LDTS:
8774 8781                  case DIF_OP_LDGAA:
8775 8782                  case DIF_OP_LDTAA:
8776 8783                          err += efunc(pc, "illegal dynamic variable load\n");
8777 8784                          break;
8778 8785  
8779 8786                  case DIF_OP_STTS:
8780 8787                  case DIF_OP_STGAA:
8781 8788                  case DIF_OP_STTAA:
8782 8789                          err += efunc(pc, "illegal dynamic variable store\n");
8783 8790                          break;
8784 8791  
8785 8792                  case DIF_OP_CALL:
8786 8793                          if (subr == DIF_SUBR_ALLOCA ||
8787 8794                              subr == DIF_SUBR_BCOPY ||
8788 8795                              subr == DIF_SUBR_COPYIN ||
8789 8796                              subr == DIF_SUBR_COPYINTO ||
8790 8797                              subr == DIF_SUBR_COPYINSTR ||
8791 8798                              subr == DIF_SUBR_INDEX ||
8792 8799                              subr == DIF_SUBR_INET_NTOA ||
8793 8800                              subr == DIF_SUBR_INET_NTOA6 ||
8794 8801                              subr == DIF_SUBR_INET_NTOP ||
8795 8802                              subr == DIF_SUBR_LLTOSTR ||
8796 8803                              subr == DIF_SUBR_RINDEX ||
8797 8804                              subr == DIF_SUBR_STRCHR ||
8798 8805                              subr == DIF_SUBR_STRJOIN ||
8799 8806                              subr == DIF_SUBR_STRRCHR ||
8800 8807                              subr == DIF_SUBR_STRSTR ||
8801 8808                              subr == DIF_SUBR_HTONS ||
8802 8809                              subr == DIF_SUBR_HTONL ||
8803 8810                              subr == DIF_SUBR_HTONLL ||
8804 8811                              subr == DIF_SUBR_NTOHS ||
8805 8812                              subr == DIF_SUBR_NTOHL ||
8806 8813                              subr == DIF_SUBR_NTOHLL)
8807 8814                                  break;
8808 8815  
8809 8816                          err += efunc(pc, "invalid subr %u\n", subr);
8810 8817                          break;
8811 8818  
8812 8819                  default:
8813 8820                          err += efunc(pc, "invalid opcode %u\n",
8814 8821                              DIF_INSTR_OP(instr));
8815 8822                  }
8816 8823          }
8817 8824  
8818 8825          return (err);
8819 8826  }
8820 8827  
8821 8828  /*
8822 8829   * Returns 1 if the expression in the DIF object can be cached on a per-thread
8823 8830   * basis; 0 if not.
8824 8831   */
8825 8832  static int
8826 8833  dtrace_difo_cacheable(dtrace_difo_t *dp)
8827 8834  {
8828 8835          int i;
8829 8836  
8830 8837          if (dp == NULL)
8831 8838                  return (0);
8832 8839  
8833 8840          for (i = 0; i < dp->dtdo_varlen; i++) {
8834 8841                  dtrace_difv_t *v = &dp->dtdo_vartab[i];
8835 8842  
8836 8843                  if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
8837 8844                          continue;
8838 8845  
8839 8846                  switch (v->dtdv_id) {
8840 8847                  case DIF_VAR_CURTHREAD:
8841 8848                  case DIF_VAR_PID:
8842 8849                  case DIF_VAR_TID:
8843 8850                  case DIF_VAR_EXECNAME:
8844 8851                  case DIF_VAR_ZONENAME:
8845 8852                          break;
8846 8853  
8847 8854                  default:
8848 8855                          return (0);
8849 8856                  }
8850 8857          }
8851 8858  
8852 8859          /*
8853 8860           * This DIF object may be cacheable.  Now we need to look for any
8854 8861           * array loading instructions, any memory loading instructions, or
8855 8862           * any stores to thread-local variables.
8856 8863           */
8857 8864          for (i = 0; i < dp->dtdo_len; i++) {
8858 8865                  uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
8859 8866  
8860 8867                  if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
8861 8868                      (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
8862 8869                      (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
8863 8870                      op == DIF_OP_LDGA || op == DIF_OP_STTS)
8864 8871                          return (0);
8865 8872          }
8866 8873  
8867 8874          return (1);
8868 8875  }
8869 8876  
8870 8877  static void
8871 8878  dtrace_difo_hold(dtrace_difo_t *dp)
8872 8879  {
8873 8880          int i;
8874 8881  
8875 8882          ASSERT(MUTEX_HELD(&dtrace_lock));
8876 8883  
8877 8884          dp->dtdo_refcnt++;
8878 8885          ASSERT(dp->dtdo_refcnt != 0);
8879 8886  
8880 8887          /*
8881 8888           * We need to check this DIF object for references to the variable
8882 8889           * DIF_VAR_VTIMESTAMP.
8883 8890           */
8884 8891          for (i = 0; i < dp->dtdo_varlen; i++) {
8885 8892                  dtrace_difv_t *v = &dp->dtdo_vartab[i];
8886 8893  
8887 8894                  if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
8888 8895                          continue;
8889 8896  
8890 8897                  if (dtrace_vtime_references++ == 0)
8891 8898                          dtrace_vtime_enable();
8892 8899          }
8893 8900  }
8894 8901  
8895 8902  /*
8896 8903   * This routine calculates the dynamic variable chunksize for a given DIF
8897 8904   * object.  The calculation is not fool-proof, and can probably be tricked by
8898 8905   * malicious DIF -- but it works for all compiler-generated DIF.  Because this
8899 8906   * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
8900 8907   * if a dynamic variable size exceeds the chunksize.
8901 8908   */
8902 8909  static void
8903 8910  dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8904 8911  {
8905 8912          uint64_t sval;
8906 8913          dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
8907 8914          const dif_instr_t *text = dp->dtdo_buf;
8908 8915          uint_t pc, srd = 0;
8909 8916          uint_t ttop = 0;
8910 8917          size_t size, ksize;
8911 8918          uint_t id, i;
8912 8919  
8913 8920          for (pc = 0; pc < dp->dtdo_len; pc++) {
8914 8921                  dif_instr_t instr = text[pc];
8915 8922                  uint_t op = DIF_INSTR_OP(instr);
8916 8923                  uint_t rd = DIF_INSTR_RD(instr);
8917 8924                  uint_t r1 = DIF_INSTR_R1(instr);
8918 8925                  uint_t nkeys = 0;
8919 8926                  uchar_t scope;
8920 8927  
8921 8928                  dtrace_key_t *key = tupregs;
8922 8929  
8923 8930                  switch (op) {
8924 8931                  case DIF_OP_SETX:
8925 8932                          sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
8926 8933                          srd = rd;
8927 8934                          continue;
8928 8935  
8929 8936                  case DIF_OP_STTS:
8930 8937                          key = &tupregs[DIF_DTR_NREGS];
8931 8938                          key[0].dttk_size = 0;
8932 8939                          key[1].dttk_size = 0;
8933 8940                          nkeys = 2;
8934 8941                          scope = DIFV_SCOPE_THREAD;
8935 8942                          break;
8936 8943  
8937 8944                  case DIF_OP_STGAA:
8938 8945                  case DIF_OP_STTAA:
8939 8946                          nkeys = ttop;
8940 8947  
8941 8948                          if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
8942 8949                                  key[nkeys++].dttk_size = 0;
8943 8950  
8944 8951                          key[nkeys++].dttk_size = 0;
8945 8952  
8946 8953                          if (op == DIF_OP_STTAA) {
8947 8954                                  scope = DIFV_SCOPE_THREAD;
8948 8955                          } else {
8949 8956                                  scope = DIFV_SCOPE_GLOBAL;
8950 8957                          }
8951 8958  
8952 8959                          break;
8953 8960  
8954 8961                  case DIF_OP_PUSHTR:
8955 8962                          if (ttop == DIF_DTR_NREGS)
8956 8963                                  return;
8957 8964  
8958 8965                          if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
8959 8966                                  /*
8960 8967                                   * If the register for the size of the "pushtr"
8961 8968                                   * is %r0 (or the value is 0) and the type is
8962 8969                                   * a string, we'll use the system-wide default
8963 8970                                   * string size.
8964 8971                                   */
8965 8972                                  tupregs[ttop++].dttk_size =
8966 8973                                      dtrace_strsize_default;
8967 8974                          } else {
8968 8975                                  if (srd == 0)
8969 8976                                          return;
8970 8977  
8971 8978                                  tupregs[ttop++].dttk_size = sval;
8972 8979                          }
8973 8980  
8974 8981                          break;
8975 8982  
8976 8983                  case DIF_OP_PUSHTV:
8977 8984                          if (ttop == DIF_DTR_NREGS)
8978 8985                                  return;
8979 8986  
8980 8987                          tupregs[ttop++].dttk_size = 0;
8981 8988                          break;
8982 8989  
8983 8990                  case DIF_OP_FLUSHTS:
8984 8991                          ttop = 0;
8985 8992                          break;
8986 8993  
8987 8994                  case DIF_OP_POPTS:
8988 8995                          if (ttop != 0)
8989 8996                                  ttop--;
8990 8997                          break;
8991 8998                  }
8992 8999  
8993 9000                  sval = 0;
8994 9001                  srd = 0;
8995 9002  
8996 9003                  if (nkeys == 0)
8997 9004                          continue;
8998 9005  
8999 9006                  /*
9000 9007                   * We have a dynamic variable allocation; calculate its size.
9001 9008                   */
9002 9009                  for (ksize = 0, i = 0; i < nkeys; i++)
9003 9010                          ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
9004 9011  
9005 9012                  size = sizeof (dtrace_dynvar_t);
9006 9013                  size += sizeof (dtrace_key_t) * (nkeys - 1);
9007 9014                  size += ksize;
9008 9015  
9009 9016                  /*
9010 9017                   * Now we need to determine the size of the stored data.
9011 9018                   */
9012 9019                  id = DIF_INSTR_VAR(instr);
9013 9020  
9014 9021                  for (i = 0; i < dp->dtdo_varlen; i++) {
9015 9022                          dtrace_difv_t *v = &dp->dtdo_vartab[i];
9016 9023  
9017 9024                          if (v->dtdv_id == id && v->dtdv_scope == scope) {
9018 9025                                  size += v->dtdv_type.dtdt_size;
9019 9026                                  break;
9020 9027                          }
9021 9028                  }
9022 9029  
9023 9030                  if (i == dp->dtdo_varlen)
9024 9031                          return;
9025 9032  
9026 9033                  /*
9027 9034                   * We have the size.  If this is larger than the chunk size
9028 9035                   * for our dynamic variable state, reset the chunk size.
9029 9036                   */
9030 9037                  size = P2ROUNDUP(size, sizeof (uint64_t));
9031 9038  
9032 9039                  if (size > vstate->dtvs_dynvars.dtds_chunksize)
9033 9040                          vstate->dtvs_dynvars.dtds_chunksize = size;
9034 9041          }
9035 9042  }
9036 9043  
9037 9044  static void
9038 9045  dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9039 9046  {
9040 9047          int i, oldsvars, osz, nsz, otlocals, ntlocals;
9041 9048          uint_t id;
9042 9049  
9043 9050          ASSERT(MUTEX_HELD(&dtrace_lock));
9044 9051          ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
9045 9052  
9046 9053          for (i = 0; i < dp->dtdo_varlen; i++) {
9047 9054                  dtrace_difv_t *v = &dp->dtdo_vartab[i];
9048 9055                  dtrace_statvar_t *svar, ***svarp;
9049 9056                  size_t dsize = 0;
9050 9057                  uint8_t scope = v->dtdv_scope;
9051 9058                  int *np;
9052 9059  
9053 9060                  if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9054 9061                          continue;
9055 9062  
9056 9063                  id -= DIF_VAR_OTHER_UBASE;
9057 9064  
9058 9065                  switch (scope) {
9059 9066                  case DIFV_SCOPE_THREAD:
9060 9067                          while (id >= (otlocals = vstate->dtvs_ntlocals)) {
9061 9068                                  dtrace_difv_t *tlocals;
9062 9069  
9063 9070                                  if ((ntlocals = (otlocals << 1)) == 0)
9064 9071                                          ntlocals = 1;
9065 9072  
9066 9073                                  osz = otlocals * sizeof (dtrace_difv_t);
9067 9074                                  nsz = ntlocals * sizeof (dtrace_difv_t);
9068 9075  
9069 9076                                  tlocals = kmem_zalloc(nsz, KM_SLEEP);
9070 9077  
9071 9078                                  if (osz != 0) {
9072 9079                                          bcopy(vstate->dtvs_tlocals,
9073 9080                                              tlocals, osz);
9074 9081                                          kmem_free(vstate->dtvs_tlocals, osz);
9075 9082                                  }
9076 9083  
9077 9084                                  vstate->dtvs_tlocals = tlocals;
9078 9085                                  vstate->dtvs_ntlocals = ntlocals;
9079 9086                          }
9080 9087  
9081 9088                          vstate->dtvs_tlocals[id] = *v;
9082 9089                          continue;
9083 9090  
9084 9091                  case DIFV_SCOPE_LOCAL:
9085 9092                          np = &vstate->dtvs_nlocals;
9086 9093                          svarp = &vstate->dtvs_locals;
9087 9094  
9088 9095                          if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9089 9096                                  dsize = NCPU * (v->dtdv_type.dtdt_size +
9090 9097                                      sizeof (uint64_t));
9091 9098                          else
9092 9099                                  dsize = NCPU * sizeof (uint64_t);
9093 9100  
9094 9101                          break;
9095 9102  
9096 9103                  case DIFV_SCOPE_GLOBAL:
9097 9104                          np = &vstate->dtvs_nglobals;
9098 9105                          svarp = &vstate->dtvs_globals;
9099 9106  
9100 9107                          if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9101 9108                                  dsize = v->dtdv_type.dtdt_size +
9102 9109                                      sizeof (uint64_t);
9103 9110  
9104 9111                          break;
9105 9112  
9106 9113                  default:
9107 9114                          ASSERT(0);
9108 9115                  }
9109 9116  
9110 9117                  while (id >= (oldsvars = *np)) {
9111 9118                          dtrace_statvar_t **statics;
9112 9119                          int newsvars, oldsize, newsize;
9113 9120  
9114 9121                          if ((newsvars = (oldsvars << 1)) == 0)
9115 9122                                  newsvars = 1;
9116 9123  
9117 9124                          oldsize = oldsvars * sizeof (dtrace_statvar_t *);
9118 9125                          newsize = newsvars * sizeof (dtrace_statvar_t *);
9119 9126  
9120 9127                          statics = kmem_zalloc(newsize, KM_SLEEP);
9121 9128  
9122 9129                          if (oldsize != 0) {
9123 9130                                  bcopy(*svarp, statics, oldsize);
9124 9131                                  kmem_free(*svarp, oldsize);
9125 9132                          }
9126 9133  
9127 9134                          *svarp = statics;
9128 9135                          *np = newsvars;
9129 9136                  }
9130 9137  
9131 9138                  if ((svar = (*svarp)[id]) == NULL) {
9132 9139                          svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
9133 9140                          svar->dtsv_var = *v;
9134 9141  
9135 9142                          if ((svar->dtsv_size = dsize) != 0) {
9136 9143                                  svar->dtsv_data = (uint64_t)(uintptr_t)
9137 9144                                      kmem_zalloc(dsize, KM_SLEEP);
9138 9145                          }
9139 9146  
9140 9147                          (*svarp)[id] = svar;
9141 9148                  }
9142 9149  
9143 9150                  svar->dtsv_refcnt++;
9144 9151          }
9145 9152  
9146 9153          dtrace_difo_chunksize(dp, vstate);
9147 9154          dtrace_difo_hold(dp);
9148 9155  }
9149 9156  
9150 9157  static dtrace_difo_t *
9151 9158  dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9152 9159  {
9153 9160          dtrace_difo_t *new;
9154 9161          size_t sz;
9155 9162  
9156 9163          ASSERT(dp->dtdo_buf != NULL);
9157 9164          ASSERT(dp->dtdo_refcnt != 0);
9158 9165  
9159 9166          new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
9160 9167  
9161 9168          ASSERT(dp->dtdo_buf != NULL);
9162 9169          sz = dp->dtdo_len * sizeof (dif_instr_t);
9163 9170          new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
9164 9171          bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
9165 9172          new->dtdo_len = dp->dtdo_len;
9166 9173  
9167 9174          if (dp->dtdo_strtab != NULL) {
9168 9175                  ASSERT(dp->dtdo_strlen != 0);
9169 9176                  new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
9170 9177                  bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
9171 9178                  new->dtdo_strlen = dp->dtdo_strlen;
9172 9179          }
9173 9180  
9174 9181          if (dp->dtdo_inttab != NULL) {
9175 9182                  ASSERT(dp->dtdo_intlen != 0);
9176 9183                  sz = dp->dtdo_intlen * sizeof (uint64_t);
9177 9184                  new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
9178 9185                  bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
9179 9186                  new->dtdo_intlen = dp->dtdo_intlen;
9180 9187          }
9181 9188  
9182 9189          if (dp->dtdo_vartab != NULL) {
9183 9190                  ASSERT(dp->dtdo_varlen != 0);
9184 9191                  sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
9185 9192                  new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
9186 9193                  bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
9187 9194                  new->dtdo_varlen = dp->dtdo_varlen;
9188 9195          }
9189 9196  
9190 9197          dtrace_difo_init(new, vstate);
9191 9198          return (new);
9192 9199  }
9193 9200  
9194 9201  static void
9195 9202  dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9196 9203  {
9197 9204          int i;
9198 9205  
9199 9206          ASSERT(dp->dtdo_refcnt == 0);
9200 9207  
9201 9208          for (i = 0; i < dp->dtdo_varlen; i++) {
9202 9209                  dtrace_difv_t *v = &dp->dtdo_vartab[i];
9203 9210                  dtrace_statvar_t *svar, **svarp;
9204 9211                  uint_t id;
9205 9212                  uint8_t scope = v->dtdv_scope;
9206 9213                  int *np;
9207 9214  
9208 9215                  switch (scope) {
9209 9216                  case DIFV_SCOPE_THREAD:
9210 9217                          continue;
9211 9218  
9212 9219                  case DIFV_SCOPE_LOCAL:
9213 9220                          np = &vstate->dtvs_nlocals;
9214 9221                          svarp = vstate->dtvs_locals;
9215 9222                          break;
9216 9223  
9217 9224                  case DIFV_SCOPE_GLOBAL:
9218 9225                          np = &vstate->dtvs_nglobals;
9219 9226                          svarp = vstate->dtvs_globals;
9220 9227                          break;
9221 9228  
9222 9229                  default:
9223 9230                          ASSERT(0);
9224 9231                  }
9225 9232  
9226 9233                  if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9227 9234                          continue;
9228 9235  
9229 9236                  id -= DIF_VAR_OTHER_UBASE;
9230 9237                  ASSERT(id < *np);
9231 9238  
9232 9239                  svar = svarp[id];
9233 9240                  ASSERT(svar != NULL);
9234 9241                  ASSERT(svar->dtsv_refcnt > 0);
9235 9242  
9236 9243                  if (--svar->dtsv_refcnt > 0)
9237 9244                          continue;
9238 9245  
9239 9246                  if (svar->dtsv_size != 0) {
9240 9247                          ASSERT(svar->dtsv_data != NULL);
9241 9248                          kmem_free((void *)(uintptr_t)svar->dtsv_data,
9242 9249                              svar->dtsv_size);
9243 9250                  }
9244 9251  
9245 9252                  kmem_free(svar, sizeof (dtrace_statvar_t));
9246 9253                  svarp[id] = NULL;
9247 9254          }
9248 9255  
9249 9256          kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
9250 9257          kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
9251 9258          kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
9252 9259          kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
9253 9260  
9254 9261          kmem_free(dp, sizeof (dtrace_difo_t));
9255 9262  }
9256 9263  
9257 9264  static void
9258 9265  dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9259 9266  {
9260 9267          int i;
9261 9268  
9262 9269          ASSERT(MUTEX_HELD(&dtrace_lock));
9263 9270          ASSERT(dp->dtdo_refcnt != 0);
9264 9271  
9265 9272          for (i = 0; i < dp->dtdo_varlen; i++) {
9266 9273                  dtrace_difv_t *v = &dp->dtdo_vartab[i];
9267 9274  
9268 9275                  if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9269 9276                          continue;
9270 9277  
9271 9278                  ASSERT(dtrace_vtime_references > 0);
9272 9279                  if (--dtrace_vtime_references == 0)
9273 9280                          dtrace_vtime_disable();
9274 9281          }
9275 9282  
9276 9283          if (--dp->dtdo_refcnt == 0)
9277 9284                  dtrace_difo_destroy(dp, vstate);
9278 9285  }
9279 9286  
9280 9287  /*
9281 9288   * DTrace Format Functions
9282 9289   */
9283 9290  static uint16_t
9284 9291  dtrace_format_add(dtrace_state_t *state, char *str)
9285 9292  {
9286 9293          char *fmt, **new;
9287 9294          uint16_t ndx, len = strlen(str) + 1;
9288 9295  
9289 9296          fmt = kmem_zalloc(len, KM_SLEEP);
9290 9297          bcopy(str, fmt, len);
9291 9298  
9292 9299          for (ndx = 0; ndx < state->dts_nformats; ndx++) {
9293 9300                  if (state->dts_formats[ndx] == NULL) {
9294 9301                          state->dts_formats[ndx] = fmt;
9295 9302                          return (ndx + 1);
9296 9303                  }
9297 9304          }
9298 9305  
9299 9306          if (state->dts_nformats == USHRT_MAX) {
9300 9307                  /*
9301 9308                   * This is only likely if a denial-of-service attack is being
9302 9309                   * attempted.  As such, it's okay to fail silently here.
9303 9310                   */
9304 9311                  kmem_free(fmt, len);
9305 9312                  return (0);
9306 9313          }
9307 9314  
9308 9315          /*
9309 9316           * For simplicity, we always resize the formats array to be exactly the
9310 9317           * number of formats.
9311 9318           */
9312 9319          ndx = state->dts_nformats++;
9313 9320          new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
9314 9321  
9315 9322          if (state->dts_formats != NULL) {
9316 9323                  ASSERT(ndx != 0);
9317 9324                  bcopy(state->dts_formats, new, ndx * sizeof (char *));
9318 9325                  kmem_free(state->dts_formats, ndx * sizeof (char *));
9319 9326          }
9320 9327  
9321 9328          state->dts_formats = new;
9322 9329          state->dts_formats[ndx] = fmt;
9323 9330  
9324 9331          return (ndx + 1);
9325 9332  }
9326 9333  
9327 9334  static void
9328 9335  dtrace_format_remove(dtrace_state_t *state, uint16_t format)
9329 9336  {
9330 9337          char *fmt;
9331 9338  
9332 9339          ASSERT(state->dts_formats != NULL);
9333 9340          ASSERT(format <= state->dts_nformats);
9334 9341          ASSERT(state->dts_formats[format - 1] != NULL);
9335 9342  
9336 9343          fmt = state->dts_formats[format - 1];
9337 9344          kmem_free(fmt, strlen(fmt) + 1);
9338 9345          state->dts_formats[format - 1] = NULL;
9339 9346  }
9340 9347  
9341 9348  static void
9342 9349  dtrace_format_destroy(dtrace_state_t *state)
9343 9350  {
9344 9351          int i;
9345 9352  
9346 9353          if (state->dts_nformats == 0) {
9347 9354                  ASSERT(state->dts_formats == NULL);
9348 9355                  return;
9349 9356          }
9350 9357  
9351 9358          ASSERT(state->dts_formats != NULL);
9352 9359  
9353 9360          for (i = 0; i < state->dts_nformats; i++) {
9354 9361                  char *fmt = state->dts_formats[i];
9355 9362  
9356 9363                  if (fmt == NULL)
9357 9364                          continue;
9358 9365  
9359 9366                  kmem_free(fmt, strlen(fmt) + 1);
9360 9367          }
9361 9368  
9362 9369          kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
9363 9370          state->dts_nformats = 0;
9364 9371          state->dts_formats = NULL;
9365 9372  }
9366 9373  
9367 9374  /*
9368 9375   * DTrace Predicate Functions
9369 9376   */
9370 9377  static dtrace_predicate_t *
9371 9378  dtrace_predicate_create(dtrace_difo_t *dp)
9372 9379  {
9373 9380          dtrace_predicate_t *pred;
9374 9381  
9375 9382          ASSERT(MUTEX_HELD(&dtrace_lock));
9376 9383          ASSERT(dp->dtdo_refcnt != 0);
9377 9384  
9378 9385          pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
9379 9386          pred->dtp_difo = dp;
9380 9387          pred->dtp_refcnt = 1;
9381 9388  
9382 9389          if (!dtrace_difo_cacheable(dp))
9383 9390                  return (pred);
9384 9391  
9385 9392          if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
9386 9393                  /*
9387 9394                   * This is only theoretically possible -- we have had 2^32
9388 9395                   * cacheable predicates on this machine.  We cannot allow any
9389 9396                   * more predicates to become cacheable:  as unlikely as it is,
9390 9397                   * there may be a thread caching a (now stale) predicate cache
9391 9398                   * ID. (N.B.: the temptation is being successfully resisted to
9392 9399                   * have this cmn_err() "Holy shit -- we executed this code!")
9393 9400                   */
9394 9401                  return (pred);
9395 9402          }
9396 9403  
9397 9404          pred->dtp_cacheid = dtrace_predcache_id++;
9398 9405  
9399 9406          return (pred);
9400 9407  }
9401 9408  
9402 9409  static void
9403 9410  dtrace_predicate_hold(dtrace_predicate_t *pred)
9404 9411  {
9405 9412          ASSERT(MUTEX_HELD(&dtrace_lock));
9406 9413          ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
9407 9414          ASSERT(pred->dtp_refcnt > 0);
9408 9415  
9409 9416          pred->dtp_refcnt++;
9410 9417  }
9411 9418  
9412 9419  static void
9413 9420  dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
9414 9421  {
9415 9422          dtrace_difo_t *dp = pred->dtp_difo;
9416 9423  
9417 9424          ASSERT(MUTEX_HELD(&dtrace_lock));
9418 9425          ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
9419 9426          ASSERT(pred->dtp_refcnt > 0);
9420 9427  
9421 9428          if (--pred->dtp_refcnt == 0) {
9422 9429                  dtrace_difo_release(pred->dtp_difo, vstate);
9423 9430                  kmem_free(pred, sizeof (dtrace_predicate_t));
9424 9431          }
9425 9432  }
9426 9433  
9427 9434  /*
9428 9435   * DTrace Action Description Functions
9429 9436   */
9430 9437  static dtrace_actdesc_t *
9431 9438  dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
9432 9439      uint64_t uarg, uint64_t arg)
9433 9440  {
9434 9441          dtrace_actdesc_t *act;
9435 9442  
9436 9443          ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
9437 9444              arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
9438 9445  
9439 9446          act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
9440 9447          act->dtad_kind = kind;
9441 9448          act->dtad_ntuple = ntuple;
9442 9449          act->dtad_uarg = uarg;
9443 9450          act->dtad_arg = arg;
9444 9451          act->dtad_refcnt = 1;
9445 9452  
9446 9453          return (act);
9447 9454  }
9448 9455  
9449 9456  static void
9450 9457  dtrace_actdesc_hold(dtrace_actdesc_t *act)
9451 9458  {
9452 9459          ASSERT(act->dtad_refcnt >= 1);
9453 9460          act->dtad_refcnt++;
9454 9461  }
9455 9462  
9456 9463  static void
9457 9464  dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
9458 9465  {
9459 9466          dtrace_actkind_t kind = act->dtad_kind;
9460 9467          dtrace_difo_t *dp;
9461 9468  
9462 9469          ASSERT(act->dtad_refcnt >= 1);
9463 9470  
9464 9471          if (--act->dtad_refcnt != 0)
9465 9472                  return;
9466 9473  
9467 9474          if ((dp = act->dtad_difo) != NULL)
9468 9475                  dtrace_difo_release(dp, vstate);
9469 9476  
9470 9477          if (DTRACEACT_ISPRINTFLIKE(kind)) {
9471 9478                  char *str = (char *)(uintptr_t)act->dtad_arg;
9472 9479  
9473 9480                  ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
9474 9481                      (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
9475 9482  
9476 9483                  if (str != NULL)
9477 9484                          kmem_free(str, strlen(str) + 1);
9478 9485          }
9479 9486  
9480 9487          kmem_free(act, sizeof (dtrace_actdesc_t));
9481 9488  }
9482 9489  
9483 9490  /*
9484 9491   * DTrace ECB Functions
9485 9492   */
9486 9493  static dtrace_ecb_t *
9487 9494  dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
9488 9495  {
9489 9496          dtrace_ecb_t *ecb;
9490 9497          dtrace_epid_t epid;
9491 9498  
9492 9499          ASSERT(MUTEX_HELD(&dtrace_lock));
9493 9500  
9494 9501          ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
9495 9502          ecb->dte_predicate = NULL;
9496 9503          ecb->dte_probe = probe;
9497 9504  
9498 9505          /*
9499 9506           * The default size is the size of the default action: recording
9500 9507           * the epid.
9501 9508           */
9502 9509          ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9503 9510          ecb->dte_alignment = sizeof (dtrace_epid_t);
9504 9511  
9505 9512          epid = state->dts_epid++;
9506 9513  
9507 9514          if (epid - 1 >= state->dts_necbs) {
9508 9515                  dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
9509 9516                  int necbs = state->dts_necbs << 1;
9510 9517  
9511 9518                  ASSERT(epid == state->dts_necbs + 1);
9512 9519  
9513 9520                  if (necbs == 0) {
9514 9521                          ASSERT(oecbs == NULL);
9515 9522                          necbs = 1;
9516 9523                  }
9517 9524  
9518 9525                  ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
9519 9526  
9520 9527                  if (oecbs != NULL)
9521 9528                          bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
9522 9529  
9523 9530                  dtrace_membar_producer();
9524 9531                  state->dts_ecbs = ecbs;
9525 9532  
9526 9533                  if (oecbs != NULL) {
9527 9534                          /*
9528 9535                           * If this state is active, we must dtrace_sync()
9529 9536                           * before we can free the old dts_ecbs array:  we're
9530 9537                           * coming in hot, and there may be active ring
9531 9538                           * buffer processing (which indexes into the dts_ecbs
9532 9539                           * array) on another CPU.
9533 9540                           */
9534 9541                          if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
9535 9542                                  dtrace_sync();
9536 9543  
9537 9544                          kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
9538 9545                  }
9539 9546  
9540 9547                  dtrace_membar_producer();
9541 9548                  state->dts_necbs = necbs;
9542 9549          }
9543 9550  
9544 9551          ecb->dte_state = state;
9545 9552  
9546 9553          ASSERT(state->dts_ecbs[epid - 1] == NULL);
9547 9554          dtrace_membar_producer();
9548 9555          state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
9549 9556  
9550 9557          return (ecb);
9551 9558  }
9552 9559  
9553 9560  static int
9554 9561  dtrace_ecb_enable(dtrace_ecb_t *ecb)
9555 9562  {
9556 9563          dtrace_probe_t *probe = ecb->dte_probe;
9557 9564  
9558 9565          ASSERT(MUTEX_HELD(&cpu_lock));
9559 9566          ASSERT(MUTEX_HELD(&dtrace_lock));
9560 9567          ASSERT(ecb->dte_next == NULL);
9561 9568  
9562 9569          if (probe == NULL) {
9563 9570                  /*
9564 9571                   * This is the NULL probe -- there's nothing to do.
9565 9572                   */
9566 9573                  return (0);
9567 9574          }
9568 9575  
9569 9576          if (probe->dtpr_ecb == NULL) {
9570 9577                  dtrace_provider_t *prov = probe->dtpr_provider;
9571 9578  
9572 9579                  /*
9573 9580                   * We're the first ECB on this probe.
9574 9581                   */
9575 9582                  probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
9576 9583  
9577 9584                  if (ecb->dte_predicate != NULL)
9578 9585                          probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
9579 9586  
9580 9587                  return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
9581 9588                      probe->dtpr_id, probe->dtpr_arg));
9582 9589          } else {
9583 9590                  /*
9584 9591                   * This probe is already active.  Swing the last pointer to
9585 9592                   * point to the new ECB, and issue a dtrace_sync() to assure
9586 9593                   * that all CPUs have seen the change.
9587 9594                   */
9588 9595                  ASSERT(probe->dtpr_ecb_last != NULL);
9589 9596                  probe->dtpr_ecb_last->dte_next = ecb;
9590 9597                  probe->dtpr_ecb_last = ecb;
9591 9598                  probe->dtpr_predcache = 0;
9592 9599  
9593 9600                  dtrace_sync();
9594 9601                  return (0);
9595 9602          }
9596 9603  }
9597 9604  
9598 9605  static void
9599 9606  dtrace_ecb_resize(dtrace_ecb_t *ecb)
9600 9607  {
9601 9608          uint32_t maxalign = sizeof (dtrace_epid_t);
9602 9609          uint32_t align = sizeof (uint8_t), offs, diff;
9603 9610          dtrace_action_t *act;
9604 9611          int wastuple = 0;
9605 9612          uint32_t aggbase = UINT32_MAX;
9606 9613          dtrace_state_t *state = ecb->dte_state;
9607 9614  
9608 9615          /*
9609 9616           * If we record anything, we always record the epid.  (And we always
9610 9617           * record it first.)
9611 9618           */
9612 9619          offs = sizeof (dtrace_epid_t);
9613 9620          ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9614 9621  
9615 9622          for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9616 9623                  dtrace_recdesc_t *rec = &act->dta_rec;
9617 9624  
9618 9625                  if ((align = rec->dtrd_alignment) > maxalign)
9619 9626                          maxalign = align;
9620 9627  
9621 9628                  if (!wastuple && act->dta_intuple) {
9622 9629                          /*
9623 9630                           * This is the first record in a tuple.  Align the
9624 9631                           * offset to be at offset 4 in an 8-byte aligned
9625 9632                           * block.
9626 9633                           */
9627 9634                          diff = offs + sizeof (dtrace_aggid_t);
9628 9635  
9629 9636                          if (diff = (diff & (sizeof (uint64_t) - 1)))
9630 9637                                  offs += sizeof (uint64_t) - diff;
9631 9638  
9632 9639                          aggbase = offs - sizeof (dtrace_aggid_t);
9633 9640                          ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
9634 9641                  }
9635 9642  
9636 9643                  /*LINTED*/
9637 9644                  if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
9638 9645                          /*
9639 9646                           * The current offset is not properly aligned; align it.
9640 9647                           */
9641 9648                          offs += align - diff;
9642 9649                  }
9643 9650  
9644 9651                  rec->dtrd_offset = offs;
9645 9652  
9646 9653                  if (offs + rec->dtrd_size > ecb->dte_needed) {
9647 9654                          ecb->dte_needed = offs + rec->dtrd_size;
9648 9655  
9649 9656                          if (ecb->dte_needed > state->dts_needed)
9650 9657                                  state->dts_needed = ecb->dte_needed;
9651 9658                  }
9652 9659  
9653 9660                  if (DTRACEACT_ISAGG(act->dta_kind)) {
9654 9661                          dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9655 9662                          dtrace_action_t *first = agg->dtag_first, *prev;
9656 9663  
9657 9664                          ASSERT(rec->dtrd_size != 0 && first != NULL);
9658 9665                          ASSERT(wastuple);
9659 9666                          ASSERT(aggbase != UINT32_MAX);
9660 9667  
9661 9668                          agg->dtag_base = aggbase;
9662 9669  
9663 9670                          while ((prev = first->dta_prev) != NULL &&
9664 9671                              DTRACEACT_ISAGG(prev->dta_kind)) {
9665 9672                                  agg = (dtrace_aggregation_t *)prev;
9666 9673                                  first = agg->dtag_first;
9667 9674                          }
9668 9675  
9669 9676                          if (prev != NULL) {
9670 9677                                  offs = prev->dta_rec.dtrd_offset +
9671 9678                                      prev->dta_rec.dtrd_size;
9672 9679                          } else {
9673 9680                                  offs = sizeof (dtrace_epid_t);
9674 9681                          }
9675 9682                          wastuple = 0;
9676 9683                  } else {
9677 9684                          if (!act->dta_intuple)
9678 9685                                  ecb->dte_size = offs + rec->dtrd_size;
9679 9686  
9680 9687                          offs += rec->dtrd_size;
9681 9688                  }
9682 9689  
9683 9690                  wastuple = act->dta_intuple;
9684 9691          }
9685 9692  
9686 9693          if ((act = ecb->dte_action) != NULL &&
9687 9694              !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
9688 9695              ecb->dte_size == sizeof (dtrace_epid_t)) {
9689 9696                  /*
9690 9697                   * If the size is still sizeof (dtrace_epid_t), then all
9691 9698                   * actions store no data; set the size to 0.
9692 9699                   */
9693 9700                  ecb->dte_alignment = maxalign;
9694 9701                  ecb->dte_size = 0;
9695 9702  
9696 9703                  /*
9697 9704                   * If the needed space is still sizeof (dtrace_epid_t), then
9698 9705                   * all actions need no additional space; set the needed
9699 9706                   * size to 0.
9700 9707                   */
9701 9708                  if (ecb->dte_needed == sizeof (dtrace_epid_t))
9702 9709                          ecb->dte_needed = 0;
9703 9710  
9704 9711                  return;
9705 9712          }
9706 9713  
9707 9714          /*
9708 9715           * Set our alignment, and make sure that the dte_size and dte_needed
9709 9716           * are aligned to the size of an EPID.
9710 9717           */
9711 9718          ecb->dte_alignment = maxalign;
9712 9719          ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
9713 9720              ~(sizeof (dtrace_epid_t) - 1);
9714 9721          ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
9715 9722              ~(sizeof (dtrace_epid_t) - 1);
9716 9723          ASSERT(ecb->dte_size <= ecb->dte_needed);
9717 9724  }
9718 9725  
9719 9726  static dtrace_action_t *
9720 9727  dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9721 9728  {
9722 9729          dtrace_aggregation_t *agg;
9723 9730          size_t size = sizeof (uint64_t);
9724 9731          int ntuple = desc->dtad_ntuple;
9725 9732          dtrace_action_t *act;
9726 9733          dtrace_recdesc_t *frec;
9727 9734          dtrace_aggid_t aggid;
9728 9735          dtrace_state_t *state = ecb->dte_state;
9729 9736  
9730 9737          agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
9731 9738          agg->dtag_ecb = ecb;
9732 9739  
9733 9740          ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
9734 9741  
9735 9742          switch (desc->dtad_kind) {
9736 9743          case DTRACEAGG_MIN:
9737 9744                  agg->dtag_initial = INT64_MAX;
9738 9745                  agg->dtag_aggregate = dtrace_aggregate_min;
9739 9746                  break;
9740 9747  
9741 9748          case DTRACEAGG_MAX:
9742 9749                  agg->dtag_initial = INT64_MIN;
9743 9750                  agg->dtag_aggregate = dtrace_aggregate_max;
9744 9751                  break;
9745 9752  
9746 9753          case DTRACEAGG_COUNT:
9747 9754                  agg->dtag_aggregate = dtrace_aggregate_count;
9748 9755                  break;
9749 9756  
9750 9757          case DTRACEAGG_QUANTIZE:
9751 9758                  agg->dtag_aggregate = dtrace_aggregate_quantize;
9752 9759                  size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
9753 9760                      sizeof (uint64_t);
9754 9761                  break;
9755 9762  
9756 9763          case DTRACEAGG_LQUANTIZE: {
9757 9764                  uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
9758 9765                  uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
9759 9766  
9760 9767                  agg->dtag_initial = desc->dtad_arg;
9761 9768                  agg->dtag_aggregate = dtrace_aggregate_lquantize;
9762 9769  
9763 9770                  if (step == 0 || levels == 0)
9764 9771                          goto err;
9765 9772  
9766 9773                  size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
9767 9774                  break;
9768 9775          }
9769 9776  
9770 9777          case DTRACEAGG_LLQUANTIZE: {
9771 9778                  uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
9772 9779                  uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
9773 9780                  uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
9774 9781                  uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
9775 9782                  int64_t v;
9776 9783  
9777 9784                  agg->dtag_initial = desc->dtad_arg;
9778 9785                  agg->dtag_aggregate = dtrace_aggregate_llquantize;
9779 9786  
9780 9787                  if (factor < 2 || low >= high || nsteps < factor)
9781 9788                          goto err;
9782 9789  
9783 9790                  /*
9784 9791                   * Now check that the number of steps evenly divides a power
9785 9792                   * of the factor.  (This assures both integer bucket size and
9786 9793                   * linearity within each magnitude.)
9787 9794                   */
9788 9795                  for (v = factor; v < nsteps; v *= factor)
9789 9796                          continue;
9790 9797  
9791 9798                  if ((v % nsteps) || (nsteps % factor))
9792 9799                          goto err;
9793 9800  
9794 9801                  size = (dtrace_aggregate_llquantize_bucket(factor,
9795 9802                      low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
9796 9803                  break;
9797 9804          }
9798 9805  
9799 9806          case DTRACEAGG_AVG:
9800 9807                  agg->dtag_aggregate = dtrace_aggregate_avg;
9801 9808                  size = sizeof (uint64_t) * 2;
9802 9809                  break;
9803 9810  
9804 9811          case DTRACEAGG_STDDEV:
9805 9812                  agg->dtag_aggregate = dtrace_aggregate_stddev;
9806 9813                  size = sizeof (uint64_t) * 4;
9807 9814                  break;
9808 9815  
9809 9816          case DTRACEAGG_SUM:
9810 9817                  agg->dtag_aggregate = dtrace_aggregate_sum;
9811 9818                  break;
9812 9819  
9813 9820          default:
9814 9821                  goto err;
9815 9822          }
9816 9823  
9817 9824          agg->dtag_action.dta_rec.dtrd_size = size;
9818 9825  
9819 9826          if (ntuple == 0)
9820 9827                  goto err;
9821 9828  
9822 9829          /*
9823 9830           * We must make sure that we have enough actions for the n-tuple.
9824 9831           */
9825 9832          for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
9826 9833                  if (DTRACEACT_ISAGG(act->dta_kind))
9827 9834                          break;
9828 9835  
9829 9836                  if (--ntuple == 0) {
9830 9837                          /*
9831 9838                           * This is the action with which our n-tuple begins.
9832 9839                           */
9833 9840                          agg->dtag_first = act;
9834 9841                          goto success;
9835 9842                  }
9836 9843          }
9837 9844  
9838 9845          /*
9839 9846           * This n-tuple is short by ntuple elements.  Return failure.
9840 9847           */
9841 9848          ASSERT(ntuple != 0);
9842 9849  err:
9843 9850          kmem_free(agg, sizeof (dtrace_aggregation_t));
9844 9851          return (NULL);
9845 9852  
9846 9853  success:
9847 9854          /*
9848 9855           * If the last action in the tuple has a size of zero, it's actually
9849 9856           * an expression argument for the aggregating action.
9850 9857           */
9851 9858          ASSERT(ecb->dte_action_last != NULL);
9852 9859          act = ecb->dte_action_last;
9853 9860  
9854 9861          if (act->dta_kind == DTRACEACT_DIFEXPR) {
9855 9862                  ASSERT(act->dta_difo != NULL);
9856 9863  
9857 9864                  if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
9858 9865                          agg->dtag_hasarg = 1;
9859 9866          }
9860 9867  
9861 9868          /*
9862 9869           * We need to allocate an id for this aggregation.
9863 9870           */
9864 9871          aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
9865 9872              VM_BESTFIT | VM_SLEEP);
9866 9873  
9867 9874          if (aggid - 1 >= state->dts_naggregations) {
9868 9875                  dtrace_aggregation_t **oaggs = state->dts_aggregations;
9869 9876                  dtrace_aggregation_t **aggs;
9870 9877                  int naggs = state->dts_naggregations << 1;
9871 9878                  int onaggs = state->dts_naggregations;
9872 9879  
9873 9880                  ASSERT(aggid == state->dts_naggregations + 1);
9874 9881  
9875 9882                  if (naggs == 0) {
9876 9883                          ASSERT(oaggs == NULL);
9877 9884                          naggs = 1;
9878 9885                  }
9879 9886  
9880 9887                  aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
9881 9888  
9882 9889                  if (oaggs != NULL) {
9883 9890                          bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
9884 9891                          kmem_free(oaggs, onaggs * sizeof (*aggs));
9885 9892                  }
9886 9893  
9887 9894                  state->dts_aggregations = aggs;
9888 9895                  state->dts_naggregations = naggs;
9889 9896          }
9890 9897  
9891 9898          ASSERT(state->dts_aggregations[aggid - 1] == NULL);
9892 9899          state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
9893 9900  
9894 9901          frec = &agg->dtag_first->dta_rec;
9895 9902          if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
9896 9903                  frec->dtrd_alignment = sizeof (dtrace_aggid_t);
9897 9904  
9898 9905          for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
9899 9906                  ASSERT(!act->dta_intuple);
9900 9907                  act->dta_intuple = 1;
9901 9908          }
9902 9909  
9903 9910          return (&agg->dtag_action);
9904 9911  }
9905 9912  
9906 9913  static void
9907 9914  dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
9908 9915  {
9909 9916          dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9910 9917          dtrace_state_t *state = ecb->dte_state;
9911 9918          dtrace_aggid_t aggid = agg->dtag_id;
9912 9919  
9913 9920          ASSERT(DTRACEACT_ISAGG(act->dta_kind));
9914 9921          vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
9915 9922  
9916 9923          ASSERT(state->dts_aggregations[aggid - 1] == agg);
9917 9924          state->dts_aggregations[aggid - 1] = NULL;
9918 9925  
9919 9926          kmem_free(agg, sizeof (dtrace_aggregation_t));
9920 9927  }
9921 9928  
9922 9929  static int
9923 9930  dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9924 9931  {
9925 9932          dtrace_action_t *action, *last;
9926 9933          dtrace_difo_t *dp = desc->dtad_difo;
9927 9934          uint32_t size = 0, align = sizeof (uint8_t), mask;
9928 9935          uint16_t format = 0;
9929 9936          dtrace_recdesc_t *rec;
9930 9937          dtrace_state_t *state = ecb->dte_state;
9931 9938          dtrace_optval_t *opt = state->dts_options, nframes, strsize;
9932 9939          uint64_t arg = desc->dtad_arg;
9933 9940  
9934 9941          ASSERT(MUTEX_HELD(&dtrace_lock));
9935 9942          ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
9936 9943  
9937 9944          if (DTRACEACT_ISAGG(desc->dtad_kind)) {
9938 9945                  /*
9939 9946                   * If this is an aggregating action, there must be neither
9940 9947                   * a speculate nor a commit on the action chain.
9941 9948                   */
9942 9949                  dtrace_action_t *act;
9943 9950  
9944 9951                  for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9945 9952                          if (act->dta_kind == DTRACEACT_COMMIT)
9946 9953                                  return (EINVAL);
9947 9954  
9948 9955                          if (act->dta_kind == DTRACEACT_SPECULATE)
9949 9956                                  return (EINVAL);
9950 9957                  }
9951 9958  
9952 9959                  action = dtrace_ecb_aggregation_create(ecb, desc);
9953 9960  
9954 9961                  if (action == NULL)
9955 9962                          return (EINVAL);
9956 9963          } else {
9957 9964                  if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
9958 9965                      (desc->dtad_kind == DTRACEACT_DIFEXPR &&
9959 9966                      dp != NULL && dp->dtdo_destructive)) {
9960 9967                          state->dts_destructive = 1;
9961 9968                  }
9962 9969  
9963 9970                  switch (desc->dtad_kind) {
9964 9971                  case DTRACEACT_PRINTF:
9965 9972                  case DTRACEACT_PRINTA:
9966 9973                  case DTRACEACT_SYSTEM:
9967 9974                  case DTRACEACT_FREOPEN:
9968 9975                  case DTRACEACT_DIFEXPR:
9969 9976                          /*
9970 9977                           * We know that our arg is a string -- turn it into a
9971 9978                           * format.
9972 9979                           */
9973 9980                          if (arg == NULL) {
9974 9981                                  ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
9975 9982                                      desc->dtad_kind == DTRACEACT_DIFEXPR);
9976 9983                                  format = 0;
9977 9984                          } else {
9978 9985                                  ASSERT(arg != NULL);
9979 9986                                  ASSERT(arg > KERNELBASE);
9980 9987                                  format = dtrace_format_add(state,
9981 9988                                      (char *)(uintptr_t)arg);
9982 9989                          }
9983 9990  
9984 9991                          /*FALLTHROUGH*/
9985 9992                  case DTRACEACT_LIBACT:
9986 9993                  case DTRACEACT_TRACEMEM:
9987 9994                  case DTRACEACT_TRACEMEM_DYNSIZE:
9988 9995                          if (dp == NULL)
9989 9996                                  return (EINVAL);
9990 9997  
9991 9998                          if ((size = dp->dtdo_rtype.dtdt_size) != 0)
9992 9999                                  break;
9993 10000  
9994 10001                          if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
9995 10002                                  if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9996 10003                                          return (EINVAL);
9997 10004  
9998 10005                                  size = opt[DTRACEOPT_STRSIZE];
9999 10006                          }
10000 10007  
10001 10008                          break;
10002 10009  
10003 10010                  case DTRACEACT_STACK:
10004 10011                          if ((nframes = arg) == 0) {
10005 10012                                  nframes = opt[DTRACEOPT_STACKFRAMES];
10006 10013                                  ASSERT(nframes > 0);
10007 10014                                  arg = nframes;
10008 10015                          }
10009 10016  
10010 10017                          size = nframes * sizeof (pc_t);
10011 10018                          break;
10012 10019  
10013 10020                  case DTRACEACT_JSTACK:
10014 10021                          if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
10015 10022                                  strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
10016 10023  
10017 10024                          if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
10018 10025                                  nframes = opt[DTRACEOPT_JSTACKFRAMES];
10019 10026  
10020 10027                          arg = DTRACE_USTACK_ARG(nframes, strsize);
10021 10028  
10022 10029                          /*FALLTHROUGH*/
10023 10030                  case DTRACEACT_USTACK:
10024 10031                          if (desc->dtad_kind != DTRACEACT_JSTACK &&
10025 10032                              (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
10026 10033                                  strsize = DTRACE_USTACK_STRSIZE(arg);
10027 10034                                  nframes = opt[DTRACEOPT_USTACKFRAMES];
10028 10035                                  ASSERT(nframes > 0);
10029 10036                                  arg = DTRACE_USTACK_ARG(nframes, strsize);
10030 10037                          }
10031 10038  
10032 10039                          /*
10033 10040                           * Save a slot for the pid.
10034 10041                           */
10035 10042                          size = (nframes + 1) * sizeof (uint64_t);
10036 10043                          size += DTRACE_USTACK_STRSIZE(arg);
10037 10044                          size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
10038 10045  
10039 10046                          break;
10040 10047  
10041 10048                  case DTRACEACT_SYM:
10042 10049                  case DTRACEACT_MOD:
10043 10050                          if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
10044 10051                              sizeof (uint64_t)) ||
10045 10052                              (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10046 10053                                  return (EINVAL);
10047 10054                          break;
10048 10055  
10049 10056                  case DTRACEACT_USYM:
10050 10057                  case DTRACEACT_UMOD:
10051 10058                  case DTRACEACT_UADDR:
10052 10059                          if (dp == NULL ||
10053 10060                              (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
10054 10061                              (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10055 10062                                  return (EINVAL);
10056 10063  
10057 10064                          /*
10058 10065                           * We have a slot for the pid, plus a slot for the
10059 10066                           * argument.  To keep things simple (aligned with
10060 10067                           * bitness-neutral sizing), we store each as a 64-bit
10061 10068                           * quantity.
10062 10069                           */
10063 10070                          size = 2 * sizeof (uint64_t);
10064 10071                          break;
10065 10072  
10066 10073                  case DTRACEACT_STOP:
10067 10074                  case DTRACEACT_BREAKPOINT:
10068 10075                  case DTRACEACT_PANIC:
10069 10076                          break;
10070 10077  
10071 10078                  case DTRACEACT_CHILL:
10072 10079                  case DTRACEACT_DISCARD:
10073 10080                  case DTRACEACT_RAISE:
10074 10081                          if (dp == NULL)
10075 10082                                  return (EINVAL);
10076 10083                          break;
10077 10084  
10078 10085                  case DTRACEACT_EXIT:
10079 10086                          if (dp == NULL ||
10080 10087                              (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
10081 10088                              (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10082 10089                                  return (EINVAL);
10083 10090                          break;
10084 10091  
10085 10092                  case DTRACEACT_SPECULATE:
10086 10093                          if (ecb->dte_size > sizeof (dtrace_epid_t))
10087 10094                                  return (EINVAL);
10088 10095  
10089 10096                          if (dp == NULL)
10090 10097                                  return (EINVAL);
10091 10098  
10092 10099                          state->dts_speculates = 1;
10093 10100                          break;
10094 10101  
10095 10102                  case DTRACEACT_COMMIT: {
10096 10103                          dtrace_action_t *act = ecb->dte_action;
10097 10104  
10098 10105                          for (; act != NULL; act = act->dta_next) {
10099 10106                                  if (act->dta_kind == DTRACEACT_COMMIT)
10100 10107                                          return (EINVAL);
10101 10108                          }
10102 10109  
10103 10110                          if (dp == NULL)
10104 10111                                  return (EINVAL);
10105 10112                          break;
10106 10113                  }
10107 10114  
10108 10115                  default:
10109 10116                          return (EINVAL);
10110 10117                  }
10111 10118  
10112 10119                  if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
10113 10120                          /*
10114 10121                           * If this is a data-storing action or a speculate,
10115 10122                           * we must be sure that there isn't a commit on the
10116 10123                           * action chain.
10117 10124                           */
10118 10125                          dtrace_action_t *act = ecb->dte_action;
10119 10126  
10120 10127                          for (; act != NULL; act = act->dta_next) {
10121 10128                                  if (act->dta_kind == DTRACEACT_COMMIT)
10122 10129                                          return (EINVAL);
10123 10130                          }
10124 10131                  }
10125 10132  
10126 10133                  action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
10127 10134                  action->dta_rec.dtrd_size = size;
10128 10135          }
10129 10136  
10130 10137          action->dta_refcnt = 1;
10131 10138          rec = &action->dta_rec;
10132 10139          size = rec->dtrd_size;
10133 10140  
10134 10141          for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
10135 10142                  if (!(size & mask)) {
10136 10143                          align = mask + 1;
10137 10144                          break;
10138 10145                  }
10139 10146          }
10140 10147  
10141 10148          action->dta_kind = desc->dtad_kind;
10142 10149  
10143 10150          if ((action->dta_difo = dp) != NULL)
10144 10151                  dtrace_difo_hold(dp);
10145 10152  
10146 10153          rec->dtrd_action = action->dta_kind;
10147 10154          rec->dtrd_arg = arg;
10148 10155          rec->dtrd_uarg = desc->dtad_uarg;
10149 10156          rec->dtrd_alignment = (uint16_t)align;
10150 10157          rec->dtrd_format = format;
10151 10158  
10152 10159          if ((last = ecb->dte_action_last) != NULL) {
10153 10160                  ASSERT(ecb->dte_action != NULL);
10154 10161                  action->dta_prev = last;
10155 10162                  last->dta_next = action;
10156 10163          } else {
10157 10164                  ASSERT(ecb->dte_action == NULL);
10158 10165                  ecb->dte_action = action;
10159 10166          }
10160 10167  
10161 10168          ecb->dte_action_last = action;
10162 10169  
10163 10170          return (0);
10164 10171  }
10165 10172  
10166 10173  static void
10167 10174  dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
10168 10175  {
10169 10176          dtrace_action_t *act = ecb->dte_action, *next;
10170 10177          dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
10171 10178          dtrace_difo_t *dp;
10172 10179          uint16_t format;
10173 10180  
10174 10181          if (act != NULL && act->dta_refcnt > 1) {
10175 10182                  ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
10176 10183                  act->dta_refcnt--;
10177 10184          } else {
10178 10185                  for (; act != NULL; act = next) {
10179 10186                          next = act->dta_next;
10180 10187                          ASSERT(next != NULL || act == ecb->dte_action_last);
10181 10188                          ASSERT(act->dta_refcnt == 1);
10182 10189  
10183 10190                          if ((format = act->dta_rec.dtrd_format) != 0)
10184 10191                                  dtrace_format_remove(ecb->dte_state, format);
10185 10192  
10186 10193                          if ((dp = act->dta_difo) != NULL)
10187 10194                                  dtrace_difo_release(dp, vstate);
10188 10195  
10189 10196                          if (DTRACEACT_ISAGG(act->dta_kind)) {
10190 10197                                  dtrace_ecb_aggregation_destroy(ecb, act);
10191 10198                          } else {
10192 10199                                  kmem_free(act, sizeof (dtrace_action_t));
10193 10200                          }
10194 10201                  }
10195 10202          }
10196 10203  
10197 10204          ecb->dte_action = NULL;
10198 10205          ecb->dte_action_last = NULL;
10199 10206          ecb->dte_size = sizeof (dtrace_epid_t);
10200 10207  }
10201 10208  
10202 10209  static void
10203 10210  dtrace_ecb_disable(dtrace_ecb_t *ecb)
10204 10211  {
10205 10212          /*
10206 10213           * We disable the ECB by removing it from its probe.
10207 10214           */
10208 10215          dtrace_ecb_t *pecb, *prev = NULL;
10209 10216          dtrace_probe_t *probe = ecb->dte_probe;
10210 10217  
10211 10218          ASSERT(MUTEX_HELD(&dtrace_lock));
10212 10219  
10213 10220          if (probe == NULL) {
10214 10221                  /*
10215 10222                   * This is the NULL probe; there is nothing to disable.
10216 10223                   */
10217 10224                  return;
10218 10225          }
10219 10226  
10220 10227          for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
10221 10228                  if (pecb == ecb)
10222 10229                          break;
10223 10230                  prev = pecb;
10224 10231          }
10225 10232  
10226 10233          ASSERT(pecb != NULL);
10227 10234  
10228 10235          if (prev == NULL) {
10229 10236                  probe->dtpr_ecb = ecb->dte_next;
10230 10237          } else {
10231 10238                  prev->dte_next = ecb->dte_next;
10232 10239          }
10233 10240  
10234 10241          if (ecb == probe->dtpr_ecb_last) {
10235 10242                  ASSERT(ecb->dte_next == NULL);
10236 10243                  probe->dtpr_ecb_last = prev;
10237 10244          }
10238 10245  
10239 10246          /*
10240 10247           * The ECB has been disconnected from the probe; now sync to assure
10241 10248           * that all CPUs have seen the change before returning.
10242 10249           */
10243 10250          dtrace_sync();
10244 10251  
10245 10252          if (probe->dtpr_ecb == NULL) {
10246 10253                  /*
10247 10254                   * That was the last ECB on the probe; clear the predicate
10248 10255                   * cache ID for the probe, disable it and sync one more time
10249 10256                   * to assure that we'll never hit it again.
10250 10257                   */
10251 10258                  dtrace_provider_t *prov = probe->dtpr_provider;
10252 10259  
10253 10260                  ASSERT(ecb->dte_next == NULL);
10254 10261                  ASSERT(probe->dtpr_ecb_last == NULL);
10255 10262                  probe->dtpr_predcache = DTRACE_CACHEIDNONE;
10256 10263                  prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
10257 10264                      probe->dtpr_id, probe->dtpr_arg);
10258 10265                  dtrace_sync();
10259 10266          } else {
10260 10267                  /*
10261 10268                   * There is at least one ECB remaining on the probe.  If there
10262 10269                   * is _exactly_ one, set the probe's predicate cache ID to be
10263 10270                   * the predicate cache ID of the remaining ECB.
10264 10271                   */
10265 10272                  ASSERT(probe->dtpr_ecb_last != NULL);
10266 10273                  ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
10267 10274  
10268 10275                  if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
10269 10276                          dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
10270 10277  
10271 10278                          ASSERT(probe->dtpr_ecb->dte_next == NULL);
10272 10279  
10273 10280                          if (p != NULL)
10274 10281                                  probe->dtpr_predcache = p->dtp_cacheid;
10275 10282                  }
10276 10283  
10277 10284                  ecb->dte_next = NULL;
10278 10285          }
10279 10286  }
10280 10287  
10281 10288  static void
10282 10289  dtrace_ecb_destroy(dtrace_ecb_t *ecb)
10283 10290  {
10284 10291          dtrace_state_t *state = ecb->dte_state;
10285 10292          dtrace_vstate_t *vstate = &state->dts_vstate;
10286 10293          dtrace_predicate_t *pred;
10287 10294          dtrace_epid_t epid = ecb->dte_epid;
10288 10295  
10289 10296          ASSERT(MUTEX_HELD(&dtrace_lock));
10290 10297          ASSERT(ecb->dte_next == NULL);
10291 10298          ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
10292 10299  
10293 10300          if ((pred = ecb->dte_predicate) != NULL)
10294 10301                  dtrace_predicate_release(pred, vstate);
10295 10302  
10296 10303          dtrace_ecb_action_remove(ecb);
10297 10304  
10298 10305          ASSERT(state->dts_ecbs[epid - 1] == ecb);
10299 10306          state->dts_ecbs[epid - 1] = NULL;
10300 10307  
10301 10308          kmem_free(ecb, sizeof (dtrace_ecb_t));
10302 10309  }
10303 10310  
10304 10311  static dtrace_ecb_t *
10305 10312  dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
10306 10313      dtrace_enabling_t *enab)
10307 10314  {
10308 10315          dtrace_ecb_t *ecb;
10309 10316          dtrace_predicate_t *pred;
10310 10317          dtrace_actdesc_t *act;
10311 10318          dtrace_provider_t *prov;
10312 10319          dtrace_ecbdesc_t *desc = enab->dten_current;
10313 10320  
10314 10321          ASSERT(MUTEX_HELD(&dtrace_lock));
10315 10322          ASSERT(state != NULL);
10316 10323  
10317 10324          ecb = dtrace_ecb_add(state, probe);
10318 10325          ecb->dte_uarg = desc->dted_uarg;
10319 10326  
10320 10327          if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
10321 10328                  dtrace_predicate_hold(pred);
10322 10329                  ecb->dte_predicate = pred;
10323 10330          }
10324 10331  
10325 10332          if (probe != NULL) {
10326 10333                  /*
10327 10334                   * If the provider shows more leg than the consumer is old
10328 10335                   * enough to see, we need to enable the appropriate implicit
10329 10336                   * predicate bits to prevent the ecb from activating at
10330 10337                   * revealing times.
10331 10338                   *
10332 10339                   * Providers specifying DTRACE_PRIV_USER at register time
10333 10340                   * are stating that they need the /proc-style privilege
10334 10341                   * model to be enforced, and this is what DTRACE_COND_OWNER
10335 10342                   * and DTRACE_COND_ZONEOWNER will then do at probe time.
10336 10343                   */
10337 10344                  prov = probe->dtpr_provider;
10338 10345                  if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
10339 10346                      (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10340 10347                          ecb->dte_cond |= DTRACE_COND_OWNER;
10341 10348  
10342 10349                  if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
10343 10350                      (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10344 10351                          ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
10345 10352  
10346 10353                  /*
10347 10354                   * If the provider shows us kernel innards and the user
10348 10355                   * is lacking sufficient privilege, enable the
10349 10356                   * DTRACE_COND_USERMODE implicit predicate.
10350 10357                   */
10351 10358                  if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
10352 10359                      (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
10353 10360                          ecb->dte_cond |= DTRACE_COND_USERMODE;
10354 10361          }
10355 10362  
10356 10363          if (dtrace_ecb_create_cache != NULL) {
10357 10364                  /*
10358 10365                   * If we have a cached ecb, we'll use its action list instead
10359 10366                   * of creating our own (saving both time and space).
10360 10367                   */
10361 10368                  dtrace_ecb_t *cached = dtrace_ecb_create_cache;
10362 10369                  dtrace_action_t *act = cached->dte_action;
10363 10370  
10364 10371                  if (act != NULL) {
10365 10372                          ASSERT(act->dta_refcnt > 0);
10366 10373                          act->dta_refcnt++;
10367 10374                          ecb->dte_action = act;
10368 10375                          ecb->dte_action_last = cached->dte_action_last;
10369 10376                          ecb->dte_needed = cached->dte_needed;
10370 10377                          ecb->dte_size = cached->dte_size;
10371 10378                          ecb->dte_alignment = cached->dte_alignment;
10372 10379                  }
10373 10380  
10374 10381                  return (ecb);
10375 10382          }
10376 10383  
10377 10384          for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
10378 10385                  if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
10379 10386                          dtrace_ecb_destroy(ecb);
10380 10387                          return (NULL);
10381 10388                  }
10382 10389          }
10383 10390  
10384 10391          dtrace_ecb_resize(ecb);
10385 10392  
10386 10393          return (dtrace_ecb_create_cache = ecb);
10387 10394  }
10388 10395  
10389 10396  static int
10390 10397  dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
10391 10398  {
10392 10399          dtrace_ecb_t *ecb;
10393 10400          dtrace_enabling_t *enab = arg;
10394 10401          dtrace_state_t *state = enab->dten_vstate->dtvs_state;
10395 10402  
10396 10403          ASSERT(state != NULL);
10397 10404  
10398 10405          if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
10399 10406                  /*
10400 10407                   * This probe was created in a generation for which this
10401 10408                   * enabling has previously created ECBs; we don't want to
10402 10409                   * enable it again, so just kick out.
10403 10410                   */
10404 10411                  return (DTRACE_MATCH_NEXT);
10405 10412          }
10406 10413  
10407 10414          if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
10408 10415                  return (DTRACE_MATCH_DONE);
10409 10416  
10410 10417          if (dtrace_ecb_enable(ecb) < 0)
10411 10418                  return (DTRACE_MATCH_FAIL);
10412 10419  
10413 10420          return (DTRACE_MATCH_NEXT);
10414 10421  }
10415 10422  
10416 10423  static dtrace_ecb_t *
10417 10424  dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
10418 10425  {
10419 10426          dtrace_ecb_t *ecb;
10420 10427  
10421 10428          ASSERT(MUTEX_HELD(&dtrace_lock));
10422 10429  
10423 10430          if (id == 0 || id > state->dts_necbs)
10424 10431                  return (NULL);
10425 10432  
10426 10433          ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
10427 10434          ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
10428 10435  
10429 10436          return (state->dts_ecbs[id - 1]);
10430 10437  }
10431 10438  
10432 10439  static dtrace_aggregation_t *
10433 10440  dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
10434 10441  {
10435 10442          dtrace_aggregation_t *agg;
10436 10443  
10437 10444          ASSERT(MUTEX_HELD(&dtrace_lock));
10438 10445  
10439 10446          if (id == 0 || id > state->dts_naggregations)
10440 10447                  return (NULL);
10441 10448  
10442 10449          ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
10443 10450          ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
10444 10451              agg->dtag_id == id);
10445 10452  
10446 10453          return (state->dts_aggregations[id - 1]);
10447 10454  }
10448 10455  
10449 10456  /*
10450 10457   * DTrace Buffer Functions
10451 10458   *
10452 10459   * The following functions manipulate DTrace buffers.  Most of these functions
10453 10460   * are called in the context of establishing or processing consumer state;
10454 10461   * exceptions are explicitly noted.
10455 10462   */
10456 10463  
10457 10464  /*
10458 10465   * Note:  called from cross call context.  This function switches the two
10459 10466   * buffers on a given CPU.  The atomicity of this operation is assured by
10460 10467   * disabling interrupts while the actual switch takes place; the disabling of
10461 10468   * interrupts serializes the execution with any execution of dtrace_probe() on
10462 10469   * the same CPU.
10463 10470   */
10464 10471  static void
10465 10472  dtrace_buffer_switch(dtrace_buffer_t *buf)
10466 10473  {
10467 10474          caddr_t tomax = buf->dtb_tomax;
10468 10475          caddr_t xamot = buf->dtb_xamot;
10469 10476          dtrace_icookie_t cookie;
10470 10477          hrtime_t now = dtrace_gethrtime();
10471 10478  
10472 10479          ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
10473 10480          ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
10474 10481  
10475 10482          cookie = dtrace_interrupt_disable();
10476 10483          buf->dtb_tomax = xamot;
10477 10484          buf->dtb_xamot = tomax;
10478 10485          buf->dtb_xamot_drops = buf->dtb_drops;
10479 10486          buf->dtb_xamot_offset = buf->dtb_offset;
10480 10487          buf->dtb_xamot_errors = buf->dtb_errors;
10481 10488          buf->dtb_xamot_flags = buf->dtb_flags;
10482 10489          buf->dtb_offset = 0;
10483 10490          buf->dtb_drops = 0;
10484 10491          buf->dtb_errors = 0;
10485 10492          buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
10486 10493          buf->dtb_interval = now - buf->dtb_switched;
10487 10494          buf->dtb_switched = now;
10488 10495          dtrace_interrupt_enable(cookie);
10489 10496  }
10490 10497  
10491 10498  /*
10492 10499   * Note:  called from cross call context.  This function activates a buffer
10493 10500   * on a CPU.  As with dtrace_buffer_switch(), the atomicity of the operation
10494 10501   * is guaranteed by the disabling of interrupts.
10495 10502   */
10496 10503  static void
10497 10504  dtrace_buffer_activate(dtrace_state_t *state)
10498 10505  {
10499 10506          dtrace_buffer_t *buf;
10500 10507          dtrace_icookie_t cookie = dtrace_interrupt_disable();
10501 10508  
10502 10509          buf = &state->dts_buffer[CPU->cpu_id];
10503 10510  
10504 10511          if (buf->dtb_tomax != NULL) {
10505 10512                  /*
10506 10513                   * We might like to assert that the buffer is marked inactive,
10507 10514                   * but this isn't necessarily true:  the buffer for the CPU
10508 10515                   * that processes the BEGIN probe has its buffer activated
10509 10516                   * manually.  In this case, we take the (harmless) action
10510 10517                   * re-clearing the bit INACTIVE bit.
10511 10518                   */
10512 10519                  buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
10513 10520          }
10514 10521  
10515 10522          dtrace_interrupt_enable(cookie);
10516 10523  }
10517 10524  
10518 10525  static int
10519 10526  dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
10520 10527      processorid_t cpu, int *factor)
10521 10528  {
10522 10529          cpu_t *cp;
10523 10530          dtrace_buffer_t *buf;
10524 10531          int allocated = 0, desired = 0;
10525 10532  
10526 10533          ASSERT(MUTEX_HELD(&cpu_lock));
10527 10534          ASSERT(MUTEX_HELD(&dtrace_lock));
10528 10535  
10529 10536          *factor = 1;
10530 10537  
10531 10538          if (size > dtrace_nonroot_maxsize &&
10532 10539              !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
10533 10540                  return (EFBIG);
10534 10541  
10535 10542          cp = cpu_list;
10536 10543  
10537 10544          do {
10538 10545                  if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10539 10546                          continue;
10540 10547  
10541 10548                  buf = &bufs[cp->cpu_id];
10542 10549  
10543 10550                  /*
10544 10551                   * If there is already a buffer allocated for this CPU, it
10545 10552                   * is only possible that this is a DR event.  In this case,
10546 10553                   * the buffer size must match our specified size.
10547 10554                   */
10548 10555                  if (buf->dtb_tomax != NULL) {
10549 10556                          ASSERT(buf->dtb_size == size);
10550 10557                          continue;
10551 10558                  }
10552 10559  
10553 10560                  ASSERT(buf->dtb_xamot == NULL);
10554 10561  
10555 10562                  if ((buf->dtb_tomax = kmem_zalloc(size,
10556 10563                      KM_NOSLEEP | KM_NORMALPRI)) == NULL)
10557 10564                          goto err;
10558 10565  
10559 10566                  buf->dtb_size = size;
10560 10567                  buf->dtb_flags = flags;
10561 10568                  buf->dtb_offset = 0;
10562 10569                  buf->dtb_drops = 0;
10563 10570  
10564 10571                  if (flags & DTRACEBUF_NOSWITCH)
10565 10572                          continue;
10566 10573  
10567 10574                  if ((buf->dtb_xamot = kmem_zalloc(size,
10568 10575                      KM_NOSLEEP | KM_NORMALPRI)) == NULL)
10569 10576                          goto err;
10570 10577          } while ((cp = cp->cpu_next) != cpu_list);
10571 10578  
10572 10579          return (0);
10573 10580  
10574 10581  err:
10575 10582          cp = cpu_list;
10576 10583  
10577 10584          do {
10578 10585                  if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10579 10586                          continue;
10580 10587  
10581 10588                  buf = &bufs[cp->cpu_id];
10582 10589                  desired += 2;
10583 10590  
10584 10591                  if (buf->dtb_xamot != NULL) {
10585 10592                          ASSERT(buf->dtb_tomax != NULL);
10586 10593                          ASSERT(buf->dtb_size == size);
10587 10594                          kmem_free(buf->dtb_xamot, size);
10588 10595                          allocated++;
10589 10596                  }
10590 10597  
10591 10598                  if (buf->dtb_tomax != NULL) {
10592 10599                          ASSERT(buf->dtb_size == size);
10593 10600                          kmem_free(buf->dtb_tomax, size);
10594 10601                          allocated++;
10595 10602                  }
10596 10603  
10597 10604                  buf->dtb_tomax = NULL;
10598 10605                  buf->dtb_xamot = NULL;
10599 10606                  buf->dtb_size = 0;
10600 10607          } while ((cp = cp->cpu_next) != cpu_list);
10601 10608  
10602 10609          *factor = desired / (allocated > 0 ? allocated : 1);
10603 10610  
10604 10611          return (ENOMEM);
10605 10612  }
10606 10613  
10607 10614  /*
10608 10615   * Note:  called from probe context.  This function just increments the drop
10609 10616   * count on a buffer.  It has been made a function to allow for the
10610 10617   * possibility of understanding the source of mysterious drop counts.  (A
10611 10618   * problem for which one may be particularly disappointed that DTrace cannot
10612 10619   * be used to understand DTrace.)
10613 10620   */
10614 10621  static void
10615 10622  dtrace_buffer_drop(dtrace_buffer_t *buf)
10616 10623  {
10617 10624          buf->dtb_drops++;
10618 10625  }
10619 10626  
10620 10627  /*
10621 10628   * Note:  called from probe context.  This function is called to reserve space
10622 10629   * in a buffer.  If mstate is non-NULL, sets the scratch base and size in the
10623 10630   * mstate.  Returns the new offset in the buffer, or a negative value if an
10624 10631   * error has occurred.
10625 10632   */
10626 10633  static intptr_t
10627 10634  dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
10628 10635      dtrace_state_t *state, dtrace_mstate_t *mstate)
10629 10636  {
10630 10637          intptr_t offs = buf->dtb_offset, soffs;
10631 10638          intptr_t woffs;
10632 10639          caddr_t tomax;
10633 10640          size_t total;
10634 10641  
10635 10642          if (buf->dtb_flags & DTRACEBUF_INACTIVE)
10636 10643                  return (-1);
10637 10644  
10638 10645          if ((tomax = buf->dtb_tomax) == NULL) {
10639 10646                  dtrace_buffer_drop(buf);
10640 10647                  return (-1);
10641 10648          }
10642 10649  
10643 10650          if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
10644 10651                  while (offs & (align - 1)) {
10645 10652                          /*
10646 10653                           * Assert that our alignment is off by a number which
10647 10654                           * is itself sizeof (uint32_t) aligned.
10648 10655                           */
10649 10656                          ASSERT(!((align - (offs & (align - 1))) &
10650 10657                              (sizeof (uint32_t) - 1)));
10651 10658                          DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10652 10659                          offs += sizeof (uint32_t);
10653 10660                  }
10654 10661  
10655 10662                  if ((soffs = offs + needed) > buf->dtb_size) {
10656 10663                          dtrace_buffer_drop(buf);
10657 10664                          return (-1);
10658 10665                  }
10659 10666  
10660 10667                  if (mstate == NULL)
10661 10668                          return (offs);
10662 10669  
10663 10670                  mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
10664 10671                  mstate->dtms_scratch_size = buf->dtb_size - soffs;
10665 10672                  mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10666 10673  
10667 10674                  return (offs);
10668 10675          }
10669 10676  
10670 10677          if (buf->dtb_flags & DTRACEBUF_FILL) {
10671 10678                  if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
10672 10679                      (buf->dtb_flags & DTRACEBUF_FULL))
10673 10680                          return (-1);
10674 10681                  goto out;
10675 10682          }
10676 10683  
10677 10684          total = needed + (offs & (align - 1));
10678 10685  
10679 10686          /*
10680 10687           * For a ring buffer, life is quite a bit more complicated.  Before
10681 10688           * we can store any padding, we need to adjust our wrapping offset.
10682 10689           * (If we've never before wrapped or we're not about to, no adjustment
10683 10690           * is required.)
10684 10691           */
10685 10692          if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
10686 10693              offs + total > buf->dtb_size) {
10687 10694                  woffs = buf->dtb_xamot_offset;
10688 10695  
10689 10696                  if (offs + total > buf->dtb_size) {
10690 10697                          /*
10691 10698                           * We can't fit in the end of the buffer.  First, a
10692 10699                           * sanity check that we can fit in the buffer at all.
10693 10700                           */
10694 10701                          if (total > buf->dtb_size) {
10695 10702                                  dtrace_buffer_drop(buf);
10696 10703                                  return (-1);
10697 10704                          }
10698 10705  
10699 10706                          /*
10700 10707                           * We're going to be storing at the top of the buffer,
10701 10708                           * so now we need to deal with the wrapped offset.  We
10702 10709                           * only reset our wrapped offset to 0 if it is
10703 10710                           * currently greater than the current offset.  If it
10704 10711                           * is less than the current offset, it is because a
10705 10712                           * previous allocation induced a wrap -- but the
10706 10713                           * allocation didn't subsequently take the space due
10707 10714                           * to an error or false predicate evaluation.  In this
10708 10715                           * case, we'll just leave the wrapped offset alone: if
10709 10716                           * the wrapped offset hasn't been advanced far enough
10710 10717                           * for this allocation, it will be adjusted in the
10711 10718                           * lower loop.
10712 10719                           */
10713 10720                          if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
10714 10721                                  if (woffs >= offs)
10715 10722                                          woffs = 0;
10716 10723                          } else {
10717 10724                                  woffs = 0;
10718 10725                          }
10719 10726  
10720 10727                          /*
10721 10728                           * Now we know that we're going to be storing to the
10722 10729                           * top of the buffer and that there is room for us
10723 10730                           * there.  We need to clear the buffer from the current
10724 10731                           * offset to the end (there may be old gunk there).
10725 10732                           */
10726 10733                          while (offs < buf->dtb_size)
10727 10734                                  tomax[offs++] = 0;
10728 10735  
10729 10736                          /*
10730 10737                           * We need to set our offset to zero.  And because we
10731 10738                           * are wrapping, we need to set the bit indicating as
10732 10739                           * much.  We can also adjust our needed space back
10733 10740                           * down to the space required by the ECB -- we know
10734 10741                           * that the top of the buffer is aligned.
10735 10742                           */
10736 10743                          offs = 0;
10737 10744                          total = needed;
10738 10745                          buf->dtb_flags |= DTRACEBUF_WRAPPED;
10739 10746                  } else {
10740 10747                          /*
10741 10748                           * There is room for us in the buffer, so we simply
10742 10749                           * need to check the wrapped offset.
10743 10750                           */
10744 10751                          if (woffs < offs) {
10745 10752                                  /*
10746 10753                                   * The wrapped offset is less than the offset.
10747 10754                                   * This can happen if we allocated buffer space
10748 10755                                   * that induced a wrap, but then we didn't
10749 10756                                   * subsequently take the space due to an error
10750 10757                                   * or false predicate evaluation.  This is
10751 10758                                   * okay; we know that _this_ allocation isn't
10752 10759                                   * going to induce a wrap.  We still can't
10753 10760                                   * reset the wrapped offset to be zero,
10754 10761                                   * however: the space may have been trashed in
10755 10762                                   * the previous failed probe attempt.  But at
10756 10763                                   * least the wrapped offset doesn't need to
10757 10764                                   * be adjusted at all...
10758 10765                                   */
10759 10766                                  goto out;
10760 10767                          }
10761 10768                  }
10762 10769  
10763 10770                  while (offs + total > woffs) {
10764 10771                          dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
10765 10772                          size_t size;
10766 10773  
10767 10774                          if (epid == DTRACE_EPIDNONE) {
10768 10775                                  size = sizeof (uint32_t);
10769 10776                          } else {
10770 10777                                  ASSERT(epid <= state->dts_necbs);
10771 10778                                  ASSERT(state->dts_ecbs[epid - 1] != NULL);
10772 10779  
10773 10780                                  size = state->dts_ecbs[epid - 1]->dte_size;
10774 10781                          }
10775 10782  
10776 10783                          ASSERT(woffs + size <= buf->dtb_size);
10777 10784                          ASSERT(size != 0);
10778 10785  
10779 10786                          if (woffs + size == buf->dtb_size) {
10780 10787                                  /*
10781 10788                                   * We've reached the end of the buffer; we want
10782 10789                                   * to set the wrapped offset to 0 and break
10783 10790                                   * out.  However, if the offs is 0, then we're
10784 10791                                   * in a strange edge-condition:  the amount of
10785 10792                                   * space that we want to reserve plus the size
10786 10793                                   * of the record that we're overwriting is
10787 10794                                   * greater than the size of the buffer.  This
10788 10795                                   * is problematic because if we reserve the
10789 10796                                   * space but subsequently don't consume it (due
10790 10797                                   * to a failed predicate or error) the wrapped
10791 10798                                   * offset will be 0 -- yet the EPID at offset 0
10792 10799                                   * will not be committed.  This situation is
10793 10800                                   * relatively easy to deal with:  if we're in
10794 10801                                   * this case, the buffer is indistinguishable
10795 10802                                   * from one that hasn't wrapped; we need only
10796 10803                                   * finish the job by clearing the wrapped bit,
10797 10804                                   * explicitly setting the offset to be 0, and
10798 10805                                   * zero'ing out the old data in the buffer.
10799 10806                                   */
10800 10807                                  if (offs == 0) {
10801 10808                                          buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
10802 10809                                          buf->dtb_offset = 0;
10803 10810                                          woffs = total;
10804 10811  
10805 10812                                          while (woffs < buf->dtb_size)
10806 10813                                                  tomax[woffs++] = 0;
10807 10814                                  }
10808 10815  
10809 10816                                  woffs = 0;
10810 10817                                  break;
10811 10818                          }
10812 10819  
10813 10820                          woffs += size;
10814 10821                  }
10815 10822  
10816 10823                  /*
10817 10824                   * We have a wrapped offset.  It may be that the wrapped offset
10818 10825                   * has become zero -- that's okay.
10819 10826                   */
10820 10827                  buf->dtb_xamot_offset = woffs;
10821 10828          }
10822 10829  
10823 10830  out:
10824 10831          /*
10825 10832           * Now we can plow the buffer with any necessary padding.
10826 10833           */
10827 10834          while (offs & (align - 1)) {
10828 10835                  /*
10829 10836                   * Assert that our alignment is off by a number which
10830 10837                   * is itself sizeof (uint32_t) aligned.
10831 10838                   */
10832 10839                  ASSERT(!((align - (offs & (align - 1))) &
10833 10840                      (sizeof (uint32_t) - 1)));
10834 10841                  DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10835 10842                  offs += sizeof (uint32_t);
10836 10843          }
10837 10844  
10838 10845          if (buf->dtb_flags & DTRACEBUF_FILL) {
10839 10846                  if (offs + needed > buf->dtb_size - state->dts_reserve) {
10840 10847                          buf->dtb_flags |= DTRACEBUF_FULL;
10841 10848                          return (-1);
10842 10849                  }
10843 10850          }
10844 10851  
10845 10852          if (mstate == NULL)
10846 10853                  return (offs);
10847 10854  
10848 10855          /*
10849 10856           * For ring buffers and fill buffers, the scratch space is always
10850 10857           * the inactive buffer.
10851 10858           */
10852 10859          mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
10853 10860          mstate->dtms_scratch_size = buf->dtb_size;
10854 10861          mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10855 10862  
10856 10863          return (offs);
10857 10864  }
10858 10865  
10859 10866  static void
10860 10867  dtrace_buffer_polish(dtrace_buffer_t *buf)
10861 10868  {
10862 10869          ASSERT(buf->dtb_flags & DTRACEBUF_RING);
10863 10870          ASSERT(MUTEX_HELD(&dtrace_lock));
10864 10871  
10865 10872          if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
10866 10873                  return;
10867 10874  
10868 10875          /*
10869 10876           * We need to polish the ring buffer.  There are three cases:
10870 10877           *
10871 10878           * - The first (and presumably most common) is that there is no gap
10872 10879           *   between the buffer offset and the wrapped offset.  In this case,
10873 10880           *   there is nothing in the buffer that isn't valid data; we can
10874 10881           *   mark the buffer as polished and return.
10875 10882           *
10876 10883           * - The second (less common than the first but still more common
10877 10884           *   than the third) is that there is a gap between the buffer offset
10878 10885           *   and the wrapped offset, and the wrapped offset is larger than the
10879 10886           *   buffer offset.  This can happen because of an alignment issue, or
10880 10887           *   can happen because of a call to dtrace_buffer_reserve() that
10881 10888           *   didn't subsequently consume the buffer space.  In this case,
10882 10889           *   we need to zero the data from the buffer offset to the wrapped
10883 10890           *   offset.
10884 10891           *
10885 10892           * - The third (and least common) is that there is a gap between the
10886 10893           *   buffer offset and the wrapped offset, but the wrapped offset is
10887 10894           *   _less_ than the buffer offset.  This can only happen because a
10888 10895           *   call to dtrace_buffer_reserve() induced a wrap, but the space
10889 10896           *   was not subsequently consumed.  In this case, we need to zero the
10890 10897           *   space from the offset to the end of the buffer _and_ from the
10891 10898           *   top of the buffer to the wrapped offset.
10892 10899           */
10893 10900          if (buf->dtb_offset < buf->dtb_xamot_offset) {
10894 10901                  bzero(buf->dtb_tomax + buf->dtb_offset,
10895 10902                      buf->dtb_xamot_offset - buf->dtb_offset);
10896 10903          }
10897 10904  
10898 10905          if (buf->dtb_offset > buf->dtb_xamot_offset) {
10899 10906                  bzero(buf->dtb_tomax + buf->dtb_offset,
10900 10907                      buf->dtb_size - buf->dtb_offset);
10901 10908                  bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
10902 10909          }
10903 10910  }
10904 10911  
10905 10912  /*
10906 10913   * This routine determines if data generated at the specified time has likely
10907 10914   * been entirely consumed at user-level.  This routine is called to determine
10908 10915   * if an ECB on a defunct probe (but for an active enabling) can be safely
10909 10916   * disabled and destroyed.
10910 10917   */
10911 10918  static int
10912 10919  dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when)
10913 10920  {
10914 10921          int i;
10915 10922  
10916 10923          for (i = 0; i < NCPU; i++) {
10917 10924                  dtrace_buffer_t *buf = &bufs[i];
10918 10925  
10919 10926                  if (buf->dtb_size == 0)
10920 10927                          continue;
10921 10928  
10922 10929                  if (buf->dtb_flags & DTRACEBUF_RING)
10923 10930                          return (0);
10924 10931  
10925 10932                  if (!buf->dtb_switched && buf->dtb_offset != 0)
10926 10933                          return (0);
10927 10934  
10928 10935                  if (buf->dtb_switched - buf->dtb_interval < when)
10929 10936                          return (0);
10930 10937          }
10931 10938  
10932 10939          return (1);
10933 10940  }
10934 10941  
10935 10942  static void
10936 10943  dtrace_buffer_free(dtrace_buffer_t *bufs)
10937 10944  {
10938 10945          int i;
10939 10946  
10940 10947          for (i = 0; i < NCPU; i++) {
10941 10948                  dtrace_buffer_t *buf = &bufs[i];
10942 10949  
10943 10950                  if (buf->dtb_tomax == NULL) {
10944 10951                          ASSERT(buf->dtb_xamot == NULL);
10945 10952                          ASSERT(buf->dtb_size == 0);
10946 10953                          continue;
10947 10954                  }
10948 10955  
10949 10956                  if (buf->dtb_xamot != NULL) {
10950 10957                          ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
10951 10958                          kmem_free(buf->dtb_xamot, buf->dtb_size);
10952 10959                  }
10953 10960  
10954 10961                  kmem_free(buf->dtb_tomax, buf->dtb_size);
10955 10962                  buf->dtb_size = 0;
10956 10963                  buf->dtb_tomax = NULL;
10957 10964                  buf->dtb_xamot = NULL;
10958 10965          }
10959 10966  }
10960 10967  
10961 10968  /*
10962 10969   * DTrace Enabling Functions
10963 10970   */
10964 10971  static dtrace_enabling_t *
10965 10972  dtrace_enabling_create(dtrace_vstate_t *vstate)
10966 10973  {
10967 10974          dtrace_enabling_t *enab;
10968 10975  
10969 10976          enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
10970 10977          enab->dten_vstate = vstate;
10971 10978  
10972 10979          return (enab);
10973 10980  }
10974 10981  
10975 10982  static void
10976 10983  dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
10977 10984  {
10978 10985          dtrace_ecbdesc_t **ndesc;
10979 10986          size_t osize, nsize;
10980 10987  
10981 10988          /*
10982 10989           * We can't add to enablings after we've enabled them, or after we've
10983 10990           * retained them.
10984 10991           */
10985 10992          ASSERT(enab->dten_probegen == 0);
10986 10993          ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
10987 10994  
10988 10995          if (enab->dten_ndesc < enab->dten_maxdesc) {
10989 10996                  enab->dten_desc[enab->dten_ndesc++] = ecb;
10990 10997                  return;
10991 10998          }
10992 10999  
10993 11000          osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
10994 11001  
10995 11002          if (enab->dten_maxdesc == 0) {
10996 11003                  enab->dten_maxdesc = 1;
10997 11004          } else {
10998 11005                  enab->dten_maxdesc <<= 1;
10999 11006          }
11000 11007  
11001 11008          ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
11002 11009  
11003 11010          nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
11004 11011          ndesc = kmem_zalloc(nsize, KM_SLEEP);
11005 11012          bcopy(enab->dten_desc, ndesc, osize);
11006 11013          kmem_free(enab->dten_desc, osize);
11007 11014  
11008 11015          enab->dten_desc = ndesc;
11009 11016          enab->dten_desc[enab->dten_ndesc++] = ecb;
11010 11017  }
11011 11018  
11012 11019  static void
11013 11020  dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
11014 11021      dtrace_probedesc_t *pd)
11015 11022  {
11016 11023          dtrace_ecbdesc_t *new;
11017 11024          dtrace_predicate_t *pred;
11018 11025          dtrace_actdesc_t *act;
11019 11026  
11020 11027          /*
11021 11028           * We're going to create a new ECB description that matches the
11022 11029           * specified ECB in every way, but has the specified probe description.
11023 11030           */
11024 11031          new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
11025 11032  
11026 11033          if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
11027 11034                  dtrace_predicate_hold(pred);
11028 11035  
11029 11036          for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
11030 11037                  dtrace_actdesc_hold(act);
11031 11038  
11032 11039          new->dted_action = ecb->dted_action;
11033 11040          new->dted_pred = ecb->dted_pred;
11034 11041          new->dted_probe = *pd;
11035 11042          new->dted_uarg = ecb->dted_uarg;
11036 11043  
11037 11044          dtrace_enabling_add(enab, new);
11038 11045  }
11039 11046  
11040 11047  static void
11041 11048  dtrace_enabling_dump(dtrace_enabling_t *enab)
11042 11049  {
11043 11050          int i;
11044 11051  
11045 11052          for (i = 0; i < enab->dten_ndesc; i++) {
11046 11053                  dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
11047 11054  
11048 11055                  cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
11049 11056                      desc->dtpd_provider, desc->dtpd_mod,
11050 11057                      desc->dtpd_func, desc->dtpd_name);
11051 11058          }
11052 11059  }
11053 11060  
11054 11061  static void
11055 11062  dtrace_enabling_destroy(dtrace_enabling_t *enab)
11056 11063  {
11057 11064          int i;
11058 11065          dtrace_ecbdesc_t *ep;
11059 11066          dtrace_vstate_t *vstate = enab->dten_vstate;
11060 11067  
11061 11068          ASSERT(MUTEX_HELD(&dtrace_lock));
11062 11069  
11063 11070          for (i = 0; i < enab->dten_ndesc; i++) {
11064 11071                  dtrace_actdesc_t *act, *next;
11065 11072                  dtrace_predicate_t *pred;
11066 11073  
11067 11074                  ep = enab->dten_desc[i];
11068 11075  
11069 11076                  if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
11070 11077                          dtrace_predicate_release(pred, vstate);
11071 11078  
11072 11079                  for (act = ep->dted_action; act != NULL; act = next) {
11073 11080                          next = act->dtad_next;
11074 11081                          dtrace_actdesc_release(act, vstate);
11075 11082                  }
11076 11083  
11077 11084                  kmem_free(ep, sizeof (dtrace_ecbdesc_t));
11078 11085          }
11079 11086  
11080 11087          kmem_free(enab->dten_desc,
11081 11088              enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
11082 11089  
11083 11090          /*
11084 11091           * If this was a retained enabling, decrement the dts_nretained count
11085 11092           * and take it off of the dtrace_retained list.
11086 11093           */
11087 11094          if (enab->dten_prev != NULL || enab->dten_next != NULL ||
11088 11095              dtrace_retained == enab) {
11089 11096                  ASSERT(enab->dten_vstate->dtvs_state != NULL);
11090 11097                  ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
11091 11098                  enab->dten_vstate->dtvs_state->dts_nretained--;
11092 11099                  dtrace_retained_gen++;
11093 11100          }
11094 11101  
11095 11102          if (enab->dten_prev == NULL) {
11096 11103                  if (dtrace_retained == enab) {
11097 11104                          dtrace_retained = enab->dten_next;
11098 11105  
11099 11106                          if (dtrace_retained != NULL)
11100 11107                                  dtrace_retained->dten_prev = NULL;
11101 11108                  }
11102 11109          } else {
11103 11110                  ASSERT(enab != dtrace_retained);
11104 11111                  ASSERT(dtrace_retained != NULL);
11105 11112                  enab->dten_prev->dten_next = enab->dten_next;
11106 11113          }
11107 11114  
11108 11115          if (enab->dten_next != NULL) {
11109 11116                  ASSERT(dtrace_retained != NULL);
11110 11117                  enab->dten_next->dten_prev = enab->dten_prev;
11111 11118          }
11112 11119  
11113 11120          kmem_free(enab, sizeof (dtrace_enabling_t));
11114 11121  }
11115 11122  
11116 11123  static int
11117 11124  dtrace_enabling_retain(dtrace_enabling_t *enab)
11118 11125  {
11119 11126          dtrace_state_t *state;
11120 11127  
11121 11128          ASSERT(MUTEX_HELD(&dtrace_lock));
11122 11129          ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11123 11130          ASSERT(enab->dten_vstate != NULL);
11124 11131  
11125 11132          state = enab->dten_vstate->dtvs_state;
11126 11133          ASSERT(state != NULL);
11127 11134  
11128 11135          /*
11129 11136           * We only allow each state to retain dtrace_retain_max enablings.
11130 11137           */
11131 11138          if (state->dts_nretained >= dtrace_retain_max)
11132 11139                  return (ENOSPC);
11133 11140  
11134 11141          state->dts_nretained++;
11135 11142          dtrace_retained_gen++;
11136 11143  
11137 11144          if (dtrace_retained == NULL) {
11138 11145                  dtrace_retained = enab;
11139 11146                  return (0);
11140 11147          }
11141 11148  
11142 11149          enab->dten_next = dtrace_retained;
11143 11150          dtrace_retained->dten_prev = enab;
11144 11151          dtrace_retained = enab;
11145 11152  
11146 11153          return (0);
11147 11154  }
11148 11155  
11149 11156  static int
11150 11157  dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
11151 11158      dtrace_probedesc_t *create)
11152 11159  {
11153 11160          dtrace_enabling_t *new, *enab;
11154 11161          int found = 0, err = ENOENT;
11155 11162  
11156 11163          ASSERT(MUTEX_HELD(&dtrace_lock));
11157 11164          ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
11158 11165          ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
11159 11166          ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
11160 11167          ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
11161 11168  
11162 11169          new = dtrace_enabling_create(&state->dts_vstate);
11163 11170  
11164 11171          /*
11165 11172           * Iterate over all retained enablings, looking for enablings that
11166 11173           * match the specified state.
11167 11174           */
11168 11175          for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11169 11176                  int i;
11170 11177  
11171 11178                  /*
11172 11179                   * dtvs_state can only be NULL for helper enablings -- and
11173 11180                   * helper enablings can't be retained.
11174 11181                   */
11175 11182                  ASSERT(enab->dten_vstate->dtvs_state != NULL);
11176 11183  
11177 11184                  if (enab->dten_vstate->dtvs_state != state)
11178 11185                          continue;
11179 11186  
11180 11187                  /*
11181 11188                   * Now iterate over each probe description; we're looking for
11182 11189                   * an exact match to the specified probe description.
11183 11190                   */
11184 11191                  for (i = 0; i < enab->dten_ndesc; i++) {
11185 11192                          dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11186 11193                          dtrace_probedesc_t *pd = &ep->dted_probe;
11187 11194  
11188 11195                          if (strcmp(pd->dtpd_provider, match->dtpd_provider))
11189 11196                                  continue;
11190 11197  
11191 11198                          if (strcmp(pd->dtpd_mod, match->dtpd_mod))
11192 11199                                  continue;
11193 11200  
11194 11201                          if (strcmp(pd->dtpd_func, match->dtpd_func))
11195 11202                                  continue;
11196 11203  
11197 11204                          if (strcmp(pd->dtpd_name, match->dtpd_name))
11198 11205                                  continue;
11199 11206  
11200 11207                          /*
11201 11208                           * We have a winning probe!  Add it to our growing
11202 11209                           * enabling.
11203 11210                           */
11204 11211                          found = 1;
11205 11212                          dtrace_enabling_addlike(new, ep, create);
11206 11213                  }
11207 11214          }
11208 11215  
11209 11216          if (!found || (err = dtrace_enabling_retain(new)) != 0) {
11210 11217                  dtrace_enabling_destroy(new);
11211 11218                  return (err);
11212 11219          }
11213 11220  
11214 11221          return (0);
11215 11222  }
11216 11223  
11217 11224  static void
11218 11225  dtrace_enabling_retract(dtrace_state_t *state)
11219 11226  {
11220 11227          dtrace_enabling_t *enab, *next;
11221 11228  
11222 11229          ASSERT(MUTEX_HELD(&dtrace_lock));
11223 11230  
11224 11231          /*
11225 11232           * Iterate over all retained enablings, destroy the enablings retained
11226 11233           * for the specified state.
11227 11234           */
11228 11235          for (enab = dtrace_retained; enab != NULL; enab = next) {
11229 11236                  next = enab->dten_next;
11230 11237  
11231 11238                  /*
11232 11239                   * dtvs_state can only be NULL for helper enablings -- and
11233 11240                   * helper enablings can't be retained.
11234 11241                   */
11235 11242                  ASSERT(enab->dten_vstate->dtvs_state != NULL);
11236 11243  
11237 11244                  if (enab->dten_vstate->dtvs_state == state) {
11238 11245                          ASSERT(state->dts_nretained > 0);
11239 11246                          dtrace_enabling_destroy(enab);
11240 11247                  }
11241 11248          }
11242 11249  
11243 11250          ASSERT(state->dts_nretained == 0);
11244 11251  }
11245 11252  
11246 11253  static int
11247 11254  dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
11248 11255  {
11249 11256          int i = 0;
11250 11257          int total_matched = 0, matched = 0;
11251 11258  
11252 11259          ASSERT(MUTEX_HELD(&cpu_lock));
11253 11260          ASSERT(MUTEX_HELD(&dtrace_lock));
11254 11261  
11255 11262          for (i = 0; i < enab->dten_ndesc; i++) {
11256 11263                  dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11257 11264  
11258 11265                  enab->dten_current = ep;
11259 11266                  enab->dten_error = 0;
11260 11267  
11261 11268                  /*
11262 11269                   * If a provider failed to enable a probe then get out and
11263 11270                   * let the consumer know we failed.
11264 11271                   */
11265 11272                  if ((matched = dtrace_probe_enable(&ep->dted_probe, enab)) < 0)
11266 11273                          return (EBUSY);
11267 11274  
11268 11275                  total_matched += matched;
11269 11276  
11270 11277                  if (enab->dten_error != 0) {
11271 11278                          /*
11272 11279                           * If we get an error half-way through enabling the
11273 11280                           * probes, we kick out -- perhaps with some number of
11274 11281                           * them enabled.  Leaving enabled probes enabled may
11275 11282                           * be slightly confusing for user-level, but we expect
11276 11283                           * that no one will attempt to actually drive on in
11277 11284                           * the face of such errors.  If this is an anonymous
11278 11285                           * enabling (indicated with a NULL nmatched pointer),
11279 11286                           * we cmn_err() a message.  We aren't expecting to
11280 11287                           * get such an error -- such as it can exist at all,
11281 11288                           * it would be a result of corrupted DOF in the driver
11282 11289                           * properties.
11283 11290                           */
11284 11291                          if (nmatched == NULL) {
11285 11292                                  cmn_err(CE_WARN, "dtrace_enabling_match() "
11286 11293                                      "error on %p: %d", (void *)ep,
11287 11294                                      enab->dten_error);
11288 11295                          }
11289 11296  
11290 11297                          return (enab->dten_error);
11291 11298                  }
11292 11299          }
11293 11300  
11294 11301          enab->dten_probegen = dtrace_probegen;
11295 11302          if (nmatched != NULL)
11296 11303                  *nmatched = total_matched;
11297 11304  
11298 11305          return (0);
11299 11306  }
11300 11307  
11301 11308  static void
11302 11309  dtrace_enabling_matchall(void)
11303 11310  {
11304 11311          dtrace_enabling_t *enab;
11305 11312  
11306 11313          mutex_enter(&cpu_lock);
11307 11314          mutex_enter(&dtrace_lock);
11308 11315  
11309 11316          /*
11310 11317           * Iterate over all retained enablings to see if any probes match
11311 11318           * against them.  We only perform this operation on enablings for which
11312 11319           * we have sufficient permissions by virtue of being in the global zone
11313 11320           * or in the same zone as the DTrace client.  Because we can be called
11314 11321           * after dtrace_detach() has been called, we cannot assert that there
11315 11322           * are retained enablings.  We can safely load from dtrace_retained,
11316 11323           * however:  the taskq_destroy() at the end of dtrace_detach() will
11317 11324           * block pending our completion.
11318 11325           */
11319 11326          for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11320 11327                  dtrace_cred_t *dcr = &enab->dten_vstate->dtvs_state->dts_cred;
11321 11328                  cred_t *cr = dcr->dcr_cred;
11322 11329                  zoneid_t zone = cr != NULL ? crgetzoneid(cr) : 0;
11323 11330  
11324 11331                  if ((dcr->dcr_visible & DTRACE_CRV_ALLZONE) || (cr != NULL &&
11325 11332                      (zone == GLOBAL_ZONEID || getzoneid() == zone)))
11326 11333                          (void) dtrace_enabling_match(enab, NULL);
11327 11334          }
11328 11335  
11329 11336          mutex_exit(&dtrace_lock);
11330 11337          mutex_exit(&cpu_lock);
11331 11338  }
11332 11339  
11333 11340  /*
11334 11341   * If an enabling is to be enabled without having matched probes (that is, if
11335 11342   * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
11336 11343   * enabling must be _primed_ by creating an ECB for every ECB description.
11337 11344   * This must be done to assure that we know the number of speculations, the
11338 11345   * number of aggregations, the minimum buffer size needed, etc. before we
11339 11346   * transition out of DTRACE_ACTIVITY_INACTIVE.  To do this without actually
11340 11347   * enabling any probes, we create ECBs for every ECB decription, but with a
11341 11348   * NULL probe -- which is exactly what this function does.
11342 11349   */
11343 11350  static void
11344 11351  dtrace_enabling_prime(dtrace_state_t *state)
11345 11352  {
11346 11353          dtrace_enabling_t *enab;
11347 11354          int i;
11348 11355  
11349 11356          for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11350 11357                  ASSERT(enab->dten_vstate->dtvs_state != NULL);
11351 11358  
11352 11359                  if (enab->dten_vstate->dtvs_state != state)
11353 11360                          continue;
11354 11361  
11355 11362                  /*
11356 11363                   * We don't want to prime an enabling more than once, lest
11357 11364                   * we allow a malicious user to induce resource exhaustion.
11358 11365                   * (The ECBs that result from priming an enabling aren't
11359 11366                   * leaked -- but they also aren't deallocated until the
11360 11367                   * consumer state is destroyed.)
11361 11368                   */
11362 11369                  if (enab->dten_primed)
11363 11370                          continue;
11364 11371  
11365 11372                  for (i = 0; i < enab->dten_ndesc; i++) {
11366 11373                          enab->dten_current = enab->dten_desc[i];
11367 11374                          (void) dtrace_probe_enable(NULL, enab);
11368 11375                  }
11369 11376  
11370 11377                  enab->dten_primed = 1;
11371 11378          }
11372 11379  }
11373 11380  
11374 11381  /*
11375 11382   * Called to indicate that probes should be provided due to retained
11376 11383   * enablings.  This is implemented in terms of dtrace_probe_provide(), but it
11377 11384   * must take an initial lap through the enabling calling the dtps_provide()
11378 11385   * entry point explicitly to allow for autocreated probes.
11379 11386   */
11380 11387  static void
11381 11388  dtrace_enabling_provide(dtrace_provider_t *prv)
11382 11389  {
11383 11390          int i, all = 0;
11384 11391          dtrace_probedesc_t desc;
11385 11392          dtrace_genid_t gen;
11386 11393  
11387 11394          ASSERT(MUTEX_HELD(&dtrace_lock));
11388 11395          ASSERT(MUTEX_HELD(&dtrace_provider_lock));
11389 11396  
11390 11397          if (prv == NULL) {
11391 11398                  all = 1;
11392 11399                  prv = dtrace_provider;
11393 11400          }
11394 11401  
11395 11402          do {
11396 11403                  dtrace_enabling_t *enab;
11397 11404                  void *parg = prv->dtpv_arg;
11398 11405  
11399 11406  retry:
11400 11407                  gen = dtrace_retained_gen;
11401 11408                  for (enab = dtrace_retained; enab != NULL;
11402 11409                      enab = enab->dten_next) {
11403 11410                          for (i = 0; i < enab->dten_ndesc; i++) {
11404 11411                                  desc = enab->dten_desc[i]->dted_probe;
11405 11412                                  mutex_exit(&dtrace_lock);
11406 11413                                  prv->dtpv_pops.dtps_provide(parg, &desc);
11407 11414                                  mutex_enter(&dtrace_lock);
11408 11415                                  /*
11409 11416                                   * Process the retained enablings again if
11410 11417                                   * they have changed while we weren't holding
11411 11418                                   * dtrace_lock.
11412 11419                                   */
11413 11420                                  if (gen != dtrace_retained_gen)
11414 11421                                          goto retry;
11415 11422                          }
11416 11423                  }
11417 11424          } while (all && (prv = prv->dtpv_next) != NULL);
11418 11425  
11419 11426          mutex_exit(&dtrace_lock);
11420 11427          dtrace_probe_provide(NULL, all ? NULL : prv);
11421 11428          mutex_enter(&dtrace_lock);
11422 11429  }
11423 11430  
11424 11431  /*
11425 11432   * Called to reap ECBs that are attached to probes from defunct providers.
11426 11433   */
11427 11434  static void
11428 11435  dtrace_enabling_reap(void)
11429 11436  {
11430 11437          dtrace_provider_t *prov;
11431 11438          dtrace_probe_t *probe;
11432 11439          dtrace_ecb_t *ecb;
11433 11440          hrtime_t when;
11434 11441          int i;
11435 11442  
11436 11443          mutex_enter(&cpu_lock);
11437 11444          mutex_enter(&dtrace_lock);
11438 11445  
11439 11446          for (i = 0; i < dtrace_nprobes; i++) {
11440 11447                  if ((probe = dtrace_probes[i]) == NULL)
11441 11448                          continue;
11442 11449  
11443 11450                  if (probe->dtpr_ecb == NULL)
11444 11451                          continue;
11445 11452  
11446 11453                  prov = probe->dtpr_provider;
11447 11454  
11448 11455                  if ((when = prov->dtpv_defunct) == 0)
11449 11456                          continue;
11450 11457  
11451 11458                  /*
11452 11459                   * We have ECBs on a defunct provider:  we want to reap these
11453 11460                   * ECBs to allow the provider to unregister.  The destruction
11454 11461                   * of these ECBs must be done carefully:  if we destroy the ECB
11455 11462                   * and the consumer later wishes to consume an EPID that
11456 11463                   * corresponds to the destroyed ECB (and if the EPID metadata
11457 11464                   * has not been previously consumed), the consumer will abort
11458 11465                   * processing on the unknown EPID.  To reduce (but not, sadly,
11459 11466                   * eliminate) the possibility of this, we will only destroy an
11460 11467                   * ECB for a defunct provider if, for the state that
11461 11468                   * corresponds to the ECB:
11462 11469                   *
11463 11470                   *  (a) There is no speculative tracing (which can effectively
11464 11471                   *      cache an EPID for an arbitrary amount of time).
11465 11472                   *
11466 11473                   *  (b) The principal buffers have been switched twice since the
11467 11474                   *      provider became defunct.
11468 11475                   *
11469 11476                   *  (c) The aggregation buffers are of zero size or have been
11470 11477                   *      switched twice since the provider became defunct.
11471 11478                   *
11472 11479                   * We use dts_speculates to determine (a) and call a function
11473 11480                   * (dtrace_buffer_consumed()) to determine (b) and (c).  Note
11474 11481                   * that as soon as we've been unable to destroy one of the ECBs
11475 11482                   * associated with the probe, we quit trying -- reaping is only
11476 11483                   * fruitful in as much as we can destroy all ECBs associated
11477 11484                   * with the defunct provider's probes.
11478 11485                   */
11479 11486                  while ((ecb = probe->dtpr_ecb) != NULL) {
11480 11487                          dtrace_state_t *state = ecb->dte_state;
11481 11488                          dtrace_buffer_t *buf = state->dts_buffer;
11482 11489                          dtrace_buffer_t *aggbuf = state->dts_aggbuffer;
11483 11490  
11484 11491                          if (state->dts_speculates)
11485 11492                                  break;
11486 11493  
11487 11494                          if (!dtrace_buffer_consumed(buf, when))
11488 11495                                  break;
11489 11496  
11490 11497                          if (!dtrace_buffer_consumed(aggbuf, when))
11491 11498                                  break;
11492 11499  
11493 11500                          dtrace_ecb_disable(ecb);
11494 11501                          ASSERT(probe->dtpr_ecb != ecb);
11495 11502                          dtrace_ecb_destroy(ecb);
11496 11503                  }
11497 11504          }
11498 11505  
11499 11506          mutex_exit(&dtrace_lock);
11500 11507          mutex_exit(&cpu_lock);
11501 11508  }
11502 11509  
11503 11510  /*
11504 11511   * DTrace DOF Functions
11505 11512   */
11506 11513  /*ARGSUSED*/
11507 11514  static void
11508 11515  dtrace_dof_error(dof_hdr_t *dof, const char *str)
11509 11516  {
11510 11517          if (dtrace_err_verbose)
11511 11518                  cmn_err(CE_WARN, "failed to process DOF: %s", str);
11512 11519  
11513 11520  #ifdef DTRACE_ERRDEBUG
11514 11521          dtrace_errdebug(str);
11515 11522  #endif
11516 11523  }
11517 11524  
11518 11525  /*
11519 11526   * Create DOF out of a currently enabled state.  Right now, we only create
11520 11527   * DOF containing the run-time options -- but this could be expanded to create
11521 11528   * complete DOF representing the enabled state.
11522 11529   */
11523 11530  static dof_hdr_t *
11524 11531  dtrace_dof_create(dtrace_state_t *state)
11525 11532  {
11526 11533          dof_hdr_t *dof;
11527 11534          dof_sec_t *sec;
11528 11535          dof_optdesc_t *opt;
11529 11536          int i, len = sizeof (dof_hdr_t) +
11530 11537              roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
11531 11538              sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11532 11539  
11533 11540          ASSERT(MUTEX_HELD(&dtrace_lock));
11534 11541  
11535 11542          dof = kmem_zalloc(len, KM_SLEEP);
11536 11543          dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
11537 11544          dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
11538 11545          dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
11539 11546          dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
11540 11547  
11541 11548          dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
11542 11549          dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
11543 11550          dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
11544 11551          dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
11545 11552          dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
11546 11553          dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
11547 11554  
11548 11555          dof->dofh_flags = 0;
11549 11556          dof->dofh_hdrsize = sizeof (dof_hdr_t);
11550 11557          dof->dofh_secsize = sizeof (dof_sec_t);
11551 11558          dof->dofh_secnum = 1;   /* only DOF_SECT_OPTDESC */
11552 11559          dof->dofh_secoff = sizeof (dof_hdr_t);
11553 11560          dof->dofh_loadsz = len;
11554 11561          dof->dofh_filesz = len;
11555 11562          dof->dofh_pad = 0;
11556 11563  
11557 11564          /*
11558 11565           * Fill in the option section header...
11559 11566           */
11560 11567          sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
11561 11568          sec->dofs_type = DOF_SECT_OPTDESC;
11562 11569          sec->dofs_align = sizeof (uint64_t);
11563 11570          sec->dofs_flags = DOF_SECF_LOAD;
11564 11571          sec->dofs_entsize = sizeof (dof_optdesc_t);
11565 11572  
11566 11573          opt = (dof_optdesc_t *)((uintptr_t)sec +
11567 11574              roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
11568 11575  
11569 11576          sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
11570 11577          sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11571 11578  
11572 11579          for (i = 0; i < DTRACEOPT_MAX; i++) {
11573 11580                  opt[i].dofo_option = i;
11574 11581                  opt[i].dofo_strtab = DOF_SECIDX_NONE;
11575 11582                  opt[i].dofo_value = state->dts_options[i];
11576 11583          }
11577 11584  
11578 11585          return (dof);
11579 11586  }
11580 11587  
11581 11588  static dof_hdr_t *
11582 11589  dtrace_dof_copyin(uintptr_t uarg, int *errp)
11583 11590  {
11584 11591          dof_hdr_t hdr, *dof;
11585 11592  
11586 11593          ASSERT(!MUTEX_HELD(&dtrace_lock));
11587 11594  
11588 11595          /*
11589 11596           * First, we're going to copyin() the sizeof (dof_hdr_t).
11590 11597           */
11591 11598          if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
11592 11599                  dtrace_dof_error(NULL, "failed to copyin DOF header");
11593 11600                  *errp = EFAULT;
11594 11601                  return (NULL);
11595 11602          }
11596 11603  
11597 11604          /*
11598 11605           * Now we'll allocate the entire DOF and copy it in -- provided
11599 11606           * that the length isn't outrageous.
11600 11607           */
11601 11608          if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
11602 11609                  dtrace_dof_error(&hdr, "load size exceeds maximum");
11603 11610                  *errp = E2BIG;
11604 11611                  return (NULL);
11605 11612          }
11606 11613  
11607 11614          if (hdr.dofh_loadsz < sizeof (hdr)) {
11608 11615                  dtrace_dof_error(&hdr, "invalid load size");
11609 11616                  *errp = EINVAL;
11610 11617                  return (NULL);
11611 11618          }
11612 11619  
11613 11620          dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
11614 11621  
11615 11622          if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
11616 11623              dof->dofh_loadsz != hdr.dofh_loadsz) {
11617 11624                  kmem_free(dof, hdr.dofh_loadsz);
11618 11625                  *errp = EFAULT;
11619 11626                  return (NULL);
11620 11627          }
11621 11628  
11622 11629          return (dof);
11623 11630  }
11624 11631  
11625 11632  static dof_hdr_t *
11626 11633  dtrace_dof_property(const char *name)
11627 11634  {
11628 11635          uchar_t *buf;
11629 11636          uint64_t loadsz;
11630 11637          unsigned int len, i;
11631 11638          dof_hdr_t *dof;
11632 11639  
11633 11640          /*
11634 11641           * Unfortunately, array of values in .conf files are always (and
11635 11642           * only) interpreted to be integer arrays.  We must read our DOF
11636 11643           * as an integer array, and then squeeze it into a byte array.
11637 11644           */
11638 11645          if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
11639 11646              (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
11640 11647                  return (NULL);
11641 11648  
11642 11649          for (i = 0; i < len; i++)
11643 11650                  buf[i] = (uchar_t)(((int *)buf)[i]);
11644 11651  
11645 11652          if (len < sizeof (dof_hdr_t)) {
11646 11653                  ddi_prop_free(buf);
11647 11654                  dtrace_dof_error(NULL, "truncated header");
11648 11655                  return (NULL);
11649 11656          }
11650 11657  
11651 11658          if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
11652 11659                  ddi_prop_free(buf);
11653 11660                  dtrace_dof_error(NULL, "truncated DOF");
11654 11661                  return (NULL);
11655 11662          }
11656 11663  
11657 11664          if (loadsz >= dtrace_dof_maxsize) {
11658 11665                  ddi_prop_free(buf);
11659 11666                  dtrace_dof_error(NULL, "oversized DOF");
11660 11667                  return (NULL);
11661 11668          }
11662 11669  
11663 11670          dof = kmem_alloc(loadsz, KM_SLEEP);
11664 11671          bcopy(buf, dof, loadsz);
11665 11672          ddi_prop_free(buf);
11666 11673  
11667 11674          return (dof);
11668 11675  }
11669 11676  
11670 11677  static void
11671 11678  dtrace_dof_destroy(dof_hdr_t *dof)
11672 11679  {
11673 11680          kmem_free(dof, dof->dofh_loadsz);
11674 11681  }
11675 11682  
11676 11683  /*
11677 11684   * Return the dof_sec_t pointer corresponding to a given section index.  If the
11678 11685   * index is not valid, dtrace_dof_error() is called and NULL is returned.  If
11679 11686   * a type other than DOF_SECT_NONE is specified, the header is checked against
11680 11687   * this type and NULL is returned if the types do not match.
11681 11688   */
11682 11689  static dof_sec_t *
11683 11690  dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
11684 11691  {
11685 11692          dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
11686 11693              ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
11687 11694  
11688 11695          if (i >= dof->dofh_secnum) {
11689 11696                  dtrace_dof_error(dof, "referenced section index is invalid");
11690 11697                  return (NULL);
11691 11698          }
11692 11699  
11693 11700          if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
11694 11701                  dtrace_dof_error(dof, "referenced section is not loadable");
11695 11702                  return (NULL);
11696 11703          }
11697 11704  
11698 11705          if (type != DOF_SECT_NONE && type != sec->dofs_type) {
11699 11706                  dtrace_dof_error(dof, "referenced section is the wrong type");
11700 11707                  return (NULL);
11701 11708          }
11702 11709  
11703 11710          return (sec);
11704 11711  }
11705 11712  
11706 11713  static dtrace_probedesc_t *
11707 11714  dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
11708 11715  {
11709 11716          dof_probedesc_t *probe;
11710 11717          dof_sec_t *strtab;
11711 11718          uintptr_t daddr = (uintptr_t)dof;
11712 11719          uintptr_t str;
11713 11720          size_t size;
11714 11721  
11715 11722          if (sec->dofs_type != DOF_SECT_PROBEDESC) {
11716 11723                  dtrace_dof_error(dof, "invalid probe section");
11717 11724                  return (NULL);
11718 11725          }
11719 11726  
11720 11727          if (sec->dofs_align != sizeof (dof_secidx_t)) {
11721 11728                  dtrace_dof_error(dof, "bad alignment in probe description");
11722 11729                  return (NULL);
11723 11730          }
11724 11731  
11725 11732          if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
11726 11733                  dtrace_dof_error(dof, "truncated probe description");
11727 11734                  return (NULL);
11728 11735          }
11729 11736  
11730 11737          probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
11731 11738          strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
11732 11739  
11733 11740          if (strtab == NULL)
11734 11741                  return (NULL);
11735 11742  
11736 11743          str = daddr + strtab->dofs_offset;
11737 11744          size = strtab->dofs_size;
11738 11745  
11739 11746          if (probe->dofp_provider >= strtab->dofs_size) {
11740 11747                  dtrace_dof_error(dof, "corrupt probe provider");
11741 11748                  return (NULL);
11742 11749          }
11743 11750  
11744 11751          (void) strncpy(desc->dtpd_provider,
11745 11752              (char *)(str + probe->dofp_provider),
11746 11753              MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
11747 11754  
11748 11755          if (probe->dofp_mod >= strtab->dofs_size) {
11749 11756                  dtrace_dof_error(dof, "corrupt probe module");
11750 11757                  return (NULL);
11751 11758          }
11752 11759  
11753 11760          (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
11754 11761              MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
11755 11762  
11756 11763          if (probe->dofp_func >= strtab->dofs_size) {
11757 11764                  dtrace_dof_error(dof, "corrupt probe function");
11758 11765                  return (NULL);
11759 11766          }
11760 11767  
11761 11768          (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
11762 11769              MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
11763 11770  
11764 11771          if (probe->dofp_name >= strtab->dofs_size) {
11765 11772                  dtrace_dof_error(dof, "corrupt probe name");
11766 11773                  return (NULL);
11767 11774          }
11768 11775  
11769 11776          (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
11770 11777              MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
11771 11778  
11772 11779          return (desc);
11773 11780  }
11774 11781  
11775 11782  static dtrace_difo_t *
11776 11783  dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11777 11784      cred_t *cr)
11778 11785  {
11779 11786          dtrace_difo_t *dp;
11780 11787          size_t ttl = 0;
11781 11788          dof_difohdr_t *dofd;
11782 11789          uintptr_t daddr = (uintptr_t)dof;
11783 11790          size_t max = dtrace_difo_maxsize;
11784 11791          int i, l, n;
11785 11792  
11786 11793          static const struct {
11787 11794                  int section;
11788 11795                  int bufoffs;
11789 11796                  int lenoffs;
11790 11797                  int entsize;
11791 11798                  int align;
11792 11799                  const char *msg;
11793 11800          } difo[] = {
11794 11801                  { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
11795 11802                  offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
11796 11803                  sizeof (dif_instr_t), "multiple DIF sections" },
11797 11804  
11798 11805                  { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
11799 11806                  offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
11800 11807                  sizeof (uint64_t), "multiple integer tables" },
11801 11808  
11802 11809                  { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
11803 11810                  offsetof(dtrace_difo_t, dtdo_strlen), 0,
11804 11811                  sizeof (char), "multiple string tables" },
11805 11812  
11806 11813                  { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
11807 11814                  offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
11808 11815                  sizeof (uint_t), "multiple variable tables" },
11809 11816  
11810 11817                  { DOF_SECT_NONE, 0, 0, 0, NULL }
11811 11818          };
11812 11819  
11813 11820          if (sec->dofs_type != DOF_SECT_DIFOHDR) {
11814 11821                  dtrace_dof_error(dof, "invalid DIFO header section");
11815 11822                  return (NULL);
11816 11823          }
11817 11824  
11818 11825          if (sec->dofs_align != sizeof (dof_secidx_t)) {
11819 11826                  dtrace_dof_error(dof, "bad alignment in DIFO header");
11820 11827                  return (NULL);
11821 11828          }
11822 11829  
11823 11830          if (sec->dofs_size < sizeof (dof_difohdr_t) ||
11824 11831              sec->dofs_size % sizeof (dof_secidx_t)) {
11825 11832                  dtrace_dof_error(dof, "bad size in DIFO header");
11826 11833                  return (NULL);
11827 11834          }
11828 11835  
11829 11836          dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
11830 11837          n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
11831 11838  
11832 11839          dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
11833 11840          dp->dtdo_rtype = dofd->dofd_rtype;
11834 11841  
11835 11842          for (l = 0; l < n; l++) {
11836 11843                  dof_sec_t *subsec;
11837 11844                  void **bufp;
11838 11845                  uint32_t *lenp;
11839 11846  
11840 11847                  if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
11841 11848                      dofd->dofd_links[l])) == NULL)
11842 11849                          goto err; /* invalid section link */
11843 11850  
11844 11851                  if (ttl + subsec->dofs_size > max) {
11845 11852                          dtrace_dof_error(dof, "exceeds maximum size");
11846 11853                          goto err;
11847 11854                  }
11848 11855  
11849 11856                  ttl += subsec->dofs_size;
11850 11857  
11851 11858                  for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
11852 11859                          if (subsec->dofs_type != difo[i].section)
11853 11860                                  continue;
11854 11861  
11855 11862                          if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
11856 11863                                  dtrace_dof_error(dof, "section not loaded");
11857 11864                                  goto err;
11858 11865                          }
11859 11866  
11860 11867                          if (subsec->dofs_align != difo[i].align) {
11861 11868                                  dtrace_dof_error(dof, "bad alignment");
11862 11869                                  goto err;
11863 11870                          }
11864 11871  
11865 11872                          bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
11866 11873                          lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
11867 11874  
11868 11875                          if (*bufp != NULL) {
11869 11876                                  dtrace_dof_error(dof, difo[i].msg);
11870 11877                                  goto err;
11871 11878                          }
11872 11879  
11873 11880                          if (difo[i].entsize != subsec->dofs_entsize) {
11874 11881                                  dtrace_dof_error(dof, "entry size mismatch");
11875 11882                                  goto err;
11876 11883                          }
11877 11884  
11878 11885                          if (subsec->dofs_entsize != 0 &&
11879 11886                              (subsec->dofs_size % subsec->dofs_entsize) != 0) {
11880 11887                                  dtrace_dof_error(dof, "corrupt entry size");
11881 11888                                  goto err;
11882 11889                          }
11883 11890  
11884 11891                          *lenp = subsec->dofs_size;
11885 11892                          *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
11886 11893                          bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
11887 11894                              *bufp, subsec->dofs_size);
11888 11895  
11889 11896                          if (subsec->dofs_entsize != 0)
11890 11897                                  *lenp /= subsec->dofs_entsize;
11891 11898  
11892 11899                          break;
11893 11900                  }
11894 11901  
11895 11902                  /*
11896 11903                   * If we encounter a loadable DIFO sub-section that is not
11897 11904                   * known to us, assume this is a broken program and fail.
11898 11905                   */
11899 11906                  if (difo[i].section == DOF_SECT_NONE &&
11900 11907                      (subsec->dofs_flags & DOF_SECF_LOAD)) {
11901 11908                          dtrace_dof_error(dof, "unrecognized DIFO subsection");
11902 11909                          goto err;
11903 11910                  }
11904 11911          }
11905 11912  
11906 11913          if (dp->dtdo_buf == NULL) {
11907 11914                  /*
11908 11915                   * We can't have a DIF object without DIF text.
11909 11916                   */
11910 11917                  dtrace_dof_error(dof, "missing DIF text");
11911 11918                  goto err;
11912 11919          }
11913 11920  
11914 11921          /*
11915 11922           * Before we validate the DIF object, run through the variable table
11916 11923           * looking for the strings -- if any of their size are under, we'll set
11917 11924           * their size to be the system-wide default string size.  Note that
11918 11925           * this should _not_ happen if the "strsize" option has been set --
11919 11926           * in this case, the compiler should have set the size to reflect the
11920 11927           * setting of the option.
11921 11928           */
11922 11929          for (i = 0; i < dp->dtdo_varlen; i++) {
11923 11930                  dtrace_difv_t *v = &dp->dtdo_vartab[i];
11924 11931                  dtrace_diftype_t *t = &v->dtdv_type;
11925 11932  
11926 11933                  if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
11927 11934                          continue;
11928 11935  
11929 11936                  if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
11930 11937                          t->dtdt_size = dtrace_strsize_default;
11931 11938          }
11932 11939  
11933 11940          if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
11934 11941                  goto err;
11935 11942  
11936 11943          dtrace_difo_init(dp, vstate);
11937 11944          return (dp);
11938 11945  
11939 11946  err:
11940 11947          kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
11941 11948          kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
11942 11949          kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
11943 11950          kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
11944 11951  
11945 11952          kmem_free(dp, sizeof (dtrace_difo_t));
11946 11953          return (NULL);
11947 11954  }
11948 11955  
11949 11956  static dtrace_predicate_t *
11950 11957  dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11951 11958      cred_t *cr)
11952 11959  {
11953 11960          dtrace_difo_t *dp;
11954 11961  
11955 11962          if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
11956 11963                  return (NULL);
11957 11964  
11958 11965          return (dtrace_predicate_create(dp));
11959 11966  }
11960 11967  
11961 11968  static dtrace_actdesc_t *
11962 11969  dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11963 11970      cred_t *cr)
11964 11971  {
11965 11972          dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
11966 11973          dof_actdesc_t *desc;
11967 11974          dof_sec_t *difosec;
11968 11975          size_t offs;
11969 11976          uintptr_t daddr = (uintptr_t)dof;
11970 11977          uint64_t arg;
11971 11978          dtrace_actkind_t kind;
11972 11979  
11973 11980          if (sec->dofs_type != DOF_SECT_ACTDESC) {
11974 11981                  dtrace_dof_error(dof, "invalid action section");
11975 11982                  return (NULL);
11976 11983          }
11977 11984  
11978 11985          if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
11979 11986                  dtrace_dof_error(dof, "truncated action description");
11980 11987                  return (NULL);
11981 11988          }
11982 11989  
11983 11990          if (sec->dofs_align != sizeof (uint64_t)) {
11984 11991                  dtrace_dof_error(dof, "bad alignment in action description");
11985 11992                  return (NULL);
11986 11993          }
11987 11994  
11988 11995          if (sec->dofs_size < sec->dofs_entsize) {
11989 11996                  dtrace_dof_error(dof, "section entry size exceeds total size");
11990 11997                  return (NULL);
11991 11998          }
11992 11999  
11993 12000          if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
11994 12001                  dtrace_dof_error(dof, "bad entry size in action description");
11995 12002                  return (NULL);
11996 12003          }
11997 12004  
11998 12005          if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
11999 12006                  dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
12000 12007                  return (NULL);
12001 12008          }
12002 12009  
12003 12010          for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
12004 12011                  desc = (dof_actdesc_t *)(daddr +
12005 12012                      (uintptr_t)sec->dofs_offset + offs);
12006 12013                  kind = (dtrace_actkind_t)desc->dofa_kind;
12007 12014  
12008 12015                  if ((DTRACEACT_ISPRINTFLIKE(kind) &&
12009 12016                      (kind != DTRACEACT_PRINTA ||
12010 12017                      desc->dofa_strtab != DOF_SECIDX_NONE)) ||
12011 12018                      (kind == DTRACEACT_DIFEXPR &&
12012 12019                      desc->dofa_strtab != DOF_SECIDX_NONE)) {
12013 12020                          dof_sec_t *strtab;
12014 12021                          char *str, *fmt;
12015 12022                          uint64_t i;
12016 12023  
12017 12024                          /*
12018 12025                           * The argument to these actions is an index into the
12019 12026                           * DOF string table.  For printf()-like actions, this
12020 12027                           * is the format string.  For print(), this is the
12021 12028                           * CTF type of the expression result.
12022 12029                           */
12023 12030                          if ((strtab = dtrace_dof_sect(dof,
12024 12031                              DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
12025 12032                                  goto err;
12026 12033  
12027 12034                          str = (char *)((uintptr_t)dof +
12028 12035                              (uintptr_t)strtab->dofs_offset);
12029 12036  
12030 12037                          for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
12031 12038                                  if (str[i] == '\0')
12032 12039                                          break;
12033 12040                          }
12034 12041  
12035 12042                          if (i >= strtab->dofs_size) {
12036 12043                                  dtrace_dof_error(dof, "bogus format string");
12037 12044                                  goto err;
12038 12045                          }
12039 12046  
12040 12047                          if (i == desc->dofa_arg) {
12041 12048                                  dtrace_dof_error(dof, "empty format string");
12042 12049                                  goto err;
12043 12050                          }
12044 12051  
12045 12052                          i -= desc->dofa_arg;
12046 12053                          fmt = kmem_alloc(i + 1, KM_SLEEP);
12047 12054                          bcopy(&str[desc->dofa_arg], fmt, i + 1);
12048 12055                          arg = (uint64_t)(uintptr_t)fmt;
12049 12056                  } else {
12050 12057                          if (kind == DTRACEACT_PRINTA) {
12051 12058                                  ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
12052 12059                                  arg = 0;
12053 12060                          } else {
12054 12061                                  arg = desc->dofa_arg;
12055 12062                          }
12056 12063                  }
12057 12064  
12058 12065                  act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
12059 12066                      desc->dofa_uarg, arg);
12060 12067  
12061 12068                  if (last != NULL) {
12062 12069                          last->dtad_next = act;
12063 12070                  } else {
12064 12071                          first = act;
12065 12072                  }
12066 12073  
12067 12074                  last = act;
12068 12075  
12069 12076                  if (desc->dofa_difo == DOF_SECIDX_NONE)
12070 12077                          continue;
12071 12078  
12072 12079                  if ((difosec = dtrace_dof_sect(dof,
12073 12080                      DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
12074 12081                          goto err;
12075 12082  
12076 12083                  act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
12077 12084  
12078 12085                  if (act->dtad_difo == NULL)
12079 12086                          goto err;
12080 12087          }
12081 12088  
12082 12089          ASSERT(first != NULL);
12083 12090          return (first);
12084 12091  
12085 12092  err:
12086 12093          for (act = first; act != NULL; act = next) {
12087 12094                  next = act->dtad_next;
12088 12095                  dtrace_actdesc_release(act, vstate);
12089 12096          }
12090 12097  
12091 12098          return (NULL);
12092 12099  }
12093 12100  
12094 12101  static dtrace_ecbdesc_t *
12095 12102  dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12096 12103      cred_t *cr)
12097 12104  {
12098 12105          dtrace_ecbdesc_t *ep;
12099 12106          dof_ecbdesc_t *ecb;
12100 12107          dtrace_probedesc_t *desc;
12101 12108          dtrace_predicate_t *pred = NULL;
12102 12109  
12103 12110          if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
12104 12111                  dtrace_dof_error(dof, "truncated ECB description");
12105 12112                  return (NULL);
12106 12113          }
12107 12114  
12108 12115          if (sec->dofs_align != sizeof (uint64_t)) {
12109 12116                  dtrace_dof_error(dof, "bad alignment in ECB description");
12110 12117                  return (NULL);
12111 12118          }
12112 12119  
12113 12120          ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
12114 12121          sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
12115 12122  
12116 12123          if (sec == NULL)
12117 12124                  return (NULL);
12118 12125  
12119 12126          ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12120 12127          ep->dted_uarg = ecb->dofe_uarg;
12121 12128          desc = &ep->dted_probe;
12122 12129  
12123 12130          if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
12124 12131                  goto err;
12125 12132  
12126 12133          if (ecb->dofe_pred != DOF_SECIDX_NONE) {
12127 12134                  if ((sec = dtrace_dof_sect(dof,
12128 12135                      DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
12129 12136                          goto err;
12130 12137  
12131 12138                  if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
12132 12139                          goto err;
12133 12140  
12134 12141                  ep->dted_pred.dtpdd_predicate = pred;
12135 12142          }
12136 12143  
12137 12144          if (ecb->dofe_actions != DOF_SECIDX_NONE) {
12138 12145                  if ((sec = dtrace_dof_sect(dof,
12139 12146                      DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
12140 12147                          goto err;
12141 12148  
12142 12149                  ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
12143 12150  
12144 12151                  if (ep->dted_action == NULL)
12145 12152                          goto err;
12146 12153          }
12147 12154  
12148 12155          return (ep);
12149 12156  
12150 12157  err:
12151 12158          if (pred != NULL)
12152 12159                  dtrace_predicate_release(pred, vstate);
12153 12160          kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12154 12161          return (NULL);
12155 12162  }
12156 12163  
12157 12164  /*
12158 12165   * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
12159 12166   * specified DOF.  At present, this amounts to simply adding 'ubase' to the
12160 12167   * site of any user SETX relocations to account for load object base address.
12161 12168   * In the future, if we need other relocations, this function can be extended.
12162 12169   */
12163 12170  static int
12164 12171  dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
12165 12172  {
12166 12173          uintptr_t daddr = (uintptr_t)dof;
12167 12174          dof_relohdr_t *dofr =
12168 12175              (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
12169 12176          dof_sec_t *ss, *rs, *ts;
12170 12177          dof_relodesc_t *r;
12171 12178          uint_t i, n;
12172 12179  
12173 12180          if (sec->dofs_size < sizeof (dof_relohdr_t) ||
12174 12181              sec->dofs_align != sizeof (dof_secidx_t)) {
12175 12182                  dtrace_dof_error(dof, "invalid relocation header");
12176 12183                  return (-1);
12177 12184          }
12178 12185  
12179 12186          ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
12180 12187          rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
12181 12188          ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
12182 12189  
12183 12190          if (ss == NULL || rs == NULL || ts == NULL)
12184 12191                  return (-1); /* dtrace_dof_error() has been called already */
12185 12192  
12186 12193          if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
12187 12194              rs->dofs_align != sizeof (uint64_t)) {
12188 12195                  dtrace_dof_error(dof, "invalid relocation section");
12189 12196                  return (-1);
12190 12197          }
12191 12198  
12192 12199          r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
12193 12200          n = rs->dofs_size / rs->dofs_entsize;
12194 12201  
12195 12202          for (i = 0; i < n; i++) {
12196 12203                  uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
12197 12204  
12198 12205                  switch (r->dofr_type) {
12199 12206                  case DOF_RELO_NONE:
12200 12207                          break;
12201 12208                  case DOF_RELO_SETX:
12202 12209                          if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
12203 12210                              sizeof (uint64_t) > ts->dofs_size) {
12204 12211                                  dtrace_dof_error(dof, "bad relocation offset");
12205 12212                                  return (-1);
12206 12213                          }
12207 12214  
12208 12215                          if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
12209 12216                                  dtrace_dof_error(dof, "misaligned setx relo");
12210 12217                                  return (-1);
12211 12218                          }
12212 12219  
12213 12220                          *(uint64_t *)taddr += ubase;
12214 12221                          break;
12215 12222                  default:
12216 12223                          dtrace_dof_error(dof, "invalid relocation type");
12217 12224                          return (-1);
12218 12225                  }
12219 12226  
12220 12227                  r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
12221 12228          }
12222 12229  
12223 12230          return (0);
12224 12231  }
12225 12232  
12226 12233  /*
12227 12234   * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
12228 12235   * header:  it should be at the front of a memory region that is at least
12229 12236   * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
12230 12237   * size.  It need not be validated in any other way.
12231 12238   */
12232 12239  static int
12233 12240  dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
12234 12241      dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
12235 12242  {
12236 12243          uint64_t len = dof->dofh_loadsz, seclen;
12237 12244          uintptr_t daddr = (uintptr_t)dof;
12238 12245          dtrace_ecbdesc_t *ep;
12239 12246          dtrace_enabling_t *enab;
12240 12247          uint_t i;
12241 12248  
12242 12249          ASSERT(MUTEX_HELD(&dtrace_lock));
12243 12250          ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
12244 12251  
12245 12252          /*
12246 12253           * Check the DOF header identification bytes.  In addition to checking
12247 12254           * valid settings, we also verify that unused bits/bytes are zeroed so
12248 12255           * we can use them later without fear of regressing existing binaries.
12249 12256           */
12250 12257          if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
12251 12258              DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
12252 12259                  dtrace_dof_error(dof, "DOF magic string mismatch");
12253 12260                  return (-1);
12254 12261          }
12255 12262  
12256 12263          if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
12257 12264              dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
12258 12265                  dtrace_dof_error(dof, "DOF has invalid data model");
12259 12266                  return (-1);
12260 12267          }
12261 12268  
12262 12269          if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
12263 12270                  dtrace_dof_error(dof, "DOF encoding mismatch");
12264 12271                  return (-1);
12265 12272          }
12266 12273  
12267 12274          if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
12268 12275              dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
12269 12276                  dtrace_dof_error(dof, "DOF version mismatch");
12270 12277                  return (-1);
12271 12278          }
12272 12279  
12273 12280          if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
12274 12281                  dtrace_dof_error(dof, "DOF uses unsupported instruction set");
12275 12282                  return (-1);
12276 12283          }
12277 12284  
12278 12285          if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
12279 12286                  dtrace_dof_error(dof, "DOF uses too many integer registers");
12280 12287                  return (-1);
12281 12288          }
12282 12289  
12283 12290          if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
12284 12291                  dtrace_dof_error(dof, "DOF uses too many tuple registers");
12285 12292                  return (-1);
12286 12293          }
12287 12294  
12288 12295          for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
12289 12296                  if (dof->dofh_ident[i] != 0) {
12290 12297                          dtrace_dof_error(dof, "DOF has invalid ident byte set");
12291 12298                          return (-1);
12292 12299                  }
12293 12300          }
12294 12301  
12295 12302          if (dof->dofh_flags & ~DOF_FL_VALID) {
12296 12303                  dtrace_dof_error(dof, "DOF has invalid flag bits set");
12297 12304                  return (-1);
12298 12305          }
12299 12306  
12300 12307          if (dof->dofh_secsize == 0) {
12301 12308                  dtrace_dof_error(dof, "zero section header size");
12302 12309                  return (-1);
12303 12310          }
12304 12311  
12305 12312          /*
12306 12313           * Check that the section headers don't exceed the amount of DOF
12307 12314           * data.  Note that we cast the section size and number of sections
12308 12315           * to uint64_t's to prevent possible overflow in the multiplication.
12309 12316           */
12310 12317          seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
12311 12318  
12312 12319          if (dof->dofh_secoff > len || seclen > len ||
12313 12320              dof->dofh_secoff + seclen > len) {
12314 12321                  dtrace_dof_error(dof, "truncated section headers");
12315 12322                  return (-1);
12316 12323          }
12317 12324  
12318 12325          if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
12319 12326                  dtrace_dof_error(dof, "misaligned section headers");
12320 12327                  return (-1);
12321 12328          }
12322 12329  
12323 12330          if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
12324 12331                  dtrace_dof_error(dof, "misaligned section size");
12325 12332                  return (-1);
12326 12333          }
12327 12334  
12328 12335          /*
12329 12336           * Take an initial pass through the section headers to be sure that
12330 12337           * the headers don't have stray offsets.  If the 'noprobes' flag is
12331 12338           * set, do not permit sections relating to providers, probes, or args.
12332 12339           */
12333 12340          for (i = 0; i < dof->dofh_secnum; i++) {
12334 12341                  dof_sec_t *sec = (dof_sec_t *)(daddr +
12335 12342                      (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12336 12343  
12337 12344                  if (noprobes) {
12338 12345                          switch (sec->dofs_type) {
12339 12346                          case DOF_SECT_PROVIDER:
12340 12347                          case DOF_SECT_PROBES:
12341 12348                          case DOF_SECT_PRARGS:
12342 12349                          case DOF_SECT_PROFFS:
12343 12350                                  dtrace_dof_error(dof, "illegal sections "
12344 12351                                      "for enabling");
12345 12352                                  return (-1);
12346 12353                          }
12347 12354                  }
12348 12355  
12349 12356                  if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
12350 12357                      !(sec->dofs_flags & DOF_SECF_LOAD)) {
12351 12358                          dtrace_dof_error(dof, "loadable section with load "
12352 12359                              "flag unset");
12353 12360                          return (-1);
12354 12361                  }
12355 12362  
12356 12363                  if (!(sec->dofs_flags & DOF_SECF_LOAD))
12357 12364                          continue; /* just ignore non-loadable sections */
12358 12365  
12359 12366                  if (sec->dofs_align & (sec->dofs_align - 1)) {
12360 12367                          dtrace_dof_error(dof, "bad section alignment");
12361 12368                          return (-1);
12362 12369                  }
12363 12370  
12364 12371                  if (sec->dofs_offset & (sec->dofs_align - 1)) {
12365 12372                          dtrace_dof_error(dof, "misaligned section");
12366 12373                          return (-1);
12367 12374                  }
12368 12375  
12369 12376                  if (sec->dofs_offset > len || sec->dofs_size > len ||
12370 12377                      sec->dofs_offset + sec->dofs_size > len) {
12371 12378                          dtrace_dof_error(dof, "corrupt section header");
12372 12379                          return (-1);
12373 12380                  }
12374 12381  
12375 12382                  if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
12376 12383                      sec->dofs_offset + sec->dofs_size - 1) != '\0') {
12377 12384                          dtrace_dof_error(dof, "non-terminating string table");
12378 12385                          return (-1);
12379 12386                  }
12380 12387          }
12381 12388  
12382 12389          /*
12383 12390           * Take a second pass through the sections and locate and perform any
12384 12391           * relocations that are present.  We do this after the first pass to
12385 12392           * be sure that all sections have had their headers validated.
12386 12393           */
12387 12394          for (i = 0; i < dof->dofh_secnum; i++) {
12388 12395                  dof_sec_t *sec = (dof_sec_t *)(daddr +
12389 12396                      (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12390 12397  
12391 12398                  if (!(sec->dofs_flags & DOF_SECF_LOAD))
12392 12399                          continue; /* skip sections that are not loadable */
12393 12400  
12394 12401                  switch (sec->dofs_type) {
12395 12402                  case DOF_SECT_URELHDR:
12396 12403                          if (dtrace_dof_relocate(dof, sec, ubase) != 0)
12397 12404                                  return (-1);
12398 12405                          break;
12399 12406                  }
12400 12407          }
12401 12408  
12402 12409          if ((enab = *enabp) == NULL)
12403 12410                  enab = *enabp = dtrace_enabling_create(vstate);
12404 12411  
12405 12412          for (i = 0; i < dof->dofh_secnum; i++) {
12406 12413                  dof_sec_t *sec = (dof_sec_t *)(daddr +
12407 12414                      (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12408 12415  
12409 12416                  if (sec->dofs_type != DOF_SECT_ECBDESC)
12410 12417                          continue;
12411 12418  
12412 12419                  if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
12413 12420                          dtrace_enabling_destroy(enab);
12414 12421                          *enabp = NULL;
12415 12422                          return (-1);
12416 12423                  }
12417 12424  
12418 12425                  dtrace_enabling_add(enab, ep);
12419 12426          }
12420 12427  
12421 12428          return (0);
12422 12429  }
12423 12430  
12424 12431  /*
12425 12432   * Process DOF for any options.  This routine assumes that the DOF has been
12426 12433   * at least processed by dtrace_dof_slurp().
12427 12434   */
12428 12435  static int
12429 12436  dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
12430 12437  {
12431 12438          int i, rval;
12432 12439          uint32_t entsize;
12433 12440          size_t offs;
12434 12441          dof_optdesc_t *desc;
12435 12442  
12436 12443          for (i = 0; i < dof->dofh_secnum; i++) {
12437 12444                  dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
12438 12445                      (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12439 12446  
12440 12447                  if (sec->dofs_type != DOF_SECT_OPTDESC)
12441 12448                          continue;
12442 12449  
12443 12450                  if (sec->dofs_align != sizeof (uint64_t)) {
12444 12451                          dtrace_dof_error(dof, "bad alignment in "
12445 12452                              "option description");
12446 12453                          return (EINVAL);
12447 12454                  }
12448 12455  
12449 12456                  if ((entsize = sec->dofs_entsize) == 0) {
12450 12457                          dtrace_dof_error(dof, "zeroed option entry size");
12451 12458                          return (EINVAL);
12452 12459                  }
12453 12460  
12454 12461                  if (entsize < sizeof (dof_optdesc_t)) {
12455 12462                          dtrace_dof_error(dof, "bad option entry size");
12456 12463                          return (EINVAL);
12457 12464                  }
12458 12465  
12459 12466                  for (offs = 0; offs < sec->dofs_size; offs += entsize) {
12460 12467                          desc = (dof_optdesc_t *)((uintptr_t)dof +
12461 12468                              (uintptr_t)sec->dofs_offset + offs);
12462 12469  
12463 12470                          if (desc->dofo_strtab != DOF_SECIDX_NONE) {
12464 12471                                  dtrace_dof_error(dof, "non-zero option string");
12465 12472                                  return (EINVAL);
12466 12473                          }
12467 12474  
12468 12475                          if (desc->dofo_value == DTRACEOPT_UNSET) {
12469 12476                                  dtrace_dof_error(dof, "unset option");
12470 12477                                  return (EINVAL);
12471 12478                          }
12472 12479  
12473 12480                          if ((rval = dtrace_state_option(state,
12474 12481                              desc->dofo_option, desc->dofo_value)) != 0) {
12475 12482                                  dtrace_dof_error(dof, "rejected option");
12476 12483                                  return (rval);
12477 12484                          }
12478 12485                  }
12479 12486          }
12480 12487  
12481 12488          return (0);
12482 12489  }
12483 12490  
12484 12491  /*
12485 12492   * DTrace Consumer State Functions
12486 12493   */
12487 12494  int
12488 12495  dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
12489 12496  {
12490 12497          size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
12491 12498          void *base;
12492 12499          uintptr_t limit;
12493 12500          dtrace_dynvar_t *dvar, *next, *start;
12494 12501          int i;
12495 12502  
12496 12503          ASSERT(MUTEX_HELD(&dtrace_lock));
12497 12504          ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
12498 12505  
12499 12506          bzero(dstate, sizeof (dtrace_dstate_t));
12500 12507  
12501 12508          if ((dstate->dtds_chunksize = chunksize) == 0)
12502 12509                  dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
12503 12510  
12504 12511          if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
12505 12512                  size = min;
12506 12513  
12507 12514          if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL)
12508 12515                  return (ENOMEM);
12509 12516  
12510 12517          dstate->dtds_size = size;
12511 12518          dstate->dtds_base = base;
12512 12519          dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
12513 12520          bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
12514 12521  
12515 12522          hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
12516 12523  
12517 12524          if (hashsize != 1 && (hashsize & 1))
12518 12525                  hashsize--;
12519 12526  
12520 12527          dstate->dtds_hashsize = hashsize;
12521 12528          dstate->dtds_hash = dstate->dtds_base;
12522 12529  
12523 12530          /*
12524 12531           * Set all of our hash buckets to point to the single sink, and (if
12525 12532           * it hasn't already been set), set the sink's hash value to be the
12526 12533           * sink sentinel value.  The sink is needed for dynamic variable
12527 12534           * lookups to know that they have iterated over an entire, valid hash
12528 12535           * chain.
12529 12536           */
12530 12537          for (i = 0; i < hashsize; i++)
12531 12538                  dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
12532 12539  
12533 12540          if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
12534 12541                  dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
12535 12542  
12536 12543          /*
12537 12544           * Determine number of active CPUs.  Divide free list evenly among
12538 12545           * active CPUs.
12539 12546           */
12540 12547          start = (dtrace_dynvar_t *)
12541 12548              ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
12542 12549          limit = (uintptr_t)base + size;
12543 12550  
12544 12551          maxper = (limit - (uintptr_t)start) / NCPU;
12545 12552          maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
12546 12553  
12547 12554          for (i = 0; i < NCPU; i++) {
12548 12555                  dstate->dtds_percpu[i].dtdsc_free = dvar = start;
12549 12556  
12550 12557                  /*
12551 12558                   * If we don't even have enough chunks to make it once through
12552 12559                   * NCPUs, we're just going to allocate everything to the first
12553 12560                   * CPU.  And if we're on the last CPU, we're going to allocate
12554 12561                   * whatever is left over.  In either case, we set the limit to
12555 12562                   * be the limit of the dynamic variable space.
12556 12563                   */
12557 12564                  if (maxper == 0 || i == NCPU - 1) {
12558 12565                          limit = (uintptr_t)base + size;
12559 12566                          start = NULL;
12560 12567                  } else {
12561 12568                          limit = (uintptr_t)start + maxper;
12562 12569                          start = (dtrace_dynvar_t *)limit;
12563 12570                  }
12564 12571  
12565 12572                  ASSERT(limit <= (uintptr_t)base + size);
12566 12573  
12567 12574                  for (;;) {
12568 12575                          next = (dtrace_dynvar_t *)((uintptr_t)dvar +
12569 12576                              dstate->dtds_chunksize);
12570 12577  
12571 12578                          if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
12572 12579                                  break;
12573 12580  
12574 12581                          dvar->dtdv_next = next;
12575 12582                          dvar = next;
12576 12583                  }
12577 12584  
12578 12585                  if (maxper == 0)
12579 12586                          break;
12580 12587          }
12581 12588  
12582 12589          return (0);
12583 12590  }
12584 12591  
12585 12592  void
12586 12593  dtrace_dstate_fini(dtrace_dstate_t *dstate)
12587 12594  {
12588 12595          ASSERT(MUTEX_HELD(&cpu_lock));
12589 12596  
12590 12597          if (dstate->dtds_base == NULL)
12591 12598                  return;
12592 12599  
12593 12600          kmem_free(dstate->dtds_base, dstate->dtds_size);
12594 12601          kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
12595 12602  }
12596 12603  
12597 12604  static void
12598 12605  dtrace_vstate_fini(dtrace_vstate_t *vstate)
12599 12606  {
12600 12607          /*
12601 12608           * Logical XOR, where are you?
12602 12609           */
12603 12610          ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
12604 12611  
12605 12612          if (vstate->dtvs_nglobals > 0) {
12606 12613                  kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
12607 12614                      sizeof (dtrace_statvar_t *));
12608 12615          }
12609 12616  
12610 12617          if (vstate->dtvs_ntlocals > 0) {
12611 12618                  kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
12612 12619                      sizeof (dtrace_difv_t));
12613 12620          }
12614 12621  
12615 12622          ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
12616 12623  
12617 12624          if (vstate->dtvs_nlocals > 0) {
12618 12625                  kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
12619 12626                      sizeof (dtrace_statvar_t *));
12620 12627          }
12621 12628  }
12622 12629  
12623 12630  static void
12624 12631  dtrace_state_clean(dtrace_state_t *state)
12625 12632  {
12626 12633          if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
12627 12634                  return;
12628 12635  
12629 12636          dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
12630 12637          dtrace_speculation_clean(state);
12631 12638  }
12632 12639  
12633 12640  static void
12634 12641  dtrace_state_deadman(dtrace_state_t *state)
12635 12642  {
12636 12643          hrtime_t now;
12637 12644  
12638 12645          dtrace_sync();
12639 12646  
12640 12647          now = dtrace_gethrtime();
12641 12648  
12642 12649          if (state != dtrace_anon.dta_state &&
12643 12650              now - state->dts_laststatus >= dtrace_deadman_user)
12644 12651                  return;
12645 12652  
12646 12653          /*
12647 12654           * We must be sure that dts_alive never appears to be less than the
12648 12655           * value upon entry to dtrace_state_deadman(), and because we lack a
12649 12656           * dtrace_cas64(), we cannot store to it atomically.  We thus instead
12650 12657           * store INT64_MAX to it, followed by a memory barrier, followed by
12651 12658           * the new value.  This assures that dts_alive never appears to be
12652 12659           * less than its true value, regardless of the order in which the
12653 12660           * stores to the underlying storage are issued.
12654 12661           */
12655 12662          state->dts_alive = INT64_MAX;
12656 12663          dtrace_membar_producer();
12657 12664          state->dts_alive = now;
12658 12665  }
12659 12666  
12660 12667  dtrace_state_t *
12661 12668  dtrace_state_create(dev_t *devp, cred_t *cr)
12662 12669  {
12663 12670          minor_t minor;
12664 12671          major_t major;
12665 12672          char c[30];
12666 12673          dtrace_state_t *state;
12667 12674          dtrace_optval_t *opt;
12668 12675          int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
12669 12676  
12670 12677          ASSERT(MUTEX_HELD(&dtrace_lock));
12671 12678          ASSERT(MUTEX_HELD(&cpu_lock));
12672 12679  
12673 12680          minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
12674 12681              VM_BESTFIT | VM_SLEEP);
12675 12682  
12676 12683          if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
12677 12684                  vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
12678 12685                  return (NULL);
12679 12686          }
12680 12687  
12681 12688          state = ddi_get_soft_state(dtrace_softstate, minor);
12682 12689          state->dts_epid = DTRACE_EPIDNONE + 1;
12683 12690  
12684 12691          (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
12685 12692          state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
12686 12693              NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
12687 12694  
12688 12695          if (devp != NULL) {
12689 12696                  major = getemajor(*devp);
12690 12697          } else {
12691 12698                  major = ddi_driver_major(dtrace_devi);
12692 12699          }
12693 12700  
12694 12701          state->dts_dev = makedevice(major, minor);
12695 12702  
12696 12703          if (devp != NULL)
12697 12704                  *devp = state->dts_dev;
12698 12705  
12699 12706          /*
12700 12707           * We allocate NCPU buffers.  On the one hand, this can be quite
12701 12708           * a bit of memory per instance (nearly 36K on a Starcat).  On the
12702 12709           * other hand, it saves an additional memory reference in the probe
12703 12710           * path.
12704 12711           */
12705 12712          state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
12706 12713          state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
12707 12714          state->dts_cleaner = CYCLIC_NONE;
12708 12715          state->dts_deadman = CYCLIC_NONE;
12709 12716          state->dts_vstate.dtvs_state = state;
12710 12717  
12711 12718          for (i = 0; i < DTRACEOPT_MAX; i++)
12712 12719                  state->dts_options[i] = DTRACEOPT_UNSET;
12713 12720  
12714 12721          /*
12715 12722           * Set the default options.
12716 12723           */
12717 12724          opt = state->dts_options;
12718 12725          opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
12719 12726          opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
12720 12727          opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
12721 12728          opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
12722 12729          opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
12723 12730          opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
12724 12731          opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
12725 12732          opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
12726 12733          opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
12727 12734          opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
12728 12735          opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
12729 12736          opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
12730 12737          opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
12731 12738          opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
12732 12739  
12733 12740          state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
12734 12741  
12735 12742          /*
12736 12743           * Depending on the user credentials, we set flag bits which alter probe
12737 12744           * visibility or the amount of destructiveness allowed.  In the case of
12738 12745           * actual anonymous tracing, or the possession of all privileges, all of
12739 12746           * the normal checks are bypassed.
12740 12747           */
12741 12748          if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
12742 12749                  state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
12743 12750                  state->dts_cred.dcr_action = DTRACE_CRA_ALL;
12744 12751          } else {
12745 12752                  /*
12746 12753                   * Set up the credentials for this instantiation.  We take a
12747 12754                   * hold on the credential to prevent it from disappearing on
12748 12755                   * us; this in turn prevents the zone_t referenced by this
12749 12756                   * credential from disappearing.  This means that we can
12750 12757                   * examine the credential and the zone from probe context.
12751 12758                   */
12752 12759                  crhold(cr);
12753 12760                  state->dts_cred.dcr_cred = cr;
12754 12761  
12755 12762                  /*
12756 12763                   * CRA_PROC means "we have *some* privilege for dtrace" and
12757 12764                   * unlocks the use of variables like pid, zonename, etc.
12758 12765                   */
12759 12766                  if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
12760 12767                      PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12761 12768                          state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
12762 12769                  }
12763 12770  
12764 12771                  /*
12765 12772                   * dtrace_user allows use of syscall and profile providers.
12766 12773                   * If the user also has proc_owner and/or proc_zone, we
12767 12774                   * extend the scope to include additional visibility and
12768 12775                   * destructive power.
12769 12776                   */
12770 12777                  if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
12771 12778                          if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
12772 12779                                  state->dts_cred.dcr_visible |=
12773 12780                                      DTRACE_CRV_ALLPROC;
12774 12781  
12775 12782                                  state->dts_cred.dcr_action |=
12776 12783                                      DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12777 12784                          }
12778 12785  
12779 12786                          if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
12780 12787                                  state->dts_cred.dcr_visible |=
12781 12788                                      DTRACE_CRV_ALLZONE;
12782 12789  
12783 12790                                  state->dts_cred.dcr_action |=
12784 12791                                      DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12785 12792                          }
12786 12793  
12787 12794                          /*
12788 12795                           * If we have all privs in whatever zone this is,
12789 12796                           * we can do destructive things to processes which
12790 12797                           * have altered credentials.
12791 12798                           */
12792 12799                          if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
12793 12800                              cr->cr_zone->zone_privset)) {
12794 12801                                  state->dts_cred.dcr_action |=
12795 12802                                      DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12796 12803                          }
12797 12804                  }
12798 12805  
12799 12806                  /*
12800 12807                   * Holding the dtrace_kernel privilege also implies that
12801 12808                   * the user has the dtrace_user privilege from a visibility
12802 12809                   * perspective.  But without further privileges, some
12803 12810                   * destructive actions are not available.
12804 12811                   */
12805 12812                  if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
12806 12813                          /*
12807 12814                           * Make all probes in all zones visible.  However,
12808 12815                           * this doesn't mean that all actions become available
12809 12816                           * to all zones.
12810 12817                           */
12811 12818                          state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
12812 12819                              DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
12813 12820  
12814 12821                          state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
12815 12822                              DTRACE_CRA_PROC;
12816 12823                          /*
12817 12824                           * Holding proc_owner means that destructive actions
12818 12825                           * for *this* zone are allowed.
12819 12826                           */
12820 12827                          if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12821 12828                                  state->dts_cred.dcr_action |=
12822 12829                                      DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12823 12830  
12824 12831                          /*
12825 12832                           * Holding proc_zone means that destructive actions
12826 12833                           * for this user/group ID in all zones is allowed.
12827 12834                           */
12828 12835                          if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12829 12836                                  state->dts_cred.dcr_action |=
12830 12837                                      DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12831 12838  
12832 12839                          /*
12833 12840                           * If we have all privs in whatever zone this is,
12834 12841                           * we can do destructive things to processes which
12835 12842                           * have altered credentials.
12836 12843                           */
12837 12844                          if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
12838 12845                              cr->cr_zone->zone_privset)) {
12839 12846                                  state->dts_cred.dcr_action |=
12840 12847                                      DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12841 12848                          }
12842 12849                  }
12843 12850  
12844 12851                  /*
12845 12852                   * Holding the dtrace_proc privilege gives control over fasttrap
12846 12853                   * and pid providers.  We need to grant wider destructive
12847 12854                   * privileges in the event that the user has proc_owner and/or
12848 12855                   * proc_zone.
12849 12856                   */
12850 12857                  if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12851 12858                          if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12852 12859                                  state->dts_cred.dcr_action |=
12853 12860                                      DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12854 12861  
12855 12862                          if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12856 12863                                  state->dts_cred.dcr_action |=
12857 12864                                      DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12858 12865                  }
12859 12866          }
12860 12867  
12861 12868          return (state);
12862 12869  }
12863 12870  
12864 12871  static int
12865 12872  dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
12866 12873  {
12867 12874          dtrace_optval_t *opt = state->dts_options, size;
12868 12875          processorid_t cpu;
12869 12876          int flags = 0, rval, factor, divisor = 1;
12870 12877  
12871 12878          ASSERT(MUTEX_HELD(&dtrace_lock));
12872 12879          ASSERT(MUTEX_HELD(&cpu_lock));
12873 12880          ASSERT(which < DTRACEOPT_MAX);
12874 12881          ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
12875 12882              (state == dtrace_anon.dta_state &&
12876 12883              state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
12877 12884  
12878 12885          if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
12879 12886                  return (0);
12880 12887  
12881 12888          if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
12882 12889                  cpu = opt[DTRACEOPT_CPU];
12883 12890  
12884 12891          if (which == DTRACEOPT_SPECSIZE)
12885 12892                  flags |= DTRACEBUF_NOSWITCH;
12886 12893  
12887 12894          if (which == DTRACEOPT_BUFSIZE) {
12888 12895                  if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
12889 12896                          flags |= DTRACEBUF_RING;
12890 12897  
12891 12898                  if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
12892 12899                          flags |= DTRACEBUF_FILL;
12893 12900  
12894 12901                  if (state != dtrace_anon.dta_state ||
12895 12902                      state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
12896 12903                          flags |= DTRACEBUF_INACTIVE;
12897 12904          }
12898 12905  
12899 12906          for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) {
12900 12907                  /*
12901 12908                   * The size must be 8-byte aligned.  If the size is not 8-byte
12902 12909                   * aligned, drop it down by the difference.
12903 12910                   */
12904 12911                  if (size & (sizeof (uint64_t) - 1))
12905 12912                          size -= size & (sizeof (uint64_t) - 1);
12906 12913  
12907 12914                  if (size < state->dts_reserve) {
12908 12915                          /*
12909 12916                           * Buffers always must be large enough to accommodate
12910 12917                           * their prereserved space.  We return E2BIG instead
12911 12918                           * of ENOMEM in this case to allow for user-level
12912 12919                           * software to differentiate the cases.
12913 12920                           */
12914 12921                          return (E2BIG);
12915 12922                  }
12916 12923  
12917 12924                  rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor);
12918 12925  
12919 12926                  if (rval != ENOMEM) {
12920 12927                          opt[which] = size;
12921 12928                          return (rval);
12922 12929                  }
12923 12930  
12924 12931                  if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
12925 12932                          return (rval);
12926 12933  
12927 12934                  for (divisor = 2; divisor < factor; divisor <<= 1)
12928 12935                          continue;
12929 12936          }
12930 12937  
12931 12938          return (ENOMEM);
12932 12939  }
12933 12940  
12934 12941  static int
12935 12942  dtrace_state_buffers(dtrace_state_t *state)
12936 12943  {
12937 12944          dtrace_speculation_t *spec = state->dts_speculations;
12938 12945          int rval, i;
12939 12946  
12940 12947          if ((rval = dtrace_state_buffer(state, state->dts_buffer,
12941 12948              DTRACEOPT_BUFSIZE)) != 0)
12942 12949                  return (rval);
12943 12950  
12944 12951          if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
12945 12952              DTRACEOPT_AGGSIZE)) != 0)
12946 12953                  return (rval);
12947 12954  
12948 12955          for (i = 0; i < state->dts_nspeculations; i++) {
12949 12956                  if ((rval = dtrace_state_buffer(state,
12950 12957                      spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
12951 12958                          return (rval);
12952 12959          }
12953 12960  
12954 12961          return (0);
12955 12962  }
12956 12963  
12957 12964  static void
12958 12965  dtrace_state_prereserve(dtrace_state_t *state)
12959 12966  {
12960 12967          dtrace_ecb_t *ecb;
12961 12968          dtrace_probe_t *probe;
12962 12969  
12963 12970          state->dts_reserve = 0;
12964 12971  
12965 12972          if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
12966 12973                  return;
12967 12974  
12968 12975          /*
12969 12976           * If our buffer policy is a "fill" buffer policy, we need to set the
12970 12977           * prereserved space to be the space required by the END probes.
12971 12978           */
12972 12979          probe = dtrace_probes[dtrace_probeid_end - 1];
12973 12980          ASSERT(probe != NULL);
12974 12981  
12975 12982          for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
12976 12983                  if (ecb->dte_state != state)
12977 12984                          continue;
12978 12985  
12979 12986                  state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
12980 12987          }
12981 12988  }
12982 12989  
12983 12990  static int
12984 12991  dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
12985 12992  {
12986 12993          dtrace_optval_t *opt = state->dts_options, sz, nspec;
12987 12994          dtrace_speculation_t *spec;
12988 12995          dtrace_buffer_t *buf;
12989 12996          cyc_handler_t hdlr;
12990 12997          cyc_time_t when;
12991 12998          int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
12992 12999          dtrace_icookie_t cookie;
12993 13000  
12994 13001          mutex_enter(&cpu_lock);
12995 13002          mutex_enter(&dtrace_lock);
12996 13003  
12997 13004          if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
12998 13005                  rval = EBUSY;
12999 13006                  goto out;
13000 13007          }
13001 13008  
13002 13009          /*
13003 13010           * Before we can perform any checks, we must prime all of the
13004 13011           * retained enablings that correspond to this state.
13005 13012           */
13006 13013          dtrace_enabling_prime(state);
13007 13014  
13008 13015          if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
13009 13016                  rval = EACCES;
13010 13017                  goto out;
13011 13018          }
13012 13019  
13013 13020          dtrace_state_prereserve(state);
13014 13021  
13015 13022          /*
13016 13023           * Now we want to do is try to allocate our speculations.
13017 13024           * We do not automatically resize the number of speculations; if
13018 13025           * this fails, we will fail the operation.
13019 13026           */
13020 13027          nspec = opt[DTRACEOPT_NSPEC];
13021 13028          ASSERT(nspec != DTRACEOPT_UNSET);
13022 13029  
13023 13030          if (nspec > INT_MAX) {
13024 13031                  rval = ENOMEM;
13025 13032                  goto out;
13026 13033          }
13027 13034  
13028 13035          spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t),
13029 13036              KM_NOSLEEP | KM_NORMALPRI);
13030 13037  
13031 13038          if (spec == NULL) {
13032 13039                  rval = ENOMEM;
13033 13040                  goto out;
13034 13041          }
13035 13042  
13036 13043          state->dts_speculations = spec;
13037 13044          state->dts_nspeculations = (int)nspec;
13038 13045  
13039 13046          for (i = 0; i < nspec; i++) {
13040 13047                  if ((buf = kmem_zalloc(bufsize,
13041 13048                      KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
13042 13049                          rval = ENOMEM;
13043 13050                          goto err;
13044 13051                  }
13045 13052  
13046 13053                  spec[i].dtsp_buffer = buf;
13047 13054          }
13048 13055  
13049 13056          if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
13050 13057                  if (dtrace_anon.dta_state == NULL) {
13051 13058                          rval = ENOENT;
13052 13059                          goto out;
13053 13060                  }
13054 13061  
13055 13062                  if (state->dts_necbs != 0) {
13056 13063                          rval = EALREADY;
13057 13064                          goto out;
13058 13065                  }
13059 13066  
13060 13067                  state->dts_anon = dtrace_anon_grab();
13061 13068                  ASSERT(state->dts_anon != NULL);
13062 13069                  state = state->dts_anon;
13063 13070  
13064 13071                  /*
13065 13072                   * We want "grabanon" to be set in the grabbed state, so we'll
13066 13073                   * copy that option value from the grabbing state into the
13067 13074                   * grabbed state.
13068 13075                   */
13069 13076                  state->dts_options[DTRACEOPT_GRABANON] =
13070 13077                      opt[DTRACEOPT_GRABANON];
13071 13078  
13072 13079                  *cpu = dtrace_anon.dta_beganon;
13073 13080  
13074 13081                  /*
13075 13082                   * If the anonymous state is active (as it almost certainly
13076 13083                   * is if the anonymous enabling ultimately matched anything),
13077 13084                   * we don't allow any further option processing -- but we
13078 13085                   * don't return failure.
13079 13086                   */
13080 13087                  if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
13081 13088                          goto out;
13082 13089          }
13083 13090  
13084 13091          if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
13085 13092              opt[DTRACEOPT_AGGSIZE] != 0) {
13086 13093                  if (state->dts_aggregations == NULL) {
13087 13094                          /*
13088 13095                           * We're not going to create an aggregation buffer
13089 13096                           * because we don't have any ECBs that contain
13090 13097                           * aggregations -- set this option to 0.
13091 13098                           */
13092 13099                          opt[DTRACEOPT_AGGSIZE] = 0;
13093 13100                  } else {
13094 13101                          /*
13095 13102                           * If we have an aggregation buffer, we must also have
13096 13103                           * a buffer to use as scratch.
13097 13104                           */
13098 13105                          if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
13099 13106                              opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
13100 13107                                  opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
13101 13108                          }
13102 13109                  }
13103 13110          }
13104 13111  
13105 13112          if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
13106 13113              opt[DTRACEOPT_SPECSIZE] != 0) {
13107 13114                  if (!state->dts_speculates) {
13108 13115                          /*
13109 13116                           * We're not going to create speculation buffers
13110 13117                           * because we don't have any ECBs that actually
13111 13118                           * speculate -- set the speculation size to 0.
13112 13119                           */
13113 13120                          opt[DTRACEOPT_SPECSIZE] = 0;
13114 13121                  }
13115 13122          }
13116 13123  
13117 13124          /*
13118 13125           * The bare minimum size for any buffer that we're actually going to
13119 13126           * do anything to is sizeof (uint64_t).
13120 13127           */
13121 13128          sz = sizeof (uint64_t);
13122 13129  
13123 13130          if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
13124 13131              (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
13125 13132              (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
13126 13133                  /*
13127 13134                   * A buffer size has been explicitly set to 0 (or to a size
13128 13135                   * that will be adjusted to 0) and we need the space -- we
13129 13136                   * need to return failure.  We return ENOSPC to differentiate
13130 13137                   * it from failing to allocate a buffer due to failure to meet
13131 13138                   * the reserve (for which we return E2BIG).
13132 13139                   */
13133 13140                  rval = ENOSPC;
13134 13141                  goto out;
13135 13142          }
13136 13143  
13137 13144          if ((rval = dtrace_state_buffers(state)) != 0)
13138 13145                  goto err;
13139 13146  
13140 13147          if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
13141 13148                  sz = dtrace_dstate_defsize;
13142 13149  
13143 13150          do {
13144 13151                  rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
13145 13152  
13146 13153                  if (rval == 0)
13147 13154                          break;
13148 13155  
13149 13156                  if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
13150 13157                          goto err;
13151 13158          } while (sz >>= 1);
13152 13159  
13153 13160          opt[DTRACEOPT_DYNVARSIZE] = sz;
13154 13161  
13155 13162          if (rval != 0)
13156 13163                  goto err;
13157 13164  
13158 13165          if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
13159 13166                  opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
13160 13167  
13161 13168          if (opt[DTRACEOPT_CLEANRATE] == 0)
13162 13169                  opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13163 13170  
13164 13171          if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
13165 13172                  opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
13166 13173  
13167 13174          if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
13168 13175                  opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13169 13176  
13170 13177          hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
13171 13178          hdlr.cyh_arg = state;
13172 13179          hdlr.cyh_level = CY_LOW_LEVEL;
13173 13180  
13174 13181          when.cyt_when = 0;
13175 13182          when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
13176 13183  
13177 13184          state->dts_cleaner = cyclic_add(&hdlr, &when);
13178 13185  
13179 13186          hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
13180 13187          hdlr.cyh_arg = state;
13181 13188          hdlr.cyh_level = CY_LOW_LEVEL;
13182 13189  
13183 13190          when.cyt_when = 0;
13184 13191          when.cyt_interval = dtrace_deadman_interval;
13185 13192  
13186 13193          state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
13187 13194          state->dts_deadman = cyclic_add(&hdlr, &when);
13188 13195  
13189 13196          state->dts_activity = DTRACE_ACTIVITY_WARMUP;
13190 13197  
13191 13198          if (state->dts_getf != 0 &&
13192 13199              !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
13193 13200                  /*
13194 13201                   * We don't have kernel privs but we have at least one call
13195 13202                   * to getf(); we need to bump our zone's count, and (if
13196 13203                   * this is the first enabling to have an unprivileged call
13197 13204                   * to getf()) we need to hook into closef().
13198 13205                   */
13199 13206                  state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++;
13200 13207  
13201 13208                  if (dtrace_getf++ == 0) {
13202 13209                          ASSERT(dtrace_closef == NULL);
13203 13210                          dtrace_closef = dtrace_getf_barrier;
13204 13211                  }
13205 13212          }
13206 13213  
13207 13214          /*
13208 13215           * Now it's time to actually fire the BEGIN probe.  We need to disable
13209 13216           * interrupts here both to record the CPU on which we fired the BEGIN
13210 13217           * probe (the data from this CPU will be processed first at user
13211 13218           * level) and to manually activate the buffer for this CPU.
13212 13219           */
13213 13220          cookie = dtrace_interrupt_disable();
13214 13221          *cpu = CPU->cpu_id;
13215 13222          ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
13216 13223          state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
13217 13224  
13218 13225          dtrace_probe(dtrace_probeid_begin,
13219 13226              (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13220 13227          dtrace_interrupt_enable(cookie);
13221 13228          /*
13222 13229           * We may have had an exit action from a BEGIN probe; only change our
13223 13230           * state to ACTIVE if we're still in WARMUP.
13224 13231           */
13225 13232          ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
13226 13233              state->dts_activity == DTRACE_ACTIVITY_DRAINING);
13227 13234  
13228 13235          if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
13229 13236                  state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
13230 13237  
13231 13238          /*
13232 13239           * Regardless of whether or not now we're in ACTIVE or DRAINING, we
13233 13240           * want each CPU to transition its principal buffer out of the
13234 13241           * INACTIVE state.  Doing this assures that no CPU will suddenly begin
13235 13242           * processing an ECB halfway down a probe's ECB chain; all CPUs will
13236 13243           * atomically transition from processing none of a state's ECBs to
13237 13244           * processing all of them.
13238 13245           */
13239 13246          dtrace_xcall(DTRACE_CPUALL,
13240 13247              (dtrace_xcall_t)dtrace_buffer_activate, state);
13241 13248          goto out;
13242 13249  
13243 13250  err:
13244 13251          dtrace_buffer_free(state->dts_buffer);
13245 13252          dtrace_buffer_free(state->dts_aggbuffer);
13246 13253  
13247 13254          if ((nspec = state->dts_nspeculations) == 0) {
13248 13255                  ASSERT(state->dts_speculations == NULL);
13249 13256                  goto out;
13250 13257          }
13251 13258  
13252 13259          spec = state->dts_speculations;
13253 13260          ASSERT(spec != NULL);
13254 13261  
13255 13262          for (i = 0; i < state->dts_nspeculations; i++) {
13256 13263                  if ((buf = spec[i].dtsp_buffer) == NULL)
13257 13264                          break;
13258 13265  
13259 13266                  dtrace_buffer_free(buf);
13260 13267                  kmem_free(buf, bufsize);
13261 13268          }
13262 13269  
13263 13270          kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13264 13271          state->dts_nspeculations = 0;
13265 13272          state->dts_speculations = NULL;
13266 13273  
13267 13274  out:
13268 13275          mutex_exit(&dtrace_lock);
13269 13276          mutex_exit(&cpu_lock);
13270 13277  
13271 13278          return (rval);
13272 13279  }
13273 13280  
13274 13281  static int
13275 13282  dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
13276 13283  {
13277 13284          dtrace_icookie_t cookie;
13278 13285  
13279 13286          ASSERT(MUTEX_HELD(&dtrace_lock));
13280 13287  
13281 13288          if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
13282 13289              state->dts_activity != DTRACE_ACTIVITY_DRAINING)
13283 13290                  return (EINVAL);
13284 13291  
13285 13292          /*
13286 13293           * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
13287 13294           * to be sure that every CPU has seen it.  See below for the details
13288 13295           * on why this is done.
13289 13296           */
13290 13297          state->dts_activity = DTRACE_ACTIVITY_DRAINING;
13291 13298          dtrace_sync();
13292 13299  
13293 13300          /*
13294 13301           * By this point, it is impossible for any CPU to be still processing
13295 13302           * with DTRACE_ACTIVITY_ACTIVE.  We can thus set our activity to
13296 13303           * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
13297 13304           * other CPU in dtrace_buffer_reserve().  This allows dtrace_probe()
13298 13305           * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
13299 13306           * iff we're in the END probe.
13300 13307           */
13301 13308          state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
13302 13309          dtrace_sync();
13303 13310          ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
13304 13311  
13305 13312          /*
13306 13313           * Finally, we can release the reserve and call the END probe.  We
13307 13314           * disable interrupts across calling the END probe to allow us to
13308 13315           * return the CPU on which we actually called the END probe.  This
13309 13316           * allows user-land to be sure that this CPU's principal buffer is
13310 13317           * processed last.
13311 13318           */
13312 13319          state->dts_reserve = 0;
13313 13320  
13314 13321          cookie = dtrace_interrupt_disable();
13315 13322          *cpu = CPU->cpu_id;
13316 13323          dtrace_probe(dtrace_probeid_end,
13317 13324              (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13318 13325          dtrace_interrupt_enable(cookie);
13319 13326  
13320 13327          state->dts_activity = DTRACE_ACTIVITY_STOPPED;
13321 13328          dtrace_sync();
13322 13329  
13323 13330          if (state->dts_getf != 0 &&
13324 13331              !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
13325 13332                  /*
13326 13333                   * We don't have kernel privs but we have at least one call
13327 13334                   * to getf(); we need to lower our zone's count, and (if
13328 13335                   * this is the last enabling to have an unprivileged call
13329 13336                   * to getf()) we need to clear the closef() hook.
13330 13337                   */
13331 13338                  ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0);
13332 13339                  ASSERT(dtrace_closef == dtrace_getf_barrier);
13333 13340                  ASSERT(dtrace_getf > 0);
13334 13341  
13335 13342                  state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--;
13336 13343  
13337 13344                  if (--dtrace_getf == 0)
13338 13345                          dtrace_closef = NULL;
13339 13346          }
13340 13347  
13341 13348          return (0);
13342 13349  }
13343 13350  
13344 13351  static int
13345 13352  dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
13346 13353      dtrace_optval_t val)
13347 13354  {
13348 13355          ASSERT(MUTEX_HELD(&dtrace_lock));
13349 13356  
13350 13357          if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
13351 13358                  return (EBUSY);
13352 13359  
13353 13360          if (option >= DTRACEOPT_MAX)
13354 13361                  return (EINVAL);
13355 13362  
13356 13363          if (option != DTRACEOPT_CPU && val < 0)
13357 13364                  return (EINVAL);
13358 13365  
13359 13366          switch (option) {
13360 13367          case DTRACEOPT_DESTRUCTIVE:
13361 13368                  if (dtrace_destructive_disallow)
13362 13369                          return (EACCES);
13363 13370  
13364 13371                  state->dts_cred.dcr_destructive = 1;
13365 13372                  break;
13366 13373  
13367 13374          case DTRACEOPT_BUFSIZE:
13368 13375          case DTRACEOPT_DYNVARSIZE:
13369 13376          case DTRACEOPT_AGGSIZE:
13370 13377          case DTRACEOPT_SPECSIZE:
13371 13378          case DTRACEOPT_STRSIZE:
13372 13379                  if (val < 0)
13373 13380                          return (EINVAL);
13374 13381  
13375 13382                  if (val >= LONG_MAX) {
13376 13383                          /*
13377 13384                           * If this is an otherwise negative value, set it to
13378 13385                           * the highest multiple of 128m less than LONG_MAX.
13379 13386                           * Technically, we're adjusting the size without
13380 13387                           * regard to the buffer resizing policy, but in fact,
13381 13388                           * this has no effect -- if we set the buffer size to
13382 13389                           * ~LONG_MAX and the buffer policy is ultimately set to
13383 13390                           * be "manual", the buffer allocation is guaranteed to
13384 13391                           * fail, if only because the allocation requires two
13385 13392                           * buffers.  (We set the the size to the highest
13386 13393                           * multiple of 128m because it ensures that the size
13387 13394                           * will remain a multiple of a megabyte when
13388 13395                           * repeatedly halved -- all the way down to 15m.)
13389 13396                           */
13390 13397                          val = LONG_MAX - (1 << 27) + 1;
13391 13398                  }
13392 13399          }
13393 13400  
13394 13401          state->dts_options[option] = val;
13395 13402  
13396 13403          return (0);
13397 13404  }
13398 13405  
13399 13406  static void
13400 13407  dtrace_state_destroy(dtrace_state_t *state)
13401 13408  {
13402 13409          dtrace_ecb_t *ecb;
13403 13410          dtrace_vstate_t *vstate = &state->dts_vstate;
13404 13411          minor_t minor = getminor(state->dts_dev);
13405 13412          int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
13406 13413          dtrace_speculation_t *spec = state->dts_speculations;
13407 13414          int nspec = state->dts_nspeculations;
13408 13415          uint32_t match;
13409 13416  
13410 13417          ASSERT(MUTEX_HELD(&dtrace_lock));
13411 13418          ASSERT(MUTEX_HELD(&cpu_lock));
13412 13419  
13413 13420          /*
13414 13421           * First, retract any retained enablings for this state.
13415 13422           */
13416 13423          dtrace_enabling_retract(state);
13417 13424          ASSERT(state->dts_nretained == 0);
13418 13425  
13419 13426          if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
13420 13427              state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
13421 13428                  /*
13422 13429                   * We have managed to come into dtrace_state_destroy() on a
13423 13430                   * hot enabling -- almost certainly because of a disorderly
13424 13431                   * shutdown of a consumer.  (That is, a consumer that is
13425 13432                   * exiting without having called dtrace_stop().) In this case,
13426 13433                   * we're going to set our activity to be KILLED, and then
13427 13434                   * issue a sync to be sure that everyone is out of probe
13428 13435                   * context before we start blowing away ECBs.
13429 13436                   */
13430 13437                  state->dts_activity = DTRACE_ACTIVITY_KILLED;
13431 13438                  dtrace_sync();
13432 13439          }
13433 13440  
13434 13441          /*
13435 13442           * Release the credential hold we took in dtrace_state_create().
13436 13443           */
13437 13444          if (state->dts_cred.dcr_cred != NULL)
13438 13445                  crfree(state->dts_cred.dcr_cred);
13439 13446  
13440 13447          /*
13441 13448           * Now we can safely disable and destroy any enabled probes.  Because
13442 13449           * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
13443 13450           * (especially if they're all enabled), we take two passes through the
13444 13451           * ECBs:  in the first, we disable just DTRACE_PRIV_KERNEL probes, and
13445 13452           * in the second we disable whatever is left over.
13446 13453           */
13447 13454          for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
13448 13455                  for (i = 0; i < state->dts_necbs; i++) {
13449 13456                          if ((ecb = state->dts_ecbs[i]) == NULL)
13450 13457                                  continue;
13451 13458  
13452 13459                          if (match && ecb->dte_probe != NULL) {
13453 13460                                  dtrace_probe_t *probe = ecb->dte_probe;
13454 13461                                  dtrace_provider_t *prov = probe->dtpr_provider;
13455 13462  
13456 13463                                  if (!(prov->dtpv_priv.dtpp_flags & match))
13457 13464                                          continue;
13458 13465                          }
13459 13466  
13460 13467                          dtrace_ecb_disable(ecb);
13461 13468                          dtrace_ecb_destroy(ecb);
13462 13469                  }
13463 13470  
13464 13471                  if (!match)
13465 13472                          break;
13466 13473          }
13467 13474  
13468 13475          /*
13469 13476           * Before we free the buffers, perform one more sync to assure that
13470 13477           * every CPU is out of probe context.
13471 13478           */
13472 13479          dtrace_sync();
13473 13480  
13474 13481          dtrace_buffer_free(state->dts_buffer);
13475 13482          dtrace_buffer_free(state->dts_aggbuffer);
13476 13483  
13477 13484          for (i = 0; i < nspec; i++)
13478 13485                  dtrace_buffer_free(spec[i].dtsp_buffer);
13479 13486  
13480 13487          if (state->dts_cleaner != CYCLIC_NONE)
13481 13488                  cyclic_remove(state->dts_cleaner);
13482 13489  
13483 13490          if (state->dts_deadman != CYCLIC_NONE)
13484 13491                  cyclic_remove(state->dts_deadman);
13485 13492  
13486 13493          dtrace_dstate_fini(&vstate->dtvs_dynvars);
13487 13494          dtrace_vstate_fini(vstate);
13488 13495          kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
13489 13496  
13490 13497          if (state->dts_aggregations != NULL) {
13491 13498  #ifdef DEBUG
13492 13499                  for (i = 0; i < state->dts_naggregations; i++)
13493 13500                          ASSERT(state->dts_aggregations[i] == NULL);
13494 13501  #endif
13495 13502                  ASSERT(state->dts_naggregations > 0);
13496 13503                  kmem_free(state->dts_aggregations,
13497 13504                      state->dts_naggregations * sizeof (dtrace_aggregation_t *));
13498 13505          }
13499 13506  
13500 13507          kmem_free(state->dts_buffer, bufsize);
13501 13508          kmem_free(state->dts_aggbuffer, bufsize);
13502 13509  
13503 13510          for (i = 0; i < nspec; i++)
13504 13511                  kmem_free(spec[i].dtsp_buffer, bufsize);
13505 13512  
13506 13513          kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13507 13514  
13508 13515          dtrace_format_destroy(state);
13509 13516  
13510 13517          vmem_destroy(state->dts_aggid_arena);
13511 13518          ddi_soft_state_free(dtrace_softstate, minor);
13512 13519          vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
13513 13520  }
13514 13521  
13515 13522  /*
13516 13523   * DTrace Anonymous Enabling Functions
13517 13524   */
13518 13525  static dtrace_state_t *
13519 13526  dtrace_anon_grab(void)
13520 13527  {
13521 13528          dtrace_state_t *state;
13522 13529  
13523 13530          ASSERT(MUTEX_HELD(&dtrace_lock));
13524 13531  
13525 13532          if ((state = dtrace_anon.dta_state) == NULL) {
13526 13533                  ASSERT(dtrace_anon.dta_enabling == NULL);
13527 13534                  return (NULL);
13528 13535          }
13529 13536  
13530 13537          ASSERT(dtrace_anon.dta_enabling != NULL);
13531 13538          ASSERT(dtrace_retained != NULL);
13532 13539  
13533 13540          dtrace_enabling_destroy(dtrace_anon.dta_enabling);
13534 13541          dtrace_anon.dta_enabling = NULL;
13535 13542          dtrace_anon.dta_state = NULL;
13536 13543  
13537 13544          return (state);
13538 13545  }
13539 13546  
13540 13547  static void
13541 13548  dtrace_anon_property(void)
13542 13549  {
13543 13550          int i, rv;
13544 13551          dtrace_state_t *state;
13545 13552          dof_hdr_t *dof;
13546 13553          char c[32];             /* enough for "dof-data-" + digits */
13547 13554  
13548 13555          ASSERT(MUTEX_HELD(&dtrace_lock));
13549 13556          ASSERT(MUTEX_HELD(&cpu_lock));
13550 13557  
13551 13558          for (i = 0; ; i++) {
13552 13559                  (void) snprintf(c, sizeof (c), "dof-data-%d", i);
13553 13560  
13554 13561                  dtrace_err_verbose = 1;
13555 13562  
13556 13563                  if ((dof = dtrace_dof_property(c)) == NULL) {
13557 13564                          dtrace_err_verbose = 0;
13558 13565                          break;
13559 13566                  }
13560 13567  
13561 13568                  /*
13562 13569                   * We want to create anonymous state, so we need to transition
13563 13570                   * the kernel debugger to indicate that DTrace is active.  If
13564 13571                   * this fails (e.g. because the debugger has modified text in
13565 13572                   * some way), we won't continue with the processing.
13566 13573                   */
13567 13574                  if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
13568 13575                          cmn_err(CE_NOTE, "kernel debugger active; anonymous "
13569 13576                              "enabling ignored.");
13570 13577                          dtrace_dof_destroy(dof);
13571 13578                          break;
13572 13579                  }
13573 13580  
13574 13581                  /*
13575 13582                   * If we haven't allocated an anonymous state, we'll do so now.
13576 13583                   */
13577 13584                  if ((state = dtrace_anon.dta_state) == NULL) {
13578 13585                          state = dtrace_state_create(NULL, NULL);
13579 13586                          dtrace_anon.dta_state = state;
13580 13587  
13581 13588                          if (state == NULL) {
13582 13589                                  /*
13583 13590                                   * This basically shouldn't happen:  the only
13584 13591                                   * failure mode from dtrace_state_create() is a
13585 13592                                   * failure of ddi_soft_state_zalloc() that
13586 13593                                   * itself should never happen.  Still, the
13587 13594                                   * interface allows for a failure mode, and
13588 13595                                   * we want to fail as gracefully as possible:
13589 13596                                   * we'll emit an error message and cease
13590 13597                                   * processing anonymous state in this case.
13591 13598                                   */
13592 13599                                  cmn_err(CE_WARN, "failed to create "
13593 13600                                      "anonymous state");
13594 13601                                  dtrace_dof_destroy(dof);
13595 13602                                  break;
13596 13603                          }
13597 13604                  }
13598 13605  
13599 13606                  rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
13600 13607                      &dtrace_anon.dta_enabling, 0, B_TRUE);
13601 13608  
13602 13609                  if (rv == 0)
13603 13610                          rv = dtrace_dof_options(dof, state);
13604 13611  
13605 13612                  dtrace_err_verbose = 0;
13606 13613                  dtrace_dof_destroy(dof);
13607 13614  
13608 13615                  if (rv != 0) {
13609 13616                          /*
13610 13617                           * This is malformed DOF; chuck any anonymous state
13611 13618                           * that we created.
13612 13619                           */
13613 13620                          ASSERT(dtrace_anon.dta_enabling == NULL);
13614 13621                          dtrace_state_destroy(state);
13615 13622                          dtrace_anon.dta_state = NULL;
13616 13623                          break;
13617 13624                  }
13618 13625  
13619 13626                  ASSERT(dtrace_anon.dta_enabling != NULL);
13620 13627          }
13621 13628  
13622 13629          if (dtrace_anon.dta_enabling != NULL) {
13623 13630                  int rval;
13624 13631  
13625 13632                  /*
13626 13633                   * dtrace_enabling_retain() can only fail because we are
13627 13634                   * trying to retain more enablings than are allowed -- but
13628 13635                   * we only have one anonymous enabling, and we are guaranteed
13629 13636                   * to be allowed at least one retained enabling; we assert
13630 13637                   * that dtrace_enabling_retain() returns success.
13631 13638                   */
13632 13639                  rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
13633 13640                  ASSERT(rval == 0);
13634 13641  
13635 13642                  dtrace_enabling_dump(dtrace_anon.dta_enabling);
13636 13643          }
13637 13644  }
13638 13645  
13639 13646  /*
13640 13647   * DTrace Helper Functions
13641 13648   */
13642 13649  static void
13643 13650  dtrace_helper_trace(dtrace_helper_action_t *helper,
13644 13651      dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
13645 13652  {
13646 13653          uint32_t size, next, nnext, i;
13647 13654          dtrace_helptrace_t *ent;
13648 13655          uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
13649 13656  
13650 13657          if (!dtrace_helptrace_enabled)
13651 13658                  return;
13652 13659  
13653 13660          ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
13654 13661  
13655 13662          /*
13656 13663           * What would a tracing framework be without its own tracing
13657 13664           * framework?  (Well, a hell of a lot simpler, for starters...)
13658 13665           */
13659 13666          size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
13660 13667              sizeof (uint64_t) - sizeof (uint64_t);
13661 13668  
13662 13669          /*
13663 13670           * Iterate until we can allocate a slot in the trace buffer.
13664 13671           */
13665 13672          do {
13666 13673                  next = dtrace_helptrace_next;
13667 13674  
13668 13675                  if (next + size < dtrace_helptrace_bufsize) {
13669 13676                          nnext = next + size;
13670 13677                  } else {
13671 13678                          nnext = size;
13672 13679                  }
13673 13680          } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
13674 13681  
13675 13682          /*
13676 13683           * We have our slot; fill it in.
13677 13684           */
13678 13685          if (nnext == size)
13679 13686                  next = 0;
13680 13687  
13681 13688          ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
13682 13689          ent->dtht_helper = helper;
13683 13690          ent->dtht_where = where;
13684 13691          ent->dtht_nlocals = vstate->dtvs_nlocals;
13685 13692  
13686 13693          ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
13687 13694              mstate->dtms_fltoffs : -1;
13688 13695          ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
13689 13696          ent->dtht_illval = cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
13690 13697  
13691 13698          for (i = 0; i < vstate->dtvs_nlocals; i++) {
13692 13699                  dtrace_statvar_t *svar;
13693 13700  
13694 13701                  if ((svar = vstate->dtvs_locals[i]) == NULL)
13695 13702                          continue;
13696 13703  
13697 13704                  ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
13698 13705                  ent->dtht_locals[i] =
13699 13706                      ((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
13700 13707          }
13701 13708  }
13702 13709  
13703 13710  static uint64_t
13704 13711  dtrace_helper(int which, dtrace_mstate_t *mstate,
13705 13712      dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
13706 13713  {
13707 13714          uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
13708 13715          uint64_t sarg0 = mstate->dtms_arg[0];
13709 13716          uint64_t sarg1 = mstate->dtms_arg[1];
13710 13717          uint64_t rval;
13711 13718          dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
13712 13719          dtrace_helper_action_t *helper;
13713 13720          dtrace_vstate_t *vstate;
13714 13721          dtrace_difo_t *pred;
13715 13722          int i, trace = dtrace_helptrace_enabled;
13716 13723  
13717 13724          ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
13718 13725  
13719 13726          if (helpers == NULL)
13720 13727                  return (0);
13721 13728  
13722 13729          if ((helper = helpers->dthps_actions[which]) == NULL)
13723 13730                  return (0);
13724 13731  
13725 13732          vstate = &helpers->dthps_vstate;
13726 13733          mstate->dtms_arg[0] = arg0;
13727 13734          mstate->dtms_arg[1] = arg1;
13728 13735  
13729 13736          /*
13730 13737           * Now iterate over each helper.  If its predicate evaluates to 'true',
13731 13738           * we'll call the corresponding actions.  Note that the below calls
13732 13739           * to dtrace_dif_emulate() may set faults in machine state.  This is
13733 13740           * okay:  our caller (the outer dtrace_dif_emulate()) will simply plow
13734 13741           * the stored DIF offset with its own (which is the desired behavior).
13735 13742           * Also, note the calls to dtrace_dif_emulate() may allocate scratch
13736 13743           * from machine state; this is okay, too.
13737 13744           */
13738 13745          for (; helper != NULL; helper = helper->dtha_next) {
13739 13746                  if ((pred = helper->dtha_predicate) != NULL) {
13740 13747                          if (trace)
13741 13748                                  dtrace_helper_trace(helper, mstate, vstate, 0);
13742 13749  
13743 13750                          if (!dtrace_dif_emulate(pred, mstate, vstate, state))
13744 13751                                  goto next;
13745 13752  
13746 13753                          if (*flags & CPU_DTRACE_FAULT)
13747 13754                                  goto err;
13748 13755                  }
13749 13756  
13750 13757                  for (i = 0; i < helper->dtha_nactions; i++) {
13751 13758                          if (trace)
13752 13759                                  dtrace_helper_trace(helper,
13753 13760                                      mstate, vstate, i + 1);
13754 13761  
13755 13762                          rval = dtrace_dif_emulate(helper->dtha_actions[i],
13756 13763                              mstate, vstate, state);
13757 13764  
13758 13765                          if (*flags & CPU_DTRACE_FAULT)
13759 13766                                  goto err;
13760 13767                  }
13761 13768  
13762 13769  next:
13763 13770                  if (trace)
13764 13771                          dtrace_helper_trace(helper, mstate, vstate,
13765 13772                              DTRACE_HELPTRACE_NEXT);
13766 13773          }
13767 13774  
13768 13775          if (trace)
13769 13776                  dtrace_helper_trace(helper, mstate, vstate,
13770 13777                      DTRACE_HELPTRACE_DONE);
13771 13778  
13772 13779          /*
13773 13780           * Restore the arg0 that we saved upon entry.
13774 13781           */
13775 13782          mstate->dtms_arg[0] = sarg0;
13776 13783          mstate->dtms_arg[1] = sarg1;
13777 13784  
13778 13785          return (rval);
13779 13786  
13780 13787  err:
13781 13788          if (trace)
13782 13789                  dtrace_helper_trace(helper, mstate, vstate,
13783 13790                      DTRACE_HELPTRACE_ERR);
13784 13791  
13785 13792          /*
13786 13793           * Restore the arg0 that we saved upon entry.
13787 13794           */
13788 13795          mstate->dtms_arg[0] = sarg0;
13789 13796          mstate->dtms_arg[1] = sarg1;
13790 13797  
13791 13798          return (NULL);
13792 13799  }
13793 13800  
13794 13801  static void
13795 13802  dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
13796 13803      dtrace_vstate_t *vstate)
13797 13804  {
13798 13805          int i;
13799 13806  
13800 13807          if (helper->dtha_predicate != NULL)
13801 13808                  dtrace_difo_release(helper->dtha_predicate, vstate);
13802 13809  
13803 13810          for (i = 0; i < helper->dtha_nactions; i++) {
13804 13811                  ASSERT(helper->dtha_actions[i] != NULL);
13805 13812                  dtrace_difo_release(helper->dtha_actions[i], vstate);
13806 13813          }
13807 13814  
13808 13815          kmem_free(helper->dtha_actions,
13809 13816              helper->dtha_nactions * sizeof (dtrace_difo_t *));
13810 13817          kmem_free(helper, sizeof (dtrace_helper_action_t));
13811 13818  }
13812 13819  
13813 13820  static int
13814 13821  dtrace_helper_destroygen(int gen)
13815 13822  {
13816 13823          proc_t *p = curproc;
13817 13824          dtrace_helpers_t *help = p->p_dtrace_helpers;
13818 13825          dtrace_vstate_t *vstate;
13819 13826          int i;
13820 13827  
13821 13828          ASSERT(MUTEX_HELD(&dtrace_lock));
13822 13829  
13823 13830          if (help == NULL || gen > help->dthps_generation)
13824 13831                  return (EINVAL);
13825 13832  
13826 13833          vstate = &help->dthps_vstate;
13827 13834  
13828 13835          for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
13829 13836                  dtrace_helper_action_t *last = NULL, *h, *next;
13830 13837  
13831 13838                  for (h = help->dthps_actions[i]; h != NULL; h = next) {
13832 13839                          next = h->dtha_next;
13833 13840  
13834 13841                          if (h->dtha_generation == gen) {
13835 13842                                  if (last != NULL) {
13836 13843                                          last->dtha_next = next;
13837 13844                                  } else {
13838 13845                                          help->dthps_actions[i] = next;
13839 13846                                  }
13840 13847  
13841 13848                                  dtrace_helper_action_destroy(h, vstate);
13842 13849                          } else {
13843 13850                                  last = h;
13844 13851                          }
13845 13852                  }
13846 13853          }
13847 13854  
13848 13855          /*
13849 13856           * Interate until we've cleared out all helper providers with the
13850 13857           * given generation number.
13851 13858           */
13852 13859          for (;;) {
13853 13860                  dtrace_helper_provider_t *prov;
13854 13861  
13855 13862                  /*
13856 13863                   * Look for a helper provider with the right generation. We
13857 13864                   * have to start back at the beginning of the list each time
13858 13865                   * because we drop dtrace_lock. It's unlikely that we'll make
13859 13866                   * more than two passes.
13860 13867                   */
13861 13868                  for (i = 0; i < help->dthps_nprovs; i++) {
13862 13869                          prov = help->dthps_provs[i];
13863 13870  
13864 13871                          if (prov->dthp_generation == gen)
13865 13872                                  break;
13866 13873                  }
13867 13874  
13868 13875                  /*
13869 13876                   * If there were no matches, we're done.
13870 13877                   */
13871 13878                  if (i == help->dthps_nprovs)
13872 13879                          break;
13873 13880  
13874 13881                  /*
13875 13882                   * Move the last helper provider into this slot.
13876 13883                   */
13877 13884                  help->dthps_nprovs--;
13878 13885                  help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
13879 13886                  help->dthps_provs[help->dthps_nprovs] = NULL;
13880 13887  
13881 13888                  mutex_exit(&dtrace_lock);
13882 13889  
13883 13890                  /*
13884 13891                   * If we have a meta provider, remove this helper provider.
13885 13892                   */
13886 13893                  mutex_enter(&dtrace_meta_lock);
13887 13894                  if (dtrace_meta_pid != NULL) {
13888 13895                          ASSERT(dtrace_deferred_pid == NULL);
13889 13896                          dtrace_helper_provider_remove(&prov->dthp_prov,
13890 13897                              p->p_pid);
13891 13898                  }
13892 13899                  mutex_exit(&dtrace_meta_lock);
13893 13900  
13894 13901                  dtrace_helper_provider_destroy(prov);
13895 13902  
13896 13903                  mutex_enter(&dtrace_lock);
13897 13904          }
13898 13905  
13899 13906          return (0);
13900 13907  }
13901 13908  
13902 13909  static int
13903 13910  dtrace_helper_validate(dtrace_helper_action_t *helper)
13904 13911  {
13905 13912          int err = 0, i;
13906 13913          dtrace_difo_t *dp;
13907 13914  
13908 13915          if ((dp = helper->dtha_predicate) != NULL)
13909 13916                  err += dtrace_difo_validate_helper(dp);
13910 13917  
13911 13918          for (i = 0; i < helper->dtha_nactions; i++)
13912 13919                  err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
13913 13920  
13914 13921          return (err == 0);
13915 13922  }
13916 13923  
13917 13924  static int
13918 13925  dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
13919 13926  {
13920 13927          dtrace_helpers_t *help;
13921 13928          dtrace_helper_action_t *helper, *last;
13922 13929          dtrace_actdesc_t *act;
13923 13930          dtrace_vstate_t *vstate;
13924 13931          dtrace_predicate_t *pred;
13925 13932          int count = 0, nactions = 0, i;
13926 13933  
13927 13934          if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
13928 13935                  return (EINVAL);
13929 13936  
13930 13937          help = curproc->p_dtrace_helpers;
13931 13938          last = help->dthps_actions[which];
13932 13939          vstate = &help->dthps_vstate;
13933 13940  
13934 13941          for (count = 0; last != NULL; last = last->dtha_next) {
13935 13942                  count++;
13936 13943                  if (last->dtha_next == NULL)
13937 13944                          break;
13938 13945          }
13939 13946  
13940 13947          /*
13941 13948           * If we already have dtrace_helper_actions_max helper actions for this
13942 13949           * helper action type, we'll refuse to add a new one.
13943 13950           */
13944 13951          if (count >= dtrace_helper_actions_max)
13945 13952                  return (ENOSPC);
13946 13953  
13947 13954          helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
13948 13955          helper->dtha_generation = help->dthps_generation;
13949 13956  
13950 13957          if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
13951 13958                  ASSERT(pred->dtp_difo != NULL);
13952 13959                  dtrace_difo_hold(pred->dtp_difo);
13953 13960                  helper->dtha_predicate = pred->dtp_difo;
13954 13961          }
13955 13962  
13956 13963          for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
13957 13964                  if (act->dtad_kind != DTRACEACT_DIFEXPR)
13958 13965                          goto err;
13959 13966  
13960 13967                  if (act->dtad_difo == NULL)
13961 13968                          goto err;
13962 13969  
13963 13970                  nactions++;
13964 13971          }
13965 13972  
13966 13973          helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
13967 13974              (helper->dtha_nactions = nactions), KM_SLEEP);
13968 13975  
13969 13976          for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
13970 13977                  dtrace_difo_hold(act->dtad_difo);
13971 13978                  helper->dtha_actions[i++] = act->dtad_difo;
13972 13979          }
13973 13980  
13974 13981          if (!dtrace_helper_validate(helper))
13975 13982                  goto err;
13976 13983  
13977 13984          if (last == NULL) {
13978 13985                  help->dthps_actions[which] = helper;
13979 13986          } else {
13980 13987                  last->dtha_next = helper;
13981 13988          }
13982 13989  
13983 13990          if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
13984 13991                  dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
13985 13992                  dtrace_helptrace_next = 0;
13986 13993          }
13987 13994  
13988 13995          return (0);
13989 13996  err:
13990 13997          dtrace_helper_action_destroy(helper, vstate);
13991 13998          return (EINVAL);
13992 13999  }
13993 14000  
13994 14001  static void
13995 14002  dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
13996 14003      dof_helper_t *dofhp)
13997 14004  {
13998 14005          ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
13999 14006  
14000 14007          mutex_enter(&dtrace_meta_lock);
14001 14008          mutex_enter(&dtrace_lock);
14002 14009  
14003 14010          if (!dtrace_attached() || dtrace_meta_pid == NULL) {
14004 14011                  /*
14005 14012                   * If the dtrace module is loaded but not attached, or if
14006 14013                   * there aren't isn't a meta provider registered to deal with
14007 14014                   * these provider descriptions, we need to postpone creating
14008 14015                   * the actual providers until later.
14009 14016                   */
14010 14017  
14011 14018                  if (help->dthps_next == NULL && help->dthps_prev == NULL &&
14012 14019                      dtrace_deferred_pid != help) {
14013 14020                          help->dthps_deferred = 1;
14014 14021                          help->dthps_pid = p->p_pid;
14015 14022                          help->dthps_next = dtrace_deferred_pid;
14016 14023                          help->dthps_prev = NULL;
14017 14024                          if (dtrace_deferred_pid != NULL)
14018 14025                                  dtrace_deferred_pid->dthps_prev = help;
14019 14026                          dtrace_deferred_pid = help;
14020 14027                  }
14021 14028  
14022 14029                  mutex_exit(&dtrace_lock);
14023 14030  
14024 14031          } else if (dofhp != NULL) {
14025 14032                  /*
14026 14033                   * If the dtrace module is loaded and we have a particular
14027 14034                   * helper provider description, pass that off to the
14028 14035                   * meta provider.
14029 14036                   */
14030 14037  
14031 14038                  mutex_exit(&dtrace_lock);
14032 14039  
14033 14040                  dtrace_helper_provide(dofhp, p->p_pid);
14034 14041  
14035 14042          } else {
14036 14043                  /*
14037 14044                   * Otherwise, just pass all the helper provider descriptions
14038 14045                   * off to the meta provider.
14039 14046                   */
14040 14047  
14041 14048                  int i;
14042 14049                  mutex_exit(&dtrace_lock);
14043 14050  
14044 14051                  for (i = 0; i < help->dthps_nprovs; i++) {
14045 14052                          dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
14046 14053                              p->p_pid);
14047 14054                  }
14048 14055          }
14049 14056  
14050 14057          mutex_exit(&dtrace_meta_lock);
14051 14058  }
14052 14059  
14053 14060  static int
14054 14061  dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
14055 14062  {
14056 14063          dtrace_helpers_t *help;
14057 14064          dtrace_helper_provider_t *hprov, **tmp_provs;
14058 14065          uint_t tmp_maxprovs, i;
14059 14066  
14060 14067          ASSERT(MUTEX_HELD(&dtrace_lock));
14061 14068  
14062 14069          help = curproc->p_dtrace_helpers;
14063 14070          ASSERT(help != NULL);
14064 14071  
14065 14072          /*
14066 14073           * If we already have dtrace_helper_providers_max helper providers,
14067 14074           * we're refuse to add a new one.
14068 14075           */
14069 14076          if (help->dthps_nprovs >= dtrace_helper_providers_max)
14070 14077                  return (ENOSPC);
14071 14078  
14072 14079          /*
14073 14080           * Check to make sure this isn't a duplicate.
14074 14081           */
14075 14082          for (i = 0; i < help->dthps_nprovs; i++) {
14076 14083                  if (dofhp->dofhp_addr ==
14077 14084                      help->dthps_provs[i]->dthp_prov.dofhp_addr)
14078 14085                          return (EALREADY);
14079 14086          }
14080 14087  
14081 14088          hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
14082 14089          hprov->dthp_prov = *dofhp;
14083 14090          hprov->dthp_ref = 1;
14084 14091          hprov->dthp_generation = gen;
14085 14092  
14086 14093          /*
14087 14094           * Allocate a bigger table for helper providers if it's already full.
14088 14095           */
14089 14096          if (help->dthps_maxprovs == help->dthps_nprovs) {
14090 14097                  tmp_maxprovs = help->dthps_maxprovs;
14091 14098                  tmp_provs = help->dthps_provs;
14092 14099  
14093 14100                  if (help->dthps_maxprovs == 0)
14094 14101                          help->dthps_maxprovs = 2;
14095 14102                  else
14096 14103                          help->dthps_maxprovs *= 2;
14097 14104                  if (help->dthps_maxprovs > dtrace_helper_providers_max)
14098 14105                          help->dthps_maxprovs = dtrace_helper_providers_max;
14099 14106  
14100 14107                  ASSERT(tmp_maxprovs < help->dthps_maxprovs);
14101 14108  
14102 14109                  help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
14103 14110                      sizeof (dtrace_helper_provider_t *), KM_SLEEP);
14104 14111  
14105 14112                  if (tmp_provs != NULL) {
14106 14113                          bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
14107 14114                              sizeof (dtrace_helper_provider_t *));
14108 14115                          kmem_free(tmp_provs, tmp_maxprovs *
14109 14116                              sizeof (dtrace_helper_provider_t *));
14110 14117                  }
14111 14118          }
14112 14119  
14113 14120          help->dthps_provs[help->dthps_nprovs] = hprov;
14114 14121          help->dthps_nprovs++;
14115 14122  
14116 14123          return (0);
14117 14124  }
14118 14125  
14119 14126  static void
14120 14127  dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
14121 14128  {
14122 14129          mutex_enter(&dtrace_lock);
14123 14130  
14124 14131          if (--hprov->dthp_ref == 0) {
14125 14132                  dof_hdr_t *dof;
14126 14133                  mutex_exit(&dtrace_lock);
14127 14134                  dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
14128 14135                  dtrace_dof_destroy(dof);
14129 14136                  kmem_free(hprov, sizeof (dtrace_helper_provider_t));
14130 14137          } else {
14131 14138                  mutex_exit(&dtrace_lock);
14132 14139          }
14133 14140  }
14134 14141  
14135 14142  static int
14136 14143  dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
14137 14144  {
14138 14145          uintptr_t daddr = (uintptr_t)dof;
14139 14146          dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
14140 14147          dof_provider_t *provider;
14141 14148          dof_probe_t *probe;
14142 14149          uint8_t *arg;
14143 14150          char *strtab, *typestr;
14144 14151          dof_stridx_t typeidx;
14145 14152          size_t typesz;
14146 14153          uint_t nprobes, j, k;
14147 14154  
14148 14155          ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
14149 14156  
14150 14157          if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
14151 14158                  dtrace_dof_error(dof, "misaligned section offset");
14152 14159                  return (-1);
14153 14160          }
14154 14161  
14155 14162          /*
14156 14163           * The section needs to be large enough to contain the DOF provider
14157 14164           * structure appropriate for the given version.
14158 14165           */
14159 14166          if (sec->dofs_size <
14160 14167              ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
14161 14168              offsetof(dof_provider_t, dofpv_prenoffs) :
14162 14169              sizeof (dof_provider_t))) {
14163 14170                  dtrace_dof_error(dof, "provider section too small");
14164 14171                  return (-1);
14165 14172          }
14166 14173  
14167 14174          provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
14168 14175          str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
14169 14176          prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
14170 14177          arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
14171 14178          off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
14172 14179  
14173 14180          if (str_sec == NULL || prb_sec == NULL ||
14174 14181              arg_sec == NULL || off_sec == NULL)
14175 14182                  return (-1);
14176 14183  
14177 14184          enoff_sec = NULL;
14178 14185  
14179 14186          if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
14180 14187              provider->dofpv_prenoffs != DOF_SECT_NONE &&
14181 14188              (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
14182 14189              provider->dofpv_prenoffs)) == NULL)
14183 14190                  return (-1);
14184 14191  
14185 14192          strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
14186 14193  
14187 14194          if (provider->dofpv_name >= str_sec->dofs_size ||
14188 14195              strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
14189 14196                  dtrace_dof_error(dof, "invalid provider name");
14190 14197                  return (-1);
14191 14198          }
14192 14199  
14193 14200          if (prb_sec->dofs_entsize == 0 ||
14194 14201              prb_sec->dofs_entsize > prb_sec->dofs_size) {
14195 14202                  dtrace_dof_error(dof, "invalid entry size");
14196 14203                  return (-1);
14197 14204          }
14198 14205  
14199 14206          if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
14200 14207                  dtrace_dof_error(dof, "misaligned entry size");
14201 14208                  return (-1);
14202 14209          }
14203 14210  
14204 14211          if (off_sec->dofs_entsize != sizeof (uint32_t)) {
14205 14212                  dtrace_dof_error(dof, "invalid entry size");
14206 14213                  return (-1);
14207 14214          }
14208 14215  
14209 14216          if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
14210 14217                  dtrace_dof_error(dof, "misaligned section offset");
14211 14218                  return (-1);
14212 14219          }
14213 14220  
14214 14221          if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
14215 14222                  dtrace_dof_error(dof, "invalid entry size");
14216 14223                  return (-1);
14217 14224          }
14218 14225  
14219 14226          arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
14220 14227  
14221 14228          nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
14222 14229  
14223 14230          /*
14224 14231           * Take a pass through the probes to check for errors.
14225 14232           */
14226 14233          for (j = 0; j < nprobes; j++) {
14227 14234                  probe = (dof_probe_t *)(uintptr_t)(daddr +
14228 14235                      prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
14229 14236  
14230 14237                  if (probe->dofpr_func >= str_sec->dofs_size) {
14231 14238                          dtrace_dof_error(dof, "invalid function name");
14232 14239                          return (-1);
14233 14240                  }
14234 14241  
14235 14242                  if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
14236 14243                          dtrace_dof_error(dof, "function name too long");
14237 14244                          return (-1);
14238 14245                  }
14239 14246  
14240 14247                  if (probe->dofpr_name >= str_sec->dofs_size ||
14241 14248                      strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
14242 14249                          dtrace_dof_error(dof, "invalid probe name");
14243 14250                          return (-1);
14244 14251                  }
14245 14252  
14246 14253                  /*
14247 14254                   * The offset count must not wrap the index, and the offsets
14248 14255                   * must also not overflow the section's data.
14249 14256                   */
14250 14257                  if (probe->dofpr_offidx + probe->dofpr_noffs <
14251 14258                      probe->dofpr_offidx ||
14252 14259                      (probe->dofpr_offidx + probe->dofpr_noffs) *
14253 14260                      off_sec->dofs_entsize > off_sec->dofs_size) {
14254 14261                          dtrace_dof_error(dof, "invalid probe offset");
14255 14262                          return (-1);
14256 14263                  }
14257 14264  
14258 14265                  if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
14259 14266                          /*
14260 14267                           * If there's no is-enabled offset section, make sure
14261 14268                           * there aren't any is-enabled offsets. Otherwise
14262 14269                           * perform the same checks as for probe offsets
14263 14270                           * (immediately above).
14264 14271                           */
14265 14272                          if (enoff_sec == NULL) {
14266 14273                                  if (probe->dofpr_enoffidx != 0 ||
14267 14274                                      probe->dofpr_nenoffs != 0) {
14268 14275                                          dtrace_dof_error(dof, "is-enabled "
14269 14276                                              "offsets with null section");
14270 14277                                          return (-1);
14271 14278                                  }
14272 14279                          } else if (probe->dofpr_enoffidx +
14273 14280                              probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
14274 14281                              (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
14275 14282                              enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
14276 14283                                  dtrace_dof_error(dof, "invalid is-enabled "
14277 14284                                      "offset");
14278 14285                                  return (-1);
14279 14286                          }
14280 14287  
14281 14288                          if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
14282 14289                                  dtrace_dof_error(dof, "zero probe and "
14283 14290                                      "is-enabled offsets");
14284 14291                                  return (-1);
14285 14292                          }
14286 14293                  } else if (probe->dofpr_noffs == 0) {
14287 14294                          dtrace_dof_error(dof, "zero probe offsets");
14288 14295                          return (-1);
14289 14296                  }
14290 14297  
14291 14298                  if (probe->dofpr_argidx + probe->dofpr_xargc <
14292 14299                      probe->dofpr_argidx ||
14293 14300                      (probe->dofpr_argidx + probe->dofpr_xargc) *
14294 14301                      arg_sec->dofs_entsize > arg_sec->dofs_size) {
14295 14302                          dtrace_dof_error(dof, "invalid args");
14296 14303                          return (-1);
14297 14304                  }
14298 14305  
14299 14306                  typeidx = probe->dofpr_nargv;
14300 14307                  typestr = strtab + probe->dofpr_nargv;
14301 14308                  for (k = 0; k < probe->dofpr_nargc; k++) {
14302 14309                          if (typeidx >= str_sec->dofs_size) {
14303 14310                                  dtrace_dof_error(dof, "bad "
14304 14311                                      "native argument type");
14305 14312                                  return (-1);
14306 14313                          }
14307 14314  
14308 14315                          typesz = strlen(typestr) + 1;
14309 14316                          if (typesz > DTRACE_ARGTYPELEN) {
14310 14317                                  dtrace_dof_error(dof, "native "
14311 14318                                      "argument type too long");
14312 14319                                  return (-1);
14313 14320                          }
14314 14321                          typeidx += typesz;
14315 14322                          typestr += typesz;
14316 14323                  }
14317 14324  
14318 14325                  typeidx = probe->dofpr_xargv;
14319 14326                  typestr = strtab + probe->dofpr_xargv;
14320 14327                  for (k = 0; k < probe->dofpr_xargc; k++) {
14321 14328                          if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
14322 14329                                  dtrace_dof_error(dof, "bad "
14323 14330                                      "native argument index");
14324 14331                                  return (-1);
14325 14332                          }
14326 14333  
14327 14334                          if (typeidx >= str_sec->dofs_size) {
14328 14335                                  dtrace_dof_error(dof, "bad "
14329 14336                                      "translated argument type");
14330 14337                                  return (-1);
14331 14338                          }
14332 14339  
14333 14340                          typesz = strlen(typestr) + 1;
14334 14341                          if (typesz > DTRACE_ARGTYPELEN) {
14335 14342                                  dtrace_dof_error(dof, "translated argument "
14336 14343                                      "type too long");
14337 14344                                  return (-1);
14338 14345                          }
14339 14346  
14340 14347                          typeidx += typesz;
14341 14348                          typestr += typesz;
14342 14349                  }
14343 14350          }
14344 14351  
14345 14352          return (0);
14346 14353  }
14347 14354  
14348 14355  static int
14349 14356  dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
14350 14357  {
14351 14358          dtrace_helpers_t *help;
14352 14359          dtrace_vstate_t *vstate;
14353 14360          dtrace_enabling_t *enab = NULL;
14354 14361          int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
14355 14362          uintptr_t daddr = (uintptr_t)dof;
14356 14363  
14357 14364          ASSERT(MUTEX_HELD(&dtrace_lock));
14358 14365  
14359 14366          if ((help = curproc->p_dtrace_helpers) == NULL)
14360 14367                  help = dtrace_helpers_create(curproc);
14361 14368  
14362 14369          vstate = &help->dthps_vstate;
14363 14370  
14364 14371          if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
14365 14372              dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
14366 14373                  dtrace_dof_destroy(dof);
14367 14374                  return (rv);
14368 14375          }
14369 14376  
14370 14377          /*
14371 14378           * Look for helper providers and validate their descriptions.
14372 14379           */
14373 14380          if (dhp != NULL) {
14374 14381                  for (i = 0; i < dof->dofh_secnum; i++) {
14375 14382                          dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
14376 14383                              dof->dofh_secoff + i * dof->dofh_secsize);
14377 14384  
14378 14385                          if (sec->dofs_type != DOF_SECT_PROVIDER)
14379 14386                                  continue;
14380 14387  
14381 14388                          if (dtrace_helper_provider_validate(dof, sec) != 0) {
14382 14389                                  dtrace_enabling_destroy(enab);
14383 14390                                  dtrace_dof_destroy(dof);
14384 14391                                  return (-1);
14385 14392                          }
14386 14393  
14387 14394                          nprovs++;
14388 14395                  }
14389 14396          }
14390 14397  
14391 14398          /*
14392 14399           * Now we need to walk through the ECB descriptions in the enabling.
14393 14400           */
14394 14401          for (i = 0; i < enab->dten_ndesc; i++) {
14395 14402                  dtrace_ecbdesc_t *ep = enab->dten_desc[i];
14396 14403                  dtrace_probedesc_t *desc = &ep->dted_probe;
14397 14404  
14398 14405                  if (strcmp(desc->dtpd_provider, "dtrace") != 0)
14399 14406                          continue;
14400 14407  
14401 14408                  if (strcmp(desc->dtpd_mod, "helper") != 0)
14402 14409                          continue;
14403 14410  
14404 14411                  if (strcmp(desc->dtpd_func, "ustack") != 0)
14405 14412                          continue;
14406 14413  
14407 14414                  if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
14408 14415                      ep)) != 0) {
14409 14416                          /*
14410 14417                           * Adding this helper action failed -- we are now going
14411 14418                           * to rip out the entire generation and return failure.
14412 14419                           */
14413 14420                          (void) dtrace_helper_destroygen(help->dthps_generation);
14414 14421                          dtrace_enabling_destroy(enab);
14415 14422                          dtrace_dof_destroy(dof);
14416 14423                          return (-1);
14417 14424                  }
14418 14425  
14419 14426                  nhelpers++;
14420 14427          }
14421 14428  
14422 14429          if (nhelpers < enab->dten_ndesc)
14423 14430                  dtrace_dof_error(dof, "unmatched helpers");
14424 14431  
14425 14432          gen = help->dthps_generation++;
14426 14433          dtrace_enabling_destroy(enab);
14427 14434  
14428 14435          if (dhp != NULL && nprovs > 0) {
14429 14436                  dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
14430 14437                  if (dtrace_helper_provider_add(dhp, gen) == 0) {
14431 14438                          mutex_exit(&dtrace_lock);
14432 14439                          dtrace_helper_provider_register(curproc, help, dhp);
14433 14440                          mutex_enter(&dtrace_lock);
14434 14441  
14435 14442                          destroy = 0;
14436 14443                  }
14437 14444          }
14438 14445  
14439 14446          if (destroy)
14440 14447                  dtrace_dof_destroy(dof);
14441 14448  
14442 14449          return (gen);
14443 14450  }
14444 14451  
14445 14452  static dtrace_helpers_t *
14446 14453  dtrace_helpers_create(proc_t *p)
14447 14454  {
14448 14455          dtrace_helpers_t *help;
14449 14456  
14450 14457          ASSERT(MUTEX_HELD(&dtrace_lock));
14451 14458          ASSERT(p->p_dtrace_helpers == NULL);
14452 14459  
14453 14460          help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
14454 14461          help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
14455 14462              DTRACE_NHELPER_ACTIONS, KM_SLEEP);
14456 14463  
14457 14464          p->p_dtrace_helpers = help;
14458 14465          dtrace_helpers++;
14459 14466  
14460 14467          return (help);
14461 14468  }
14462 14469  
14463 14470  static void
14464 14471  dtrace_helpers_destroy(void)
14465 14472  {
14466 14473          dtrace_helpers_t *help;
14467 14474          dtrace_vstate_t *vstate;
14468 14475          proc_t *p = curproc;
14469 14476          int i;
14470 14477  
14471 14478          mutex_enter(&dtrace_lock);
14472 14479  
14473 14480          ASSERT(p->p_dtrace_helpers != NULL);
14474 14481          ASSERT(dtrace_helpers > 0);
14475 14482  
14476 14483          help = p->p_dtrace_helpers;
14477 14484          vstate = &help->dthps_vstate;
14478 14485  
14479 14486          /*
14480 14487           * We're now going to lose the help from this process.
14481 14488           */
14482 14489          p->p_dtrace_helpers = NULL;
14483 14490          dtrace_sync();
14484 14491  
14485 14492          /*
14486 14493           * Destory the helper actions.
14487 14494           */
14488 14495          for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14489 14496                  dtrace_helper_action_t *h, *next;
14490 14497  
14491 14498                  for (h = help->dthps_actions[i]; h != NULL; h = next) {
14492 14499                          next = h->dtha_next;
14493 14500                          dtrace_helper_action_destroy(h, vstate);
14494 14501                          h = next;
14495 14502                  }
14496 14503          }
14497 14504  
14498 14505          mutex_exit(&dtrace_lock);
14499 14506  
14500 14507          /*
14501 14508           * Destroy the helper providers.
14502 14509           */
14503 14510          if (help->dthps_maxprovs > 0) {
14504 14511                  mutex_enter(&dtrace_meta_lock);
14505 14512                  if (dtrace_meta_pid != NULL) {
14506 14513                          ASSERT(dtrace_deferred_pid == NULL);
14507 14514  
14508 14515                          for (i = 0; i < help->dthps_nprovs; i++) {
14509 14516                                  dtrace_helper_provider_remove(
14510 14517                                      &help->dthps_provs[i]->dthp_prov, p->p_pid);
14511 14518                          }
14512 14519                  } else {
14513 14520                          mutex_enter(&dtrace_lock);
14514 14521                          ASSERT(help->dthps_deferred == 0 ||
14515 14522                              help->dthps_next != NULL ||
14516 14523                              help->dthps_prev != NULL ||
14517 14524                              help == dtrace_deferred_pid);
14518 14525  
14519 14526                          /*
14520 14527                           * Remove the helper from the deferred list.
14521 14528                           */
14522 14529                          if (help->dthps_next != NULL)
14523 14530                                  help->dthps_next->dthps_prev = help->dthps_prev;
14524 14531                          if (help->dthps_prev != NULL)
14525 14532                                  help->dthps_prev->dthps_next = help->dthps_next;
14526 14533                          if (dtrace_deferred_pid == help) {
14527 14534                                  dtrace_deferred_pid = help->dthps_next;
14528 14535                                  ASSERT(help->dthps_prev == NULL);
14529 14536                          }
14530 14537  
14531 14538                          mutex_exit(&dtrace_lock);
14532 14539                  }
14533 14540  
14534 14541                  mutex_exit(&dtrace_meta_lock);
14535 14542  
14536 14543                  for (i = 0; i < help->dthps_nprovs; i++) {
14537 14544                          dtrace_helper_provider_destroy(help->dthps_provs[i]);
14538 14545                  }
14539 14546  
14540 14547                  kmem_free(help->dthps_provs, help->dthps_maxprovs *
14541 14548                      sizeof (dtrace_helper_provider_t *));
14542 14549          }
14543 14550  
14544 14551          mutex_enter(&dtrace_lock);
14545 14552  
14546 14553          dtrace_vstate_fini(&help->dthps_vstate);
14547 14554          kmem_free(help->dthps_actions,
14548 14555              sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
14549 14556          kmem_free(help, sizeof (dtrace_helpers_t));
14550 14557  
14551 14558          --dtrace_helpers;
14552 14559          mutex_exit(&dtrace_lock);
14553 14560  }
14554 14561  
14555 14562  static void
14556 14563  dtrace_helpers_duplicate(proc_t *from, proc_t *to)
14557 14564  {
14558 14565          dtrace_helpers_t *help, *newhelp;
14559 14566          dtrace_helper_action_t *helper, *new, *last;
14560 14567          dtrace_difo_t *dp;
14561 14568          dtrace_vstate_t *vstate;
14562 14569          int i, j, sz, hasprovs = 0;
14563 14570  
14564 14571          mutex_enter(&dtrace_lock);
14565 14572          ASSERT(from->p_dtrace_helpers != NULL);
14566 14573          ASSERT(dtrace_helpers > 0);
14567 14574  
14568 14575          help = from->p_dtrace_helpers;
14569 14576          newhelp = dtrace_helpers_create(to);
14570 14577          ASSERT(to->p_dtrace_helpers != NULL);
14571 14578  
14572 14579          newhelp->dthps_generation = help->dthps_generation;
14573 14580          vstate = &newhelp->dthps_vstate;
14574 14581  
14575 14582          /*
14576 14583           * Duplicate the helper actions.
14577 14584           */
14578 14585          for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14579 14586                  if ((helper = help->dthps_actions[i]) == NULL)
14580 14587                          continue;
14581 14588  
14582 14589                  for (last = NULL; helper != NULL; helper = helper->dtha_next) {
14583 14590                          new = kmem_zalloc(sizeof (dtrace_helper_action_t),
14584 14591                              KM_SLEEP);
14585 14592                          new->dtha_generation = helper->dtha_generation;
14586 14593  
14587 14594                          if ((dp = helper->dtha_predicate) != NULL) {
14588 14595                                  dp = dtrace_difo_duplicate(dp, vstate);
14589 14596                                  new->dtha_predicate = dp;
14590 14597                          }
14591 14598  
14592 14599                          new->dtha_nactions = helper->dtha_nactions;
14593 14600                          sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
14594 14601                          new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
14595 14602  
14596 14603                          for (j = 0; j < new->dtha_nactions; j++) {
14597 14604                                  dtrace_difo_t *dp = helper->dtha_actions[j];
14598 14605  
14599 14606                                  ASSERT(dp != NULL);
14600 14607                                  dp = dtrace_difo_duplicate(dp, vstate);
14601 14608                                  new->dtha_actions[j] = dp;
14602 14609                          }
14603 14610  
14604 14611                          if (last != NULL) {
14605 14612                                  last->dtha_next = new;
14606 14613                          } else {
14607 14614                                  newhelp->dthps_actions[i] = new;
14608 14615                          }
14609 14616  
14610 14617                          last = new;
14611 14618                  }
14612 14619          }
14613 14620  
14614 14621          /*
14615 14622           * Duplicate the helper providers and register them with the
14616 14623           * DTrace framework.
14617 14624           */
14618 14625          if (help->dthps_nprovs > 0) {
14619 14626                  newhelp->dthps_nprovs = help->dthps_nprovs;
14620 14627                  newhelp->dthps_maxprovs = help->dthps_nprovs;
14621 14628                  newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
14622 14629                      sizeof (dtrace_helper_provider_t *), KM_SLEEP);
14623 14630                  for (i = 0; i < newhelp->dthps_nprovs; i++) {
14624 14631                          newhelp->dthps_provs[i] = help->dthps_provs[i];
14625 14632                          newhelp->dthps_provs[i]->dthp_ref++;
14626 14633                  }
14627 14634  
14628 14635                  hasprovs = 1;
14629 14636          }
14630 14637  
14631 14638          mutex_exit(&dtrace_lock);
14632 14639  
14633 14640          if (hasprovs)
14634 14641                  dtrace_helper_provider_register(to, newhelp, NULL);
14635 14642  }
14636 14643  
14637 14644  /*
14638 14645   * DTrace Hook Functions
14639 14646   */
14640 14647  static void
14641 14648  dtrace_module_loaded(struct modctl *ctl)
14642 14649  {
14643 14650          dtrace_provider_t *prv;
14644 14651  
14645 14652          mutex_enter(&dtrace_provider_lock);
14646 14653          mutex_enter(&mod_lock);
14647 14654  
14648 14655          ASSERT(ctl->mod_busy);
14649 14656  
14650 14657          /*
14651 14658           * We're going to call each providers per-module provide operation
14652 14659           * specifying only this module.
14653 14660           */
14654 14661          for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
14655 14662                  prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
14656 14663  
14657 14664          mutex_exit(&mod_lock);
14658 14665          mutex_exit(&dtrace_provider_lock);
14659 14666  
14660 14667          /*
14661 14668           * If we have any retained enablings, we need to match against them.
14662 14669           * Enabling probes requires that cpu_lock be held, and we cannot hold
14663 14670           * cpu_lock here -- it is legal for cpu_lock to be held when loading a
14664 14671           * module.  (In particular, this happens when loading scheduling
14665 14672           * classes.)  So if we have any retained enablings, we need to dispatch
14666 14673           * our task queue to do the match for us.
14667 14674           */
14668 14675          mutex_enter(&dtrace_lock);
14669 14676  
14670 14677          if (dtrace_retained == NULL) {
14671 14678                  mutex_exit(&dtrace_lock);
14672 14679                  return;
14673 14680          }
14674 14681  
14675 14682          (void) taskq_dispatch(dtrace_taskq,
14676 14683              (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
14677 14684  
14678 14685          mutex_exit(&dtrace_lock);
14679 14686  
14680 14687          /*
14681 14688           * And now, for a little heuristic sleaze:  in general, we want to
14682 14689           * match modules as soon as they load.  However, we cannot guarantee
14683 14690           * this, because it would lead us to the lock ordering violation
14684 14691           * outlined above.  The common case, of course, is that cpu_lock is
14685 14692           * _not_ held -- so we delay here for a clock tick, hoping that that's
14686 14693           * long enough for the task queue to do its work.  If it's not, it's
14687 14694           * not a serious problem -- it just means that the module that we
14688 14695           * just loaded may not be immediately instrumentable.
14689 14696           */
14690 14697          delay(1);
14691 14698  }
14692 14699  
14693 14700  static void
14694 14701  dtrace_module_unloaded(struct modctl *ctl)
14695 14702  {
14696 14703          dtrace_probe_t template, *probe, *first, *next;
14697 14704          dtrace_provider_t *prov;
14698 14705  
14699 14706          template.dtpr_mod = ctl->mod_modname;
14700 14707  
14701 14708          mutex_enter(&dtrace_provider_lock);
14702 14709          mutex_enter(&mod_lock);
14703 14710          mutex_enter(&dtrace_lock);
14704 14711  
14705 14712          if (dtrace_bymod == NULL) {
14706 14713                  /*
14707 14714                   * The DTrace module is loaded (obviously) but not attached;
14708 14715                   * we don't have any work to do.
14709 14716                   */
14710 14717                  mutex_exit(&dtrace_provider_lock);
14711 14718                  mutex_exit(&mod_lock);
14712 14719                  mutex_exit(&dtrace_lock);
14713 14720                  return;
14714 14721          }
14715 14722  
14716 14723          for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
14717 14724              probe != NULL; probe = probe->dtpr_nextmod) {
14718 14725                  if (probe->dtpr_ecb != NULL) {
14719 14726                          mutex_exit(&dtrace_provider_lock);
14720 14727                          mutex_exit(&mod_lock);
14721 14728                          mutex_exit(&dtrace_lock);
14722 14729  
14723 14730                          /*
14724 14731                           * This shouldn't _actually_ be possible -- we're
14725 14732                           * unloading a module that has an enabled probe in it.
14726 14733                           * (It's normally up to the provider to make sure that
14727 14734                           * this can't happen.)  However, because dtps_enable()
14728 14735                           * doesn't have a failure mode, there can be an
14729 14736                           * enable/unload race.  Upshot:  we don't want to
14730 14737                           * assert, but we're not going to disable the
14731 14738                           * probe, either.
14732 14739                           */
14733 14740                          if (dtrace_err_verbose) {
14734 14741                                  cmn_err(CE_WARN, "unloaded module '%s' had "
14735 14742                                      "enabled probes", ctl->mod_modname);
14736 14743                          }
14737 14744  
14738 14745                          return;
14739 14746                  }
14740 14747          }
14741 14748  
14742 14749          probe = first;
14743 14750  
14744 14751          for (first = NULL; probe != NULL; probe = next) {
14745 14752                  ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
14746 14753  
14747 14754                  dtrace_probes[probe->dtpr_id - 1] = NULL;
14748 14755  
14749 14756                  next = probe->dtpr_nextmod;
14750 14757                  dtrace_hash_remove(dtrace_bymod, probe);
14751 14758                  dtrace_hash_remove(dtrace_byfunc, probe);
14752 14759                  dtrace_hash_remove(dtrace_byname, probe);
14753 14760  
14754 14761                  if (first == NULL) {
14755 14762                          first = probe;
14756 14763                          probe->dtpr_nextmod = NULL;
14757 14764                  } else {
14758 14765                          probe->dtpr_nextmod = first;
14759 14766                          first = probe;
14760 14767                  }
14761 14768          }
14762 14769  
14763 14770          /*
14764 14771           * We've removed all of the module's probes from the hash chains and
14765 14772           * from the probe array.  Now issue a dtrace_sync() to be sure that
14766 14773           * everyone has cleared out from any probe array processing.
14767 14774           */
14768 14775          dtrace_sync();
14769 14776  
14770 14777          for (probe = first; probe != NULL; probe = first) {
14771 14778                  first = probe->dtpr_nextmod;
14772 14779                  prov = probe->dtpr_provider;
14773 14780                  prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
14774 14781                      probe->dtpr_arg);
14775 14782                  kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
14776 14783                  kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
14777 14784                  kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
14778 14785                  vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
14779 14786                  kmem_free(probe, sizeof (dtrace_probe_t));
14780 14787          }
14781 14788  
14782 14789          mutex_exit(&dtrace_lock);
14783 14790          mutex_exit(&mod_lock);
14784 14791          mutex_exit(&dtrace_provider_lock);
14785 14792  }
14786 14793  
14787 14794  void
14788 14795  dtrace_suspend(void)
14789 14796  {
14790 14797          dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
14791 14798  }
14792 14799  
14793 14800  void
14794 14801  dtrace_resume(void)
14795 14802  {
14796 14803          dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
14797 14804  }
14798 14805  
14799 14806  static int
14800 14807  dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
14801 14808  {
14802 14809          ASSERT(MUTEX_HELD(&cpu_lock));
14803 14810          mutex_enter(&dtrace_lock);
14804 14811  
14805 14812          switch (what) {
14806 14813          case CPU_CONFIG: {
14807 14814                  dtrace_state_t *state;
14808 14815                  dtrace_optval_t *opt, rs, c;
14809 14816  
14810 14817                  /*
14811 14818                   * For now, we only allocate a new buffer for anonymous state.
14812 14819                   */
14813 14820                  if ((state = dtrace_anon.dta_state) == NULL)
14814 14821                          break;
14815 14822  
14816 14823                  if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14817 14824                          break;
14818 14825  
14819 14826                  opt = state->dts_options;
14820 14827                  c = opt[DTRACEOPT_CPU];
14821 14828  
14822 14829                  if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
14823 14830                          break;
14824 14831  
14825 14832                  /*
14826 14833                   * Regardless of what the actual policy is, we're going to
14827 14834                   * temporarily set our resize policy to be manual.  We're
14828 14835                   * also going to temporarily set our CPU option to denote
14829 14836                   * the newly configured CPU.
14830 14837                   */
14831 14838                  rs = opt[DTRACEOPT_BUFRESIZE];
14832 14839                  opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
14833 14840                  opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
14834 14841  
14835 14842                  (void) dtrace_state_buffers(state);
14836 14843  
14837 14844                  opt[DTRACEOPT_BUFRESIZE] = rs;
14838 14845                  opt[DTRACEOPT_CPU] = c;
14839 14846  
14840 14847                  break;
14841 14848          }
14842 14849  
14843 14850          case CPU_UNCONFIG:
14844 14851                  /*
14845 14852                   * We don't free the buffer in the CPU_UNCONFIG case.  (The
14846 14853                   * buffer will be freed when the consumer exits.)
14847 14854                   */
14848 14855                  break;
14849 14856  
14850 14857          default:
14851 14858                  break;
14852 14859          }
14853 14860  
14854 14861          mutex_exit(&dtrace_lock);
14855 14862          return (0);
14856 14863  }
14857 14864  
14858 14865  static void
14859 14866  dtrace_cpu_setup_initial(processorid_t cpu)
14860 14867  {
14861 14868          (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
14862 14869  }
14863 14870  
14864 14871  static void
14865 14872  dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
14866 14873  {
14867 14874          if (dtrace_toxranges >= dtrace_toxranges_max) {
14868 14875                  int osize, nsize;
14869 14876                  dtrace_toxrange_t *range;
14870 14877  
14871 14878                  osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
14872 14879  
14873 14880                  if (osize == 0) {
14874 14881                          ASSERT(dtrace_toxrange == NULL);
14875 14882                          ASSERT(dtrace_toxranges_max == 0);
14876 14883                          dtrace_toxranges_max = 1;
14877 14884                  } else {
14878 14885                          dtrace_toxranges_max <<= 1;
14879 14886                  }
14880 14887  
14881 14888                  nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
14882 14889                  range = kmem_zalloc(nsize, KM_SLEEP);
14883 14890  
14884 14891                  if (dtrace_toxrange != NULL) {
14885 14892                          ASSERT(osize != 0);
14886 14893                          bcopy(dtrace_toxrange, range, osize);
14887 14894                          kmem_free(dtrace_toxrange, osize);
14888 14895                  }
14889 14896  
14890 14897                  dtrace_toxrange = range;
14891 14898          }
14892 14899  
14893 14900          ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == NULL);
14894 14901          ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == NULL);
14895 14902  
14896 14903          dtrace_toxrange[dtrace_toxranges].dtt_base = base;
14897 14904          dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
14898 14905          dtrace_toxranges++;
14899 14906  }
14900 14907  
14901 14908  static void
14902 14909  dtrace_getf_barrier()
14903 14910  {
14904 14911          /*
14905 14912           * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings
14906 14913           * that contain calls to getf(), this routine will be called on every
14907 14914           * closef() before either the underlying vnode is released or the
14908 14915           * file_t itself is freed.  By the time we are here, it is essential
14909 14916           * that the file_t can no longer be accessed from a call to getf()
14910 14917           * in probe context -- that assures that a dtrace_sync() can be used
14911 14918           * to clear out any enablings referring to the old structures.
14912 14919           */
14913 14920          if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 ||
14914 14921              kcred->cr_zone->zone_dtrace_getf != 0)
14915 14922                  dtrace_sync();
14916 14923  }
14917 14924  
14918 14925  /*
14919 14926   * DTrace Driver Cookbook Functions
14920 14927   */
14921 14928  /*ARGSUSED*/
14922 14929  static int
14923 14930  dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
14924 14931  {
14925 14932          dtrace_provider_id_t id;
14926 14933          dtrace_state_t *state = NULL;
14927 14934          dtrace_enabling_t *enab;
14928 14935  
14929 14936          mutex_enter(&cpu_lock);
14930 14937          mutex_enter(&dtrace_provider_lock);
14931 14938          mutex_enter(&dtrace_lock);
14932 14939  
14933 14940          if (ddi_soft_state_init(&dtrace_softstate,
14934 14941              sizeof (dtrace_state_t), 0) != 0) {
14935 14942                  cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
14936 14943                  mutex_exit(&cpu_lock);
14937 14944                  mutex_exit(&dtrace_provider_lock);
14938 14945                  mutex_exit(&dtrace_lock);
14939 14946                  return (DDI_FAILURE);
14940 14947          }
14941 14948  
14942 14949          if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
14943 14950              DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
14944 14951              ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
14945 14952              DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
14946 14953                  cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
14947 14954                  ddi_remove_minor_node(devi, NULL);
14948 14955                  ddi_soft_state_fini(&dtrace_softstate);
14949 14956                  mutex_exit(&cpu_lock);
14950 14957                  mutex_exit(&dtrace_provider_lock);
14951 14958                  mutex_exit(&dtrace_lock);
14952 14959                  return (DDI_FAILURE);
14953 14960          }
14954 14961  
14955 14962          ddi_report_dev(devi);
14956 14963          dtrace_devi = devi;
14957 14964  
14958 14965          dtrace_modload = dtrace_module_loaded;
14959 14966          dtrace_modunload = dtrace_module_unloaded;
14960 14967          dtrace_cpu_init = dtrace_cpu_setup_initial;
14961 14968          dtrace_helpers_cleanup = dtrace_helpers_destroy;
14962 14969          dtrace_helpers_fork = dtrace_helpers_duplicate;
14963 14970          dtrace_cpustart_init = dtrace_suspend;
14964 14971          dtrace_cpustart_fini = dtrace_resume;
14965 14972          dtrace_debugger_init = dtrace_suspend;
14966 14973          dtrace_debugger_fini = dtrace_resume;
14967 14974  
14968 14975          register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
14969 14976  
14970 14977          ASSERT(MUTEX_HELD(&cpu_lock));
14971 14978  
14972 14979          dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
14973 14980              NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14974 14981          dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
14975 14982              UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
14976 14983              VM_SLEEP | VMC_IDENTIFIER);
14977 14984          dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
14978 14985              1, INT_MAX, 0);
14979 14986  
14980 14987          dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
14981 14988              sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
14982 14989              NULL, NULL, NULL, NULL, NULL, 0);
14983 14990  
14984 14991          ASSERT(MUTEX_HELD(&cpu_lock));
14985 14992          dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
14986 14993              offsetof(dtrace_probe_t, dtpr_nextmod),
14987 14994              offsetof(dtrace_probe_t, dtpr_prevmod));
14988 14995  
14989 14996          dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
14990 14997              offsetof(dtrace_probe_t, dtpr_nextfunc),
14991 14998              offsetof(dtrace_probe_t, dtpr_prevfunc));
14992 14999  
14993 15000          dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
14994 15001              offsetof(dtrace_probe_t, dtpr_nextname),
14995 15002              offsetof(dtrace_probe_t, dtpr_prevname));
14996 15003  
14997 15004          if (dtrace_retain_max < 1) {
14998 15005                  cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
14999 15006                      "setting to 1", dtrace_retain_max);
15000 15007                  dtrace_retain_max = 1;
15001 15008          }
15002 15009  
15003 15010          /*
15004 15011           * Now discover our toxic ranges.
15005 15012           */
15006 15013          dtrace_toxic_ranges(dtrace_toxrange_add);
15007 15014  
15008 15015          /*
15009 15016           * Before we register ourselves as a provider to our own framework,
15010 15017           * we would like to assert that dtrace_provider is NULL -- but that's
15011 15018           * not true if we were loaded as a dependency of a DTrace provider.
15012 15019           * Once we've registered, we can assert that dtrace_provider is our
15013 15020           * pseudo provider.
15014 15021           */
15015 15022          (void) dtrace_register("dtrace", &dtrace_provider_attr,
15016 15023              DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
15017 15024  
15018 15025          ASSERT(dtrace_provider != NULL);
15019 15026          ASSERT((dtrace_provider_id_t)dtrace_provider == id);
15020 15027  
15021 15028          dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
15022 15029              dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
15023 15030          dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
15024 15031              dtrace_provider, NULL, NULL, "END", 0, NULL);
15025 15032          dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
15026 15033              dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
15027 15034  
15028 15035          dtrace_anon_property();
15029 15036          mutex_exit(&cpu_lock);
15030 15037  
15031 15038          /*
15032 15039           * If DTrace helper tracing is enabled, we need to allocate the
15033 15040           * trace buffer and initialize the values.
15034 15041           */
15035 15042          if (dtrace_helptrace_enabled) {
15036 15043                  ASSERT(dtrace_helptrace_buffer == NULL);
15037 15044                  dtrace_helptrace_buffer =
15038 15045                      kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
15039 15046                  dtrace_helptrace_next = 0;
15040 15047          }
15041 15048  
15042 15049          /*
15043 15050           * If there are already providers, we must ask them to provide their
15044 15051           * probes, and then match any anonymous enabling against them.  Note
15045 15052           * that there should be no other retained enablings at this time:
15046 15053           * the only retained enablings at this time should be the anonymous
15047 15054           * enabling.
15048 15055           */
15049 15056          if (dtrace_anon.dta_enabling != NULL) {
15050 15057                  ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
15051 15058  
15052 15059                  dtrace_enabling_provide(NULL);
15053 15060                  state = dtrace_anon.dta_state;
15054 15061  
15055 15062                  /*
15056 15063                   * We couldn't hold cpu_lock across the above call to
15057 15064                   * dtrace_enabling_provide(), but we must hold it to actually
15058 15065                   * enable the probes.  We have to drop all of our locks, pick
15059 15066                   * up cpu_lock, and regain our locks before matching the
15060 15067                   * retained anonymous enabling.
15061 15068                   */
15062 15069                  mutex_exit(&dtrace_lock);
15063 15070                  mutex_exit(&dtrace_provider_lock);
15064 15071  
15065 15072                  mutex_enter(&cpu_lock);
15066 15073                  mutex_enter(&dtrace_provider_lock);
15067 15074                  mutex_enter(&dtrace_lock);
15068 15075  
15069 15076                  if ((enab = dtrace_anon.dta_enabling) != NULL)
15070 15077                          (void) dtrace_enabling_match(enab, NULL);
15071 15078  
15072 15079                  mutex_exit(&cpu_lock);
15073 15080          }
15074 15081  
15075 15082          mutex_exit(&dtrace_lock);
15076 15083          mutex_exit(&dtrace_provider_lock);
15077 15084  
15078 15085          if (state != NULL) {
15079 15086                  /*
15080 15087                   * If we created any anonymous state, set it going now.
15081 15088                   */
15082 15089                  (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
15083 15090          }
15084 15091  
15085 15092          return (DDI_SUCCESS);
15086 15093  }
15087 15094  
15088 15095  /*ARGSUSED*/
15089 15096  static int
15090 15097  dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
15091 15098  {
15092 15099          dtrace_state_t *state;
15093 15100          uint32_t priv;
15094 15101          uid_t uid;
15095 15102          zoneid_t zoneid;
15096 15103  
15097 15104          if (getminor(*devp) == DTRACEMNRN_HELPER)
15098 15105                  return (0);
15099 15106  
15100 15107          /*
15101 15108           * If this wasn't an open with the "helper" minor, then it must be
15102 15109           * the "dtrace" minor.
15103 15110           */
15104 15111          if (getminor(*devp) != DTRACEMNRN_DTRACE)
15105 15112                  return (ENXIO);
15106 15113  
15107 15114          /*
15108 15115           * If no DTRACE_PRIV_* bits are set in the credential, then the
15109 15116           * caller lacks sufficient permission to do anything with DTrace.
15110 15117           */
15111 15118          dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
15112 15119          if (priv == DTRACE_PRIV_NONE)
15113 15120                  return (EACCES);
15114 15121  
15115 15122          /*
15116 15123           * Ask all providers to provide all their probes.
15117 15124           */
15118 15125          mutex_enter(&dtrace_provider_lock);
15119 15126          dtrace_probe_provide(NULL, NULL);
15120 15127          mutex_exit(&dtrace_provider_lock);
15121 15128  
15122 15129          mutex_enter(&cpu_lock);
15123 15130          mutex_enter(&dtrace_lock);
15124 15131          dtrace_opens++;
15125 15132          dtrace_membar_producer();
15126 15133  
15127 15134          /*
15128 15135           * If the kernel debugger is active (that is, if the kernel debugger
15129 15136           * modified text in some way), we won't allow the open.
15130 15137           */
15131 15138          if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
15132 15139                  dtrace_opens--;
15133 15140                  mutex_exit(&cpu_lock);
15134 15141                  mutex_exit(&dtrace_lock);
15135 15142                  return (EBUSY);
15136 15143          }
15137 15144  
15138 15145          state = dtrace_state_create(devp, cred_p);
15139 15146          mutex_exit(&cpu_lock);
15140 15147  
15141 15148          if (state == NULL) {
15142 15149                  if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
15143 15150                          (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
15144 15151                  mutex_exit(&dtrace_lock);
15145 15152                  return (EAGAIN);
15146 15153          }
15147 15154  
15148 15155          mutex_exit(&dtrace_lock);
15149 15156  
15150 15157          return (0);
15151 15158  }
15152 15159  
15153 15160  /*ARGSUSED*/
15154 15161  static int
15155 15162  dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
15156 15163  {
15157 15164          minor_t minor = getminor(dev);
15158 15165          dtrace_state_t *state;
15159 15166  
15160 15167          if (minor == DTRACEMNRN_HELPER)
15161 15168                  return (0);
15162 15169  
15163 15170          state = ddi_get_soft_state(dtrace_softstate, minor);
15164 15171  
15165 15172          mutex_enter(&cpu_lock);
15166 15173          mutex_enter(&dtrace_lock);
15167 15174  
15168 15175          if (state->dts_anon) {
15169 15176                  /*
15170 15177                   * There is anonymous state. Destroy that first.
15171 15178                   */
15172 15179                  ASSERT(dtrace_anon.dta_state == NULL);
15173 15180                  dtrace_state_destroy(state->dts_anon);
15174 15181          }
15175 15182  
15176 15183          dtrace_state_destroy(state);
15177 15184          ASSERT(dtrace_opens > 0);
15178 15185  
15179 15186          /*
15180 15187           * Only relinquish control of the kernel debugger interface when there
15181 15188           * are no consumers and no anonymous enablings.
15182 15189           */
15183 15190          if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
15184 15191                  (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
15185 15192  
15186 15193          mutex_exit(&dtrace_lock);
15187 15194          mutex_exit(&cpu_lock);
15188 15195  
15189 15196          return (0);
15190 15197  }
15191 15198  
15192 15199  /*ARGSUSED*/
15193 15200  static int
15194 15201  dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
15195 15202  {
15196 15203          int rval;
15197 15204          dof_helper_t help, *dhp = NULL;
15198 15205  
15199 15206          switch (cmd) {
15200 15207          case DTRACEHIOC_ADDDOF:
15201 15208                  if (copyin((void *)arg, &help, sizeof (help)) != 0) {
15202 15209                          dtrace_dof_error(NULL, "failed to copyin DOF helper");
15203 15210                          return (EFAULT);
15204 15211                  }
15205 15212  
15206 15213                  dhp = &help;
15207 15214                  arg = (intptr_t)help.dofhp_dof;
15208 15215                  /*FALLTHROUGH*/
15209 15216  
15210 15217          case DTRACEHIOC_ADD: {
15211 15218                  dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
15212 15219  
15213 15220                  if (dof == NULL)
15214 15221                          return (rval);
15215 15222  
15216 15223                  mutex_enter(&dtrace_lock);
15217 15224  
15218 15225                  /*
15219 15226                   * dtrace_helper_slurp() takes responsibility for the dof --
15220 15227                   * it may free it now or it may save it and free it later.
15221 15228                   */
15222 15229                  if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
15223 15230                          *rv = rval;
15224 15231                          rval = 0;
15225 15232                  } else {
15226 15233                          rval = EINVAL;
15227 15234                  }
15228 15235  
15229 15236                  mutex_exit(&dtrace_lock);
15230 15237                  return (rval);
15231 15238          }
15232 15239  
15233 15240          case DTRACEHIOC_REMOVE: {
15234 15241                  mutex_enter(&dtrace_lock);
15235 15242                  rval = dtrace_helper_destroygen(arg);
15236 15243                  mutex_exit(&dtrace_lock);
15237 15244  
15238 15245                  return (rval);
15239 15246          }
15240 15247  
15241 15248          default:
15242 15249                  break;
15243 15250          }
15244 15251  
15245 15252          return (ENOTTY);
15246 15253  }
15247 15254  
15248 15255  /*ARGSUSED*/
15249 15256  static int
15250 15257  dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
15251 15258  {
15252 15259          minor_t minor = getminor(dev);
15253 15260          dtrace_state_t *state;
15254 15261          int rval;
15255 15262  
15256 15263          if (minor == DTRACEMNRN_HELPER)
15257 15264                  return (dtrace_ioctl_helper(cmd, arg, rv));
15258 15265  
15259 15266          state = ddi_get_soft_state(dtrace_softstate, minor);
15260 15267  
15261 15268          if (state->dts_anon) {
15262 15269                  ASSERT(dtrace_anon.dta_state == NULL);
15263 15270                  state = state->dts_anon;
15264 15271          }
15265 15272  
15266 15273          switch (cmd) {
15267 15274          case DTRACEIOC_PROVIDER: {
15268 15275                  dtrace_providerdesc_t pvd;
15269 15276                  dtrace_provider_t *pvp;
15270 15277  
15271 15278                  if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
15272 15279                          return (EFAULT);
15273 15280  
15274 15281                  pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
15275 15282                  mutex_enter(&dtrace_provider_lock);
15276 15283  
15277 15284                  for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
15278 15285                          if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
15279 15286                                  break;
15280 15287                  }
15281 15288  
15282 15289                  mutex_exit(&dtrace_provider_lock);
15283 15290  
15284 15291                  if (pvp == NULL)
15285 15292                          return (ESRCH);
15286 15293  
15287 15294                  bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
15288 15295                  bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
15289 15296                  if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
15290 15297                          return (EFAULT);
15291 15298  
15292 15299                  return (0);
15293 15300          }
15294 15301  
15295 15302          case DTRACEIOC_EPROBE: {
15296 15303                  dtrace_eprobedesc_t epdesc;
15297 15304                  dtrace_ecb_t *ecb;
15298 15305                  dtrace_action_t *act;
15299 15306                  void *buf;
15300 15307                  size_t size;
15301 15308                  uintptr_t dest;
15302 15309                  int nrecs;
15303 15310  
15304 15311                  if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
15305 15312                          return (EFAULT);
15306 15313  
15307 15314                  mutex_enter(&dtrace_lock);
15308 15315  
15309 15316                  if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
15310 15317                          mutex_exit(&dtrace_lock);
15311 15318                          return (EINVAL);
15312 15319                  }
15313 15320  
15314 15321                  if (ecb->dte_probe == NULL) {
15315 15322                          mutex_exit(&dtrace_lock);
15316 15323                          return (EINVAL);
15317 15324                  }
15318 15325  
15319 15326                  epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
15320 15327                  epdesc.dtepd_uarg = ecb->dte_uarg;
15321 15328                  epdesc.dtepd_size = ecb->dte_size;
15322 15329  
15323 15330                  nrecs = epdesc.dtepd_nrecs;
15324 15331                  epdesc.dtepd_nrecs = 0;
15325 15332                  for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
15326 15333                          if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
15327 15334                                  continue;
15328 15335  
15329 15336                          epdesc.dtepd_nrecs++;
15330 15337                  }
15331 15338  
15332 15339                  /*
15333 15340                   * Now that we have the size, we need to allocate a temporary
15334 15341                   * buffer in which to store the complete description.  We need
15335 15342                   * the temporary buffer to be able to drop dtrace_lock()
15336 15343                   * across the copyout(), below.
15337 15344                   */
15338 15345                  size = sizeof (dtrace_eprobedesc_t) +
15339 15346                      (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
15340 15347  
15341 15348                  buf = kmem_alloc(size, KM_SLEEP);
15342 15349                  dest = (uintptr_t)buf;
15343 15350  
15344 15351                  bcopy(&epdesc, (void *)dest, sizeof (epdesc));
15345 15352                  dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
15346 15353  
15347 15354                  for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
15348 15355                          if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
15349 15356                                  continue;
15350 15357  
15351 15358                          if (nrecs-- == 0)
15352 15359                                  break;
15353 15360  
15354 15361                          bcopy(&act->dta_rec, (void *)dest,
15355 15362                              sizeof (dtrace_recdesc_t));
15356 15363                          dest += sizeof (dtrace_recdesc_t);
15357 15364                  }
15358 15365  
15359 15366                  mutex_exit(&dtrace_lock);
15360 15367  
15361 15368                  if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
15362 15369                          kmem_free(buf, size);
15363 15370                          return (EFAULT);
15364 15371                  }
15365 15372  
15366 15373                  kmem_free(buf, size);
15367 15374                  return (0);
15368 15375          }
15369 15376  
15370 15377          case DTRACEIOC_AGGDESC: {
15371 15378                  dtrace_aggdesc_t aggdesc;
15372 15379                  dtrace_action_t *act;
15373 15380                  dtrace_aggregation_t *agg;
15374 15381                  int nrecs;
15375 15382                  uint32_t offs;
15376 15383                  dtrace_recdesc_t *lrec;
15377 15384                  void *buf;
15378 15385                  size_t size;
15379 15386                  uintptr_t dest;
15380 15387  
15381 15388                  if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
15382 15389                          return (EFAULT);
15383 15390  
15384 15391                  mutex_enter(&dtrace_lock);
15385 15392  
15386 15393                  if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
15387 15394                          mutex_exit(&dtrace_lock);
15388 15395                          return (EINVAL);
15389 15396                  }
15390 15397  
15391 15398                  aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
15392 15399  
15393 15400                  nrecs = aggdesc.dtagd_nrecs;
15394 15401                  aggdesc.dtagd_nrecs = 0;
15395 15402  
15396 15403                  offs = agg->dtag_base;
15397 15404                  lrec = &agg->dtag_action.dta_rec;
15398 15405                  aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
15399 15406  
15400 15407                  for (act = agg->dtag_first; ; act = act->dta_next) {
15401 15408                          ASSERT(act->dta_intuple ||
15402 15409                              DTRACEACT_ISAGG(act->dta_kind));
15403 15410  
15404 15411                          /*
15405 15412                           * If this action has a record size of zero, it
15406 15413                           * denotes an argument to the aggregating action.
15407 15414                           * Because the presence of this record doesn't (or
15408 15415                           * shouldn't) affect the way the data is interpreted,
15409 15416                           * we don't copy it out to save user-level the
15410 15417                           * confusion of dealing with a zero-length record.
15411 15418                           */
15412 15419                          if (act->dta_rec.dtrd_size == 0) {
15413 15420                                  ASSERT(agg->dtag_hasarg);
15414 15421                                  continue;
15415 15422                          }
15416 15423  
15417 15424                          aggdesc.dtagd_nrecs++;
15418 15425  
15419 15426                          if (act == &agg->dtag_action)
15420 15427                                  break;
15421 15428                  }
15422 15429  
15423 15430                  /*
15424 15431                   * Now that we have the size, we need to allocate a temporary
15425 15432                   * buffer in which to store the complete description.  We need
15426 15433                   * the temporary buffer to be able to drop dtrace_lock()
15427 15434                   * across the copyout(), below.
15428 15435                   */
15429 15436                  size = sizeof (dtrace_aggdesc_t) +
15430 15437                      (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
15431 15438  
15432 15439                  buf = kmem_alloc(size, KM_SLEEP);
15433 15440                  dest = (uintptr_t)buf;
15434 15441  
15435 15442                  bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
15436 15443                  dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
15437 15444  
15438 15445                  for (act = agg->dtag_first; ; act = act->dta_next) {
15439 15446                          dtrace_recdesc_t rec = act->dta_rec;
15440 15447  
15441 15448                          /*
15442 15449                           * See the comment in the above loop for why we pass
15443 15450                           * over zero-length records.
15444 15451                           */
15445 15452                          if (rec.dtrd_size == 0) {
15446 15453                                  ASSERT(agg->dtag_hasarg);
15447 15454                                  continue;
15448 15455                          }
15449 15456  
15450 15457                          if (nrecs-- == 0)
15451 15458                                  break;
15452 15459  
15453 15460                          rec.dtrd_offset -= offs;
15454 15461                          bcopy(&rec, (void *)dest, sizeof (rec));
15455 15462                          dest += sizeof (dtrace_recdesc_t);
15456 15463  
15457 15464                          if (act == &agg->dtag_action)
15458 15465                                  break;
15459 15466                  }
15460 15467  
15461 15468                  mutex_exit(&dtrace_lock);
15462 15469  
15463 15470                  if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
15464 15471                          kmem_free(buf, size);
15465 15472                          return (EFAULT);
15466 15473                  }
15467 15474  
15468 15475                  kmem_free(buf, size);
15469 15476                  return (0);
15470 15477          }
15471 15478  
15472 15479          case DTRACEIOC_ENABLE: {
15473 15480                  dof_hdr_t *dof;
15474 15481                  dtrace_enabling_t *enab = NULL;
15475 15482                  dtrace_vstate_t *vstate;
15476 15483                  int err = 0;
15477 15484  
15478 15485                  *rv = 0;
15479 15486  
15480 15487                  /*
15481 15488                   * If a NULL argument has been passed, we take this as our
15482 15489                   * cue to reevaluate our enablings.
15483 15490                   */
15484 15491                  if (arg == NULL) {
15485 15492                          dtrace_enabling_matchall();
15486 15493  
15487 15494                          return (0);
15488 15495                  }
15489 15496  
15490 15497                  if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
15491 15498                          return (rval);
15492 15499  
15493 15500                  mutex_enter(&cpu_lock);
15494 15501                  mutex_enter(&dtrace_lock);
15495 15502                  vstate = &state->dts_vstate;
15496 15503  
15497 15504                  if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
15498 15505                          mutex_exit(&dtrace_lock);
15499 15506                          mutex_exit(&cpu_lock);
15500 15507                          dtrace_dof_destroy(dof);
15501 15508                          return (EBUSY);
15502 15509                  }
15503 15510  
15504 15511                  if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
15505 15512                          mutex_exit(&dtrace_lock);
15506 15513                          mutex_exit(&cpu_lock);
15507 15514                          dtrace_dof_destroy(dof);
15508 15515                          return (EINVAL);
15509 15516                  }
15510 15517  
15511 15518                  if ((rval = dtrace_dof_options(dof, state)) != 0) {
15512 15519                          dtrace_enabling_destroy(enab);
15513 15520                          mutex_exit(&dtrace_lock);
15514 15521                          mutex_exit(&cpu_lock);
15515 15522                          dtrace_dof_destroy(dof);
15516 15523                          return (rval);
15517 15524                  }
15518 15525  
15519 15526                  if ((err = dtrace_enabling_match(enab, rv)) == 0) {
15520 15527                          err = dtrace_enabling_retain(enab);
15521 15528                  } else {
15522 15529                          dtrace_enabling_destroy(enab);
15523 15530                  }
15524 15531  
15525 15532                  mutex_exit(&cpu_lock);
15526 15533                  mutex_exit(&dtrace_lock);
15527 15534                  dtrace_dof_destroy(dof);
15528 15535  
15529 15536                  return (err);
15530 15537          }
15531 15538  
15532 15539          case DTRACEIOC_REPLICATE: {
15533 15540                  dtrace_repldesc_t desc;
15534 15541                  dtrace_probedesc_t *match = &desc.dtrpd_match;
15535 15542                  dtrace_probedesc_t *create = &desc.dtrpd_create;
15536 15543                  int err;
15537 15544  
15538 15545                  if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15539 15546                          return (EFAULT);
15540 15547  
15541 15548                  match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
15542 15549                  match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
15543 15550                  match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
15544 15551                  match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
15545 15552  
15546 15553                  create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
15547 15554                  create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
15548 15555                  create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
15549 15556                  create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
15550 15557  
15551 15558                  mutex_enter(&dtrace_lock);
15552 15559                  err = dtrace_enabling_replicate(state, match, create);
15553 15560                  mutex_exit(&dtrace_lock);
15554 15561  
15555 15562                  return (err);
15556 15563          }
15557 15564  
15558 15565          case DTRACEIOC_PROBEMATCH:
15559 15566          case DTRACEIOC_PROBES: {
15560 15567                  dtrace_probe_t *probe = NULL;
15561 15568                  dtrace_probedesc_t desc;
15562 15569                  dtrace_probekey_t pkey;
15563 15570                  dtrace_id_t i;
15564 15571                  int m = 0;
15565 15572                  uint32_t priv;
15566 15573                  uid_t uid;
15567 15574                  zoneid_t zoneid;
15568 15575  
15569 15576                  if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15570 15577                          return (EFAULT);
15571 15578  
15572 15579                  desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
15573 15580                  desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
15574 15581                  desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
15575 15582                  desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
15576 15583  
15577 15584                  /*
15578 15585                   * Before we attempt to match this probe, we want to give
15579 15586                   * all providers the opportunity to provide it.
15580 15587                   */
15581 15588                  if (desc.dtpd_id == DTRACE_IDNONE) {
15582 15589                          mutex_enter(&dtrace_provider_lock);
15583 15590                          dtrace_probe_provide(&desc, NULL);
15584 15591                          mutex_exit(&dtrace_provider_lock);
15585 15592                          desc.dtpd_id++;
15586 15593                  }
15587 15594  
15588 15595                  if (cmd == DTRACEIOC_PROBEMATCH)  {
15589 15596                          dtrace_probekey(&desc, &pkey);
15590 15597                          pkey.dtpk_id = DTRACE_IDNONE;
15591 15598                  }
15592 15599  
15593 15600                  dtrace_cred2priv(cr, &priv, &uid, &zoneid);
15594 15601  
15595 15602                  mutex_enter(&dtrace_lock);
15596 15603  
15597 15604                  if (cmd == DTRACEIOC_PROBEMATCH) {
15598 15605                          for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
15599 15606                                  if ((probe = dtrace_probes[i - 1]) != NULL &&
15600 15607                                      (m = dtrace_match_probe(probe, &pkey,
15601 15608                                      priv, uid, zoneid)) != 0)
15602 15609                                          break;
15603 15610                          }
15604 15611  
15605 15612                          if (m < 0) {
15606 15613                                  mutex_exit(&dtrace_lock);
15607 15614                                  return (EINVAL);
15608 15615                          }
15609 15616  
15610 15617                  } else {
15611 15618                          for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
15612 15619                                  if ((probe = dtrace_probes[i - 1]) != NULL &&
15613 15620                                      dtrace_match_priv(probe, priv, uid, zoneid))
15614 15621                                          break;
15615 15622                          }
15616 15623                  }
15617 15624  
15618 15625                  if (probe == NULL) {
15619 15626                          mutex_exit(&dtrace_lock);
15620 15627                          return (ESRCH);
15621 15628                  }
15622 15629  
15623 15630                  dtrace_probe_description(probe, &desc);
15624 15631                  mutex_exit(&dtrace_lock);
15625 15632  
15626 15633                  if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15627 15634                          return (EFAULT);
15628 15635  
15629 15636                  return (0);
15630 15637          }
15631 15638  
15632 15639          case DTRACEIOC_PROBEARG: {
15633 15640                  dtrace_argdesc_t desc;
15634 15641                  dtrace_probe_t *probe;
15635 15642                  dtrace_provider_t *prov;
15636 15643  
15637 15644                  if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15638 15645                          return (EFAULT);
15639 15646  
15640 15647                  if (desc.dtargd_id == DTRACE_IDNONE)
15641 15648                          return (EINVAL);
15642 15649  
15643 15650                  if (desc.dtargd_ndx == DTRACE_ARGNONE)
15644 15651                          return (EINVAL);
15645 15652  
15646 15653                  mutex_enter(&dtrace_provider_lock);
15647 15654                  mutex_enter(&mod_lock);
15648 15655                  mutex_enter(&dtrace_lock);
15649 15656  
15650 15657                  if (desc.dtargd_id > dtrace_nprobes) {
15651 15658                          mutex_exit(&dtrace_lock);
15652 15659                          mutex_exit(&mod_lock);
15653 15660                          mutex_exit(&dtrace_provider_lock);
15654 15661                          return (EINVAL);
15655 15662                  }
15656 15663  
15657 15664                  if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
15658 15665                          mutex_exit(&dtrace_lock);
15659 15666                          mutex_exit(&mod_lock);
15660 15667                          mutex_exit(&dtrace_provider_lock);
15661 15668                          return (EINVAL);
15662 15669                  }
15663 15670  
15664 15671                  mutex_exit(&dtrace_lock);
15665 15672  
15666 15673                  prov = probe->dtpr_provider;
15667 15674  
15668 15675                  if (prov->dtpv_pops.dtps_getargdesc == NULL) {
15669 15676                          /*
15670 15677                           * There isn't any typed information for this probe.
15671 15678                           * Set the argument number to DTRACE_ARGNONE.
15672 15679                           */
15673 15680                          desc.dtargd_ndx = DTRACE_ARGNONE;
15674 15681                  } else {
15675 15682                          desc.dtargd_native[0] = '\0';
15676 15683                          desc.dtargd_xlate[0] = '\0';
15677 15684                          desc.dtargd_mapping = desc.dtargd_ndx;
15678 15685  
15679 15686                          prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
15680 15687                              probe->dtpr_id, probe->dtpr_arg, &desc);
15681 15688                  }
15682 15689  
15683 15690                  mutex_exit(&mod_lock);
15684 15691                  mutex_exit(&dtrace_provider_lock);
15685 15692  
15686 15693                  if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15687 15694                          return (EFAULT);
15688 15695  
15689 15696                  return (0);
15690 15697          }
15691 15698  
15692 15699          case DTRACEIOC_GO: {
15693 15700                  processorid_t cpuid;
15694 15701                  rval = dtrace_state_go(state, &cpuid);
15695 15702  
15696 15703                  if (rval != 0)
15697 15704                          return (rval);
15698 15705  
15699 15706                  if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
15700 15707                          return (EFAULT);
15701 15708  
15702 15709                  return (0);
15703 15710          }
15704 15711  
15705 15712          case DTRACEIOC_STOP: {
15706 15713                  processorid_t cpuid;
15707 15714  
15708 15715                  mutex_enter(&dtrace_lock);
15709 15716                  rval = dtrace_state_stop(state, &cpuid);
15710 15717                  mutex_exit(&dtrace_lock);
15711 15718  
15712 15719                  if (rval != 0)
15713 15720                          return (rval);
15714 15721  
15715 15722                  if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
15716 15723                          return (EFAULT);
15717 15724  
15718 15725                  return (0);
15719 15726          }
15720 15727  
15721 15728          case DTRACEIOC_DOFGET: {
15722 15729                  dof_hdr_t hdr, *dof;
15723 15730                  uint64_t len;
15724 15731  
15725 15732                  if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
15726 15733                          return (EFAULT);
15727 15734  
15728 15735                  mutex_enter(&dtrace_lock);
15729 15736                  dof = dtrace_dof_create(state);
15730 15737                  mutex_exit(&dtrace_lock);
15731 15738  
15732 15739                  len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
15733 15740                  rval = copyout(dof, (void *)arg, len);
15734 15741                  dtrace_dof_destroy(dof);
15735 15742  
15736 15743                  return (rval == 0 ? 0 : EFAULT);
15737 15744          }
15738 15745  
15739 15746          case DTRACEIOC_AGGSNAP:
15740 15747          case DTRACEIOC_BUFSNAP: {
15741 15748                  dtrace_bufdesc_t desc;
15742 15749                  caddr_t cached;
15743 15750                  dtrace_buffer_t *buf;
15744 15751  
15745 15752                  if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15746 15753                          return (EFAULT);
15747 15754  
15748 15755                  if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
15749 15756                          return (EINVAL);
15750 15757  
15751 15758                  mutex_enter(&dtrace_lock);
15752 15759  
15753 15760                  if (cmd == DTRACEIOC_BUFSNAP) {
15754 15761                          buf = &state->dts_buffer[desc.dtbd_cpu];
15755 15762                  } else {
15756 15763                          buf = &state->dts_aggbuffer[desc.dtbd_cpu];
15757 15764                  }
15758 15765  
15759 15766                  if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
15760 15767                          size_t sz = buf->dtb_offset;
15761 15768  
15762 15769                          if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
15763 15770                                  mutex_exit(&dtrace_lock);
15764 15771                                  return (EBUSY);
15765 15772                          }
15766 15773  
15767 15774                          /*
15768 15775                           * If this buffer has already been consumed, we're
15769 15776                           * going to indicate that there's nothing left here
15770 15777                           * to consume.
15771 15778                           */
15772 15779                          if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
15773 15780                                  mutex_exit(&dtrace_lock);
15774 15781  
15775 15782                                  desc.dtbd_size = 0;
15776 15783                                  desc.dtbd_drops = 0;
15777 15784                                  desc.dtbd_errors = 0;
15778 15785                                  desc.dtbd_oldest = 0;
15779 15786                                  sz = sizeof (desc);
15780 15787  
15781 15788                                  if (copyout(&desc, (void *)arg, sz) != 0)
15782 15789                                          return (EFAULT);
15783 15790  
15784 15791                                  return (0);
15785 15792                          }
15786 15793  
15787 15794                          /*
15788 15795                           * If this is a ring buffer that has wrapped, we want
15789 15796                           * to copy the whole thing out.
15790 15797                           */
15791 15798                          if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
15792 15799                                  dtrace_buffer_polish(buf);
15793 15800                                  sz = buf->dtb_size;
15794 15801                          }
15795 15802  
15796 15803                          if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
15797 15804                                  mutex_exit(&dtrace_lock);
15798 15805                                  return (EFAULT);
15799 15806                          }
15800 15807  
15801 15808                          desc.dtbd_size = sz;
15802 15809                          desc.dtbd_drops = buf->dtb_drops;
15803 15810                          desc.dtbd_errors = buf->dtb_errors;
15804 15811                          desc.dtbd_oldest = buf->dtb_xamot_offset;
15805 15812  
15806 15813                          mutex_exit(&dtrace_lock);
15807 15814  
15808 15815                          if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15809 15816                                  return (EFAULT);
15810 15817  
15811 15818                          buf->dtb_flags |= DTRACEBUF_CONSUMED;
15812 15819  
15813 15820                          return (0);
15814 15821                  }
15815 15822  
15816 15823                  if (buf->dtb_tomax == NULL) {
15817 15824                          ASSERT(buf->dtb_xamot == NULL);
15818 15825                          mutex_exit(&dtrace_lock);
15819 15826                          return (ENOENT);
15820 15827                  }
15821 15828  
15822 15829                  cached = buf->dtb_tomax;
15823 15830                  ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
15824 15831  
15825 15832                  dtrace_xcall(desc.dtbd_cpu,
15826 15833                      (dtrace_xcall_t)dtrace_buffer_switch, buf);
15827 15834  
15828 15835                  state->dts_errors += buf->dtb_xamot_errors;
15829 15836  
15830 15837                  /*
15831 15838                   * If the buffers did not actually switch, then the cross call
15832 15839                   * did not take place -- presumably because the given CPU is
15833 15840                   * not in the ready set.  If this is the case, we'll return
15834 15841                   * ENOENT.
15835 15842                   */
15836 15843                  if (buf->dtb_tomax == cached) {
15837 15844                          ASSERT(buf->dtb_xamot != cached);
15838 15845                          mutex_exit(&dtrace_lock);
15839 15846                          return (ENOENT);
15840 15847                  }
15841 15848  
15842 15849                  ASSERT(cached == buf->dtb_xamot);
15843 15850  
15844 15851                  /*
15845 15852                   * We have our snapshot; now copy it out.
15846 15853                   */
15847 15854                  if (copyout(buf->dtb_xamot, desc.dtbd_data,
15848 15855                      buf->dtb_xamot_offset) != 0) {
15849 15856                          mutex_exit(&dtrace_lock);
15850 15857                          return (EFAULT);
15851 15858                  }
15852 15859  
15853 15860                  desc.dtbd_size = buf->dtb_xamot_offset;
15854 15861                  desc.dtbd_drops = buf->dtb_xamot_drops;
15855 15862                  desc.dtbd_errors = buf->dtb_xamot_errors;
15856 15863                  desc.dtbd_oldest = 0;
15857 15864  
15858 15865                  mutex_exit(&dtrace_lock);
15859 15866  
15860 15867                  /*
15861 15868                   * Finally, copy out the buffer description.
15862 15869                   */
15863 15870                  if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15864 15871                          return (EFAULT);
15865 15872  
15866 15873                  return (0);
15867 15874          }
15868 15875  
15869 15876          case DTRACEIOC_CONF: {
15870 15877                  dtrace_conf_t conf;
15871 15878  
15872 15879                  bzero(&conf, sizeof (conf));
15873 15880                  conf.dtc_difversion = DIF_VERSION;
15874 15881                  conf.dtc_difintregs = DIF_DIR_NREGS;
15875 15882                  conf.dtc_diftupregs = DIF_DTR_NREGS;
15876 15883                  conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
15877 15884  
15878 15885                  if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
15879 15886                          return (EFAULT);
15880 15887  
15881 15888                  return (0);
15882 15889          }
15883 15890  
15884 15891          case DTRACEIOC_STATUS: {
15885 15892                  dtrace_status_t stat;
15886 15893                  dtrace_dstate_t *dstate;
15887 15894                  int i, j;
15888 15895                  uint64_t nerrs;
15889 15896  
15890 15897                  /*
15891 15898                   * See the comment in dtrace_state_deadman() for the reason
15892 15899                   * for setting dts_laststatus to INT64_MAX before setting
15893 15900                   * it to the correct value.
15894 15901                   */
15895 15902                  state->dts_laststatus = INT64_MAX;
15896 15903                  dtrace_membar_producer();
15897 15904                  state->dts_laststatus = dtrace_gethrtime();
15898 15905  
15899 15906                  bzero(&stat, sizeof (stat));
15900 15907  
15901 15908                  mutex_enter(&dtrace_lock);
15902 15909  
15903 15910                  if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
15904 15911                          mutex_exit(&dtrace_lock);
15905 15912                          return (ENOENT);
15906 15913                  }
15907 15914  
15908 15915                  if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
15909 15916                          stat.dtst_exiting = 1;
15910 15917  
15911 15918                  nerrs = state->dts_errors;
15912 15919                  dstate = &state->dts_vstate.dtvs_dynvars;
15913 15920  
15914 15921                  for (i = 0; i < NCPU; i++) {
15915 15922                          dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
15916 15923  
15917 15924                          stat.dtst_dyndrops += dcpu->dtdsc_drops;
15918 15925                          stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
15919 15926                          stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
15920 15927  
15921 15928                          if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
15922 15929                                  stat.dtst_filled++;
15923 15930  
15924 15931                          nerrs += state->dts_buffer[i].dtb_errors;
15925 15932  
15926 15933                          for (j = 0; j < state->dts_nspeculations; j++) {
15927 15934                                  dtrace_speculation_t *spec;
15928 15935                                  dtrace_buffer_t *buf;
15929 15936  
15930 15937                                  spec = &state->dts_speculations[j];
15931 15938                                  buf = &spec->dtsp_buffer[i];
15932 15939                                  stat.dtst_specdrops += buf->dtb_xamot_drops;
15933 15940                          }
15934 15941                  }
15935 15942  
15936 15943                  stat.dtst_specdrops_busy = state->dts_speculations_busy;
15937 15944                  stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
15938 15945                  stat.dtst_stkstroverflows = state->dts_stkstroverflows;
15939 15946                  stat.dtst_dblerrors = state->dts_dblerrors;
15940 15947                  stat.dtst_killed =
15941 15948                      (state->dts_activity == DTRACE_ACTIVITY_KILLED);
15942 15949                  stat.dtst_errors = nerrs;
15943 15950  
15944 15951                  mutex_exit(&dtrace_lock);
15945 15952  
15946 15953                  if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
15947 15954                          return (EFAULT);
15948 15955  
15949 15956                  return (0);
15950 15957          }
15951 15958  
15952 15959          case DTRACEIOC_FORMAT: {
15953 15960                  dtrace_fmtdesc_t fmt;
15954 15961                  char *str;
15955 15962                  int len;
15956 15963  
15957 15964                  if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
15958 15965                          return (EFAULT);
15959 15966  
15960 15967                  mutex_enter(&dtrace_lock);
15961 15968  
15962 15969                  if (fmt.dtfd_format == 0 ||
15963 15970                      fmt.dtfd_format > state->dts_nformats) {
15964 15971                          mutex_exit(&dtrace_lock);
15965 15972                          return (EINVAL);
15966 15973                  }
15967 15974  
15968 15975                  /*
15969 15976                   * Format strings are allocated contiguously and they are
15970 15977                   * never freed; if a format index is less than the number
15971 15978                   * of formats, we can assert that the format map is non-NULL
15972 15979                   * and that the format for the specified index is non-NULL.
15973 15980                   */
15974 15981                  ASSERT(state->dts_formats != NULL);
15975 15982                  str = state->dts_formats[fmt.dtfd_format - 1];
15976 15983                  ASSERT(str != NULL);
15977 15984  
15978 15985                  len = strlen(str) + 1;
15979 15986  
15980 15987                  if (len > fmt.dtfd_length) {
15981 15988                          fmt.dtfd_length = len;
15982 15989  
15983 15990                          if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
15984 15991                                  mutex_exit(&dtrace_lock);
15985 15992                                  return (EINVAL);
15986 15993                          }
15987 15994                  } else {
15988 15995                          if (copyout(str, fmt.dtfd_string, len) != 0) {
15989 15996                                  mutex_exit(&dtrace_lock);
15990 15997                                  return (EINVAL);
15991 15998                          }
15992 15999                  }
15993 16000  
15994 16001                  mutex_exit(&dtrace_lock);
15995 16002                  return (0);
15996 16003          }
15997 16004  
15998 16005          default:
15999 16006                  break;
16000 16007          }
16001 16008  
16002 16009          return (ENOTTY);
16003 16010  }
16004 16011  
16005 16012  /*ARGSUSED*/
16006 16013  static int
16007 16014  dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
16008 16015  {
16009 16016          dtrace_state_t *state;
16010 16017  
16011 16018          switch (cmd) {
16012 16019          case DDI_DETACH:
16013 16020                  break;
16014 16021  
16015 16022          case DDI_SUSPEND:
16016 16023                  return (DDI_SUCCESS);
16017 16024  
16018 16025          default:
16019 16026                  return (DDI_FAILURE);
16020 16027          }
16021 16028  
16022 16029          mutex_enter(&cpu_lock);
16023 16030          mutex_enter(&dtrace_provider_lock);
16024 16031          mutex_enter(&dtrace_lock);
16025 16032  
16026 16033          ASSERT(dtrace_opens == 0);
16027 16034  
16028 16035          if (dtrace_helpers > 0) {
16029 16036                  mutex_exit(&dtrace_provider_lock);
16030 16037                  mutex_exit(&dtrace_lock);
16031 16038                  mutex_exit(&cpu_lock);
16032 16039                  return (DDI_FAILURE);
16033 16040          }
16034 16041  
16035 16042          if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
16036 16043                  mutex_exit(&dtrace_provider_lock);
16037 16044                  mutex_exit(&dtrace_lock);
16038 16045                  mutex_exit(&cpu_lock);
16039 16046                  return (DDI_FAILURE);
16040 16047          }
16041 16048  
16042 16049          dtrace_provider = NULL;
16043 16050  
16044 16051          if ((state = dtrace_anon_grab()) != NULL) {
16045 16052                  /*
16046 16053                   * If there were ECBs on this state, the provider should
16047 16054                   * have not been allowed to detach; assert that there is
16048 16055                   * none.
16049 16056                   */
16050 16057                  ASSERT(state->dts_necbs == 0);
16051 16058                  dtrace_state_destroy(state);
16052 16059  
16053 16060                  /*
16054 16061                   * If we're being detached with anonymous state, we need to
16055 16062                   * indicate to the kernel debugger that DTrace is now inactive.
16056 16063                   */
16057 16064                  (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16058 16065          }
16059 16066  
16060 16067          bzero(&dtrace_anon, sizeof (dtrace_anon_t));
16061 16068          unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
16062 16069          dtrace_cpu_init = NULL;
16063 16070          dtrace_helpers_cleanup = NULL;
16064 16071          dtrace_helpers_fork = NULL;
16065 16072          dtrace_cpustart_init = NULL;
16066 16073          dtrace_cpustart_fini = NULL;
16067 16074          dtrace_debugger_init = NULL;
16068 16075          dtrace_debugger_fini = NULL;
16069 16076          dtrace_modload = NULL;
16070 16077          dtrace_modunload = NULL;
16071 16078  
16072 16079          ASSERT(dtrace_getf == 0);
16073 16080          ASSERT(dtrace_closef == NULL);
16074 16081  
16075 16082          mutex_exit(&cpu_lock);
16076 16083  
16077 16084          if (dtrace_helptrace_enabled) {
16078 16085                  kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
16079 16086                  dtrace_helptrace_buffer = NULL;
16080 16087          }
16081 16088  
16082 16089          kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
16083 16090          dtrace_probes = NULL;
16084 16091          dtrace_nprobes = 0;
16085 16092  
16086 16093          dtrace_hash_destroy(dtrace_bymod);
16087 16094          dtrace_hash_destroy(dtrace_byfunc);
16088 16095          dtrace_hash_destroy(dtrace_byname);
16089 16096          dtrace_bymod = NULL;
16090 16097          dtrace_byfunc = NULL;
16091 16098          dtrace_byname = NULL;
16092 16099  
16093 16100          kmem_cache_destroy(dtrace_state_cache);
16094 16101          vmem_destroy(dtrace_minor);
16095 16102          vmem_destroy(dtrace_arena);
16096 16103  
16097 16104          if (dtrace_toxrange != NULL) {
16098 16105                  kmem_free(dtrace_toxrange,
16099 16106                      dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
16100 16107                  dtrace_toxrange = NULL;
16101 16108                  dtrace_toxranges = 0;
16102 16109                  dtrace_toxranges_max = 0;
16103 16110          }
16104 16111  
16105 16112          ddi_remove_minor_node(dtrace_devi, NULL);
16106 16113          dtrace_devi = NULL;
16107 16114  
16108 16115          ddi_soft_state_fini(&dtrace_softstate);
16109 16116  
16110 16117          ASSERT(dtrace_vtime_references == 0);
16111 16118          ASSERT(dtrace_opens == 0);
16112 16119          ASSERT(dtrace_retained == NULL);
16113 16120  
16114 16121          mutex_exit(&dtrace_lock);
16115 16122          mutex_exit(&dtrace_provider_lock);
16116 16123  
16117 16124          /*
16118 16125           * We don't destroy the task queue until after we have dropped our
16119 16126           * locks (taskq_destroy() may block on running tasks).  To prevent
16120 16127           * attempting to do work after we have effectively detached but before
16121 16128           * the task queue has been destroyed, all tasks dispatched via the
16122 16129           * task queue must check that DTrace is still attached before
16123 16130           * performing any operation.
16124 16131           */
16125 16132          taskq_destroy(dtrace_taskq);
16126 16133          dtrace_taskq = NULL;
16127 16134  
16128 16135          return (DDI_SUCCESS);
16129 16136  }
16130 16137  
16131 16138  /*ARGSUSED*/
16132 16139  static int
16133 16140  dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
16134 16141  {
16135 16142          int error;
16136 16143  
16137 16144          switch (infocmd) {
16138 16145          case DDI_INFO_DEVT2DEVINFO:
16139 16146                  *result = (void *)dtrace_devi;
16140 16147                  error = DDI_SUCCESS;
16141 16148                  break;
16142 16149          case DDI_INFO_DEVT2INSTANCE:
16143 16150                  *result = (void *)0;
16144 16151                  error = DDI_SUCCESS;
16145 16152                  break;
16146 16153          default:
16147 16154                  error = DDI_FAILURE;
16148 16155          }
16149 16156          return (error);
16150 16157  }
16151 16158  
16152 16159  static struct cb_ops dtrace_cb_ops = {
16153 16160          dtrace_open,            /* open */
16154 16161          dtrace_close,           /* close */
16155 16162          nulldev,                /* strategy */
16156 16163          nulldev,                /* print */
16157 16164          nodev,                  /* dump */
16158 16165          nodev,                  /* read */
16159 16166          nodev,                  /* write */
16160 16167          dtrace_ioctl,           /* ioctl */
16161 16168          nodev,                  /* devmap */
16162 16169          nodev,                  /* mmap */
16163 16170          nodev,                  /* segmap */
16164 16171          nochpoll,               /* poll */
16165 16172          ddi_prop_op,            /* cb_prop_op */
16166 16173          0,                      /* streamtab  */
16167 16174          D_NEW | D_MP            /* Driver compatibility flag */
16168 16175  };
16169 16176  
16170 16177  static struct dev_ops dtrace_ops = {
16171 16178          DEVO_REV,               /* devo_rev */
16172 16179          0,                      /* refcnt */
16173 16180          dtrace_info,            /* get_dev_info */
16174 16181          nulldev,                /* identify */
16175 16182          nulldev,                /* probe */
16176 16183          dtrace_attach,          /* attach */
16177 16184          dtrace_detach,          /* detach */
16178 16185          nodev,                  /* reset */
16179 16186          &dtrace_cb_ops,         /* driver operations */
16180 16187          NULL,                   /* bus operations */
16181 16188          nodev,                  /* dev power */
16182 16189          ddi_quiesce_not_needed,         /* quiesce */
16183 16190  };
16184 16191  
16185 16192  static struct modldrv modldrv = {
16186 16193          &mod_driverops,         /* module type (this is a pseudo driver) */
16187 16194          "Dynamic Tracing",      /* name of module */
16188 16195          &dtrace_ops,            /* driver ops */
16189 16196  };
16190 16197  
16191 16198  static struct modlinkage modlinkage = {
16192 16199          MODREV_1,
16193 16200          (void *)&modldrv,
16194 16201          NULL
16195 16202  };
16196 16203  
16197 16204  int
16198 16205  _init(void)
16199 16206  {
16200 16207          return (mod_install(&modlinkage));
16201 16208  }
16202 16209  
16203 16210  int
16204 16211  _info(struct modinfo *modinfop)
16205 16212  {
16206 16213          return (mod_info(&modlinkage, modinfop));
16207 16214  }
16208 16215  
16209 16216  int
16210 16217  _fini(void)
16211 16218  {
16212 16219          return (mod_remove(&modlinkage));
16213 16220  }

↓ open down ↓

14822 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX