1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * Xen event provider for DTrace
  29  *
  30  * NOTE: This provider is PRIVATE. It is intended as a short-term solution and
  31  * may disappear or be re-implemented at anytime.
  32  *
  33  * This provider isn't suitable as a general-purpose solution for a number of
  34  * reasons. First and foremost, we rely on the Xen tracing mechanism and don't
  35  * have any way to gather data other than that collected by the Xen trace
  36  * buffers. Further, it does not fit into the DTrace model (see "Interacting
  37  * with DTrace" below.)
  38  *
  39  *
  40  * Tracing in Xen
  41  * --------------
  42  *
  43  * Xen implements a tracing facility for generating and collecting execution
  44  * event traces from the hypervisor. When tracing is enabled, compiled in
  45  * probes record events in contiguous per-CPU trace buffers.
  46  *
  47  *               +---------+
  48  * +------+      |         |
  49  * | CPUn |----> | BUFFERn |
  50  * +------+      |         |
  51  *               +---------+- tbuf.va + (tbuf.size * n)
  52  *               :         :
  53  *               +---------+
  54  * +------+      |         |
  55  * | CPU1 |----> | BUFFER1 |
  56  * +------+      |         |
  57  *               +---------+- tbuf.va + tbuf.size
  58  * +------+      |         |
  59  * | CPU0 |----> | BUFFER0 |
  60  * +------+      |         |
  61  *               +---------+- tbuf.va
  62  *
  63  * Each CPU buffer consists of a metadata header followed by the trace records.
  64  * The metadata consists of a producer/consumer pair of pointers into the buffer
  65  * that point to the next record to be written and the next record to be read
  66  * respectively.
  67  *
  68  * A trace record can be in one of two forms, depending on if the TSC is
  69  * included. The record header indicates whether or not the TSC field is
  70  * present.
  71  *
  72  * 1. Trace record without TSC:
  73  * +------------------------------------------------------------+
  74  * | HEADER(uint32_t) |            DATA FIELDS                  |
  75  * +------------------------------------------------------------+
  76  *
  77  * 2. Trace record with TSC:
  78  * +--------------------------------------------------------------------------+
  79  * | HEADER(uint32_t) | TSC(uint64_t) |              DATA FIELDS              |
  80  * +--------------------------------------------------------------------------+
  81  *
  82  * Where,
  83  *
  84  * HEADER bit field:
  85  * +--------------------------------------------------------------------------+
  86  * | C |  NDATA  |                        EVENT                               |
  87  * +--------------------------------------------------------------------------+
  88  *  31  30     28 27                                                         0
  89  *
  90  * EVENT: Event ID.
  91  * NDATA: Number of populated data fields.
  92  *     C: TSC included.
  93  *
  94  * DATA FIELDS:
  95  * +--------------------------------------------------------------------------+
  96  * | D1(uint32_t) | D2(uint32_t) | D3(uint32_t) |     . . .    | D7(uint32_t) |
  97  * +--------------------------------------------------------------------------+
  98  *
  99  *
 100  * Interacting with DTrace
 101  * -----------------------
 102  *
 103  * Every xdt_poll_nsec nano-seconds we poll the trace buffers for data and feed
 104  * each entry into dtrace_probe() with the corresponding probe ID for the event.
 105  * As a result of this periodic collection implementation probe firings are
 106  * asynchronous. This is the only sensible way to implement this form of
 107  * provider, but because of its asynchronous nature asking things like
 108  * "current CPU" and, more importantly, arbitrary questions about the context
 109  * surrounding the probe firing are not meaningful. So, consumers should not
 110  * attempt to infer anything beyond what is supplied via the probe arguments.
 111  */
 112 
 113 #include <sys/xpv_user.h>
 114 
 115 #include <sys/types.h>
 116 #include <sys/sysmacros.h>
 117 #include <sys/modctl.h>
 118 #include <sys/sunddi.h>
 119 #include <sys/ddi.h>
 120 #include <sys/conf.h>
 121 #include <sys/devops.h>
 122 #include <sys/stat.h>
 123 #include <sys/cmn_err.h>
 124 #include <sys/dtrace.h>
 125 #include <sys/sdt.h>
 126 #include <sys/cyclic.h>
 127 #include <vm/seg_kmem.h>
 128 #include <vm/hat_i86.h>
 129 
 130 #include <sys/hypervisor.h>
 131 #include <xen/public/trace.h>
 132 #include <xen/public/sched.h>
 133 
 134 #define XDT_POLL_DEFAULT        100000000       /* default poll interval (ns) */
 135 #define XDT_POLL_MIN            10000000        /* min poll interval (ns) */
 136 #define XDT_TBUF_RETRY          50              /* tbuf disable retry count */
 137 
 138 /*
 139  * The domid must match IDLE_DOMAIN_ID in xen.hg/xen/include/xen/sched.h
 140  * in the xVM gate.
 141  */
 142 #define IS_IDLE_DOM(domid)      (domid == 0x7FFFU)
 143 
 144 /* Macros to extract the domid and cpuid from a HVM trace data field */
 145 #define HVM_DOMID(d)            (d >> 16)
 146 #define HVM_VCPUID(d)           (d & 0xFFFF)
 147 
 148 /* Flags for shadow page table events */
 149 #define SH_GUEST_32     0x000
 150 #define SH_GUEST_PAE    0x100
 151 #define SH_GUEST_64     0x200
 152 
 153 #define XDT_PROBE5(event, arg0, arg1, arg2, arg3, arg4) {               \
 154         dtrace_id_t id = xdt_probemap[event];                           \
 155         if (id)                                                         \
 156                 dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);         \
 157 }                                                                       \
 158 
 159 #define XDT_PROBE4(event, arg0, arg1, arg2, arg3) \
 160         XDT_PROBE5(event, arg0, arg1, arg2, arg3, 0)
 161 
 162 #define XDT_PROBE3(event, arg0, arg1, arg2) \
 163         XDT_PROBE5(event, arg0, arg1, arg2, 0, 0)
 164 
 165 #define XDT_PROBE2(event, arg0, arg1) \
 166         XDT_PROBE5(event, arg0, arg1, 0, 0, 0)
 167 
 168 #define XDT_PROBE1(event, arg0) \
 169         XDT_PROBE5(event, arg0, 0, 0, 0, 0)
 170 
 171 #define XDT_PROBE0(event) \
 172         XDT_PROBE5(event, 0, 0, 0, 0, 0)
 173 
 174 /* Probe classes */
 175 #define XDT_SCHED                       0
 176 #define XDT_MEM                         1
 177 #define XDT_HVM                         2
 178 #define XDT_GEN                         3
 179 #define XDT_PV                          4
 180 #define XDT_SHADOW                      5
 181 #define XDT_PM                          6
 182 #define XDT_NCLASSES                    7
 183 
 184 /* Probe events */
 185 #define XDT_EVT_INVALID                 (-(int)1)
 186 #define XDT_SCHED_OFF_CPU               0
 187 #define XDT_SCHED_ON_CPU                1
 188 #define XDT_SCHED_IDLE_OFF_CPU          2
 189 #define XDT_SCHED_IDLE_ON_CPU           3
 190 #define XDT_SCHED_BLOCK                 4
 191 #define XDT_SCHED_SLEEP                 5
 192 #define XDT_SCHED_WAKE                  6
 193 #define XDT_SCHED_YIELD                 7
 194 #define XDT_SCHED_SHUTDOWN_POWEROFF     8
 195 #define XDT_SCHED_SHUTDOWN_REBOOT       9
 196 #define XDT_SCHED_SHUTDOWN_SUSPEND      10
 197 #define XDT_SCHED_SHUTDOWN_CRASH        11
 198 #define XDT_MEM_PAGE_GRANT_MAP          12
 199 #define XDT_MEM_PAGE_GRANT_UNMAP        13
 200 #define XDT_MEM_PAGE_GRANT_TRANSFER     14
 201 #define XDT_HVM_VMENTRY                 15
 202 #define XDT_HVM_VMEXIT                  16
 203 #define XDT_TRC_LOST_RECORDS            17
 204 #define XDT_SCHED_ADD_VCPU              18
 205 #define XDT_SCHED_REM_VCPU              19      /* unused */
 206 #define XDT_SCHED_CTL                   20      /* unused */
 207 #define XDT_SCHED_ADJDOM                21
 208 #define XDT_SCHED_S_TIMER_FN            22      /* unused */
 209 #define XDT_SCHED_T_TIMER_FN            23      /* unused */
 210 #define XDT_SCHED_DOM_TIMER_FN          24      /* unused */
 211 #define XDT_PV_HYPERCALL                25
 212 #define XDT_PV_TRAP                     26
 213 #define XDT_PV_PAGE_FAULT               27
 214 #define XDT_PV_FORCED_INVALID_OP        28
 215 #define XDT_PV_EMULATE_PRIVOP           29
 216 #define XDT_PV_EMULATE_4GB              30      /* unused (32-bit HV only ) */
 217 #define XDT_PV_MATH_STATE_RESTORE       31
 218 #define XDT_PV_PAGING_FIXUP             32
 219 #define XDT_PV_DT_MAPPING_FAULT         33
 220 #define XDT_PV_PTWR_EMULATION           34
 221 #define XDT_HVM_PF_XEN                  35
 222 #define XDT_HVM_PF_INJECT               36
 223 #define XDT_HVM_EXC_INJECT              37
 224 #define XDT_HVM_VIRQ_INJECT             38
 225 #define XDT_HVM_VIRQ_REINJECT           39
 226 #define XDT_HVM_IO_READ                 40      /* unused */
 227 #define XDT_HVM_IO_WRITE                41      /* unused */
 228 #define XDT_HVM_CR_READ                 42
 229 #define XDT_HVM_CR_WRITE                43
 230 #define XDT_HVM_DR_READ                 44      /* unused */
 231 #define XDT_HVM_DR_WRITE                45      /* unused */
 232 #define XDT_HVM_MSR_READ                46
 233 #define XDT_HVM_MSR_WRITE               47
 234 #define XDT_HVM_CPUID                   48
 235 #define XDT_HVM_INTR                    49
 236 #define XDT_HVM_INTR_WINDOW             50
 237 #define XDT_HVM_NMI                     51
 238 #define XDT_HVM_SMI                     52
 239 #define XDT_HVM_VMMCALL                 53
 240 #define XDT_HVM_HLT                     54
 241 #define XDT_HVM_INVLPG                  55
 242 #define XDT_HVM_MCE                     56
 243 #define XDT_HVM_IOPORT_READ             57
 244 #define XDT_HVM_IOPORT_WRITE            58
 245 #define XDT_HVM_CLTS                    59
 246 #define XDT_HVM_LMSW                    60
 247 #define XDT_HVM_IOMEM_READ              61
 248 #define XDT_HVM_IOMEM_WRITE             62
 249 #define XDT_SHADOW_NOT_SHADOW                   63
 250 #define XDT_SHADOW_FAST_PROPAGATE               64
 251 #define XDT_SHADOW_FAST_MMIO                    65
 252 #define XDT_SHADOW_FALSE_FAST_PATH              66
 253 #define XDT_SHADOW_MMIO                         67
 254 #define XDT_SHADOW_FIXUP                        68
 255 #define XDT_SHADOW_DOMF_DYING                   69
 256 #define XDT_SHADOW_EMULATE                      70
 257 #define XDT_SHADOW_EMULATE_UNSHADOW_USER        71
 258 #define XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ      72
 259 #define XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED   73
 260 #define XDT_SHADOW_WRMAP_BF                     74
 261 #define XDT_SHADOW_PREALLOC_UNPIN               75
 262 #define XDT_SHADOW_RESYNC_FULL                  76
 263 #define XDT_SHADOW_RESYNC_ONLY                  77
 264 #define XDT_PM_FREQ_CHANGE              78
 265 #define XDT_PM_IDLE_ENTRY               79
 266 #define XDT_PM_IDLE_EXIT                80
 267 #define XDT_SCHED_RUNSTATE_CHANGE       81
 268 #define XDT_SCHED_CONTINUE_RUNNING      82
 269 #define XDT_NEVENTS                     83
 270 
 271 typedef struct {
 272         const char      *pr_mod;        /* probe module */
 273         const char      *pr_name;       /* probe name */
 274         int             evt_id;         /* event id */
 275         uint_t          class;          /* probe class */
 276 } xdt_probe_t;
 277 
 278 typedef struct {
 279         uint32_t        trc_mask;       /* trace mask */
 280         uint32_t        cnt;            /* num enabled probes in class */
 281 } xdt_classinfo_t;
 282 
 283 typedef struct {
 284         ulong_t prev_domid;             /* previous dom executed */
 285         ulong_t prev_vcpuid;            /* previous vcpu executed */
 286         ulong_t prev_ctime;             /* time spent on cpu */
 287         ulong_t next_domid;             /* next dom to be scheduled */
 288         ulong_t next_vcpuid;            /* next vcpu to be scheduled */
 289         ulong_t next_wtime;             /* time spent waiting to get on cpu */
 290         ulong_t next_ts;                /* allocated time slice */
 291         ulong_t cur_domid;              /* current dom */
 292         ulong_t cur_vcpuid;             /* current vcpuid */
 293         int curinfo_valid;              /* info is valid */
 294 } xdt_schedinfo_t;
 295 
 296 static struct {
 297         uint_t cnt;                     /* total num of trace buffers */
 298         size_t size;                    /* size of each cpu buffer */
 299         mfn_t start_mfn;                /* starting mfn of buffers */
 300         caddr_t va;                     /* va buffers are mapped into */
 301 
 302         /* per-cpu buffers */
 303         struct t_buf **meta;            /* buffer metadata */
 304         struct t_rec **data;            /* buffer data records */
 305 
 306         /* statistics */
 307         uint64_t stat_dropped_recs;     /* records dropped */
 308         uint64_t stat_spurious_cpu;     /* recs with garbage cpuids */
 309         uint64_t stat_spurious_switch;  /* inconsistent vcpu switches */
 310         uint64_t stat_unknown_shutdown; /* unknown shutdown code */
 311         uint64_t stat_unknown_recs;     /* unknown records */
 312 } tbuf;
 313 
 314 static size_t tbuf_data_size;
 315 
 316 static char *xdt_stats[] = {
 317         "dropped_recs",
 318 };
 319 
 320 /*
 321  * Tunable variables
 322  *
 323  * The following may be tuned by adding a line to /etc/system that
 324  * includes both the name of the module ("xdt") and the name of the variable.
 325  * For example:
 326  *     set xdt:xdt_tbuf_pages = 40
 327  */
 328 uint_t xdt_tbuf_pages = 20;                     /* pages to alloc per-cpu buf */
 329 
 330 /*
 331  * The following may be tuned by adding a line to
 332  * /platform/i86xpv/kernel/drv/xdt.conf.
 333  * For example:
 334  *     xdt_poll_nsec = 200000000;
 335  */
 336 static hrtime_t xdt_poll_nsec;                  /* trace buffer poll interval */
 337 
 338 /*
 339  * Another tunable variable: the maximum number of records to process
 340  * in one scan. If it is 0 (e.g. not set in /etc/system), it will
 341  * be set to ncpu * (bufsize / max_rec_size).
 342  *
 343  * Having an upper limit avoids a situation where the scan would loop
 344  * endlessly in case the hypervisor adds records quicker than we
 345  * can process them. It's better to drop records than to loop, obviously.
 346  */
 347 uint_t xdt_max_recs = 0;
 348 
 349 /*
 350  * Internal variables
 351  */
 352 static dev_info_t *xdt_devi;
 353 static dtrace_provider_id_t xdt_id;
 354 static uint_t xdt_ncpus;                        /* total number of phys CPUs */
 355 static uint32_t cur_trace_mask;                 /* current trace mask */
 356 static xdt_schedinfo_t *xdt_cpu_schedinfo;      /* per-cpu sched info */
 357 dtrace_id_t xdt_probemap[XDT_NEVENTS];          /* map of enabled probes */
 358 dtrace_id_t xdt_prid[XDT_NEVENTS];              /* IDs of registered events */
 359 static cyclic_id_t xdt_cyclic = CYCLIC_NONE;
 360 static kstat_t *xdt_kstats;
 361 static xdt_classinfo_t xdt_classinfo[XDT_NCLASSES];
 362 
 363 /*
 364  * These provide context when probes fire. They can be accessed
 365  * from xdt dtrace probe (as `xdt_curdom, etc). It's ok for these
 366  * to be global, and not per-cpu, as probes are run strictly in sequence
 367  * as the trace buffers are
 368  */
 369 uint_t xdt_curdom, xdt_curvcpu, xdt_curpcpu;
 370 uint64_t xdt_timestamp;
 371 
 372 static xdt_probe_t xdt_probe[] = {
 373         /* Sched probes */
 374         { "sched", "off-cpu", XDT_SCHED_OFF_CPU, XDT_SCHED },
 375         { "sched", "on-cpu", XDT_SCHED_ON_CPU, XDT_SCHED },
 376         { "sched", "idle-off-cpu", XDT_SCHED_IDLE_OFF_CPU, XDT_SCHED },
 377         { "sched", "idle-on-cpu", XDT_SCHED_IDLE_ON_CPU, XDT_SCHED },
 378         { "sched", "block", XDT_SCHED_BLOCK, XDT_SCHED },
 379         { "sched", "sleep", XDT_SCHED_SLEEP, XDT_SCHED },
 380         { "sched", "wake", XDT_SCHED_WAKE, XDT_SCHED },
 381         { "sched", "yield", XDT_SCHED_YIELD, XDT_SCHED },
 382         { "sched", "shutdown-poweroff", XDT_SCHED_SHUTDOWN_POWEROFF,
 383                 XDT_SCHED },
 384         { "sched", "shutdown-reboot", XDT_SCHED_SHUTDOWN_REBOOT, XDT_SCHED },
 385         { "sched", "shutdown-suspend", XDT_SCHED_SHUTDOWN_SUSPEND, XDT_SCHED },
 386         { "sched", "shutdown-crash", XDT_SCHED_SHUTDOWN_CRASH, XDT_SCHED },
 387         { "sched", "add", XDT_SCHED_ADD_VCPU, XDT_SCHED },
 388         { "sched", "runstate-change", XDT_SCHED_RUNSTATE_CHANGE, XDT_SCHED },
 389         { "sched", "continue-running", XDT_SCHED_CONTINUE_RUNNING, XDT_SCHED },
 390 
 391         /* Memory probes */
 392         { "mem", "page-grant-map", XDT_MEM_PAGE_GRANT_MAP, XDT_MEM },
 393         { "mem", "page-grant-unmap", XDT_MEM_PAGE_GRANT_UNMAP, XDT_MEM },
 394         { "mem", "page-grant-transfer", XDT_MEM_PAGE_GRANT_TRANSFER, XDT_MEM },
 395 
 396         {"pv", "hypercall", XDT_PV_HYPERCALL, XDT_PV },
 397         {"pv", "trap", XDT_PV_TRAP, XDT_PV },
 398         {"pv", "page-fault", XDT_PV_PAGE_FAULT, XDT_PV },
 399         {"pv", "forced-invalid-op", XDT_PV_FORCED_INVALID_OP, XDT_PV },
 400         {"pv", "emulate-priv-op", XDT_PV_EMULATE_PRIVOP, XDT_PV },
 401         {"pv", "math-state-restore", XDT_PV_MATH_STATE_RESTORE, XDT_PV },
 402         {"pv", "paging-fixup", XDT_PV_PAGING_FIXUP, XDT_PV },
 403         {"pv", "dt-mapping-fault", XDT_PV_DT_MAPPING_FAULT, XDT_PV },
 404         {"pv", "pte-write-emul", XDT_PV_PTWR_EMULATION, XDT_PV },
 405 
 406         /* HVM probes */
 407         { "hvm", "vmentry", XDT_HVM_VMENTRY, XDT_HVM },
 408         { "hvm", "vmexit", XDT_HVM_VMEXIT, XDT_HVM },
 409         { "hvm", "pagefault-xen", XDT_HVM_PF_XEN, XDT_HVM },
 410         { "hvm", "pagefault-inject", XDT_HVM_PF_INJECT, XDT_HVM },
 411         { "hvm", "exception-inject", XDT_HVM_EXC_INJECT, XDT_HVM },
 412         { "hvm", "virq-inject", XDT_HVM_VIRQ_INJECT, XDT_HVM },
 413         { "hvm", "cr-read", XDT_HVM_CR_READ, XDT_HVM },
 414         { "hvm", "cr-write", XDT_HVM_CR_WRITE, XDT_HVM },
 415         { "hvm", "msr-read", XDT_HVM_MSR_READ, XDT_HVM },
 416         { "hvm", "msr-write", XDT_HVM_MSR_WRITE, XDT_HVM },
 417         { "hvm", "cpuid", XDT_HVM_CPUID, XDT_HVM },
 418         { "hvm", "intr", XDT_HVM_INTR, XDT_HVM },
 419         { "hvm", "intr-window", XDT_HVM_INTR_WINDOW, XDT_HVM },
 420         { "hvm", "nmi", XDT_HVM_NMI, XDT_HVM },
 421         { "hvm", "smi", XDT_HVM_SMI, XDT_HVM },
 422         { "hvm", "vmmcall", XDT_HVM_VMMCALL, XDT_HVM },
 423         { "hvm", "hlt", XDT_HVM_HLT, XDT_HVM },
 424         { "hvm", "invlpg", XDT_HVM_INVLPG, XDT_HVM },
 425         { "hvm", "mce", XDT_HVM_MCE, XDT_HVM },
 426         { "hvm", "pio-read", XDT_HVM_IOPORT_READ, XDT_HVM },
 427         { "hvm", "pio-write", XDT_HVM_IOPORT_WRITE, XDT_HVM },
 428         { "hvm", "mmio-read", XDT_HVM_IOMEM_READ, XDT_HVM },
 429         { "hvm", "mmio-write", XDT_HVM_IOMEM_WRITE, XDT_HVM },
 430         { "hvm", "clts", XDT_HVM_CLTS, XDT_HVM },
 431         { "hvm", "lmsw", XDT_HVM_LMSW, XDT_HVM },
 432 
 433         { "shadow", "fault-not-shadow", XDT_SHADOW_NOT_SHADOW, XDT_SHADOW },
 434         { "shadow", "fast-propagate", XDT_SHADOW_FAST_PROPAGATE, XDT_SHADOW },
 435         { "shadow", "fast-mmio", XDT_SHADOW_FAST_MMIO, XDT_SHADOW },
 436         { "shadow", "false-fast-path", XDT_SHADOW_FALSE_FAST_PATH,
 437             XDT_SHADOW },
 438         { "shadow", "mmio", XDT_SHADOW_MMIO, XDT_SHADOW },
 439         { "shadow", "fixup", XDT_SHADOW_FIXUP, XDT_SHADOW },
 440         { "shadow", "domf-dying", XDT_SHADOW_DOMF_DYING, XDT_SHADOW },
 441         { "shadow", "emulate", XDT_SHADOW_EMULATE, XDT_SHADOW },
 442         { "shadow", "emulate-unshadow-user", XDT_SHADOW_EMULATE_UNSHADOW_USER,
 443             XDT_SHADOW },
 444         { "shadow", "emulate-unshadow-evtinj",
 445             XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ, XDT_SHADOW },
 446         { "shadow", "emulate-unshadow-unhandled",
 447             XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED, XDT_SHADOW },
 448         { "shadow", "wrmap-bf", XDT_SHADOW_WRMAP_BF, XDT_SHADOW },
 449         { "shadow", "prealloc-unpin", XDT_SHADOW_PREALLOC_UNPIN, XDT_SHADOW },
 450         { "shadow", "resync-full", XDT_SHADOW_RESYNC_FULL, XDT_SHADOW },
 451         { "shadow", "resync-only", XDT_SHADOW_RESYNC_ONLY, XDT_SHADOW },
 452 
 453         { "pm", "freq-change", XDT_PM_FREQ_CHANGE, XDT_PM },
 454         { "pm", "idle-entry", XDT_PM_IDLE_ENTRY, XDT_PM },
 455         { "pm", "idle-exit", XDT_PM_IDLE_EXIT, XDT_PM },
 456 
 457         /* Trace buffer related probes */
 458         { "trace", "records-lost", XDT_TRC_LOST_RECORDS, XDT_GEN },
 459 
 460         { NULL }
 461 };
 462 
 463 static inline uint32_t
 464 xdt_nr_active_probes()
 465 {
 466         int i;
 467         uint32_t tot = 0;
 468 
 469         for (i = 0; i < XDT_NCLASSES; i++)
 470                 tot += xdt_classinfo[i].cnt;
 471 
 472         return (tot);
 473 }
 474 
 475 static void
 476 xdt_init_trace_masks(void)
 477 {
 478         xdt_classinfo[XDT_SCHED].trc_mask = TRC_SCHED;
 479         xdt_classinfo[XDT_MEM].trc_mask = TRC_MEM;
 480         xdt_classinfo[XDT_HVM].trc_mask = TRC_HVM;
 481         xdt_classinfo[XDT_GEN].trc_mask = TRC_GEN;
 482         xdt_classinfo[XDT_PV].trc_mask = TRC_PV;
 483         xdt_classinfo[XDT_SHADOW].trc_mask = TRC_SHADOW;
 484         xdt_classinfo[XDT_PM].trc_mask = TRC_PM;
 485 }
 486 
 487 static int
 488 xdt_kstat_update(kstat_t *ksp, int flag)
 489 {
 490         kstat_named_t *knp;
 491 
 492         if (flag != KSTAT_READ)
 493                 return (EACCES);
 494 
 495         knp = ksp->ks_data;
 496 
 497         /*
 498          * Assignment order should match that of the names in
 499          * xdt_stats.
 500          */
 501         (knp++)->value.ui64 = tbuf.stat_dropped_recs;
 502 
 503         return (0);
 504 }
 505 
 506 static void
 507 xdt_kstat_init(void)
 508 {
 509         int nstats = sizeof (xdt_stats) / sizeof (xdt_stats[0]);
 510         char **cp = xdt_stats;
 511         kstat_named_t *knp;
 512 
 513         if ((xdt_kstats = kstat_create("xdt", 0, "trace_statistics", "misc",
 514             KSTAT_TYPE_NAMED, nstats, 0)) == NULL)
 515                 return;
 516 
 517         xdt_kstats->ks_update = xdt_kstat_update;
 518 
 519         knp = xdt_kstats->ks_data;
 520         while (nstats > 0) {
 521                 kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
 522                 knp++;
 523                 cp++;
 524                 nstats--;
 525         }
 526 
 527         kstat_install(xdt_kstats);
 528 }
 529 
 530 static int
 531 xdt_sysctl_tbuf(xen_sysctl_tbuf_op_t *tbuf_op)
 532 {
 533         xen_sysctl_t op;
 534         int xerr;
 535 
 536         op.cmd = XEN_SYSCTL_tbuf_op;
 537         op.interface_version = XEN_SYSCTL_INTERFACE_VERSION;
 538         op.u.tbuf_op = *tbuf_op;
 539 
 540         if ((xerr = HYPERVISOR_sysctl(&op)) != 0)
 541                 return (xen_xlate_errcode(xerr));
 542 
 543         *tbuf_op = op.u.tbuf_op;
 544         return (0);
 545 }
 546 
 547 static int
 548 xdt_map_trace_buffers(mfn_t mfn, caddr_t va, size_t len)
 549 {
 550         x86pte_t pte;
 551         caddr_t const sva = va;
 552         caddr_t const eva = va + len;
 553         int xerr;
 554 
 555         ASSERT(mfn != MFN_INVALID);
 556         ASSERT(va != NULL);
 557         ASSERT(IS_PAGEALIGNED(len));
 558 
 559         for (; va < eva; va += MMU_PAGESIZE) {
 560                 /*
 561                  * Ask the HAT to load a throwaway mapping to page zero, then
 562                  * overwrite it with the hypervisor mapping. It gets removed
 563                  * later via hat_unload().
 564                  */
 565                 hat_devload(kas.a_hat, va, MMU_PAGESIZE, (pfn_t)0,
 566                     PROT_READ | HAT_UNORDERED_OK,
 567                     HAT_LOAD_NOCONSIST | HAT_LOAD);
 568 
 569                 pte = mmu_ptob((x86pte_t)mfn) | PT_VALID | PT_USER
 570                     | PT_FOREIGN | PT_WRITABLE;
 571 
 572                 xerr = HYPERVISOR_update_va_mapping_otherdomain((ulong_t)va,
 573                     pte, UVMF_INVLPG | UVMF_LOCAL, DOMID_XEN);
 574 
 575                 if (xerr != 0) {
 576                         /* unmap pages loaded so far */
 577                         size_t ulen = (uintptr_t)(va + MMU_PAGESIZE) -
 578                             (uintptr_t)sva;
 579                         hat_unload(kas.a_hat, sva, ulen, HAT_UNLOAD_UNMAP);
 580                         return (xen_xlate_errcode(xerr));
 581                 }
 582 
 583                 mfn++;
 584         }
 585 
 586         return (0);
 587 }
 588 
 589 static int
 590 xdt_attach_trace_buffers(void)
 591 {
 592         xen_sysctl_tbuf_op_t tbuf_op;
 593         size_t len;
 594         int err;
 595         uint_t i;
 596 
 597         /*
 598          * Xen does not support trace buffer re-sizing. If the buffers
 599          * have already been allocated we just use them as is.
 600          */
 601         tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_get_info;
 602         if ((err = xdt_sysctl_tbuf(&tbuf_op)) != 0)
 603                 return (err);
 604 
 605         if (tbuf_op.size == 0) {
 606                 /* set trace buffer size */
 607                 tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_set_size;
 608                 tbuf_op.size = xdt_tbuf_pages;
 609                 (void) xdt_sysctl_tbuf(&tbuf_op);
 610 
 611                 /* get trace buffer info */
 612                 tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_get_info;
 613                 if ((err = xdt_sysctl_tbuf(&tbuf_op)) != 0)
 614                         return (err);
 615 
 616                 if (tbuf_op.size == 0) {
 617                         cmn_err(CE_NOTE, "Couldn't allocate trace buffers.");
 618                         return (ENOBUFS);
 619                 }
 620         }
 621 
 622         tbuf.size = tbuf_op.size;
 623         tbuf.start_mfn = (mfn_t)tbuf_op.buffer_mfn;
 624         tbuf.cnt = xdt_ncpus;
 625 
 626         ASSERT(tbuf.start_mfn != MFN_INVALID);
 627         ASSERT(tbuf.cnt > 0);
 628 
 629         len = tbuf.size * tbuf.cnt;
 630         tbuf.va = vmem_alloc(heap_arena, len, VM_SLEEP);
 631 
 632         if ((err = xdt_map_trace_buffers(tbuf.start_mfn, tbuf.va, len)) != 0) {
 633                 vmem_free(heap_arena, tbuf.va, len);
 634                 tbuf.va = NULL;
 635                 return (err);
 636         }
 637 
 638         tbuf.meta = (struct t_buf **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.meta),
 639             KM_SLEEP);
 640         tbuf.data = (struct t_rec **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.data),
 641             KM_SLEEP);
 642 
 643         for (i = 0; i < tbuf.cnt; i++) {
 644                 void *cpu_buf = (void *)(tbuf.va + (tbuf.size * i));
 645                 tbuf.meta[i] = cpu_buf;
 646                 tbuf.data[i] = (struct t_rec *)((uintptr_t)cpu_buf +
 647                     sizeof (struct t_buf));
 648 
 649                 /* throw away stale trace records */
 650                 tbuf.meta[i]->cons = tbuf.meta[i]->prod;
 651         }
 652 
 653         tbuf_data_size = tbuf.size - sizeof (struct t_buf);
 654         if (xdt_max_recs == 0)
 655                 xdt_max_recs = (xdt_ncpus * tbuf_data_size)
 656                     / sizeof (struct t_rec);
 657 
 658         return (0);
 659 }
 660 
 661 static void
 662 xdt_detach_trace_buffers(void)
 663 {
 664         size_t len = tbuf.size * tbuf.cnt;
 665 
 666         ASSERT(tbuf.va != NULL);
 667 
 668         hat_unload(kas.a_hat, tbuf.va, len,
 669             HAT_UNLOAD_UNMAP | HAT_UNLOAD_UNLOCK);
 670         vmem_free(heap_arena, tbuf.va, len);
 671         kmem_free(tbuf.meta, tbuf.cnt * sizeof (*tbuf.meta));
 672         kmem_free(tbuf.data, tbuf.cnt * sizeof (*tbuf.data));
 673 }
 674 
 675 static void
 676 xdt_update_sched_context(uint_t cpuid, uint_t dom, uint_t vcpu)
 677 {
 678         xdt_schedinfo_t *sp = &xdt_cpu_schedinfo[cpuid];
 679 
 680         sp->cur_domid = dom;
 681         sp->cur_vcpuid = vcpu;
 682         sp->curinfo_valid = 1;
 683 }
 684 
 685 static void
 686 xdt_update_domain_context(uint_t dom, uint_t vcpu)
 687 {
 688         xdt_curdom = dom;
 689         xdt_curvcpu = vcpu;
 690 }
 691 
 692 static size_t
 693 xdt_process_rec(uint_t cpuid, struct t_rec *rec)
 694 {
 695         xdt_schedinfo_t *sp = &xdt_cpu_schedinfo[cpuid];
 696         uint_t dom, vcpu;
 697         int eid;
 698         uint32_t *data;
 699         uint64_t tsc, addr64, rip64, val64, pte64;
 700         size_t rec_size;
 701 
 702         ASSERT(rec != NULL);
 703         ASSERT(xdt_ncpus == xpv_nr_phys_cpus());
 704 
 705         if (cpuid >= xdt_ncpus) {
 706                 tbuf.stat_spurious_cpu++;
 707                 goto done;
 708         }
 709 
 710         /*
 711          * If our current state isn't valid, and if this is not
 712          * an event that will update our state, skip it.
 713          */
 714 
 715         if (!sp->curinfo_valid &&
 716             rec->event != TRC_SCHED_SWITCH &&
 717             rec->event != TRC_LOST_RECORDS)
 718                 goto done;
 719 
 720         if (rec->cycles_included) {
 721                 data = rec->u.cycles.extra_u32;
 722                 tsc = (((uint64_t)rec->u.cycles.cycles_hi) << 32)
 723                     | rec->u.cycles.cycles_lo;
 724         } else {
 725                 data = rec->u.nocycles.extra_u32;
 726                 tsc = 0;
 727         }
 728 
 729         xdt_timestamp = tsc;
 730 
 731         switch (rec->event) {
 732         /*
 733          * Sched probes
 734          */
 735         case TRC_SCHED_SWITCH_INFPREV:
 736                 /*
 737                  * Info on vCPU being de-scheduled
 738                  *
 739                  * data[0] = prev domid
 740                  * data[1] = time spent on pcpu
 741                  */
 742                 sp->prev_domid = data[0];
 743                 sp->prev_ctime = data[1];
 744                 break;
 745 
 746         case TRC_SCHED_SWITCH_INFNEXT:
 747                 /*
 748                  * Info on next vCPU to be scheduled
 749                  *
 750                  * data[0] = next domid
 751                  * data[1] = time spent waiting to get on cpu
 752                  * data[2] = time slice
 753                  */
 754                 sp->next_domid = data[0];
 755                 sp->next_wtime = data[1];
 756                 sp->next_ts = data[2];
 757                 break;
 758 
 759         case TRC_SCHED_SWITCH:
 760                 /*
 761                  * vCPU switch
 762                  *
 763                  * data[0] = prev domid
 764                  * data[1] = prev vcpuid
 765                  * data[2] = next domid
 766                  * data[3] = next vcpuid
 767                  */
 768 
 769                 /*
 770                  * Provide valid context for this probe if there
 771                  * wasn't one.
 772                  */
 773                 if (!sp->curinfo_valid)
 774                         xdt_update_domain_context(data[0], data[1]);
 775 
 776                 xdt_update_sched_context(cpuid, data[0], data[1]);
 777 
 778                 if (data[0] != sp->prev_domid &&
 779                     data[2] != sp->next_domid) {
 780                         /* prev and next info don't match doms being sched'd */
 781                         tbuf.stat_spurious_switch++;
 782                         goto switchdone;
 783                 }
 784 
 785                 sp->prev_vcpuid = data[1];
 786                 sp->next_vcpuid = data[3];
 787 
 788                 XDT_PROBE3(IS_IDLE_DOM(sp->prev_domid)?
 789                     XDT_SCHED_IDLE_OFF_CPU:XDT_SCHED_OFF_CPU,
 790                     sp->prev_domid, sp->prev_vcpuid, sp->prev_ctime);
 791 
 792                 XDT_PROBE4(IS_IDLE_DOM(sp->next_domid)?
 793                     XDT_SCHED_IDLE_ON_CPU:XDT_SCHED_ON_CPU,
 794                     sp->next_domid, sp->next_vcpuid, sp->next_wtime,
 795                     sp->next_ts);
 796 switchdone:
 797                 xdt_update_sched_context(cpuid, data[2], data[3]);
 798                 xdt_update_domain_context(data[2], data[3]);
 799 
 800                 break;
 801 
 802         case TRC_SCHED_BLOCK:
 803                 /*
 804                  * vCPU blocked
 805                  *
 806                  * data[0] = domid
 807                  * data[1] = vcpuid
 808                  */
 809                 XDT_PROBE2(XDT_SCHED_BLOCK, data[0], data[1]);
 810                 break;
 811 
 812         case TRC_SCHED_SLEEP:
 813                 /*
 814                  * Put vCPU to sleep
 815                  *
 816                  * data[0] = domid
 817                  * data[1] = vcpuid
 818                  */
 819                 XDT_PROBE2(XDT_SCHED_SLEEP, data[0], data[1]);
 820                 break;
 821 
 822         case TRC_SCHED_WAKE:
 823                 /*
 824                  * Wake up vCPU
 825                  *
 826                  * data[0] = domid
 827                  * data[1] = vcpuid
 828                  */
 829                 XDT_PROBE2(XDT_SCHED_WAKE, data[0], data[1]);
 830                 break;
 831 
 832         case TRC_SCHED_YIELD:
 833                 /*
 834                  * vCPU yielded
 835                  *
 836                  * data[0] = domid
 837                  * data[1] = vcpuid
 838                  */
 839                 XDT_PROBE2(XDT_SCHED_YIELD, data[0], data[1]);
 840                 break;
 841 
 842         case TRC_SCHED_SHUTDOWN:
 843                 /*
 844                  * Guest shutting down
 845                  *
 846                  * data[0] = domid
 847                  * data[1] = initiating vcpu
 848                  * data[2] = shutdown code
 849                  */
 850                 switch (data[2]) {
 851                 case SHUTDOWN_poweroff:
 852                         eid = XDT_SCHED_SHUTDOWN_POWEROFF;
 853                         break;
 854                 case SHUTDOWN_reboot:
 855                         eid = XDT_SCHED_SHUTDOWN_REBOOT;
 856                         break;
 857                 case SHUTDOWN_suspend:
 858                         eid = XDT_SCHED_SHUTDOWN_SUSPEND;
 859                         break;
 860                 case SHUTDOWN_crash:
 861                         eid = XDT_SCHED_SHUTDOWN_CRASH;
 862                         break;
 863                 default:
 864                         tbuf.stat_unknown_shutdown++;
 865                         goto done;
 866                 }
 867 
 868                 XDT_PROBE2(eid, data[0], data[1]);
 869                 break;
 870 
 871         case TRC_SCHED_DOM_REM:
 872         case TRC_SCHED_CTL:
 873         case TRC_SCHED_S_TIMER_FN:
 874         case TRC_SCHED_T_TIMER_FN:
 875         case TRC_SCHED_DOM_TIMER_FN:
 876                 /* unused */
 877                 break;
 878         case TRC_SCHED_DOM_ADD:
 879                 /*
 880                  * Add vcpu to a guest.
 881                  *
 882                  * data[0] = domid
 883                  * data[1] = vcpu
 884                  */
 885                 XDT_PROBE2(XDT_SCHED_ADD_VCPU, data[0], data[1]);
 886                 break;
 887         case TRC_SCHED_ADJDOM:
 888                 /*
 889                  * Scheduling parameters for a guest
 890                  * were modified.
 891                  *
 892                  * data[0] = domid;
 893                  */
 894                 XDT_PROBE1(XDT_SCHED_ADJDOM, data[1]);
 895                 break;
 896         case TRC_SCHED_RUNSTATE_CHANGE:
 897                 /*
 898                  * Runstate change for a VCPU.
 899                  *
 900                  * data[0] = (domain << 16) | vcpu;
 901                  * data[1] = oldstate;
 902                  * data[2] = newstate;
 903                  */
 904                 XDT_PROBE4(XDT_SCHED_RUNSTATE_CHANGE, data[0] >> 16,
 905                     data[0] & 0xffff, data[1], data[2]);
 906                 break;
 907         case TRC_SCHED_CONTINUE_RUNNING:
 908                 /*
 909                  * VCPU is back on a physical CPU that it previously
 910                  * was also running this VCPU.
 911                  *
 912                  * data[0] = (domain << 16) | vcpu;
 913                  */
 914                 XDT_PROBE2(XDT_SCHED_CONTINUE_RUNNING, data[0] >> 16,
 915                     data[0] & 0xffff);
 916                 break;
 917         /*
 918          * Mem probes
 919          */
 920         case TRC_MEM_PAGE_GRANT_MAP:
 921                 /*
 922                  * Guest mapped page grant
 923                  *
 924                  * data[0] = target domid
 925                  */
 926                 XDT_PROBE1(XDT_MEM_PAGE_GRANT_MAP, data[0]);
 927                 break;
 928 
 929         case TRC_MEM_PAGE_GRANT_UNMAP:
 930                 /*
 931                  * Guest unmapped page grant
 932                  *
 933                  * data[0] = target domid
 934                  */
 935                 XDT_PROBE1(XDT_MEM_PAGE_GRANT_UNMAP, data[0]);
 936                 break;
 937 
 938         case TRC_MEM_PAGE_GRANT_TRANSFER:
 939                 /*
 940                  * Page grant is being transferred
 941                  *
 942                  * data[0] = target domid
 943                  */
 944                 XDT_PROBE1(XDT_MEM_PAGE_GRANT_TRANSFER, data[0]);
 945                 break;
 946 
 947         /*
 948          * Probes for PV domains.
 949          */
 950         case TRC_PV_HYPERCALL:
 951                 /*
 952                  * Hypercall from a 32-bit PV domain.
 953                  *
 954                  * data[0] = eip
 955                  * data[1] = eax
 956                  */
 957                 XDT_PROBE2(XDT_PV_HYPERCALL, data[0], data[1]);
 958                 break;
 959         case TRC_PV_HYPERCALL | TRC_64_FLAG:
 960                 /*
 961                  * Hypercall from a 64-bit PV domain.
 962                  *
 963                  * data[0] = rip(0:31)
 964                  * data[1] = rip(32:63)
 965                  * data[2] = eax;
 966                  */
 967                 rip64 = (((uint64_t)data[1]) << 32) | data[0];
 968                 XDT_PROBE2(XDT_PV_HYPERCALL, rip64, data[2]);
 969                 break;
 970         case TRC_PV_TRAP:
 971                 /*
 972                  * Trap in a 32-bit PV domain.
 973                  *
 974                  * data[0] = eip
 975                  * data[1] = trapnr | (error_code_valid << 15)
 976                  *      | (error_code << 16);
 977                  */
 978                 XDT_PROBE4(XDT_PV_TRAP, data[0], data[1] & 0x7fff,
 979                     (data[1] >> 15) & 1, data[1] >> 16);
 980                 break;
 981         case TRC_PV_TRAP | TRC_64_FLAG:
 982                 /*
 983                  * Trap in a 64-bit PV domain.
 984                  *
 985                  * data[0] = rip(0:31)
 986                  * data[1] = rip(32:63)
 987                  * data[2] = trapnr | (error_code_valid << 15)
 988                  *      | (error_code << 16);
 989                  */
 990                 rip64 = (((uint64_t)data[1]) << 32) | data[2];
 991                 XDT_PROBE4(XDT_PV_TRAP, rip64, data[2] & 0x7fff,
 992                     (data[2] >> 15) & 1, data[2] >> 16);
 993                 break;
 994         case TRC_PV_PAGE_FAULT:
 995                 /*
 996                  * Page fault in a 32-bit PV domain.
 997                  *
 998                  * data[0] = eip
 999                  * data[1] = vaddr
1000                  * data[2] = error code
1001                  */
1002                 XDT_PROBE3(XDT_PV_PAGE_FAULT, data[0], data[1], data[2]);
1003                 break;
1004         case TRC_PV_PAGE_FAULT | TRC_64_FLAG:
1005                 /*
1006                  * Page fault in a 32-bit PV domain.
1007                  *
1008                  * data[0] = rip(0:31)
1009                  * data[1] = rip(31:63)
1010                  * data[2] = vaddr(0:31)
1011                  * data[3] = vaddr(31:63)
1012                  * data[4] = error code
1013                  */
1014                 rip64 = (((uint64_t)data[1]) << 32) | data[0];
1015                 addr64 = (((uint64_t)data[3]) << 32) | data[2];
1016                 XDT_PROBE3(XDT_PV_PAGE_FAULT, rip64, addr64, data[4]);
1017                 break;
1018         case TRC_PV_FORCED_INVALID_OP:
1019                 /*
1020                  * Hypervisor emulated a forced invalid op (ud2)
1021                  * in a 32-bit PV domain.
1022                  *
1023                  * data[1] = eip
1024                  */
1025                 XDT_PROBE1(XDT_PV_FORCED_INVALID_OP, data[1]);
1026                 break;
1027         case TRC_PV_FORCED_INVALID_OP | TRC_64_FLAG:
1028                 /*
1029                  * Hypervisor emulated a forced invalid op (ud2)
1030                  * in a 64-bit PV domain.
1031                  *
1032                  * data[1] = rip(0:31)
1033                  * data[2] = rip(31:63)
1034                  *
1035                  */
1036                 rip64 = (((uint64_t)data[2]) << 32) | data[1];
1037                 XDT_PROBE1(XDT_PV_FORCED_INVALID_OP, rip64);
1038                 break;
1039         case TRC_PV_EMULATE_PRIVOP:
1040                 /*
1041                  * Hypervisor emulated a privileged operation
1042                  * in a 32-bit PV domain.
1043                  *
1044                  * data[0] = eip
1045                  */
1046                 XDT_PROBE1(XDT_PV_EMULATE_PRIVOP, data[0]);
1047                 break;
1048         case TRC_PV_EMULATE_PRIVOP | TRC_64_FLAG:
1049                 /*
1050                  * Hypervisor emulated a privileged operation
1051                  * in a 64-bit PV domain.
1052                  *
1053                  * data[0] = rip(0:31)
1054                  * data[1] = rip(31:63)
1055                  */
1056                 rip64 = (((uint64_t)data[1]) << 32) | data[0];
1057                 XDT_PROBE1(XDT_PV_EMULATE_PRIVOP, rip64);
1058                 break;
1059         case TRC_PV_EMULATE_4GB:
1060                 /* unused, 32-bit hypervisor only */
1061                 break;
1062         case TRC_PV_MATH_STATE_RESTORE:
1063                 /*
1064                  * Hypervisor restores math state after FP DNA trap.
1065                  *
1066                  * No arguments.
1067                  */
1068                 XDT_PROBE0(XDT_PV_MATH_STATE_RESTORE);
1069                 break;
1070         case TRC_PV_PAGING_FIXUP:
1071                 /*
1072                  * Hypervisor fixed up a page fault (e.g. it was
1073                  * a side-effect of hypervisor guest page table
1074                  * bookkeeping, and not propagated to the guest).
1075                  *
1076                  * data[0] = eip
1077                  * data[1] = vaddr
1078                  */
1079                 XDT_PROBE2(XDT_PV_PAGING_FIXUP, data[0], data[2]);
1080                 break;
1081         case TRC_PV_PAGING_FIXUP | TRC_64_FLAG:
1082                 /*
1083                  * Hypervisor fixed up a page fault (e.g. it was
1084                  * a side-effect of hypervisor guest page table
1085                  * bookkeeping, and not propagated to the guest).
1086                  *
1087                  * data[0] = eip(0:31)
1088                  * data[1] = eip(31:63)
1089                  * data[2] = vaddr(0:31)
1090                  * data[3] = vaddr(31:63)
1091                  */
1092                 rip64 = (((uint64_t)data[1]) << 32) | data[0];
1093                 addr64 = (((uint64_t)data[3]) << 32) | data[2];
1094                 XDT_PROBE2(XDT_PV_PAGING_FIXUP, rip64, addr64);
1095                 break;
1096         case TRC_PV_GDT_LDT_MAPPING_FAULT:
1097                 /*
1098                  * Descriptor table mapping fault in a 32-bit PV domain.
1099                  * data[0] = eip
1100                  * data[1] = offset
1101                  */
1102                 XDT_PROBE2(XDT_PV_DT_MAPPING_FAULT, data[0], data[1]);
1103                 break;
1104         case TRC_PV_GDT_LDT_MAPPING_FAULT | TRC_64_FLAG:
1105                 /*
1106                  * Descriptor table mapping fault in a 64-bit PV domain.
1107                  *
1108                  * data[0] = eip(0:31)
1109                  * data[1] = eip(31:63)
1110                  * data[2] = offset(0:31)
1111                  * data[3] = offset(31:63)
1112                  */
1113                 rip64 = (((uint64_t)data[1]) << 32) | data[0];
1114                 val64 = (((uint64_t)data[3]) << 32) | data[2];
1115                 XDT_PROBE2(XDT_PV_DT_MAPPING_FAULT, rip64, val64);
1116                 break;
1117         case TRC_PV_PTWR_EMULATION:
1118         case TRC_PV_PTWR_EMULATION_PAE | TRC_64_FLAG:
1119                 /*
1120                  * Should only happen on 32-bit hypervisor; unused.
1121                  */
1122                 break;
1123         case TRC_PV_PTWR_EMULATION_PAE:
1124                 /*
1125                  * PTE write emulation for a 32-bit PV domain.
1126                  *
1127                  * data[0] = pte
1128                  * data[1] = addr
1129                  * data[2] = eip
1130                  */
1131                 XDT_PROBE3(XDT_PV_PTWR_EMULATION, data[0], data[1], data[2]);
1132                 break;
1133         case TRC_PV_PTWR_EMULATION | TRC_64_FLAG:
1134                 /*
1135                  * PTE write emulation for a 64-bit PV domain.
1136                  *
1137                  * data[0] = pte(0:31)
1138                  * data[1] = pte(32:63)
1139                  * data[2] = addr(0:31)
1140                  * data[3] = addr(32:63)
1141                  * data[4] = rip(0:31)
1142                  * data[5] = rip(32:63)
1143                  */
1144                 pte64 = (((uint64_t)data[1]) << 32) | data[0];
1145                 addr64 = (((uint64_t)data[3]) << 32) | data[2];
1146                 rip64 = (((uint64_t)data[5]) << 32) | data[4];
1147                 XDT_PROBE3(XDT_PV_PTWR_EMULATION, pte64, addr64, rip64);
1148                 break;
1149 
1150         /*
1151          * HVM probes
1152          */
1153         case TRC_HVM_VMENTRY:
1154                 /*
1155                  * Return to guest via vmx_launch/vmrun
1156                  *
1157                  */
1158                 XDT_PROBE0(XDT_HVM_VMENTRY);
1159                 break;
1160 
1161         case TRC_HVM_VMEXIT:
1162                 /*
1163                  * Entry into VMEXIT handler from 32-bit HVM domain
1164                  *
1165                  * data[0] = cpu vendor specific exit code
1166                  * data[1] = guest eip
1167                  */
1168                 XDT_PROBE2(XDT_HVM_VMEXIT, data[0], data[1]);
1169                 break;
1170         case TRC_HVM_VMEXIT64:
1171                 /*
1172                  * Entry into VMEXIT handler from 64-bit HVM domain
1173                  *
1174                  * data[0] = cpu vendor specific exit code
1175                  * data[1] = guest rip(0:31)
1176                  * data[2] = guest rip(32:64)
1177                  */
1178                 rip64 = (((uint64_t)data[2]) << 32) | data[1];
1179                 XDT_PROBE2(XDT_HVM_VMEXIT, data[0], rip64);
1180                 break;
1181 
1182         case TRC_HVM_PF_XEN64:
1183                 /*
1184                  * Pagefault in a guest that is a Xen (e.g. shadow)
1185                  * artifact, and is not injected back into the guest.
1186                  *
1187                  * data[0] = error code
1188                  * data[1] = guest VA(0:31)
1189                  * data[2] = guest VA(32:64)
1190                  */
1191                 addr64 = (((uint64_t)data[2]) << 32) | data[1];
1192                 XDT_PROBE2(XDT_HVM_PF_XEN, data[0], addr64);
1193                 break;
1194 
1195         case TRC_HVM_PF_XEN:
1196                 /*
1197                  * Same as above, but for a 32-bit HVM domain.
1198                  *
1199                  * data[0] = error code
1200                  * data[1] = guest VA
1201                  */
1202                 XDT_PROBE2(XDT_HVM_PF_XEN, data[0], data[1]);
1203                 break;
1204 
1205         case TRC_HVM_PF_INJECT:
1206                 /*
1207                  * 32-bit Xen only.
1208                  */
1209                 break;
1210         case TRC_HVM_PF_INJECT64:
1211                 /*
1212                  * Pagefault injected back into a guest (e.g. the shadow
1213                  * code found no mapping).
1214                  *
1215                  * data[0] = error code
1216                  * data[1] = guest VA(0:31)
1217                  * data[2] = guest VA(32:64)
1218                  */
1219                 addr64 = (((uint64_t)data[2]) << 32) | data[1];
1220                 XDT_PROBE2(XDT_HVM_PF_INJECT, data[0], addr64);
1221                 break;
1222 
1223         case TRC_HVM_INJ_EXC:
1224                 /*
1225                  * Exception injected into an HVM guest.
1226                  *
1227                  * data[0] = trap
1228                  * data[1] = error code
1229                  */
1230                 XDT_PROBE2(XDT_HVM_EXC_INJECT, data[0], data[1]);
1231                 break;
1232         case TRC_HVM_INJ_VIRQ:
1233                 /*
1234                  * Interrupt inject into an HVM guest.
1235                  *
1236                  * data[0] = vector
1237                  */
1238                 XDT_PROBE1(XDT_HVM_VIRQ_INJECT, data[0]);
1239                 break;
1240         case TRC_HVM_REINJ_VIRQ:
1241         case TRC_HVM_IO_READ:
1242         case TRC_HVM_IO_WRITE:
1243                 /* unused */
1244                 break;
1245         case TRC_HVM_CR_READ64:
1246                 /*
1247                  * Control register read. Intel VMX only.
1248                  *
1249                  * data[0] = control register #
1250                  * data[1] = value(0:31)
1251                  * data[2] = value(32:63)
1252                  */
1253                 val64 = (((uint64_t)data[2]) << 32) | data[1];
1254                 XDT_PROBE2(XDT_HVM_CR_READ, data[0], val64);
1255                 break;
1256         case TRC_HVM_CR_READ:
1257                 /*
1258                  * unused (32-bit Xen only)
1259                  */
1260                 break;
1261         case TRC_HVM_CR_WRITE64:
1262                 /*
1263                  * Control register write. Intel VMX only.
1264                  *
1265                  * data[0] = control register #
1266                  * data[1] = value(0:31)
1267                  * data[2] = value(32:63)
1268                  */
1269                 val64 = (((uint64_t)data[2]) << 32) | data[1];
1270                 XDT_PROBE2(XDT_HVM_CR_READ, data[0], val64);
1271                 break;
1272         case TRC_HVM_CR_WRITE:
1273                 /*
1274                  * unused (32-bit Xen only)
1275                  */
1276                 break;
1277         case TRC_HVM_DR_READ:
1278                 /*
1279                  * unused.
1280                  *
1281                  * data[0] = (domid<<16 + vcpuid)
1282                  */
1283                 break;
1284         case TRC_HVM_DR_WRITE:
1285                 /*
1286                  * Debug register write. Not too useful; no values,
1287                  * so we ignore this.
1288                  *
1289                  * data[0] = (domid<<16 + vcpuid)
1290                  */
1291                 break;
1292         case TRC_HVM_MSR_READ:
1293                 /*
1294                  * MSR read.
1295                  *
1296                  * data[0] = MSR
1297                  * data[1] = value(0:31)
1298                  * data[2] = value(32:63)
1299                  */
1300                 val64 = (((uint64_t)data[3]) << 32) | data[2];
1301                 XDT_PROBE2(XDT_HVM_MSR_READ, data[0], val64);
1302                 break;
1303         case TRC_HVM_MSR_WRITE:
1304                 /*
1305                  * MSR write.
1306                  *
1307                  * data[0] = MSR;
1308                  * data[1] = value(0:31)
1309                  * data[2] = value(32:63)
1310                  */
1311                 val64 = (((uint64_t)data[2]) << 32) | data[1];
1312                 XDT_PROBE2(XDT_HVM_MSR_WRITE, data[0], val64);
1313                 break;
1314         case TRC_HVM_CPUID:
1315                 /*
1316                  * CPUID insn.
1317                  *
1318                  * data[0] = %eax (input)
1319                  * data[1] = %eax
1320                  * data[2] = %ebx
1321                  * data[3] = %ecx
1322                  * data[4] = %edx
1323                  */
1324                 XDT_PROBE5(XDT_HVM_CPUID, data[0], data[1], data[2], data[3],
1325                     data[4]);
1326                 break;
1327         case TRC_HVM_INTR:
1328                 /*
1329                  * VMEXIT because of an interrupt.
1330                  */
1331                 XDT_PROBE0(XDT_HVM_INTR);
1332                 break;
1333         case TRC_HVM_INTR_WINDOW:
1334                 /*
1335                  * VMEXIT because of an interrupt window (an interrupt
1336                  * can't be delivered immediately to a HVM guest and must
1337                  * be delayed).
1338                  *
1339                  * data[0] = vector
1340                  * data[1] = source
1341                  * data[2] = info
1342                  */
1343                 XDT_PROBE3(XDT_HVM_INTR_WINDOW, data[0], data[1], data[2]);
1344                 break;
1345         case TRC_HVM_NMI:
1346                 /*
1347                  * VMEXIT because of an NMI.
1348                  */
1349                 XDT_PROBE0(XDT_HVM_NMI);
1350                 break;
1351         case TRC_HVM_SMI:
1352                 /*
1353                  * VMEXIT because of an SMI
1354                  */
1355                 XDT_PROBE0(XDT_HVM_SMI);
1356                 break;
1357         case TRC_HVM_VMMCALL:
1358                 /*
1359                  * VMMCALL insn.
1360                  *
1361                  * data[0] = %eax
1362                  */
1363                 XDT_PROBE1(XDT_HVM_VMMCALL, data[0]);
1364                 break;
1365         case TRC_HVM_HLT:
1366                 /*
1367                  * HLT insn.
1368                  *
1369                  * data[0] = 1 if VCPU runnable, 0 if not
1370                  */
1371                 XDT_PROBE1(XDT_HVM_HLT, data[0]);
1372                 break;
1373         case TRC_HVM_INVLPG64:
1374                 /*
1375                  *
1376                  * data[0] = INVLPGA ? 1 : 0
1377                  * data[1] = vaddr(0:31)
1378                  * data[2] = vaddr(32:63)
1379                  */
1380                 addr64 = (((uint64_t)data[2]) << 32) | data[1];
1381                 XDT_PROBE2(XDT_HVM_INVLPG, data[0], addr64);
1382                 break;
1383         case TRC_HVM_INVLPG:
1384                 /*
1385                  * unused (32-bit Xen only)
1386                  *
1387                  * data[0] = (domid<<16 + vcpuid)
1388                  */
1389                 break;
1390         case TRC_HVM_MCE:
1391                 /*
1392                  * #MCE VMEXIT
1393                  *
1394                  */
1395                 XDT_PROBE0(XDT_HVM_MCE);
1396                 break;
1397         case TRC_HVM_IOPORT_READ:
1398         case TRC_HVM_IOPORT_WRITE:
1399         case TRC_HVM_IOMEM_READ:
1400         case TRC_HVM_IOMEM_WRITE:
1401                 /*
1402                  * data[0] = addr(0:31)
1403                  * data[1] = addr(32:63)
1404                  * data[2] = count
1405                  * data[3] = size
1406                  */
1407                 switch (rec->event) {
1408                 case TRC_HVM_IOPORT_READ:
1409                         eid = XDT_HVM_IOPORT_READ;
1410                         break;
1411                 case TRC_HVM_IOPORT_WRITE:
1412                         eid = XDT_HVM_IOPORT_WRITE;
1413                         break;
1414                 case TRC_HVM_IOMEM_READ:
1415                         eid = XDT_HVM_IOMEM_READ;
1416                         break;
1417                 case TRC_HVM_IOMEM_WRITE:
1418                         eid = XDT_HVM_IOMEM_WRITE;
1419                         break;
1420                 }
1421                 addr64 = (((uint64_t)data[1]) << 32) | data[0];
1422                 XDT_PROBE3(eid, addr64, data[2], data[3]);
1423                 break;
1424         case TRC_HVM_CLTS:
1425                 /*
1426                  * CLTS insn (Intel VMX only)
1427                  */
1428                 XDT_PROBE0(XDT_HVM_CLTS);
1429                 break;
1430         case TRC_HVM_LMSW64:
1431                 /*
1432                  * LMSW insn.
1433                  *
1434                  * data[0] = value(0:31)
1435                  * data[1] = value(32:63)
1436                  */
1437                 val64 = (((uint64_t)data[1]) << 32) | data[0];
1438                 XDT_PROBE1(XDT_HVM_LMSW, val64);
1439                 break;
1440         case TRC_HVM_LMSW:
1441                 /*
1442                  * unused (32-bit Xen only)
1443                  */
1444                 break;
1445 
1446         /*
1447          * Shadow page table probes (mainly used for HVM domains
1448          * without hardware paging support).
1449          */
1450         case TRC_SHADOW_NOT_SHADOW | SH_GUEST_32:
1451                 /*
1452                  * data[0] = pte(0:31)
1453                  * data[1] = pte(32:63)
1454                  * data[2] = va
1455                  * data[3] = flags
1456                  */
1457                 pte64 = ((uint64_t)data[1] << 32) | data[0];
1458                 XDT_PROBE3(XDT_SHADOW_NOT_SHADOW, pte64, data[2], data[3]);
1459                 break;
1460         case TRC_SHADOW_NOT_SHADOW | SH_GUEST_PAE:
1461         case TRC_SHADOW_NOT_SHADOW | SH_GUEST_64:
1462                 /*
1463                  * data[0] = pte(0:31)
1464                  * data[1] = pte(32:63)
1465                  * data[2] = va(0:31)
1466                  * data[3] = va(32:63)
1467                  * data[4] = flags
1468                  */
1469                 addr64 = ((uint64_t)data[2] << 32) | data[3];
1470                 pte64 = ((uint64_t)data[1] << 32) | data[0];
1471                 XDT_PROBE3(XDT_SHADOW_NOT_SHADOW, pte64, addr64, data[4]);
1472                 break;
1473         case TRC_SHADOW_FAST_PROPAGATE | SH_GUEST_32:
1474                 /*
1475                  * data[0] = va
1476                  */
1477                 XDT_PROBE1(XDT_SHADOW_FAST_PROPAGATE, data[0]);
1478                 break;
1479         case TRC_SHADOW_FAST_PROPAGATE | SH_GUEST_PAE:
1480         case TRC_SHADOW_FAST_PROPAGATE | SH_GUEST_64:
1481                 /*
1482                  * data[0] = va(0:31)
1483                  * data[1] = va(32:63)
1484                  */
1485                 addr64 = ((uint64_t)data[1] << 32) | data[0];
1486                 XDT_PROBE1(XDT_SHADOW_FAST_PROPAGATE, addr64);
1487                 break;
1488         case TRC_SHADOW_FAST_MMIO | SH_GUEST_32:
1489                 /*
1490                  * data[0] = va
1491                  */
1492                 XDT_PROBE1(XDT_SHADOW_FAST_MMIO, data[0]);
1493                 break;
1494         case TRC_SHADOW_FAST_MMIO | SH_GUEST_PAE:
1495         case TRC_SHADOW_FAST_MMIO | SH_GUEST_64:
1496                 /*
1497                  * data[0] = va(0:31)
1498                  * data[1] = va(32:63)
1499                  */
1500                 addr64 = ((uint64_t)data[1] << 32) | data[0];
1501                 XDT_PROBE1(XDT_SHADOW_FAST_MMIO, addr64);
1502                 break;
1503         case TRC_SHADOW_FALSE_FAST_PATH | SH_GUEST_32:
1504                 /*
1505                  * data[0] = va
1506                  */
1507                 XDT_PROBE1(XDT_SHADOW_FALSE_FAST_PATH, data[0]);
1508                 break;
1509         case TRC_SHADOW_FALSE_FAST_PATH | SH_GUEST_PAE:
1510         case TRC_SHADOW_FALSE_FAST_PATH | SH_GUEST_64:
1511                 /*
1512                  * data[0] = va(0:31)
1513                  * data[1] = va(32:63)
1514                  */
1515                 addr64 = ((uint64_t)data[1] << 32) | data[0];
1516                 XDT_PROBE1(XDT_SHADOW_FALSE_FAST_PATH, addr64);
1517                 break;
1518         case TRC_SHADOW_MMIO | SH_GUEST_32:
1519                 /*
1520                  * data[0] = va
1521                  */
1522                 XDT_PROBE1(XDT_SHADOW_MMIO, data[0]);
1523                 break;
1524         case TRC_SHADOW_MMIO | SH_GUEST_PAE:
1525         case TRC_SHADOW_MMIO | SH_GUEST_64:
1526                 /*
1527                  * data[0] = va(0:31)
1528                  * data[1] = va(32:63)
1529                  */
1530                 addr64 = ((uint64_t)data[1] << 32) | data[0];
1531                 XDT_PROBE1(XDT_SHADOW_MMIO, addr64);
1532                 break;
1533         case TRC_SHADOW_FIXUP | SH_GUEST_32:
1534                 /*
1535                  * data[0] = pte(0:31)
1536                  * data[1] = pte(32:63)
1537                  * data[2] = va
1538                  * data[3] = flags
1539                  */
1540                 pte64 = ((uint64_t)data[1] << 32) | data[0];
1541                 XDT_PROBE3(XDT_SHADOW_FIXUP, pte64, data[2], data[3]);
1542                 break;
1543         case TRC_SHADOW_FIXUP | SH_GUEST_64:
1544         case TRC_SHADOW_FIXUP | SH_GUEST_PAE:
1545                 /*
1546                  * data[0] = pte(0:31)
1547                  * data[1] = pte(32:63)
1548                  * data[2] = va(0:31)
1549                  * data[3] = va(32:63)
1550                  * data[4] = flags
1551                  */
1552                 addr64 = ((uint64_t)data[2] << 32) | data[3];
1553                 pte64 = ((uint64_t)data[1] << 32) | data[0];
1554                 XDT_PROBE3(XDT_SHADOW_FIXUP, pte64, addr64, data[4]);
1555                 break;
1556         case TRC_SHADOW_DOMF_DYING | SH_GUEST_32:
1557                 /*
1558                  * data[0] = va
1559                  */
1560                 XDT_PROBE1(XDT_SHADOW_DOMF_DYING, data[0]);
1561                 break;
1562         case TRC_SHADOW_DOMF_DYING | SH_GUEST_PAE:
1563         case TRC_SHADOW_DOMF_DYING | SH_GUEST_64:
1564                 /*
1565                  * data[0] = va(0:31)
1566                  * data[1] = va(32:63)
1567                  */
1568                 addr64 = ((uint64_t)data[1] << 32) | data[0];
1569                 XDT_PROBE1(XDT_SHADOW_DOMF_DYING, addr64);
1570                 break;
1571         case TRC_SHADOW_EMULATE | SH_GUEST_32:
1572                 /*
1573                  * data[0] = pte(0:31)
1574                  * data[1] = pte(32:63)
1575                  * data[2] = val(0:31)
1576                  * data[3] = val(32:63)
1577                  * data[4] = addr
1578                  * data[5] = flags
1579                  */
1580                 pte64 = ((uint64_t)data[1] << 32) | data[0];
1581                 val64 = ((uint64_t)data[3] << 32) | data[2];
1582                 XDT_PROBE5(XDT_SHADOW_EMULATE, pte64, val64, data[4],
1583                     data[5] & 0x7fffffff, data[5] >> 29);
1584                 break;
1585         case TRC_SHADOW_EMULATE | SH_GUEST_PAE:
1586         case TRC_SHADOW_EMULATE | SH_GUEST_64:
1587                 /*
1588                  * data[0] = pte(0:31)
1589                  * data[1] = pte(32:63)
1590                  * data[2] = val(0:31)
1591                  * data[3] = val(32:63)
1592                  * data[4] = addr(0:31)
1593                  * data[5] = addr(32:63)
1594                  * data[6] = flags
1595                  */
1596                 pte64 = ((uint64_t)data[1] << 32) | data[0];
1597                 val64 = ((uint64_t)data[3] << 32) | data[2];
1598                 addr64 = ((uint64_t)data[5] << 32) | data[4];
1599                 XDT_PROBE5(XDT_SHADOW_EMULATE, pte64, val64, data[4],
1600                     data[6] & 0x7fffffff, data[6] >> 29);
1601                 break;
1602         case TRC_SHADOW_EMULATE_UNSHADOW_USER | SH_GUEST_32:
1603                 /*
1604                  * data[0] = gfn
1605                  * data[1] = vaddr
1606                  */
1607                 XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_USER, data[0], data[1]);
1608                 break;
1609         case TRC_SHADOW_EMULATE_UNSHADOW_USER | SH_GUEST_PAE:
1610         case TRC_SHADOW_EMULATE_UNSHADOW_USER | SH_GUEST_64:
1611                 /*
1612                  * data[0] = gfn(0:31)
1613                  * data[1] = gfn(32:63)
1614                  * data[2] = vaddr(0:31)
1615                  * data[3] = vaddr(32:63)
1616                  */
1617                 val64 = ((uint64_t)data[1] << 32) | data[0];
1618                 addr64 = ((uint64_t)data[3] << 32) | data[2];
1619                 XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_USER, val64, addr64);
1620                 break;
1621         case TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ | SH_GUEST_32:
1622                 /*
1623                  * data[0] = gfn
1624                  * data[1] = vaddr
1625                  */
1626                 XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ, data[0],
1627                     data[1]);
1628                 break;
1629         case TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ | SH_GUEST_PAE:
1630         case TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ | SH_GUEST_64:
1631                 /*
1632                  * data[0] = gfn(0:31)
1633                  * data[1] = gfn(32:63)
1634                  * data[2] = vaddr(0:31)
1635                  * data[3] = vaddr(32:63)
1636                  */
1637                 val64 = ((uint64_t)data[1] << 32) | data[0];
1638                 addr64 = ((uint64_t)data[3] << 32) | data[2];
1639                 XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ, val64, addr64);
1640                 break;
1641         case TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED | SH_GUEST_32:
1642                 /*
1643                  * data[0] = gfn
1644                  * data[1] = vaddr
1645                  */
1646                 XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED, data[0],
1647                     data[1]);
1648                 break;
1649         case TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED | SH_GUEST_PAE:
1650         case TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED | SH_GUEST_64:
1651                 /*
1652                  * data[0] = gfn(0:31)
1653                  * data[1] = gfn(32:63)
1654                  * data[2] = vaddr(0:31)
1655                  * data[3] = vaddr(32:63)
1656                  */
1657                 val64 = ((uint64_t)data[1] << 32) | data[0];
1658                 addr64 = ((uint64_t)data[3] << 32) | data[2];
1659                 XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED, val64,
1660                     addr64);
1661                 break;
1662         case TRC_SHADOW_WRMAP_BF:
1663                 /*
1664                  * data[0] = gfn(0:31)
1665                  * data[1] = gfn(32:63)
1666                  */
1667                 val64 = ((uint64_t)data[1] << 32) | data[0];
1668                 XDT_PROBE1(XDT_SHADOW_WRMAP_BF, val64);
1669                 break;
1670         case TRC_SHADOW_PREALLOC_UNPIN:
1671                 /*
1672                  * data[0] = gfn(0:31)
1673                  * data[1] = gfn(32:63)
1674                  */
1675                 val64 = ((uint64_t)data[1] << 32) | data[0];
1676                 XDT_PROBE1(XDT_SHADOW_PREALLOC_UNPIN, val64);
1677                 break;
1678         case TRC_SHADOW_RESYNC_FULL:
1679                 /*
1680                  * data[0] = gmfn(0:31)
1681                  * data[1] = gmfn(32:63)
1682                  */
1683                 val64 = ((uint64_t)data[1] << 32) | data[0];
1684                 XDT_PROBE1(XDT_SHADOW_RESYNC_FULL, val64);
1685                 break;
1686         case TRC_SHADOW_RESYNC_ONLY:
1687                 /*
1688                  * data[0] = gmfn(0:31)
1689                  * data[1] = gmfn(32:63)
1690                  */
1691                 val64 = ((uint64_t)data[1] << 32) | data[0];
1692                 XDT_PROBE1(XDT_SHADOW_RESYNC_ONLY, val64);
1693                 break;
1694 
1695         /*
1696          * Power management probes.
1697          */
1698         case TRC_PM_FREQ_CHANGE:
1699                 /*
1700                  * data[0] = old freq
1701                  * data[1] = new freq
1702                  */
1703                 XDT_PROBE2(XDT_PM_FREQ_CHANGE, data[0], data[1]);
1704                 break;
1705         case TRC_PM_IDLE_ENTRY:
1706                 /*
1707                  * data[0] = C-state
1708                  * data[1] = time
1709                  */
1710                 XDT_PROBE2(XDT_PM_IDLE_ENTRY, data[0], data[1]);
1711                 break;
1712         case TRC_PM_IDLE_EXIT:
1713                 /*
1714                  * data[0] = C-state
1715                  * data[1] = time
1716                  */
1717                 XDT_PROBE2(XDT_PM_IDLE_EXIT, data[0], data[1]);
1718                 break;
1719         case TRC_LOST_RECORDS:
1720                 vcpu = data[1] >> 16;
1721                 dom = data[1] & 0xffff;
1722                 xdt_update_sched_context(cpuid, dom, vcpu);
1723                 xdt_update_domain_context(dom, vcpu);
1724                 XDT_PROBE1(XDT_TRC_LOST_RECORDS, cpuid);
1725                 tbuf.stat_dropped_recs++;
1726                 break;
1727 
1728         default:
1729                 tbuf.stat_unknown_recs++;
1730                 break;
1731         }
1732 
1733 done:
1734         rec_size = 4 + (rec->cycles_included ? 8 : 0) + (rec->extra_u32 * 4);
1735         return (rec_size);
1736 }
1737 
1738 /*
1739  * Scan all CPU buffers for the record with the lowest timestamp so
1740  * that the probes will fire in order.
1741  */
1742 static int
1743 xdt_get_first_rec(uint_t *cpuidp, struct t_rec **recp, uint32_t *consp)
1744 {
1745         uint_t cpuid;
1746         uint32_t prod, cons, offset;
1747         struct t_rec *rec;
1748         uint64_t minstamp = ~0ULL, stamp;
1749         uintptr_t data;
1750 
1751         for (cpuid = 0; cpuid < tbuf.cnt; cpuid++) {
1752                 cons = tbuf.meta[cpuid]->cons;
1753                 prod = tbuf.meta[cpuid]->prod;
1754                 membar_consumer();
1755                 if (prod == cons)
1756                         continue;
1757 
1758                 offset = cons % tbuf_data_size;
1759                 data = (uintptr_t)tbuf.data[cpuid] + offset;
1760                 rec = (struct t_rec *)data;
1761                 ASSERT((caddr_t)rec < tbuf.va + (tbuf.size * (cpuid + 1)));
1762 
1763                 /*
1764                  * All records that we know about have time cycles included.
1765                  * If this record doesn't have them, assume it's a type
1766                  * that we don't handle. Use a 0 time value, which will make
1767                  * it get handled first (it will be thrown away).
1768                  */
1769                 if (rec->cycles_included)
1770                         stamp = (((uint64_t)rec->u.cycles.cycles_hi) << 32)
1771                             | rec->u.cycles.cycles_lo;
1772                 else
1773                         stamp = 0;
1774 
1775                 if (stamp < minstamp) {
1776                         minstamp = stamp;
1777                         *cpuidp = cpuid;
1778                         *recp = rec;
1779                         *consp = cons;
1780                 }
1781         }
1782 
1783         if (minstamp != ~0ULL)
1784                 return (1);
1785 
1786         return (0);
1787 }
1788 
1789 /*ARGSUSED*/
1790 static void
1791 xdt_tbuf_scan(void *arg)
1792 {
1793         uint32_t bytes_done, cons;
1794         struct t_rec *rec;
1795         xdt_schedinfo_t *sp;
1796         uint_t nrecs, cpuid;
1797 
1798         for (nrecs = 0;
1799             nrecs < xdt_max_recs && xdt_get_first_rec(&cpuid, &rec, &cons) > 0;
1800             nrecs++) {
1801                 xdt_curpcpu = cpuid;
1802                 sp = &xdt_cpu_schedinfo[cpuid];
1803                 if (sp->curinfo_valid)
1804                         xdt_update_domain_context(sp->cur_domid,
1805                             sp->cur_vcpuid);
1806 
1807                 bytes_done = xdt_process_rec(cpuid, rec);
1808                 cons += bytes_done;
1809                 /*
1810                  * cons and prod are incremented modulo (2 * tbuf_data_size).
1811                  * See <xen/public/trace.h>.
1812                  */
1813                 if (cons >= 2 * tbuf_data_size)
1814                         cons -= 2 * tbuf_data_size;
1815                 membar_exit();
1816                 tbuf.meta[cpuid]->cons = cons;
1817         }
1818 }
1819 
1820 static void
1821 xdt_cyclic_enable(void)
1822 {
1823         cyc_handler_t hdlr;
1824         cyc_time_t when;
1825 
1826         ASSERT(MUTEX_HELD(&cpu_lock));
1827 
1828         hdlr.cyh_func = xdt_tbuf_scan;
1829         hdlr.cyh_arg = NULL;
1830         hdlr.cyh_level = CY_LOW_LEVEL;
1831 
1832         when.cyt_interval = xdt_poll_nsec;
1833         when.cyt_when = dtrace_gethrtime() + when.cyt_interval;
1834 
1835         xdt_cyclic = cyclic_add(&hdlr, &when);
1836 }
1837 
1838 static void
1839 xdt_probe_create(xdt_probe_t *p)
1840 {
1841         ASSERT(p != NULL && p->pr_mod != NULL);
1842 
1843         if (dtrace_probe_lookup(xdt_id, p->pr_mod, NULL, p->pr_name) != 0)
1844                 return;
1845 
1846         xdt_prid[p->evt_id] = dtrace_probe_create(xdt_id, p->pr_mod, NULL,
1847             p->pr_name, dtrace_mach_aframes(), p);
1848 }
1849 
1850 /*ARGSUSED*/
1851 static void
1852 xdt_provide(void *arg, const dtrace_probedesc_t *desc)
1853 {
1854         const char *mod, *name;
1855         int i;
1856 
1857         if (desc == NULL) {
1858                 for (i = 0; xdt_probe[i].pr_mod != NULL; i++) {
1859                         xdt_probe_create(&xdt_probe[i]);
1860                 }
1861         } else {
1862                 mod = desc->dtpd_mod;
1863                 name = desc->dtpd_name;
1864                 for (i = 0; xdt_probe[i].pr_mod != NULL; i++) {
1865                         int l1 = strlen(xdt_probe[i].pr_name);
1866                         int l2 = strlen(xdt_probe[i].pr_mod);
1867                         if (strncmp(name, xdt_probe[i].pr_name, l1) == 0 &&
1868                             strncmp(mod, xdt_probe[i].pr_mod, l2) == 0)
1869                                 break;
1870                 }
1871 
1872                 if (xdt_probe[i].pr_mod == NULL)
1873                         return;
1874                 xdt_probe_create(&xdt_probe[i]);
1875         }
1876 
1877 }
1878 
1879 /*ARGSUSED*/
1880 static void
1881 xdt_destroy(void *arg, dtrace_id_t id, void *parg)
1882 {
1883         xdt_probe_t *p = parg;
1884         xdt_prid[p->evt_id] = 0;
1885 }
1886 
1887 static void
1888 xdt_set_trace_mask(uint32_t mask)
1889 {
1890         xen_sysctl_tbuf_op_t tbuf_op;
1891 
1892         /* Always need to trace scheduling, for context */
1893         if (mask != 0)
1894                 mask |= TRC_SCHED;
1895         tbuf_op.evt_mask = mask;
1896         tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_set_evt_mask;
1897         (void) xdt_sysctl_tbuf(&tbuf_op);
1898 }
1899 
1900 /*ARGSUSED*/
1901 static int
1902 xdt_enable(void *arg, dtrace_id_t id, void *parg)
1903 {
1904         xdt_probe_t *p = parg;
1905         xen_sysctl_tbuf_op_t tbuf_op;
1906 
1907         ASSERT(MUTEX_HELD(&cpu_lock));
1908         ASSERT(xdt_prid[p->evt_id] != 0);
1909 
1910         xdt_probemap[p->evt_id] = xdt_prid[p->evt_id];
1911         xdt_classinfo[p->class].cnt++;
1912 
1913         if (xdt_classinfo[p->class].cnt == 1) {
1914                 /* set the trace mask for this class */
1915                 cur_trace_mask |= xdt_classinfo[p->class].trc_mask;
1916                 xdt_set_trace_mask(cur_trace_mask);
1917         }
1918 
1919         if (xdt_cyclic == CYCLIC_NONE) {
1920                 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_enable;
1921                 if (xdt_sysctl_tbuf(&tbuf_op) != 0) {
1922                         cmn_err(CE_NOTE, "Couldn't enable hypervisor tracing.");
1923                         return (-1);
1924                 }
1925 
1926                 xdt_cyclic_enable();
1927         }
1928         return (0);
1929 }
1930 
1931 /*ARGSUSED*/
1932 static void
1933 xdt_disable(void *arg, dtrace_id_t id, void *parg)
1934 {
1935         xdt_probe_t *p = parg;
1936         xen_sysctl_tbuf_op_t tbuf_op;
1937         int i, err;
1938 
1939         ASSERT(MUTEX_HELD(&cpu_lock));
1940         ASSERT(xdt_probemap[p->evt_id] != 0);
1941         ASSERT(xdt_probemap[p->evt_id] == xdt_prid[p->evt_id]);
1942         ASSERT(xdt_classinfo[p->class].cnt > 0);
1943 
1944         /*
1945          * We could be here in the slight window between the cyclic firing and
1946          * a call to dtrace_probe() occurring. We need to be careful if we tear
1947          * down any shared state.
1948          */
1949 
1950         xdt_probemap[p->evt_id] = 0;
1951         xdt_classinfo[p->class].cnt--;
1952 
1953         if (xdt_nr_active_probes() == 0) {
1954                 cur_trace_mask = 0;
1955 
1956                 if (xdt_cyclic == CYCLIC_NONE)
1957                         return;
1958 
1959                 for (i = 0; i < xdt_ncpus; i++)
1960                         xdt_cpu_schedinfo[i].curinfo_valid = 0;
1961 
1962                 /*
1963                  * We will try to disable the trace buffers. If we fail for some
1964                  * reason we will try again, up to a count of XDT_TBUF_RETRY.
1965                  * If we still aren't successful we try to set the trace mask
1966                  * to 0 in order to prevent trace records from being written.
1967                  */
1968                 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_disable;
1969                 i = 0;
1970                 do {
1971                         err = xdt_sysctl_tbuf(&tbuf_op);
1972                 } while ((err != 0) && (++i < XDT_TBUF_RETRY));
1973 
1974                 if (err != 0) {
1975                         cmn_err(CE_NOTE,
1976                             "Couldn't disable hypervisor tracing.");
1977                         xdt_set_trace_mask(0);
1978                 } else {
1979                         cyclic_remove(xdt_cyclic);
1980                         xdt_cyclic = CYCLIC_NONE;
1981                         /*
1982                          * We don't bother making the hypercall to set
1983                          * the trace mask, since it will be reset when
1984                          * tracing is re-enabled.
1985                          */
1986                 }
1987         } else if (xdt_classinfo[p->class].cnt == 0) {
1988                 cur_trace_mask ^= xdt_classinfo[p->class].trc_mask;
1989                 /* other probes are enabled, so add the sub-class mask back */
1990                 cur_trace_mask |= 0xF000;
1991                 xdt_set_trace_mask(cur_trace_mask);
1992         }
1993 }
1994 
1995 static dtrace_pattr_t xdt_attr = {
1996 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
1997 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
1998 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
1999 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
2000 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
2001 };
2002 
2003 static dtrace_pops_t xdt_pops = {
2004         xdt_provide,            /* dtps_provide() */
2005         NULL,                   /* dtps_provide_module() */
2006         xdt_enable,             /* dtps_enable() */
2007         xdt_disable,            /* dtps_disable() */
2008         NULL,                   /* dtps_suspend() */
2009         NULL,                   /* dtps_resume() */
2010         NULL,                   /* dtps_getargdesc() */
2011         NULL,                   /* dtps_getargval() */
2012         NULL,                   /* dtps_usermode() */
2013         xdt_destroy             /* dtps_destroy() */
2014 };
2015 
2016 static int
2017 xdt_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
2018 {
2019         int val;
2020 
2021         if (!DOMAIN_IS_INITDOMAIN(xen_info))
2022                 return (DDI_FAILURE);
2023 
2024         switch (cmd) {
2025         case DDI_ATTACH:
2026                 break;
2027 
2028         case DDI_RESUME:
2029                 /*
2030                  * We might support proper suspend/resume in the future, so,
2031                  * return DDI_FAILURE for now.
2032                  */
2033                 return (DDI_FAILURE);
2034 
2035         default:
2036                 return (DDI_FAILURE);
2037         }
2038 
2039         xdt_ncpus = xpv_nr_phys_cpus();
2040         ASSERT(xdt_ncpus > 0);
2041 
2042         if (ddi_create_minor_node(devi, "xdt", S_IFCHR, 0, DDI_PSEUDO, 0) ==
2043             DDI_FAILURE || xdt_attach_trace_buffers() != 0 ||
2044             dtrace_register("xdt", &xdt_attr, DTRACE_PRIV_KERNEL, NULL,
2045             &xdt_pops, NULL, &xdt_id) != 0) {
2046                 if (tbuf.va != NULL)
2047                         xdt_detach_trace_buffers();
2048                 ddi_remove_minor_node(devi, NULL);
2049                 return (DDI_FAILURE);
2050         }
2051 
2052         val = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
2053             "xdt_poll_nsec", XDT_POLL_DEFAULT);
2054         xdt_poll_nsec = MAX(val, XDT_POLL_MIN);
2055 
2056         xdt_cpu_schedinfo = (xdt_schedinfo_t *)kmem_zalloc(xdt_ncpus *
2057             sizeof (xdt_schedinfo_t), KM_SLEEP);
2058         xdt_init_trace_masks();
2059         xdt_kstat_init();
2060 
2061         xdt_devi = devi;
2062         ddi_report_dev(devi);
2063         return (DDI_SUCCESS);
2064 }
2065 
2066 static int
2067 xdt_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
2068 {
2069         switch (cmd) {
2070         case DDI_DETACH:
2071                 break;
2072 
2073         case DDI_SUSPEND:
2074                 /*
2075                  * We might support proper suspend/resume in the future. So
2076                  * return DDI_FAILURE for now.
2077                  */
2078                 return (DDI_FAILURE);
2079 
2080         default:
2081                 return (DDI_FAILURE);
2082         }
2083 
2084         if (dtrace_unregister(xdt_id) != 0)
2085                 return (DDI_FAILURE);
2086 
2087         xdt_detach_trace_buffers();
2088         kmem_free(xdt_cpu_schedinfo, xdt_ncpus * sizeof (xdt_schedinfo_t));
2089         if (xdt_cyclic != CYCLIC_NONE)
2090                 cyclic_remove(xdt_cyclic);
2091         if (xdt_kstats != NULL)
2092                 kstat_delete(xdt_kstats);
2093         xdt_devi = (void *)0;
2094         ddi_remove_minor_node(devi, NULL);
2095 
2096         return (DDI_SUCCESS);
2097 }
2098 
2099 /*ARGSUSED*/
2100 static int
2101 xdt_info(dev_info_t *devi, ddi_info_cmd_t infocmd, void *arg, void **result)
2102 {
2103         int error;
2104 
2105         switch (infocmd) {
2106         case DDI_INFO_DEVT2DEVINFO:
2107                 *result = xdt_devi;
2108                 error = DDI_SUCCESS;
2109                 break;
2110         case DDI_INFO_DEVT2INSTANCE:
2111                 *result = (void *)0;
2112                 error = DDI_SUCCESS;
2113                 break;
2114         default:
2115                 error = DDI_FAILURE;
2116         }
2117         return (error);
2118 }
2119 
2120 static struct cb_ops xdt_cb_ops = {
2121         nulldev,                /* open(9E) */
2122         nodev,                  /* close(9E) */
2123         nodev,                  /* strategy(9E) */
2124         nodev,                  /* print(9E) */
2125         nodev,                  /* dump(9E) */
2126         nodev,                  /* read(9E) */
2127         nodev,                  /* write(9E) */
2128         nodev,                  /* ioctl(9E) */
2129         nodev,                  /* devmap(9E) */
2130         nodev,                  /* mmap(9E) */
2131         nodev,                  /* segmap(9E) */
2132         nochpoll,               /* chpoll(9E) */
2133         ddi_prop_op,            /* prop_op(9E) */
2134         NULL,                   /* streamtab(9S) */
2135         D_MP | D_64BIT | D_NEW  /* cb_flag */
2136 };
2137 
2138 static struct dev_ops xdt_ops = {
2139         DEVO_REV,               /* devo_rev */
2140         0,                      /* devo_refcnt */
2141         xdt_info,               /* getinfo(9E) */
2142         nulldev,                /* identify(9E) */
2143         nulldev,                /* probe(9E) */
2144         xdt_attach,             /* attach(9E) */
2145         xdt_detach,             /* detach(9E) */
2146         nulldev,                /* devo_reset */
2147         &xdt_cb_ops,                /* devo_cb_ops */
2148         NULL,                   /* devo_bus_ops */
2149         NULL,                   /* power(9E) */
2150         ddi_quiesce_not_needed, /* devo_quiesce */
2151 };
2152 
2153 
2154 static struct modldrv modldrv = {
2155         &mod_driverops,
2156         "Hypervisor event tracing",
2157         &xdt_ops
2158 };
2159 
2160 static struct modlinkage modlinkage = {
2161         MODREV_1,
2162         &modldrv,
2163         NULL
2164 };
2165 
2166 int
2167 _init(void)
2168 {
2169         return (mod_install(&modlinkage));
2170 }
2171 
2172 int
2173 _fini(void)
2174 {
2175         return (mod_remove(&modlinkage));
2176 }
2177 
2178 int
2179 _info(struct modinfo *modinfop)
2180 {
2181         return (mod_info(&modlinkage, modinfop));
2182 }