1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Xen event provider for DTrace 29 * 30 * NOTE: This provider is PRIVATE. It is intended as a short-term solution and 31 * may disappear or be re-implemented at anytime. 32 * 33 * This provider isn't suitable as a general-purpose solution for a number of 34 * reasons. First and foremost, we rely on the Xen tracing mechanism and don't 35 * have any way to gather data other than that collected by the Xen trace 36 * buffers. Further, it does not fit into the DTrace model (see "Interacting 37 * with DTrace" below.) 38 * 39 * 40 * Tracing in Xen 41 * -------------- 42 * 43 * Xen implements a tracing facility for generating and collecting execution 44 * event traces from the hypervisor. When tracing is enabled, compiled in 45 * probes record events in contiguous per-CPU trace buffers. 46 * 47 * +---------+ 48 * +------+ | | 49 * | CPUn |----> | BUFFERn | 50 * +------+ | | 51 * +---------+- tbuf.va + (tbuf.size * n) 52 * : : 53 * +---------+ 54 * +------+ | | 55 * | CPU1 |----> | BUFFER1 | 56 * +------+ | | 57 * +---------+- tbuf.va + tbuf.size 58 * +------+ | | 59 * | CPU0 |----> | BUFFER0 | 60 * +------+ | | 61 * +---------+- tbuf.va 62 * 63 * Each CPU buffer consists of a metadata header followed by the trace records. 64 * The metadata consists of a producer/consumer pair of pointers into the buffer 65 * that point to the next record to be written and the next record to be read 66 * respectively. 67 * 68 * A trace record can be in one of two forms, depending on if the TSC is 69 * included. The record header indicates whether or not the TSC field is 70 * present. 71 * 72 * 1. Trace record without TSC: 73 * +------------------------------------------------------------+ 74 * | HEADER(uint32_t) | DATA FIELDS | 75 * +------------------------------------------------------------+ 76 * 77 * 2. Trace record with TSC: 78 * +--------------------------------------------------------------------------+ 79 * | HEADER(uint32_t) | TSC(uint64_t) | DATA FIELDS | 80 * +--------------------------------------------------------------------------+ 81 * 82 * Where, 83 * 84 * HEADER bit field: 85 * +--------------------------------------------------------------------------+ 86 * | C | NDATA | EVENT | 87 * +--------------------------------------------------------------------------+ 88 * 31 30 28 27 0 89 * 90 * EVENT: Event ID. 91 * NDATA: Number of populated data fields. 92 * C: TSC included. 93 * 94 * DATA FIELDS: 95 * +--------------------------------------------------------------------------+ 96 * | D1(uint32_t) | D2(uint32_t) | D3(uint32_t) | . . . | D7(uint32_t) | 97 * +--------------------------------------------------------------------------+ 98 * 99 * 100 * Interacting with DTrace 101 * ----------------------- 102 * 103 * Every xdt_poll_nsec nano-seconds we poll the trace buffers for data and feed 104 * each entry into dtrace_probe() with the corresponding probe ID for the event. 105 * As a result of this periodic collection implementation probe firings are 106 * asynchronous. This is the only sensible way to implement this form of 107 * provider, but because of its asynchronous nature asking things like 108 * "current CPU" and, more importantly, arbitrary questions about the context 109 * surrounding the probe firing are not meaningful. So, consumers should not 110 * attempt to infer anything beyond what is supplied via the probe arguments. 111 */ 112 113 #include <sys/xpv_user.h> 114 115 #include <sys/types.h> 116 #include <sys/sysmacros.h> 117 #include <sys/modctl.h> 118 #include <sys/sunddi.h> 119 #include <sys/ddi.h> 120 #include <sys/conf.h> 121 #include <sys/devops.h> 122 #include <sys/stat.h> 123 #include <sys/cmn_err.h> 124 #include <sys/dtrace.h> 125 #include <sys/sdt.h> 126 #include <sys/cyclic.h> 127 #include <vm/seg_kmem.h> 128 #include <vm/hat_i86.h> 129 130 #include <sys/hypervisor.h> 131 #include <xen/public/trace.h> 132 #include <xen/public/sched.h> 133 134 #define XDT_POLL_DEFAULT 100000000 /* default poll interval (ns) */ 135 #define XDT_POLL_MIN 10000000 /* min poll interval (ns) */ 136 #define XDT_TBUF_RETRY 50 /* tbuf disable retry count */ 137 138 /* 139 * The domid must match IDLE_DOMAIN_ID in xen.hg/xen/include/xen/sched.h 140 * in the xVM gate. 141 */ 142 #define IS_IDLE_DOM(domid) (domid == 0x7FFFU) 143 144 /* Macros to extract the domid and cpuid from a HVM trace data field */ 145 #define HVM_DOMID(d) (d >> 16) 146 #define HVM_VCPUID(d) (d & 0xFFFF) 147 148 /* Flags for shadow page table events */ 149 #define SH_GUEST_32 0x000 150 #define SH_GUEST_PAE 0x100 151 #define SH_GUEST_64 0x200 152 153 #define XDT_PROBE5(event, arg0, arg1, arg2, arg3, arg4) { \ 154 dtrace_id_t id = xdt_probemap[event]; \ 155 if (id) \ 156 dtrace_probe(id, arg0, arg1, arg2, arg3, arg4); \ 157 } \ 158 159 #define XDT_PROBE4(event, arg0, arg1, arg2, arg3) \ 160 XDT_PROBE5(event, arg0, arg1, arg2, arg3, 0) 161 162 #define XDT_PROBE3(event, arg0, arg1, arg2) \ 163 XDT_PROBE5(event, arg0, arg1, arg2, 0, 0) 164 165 #define XDT_PROBE2(event, arg0, arg1) \ 166 XDT_PROBE5(event, arg0, arg1, 0, 0, 0) 167 168 #define XDT_PROBE1(event, arg0) \ 169 XDT_PROBE5(event, arg0, 0, 0, 0, 0) 170 171 #define XDT_PROBE0(event) \ 172 XDT_PROBE5(event, 0, 0, 0, 0, 0) 173 174 /* Probe classes */ 175 #define XDT_SCHED 0 176 #define XDT_MEM 1 177 #define XDT_HVM 2 178 #define XDT_GEN 3 179 #define XDT_PV 4 180 #define XDT_SHADOW 5 181 #define XDT_PM 6 182 #define XDT_NCLASSES 7 183 184 /* Probe events */ 185 #define XDT_EVT_INVALID (-(int)1) 186 #define XDT_SCHED_OFF_CPU 0 187 #define XDT_SCHED_ON_CPU 1 188 #define XDT_SCHED_IDLE_OFF_CPU 2 189 #define XDT_SCHED_IDLE_ON_CPU 3 190 #define XDT_SCHED_BLOCK 4 191 #define XDT_SCHED_SLEEP 5 192 #define XDT_SCHED_WAKE 6 193 #define XDT_SCHED_YIELD 7 194 #define XDT_SCHED_SHUTDOWN_POWEROFF 8 195 #define XDT_SCHED_SHUTDOWN_REBOOT 9 196 #define XDT_SCHED_SHUTDOWN_SUSPEND 10 197 #define XDT_SCHED_SHUTDOWN_CRASH 11 198 #define XDT_MEM_PAGE_GRANT_MAP 12 199 #define XDT_MEM_PAGE_GRANT_UNMAP 13 200 #define XDT_MEM_PAGE_GRANT_TRANSFER 14 201 #define XDT_HVM_VMENTRY 15 202 #define XDT_HVM_VMEXIT 16 203 #define XDT_TRC_LOST_RECORDS 17 204 #define XDT_SCHED_ADD_VCPU 18 205 #define XDT_SCHED_REM_VCPU 19 /* unused */ 206 #define XDT_SCHED_CTL 20 /* unused */ 207 #define XDT_SCHED_ADJDOM 21 208 #define XDT_SCHED_S_TIMER_FN 22 /* unused */ 209 #define XDT_SCHED_T_TIMER_FN 23 /* unused */ 210 #define XDT_SCHED_DOM_TIMER_FN 24 /* unused */ 211 #define XDT_PV_HYPERCALL 25 212 #define XDT_PV_TRAP 26 213 #define XDT_PV_PAGE_FAULT 27 214 #define XDT_PV_FORCED_INVALID_OP 28 215 #define XDT_PV_EMULATE_PRIVOP 29 216 #define XDT_PV_EMULATE_4GB 30 /* unused (32-bit HV only ) */ 217 #define XDT_PV_MATH_STATE_RESTORE 31 218 #define XDT_PV_PAGING_FIXUP 32 219 #define XDT_PV_DT_MAPPING_FAULT 33 220 #define XDT_PV_PTWR_EMULATION 34 221 #define XDT_HVM_PF_XEN 35 222 #define XDT_HVM_PF_INJECT 36 223 #define XDT_HVM_EXC_INJECT 37 224 #define XDT_HVM_VIRQ_INJECT 38 225 #define XDT_HVM_VIRQ_REINJECT 39 226 #define XDT_HVM_IO_READ 40 /* unused */ 227 #define XDT_HVM_IO_WRITE 41 /* unused */ 228 #define XDT_HVM_CR_READ 42 229 #define XDT_HVM_CR_WRITE 43 230 #define XDT_HVM_DR_READ 44 /* unused */ 231 #define XDT_HVM_DR_WRITE 45 /* unused */ 232 #define XDT_HVM_MSR_READ 46 233 #define XDT_HVM_MSR_WRITE 47 234 #define XDT_HVM_CPUID 48 235 #define XDT_HVM_INTR 49 236 #define XDT_HVM_INTR_WINDOW 50 237 #define XDT_HVM_NMI 51 238 #define XDT_HVM_SMI 52 239 #define XDT_HVM_VMMCALL 53 240 #define XDT_HVM_HLT 54 241 #define XDT_HVM_INVLPG 55 242 #define XDT_HVM_MCE 56 243 #define XDT_HVM_IOPORT_READ 57 244 #define XDT_HVM_IOPORT_WRITE 58 245 #define XDT_HVM_CLTS 59 246 #define XDT_HVM_LMSW 60 247 #define XDT_HVM_IOMEM_READ 61 248 #define XDT_HVM_IOMEM_WRITE 62 249 #define XDT_SHADOW_NOT_SHADOW 63 250 #define XDT_SHADOW_FAST_PROPAGATE 64 251 #define XDT_SHADOW_FAST_MMIO 65 252 #define XDT_SHADOW_FALSE_FAST_PATH 66 253 #define XDT_SHADOW_MMIO 67 254 #define XDT_SHADOW_FIXUP 68 255 #define XDT_SHADOW_DOMF_DYING 69 256 #define XDT_SHADOW_EMULATE 70 257 #define XDT_SHADOW_EMULATE_UNSHADOW_USER 71 258 #define XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ 72 259 #define XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED 73 260 #define XDT_SHADOW_WRMAP_BF 74 261 #define XDT_SHADOW_PREALLOC_UNPIN 75 262 #define XDT_SHADOW_RESYNC_FULL 76 263 #define XDT_SHADOW_RESYNC_ONLY 77 264 #define XDT_PM_FREQ_CHANGE 78 265 #define XDT_PM_IDLE_ENTRY 79 266 #define XDT_PM_IDLE_EXIT 80 267 #define XDT_SCHED_RUNSTATE_CHANGE 81 268 #define XDT_SCHED_CONTINUE_RUNNING 82 269 #define XDT_NEVENTS 83 270 271 typedef struct { 272 const char *pr_mod; /* probe module */ 273 const char *pr_name; /* probe name */ 274 int evt_id; /* event id */ 275 uint_t class; /* probe class */ 276 } xdt_probe_t; 277 278 typedef struct { 279 uint32_t trc_mask; /* trace mask */ 280 uint32_t cnt; /* num enabled probes in class */ 281 } xdt_classinfo_t; 282 283 typedef struct { 284 ulong_t prev_domid; /* previous dom executed */ 285 ulong_t prev_vcpuid; /* previous vcpu executed */ 286 ulong_t prev_ctime; /* time spent on cpu */ 287 ulong_t next_domid; /* next dom to be scheduled */ 288 ulong_t next_vcpuid; /* next vcpu to be scheduled */ 289 ulong_t next_wtime; /* time spent waiting to get on cpu */ 290 ulong_t next_ts; /* allocated time slice */ 291 ulong_t cur_domid; /* current dom */ 292 ulong_t cur_vcpuid; /* current vcpuid */ 293 int curinfo_valid; /* info is valid */ 294 } xdt_schedinfo_t; 295 296 static struct { 297 uint_t cnt; /* total num of trace buffers */ 298 size_t size; /* size of each cpu buffer */ 299 mfn_t start_mfn; /* starting mfn of buffers */ 300 caddr_t va; /* va buffers are mapped into */ 301 302 /* per-cpu buffers */ 303 struct t_buf **meta; /* buffer metadata */ 304 struct t_rec **data; /* buffer data records */ 305 306 /* statistics */ 307 uint64_t stat_dropped_recs; /* records dropped */ 308 uint64_t stat_spurious_cpu; /* recs with garbage cpuids */ 309 uint64_t stat_spurious_switch; /* inconsistent vcpu switches */ 310 uint64_t stat_unknown_shutdown; /* unknown shutdown code */ 311 uint64_t stat_unknown_recs; /* unknown records */ 312 } tbuf; 313 314 static size_t tbuf_data_size; 315 316 static char *xdt_stats[] = { 317 "dropped_recs", 318 }; 319 320 /* 321 * Tunable variables 322 * 323 * The following may be tuned by adding a line to /etc/system that 324 * includes both the name of the module ("xdt") and the name of the variable. 325 * For example: 326 * set xdt:xdt_tbuf_pages = 40 327 */ 328 uint_t xdt_tbuf_pages = 20; /* pages to alloc per-cpu buf */ 329 330 /* 331 * The following may be tuned by adding a line to 332 * /platform/i86xpv/kernel/drv/xdt.conf. 333 * For example: 334 * xdt_poll_nsec = 200000000; 335 */ 336 static hrtime_t xdt_poll_nsec; /* trace buffer poll interval */ 337 338 /* 339 * Another tunable variable: the maximum number of records to process 340 * in one scan. If it is 0 (e.g. not set in /etc/system), it will 341 * be set to ncpu * (bufsize / max_rec_size). 342 * 343 * Having an upper limit avoids a situation where the scan would loop 344 * endlessly in case the hypervisor adds records quicker than we 345 * can process them. It's better to drop records than to loop, obviously. 346 */ 347 uint_t xdt_max_recs = 0; 348 349 /* 350 * Internal variables 351 */ 352 static dev_info_t *xdt_devi; 353 static dtrace_provider_id_t xdt_id; 354 static uint_t xdt_ncpus; /* total number of phys CPUs */ 355 static uint32_t cur_trace_mask; /* current trace mask */ 356 static xdt_schedinfo_t *xdt_cpu_schedinfo; /* per-cpu sched info */ 357 dtrace_id_t xdt_probemap[XDT_NEVENTS]; /* map of enabled probes */ 358 dtrace_id_t xdt_prid[XDT_NEVENTS]; /* IDs of registered events */ 359 static cyclic_id_t xdt_cyclic = CYCLIC_NONE; 360 static kstat_t *xdt_kstats; 361 static xdt_classinfo_t xdt_classinfo[XDT_NCLASSES]; 362 363 /* 364 * These provide context when probes fire. They can be accessed 365 * from xdt dtrace probe (as `xdt_curdom, etc). It's ok for these 366 * to be global, and not per-cpu, as probes are run strictly in sequence 367 * as the trace buffers are 368 */ 369 uint_t xdt_curdom, xdt_curvcpu, xdt_curpcpu; 370 uint64_t xdt_timestamp; 371 372 static xdt_probe_t xdt_probe[] = { 373 /* Sched probes */ 374 { "sched", "off-cpu", XDT_SCHED_OFF_CPU, XDT_SCHED }, 375 { "sched", "on-cpu", XDT_SCHED_ON_CPU, XDT_SCHED }, 376 { "sched", "idle-off-cpu", XDT_SCHED_IDLE_OFF_CPU, XDT_SCHED }, 377 { "sched", "idle-on-cpu", XDT_SCHED_IDLE_ON_CPU, XDT_SCHED }, 378 { "sched", "block", XDT_SCHED_BLOCK, XDT_SCHED }, 379 { "sched", "sleep", XDT_SCHED_SLEEP, XDT_SCHED }, 380 { "sched", "wake", XDT_SCHED_WAKE, XDT_SCHED }, 381 { "sched", "yield", XDT_SCHED_YIELD, XDT_SCHED }, 382 { "sched", "shutdown-poweroff", XDT_SCHED_SHUTDOWN_POWEROFF, 383 XDT_SCHED }, 384 { "sched", "shutdown-reboot", XDT_SCHED_SHUTDOWN_REBOOT, XDT_SCHED }, 385 { "sched", "shutdown-suspend", XDT_SCHED_SHUTDOWN_SUSPEND, XDT_SCHED }, 386 { "sched", "shutdown-crash", XDT_SCHED_SHUTDOWN_CRASH, XDT_SCHED }, 387 { "sched", "add", XDT_SCHED_ADD_VCPU, XDT_SCHED }, 388 { "sched", "runstate-change", XDT_SCHED_RUNSTATE_CHANGE, XDT_SCHED }, 389 { "sched", "continue-running", XDT_SCHED_CONTINUE_RUNNING, XDT_SCHED }, 390 391 /* Memory probes */ 392 { "mem", "page-grant-map", XDT_MEM_PAGE_GRANT_MAP, XDT_MEM }, 393 { "mem", "page-grant-unmap", XDT_MEM_PAGE_GRANT_UNMAP, XDT_MEM }, 394 { "mem", "page-grant-transfer", XDT_MEM_PAGE_GRANT_TRANSFER, XDT_MEM }, 395 396 {"pv", "hypercall", XDT_PV_HYPERCALL, XDT_PV }, 397 {"pv", "trap", XDT_PV_TRAP, XDT_PV }, 398 {"pv", "page-fault", XDT_PV_PAGE_FAULT, XDT_PV }, 399 {"pv", "forced-invalid-op", XDT_PV_FORCED_INVALID_OP, XDT_PV }, 400 {"pv", "emulate-priv-op", XDT_PV_EMULATE_PRIVOP, XDT_PV }, 401 {"pv", "math-state-restore", XDT_PV_MATH_STATE_RESTORE, XDT_PV }, 402 {"pv", "paging-fixup", XDT_PV_PAGING_FIXUP, XDT_PV }, 403 {"pv", "dt-mapping-fault", XDT_PV_DT_MAPPING_FAULT, XDT_PV }, 404 {"pv", "pte-write-emul", XDT_PV_PTWR_EMULATION, XDT_PV }, 405 406 /* HVM probes */ 407 { "hvm", "vmentry", XDT_HVM_VMENTRY, XDT_HVM }, 408 { "hvm", "vmexit", XDT_HVM_VMEXIT, XDT_HVM }, 409 { "hvm", "pagefault-xen", XDT_HVM_PF_XEN, XDT_HVM }, 410 { "hvm", "pagefault-inject", XDT_HVM_PF_INJECT, XDT_HVM }, 411 { "hvm", "exception-inject", XDT_HVM_EXC_INJECT, XDT_HVM }, 412 { "hvm", "virq-inject", XDT_HVM_VIRQ_INJECT, XDT_HVM }, 413 { "hvm", "cr-read", XDT_HVM_CR_READ, XDT_HVM }, 414 { "hvm", "cr-write", XDT_HVM_CR_WRITE, XDT_HVM }, 415 { "hvm", "msr-read", XDT_HVM_MSR_READ, XDT_HVM }, 416 { "hvm", "msr-write", XDT_HVM_MSR_WRITE, XDT_HVM }, 417 { "hvm", "cpuid", XDT_HVM_CPUID, XDT_HVM }, 418 { "hvm", "intr", XDT_HVM_INTR, XDT_HVM }, 419 { "hvm", "intr-window", XDT_HVM_INTR_WINDOW, XDT_HVM }, 420 { "hvm", "nmi", XDT_HVM_NMI, XDT_HVM }, 421 { "hvm", "smi", XDT_HVM_SMI, XDT_HVM }, 422 { "hvm", "vmmcall", XDT_HVM_VMMCALL, XDT_HVM }, 423 { "hvm", "hlt", XDT_HVM_HLT, XDT_HVM }, 424 { "hvm", "invlpg", XDT_HVM_INVLPG, XDT_HVM }, 425 { "hvm", "mce", XDT_HVM_MCE, XDT_HVM }, 426 { "hvm", "pio-read", XDT_HVM_IOPORT_READ, XDT_HVM }, 427 { "hvm", "pio-write", XDT_HVM_IOPORT_WRITE, XDT_HVM }, 428 { "hvm", "mmio-read", XDT_HVM_IOMEM_READ, XDT_HVM }, 429 { "hvm", "mmio-write", XDT_HVM_IOMEM_WRITE, XDT_HVM }, 430 { "hvm", "clts", XDT_HVM_CLTS, XDT_HVM }, 431 { "hvm", "lmsw", XDT_HVM_LMSW, XDT_HVM }, 432 433 { "shadow", "fault-not-shadow", XDT_SHADOW_NOT_SHADOW, XDT_SHADOW }, 434 { "shadow", "fast-propagate", XDT_SHADOW_FAST_PROPAGATE, XDT_SHADOW }, 435 { "shadow", "fast-mmio", XDT_SHADOW_FAST_MMIO, XDT_SHADOW }, 436 { "shadow", "false-fast-path", XDT_SHADOW_FALSE_FAST_PATH, 437 XDT_SHADOW }, 438 { "shadow", "mmio", XDT_SHADOW_MMIO, XDT_SHADOW }, 439 { "shadow", "fixup", XDT_SHADOW_FIXUP, XDT_SHADOW }, 440 { "shadow", "domf-dying", XDT_SHADOW_DOMF_DYING, XDT_SHADOW }, 441 { "shadow", "emulate", XDT_SHADOW_EMULATE, XDT_SHADOW }, 442 { "shadow", "emulate-unshadow-user", XDT_SHADOW_EMULATE_UNSHADOW_USER, 443 XDT_SHADOW }, 444 { "shadow", "emulate-unshadow-evtinj", 445 XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ, XDT_SHADOW }, 446 { "shadow", "emulate-unshadow-unhandled", 447 XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED, XDT_SHADOW }, 448 { "shadow", "wrmap-bf", XDT_SHADOW_WRMAP_BF, XDT_SHADOW }, 449 { "shadow", "prealloc-unpin", XDT_SHADOW_PREALLOC_UNPIN, XDT_SHADOW }, 450 { "shadow", "resync-full", XDT_SHADOW_RESYNC_FULL, XDT_SHADOW }, 451 { "shadow", "resync-only", XDT_SHADOW_RESYNC_ONLY, XDT_SHADOW }, 452 453 { "pm", "freq-change", XDT_PM_FREQ_CHANGE, XDT_PM }, 454 { "pm", "idle-entry", XDT_PM_IDLE_ENTRY, XDT_PM }, 455 { "pm", "idle-exit", XDT_PM_IDLE_EXIT, XDT_PM }, 456 457 /* Trace buffer related probes */ 458 { "trace", "records-lost", XDT_TRC_LOST_RECORDS, XDT_GEN }, 459 460 { NULL } 461 }; 462 463 static inline uint32_t 464 xdt_nr_active_probes() 465 { 466 int i; 467 uint32_t tot = 0; 468 469 for (i = 0; i < XDT_NCLASSES; i++) 470 tot += xdt_classinfo[i].cnt; 471 472 return (tot); 473 } 474 475 static void 476 xdt_init_trace_masks(void) 477 { 478 xdt_classinfo[XDT_SCHED].trc_mask = TRC_SCHED; 479 xdt_classinfo[XDT_MEM].trc_mask = TRC_MEM; 480 xdt_classinfo[XDT_HVM].trc_mask = TRC_HVM; 481 xdt_classinfo[XDT_GEN].trc_mask = TRC_GEN; 482 xdt_classinfo[XDT_PV].trc_mask = TRC_PV; 483 xdt_classinfo[XDT_SHADOW].trc_mask = TRC_SHADOW; 484 xdt_classinfo[XDT_PM].trc_mask = TRC_PM; 485 } 486 487 static int 488 xdt_kstat_update(kstat_t *ksp, int flag) 489 { 490 kstat_named_t *knp; 491 492 if (flag != KSTAT_READ) 493 return (EACCES); 494 495 knp = ksp->ks_data; 496 497 /* 498 * Assignment order should match that of the names in 499 * xdt_stats. 500 */ 501 (knp++)->value.ui64 = tbuf.stat_dropped_recs; 502 503 return (0); 504 } 505 506 static void 507 xdt_kstat_init(void) 508 { 509 int nstats = sizeof (xdt_stats) / sizeof (xdt_stats[0]); 510 char **cp = xdt_stats; 511 kstat_named_t *knp; 512 513 if ((xdt_kstats = kstat_create("xdt", 0, "trace_statistics", "misc", 514 KSTAT_TYPE_NAMED, nstats, 0)) == NULL) 515 return; 516 517 xdt_kstats->ks_update = xdt_kstat_update; 518 519 knp = xdt_kstats->ks_data; 520 while (nstats > 0) { 521 kstat_named_init(knp, *cp, KSTAT_DATA_UINT64); 522 knp++; 523 cp++; 524 nstats--; 525 } 526 527 kstat_install(xdt_kstats); 528 } 529 530 static int 531 xdt_sysctl_tbuf(xen_sysctl_tbuf_op_t *tbuf_op) 532 { 533 xen_sysctl_t op; 534 int xerr; 535 536 op.cmd = XEN_SYSCTL_tbuf_op; 537 op.interface_version = XEN_SYSCTL_INTERFACE_VERSION; 538 op.u.tbuf_op = *tbuf_op; 539 540 if ((xerr = HYPERVISOR_sysctl(&op)) != 0) 541 return (xen_xlate_errcode(xerr)); 542 543 *tbuf_op = op.u.tbuf_op; 544 return (0); 545 } 546 547 static int 548 xdt_map_trace_buffers(mfn_t mfn, caddr_t va, size_t len) 549 { 550 x86pte_t pte; 551 caddr_t const sva = va; 552 caddr_t const eva = va + len; 553 int xerr; 554 555 ASSERT(mfn != MFN_INVALID); 556 ASSERT(va != NULL); 557 ASSERT(IS_PAGEALIGNED(len)); 558 559 for (; va < eva; va += MMU_PAGESIZE) { 560 /* 561 * Ask the HAT to load a throwaway mapping to page zero, then 562 * overwrite it with the hypervisor mapping. It gets removed 563 * later via hat_unload(). 564 */ 565 hat_devload(kas.a_hat, va, MMU_PAGESIZE, (pfn_t)0, 566 PROT_READ | HAT_UNORDERED_OK, 567 HAT_LOAD_NOCONSIST | HAT_LOAD); 568 569 pte = mmu_ptob((x86pte_t)mfn) | PT_VALID | PT_USER 570 | PT_FOREIGN | PT_WRITABLE; 571 572 xerr = HYPERVISOR_update_va_mapping_otherdomain((ulong_t)va, 573 pte, UVMF_INVLPG | UVMF_LOCAL, DOMID_XEN); 574 575 if (xerr != 0) { 576 /* unmap pages loaded so far */ 577 size_t ulen = (uintptr_t)(va + MMU_PAGESIZE) - 578 (uintptr_t)sva; 579 hat_unload(kas.a_hat, sva, ulen, HAT_UNLOAD_UNMAP); 580 return (xen_xlate_errcode(xerr)); 581 } 582 583 mfn++; 584 } 585 586 return (0); 587 } 588 589 static int 590 xdt_attach_trace_buffers(void) 591 { 592 xen_sysctl_tbuf_op_t tbuf_op; 593 size_t len; 594 int err; 595 uint_t i; 596 597 /* 598 * Xen does not support trace buffer re-sizing. If the buffers 599 * have already been allocated we just use them as is. 600 */ 601 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_get_info; 602 if ((err = xdt_sysctl_tbuf(&tbuf_op)) != 0) 603 return (err); 604 605 if (tbuf_op.size == 0) { 606 /* set trace buffer size */ 607 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_set_size; 608 tbuf_op.size = xdt_tbuf_pages; 609 (void) xdt_sysctl_tbuf(&tbuf_op); 610 611 /* get trace buffer info */ 612 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_get_info; 613 if ((err = xdt_sysctl_tbuf(&tbuf_op)) != 0) 614 return (err); 615 616 if (tbuf_op.size == 0) { 617 cmn_err(CE_NOTE, "Couldn't allocate trace buffers."); 618 return (ENOBUFS); 619 } 620 } 621 622 tbuf.size = tbuf_op.size; 623 tbuf.start_mfn = (mfn_t)tbuf_op.buffer_mfn; 624 tbuf.cnt = xdt_ncpus; 625 626 ASSERT(tbuf.start_mfn != MFN_INVALID); 627 ASSERT(tbuf.cnt > 0); 628 629 len = tbuf.size * tbuf.cnt; 630 tbuf.va = vmem_alloc(heap_arena, len, VM_SLEEP); 631 632 if ((err = xdt_map_trace_buffers(tbuf.start_mfn, tbuf.va, len)) != 0) { 633 vmem_free(heap_arena, tbuf.va, len); 634 tbuf.va = NULL; 635 return (err); 636 } 637 638 tbuf.meta = (struct t_buf **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.meta), 639 KM_SLEEP); 640 tbuf.data = (struct t_rec **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.data), 641 KM_SLEEP); 642 643 for (i = 0; i < tbuf.cnt; i++) { 644 void *cpu_buf = (void *)(tbuf.va + (tbuf.size * i)); 645 tbuf.meta[i] = cpu_buf; 646 tbuf.data[i] = (struct t_rec *)((uintptr_t)cpu_buf + 647 sizeof (struct t_buf)); 648 649 /* throw away stale trace records */ 650 tbuf.meta[i]->cons = tbuf.meta[i]->prod; 651 } 652 653 tbuf_data_size = tbuf.size - sizeof (struct t_buf); 654 if (xdt_max_recs == 0) 655 xdt_max_recs = (xdt_ncpus * tbuf_data_size) 656 / sizeof (struct t_rec); 657 658 return (0); 659 } 660 661 static void 662 xdt_detach_trace_buffers(void) 663 { 664 size_t len = tbuf.size * tbuf.cnt; 665 666 ASSERT(tbuf.va != NULL); 667 668 hat_unload(kas.a_hat, tbuf.va, len, 669 HAT_UNLOAD_UNMAP | HAT_UNLOAD_UNLOCK); 670 vmem_free(heap_arena, tbuf.va, len); 671 kmem_free(tbuf.meta, tbuf.cnt * sizeof (*tbuf.meta)); 672 kmem_free(tbuf.data, tbuf.cnt * sizeof (*tbuf.data)); 673 } 674 675 static void 676 xdt_update_sched_context(uint_t cpuid, uint_t dom, uint_t vcpu) 677 { 678 xdt_schedinfo_t *sp = &xdt_cpu_schedinfo[cpuid]; 679 680 sp->cur_domid = dom; 681 sp->cur_vcpuid = vcpu; 682 sp->curinfo_valid = 1; 683 } 684 685 static void 686 xdt_update_domain_context(uint_t dom, uint_t vcpu) 687 { 688 xdt_curdom = dom; 689 xdt_curvcpu = vcpu; 690 } 691 692 static size_t 693 xdt_process_rec(uint_t cpuid, struct t_rec *rec) 694 { 695 xdt_schedinfo_t *sp = &xdt_cpu_schedinfo[cpuid]; 696 uint_t dom, vcpu; 697 int eid; 698 uint32_t *data; 699 uint64_t tsc, addr64, rip64, val64, pte64; 700 size_t rec_size; 701 702 ASSERT(rec != NULL); 703 ASSERT(xdt_ncpus == xpv_nr_phys_cpus()); 704 705 if (cpuid >= xdt_ncpus) { 706 tbuf.stat_spurious_cpu++; 707 goto done; 708 } 709 710 /* 711 * If our current state isn't valid, and if this is not 712 * an event that will update our state, skip it. 713 */ 714 715 if (!sp->curinfo_valid && 716 rec->event != TRC_SCHED_SWITCH && 717 rec->event != TRC_LOST_RECORDS) 718 goto done; 719 720 if (rec->cycles_included) { 721 data = rec->u.cycles.extra_u32; 722 tsc = (((uint64_t)rec->u.cycles.cycles_hi) << 32) 723 | rec->u.cycles.cycles_lo; 724 } else { 725 data = rec->u.nocycles.extra_u32; 726 tsc = 0; 727 } 728 729 xdt_timestamp = tsc; 730 731 switch (rec->event) { 732 /* 733 * Sched probes 734 */ 735 case TRC_SCHED_SWITCH_INFPREV: 736 /* 737 * Info on vCPU being de-scheduled 738 * 739 * data[0] = prev domid 740 * data[1] = time spent on pcpu 741 */ 742 sp->prev_domid = data[0]; 743 sp->prev_ctime = data[1]; 744 break; 745 746 case TRC_SCHED_SWITCH_INFNEXT: 747 /* 748 * Info on next vCPU to be scheduled 749 * 750 * data[0] = next domid 751 * data[1] = time spent waiting to get on cpu 752 * data[2] = time slice 753 */ 754 sp->next_domid = data[0]; 755 sp->next_wtime = data[1]; 756 sp->next_ts = data[2]; 757 break; 758 759 case TRC_SCHED_SWITCH: 760 /* 761 * vCPU switch 762 * 763 * data[0] = prev domid 764 * data[1] = prev vcpuid 765 * data[2] = next domid 766 * data[3] = next vcpuid 767 */ 768 769 /* 770 * Provide valid context for this probe if there 771 * wasn't one. 772 */ 773 if (!sp->curinfo_valid) 774 xdt_update_domain_context(data[0], data[1]); 775 776 xdt_update_sched_context(cpuid, data[0], data[1]); 777 778 if (data[0] != sp->prev_domid && 779 data[2] != sp->next_domid) { 780 /* prev and next info don't match doms being sched'd */ 781 tbuf.stat_spurious_switch++; 782 goto switchdone; 783 } 784 785 sp->prev_vcpuid = data[1]; 786 sp->next_vcpuid = data[3]; 787 788 XDT_PROBE3(IS_IDLE_DOM(sp->prev_domid)? 789 XDT_SCHED_IDLE_OFF_CPU:XDT_SCHED_OFF_CPU, 790 sp->prev_domid, sp->prev_vcpuid, sp->prev_ctime); 791 792 XDT_PROBE4(IS_IDLE_DOM(sp->next_domid)? 793 XDT_SCHED_IDLE_ON_CPU:XDT_SCHED_ON_CPU, 794 sp->next_domid, sp->next_vcpuid, sp->next_wtime, 795 sp->next_ts); 796 switchdone: 797 xdt_update_sched_context(cpuid, data[2], data[3]); 798 xdt_update_domain_context(data[2], data[3]); 799 800 break; 801 802 case TRC_SCHED_BLOCK: 803 /* 804 * vCPU blocked 805 * 806 * data[0] = domid 807 * data[1] = vcpuid 808 */ 809 XDT_PROBE2(XDT_SCHED_BLOCK, data[0], data[1]); 810 break; 811 812 case TRC_SCHED_SLEEP: 813 /* 814 * Put vCPU to sleep 815 * 816 * data[0] = domid 817 * data[1] = vcpuid 818 */ 819 XDT_PROBE2(XDT_SCHED_SLEEP, data[0], data[1]); 820 break; 821 822 case TRC_SCHED_WAKE: 823 /* 824 * Wake up vCPU 825 * 826 * data[0] = domid 827 * data[1] = vcpuid 828 */ 829 XDT_PROBE2(XDT_SCHED_WAKE, data[0], data[1]); 830 break; 831 832 case TRC_SCHED_YIELD: 833 /* 834 * vCPU yielded 835 * 836 * data[0] = domid 837 * data[1] = vcpuid 838 */ 839 XDT_PROBE2(XDT_SCHED_YIELD, data[0], data[1]); 840 break; 841 842 case TRC_SCHED_SHUTDOWN: 843 /* 844 * Guest shutting down 845 * 846 * data[0] = domid 847 * data[1] = initiating vcpu 848 * data[2] = shutdown code 849 */ 850 switch (data[2]) { 851 case SHUTDOWN_poweroff: 852 eid = XDT_SCHED_SHUTDOWN_POWEROFF; 853 break; 854 case SHUTDOWN_reboot: 855 eid = XDT_SCHED_SHUTDOWN_REBOOT; 856 break; 857 case SHUTDOWN_suspend: 858 eid = XDT_SCHED_SHUTDOWN_SUSPEND; 859 break; 860 case SHUTDOWN_crash: 861 eid = XDT_SCHED_SHUTDOWN_CRASH; 862 break; 863 default: 864 tbuf.stat_unknown_shutdown++; 865 goto done; 866 } 867 868 XDT_PROBE2(eid, data[0], data[1]); 869 break; 870 871 case TRC_SCHED_DOM_REM: 872 case TRC_SCHED_CTL: 873 case TRC_SCHED_S_TIMER_FN: 874 case TRC_SCHED_T_TIMER_FN: 875 case TRC_SCHED_DOM_TIMER_FN: 876 /* unused */ 877 break; 878 case TRC_SCHED_DOM_ADD: 879 /* 880 * Add vcpu to a guest. 881 * 882 * data[0] = domid 883 * data[1] = vcpu 884 */ 885 XDT_PROBE2(XDT_SCHED_ADD_VCPU, data[0], data[1]); 886 break; 887 case TRC_SCHED_ADJDOM: 888 /* 889 * Scheduling parameters for a guest 890 * were modified. 891 * 892 * data[0] = domid; 893 */ 894 XDT_PROBE1(XDT_SCHED_ADJDOM, data[1]); 895 break; 896 case TRC_SCHED_RUNSTATE_CHANGE: 897 /* 898 * Runstate change for a VCPU. 899 * 900 * data[0] = (domain << 16) | vcpu; 901 * data[1] = oldstate; 902 * data[2] = newstate; 903 */ 904 XDT_PROBE4(XDT_SCHED_RUNSTATE_CHANGE, data[0] >> 16, 905 data[0] & 0xffff, data[1], data[2]); 906 break; 907 case TRC_SCHED_CONTINUE_RUNNING: 908 /* 909 * VCPU is back on a physical CPU that it previously 910 * was also running this VCPU. 911 * 912 * data[0] = (domain << 16) | vcpu; 913 */ 914 XDT_PROBE2(XDT_SCHED_CONTINUE_RUNNING, data[0] >> 16, 915 data[0] & 0xffff); 916 break; 917 /* 918 * Mem probes 919 */ 920 case TRC_MEM_PAGE_GRANT_MAP: 921 /* 922 * Guest mapped page grant 923 * 924 * data[0] = target domid 925 */ 926 XDT_PROBE1(XDT_MEM_PAGE_GRANT_MAP, data[0]); 927 break; 928 929 case TRC_MEM_PAGE_GRANT_UNMAP: 930 /* 931 * Guest unmapped page grant 932 * 933 * data[0] = target domid 934 */ 935 XDT_PROBE1(XDT_MEM_PAGE_GRANT_UNMAP, data[0]); 936 break; 937 938 case TRC_MEM_PAGE_GRANT_TRANSFER: 939 /* 940 * Page grant is being transferred 941 * 942 * data[0] = target domid 943 */ 944 XDT_PROBE1(XDT_MEM_PAGE_GRANT_TRANSFER, data[0]); 945 break; 946 947 /* 948 * Probes for PV domains. 949 */ 950 case TRC_PV_HYPERCALL: 951 /* 952 * Hypercall from a 32-bit PV domain. 953 * 954 * data[0] = eip 955 * data[1] = eax 956 */ 957 XDT_PROBE2(XDT_PV_HYPERCALL, data[0], data[1]); 958 break; 959 case TRC_PV_HYPERCALL | TRC_64_FLAG: 960 /* 961 * Hypercall from a 64-bit PV domain. 962 * 963 * data[0] = rip(0:31) 964 * data[1] = rip(32:63) 965 * data[2] = eax; 966 */ 967 rip64 = (((uint64_t)data[1]) << 32) | data[0]; 968 XDT_PROBE2(XDT_PV_HYPERCALL, rip64, data[2]); 969 break; 970 case TRC_PV_TRAP: 971 /* 972 * Trap in a 32-bit PV domain. 973 * 974 * data[0] = eip 975 * data[1] = trapnr | (error_code_valid << 15) 976 * | (error_code << 16); 977 */ 978 XDT_PROBE4(XDT_PV_TRAP, data[0], data[1] & 0x7fff, 979 (data[1] >> 15) & 1, data[1] >> 16); 980 break; 981 case TRC_PV_TRAP | TRC_64_FLAG: 982 /* 983 * Trap in a 64-bit PV domain. 984 * 985 * data[0] = rip(0:31) 986 * data[1] = rip(32:63) 987 * data[2] = trapnr | (error_code_valid << 15) 988 * | (error_code << 16); 989 */ 990 rip64 = (((uint64_t)data[1]) << 32) | data[2]; 991 XDT_PROBE4(XDT_PV_TRAP, rip64, data[2] & 0x7fff, 992 (data[2] >> 15) & 1, data[2] >> 16); 993 break; 994 case TRC_PV_PAGE_FAULT: 995 /* 996 * Page fault in a 32-bit PV domain. 997 * 998 * data[0] = eip 999 * data[1] = vaddr 1000 * data[2] = error code 1001 */ 1002 XDT_PROBE3(XDT_PV_PAGE_FAULT, data[0], data[1], data[2]); 1003 break; 1004 case TRC_PV_PAGE_FAULT | TRC_64_FLAG: 1005 /* 1006 * Page fault in a 32-bit PV domain. 1007 * 1008 * data[0] = rip(0:31) 1009 * data[1] = rip(31:63) 1010 * data[2] = vaddr(0:31) 1011 * data[3] = vaddr(31:63) 1012 * data[4] = error code 1013 */ 1014 rip64 = (((uint64_t)data[1]) << 32) | data[0]; 1015 addr64 = (((uint64_t)data[3]) << 32) | data[2]; 1016 XDT_PROBE3(XDT_PV_PAGE_FAULT, rip64, addr64, data[4]); 1017 break; 1018 case TRC_PV_FORCED_INVALID_OP: 1019 /* 1020 * Hypervisor emulated a forced invalid op (ud2) 1021 * in a 32-bit PV domain. 1022 * 1023 * data[1] = eip 1024 */ 1025 XDT_PROBE1(XDT_PV_FORCED_INVALID_OP, data[1]); 1026 break; 1027 case TRC_PV_FORCED_INVALID_OP | TRC_64_FLAG: 1028 /* 1029 * Hypervisor emulated a forced invalid op (ud2) 1030 * in a 64-bit PV domain. 1031 * 1032 * data[1] = rip(0:31) 1033 * data[2] = rip(31:63) 1034 * 1035 */ 1036 rip64 = (((uint64_t)data[2]) << 32) | data[1]; 1037 XDT_PROBE1(XDT_PV_FORCED_INVALID_OP, rip64); 1038 break; 1039 case TRC_PV_EMULATE_PRIVOP: 1040 /* 1041 * Hypervisor emulated a privileged operation 1042 * in a 32-bit PV domain. 1043 * 1044 * data[0] = eip 1045 */ 1046 XDT_PROBE1(XDT_PV_EMULATE_PRIVOP, data[0]); 1047 break; 1048 case TRC_PV_EMULATE_PRIVOP | TRC_64_FLAG: 1049 /* 1050 * Hypervisor emulated a privileged operation 1051 * in a 64-bit PV domain. 1052 * 1053 * data[0] = rip(0:31) 1054 * data[1] = rip(31:63) 1055 */ 1056 rip64 = (((uint64_t)data[1]) << 32) | data[0]; 1057 XDT_PROBE1(XDT_PV_EMULATE_PRIVOP, rip64); 1058 break; 1059 case TRC_PV_EMULATE_4GB: 1060 /* unused, 32-bit hypervisor only */ 1061 break; 1062 case TRC_PV_MATH_STATE_RESTORE: 1063 /* 1064 * Hypervisor restores math state after FP DNA trap. 1065 * 1066 * No arguments. 1067 */ 1068 XDT_PROBE0(XDT_PV_MATH_STATE_RESTORE); 1069 break; 1070 case TRC_PV_PAGING_FIXUP: 1071 /* 1072 * Hypervisor fixed up a page fault (e.g. it was 1073 * a side-effect of hypervisor guest page table 1074 * bookkeeping, and not propagated to the guest). 1075 * 1076 * data[0] = eip 1077 * data[1] = vaddr 1078 */ 1079 XDT_PROBE2(XDT_PV_PAGING_FIXUP, data[0], data[2]); 1080 break; 1081 case TRC_PV_PAGING_FIXUP | TRC_64_FLAG: 1082 /* 1083 * Hypervisor fixed up a page fault (e.g. it was 1084 * a side-effect of hypervisor guest page table 1085 * bookkeeping, and not propagated to the guest). 1086 * 1087 * data[0] = eip(0:31) 1088 * data[1] = eip(31:63) 1089 * data[2] = vaddr(0:31) 1090 * data[3] = vaddr(31:63) 1091 */ 1092 rip64 = (((uint64_t)data[1]) << 32) | data[0]; 1093 addr64 = (((uint64_t)data[3]) << 32) | data[2]; 1094 XDT_PROBE2(XDT_PV_PAGING_FIXUP, rip64, addr64); 1095 break; 1096 case TRC_PV_GDT_LDT_MAPPING_FAULT: 1097 /* 1098 * Descriptor table mapping fault in a 32-bit PV domain. 1099 * data[0] = eip 1100 * data[1] = offset 1101 */ 1102 XDT_PROBE2(XDT_PV_DT_MAPPING_FAULT, data[0], data[1]); 1103 break; 1104 case TRC_PV_GDT_LDT_MAPPING_FAULT | TRC_64_FLAG: 1105 /* 1106 * Descriptor table mapping fault in a 64-bit PV domain. 1107 * 1108 * data[0] = eip(0:31) 1109 * data[1] = eip(31:63) 1110 * data[2] = offset(0:31) 1111 * data[3] = offset(31:63) 1112 */ 1113 rip64 = (((uint64_t)data[1]) << 32) | data[0]; 1114 val64 = (((uint64_t)data[3]) << 32) | data[2]; 1115 XDT_PROBE2(XDT_PV_DT_MAPPING_FAULT, rip64, val64); 1116 break; 1117 case TRC_PV_PTWR_EMULATION: 1118 case TRC_PV_PTWR_EMULATION_PAE | TRC_64_FLAG: 1119 /* 1120 * Should only happen on 32-bit hypervisor; unused. 1121 */ 1122 break; 1123 case TRC_PV_PTWR_EMULATION_PAE: 1124 /* 1125 * PTE write emulation for a 32-bit PV domain. 1126 * 1127 * data[0] = pte 1128 * data[1] = addr 1129 * data[2] = eip 1130 */ 1131 XDT_PROBE3(XDT_PV_PTWR_EMULATION, data[0], data[1], data[2]); 1132 break; 1133 case TRC_PV_PTWR_EMULATION | TRC_64_FLAG: 1134 /* 1135 * PTE write emulation for a 64-bit PV domain. 1136 * 1137 * data[0] = pte(0:31) 1138 * data[1] = pte(32:63) 1139 * data[2] = addr(0:31) 1140 * data[3] = addr(32:63) 1141 * data[4] = rip(0:31) 1142 * data[5] = rip(32:63) 1143 */ 1144 pte64 = (((uint64_t)data[1]) << 32) | data[0]; 1145 addr64 = (((uint64_t)data[3]) << 32) | data[2]; 1146 rip64 = (((uint64_t)data[5]) << 32) | data[4]; 1147 XDT_PROBE3(XDT_PV_PTWR_EMULATION, pte64, addr64, rip64); 1148 break; 1149 1150 /* 1151 * HVM probes 1152 */ 1153 case TRC_HVM_VMENTRY: 1154 /* 1155 * Return to guest via vmx_launch/vmrun 1156 * 1157 */ 1158 XDT_PROBE0(XDT_HVM_VMENTRY); 1159 break; 1160 1161 case TRC_HVM_VMEXIT: 1162 /* 1163 * Entry into VMEXIT handler from 32-bit HVM domain 1164 * 1165 * data[0] = cpu vendor specific exit code 1166 * data[1] = guest eip 1167 */ 1168 XDT_PROBE2(XDT_HVM_VMEXIT, data[0], data[1]); 1169 break; 1170 case TRC_HVM_VMEXIT64: 1171 /* 1172 * Entry into VMEXIT handler from 64-bit HVM domain 1173 * 1174 * data[0] = cpu vendor specific exit code 1175 * data[1] = guest rip(0:31) 1176 * data[2] = guest rip(32:64) 1177 */ 1178 rip64 = (((uint64_t)data[2]) << 32) | data[1]; 1179 XDT_PROBE2(XDT_HVM_VMEXIT, data[0], rip64); 1180 break; 1181 1182 case TRC_HVM_PF_XEN64: 1183 /* 1184 * Pagefault in a guest that is a Xen (e.g. shadow) 1185 * artifact, and is not injected back into the guest. 1186 * 1187 * data[0] = error code 1188 * data[1] = guest VA(0:31) 1189 * data[2] = guest VA(32:64) 1190 */ 1191 addr64 = (((uint64_t)data[2]) << 32) | data[1]; 1192 XDT_PROBE2(XDT_HVM_PF_XEN, data[0], addr64); 1193 break; 1194 1195 case TRC_HVM_PF_XEN: 1196 /* 1197 * Same as above, but for a 32-bit HVM domain. 1198 * 1199 * data[0] = error code 1200 * data[1] = guest VA 1201 */ 1202 XDT_PROBE2(XDT_HVM_PF_XEN, data[0], data[1]); 1203 break; 1204 1205 case TRC_HVM_PF_INJECT: 1206 /* 1207 * 32-bit Xen only. 1208 */ 1209 break; 1210 case TRC_HVM_PF_INJECT64: 1211 /* 1212 * Pagefault injected back into a guest (e.g. the shadow 1213 * code found no mapping). 1214 * 1215 * data[0] = error code 1216 * data[1] = guest VA(0:31) 1217 * data[2] = guest VA(32:64) 1218 */ 1219 addr64 = (((uint64_t)data[2]) << 32) | data[1]; 1220 XDT_PROBE2(XDT_HVM_PF_INJECT, data[0], addr64); 1221 break; 1222 1223 case TRC_HVM_INJ_EXC: 1224 /* 1225 * Exception injected into an HVM guest. 1226 * 1227 * data[0] = trap 1228 * data[1] = error code 1229 */ 1230 XDT_PROBE2(XDT_HVM_EXC_INJECT, data[0], data[1]); 1231 break; 1232 case TRC_HVM_INJ_VIRQ: 1233 /* 1234 * Interrupt inject into an HVM guest. 1235 * 1236 * data[0] = vector 1237 */ 1238 XDT_PROBE1(XDT_HVM_VIRQ_INJECT, data[0]); 1239 break; 1240 case TRC_HVM_REINJ_VIRQ: 1241 case TRC_HVM_IO_READ: 1242 case TRC_HVM_IO_WRITE: 1243 /* unused */ 1244 break; 1245 case TRC_HVM_CR_READ64: 1246 /* 1247 * Control register read. Intel VMX only. 1248 * 1249 * data[0] = control register # 1250 * data[1] = value(0:31) 1251 * data[2] = value(32:63) 1252 */ 1253 val64 = (((uint64_t)data[2]) << 32) | data[1]; 1254 XDT_PROBE2(XDT_HVM_CR_READ, data[0], val64); 1255 break; 1256 case TRC_HVM_CR_READ: 1257 /* 1258 * unused (32-bit Xen only) 1259 */ 1260 break; 1261 case TRC_HVM_CR_WRITE64: 1262 /* 1263 * Control register write. Intel VMX only. 1264 * 1265 * data[0] = control register # 1266 * data[1] = value(0:31) 1267 * data[2] = value(32:63) 1268 */ 1269 val64 = (((uint64_t)data[2]) << 32) | data[1]; 1270 XDT_PROBE2(XDT_HVM_CR_READ, data[0], val64); 1271 break; 1272 case TRC_HVM_CR_WRITE: 1273 /* 1274 * unused (32-bit Xen only) 1275 */ 1276 break; 1277 case TRC_HVM_DR_READ: 1278 /* 1279 * unused. 1280 * 1281 * data[0] = (domid<<16 + vcpuid) 1282 */ 1283 break; 1284 case TRC_HVM_DR_WRITE: 1285 /* 1286 * Debug register write. Not too useful; no values, 1287 * so we ignore this. 1288 * 1289 * data[0] = (domid<<16 + vcpuid) 1290 */ 1291 break; 1292 case TRC_HVM_MSR_READ: 1293 /* 1294 * MSR read. 1295 * 1296 * data[0] = MSR 1297 * data[1] = value(0:31) 1298 * data[2] = value(32:63) 1299 */ 1300 val64 = (((uint64_t)data[3]) << 32) | data[2]; 1301 XDT_PROBE2(XDT_HVM_MSR_READ, data[0], val64); 1302 break; 1303 case TRC_HVM_MSR_WRITE: 1304 /* 1305 * MSR write. 1306 * 1307 * data[0] = MSR; 1308 * data[1] = value(0:31) 1309 * data[2] = value(32:63) 1310 */ 1311 val64 = (((uint64_t)data[2]) << 32) | data[1]; 1312 XDT_PROBE2(XDT_HVM_MSR_WRITE, data[0], val64); 1313 break; 1314 case TRC_HVM_CPUID: 1315 /* 1316 * CPUID insn. 1317 * 1318 * data[0] = %eax (input) 1319 * data[1] = %eax 1320 * data[2] = %ebx 1321 * data[3] = %ecx 1322 * data[4] = %edx 1323 */ 1324 XDT_PROBE5(XDT_HVM_CPUID, data[0], data[1], data[2], data[3], 1325 data[4]); 1326 break; 1327 case TRC_HVM_INTR: 1328 /* 1329 * VMEXIT because of an interrupt. 1330 */ 1331 XDT_PROBE0(XDT_HVM_INTR); 1332 break; 1333 case TRC_HVM_INTR_WINDOW: 1334 /* 1335 * VMEXIT because of an interrupt window (an interrupt 1336 * can't be delivered immediately to a HVM guest and must 1337 * be delayed). 1338 * 1339 * data[0] = vector 1340 * data[1] = source 1341 * data[2] = info 1342 */ 1343 XDT_PROBE3(XDT_HVM_INTR_WINDOW, data[0], data[1], data[2]); 1344 break; 1345 case TRC_HVM_NMI: 1346 /* 1347 * VMEXIT because of an NMI. 1348 */ 1349 XDT_PROBE0(XDT_HVM_NMI); 1350 break; 1351 case TRC_HVM_SMI: 1352 /* 1353 * VMEXIT because of an SMI 1354 */ 1355 XDT_PROBE0(XDT_HVM_SMI); 1356 break; 1357 case TRC_HVM_VMMCALL: 1358 /* 1359 * VMMCALL insn. 1360 * 1361 * data[0] = %eax 1362 */ 1363 XDT_PROBE1(XDT_HVM_VMMCALL, data[0]); 1364 break; 1365 case TRC_HVM_HLT: 1366 /* 1367 * HLT insn. 1368 * 1369 * data[0] = 1 if VCPU runnable, 0 if not 1370 */ 1371 XDT_PROBE1(XDT_HVM_HLT, data[0]); 1372 break; 1373 case TRC_HVM_INVLPG64: 1374 /* 1375 * 1376 * data[0] = INVLPGA ? 1 : 0 1377 * data[1] = vaddr(0:31) 1378 * data[2] = vaddr(32:63) 1379 */ 1380 addr64 = (((uint64_t)data[2]) << 32) | data[1]; 1381 XDT_PROBE2(XDT_HVM_INVLPG, data[0], addr64); 1382 break; 1383 case TRC_HVM_INVLPG: 1384 /* 1385 * unused (32-bit Xen only) 1386 * 1387 * data[0] = (domid<<16 + vcpuid) 1388 */ 1389 break; 1390 case TRC_HVM_MCE: 1391 /* 1392 * #MCE VMEXIT 1393 * 1394 */ 1395 XDT_PROBE0(XDT_HVM_MCE); 1396 break; 1397 case TRC_HVM_IOPORT_READ: 1398 case TRC_HVM_IOPORT_WRITE: 1399 case TRC_HVM_IOMEM_READ: 1400 case TRC_HVM_IOMEM_WRITE: 1401 /* 1402 * data[0] = addr(0:31) 1403 * data[1] = addr(32:63) 1404 * data[2] = count 1405 * data[3] = size 1406 */ 1407 switch (rec->event) { 1408 case TRC_HVM_IOPORT_READ: 1409 eid = XDT_HVM_IOPORT_READ; 1410 break; 1411 case TRC_HVM_IOPORT_WRITE: 1412 eid = XDT_HVM_IOPORT_WRITE; 1413 break; 1414 case TRC_HVM_IOMEM_READ: 1415 eid = XDT_HVM_IOMEM_READ; 1416 break; 1417 case TRC_HVM_IOMEM_WRITE: 1418 eid = XDT_HVM_IOMEM_WRITE; 1419 break; 1420 } 1421 addr64 = (((uint64_t)data[1]) << 32) | data[0]; 1422 XDT_PROBE3(eid, addr64, data[2], data[3]); 1423 break; 1424 case TRC_HVM_CLTS: 1425 /* 1426 * CLTS insn (Intel VMX only) 1427 */ 1428 XDT_PROBE0(XDT_HVM_CLTS); 1429 break; 1430 case TRC_HVM_LMSW64: 1431 /* 1432 * LMSW insn. 1433 * 1434 * data[0] = value(0:31) 1435 * data[1] = value(32:63) 1436 */ 1437 val64 = (((uint64_t)data[1]) << 32) | data[0]; 1438 XDT_PROBE1(XDT_HVM_LMSW, val64); 1439 break; 1440 case TRC_HVM_LMSW: 1441 /* 1442 * unused (32-bit Xen only) 1443 */ 1444 break; 1445 1446 /* 1447 * Shadow page table probes (mainly used for HVM domains 1448 * without hardware paging support). 1449 */ 1450 case TRC_SHADOW_NOT_SHADOW | SH_GUEST_32: 1451 /* 1452 * data[0] = pte(0:31) 1453 * data[1] = pte(32:63) 1454 * data[2] = va 1455 * data[3] = flags 1456 */ 1457 pte64 = ((uint64_t)data[1] << 32) | data[0]; 1458 XDT_PROBE3(XDT_SHADOW_NOT_SHADOW, pte64, data[2], data[3]); 1459 break; 1460 case TRC_SHADOW_NOT_SHADOW | SH_GUEST_PAE: 1461 case TRC_SHADOW_NOT_SHADOW | SH_GUEST_64: 1462 /* 1463 * data[0] = pte(0:31) 1464 * data[1] = pte(32:63) 1465 * data[2] = va(0:31) 1466 * data[3] = va(32:63) 1467 * data[4] = flags 1468 */ 1469 addr64 = ((uint64_t)data[2] << 32) | data[3]; 1470 pte64 = ((uint64_t)data[1] << 32) | data[0]; 1471 XDT_PROBE3(XDT_SHADOW_NOT_SHADOW, pte64, addr64, data[4]); 1472 break; 1473 case TRC_SHADOW_FAST_PROPAGATE | SH_GUEST_32: 1474 /* 1475 * data[0] = va 1476 */ 1477 XDT_PROBE1(XDT_SHADOW_FAST_PROPAGATE, data[0]); 1478 break; 1479 case TRC_SHADOW_FAST_PROPAGATE | SH_GUEST_PAE: 1480 case TRC_SHADOW_FAST_PROPAGATE | SH_GUEST_64: 1481 /* 1482 * data[0] = va(0:31) 1483 * data[1] = va(32:63) 1484 */ 1485 addr64 = ((uint64_t)data[1] << 32) | data[0]; 1486 XDT_PROBE1(XDT_SHADOW_FAST_PROPAGATE, addr64); 1487 break; 1488 case TRC_SHADOW_FAST_MMIO | SH_GUEST_32: 1489 /* 1490 * data[0] = va 1491 */ 1492 XDT_PROBE1(XDT_SHADOW_FAST_MMIO, data[0]); 1493 break; 1494 case TRC_SHADOW_FAST_MMIO | SH_GUEST_PAE: 1495 case TRC_SHADOW_FAST_MMIO | SH_GUEST_64: 1496 /* 1497 * data[0] = va(0:31) 1498 * data[1] = va(32:63) 1499 */ 1500 addr64 = ((uint64_t)data[1] << 32) | data[0]; 1501 XDT_PROBE1(XDT_SHADOW_FAST_MMIO, addr64); 1502 break; 1503 case TRC_SHADOW_FALSE_FAST_PATH | SH_GUEST_32: 1504 /* 1505 * data[0] = va 1506 */ 1507 XDT_PROBE1(XDT_SHADOW_FALSE_FAST_PATH, data[0]); 1508 break; 1509 case TRC_SHADOW_FALSE_FAST_PATH | SH_GUEST_PAE: 1510 case TRC_SHADOW_FALSE_FAST_PATH | SH_GUEST_64: 1511 /* 1512 * data[0] = va(0:31) 1513 * data[1] = va(32:63) 1514 */ 1515 addr64 = ((uint64_t)data[1] << 32) | data[0]; 1516 XDT_PROBE1(XDT_SHADOW_FALSE_FAST_PATH, addr64); 1517 break; 1518 case TRC_SHADOW_MMIO | SH_GUEST_32: 1519 /* 1520 * data[0] = va 1521 */ 1522 XDT_PROBE1(XDT_SHADOW_MMIO, data[0]); 1523 break; 1524 case TRC_SHADOW_MMIO | SH_GUEST_PAE: 1525 case TRC_SHADOW_MMIO | SH_GUEST_64: 1526 /* 1527 * data[0] = va(0:31) 1528 * data[1] = va(32:63) 1529 */ 1530 addr64 = ((uint64_t)data[1] << 32) | data[0]; 1531 XDT_PROBE1(XDT_SHADOW_MMIO, addr64); 1532 break; 1533 case TRC_SHADOW_FIXUP | SH_GUEST_32: 1534 /* 1535 * data[0] = pte(0:31) 1536 * data[1] = pte(32:63) 1537 * data[2] = va 1538 * data[3] = flags 1539 */ 1540 pte64 = ((uint64_t)data[1] << 32) | data[0]; 1541 XDT_PROBE3(XDT_SHADOW_FIXUP, pte64, data[2], data[3]); 1542 break; 1543 case TRC_SHADOW_FIXUP | SH_GUEST_64: 1544 case TRC_SHADOW_FIXUP | SH_GUEST_PAE: 1545 /* 1546 * data[0] = pte(0:31) 1547 * data[1] = pte(32:63) 1548 * data[2] = va(0:31) 1549 * data[3] = va(32:63) 1550 * data[4] = flags 1551 */ 1552 addr64 = ((uint64_t)data[2] << 32) | data[3]; 1553 pte64 = ((uint64_t)data[1] << 32) | data[0]; 1554 XDT_PROBE3(XDT_SHADOW_FIXUP, pte64, addr64, data[4]); 1555 break; 1556 case TRC_SHADOW_DOMF_DYING | SH_GUEST_32: 1557 /* 1558 * data[0] = va 1559 */ 1560 XDT_PROBE1(XDT_SHADOW_DOMF_DYING, data[0]); 1561 break; 1562 case TRC_SHADOW_DOMF_DYING | SH_GUEST_PAE: 1563 case TRC_SHADOW_DOMF_DYING | SH_GUEST_64: 1564 /* 1565 * data[0] = va(0:31) 1566 * data[1] = va(32:63) 1567 */ 1568 addr64 = ((uint64_t)data[1] << 32) | data[0]; 1569 XDT_PROBE1(XDT_SHADOW_DOMF_DYING, addr64); 1570 break; 1571 case TRC_SHADOW_EMULATE | SH_GUEST_32: 1572 /* 1573 * data[0] = pte(0:31) 1574 * data[1] = pte(32:63) 1575 * data[2] = val(0:31) 1576 * data[3] = val(32:63) 1577 * data[4] = addr 1578 * data[5] = flags 1579 */ 1580 pte64 = ((uint64_t)data[1] << 32) | data[0]; 1581 val64 = ((uint64_t)data[3] << 32) | data[2]; 1582 XDT_PROBE5(XDT_SHADOW_EMULATE, pte64, val64, data[4], 1583 data[5] & 0x7fffffff, data[5] >> 29); 1584 break; 1585 case TRC_SHADOW_EMULATE | SH_GUEST_PAE: 1586 case TRC_SHADOW_EMULATE | SH_GUEST_64: 1587 /* 1588 * data[0] = pte(0:31) 1589 * data[1] = pte(32:63) 1590 * data[2] = val(0:31) 1591 * data[3] = val(32:63) 1592 * data[4] = addr(0:31) 1593 * data[5] = addr(32:63) 1594 * data[6] = flags 1595 */ 1596 pte64 = ((uint64_t)data[1] << 32) | data[0]; 1597 val64 = ((uint64_t)data[3] << 32) | data[2]; 1598 addr64 = ((uint64_t)data[5] << 32) | data[4]; 1599 XDT_PROBE5(XDT_SHADOW_EMULATE, pte64, val64, data[4], 1600 data[6] & 0x7fffffff, data[6] >> 29); 1601 break; 1602 case TRC_SHADOW_EMULATE_UNSHADOW_USER | SH_GUEST_32: 1603 /* 1604 * data[0] = gfn 1605 * data[1] = vaddr 1606 */ 1607 XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_USER, data[0], data[1]); 1608 break; 1609 case TRC_SHADOW_EMULATE_UNSHADOW_USER | SH_GUEST_PAE: 1610 case TRC_SHADOW_EMULATE_UNSHADOW_USER | SH_GUEST_64: 1611 /* 1612 * data[0] = gfn(0:31) 1613 * data[1] = gfn(32:63) 1614 * data[2] = vaddr(0:31) 1615 * data[3] = vaddr(32:63) 1616 */ 1617 val64 = ((uint64_t)data[1] << 32) | data[0]; 1618 addr64 = ((uint64_t)data[3] << 32) | data[2]; 1619 XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_USER, val64, addr64); 1620 break; 1621 case TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ | SH_GUEST_32: 1622 /* 1623 * data[0] = gfn 1624 * data[1] = vaddr 1625 */ 1626 XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ, data[0], 1627 data[1]); 1628 break; 1629 case TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ | SH_GUEST_PAE: 1630 case TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ | SH_GUEST_64: 1631 /* 1632 * data[0] = gfn(0:31) 1633 * data[1] = gfn(32:63) 1634 * data[2] = vaddr(0:31) 1635 * data[3] = vaddr(32:63) 1636 */ 1637 val64 = ((uint64_t)data[1] << 32) | data[0]; 1638 addr64 = ((uint64_t)data[3] << 32) | data[2]; 1639 XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ, val64, addr64); 1640 break; 1641 case TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED | SH_GUEST_32: 1642 /* 1643 * data[0] = gfn 1644 * data[1] = vaddr 1645 */ 1646 XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED, data[0], 1647 data[1]); 1648 break; 1649 case TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED | SH_GUEST_PAE: 1650 case TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED | SH_GUEST_64: 1651 /* 1652 * data[0] = gfn(0:31) 1653 * data[1] = gfn(32:63) 1654 * data[2] = vaddr(0:31) 1655 * data[3] = vaddr(32:63) 1656 */ 1657 val64 = ((uint64_t)data[1] << 32) | data[0]; 1658 addr64 = ((uint64_t)data[3] << 32) | data[2]; 1659 XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED, val64, 1660 addr64); 1661 break; 1662 case TRC_SHADOW_WRMAP_BF: 1663 /* 1664 * data[0] = gfn(0:31) 1665 * data[1] = gfn(32:63) 1666 */ 1667 val64 = ((uint64_t)data[1] << 32) | data[0]; 1668 XDT_PROBE1(XDT_SHADOW_WRMAP_BF, val64); 1669 break; 1670 case TRC_SHADOW_PREALLOC_UNPIN: 1671 /* 1672 * data[0] = gfn(0:31) 1673 * data[1] = gfn(32:63) 1674 */ 1675 val64 = ((uint64_t)data[1] << 32) | data[0]; 1676 XDT_PROBE1(XDT_SHADOW_PREALLOC_UNPIN, val64); 1677 break; 1678 case TRC_SHADOW_RESYNC_FULL: 1679 /* 1680 * data[0] = gmfn(0:31) 1681 * data[1] = gmfn(32:63) 1682 */ 1683 val64 = ((uint64_t)data[1] << 32) | data[0]; 1684 XDT_PROBE1(XDT_SHADOW_RESYNC_FULL, val64); 1685 break; 1686 case TRC_SHADOW_RESYNC_ONLY: 1687 /* 1688 * data[0] = gmfn(0:31) 1689 * data[1] = gmfn(32:63) 1690 */ 1691 val64 = ((uint64_t)data[1] << 32) | data[0]; 1692 XDT_PROBE1(XDT_SHADOW_RESYNC_ONLY, val64); 1693 break; 1694 1695 /* 1696 * Power management probes. 1697 */ 1698 case TRC_PM_FREQ_CHANGE: 1699 /* 1700 * data[0] = old freq 1701 * data[1] = new freq 1702 */ 1703 XDT_PROBE2(XDT_PM_FREQ_CHANGE, data[0], data[1]); 1704 break; 1705 case TRC_PM_IDLE_ENTRY: 1706 /* 1707 * data[0] = C-state 1708 * data[1] = time 1709 */ 1710 XDT_PROBE2(XDT_PM_IDLE_ENTRY, data[0], data[1]); 1711 break; 1712 case TRC_PM_IDLE_EXIT: 1713 /* 1714 * data[0] = C-state 1715 * data[1] = time 1716 */ 1717 XDT_PROBE2(XDT_PM_IDLE_EXIT, data[0], data[1]); 1718 break; 1719 case TRC_LOST_RECORDS: 1720 vcpu = data[1] >> 16; 1721 dom = data[1] & 0xffff; 1722 xdt_update_sched_context(cpuid, dom, vcpu); 1723 xdt_update_domain_context(dom, vcpu); 1724 XDT_PROBE1(XDT_TRC_LOST_RECORDS, cpuid); 1725 tbuf.stat_dropped_recs++; 1726 break; 1727 1728 default: 1729 tbuf.stat_unknown_recs++; 1730 break; 1731 } 1732 1733 done: 1734 rec_size = 4 + (rec->cycles_included ? 8 : 0) + (rec->extra_u32 * 4); 1735 return (rec_size); 1736 } 1737 1738 /* 1739 * Scan all CPU buffers for the record with the lowest timestamp so 1740 * that the probes will fire in order. 1741 */ 1742 static int 1743 xdt_get_first_rec(uint_t *cpuidp, struct t_rec **recp, uint32_t *consp) 1744 { 1745 uint_t cpuid; 1746 uint32_t prod, cons, offset; 1747 struct t_rec *rec; 1748 uint64_t minstamp = ~0ULL, stamp; 1749 uintptr_t data; 1750 1751 for (cpuid = 0; cpuid < tbuf.cnt; cpuid++) { 1752 cons = tbuf.meta[cpuid]->cons; 1753 prod = tbuf.meta[cpuid]->prod; 1754 membar_consumer(); 1755 if (prod == cons) 1756 continue; 1757 1758 offset = cons % tbuf_data_size; 1759 data = (uintptr_t)tbuf.data[cpuid] + offset; 1760 rec = (struct t_rec *)data; 1761 ASSERT((caddr_t)rec < tbuf.va + (tbuf.size * (cpuid + 1))); 1762 1763 /* 1764 * All records that we know about have time cycles included. 1765 * If this record doesn't have them, assume it's a type 1766 * that we don't handle. Use a 0 time value, which will make 1767 * it get handled first (it will be thrown away). 1768 */ 1769 if (rec->cycles_included) 1770 stamp = (((uint64_t)rec->u.cycles.cycles_hi) << 32) 1771 | rec->u.cycles.cycles_lo; 1772 else 1773 stamp = 0; 1774 1775 if (stamp < minstamp) { 1776 minstamp = stamp; 1777 *cpuidp = cpuid; 1778 *recp = rec; 1779 *consp = cons; 1780 } 1781 } 1782 1783 if (minstamp != ~0ULL) 1784 return (1); 1785 1786 return (0); 1787 } 1788 1789 /*ARGSUSED*/ 1790 static void 1791 xdt_tbuf_scan(void *arg) 1792 { 1793 uint32_t bytes_done, cons; 1794 struct t_rec *rec; 1795 xdt_schedinfo_t *sp; 1796 uint_t nrecs, cpuid; 1797 1798 for (nrecs = 0; 1799 nrecs < xdt_max_recs && xdt_get_first_rec(&cpuid, &rec, &cons) > 0; 1800 nrecs++) { 1801 xdt_curpcpu = cpuid; 1802 sp = &xdt_cpu_schedinfo[cpuid]; 1803 if (sp->curinfo_valid) 1804 xdt_update_domain_context(sp->cur_domid, 1805 sp->cur_vcpuid); 1806 1807 bytes_done = xdt_process_rec(cpuid, rec); 1808 cons += bytes_done; 1809 /* 1810 * cons and prod are incremented modulo (2 * tbuf_data_size). 1811 * See <xen/public/trace.h>. 1812 */ 1813 if (cons >= 2 * tbuf_data_size) 1814 cons -= 2 * tbuf_data_size; 1815 membar_exit(); 1816 tbuf.meta[cpuid]->cons = cons; 1817 } 1818 } 1819 1820 static void 1821 xdt_cyclic_enable(void) 1822 { 1823 cyc_handler_t hdlr; 1824 cyc_time_t when; 1825 1826 ASSERT(MUTEX_HELD(&cpu_lock)); 1827 1828 hdlr.cyh_func = xdt_tbuf_scan; 1829 hdlr.cyh_arg = NULL; 1830 hdlr.cyh_level = CY_LOW_LEVEL; 1831 1832 when.cyt_interval = xdt_poll_nsec; 1833 when.cyt_when = dtrace_gethrtime() + when.cyt_interval; 1834 1835 xdt_cyclic = cyclic_add(&hdlr, &when); 1836 } 1837 1838 static void 1839 xdt_probe_create(xdt_probe_t *p) 1840 { 1841 ASSERT(p != NULL && p->pr_mod != NULL); 1842 1843 if (dtrace_probe_lookup(xdt_id, p->pr_mod, NULL, p->pr_name) != 0) 1844 return; 1845 1846 xdt_prid[p->evt_id] = dtrace_probe_create(xdt_id, p->pr_mod, NULL, 1847 p->pr_name, dtrace_mach_aframes(), p); 1848 } 1849 1850 /*ARGSUSED*/ 1851 static void 1852 xdt_provide(void *arg, const dtrace_probedesc_t *desc) 1853 { 1854 const char *mod, *name; 1855 int i; 1856 1857 if (desc == NULL) { 1858 for (i = 0; xdt_probe[i].pr_mod != NULL; i++) { 1859 xdt_probe_create(&xdt_probe[i]); 1860 } 1861 } else { 1862 mod = desc->dtpd_mod; 1863 name = desc->dtpd_name; 1864 for (i = 0; xdt_probe[i].pr_mod != NULL; i++) { 1865 int l1 = strlen(xdt_probe[i].pr_name); 1866 int l2 = strlen(xdt_probe[i].pr_mod); 1867 if (strncmp(name, xdt_probe[i].pr_name, l1) == 0 && 1868 strncmp(mod, xdt_probe[i].pr_mod, l2) == 0) 1869 break; 1870 } 1871 1872 if (xdt_probe[i].pr_mod == NULL) 1873 return; 1874 xdt_probe_create(&xdt_probe[i]); 1875 } 1876 1877 } 1878 1879 /*ARGSUSED*/ 1880 static void 1881 xdt_destroy(void *arg, dtrace_id_t id, void *parg) 1882 { 1883 xdt_probe_t *p = parg; 1884 xdt_prid[p->evt_id] = 0; 1885 } 1886 1887 static void 1888 xdt_set_trace_mask(uint32_t mask) 1889 { 1890 xen_sysctl_tbuf_op_t tbuf_op; 1891 1892 /* Always need to trace scheduling, for context */ 1893 if (mask != 0) 1894 mask |= TRC_SCHED; 1895 tbuf_op.evt_mask = mask; 1896 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_set_evt_mask; 1897 (void) xdt_sysctl_tbuf(&tbuf_op); 1898 } 1899 1900 /*ARGSUSED*/ 1901 static int 1902 xdt_enable(void *arg, dtrace_id_t id, void *parg) 1903 { 1904 xdt_probe_t *p = parg; 1905 xen_sysctl_tbuf_op_t tbuf_op; 1906 1907 ASSERT(MUTEX_HELD(&cpu_lock)); 1908 ASSERT(xdt_prid[p->evt_id] != 0); 1909 1910 xdt_probemap[p->evt_id] = xdt_prid[p->evt_id]; 1911 xdt_classinfo[p->class].cnt++; 1912 1913 if (xdt_classinfo[p->class].cnt == 1) { 1914 /* set the trace mask for this class */ 1915 cur_trace_mask |= xdt_classinfo[p->class].trc_mask; 1916 xdt_set_trace_mask(cur_trace_mask); 1917 } 1918 1919 if (xdt_cyclic == CYCLIC_NONE) { 1920 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_enable; 1921 if (xdt_sysctl_tbuf(&tbuf_op) != 0) { 1922 cmn_err(CE_NOTE, "Couldn't enable hypervisor tracing."); 1923 return (-1); 1924 } 1925 1926 xdt_cyclic_enable(); 1927 } 1928 return (0); 1929 } 1930 1931 /*ARGSUSED*/ 1932 static void 1933 xdt_disable(void *arg, dtrace_id_t id, void *parg) 1934 { 1935 xdt_probe_t *p = parg; 1936 xen_sysctl_tbuf_op_t tbuf_op; 1937 int i, err; 1938 1939 ASSERT(MUTEX_HELD(&cpu_lock)); 1940 ASSERT(xdt_probemap[p->evt_id] != 0); 1941 ASSERT(xdt_probemap[p->evt_id] == xdt_prid[p->evt_id]); 1942 ASSERT(xdt_classinfo[p->class].cnt > 0); 1943 1944 /* 1945 * We could be here in the slight window between the cyclic firing and 1946 * a call to dtrace_probe() occurring. We need to be careful if we tear 1947 * down any shared state. 1948 */ 1949 1950 xdt_probemap[p->evt_id] = 0; 1951 xdt_classinfo[p->class].cnt--; 1952 1953 if (xdt_nr_active_probes() == 0) { 1954 cur_trace_mask = 0; 1955 1956 if (xdt_cyclic == CYCLIC_NONE) 1957 return; 1958 1959 for (i = 0; i < xdt_ncpus; i++) 1960 xdt_cpu_schedinfo[i].curinfo_valid = 0; 1961 1962 /* 1963 * We will try to disable the trace buffers. If we fail for some 1964 * reason we will try again, up to a count of XDT_TBUF_RETRY. 1965 * If we still aren't successful we try to set the trace mask 1966 * to 0 in order to prevent trace records from being written. 1967 */ 1968 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_disable; 1969 i = 0; 1970 do { 1971 err = xdt_sysctl_tbuf(&tbuf_op); 1972 } while ((err != 0) && (++i < XDT_TBUF_RETRY)); 1973 1974 if (err != 0) { 1975 cmn_err(CE_NOTE, 1976 "Couldn't disable hypervisor tracing."); 1977 xdt_set_trace_mask(0); 1978 } else { 1979 cyclic_remove(xdt_cyclic); 1980 xdt_cyclic = CYCLIC_NONE; 1981 /* 1982 * We don't bother making the hypercall to set 1983 * the trace mask, since it will be reset when 1984 * tracing is re-enabled. 1985 */ 1986 } 1987 } else if (xdt_classinfo[p->class].cnt == 0) { 1988 cur_trace_mask ^= xdt_classinfo[p->class].trc_mask; 1989 /* other probes are enabled, so add the sub-class mask back */ 1990 cur_trace_mask |= 0xF000; 1991 xdt_set_trace_mask(cur_trace_mask); 1992 } 1993 } 1994 1995 static dtrace_pattr_t xdt_attr = { 1996 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 1997 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 1998 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, 1999 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 2000 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 2001 }; 2002 2003 static dtrace_pops_t xdt_pops = { 2004 xdt_provide, /* dtps_provide() */ 2005 NULL, /* dtps_provide_module() */ 2006 xdt_enable, /* dtps_enable() */ 2007 xdt_disable, /* dtps_disable() */ 2008 NULL, /* dtps_suspend() */ 2009 NULL, /* dtps_resume() */ 2010 NULL, /* dtps_getargdesc() */ 2011 NULL, /* dtps_getargval() */ 2012 NULL, /* dtps_usermode() */ 2013 xdt_destroy /* dtps_destroy() */ 2014 }; 2015 2016 static int 2017 xdt_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) 2018 { 2019 int val; 2020 2021 if (!DOMAIN_IS_INITDOMAIN(xen_info)) 2022 return (DDI_FAILURE); 2023 2024 switch (cmd) { 2025 case DDI_ATTACH: 2026 break; 2027 2028 case DDI_RESUME: 2029 /* 2030 * We might support proper suspend/resume in the future, so, 2031 * return DDI_FAILURE for now. 2032 */ 2033 return (DDI_FAILURE); 2034 2035 default: 2036 return (DDI_FAILURE); 2037 } 2038 2039 xdt_ncpus = xpv_nr_phys_cpus(); 2040 ASSERT(xdt_ncpus > 0); 2041 2042 if (ddi_create_minor_node(devi, "xdt", S_IFCHR, 0, DDI_PSEUDO, 0) == 2043 DDI_FAILURE || xdt_attach_trace_buffers() != 0 || 2044 dtrace_register("xdt", &xdt_attr, DTRACE_PRIV_KERNEL, NULL, 2045 &xdt_pops, NULL, &xdt_id) != 0) { 2046 if (tbuf.va != NULL) 2047 xdt_detach_trace_buffers(); 2048 ddi_remove_minor_node(devi, NULL); 2049 return (DDI_FAILURE); 2050 } 2051 2052 val = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, 2053 "xdt_poll_nsec", XDT_POLL_DEFAULT); 2054 xdt_poll_nsec = MAX(val, XDT_POLL_MIN); 2055 2056 xdt_cpu_schedinfo = (xdt_schedinfo_t *)kmem_zalloc(xdt_ncpus * 2057 sizeof (xdt_schedinfo_t), KM_SLEEP); 2058 xdt_init_trace_masks(); 2059 xdt_kstat_init(); 2060 2061 xdt_devi = devi; 2062 ddi_report_dev(devi); 2063 return (DDI_SUCCESS); 2064 } 2065 2066 static int 2067 xdt_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) 2068 { 2069 switch (cmd) { 2070 case DDI_DETACH: 2071 break; 2072 2073 case DDI_SUSPEND: 2074 /* 2075 * We might support proper suspend/resume in the future. So 2076 * return DDI_FAILURE for now. 2077 */ 2078 return (DDI_FAILURE); 2079 2080 default: 2081 return (DDI_FAILURE); 2082 } 2083 2084 if (dtrace_unregister(xdt_id) != 0) 2085 return (DDI_FAILURE); 2086 2087 xdt_detach_trace_buffers(); 2088 kmem_free(xdt_cpu_schedinfo, xdt_ncpus * sizeof (xdt_schedinfo_t)); 2089 if (xdt_cyclic != CYCLIC_NONE) 2090 cyclic_remove(xdt_cyclic); 2091 if (xdt_kstats != NULL) 2092 kstat_delete(xdt_kstats); 2093 xdt_devi = (void *)0; 2094 ddi_remove_minor_node(devi, NULL); 2095 2096 return (DDI_SUCCESS); 2097 } 2098 2099 /*ARGSUSED*/ 2100 static int 2101 xdt_info(dev_info_t *devi, ddi_info_cmd_t infocmd, void *arg, void **result) 2102 { 2103 int error; 2104 2105 switch (infocmd) { 2106 case DDI_INFO_DEVT2DEVINFO: 2107 *result = xdt_devi; 2108 error = DDI_SUCCESS; 2109 break; 2110 case DDI_INFO_DEVT2INSTANCE: 2111 *result = (void *)0; 2112 error = DDI_SUCCESS; 2113 break; 2114 default: 2115 error = DDI_FAILURE; 2116 } 2117 return (error); 2118 } 2119 2120 static struct cb_ops xdt_cb_ops = { 2121 nulldev, /* open(9E) */ 2122 nodev, /* close(9E) */ 2123 nodev, /* strategy(9E) */ 2124 nodev, /* print(9E) */ 2125 nodev, /* dump(9E) */ 2126 nodev, /* read(9E) */ 2127 nodev, /* write(9E) */ 2128 nodev, /* ioctl(9E) */ 2129 nodev, /* devmap(9E) */ 2130 nodev, /* mmap(9E) */ 2131 nodev, /* segmap(9E) */ 2132 nochpoll, /* chpoll(9E) */ 2133 ddi_prop_op, /* prop_op(9E) */ 2134 NULL, /* streamtab(9S) */ 2135 D_MP | D_64BIT | D_NEW /* cb_flag */ 2136 }; 2137 2138 static struct dev_ops xdt_ops = { 2139 DEVO_REV, /* devo_rev */ 2140 0, /* devo_refcnt */ 2141 xdt_info, /* getinfo(9E) */ 2142 nulldev, /* identify(9E) */ 2143 nulldev, /* probe(9E) */ 2144 xdt_attach, /* attach(9E) */ 2145 xdt_detach, /* detach(9E) */ 2146 nulldev, /* devo_reset */ 2147 &xdt_cb_ops, /* devo_cb_ops */ 2148 NULL, /* devo_bus_ops */ 2149 NULL, /* power(9E) */ 2150 ddi_quiesce_not_needed, /* devo_quiesce */ 2151 }; 2152 2153 2154 static struct modldrv modldrv = { 2155 &mod_driverops, 2156 "Hypervisor event tracing", 2157 &xdt_ops 2158 }; 2159 2160 static struct modlinkage modlinkage = { 2161 MODREV_1, 2162 &modldrv, 2163 NULL 2164 }; 2165 2166 int 2167 _init(void) 2168 { 2169 return (mod_install(&modlinkage)); 2170 } 2171 2172 int 2173 _fini(void) 2174 { 2175 return (mod_remove(&modlinkage)); 2176 } 2177 2178 int 2179 _info(struct modinfo *modinfop) 2180 { 2181 return (mod_info(&modlinkage, modinfop)); 2182 }