1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  28  */
  29 
  30 #include <sys/errno.h>
  31 #include <sys/stat.h>
  32 #include <sys/modctl.h>
  33 #include <sys/conf.h>
  34 #include <sys/systm.h>
  35 #include <sys/ddi.h>
  36 #include <sys/sunddi.h>
  37 #include <sys/cpuvar.h>
  38 #include <sys/kmem.h>
  39 #include <sys/strsubr.h>
  40 #include <sys/dtrace.h>
  41 #include <sys/cyclic.h>
  42 #include <sys/atomic.h>
  43 
  44 static dev_info_t *profile_devi;
  45 static dtrace_provider_id_t profile_id;
  46 
  47 /*
  48  * Regardless of platform, the stack frames look like this in the case of the
  49  * profile provider:
  50  *
  51  *      profile_fire
  52  *      cyclic_expire
  53  *      cyclic_fire
  54  *      [ cbe ]
  55  *      [ interrupt code ]
  56  *
  57  * On x86, there are five frames from the generic interrupt code; further, the
  58  * interrupted instruction appears as its own stack frame, giving us a total of
  59  * 10.
  60  *
  61  * On SPARC, the picture is further complicated because the compiler
  62  * optimizes away tail-calls -- so the following frames are optimized away:
  63  *
  64  *      profile_fire
  65  *      cyclic_expire
  66  *
  67  * This gives three frames.  However, on DEBUG kernels, the cyclic_expire
  68  * frame cannot be tail-call eliminated, yielding four frames in this case.
  69  *
  70  * All of the above constraints lead to the mess below.  Yes, the profile
  71  * provider should ideally figure this out on-the-fly by hitting one of its own
  72  * probes and then walking its own stack trace.  This is complicated, however,
  73  * and the static definition doesn't seem to be overly brittle.  Still, we
  74  * allow for a manual override in case we get it completely wrong.
  75  */
  76 #ifdef __x86
  77 #define PROF_ARTIFICIAL_FRAMES  10
  78 #else
  79 #ifdef __sparc
  80 #ifdef DEBUG
  81 #define PROF_ARTIFICIAL_FRAMES  4
  82 #else
  83 #define PROF_ARTIFICIAL_FRAMES  3
  84 #endif
  85 #endif
  86 #endif
  87 
  88 #define PROF_NAMELEN            15
  89 
  90 #define PROF_PROFILE            0
  91 #define PROF_TICK               1
  92 #define PROF_PREFIX_PROFILE     "profile-"
  93 #define PROF_PREFIX_TICK        "tick-"
  94 
  95 typedef struct profile_probe {
  96         char            prof_name[PROF_NAMELEN];
  97         dtrace_id_t     prof_id;
  98         int             prof_kind;
  99         hrtime_t        prof_interval;
 100         cyclic_id_t     prof_cyclic;
 101 } profile_probe_t;
 102 
 103 typedef struct profile_probe_percpu {
 104         hrtime_t        profc_expected;
 105         hrtime_t        profc_interval;
 106         profile_probe_t *profc_probe;
 107 } profile_probe_percpu_t;
 108 
 109 hrtime_t        profile_interval_min = NANOSEC / 5000;          /* 5000 hz */
 110 int             profile_aframes = 0;                            /* override */
 111 
 112 static int profile_rates[] = {
 113     97, 199, 499, 997, 1999,
 114     4001, 4999, 0, 0, 0,
 115     0, 0, 0, 0, 0,
 116     0, 0, 0, 0, 0
 117 };
 118 
 119 static int profile_ticks[] = {
 120     1, 10, 100, 500, 1000,
 121     5000, 0, 0, 0, 0,
 122     0, 0, 0, 0, 0
 123 };
 124 
 125 /*
 126  * profile_max defines the upper bound on the number of profile probes that
 127  * can exist (this is to prevent malicious or clumsy users from exhausing
 128  * system resources by creating a slew of profile probes). At mod load time,
 129  * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
 130  * present in the profile.conf file.
 131  */
 132 #define PROFILE_MAX_DEFAULT     1000    /* default max. number of probes */
 133 static uint32_t profile_max;            /* maximum number of profile probes */
 134 static uint32_t profile_total;  /* current number of profile probes */
 135 
 136 static void
 137 profile_fire(void *arg)
 138 {
 139         profile_probe_percpu_t *pcpu = arg;
 140         profile_probe_t *prof = pcpu->profc_probe;
 141         hrtime_t late;
 142 
 143         late = dtrace_gethrtime() - pcpu->profc_expected;
 144         pcpu->profc_expected += pcpu->profc_interval;
 145 
 146         dtrace_probe(prof->prof_id, CPU->cpu_profile_pc,
 147             CPU->cpu_profile_upc, late, 0, 0);
 148 }
 149 
 150 static void
 151 profile_tick(void *arg)
 152 {
 153         profile_probe_t *prof = arg;
 154 
 155         dtrace_probe(prof->prof_id, CPU->cpu_profile_pc,
 156             CPU->cpu_profile_upc, 0, 0, 0);
 157 }
 158 
 159 static void
 160 profile_create(hrtime_t interval, const char *name, int kind)
 161 {
 162         profile_probe_t *prof;
 163         int nr_frames = PROF_ARTIFICIAL_FRAMES + dtrace_mach_aframes();
 164 
 165         if (profile_aframes)
 166                 nr_frames = profile_aframes;
 167 
 168         if (interval < profile_interval_min)
 169                 return;
 170 
 171         if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
 172                 return;
 173 
 174         atomic_inc_32(&profile_total);
 175         if (profile_total > profile_max) {
 176                 atomic_dec_32(&profile_total);
 177                 return;
 178         }
 179 
 180         prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
 181         (void) strcpy(prof->prof_name, name);
 182         prof->prof_interval = interval;
 183         prof->prof_cyclic = CYCLIC_NONE;
 184         prof->prof_kind = kind;
 185         prof->prof_id = dtrace_probe_create(profile_id,
 186             NULL, NULL, name, nr_frames, prof);
 187 }
 188 
 189 /*ARGSUSED*/
 190 static void
 191 profile_provide(void *arg, const dtrace_probedesc_t *desc)
 192 {
 193         int i, j, rate, kind;
 194         hrtime_t val = 0, mult = 1, len;
 195         const char *name, *suffix = NULL;
 196 
 197         const struct {
 198                 char *prefix;
 199                 int kind;
 200         } types[] = {
 201                 { PROF_PREFIX_PROFILE, PROF_PROFILE },
 202                 { PROF_PREFIX_TICK, PROF_TICK },
 203                 { NULL, NULL }
 204         };
 205 
 206         const struct {
 207                 char *name;
 208                 hrtime_t mult;
 209         } suffixes[] = {
 210                 { "ns",         NANOSEC / NANOSEC },
 211                 { "nsec",       NANOSEC / NANOSEC },
 212                 { "us",         NANOSEC / MICROSEC },
 213                 { "usec",       NANOSEC / MICROSEC },
 214                 { "ms",         NANOSEC / MILLISEC },
 215                 { "msec",       NANOSEC / MILLISEC },
 216                 { "s",          NANOSEC / SEC },
 217                 { "sec",        NANOSEC / SEC },
 218                 { "m",          NANOSEC * (hrtime_t)60 },
 219                 { "min",        NANOSEC * (hrtime_t)60 },
 220                 { "h",          NANOSEC * (hrtime_t)(60 * 60) },
 221                 { "hour",       NANOSEC * (hrtime_t)(60 * 60) },
 222                 { "d",          NANOSEC * (hrtime_t)(24 * 60 * 60) },
 223                 { "day",        NANOSEC * (hrtime_t)(24 * 60 * 60) },
 224                 { "hz",         0 },
 225                 { NULL }
 226         };
 227 
 228         if (desc == NULL) {
 229                 char n[PROF_NAMELEN];
 230 
 231                 /*
 232                  * If no description was provided, provide all of our probes.
 233                  */
 234                 for (i = 0; i < sizeof (profile_rates) / sizeof (int); i++) {
 235                         if ((rate = profile_rates[i]) == 0)
 236                                 continue;
 237 
 238                         (void) snprintf(n, PROF_NAMELEN, "%s%d",
 239                             PROF_PREFIX_PROFILE, rate);
 240                         profile_create(NANOSEC / rate, n, PROF_PROFILE);
 241                 }
 242 
 243                 for (i = 0; i < sizeof (profile_ticks) / sizeof (int); i++) {
 244                         if ((rate = profile_ticks[i]) == 0)
 245                                 continue;
 246 
 247                         (void) snprintf(n, PROF_NAMELEN, "%s%d",
 248                             PROF_PREFIX_TICK, rate);
 249                         profile_create(NANOSEC / rate, n, PROF_TICK);
 250                 }
 251 
 252                 return;
 253         }
 254 
 255         name = desc->dtpd_name;
 256 
 257         for (i = 0; types[i].prefix != NULL; i++) {
 258                 len = strlen(types[i].prefix);
 259 
 260                 if (strncmp(name, types[i].prefix, len) != 0)
 261                         continue;
 262                 break;
 263         }
 264 
 265         if (types[i].prefix == NULL)
 266                 return;
 267 
 268         kind = types[i].kind;
 269         j = strlen(name) - len;
 270 
 271         /*
 272          * We need to start before any time suffix.
 273          */
 274         for (j = strlen(name); j >= len; j--) {
 275                 if (name[j] >= '0' && name[j] <= '9')
 276                         break;
 277                 suffix = &name[j];
 278         }
 279 
 280         ASSERT(suffix != NULL);
 281 
 282         /*
 283          * Now determine the numerical value present in the probe name.
 284          */
 285         for (; j >= len; j--) {
 286                 if (name[j] < '0' || name[j] > '9')
 287                         return;
 288 
 289                 val += (name[j] - '0') * mult;
 290                 mult *= (hrtime_t)10;
 291         }
 292 
 293         if (val == 0)
 294                 return;
 295 
 296         /*
 297          * Look-up the suffix to determine the multiplier.
 298          */
 299         for (i = 0, mult = 0; suffixes[i].name != NULL; i++) {
 300                 if (strcasecmp(suffixes[i].name, suffix) == 0) {
 301                         mult = suffixes[i].mult;
 302                         break;
 303                 }
 304         }
 305 
 306         if (suffixes[i].name == NULL && *suffix != '\0')
 307                 return;
 308 
 309         if (mult == 0) {
 310                 /*
 311                  * The default is frequency-per-second.
 312                  */
 313                 val = NANOSEC / val;
 314         } else {
 315                 val *= mult;
 316         }
 317 
 318         profile_create(val, name, kind);
 319 }
 320 
 321 /*ARGSUSED*/
 322 static void
 323 profile_destroy(void *arg, dtrace_id_t id, void *parg)
 324 {
 325         profile_probe_t *prof = parg;
 326 
 327         ASSERT(prof->prof_cyclic == CYCLIC_NONE);
 328         kmem_free(prof, sizeof (profile_probe_t));
 329 
 330         ASSERT(profile_total >= 1);
 331         atomic_dec_32(&profile_total);
 332 }
 333 
 334 /*ARGSUSED*/
 335 static void
 336 profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
 337 {
 338         profile_probe_t *prof = arg;
 339         profile_probe_percpu_t *pcpu;
 340 
 341         pcpu = kmem_zalloc(sizeof (profile_probe_percpu_t), KM_SLEEP);
 342         pcpu->profc_probe = prof;
 343 
 344         hdlr->cyh_func = profile_fire;
 345         hdlr->cyh_arg = pcpu;
 346         hdlr->cyh_level = CY_HIGH_LEVEL;
 347 
 348         when->cyt_interval = prof->prof_interval;
 349         when->cyt_when = dtrace_gethrtime() + when->cyt_interval;
 350 
 351         pcpu->profc_expected = when->cyt_when;
 352         pcpu->profc_interval = when->cyt_interval;
 353 }
 354 
 355 /*ARGSUSED*/
 356 static void
 357 profile_offline(void *arg, cpu_t *cpu, void *oarg)
 358 {
 359         profile_probe_percpu_t *pcpu = oarg;
 360 
 361         ASSERT(pcpu->profc_probe == arg);
 362         kmem_free(pcpu, sizeof (profile_probe_percpu_t));
 363 }
 364 
 365 /*ARGSUSED*/
 366 static int
 367 profile_enable(void *arg, dtrace_id_t id, void *parg)
 368 {
 369         profile_probe_t *prof = parg;
 370         cyc_omni_handler_t omni;
 371         cyc_handler_t hdlr;
 372         cyc_time_t when;
 373 
 374         ASSERT(prof->prof_interval != 0);
 375         ASSERT(MUTEX_HELD(&cpu_lock));
 376 
 377         if (prof->prof_kind == PROF_TICK) {
 378                 hdlr.cyh_func = profile_tick;
 379                 hdlr.cyh_arg = prof;
 380                 hdlr.cyh_level = CY_HIGH_LEVEL;
 381 
 382                 when.cyt_interval = prof->prof_interval;
 383                 when.cyt_when = dtrace_gethrtime() + when.cyt_interval;
 384         } else {
 385                 ASSERT(prof->prof_kind == PROF_PROFILE);
 386                 omni.cyo_online = profile_online;
 387                 omni.cyo_offline = profile_offline;
 388                 omni.cyo_arg = prof;
 389         }
 390 
 391         if (prof->prof_kind == PROF_TICK) {
 392                 prof->prof_cyclic = cyclic_add(&hdlr, &when);
 393         } else {
 394                 prof->prof_cyclic = cyclic_add_omni(&omni);
 395         }
 396         return (0);
 397 }
 398 
 399 /*ARGSUSED*/
 400 static void
 401 profile_disable(void *arg, dtrace_id_t id, void *parg)
 402 {
 403         profile_probe_t *prof = parg;
 404 
 405         ASSERT(prof->prof_cyclic != CYCLIC_NONE);
 406         ASSERT(MUTEX_HELD(&cpu_lock));
 407 
 408         cyclic_remove(prof->prof_cyclic);
 409         prof->prof_cyclic = CYCLIC_NONE;
 410 }
 411 
 412 /*ARGSUSED*/
 413 static int
 414 profile_mode(void *arg, dtrace_id_t id, void *parg)
 415 {
 416         profile_probe_t *prof = parg;
 417         int mode;
 418 
 419         if (CPU->cpu_profile_pc != 0) {
 420                 mode = DTRACE_MODE_KERNEL;
 421         } else {
 422                 mode = DTRACE_MODE_USER;
 423         }
 424 
 425         if (prof->prof_kind == PROF_TICK) {
 426                 mode |= DTRACE_MODE_NOPRIV_RESTRICT;
 427         } else {
 428                 ASSERT(prof->prof_kind == PROF_PROFILE);
 429                 mode |= DTRACE_MODE_NOPRIV_DROP;
 430         }
 431 
 432         return (mode);
 433 }
 434 
 435 static dtrace_pattr_t profile_attr = {
 436 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
 437 { DTRACE_STABILITY_UNSTABLE, DTRACE_STABILITY_UNSTABLE, DTRACE_CLASS_UNKNOWN },
 438 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 439 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
 440 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
 441 };
 442 
 443 static dtrace_pops_t profile_pops = {
 444         profile_provide,
 445         NULL,
 446         profile_enable,
 447         profile_disable,
 448         NULL,
 449         NULL,
 450         NULL,
 451         NULL,
 452         profile_mode,
 453         profile_destroy
 454 };
 455 
 456 static int
 457 profile_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 458 {
 459         switch (cmd) {
 460         case DDI_ATTACH:
 461                 break;
 462         case DDI_RESUME:
 463                 return (DDI_SUCCESS);
 464         default:
 465                 return (DDI_FAILURE);
 466         }
 467 
 468         if (ddi_create_minor_node(devi, "profile", S_IFCHR, 0,
 469             DDI_PSEUDO, NULL) == DDI_FAILURE ||
 470             dtrace_register("profile", &profile_attr,
 471             DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER, NULL,
 472             &profile_pops, NULL, &profile_id) != 0) {
 473                 ddi_remove_minor_node(devi, NULL);
 474                 return (DDI_FAILURE);
 475         }
 476 
 477         profile_max = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
 478             "profile-max-probes", PROFILE_MAX_DEFAULT);
 479 
 480         ddi_report_dev(devi);
 481         profile_devi = devi;
 482         return (DDI_SUCCESS);
 483 }
 484 
 485 static int
 486 profile_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
 487 {
 488         switch (cmd) {
 489         case DDI_DETACH:
 490                 break;
 491         case DDI_SUSPEND:
 492                 return (DDI_SUCCESS);
 493         default:
 494                 return (DDI_FAILURE);
 495         }
 496 
 497         if (dtrace_unregister(profile_id) != 0)
 498                 return (DDI_FAILURE);
 499 
 500         ddi_remove_minor_node(devi, NULL);
 501         return (DDI_SUCCESS);
 502 }
 503 
 504 /*ARGSUSED*/
 505 static int
 506 profile_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 507 {
 508         int error;
 509 
 510         switch (infocmd) {
 511         case DDI_INFO_DEVT2DEVINFO:
 512                 *result = (void *)profile_devi;
 513                 error = DDI_SUCCESS;
 514                 break;
 515         case DDI_INFO_DEVT2INSTANCE:
 516                 *result = (void *)0;
 517                 error = DDI_SUCCESS;
 518                 break;
 519         default:
 520                 error = DDI_FAILURE;
 521         }
 522         return (error);
 523 }
 524 
 525 /*ARGSUSED*/
 526 static int
 527 profile_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
 528 {
 529         return (0);
 530 }
 531 
 532 static struct cb_ops profile_cb_ops = {
 533         profile_open,           /* open */
 534         nodev,                  /* close */
 535         nulldev,                /* strategy */
 536         nulldev,                /* print */
 537         nodev,                  /* dump */
 538         nodev,                  /* read */
 539         nodev,                  /* write */
 540         nodev,                  /* ioctl */
 541         nodev,                  /* devmap */
 542         nodev,                  /* mmap */
 543         nodev,                  /* segmap */
 544         nochpoll,               /* poll */
 545         ddi_prop_op,            /* cb_prop_op */
 546         0,                      /* streamtab  */
 547         D_NEW | D_MP            /* Driver compatibility flag */
 548 };
 549 
 550 static struct dev_ops profile_ops = {
 551         DEVO_REV,               /* devo_rev, */
 552         0,                      /* refcnt  */
 553         profile_info,           /* get_dev_info */
 554         nulldev,                /* identify */
 555         nulldev,                /* probe */
 556         profile_attach,         /* attach */
 557         profile_detach,         /* detach */
 558         nodev,                  /* reset */
 559         &profile_cb_ops,    /* driver operations */
 560         NULL,                   /* bus operations */
 561         nodev,                  /* dev power */
 562         ddi_quiesce_not_needed,         /* quiesce */
 563 };
 564 
 565 /*
 566  * Module linkage information for the kernel.
 567  */
 568 static struct modldrv modldrv = {
 569         &mod_driverops,             /* module type (this is a pseudo driver) */
 570         "Profile Interrupt Tracing",    /* name of module */
 571         &profile_ops,               /* driver ops */
 572 };
 573 
 574 static struct modlinkage modlinkage = {
 575         MODREV_1,
 576         { (void *)&modldrv, NULL }
 577 };
 578 
 579 int
 580 _init(void)
 581 {
 582         return (mod_install(&modlinkage));
 583 }
 584 
 585 int
 586 _info(struct modinfo *modinfop)
 587 {
 588         return (mod_info(&modlinkage, modinfop));
 589 }
 590 
 591 int
 592 _fini(void)
 593 {
 594         return (mod_remove(&modlinkage));
 595 }