1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #include <sys/errno.h>
  28 #include <sys/cpuvar.h>
  29 #include <sys/stat.h>
  30 #include <sys/modctl.h>
  31 #include <sys/cmn_err.h>
  32 #include <sys/ddi.h>
  33 #include <sys/sunddi.h>
  34 #include <sys/ksynch.h>
  35 #include <sys/conf.h>
  36 #include <sys/kmem.h>
  37 #include <sys/kcpc.h>
  38 #include <sys/cap_util.h>
  39 #include <sys/cpc_pcbe.h>
  40 #include <sys/cpc_impl.h>
  41 #include <sys/dtrace_impl.h>
  42 
  43 /*
  44  * DTrace CPU Performance Counter Provider
  45  * ---------------------------------------
  46  *
  47  * The DTrace cpc provider allows DTrace consumers to access the CPU
  48  * performance counter overflow mechanism of a CPU. The configuration
  49  * presented in a probe specification is programmed into the performance
  50  * counter hardware of all available CPUs on a system. Programming the
  51  * hardware causes a counter on each CPU to begin counting events of the
  52  * given type. When the specified number of events have occurred, an overflow
  53  * interrupt will be generated and the probe is fired.
  54  *
  55  * The required configuration for the performance counter is encoded into
  56  * the probe specification and this includes the performance counter event
  57  * name, processor mode, overflow rate and an optional unit mask.
  58  *
  59  * Most processors provide several counters (PICs) which can count all or a
  60  * subset of the events available for a given CPU. However, when overflow
  61  * profiling is being used, not all CPUs can detect which counter generated the
  62  * overflow interrupt. In this case we cannot reliably determine which counter
  63  * overflowed and we therefore only allow such CPUs to configure one event at
  64  * a time. Processors that can determine the counter which overflowed are
  65  * allowed to program as many events at one time as possible (in theory up to
  66  * the number of instrumentation counters supported by that platform).
  67  * Therefore, multiple consumers can enable multiple probes at the same time
  68  * on such platforms. Platforms which cannot determine the source of an
  69  * overflow interrupt are only allowed to program a single event at one time.
  70  *
  71  * The performance counter hardware is made available to consumers on a
  72  * first-come, first-served basis. Only a finite amount of hardware resource
  73  * is available and, while we make every attempt to accomodate requests from
  74  * consumers, we must deny requests when hardware resources have been exhausted.
  75  * A consumer will fail to enable probes when resources are currently in use.
  76  *
  77  * The cpc provider contends for shared hardware resources along with other
  78  * consumers of the kernel CPU performance counter subsystem (e.g. cpustat(1M)).
  79  * Only one such consumer can use the performance counters at any one time and
  80  * counters are made available on a first-come, first-served basis. As with
  81  * cpustat, the cpc provider has priority over per-LWP libcpc usage (e.g.
  82  * cputrack(1)). Invoking the cpc provider will cause all existing per-LWP
  83  * counter contexts to be invalidated.
  84  */
  85 
  86 typedef struct dcpc_probe {
  87         char            dcpc_event_name[CPC_MAX_EVENT_LEN];
  88         int             dcpc_flag;      /* flags (USER/SYS) */
  89         uint32_t        dcpc_ovfval;    /* overflow value */
  90         int64_t         dcpc_umask;     /* umask/emask for this event */
  91         int             dcpc_picno;     /* pic this event is programmed in */
  92         int             dcpc_enabled;   /* probe is actually enabled? */
  93         int             dcpc_disabling; /* probe is currently being disabled */
  94         dtrace_id_t     dcpc_id;        /* probeid this request is enabling */
  95         int             dcpc_actv_req_idx;      /* idx into dcpc_actv_reqs[] */
  96 } dcpc_probe_t;
  97 
  98 static dev_info_t                       *dcpc_devi;
  99 static dtrace_provider_id_t             dcpc_pid;
 100 static dcpc_probe_t                     **dcpc_actv_reqs;
 101 static uint32_t                         dcpc_enablings = 0;
 102 static int                              dcpc_ovf_mask = 0;
 103 static int                              dcpc_mult_ovf_cap = 0;
 104 static int                              dcpc_mask_type = 0;
 105 
 106 /*
 107  * When the dcpc provider is loaded, dcpc_min_overflow is set to either
 108  * DCPC_MIN_OVF_DEFAULT or the value that dcpc-min-overflow is set to in
 109  * the dcpc.conf file. Decrease this value to set probes with smaller
 110  * overflow values. Remember that very small values could render a system
 111  * unusable with frequently occurring events.
 112  */
 113 #define DCPC_MIN_OVF_DEFAULT            5000
 114 static uint32_t                         dcpc_min_overflow;
 115 
 116 static int dcpc_aframes = 0;    /* override for artificial frame setting */
 117 #if defined(__x86)
 118 #define DCPC_ARTIFICIAL_FRAMES  8
 119 #elif defined(__sparc)
 120 #define DCPC_ARTIFICIAL_FRAMES  2
 121 #endif
 122 
 123 /*
 124  * Called from the platform overflow interrupt handler. 'bitmap' is a mask
 125  * which contains the pic(s) that have overflowed.
 126  */
 127 static void
 128 dcpc_fire(uint64_t bitmap)
 129 {
 130         int i;
 131 
 132         /*
 133          * No counter was marked as overflowing. Shout about it and get out.
 134          */
 135         if ((bitmap & dcpc_ovf_mask) == 0) {
 136                 cmn_err(CE_NOTE, "dcpc_fire: no counter overflow found\n");
 137                 return;
 138         }
 139 
 140         /*
 141          * This is the common case of a processor that doesn't support
 142          * multiple overflow events. Such systems are only allowed a single
 143          * enabling and therefore we just look for the first entry in
 144          * the active request array.
 145          */
 146         if (!dcpc_mult_ovf_cap) {
 147                 for (i = 0; i < cpc_ncounters; i++) {
 148                         if (dcpc_actv_reqs[i] != NULL) {
 149                                 dtrace_probe(dcpc_actv_reqs[i]->dcpc_id,
 150                                     CPU->cpu_cpcprofile_pc,
 151                                     CPU->cpu_cpcprofile_upc, 0, 0, 0);
 152                                 return;
 153                         }
 154                 }
 155                 return;
 156         }
 157 
 158         /*
 159          * This is a processor capable of handling multiple overflow events.
 160          * Iterate over the array of active requests and locate the counters
 161          * that overflowed (note: it is possible for more than one counter to
 162          * have overflowed at the same time).
 163          */
 164         for (i = 0; i < cpc_ncounters; i++) {
 165                 if (dcpc_actv_reqs[i] != NULL &&
 166                     (bitmap & (1ULL << dcpc_actv_reqs[i]->dcpc_picno))) {
 167                         dtrace_probe(dcpc_actv_reqs[i]->dcpc_id,
 168                             CPU->cpu_cpcprofile_pc,
 169                             CPU->cpu_cpcprofile_upc, 0, 0, 0);
 170                 }
 171         }
 172 }
 173 
 174 static void
 175 dcpc_create_probe(dtrace_provider_id_t id, const char *probename,
 176     char *eventname, int64_t umask, uint32_t ovfval, char flag)
 177 {
 178         dcpc_probe_t *pp;
 179         int nr_frames = DCPC_ARTIFICIAL_FRAMES + dtrace_mach_aframes();
 180 
 181         if (dcpc_aframes)
 182                 nr_frames = dcpc_aframes;
 183 
 184         if (dtrace_probe_lookup(id, NULL, NULL, probename) != 0)
 185                 return;
 186 
 187         pp = kmem_zalloc(sizeof (dcpc_probe_t), KM_SLEEP);
 188         (void) strncpy(pp->dcpc_event_name, eventname,
 189             sizeof (pp->dcpc_event_name) - 1);
 190         pp->dcpc_event_name[sizeof (pp->dcpc_event_name) - 1] = '\0';
 191         pp->dcpc_flag = flag | CPC_OVF_NOTIFY_EMT;
 192         pp->dcpc_ovfval = ovfval;
 193         pp->dcpc_umask = umask;
 194         pp->dcpc_actv_req_idx = pp->dcpc_picno = pp->dcpc_disabling = -1;
 195 
 196         pp->dcpc_id = dtrace_probe_create(id, NULL, NULL, probename,
 197             nr_frames, pp);
 198 }
 199 
 200 /*ARGSUSED*/
 201 static void
 202 dcpc_provide(void *arg, const dtrace_probedesc_t *desc)
 203 {
 204         /*
 205          * The format of a probe is:
 206          *
 207          *      event_name-mode-{optional_umask}-overflow_rate
 208          * e.g.
 209          *      DC_refill_from_system-user-0x1e-50000, or,
 210          *      DC_refill_from_system-all-10000
 211          *
 212          */
 213         char *str, *end, *p;
 214         int i, flag = 0;
 215         char event[CPC_MAX_EVENT_LEN];
 216         long umask = -1, val = 0;
 217         size_t evlen, len;
 218 
 219         /*
 220          * The 'cpc' provider offers no probes by default.
 221          */
 222         if (desc == NULL)
 223                 return;
 224 
 225         len = strlen(desc->dtpd_name);
 226         p = str = kmem_alloc(len + 1, KM_SLEEP);
 227         (void) strcpy(str, desc->dtpd_name);
 228 
 229         /*
 230          * We have a poor man's strtok() going on here. Replace any hyphens
 231          * in the the probe name with NULL characters in order to make it
 232          * easy to parse the string with regular string functions.
 233          */
 234         for (i = 0; i < len; i++) {
 235                 if (str[i] == '-')
 236                         str[i] = '\0';
 237         }
 238 
 239         /*
 240          * The first part of the string must be either a platform event
 241          * name or a generic event name.
 242          */
 243         evlen = strlen(p);
 244         (void) strncpy(event, p, CPC_MAX_EVENT_LEN - 1);
 245         event[CPC_MAX_EVENT_LEN - 1] = '\0';
 246 
 247         /*
 248          * The next part of the name is the mode specification. Valid
 249          * settings are "user", "kernel" or "all".
 250          */
 251         p += evlen + 1;
 252 
 253         if (strcmp(p, "user") == 0)
 254                 flag |= CPC_COUNT_USER;
 255         else if (strcmp(p, "kernel") == 0)
 256                 flag |= CPC_COUNT_SYSTEM;
 257         else if (strcmp(p, "all") == 0)
 258                 flag |= CPC_COUNT_USER | CPC_COUNT_SYSTEM;
 259         else
 260                 goto err;
 261 
 262         /*
 263          * Next we either have a mask specification followed by an overflow
 264          * rate or just an overflow rate on its own.
 265          */
 266         p += strlen(p) + 1;
 267         if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
 268                 /*
 269                  * A unit mask can only be specified if:
 270                  * 1) this performance counter back end supports masks.
 271                  * 2) the specified event is platform specific.
 272                  * 3) a valid hex number is converted.
 273                  * 4) no extraneous characters follow the mask specification.
 274                  */
 275                 if (dcpc_mask_type != 0 && strncmp(event, "PAPI", 4) != 0 &&
 276                     ddi_strtol(p, &end, 16, &umask) == 0 &&
 277                     end == p + strlen(p)) {
 278                         p += strlen(p) + 1;
 279                 } else {
 280                         goto err;
 281                 }
 282         }
 283 
 284         /*
 285          * This final part must be an overflow value which has to be greater
 286          * than the minimum permissible overflow rate.
 287          */
 288         if ((ddi_strtol(p, &end, 10, &val) != 0) || end != p + strlen(p) ||
 289             val < dcpc_min_overflow)
 290                 goto err;
 291 
 292         /*
 293          * Validate the event and create the probe.
 294          */
 295         for (i = 0; i < cpc_ncounters; i++) {
 296                 char *events, *cp, *p, *end;
 297                 int found = 0, j;
 298                 size_t llen;
 299 
 300                 if ((events = kcpc_list_events(i)) == NULL)
 301                         goto err;
 302 
 303                 llen = strlen(events);
 304                 p = cp = ddi_strdup(events, KM_NOSLEEP);
 305                 end = cp + llen;
 306 
 307                 for (j = 0; j < llen; j++) {
 308                         if (cp[j] == ',')
 309                                 cp[j] = '\0';
 310                 }
 311 
 312                 while (p < end && found == 0) {
 313                         if (strcmp(p, event) == 0) {
 314                                 dcpc_create_probe(dcpc_pid, desc->dtpd_name,
 315                                     event, umask, (uint32_t)val, flag);
 316                                 found = 1;
 317                         }
 318                         p += strlen(p) + 1;
 319                 }
 320                 kmem_free(cp, llen + 1);
 321 
 322                 if (found)
 323                         break;
 324         }
 325 
 326 err:
 327         kmem_free(str, len + 1);
 328 }
 329 
 330 /*ARGSUSED*/
 331 static void
 332 dcpc_destroy(void *arg, dtrace_id_t id, void *parg)
 333 {
 334         dcpc_probe_t *pp = parg;
 335 
 336         ASSERT(pp->dcpc_enabled == 0);
 337         kmem_free(pp, sizeof (dcpc_probe_t));
 338 }
 339 
 340 /*ARGSUSED*/
 341 static int
 342 dcpc_mode(void *arg, dtrace_id_t id, void *parg)
 343 {
 344         if (CPU->cpu_cpcprofile_pc == 0) {
 345                 return (DTRACE_MODE_NOPRIV_DROP | DTRACE_MODE_USER);
 346         } else {
 347                 return (DTRACE_MODE_NOPRIV_DROP | DTRACE_MODE_KERNEL);
 348         }
 349 }
 350 
 351 static void
 352 dcpc_populate_set(cpu_t *c, dcpc_probe_t *pp, kcpc_set_t *set, int reqno)
 353 {
 354         kcpc_set_t *oset;
 355         int i;
 356 
 357         (void) strncpy(set->ks_req[reqno].kr_event, pp->dcpc_event_name,
 358             CPC_MAX_EVENT_LEN);
 359         set->ks_req[reqno].kr_config = NULL;
 360         set->ks_req[reqno].kr_index = reqno;
 361         set->ks_req[reqno].kr_picnum = -1;
 362         set->ks_req[reqno].kr_flags =  pp->dcpc_flag;
 363 
 364         /*
 365          * If a unit mask has been specified then detect which attribute
 366          * the platform needs. For now, it's either "umask" or "emask".
 367          */
 368         if (pp->dcpc_umask >= 0) {
 369                 set->ks_req[reqno].kr_attr =
 370                     kmem_zalloc(sizeof (kcpc_attr_t), KM_SLEEP);
 371                 set->ks_req[reqno].kr_nattrs = 1;
 372                 if (dcpc_mask_type & DCPC_UMASK)
 373                         (void) strncpy(set->ks_req[reqno].kr_attr->ka_name,
 374                             "umask", 5);
 375                 else
 376                         (void) strncpy(set->ks_req[reqno].kr_attr->ka_name,
 377                             "emask", 5);
 378                 set->ks_req[reqno].kr_attr->ka_val = pp->dcpc_umask;
 379         } else {
 380                 set->ks_req[reqno].kr_attr = NULL;
 381                 set->ks_req[reqno].kr_nattrs = 0;
 382         }
 383 
 384         /*
 385          * If this probe is enabled, obtain its current countdown value
 386          * and use that. The CPUs cpc context might not exist yet if we
 387          * are dealing with a CPU that is just coming online.
 388          */
 389         if (pp->dcpc_enabled && (c->cpu_cpc_ctx != NULL)) {
 390                 oset = c->cpu_cpc_ctx->kc_set;
 391 
 392                 for (i = 0; i < oset->ks_nreqs; i++) {
 393                         if (strcmp(oset->ks_req[i].kr_event,
 394                             set->ks_req[reqno].kr_event) == 0) {
 395                                 set->ks_req[reqno].kr_preset =
 396                                     *(oset->ks_req[i].kr_data);
 397                         }
 398                 }
 399         } else {
 400                 set->ks_req[reqno].kr_preset = UINT64_MAX - pp->dcpc_ovfval;
 401         }
 402 
 403         set->ks_nreqs++;
 404 }
 405 
 406 
 407 /*
 408  * Create a fresh request set for the enablings represented in the
 409  * 'dcpc_actv_reqs' array which contains the probes we want to be
 410  * in the set. This can be called for several reasons:
 411  *
 412  * 1)   We are on a single or multi overflow platform and we have no
 413  *      current events so we can just create the set and initialize it.
 414  * 2)   We are on a multi-overflow platform and we already have one or
 415  *      more existing events and we are adding a new enabling. Create a
 416  *      new set and copy old requests in and then add the new request.
 417  * 3)   We are on a multi-overflow platform and we have just removed an
 418  *      enabling but we still have enablings whch are valid. Create a new
 419  *      set and copy in still valid requests.
 420  */
 421 static kcpc_set_t *
 422 dcpc_create_set(cpu_t *c)
 423 {
 424         int i, reqno = 0;
 425         int active_requests = 0;
 426         kcpc_set_t *set;
 427 
 428         /*
 429          * First get a count of the number of currently active requests.
 430          * Note that dcpc_actv_reqs[] should always reflect which requests
 431          * we want to be in the set that is to be created. It is the
 432          * responsibility of the caller of dcpc_create_set() to adjust that
 433          * array accordingly beforehand.
 434          */
 435         for (i = 0; i < cpc_ncounters; i++) {
 436                 if (dcpc_actv_reqs[i] != NULL)
 437                         active_requests++;
 438         }
 439 
 440         set = kmem_zalloc(sizeof (kcpc_set_t), KM_SLEEP);
 441 
 442         set->ks_req =
 443             kmem_zalloc(sizeof (kcpc_request_t) * active_requests, KM_SLEEP);
 444 
 445         set->ks_data =
 446             kmem_zalloc(active_requests * sizeof (uint64_t), KM_SLEEP);
 447 
 448         /*
 449          * Look for valid entries in the active requests array and populate
 450          * the request set for any entries found.
 451          */
 452         for (i = 0; i < cpc_ncounters; i++) {
 453                 if (dcpc_actv_reqs[i] != NULL) {
 454                         dcpc_populate_set(c, dcpc_actv_reqs[i], set, reqno);
 455                         reqno++;
 456                 }
 457         }
 458 
 459         return (set);
 460 }
 461 
 462 static int
 463 dcpc_program_cpu_event(cpu_t *c)
 464 {
 465         int i, j, subcode;
 466         kcpc_ctx_t *ctx, *octx;
 467         kcpc_set_t *set;
 468 
 469         set = dcpc_create_set(c);
 470 
 471         set->ks_ctx = ctx = kcpc_ctx_alloc(KM_SLEEP);
 472         ctx->kc_set = set;
 473         ctx->kc_cpuid = c->cpu_id;
 474 
 475         if (kcpc_assign_reqs(set, ctx) != 0)
 476                 goto err;
 477 
 478         if (kcpc_configure_reqs(ctx, set, &subcode) != 0)
 479                 goto err;
 480 
 481         for (i = 0; i < set->ks_nreqs; i++) {
 482                 for (j = 0; j < cpc_ncounters; j++) {
 483                         if (dcpc_actv_reqs[j] != NULL &&
 484                             strcmp(set->ks_req[i].kr_event,
 485                             dcpc_actv_reqs[j]->dcpc_event_name) == 0) {
 486                                 dcpc_actv_reqs[j]->dcpc_picno =
 487                                     set->ks_req[i].kr_picnum;
 488                         }
 489                 }
 490         }
 491 
 492         /*
 493          * If we already have an active enabling then save the current cpc
 494          * context away.
 495          */
 496         octx = c->cpu_cpc_ctx;
 497 
 498         kcpc_cpu_program(c, ctx);
 499 
 500         if (octx != NULL) {
 501                 kcpc_set_t *oset = octx->kc_set;
 502                 kmem_free(oset->ks_data, oset->ks_nreqs * sizeof (uint64_t));
 503                 kcpc_free_configs(oset);
 504                 kcpc_free_set(oset);
 505                 kcpc_ctx_free(octx);
 506         }
 507 
 508         return (0);
 509 
 510 err:
 511         /*
 512          * We failed to configure this request up so free things up and
 513          * get out.
 514          */
 515         kcpc_free_configs(set);
 516         kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
 517         kcpc_free_set(set);
 518         kcpc_ctx_free(ctx);
 519 
 520         return (-1);
 521 }
 522 
 523 static void
 524 dcpc_disable_cpu(cpu_t *c)
 525 {
 526         kcpc_ctx_t *ctx;
 527         kcpc_set_t *set;
 528 
 529         /*
 530          * Leave this CPU alone if it's already offline.
 531          */
 532         if (c->cpu_flags & CPU_OFFLINE)
 533                 return;
 534 
 535         /*
 536          * Grab CPUs CPC context before kcpc_cpu_stop() stops counters and
 537          * changes it.
 538          */
 539         ctx = c->cpu_cpc_ctx;
 540 
 541         kcpc_cpu_stop(c, B_FALSE);
 542 
 543         set = ctx->kc_set;
 544 
 545         kcpc_free_configs(set);
 546         kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
 547         kcpc_free_set(set);
 548         kcpc_ctx_free(ctx);
 549 }
 550 
 551 /*
 552  * The dcpc_*_interrupts() routines are responsible for manipulating the
 553  * per-CPU dcpc interrupt state byte. The purpose of the state byte is to
 554  * synchronize processing of hardware overflow interrupts wth configuration
 555  * changes made to the CPU performance counter subsystem by the dcpc provider.
 556  *
 557  * The dcpc provider claims ownership of the overflow interrupt mechanism
 558  * by transitioning the state byte from DCPC_INTR_INACTIVE (indicating the
 559  * dcpc provider is not in use) to DCPC_INTR_FREE (the dcpc provider owns the
 560  * overflow mechanism and interrupts may be processed). Before modifying
 561  * a CPUs configuration state the state byte is transitioned from
 562  * DCPC_INTR_FREE to DCPC_INTR_CONFIG ("configuration in process" state).
 563  * The hardware overflow handler, kcpc_hw_overflow_intr(), will only process
 564  * an interrupt when a configuration is not in process (i.e. the state is
 565  * marked as free). During interrupt processing the state is set to
 566  * DCPC_INTR_PROCESSING by the overflow handler. When the last dcpc based
 567  * enabling is removed, the state byte is set to DCPC_INTR_INACTIVE to indicate
 568  * the dcpc provider is no longer interested in overflow interrupts.
 569  */
 570 static void
 571 dcpc_block_interrupts(void)
 572 {
 573         cpu_t *c = cpu_list;
 574         uint8_t *state;
 575 
 576         ASSERT(cpu_core[c->cpu_id].cpuc_dcpc_intr_state != DCPC_INTR_INACTIVE);
 577 
 578         do {
 579                 state = &cpu_core[c->cpu_id].cpuc_dcpc_intr_state;
 580 
 581                 while (atomic_cas_8(state, DCPC_INTR_FREE,
 582                     DCPC_INTR_CONFIG) != DCPC_INTR_FREE)
 583                         continue;
 584 
 585         } while ((c = c->cpu_next) != cpu_list);
 586 }
 587 
 588 /*
 589  * Set all CPUs dcpc interrupt state to DCPC_INTR_FREE to indicate that
 590  * overflow interrupts can be processed safely.
 591  */
 592 static void
 593 dcpc_release_interrupts(void)
 594 {
 595         cpu_t *c = cpu_list;
 596 
 597         ASSERT(cpu_core[c->cpu_id].cpuc_dcpc_intr_state != DCPC_INTR_INACTIVE);
 598 
 599         do {
 600                 cpu_core[c->cpu_id].cpuc_dcpc_intr_state = DCPC_INTR_FREE;
 601                 membar_producer();
 602         } while ((c = c->cpu_next) != cpu_list);
 603 }
 604 
 605 /*
 606  * Transition all CPUs dcpc interrupt state from DCPC_INTR_INACTIVE to
 607  * to DCPC_INTR_FREE. This indicates that the dcpc provider is now
 608  * responsible for handling all overflow interrupt activity. Should only be
 609  * called before enabling the first dcpc based probe.
 610  */
 611 static void
 612 dcpc_claim_interrupts(void)
 613 {
 614         cpu_t *c = cpu_list;
 615 
 616         ASSERT(cpu_core[c->cpu_id].cpuc_dcpc_intr_state == DCPC_INTR_INACTIVE);
 617 
 618         do {
 619                 cpu_core[c->cpu_id].cpuc_dcpc_intr_state = DCPC_INTR_FREE;
 620                 membar_producer();
 621         } while ((c = c->cpu_next) != cpu_list);
 622 }
 623 
 624 /*
 625  * Set all CPUs dcpc interrupt state to DCPC_INTR_INACTIVE to indicate that
 626  * the dcpc provider is no longer processing overflow interrupts. Only called
 627  * during removal of the last dcpc based enabling.
 628  */
 629 static void
 630 dcpc_surrender_interrupts(void)
 631 {
 632         cpu_t *c = cpu_list;
 633 
 634         ASSERT(cpu_core[c->cpu_id].cpuc_dcpc_intr_state != DCPC_INTR_INACTIVE);
 635 
 636         do {
 637                 cpu_core[c->cpu_id].cpuc_dcpc_intr_state = DCPC_INTR_INACTIVE;
 638                 membar_producer();
 639         } while ((c = c->cpu_next) != cpu_list);
 640 }
 641 
 642 /*
 643  * dcpc_program_event() can be called owing to a new enabling or if a multi
 644  * overflow platform has disabled a request but needs to  program the requests
 645  * that are still valid.
 646  *
 647  * Every invocation of dcpc_program_event() will create a new kcpc_ctx_t
 648  * and a new request set which contains the new enabling and any old enablings
 649  * which are still valid (possible with multi-overflow platforms).
 650  */
 651 static int
 652 dcpc_program_event(dcpc_probe_t *pp)
 653 {
 654         cpu_t *c;
 655         int ret = 0;
 656 
 657         ASSERT(MUTEX_HELD(&cpu_lock));
 658 
 659         kpreempt_disable();
 660 
 661         dcpc_block_interrupts();
 662 
 663         c = cpu_list;
 664 
 665         do {
 666                 /*
 667                  * Skip CPUs that are currently offline.
 668                  */
 669                 if (c->cpu_flags & CPU_OFFLINE)
 670                         continue;
 671 
 672                 /*
 673                  * Stop counters but preserve existing DTrace CPC context
 674                  * if there is one.
 675                  *
 676                  * If we come here when the first event is programmed for a CPU,
 677                  * there should be no DTrace CPC context installed. In this
 678                  * case, kcpc_cpu_stop() will ensure that there is no other
 679                  * context on the CPU.
 680                  *
 681                  * If we add new enabling to the original one, the CPU should
 682                  * have the old DTrace CPC context which we need to keep around
 683                  * since dcpc_program_event() will add to it.
 684                  */
 685                 if (c->cpu_cpc_ctx != NULL)
 686                         kcpc_cpu_stop(c, B_TRUE);
 687         } while ((c = c->cpu_next) != cpu_list);
 688 
 689         dcpc_release_interrupts();
 690 
 691         /*
 692          * If this enabling is being removed (in the case of a multi event
 693          * capable system with more than one active enabling), we can now
 694          * update the active request array to reflect the enablings that need
 695          * to be reprogrammed.
 696          */
 697         if (pp->dcpc_disabling == 1)
 698                 dcpc_actv_reqs[pp->dcpc_actv_req_idx] = NULL;
 699 
 700         do {
 701                 /*
 702                  * Skip CPUs that are currently offline.
 703                  */
 704                 if (c->cpu_flags & CPU_OFFLINE)
 705                         continue;
 706 
 707                 ret = dcpc_program_cpu_event(c);
 708         } while ((c = c->cpu_next) != cpu_list && ret == 0);
 709 
 710         /*
 711          * If dcpc_program_cpu_event() fails then it is because we couldn't
 712          * configure the requests in the set for the CPU and not because of
 713          * an error programming the hardware. If we have a failure here then
 714          * we assume no CPUs have been programmed in the above step as they
 715          * are all configured identically.
 716          */
 717         if (ret != 0) {
 718                 pp->dcpc_enabled = 0;
 719                 kpreempt_enable();
 720                 return (-1);
 721         }
 722 
 723         if (pp->dcpc_disabling != 1)
 724                 pp->dcpc_enabled = 1;
 725 
 726         kpreempt_enable();
 727 
 728         return (0);
 729 }
 730 
 731 /*ARGSUSED*/
 732 static int
 733 dcpc_enable(void *arg, dtrace_id_t id, void *parg)
 734 {
 735         dcpc_probe_t *pp = parg;
 736         int i, found = 0;
 737         cpu_t *c;
 738 
 739         ASSERT(MUTEX_HELD(&cpu_lock));
 740 
 741         /*
 742          * Bail out if the counters are being used by a libcpc consumer.
 743          */
 744         rw_enter(&kcpc_cpuctx_lock, RW_READER);
 745         if (kcpc_cpuctx > 0) {
 746                 rw_exit(&kcpc_cpuctx_lock);
 747                 return (-1);
 748         }
 749 
 750         dtrace_cpc_in_use++;
 751         rw_exit(&kcpc_cpuctx_lock);
 752 
 753         /*
 754          * Locate this enabling in the first free entry of the active
 755          * request array.
 756          */
 757         for (i = 0; i < cpc_ncounters; i++) {
 758                 if (dcpc_actv_reqs[i] == NULL) {
 759                         dcpc_actv_reqs[i] = pp;
 760                         pp->dcpc_actv_req_idx = i;
 761                         found = 1;
 762                         break;
 763                 }
 764         }
 765 
 766         /*
 767          * If we couldn't find a slot for this probe then there is no
 768          * room at the inn.
 769          */
 770         if (!found) {
 771                 dtrace_cpc_in_use--;
 772                 return (-1);
 773         }
 774 
 775         ASSERT(pp->dcpc_actv_req_idx >= 0);
 776 
 777         /*
 778          * DTrace is taking over CPC contexts, so stop collecting
 779          * capacity/utilization data for all CPUs.
 780          */
 781         if (dtrace_cpc_in_use == 1)
 782                 cu_disable();
 783 
 784         /*
 785          * The following must hold true if we are to (attempt to) enable
 786          * this request:
 787          *
 788          * 1) No enablings currently exist. We allow all platforms to
 789          * proceed if this is true.
 790          *
 791          * OR
 792          *
 793          * 2) If the platform is multi overflow capable and there are
 794          * less valid enablings than there are counters. There is no
 795          * guarantee that a platform can accommodate as many events as
 796          * it has counters for but we will at least try to program
 797          * up to that many requests.
 798          *
 799          * The 'dcpc_enablings' variable is implictly protected by locking
 800          * provided by the DTrace framework and the cpu management framework.
 801          */
 802         if (dcpc_enablings == 0 || (dcpc_mult_ovf_cap &&
 803             dcpc_enablings < cpc_ncounters)) {
 804                 /*
 805                  * Before attempting to program the first enabling we need to
 806                  * invalidate any lwp-based contexts and lay claim to the
 807                  * overflow interrupt mechanism.
 808                  */
 809                 if (dcpc_enablings == 0) {
 810                         kcpc_invalidate_all();
 811                         dcpc_claim_interrupts();
 812                 }
 813 
 814                 if (dcpc_program_event(pp) == 0) {
 815                         dcpc_enablings++;
 816                         return (0);
 817                 }
 818         }
 819 
 820         /*
 821          * If active enablings existed before we failed to enable this probe
 822          * on a multi event capable platform then we need to restart counters
 823          * as they will have been stopped in the attempted configuration. The
 824          * context should now just contain the request prior to this failed
 825          * enabling.
 826          */
 827         if (dcpc_enablings > 0 && dcpc_mult_ovf_cap) {
 828                 c = cpu_list;
 829 
 830                 ASSERT(dcpc_mult_ovf_cap == 1);
 831                 do {
 832                         /*
 833                          * Skip CPUs that are currently offline.
 834                          */
 835                         if (c->cpu_flags & CPU_OFFLINE)
 836                                 continue;
 837 
 838                         kcpc_cpu_program(c, c->cpu_cpc_ctx);
 839                 } while ((c = c->cpu_next) != cpu_list);
 840         }
 841 
 842         /*
 843          * Give up any claim to the overflow interrupt mechanism if no
 844          * dcpc based enablings exist.
 845          */
 846         if (dcpc_enablings == 0)
 847                 dcpc_surrender_interrupts();
 848 
 849         dtrace_cpc_in_use--;
 850         dcpc_actv_reqs[pp->dcpc_actv_req_idx] = NULL;
 851         pp->dcpc_actv_req_idx = pp->dcpc_picno = -1;
 852 
 853         /*
 854          * If all probes are removed, enable capacity/utilization data
 855          * collection for every CPU.
 856          */
 857         if (dtrace_cpc_in_use == 0)
 858                 cu_enable();
 859 
 860         return (-1);
 861 }
 862 
 863 /*
 864  * If only one enabling is active then remove the context and free
 865  * everything up. If there are multiple enablings active then remove this
 866  * one, its associated meta-data and re-program the hardware.
 867  */
 868 /*ARGSUSED*/
 869 static void
 870 dcpc_disable(void *arg, dtrace_id_t id, void *parg)
 871 {
 872         cpu_t *c;
 873         dcpc_probe_t *pp = parg;
 874 
 875         ASSERT(MUTEX_HELD(&cpu_lock));
 876 
 877         kpreempt_disable();
 878 
 879         /*
 880          * This probe didn't actually make it as far as being fully enabled
 881          * so we needn't do anything with it.
 882          */
 883         if (pp->dcpc_enabled == 0) {
 884                 /*
 885                  * If we actually allocated this request a slot in the
 886                  * request array but failed to enabled it then remove the
 887                  * entry in the array.
 888                  */
 889                 if (pp->dcpc_actv_req_idx >= 0) {
 890                         dcpc_actv_reqs[pp->dcpc_actv_req_idx] = NULL;
 891                         pp->dcpc_actv_req_idx = pp->dcpc_picno =
 892                             pp->dcpc_disabling = -1;
 893                 }
 894 
 895                 kpreempt_enable();
 896                 return;
 897         }
 898 
 899         /*
 900          * If this is the only enabling then stop all the counters and
 901          * free up the meta-data.
 902          */
 903         if (dcpc_enablings == 1) {
 904                 ASSERT(dtrace_cpc_in_use == 1);
 905 
 906                 dcpc_block_interrupts();
 907 
 908                 c = cpu_list;
 909 
 910                 do {
 911                         dcpc_disable_cpu(c);
 912                 } while ((c = c->cpu_next) != cpu_list);
 913 
 914                 dcpc_actv_reqs[pp->dcpc_actv_req_idx] = NULL;
 915                 dcpc_surrender_interrupts();
 916         } else {
 917                 /*
 918                  * This platform can support multiple overflow events and
 919                  * the enabling being disabled is not the last one. Remove this
 920                  * enabling and re-program the hardware with the new config.
 921                  */
 922                 ASSERT(dcpc_mult_ovf_cap);
 923                 ASSERT(dcpc_enablings > 1);
 924 
 925                 pp->dcpc_disabling = 1;
 926                 (void) dcpc_program_event(pp);
 927         }
 928 
 929         kpreempt_enable();
 930 
 931         dcpc_enablings--;
 932         dtrace_cpc_in_use--;
 933         pp->dcpc_enabled = 0;
 934         pp->dcpc_actv_req_idx = pp->dcpc_picno = pp->dcpc_disabling = -1;
 935 
 936         /*
 937          * If all probes are removed, enable capacity/utilization data
 938          * collection for every CPU
 939          */
 940         if (dtrace_cpc_in_use == 0)
 941                 cu_enable();
 942 }
 943 
 944 /*ARGSUSED*/
 945 static int
 946 dcpc_cpu_setup(cpu_setup_t what, processorid_t cpu, void *arg)
 947 {
 948         cpu_t *c;
 949         uint8_t *state;
 950 
 951         ASSERT(MUTEX_HELD(&cpu_lock));
 952 
 953         switch (what) {
 954         case CPU_OFF:
 955                 /*
 956                  * Offline CPUs are not allowed to take part so remove this
 957                  * CPU if we are actively tracing.
 958                  */
 959                 if (dtrace_cpc_in_use) {
 960                         c = cpu_get(cpu);
 961                         state = &cpu_core[c->cpu_id].cpuc_dcpc_intr_state;
 962 
 963                         /*
 964                          * Indicate that a configuration is in process in
 965                          * order to stop overflow interrupts being processed
 966                          * on this CPU while we disable it.
 967                          */
 968                         while (atomic_cas_8(state, DCPC_INTR_FREE,
 969                             DCPC_INTR_CONFIG) != DCPC_INTR_FREE)
 970                                 continue;
 971 
 972                         dcpc_disable_cpu(c);
 973 
 974                         /*
 975                          * Reset this CPUs interrupt state as the configuration
 976                          * has ended.
 977                          */
 978                         cpu_core[c->cpu_id].cpuc_dcpc_intr_state =
 979                             DCPC_INTR_FREE;
 980                         membar_producer();
 981                 }
 982                 break;
 983 
 984         case CPU_ON:
 985         case CPU_SETUP:
 986                 /*
 987                  * This CPU is being initialized or brought online so program
 988                  * it with the current request set if we are actively tracing.
 989                  */
 990                 if (dtrace_cpc_in_use) {
 991                         c = cpu_get(cpu);
 992                         (void) dcpc_program_cpu_event(c);
 993                 }
 994                 break;
 995 
 996         default:
 997                 break;
 998         }
 999 
1000         return (0);
1001 }
1002 
1003 static dtrace_pattr_t dcpc_attr = {
1004 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
1005 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
1006 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
1007 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_CPU },
1008 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
1009 };
1010 
1011 static dtrace_pops_t dcpc_pops = {
1012     dcpc_provide,
1013     NULL,
1014     dcpc_enable,
1015     dcpc_disable,
1016     NULL,
1017     NULL,
1018     NULL,
1019     NULL,
1020     dcpc_mode,
1021     dcpc_destroy
1022 };
1023 
1024 /*ARGSUSED*/
1025 static int
1026 dcpc_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
1027 {
1028         return (0);
1029 }
1030 
1031 /*ARGSUSED*/
1032 static int
1033 dcpc_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
1034 {
1035         int error;
1036 
1037         switch (infocmd) {
1038         case DDI_INFO_DEVT2DEVINFO:
1039                 *result = (void *)dcpc_devi;
1040                 error = DDI_SUCCESS;
1041                 break;
1042         case DDI_INFO_DEVT2INSTANCE:
1043                 *result = (void *)0;
1044                 error = DDI_SUCCESS;
1045                 break;
1046         default:
1047                 error = DDI_FAILURE;
1048         }
1049         return (error);
1050 }
1051 
1052 static int
1053 dcpc_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
1054 {
1055         switch (cmd) {
1056         case DDI_DETACH:
1057                 break;
1058         case DDI_SUSPEND:
1059                 return (DDI_SUCCESS);
1060         default:
1061                 return (DDI_FAILURE);
1062         }
1063 
1064         if (dtrace_unregister(dcpc_pid) != 0)
1065                 return (DDI_FAILURE);
1066 
1067         ddi_remove_minor_node(devi, NULL);
1068 
1069         mutex_enter(&cpu_lock);
1070         unregister_cpu_setup_func(dcpc_cpu_setup, NULL);
1071         mutex_exit(&cpu_lock);
1072 
1073         kmem_free(dcpc_actv_reqs, cpc_ncounters * sizeof (dcpc_probe_t *));
1074 
1075         kcpc_unregister_dcpc();
1076 
1077         return (DDI_SUCCESS);
1078 }
1079 
1080 static int
1081 dcpc_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
1082 {
1083         uint_t caps;
1084         char *attrs;
1085 
1086         switch (cmd) {
1087         case DDI_ATTACH:
1088                 break;
1089         case DDI_RESUME:
1090                 return (DDI_SUCCESS);
1091         default:
1092                 return (DDI_FAILURE);
1093         }
1094 
1095         if (kcpc_pcbe_loaded() == -1)
1096                 return (DDI_FAILURE);
1097 
1098         caps = kcpc_pcbe_capabilities();
1099 
1100         if (!(caps & CPC_CAP_OVERFLOW_INTERRUPT)) {
1101                 cmn_err(CE_NOTE, "!dcpc: Counter Overflow not supported"\
1102                     " on this processor");
1103                 return (DDI_FAILURE);
1104         }
1105 
1106         if (ddi_create_minor_node(devi, "dcpc", S_IFCHR, 0,
1107             DDI_PSEUDO, NULL) == DDI_FAILURE ||
1108             dtrace_register("cpc", &dcpc_attr, DTRACE_PRIV_KERNEL,
1109             NULL, &dcpc_pops, NULL, &dcpc_pid) != 0) {
1110                 ddi_remove_minor_node(devi, NULL);
1111                 return (DDI_FAILURE);
1112         }
1113 
1114         mutex_enter(&cpu_lock);
1115         register_cpu_setup_func(dcpc_cpu_setup, NULL);
1116         mutex_exit(&cpu_lock);
1117 
1118         dcpc_ovf_mask = (1 << cpc_ncounters) - 1;
1119         ASSERT(dcpc_ovf_mask != 0);
1120 
1121         if (caps & CPC_CAP_OVERFLOW_PRECISE)
1122                 dcpc_mult_ovf_cap = 1;
1123 
1124         /*
1125          * Determine which, if any, mask attribute the back-end can use.
1126          */
1127         attrs = kcpc_list_attrs();
1128         if (strstr(attrs, "umask") != NULL)
1129                 dcpc_mask_type |= DCPC_UMASK;
1130         else if (strstr(attrs, "emask") != NULL)
1131                 dcpc_mask_type |= DCPC_EMASK;
1132 
1133         /*
1134          * The dcpc_actv_reqs array is used to store the requests that
1135          * we currently have programmed. The order of requests in this
1136          * array is not necessarily the order that the event appears in
1137          * the kcpc_request_t array. Once entered into a slot in the array
1138          * the entry is not moved until it's removed.
1139          */
1140         dcpc_actv_reqs =
1141             kmem_zalloc(cpc_ncounters * sizeof (dcpc_probe_t *), KM_SLEEP);
1142 
1143         dcpc_min_overflow = ddi_prop_get_int(DDI_DEV_T_ANY, devi,
1144             DDI_PROP_DONTPASS, "dcpc-min-overflow", DCPC_MIN_OVF_DEFAULT);
1145 
1146         kcpc_register_dcpc(dcpc_fire);
1147 
1148         ddi_report_dev(devi);
1149         dcpc_devi = devi;
1150 
1151         return (DDI_SUCCESS);
1152 }
1153 
1154 static struct cb_ops dcpc_cb_ops = {
1155         dcpc_open,              /* open */
1156         nodev,                  /* close */
1157         nulldev,                /* strategy */
1158         nulldev,                /* print */
1159         nodev,                  /* dump */
1160         nodev,                  /* read */
1161         nodev,                  /* write */
1162         nodev,                  /* ioctl */
1163         nodev,                  /* devmap */
1164         nodev,                  /* mmap */
1165         nodev,                  /* segmap */
1166         nochpoll,               /* poll */
1167         ddi_prop_op,            /* cb_prop_op */
1168         0,                      /* streamtab  */
1169         D_NEW | D_MP            /* Driver compatibility flag */
1170 };
1171 
1172 static struct dev_ops dcpc_ops = {
1173         DEVO_REV,               /* devo_rev, */
1174         0,                      /* refcnt  */
1175         dcpc_info,              /* get_dev_info */
1176         nulldev,                /* identify */
1177         nulldev,                /* probe */
1178         dcpc_attach,            /* attach */
1179         dcpc_detach,            /* detach */
1180         nodev,                  /* reset */
1181         &dcpc_cb_ops,               /* driver operations */
1182         NULL,                   /* bus operations */
1183         nodev,                  /* dev power */
1184         ddi_quiesce_not_needed  /* quiesce */
1185 };
1186 
1187 /*
1188  * Module linkage information for the kernel.
1189  */
1190 static struct modldrv modldrv = {
1191         &mod_driverops,             /* module type */
1192         "DTrace CPC Module",    /* name of module */
1193         &dcpc_ops,          /* driver ops */
1194 };
1195 
1196 static struct modlinkage modlinkage = {
1197         MODREV_1,
1198         { (void *)&modldrv, NULL }
1199 };
1200 
1201 int
1202 _init(void)
1203 {
1204         return (mod_install(&modlinkage));
1205 }
1206 
1207 int
1208 _info(struct modinfo *modinfop)
1209 {
1210         return (mod_info(&modlinkage, modinfop));
1211 }
1212 
1213 int
1214 _fini(void)
1215 {
1216         return (mod_remove(&modlinkage));
1217 }