1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * CPU Performance Counter system calls and device driver.
  28  *
  29  * This module uses a combination of thread context operators, and
  30  * thread-specific data to export CPU performance counters
  31  * via both a system call and a driver interface.
  32  *
  33  * There are three access methods exported - the 'shared' device
  34  * and the 'private' and 'agent' variants of the system call.
  35  *
  36  * The shared device treats the performance counter registers as
  37  * a processor metric, regardless of the work scheduled on them.
  38  * The private system call treats the performance counter registers
  39  * as a property of a single lwp.  This is achieved by using the
  40  * thread context operators to virtualize the contents of the
  41  * performance counter registers between lwps.
  42  *
  43  * The agent method is like the private method, except that it must
  44  * be accessed via /proc's agent lwp to allow the counter context of
  45  * other threads to be examined safely.
  46  *
  47  * The shared usage fundamentally conflicts with the agent and private usage;
  48  * almost all of the complexity of the module is needed to allow these two
  49  * models to co-exist in a reasonable way.
  50  */
  51 
  52 #include <sys/types.h>
  53 #include <sys/file.h>
  54 #include <sys/errno.h>
  55 #include <sys/open.h>
  56 #include <sys/cred.h>
  57 #include <sys/conf.h>
  58 #include <sys/stat.h>
  59 #include <sys/processor.h>
  60 #include <sys/cpuvar.h>
  61 #include <sys/disp.h>
  62 #include <sys/kmem.h>
  63 #include <sys/modctl.h>
  64 #include <sys/ddi.h>
  65 #include <sys/sunddi.h>
  66 #include <sys/nvpair.h>
  67 #include <sys/policy.h>
  68 #include <sys/machsystm.h>
  69 #include <sys/cpc_impl.h>
  70 #include <sys/cpc_pcbe.h>
  71 #include <sys/kcpc.h>
  72 
  73 static int kcpc_copyin_set(kcpc_set_t **set, void *ubuf, size_t len);
  74 static int kcpc_verify_set(kcpc_set_t *set);
  75 static uint32_t kcpc_nvlist_npairs(nvlist_t *list);
  76 
  77 /*
  78  * Generic attributes supported regardless of processor.
  79  */
  80 
  81 #define ATTRLIST "picnum"
  82 #define SEPARATOR ","
  83 
  84 /*
  85  * System call to access CPU performance counters.
  86  */
  87 static int
  88 cpc(int cmd, id_t lwpid, void *udata1, void *udata2, void *udata3)
  89 {
  90         kthread_t       *t;
  91         int             error;
  92         int             size;
  93         const char      *str;
  94         int             code;
  95 
  96         /*
  97          * This CPC syscall should only be loaded if it found a PCBE to use.
  98          */
  99         ASSERT(pcbe_ops != NULL);
 100 
 101         if (curproc->p_agenttp == curthread) {
 102                 /*
 103                  * Only if /proc is invoking this system call from
 104                  * the agent thread do we allow the caller to examine
 105                  * the contexts of other lwps in the process.  And
 106                  * because we know we're the agent, we know we don't
 107                  * have to grab p_lock because no-one else can change
 108                  * the state of the process.
 109                  */
 110                 if ((t = idtot(curproc, lwpid)) == NULL || t == curthread)
 111                         return (set_errno(ESRCH));
 112                 ASSERT(t->t_tid == lwpid && ttolwp(t) != NULL);
 113         } else
 114                 t = curthread;
 115 
 116         if (t->t_cpc_set == NULL && (cmd == CPC_SAMPLE || cmd == CPC_RELE))
 117                 return (set_errno(EINVAL));
 118 
 119         switch (cmd) {
 120         case CPC_BIND:
 121                 /*
 122                  * udata1 = pointer to packed nvlist buffer
 123                  * udata2 = size of packed nvlist buffer
 124                  * udata3 = User addr to return error subcode in.
 125                  */
 126 
 127                 rw_enter(&kcpc_cpuctx_lock, RW_READER);
 128                 if (kcpc_cpuctx || dtrace_cpc_in_use) {
 129                         rw_exit(&kcpc_cpuctx_lock);
 130                         return (set_errno(EAGAIN));
 131                 }
 132 
 133                 if (kcpc_hw_lwp_hook() != 0) {
 134                         rw_exit(&kcpc_cpuctx_lock);
 135                         return (set_errno(EACCES));
 136                 }
 137 
 138                 /*
 139                  * An LWP may only have one set bound to it at a time; if there
 140                  * is a set bound to this LWP already, we unbind it here.
 141                  */
 142                 if (t->t_cpc_set != NULL)
 143                         (void) kcpc_unbind(t->t_cpc_set);
 144                 ASSERT(t->t_cpc_set == NULL);
 145 
 146                 if ((error = kcpc_copyin_set(&t->t_cpc_set, udata1,
 147                     (size_t)udata2)) != 0) {
 148                         rw_exit(&kcpc_cpuctx_lock);
 149                         return (set_errno(error));
 150                 }
 151 
 152                 if ((error = kcpc_verify_set(t->t_cpc_set)) != 0) {
 153                         rw_exit(&kcpc_cpuctx_lock);
 154                         kcpc_free_set(t->t_cpc_set);
 155                         t->t_cpc_set = NULL;
 156                         if (copyout(&error, udata3, sizeof (error)) == -1)
 157                                 return (set_errno(EFAULT));
 158                         return (set_errno(EINVAL));
 159                 }
 160 
 161                 if ((error = kcpc_bind_thread(t->t_cpc_set, t, &code)) != 0) {
 162                         rw_exit(&kcpc_cpuctx_lock);
 163                         kcpc_free_set(t->t_cpc_set);
 164                         t->t_cpc_set = NULL;
 165                         /*
 166                          * EINVAL and EACCES are the only errors with more
 167                          * specific subcodes.
 168                          */
 169                         if ((error == EINVAL || error == EACCES) &&
 170                             copyout(&code, udata3, sizeof (code)) == -1)
 171                                 return (set_errno(EFAULT));
 172                         return (set_errno(error));
 173                 }
 174 
 175                 rw_exit(&kcpc_cpuctx_lock);
 176                 return (0);
 177         case CPC_SAMPLE:
 178                 /*
 179                  * udata1 = pointer to user's buffer
 180                  * udata2 = pointer to user's hrtime
 181                  * udata3 = pointer to user's tick
 182                  */
 183                 /*
 184                  * We only allow thread-bound sets to be sampled via the
 185                  * syscall, so if this set has a CPU-bound context, return an
 186                  * error.
 187                  */
 188                 if (t->t_cpc_set->ks_ctx->kc_cpuid != -1)
 189                         return (set_errno(EINVAL));
 190                 if ((error = kcpc_sample(t->t_cpc_set, udata1, udata2,
 191                     udata3)) != 0)
 192                         return (set_errno(error));
 193 
 194                 return (0);
 195         case CPC_PRESET:
 196         case CPC_RESTART:
 197                 /*
 198                  * These are valid only if this lwp has a bound set.
 199                  */
 200                 if (t->t_cpc_set == NULL)
 201                         return (set_errno(EINVAL));
 202                 if (cmd == CPC_PRESET) {
 203                         /*
 204                          * The preset is shipped up to us from userland in two
 205                          * parts. This lets us handle 64-bit values from 32-bit
 206                          * and 64-bit applications in the same manner.
 207                          *
 208                          * udata1 = index of request to preset
 209                          * udata2 = new 64-bit preset (most sig. 32 bits)
 210                          * udata3 = new 64-bit preset (least sig. 32 bits)
 211                          */
 212                         if ((error = kcpc_preset(t->t_cpc_set, (intptr_t)udata1,
 213                             ((uint64_t)(uintptr_t)udata2 << 32ULL) |
 214                             (uint64_t)(uintptr_t)udata3)) != 0)
 215                                 return (set_errno(error));
 216                 } else {
 217                         /*
 218                          * udata[1-3] = unused
 219                          */
 220                         if ((error = kcpc_restart(t->t_cpc_set)) != 0)
 221                                 return (set_errno(error));
 222                 }
 223                 return (0);
 224         case CPC_ENABLE:
 225         case CPC_DISABLE:
 226                 udata1 = 0;
 227                 /*FALLTHROUGH*/
 228         case CPC_USR_EVENTS:
 229         case CPC_SYS_EVENTS:
 230                 if (t != curthread || t->t_cpc_set == NULL)
 231                         return (set_errno(EINVAL));
 232                 /*
 233                  * Provided for backwards compatibility with CPCv1.
 234                  *
 235                  * Stop the counters and record the current counts. Use the
 236                  * counts as the preset to rebind a new set with the requests
 237                  * reconfigured as requested.
 238                  *
 239                  * udata1: 1 == enable; 0 == disable
 240                  * udata{2,3}: unused
 241                  */
 242                 rw_enter(&kcpc_cpuctx_lock, RW_READER);
 243                 if ((error = kcpc_enable(t,
 244                     cmd, (int)(uintptr_t)udata1)) != 0) {
 245                         rw_exit(&kcpc_cpuctx_lock);
 246                         return (set_errno(error));
 247                 }
 248                 rw_exit(&kcpc_cpuctx_lock);
 249                 return (0);
 250         case CPC_NPIC:
 251                 return (cpc_ncounters);
 252         case CPC_CAPS:
 253                 return (pcbe_ops->pcbe_caps);
 254         case CPC_EVLIST_SIZE:
 255         case CPC_LIST_EVENTS:
 256                 /*
 257                  * udata1 = pointer to user's int or buffer
 258                  * udata2 = picnum
 259                  * udata3 = unused
 260                  */
 261                 if ((uintptr_t)udata2 >= cpc_ncounters)
 262                         return (set_errno(EINVAL));
 263 
 264                 size = strlen(
 265                     pcbe_ops->pcbe_list_events((uintptr_t)udata2)) + 1;
 266 
 267                 if (cmd == CPC_EVLIST_SIZE) {
 268                         if (suword32(udata1, size) == -1)
 269                                 return (set_errno(EFAULT));
 270                 } else {
 271                         if (copyout(
 272                             pcbe_ops->pcbe_list_events((uintptr_t)udata2),
 273                             udata1, size) == -1)
 274                                 return (set_errno(EFAULT));
 275                 }
 276                 return (0);
 277         case CPC_ATTRLIST_SIZE:
 278         case CPC_LIST_ATTRS:
 279                 /*
 280                  * udata1 = pointer to user's int or buffer
 281                  * udata2 = unused
 282                  * udata3 = unused
 283                  *
 284                  * attrlist size is length of PCBE-supported attributes, plus
 285                  * room for "picnum\0" plus an optional ',' separator char.
 286                  */
 287                 str = pcbe_ops->pcbe_list_attrs();
 288                 size = strlen(str) + sizeof (SEPARATOR ATTRLIST) + 1;
 289                 if (str[0] != '\0')
 290                         /*
 291                          * A ',' separator character is necessary.
 292                          */
 293                         size += 1;
 294 
 295                 if (cmd == CPC_ATTRLIST_SIZE) {
 296                         if (suword32(udata1, size) == -1)
 297                                 return (set_errno(EFAULT));
 298                 } else {
 299                         /*
 300                          * Copyout the PCBE attributes, and then append the
 301                          * generic attribute list (with separator if necessary).
 302                          */
 303                         if (copyout(str, udata1, strlen(str)) == -1)
 304                                 return (set_errno(EFAULT));
 305                         if (str[0] != '\0') {
 306                                 if (copyout(SEPARATOR ATTRLIST,
 307                                     ((char *)udata1) + strlen(str),
 308                                     strlen(SEPARATOR ATTRLIST) + 1)
 309                                     == -1)
 310                                         return (set_errno(EFAULT));
 311                         } else
 312                                 if (copyout(ATTRLIST,
 313                                     (char *)udata1 + strlen(str),
 314                                     strlen(ATTRLIST) + 1) == -1)
 315                                         return (set_errno(EFAULT));
 316                 }
 317                 return (0);
 318         case CPC_IMPL_NAME:
 319         case CPC_CPUREF:
 320                 /*
 321                  * udata1 = pointer to user's buffer
 322                  * udata2 = unused
 323                  * udata3 = unused
 324                  */
 325                 if (cmd == CPC_IMPL_NAME) {
 326                         str = pcbe_ops->pcbe_impl_name();
 327                         ASSERT(strlen(str) < CPC_MAX_IMPL_NAME);
 328                 } else {
 329                         str = pcbe_ops->pcbe_cpuref();
 330                         ASSERT(strlen(str) < CPC_MAX_CPUREF);
 331                 }
 332 
 333                 if (copyout(str, udata1, strlen(str) + 1) != 0)
 334                         return (set_errno(EFAULT));
 335                 return (0);
 336         case CPC_INVALIDATE:
 337                 kcpc_invalidate(t);
 338                 return (0);
 339         case CPC_RELE:
 340                 if ((error = kcpc_unbind(t->t_cpc_set)) != 0)
 341                         return (set_errno(error));
 342                 return (0);
 343         default:
 344                 return (set_errno(EINVAL));
 345         }
 346 }
 347 
 348 /*
 349  * The 'shared' device allows direct access to the
 350  * performance counter control register of the current CPU.
 351  * The major difference between the contexts created here and those
 352  * above is that the context handlers are -not- installed, thus
 353  * no context switching behaviour occurs.
 354  *
 355  * Because they manipulate per-cpu state, these ioctls can
 356  * only be invoked from a bound lwp, by a caller with the cpc_cpu privilege
 357  * who can open the relevant entry in /devices (the act of holding it open
 358  * causes other uses of the counters to be suspended).
 359  *
 360  * Note that for correct results, the caller -must- ensure that
 361  * all existing per-lwp contexts are either inactive or marked invalid;
 362  * that's what the open routine does.
 363  */
 364 /*ARGSUSED*/
 365 static int
 366 kcpc_ioctl(dev_t dev, int cmd, intptr_t data, int flags, cred_t *cr, int *rvp)
 367 {
 368         kthread_t       *t = curthread;
 369         processorid_t   cpuid;
 370         void            *udata1 = NULL;
 371         void            *udata2 = NULL;
 372         void            *udata3 = NULL;
 373         int             error;
 374         int             code;
 375 
 376         STRUCT_DECL(__cpc_args, args);
 377 
 378         STRUCT_INIT(args, flags);
 379 
 380         if (curthread->t_bind_cpu != getminor(dev))
 381                 return (EAGAIN);  /* someone unbound it? */
 382 
 383         cpuid = getminor(dev);
 384 
 385         if (cmd == CPCIO_BIND || cmd == CPCIO_SAMPLE) {
 386                 if (copyin((void *)data, STRUCT_BUF(args),
 387                     STRUCT_SIZE(args)) == -1)
 388                         return (EFAULT);
 389 
 390                 udata1 = STRUCT_FGETP(args, udata1);
 391                 udata2 = STRUCT_FGETP(args, udata2);
 392                 udata3 = STRUCT_FGETP(args, udata3);
 393         }
 394 
 395         switch (cmd) {
 396         case CPCIO_BIND:
 397                 /*
 398                  * udata1 = pointer to packed nvlist buffer
 399                  * udata2 = size of packed nvlist buffer
 400                  * udata3 = User addr to return error subcode in.
 401                  */
 402                 if (t->t_cpc_set != NULL) {
 403                         (void) kcpc_unbind(t->t_cpc_set);
 404                         ASSERT(t->t_cpc_set == NULL);
 405                 }
 406 
 407                 if ((error = kcpc_copyin_set(&t->t_cpc_set, udata1,
 408                     (size_t)udata2)) != 0) {
 409                         return (error);
 410                 }
 411 
 412                 if ((error = kcpc_verify_set(t->t_cpc_set)) != 0) {
 413                         kcpc_free_set(t->t_cpc_set);
 414                         t->t_cpc_set = NULL;
 415                         if (copyout(&error, udata3, sizeof (error)) == -1)
 416                                 return (EFAULT);
 417                         return (EINVAL);
 418                 }
 419 
 420                 if ((error = kcpc_bind_cpu(t->t_cpc_set, cpuid, &code)) != 0) {
 421                         kcpc_free_set(t->t_cpc_set);
 422                         t->t_cpc_set = NULL;
 423                         /*
 424                          * Subcodes are only returned for EINVAL and EACCESS.
 425                          */
 426                         if ((error == EINVAL || error == EACCES) &&
 427                             copyout(&code, udata3, sizeof (code)) == -1)
 428                                 return (EFAULT);
 429                         return (error);
 430                 }
 431 
 432                 return (0);
 433         case CPCIO_SAMPLE:
 434                 /*
 435                  * udata1 = pointer to user's buffer
 436                  * udata2 = pointer to user's hrtime
 437                  * udata3 = pointer to user's tick
 438                  */
 439                 /*
 440                  * Only CPU-bound sets may be sampled via the ioctl(). If this
 441                  * set has no CPU-bound context, return an error.
 442                  */
 443                 if (t->t_cpc_set == NULL)
 444                         return (EINVAL);
 445                 if ((error = kcpc_sample(t->t_cpc_set, udata1, udata2,
 446                     udata3)) != 0)
 447                         return (error);
 448                 return (0);
 449         case CPCIO_RELE:
 450                 if (t->t_cpc_set == NULL)
 451                         return (EINVAL);
 452                 return (kcpc_unbind(t->t_cpc_set));
 453         default:
 454                 return (EINVAL);
 455         }
 456 }
 457 
 458 /*
 459  * The device supports multiple opens, but only one open
 460  * is allowed per processor.  This is to enable multiple
 461  * instances of tools looking at different processors.
 462  */
 463 #define KCPC_MINOR_SHARED               ((minor_t)0x3fffful)
 464 
 465 static ulong_t *kcpc_cpumap;            /* bitmap of cpus */
 466 
 467 /*ARGSUSED1*/
 468 static int
 469 kcpc_open(dev_t *dev, int flags, int otyp, cred_t *cr)
 470 {
 471         processorid_t   cpuid;
 472         int             error;
 473 
 474         ASSERT(pcbe_ops != NULL);
 475 
 476         if ((error = secpolicy_cpc_cpu(cr)) != 0)
 477                 return (error);
 478         if (getminor(*dev) != KCPC_MINOR_SHARED)
 479                 return (ENXIO);
 480         if ((cpuid = curthread->t_bind_cpu) == PBIND_NONE)
 481                 return (EINVAL);
 482         if (cpuid > max_cpuid)
 483                 return (EINVAL);
 484 
 485         rw_enter(&kcpc_cpuctx_lock, RW_WRITER);
 486         if (++kcpc_cpuctx == 1) {
 487                 ASSERT(kcpc_cpumap == NULL);
 488 
 489                 /*
 490                  * Bail out if DTrace is already using the counters.
 491                  */
 492                 if (dtrace_cpc_in_use) {
 493                         kcpc_cpuctx--;
 494                         rw_exit(&kcpc_cpuctx_lock);
 495                         return (EAGAIN);
 496                 }
 497                 kcpc_cpumap = kmem_zalloc(BT_SIZEOFMAP(max_cpuid + 1),
 498                     KM_SLEEP);
 499                 /*
 500                  * When this device is open for processor-based contexts,
 501                  * no further lwp-based contexts can be created.
 502                  *
 503                  * Since this is the first open, ensure that all existing
 504                  * contexts are invalidated.
 505                  */
 506                 kcpc_invalidate_all();
 507         } else if (BT_TEST(kcpc_cpumap, cpuid)) {
 508                 kcpc_cpuctx--;
 509                 rw_exit(&kcpc_cpuctx_lock);
 510                 return (EAGAIN);
 511         } else if (kcpc_hw_cpu_hook(cpuid, kcpc_cpumap) != 0) {
 512                 kcpc_cpuctx--;
 513                 rw_exit(&kcpc_cpuctx_lock);
 514                 return (EACCES);
 515         }
 516         BT_SET(kcpc_cpumap, cpuid);
 517         rw_exit(&kcpc_cpuctx_lock);
 518 
 519         *dev = makedevice(getmajor(*dev), (minor_t)cpuid);
 520 
 521         return (0);
 522 }
 523 
 524 /*ARGSUSED1*/
 525 static int
 526 kcpc_close(dev_t dev, int flags, int otyp, cred_t *cr)
 527 {
 528         rw_enter(&kcpc_cpuctx_lock, RW_WRITER);
 529         BT_CLEAR(kcpc_cpumap, getminor(dev));
 530         if (--kcpc_cpuctx == 0) {
 531                 kmem_free(kcpc_cpumap, BT_SIZEOFMAP(max_cpuid + 1));
 532                 kcpc_cpumap = NULL;
 533         }
 534         ASSERT(kcpc_cpuctx >= 0);
 535         rw_exit(&kcpc_cpuctx_lock);
 536 
 537         return (0);
 538 }
 539 
 540 /*
 541  * Sane boundaries on the size of packed lists. In bytes.
 542  */
 543 #define CPC_MIN_PACKSIZE 4
 544 #define CPC_MAX_PACKSIZE 10000
 545 
 546 /*
 547  * Sane boundary on the number of requests a set can contain.
 548  */
 549 #define CPC_MAX_NREQS 100
 550 
 551 /*
 552  * Sane boundary on the number of attributes a request can contain.
 553  */
 554 #define CPC_MAX_ATTRS 50
 555 
 556 /*
 557  * Copy in a packed nvlist from the user and create a request set out of it.
 558  * If successful, return 0 and store a pointer to the set we've created. Returns
 559  * error code on error.
 560  */
 561 int
 562 kcpc_copyin_set(kcpc_set_t **inset, void *ubuf, size_t len)
 563 {
 564         kcpc_set_t      *set;
 565         int             i;
 566         int             j;
 567         char            *packbuf;
 568 
 569         nvlist_t        *nvl;
 570         nvpair_t        *nvp = NULL;
 571 
 572         nvlist_t        *attrs;
 573         nvpair_t        *nvp_attr;
 574         kcpc_attr_t     *attrp;
 575 
 576         nvlist_t        **reqlist;
 577         uint_t          nreqs;
 578         uint64_t        uint64;
 579         uint32_t        uint32;
 580         uint32_t        setflags = (uint32_t)-1;
 581         char            *string;
 582         char            *name;
 583 
 584         if (len < CPC_MIN_PACKSIZE || len > CPC_MAX_PACKSIZE)
 585                 return (EINVAL);
 586 
 587         packbuf = kmem_alloc(len, KM_SLEEP);
 588 
 589         if (copyin(ubuf, packbuf, len) == -1) {
 590                 kmem_free(packbuf, len);
 591                 return (EFAULT);
 592         }
 593 
 594         if (nvlist_unpack(packbuf, len, &nvl, KM_SLEEP) != 0) {
 595                 kmem_free(packbuf, len);
 596                 return (EINVAL);
 597         }
 598 
 599         /*
 600          * The nvlist has been unpacked so there is no need for the packed
 601          * representation from this point on.
 602          */
 603         kmem_free(packbuf, len);
 604 
 605         i = 0;
 606         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
 607                 switch (nvpair_type(nvp)) {
 608                 case DATA_TYPE_UINT32:
 609                         if (strcmp(nvpair_name(nvp), "flags") != 0 ||
 610                             nvpair_value_uint32(nvp, &setflags) != 0) {
 611                                 nvlist_free(nvl);
 612                                 return (EINVAL);
 613                         }
 614                         break;
 615                 case DATA_TYPE_NVLIST_ARRAY:
 616                         if (strcmp(nvpair_name(nvp), "reqs") != 0 ||
 617                             nvpair_value_nvlist_array(nvp, &reqlist,
 618                             &nreqs) != 0) {
 619                                 nvlist_free(nvl);
 620                                 return (EINVAL);
 621                         }
 622                         break;
 623                 default:
 624                         nvlist_free(nvl);
 625                         return (EINVAL);
 626                 }
 627                 i++;
 628         }
 629 
 630         /*
 631          * There should be two members in the top-level nvlist:
 632          * an array of nvlists consisting of the requests, and flags.
 633          * Anything else is an invalid set.
 634          */
 635         if (i != 2) {
 636                 nvlist_free(nvl);
 637                 return (EINVAL);
 638         }
 639 
 640         if (nreqs > CPC_MAX_NREQS) {
 641                 nvlist_free(nvl);
 642                 return (EINVAL);
 643         }
 644 
 645         /*
 646          * The requests are now stored in the nvlist array at reqlist.
 647          * Note that the use of kmem_zalloc() to alloc the kcpc_set_t means
 648          * we don't need to call the init routines for ks_lock and ks_condv.
 649          */
 650         set = kmem_zalloc(sizeof (kcpc_set_t), KM_SLEEP);
 651         set->ks_req = (kcpc_request_t *)kmem_zalloc(sizeof (kcpc_request_t) *
 652             nreqs, KM_SLEEP);
 653         set->ks_nreqs = nreqs;
 654         /*
 655          * If the nvlist didn't contain a flags member, setflags was initialized
 656          * with an illegal value and this set will fail sanity checks later on.
 657          */
 658         set->ks_flags = setflags;
 659         /*
 660          * Initialize bind/unbind set synchronization.
 661          */
 662         set->ks_state &= ~KCPC_SET_BOUND;
 663 
 664         /*
 665          * Build the set up one request at a time, always keeping it self-
 666          * consistent so we can give it to kcpc_free_set() if we need to back
 667          * out and return and error.
 668          */
 669         for (i = 0; i < nreqs; i++) {
 670                 nvp = NULL;
 671                 set->ks_req[i].kr_picnum = -1;
 672                 while ((nvp = nvlist_next_nvpair(reqlist[i], nvp)) != NULL) {
 673                         name = nvpair_name(nvp);
 674                         switch (nvpair_type(nvp)) {
 675                         case DATA_TYPE_UINT32:
 676                                 if (nvpair_value_uint32(nvp, &uint32) == EINVAL)
 677                                         goto inval;
 678                                 if (strcmp(name, "cr_flags") == 0)
 679                                         set->ks_req[i].kr_flags = uint32;
 680                                 if (strcmp(name, "cr_index") == 0)
 681                                         set->ks_req[i].kr_index = uint32;
 682                                 break;
 683                         case DATA_TYPE_UINT64:
 684                                 if (nvpair_value_uint64(nvp, &uint64) == EINVAL)
 685                                         goto inval;
 686                                 if (strcmp(name, "cr_preset") == 0)
 687                                         set->ks_req[i].kr_preset = uint64;
 688                                 break;
 689                         case DATA_TYPE_STRING:
 690                                 if (nvpair_value_string(nvp, &string) == EINVAL)
 691                                         goto inval;
 692                                 if (strcmp(name, "cr_event") == 0)
 693                                         (void) strncpy(set->ks_req[i].kr_event,
 694                                             string, CPC_MAX_EVENT_LEN);
 695                                 break;
 696                         case DATA_TYPE_NVLIST:
 697                                 if (strcmp(name, "cr_attr") != 0)
 698                                         goto inval;
 699                                 if (nvpair_value_nvlist(nvp, &attrs) == EINVAL)
 700                                         goto inval;
 701                                 nvp_attr = NULL;
 702                                 /*
 703                                  * If the picnum has been specified as an
 704                                  * attribute, consume that attribute here and
 705                                  * remove it from the list of attributes.
 706                                  */
 707                                 if (nvlist_lookup_uint64(attrs, "picnum",
 708                                     &uint64) == 0) {
 709                                         if (nvlist_remove(attrs, "picnum",
 710                                             DATA_TYPE_UINT64) != 0)
 711                                                 panic("nvlist %p faulty",
 712                                                     (void *)attrs);
 713                                         set->ks_req[i].kr_picnum = uint64;
 714                                 }
 715 
 716                                 if ((set->ks_req[i].kr_nattrs =
 717                                     kcpc_nvlist_npairs(attrs)) == 0)
 718                                         break;
 719 
 720                                 if (set->ks_req[i].kr_nattrs > CPC_MAX_ATTRS)
 721                                         goto inval;
 722 
 723                                 set->ks_req[i].kr_attr =
 724                                     kmem_alloc(set->ks_req[i].kr_nattrs *
 725                                     sizeof (kcpc_attr_t), KM_SLEEP);
 726                                 j = 0;
 727 
 728                                 while ((nvp_attr = nvlist_next_nvpair(attrs,
 729                                     nvp_attr)) != NULL) {
 730                                         attrp = &set->ks_req[i].kr_attr[j];
 731 
 732                                         if (nvpair_type(nvp_attr) !=
 733                                             DATA_TYPE_UINT64)
 734                                                 goto inval;
 735 
 736                                         (void) strncpy(attrp->ka_name,
 737                                             nvpair_name(nvp_attr),
 738                                             CPC_MAX_ATTR_LEN);
 739 
 740                                         if (nvpair_value_uint64(nvp_attr,
 741                                             &(attrp->ka_val)) == EINVAL)
 742                                                 goto inval;
 743                                         j++;
 744                                 }
 745                                 ASSERT(j == set->ks_req[i].kr_nattrs);
 746                         default:
 747                                 break;
 748                         }
 749                 }
 750         }
 751 
 752         nvlist_free(nvl);
 753         *inset = set;
 754         return (0);
 755 
 756 inval:
 757         nvlist_free(nvl);
 758         kcpc_free_set(set);
 759         return (EINVAL);
 760 }
 761 
 762 /*
 763  * Count the number of nvpairs in the supplied nvlist.
 764  */
 765 static uint32_t
 766 kcpc_nvlist_npairs(nvlist_t *list)
 767 {
 768         nvpair_t *nvp = NULL;
 769         uint32_t n = 0;
 770 
 771         while ((nvp = nvlist_next_nvpair(list, nvp)) != NULL)
 772                 n++;
 773 
 774         return (n);
 775 }
 776 
 777 /*
 778  * Performs sanity checks on the given set.
 779  * Returns 0 if the set checks out OK.
 780  * Returns a detailed error subcode, or -1 if there is no applicable subcode.
 781  */
 782 static int
 783 kcpc_verify_set(kcpc_set_t *set)
 784 {
 785         kcpc_request_t  *rp;
 786         int             i;
 787         uint64_t        bitmap = 0;
 788         int             n;
 789 
 790         if (set->ks_nreqs > cpc_ncounters)
 791                 return (-1);
 792 
 793         if (CPC_SET_VALID_FLAGS(set->ks_flags) == 0)
 794                 return (-1);
 795 
 796         for (i = 0; i < set->ks_nreqs; i++) {
 797                 rp = &set->ks_req[i];
 798 
 799                 /*
 800                  * The following comparison must cast cpc_ncounters to an int,
 801                  * because kr_picnum will be -1 if the request didn't explicitly
 802                  * choose a PIC.
 803                  */
 804                 if (rp->kr_picnum >= (int)cpc_ncounters)
 805                         return (CPC_INVALID_PICNUM);
 806 
 807                 /*
 808                  * Of the pics whose physical picnum has been specified, make
 809                  * sure each PIC appears only once in set.
 810                  */
 811                 if ((n = set->ks_req[i].kr_picnum) != -1) {
 812                         if ((bitmap & (1 << n)) != 0)
 813                                 return (-1);
 814                         bitmap |= (1 << n);
 815                 }
 816 
 817                 /*
 818                  * Make sure the requested index falls within the range of all
 819                  * requests.
 820                  */
 821                 if (rp->kr_index < 0 || rp->kr_index >= set->ks_nreqs)
 822                         return (-1);
 823 
 824                 /*
 825                  * Make sure there are no unknown flags.
 826                  */
 827                 if (KCPC_REQ_VALID_FLAGS(rp->kr_flags) == 0)
 828                         return (CPC_REQ_INVALID_FLAGS);
 829         }
 830 
 831         return (0);
 832 }
 833 
 834 static struct cb_ops cb_ops = {
 835         kcpc_open,
 836         kcpc_close,
 837         nodev,          /* strategy */
 838         nodev,          /* print */
 839         nodev,          /* dump */
 840         nodev,          /* read */
 841         nodev,          /* write */
 842         kcpc_ioctl,
 843         nodev,          /* devmap */
 844         nodev,          /* mmap */
 845         nodev,          /* segmap */
 846         nochpoll,       /* poll */
 847         ddi_prop_op,
 848         NULL,
 849         D_NEW | D_MP
 850 };
 851 
 852 /*ARGSUSED*/
 853 static int
 854 kcpc_probe(dev_info_t *devi)
 855 {
 856         return (DDI_PROBE_SUCCESS);
 857 }
 858 
 859 static dev_info_t *kcpc_devi;
 860 
 861 static int
 862 kcpc_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 863 {
 864         if (cmd != DDI_ATTACH)
 865                 return (DDI_FAILURE);
 866         kcpc_devi = devi;
 867         return (ddi_create_minor_node(devi, "shared", S_IFCHR,
 868             KCPC_MINOR_SHARED, DDI_PSEUDO, 0));
 869 }
 870 
 871 /*ARGSUSED*/
 872 static int
 873 kcpc_getinfo(dev_info_t *devi, ddi_info_cmd_t cmd, void *arg, void **result)
 874 {
 875         switch (cmd) {
 876         case DDI_INFO_DEVT2DEVINFO:
 877                 switch (getminor((dev_t)arg)) {
 878                 case KCPC_MINOR_SHARED:
 879                         *result = kcpc_devi;
 880                         return (DDI_SUCCESS);
 881                 default:
 882                         break;
 883                 }
 884                 break;
 885         case DDI_INFO_DEVT2INSTANCE:
 886                 *result = 0;
 887                 return (DDI_SUCCESS);
 888         default:
 889                 break;
 890         }
 891 
 892         return (DDI_FAILURE);
 893 }
 894 
 895 static struct dev_ops dev_ops = {
 896         DEVO_REV,
 897         0,
 898         kcpc_getinfo,
 899         nulldev,                /* identify */
 900         kcpc_probe,
 901         kcpc_attach,
 902         nodev,                  /* detach */
 903         nodev,                  /* reset */
 904         &cb_ops,
 905         (struct bus_ops *)0,
 906         NULL,
 907         ddi_quiesce_not_needed,         /* quiesce */
 908 };
 909 
 910 static struct modldrv modldrv = {
 911         &mod_driverops,
 912         "cpc sampling driver",
 913         &dev_ops
 914 };
 915 
 916 static struct sysent cpc_sysent = {
 917         5,
 918         SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
 919         cpc
 920 };
 921 
 922 static struct modlsys modlsys = {
 923         &mod_syscallops,
 924         "cpc sampling system call",
 925         &cpc_sysent
 926 };
 927 
 928 #ifdef _SYSCALL32_IMPL
 929 static struct modlsys modlsys32 = {
 930         &mod_syscallops32,
 931         "32-bit cpc sampling system call",
 932         &cpc_sysent
 933 };
 934 #endif
 935 
 936 static struct modlinkage modl = {
 937         MODREV_1,
 938         &modldrv,
 939         &modlsys,
 940 #ifdef _SYSCALL32_IMPL
 941         &modlsys32,
 942 #endif
 943 };
 944 
 945 int
 946 _init(void)
 947 {
 948         if (kcpc_init() != 0)
 949                 return (ENOTSUP);
 950 
 951         return (mod_install(&modl));
 952 }
 953 
 954 int
 955 _fini(void)
 956 {
 957         return (mod_remove(&modl));
 958 }
 959 
 960 int
 961 _info(struct modinfo *mi)
 962 {
 963         return (mod_info(&modl, mi));
 964 }