Print this page
2916 DTrace in a zone should be able to access fds[]

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/dtrace/dtrace.c
          +++ new/usr/src/uts/common/dtrace/dtrace.c
↓ open down ↓ 162 lines elided ↑ open up ↑
 163  163  static dev_info_t       *dtrace_devi;           /* device info */
 164  164  static vmem_t           *dtrace_arena;          /* probe ID arena */
 165  165  static vmem_t           *dtrace_minor;          /* minor number arena */
 166  166  static taskq_t          *dtrace_taskq;          /* task queue */
 167  167  static dtrace_probe_t   **dtrace_probes;        /* array of all probes */
 168  168  static int              dtrace_nprobes;         /* number of probes */
 169  169  static dtrace_provider_t *dtrace_provider;      /* provider list */
 170  170  static dtrace_meta_t    *dtrace_meta_pid;       /* user-land meta provider */
 171  171  static int              dtrace_opens;           /* number of opens */
 172  172  static int              dtrace_helpers;         /* number of helpers */
      173 +static int              dtrace_getf;            /* number of unpriv getf()s */
 173  174  static void             *dtrace_softstate;      /* softstate pointer */
 174  175  static dtrace_hash_t    *dtrace_bymod;          /* probes hashed by module */
 175  176  static dtrace_hash_t    *dtrace_byfunc;         /* probes hashed by function */
 176  177  static dtrace_hash_t    *dtrace_byname;         /* probes hashed by name */
 177  178  static dtrace_toxrange_t *dtrace_toxrange;      /* toxic range array */
 178  179  static int              dtrace_toxranges;       /* number of toxic ranges */
 179  180  static int              dtrace_toxranges_max;   /* size of toxic range array */
 180  181  static dtrace_anon_t    dtrace_anon;            /* anonymous enabling */
 181  182  static kmem_cache_t     *dtrace_state_cache;    /* cache for dynamic state */
 182  183  static uint64_t         dtrace_vtime_references; /* number of vtimestamp refs */
↓ open down ↓ 285 lines elided ↑ open up ↑
 468  469  static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
 469  470  static void dtrace_buffer_drop(dtrace_buffer_t *);
 470  471  static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
 471  472  static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
 472  473      dtrace_state_t *, dtrace_mstate_t *);
 473  474  static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
 474  475      dtrace_optval_t);
 475  476  static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
 476  477  static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
 477  478  static int dtrace_priv_proc(dtrace_state_t *, dtrace_mstate_t *);
      479 +static void dtrace_getf_barrier(void);
 478  480  
 479  481  /*
 480  482   * DTrace Probe Context Functions
 481  483   *
 482  484   * These functions are called from probe context.  Because probe context is
 483  485   * any context in which C may be called, arbitrarily locks may be held,
 484  486   * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
 485  487   * As a result, functions called from probe context may only call other DTrace
 486  488   * support functions -- they may not interact at all with the system at large.
 487  489   * (Note that the ASSERT macro is made probe-context safe by redefining it in
↓ open down ↓ 191 lines elided ↑ open up ↑
 679  681   * illegal value slot.
 680  682   *
 681  683   * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
 682  684   * appropriate memory access protection.
 683  685   */
 684  686  static int
 685  687  dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 686  688      dtrace_vstate_t *vstate)
 687  689  {
 688  690          volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
      691 +        file_t *fp;
 689  692  
 690  693          /*
 691  694           * If we hold the privilege to read from kernel memory, then
 692  695           * everything is readable.
 693  696           */
 694  697          if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
 695  698                  return (1);
 696  699  
 697  700          /*
 698  701           * You can obviously read that which you can store.
↓ open down ↓ 45 lines elided ↑ open up ↑
 744  747                      &(p->p_pidp->pid_id), sizeof (pid_t))) {
 745  748                          return (1);
 746  749                  }
 747  750  
 748  751                  if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
 749  752                      curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
 750  753                          return (1);
 751  754                  }
 752  755          }
 753  756  
      757 +        if ((fp = mstate->dtms_getf) != NULL) {
      758 +                uintptr_t psz = sizeof (void *);
      759 +                vnode_t *vp;
      760 +                vnodeops_t *op;
      761 +
      762 +                /*
      763 +                 * When getf() returns a file_t, the enabling is implicitly
      764 +                 * granted the (transient) right to read the returned file_t
      765 +                 * as well as the v_path and v_op->vnop_name of the underlying
      766 +                 * vnode.  These accesses are allowed after a successful
      767 +                 * getf() because the members that they refer to cannot change
      768 +                 * once set -- and the barrier logic in the kernel's closef()
      769 +                 * path assures that the file_t and its referenced vode_t
      770 +                 * cannot themselves be stale (that is, it impossible for
      771 +                 * either dtms_getf itself or its f_vnode member to reference
      772 +                 * freed memory).
      773 +                 */
      774 +                if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t)))
      775 +                        return (1);
      776 +
      777 +                if ((vp = fp->f_vnode) != NULL) {
      778 +                        if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz))
      779 +                                return (1);
      780 +
      781 +                        if (vp->v_path != NULL && DTRACE_INRANGE(addr, sz,
      782 +                            vp->v_path, strlen(vp->v_path) + 1)) {
      783 +                                return (1);
      784 +                        }
      785 +
      786 +                        if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz))
      787 +                                return (1);
      788 +
      789 +                        if ((op = vp->v_op) != NULL &&
      790 +                            DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
      791 +                                return (1);
      792 +                        }
      793 +
      794 +                        if (op != NULL && op->vnop_name != NULL &&
      795 +                            DTRACE_INRANGE(addr, sz, op->vnop_name,
      796 +                            strlen(op->vnop_name) + 1)) {
      797 +                                return (1);
      798 +                        }
      799 +                }
      800 +        }
      801 +
 754  802          DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
 755  803          *illval = addr;
 756  804          return (0);
 757  805  }
 758  806  
 759  807  /*
 760  808   * Convenience routine to check to see if a given string is within a memory
 761  809   * region in which a load may be issued given the user's privilege level;
 762  810   * this exists so that we don't need to issue unnecessary dtrace_strlen()
 763  811   * calls in the event that the user has all privileges.
↓ open down ↓ 358 lines elided ↑ open up ↑
1122 1170  dtrace_priv_proc_common_zone(dtrace_state_t *state)
1123 1171  {
1124 1172          cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1125 1173  
1126 1174          /*
1127 1175           * We should always have a non-NULL state cred here, since if cred
1128 1176           * is null (anonymous tracing), we fast-path bypass this routine.
1129 1177           */
1130 1178          ASSERT(s_cr != NULL);
1131 1179  
1132      -        if ((cr = CRED()) != NULL &&
1133      -            s_cr->cr_zone == cr->cr_zone)
     1180 +        if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
1134 1181                  return (1);
1135 1182  
1136 1183          return (0);
1137 1184  }
1138 1185  
1139 1186  /*
1140 1187   * This privilege check should be used by actions and subroutines to
1141 1188   * verify that the process has not setuid or changed credentials.
1142 1189   */
1143 1190  static int
↓ open down ↓ 3314 lines elided ↑ open up ↑
4458 4505  
4459 4506                  for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
4460 4507                          dest[j] = dtrace_load8(src + i);
4461 4508  
4462 4509                  dest[j] = '\0';
4463 4510                  regs[rd] = (uintptr_t)dest;
4464 4511                  mstate->dtms_scratch_ptr += size;
4465 4512                  break;
4466 4513          }
4467 4514  
     4515 +        case DIF_SUBR_GETF: {
     4516 +                uintptr_t fd = tupregs[0].dttk_value;
     4517 +                uf_info_t *finfo = &curthread->t_procp->p_user.u_finfo;
     4518 +                file_t *fp;
     4519 +
     4520 +                if (!dtrace_priv_proc(state, mstate)) {
     4521 +                        regs[rd] = NULL;
     4522 +                        break;
     4523 +                }
     4524 +
     4525 +                /*
     4526 +                 * This is safe because fi_nfiles only increases, and the
     4527 +                 * fi_list array is not freed when the array size doubles.
     4528 +                 * (See the comment in flist_grow() for details on the
     4529 +                 * management of the u_finfo structure.)
     4530 +                 */
     4531 +                fp = fd < finfo->fi_nfiles ? finfo->fi_list[fd].uf_file : NULL;
     4532 +
     4533 +                mstate->dtms_getf = fp;
     4534 +                regs[rd] = (uintptr_t)fp;
     4535 +                break;
     4536 +        }
     4537 +
4468 4538          case DIF_SUBR_CLEANPATH: {
4469 4539                  char *dest = (char *)mstate->dtms_scratch_ptr, c;
4470 4540                  uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4471 4541                  uintptr_t src = tupregs[0].dttk_value;
4472 4542                  int i = 0, j = 0;
     4543 +                zone_t *z;
4473 4544  
4474 4545                  if (!dtrace_strcanload(src, size, mstate, vstate)) {
4475 4546                          regs[rd] = NULL;
4476 4547                          break;
4477 4548                  }
4478 4549  
4479 4550                  if (!DTRACE_INSCRATCH(mstate, size)) {
4480 4551                          DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4481 4552                          regs[rd] = NULL;
4482 4553                          break;
↓ open down ↓ 78 lines elided ↑ open up ↑
4561 4632                           */
4562 4633                          i--;
4563 4634                          while (j != 0 && dest[--j] != '/')
4564 4635                                  continue;
4565 4636  
4566 4637                          if (c == '\0')
4567 4638                                  dest[++j] = '/';
4568 4639                  } while (c != '\0');
4569 4640  
4570 4641                  dest[j] = '\0';
     4642 +
     4643 +                if (mstate->dtms_getf != NULL &&
     4644 +                    !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&
     4645 +                    (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {
     4646 +                        /*
     4647 +                         * If we've done a getf() as a part of this ECB and we
     4648 +                         * don't have kernel access (and we're not in the global
     4649 +                         * zone), check if the path we cleaned up begins with
     4650 +                         * the zone's root path, and trim it off if so.  Note
     4651 +                         * that this is an output cleanliness issue, not a
     4652 +                         * security issue: knowing one's zone root path does
     4653 +                         * not enable privilege escalation.
     4654 +                         */
     4655 +                        if (strstr(dest, z->zone_rootpath) == dest)
     4656 +                                dest += strlen(z->zone_rootpath) - 1;
     4657 +                }
     4658 +
4571 4659                  regs[rd] = (uintptr_t)dest;
4572 4660                  mstate->dtms_scratch_ptr += size;
4573 4661                  break;
4574 4662          }
4575 4663  
4576 4664          case DIF_SUBR_INET_NTOA:
4577 4665          case DIF_SUBR_INET_NTOA6:
4578 4666          case DIF_SUBR_INET_NTOP: {
4579 4667                  size_t size;
4580 4668                  int af, argi, i;
↓ open down ↓ 1344 lines elided ↑ open up ↑
5925 6013                   * action loop will use the last iteration's value.
5926 6014                   */
5927 6015  #ifdef lint
5928 6016                  uint64_t val = 0;
5929 6017  #else
5930 6018                  uint64_t val;
5931 6019  #endif
5932 6020  
5933 6021                  mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
5934 6022                  mstate.dtms_access = DTRACE_ACCESS_ARGS | DTRACE_ACCESS_PROC;
     6023 +                mstate.dtms_getf = NULL;
     6024 +
5935 6025                  *flags &= ~CPU_DTRACE_ERROR;
5936 6026  
5937 6027                  if (prov == dtrace_provider) {
5938 6028                          /*
5939 6029                           * If dtrace itself is the provider of this probe,
5940 6030                           * we're only going to continue processing the ECB if
5941 6031                           * arg0 (the dtrace_state_t) is equal to the ECB's
5942 6032                           * creating state.  (This prevents disjoint consumers
5943 6033                           * from seeing one another's metaprobes.)
5944 6034                           */
↓ open down ↓ 2458 lines elided ↑ open up ↑
8403 8493                                  err += efunc(pc, "invalid subr %u\n", subr);
8404 8494                          if (rd >= nregs)
8405 8495                                  err += efunc(pc, "invalid register %u\n", rd);
8406 8496                          if (rd == 0)
8407 8497                                  err += efunc(pc, "cannot write to %r0\n");
8408 8498  
8409 8499                          if (subr == DIF_SUBR_COPYOUT ||
8410 8500                              subr == DIF_SUBR_COPYOUTSTR) {
8411 8501                                  dp->dtdo_destructive = 1;
8412 8502                          }
     8503 +
     8504 +                        if (subr == DIF_SUBR_GETF) {
     8505 +                                /*
     8506 +                                 * If we have a getf() we need to record that
     8507 +                                 * in our state.  Note that our state can be
     8508 +                                 * NULL if this is a helper -- but in that
     8509 +                                 * case, the call to getf() is itself illegal,
     8510 +                                 * and will be caught (slightly later) when
     8511 +                                 * the helper is validated.
     8512 +                                 */
     8513 +                                if (vstate->dtvs_state != NULL)
     8514 +                                        vstate->dtvs_state->dts_getf++;
     8515 +                        }
     8516 +
8413 8517                          break;
8414 8518                  case DIF_OP_PUSHTR:
8415 8519                          if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
8416 8520                                  err += efunc(pc, "invalid ref type %u\n", type);
8417 8521                          if (r2 >= nregs)
8418 8522                                  err += efunc(pc, "invalid register %u\n", r2);
8419 8523                          if (rs >= nregs)
8420 8524                                  err += efunc(pc, "invalid register %u\n", rs);
8421 8525                          break;
8422 8526                  case DIF_OP_PUSHTV:
↓ open down ↓ 4654 lines elided ↑ open up ↑
13077 13181          hdlr.cyh_level = CY_LOW_LEVEL;
13078 13182  
13079 13183          when.cyt_when = 0;
13080 13184          when.cyt_interval = dtrace_deadman_interval;
13081 13185  
13082 13186          state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
13083 13187          state->dts_deadman = cyclic_add(&hdlr, &when);
13084 13188  
13085 13189          state->dts_activity = DTRACE_ACTIVITY_WARMUP;
13086 13190  
     13191 +        if (state->dts_getf != 0 &&
     13192 +            !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
     13193 +                /*
     13194 +                 * We don't have kernel privs but we have at least one call
     13195 +                 * to getf(); we need to bump our zone's count, and (if
     13196 +                 * this is the first enabling to have an unprivileged call
     13197 +                 * to getf()) we need to hook into closef().
     13198 +                 */
     13199 +                state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++;
     13200 +
     13201 +                if (dtrace_getf++ == 0) {
     13202 +                        ASSERT(dtrace_closef == NULL);
     13203 +                        dtrace_closef = dtrace_getf_barrier;
     13204 +                }
     13205 +        }
     13206 +
13087 13207          /*
13088 13208           * Now it's time to actually fire the BEGIN probe.  We need to disable
13089 13209           * interrupts here both to record the CPU on which we fired the BEGIN
13090 13210           * probe (the data from this CPU will be processed first at user
13091 13211           * level) and to manually activate the buffer for this CPU.
13092 13212           */
13093 13213          cookie = dtrace_interrupt_disable();
13094 13214          *cpu = CPU->cpu_id;
13095 13215          ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
13096 13216          state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
↓ open down ↓ 96 lines elided ↑ open up ↑
13193 13313  
13194 13314          cookie = dtrace_interrupt_disable();
13195 13315          *cpu = CPU->cpu_id;
13196 13316          dtrace_probe(dtrace_probeid_end,
13197 13317              (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13198 13318          dtrace_interrupt_enable(cookie);
13199 13319  
13200 13320          state->dts_activity = DTRACE_ACTIVITY_STOPPED;
13201 13321          dtrace_sync();
13202 13322  
     13323 +        if (state->dts_getf != 0 &&
     13324 +            !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
     13325 +                /*
     13326 +                 * We don't have kernel privs but we have at least one call
     13327 +                 * to getf(); we need to lower our zone's count, and (if
     13328 +                 * this is the last enabling to have an unprivileged call
     13329 +                 * to getf()) we need to clear the closef() hook.
     13330 +                 */
     13331 +                ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0);
     13332 +                ASSERT(dtrace_closef == dtrace_getf_barrier);
     13333 +                ASSERT(dtrace_getf > 0);
     13334 +
     13335 +                state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--;
     13336 +
     13337 +                if (--dtrace_getf == 0)
     13338 +                        dtrace_closef = NULL;
     13339 +        }
     13340 +
13203 13341          return (0);
13204 13342  }
13205 13343  
13206 13344  static int
13207 13345  dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
13208 13346      dtrace_optval_t val)
13209 13347  {
13210 13348          ASSERT(MUTEX_HELD(&dtrace_lock));
13211 13349  
13212 13350          if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
↓ open down ↓ 1540 lines elided ↑ open up ↑
14753 14891          }
14754 14892  
14755 14893          ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == NULL);
14756 14894          ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == NULL);
14757 14895  
14758 14896          dtrace_toxrange[dtrace_toxranges].dtt_base = base;
14759 14897          dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
14760 14898          dtrace_toxranges++;
14761 14899  }
14762 14900  
     14901 +static void
     14902 +dtrace_getf_barrier()
     14903 +{
     14904 +        /*
     14905 +         * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings
     14906 +         * that contain calls to getf(), this routine will be called on every
     14907 +         * closef() before either the underlying vnode is released or the
     14908 +         * file_t itself is freed.  By the time we are here, it is essential
     14909 +         * that the file_t can no longer be accessed from a call to getf()
     14910 +         * in probe context -- that assures that a dtrace_sync() can be used
     14911 +         * to clear out any enablings referring to the old structures.
     14912 +         */
     14913 +        if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 ||
     14914 +            kcred->cr_zone->zone_dtrace_getf != 0)
     14915 +                dtrace_sync();
     14916 +}
     14917 +
14763 14918  /*
14764 14919   * DTrace Driver Cookbook Functions
14765 14920   */
14766 14921  /*ARGSUSED*/
14767 14922  static int
14768 14923  dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
14769 14924  {
14770 14925          dtrace_provider_id_t id;
14771 14926          dtrace_state_t *state = NULL;
14772 14927          dtrace_enabling_t *enab;
↓ open down ↓ 1134 lines elided ↑ open up ↑
15907 16062          dtrace_cpu_init = NULL;
15908 16063          dtrace_helpers_cleanup = NULL;
15909 16064          dtrace_helpers_fork = NULL;
15910 16065          dtrace_cpustart_init = NULL;
15911 16066          dtrace_cpustart_fini = NULL;
15912 16067          dtrace_debugger_init = NULL;
15913 16068          dtrace_debugger_fini = NULL;
15914 16069          dtrace_modload = NULL;
15915 16070          dtrace_modunload = NULL;
15916 16071  
     16072 +        ASSERT(dtrace_getf == 0);
     16073 +        ASSERT(dtrace_closef == NULL);
     16074 +
15917 16075          mutex_exit(&cpu_lock);
15918 16076  
15919 16077          if (dtrace_helptrace_enabled) {
15920 16078                  kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
15921 16079                  dtrace_helptrace_buffer = NULL;
15922 16080          }
15923 16081  
15924 16082          kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
15925 16083          dtrace_probes = NULL;
15926 16084          dtrace_nprobes = 0;
↓ open down ↓ 129 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX