Print this page
2915 DTrace in a zone should see "cpu", "curpsinfo", et al
2916 DTrace in a zone should be able to access fds[]
2917 DTrace in a zone should have limited provider access
Reviewed by: Joshua M. Clulow <josh@sysmgr.org>
Reviewed by: Adam Leventhal <ahl@delphix.com>

@@ -169,10 +169,11 @@
 static int              dtrace_nprobes;         /* number of probes */
 static dtrace_provider_t *dtrace_provider;      /* provider list */
 static dtrace_meta_t    *dtrace_meta_pid;       /* user-land meta provider */
 static int              dtrace_opens;           /* number of opens */
 static int              dtrace_helpers;         /* number of helpers */
+static int              dtrace_getf;            /* number of unpriv getf()s */
 static void             *dtrace_softstate;      /* softstate pointer */
 static dtrace_hash_t    *dtrace_bymod;          /* probes hashed by module */
 static dtrace_hash_t    *dtrace_byfunc;         /* probes hashed by function */
 static dtrace_hash_t    *dtrace_byname;         /* probes hashed by name */
 static dtrace_toxrange_t *dtrace_toxrange;      /* toxic range array */

@@ -371,12 +372,12 @@
  * within the range of memory described by addr, sz.  We take care to avoid
  * problems with overflow and underflow of the unsigned quantities, and
  * disallow all negative sizes.  Ranges of size 0 are allowed.
  */
 #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
-        ((testaddr) - (baseaddr) < (basesz) && \
-        (testaddr) + (testsz) - (baseaddr) <= (basesz) && \
+        ((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
+        (testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
         (testaddr) + (testsz) >= (testaddr))
 
 /*
  * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
  * alloc_sz on the righthand side of the comparison in order to avoid overflow

@@ -473,10 +474,12 @@
     dtrace_state_t *, dtrace_mstate_t *);
 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
     dtrace_optval_t);
 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
+static int dtrace_priv_proc(dtrace_state_t *, dtrace_mstate_t *);
+static void dtrace_getf_barrier(void);
 
 /*
  * DTrace Probe Context Functions
  *
  * These functions are called from probe context.  Because probe context is

@@ -617,11 +620,11 @@
         /*
          * Now check to see if it's a dynamic variable.  This check will pick
          * up both thread-local variables and any global dynamically-allocated
          * variables.
          */
-        if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
+        if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
             vstate->dtvs_dynvars.dtds_size)) {
                 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
                 uintptr_t base = (uintptr_t)dstate->dtds_base +
                     (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
                 uintptr_t chunkoffs;

@@ -684,10 +687,11 @@
 static int
 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
     dtrace_vstate_t *vstate)
 {
         volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
+        file_t *fp;
 
         /*
          * If we hold the privilege to read from kernel memory, then
          * everything is readable.
          */

@@ -701,14 +705,103 @@
                 return (1);
 
         /*
          * We're allowed to read from our own string table.
          */
-        if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
+        if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
             mstate->dtms_difo->dtdo_strlen))
                 return (1);
 
+        if (vstate->dtvs_state != NULL &&
+            dtrace_priv_proc(vstate->dtvs_state, mstate)) {
+                proc_t *p;
+
+                /*
+                 * When we have privileges to the current process, there are
+                 * several context-related kernel structures that are safe to
+                 * read, even absent the privilege to read from kernel memory.
+                 * These reads are safe because these structures contain only
+                 * state that (1) we're permitted to read, (2) is harmless or
+                 * (3) contains pointers to additional kernel state that we're
+                 * not permitted to read (and as such, do not present an
+                 * opportunity for privilege escalation).  Finally (and
+                 * critically), because of the nature of their relation with
+                 * the current thread context, the memory associated with these
+                 * structures cannot change over the duration of probe context,
+                 * and it is therefore impossible for this memory to be
+                 * deallocated and reallocated as something else while it's
+                 * being operated upon.
+                 */
+                if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t)))
+                        return (1);
+
+                if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
+                    sz, curthread->t_procp, sizeof (proc_t))) {
+                        return (1);
+                }
+
+                if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
+                    curthread->t_cred, sizeof (cred_t))) {
+                        return (1);
+                }
+
+                if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
+                    &(p->p_pidp->pid_id), sizeof (pid_t))) {
+                        return (1);
+                }
+
+                if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
+                    curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
+                        return (1);
+                }
+        }
+
+        if ((fp = mstate->dtms_getf) != NULL) {
+                uintptr_t psz = sizeof (void *);
+                vnode_t *vp;
+                vnodeops_t *op;
+
+                /*
+                 * When getf() returns a file_t, the enabling is implicitly
+                 * granted the (transient) right to read the returned file_t
+                 * as well as the v_path and v_op->vnop_name of the underlying
+                 * vnode.  These accesses are allowed after a successful
+                 * getf() because the members that they refer to cannot change
+                 * once set -- and the barrier logic in the kernel's closef()
+                 * path assures that the file_t and its referenced vode_t
+                 * cannot themselves be stale (that is, it impossible for
+                 * either dtms_getf itself or its f_vnode member to reference
+                 * freed memory).
+                 */
+                if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t)))
+                        return (1);
+
+                if ((vp = fp->f_vnode) != NULL) {
+                        if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz))
+                                return (1);
+
+                        if (vp->v_path != NULL && DTRACE_INRANGE(addr, sz,
+                            vp->v_path, strlen(vp->v_path) + 1)) {
+                                return (1);
+                        }
+
+                        if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz))
+                                return (1);
+
+                        if ((op = vp->v_op) != NULL &&
+                            DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
+                                return (1);
+                        }
+
+                        if (op != NULL && op->vnop_name != NULL &&
+                            DTRACE_INRANGE(addr, sz, op->vnop_name,
+                            strlen(op->vnop_name) + 1)) {
+                                return (1);
+                        }
+                }
+        }
+
         DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
         *illval = addr;
         return (0);
 }
 

@@ -1083,12 +1176,11 @@
          * We should always have a non-NULL state cred here, since if cred
          * is null (anonymous tracing), we fast-path bypass this routine.
          */
         ASSERT(s_cr != NULL);
 
-        if ((cr = CRED()) != NULL &&
-            s_cr->cr_zone == cr->cr_zone)
+        if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
                 return (1);
 
         return (0);
 }
 

@@ -1207,23 +1299,21 @@
 
         if (pops->dtps_mode != NULL) {
                 mode = pops->dtps_mode(prov->dtpv_arg,
                     probe->dtpr_id, probe->dtpr_arg);
 
-                ASSERT((mode & DTRACE_MODE_USER) ||
-                    (mode & DTRACE_MODE_KERNEL));
-                ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) ||
-                    (mode & DTRACE_MODE_NOPRIV_DROP));
+                ASSERT(mode & (DTRACE_MODE_USER | DTRACE_MODE_KERNEL));
+                ASSERT(mode & (DTRACE_MODE_NOPRIV_RESTRICT |
+                    DTRACE_MODE_NOPRIV_DROP));
         }
 
         /*
          * If the dte_cond bits indicate that this consumer is only allowed to
-         * see user-mode firings of this probe, call the provider's dtps_mode()
-         * entry point to check that the probe was fired while in a user
-         * context.  If that's not the case, use the policy specified by the
-         * provider to determine if we drop the probe or merely restrict
-         * operation.
+         * see user-mode firings of this probe, check that the probe was fired
+         * while in a user context.  If that's not the case, use the policy
+         * specified by the provider to determine if we drop the probe or
+         * merely restrict operation.
          */
         if (ecb->dte_cond & DTRACE_COND_USERMODE) {
                 ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
 
                 if (!(mode & DTRACE_MODE_USER)) {

@@ -1286,10 +1376,19 @@
                         mstate->dtms_access &=
                             ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
                 }
         }
 
+        /*
+         * By merits of being in this code path at all, we have limited
+         * privileges.  If the provider has indicated that limited privileges
+         * are to denote restricted operation, strip off the ability to access
+         * arguments.
+         */
+        if (mode & DTRACE_MODE_LIMITEDPRIV_RESTRICT)
+                mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
+
         return (1);
 }
 
 /*
  * Note:  not called from probe context.  This function is called

@@ -2922,11 +3021,11 @@
 
                 return (rval);
         }
 
         case DIF_VAR_CURTHREAD:
-                if (!dtrace_priv_kernel(state))
+                if (!dtrace_priv_proc(state, mstate))
                         return (0);
                 return ((uint64_t)(uintptr_t)curthread);
 
         case DIF_VAR_TIMESTAMP:
                 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {

@@ -4450,15 +4549,39 @@
                 regs[rd] = (uintptr_t)dest;
                 mstate->dtms_scratch_ptr += size;
                 break;
         }
 
+        case DIF_SUBR_GETF: {
+                uintptr_t fd = tupregs[0].dttk_value;
+                uf_info_t *finfo = &curthread->t_procp->p_user.u_finfo;
+                file_t *fp;
+
+                if (!dtrace_priv_proc(state, mstate)) {
+                        regs[rd] = NULL;
+                        break;
+                }
+
+                /*
+                 * This is safe because fi_nfiles only increases, and the
+                 * fi_list array is not freed when the array size doubles.
+                 * (See the comment in flist_grow() for details on the
+                 * management of the u_finfo structure.)
+                 */
+                fp = fd < finfo->fi_nfiles ? finfo->fi_list[fd].uf_file : NULL;
+
+                mstate->dtms_getf = fp;
+                regs[rd] = (uintptr_t)fp;
+                break;
+        }
+
         case DIF_SUBR_CLEANPATH: {
                 char *dest = (char *)mstate->dtms_scratch_ptr, c;
                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
                 uintptr_t src = tupregs[0].dttk_value;
                 int i = 0, j = 0;
+                zone_t *z;
 
                 if (!dtrace_strcanload(src, size, mstate, vstate)) {
                         regs[rd] = NULL;
                         break;
                 }

@@ -4553,10 +4676,27 @@
                         if (c == '\0')
                                 dest[++j] = '/';
                 } while (c != '\0');
 
                 dest[j] = '\0';
+
+                if (mstate->dtms_getf != NULL &&
+                    !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&
+                    (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {
+                        /*
+                         * If we've done a getf() as a part of this ECB and we
+                         * don't have kernel access (and we're not in the global
+                         * zone), check if the path we cleaned up begins with
+                         * the zone's root path, and trim it off if so.  Note
+                         * that this is an output cleanliness issue, not a
+                         * security issue: knowing one's zone root path does
+                         * not enable privilege escalation.
+                         */
+                        if (strstr(dest, z->zone_rootpath) == dest)
+                                dest += strlen(z->zone_rootpath) - 1;
+                }
+
                 regs[rd] = (uintptr_t)dest;
                 mstate->dtms_scratch_ptr += size;
                 break;
         }
 

@@ -4937,75 +5077,54 @@
                 case DIF_OP_BLEU:
                         if (cc_c | cc_z)
                                 pc = DIF_INSTR_LABEL(instr);
                         break;
                 case DIF_OP_RLDSB:
-                        if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
-                                *flags |= CPU_DTRACE_KPRIV;
-                                *illval = regs[r1];
+                        if (!dtrace_canload(regs[r1], 1, mstate, vstate))
                                 break;
-                        }
                         /*FALLTHROUGH*/
                 case DIF_OP_LDSB:
                         regs[rd] = (int8_t)dtrace_load8(regs[r1]);
                         break;
                 case DIF_OP_RLDSH:
-                        if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
-                                *flags |= CPU_DTRACE_KPRIV;
-                                *illval = regs[r1];
+                        if (!dtrace_canload(regs[r1], 2, mstate, vstate))
                                 break;
-                        }
                         /*FALLTHROUGH*/
                 case DIF_OP_LDSH:
                         regs[rd] = (int16_t)dtrace_load16(regs[r1]);
                         break;
                 case DIF_OP_RLDSW:
-                        if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
-                                *flags |= CPU_DTRACE_KPRIV;
-                                *illval = regs[r1];
+                        if (!dtrace_canload(regs[r1], 4, mstate, vstate))
                                 break;
-                        }
                         /*FALLTHROUGH*/
                 case DIF_OP_LDSW:
                         regs[rd] = (int32_t)dtrace_load32(regs[r1]);
                         break;
                 case DIF_OP_RLDUB:
-                        if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
-                                *flags |= CPU_DTRACE_KPRIV;
-                                *illval = regs[r1];
+                        if (!dtrace_canload(regs[r1], 1, mstate, vstate))
                                 break;
-                        }
                         /*FALLTHROUGH*/
                 case DIF_OP_LDUB:
                         regs[rd] = dtrace_load8(regs[r1]);
                         break;
                 case DIF_OP_RLDUH:
-                        if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
-                                *flags |= CPU_DTRACE_KPRIV;
-                                *illval = regs[r1];
+                        if (!dtrace_canload(regs[r1], 2, mstate, vstate))
                                 break;
-                        }
                         /*FALLTHROUGH*/
                 case DIF_OP_LDUH:
                         regs[rd] = dtrace_load16(regs[r1]);
                         break;
                 case DIF_OP_RLDUW:
-                        if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
-                                *flags |= CPU_DTRACE_KPRIV;
-                                *illval = regs[r1];
+                        if (!dtrace_canload(regs[r1], 4, mstate, vstate))
                                 break;
-                        }
                         /*FALLTHROUGH*/
                 case DIF_OP_LDUW:
                         regs[rd] = dtrace_load32(regs[r1]);
                         break;
                 case DIF_OP_RLDX:
-                        if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
-                                *flags |= CPU_DTRACE_KPRIV;
-                                *illval = regs[r1];
+                        if (!dtrace_canload(regs[r1], 8, mstate, vstate))
                                 break;
-                        }
                         /*FALLTHROUGH*/
                 case DIF_OP_LDX:
                         regs[rd] = dtrace_load64(regs[r1]);
                         break;
                 case DIF_OP_ULDSB:

@@ -5940,10 +6059,12 @@
                 uint64_t val;
 #endif
 
                 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
                 mstate.dtms_access = DTRACE_ACCESS_ARGS | DTRACE_ACCESS_PROC;
+                mstate.dtms_getf = NULL;
+
                 *flags &= ~CPU_DTRACE_ERROR;
 
                 if (prov == dtrace_provider) {
                         /*
                          * If dtrace itself is the provider of this probe,

@@ -8446,10 +8567,24 @@
 
                         if (subr == DIF_SUBR_COPYOUT ||
                             subr == DIF_SUBR_COPYOUTSTR) {
                                 dp->dtdo_destructive = 1;
                         }
+
+                        if (subr == DIF_SUBR_GETF) {
+                                /*
+                                 * If we have a getf() we need to record that
+                                 * in our state.  Note that our state can be
+                                 * NULL if this is a helper -- but in that
+                                 * case, the call to getf() is itself illegal,
+                                 * and will be caught (slightly later) when
+                                 * the helper is validated.
+                                 */
+                                if (vstate->dtvs_state != NULL)
+                                        vstate->dtvs_state->dts_getf++;
+                        }
+
                         break;
                 case DIF_OP_PUSHTR:
                         if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
                                 err += efunc(pc, "invalid ref type %u\n", type);
                         if (r2 >= nregs)

@@ -13088,10 +13223,26 @@
         state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
         state->dts_deadman = cyclic_add(&hdlr, &when);
 
         state->dts_activity = DTRACE_ACTIVITY_WARMUP;
 
+        if (state->dts_getf != 0 &&
+            !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
+                /*
+                 * We don't have kernel privs but we have at least one call
+                 * to getf(); we need to bump our zone's count, and (if
+                 * this is the first enabling to have an unprivileged call
+                 * to getf()) we need to hook into closef().
+                 */
+                state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++;
+
+                if (dtrace_getf++ == 0) {
+                        ASSERT(dtrace_closef == NULL);
+                        dtrace_closef = dtrace_getf_barrier;
+                }
+        }
+
         /*
          * Now it's time to actually fire the BEGIN probe.  We need to disable
          * interrupts here both to record the CPU on which we fired the BEGIN
          * probe (the data from this CPU will be processed first at user
          * level) and to manually activate the buffer for this CPU.

@@ -13204,10 +13355,28 @@
         dtrace_interrupt_enable(cookie);
 
         state->dts_activity = DTRACE_ACTIVITY_STOPPED;
         dtrace_sync();
 
+        if (state->dts_getf != 0 &&
+            !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
+                /*
+                 * We don't have kernel privs but we have at least one call
+                 * to getf(); we need to lower our zone's count, and (if
+                 * this is the last enabling to have an unprivileged call
+                 * to getf()) we need to clear the closef() hook.
+                 */
+                ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0);
+                ASSERT(dtrace_closef == dtrace_getf_barrier);
+                ASSERT(dtrace_getf > 0);
+
+                state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--;
+
+                if (--dtrace_getf == 0)
+                        dtrace_closef = NULL;
+        }
+
         return (0);
 }
 
 static int
 dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,

@@ -14764,10 +14933,27 @@
         dtrace_toxrange[dtrace_toxranges].dtt_base = base;
         dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
         dtrace_toxranges++;
 }
 
+static void
+dtrace_getf_barrier()
+{
+        /*
+         * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings
+         * that contain calls to getf(), this routine will be called on every
+         * closef() before either the underlying vnode is released or the
+         * file_t itself is freed.  By the time we are here, it is essential
+         * that the file_t can no longer be accessed from a call to getf()
+         * in probe context -- that assures that a dtrace_sync() can be used
+         * to clear out any enablings referring to the old structures.
+         */
+        if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 ||
+            kcred->cr_zone->zone_dtrace_getf != 0)
+                dtrace_sync();
+}
+
 /*
  * DTrace Driver Cookbook Functions
  */
 /*ARGSUSED*/
 static int

@@ -15920,10 +16106,13 @@
         dtrace_debugger_init = NULL;
         dtrace_debugger_fini = NULL;
         dtrace_modload = NULL;
         dtrace_modunload = NULL;
 
+        ASSERT(dtrace_getf == 0);
+        ASSERT(dtrace_closef == NULL);
+
         mutex_exit(&cpu_lock);
 
         if (dtrace_helptrace_enabled) {
                 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
                 dtrace_helptrace_buffer = NULL;