Print this page
2915 DTrace in a zone should see "cpu", "curpsinfo", et al
2916 DTrace in a zone should be able to access fds[]
2917 DTrace in a zone should have limited provider access
Reviewed by: Joshua M. Clulow <josh@sysmgr.org>
Reviewed by: Adam Leventhal <ahl@delphix.com>

*** 169,178 **** --- 169,179 ---- static int dtrace_nprobes; /* number of probes */ static dtrace_provider_t *dtrace_provider; /* provider list */ static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */ static int dtrace_opens; /* number of opens */ static int dtrace_helpers; /* number of helpers */ + static int dtrace_getf; /* number of unpriv getf()s */ static void *dtrace_softstate; /* softstate pointer */ static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */ static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */ static dtrace_hash_t *dtrace_byname; /* probes hashed by name */ static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */
*** 371,382 **** * within the range of memory described by addr, sz. We take care to avoid * problems with overflow and underflow of the unsigned quantities, and * disallow all negative sizes. Ranges of size 0 are allowed. */ #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \ ! ((testaddr) - (baseaddr) < (basesz) && \ ! (testaddr) + (testsz) - (baseaddr) <= (basesz) && \ (testaddr) + (testsz) >= (testaddr)) /* * Test whether alloc_sz bytes will fit in the scratch region. We isolate * alloc_sz on the righthand side of the comparison in order to avoid overflow --- 372,383 ---- * within the range of memory described by addr, sz. We take care to avoid * problems with overflow and underflow of the unsigned quantities, and * disallow all negative sizes. Ranges of size 0 are allowed. */ #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \ ! ((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \ ! (testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \ (testaddr) + (testsz) >= (testaddr)) /* * Test whether alloc_sz bytes will fit in the scratch region. We isolate * alloc_sz on the righthand side of the comparison in order to avoid overflow
*** 473,482 **** --- 474,485 ---- dtrace_state_t *, dtrace_mstate_t *); static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t, dtrace_optval_t); static int dtrace_ecb_create_enable(dtrace_probe_t *, void *); static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *); + static int dtrace_priv_proc(dtrace_state_t *, dtrace_mstate_t *); + static void dtrace_getf_barrier(void); /* * DTrace Probe Context Functions * * These functions are called from probe context. Because probe context is
*** 617,627 **** /* * Now check to see if it's a dynamic variable. This check will pick * up both thread-local variables and any global dynamically-allocated * variables. */ ! if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base, vstate->dtvs_dynvars.dtds_size)) { dtrace_dstate_t *dstate = &vstate->dtvs_dynvars; uintptr_t base = (uintptr_t)dstate->dtds_base + (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t)); uintptr_t chunkoffs; --- 620,630 ---- /* * Now check to see if it's a dynamic variable. This check will pick * up both thread-local variables and any global dynamically-allocated * variables. */ ! if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base, vstate->dtvs_dynvars.dtds_size)) { dtrace_dstate_t *dstate = &vstate->dtvs_dynvars; uintptr_t base = (uintptr_t)dstate->dtds_base + (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t)); uintptr_t chunkoffs;
*** 684,693 **** --- 687,697 ---- static int dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) { volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval; + file_t *fp; /* * If we hold the privilege to read from kernel memory, then * everything is readable. */
*** 701,714 **** return (1); /* * We're allowed to read from our own string table. */ ! if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab, mstate->dtms_difo->dtdo_strlen)) return (1); DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV); *illval = addr; return (0); } --- 705,807 ---- return (1); /* * We're allowed to read from our own string table. */ ! if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab, mstate->dtms_difo->dtdo_strlen)) return (1); + if (vstate->dtvs_state != NULL && + dtrace_priv_proc(vstate->dtvs_state, mstate)) { + proc_t *p; + + /* + * When we have privileges to the current process, there are + * several context-related kernel structures that are safe to + * read, even absent the privilege to read from kernel memory. + * These reads are safe because these structures contain only + * state that (1) we're permitted to read, (2) is harmless or + * (3) contains pointers to additional kernel state that we're + * not permitted to read (and as such, do not present an + * opportunity for privilege escalation). Finally (and + * critically), because of the nature of their relation with + * the current thread context, the memory associated with these + * structures cannot change over the duration of probe context, + * and it is therefore impossible for this memory to be + * deallocated and reallocated as something else while it's + * being operated upon. + */ + if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t))) + return (1); + + if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr, + sz, curthread->t_procp, sizeof (proc_t))) { + return (1); + } + + if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz, + curthread->t_cred, sizeof (cred_t))) { + return (1); + } + + if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz, + &(p->p_pidp->pid_id), sizeof (pid_t))) { + return (1); + } + + if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz, + curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) { + return (1); + } + } + + if ((fp = mstate->dtms_getf) != NULL) { + uintptr_t psz = sizeof (void *); + vnode_t *vp; + vnodeops_t *op; + + /* + * When getf() returns a file_t, the enabling is implicitly + * granted the (transient) right to read the returned file_t + * as well as the v_path and v_op->vnop_name of the underlying + * vnode. These accesses are allowed after a successful + * getf() because the members that they refer to cannot change + * once set -- and the barrier logic in the kernel's closef() + * path assures that the file_t and its referenced vode_t + * cannot themselves be stale (that is, it impossible for + * either dtms_getf itself or its f_vnode member to reference + * freed memory). + */ + if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t))) + return (1); + + if ((vp = fp->f_vnode) != NULL) { + if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz)) + return (1); + + if (vp->v_path != NULL && DTRACE_INRANGE(addr, sz, + vp->v_path, strlen(vp->v_path) + 1)) { + return (1); + } + + if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz)) + return (1); + + if ((op = vp->v_op) != NULL && + DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) { + return (1); + } + + if (op != NULL && op->vnop_name != NULL && + DTRACE_INRANGE(addr, sz, op->vnop_name, + strlen(op->vnop_name) + 1)) { + return (1); + } + } + } + DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV); *illval = addr; return (0); }
*** 1083,1094 **** * We should always have a non-NULL state cred here, since if cred * is null (anonymous tracing), we fast-path bypass this routine. */ ASSERT(s_cr != NULL); ! if ((cr = CRED()) != NULL && ! s_cr->cr_zone == cr->cr_zone) return (1); return (0); } --- 1176,1186 ---- * We should always have a non-NULL state cred here, since if cred * is null (anonymous tracing), we fast-path bypass this routine. */ ASSERT(s_cr != NULL); ! if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone) return (1); return (0); }
*** 1207,1229 **** if (pops->dtps_mode != NULL) { mode = pops->dtps_mode(prov->dtpv_arg, probe->dtpr_id, probe->dtpr_arg); ! ASSERT((mode & DTRACE_MODE_USER) || ! (mode & DTRACE_MODE_KERNEL)); ! ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) || ! (mode & DTRACE_MODE_NOPRIV_DROP)); } /* * If the dte_cond bits indicate that this consumer is only allowed to ! * see user-mode firings of this probe, call the provider's dtps_mode() ! * entry point to check that the probe was fired while in a user ! * context. If that's not the case, use the policy specified by the ! * provider to determine if we drop the probe or merely restrict ! * operation. */ if (ecb->dte_cond & DTRACE_COND_USERMODE) { ASSERT(mode != DTRACE_MODE_NOPRIV_DROP); if (!(mode & DTRACE_MODE_USER)) { --- 1299,1319 ---- if (pops->dtps_mode != NULL) { mode = pops->dtps_mode(prov->dtpv_arg, probe->dtpr_id, probe->dtpr_arg); ! ASSERT(mode & (DTRACE_MODE_USER | DTRACE_MODE_KERNEL)); ! ASSERT(mode & (DTRACE_MODE_NOPRIV_RESTRICT | ! DTRACE_MODE_NOPRIV_DROP)); } /* * If the dte_cond bits indicate that this consumer is only allowed to ! * see user-mode firings of this probe, check that the probe was fired ! * while in a user context. If that's not the case, use the policy ! * specified by the provider to determine if we drop the probe or ! * merely restrict operation. */ if (ecb->dte_cond & DTRACE_COND_USERMODE) { ASSERT(mode != DTRACE_MODE_NOPRIV_DROP); if (!(mode & DTRACE_MODE_USER)) {
*** 1286,1295 **** --- 1376,1394 ---- mstate->dtms_access &= ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS); } } + /* + * By merits of being in this code path at all, we have limited + * privileges. If the provider has indicated that limited privileges + * are to denote restricted operation, strip off the ability to access + * arguments. + */ + if (mode & DTRACE_MODE_LIMITEDPRIV_RESTRICT) + mstate->dtms_access &= ~DTRACE_ACCESS_ARGS; + return (1); } /* * Note: not called from probe context. This function is called
*** 2922,2932 **** return (rval); } case DIF_VAR_CURTHREAD: ! if (!dtrace_priv_kernel(state)) return (0); return ((uint64_t)(uintptr_t)curthread); case DIF_VAR_TIMESTAMP: if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) { --- 3021,3031 ---- return (rval); } case DIF_VAR_CURTHREAD: ! if (!dtrace_priv_proc(state, mstate)) return (0); return ((uint64_t)(uintptr_t)curthread); case DIF_VAR_TIMESTAMP: if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
*** 4450,4464 **** --- 4549,4587 ---- regs[rd] = (uintptr_t)dest; mstate->dtms_scratch_ptr += size; break; } + case DIF_SUBR_GETF: { + uintptr_t fd = tupregs[0].dttk_value; + uf_info_t *finfo = &curthread->t_procp->p_user.u_finfo; + file_t *fp; + + if (!dtrace_priv_proc(state, mstate)) { + regs[rd] = NULL; + break; + } + + /* + * This is safe because fi_nfiles only increases, and the + * fi_list array is not freed when the array size doubles. + * (See the comment in flist_grow() for details on the + * management of the u_finfo structure.) + */ + fp = fd < finfo->fi_nfiles ? finfo->fi_list[fd].uf_file : NULL; + + mstate->dtms_getf = fp; + regs[rd] = (uintptr_t)fp; + break; + } + case DIF_SUBR_CLEANPATH: { char *dest = (char *)mstate->dtms_scratch_ptr, c; uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; uintptr_t src = tupregs[0].dttk_value; int i = 0, j = 0; + zone_t *z; if (!dtrace_strcanload(src, size, mstate, vstate)) { regs[rd] = NULL; break; }
*** 4553,4562 **** --- 4676,4702 ---- if (c == '\0') dest[++j] = '/'; } while (c != '\0'); dest[j] = '\0'; + + if (mstate->dtms_getf != NULL && + !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) && + (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) { + /* + * If we've done a getf() as a part of this ECB and we + * don't have kernel access (and we're not in the global + * zone), check if the path we cleaned up begins with + * the zone's root path, and trim it off if so. Note + * that this is an output cleanliness issue, not a + * security issue: knowing one's zone root path does + * not enable privilege escalation. + */ + if (strstr(dest, z->zone_rootpath) == dest) + dest += strlen(z->zone_rootpath) - 1; + } + regs[rd] = (uintptr_t)dest; mstate->dtms_scratch_ptr += size; break; }
*** 4937,5011 **** case DIF_OP_BLEU: if (cc_c | cc_z) pc = DIF_INSTR_LABEL(instr); break; case DIF_OP_RLDSB: ! if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) { ! *flags |= CPU_DTRACE_KPRIV; ! *illval = regs[r1]; break; - } /*FALLTHROUGH*/ case DIF_OP_LDSB: regs[rd] = (int8_t)dtrace_load8(regs[r1]); break; case DIF_OP_RLDSH: ! if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) { ! *flags |= CPU_DTRACE_KPRIV; ! *illval = regs[r1]; break; - } /*FALLTHROUGH*/ case DIF_OP_LDSH: regs[rd] = (int16_t)dtrace_load16(regs[r1]); break; case DIF_OP_RLDSW: ! if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) { ! *flags |= CPU_DTRACE_KPRIV; ! *illval = regs[r1]; break; - } /*FALLTHROUGH*/ case DIF_OP_LDSW: regs[rd] = (int32_t)dtrace_load32(regs[r1]); break; case DIF_OP_RLDUB: ! if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) { ! *flags |= CPU_DTRACE_KPRIV; ! *illval = regs[r1]; break; - } /*FALLTHROUGH*/ case DIF_OP_LDUB: regs[rd] = dtrace_load8(regs[r1]); break; case DIF_OP_RLDUH: ! if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) { ! *flags |= CPU_DTRACE_KPRIV; ! *illval = regs[r1]; break; - } /*FALLTHROUGH*/ case DIF_OP_LDUH: regs[rd] = dtrace_load16(regs[r1]); break; case DIF_OP_RLDUW: ! if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) { ! *flags |= CPU_DTRACE_KPRIV; ! *illval = regs[r1]; break; - } /*FALLTHROUGH*/ case DIF_OP_LDUW: regs[rd] = dtrace_load32(regs[r1]); break; case DIF_OP_RLDX: ! if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) { ! *flags |= CPU_DTRACE_KPRIV; ! *illval = regs[r1]; break; - } /*FALLTHROUGH*/ case DIF_OP_LDX: regs[rd] = dtrace_load64(regs[r1]); break; case DIF_OP_ULDSB: --- 5077,5130 ---- case DIF_OP_BLEU: if (cc_c | cc_z) pc = DIF_INSTR_LABEL(instr); break; case DIF_OP_RLDSB: ! if (!dtrace_canload(regs[r1], 1, mstate, vstate)) break; /*FALLTHROUGH*/ case DIF_OP_LDSB: regs[rd] = (int8_t)dtrace_load8(regs[r1]); break; case DIF_OP_RLDSH: ! if (!dtrace_canload(regs[r1], 2, mstate, vstate)) break; /*FALLTHROUGH*/ case DIF_OP_LDSH: regs[rd] = (int16_t)dtrace_load16(regs[r1]); break; case DIF_OP_RLDSW: ! if (!dtrace_canload(regs[r1], 4, mstate, vstate)) break; /*FALLTHROUGH*/ case DIF_OP_LDSW: regs[rd] = (int32_t)dtrace_load32(regs[r1]); break; case DIF_OP_RLDUB: ! if (!dtrace_canload(regs[r1], 1, mstate, vstate)) break; /*FALLTHROUGH*/ case DIF_OP_LDUB: regs[rd] = dtrace_load8(regs[r1]); break; case DIF_OP_RLDUH: ! if (!dtrace_canload(regs[r1], 2, mstate, vstate)) break; /*FALLTHROUGH*/ case DIF_OP_LDUH: regs[rd] = dtrace_load16(regs[r1]); break; case DIF_OP_RLDUW: ! if (!dtrace_canload(regs[r1], 4, mstate, vstate)) break; /*FALLTHROUGH*/ case DIF_OP_LDUW: regs[rd] = dtrace_load32(regs[r1]); break; case DIF_OP_RLDX: ! if (!dtrace_canload(regs[r1], 8, mstate, vstate)) break; /*FALLTHROUGH*/ case DIF_OP_LDX: regs[rd] = dtrace_load64(regs[r1]); break; case DIF_OP_ULDSB:
*** 5940,5949 **** --- 6059,6070 ---- uint64_t val; #endif mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE; mstate.dtms_access = DTRACE_ACCESS_ARGS | DTRACE_ACCESS_PROC; + mstate.dtms_getf = NULL; + *flags &= ~CPU_DTRACE_ERROR; if (prov == dtrace_provider) { /* * If dtrace itself is the provider of this probe,
*** 8446,8455 **** --- 8567,8590 ---- if (subr == DIF_SUBR_COPYOUT || subr == DIF_SUBR_COPYOUTSTR) { dp->dtdo_destructive = 1; } + + if (subr == DIF_SUBR_GETF) { + /* + * If we have a getf() we need to record that + * in our state. Note that our state can be + * NULL if this is a helper -- but in that + * case, the call to getf() is itself illegal, + * and will be caught (slightly later) when + * the helper is validated. + */ + if (vstate->dtvs_state != NULL) + vstate->dtvs_state->dts_getf++; + } + break; case DIF_OP_PUSHTR: if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF) err += efunc(pc, "invalid ref type %u\n", type); if (r2 >= nregs)
*** 13088,13097 **** --- 13223,13248 ---- state->dts_alive = state->dts_laststatus = dtrace_gethrtime(); state->dts_deadman = cyclic_add(&hdlr, &when); state->dts_activity = DTRACE_ACTIVITY_WARMUP; + if (state->dts_getf != 0 && + !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) { + /* + * We don't have kernel privs but we have at least one call + * to getf(); we need to bump our zone's count, and (if + * this is the first enabling to have an unprivileged call + * to getf()) we need to hook into closef(). + */ + state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++; + + if (dtrace_getf++ == 0) { + ASSERT(dtrace_closef == NULL); + dtrace_closef = dtrace_getf_barrier; + } + } + /* * Now it's time to actually fire the BEGIN probe. We need to disable * interrupts here both to record the CPU on which we fired the BEGIN * probe (the data from this CPU will be processed first at user * level) and to manually activate the buffer for this CPU.
*** 13204,13213 **** --- 13355,13382 ---- dtrace_interrupt_enable(cookie); state->dts_activity = DTRACE_ACTIVITY_STOPPED; dtrace_sync(); + if (state->dts_getf != 0 && + !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) { + /* + * We don't have kernel privs but we have at least one call + * to getf(); we need to lower our zone's count, and (if + * this is the last enabling to have an unprivileged call + * to getf()) we need to clear the closef() hook. + */ + ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0); + ASSERT(dtrace_closef == dtrace_getf_barrier); + ASSERT(dtrace_getf > 0); + + state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--; + + if (--dtrace_getf == 0) + dtrace_closef = NULL; + } + return (0); } static int dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
*** 14764,14773 **** --- 14933,14959 ---- dtrace_toxrange[dtrace_toxranges].dtt_base = base; dtrace_toxrange[dtrace_toxranges].dtt_limit = limit; dtrace_toxranges++; } + static void + dtrace_getf_barrier() + { + /* + * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings + * that contain calls to getf(), this routine will be called on every + * closef() before either the underlying vnode is released or the + * file_t itself is freed. By the time we are here, it is essential + * that the file_t can no longer be accessed from a call to getf() + * in probe context -- that assures that a dtrace_sync() can be used + * to clear out any enablings referring to the old structures. + */ + if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 || + kcred->cr_zone->zone_dtrace_getf != 0) + dtrace_sync(); + } + /* * DTrace Driver Cookbook Functions */ /*ARGSUSED*/ static int
*** 15920,15929 **** --- 16106,16118 ---- dtrace_debugger_init = NULL; dtrace_debugger_fini = NULL; dtrace_modload = NULL; dtrace_modunload = NULL; + ASSERT(dtrace_getf == 0); + ASSERT(dtrace_closef == NULL); + mutex_exit(&cpu_lock); if (dtrace_helptrace_enabled) { kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize); dtrace_helptrace_buffer = NULL;