1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /*
27 * Copyright 2018 Nexenta Systems, Inc.
28 */
29
30 /*
31 * miscellaneous routines for the devfs
32 */
33
34 #include <sys/types.h>
35 #include <sys/param.h>
36 #include <sys/t_lock.h>
37 #include <sys/systm.h>
38 #include <sys/sysmacros.h>
39 #include <sys/user.h>
40 #include <sys/time.h>
41 #include <sys/vfs.h>
42 #include <sys/vnode.h>
43 #include <sys/file.h>
44 #include <sys/fcntl.h>
45 #include <sys/flock.h>
46 #include <sys/kmem.h>
47 #include <sys/uio.h>
48 #include <sys/errno.h>
49 #include <sys/stat.h>
50 #include <sys/cred.h>
51 #include <sys/dirent.h>
52 #include <sys/pathname.h>
53 #include <sys/cmn_err.h>
54 #include <sys/debug.h>
55 #include <sys/modctl.h>
56 #include <fs/fs_subr.h>
57 #include <sys/fs/dv_node.h>
58 #include <sys/fs/snode.h>
59 #include <sys/sunndi.h>
60 #include <sys/sunmdi.h>
61 #include <sys/conf.h>
62
63 #ifdef DEBUG
64 int devfs_debug = 0x0;
65 #endif
66
67 const char dvnm[] = "devfs";
68 kmem_cache_t *dv_node_cache; /* dv_node cache */
69
70 /*
71 * The devfs_clean_key is taken during a devfs_clean operation: it is used to
72 * prevent unnecessary code execution and for detection of potential deadlocks.
73 */
74 uint_t devfs_clean_key;
75
76 struct dv_node *dvroot;
77
78 /* prototype memory vattrs */
79 vattr_t dv_vattr_dir = {
80 AT_TYPE|AT_MODE|AT_UID|AT_GID, /* va_mask */
81 VDIR, /* va_type */
82 DV_DIRMODE_DEFAULT, /* va_mode */
83 DV_UID_DEFAULT, /* va_uid */
84 DV_GID_DEFAULT, /* va_gid */
85 0, /* va_fsid; */
86 0, /* va_nodeid; */
87 0, /* va_nlink; */
88 0, /* va_size; */
89 0, /* va_atime; */
90 0, /* va_mtime; */
91 0, /* va_ctime; */
92 0, /* va_rdev; */
93 0, /* va_blksize; */
94 0, /* va_nblocks; */
95 0, /* va_seq; */
96 };
97
98 vattr_t dv_vattr_file = {
99 AT_TYPE|AT_MODE|AT_SIZE|AT_UID|AT_GID|AT_RDEV, /* va_mask */
100 0, /* va_type */
101 DV_DEVMODE_DEFAULT, /* va_mode */
102 DV_UID_DEFAULT, /* va_uid */
103 DV_GID_DEFAULT, /* va_gid */
104 0, /* va_fsid; */
105 0, /* va_nodeid; */
106 0, /* va_nlink; */
107 0, /* va_size; */
108 0, /* va_atime; */
109 0, /* va_mtime; */
110 0, /* va_ctime; */
111 0, /* va_rdev; */
112 0, /* va_blksize; */
113 0, /* va_nblocks; */
114 0, /* va_seq; */
115 };
116
117 vattr_t dv_vattr_priv = {
118 AT_TYPE|AT_MODE|AT_SIZE|AT_UID|AT_GID|AT_RDEV, /* va_mask */
119 0, /* va_type */
120 DV_DEVMODE_PRIV, /* va_mode */
121 DV_UID_DEFAULT, /* va_uid */
122 DV_GID_DEFAULT, /* va_gid */
123 0, /* va_fsid; */
124 0, /* va_nodeid; */
125 0, /* va_nlink; */
126 0, /* va_size; */
127 0, /* va_atime; */
128 0, /* va_mtime; */
129 0, /* va_ctime; */
130 0, /* va_rdev; */
131 0, /* va_blksize; */
132 0, /* va_nblocks; */
133 0, /* va_seq; */
134 };
135
136 extern dev_info_t *clone_dip;
137 extern major_t clone_major;
138 extern struct dev_ops *ddi_hold_driver(major_t);
139
140 /* dv_node node constructor for kmem cache */
141 static int
142 i_dv_node_ctor(void *buf, void *cfarg, int flag)
143 {
144 _NOTE(ARGUNUSED(cfarg, flag))
145 struct dv_node *dv = (struct dv_node *)buf;
146 struct vnode *vp;
147
148 bzero(buf, sizeof (struct dv_node));
149 vp = dv->dv_vnode = vn_alloc(flag);
150 if (vp == NULL) {
151 return (-1);
152 }
153 vp->v_data = dv;
154 rw_init(&dv->dv_contents, NULL, RW_DEFAULT, NULL);
155 return (0);
156 }
157
158 /* dv_node node destructor for kmem cache */
159 static void
160 i_dv_node_dtor(void *buf, void *arg)
161 {
162 _NOTE(ARGUNUSED(arg))
163 struct dv_node *dv = (struct dv_node *)buf;
164 struct vnode *vp = DVTOV(dv);
165
166 rw_destroy(&dv->dv_contents);
167 vn_invalid(vp);
168 vn_free(vp);
169 }
170
171
172 /* initialize dv_node node cache */
173 void
174 dv_node_cache_init()
175 {
176 ASSERT(dv_node_cache == NULL);
177 dv_node_cache = kmem_cache_create("dv_node_cache",
178 sizeof (struct dv_node), 0, i_dv_node_ctor, i_dv_node_dtor,
179 NULL, NULL, NULL, 0);
180
181 tsd_create(&devfs_clean_key, NULL);
182 }
183
184 /* destroy dv_node node cache */
185 void
186 dv_node_cache_fini()
187 {
188 ASSERT(dv_node_cache != NULL);
189 kmem_cache_destroy(dv_node_cache);
190 dv_node_cache = NULL;
191
192 tsd_destroy(&devfs_clean_key);
193 }
194
195 /*
196 * dv_mkino - Generate a unique inode number for devfs nodes.
197 *
198 * Although ino_t is 64 bits, the inode number is truncated to 32 bits for 32
199 * bit non-LARGEFILE applications. This means that there is a requirement to
200 * maintain the inode number as a 32 bit value or applications will have
201 * stat(2) calls fail with EOVERFLOW. We form a 32 bit inode number from the
202 * dev_t. but if the minor number is larger than L_MAXMIN32 we fold extra minor
203 *
204 * To generate inode numbers for directories, we assume that we will never use
205 * more than half the major space - this allows for ~8190 drivers. We use this
206 * upper major number space to allocate inode numbers for directories by
207 * encoding the major and instance into this space.
208 *
209 * We also skew the result so that inode 2 is reserved for the root of the file
210 * system.
211 *
212 * As part of the future support for 64-bit dev_t APIs, the upper minor bits
213 * should be folded into the high inode bits by adding the following code
214 * after "ino |= 1":
215 *
216 * #if (L_BITSMINOR32 != L_BITSMINOR)
217 * |* fold overflow minor bits into high bits of inode number *|
218 * ino |= ((ino_t)(minor >> L_BITSMINOR32)) << L_BITSMINOR;
219 * #endif |* (L_BITSMINOR32 != L_BITSMINOR) *|
220 *
221 * This way only applications that use devices that overflow their minor
222 * space will have an application level impact.
223 */
224 static ino_t
225 dv_mkino(dev_info_t *devi, vtype_t typ, dev_t dev)
226 {
227 major_t major;
228 minor_t minor;
229 ino_t ino;
230 static int warn;
231
232 if (typ == VDIR) {
233 major = ((L_MAXMAJ32 + 1) >> 1) + DEVI(devi)->devi_major;
234 minor = ddi_get_instance(devi);
235
236 /* makedevice32 in high half of major number space */
237 ino = (ino_t)((major << L_BITSMINOR32) | (minor & L_MAXMIN32));
238
239 major = DEVI(devi)->devi_major;
240 } else {
241 major = getmajor(dev);
242 minor = getminor(dev);
243
244 /* makedevice32 */
245 ino = (ino_t)((major << L_BITSMINOR32) | (minor & L_MAXMIN32));
246
247 /* make ino for VCHR different than VBLK */
248 ino <<= 1;
249 if (typ == VCHR)
250 ino |= 1;
251 }
252
253 ino += DV_ROOTINO + 1; /* skew */
254
255 /*
256 * diagnose things a little early because adding the skew to a large
257 * minor number could roll over the major.
258 */
259 if ((major >= (L_MAXMAJ32 >> 1)) && (warn == 0)) {
260 warn = 1;
261 cmn_err(CE_WARN, "%s: inode numbers are not unique", dvnm);
262 }
263
264 return (ino);
265 }
266
267 /*
268 * Compare two nodes lexographically to balance avl tree
269 */
270 static int
271 dv_compare_nodes(const struct dv_node *dv1, const struct dv_node *dv2)
272 {
273 int rv;
274
275 if ((rv = strcmp(dv1->dv_name, dv2->dv_name)) == 0)
276 return (0);
277 return ((rv < 0) ? -1 : 1);
278 }
279
280 /*
281 * dv_mkroot
282 *
283 * Build the first VDIR dv_node.
284 */
285 struct dv_node *
286 dv_mkroot(struct vfs *vfsp, dev_t devfsdev)
287 {
288 struct dv_node *dv;
289 struct vnode *vp;
290
291 ASSERT(ddi_root_node() != NULL);
292 ASSERT(dv_node_cache != NULL);
293
294 dcmn_err3(("dv_mkroot\n"));
295 dv = kmem_cache_alloc(dv_node_cache, KM_SLEEP);
296 vp = DVTOV(dv);
297 vn_reinit(vp);
298 vp->v_flag = VROOT;
299 vp->v_vfsp = vfsp;
300 vp->v_type = VDIR;
301 vp->v_rdev = devfsdev;
302 vn_setops(vp, dv_vnodeops);
303 vn_exists(vp);
304
305 dvroot = dv;
306
307 dv->dv_name = NULL; /* not needed */
308 dv->dv_namelen = 0;
309
310 dv->dv_devi = ddi_root_node();
311
312 dv->dv_ino = DV_ROOTINO;
313 dv->dv_nlink = 2; /* name + . (no dv_insert) */
314 dv->dv_dotdot = dv; /* .. == self */
315 dv->dv_attrvp = NULLVP;
316 dv->dv_attr = NULL;
317 dv->dv_flags = DV_BUILD;
318 dv->dv_priv = NULL;
319 dv->dv_busy = 0;
320 dv->dv_dflt_mode = 0;
321
322 avl_create(&dv->dv_entries,
323 (int (*)(const void *, const void *))dv_compare_nodes,
324 sizeof (struct dv_node), offsetof(struct dv_node, dv_avllink));
325
326 return (dv);
327 }
328
329 /*
330 * dv_mkdir
331 *
332 * Given an probed or attached nexus node, create a VDIR dv_node.
333 * No dv_attrvp is created at this point.
334 */
335 struct dv_node *
336 dv_mkdir(struct dv_node *ddv, dev_info_t *devi, char *nm)
337 {
338 struct dv_node *dv;
339 struct vnode *vp;
340 size_t nmlen;
341
342 ASSERT((devi));
343 dcmn_err4(("dv_mkdir: %s\n", nm));
344
345 dv = kmem_cache_alloc(dv_node_cache, KM_SLEEP);
346 nmlen = strlen(nm) + 1;
347 dv->dv_name = kmem_alloc(nmlen, KM_SLEEP);
348 bcopy(nm, dv->dv_name, nmlen);
349 dv->dv_namelen = nmlen - 1; /* '\0' not included */
350
351 vp = DVTOV(dv);
352 vn_reinit(vp);
353 vp->v_flag = 0;
354 vp->v_vfsp = DVTOV(ddv)->v_vfsp;
355 vp->v_type = VDIR;
356 vp->v_rdev = DVTOV(ddv)->v_rdev;
357 vn_setops(vp, vn_getops(DVTOV(ddv)));
358 vn_exists(vp);
359
360 dv->dv_devi = devi;
361 ndi_hold_devi(devi);
362
363 dv->dv_ino = dv_mkino(devi, VDIR, NODEV);
364 dv->dv_nlink = 0; /* updated on insert */
365 dv->dv_dotdot = ddv;
366 dv->dv_attrvp = NULLVP;
367 dv->dv_attr = NULL;
368 dv->dv_flags = DV_BUILD;
369 dv->dv_priv = NULL;
370 dv->dv_busy = 0;
371 dv->dv_dflt_mode = 0;
372
373 avl_create(&dv->dv_entries,
374 (int (*)(const void *, const void *))dv_compare_nodes,
375 sizeof (struct dv_node), offsetof(struct dv_node, dv_avllink));
376
377 return (dv);
378 }
379
380 /*
381 * dv_mknod
382 *
383 * Given a minor node, create a VCHR or VBLK dv_node.
384 * No dv_attrvp is created at this point.
385 */
386 static struct dv_node *
387 dv_mknod(struct dv_node *ddv, dev_info_t *devi, char *nm,
388 struct ddi_minor_data *dmd)
389 {
390 struct dv_node *dv;
391 struct vnode *vp;
392 size_t nmlen;
393
394 dcmn_err4(("dv_mknod: %s\n", nm));
395
396 dv = kmem_cache_alloc(dv_node_cache, KM_SLEEP);
397 nmlen = strlen(nm) + 1;
398 dv->dv_name = kmem_alloc(nmlen, KM_SLEEP);
399 bcopy(nm, dv->dv_name, nmlen);
400 dv->dv_namelen = nmlen - 1; /* no '\0' */
401
402 vp = DVTOV(dv);
403 vn_reinit(vp);
404 vp->v_flag = 0;
405 vp->v_vfsp = DVTOV(ddv)->v_vfsp;
406 vp->v_type = dmd->ddm_spec_type == S_IFCHR ? VCHR : VBLK;
407 vp->v_rdev = dmd->ddm_dev;
408 vn_setops(vp, vn_getops(DVTOV(ddv)));
409 vn_exists(vp);
410
411 ASSERT(DEVI_BUSY_OWNED(devi));
412 ndi_hold_devi(devi);
413
414 dv->dv_devi = devi;
415 dv->dv_ino = dv_mkino(devi, vp->v_type, vp->v_rdev);
416 dv->dv_nlink = 0; /* updated on insert */
417 dv->dv_dotdot = ddv;
418 dv->dv_attrvp = NULLVP;
419 dv->dv_attr = NULL;
420 dv->dv_flags = 0;
421
422 if (dmd->type == DDM_INTERNAL_PATH)
423 dv->dv_flags |= DV_INTERNAL;
424 if (dmd->ddm_flags & DM_NO_FSPERM)
425 dv->dv_flags |= DV_NO_FSPERM;
426
427 dv->dv_priv = dmd->ddm_node_priv;
428 if (dv->dv_priv)
429 dphold(dv->dv_priv);
430
431 /*
432 * Minors created with ddi_create_priv_minor_node can specify
433 * a default mode permission other than the devfs default.
434 */
435 if (dv->dv_priv || dv->dv_flags & DV_NO_FSPERM) {
436 dcmn_err5(("%s: dv_mknod default priv mode 0%o\n",
437 dv->dv_name, dmd->ddm_priv_mode));
438 dv->dv_flags |= DV_DFLT_MODE;
439 dv->dv_dflt_mode = dmd->ddm_priv_mode & S_IAMB;
440 }
441
442 return (dv);
443 }
444
445 /*
446 * dv_destroy
447 *
448 * Destroy what we created in dv_mkdir or dv_mknod.
449 * In the case of a *referenced* directory, do nothing.
450 */
451 void
452 dv_destroy(struct dv_node *dv, uint_t flags)
453 {
454 vnode_t *vp = DVTOV(dv);
455 ASSERT(dv->dv_nlink == 0); /* no references */
456
457 dcmn_err4(("dv_destroy: %s\n", dv->dv_name));
458
459 /*
460 * We may be asked to unlink referenced directories.
461 * In this case, there is nothing to be done.
462 * The eventual memory free will be done in
463 * devfs_inactive.
464 */
465 if (vp->v_count != 0) {
466 ASSERT(vp->v_type == VDIR);
467 ASSERT(flags & DV_CLEAN_FORCE);
468 ASSERT(DV_STALE(dv));
469 return;
470 }
471
472 if (vp->v_type == VDIR) {
473 ASSERT(DV_FIRST_ENTRY(dv) == NULL);
474 avl_destroy(&dv->dv_entries);
475 }
476
477 if (dv->dv_attrvp != NULLVP)
478 VN_RELE(dv->dv_attrvp);
479 if (dv->dv_attr != NULL)
480 kmem_free(dv->dv_attr, sizeof (struct vattr));
481 if (dv->dv_name != NULL)
482 kmem_free(dv->dv_name, dv->dv_namelen + 1);
483 if (dv->dv_devi != NULL) {
484 ndi_rele_devi(dv->dv_devi);
485 }
486 if (dv->dv_priv != NULL) {
487 dpfree(dv->dv_priv);
488 }
489
490 kmem_cache_free(dv_node_cache, dv);
491 }
492
493 /*
494 * Find and hold dv_node by name
495 */
496 static struct dv_node *
497 dv_findbyname(struct dv_node *ddv, char *nm)
498 {
499 struct dv_node *dv;
500 avl_index_t where;
501 struct dv_node dvtmp;
502
503 ASSERT(RW_LOCK_HELD(&ddv->dv_contents));
504 dcmn_err3(("dv_findbyname: %s\n", nm));
505
506 dvtmp.dv_name = nm;
507 dv = avl_find(&ddv->dv_entries, &dvtmp, &where);
508 if (dv) {
509 ASSERT(dv->dv_dotdot == ddv);
510 ASSERT(strcmp(dv->dv_name, nm) == 0);
511 VN_HOLD(DVTOV(dv));
512 return (dv);
513 }
514 return (NULL);
515 }
516
517 /*
518 * Inserts a new dv_node in a parent directory
519 */
520 void
521 dv_insert(struct dv_node *ddv, struct dv_node *dv)
522 {
523 avl_index_t where;
524
525 ASSERT(RW_WRITE_HELD(&ddv->dv_contents));
526 ASSERT(DVTOV(ddv)->v_type == VDIR);
527 ASSERT(ddv->dv_nlink >= 2);
528 ASSERT(dv->dv_nlink == 0);
529
530 dcmn_err3(("dv_insert: %s\n", dv->dv_name));
531
532 dv->dv_dotdot = ddv;
533 if (DVTOV(dv)->v_type == VDIR) {
534 ddv->dv_nlink++; /* .. to containing directory */
535 dv->dv_nlink = 2; /* name + . */
536 } else {
537 dv->dv_nlink = 1; /* name */
538 }
539
540 /* enter node in the avl tree */
541 VERIFY(avl_find(&ddv->dv_entries, dv, &where) == NULL);
542 avl_insert(&ddv->dv_entries, dv, where);
543 }
544
545 /*
546 * Unlink a dv_node from a perent directory
547 */
548 void
549 dv_unlink(struct dv_node *ddv, struct dv_node *dv)
550 {
551 /* verify linkage of arguments */
552 ASSERT(ddv && dv);
553 ASSERT(dv->dv_dotdot == ddv);
554 ASSERT(RW_WRITE_HELD(&ddv->dv_contents));
555 ASSERT(DVTOV(ddv)->v_type == VDIR);
556
557 dcmn_err3(("dv_unlink: %s\n", dv->dv_name));
558
559 if (DVTOV(dv)->v_type == VDIR) {
560 ddv->dv_nlink--; /* .. to containing directory */
561 dv->dv_nlink -= 2; /* name + . */
562 } else {
563 dv->dv_nlink -= 1; /* name */
564 }
565 ASSERT(ddv->dv_nlink >= 2);
566 ASSERT(dv->dv_nlink == 0);
567
568 dv->dv_dotdot = NULL;
569
570 /* remove from avl tree */
571 avl_remove(&ddv->dv_entries, dv);
572 }
573
574 /*
575 * Merge devfs node specific information into an attribute structure.
576 *
577 * NOTE: specfs provides ATIME,MTIME,CTIME,SIZE,BLKSIZE,NBLOCKS on leaf node.
578 */
579 void
580 dv_vattr_merge(struct dv_node *dv, struct vattr *vap)
581 {
582 struct vnode *vp = DVTOV(dv);
583
584 vap->va_nodeid = dv->dv_ino;
585 vap->va_nlink = dv->dv_nlink;
586
587 if (vp->v_type == VDIR) {
588 vap->va_rdev = 0;
589 vap->va_fsid = vp->v_rdev;
590 } else {
591 vap->va_rdev = vp->v_rdev;
592 vap->va_fsid = DVTOV(dv->dv_dotdot)->v_rdev;
593 vap->va_type = vp->v_type;
594 /* don't trust the shadow file type */
595 vap->va_mode &= ~S_IFMT;
596 if (vap->va_type == VCHR)
597 vap->va_mode |= S_IFCHR;
598 else
599 vap->va_mode |= S_IFBLK;
600 }
601 }
602
603 /*
604 * Get default device permission by consulting rules in
605 * privilege specification in minor node and /etc/minor_perm.
606 *
607 * This function is called from the devname filesystem to get default
608 * permissions for a device exported to a non-global zone.
609 */
610 void
611 devfs_get_defattr(struct vnode *vp, struct vattr *vap, int *no_fs_perm)
612 {
613 mperm_t mp;
614 struct dv_node *dv;
615
616 /* If vp isn't a dv_node, return something sensible */
617 if (!vn_matchops(vp, dv_vnodeops)) {
618 if (no_fs_perm)
619 *no_fs_perm = 0;
620 *vap = dv_vattr_file;
621 return;
622 }
623
624 /*
625 * For minors not created by ddi_create_priv_minor_node(),
626 * use devfs defaults.
627 */
628 dv = VTODV(vp);
629 if (vp->v_type == VDIR) {
630 *vap = dv_vattr_dir;
631 } else if (dv->dv_flags & DV_NO_FSPERM) {
632 if (no_fs_perm)
633 *no_fs_perm = 1;
634 *vap = dv_vattr_priv;
635 } else {
636 /*
637 * look up perm bits from minor_perm
638 */
639 *vap = dv_vattr_file;
640 if (dev_minorperm(dv->dv_devi, dv->dv_name, &mp) == 0) {
641 VATTR_MP_MERGE((*vap), mp);
642 dcmn_err5(("%s: minor perm mode 0%o\n",
643 dv->dv_name, vap->va_mode));
644 } else if (dv->dv_flags & DV_DFLT_MODE) {
645 ASSERT((dv->dv_dflt_mode & ~S_IAMB) == 0);
646 vap->va_mode &= ~S_IAMB;
647 vap->va_mode |= dv->dv_dflt_mode;
648 dcmn_err5(("%s: priv mode 0%o\n",
649 dv->dv_name, vap->va_mode));
650 }
651 }
652 }
653
654 /*
655 * dv_shadow_node
656 *
657 * Given a VDIR dv_node, find/create the associated VDIR
658 * node in the shadow attribute filesystem.
659 *
660 * Given a VCHR/VBLK dv_node, find the associated VREG
661 * node in the shadow attribute filesystem. These nodes
662 * are only created to persist non-default attributes.
663 * Lack of such a node implies the default permissions
664 * are sufficient.
665 *
666 * Managing the attribute file entries is slightly tricky (mostly
667 * because we can't intercept VN_HOLD and VN_RELE except on the last
668 * release).
669 *
670 * We assert that if the dv_attrvp pointer is non-NULL, it points
671 * to a singly-held (by us) vnode that represents the shadow entry
672 * in the underlying filesystem. To avoid store-ordering issues,
673 * we assert that the pointer can only be tested under the dv_contents
674 * READERS lock.
675 */
676
677 void
678 dv_shadow_node(
679 struct vnode *dvp, /* devfs parent directory vnode */
680 char *nm, /* name component */
681 struct vnode *vp, /* devfs vnode */
682 struct pathname *pnp, /* the path .. */
683 struct vnode *rdir, /* the root .. */
684 struct cred *cred, /* who's asking? */
685 int flags) /* optionally create shadow node */
686 {
687 struct dv_node *dv; /* dv_node of named directory */
688 struct vnode *rdvp; /* shadow parent directory vnode */
689 struct vnode *rvp; /* shadow vnode */
690 struct vnode *rrvp; /* realvp of shadow vnode */
691 struct vattr vattr;
692 int create_tried;
693 int error;
694
695 ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
696 dv = VTODV(vp);
697 dcmn_err3(("dv_shadow_node: name %s attr %p\n",
698 nm, (void *)dv->dv_attrvp));
699
700 if ((flags & DV_SHADOW_WRITE_HELD) == 0) {
701 ASSERT(RW_READ_HELD(&dv->dv_contents));
702 if (dv->dv_attrvp != NULLVP)
703 return;
704 if (!rw_tryupgrade(&dv->dv_contents)) {
705 rw_exit(&dv->dv_contents);
706 rw_enter(&dv->dv_contents, RW_WRITER);
707 if (dv->dv_attrvp != NULLVP) {
708 rw_downgrade(&dv->dv_contents);
709 return;
710 }
711 }
712 } else {
713 ASSERT(RW_WRITE_HELD(&dv->dv_contents));
714 if (dv->dv_attrvp != NULLVP)
715 return;
716 }
717
718 ASSERT(RW_WRITE_HELD(&dv->dv_contents) && dv->dv_attrvp == NULL);
719
720 rdvp = VTODV(dvp)->dv_attrvp;
721 create_tried = 0;
722 lookup:
723 if (rdvp && (dv->dv_flags & DV_NO_FSPERM) == 0) {
724 error = VOP_LOOKUP(rdvp, nm, &rvp, pnp, LOOKUP_DIR, rdir, cred,
725 NULL, NULL, NULL);
726
727 /* factor out the snode since we only want the attribute node */
728 if ((error == 0) && (VOP_REALVP(rvp, &rrvp, NULL) == 0)) {
729 VN_HOLD(rrvp);
730 VN_RELE(rvp);
731 rvp = rrvp;
732 }
733 } else
734 error = EROFS; /* no parent, no entry */
735
736 /*
737 * All we want is the permissions (and maybe ACLs and
738 * extended attributes), and we want to perform lookups
739 * by name. Drivers occasionally change their minor
740 * number space. If something changes, there's no
741 * much we can do about it here.
742 */
743
744 /* The shadow node checks out. We are done */
745 if (error == 0) {
746 dv->dv_attrvp = rvp; /* with one hold */
747
748 /*
749 * Determine if we have non-trivial ACLs on this node.
750 * It is not necessary to VOP_RWLOCK since fs_acl_nontrivial
751 * only does VOP_GETSECATTR.
752 */
753 dv->dv_flags &= ~DV_ACL;
754
755 if (fs_acl_nontrivial(rvp, cred))
756 dv->dv_flags |= DV_ACL;
757
758 /*
759 * If we have synced out the memory attributes, free
760 * them and switch back to using the persistent store.
761 */
762 if (rvp && dv->dv_attr) {
763 kmem_free(dv->dv_attr, sizeof (struct vattr));
764 dv->dv_attr = NULL;
765 }
766 if ((flags & DV_SHADOW_WRITE_HELD) == 0)
767 rw_downgrade(&dv->dv_contents);
768 ASSERT(RW_LOCK_HELD(&dv->dv_contents));
769 return;
770 }
771
772 /*
773 * Failed to find attribute in persistent backing store,
774 * get default permission bits.
775 */
776 devfs_get_defattr(vp, &vattr, NULL);
777
778 dv_vattr_merge(dv, &vattr);
779 gethrestime(&vattr.va_atime);
780 vattr.va_mtime = vattr.va_atime;
781 vattr.va_ctime = vattr.va_atime;
782
783 /*
784 * Try to create shadow dir. This is necessary in case
785 * we need to create a shadow leaf node later, when user
786 * executes chmod.
787 */
788 if ((error == ENOENT) && !create_tried) {
789 switch (vp->v_type) {
790 case VDIR:
791 error = VOP_MKDIR(rdvp, nm, &vattr, &rvp, kcred,
792 NULL, 0, NULL);
793 dsysdebug(error, ("vop_mkdir %s %s %d\n",
794 VTODV(dvp)->dv_name, nm, error));
795 create_tried = 1;
796 break;
797
798 case VCHR:
799 case VBLK:
800 /*
801 * Shadow nodes are only created on demand
802 */
803 if (flags & DV_SHADOW_CREATE) {
804 error = VOP_CREATE(rdvp, nm, &vattr, NONEXCL,
805 VREAD|VWRITE, &rvp, kcred, 0, NULL, NULL);
806 dsysdebug(error, ("vop_create %s %s %d\n",
807 VTODV(dvp)->dv_name, nm, error));
808 create_tried = 1;
809 }
810 break;
811
812 default:
813 cmn_err(CE_PANIC, "devfs: %s: create", dvnm);
814 /*NOTREACHED*/
815 }
816
817 if (create_tried &&
818 (error == 0) || (error == EEXIST)) {
819 VN_RELE(rvp);
820 goto lookup;
821 }
822 }
823
824 /* Store attribute in memory */
825 if (dv->dv_attr == NULL) {
826 dv->dv_attr = kmem_alloc(sizeof (struct vattr), KM_SLEEP);
827 *(dv->dv_attr) = vattr;
828 }
829
830 if ((flags & DV_SHADOW_WRITE_HELD) == 0)
831 rw_downgrade(&dv->dv_contents);
832 ASSERT(RW_LOCK_HELD(&dv->dv_contents));
833 }
834
835 /*
836 * Given a devinfo node, and a name, returns the appropriate
837 * minor information for that named node, if it exists.
838 */
839 static int
840 dv_find_leafnode(dev_info_t *devi, char *minor_nm, struct ddi_minor_data *r_mi)
841 {
842 struct ddi_minor_data *dmd;
843
844 ASSERT(i_ddi_devi_attached(devi));
845
846 dcmn_err3(("dv_find_leafnode: %s\n", minor_nm));
847 ASSERT(DEVI_BUSY_OWNED(devi));
848 for (dmd = DEVI(devi)->devi_minor; dmd; dmd = dmd->next) {
849
850 /*
851 * Skip alias nodes and nodes without a name.
852 */
853 if ((dmd->type == DDM_ALIAS) || (dmd->ddm_name == NULL))
854 continue;
855
856 dcmn_err4(("dv_find_leafnode: (%s,%s)\n",
857 minor_nm, dmd->ddm_name));
858 if (strcmp(minor_nm, dmd->ddm_name) == 0) {
859 r_mi->ddm_dev = dmd->ddm_dev;
860 r_mi->ddm_spec_type = dmd->ddm_spec_type;
861 r_mi->type = dmd->type;
862 r_mi->ddm_flags = dmd->ddm_flags;
863 r_mi->ddm_node_priv = dmd->ddm_node_priv;
864 r_mi->ddm_priv_mode = dmd->ddm_priv_mode;
865 if (r_mi->ddm_node_priv)
866 dphold(r_mi->ddm_node_priv);
867 return (0);
868 }
869 }
870
871 dcmn_err3(("dv_find_leafnode: %s: ENOENT\n", minor_nm));
872 return (ENOENT);
873 }
874
875 /*
876 * Special handling for clone node:
877 * Clone minor name is a driver name, the minor number will
878 * be the major number of the driver. There is no minor
879 * node under the clone driver, so we'll manufacture the
880 * dev_t.
881 */
882 static struct dv_node *
883 dv_clone_mknod(struct dv_node *ddv, char *drvname)
884 {
885 major_t major;
886 struct dv_node *dvp;
887 char *devnm;
888 struct ddi_minor_data *dmd;
889
890 /*
891 * Make sure drvname is a STREAMS driver. We load the driver,
892 * but don't attach to any instances. This makes stat(2)
893 * relatively cheap.
894 */
895 major = ddi_name_to_major(drvname);
896 if (major == DDI_MAJOR_T_NONE)
897 return (NULL);
898
899 if (ddi_hold_driver(major) == NULL)
900 return (NULL);
901
902 if (STREAMSTAB(major) == NULL) {
903 ddi_rele_driver(major);
904 return (NULL);
905 }
906
907 ddi_rele_driver(major);
908 devnm = kmem_alloc(MAXNAMELEN, KM_SLEEP);
909 (void) snprintf(devnm, MAXNAMELEN, "clone@0:%s", drvname);
910 dmd = kmem_zalloc(sizeof (*dmd), KM_SLEEP);
911 dmd->ddm_dev = makedevice(clone_major, (minor_t)major);
912 dmd->ddm_spec_type = S_IFCHR;
913 dvp = dv_mknod(ddv, clone_dip, devnm, dmd);
914 kmem_free(dmd, sizeof (*dmd));
915 kmem_free(devnm, MAXNAMELEN);
916 return (dvp);
917 }
918
919 /*
920 * Given the parent directory node, and a name in it, returns the
921 * named dv_node to the caller (as a vnode).
922 *
923 * (We need pnp and rdir for doing shadow lookups; they can be NULL)
924 */
925 int
926 dv_find(struct dv_node *ddv, char *nm, struct vnode **vpp, struct pathname *pnp,
927 struct vnode *rdir, struct cred *cred, uint_t ndi_flags)
928 {
929 extern int isminiroot; /* see modctl.c */
930
931 int circ;
932 int rv = 0, was_busy = 0, nmlen, write_held = 0;
933 struct vnode *vp;
934 struct dv_node *dv, *dup;
935 dev_info_t *pdevi, *devi = NULL;
936 char *mnm;
937 struct ddi_minor_data *dmd;
938
939 dcmn_err3(("dv_find %s\n", nm));
940
941 if (!rw_tryenter(&ddv->dv_contents, RW_READER)) {
942 if (tsd_get(devfs_clean_key))
943 return (EBUSY);
944 rw_enter(&ddv->dv_contents, RW_READER);
945 }
946 start:
947 if (DV_STALE(ddv)) {
948 rw_exit(&ddv->dv_contents);
949 return (ESTALE);
950 }
951
952 /*
953 * Empty name or ., return node itself.
954 */
955 nmlen = strlen(nm);
956 if ((nmlen == 0) || ((nmlen == 1) && (nm[0] == '.'))) {
957 *vpp = DVTOV(ddv);
958 rw_exit(&ddv->dv_contents);
959 VN_HOLD(*vpp);
960 return (0);
961 }
962
963 /*
964 * .., return the parent directory
965 */
966 if ((nmlen == 2) && (strcmp(nm, "..") == 0)) {
967 *vpp = DVTOV(ddv->dv_dotdot);
968 rw_exit(&ddv->dv_contents);
969 VN_HOLD(*vpp);
970 return (0);
971 }
972
973 /*
974 * Fail anything without a valid device name component
975 */
976 if (nm[0] == '@' || nm[0] == ':') {
977 dcmn_err3(("devfs: no driver '%s'\n", nm));
978 rw_exit(&ddv->dv_contents);
979 return (ENOENT);
980 }
981
982 /*
983 * So, now we have to deal with the trickier stuff.
984 *
985 * (a) search the existing list of dv_nodes on this directory
986 */
987 if ((dv = dv_findbyname(ddv, nm)) != NULL) {
988 founddv:
989 ASSERT(RW_LOCK_HELD(&ddv->dv_contents));
990
991 if (!rw_tryenter(&dv->dv_contents, RW_READER)) {
992 if (tsd_get(devfs_clean_key)) {
993 VN_RELE(DVTOV(dv));
994 rw_exit(&ddv->dv_contents);
995 return (EBUSY);
996 }
997 rw_enter(&dv->dv_contents, RW_READER);
998 }
999
1000 vp = DVTOV(dv);
1001 if ((dv->dv_attrvp != NULLVP) ||
1002 (vp->v_type != VDIR && dv->dv_attr != NULL)) {
1003 /*
1004 * Common case - we already have attributes
1005 */
1006 rw_exit(&dv->dv_contents);
1007 rw_exit(&ddv->dv_contents);
1008 goto found;
1009 }
1010
1011 /*
1012 * No attribute vp, try and build one.
1013 *
1014 * dv_shadow_node() can briefly drop &dv->dv_contents lock
1015 * if it is unable to upgrade it to a write lock. If the
1016 * current thread has come in through the bottom-up device
1017 * configuration devfs_clean() path, we may deadlock against
1018 * a thread performing top-down device configuration if it
1019 * grabs the contents lock. To avoid this, when we are on the
1020 * devfs_clean() path we attempt to upgrade the dv_contents
1021 * lock before we call dv_shadow_node().
1022 */
1023 if (tsd_get(devfs_clean_key)) {
1024 if (!rw_tryupgrade(&dv->dv_contents)) {
1025 VN_RELE(DVTOV(dv));
1026 rw_exit(&dv->dv_contents);
1027 rw_exit(&ddv->dv_contents);
1028 return (EBUSY);
1029 }
1030
1031 write_held = DV_SHADOW_WRITE_HELD;
1032 }
1033
1034 dv_shadow_node(DVTOV(ddv), nm, vp, pnp, rdir, cred,
1035 write_held);
1036
1037 rw_exit(&dv->dv_contents);
1038 rw_exit(&ddv->dv_contents);
1039 goto found;
1040 }
1041
1042 /*
1043 * (b) Search the child devinfo nodes of our parent directory,
1044 * looking for the named node. If we find it, build a new
1045 * node, then grab the writers lock, search the directory
1046 * if it's still not there, then insert it.
1047 *
1048 * We drop the devfs locks before accessing the device tree.
1049 * Take care to mark the node BUSY so that a forced devfs_clean
1050 * doesn't mark the directory node stale.
1051 *
1052 * Also, check if we are called as part of devfs_clean or
1053 * reset_perm. If so, simply return not found because there
1054 * is nothing to clean.
1055 */
1056 if (tsd_get(devfs_clean_key)) {
1057 rw_exit(&ddv->dv_contents);
1058 return (ENOENT);
1059 }
1060
1061 /*
1062 * We could be either READ or WRITE locked at
1063 * this point. Upgrade if we are read locked.
1064 */
1065 ASSERT(RW_LOCK_HELD(&ddv->dv_contents));
1066 if (rw_read_locked(&ddv->dv_contents) &&
1067 !rw_tryupgrade(&ddv->dv_contents)) {
1068 rw_exit(&ddv->dv_contents);
1069 rw_enter(&ddv->dv_contents, RW_WRITER);
1070 /*
1071 * Things may have changed when we dropped
1072 * the contents lock, so start from top again
1073 */
1074 goto start;
1075 }
1076 ddv->dv_busy++; /* mark busy before dropping lock */
1077 was_busy++;
1078 rw_exit(&ddv->dv_contents);
1079
1080 pdevi = ddv->dv_devi;
1081 ASSERT(pdevi != NULL);
1082
1083 mnm = strchr(nm, ':');
1084 if (mnm)
1085 *mnm = (char)0;
1086
1087 /*
1088 * Configure one nexus child, will call nexus's bus_ops
1089 * If successful, devi is held upon returning.
1090 * Note: devfs lookup should not be configuring grandchildren.
1091 */
1092 ASSERT((ndi_flags & NDI_CONFIG) == 0);
1093
1094 rv = ndi_devi_config_one(pdevi, nm, &devi, ndi_flags | NDI_NO_EVENT);
1095 if (mnm)
1096 *mnm = ':';
1097 if (rv != NDI_SUCCESS) {
1098 rv = ENOENT;
1099 goto notfound;
1100 }
1101
1102 ASSERT(devi);
1103
1104 /* Check if this is a path alias */
1105 if (ddi_aliases_present == B_TRUE && ddi_get_parent(devi) != pdevi) {
1106 char *curr = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1107
1108 (void) ddi_pathname(devi, curr);
1109
1110 vp = NULL;
1111 if (devfs_lookupname(curr, NULL, &vp) == 0 && vp) {
1112 dv = VTODV(vp);
1113 kmem_free(curr, MAXPATHLEN);
1114 goto found;
1115 }
1116 kmem_free(curr, MAXPATHLEN);
1117 }
1118
1119 /*
1120 * If we configured a hidden node, consider it notfound.
1121 */
1122 if (ndi_dev_is_hidden_node(devi)) {
1123 ndi_rele_devi(devi);
1124 rv = ENOENT;
1125 goto notfound;
1126 }
1127
1128 /*
1129 * Don't make vhci clients visible under phci, unless we
1130 * are in miniroot.
1131 */
1132 if (isminiroot == 0 && ddi_get_parent(devi) != pdevi) {
1133 ndi_rele_devi(devi);
1134 rv = ENOENT;
1135 goto notfound;
1136 }
1137
1138 ASSERT(devi && i_ddi_devi_attached(devi));
1139
1140 /*
1141 * Invalidate cache to notice newly created minor nodes.
1142 */
1143 rw_enter(&ddv->dv_contents, RW_WRITER);
1144 ddv->dv_flags |= DV_BUILD;
1145 rw_exit(&ddv->dv_contents);
1146
1147 /*
1148 * mkdir for nexus drivers and leaf nodes as well. If we are racing
1149 * and create a duplicate, the duplicate will be destroyed below.
1150 */
1151 if (mnm == NULL) {
1152 dv = dv_mkdir(ddv, devi, nm);
1153 } else {
1154 /*
1155 * Allocate dmd first to avoid KM_SLEEP with active
1156 * ndi_devi_enter.
1157 */
1158 dmd = kmem_zalloc(sizeof (*dmd), KM_SLEEP);
1159 ndi_devi_enter(devi, &circ);
1160 if (devi == clone_dip) {
1161 /*
1162 * For clone minors, load the driver indicated by
1163 * minor name.
1164 */
1165 dv = dv_clone_mknod(ddv, mnm + 1);
1166 } else {
1167 /*
1168 * Find minor node and make a dv_node
1169 */
1170 if (dv_find_leafnode(devi, mnm + 1, dmd) == 0) {
1171 dv = dv_mknod(ddv, devi, nm, dmd);
1172 if (dmd->ddm_node_priv)
1173 dpfree(dmd->ddm_node_priv);
1174 }
1175 }
1176 ndi_devi_exit(devi, circ);
1177 kmem_free(dmd, sizeof (*dmd));
1178 }
1179 /*
1180 * Release hold from ndi_devi_config_one()
1181 */
1182 ndi_rele_devi(devi);
1183
1184 if (dv == NULL) {
1185 rv = ENOENT;
1186 goto notfound;
1187 }
1188
1189 /*
1190 * We have released the dv_contents lock, need to check
1191 * if another thread already created a duplicate node
1192 */
1193 rw_enter(&ddv->dv_contents, RW_WRITER);
1194 if ((dup = dv_findbyname(ddv, nm)) == NULL) {
1195 dv_insert(ddv, dv);
1196 } else {
1197 /*
1198 * Duplicate found, use the existing node
1199 */
1200 VN_RELE(DVTOV(dv));
1201 dv_destroy(dv, 0);
1202 dv = dup;
1203 }
1204 goto founddv;
1205 /*NOTREACHED*/
1206
1207 found:
1208 /*
1209 * Fail lookup of device that has now become hidden (typically via
1210 * hot removal of open device).
1211 */
1212 if (dv->dv_devi && ndi_dev_is_hidden_node(dv->dv_devi)) {
1213 dcmn_err2(("dv_find: nm %s failed: hidden/removed\n", nm));
1214 VN_RELE(vp);
1215 rv = ENOENT;
1216 goto notfound;
1217 }
1218
1219 /*
1220 * Skip non-kernel lookups of internal nodes.
1221 * This use of kcred to distinguish between user and
1222 * internal kernel lookups is unfortunate. The information
1223 * provided by the seg argument to lookupnameat should
1224 * evolve into a lookup flag for filesystems that need
1225 * this distinction.
1226 */
1227 if ((dv->dv_flags & DV_INTERNAL) && (cred != kcred)) {
1228 dcmn_err2(("dv_find: nm %s failed: internal\n", nm));
1229 VN_RELE(vp);
1230 rv = ENOENT;
1231 goto notfound;
1232 }
1233
1234 dcmn_err2(("dv_find: returning vp for nm %s\n", nm));
1235 if (vp->v_type == VCHR || vp->v_type == VBLK) {
1236 /*
1237 * If vnode is a device, return special vnode instead
1238 * (though it knows all about -us- via sp->s_realvp,
1239 * sp->s_devvp, and sp->s_dip)
1240 */
1241 *vpp = specvp_devfs(vp, vp->v_rdev, vp->v_type, cred,
1242 dv->dv_devi);
1243 VN_RELE(vp);
1244 if (*vpp == NULLVP)
1245 rv = ENOSYS;
1246 } else
1247 *vpp = vp;
1248
1249 notfound:
1250 if (was_busy) {
1251 /*
1252 * Non-zero was_busy tells us that we are not in the
1253 * devfs_clean() path which in turn means that we can afford
1254 * to take the contents lock unconditionally.
1255 */
1256 rw_enter(&ddv->dv_contents, RW_WRITER);
1257 ddv->dv_busy--;
1258 rw_exit(&ddv->dv_contents);
1259 }
1260 return (rv);
1261 }
1262
1263 /*
1264 * The given directory node is out-of-date; that is, it has been
1265 * marked as needing to be rebuilt, possibly because some new devinfo
1266 * node has come into existence, or possibly because this is the first
1267 * time we've been here.
1268 */
1269 void
1270 dv_filldir(struct dv_node *ddv)
1271 {
1272 struct dv_node *dv;
1273 dev_info_t *devi, *pdevi;
1274 struct ddi_minor_data *dmd;
1275 char devnm[MAXNAMELEN];
1276 int circ, ccirc;
1277
1278 ASSERT(DVTOV(ddv)->v_type == VDIR);
1279 ASSERT(RW_WRITE_HELD(&ddv->dv_contents));
1280 ASSERT(ddv->dv_flags & DV_BUILD);
1281
1282 dcmn_err3(("dv_filldir: %s\n", ddv->dv_name));
1283 if (DV_STALE(ddv))
1284 return;
1285 pdevi = ddv->dv_devi;
1286
1287 if (ndi_devi_config(pdevi, NDI_NO_EVENT) != NDI_SUCCESS) {
1288 dcmn_err3(("dv_filldir: config error %s\n", ddv->dv_name));
1289 }
1290
1291 ndi_devi_enter(pdevi, &circ);
1292 for (devi = ddi_get_child(pdevi); devi;
1293 devi = ddi_get_next_sibling(devi)) {
1294 /*
1295 * While we know enough to create a directory at DS_INITIALIZED,
1296 * the directory will be empty until DS_ATTACHED. The existence
1297 * of an empty directory dv_node will cause a devi_ref, which
1298 * has caused problems for existing code paths doing offline/DR
1299 * type operations - making devfs_clean coordination even more
1300 * sensitive and error prone. Given this, the 'continue' below
1301 * is checking for DS_ATTACHED instead of DS_INITIALIZED.
1302 */
1303 if (i_ddi_node_state(devi) < DS_ATTACHED)
1304 continue;
1305
1306 /* skip hidden nodes */
1307 if (ndi_dev_is_hidden_node(devi))
1308 continue;
1309
1310 dcmn_err3(("dv_filldir: node %s\n", ddi_node_name(devi)));
1311
1312 ndi_devi_enter(devi, &ccirc);
1313 for (dmd = DEVI(devi)->devi_minor; dmd; dmd = dmd->next) {
1314 char *addr;
1315
1316 /*
1317 * Skip alias nodes, internal nodes, and nodes
1318 * without a name. We allow DDM_DEFAULT nodes
1319 * to appear in readdir.
1320 */
1321 if ((dmd->type == DDM_ALIAS) ||
1322 (dmd->type == DDM_INTERNAL_PATH) ||
1323 (dmd->ddm_name == NULL))
1324 continue;
1325
1326 addr = ddi_get_name_addr(devi);
1327 if (addr && *addr)
1328 (void) sprintf(devnm, "%s@%s:%s",
1329 ddi_node_name(devi), addr, dmd->ddm_name);
1330 else
1331 (void) sprintf(devnm, "%s:%s",
1332 ddi_node_name(devi), dmd->ddm_name);
1333
1334 if ((dv = dv_findbyname(ddv, devnm)) != NULL) {
1335 /* dv_node already exists */
1336 VN_RELE(DVTOV(dv));
1337 continue;
1338 }
1339
1340 dv = dv_mknod(ddv, devi, devnm, dmd);
1341 dv_insert(ddv, dv);
1342 VN_RELE(DVTOV(dv));
1343 }
1344 ndi_devi_exit(devi, ccirc);
1345
1346 (void) ddi_deviname(devi, devnm);
1347 if ((dv = dv_findbyname(ddv, devnm + 1)) == NULL) {
1348 /* directory doesn't exist */
1349 dv = dv_mkdir(ddv, devi, devnm + 1);
1350 dv_insert(ddv, dv);
1351 }
1352 VN_RELE(DVTOV(dv));
1353 }
1354 ndi_devi_exit(pdevi, circ);
1355
1356 ddv->dv_flags &= ~DV_BUILD;
1357 }
1358
1359 /*
1360 * Given a directory node, clean out all the nodes beneath.
1361 *
1362 * VDIR: Reinvoke to clean them, then delete the directory.
1363 * VCHR, VBLK: Just blow them away.
1364 *
1365 * Mark the directories touched as in need of a rebuild, in case
1366 * we fall over part way through. When DV_CLEAN_FORCE is specified,
1367 * we mark referenced empty directories as stale to facilitate DR.
1368 */
1369 int
1370 dv_cleandir(struct dv_node *ddv, char *devnm, uint_t flags)
1371 {
1372 struct dv_node *dv;
1373 struct dv_node *next;
1374 struct vnode *vp;
1375 int busy = 0;
1376
1377 /*
1378 * We should always be holding the tsd_clean_key here: dv_cleandir()
1379 * will be called as a result of a devfs_clean request and the
1380 * tsd_clean_key will be set in either in devfs_clean() itself or in
1381 * devfs_clean_vhci().
1382 *
1383 * Since we are on the devfs_clean path, we return EBUSY if we cannot
1384 * get the contents lock: if we blocked here we might deadlock against
1385 * a thread performing top-down device configuration.
1386 */
1387 ASSERT(tsd_get(devfs_clean_key));
1388
1389 dcmn_err3(("dv_cleandir: %s\n", ddv->dv_name));
1390
1391 if (!(flags & DV_CLEANDIR_LCK) &&
1392 !rw_tryenter(&ddv->dv_contents, RW_WRITER))
1393 return (EBUSY);
1394
1395 for (dv = DV_FIRST_ENTRY(ddv); dv; dv = next) {
1396 next = DV_NEXT_ENTRY(ddv, dv);
1397
1398 /*
1399 * If devnm is specified, the non-minor portion of the
1400 * name must match devnm.
1401 */
1402 if (devnm &&
1403 (strncmp(devnm, dv->dv_name, strlen(devnm)) ||
1404 (dv->dv_name[strlen(devnm)] != ':' &&
1405 dv->dv_name[strlen(devnm)] != '\0')))
1406 continue;
1407
1408 /* check type of what we are cleaning */
1409 vp = DVTOV(dv);
1410 if (vp->v_type == VDIR) {
1411 /* recurse on directories */
1412 rw_enter(&dv->dv_contents, RW_WRITER);
1413 if (dv_cleandir(dv, NULL,
1414 flags | DV_CLEANDIR_LCK) == EBUSY) {
1415 rw_exit(&dv->dv_contents);
1416 goto set_busy;
1417 }
1418
1419 /* A clean directory is an empty directory... */
1420 ASSERT(dv->dv_nlink == 2);
1421 mutex_enter(&vp->v_lock);
1422 if (vp->v_count > 0) {
1423 /*
1424 * ... but an empty directory can still have
1425 * references to it. If we have dv_busy or
1426 * DV_CLEAN_FORCE is *not* specified then a
1427 * referenced directory is considered busy.
1428 */
1429 if (dv->dv_busy || !(flags & DV_CLEAN_FORCE)) {
1430 mutex_exit(&vp->v_lock);
1431 rw_exit(&dv->dv_contents);
1432 goto set_busy;
1433 }
1434
1435 /*
1436 * Mark referenced directory stale so that DR
1437 * will succeed even if a shell has
1438 * /devices/xxx as current directory (causing
1439 * VN_HOLD reference to an empty directory).
1440 */
1441 ASSERT(!DV_STALE(dv));
1442 ndi_rele_devi(dv->dv_devi);
1443 dv->dv_devi = NULL; /* mark DV_STALE */
1444 }
1445 } else {
1446 ASSERT((vp->v_type == VCHR) || (vp->v_type == VBLK));
1447 ASSERT(dv->dv_nlink == 1); /* no hard links */
1448 mutex_enter(&vp->v_lock);
1449 if (vp->v_count > 0) {
1450 /*
1451 * The file still has references to it. If
1452 * DV_DEVI_GONE is *not* specified then a
1453 * referenced file is considered busy.
1454 */
1455 if (!(flags & DV_DEVI_GONE)) {
1456 mutex_exit(&vp->v_lock);
1457 goto set_busy;
1458 }
1459
1460 /*
1461 * Mark referenced file stale so that DR will
1462 * succeed even if there are userland opens.
1463 */
1464 ASSERT(!DV_STALE(dv));
1465 ndi_rele_devi(dv->dv_devi);
1466 DEVI(dv->dv_devi)->devi_gone = 1;
1467 dv->dv_devi = NULL;
1468 }
1469 }
1470
1471 /* unlink from directory */
1472 dv_unlink(ddv, dv);
1473
1474 /* drop locks */
1475 mutex_exit(&vp->v_lock);
1476 if (vp->v_type == VDIR)
1477 rw_exit(&dv->dv_contents);
1478
1479 /* destroy vnode if ref count is zero */
1480 if (vp->v_count == 0)
1481 dv_destroy(dv, flags);
1482
1483 continue;
1484
1485 /*
1486 * If devnm is not NULL we return immediately on busy,
1487 * otherwise we continue destroying unused dv_node's.
1488 */
1489 set_busy: busy++;
1490 if (devnm)
1491 break;
1492 }
1493
1494 /*
1495 * This code may be invoked to inform devfs that a new node has
1496 * been created in the kernel device tree. So we always set
1497 * the DV_BUILD flag to allow the next dv_filldir() to pick
1498 * the new devinfo nodes.
1499 */
1500 ddv->dv_flags |= DV_BUILD;
1501
1502 if (!(flags & DV_CLEANDIR_LCK))
1503 rw_exit(&ddv->dv_contents);
1504
1505 return (busy ? EBUSY : 0);
1506 }
1507
1508 /*
1509 * Walk through the devfs hierarchy, correcting the permissions of
1510 * devices with default permissions that do not match those specified
1511 * by minor perm. This can only be done for all drivers for now.
1512 */
1513 static int
1514 dv_reset_perm_dir(struct dv_node *ddv, uint_t flags)
1515 {
1516 struct dv_node *dv;
1517 struct vnode *vp;
1518 int retval = 0;
1519 struct vattr *attrp;
1520 mperm_t mp;
1521 char *nm;
1522 uid_t old_uid;
1523 gid_t old_gid;
1524 mode_t old_mode;
1525
1526 rw_enter(&ddv->dv_contents, RW_WRITER);
1527 for (dv = DV_FIRST_ENTRY(ddv); dv; dv = DV_NEXT_ENTRY(ddv, dv)) {
1528 int error = 0;
1529 nm = dv->dv_name;
1530
1531 rw_enter(&dv->dv_contents, RW_READER);
1532 vp = DVTOV(dv);
1533 if (vp->v_type == VDIR) {
1534 rw_exit(&dv->dv_contents);
1535 if (dv_reset_perm_dir(dv, flags) != 0) {
1536 error = EBUSY;
1537 }
1538 } else {
1539 ASSERT(vp->v_type == VCHR || vp->v_type == VBLK);
1540
1541 /*
1542 * Check for permissions from minor_perm
1543 * If there are none, we're done
1544 */
1545 rw_exit(&dv->dv_contents);
1546 if (dev_minorperm(dv->dv_devi, nm, &mp) != 0)
1547 continue;
1548
1549 rw_enter(&dv->dv_contents, RW_READER);
1550
1551 /*
1552 * Allow a node's permissions to be altered
1553 * permanently from the defaults by chmod,
1554 * using the shadow node as backing store.
1555 * Otherwise, update node to minor_perm permissions.
1556 */
1557 if (dv->dv_attrvp == NULLVP) {
1558 /*
1559 * No attribute vp, try to find one.
1560 */
1561 dv_shadow_node(DVTOV(ddv), nm, vp,
1562 NULL, NULLVP, kcred, 0);
1563 }
1564 if (dv->dv_attrvp != NULLVP || dv->dv_attr == NULL) {
1565 rw_exit(&dv->dv_contents);
1566 continue;
1567 }
1568
1569 attrp = dv->dv_attr;
1570
1571 if (VATTRP_MP_CMP(attrp, mp) == 0) {
1572 dcmn_err5(("%s: no perm change: "
1573 "%d %d 0%o\n", nm, attrp->va_uid,
1574 attrp->va_gid, attrp->va_mode));
1575 rw_exit(&dv->dv_contents);
1576 continue;
1577 }
1578
1579 old_uid = attrp->va_uid;
1580 old_gid = attrp->va_gid;
1581 old_mode = attrp->va_mode;
1582
1583 VATTRP_MP_MERGE(attrp, mp);
1584 mutex_enter(&vp->v_lock);
1585 if (vp->v_count > 0) {
1586 error = EBUSY;
1587 }
1588 mutex_exit(&vp->v_lock);
1589
1590 dcmn_err5(("%s: perm %d/%d/0%o -> %d/%d/0%o (%d)\n",
1591 nm, old_uid, old_gid, old_mode, attrp->va_uid,
1592 attrp->va_gid, attrp->va_mode, error));
1593
1594 rw_exit(&dv->dv_contents);
1595 }
1596
1597 if (error != 0) {
1598 retval = error;
1599 }
1600 }
1601
1602 ddv->dv_flags |= DV_BUILD;
1603
1604 rw_exit(&ddv->dv_contents);
1605
1606 return (retval);
1607 }
1608
1609 int
1610 devfs_reset_perm(uint_t flags)
1611 {
1612 struct dv_node *dvp;
1613 int rval;
1614
1615 if ((dvp = devfs_dip_to_dvnode(ddi_root_node())) == NULL)
1616 return (0);
1617
1618 VN_HOLD(DVTOV(dvp));
1619 rval = dv_reset_perm_dir(dvp, flags);
1620 VN_RELE(DVTOV(dvp));
1621 return (rval);
1622 }
1623
1624 /*
1625 * Clean up dangling devfs shadow nodes for removed
1626 * drivers so that, in the event the driver is re-added
1627 * to the system, newly created nodes won't incorrectly
1628 * pick up these stale shadow node permissions.
1629 *
1630 * This is accomplished by walking down the pathname
1631 * to the directory, starting at the root's attribute
1632 * node, then removing all minors matching the specified
1633 * node name. Care must be taken to remove all entries
1634 * in a directory before the directory itself, so that
1635 * the clean-up associated with rem_drv'ing a nexus driver
1636 * does not inadvertently result in an inconsistent
1637 * filesystem underlying devfs.
1638 */
1639
1640 static int
1641 devfs_remdrv_rmdir(vnode_t *dirvp, const char *dir, vnode_t *rvp)
1642 {
1643 int error;
1644 vnode_t *vp;
1645 int eof;
1646 struct iovec iov;
1647 struct uio uio;
1648 struct dirent64 *dp;
1649 dirent64_t *dbuf;
1650 size_t dlen;
1651 size_t dbuflen;
1652 int ndirents = 64;
1653 char *nm;
1654
1655 VN_HOLD(dirvp);
1656
1657 dlen = ndirents * (sizeof (*dbuf));
1658 dbuf = kmem_alloc(dlen, KM_SLEEP);
1659
1660 uio.uio_iov = &iov;
1661 uio.uio_iovcnt = 1;
1662 uio.uio_segflg = UIO_SYSSPACE;
1663 uio.uio_fmode = 0;
1664 uio.uio_extflg = UIO_COPY_CACHED;
1665 uio.uio_loffset = 0;
1666 uio.uio_llimit = MAXOFFSET_T;
1667
1668 eof = 0;
1669 error = 0;
1670 while (!error && !eof) {
1671 uio.uio_resid = dlen;
1672 iov.iov_base = (char *)dbuf;
1673 iov.iov_len = dlen;
1674
1675 (void) VOP_RWLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1676 error = VOP_READDIR(dirvp, &uio, kcred, &eof, NULL, 0);
1677 VOP_RWUNLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1678
1679 dbuflen = dlen - uio.uio_resid;
1680
1681 if (error || dbuflen == 0)
1682 break;
1683
1684 for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen);
1685 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
1686
1687 nm = dp->d_name;
1688
1689 if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0)
1690 continue;
1691
1692 error = VOP_LOOKUP(dirvp, nm,
1693 &vp, NULL, 0, NULL, kcred, NULL, NULL, NULL);
1694
1695 dsysdebug(error,
1696 ("rem_drv %s/%s lookup (%d)\n",
1697 dir, nm, error));
1698
1699 if (error)
1700 continue;
1701
1702 ASSERT(vp->v_type == VDIR ||
1703 vp->v_type == VCHR || vp->v_type == VBLK);
1704
1705 if (vp->v_type == VDIR) {
1706 error = devfs_remdrv_rmdir(vp, nm, rvp);
1707 if (error == 0) {
1708 error = VOP_RMDIR(dirvp,
1709 (char *)nm, rvp, kcred, NULL, 0);
1710 dsysdebug(error,
1711 ("rem_drv %s/%s rmdir (%d)\n",
1712 dir, nm, error));
1713 }
1714 } else {
1715 error = VOP_REMOVE(dirvp, (char *)nm, kcred,
1716 NULL, 0);
1717 dsysdebug(error,
1718 ("rem_drv %s/%s remove (%d)\n",
1719 dir, nm, error));
1720 }
1721
1722 VN_RELE(vp);
1723 if (error) {
1724 goto exit;
1725 }
1726 }
1727 }
1728
1729 exit:
1730 VN_RELE(dirvp);
1731 kmem_free(dbuf, dlen);
1732
1733 return (error);
1734 }
1735
1736 int
1737 devfs_remdrv_cleanup(const char *dir, const char *nodename)
1738 {
1739 int error;
1740 vnode_t *vp;
1741 vnode_t *dirvp;
1742 int eof;
1743 struct iovec iov;
1744 struct uio uio;
1745 struct dirent64 *dp;
1746 dirent64_t *dbuf;
1747 size_t dlen;
1748 size_t dbuflen;
1749 int ndirents = 64;
1750 int nodenamelen = strlen(nodename);
1751 char *nm;
1752 struct pathname pn;
1753 vnode_t *rvp; /* root node of the underlying attribute fs */
1754
1755 dcmn_err5(("devfs_remdrv_cleanup: %s %s\n", dir, nodename));
1756
1757 if (error = pn_get((char *)dir, UIO_SYSSPACE, &pn))
1758 return (0);
1759
1760 rvp = dvroot->dv_attrvp;
1761 ASSERT(rvp != NULL);
1762 VN_HOLD(rvp);
1763
1764 pn_skipslash(&pn);
1765 dirvp = rvp;
1766 VN_HOLD(dirvp);
1767
1768 nm = kmem_alloc(MAXNAMELEN, KM_SLEEP);
1769
1770 while (pn_pathleft(&pn)) {
1771 ASSERT(dirvp->v_type == VDIR);
1772 (void) pn_getcomponent(&pn, nm);
1773 ASSERT((strcmp(nm, ".") != 0) && (strcmp(nm, "..") != 0));
1774 error = VOP_LOOKUP(dirvp, nm, &vp, NULL, 0, rvp, kcred,
1775 NULL, NULL, NULL);
1776 if (error) {
1777 dcmn_err5(("remdrv_cleanup %s lookup error %d\n",
1778 nm, error));
1779 VN_RELE(dirvp);
1780 if (dirvp != rvp)
1781 VN_RELE(rvp);
1782 pn_free(&pn);
1783 kmem_free(nm, MAXNAMELEN);
1784 return (0);
1785 }
1786 VN_RELE(dirvp);
1787 dirvp = vp;
1788 pn_skipslash(&pn);
1789 }
1790
1791 ASSERT(dirvp->v_type == VDIR);
1792 if (dirvp != rvp)
1793 VN_RELE(rvp);
1794 pn_free(&pn);
1795 kmem_free(nm, MAXNAMELEN);
1796
1797 dlen = ndirents * (sizeof (*dbuf));
1798 dbuf = kmem_alloc(dlen, KM_SLEEP);
1799
1800 uio.uio_iov = &iov;
1801 uio.uio_iovcnt = 1;
1802 uio.uio_segflg = UIO_SYSSPACE;
1803 uio.uio_fmode = 0;
1804 uio.uio_extflg = UIO_COPY_CACHED;
1805 uio.uio_loffset = 0;
1806 uio.uio_llimit = MAXOFFSET_T;
1807
1808 eof = 0;
1809 error = 0;
1810 while (!error && !eof) {
1811 uio.uio_resid = dlen;
1812 iov.iov_base = (char *)dbuf;
1813 iov.iov_len = dlen;
1814
1815 (void) VOP_RWLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1816 error = VOP_READDIR(dirvp, &uio, kcred, &eof, NULL, 0);
1817 VOP_RWUNLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1818
1819 dbuflen = dlen - uio.uio_resid;
1820
1821 if (error || dbuflen == 0)
1822 break;
1823
1824 for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen);
1825 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
1826
1827 nm = dp->d_name;
1828
1829 if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0)
1830 continue;
1831
1832 if (strncmp(nm, nodename, nodenamelen) != 0)
1833 continue;
1834
1835 error = VOP_LOOKUP(dirvp, nm, &vp,
1836 NULL, 0, NULL, kcred, NULL, NULL, NULL);
1837
1838 dsysdebug(error,
1839 ("rem_drv %s/%s lookup (%d)\n",
1840 dir, nm, error));
1841
1842 if (error)
1843 continue;
1844
1845 ASSERT(vp->v_type == VDIR ||
1846 vp->v_type == VCHR || vp->v_type == VBLK);
1847
1848 if (vp->v_type == VDIR) {
1849 error = devfs_remdrv_rmdir(vp, nm, rvp);
1850 if (error == 0) {
1851 error = VOP_RMDIR(dirvp, (char *)nm,
1852 rvp, kcred, NULL, 0);
1853 dsysdebug(error,
1854 ("rem_drv %s/%s rmdir (%d)\n",
1855 dir, nm, error));
1856 }
1857 } else {
1858 error = VOP_REMOVE(dirvp, (char *)nm, kcred,
1859 NULL, 0);
1860 dsysdebug(error,
1861 ("rem_drv %s/%s remove (%d)\n",
1862 dir, nm, error));
1863 }
1864
1865 VN_RELE(vp);
1866 if (error)
1867 goto exit;
1868 }
1869 }
1870
1871 exit:
1872 VN_RELE(dirvp);
1873
1874 kmem_free(dbuf, dlen);
1875
1876 return (0);
1877 }
1878
1879 struct dv_list {
1880 struct dv_node *dv;
1881 struct dv_list *next;
1882 };
1883
1884 void
1885 dv_walk(
1886 struct dv_node *ddv,
1887 char *devnm,
1888 void (*callback)(struct dv_node *, void *),
1889 void *arg)
1890 {
1891 struct vnode *dvp;
1892 struct dv_node *dv;
1893 struct dv_list *head, *tail, *next;
1894 int len;
1895
1896 dcmn_err3(("dv_walk: ddv = %s, devnm = %s\n",
1897 ddv->dv_name, devnm ? devnm : "<null>"));
1898
1899 dvp = DVTOV(ddv);
1900
1901 ASSERT(dvp->v_type == VDIR);
1902
1903 head = tail = next = NULL;
1904
1905 rw_enter(&ddv->dv_contents, RW_READER);
1906 mutex_enter(&dvp->v_lock);
1907 for (dv = DV_FIRST_ENTRY(ddv); dv; dv = DV_NEXT_ENTRY(ddv, dv)) {
1908 /*
1909 * If devnm is not NULL and is not the empty string,
1910 * select only dv_nodes with matching non-minor name
1911 */
1912 if (devnm && (len = strlen(devnm)) &&
1913 (strncmp(devnm, dv->dv_name, len) ||
1914 (dv->dv_name[len] != ':' && dv->dv_name[len] != '\0')))
1915 continue;
1916
1917 callback(dv, arg);
1918
1919 if (DVTOV(dv)->v_type != VDIR)
1920 continue;
1921
1922 next = kmem_zalloc(sizeof (*next), KM_SLEEP);
1923 next->dv = dv;
1924
1925 if (tail)
1926 tail->next = next;
1927 else
1928 head = next;
1929
1930 tail = next;
1931 }
1932
1933 while (head) {
1934 dv_walk(head->dv, NULL, callback, arg);
1935 next = head->next;
1936 kmem_free(head, sizeof (*head));
1937 head = next;
1938 }
1939 rw_exit(&ddv->dv_contents);
1940 mutex_exit(&dvp->v_lock);
1941 }