1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * This is the device filesystem.
  28  *
  29  * It is a combination of a namer to drive autoconfiguration,
  30  * plus the access methods for the device drivers of the system.
  31  *
  32  * The prototype is fairly dependent on specfs for the latter part
  33  * of its implementation, though a final version would integrate the two.
  34  */
  35 #include <sys/types.h>
  36 #include <sys/param.h>
  37 #include <sys/sysmacros.h>
  38 #include <sys/systm.h>
  39 #include <sys/kmem.h>
  40 #include <sys/time.h>
  41 #include <sys/pathname.h>
  42 #include <sys/vfs.h>
  43 #include <sys/vfs_opreg.h>
  44 #include <sys/vnode.h>
  45 #include <sys/stat.h>
  46 #include <sys/uio.h>
  47 #include <sys/stat.h>
  48 #include <sys/errno.h>
  49 #include <sys/cmn_err.h>
  50 #include <sys/cred.h>
  51 #include <sys/statvfs.h>
  52 #include <sys/mount.h>
  53 #include <sys/debug.h>
  54 #include <sys/modctl.h>
  55 #include <fs/fs_subr.h>
  56 #include <sys/fs/dv_node.h>
  57 #include <sys/fs/snode.h>
  58 #include <sys/sunndi.h>
  59 #include <sys/policy.h>
  60 #include <sys/sunmdi.h>
  61 
  62 /*
  63  * devfs vfs operations.
  64  */
  65 static int devfs_mount(struct vfs *, struct vnode *, struct mounta *,
  66     struct cred *);
  67 static int devfs_unmount(struct vfs *, int, struct cred *);
  68 static int devfs_root(struct vfs *, struct vnode **);
  69 static int devfs_statvfs(struct vfs *, struct statvfs64 *);
  70 static int devfs_mountroot(struct vfs *, enum whymountroot);
  71 
  72 static int devfsinit(int, char *);
  73 
  74 static vfsdef_t devfs_vfssw = {
  75         VFSDEF_VERSION,
  76         "devfs",        /* type name string */
  77         devfsinit,      /* init routine */
  78         0,              /* flags */
  79         NULL            /* mount options table prototype */
  80 };
  81 
  82 static kmutex_t devfs_lock;     /* protects global data */
  83 static int devfstype;           /* fstype */
  84 static dev_t devfsdev;          /* the fictious 'device' we live on */
  85 static struct devfs_data *devfs_mntinfo;        /* linked list of instances */
  86 
  87 /*
  88  * Module linkage information
  89  */
  90 static struct modlfs modlfs = {
  91         &mod_fsops, "devices filesystem", &devfs_vfssw
  92 };
  93 
  94 static struct modlinkage modlinkage = {
  95         MODREV_1, (void *)&modlfs, NULL
  96 };
  97 
  98 int
  99 _init(void)
 100 {
 101         int e;
 102 
 103         mutex_init(&devfs_lock, "devfs lock", MUTEX_DEFAULT, NULL);
 104         dv_node_cache_init();
 105         if ((e = mod_install(&modlinkage)) != 0) {
 106                 dv_node_cache_fini();
 107                 mutex_destroy(&devfs_lock);
 108                 return (e);
 109         }
 110         dcmn_err(("devfs loaded\n"));
 111         return (0);
 112 }
 113 
 114 int
 115 _fini(void)
 116 {
 117         return (EBUSY);
 118 }
 119 
 120 int
 121 _info(struct modinfo *modinfop)
 122 {
 123         return (mod_info(&modlinkage, modinfop));
 124 }
 125 
 126 /*ARGSUSED1*/
 127 static int
 128 devfsinit(int fstype, char *name)
 129 {
 130         static const fs_operation_def_t devfs_vfsops_template[] = {
 131                 VFSNAME_MOUNT,          { .vfs_mount = devfs_mount },
 132                 VFSNAME_UNMOUNT,        { .vfs_unmount = devfs_unmount },
 133                 VFSNAME_ROOT,           { .vfs_root = devfs_root },
 134                 VFSNAME_STATVFS,        { .vfs_statvfs = devfs_statvfs },
 135                 VFSNAME_SYNC,           { .vfs_sync = fs_sync },
 136                 VFSNAME_MOUNTROOT,      { .vfs_mountroot = devfs_mountroot },
 137                 NULL,                   NULL
 138         };
 139         int error;
 140         int dev;
 141         extern major_t getudev(void);   /* gack - what a function */
 142 
 143         devfstype = fstype;
 144         /*
 145          * Associate VFS ops vector with this fstype
 146          */
 147         error = vfs_setfsops(fstype, devfs_vfsops_template, NULL);
 148         if (error != 0) {
 149                 cmn_err(CE_WARN, "devfsinit: bad vfs ops template");
 150                 return (error);
 151         }
 152 
 153         error = vn_make_ops("dev fs", dv_vnodeops_template, &dv_vnodeops);
 154         if (error != 0) {
 155                 (void) vfs_freevfsops_by_type(fstype);
 156                 cmn_err(CE_WARN, "devfsinit: bad vnode ops template");
 157                 return (error);
 158         }
 159 
 160         /*
 161          * Invent a dev_t (sigh).
 162          */
 163         if ((dev = getudev()) == DDI_MAJOR_T_NONE) {
 164                 cmn_err(CE_NOTE, "%s: can't get unique dev", devfs_vfssw.name);
 165                 dev = 0;
 166         }
 167         devfsdev = makedevice(dev, 0);
 168 
 169         return (0);
 170 }
 171 
 172 /*
 173  * The name of the mount point and the name of the attribute
 174  * filesystem are passed down from userland for now.
 175  */
 176 static int
 177 devfs_mount(struct vfs *vfsp, struct vnode *mvp, struct mounta *uap,
 178     struct cred *cr)
 179 {
 180         struct devfs_data *devfs_data;
 181         struct vnode *avp;
 182         struct dv_node *dv;
 183         struct vattr va;
 184 
 185         dcmn_err(("devfs_mount\n"));
 186 
 187         if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
 188                 return (EPERM);
 189 
 190         /*
 191          * check that the mount point is sane
 192          */
 193         if (mvp->v_type != VDIR)
 194                 return (ENOTDIR);
 195 
 196         ASSERT(uap->flags & MS_SYSSPACE);
 197         /*
 198          * Devfs can only be mounted from kernel during boot.
 199          * avp is the existing /devices, the same as the mount point.
 200          */
 201         avp = mvp;
 202 
 203         /*
 204          * Create and initialize the vfs-private data.
 205          * This includes a hand-crafted root vnode (we build
 206          * this here mostly so that traverse() doesn't sleep
 207          * in VFS_ROOT()).
 208          */
 209         mutex_enter(&devfs_lock);
 210         ASSERT(devfs_mntinfo == NULL);
 211         dv = dv_mkroot(vfsp, devfsdev);
 212         dv->dv_attrvp = avp;         /* attribute root vp */
 213 
 214         ASSERT(dv == dv->dv_dotdot);
 215 
 216         devfs_data = kmem_zalloc(sizeof (struct devfs_data), KM_SLEEP);
 217         devfs_data->devfs_vfsp = vfsp;
 218         devfs_data->devfs_root = dv;
 219 
 220         vfsp->vfs_data = (caddr_t)devfs_data;
 221         vfsp->vfs_fstype = devfstype;
 222         vfsp->vfs_dev = devfsdev;
 223         vfsp->vfs_bsize = DEV_BSIZE;
 224         vfsp->vfs_mtime = ddi_get_time();
 225         vfs_make_fsid(&vfsp->vfs_fsid, vfsp->vfs_dev, devfstype);
 226 
 227         /* We're there. */
 228         devfs_mntinfo = devfs_data;
 229         mutex_exit(&devfs_lock);
 230 
 231         va.va_mask = AT_ATIME|AT_MTIME;
 232         gethrestime(&va.va_atime);
 233         gethrestime(&va.va_mtime);
 234         (void) VOP_SETATTR(DVTOV(dv), &va, 0, cr, NULL);
 235         return (0);
 236 }
 237 
 238 
 239 /*
 240  * We never unmount devfs in a real production system.
 241  */
 242 /*ARGSUSED*/
 243 static int
 244 devfs_unmount(struct vfs *vfsp, int flag, struct cred *cr)
 245 {
 246         return (EBUSY);
 247 }
 248 
 249 /*
 250  * return root vnode for given vfs
 251  */
 252 static int
 253 devfs_root(struct vfs *vfsp, struct vnode **vpp)
 254 {
 255         dcmn_err(("devfs_root\n"));
 256         *vpp = DVTOV(VFSTODVFS(vfsp)->devfs_root);
 257         VN_HOLD(*vpp);
 258         return (0);
 259 }
 260 
 261 /*
 262  * return 'generic superblock' information to userland.
 263  *
 264  * not much that we can usefully admit to here
 265  */
 266 static int
 267 devfs_statvfs(struct vfs *vfsp, struct statvfs64 *sbp)
 268 {
 269         extern kmem_cache_t *dv_node_cache;
 270 
 271         dev32_t d32;
 272 
 273         dcmn_err(("devfs_statvfs\n"));
 274         bzero(sbp, sizeof (*sbp));
 275         sbp->f_frsize = sbp->f_bsize = vfsp->vfs_bsize;
 276         /*
 277          * We could compute the number of devfsnodes here .. but since
 278          * it's dynamic anyway, it's not clear how useful this is.
 279          */
 280         sbp->f_files = kmem_cache_stat(dv_node_cache, "alloc");
 281 
 282         /* no illusions that free/avail files is relevant to devfs */
 283         sbp->f_ffree = 0;
 284         sbp->f_favail = 0;
 285 
 286         /* no illusions that blocks are relevant to devfs */
 287         sbp->f_bfree = 0;
 288         sbp->f_bavail = 0;
 289         sbp->f_blocks = 0;
 290 
 291         (void) cmpldev(&d32, vfsp->vfs_dev);
 292         sbp->f_fsid = d32;
 293         (void) strcpy(sbp->f_basetype, vfssw[devfstype].vsw_name);
 294         sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
 295         sbp->f_namemax = MAXNAMELEN - 1;
 296         (void) strcpy(sbp->f_fstr, "devices");
 297 
 298         return (0);
 299 }
 300 
 301 /*
 302  * devfs always mount after root is mounted, so this should never
 303  * be invoked.
 304  */
 305 /*ARGSUSED*/
 306 static int
 307 devfs_mountroot(struct vfs *vfsp, enum whymountroot why)
 308 {
 309         dcmn_err(("devfs_mountroot\n"));
 310 
 311         return (EINVAL);
 312 }
 313 
 314 struct dv_node *
 315 devfs_dip_to_dvnode(dev_info_t *dip)
 316 {
 317         char *dirpath;
 318         struct vnode *dirvp;
 319 
 320         ASSERT(dip != NULL);
 321 
 322         /* no-op if devfs not mounted yet */
 323         if (devfs_mntinfo == NULL)
 324                 return (NULL);
 325 
 326         /*
 327          * The lookupname below only looks up cached dv_nodes
 328          * because devfs_clean_key is set in thread specific data.
 329          */
 330         dirpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 331         (void) ddi_pathname(dip, dirpath);
 332         if (devfs_lookupname(dirpath, NULLVPP, &dirvp)) {
 333                 dcmn_err(("directory %s not found\n", dirpath));
 334                 kmem_free(dirpath, MAXPATHLEN);
 335                 return (NULL);
 336         }
 337 
 338         kmem_free(dirpath, MAXPATHLEN);
 339         return (VTODV(dirvp));
 340 }
 341 
 342 /*
 343  * If DV_CLEAN_FORCE devfs_clean is issued with a dip that is not the root
 344  * and not a vHCI we also need to clean any vHCI branches because they
 345  * may contain pHCI nodes. A detach_node() of a pHCI will fail if its
 346  * mdi_devi_offline() fails, and the mdi_devi_offline() of the last
 347  * pHCI will fail unless an ndi_devi_offline() of the Client nodes under
 348  * the vHCI is successful - which requires a clean vHCI branch to removed
 349  * the devi_refs associated with devfs vnodes.
 350  */
 351 static int
 352 devfs_clean_vhci(dev_info_t *dip, void *args)
 353 {
 354         struct dv_node  *dvp;
 355         uint_t          flags = (uint_t)(uintptr_t)args;
 356 
 357         (void) tsd_set(devfs_clean_key, (void *)1);
 358         dvp = devfs_dip_to_dvnode(dip);
 359         if (dvp) {
 360                 (void) dv_cleandir(dvp, NULL, flags);
 361                 VN_RELE(DVTOV(dvp));
 362         }
 363         (void) tsd_set(devfs_clean_key, NULL);
 364         return (DDI_WALK_CONTINUE);
 365 }
 366 
 367 /*
 368  * devfs_clean()
 369  *
 370  * Destroy unreferenced dv_node's and detach devices.
 371  *
 372  * devfs_clean will try its best to clean up unused nodes. It is
 373  * no longer valid to assume that just because devfs_clean fails,
 374  * the device is not removable. This is because device contracts
 375  * can result in userland processes releasing a device during the
 376  * device offline process in the kernel. Thus it is no longer
 377  * correct to fail an offline just because devfs_clean finds
 378  * referenced dv_nodes. To enforce this, devfs_clean() always
 379  * returns success i.e. 0.
 380  *
 381  * devfs_clean() may return before removing all possible nodes if
 382  * we cannot acquire locks in areas of the code where potential for
 383  * deadlock exists (see comments in dv_find() and dv_cleandir() for
 384  * examples of this).
 385  *
 386  * devfs caches unreferenced dv_node to speed by the performance
 387  * of ls, find, etc. devfs_clean() is invoked to cleanup cached
 388  * dv_nodes to reclaim memory as well as to facilitate device
 389  * removal (dv_node reference devinfo nodes, which prevents driver
 390  * detach).
 391  *
 392  * If a shell parks in a /devices directory, the dv_node will be
 393  * held, preventing the corresponding device to be detached.
 394  * This would be a denial of service against DR. To prevent this,
 395  * DR code calls devfs_clean() with the DV_CLEAN_FORCE flag.
 396  * The dv_cleandir() implementation does the right thing to ensure
 397  * successful DR.
 398  */
 399 int
 400 devfs_clean(dev_info_t *dip, char *devnm, uint_t flags)
 401 {
 402         struct dv_node          *dvp;
 403 
 404         dcmn_err(("devfs_unconfigure: dip = 0x%p, flags = 0x%x",
 405             (void *)dip, flags));
 406 
 407         /* avoid recursion back into the device tree */
 408         (void) tsd_set(devfs_clean_key, (void *)1);
 409         dvp = devfs_dip_to_dvnode(dip);
 410         if (dvp == NULL) {
 411                 (void) tsd_set(devfs_clean_key, NULL);
 412                 return (0);
 413         }
 414 
 415         (void) dv_cleandir(dvp, devnm, flags);
 416         (void) tsd_set(devfs_clean_key, NULL);
 417         VN_RELE(DVTOV(dvp));
 418 
 419         /*
 420          * If we are doing a DV_CLEAN_FORCE, and we did not start at the
 421          * root, and we did not start at a vHCI node then clean vHCI
 422          * branches too.  Failure to clean vHCI branch does not cause EBUSY.
 423          *
 424          * Also, to accommodate nexus callers that clean 'self' to DR 'child'
 425          * (like pcihp) we clean vHCIs even when dv_cleandir() of dip branch
 426          * above fails - this prevents a busy DR 'child' sibling from causing
 427          * the DR of 'child' to fail because a vHCI branch was not cleaned.
 428          */
 429         if ((flags & DV_CLEAN_FORCE) && (dip != ddi_root_node()) &&
 430             (mdi_component_is_vhci(dip, NULL) != MDI_SUCCESS)) {
 431                 /*
 432                  * NOTE: for backport the following is recommended
 433                  *      (void) devfs_clean_vhci(scsi_vhci_dip,
 434                  *          (void *)(uintptr_t)flags);
 435                  */
 436                 mdi_walk_vhcis(devfs_clean_vhci, (void *)(uintptr_t)flags);
 437         }
 438 
 439         return (0);
 440 }
 441 
 442 /*
 443  * lookup a devfs relative pathname, returning held vnodes for the final
 444  * component and the containing directory (if requested).
 445  *
 446  * NOTE: We can't use lookupname because this would use the current
 447  *      processes credentials (CRED) in the call lookuppnvp instead
 448  *      of kcred.  It also does not give you the flexibility so
 449  *      specify the directory to start the resolution in (devicesdir).
 450  */
 451 int
 452 devfs_lookupname(
 453         char    *pathname,              /* user pathname */
 454         vnode_t **dirvpp,               /* ret for ptr to parent dir vnode */
 455         vnode_t **compvpp)              /* ret for ptr to component vnode */
 456 {
 457         struct pathname pn;
 458         int             error;
 459 
 460         ASSERT(devicesdir);             /* devfs must be initialized */
 461         ASSERT(pathname);               /* must have some path */
 462 
 463         if (error = pn_get(pathname, UIO_SYSSPACE, &pn))
 464                 return (error);
 465 
 466         /* make the path relative to /devices. */
 467         pn_skipslash(&pn);
 468         if (pn_pathleft(&pn) == 0) {
 469                 /* all we had was "\0" or "/" (which skipslash skiped) */
 470                 if (dirvpp)
 471                         *dirvpp = NULL;
 472                 if (compvpp) {
 473                         VN_HOLD(devicesdir);
 474                         *compvpp = devicesdir;
 475                 }
 476         } else {
 477                 /*
 478                  * Use devfs lookup to resolve pathname to the vnode for
 479                  * the device via relative lookup in devfs. Extra holds for
 480                  * using devicesdir as directory we are searching and for
 481                  * being our root without being == rootdir.
 482                  */
 483                 VN_HOLD(devicesdir);
 484                 VN_HOLD(devicesdir);
 485                 error = lookuppnvp(&pn, NULL, FOLLOW, dirvpp, compvpp,
 486                     devicesdir, devicesdir, kcred);
 487         }
 488         pn_free(&pn);
 489 
 490         return (error);
 491 }
 492 
 493 /*
 494  * Given a devfs path (without the /devices prefix), walk
 495  * the dv_node sub-tree rooted at the path.
 496  */
 497 int
 498 devfs_walk(
 499         char            *path,
 500         void            (*callback)(struct dv_node *, void *),
 501         void            *arg)
 502 {
 503         char *dirpath, *devnm;
 504         struct vnode    *dirvp;
 505 
 506         ASSERT(path && callback);
 507 
 508         if (*path != '/' || devfs_mntinfo == NULL)
 509                 return (ENXIO);
 510 
 511         dcmn_err(("devfs_walk: path = %s", path));
 512 
 513         dirpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 514 
 515         (void) snprintf(dirpath, MAXPATHLEN, "/devices%s", path);
 516 
 517         devnm = strrchr(dirpath, '/');
 518 
 519         ASSERT(devnm);
 520 
 521         *devnm++ = '\0';
 522 
 523         if (lookupname(dirpath, UIO_SYSSPACE, 0, NULL, &dirvp)) {
 524                 dcmn_err(("directory %s not found\n", dirpath));
 525                 kmem_free(dirpath, MAXPATHLEN);
 526                 return (ENXIO);
 527         }
 528 
 529         /*
 530          * if path == "/", visit the root dv_node
 531          */
 532         if (*devnm == '\0') {
 533                 callback(VTODV(dirvp), arg);
 534                 devnm = NULL;
 535         }
 536 
 537         dv_walk(VTODV(dirvp), devnm, callback, arg);
 538 
 539         VN_RELE(dirvp);
 540 
 541         kmem_free(dirpath, MAXPATHLEN);
 542 
 543         return (0);
 544 }
 545 
 546 int
 547 devfs_devpolicy(vnode_t *vp, devplcy_t **dpp)
 548 {
 549         struct vnode *rvp;
 550         struct dv_node *dvp;
 551         int rval = -1;
 552 
 553         /* fail if devfs not mounted yet */
 554         if (devfs_mntinfo == NULL)
 555                 return (rval);
 556 
 557         if (VOP_REALVP(vp, &rvp, NULL) == 0 && vn_matchops(rvp, dv_vnodeops)) {
 558                 dvp = VTODV(rvp);
 559                 rw_enter(&dvp->dv_contents, RW_READER);
 560                 if (dvp->dv_priv) {
 561                         dphold(dvp->dv_priv);
 562                         *dpp = dvp->dv_priv;
 563                         rval = 0;
 564                 }
 565                 rw_exit(&dvp->dv_contents);
 566         }
 567         return (rval);
 568 }