1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  * vnode ops for the devfs
  27  *
  28  * For leaf vnode special files (VCHR|VBLK) specfs will always see the VOP
  29  * first because dv_find always performs leaf vnode substitution, returning
  30  * a specfs vnode with an s_realvp pointing to the devfs leaf vnode. This
  31  * means that the only leaf special file VOP operations that devfs will see
  32  * after VOP_LOOKUP are the ones that specfs forwards.
  33  */
  34 
  35 #include <sys/types.h>
  36 #include <sys/param.h>
  37 #include <sys/t_lock.h>
  38 #include <sys/systm.h>
  39 #include <sys/sysmacros.h>
  40 #include <sys/user.h>
  41 #include <sys/time.h>
  42 #include <sys/vfs.h>
  43 #include <sys/vnode.h>
  44 #include <sys/vfs_opreg.h>
  45 #include <sys/file.h>
  46 #include <sys/fcntl.h>
  47 #include <sys/flock.h>
  48 #include <sys/kmem.h>
  49 #include <sys/uio.h>
  50 #include <sys/errno.h>
  51 #include <sys/stat.h>
  52 #include <sys/cred.h>
  53 #include <sys/dirent.h>
  54 #include <sys/pathname.h>
  55 #include <sys/cmn_err.h>
  56 #include <sys/debug.h>
  57 #include <sys/policy.h>
  58 #include <sys/modctl.h>
  59 #include <sys/sunndi.h>
  60 #include <fs/fs_subr.h>
  61 #include <sys/fs/dv_node.h>
  62 
  63 extern struct vattr     dv_vattr_dir, dv_vattr_file;
  64 extern dev_t rconsdev;
  65 
  66 /*
  67  * Open of devices (leaf nodes) is handled by specfs.
  68  * There is nothing to do to open a directory
  69  */
  70 /*ARGSUSED*/
  71 static int
  72 devfs_open(struct vnode **vpp, int flag, struct cred *cred,
  73     caller_context_t *ct)
  74 {
  75         struct dv_node  *dv = VTODV(*vpp);
  76 
  77         dcmn_err2(("devfs_open %s\n", dv->dv_name));
  78         ASSERT((*vpp)->v_type == VDIR);
  79         return (0);
  80 }
  81 
  82 /*
  83  * Close of devices (leaf nodes) is handled by specfs.
  84  * There is nothing much to do inorder to close a directory.
  85  */
  86 /*ARGSUSED1*/
  87 static int
  88 devfs_close(struct vnode *vp, int flag, int count,
  89     offset_t offset, struct cred *cred, caller_context_t *ct)
  90 {
  91         struct dv_node  *dv = VTODV(vp);
  92 
  93         dcmn_err2(("devfs_close %s\n", dv->dv_name));
  94         ASSERT(vp->v_type == VDIR);
  95 
  96         cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
  97         cleanshares(vp, ttoproc(curthread)->p_pid);
  98         return (0);
  99 }
 100 
 101 /*
 102  * Read of devices (leaf nodes) is handled by specfs.
 103  * Read of directories is not supported.
 104  */
 105 /*ARGSUSED*/
 106 static int
 107 devfs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
 108         struct caller_context *ct)
 109 {
 110         dcmn_err2(("devfs_read %s\n", VTODV(vp)->dv_name));
 111         ASSERT(vp->v_type == VDIR);
 112         ASSERT(RW_READ_HELD(&VTODV(vp)->dv_contents));
 113         return (EISDIR);
 114 }
 115 
 116 /*
 117  * Write of devices (leaf nodes) is handled by specfs.
 118  * Write of directories is not supported.
 119  */
 120 /*ARGSUSED*/
 121 static int
 122 devfs_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
 123         struct caller_context *ct)
 124 {
 125         dcmn_err2(("devfs_write %s\n", VTODV(vp)->dv_name));
 126         ASSERT(vp->v_type == VDIR);
 127         ASSERT(RW_WRITE_HELD(&VTODV(vp)->dv_contents));
 128         return (EISDIR);
 129 }
 130 
 131 /*
 132  * Ioctls to device (leaf nodes) is handled by specfs.
 133  * Ioctl to directories is not supported.
 134  */
 135 /*ARGSUSED*/
 136 static int
 137 devfs_ioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
 138     struct cred *cred, int *rvalp, caller_context_t *ct)
 139 {
 140         dcmn_err2(("devfs_ioctl %s\n", VTODV(vp)->dv_name));
 141         ASSERT(vp->v_type == VDIR);
 142 
 143         return (ENOTTY);        /* no ioctls supported */
 144 }
 145 
 146 /*
 147  * We can be asked directly about the attributes of directories, or
 148  * (via sp->s_realvp) about the filesystem attributes of special files.
 149  *
 150  * For directories, we just believe the attribute store
 151  * though we mangle the nodeid, fsid, and rdev to convince userland we
 152  * really are a different filesystem.
 153  *
 154  * For special files, a little more fakery is required.
 155  *
 156  * If the attribute store is not there (read only root), we believe our
 157  * memory based attributes.
 158  */
 159 static int
 160 devfs_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cr,
 161     caller_context_t *ct)
 162 {
 163         struct dv_node  *dv = VTODV(vp);
 164         int             error = 0;
 165         uint_t          mask;
 166 
 167         /*
 168          * Message goes to console only. Otherwise, the message
 169          * causes devfs_getattr to be invoked again... infinite loop
 170          */
 171         dcmn_err2(("?devfs_getattr %s\n", dv->dv_name));
 172         ASSERT(dv->dv_attr || dv->dv_attrvp);
 173 
 174         if (!(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK)) {
 175                 cmn_err(CE_WARN,        /* panic ? */
 176                     "?%s: getattr on vnode type %d", dvnm, vp->v_type);
 177                 return (ENOENT);
 178         }
 179 
 180         rw_enter(&dv->dv_contents, RW_READER);
 181         if (dv->dv_attr) {
 182                 /*
 183                  * obtain from the memory version of attribute.
 184                  * preserve mask for those that optimize.
 185                  * devfs specific fields are already merged on creation.
 186                  */
 187                 mask = vap->va_mask;
 188                 *vap = *dv->dv_attr;
 189                 vap->va_mask = mask;
 190         } else {
 191                 /* obtain from attribute store and merge */
 192                 error = VOP_GETATTR(dv->dv_attrvp, vap, flags, cr, ct);
 193                 dsysdebug(error, ("vop_getattr %s %d\n", dv->dv_name, error));
 194                 dv_vattr_merge(dv, vap);
 195         }
 196         rw_exit(&dv->dv_contents);
 197 
 198         /*
 199          * Restrict the permissions of the node fronting the console
 200          * to 0600 with root as the owner.  This prevents a non-root
 201          * user from gaining access to a serial terminal (like /dev/term/a)
 202          * which is in reality serving as the console device (/dev/console).
 203          */
 204         if (vp->v_rdev == rconsdev) {
 205                 mode_t  rconsmask = S_IXUSR|S_IRWXG|S_IRWXO;
 206                 vap->va_mode &= (~rconsmask);
 207                 vap->va_uid = 0;
 208         }
 209 
 210         return (error);
 211 }
 212 
 213 static int devfs_unlocked_access(void *, int, struct cred *);
 214 
 215 /*ARGSUSED4*/
 216 static int
 217 devfs_setattr_dir(
 218         struct dv_node *dv,
 219         struct vnode *vp,
 220         struct vattr *vap,
 221         int flags,
 222         struct cred *cr)
 223 {
 224         struct vattr    *map;
 225         uint_t          mask;
 226         int             error = 0;
 227         struct vattr    vattr;
 228 
 229         ASSERT(dv->dv_attr || dv->dv_attrvp);
 230 
 231         ASSERT(vp->v_type == VDIR);
 232         ASSERT((dv->dv_flags & DV_NO_FSPERM) == 0);
 233 
 234         if (vap->va_mask & AT_NOSET)
 235                 return (EINVAL);
 236 
 237         /* to ensure consistency, single thread setting of attributes */
 238         rw_enter(&dv->dv_contents, RW_WRITER);
 239 
 240 again:  if (dv->dv_attr) {
 241 
 242                 error = secpolicy_vnode_setattr(cr, vp, vap,
 243                     dv->dv_attr, flags, devfs_unlocked_access, dv);
 244 
 245                 if (error)
 246                         goto out;
 247 
 248                 /*
 249                  * Apply changes to the memory based attribute. This code
 250                  * is modeled after the tmpfs implementation of memory
 251                  * based vnodes
 252                  */
 253                 map = dv->dv_attr;
 254                 mask = vap->va_mask;
 255 
 256                 /* Change file access modes. */
 257                 if (mask & AT_MODE) {
 258                         map->va_mode &= S_IFMT;
 259                         map->va_mode |= vap->va_mode & ~S_IFMT;
 260                 }
 261                 if (mask & AT_UID)
 262                         map->va_uid = vap->va_uid;
 263                 if (mask & AT_GID)
 264                         map->va_gid = vap->va_gid;
 265                 if (mask & AT_ATIME)
 266                         map->va_atime = vap->va_atime;
 267                 if (mask & AT_MTIME)
 268                         map->va_mtime = vap->va_mtime;
 269 
 270                 if (mask & (AT_MODE | AT_UID | AT_GID | AT_MTIME))
 271                         gethrestime(&map->va_ctime);
 272         } else {
 273                 /* use the backing attribute store */
 274                 ASSERT(dv->dv_attrvp);
 275 
 276                 /*
 277                  * See if we are changing something we care about
 278                  * the persistence of - return success if we don't care.
 279                  */
 280                 if (vap->va_mask & (AT_MODE|AT_UID|AT_GID|AT_ATIME|AT_MTIME)) {
 281                         /* Set the attributes */
 282                         error = VOP_SETATTR(dv->dv_attrvp,
 283                             vap, flags, cr, NULL);
 284                         dsysdebug(error,
 285                             ("vop_setattr %s %d\n", dv->dv_name, error));
 286 
 287                         /*
 288                          * Some file systems may return EROFS for a setattr
 289                          * on a readonly file system.  In this case we create
 290                          * our own memory based attribute.
 291                          */
 292                         if (error == EROFS) {
 293                                 /*
 294                                  * obtain attributes from existing file
 295                                  * that we will modify and switch to memory
 296                                  * based attribute until attribute store is
 297                                  * read/write.
 298                                  */
 299                                 vattr = dv_vattr_dir;
 300                                 if (VOP_GETATTR(dv->dv_attrvp,
 301                                     &vattr, flags, cr, NULL) == 0) {
 302                                         dv->dv_attr = kmem_alloc(
 303                                             sizeof (struct vattr), KM_SLEEP);
 304                                         *dv->dv_attr = vattr;
 305                                         dv_vattr_merge(dv, dv->dv_attr);
 306                                         goto again;
 307                                 }
 308                         }
 309                 }
 310         }
 311 out:
 312         rw_exit(&dv->dv_contents);
 313         return (error);
 314 }
 315 
 316 
 317 /*
 318  * Compare the uid/gid/mode changes requested for a setattr
 319  * operation with the same details of a node's default minor
 320  * perm information.  Return 0 if identical.
 321  */
 322 static int
 323 dv_setattr_cmp(struct vattr *map, mperm_t *mp)
 324 {
 325         if ((map->va_mode & S_IAMB) != (mp->mp_mode & S_IAMB))
 326                 return (1);
 327         if (map->va_uid != mp->mp_uid)
 328                 return (1);
 329         if (map->va_gid != mp->mp_gid)
 330                 return (1);
 331         return (0);
 332 }
 333 
 334 
 335 /*ARGSUSED4*/
 336 static int
 337 devfs_setattr(
 338         struct vnode *vp,
 339         struct vattr *vap,
 340         int flags,
 341         struct cred *cr,
 342         caller_context_t *ct)
 343 {
 344         struct dv_node  *dv = VTODV(vp);
 345         struct dv_node  *ddv;
 346         struct vnode    *dvp;
 347         struct vattr    *map;
 348         uint_t          mask;
 349         int             error = 0;
 350         struct vattr    *free_vattr = NULL;
 351         struct vattr    *vattrp = NULL;
 352         mperm_t         mp;
 353         int             persist;
 354 
 355         /*
 356          * Message goes to console only. Otherwise, the message
 357          * causes devfs_getattr to be invoked again... infinite loop
 358          */
 359         dcmn_err2(("?devfs_setattr %s\n", dv->dv_name));
 360         ASSERT(dv->dv_attr || dv->dv_attrvp);
 361 
 362         if (!(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK)) {
 363                 cmn_err(CE_WARN,        /* panic ? */
 364                     "?%s: getattr on vnode type %d", dvnm, vp->v_type);
 365                 return (ENOENT);
 366         }
 367 
 368         if (vap->va_mask & AT_NOSET)
 369                 return (EINVAL);
 370 
 371         /*
 372          * If we are changing something we don't care about
 373          * the persistence of, return success.
 374          */
 375         if ((vap->va_mask &
 376             (AT_MODE|AT_UID|AT_GID|AT_ATIME|AT_MTIME)) == 0)
 377                 return (0);
 378 
 379         /*
 380          * If driver overrides fs perm, disallow chmod
 381          * and do not create attribute nodes.
 382          */
 383         if (dv->dv_flags & DV_NO_FSPERM) {
 384                 ASSERT(dv->dv_attr);
 385                 if (vap->va_mask & (AT_MODE | AT_UID | AT_GID))
 386                         return (EPERM);
 387                 if ((vap->va_mask & (AT_ATIME|AT_MTIME)) == 0)
 388                         return (0);
 389                 rw_enter(&dv->dv_contents, RW_WRITER);
 390                 if (vap->va_mask & AT_ATIME)
 391                         dv->dv_attr->va_atime = vap->va_atime;
 392                 if (vap->va_mask & AT_MTIME)
 393                         dv->dv_attr->va_mtime = vap->va_mtime;
 394                 rw_exit(&dv->dv_contents);
 395                 return (0);
 396         }
 397 
 398         /*
 399          * Directories are always created but device nodes are
 400          * only used to persist non-default permissions.
 401          */
 402         if (vp->v_type == VDIR) {
 403                 ASSERT(dv->dv_attr || dv->dv_attrvp);
 404                 return (devfs_setattr_dir(dv, vp, vap, flags, cr));
 405         }
 406 
 407         /*
 408          * Allocate now before we take any locks
 409          */
 410         vattrp = kmem_zalloc(sizeof (*vattrp), KM_SLEEP);
 411 
 412         /* to ensure consistency, single thread setting of attributes */
 413         rw_enter(&dv->dv_contents, RW_WRITER);
 414 
 415         /*
 416          * We don't need to create an attribute node
 417          * to persist access or modification times.
 418          */
 419         persist = (vap->va_mask & (AT_MODE | AT_UID | AT_GID));
 420 
 421         /*
 422          * If persisting something, get the default permissions
 423          * for this minor to compare against what the attributes
 424          * are now being set to.  Default ordering is:
 425          *      - minor_perm match for this minor
 426          *      - mode supplied by ddi_create_priv_minor_node
 427          *      - devfs defaults
 428          */
 429         if (persist) {
 430                 if (dev_minorperm(dv->dv_devi, dv->dv_name, &mp) != 0) {
 431                         mp.mp_uid = dv_vattr_file.va_uid;
 432                         mp.mp_gid = dv_vattr_file.va_gid;
 433                         mp.mp_mode = dv_vattr_file.va_mode;
 434                         if (dv->dv_flags & DV_DFLT_MODE) {
 435                                 ASSERT((dv->dv_dflt_mode & ~S_IAMB) == 0);
 436                                 mp.mp_mode &= ~S_IAMB;
 437                                 mp.mp_mode |= dv->dv_dflt_mode;
 438                                 dcmn_err5(("%s: setattr priv default 0%o\n",
 439                                     dv->dv_name, mp.mp_mode));
 440                         } else {
 441                                 dcmn_err5(("%s: setattr devfs default 0%o\n",
 442                                     dv->dv_name, mp.mp_mode));
 443                         }
 444                 } else {
 445                         dcmn_err5(("%s: setattr minor perm default 0%o\n",
 446                             dv->dv_name, mp.mp_mode));
 447                 }
 448         }
 449 
 450         /*
 451          * If we don't have a vattr for this node, construct one.
 452          */
 453         if (dv->dv_attr) {
 454                 free_vattr = vattrp;
 455                 vattrp = NULL;
 456         } else {
 457                 ASSERT(dv->dv_attrvp);
 458                 ASSERT(vp->v_type != VDIR);
 459                 *vattrp = dv_vattr_file;
 460                 error = VOP_GETATTR(dv->dv_attrvp, vattrp, 0, cr, ct);
 461                 dsysdebug(error, ("vop_getattr %s %d\n", dv->dv_name, error));
 462                 if (error)
 463                         goto out;
 464                 dv->dv_attr = vattrp;
 465                 dv_vattr_merge(dv, dv->dv_attr);
 466                 vattrp = NULL;
 467         }
 468 
 469         error = secpolicy_vnode_setattr(cr, vp, vap, dv->dv_attr,
 470             flags, devfs_unlocked_access, dv);
 471         if (error) {
 472                 dsysdebug(error, ("devfs_setattr %s secpolicy error %d\n",
 473                     dv->dv_name, error));
 474                 goto out;
 475         }
 476 
 477         /*
 478          * Apply changes to the memory based attribute. This code
 479          * is modeled after the tmpfs implementation of memory
 480          * based vnodes
 481          */
 482         map = dv->dv_attr;
 483         mask = vap->va_mask;
 484 
 485         /* Change file access modes. */
 486         if (mask & AT_MODE) {
 487                 map->va_mode &= S_IFMT;
 488                 map->va_mode |= vap->va_mode & ~S_IFMT;
 489         }
 490         if (mask & AT_UID)
 491                 map->va_uid = vap->va_uid;
 492         if (mask & AT_GID)
 493                 map->va_gid = vap->va_gid;
 494         if (mask & AT_ATIME)
 495                 map->va_atime = vap->va_atime;
 496         if (mask & AT_MTIME)
 497                 map->va_mtime = vap->va_mtime;
 498 
 499         if (mask & (AT_MODE | AT_UID | AT_GID | AT_MTIME)) {
 500                 gethrestime(&map->va_ctime);
 501         }
 502 
 503         /*
 504          * A setattr to defaults means we no longer need the
 505          * shadow node as a persistent store, unless there
 506          * are ACLs.  Otherwise create a shadow node if one
 507          * doesn't exist yet.
 508          */
 509         if (persist) {
 510                 if ((dv_setattr_cmp(map, &mp) == 0) &&
 511                     ((dv->dv_flags & DV_ACL) == 0)) {
 512 
 513                         if (dv->dv_attrvp) {
 514                                 ddv = dv->dv_dotdot;
 515                                 ASSERT(ddv->dv_attrvp);
 516                                 error = VOP_REMOVE(ddv->dv_attrvp,
 517                                     dv->dv_name, cr, ct, 0);
 518                                 dsysdebug(error,
 519                                     ("vop_remove %s %s %d\n",
 520                                     ddv->dv_name, dv->dv_name, error));
 521 
 522                                 if (error == EROFS)
 523                                         error = 0;
 524                                 VN_RELE(dv->dv_attrvp);
 525                                 dv->dv_attrvp = NULL;
 526                         }
 527                         ASSERT(dv->dv_attr);
 528                 } else {
 529                         if (mask & AT_MODE)
 530                                 dcmn_err5(("%s persisting mode 0%o\n",
 531                                     dv->dv_name, vap->va_mode));
 532                         if (mask & AT_UID)
 533                                 dcmn_err5(("%s persisting uid %d\n",
 534                                     dv->dv_name, vap->va_uid));
 535                         if (mask & AT_GID)
 536                                 dcmn_err5(("%s persisting gid %d\n",
 537                                     dv->dv_name, vap->va_gid));
 538 
 539                         if (dv->dv_attrvp == NULL) {
 540                                 dvp = DVTOV(dv->dv_dotdot);
 541                                 dv_shadow_node(dvp, dv->dv_name, vp,
 542                                     NULL, NULLVP, cr,
 543                                     DV_SHADOW_CREATE | DV_SHADOW_WRITE_HELD);
 544                         }
 545                         if (dv->dv_attrvp) {
 546                                 /* If map still valid do TIME for free. */
 547                                 if (dv->dv_attr == map) {
 548                                         mask = map->va_mask;
 549                                         map->va_mask =
 550                                             vap->va_mask | AT_ATIME | AT_MTIME;
 551                                         error = VOP_SETATTR(dv->dv_attrvp, map,
 552                                             flags, cr, NULL);
 553                                         map->va_mask = mask;
 554                                 } else {
 555                                         error = VOP_SETATTR(dv->dv_attrvp,
 556                                             vap, flags, cr, NULL);
 557                                 }
 558                                 dsysdebug(error, ("vop_setattr %s %d\n",
 559                                     dv->dv_name, error));
 560                         }
 561                         /*
 562                          * Some file systems may return EROFS for a setattr
 563                          * on a readonly file system.  In this case save
 564                          * as our own memory based attribute.
 565                          * NOTE: ufs is NOT one of these (see ufs_iupdat).
 566                          */
 567                         if (dv->dv_attr && dv->dv_attrvp && error == 0) {
 568                                 vattrp = dv->dv_attr;
 569                                 dv->dv_attr = NULL;
 570                         } else if (error == EROFS)
 571                                 error = 0;
 572                 }
 573         }
 574 
 575 out:
 576         rw_exit(&dv->dv_contents);
 577 
 578         if (vattrp)
 579                 kmem_free(vattrp, sizeof (*vattrp));
 580         if (free_vattr)
 581                 kmem_free(free_vattr, sizeof (*free_vattr));
 582         return (error);
 583 }
 584 
 585 static int
 586 devfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
 587     caller_context_t *ct)
 588 {
 589         switch (cmd) {
 590         case _PC_ACL_ENABLED:
 591                 /*
 592                  * We rely on the underlying filesystem for ACLs,
 593                  * so direct the query for ACL support there.
 594                  * ACL support isn't relative to the file
 595                  * and we can't guarantee that the dv node
 596                  * has an attribute node, so any valid
 597                  * attribute node will suffice.
 598                  */
 599                 ASSERT(dvroot);
 600                 ASSERT(dvroot->dv_attrvp);
 601                 return (VOP_PATHCONF(dvroot->dv_attrvp, cmd, valp, cr, ct));
 602                 /*NOTREACHED*/
 603         }
 604 
 605         return (fs_pathconf(vp, cmd, valp, cr, ct));
 606 }
 607 
 608 /*
 609  * Let avp handle security attributes (acl's).
 610  */
 611 static int
 612 devfs_getsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
 613     struct cred *cr, caller_context_t *ct)
 614 {
 615         dvnode_t *dv = VTODV(vp);
 616         struct vnode *avp;
 617         int     error;
 618 
 619         dcmn_err2(("devfs_getsecattr %s\n", dv->dv_name));
 620         ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
 621 
 622         rw_enter(&dv->dv_contents, RW_READER);
 623 
 624         avp = dv->dv_attrvp;
 625 
 626         /* fabricate the acl */
 627         if (avp == NULL) {
 628                 error = fs_fab_acl(vp, vsap, flags, cr, ct);
 629                 rw_exit(&dv->dv_contents);
 630                 return (error);
 631         }
 632 
 633         error = VOP_GETSECATTR(avp, vsap, flags, cr, ct);
 634         dsysdebug(error, ("vop_getsecattr %s %d\n", VTODV(vp)->dv_name, error));
 635         rw_exit(&dv->dv_contents);
 636         return (error);
 637 }
 638 
 639 /*
 640  * Set security attributes (acl's)
 641  *
 642  * Note that the dv_contents lock has already been acquired
 643  * by the caller's VOP_RWLOCK.
 644  */
 645 static int
 646 devfs_setsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
 647     struct cred *cr, caller_context_t *ct)
 648 {
 649         dvnode_t *dv = VTODV(vp);
 650         struct vnode *avp;
 651         int     error;
 652 
 653         dcmn_err2(("devfs_setsecattr %s\n", dv->dv_name));
 654         ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
 655         ASSERT(RW_LOCK_HELD(&dv->dv_contents));
 656 
 657         /*
 658          * Not a supported operation on drivers not providing
 659          * file system based permissions.
 660          */
 661         if (dv->dv_flags & DV_NO_FSPERM)
 662                 return (ENOTSUP);
 663 
 664         /*
 665          * To complete, the setsecattr requires an underlying attribute node.
 666          */
 667         if (dv->dv_attrvp == NULL) {
 668                 ASSERT(vp->v_type == VCHR || vp->v_type == VBLK);
 669                 dv_shadow_node(DVTOV(dv->dv_dotdot), dv->dv_name, vp,
 670                     NULL, NULLVP, cr, DV_SHADOW_CREATE | DV_SHADOW_WRITE_HELD);
 671         }
 672 
 673         if ((avp = dv->dv_attrvp) == NULL) {
 674                 dcmn_err2(("devfs_setsecattr %s: "
 675                     "cannot construct attribute node\n", dv->dv_name));
 676                 return (fs_nosys());
 677         }
 678 
 679         /*
 680          * The acl(2) system call issues a VOP_RWLOCK before setting an ACL.
 681          * Since backing file systems expect the lock to be held before seeing
 682          * a VOP_SETSECATTR ACL, we need to issue the VOP_RWLOCK to the backing
 683          * store before forwarding the ACL.
 684          */
 685         (void) VOP_RWLOCK(avp, V_WRITELOCK_TRUE, NULL);
 686         error = VOP_SETSECATTR(avp, vsap, flags, cr, ct);
 687         dsysdebug(error, ("vop_setsecattr %s %d\n", VTODV(vp)->dv_name, error));
 688         VOP_RWUNLOCK(avp, V_WRITELOCK_TRUE, NULL);
 689 
 690         /*
 691          * Set DV_ACL if we have a non-trivial set of ACLs.  It is not
 692          * necessary to hold VOP_RWLOCK since fs_acl_nontrivial only does
 693          * VOP_GETSECATTR calls.
 694          */
 695         if (fs_acl_nontrivial(avp, cr))
 696                 dv->dv_flags |= DV_ACL;
 697         return (error);
 698 }
 699 
 700 /*
 701  * This function is used for secpolicy_setattr().  It must call an
 702  * access() like function while it is already holding the
 703  * dv_contents lock.  We only care about this when dv_attr != NULL;
 704  * so the unlocked access call only concerns itself with that
 705  * particular branch of devfs_access().
 706  */
 707 static int
 708 devfs_unlocked_access(void *vdv, int mode, struct cred *cr)
 709 {
 710         struct dv_node *dv = vdv;
 711         int shift = 0;
 712         uid_t owner = dv->dv_attr->va_uid;
 713 
 714         /* Check access based on owner, group and public permissions. */
 715         if (crgetuid(cr) != owner) {
 716                 shift += 3;
 717                 if (groupmember(dv->dv_attr->va_gid, cr) == 0)
 718                         shift += 3;
 719         }
 720 
 721         return (secpolicy_vnode_access2(cr, DVTOV(dv), owner,
 722             dv->dv_attr->va_mode << shift, mode));
 723 }
 724 
 725 static int
 726 devfs_access(struct vnode *vp, int mode, int flags, struct cred *cr,
 727     caller_context_t *ct)
 728 {
 729         struct dv_node  *dv = VTODV(vp);
 730         int             res;
 731 
 732         dcmn_err2(("devfs_access %s\n", dv->dv_name));
 733         ASSERT(dv->dv_attr || dv->dv_attrvp);
 734 
 735         /* restrict console access to privileged processes */
 736         if ((vp->v_rdev == rconsdev) && secpolicy_console(cr) != 0) {
 737                 return (EACCES);
 738         }
 739 
 740         rw_enter(&dv->dv_contents, RW_READER);
 741         if (dv->dv_attr && ((dv->dv_flags & DV_ACL) == 0)) {
 742                 res = devfs_unlocked_access(dv, mode, cr);
 743         } else {
 744                 res = VOP_ACCESS(dv->dv_attrvp, mode, flags, cr, ct);
 745         }
 746         rw_exit(&dv->dv_contents);
 747         return (res);
 748 }
 749 
 750 /*
 751  * Lookup
 752  *
 753  * Given the directory vnode and the name of the component, return
 754  * the corresponding held vnode for that component.
 755  *
 756  * Of course in these fictional filesystems, nothing's ever quite
 757  * -that- simple.
 758  *
 759  * devfs name   type            shadow (fs attributes)  type    comments
 760  * -------------------------------------------------------------------------
 761  * drv[@addr]   VDIR            drv[@addr]              VDIR    nexus driver
 762  * drv[@addr]:m VCHR/VBLK       drv[@addr]:m            VREG    leaf driver
 763  * drv[@addr]   VCHR/VBLK       drv[@addr]:.default     VREG    leaf driver
 764  * -------------------------------------------------------------------------
 765  *
 766  * The following names are reserved for the attribute filesystem (which
 767  * could easily be another layer on top of this one - we simply need to
 768  * hold the vnode of the thing we're looking at)
 769  *
 770  * attr name    type            shadow (fs attributes)  type    comments
 771  * -------------------------------------------------------------------------
 772  * drv[@addr]   VDIR            -                       -       attribute dir
 773  * minorname    VDIR            -                       -       minorname
 774  * attribute    VREG            -                       -       attribute
 775  * -------------------------------------------------------------------------
 776  *
 777  * Examples:
 778  *
 779  *      devfs:/devices/.../mm@0:zero            VCHR
 780  *      shadow:/.devices/.../mm@0:zero          VREG, fs attrs
 781  *      devfs:/devices/.../mm@0:/zero/attr      VREG, driver attribute
 782  *
 783  *      devfs:/devices/.../sd@0,0:a             VBLK
 784  *      shadow:/.devices/.../sd@0,0:a           VREG, fs attrs
 785  *      devfs:/devices/.../sd@0,0:/a/.type      VREG, "ddi_block:chan"
 786  *
 787  *      devfs:/devices/.../mm@0                 VCHR
 788  *      shadow:/.devices/.../mm@0:.default      VREG, fs attrs
 789  *      devfs:/devices/.../mm@0:/.default/attr  VREG, driver attribute
 790  *      devfs:/devices/.../mm@0:/.default/.type VREG, "ddi_pseudo"
 791  *
 792  *      devfs:/devices/.../obio                 VDIR
 793  *      shadow:/devices/.../obio                VDIR, needed for fs attrs.
 794  *      devfs:/devices/.../obio:/.default/attr  VDIR, driver attribute
 795  *
 796  * We also need to be able deal with "old" devices that have gone away,
 797  * though I think that provided we return them with readdir, they can
 798  * be removed (i.e. they don't have to respond to lookup, though it might
 799  * be weird if they didn't ;-)
 800  *
 801  * Lookup has side-effects.
 802  *
 803  * - It will create directories and fs attribute files in the shadow hierarchy.
 804  * - It should cause non-SID devices to be probed (ask the parent nexi).
 805  */
 806 /*ARGSUSED3*/
 807 static int
 808 devfs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
 809     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
 810     caller_context_t *ct, int *direntflags, pathname_t *realpnp)
 811 {
 812         ASSERT(dvp->v_type == VDIR);
 813         dcmn_err2(("devfs_lookup: %s\n", nm));
 814         return (dv_find(VTODV(dvp), nm, vpp, pnp, rdir, cred, 0));
 815 }
 816 
 817 /*
 818  * devfs nodes can't really be created directly by userland - however,
 819  * we do allow creates to find existing nodes:
 820  *
 821  * - any create fails if the node doesn't exist - EROFS.
 822  * - creating an existing directory read-only succeeds, otherwise EISDIR.
 823  * - exclusive creates fail if the node already exists - EEXIST.
 824  * - failure to create the snode for an existing device - ENOSYS.
 825  */
 826 /*ARGSUSED2*/
 827 static int
 828 devfs_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
 829     int mode, struct vnode **vpp, struct cred *cred, int flag,
 830     caller_context_t *ct, vsecattr_t *vsecp)
 831 {
 832         int error;
 833         struct vnode *vp;
 834 
 835         dcmn_err2(("devfs_create %s\n", nm));
 836         error = dv_find(VTODV(dvp), nm, &vp, NULL, NULLVP, cred, 0);
 837         if (error == 0) {
 838                 if (excl == EXCL)
 839                         error = EEXIST;
 840                 else if (vp->v_type == VDIR && (mode & VWRITE))
 841                         error = EISDIR;
 842                 else
 843                         error = VOP_ACCESS(vp, mode, 0, cred, ct);
 844 
 845                 if (error) {
 846                         VN_RELE(vp);
 847                 } else
 848                         *vpp = vp;
 849         } else if (error == ENOENT)
 850                 error = EROFS;
 851 
 852         return (error);
 853 }
 854 
 855 /*
 856  * If DV_BUILD is set, we call into nexus driver to do a BUS_CONFIG_ALL.
 857  * Otherwise, simply return cached dv_node's. Hotplug code always call
 858  * devfs_clean() to invalid the dv_node cache.
 859  */
 860 /*ARGSUSED5*/
 861 static int
 862 devfs_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, int *eofp,
 863     caller_context_t *ct, int flags)
 864 {
 865         struct dv_node *ddv, *dv;
 866         struct dirent64 *de, *bufp;
 867         offset_t diroff;
 868         offset_t        soff;
 869         size_t reclen, movesz;
 870         int error;
 871         struct vattr va;
 872         size_t bufsz;
 873 
 874         ddv = VTODV(dvp);
 875         dcmn_err2(("devfs_readdir %s: offset %lld len %ld\n",
 876             ddv->dv_name, uiop->uio_loffset, uiop->uio_iov->iov_len));
 877         ASSERT(ddv->dv_attr || ddv->dv_attrvp);
 878         ASSERT(RW_READ_HELD(&ddv->dv_contents));
 879 
 880         if (uiop->uio_loffset >= MAXOFF_T) {
 881                 if (eofp)
 882                         *eofp = 1;
 883                 return (0);
 884         }
 885 
 886         if (uiop->uio_iovcnt != 1)
 887                 return (EINVAL);
 888 
 889         if (dvp->v_type != VDIR)
 890                 return (ENOTDIR);
 891 
 892         /* Load the initial contents */
 893         if (ddv->dv_flags & DV_BUILD) {
 894                 if (!rw_tryupgrade(&ddv->dv_contents)) {
 895                         rw_exit(&ddv->dv_contents);
 896                         rw_enter(&ddv->dv_contents, RW_WRITER);
 897                 }
 898 
 899                 /* recheck and fill */
 900                 if (ddv->dv_flags & DV_BUILD)
 901                         dv_filldir(ddv);
 902 
 903                 rw_downgrade(&ddv->dv_contents);
 904         }
 905 
 906         soff = uiop->uio_loffset;
 907         bufsz = uiop->uio_iov->iov_len;
 908         de = bufp = kmem_alloc(bufsz, KM_SLEEP);
 909         movesz = 0;
 910         dv = (struct dv_node *)-1;
 911 
 912         /*
 913          * Move as many entries into the uio structure as it will take.
 914          * Special case "." and "..".
 915          */
 916         diroff = 0;
 917         if (soff == 0) {                                /* . */
 918                 reclen = DIRENT64_RECLEN(strlen("."));
 919                 if ((movesz + reclen) > bufsz)
 920                         goto full;
 921                 de->d_ino = (ino64_t)ddv->dv_ino;
 922                 de->d_off = (off64_t)diroff + 1;
 923                 de->d_reclen = (ushort_t)reclen;
 924 
 925                 /* use strncpy(9f) to zero out uninitialized bytes */
 926 
 927                 (void) strncpy(de->d_name, ".", DIRENT64_NAMELEN(reclen));
 928                 movesz += reclen;
 929                 de = (dirent64_t *)(intptr_t)((char *)de + reclen);
 930                 dcmn_err3(("devfs_readdir: A: diroff %lld, soff %lld: '%s' "
 931                     "reclen %lu\n", diroff, soff, ".", reclen));
 932         }
 933 
 934         diroff++;
 935         if (soff <= 1) {                             /* .. */
 936                 reclen = DIRENT64_RECLEN(strlen(".."));
 937                 if ((movesz + reclen) > bufsz)
 938                         goto full;
 939                 de->d_ino = (ino64_t)ddv->dv_dotdot->dv_ino;
 940                 de->d_off = (off64_t)diroff + 1;
 941                 de->d_reclen = (ushort_t)reclen;
 942 
 943                 /* use strncpy(9f) to zero out uninitialized bytes */
 944 
 945                 (void) strncpy(de->d_name, "..", DIRENT64_NAMELEN(reclen));
 946                 movesz += reclen;
 947                 de = (dirent64_t *)(intptr_t)((char *)de + reclen);
 948                 dcmn_err3(("devfs_readdir: B: diroff %lld, soff %lld: '%s' "
 949                     "reclen %lu\n", diroff, soff, "..", reclen));
 950         }
 951 
 952         diroff++;
 953         for (dv = DV_FIRST_ENTRY(ddv); dv;
 954             dv = DV_NEXT_ENTRY(ddv, dv), diroff++) {
 955                 /* skip entries until at correct directory offset */
 956                 if (diroff < soff)
 957                         continue;
 958 
 959                 /*
 960                  * hidden nodes are skipped (but they still occupy a
 961                  * directory offset).
 962                  */
 963                 if (dv->dv_devi && ndi_dev_is_hidden_node(dv->dv_devi))
 964                         continue;
 965 
 966                 /*
 967                  * DDM_INTERNAL_PATH minor nodes are skipped for readdirs
 968                  * outside the kernel (but they still occupy a directory
 969                  * offset).
 970                  */
 971                 if ((dv->dv_flags & DV_INTERNAL) && (cred != kcred))
 972                         continue;
 973 
 974                 reclen = DIRENT64_RECLEN(strlen(dv->dv_name));
 975                 if ((movesz + reclen) > bufsz) {
 976                         dcmn_err3(("devfs_readdir: C: diroff "
 977                             "%lld, soff %lld: '%s' reclen %lu\n",
 978                             diroff, soff, dv->dv_name, reclen));
 979                         goto full;
 980                 }
 981                 de->d_ino = (ino64_t)dv->dv_ino;
 982                 de->d_off = (off64_t)diroff + 1;
 983                 de->d_reclen = (ushort_t)reclen;
 984 
 985                 /* use strncpy(9f) to zero out uninitialized bytes */
 986 
 987                 ASSERT(strlen(dv->dv_name) + 1 <=
 988                     DIRENT64_NAMELEN(reclen));
 989                 (void) strncpy(de->d_name, dv->dv_name,
 990                     DIRENT64_NAMELEN(reclen));
 991 
 992                 movesz += reclen;
 993                 de = (dirent64_t *)(intptr_t)((char *)de + reclen);
 994                 dcmn_err4(("devfs_readdir: D: diroff "
 995                     "%lld, soff %lld: '%s' reclen %lu\n", diroff, soff,
 996                     dv->dv_name, reclen));
 997         }
 998 
 999         /* the buffer is full, or we exhausted everything */
1000 full:   dcmn_err3(("devfs_readdir: moving %lu bytes: "
1001             "diroff %lld, soff %lld, dv %p\n",
1002             movesz, diroff, soff, (void *)dv));
1003 
1004         if ((movesz == 0) && dv)
1005                 error = EINVAL;         /* cannot be represented */
1006         else {
1007                 error = uiomove(bufp, movesz, UIO_READ, uiop);
1008                 if (error == 0) {
1009                         if (eofp)
1010                                 *eofp = dv ? 0 : 1;
1011                         uiop->uio_loffset = diroff;
1012                 }
1013 
1014                 va.va_mask = AT_ATIME;
1015                 gethrestime(&va.va_atime);
1016                 rw_exit(&ddv->dv_contents);
1017                 (void) devfs_setattr(dvp, &va, 0, cred, ct);
1018                 rw_enter(&ddv->dv_contents, RW_READER);
1019         }
1020 
1021         kmem_free(bufp, bufsz);
1022         return (error);
1023 }
1024 
1025 /*ARGSUSED*/
1026 static int
1027 devfs_fsync(struct vnode *vp, int syncflag, struct cred *cred,
1028     caller_context_t *ct)
1029 {
1030         /*
1031          * Message goes to console only. Otherwise, the message
1032          * causes devfs_fsync to be invoked again... infinite loop
1033          */
1034         dcmn_err2(("devfs_fsync %s\n", VTODV(vp)->dv_name));
1035         return (0);
1036 }
1037 
1038 /*
1039  * Normally, we leave the dv_node here at count of 0.
1040  * The node will be destroyed when dv_cleandir() is called.
1041  *
1042  * Stale dv_node's are already unlinked from the fs tree,
1043  * so dv_cleandir() won't find them. We destroy such nodes
1044  * immediately.
1045  */
1046 /*ARGSUSED1*/
1047 static void
1048 devfs_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
1049 {
1050         int destroy;
1051         struct dv_node *dv = VTODV(vp);
1052 
1053         dcmn_err2(("devfs_inactive: %s\n", dv->dv_name));
1054         mutex_enter(&vp->v_lock);
1055         ASSERT(vp->v_count >= 1);
1056         --vp->v_count;
1057         destroy = (DV_STALE(dv) && vp->v_count == 0);
1058         mutex_exit(&vp->v_lock);
1059 
1060         /* stale nodes cannot be rediscovered, destroy it here */
1061         if (destroy)
1062                 dv_destroy(dv, 0);
1063 }
1064 
1065 /*
1066  * XXX Why do we need this?  NFS mounted /dev directories?
1067  * XXX Talk to peter staubach about this.
1068  */
1069 /*ARGSUSED2*/
1070 static int
1071 devfs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
1072 {
1073         struct dv_node  *dv = VTODV(vp);
1074         struct dv_fid   *dv_fid;
1075 
1076         if (fidp->fid_len < (sizeof (struct dv_fid) - sizeof (ushort_t))) {
1077                 fidp->fid_len = sizeof (struct dv_fid) - sizeof (ushort_t);
1078                 return (ENOSPC);
1079         }
1080 
1081         dv_fid = (struct dv_fid *)fidp;
1082         bzero(dv_fid, sizeof (struct dv_fid));
1083         dv_fid->dvfid_len = (int)sizeof (struct dv_fid) - sizeof (ushort_t);
1084         dv_fid->dvfid_ino = dv->dv_ino;
1085         /* dv_fid->dvfid_gen = dv->tn_gen; XXX ? */
1086 
1087         return (0);
1088 }
1089 
1090 /*
1091  * This pair of routines bracket all VOP_READ, VOP_WRITE
1092  * and VOP_READDIR requests.  The contents lock stops things
1093  * moving around while we're looking at them.
1094  *
1095  * Also used by file and record locking.
1096  */
1097 /*ARGSUSED2*/
1098 static int
1099 devfs_rwlock(struct vnode *vp, int write_flag, caller_context_t *ct)
1100 {
1101         dcmn_err2(("devfs_rwlock %s\n", VTODV(vp)->dv_name));
1102         rw_enter(&VTODV(vp)->dv_contents, write_flag ? RW_WRITER : RW_READER);
1103         return (write_flag);
1104 }
1105 
1106 /*ARGSUSED1*/
1107 static void
1108 devfs_rwunlock(struct vnode *vp, int write_flag, caller_context_t *ct)
1109 {
1110         dcmn_err2(("devfs_rwunlock %s\n", VTODV(vp)->dv_name));
1111         rw_exit(&VTODV(vp)->dv_contents);
1112 }
1113 
1114 /*
1115  * XXX  Should probably do a better job of computing the maximum
1116  *      offset available in the directory.
1117  */
1118 /*ARGSUSED1*/
1119 static int
1120 devfs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
1121     caller_context_t *ct)
1122 {
1123         ASSERT(vp->v_type == VDIR);
1124         dcmn_err2(("devfs_seek %s\n", VTODV(vp)->dv_name));
1125         return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1126 }
1127 
1128 vnodeops_t *dv_vnodeops;
1129 
1130 const fs_operation_def_t dv_vnodeops_template[] = {
1131         { VOPNAME_OPEN,         { .vop_open = devfs_open } },
1132         { VOPNAME_CLOSE,        { .vop_close = devfs_close } },
1133         { VOPNAME_READ,         { .vop_read = devfs_read } },
1134         { VOPNAME_WRITE,        { .vop_write = devfs_write } },
1135         { VOPNAME_IOCTL,        { .vop_ioctl = devfs_ioctl } },
1136         { VOPNAME_GETATTR,      { .vop_getattr = devfs_getattr } },
1137         { VOPNAME_SETATTR,      { .vop_setattr = devfs_setattr } },
1138         { VOPNAME_ACCESS,       { .vop_access = devfs_access } },
1139         { VOPNAME_LOOKUP,       { .vop_lookup = devfs_lookup } },
1140         { VOPNAME_CREATE,       { .vop_create = devfs_create } },
1141         { VOPNAME_READDIR,      { .vop_readdir = devfs_readdir } },
1142         { VOPNAME_FSYNC,        { .vop_fsync = devfs_fsync } },
1143         { VOPNAME_INACTIVE,     { .vop_inactive = devfs_inactive } },
1144         { VOPNAME_FID,          { .vop_fid = devfs_fid } },
1145         { VOPNAME_RWLOCK,       { .vop_rwlock = devfs_rwlock } },
1146         { VOPNAME_RWUNLOCK,     { .vop_rwunlock = devfs_rwunlock } },
1147         { VOPNAME_SEEK,         { .vop_seek = devfs_seek } },
1148         { VOPNAME_PATHCONF,     { .vop_pathconf = devfs_pathconf } },
1149         { VOPNAME_DISPOSE,      { .error = fs_error } },
1150         { VOPNAME_SETSECATTR,   { .vop_setsecattr = devfs_setsecattr } },
1151         { VOPNAME_GETSECATTR,   { .vop_getsecattr = devfs_getsecattr } },
1152         { NULL,                 { NULL } }
1153 };