1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  26 /*        All rights reserved.          */
  27 
  28 
  29 #include <sys/types.h>
  30 #include <sys/param.h>
  31 #include <sys/cmn_err.h>
  32 #include <sys/debug.h>
  33 #include <sys/dirent.h>
  34 #include <sys/errno.h>
  35 #include <sys/file.h>
  36 #include <sys/inline.h>
  37 #include <sys/kmem.h>
  38 #include <sys/pathname.h>
  39 #include <sys/resource.h>
  40 #include <sys/statvfs.h>
  41 #include <sys/mount.h>
  42 #include <sys/sysmacros.h>
  43 #include <sys/systm.h>
  44 #include <sys/uio.h>
  45 #include <sys/vfs.h>
  46 #include <sys/vfs_opreg.h>
  47 #include <sys/vnode.h>
  48 #include <sys/cred.h>
  49 #include <sys/mntent.h>
  50 #include <sys/mount.h>
  51 #include <sys/user.h>
  52 #include <sys/t_lock.h>
  53 #include <sys/modctl.h>
  54 #include <sys/policy.h>
  55 #include <fs/fs_subr.h>
  56 #include <sys/atomic.h>
  57 #include <sys/mkdev.h>
  58 
  59 #define round(r)        (((r)+sizeof (int)-1)&(~(sizeof (int)-1)))
  60 #define fdtoi(n)        ((n)+100)
  61 
  62 #define FDDIRSIZE 14
  63 struct fddirect {
  64         short   d_ino;
  65         char    d_name[FDDIRSIZE];
  66 };
  67 
  68 #define FDROOTINO       2
  69 #define FDSDSIZE        sizeof (struct fddirect)
  70 #define FDNSIZE         10
  71 
  72 static int              fdfstype = 0;
  73 static major_t          fdfsmaj;
  74 static minor_t          fdfsmin;
  75 static major_t          fdrmaj;
  76 static kmutex_t         fd_minor_lock;
  77 
  78 static int fdget(vnode_t *, char *, vnode_t **);
  79 
  80 /* ARGSUSED */
  81 static int
  82 fdopen(vnode_t **vpp, int mode, cred_t *cr, caller_context_t *ct)
  83 {
  84         if ((*vpp)->v_type != VDIR) {
  85                 mutex_enter(&(*vpp)->v_lock);
  86                 (*vpp)->v_flag |= VDUP;
  87                 mutex_exit(&(*vpp)->v_lock);
  88         }
  89         return (0);
  90 }
  91 
  92 /* ARGSUSED */
  93 static int
  94 fdclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
  95         caller_context_t *ct)
  96 {
  97         return (0);
  98 }
  99 
 100 /* ARGSUSED */
 101 static int
 102 fdread(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct)
 103 {
 104         static struct fddirect dotbuf[] = {
 105                 { FDROOTINO, "."  },
 106                 { FDROOTINO, ".." }
 107         };
 108         struct fddirect dirbuf;
 109         int i, n;
 110         int minfd, maxfd, modoff, error = 0;
 111         int nentries;
 112         rctl_qty_t fdno_ctl;
 113         int endoff;
 114 
 115         if (vp->v_type != VDIR)
 116                 return (ENOSYS);
 117 
 118         mutex_enter(&curproc->p_lock);
 119         fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
 120             curproc->p_rctls, curproc);
 121         nentries = MIN(P_FINFO(curproc)->fi_nfiles, (int)fdno_ctl);
 122         mutex_exit(&curproc->p_lock);
 123 
 124         endoff = (nentries + 2) * FDSDSIZE;
 125 
 126         /*
 127          * Fake up ".", "..", and the /dev/fd directory entries.
 128          */
 129         if (uiop->uio_loffset < (offset_t)0 ||
 130             uiop->uio_loffset >= (offset_t)endoff ||
 131             uiop->uio_resid <= 0)
 132                 return (0);
 133         ASSERT(uiop->uio_loffset <= MAXOFF_T);
 134         if (uiop->uio_offset < 2*FDSDSIZE) {
 135                 error = uiomove((caddr_t)dotbuf + uiop->uio_offset,
 136                     MIN(uiop->uio_resid, 2*FDSDSIZE - uiop->uio_offset),
 137                     UIO_READ, uiop);
 138                 if (uiop->uio_resid <= 0 || error)
 139                         return (error);
 140         }
 141         minfd = (uiop->uio_offset - 2*FDSDSIZE)/FDSDSIZE;
 142         maxfd = (uiop->uio_offset + uiop->uio_resid - 1)/FDSDSIZE;
 143         modoff = uiop->uio_offset % FDSDSIZE;
 144 
 145         for (i = 0; i < FDDIRSIZE; i++)
 146                 dirbuf.d_name[i] = '\0';
 147         for (i = minfd; i < MIN(maxfd, nentries); i++) {
 148                 n = i;
 149                 dirbuf.d_ino = fdtoi(n);
 150                 numtos((ulong_t)n, dirbuf.d_name);
 151                 error = uiomove((caddr_t)&dirbuf + modoff,
 152                     MIN(uiop->uio_resid, FDSDSIZE - modoff),
 153                     UIO_READ, uiop);
 154                 if (uiop->uio_resid <= 0 || error)
 155                         return (error);
 156                 modoff = 0;
 157         }
 158 
 159         return (error);
 160 }
 161 
 162 /* ARGSUSED */
 163 static int
 164 fdgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 165         caller_context_t *ct)
 166 {
 167         vfs_t *vfsp = vp->v_vfsp;
 168         timestruc_t now;
 169 
 170         if (vp->v_type == VDIR) {
 171                 vap->va_nlink = 2;
 172                 vap->va_size = (u_offset_t)
 173                     ((P_FINFO(curproc)->fi_nfiles + 2) * FDSDSIZE);
 174                 vap->va_mode = 0555;
 175                 vap->va_nodeid = (ino64_t)FDROOTINO;
 176         } else {
 177                 vap->va_nlink = 1;
 178                 vap->va_size = (u_offset_t)0;
 179                 vap->va_mode = 0666;
 180                 vap->va_nodeid = (ino64_t)fdtoi(getminor(vp->v_rdev));
 181         }
 182         vap->va_type = vp->v_type;
 183         vap->va_rdev = vp->v_rdev;
 184         vap->va_blksize = vfsp->vfs_bsize;
 185         vap->va_nblocks = (fsblkcnt64_t)0;
 186         gethrestime(&now);
 187         vap->va_atime = vap->va_mtime = vap->va_ctime = now;
 188         vap->va_uid = 0;
 189         vap->va_gid = 0;
 190         vap->va_fsid = vfsp->vfs_dev;
 191         vap->va_seq = 0;
 192         return (0);
 193 }
 194 
 195 /* ARGSUSED */
 196 static int
 197 fdaccess(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
 198 {
 199         return (0);
 200 }
 201 
 202 /* ARGSUSED */
 203 static int
 204 fdlookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pnp,
 205         int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
 206         int *direntflags, pathname_t *realpnp)
 207 {
 208         if (comp[0] == 0 || strcmp(comp, ".") == 0 || strcmp(comp, "..") == 0) {
 209                 VN_HOLD(dp);
 210                 *vpp = dp;
 211                 return (0);
 212         }
 213         return (fdget(dp, comp, vpp));
 214 }
 215 
 216 /* ARGSUSED */
 217 static int
 218 fdcreate(vnode_t *dvp, char *comp, vattr_t *vap, enum vcexcl excl,
 219         int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
 220         vsecattr_t *vsecp)
 221 {
 222         return (fdget(dvp, comp, vpp));
 223 }
 224 
 225 /* ARGSUSED */
 226 static int
 227 fdreaddir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, caller_context_t *ct,
 228         int flags)
 229 {
 230         /* bp holds one dirent structure */
 231         u_offset_t bp[DIRENT64_RECLEN(FDNSIZE) / sizeof (u_offset_t)];
 232         struct dirent64 *dirent = (struct dirent64 *)bp;
 233         int reclen, nentries;
 234         rctl_qty_t fdno_ctl;
 235         int  n;
 236         int oresid;
 237         off_t off;
 238 
 239         if (uiop->uio_offset < 0 || uiop->uio_resid <= 0 ||
 240             (uiop->uio_offset % FDSDSIZE) != 0)
 241                 return (ENOENT);
 242 
 243         ASSERT(uiop->uio_loffset <= MAXOFF_T);
 244         oresid = uiop->uio_resid;
 245         bzero(bp, sizeof (bp));
 246 
 247         mutex_enter(&curproc->p_lock);
 248         fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
 249             curproc->p_rctls, curproc);
 250         nentries = MIN(P_FINFO(curproc)->fi_nfiles, (int)fdno_ctl);
 251         mutex_exit(&curproc->p_lock);
 252 
 253         while (uiop->uio_resid > 0) {
 254                 if ((off = uiop->uio_offset) == 0) { /* "." */
 255                         dirent->d_ino = (ino64_t)FDROOTINO;
 256                         dirent->d_name[0] = '.';
 257                         dirent->d_name[1] = '\0';
 258                         reclen = DIRENT64_RECLEN(1);
 259                 } else if (off == FDSDSIZE) {           /* ".." */
 260                         dirent->d_ino = (ino64_t)FDROOTINO;
 261                         dirent->d_name[0] = '.';
 262                         dirent->d_name[1] = '.';
 263                         dirent->d_name[2] = '\0';
 264                         reclen = DIRENT64_RECLEN(2);
 265                 } else {
 266                         /*
 267                          * Return entries corresponding to the allowable
 268                          * number of file descriptors for this process.
 269                          */
 270                         if ((n = (off-2*FDSDSIZE)/FDSDSIZE) >= nentries)
 271                                 break;
 272                         dirent->d_ino = (ino64_t)fdtoi(n);
 273                         numtos((ulong_t)n, dirent->d_name);
 274                         reclen = DIRENT64_RECLEN(strlen(dirent->d_name));
 275                 }
 276                 dirent->d_off = (offset_t)(uiop->uio_offset + FDSDSIZE);
 277                 dirent->d_reclen = (ushort_t)reclen;
 278 
 279                 if (reclen > uiop->uio_resid) {
 280                         /*
 281                          * Error if no entries have been returned yet.
 282                          */
 283                         if (uiop->uio_resid == oresid)
 284                                 return (EINVAL);
 285                         break;
 286                 }
 287                 /*
 288                  * uiomove() updates both resid and offset by the same
 289                  * amount.  But we want offset to change in increments
 290                  * of FDSDSIZE, which is different from the number of bytes
 291                  * being returned to the user.  So we set uio_offset
 292                  * separately, ignoring what uiomove() does.
 293                  */
 294                 if (uiomove((caddr_t)dirent, reclen, UIO_READ, uiop))
 295                         return (EFAULT);
 296                 uiop->uio_offset = off + FDSDSIZE;
 297         }
 298         if (eofp)
 299                 *eofp = ((uiop->uio_offset-2*FDSDSIZE)/FDSDSIZE >= nentries);
 300         return (0);
 301 }
 302 
 303 /* ARGSUSED */
 304 static void
 305 fdinactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 306 {
 307         mutex_enter(&vp->v_lock);
 308         ASSERT(vp->v_count >= 1);
 309         if (--vp->v_count != 0) {
 310                 mutex_exit(&vp->v_lock);
 311                 return;
 312         }
 313         mutex_exit(&vp->v_lock);
 314         vn_invalid(vp);
 315         vn_free(vp);
 316 }
 317 
 318 static struct vnodeops *fd_vnodeops;
 319 
 320 static const fs_operation_def_t fd_vnodeops_template[] = {
 321         { VOPNAME_OPEN,         { .vop_open = fdopen } },
 322         { VOPNAME_CLOSE,        { .vop_close = fdclose } },
 323         { VOPNAME_READ,         { .vop_read = fdread } },
 324         { VOPNAME_GETATTR,      { .vop_getattr = fdgetattr } },
 325         { VOPNAME_ACCESS,       { .vop_access = fdaccess } },
 326         { VOPNAME_LOOKUP,       { .vop_lookup = fdlookup } },
 327         { VOPNAME_CREATE,       { .vop_create = fdcreate } },
 328         { VOPNAME_READDIR,      { .vop_readdir = fdreaddir } },
 329         { VOPNAME_INACTIVE,     { .vop_inactive = fdinactive } },
 330         { VOPNAME_FRLOCK,       { .error = fs_error } },
 331         { VOPNAME_POLL,         { .error = fs_error } },
 332         { VOPNAME_DISPOSE,      { .error = fs_error } },
 333         { NULL,                 { NULL } }
 334 };
 335 
 336 static int
 337 fdget(struct vnode *dvp, char *comp, struct vnode **vpp)
 338 {
 339         int n = 0;
 340         struct vnode *vp;
 341 
 342         while (*comp) {
 343                 if (*comp < '0' || *comp > '9')
 344                         return (ENOENT);
 345                 n = 10 * n + *comp++ - '0';
 346         }
 347         vp = vn_alloc(KM_SLEEP);
 348         vp->v_type = VCHR;
 349         vp->v_vfsp = dvp->v_vfsp;
 350         vn_setops(vp, fd_vnodeops);
 351         vp->v_data = NULL;
 352         vp->v_flag = VNOMAP;
 353         vp->v_rdev = makedevice(fdrmaj, n);
 354         vn_exists(vp);
 355         *vpp = vp;
 356         return (0);
 357 }
 358 
 359 /*
 360  * fdfs is mounted on /dev/fd, however, there are two interesting
 361  * possibilities - two threads racing to do the same mount (protected
 362  * by vfs locking), and two threads mounting fdfs in different places.
 363  */
 364 /*ARGSUSED*/
 365 static int
 366 fdmount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 367 {
 368         struct vnode *vp;
 369 
 370         if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
 371                 return (EPERM);
 372         if (mvp->v_type != VDIR)
 373                 return (ENOTDIR);
 374 
 375         mutex_enter(&mvp->v_lock);
 376         if ((uap->flags & MS_OVERLAY) == 0 &&
 377             (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
 378                 mutex_exit(&mvp->v_lock);
 379                 return (EBUSY);
 380         }
 381         mutex_exit(&mvp->v_lock);
 382 
 383         /*
 384          * Having the resource be anything but "fd" doesn't make sense
 385          */
 386         vfs_setresource(vfsp, "fd", 0);
 387 
 388         vp = vn_alloc(KM_SLEEP);
 389         vp->v_vfsp = vfsp;
 390         vn_setops(vp, fd_vnodeops);
 391         vp->v_type = VDIR;
 392         vp->v_data = NULL;
 393         vp->v_flag |= VROOT;
 394         vfsp->vfs_fstype = fdfstype;
 395         vfsp->vfs_data = (char *)vp;
 396         mutex_enter(&fd_minor_lock);
 397         do {
 398                 fdfsmin = (fdfsmin + 1) & L_MAXMIN32;
 399                 vfsp->vfs_dev = makedevice(fdfsmaj, fdfsmin);
 400         } while (vfs_devismounted(vfsp->vfs_dev));
 401         mutex_exit(&fd_minor_lock);
 402         vfs_make_fsid(&vfsp->vfs_fsid, vfsp->vfs_dev, fdfstype);
 403         vfsp->vfs_bsize = 1024;
 404         return (0);
 405 }
 406 
 407 /* ARGSUSED */
 408 static int
 409 fdunmount(vfs_t *vfsp, int flag, cred_t *cr)
 410 {
 411         vnode_t *rvp;
 412 
 413         if (secpolicy_fs_unmount(cr, vfsp) != 0)
 414                 return (EPERM);
 415 
 416         /*
 417          * forced unmount is not supported by this file system
 418          * and thus, ENOTSUP, is being returned.
 419          */
 420         if (flag & MS_FORCE)
 421                 return (ENOTSUP);
 422 
 423         rvp = (vnode_t *)vfsp->vfs_data;
 424         if (rvp->v_count > 1)
 425                 return (EBUSY);
 426 
 427         VN_RELE(rvp);
 428         return (0);
 429 }
 430 
 431 /* ARGSUSED */
 432 static int
 433 fdroot(vfs_t *vfsp, vnode_t **vpp)
 434 {
 435         vnode_t *vp = (vnode_t *)vfsp->vfs_data;
 436 
 437         VN_HOLD(vp);
 438         *vpp = vp;
 439         return (0);
 440 }
 441 
 442 /*
 443  * No locking required because I held the root vnode before calling this
 444  * function so the vfs won't disappear on me.  To be more explicit:
 445  * fdvrootp->v_count will be greater than 1 so fdunmount will just return.
 446  */
 447 static int
 448 fdstatvfs(struct vfs *vfsp, struct statvfs64 *sp)
 449 {
 450         dev32_t d32;
 451         rctl_qty_t fdno_ctl;
 452 
 453         mutex_enter(&curproc->p_lock);
 454         fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
 455             curproc->p_rctls, curproc);
 456         mutex_exit(&curproc->p_lock);
 457 
 458         bzero(sp, sizeof (*sp));
 459         sp->f_bsize = 1024;
 460         sp->f_frsize = 1024;
 461         sp->f_blocks = (fsblkcnt64_t)0;
 462         sp->f_bfree = (fsblkcnt64_t)0;
 463         sp->f_bavail = (fsblkcnt64_t)0;
 464         sp->f_files = (fsfilcnt64_t)
 465             (MIN(P_FINFO(curproc)->fi_nfiles, fdno_ctl + 2));
 466         sp->f_ffree = (fsfilcnt64_t)0;
 467         sp->f_favail = (fsfilcnt64_t)0;
 468         (void) cmpldev(&d32, vfsp->vfs_dev);
 469         sp->f_fsid = d32;
 470         (void) strcpy(sp->f_basetype, vfssw[fdfstype].vsw_name);
 471         sp->f_flag = vf_to_stf(vfsp->vfs_flag);
 472         sp->f_namemax = FDNSIZE;
 473         (void) strcpy(sp->f_fstr, "/dev/fd");
 474         (void) strcpy(&sp->f_fstr[8], "/dev/fd");
 475         return (0);
 476 }
 477 
 478 int
 479 fdinit(int fstype, char *name)
 480 {
 481         static const fs_operation_def_t fd_vfsops_template[] = {
 482                 { VFSNAME_MOUNT,        { .vfs_mount = fdmount } },
 483                 { VFSNAME_UNMOUNT,      { .vfs_unmount = fdunmount } },
 484                 { VFSNAME_ROOT,         { .vfs_root = fdroot } },
 485                 { VFSNAME_STATVFS,      { .vfs_statvfs = fdstatvfs } },
 486                 { NULL,                 { NULL } }
 487         };
 488         int error;
 489 
 490         fdfstype = fstype;
 491         ASSERT(fdfstype != 0);
 492 
 493         /*
 494          * Associate VFS ops vector with this fstype.
 495          */
 496         error = vfs_setfsops(fstype, fd_vfsops_template, NULL);
 497         if (error != 0) {
 498                 cmn_err(CE_WARN, "fdinit: bad vnode ops template");
 499                 return (error);
 500         }
 501 
 502         error = vn_make_ops(name, fd_vnodeops_template, &fd_vnodeops);
 503         if (error != 0) {
 504                 (void) vfs_freevfsops_by_type(fstype);
 505                 cmn_err(CE_WARN, "fdinit: bad vnode ops template");
 506                 return (error);
 507         }
 508 
 509         /*
 510          * Assign unique "device" numbers (reported by stat(2)).
 511          */
 512         fdfsmaj = getudev();
 513         fdrmaj = getudev();
 514         if (fdfsmaj == (major_t)-1 || fdrmaj == (major_t)-1) {
 515                 cmn_err(CE_WARN, "fdinit: can't get unique device numbers");
 516                 if (fdfsmaj == (major_t)-1)
 517                         fdfsmaj = 0;
 518                 if (fdrmaj == (major_t)-1)
 519                         fdrmaj = 0;
 520         }
 521         mutex_init(&fd_minor_lock, NULL, MUTEX_DEFAULT, NULL);
 522         return (0);
 523 }
 524 
 525 /*
 526  * FDFS Mount options table
 527  */
 528 static char *rw_cancel[] = { MNTOPT_RO, NULL };
 529 
 530 static mntopt_t mntopts[] = {
 531 /*
 532  *      option name             cancel option   default arg     flags
 533  */
 534         { MNTOPT_RW,            rw_cancel,      NULL,           MO_DEFAULT,
 535                 (void *)MNTOPT_NOINTR },
 536         { MNTOPT_IGNORE,        NULL,           NULL,           0,
 537                 (void *)0 },
 538 };
 539 
 540 static mntopts_t fdfs_mntopts = {
 541         sizeof (mntopts) / sizeof (mntopt_t),
 542         mntopts
 543 };
 544 
 545 static vfsdef_t vfw = {
 546         VFSDEF_VERSION,
 547         "fd",
 548         fdinit,
 549         VSW_HASPROTO | VSW_ZMOUNT,
 550         &fdfs_mntopts
 551 };
 552 
 553 static struct modlfs modlfs = {
 554         &mod_fsops,
 555         "filesystem for fd",
 556         &vfw
 557 };
 558 
 559 static struct modlinkage modlinkage = {
 560         MODREV_1,
 561         { &modlfs, NULL }
 562 };
 563 
 564 int
 565 _init(void)
 566 {
 567         return (mod_install(&modlinkage));
 568 }
 569 
 570 int
 571 _info(struct modinfo *modinfop)
 572 {
 573         return (mod_info(&modlinkage, modinfop));
 574 }