1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 #include <sys/param.h>
  26 #include <sys/errno.h>
  27 #include <sys/vfs.h>
  28 #include <sys/vfs_opreg.h>
  29 #include <sys/vnode.h>
  30 #include <sys/uio.h>
  31 #include <sys/pathname.h>
  32 #include <sys/kmem.h>
  33 #include <sys/cred.h>
  34 #include <sys/statvfs.h>
  35 #include <sys/fs/lofs_info.h>
  36 #include <sys/fs/lofs_node.h>
  37 #include <sys/mount.h>
  38 #include <sys/mntent.h>
  39 #include <sys/mkdev.h>
  40 #include <sys/priv.h>
  41 #include <sys/sysmacros.h>
  42 #include <sys/systm.h>
  43 #include <sys/cmn_err.h>
  44 #include <sys/policy.h>
  45 #include <sys/tsol/label.h>
  46 #include "fs/fs_subr.h"
  47 
  48 /*
  49  * This is the loadable module wrapper.
  50  */
  51 #include <sys/modctl.h>
  52 
  53 static mntopts_t lofs_mntopts;
  54 
  55 static int lofsinit(int, char *);
  56 
  57 static vfsdef_t vfw = {
  58         VFSDEF_VERSION,
  59         "lofs",
  60         lofsinit,
  61         VSW_HASPROTO|VSW_STATS|VSW_ZMOUNT,
  62         &lofs_mntopts
  63 };
  64 
  65 /*
  66  * LOFS mount options table
  67  */
  68 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
  69 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
  70 static char *sub_cancel[] = { MNTOPT_LOFS_NOSUB, NULL };
  71 static char *nosub_cancel[] = { MNTOPT_LOFS_SUB, NULL };
  72 
  73 static mntopt_t mntopts[] = {
  74 /*
  75  *      option name             cancel option   default arg     flags
  76  *              private data
  77  */
  78         { MNTOPT_XATTR,         xattr_cancel,   NULL,           0,
  79                 (void *)0 },
  80         { MNTOPT_NOXATTR,       noxattr_cancel, NULL,           0,
  81                 (void *)0 },
  82         { MNTOPT_LOFS_SUB,      sub_cancel,     NULL,           0,
  83                 (void *)0 },
  84         { MNTOPT_LOFS_NOSUB,    nosub_cancel,   NULL,           0,
  85                 (void *)0 },
  86 };
  87 
  88 static mntopts_t lofs_mntopts = {
  89         sizeof (mntopts) / sizeof (mntopt_t),
  90         mntopts
  91 };
  92 
  93 /*
  94  * Module linkage information for the kernel.
  95  */
  96 
  97 static struct modlfs modlfs = {
  98         &mod_fsops, "filesystem for lofs", &vfw
  99 };
 100 
 101 static struct modlinkage modlinkage = {
 102         MODREV_1, (void *)&modlfs, NULL
 103 };
 104 
 105 /*
 106  * This is the module initialization routine.
 107  */
 108 
 109 int
 110 _init(void)
 111 {
 112         int status;
 113 
 114         lofs_subrinit();
 115         status = mod_install(&modlinkage);
 116         if (status != 0) {
 117                 /*
 118                  * Cleanup previously initialized work.
 119                  */
 120                 lofs_subrfini();
 121         }
 122 
 123         return (status);
 124 }
 125 
 126 /*
 127  * Don't allow the lofs module to be unloaded for now.
 128  * There is a memory leak if it gets unloaded.
 129  */
 130 
 131 int
 132 _fini(void)
 133 {
 134         return (EBUSY);
 135 }
 136 
 137 int
 138 _info(struct modinfo *modinfop)
 139 {
 140         return (mod_info(&modlinkage, modinfop));
 141 }
 142 
 143 
 144 static int lofsfstype;
 145 vfsops_t *lo_vfsops;
 146 
 147 /*
 148  * lo mount vfsop
 149  * Set up mount info record and attach it to vfs struct.
 150  */
 151 /*ARGSUSED*/
 152 static int
 153 lo_mount(struct vfs *vfsp,
 154         struct vnode *vp,
 155         struct mounta *uap,
 156         struct cred *cr)
 157 {
 158         int error;
 159         struct vnode *srootvp = NULL;   /* the server's root */
 160         struct vnode *realrootvp;
 161         struct loinfo *li;
 162         int nodev;
 163 
 164         nodev = vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL);
 165 
 166         if ((error = secpolicy_fs_mount(cr, vp, vfsp)) != 0)
 167                 return (EPERM);
 168 
 169         /*
 170          * Loopback devices which get "nodevices" added can be done without
 171          * "nodevices" set because we cannot import devices into a zone
 172          * with loopback.  Note that we have all zone privileges when
 173          * this happens; if not, we'd have gotten "nosuid".
 174          */
 175         if (!nodev && vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
 176                 vfs_setmntopt(vfsp, MNTOPT_DEVICES, NULL, VFS_NODISPLAY);
 177 
 178         mutex_enter(&vp->v_lock);
 179         if (!(uap->flags & MS_OVERLAY) &&
 180             (vp->v_count != 1 || (vp->v_flag & VROOT))) {
 181                 mutex_exit(&vp->v_lock);
 182                 return (EBUSY);
 183         }
 184         mutex_exit(&vp->v_lock);
 185 
 186         /*
 187          * Find real root, and make vfs point to real vfs
 188          */
 189 
 190         if (error = lookupname(uap->spec, (uap->flags & MS_SYSSPACE) ?
 191             UIO_SYSSPACE : UIO_USERSPACE, FOLLOW, NULLVPP, &realrootvp))
 192                 return (error);
 193 
 194         /*
 195          * Enforce MAC policy if needed.
 196          *
 197          * Loopback mounts must not allow writing up. The dominance test
 198          * is intended to prevent a global zone caller from accidentally
 199          * creating write-up conditions between two labeled zones.
 200          * Local zones can't violate MAC on their own without help from
 201          * the global zone because they can't name a pathname that
 202          * they don't already have.
 203          *
 204          * The special case check for the NET_MAC_AWARE process flag is
 205          * to support the case of the automounter in the global zone. We
 206          * permit automounting of local zone directories such as home
 207          * directories, into the global zone as required by setlabel,
 208          * zonecopy, and saving of desktop sessions. Such mounts are
 209          * trusted not to expose the contents of one zone's directories
 210          * to another by leaking them through the global zone.
 211          */
 212         if (is_system_labeled() && crgetzoneid(cr) == GLOBAL_ZONEID) {
 213                 char    specname[MAXPATHLEN];
 214                 zone_t  *from_zptr;
 215                 zone_t  *to_zptr;
 216 
 217                 if (vnodetopath(NULL, realrootvp, specname,
 218                     sizeof (specname), CRED()) != 0) {
 219                         VN_RELE(realrootvp);
 220                         return (EACCES);
 221                 }
 222 
 223                 from_zptr = zone_find_by_path(specname);
 224                 to_zptr = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
 225 
 226                 /*
 227                  * Special case for scratch zones used for Live Upgrade:
 228                  * this is used to mount the zone's root from /root to /a in
 229                  * the scratch zone.  As with the other special case, this
 230                  * appears to be outside of the zone because it's not under
 231                  * the zone rootpath, which is $ZONEPATH/lu in the scratch
 232                  * zone case.
 233                  */
 234 
 235                 if (from_zptr != to_zptr &&
 236                     !(to_zptr->zone_flags & ZF_IS_SCRATCH)) {
 237                         /*
 238                          * We know at this point that the labels aren't equal
 239                          * because the zone pointers aren't equal, and zones
 240                          * can't share a label.
 241                          *
 242                          * If the source is the global zone then making
 243                          * it available to a local zone must be done in
 244                          * read-only mode as the label will become admin_low.
 245                          *
 246                          * If it is a mount between local zones then if
 247                          * the current process is in the global zone and has
 248                          * the NET_MAC_AWARE flag, then regular read-write
 249                          * access is allowed.  If it's in some other zone, but
 250                          * the label on the mount point dominates the original
 251                          * source, then allow the mount as read-only
 252                          * ("read-down").
 253                          */
 254                         if (from_zptr->zone_id == GLOBAL_ZONEID) {
 255                                 /* make the mount read-only */
 256                                 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
 257                         } else { /* cross-zone mount */
 258                                 if (to_zptr->zone_id == GLOBAL_ZONEID &&
 259                                     /* LINTED: no consequent */
 260                                     getpflags(NET_MAC_AWARE, cr) != 0) {
 261                                         /* Allow the mount as read-write */
 262                                 } else if (bldominates(
 263                                     label2bslabel(to_zptr->zone_slabel),
 264                                     label2bslabel(from_zptr->zone_slabel))) {
 265                                         /* make the mount read-only */
 266                                         vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
 267                                 } else {
 268                                         VN_RELE(realrootvp);
 269                                         zone_rele(to_zptr);
 270                                         zone_rele(from_zptr);
 271                                         return (EACCES);
 272                                 }
 273                         }
 274                 }
 275                 zone_rele(to_zptr);
 276                 zone_rele(from_zptr);
 277         }
 278 
 279         /*
 280          * realrootvp may be an AUTOFS node, in which case we perform a
 281          * VOP_ACCESS() to trigger the mount of the intended filesystem.
 282          * This causes a loopback mount of the intended filesystem instead
 283          * of the AUTOFS filesystem.
 284          *
 285          * If a lofs mount creates a mount loop (such that a lofs vfs is
 286          * mounted on an autofs node and that lofs vfs points back to the
 287          * autofs node which it is mounted on) then a VOP_ACCESS call will
 288          * create a deadlock. Once this deadlock is released, VOP_ACCESS will
 289          * return EINTR. In such a case we don't want the lofs vfs to be
 290          * created as the loop could panic the system.
 291          */
 292         if ((error = VOP_ACCESS(realrootvp, 0, 0, cr, NULL)) != 0) {
 293                 VN_RELE(realrootvp);
 294                 return (error);
 295         }
 296 
 297         /*
 298          * We're interested in the top most filesystem.
 299          * This is specially important when uap->spec is a trigger
 300          * AUTOFS node, since we're really interested in mounting the
 301          * filesystem AUTOFS mounted as result of the VOP_ACCESS()
 302          * call not the AUTOFS node itself.
 303          */
 304         if (vn_mountedvfs(realrootvp) != NULL) {
 305                 if (error = traverse(&realrootvp)) {
 306                         VN_RELE(realrootvp);
 307                         return (error);
 308                 }
 309         }
 310 
 311         /*
 312          * Allocate a vfs info struct and attach it
 313          */
 314         li = kmem_zalloc(sizeof (struct loinfo), KM_SLEEP);
 315         li->li_realvfs = realrootvp->v_vfsp;
 316         li->li_mountvfs = vfsp;
 317 
 318         /*
 319          * Set mount flags to be inherited by loopback vfs's
 320          */
 321         if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
 322                 li->li_mflag |= VFS_RDONLY;
 323         }
 324         if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 325                 li->li_mflag |= (VFS_NOSETUID|VFS_NODEVICES);
 326         }
 327         if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
 328                 li->li_mflag |= VFS_NODEVICES;
 329         }
 330         if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 331                 li->li_mflag |= VFS_NOSETUID;
 332         }
 333         /*
 334          * Permissive flags are added to the "deny" bitmap.
 335          */
 336         if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 337                 li->li_dflag |= VFS_XATTR;
 338         }
 339         if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
 340                 li->li_dflag |= VFS_NBMAND;
 341         }
 342 
 343         /*
 344          * Propagate inheritable mount flags from the real vfs.
 345          */
 346         if ((li->li_realvfs->vfs_flag & VFS_RDONLY) &&
 347             !vfs_optionisset(vfsp, MNTOPT_RO, NULL))
 348                 vfs_setmntopt(vfsp, MNTOPT_RO, NULL,
 349                     VFS_NODISPLAY);
 350         if ((li->li_realvfs->vfs_flag & VFS_NOSETUID) &&
 351             !vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
 352                 vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL,
 353                     VFS_NODISPLAY);
 354         if ((li->li_realvfs->vfs_flag & VFS_NODEVICES) &&
 355             !vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
 356                 vfs_setmntopt(vfsp, MNTOPT_NODEVICES, NULL,
 357                     VFS_NODISPLAY);
 358         /*
 359          * Permissive flags such as VFS_XATTR, as opposed to restrictive flags
 360          * such as VFS_RDONLY, are handled differently.  An explicit
 361          * MNTOPT_NOXATTR should override the underlying filesystem's VFS_XATTR.
 362          */
 363         if ((li->li_realvfs->vfs_flag & VFS_XATTR) &&
 364             !vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL) &&
 365             !vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
 366                 vfs_setmntopt(vfsp, MNTOPT_XATTR, NULL,
 367                     VFS_NODISPLAY);
 368         if ((li->li_realvfs->vfs_flag & VFS_NBMAND) &&
 369             !vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL) &&
 370             !vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
 371                 vfs_setmntopt(vfsp, MNTOPT_NBMAND, NULL,
 372                     VFS_NODISPLAY);
 373 
 374         li->li_refct = 0;
 375         vfsp->vfs_data = (caddr_t)li;
 376         vfsp->vfs_bcount = 0;
 377         vfsp->vfs_fstype = lofsfstype;
 378         vfsp->vfs_bsize = li->li_realvfs->vfs_bsize;
 379 
 380         vfsp->vfs_dev = li->li_realvfs->vfs_dev;
 381         vfsp->vfs_fsid.val[0] = li->li_realvfs->vfs_fsid.val[0];
 382         vfsp->vfs_fsid.val[1] = li->li_realvfs->vfs_fsid.val[1];
 383 
 384         if (vfs_optionisset(vfsp, MNTOPT_LOFS_NOSUB, NULL)) {
 385                 li->li_flag |= LO_NOSUB;
 386         }
 387 
 388         /*
 389          * Propagate any VFS features
 390          */
 391 
 392         vfs_propagate_features(li->li_realvfs, vfsp);
 393 
 394         /*
 395          * Setup the hashtable. If the root of this mount isn't a directory,
 396          * there's no point in allocating a large hashtable. A table with one
 397          * bucket is sufficient.
 398          */
 399         if (realrootvp->v_type != VDIR)
 400                 lsetup(li, 1);
 401         else
 402                 lsetup(li, 0);
 403 
 404         /*
 405          * Make the root vnode
 406          */
 407         srootvp = makelonode(realrootvp, li, 0);
 408         srootvp->v_flag |= VROOT;
 409         li->li_rootvp = srootvp;
 410 
 411 #ifdef LODEBUG
 412         lo_dprint(4, "lo_mount: vfs %p realvfs %p root %p realroot %p li %p\n",
 413             vfsp, li->li_realvfs, srootvp, realrootvp, li);
 414 #endif
 415         return (0);
 416 }
 417 
 418 /*
 419  * Undo loopback mount
 420  */
 421 static int
 422 lo_unmount(struct vfs *vfsp, int flag, struct cred *cr)
 423 {
 424         struct loinfo *li;
 425 
 426         if (secpolicy_fs_unmount(cr, vfsp) != 0)
 427                 return (EPERM);
 428 
 429         /*
 430          * Forced unmount is not supported by this file system
 431          * and thus, ENOTSUP, is being returned.
 432          */
 433         if (flag & MS_FORCE)
 434                 return (ENOTSUP);
 435 
 436         li = vtoli(vfsp);
 437 #ifdef LODEBUG
 438         lo_dprint(4, "lo_unmount(%p) li %p\n", vfsp, li);
 439 #endif
 440         if (li->li_refct != 1 || li->li_rootvp->v_count != 1) {
 441 #ifdef LODEBUG
 442                 lo_dprint(4, "refct %d v_ct %d\n", li->li_refct,
 443                     li->li_rootvp->v_count);
 444 #endif
 445                 return (EBUSY);
 446         }
 447         VN_RELE(li->li_rootvp);
 448         return (0);
 449 }
 450 
 451 /*
 452  * Find root of lofs mount.
 453  */
 454 static int
 455 lo_root(struct vfs *vfsp, struct vnode **vpp)
 456 {
 457         *vpp = vtoli(vfsp)->li_rootvp;
 458 #ifdef LODEBUG
 459         lo_dprint(4, "lo_root(0x%p) = %p\n", vfsp, *vpp);
 460 #endif
 461         /*
 462          * If the root of the filesystem is a special file, return the specvp
 463          * version of the vnode. We don't save the specvp vnode in our
 464          * hashtable since that's exclusively for lnodes.
 465          */
 466         if (IS_DEVVP(*vpp)) {
 467                 struct vnode *svp;
 468 
 469                 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, kcred);
 470                 if (svp == NULL)
 471                         return (ENOSYS);
 472                 *vpp = svp;
 473         } else {
 474                 VN_HOLD(*vpp);
 475         }
 476 
 477         return (0);
 478 }
 479 
 480 /*
 481  * Get file system statistics.
 482  */
 483 static int
 484 lo_statvfs(register struct vfs *vfsp, struct statvfs64 *sbp)
 485 {
 486         vnode_t *realrootvp;
 487 
 488 #ifdef LODEBUG
 489         lo_dprint(4, "lostatvfs %p\n", vfsp);
 490 #endif
 491         /*
 492          * Using realrootvp->v_vfsp (instead of the realvfsp that was
 493          * cached) is necessary to make lofs work woth forced UFS unmounts.
 494          * In the case of a forced unmount, UFS stores a set of dummy vfsops
 495          * in all the (i)vnodes in the filesystem. The dummy ops simply
 496          * returns back EIO.
 497          */
 498         (void) lo_realvfs(vfsp, &realrootvp);
 499         if (realrootvp != NULL)
 500                 return (VFS_STATVFS(realrootvp->v_vfsp, sbp));
 501         else
 502                 return (EIO);
 503 }
 504 
 505 /*
 506  * LOFS doesn't have any data or metadata to flush, pending I/O on the
 507  * underlying filesystem will be flushed when such filesystem is synched.
 508  */
 509 /* ARGSUSED */
 510 static int
 511 lo_sync(struct vfs *vfsp,
 512         short flag,
 513         struct cred *cr)
 514 {
 515 #ifdef LODEBUG
 516         lo_dprint(4, "lo_sync: %p\n", vfsp);
 517 #endif
 518         return (0);
 519 }
 520 
 521 /*
 522  * Obtain the vnode from the underlying filesystem.
 523  */
 524 static int
 525 lo_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp)
 526 {
 527         vnode_t *realrootvp;
 528 
 529 #ifdef LODEBUG
 530         lo_dprint(4, "lo_vget: %p\n", vfsp);
 531 #endif
 532         (void) lo_realvfs(vfsp, &realrootvp);
 533         if (realrootvp != NULL)
 534                 return (VFS_VGET(realrootvp->v_vfsp, vpp, fidp));
 535         else
 536                 return (EIO);
 537 }
 538 
 539 /*
 540  * Free mount-specific data.
 541  */
 542 static void
 543 lo_freevfs(struct vfs *vfsp)
 544 {
 545         struct loinfo *li = vtoli(vfsp);
 546 
 547         ldestroy(li);
 548         kmem_free(li, sizeof (struct loinfo));
 549 }
 550 
 551 static int
 552 lofsinit(int fstyp, char *name)
 553 {
 554         static const fs_operation_def_t lo_vfsops_template[] = {
 555                 VFSNAME_MOUNT,          { .vfs_mount = lo_mount },
 556                 VFSNAME_UNMOUNT,        { .vfs_unmount = lo_unmount },
 557                 VFSNAME_ROOT,           { .vfs_root = lo_root },
 558                 VFSNAME_STATVFS,        { .vfs_statvfs = lo_statvfs },
 559                 VFSNAME_SYNC,           { .vfs_sync = lo_sync },
 560                 VFSNAME_VGET,           { .vfs_vget = lo_vget },
 561                 VFSNAME_FREEVFS,        { .vfs_freevfs = lo_freevfs },
 562                 NULL,                   NULL
 563         };
 564         int error;
 565 
 566         error = vfs_setfsops(fstyp, lo_vfsops_template, &lo_vfsops);
 567         if (error != 0) {
 568                 cmn_err(CE_WARN, "lofsinit: bad vfs ops template");
 569                 return (error);
 570         }
 571 
 572         error = vn_make_ops(name, lo_vnodeops_template, &lo_vnodeops);
 573         if (error != 0) {
 574                 (void) vfs_freevfsops_by_type(fstyp);
 575                 cmn_err(CE_WARN, "lofsinit: bad vnode ops template");
 576                 return (error);
 577         }
 578 
 579         lofsfstype = fstyp;
 580 
 581         return (0);
 582 }