1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2013 by Delphix. All rights reserved.
  24  */
  25 
  26 #include <sys/zfs_context.h>
  27 #include <sys/spa.h>
  28 #include <sys/spa_impl.h>
  29 #include <sys/vdev_file.h>
  30 #include <sys/vdev_impl.h>
  31 #include <sys/zio.h>
  32 #include <sys/fs/zfs.h>
  33 #include <sys/fm/fs/zfs.h>
  34 
  35 /*
  36  * Virtual device vector for files.
  37  */
  38 
  39 static void
  40 vdev_file_hold(vdev_t *vd)
  41 {
  42         ASSERT(vd->vdev_path != NULL);
  43 }
  44 
  45 static void
  46 vdev_file_rele(vdev_t *vd)
  47 {
  48         ASSERT(vd->vdev_path != NULL);
  49 }
  50 
  51 static int
  52 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
  53     uint64_t *ashift)
  54 {
  55         vdev_file_t *vf;
  56         vnode_t *vp;
  57         vattr_t vattr;
  58         int error;
  59 
  60         /*
  61          * We must have a pathname, and it must be absolute.
  62          */
  63         if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
  64                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
  65                 return (SET_ERROR(EINVAL));
  66         }
  67 
  68         /*
  69          * Reopen the device if it's not currently open.  Otherwise,
  70          * just update the physical size of the device.
  71          */
  72         if (vd->vdev_tsd != NULL) {
  73                 ASSERT(vd->vdev_reopening);
  74                 vf = vd->vdev_tsd;
  75                 goto skip_open;
  76         }
  77 
  78         vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
  79 
  80         /*
  81          * We always open the files from the root of the global zone, even if
  82          * we're in a local zone.  If the user has gotten to this point, the
  83          * administrator has already decided that the pool should be available
  84          * to local zone users, so the underlying devices should be as well.
  85          */
  86         ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
  87         error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
  88             spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
  89 
  90         if (error) {
  91                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
  92                 return (error);
  93         }
  94 
  95         vf->vf_vnode = vp;
  96 
  97 #ifdef _KERNEL
  98         /*
  99          * Make sure it's a regular file.
 100          */
 101         if (vp->v_type != VREG) {
 102                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 103                 return (SET_ERROR(ENODEV));
 104         }
 105 #endif
 106 
 107 skip_open:
 108         /*
 109          * Determine the physical size of the file.
 110          */
 111         vattr.va_mask = AT_SIZE;
 112         error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL);
 113         if (error) {
 114                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 115                 return (error);
 116         }
 117 
 118         *max_psize = *psize = vattr.va_size;
 119         *ashift = SPA_MINBLOCKSHIFT;
 120 
 121         return (0);
 122 }
 123 
 124 static void
 125 vdev_file_close(vdev_t *vd)
 126 {
 127         vdev_file_t *vf = vd->vdev_tsd;
 128 
 129         if (vd->vdev_reopening || vf == NULL)
 130                 return;
 131 
 132         if (vf->vf_vnode != NULL) {
 133                 (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
 134                 (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
 135                     kcred, NULL);
 136                 VN_RELE(vf->vf_vnode);
 137         }
 138 
 139         vd->vdev_delayed_close = B_FALSE;
 140         kmem_free(vf, sizeof (vdev_file_t));
 141         vd->vdev_tsd = NULL;
 142 }
 143 
 144 /*
 145  * Implements the interrupt side for file vdev types. This routine will be
 146  * called when the I/O completes allowing us to transfer the I/O to the
 147  * interrupt taskqs. For consistency, the code structure mimics disk vdev
 148  * types.
 149  */
 150 static void
 151 vdev_file_io_intr(buf_t *bp)
 152 {
 153         vdev_buf_t *vb = (vdev_buf_t *)bp;
 154         zio_t *zio = vb->vb_io;
 155 
 156         zio->io_error = (geterror(bp) != 0 ? EIO : 0);
 157         if (zio->io_error == 0 && bp->b_resid != 0)
 158                 zio->io_error = SET_ERROR(ENOSPC);
 159 
 160         kmem_free(vb, sizeof (vdev_buf_t));
 161         zio_interrupt(zio);
 162 }
 163 
 164 static void
 165 vdev_file_io_strategy(void *arg)
 166 {
 167         buf_t *bp = arg;
 168         vnode_t *vp = bp->b_private;
 169         ssize_t resid;
 170         int error;
 171 
 172         error = vn_rdwr((bp->b_flags & B_READ) ? UIO_READ : UIO_WRITE,
 173             vp, bp->b_un.b_addr, bp->b_bcount, ldbtob(bp->b_lblkno),
 174             UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
 175 
 176         if (error == 0) {
 177                 bp->b_resid = resid;
 178                 biodone(bp);
 179         } else {
 180                 bioerror(bp, error);
 181                 biodone(bp);
 182         }
 183 }
 184 
 185 static int
 186 vdev_file_io_start(zio_t *zio)
 187 {
 188         vdev_t *vd = zio->io_vd;
 189         vdev_file_t *vf = vd->vdev_tsd;
 190         vdev_buf_t *vb;
 191         buf_t *bp;
 192 
 193         if (zio->io_type == ZIO_TYPE_IOCTL) {
 194                 /* XXPOLICY */
 195                 if (!vdev_readable(vd)) {
 196                         zio->io_error = SET_ERROR(ENXIO);
 197                         return (ZIO_PIPELINE_CONTINUE);
 198                 }
 199 
 200                 switch (zio->io_cmd) {
 201                 case DKIOCFLUSHWRITECACHE:
 202                         zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
 203                             kcred, NULL);
 204                         break;
 205                 default:
 206                         zio->io_error = SET_ERROR(ENOTSUP);
 207                 }
 208 
 209                 return (ZIO_PIPELINE_CONTINUE);
 210         }
 211 
 212         vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
 213 
 214         vb->vb_io = zio;
 215         bp = &vb->vb_buf;
 216 
 217         bioinit(bp);
 218         bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
 219         bp->b_bcount = zio->io_size;
 220         bp->b_un.b_addr = zio->io_data;
 221         bp->b_lblkno = lbtodb(zio->io_offset);
 222         bp->b_bufsize = zio->io_size;
 223         bp->b_private = vf->vf_vnode;
 224         bp->b_iodone = (int (*)())vdev_file_io_intr;
 225 
 226         VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, bp,
 227             TQ_SLEEP), !=, 0);
 228 
 229         return (ZIO_PIPELINE_STOP);
 230 }
 231 
 232 /* ARGSUSED */
 233 static void
 234 vdev_file_io_done(zio_t *zio)
 235 {
 236 }
 237 
 238 vdev_ops_t vdev_file_ops = {
 239         vdev_file_open,
 240         vdev_file_close,
 241         vdev_default_asize,
 242         vdev_file_io_start,
 243         vdev_file_io_done,
 244         NULL,
 245         vdev_file_hold,
 246         vdev_file_rele,
 247         VDEV_TYPE_FILE,         /* name of this vdev type */
 248         B_TRUE                  /* leaf vdev */
 249 };
 250 
 251 /*
 252  * From userland we access disks just like files.
 253  */
 254 #ifndef _KERNEL
 255 
 256 vdev_ops_t vdev_disk_ops = {
 257         vdev_file_open,
 258         vdev_file_close,
 259         vdev_default_asize,
 260         vdev_file_io_start,
 261         vdev_file_io_done,
 262         NULL,
 263         vdev_file_hold,
 264         vdev_file_rele,
 265         VDEV_TYPE_DISK,         /* name of this vdev type */
 266         B_TRUE                  /* leaf vdev */
 267 };
 268 
 269 #endif