1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  24  */
  25 
  26 #include <sys/zfs_context.h>
  27 #include <sys/spa.h>
  28 #include <sys/spa_impl.h>
  29 #include <sys/vdev_file.h>
  30 #include <sys/vdev_impl.h>
  31 #include <sys/zio.h>
  32 #include <sys/fs/zfs.h>
  33 #include <sys/fm/fs/zfs.h>
  34 
  35 /*
  36  * Virtual device vector for files.
  37  */
  38 
  39 static void
  40 vdev_file_hold(vdev_t *vd)
  41 {
  42         ASSERT(vd->vdev_path != NULL);
  43 }
  44 
  45 static void
  46 vdev_file_rele(vdev_t *vd)
  47 {
  48         ASSERT(vd->vdev_path != NULL);
  49 }
  50 
  51 static int
  52 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
  53     uint64_t *ashift)
  54 {
  55         vdev_file_t *vf;
  56         vnode_t *vp;
  57         vattr_t vattr;
  58         int error;
  59 
  60         /*
  61          * Rotational optimizations only make sense on block devices
  62          */
  63         vd->vdev_nonrot = B_TRUE;
  64 
  65         /*
  66          * We must have a pathname, and it must be absolute.
  67          */
  68         if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
  69                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
  70                 return (SET_ERROR(EINVAL));
  71         }
  72 
  73         /*
  74          * Reopen the device if it's not currently open.  Otherwise,
  75          * just update the physical size of the device.
  76          */
  77         if (vd->vdev_tsd != NULL) {
  78                 ASSERT(vd->vdev_reopening);
  79                 vf = vd->vdev_tsd;
  80                 goto skip_open;
  81         }
  82 
  83         vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
  84 
  85         /*
  86          * We always open the files from the root of the global zone, even if
  87          * we're in a local zone.  If the user has gotten to this point, the
  88          * administrator has already decided that the pool should be available
  89          * to local zone users, so the underlying devices should be as well.
  90          */
  91         ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
  92         error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
  93             spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
  94 
  95         if (error) {
  96                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
  97                 return (error);
  98         }
  99 
 100         vf->vf_vnode = vp;
 101 
 102 #ifdef _KERNEL
 103         /*
 104          * Make sure it's a regular file.
 105          */
 106         if (vp->v_type != VREG) {
 107                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 108                 return (SET_ERROR(ENODEV));
 109         }
 110 #endif
 111 
 112 skip_open:
 113         /*
 114          * Determine the physical size of the file.
 115          */
 116         vattr.va_mask = AT_SIZE;
 117         error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL);
 118         if (error) {
 119                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 120                 return (error);
 121         }
 122 
 123         *max_psize = *psize = vattr.va_size;
 124         *ashift = SPA_MINBLOCKSHIFT;
 125 
 126         return (0);
 127 }
 128 
 129 static void
 130 vdev_file_close(vdev_t *vd)
 131 {
 132         vdev_file_t *vf = vd->vdev_tsd;
 133 
 134         if (vd->vdev_reopening || vf == NULL)
 135                 return;
 136 
 137         if (vf->vf_vnode != NULL) {
 138                 (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
 139                 (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
 140                     kcred, NULL);
 141                 VN_RELE(vf->vf_vnode);
 142         }
 143 
 144         vd->vdev_delayed_close = B_FALSE;
 145         kmem_free(vf, sizeof (vdev_file_t));
 146         vd->vdev_tsd = NULL;
 147 }
 148 
 149 /*
 150  * Implements the interrupt side for file vdev types. This routine will be
 151  * called when the I/O completes allowing us to transfer the I/O to the
 152  * interrupt taskqs. For consistency, the code structure mimics disk vdev
 153  * types.
 154  */
 155 static void
 156 vdev_file_io_intr(buf_t *bp)
 157 {
 158         vdev_buf_t *vb = (vdev_buf_t *)bp;
 159         zio_t *zio = vb->vb_io;
 160 
 161         zio->io_error = (geterror(bp) != 0 ? EIO : 0);
 162         if (zio->io_error == 0 && bp->b_resid != 0)
 163                 zio->io_error = SET_ERROR(ENOSPC);
 164 
 165         kmem_free(vb, sizeof (vdev_buf_t));
 166         zio_delay_interrupt(zio);
 167 }
 168 
 169 static void
 170 vdev_file_io_strategy(void *arg)
 171 {
 172         buf_t *bp = arg;
 173         vnode_t *vp = bp->b_private;
 174         ssize_t resid;
 175         int error;
 176 
 177         error = vn_rdwr((bp->b_flags & B_READ) ? UIO_READ : UIO_WRITE,
 178             vp, bp->b_un.b_addr, bp->b_bcount, ldbtob(bp->b_lblkno),
 179             UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
 180 
 181         if (error == 0) {
 182                 bp->b_resid = resid;
 183                 biodone(bp);
 184         } else {
 185                 bioerror(bp, error);
 186                 biodone(bp);
 187         }
 188 }
 189 
 190 static void
 191 vdev_file_io_start(zio_t *zio)
 192 {
 193         vdev_t *vd = zio->io_vd;
 194         vdev_file_t *vf = vd->vdev_tsd;
 195         vdev_buf_t *vb;
 196         buf_t *bp;
 197 
 198         if (zio->io_type == ZIO_TYPE_IOCTL) {
 199                 /* XXPOLICY */
 200                 if (!vdev_readable(vd)) {
 201                         zio->io_error = SET_ERROR(ENXIO);
 202                         zio_interrupt(zio);
 203                         return;
 204                 }
 205 
 206                 switch (zio->io_cmd) {
 207                 case DKIOCFLUSHWRITECACHE:
 208                         zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
 209                             kcred, NULL);
 210                         break;
 211                 default:
 212                         zio->io_error = SET_ERROR(ENOTSUP);
 213                 }
 214 
 215                 zio_execute(zio);
 216                 return;
 217         }
 218 
 219         ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
 220         zio->io_target_timestamp = zio_handle_io_delay(zio);
 221 
 222         vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
 223 
 224         vb->vb_io = zio;
 225         bp = &vb->vb_buf;
 226 
 227         bioinit(bp);
 228         bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
 229         bp->b_bcount = zio->io_size;
 230         bp->b_un.b_addr = zio->io_data;
 231         bp->b_lblkno = lbtodb(zio->io_offset);
 232         bp->b_bufsize = zio->io_size;
 233         bp->b_private = vf->vf_vnode;
 234         bp->b_iodone = (int (*)())vdev_file_io_intr;
 235 
 236         VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, bp,
 237             TQ_SLEEP), !=, 0);
 238 }
 239 
 240 /* ARGSUSED */
 241 static void
 242 vdev_file_io_done(zio_t *zio)
 243 {
 244 }
 245 
 246 vdev_ops_t vdev_file_ops = {
 247         vdev_file_open,
 248         vdev_file_close,
 249         vdev_default_asize,
 250         vdev_file_io_start,
 251         vdev_file_io_done,
 252         NULL,
 253         vdev_file_hold,
 254         vdev_file_rele,
 255         VDEV_TYPE_FILE,         /* name of this vdev type */
 256         B_TRUE                  /* leaf vdev */
 257 };
 258 
 259 /*
 260  * From userland we access disks just like files.
 261  */
 262 #ifndef _KERNEL
 263 
 264 vdev_ops_t vdev_disk_ops = {
 265         vdev_file_open,
 266         vdev_file_close,
 267         vdev_default_asize,
 268         vdev_file_io_start,
 269         vdev_file_io_done,
 270         NULL,
 271         vdev_file_hold,
 272         vdev_file_rele,
 273         VDEV_TYPE_DISK,         /* name of this vdev type */
 274         B_TRUE                  /* leaf vdev */
 275 };
 276 
 277 #endif