1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  24  */
  25 
  26 #include <sys/zfs_context.h>
  27 #include <sys/spa.h>
  28 #include <sys/spa_impl.h>
  29 #include <sys/vdev_file.h>
  30 #include <sys/vdev_impl.h>
  31 #include <sys/zio.h>
  32 #include <sys/fs/zfs.h>
  33 #include <sys/fm/fs/zfs.h>
  34 #include <sys/abd.h>
  35 
  36 /*
  37  * Virtual device vector for files.
  38  */
  39 
  40 static void
  41 vdev_file_hold(vdev_t *vd)
  42 {
  43         ASSERT(vd->vdev_path != NULL);
  44 }
  45 
  46 static void
  47 vdev_file_rele(vdev_t *vd)
  48 {
  49         ASSERT(vd->vdev_path != NULL);
  50 }
  51 
  52 static int
  53 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
  54     uint64_t *ashift)
  55 {
  56         vdev_file_t *vf;
  57         vnode_t *vp;
  58         vattr_t vattr;
  59         int error;
  60 
  61         /*
  62          * Rotational optimizations only make sense on block devices
  63          */
  64         vd->vdev_nonrot = B_TRUE;
  65 
  66         /*
  67          * We must have a pathname, and it must be absolute.
  68          */
  69         if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
  70                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
  71                 return (SET_ERROR(EINVAL));
  72         }
  73 
  74         /*
  75          * Reopen the device if it's not currently open.  Otherwise,
  76          * just update the physical size of the device.
  77          */
  78         if (vd->vdev_tsd != NULL) {
  79                 ASSERT(vd->vdev_reopening);
  80                 vf = vd->vdev_tsd;
  81                 goto skip_open;
  82         }
  83 
  84         vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
  85 
  86         /*
  87          * We always open the files from the root of the global zone, even if
  88          * we're in a local zone.  If the user has gotten to this point, the
  89          * administrator has already decided that the pool should be available
  90          * to local zone users, so the underlying devices should be as well.
  91          */
  92         ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
  93         error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
  94             spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
  95 
  96         if (error) {
  97                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
  98                 return (error);
  99         }
 100 
 101         vf->vf_vnode = vp;
 102 
 103 #ifdef _KERNEL
 104         /*
 105          * Make sure it's a regular file.
 106          */
 107         if (vp->v_type != VREG) {
 108                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 109                 return (SET_ERROR(ENODEV));
 110         }
 111 #endif
 112 
 113 skip_open:
 114         /*
 115          * Determine the physical size of the file.
 116          */
 117         vattr.va_mask = AT_SIZE;
 118         error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL);
 119         if (error) {
 120                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 121                 return (error);
 122         }
 123 
 124         *max_psize = *psize = vattr.va_size;
 125         *ashift = SPA_MINBLOCKSHIFT;
 126 
 127         return (0);
 128 }
 129 
 130 static void
 131 vdev_file_close(vdev_t *vd)
 132 {
 133         vdev_file_t *vf = vd->vdev_tsd;
 134 
 135         if (vd->vdev_reopening || vf == NULL)
 136                 return;
 137 
 138         if (vf->vf_vnode != NULL) {
 139                 (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
 140                 (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
 141                     kcred, NULL);
 142                 VN_RELE(vf->vf_vnode);
 143         }
 144 
 145         vd->vdev_delayed_close = B_FALSE;
 146         kmem_free(vf, sizeof (vdev_file_t));
 147         vd->vdev_tsd = NULL;
 148 }
 149 
 150 /*
 151  * Implements the interrupt side for file vdev types. This routine will be
 152  * called when the I/O completes allowing us to transfer the I/O to the
 153  * interrupt taskqs. For consistency, the code structure mimics disk vdev
 154  * types.
 155  */
 156 static void
 157 vdev_file_io_intr(buf_t *bp)
 158 {
 159         vdev_buf_t *vb = (vdev_buf_t *)bp;
 160         zio_t *zio = vb->vb_io;
 161 
 162         zio->io_error = (geterror(bp) != 0 ? EIO : 0);
 163         if (zio->io_error == 0 && bp->b_resid != 0)
 164                 zio->io_error = SET_ERROR(ENOSPC);
 165 
 166         if (zio->io_type == ZIO_TYPE_READ) {
 167                 abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size);
 168         } else {
 169                 abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size);
 170         }
 171 
 172         kmem_free(vb, sizeof (vdev_buf_t));
 173         zio_delay_interrupt(zio);
 174 }
 175 
 176 static void
 177 vdev_file_io_strategy(void *arg)
 178 {
 179         buf_t *bp = arg;
 180         vnode_t *vp = bp->b_private;
 181         ssize_t resid;
 182         int error;
 183 
 184         error = vn_rdwr((bp->b_flags & B_READ) ? UIO_READ : UIO_WRITE,
 185             vp, bp->b_un.b_addr, bp->b_bcount, ldbtob(bp->b_lblkno),
 186             UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
 187 
 188         if (error == 0) {
 189                 bp->b_resid = resid;
 190                 biodone(bp);
 191         } else {
 192                 bioerror(bp, error);
 193                 biodone(bp);
 194         }
 195 }
 196 
 197 static void
 198 vdev_file_io_start(zio_t *zio)
 199 {
 200         vdev_t *vd = zio->io_vd;
 201         vdev_file_t *vf = vd->vdev_tsd;
 202         vdev_buf_t *vb;
 203         buf_t *bp;
 204 
 205         if (zio->io_type == ZIO_TYPE_IOCTL) {
 206                 /* XXPOLICY */
 207                 if (!vdev_readable(vd)) {
 208                         zio->io_error = SET_ERROR(ENXIO);
 209                         zio_interrupt(zio);
 210                         return;
 211                 }
 212 
 213                 switch (zio->io_cmd) {
 214                 case DKIOCFLUSHWRITECACHE:
 215                         zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
 216                             kcred, NULL);
 217                         break;
 218                 default:
 219                         zio->io_error = SET_ERROR(ENOTSUP);
 220                 }
 221 
 222                 zio_execute(zio);
 223                 return;
 224         }
 225 
 226         ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
 227         zio->io_target_timestamp = zio_handle_io_delay(zio);
 228 
 229         vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
 230 
 231         vb->vb_io = zio;
 232         bp = &vb->vb_buf;
 233 
 234         bioinit(bp);
 235         bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
 236         bp->b_bcount = zio->io_size;
 237 
 238         if (zio->io_type == ZIO_TYPE_READ) {
 239                 bp->b_un.b_addr =
 240                     abd_borrow_buf(zio->io_abd, zio->io_size);
 241         } else {
 242                 bp->b_un.b_addr =
 243                     abd_borrow_buf_copy(zio->io_abd, zio->io_size);
 244         }
 245 
 246         bp->b_lblkno = lbtodb(zio->io_offset);
 247         bp->b_bufsize = zio->io_size;
 248         bp->b_private = vf->vf_vnode;
 249         bp->b_iodone = (int (*)())vdev_file_io_intr;
 250 
 251         VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, bp,
 252             TQ_SLEEP), !=, 0);
 253 }
 254 
 255 /* ARGSUSED */
 256 static void
 257 vdev_file_io_done(zio_t *zio)
 258 {
 259 }
 260 
 261 vdev_ops_t vdev_file_ops = {
 262         vdev_file_open,
 263         vdev_file_close,
 264         vdev_default_asize,
 265         vdev_file_io_start,
 266         vdev_file_io_done,
 267         NULL,
 268         vdev_file_hold,
 269         vdev_file_rele,
 270         VDEV_TYPE_FILE,         /* name of this vdev type */
 271         B_TRUE                  /* leaf vdev */
 272 };
 273 
 274 /*
 275  * From userland we access disks just like files.
 276  */
 277 #ifndef _KERNEL
 278 
 279 vdev_ops_t vdev_disk_ops = {
 280         vdev_file_open,
 281         vdev_file_close,
 282         vdev_default_asize,
 283         vdev_file_io_start,
 284         vdev_file_io_done,
 285         NULL,
 286         vdev_file_hold,
 287         vdev_file_rele,
 288         VDEV_TYPE_DISK,         /* name of this vdev type */
 289         B_TRUE                  /* leaf vdev */
 290 };
 291 
 292 #endif