1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/spa.h> 28 #include <sys/spa_impl.h> 29 #include <sys/vdev_file.h> 30 #include <sys/vdev_impl.h> 31 #include <sys/zio.h> 32 #include <sys/fs/zfs.h> 33 #include <sys/fm/fs/zfs.h> 34 #include <sys/abd.h> 35 36 /* 37 * Virtual device vector for files. 38 */ 39 40 static void 41 vdev_file_hold(vdev_t *vd) 42 { 43 ASSERT(vd->vdev_path != NULL); 44 } 45 46 static void 47 vdev_file_rele(vdev_t *vd) 48 { 49 ASSERT(vd->vdev_path != NULL); 50 } 51 52 static int 53 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 54 uint64_t *ashift) 55 { 56 vdev_file_t *vf; 57 vnode_t *vp; 58 vattr_t vattr; 59 int error; 60 61 /* 62 * Rotational optimizations only make sense on block devices 63 */ 64 vd->vdev_nonrot = B_TRUE; 65 66 /* 67 * We must have a pathname, and it must be absolute. 68 */ 69 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { 70 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 71 return (SET_ERROR(EINVAL)); 72 } 73 74 /* 75 * Reopen the device if it's not currently open. Otherwise, 76 * just update the physical size of the device. 77 */ 78 if (vd->vdev_tsd != NULL) { 79 ASSERT(vd->vdev_reopening); 80 vf = vd->vdev_tsd; 81 goto skip_open; 82 } 83 84 vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); 85 86 /* 87 * We always open the files from the root of the global zone, even if 88 * we're in a local zone. If the user has gotten to this point, the 89 * administrator has already decided that the pool should be available 90 * to local zone users, so the underlying devices should be as well. 91 */ 92 ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); 93 error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, 94 spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); 95 96 if (error) { 97 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 98 return (error); 99 } 100 101 vf->vf_vnode = vp; 102 103 #ifdef _KERNEL 104 /* 105 * Make sure it's a regular file. 106 */ 107 if (vp->v_type != VREG) { 108 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 109 return (SET_ERROR(ENODEV)); 110 } 111 #endif 112 113 skip_open: 114 /* 115 * Determine the physical size of the file. 116 */ 117 vattr.va_mask = AT_SIZE; 118 error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL); 119 if (error) { 120 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 121 return (error); 122 } 123 124 *max_psize = *psize = vattr.va_size; 125 *ashift = SPA_MINBLOCKSHIFT; 126 127 return (0); 128 } 129 130 static void 131 vdev_file_close(vdev_t *vd) 132 { 133 vdev_file_t *vf = vd->vdev_tsd; 134 135 if (vd->vdev_reopening || vf == NULL) 136 return; 137 138 if (vf->vf_vnode != NULL) { 139 (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL); 140 (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0, 141 kcred, NULL); 142 VN_RELE(vf->vf_vnode); 143 } 144 145 vd->vdev_delayed_close = B_FALSE; 146 kmem_free(vf, sizeof (vdev_file_t)); 147 vd->vdev_tsd = NULL; 148 } 149 150 /* 151 * Implements the interrupt side for file vdev types. This routine will be 152 * called when the I/O completes allowing us to transfer the I/O to the 153 * interrupt taskqs. For consistency, the code structure mimics disk vdev 154 * types. 155 */ 156 static void 157 vdev_file_io_intr(buf_t *bp) 158 { 159 vdev_buf_t *vb = (vdev_buf_t *)bp; 160 zio_t *zio = vb->vb_io; 161 162 zio->io_error = (geterror(bp) != 0 ? EIO : 0); 163 if (zio->io_error == 0 && bp->b_resid != 0) 164 zio->io_error = SET_ERROR(ENOSPC); 165 166 if (zio->io_type == ZIO_TYPE_READ) { 167 abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size); 168 } else { 169 abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size); 170 } 171 172 kmem_free(vb, sizeof (vdev_buf_t)); 173 zio_delay_interrupt(zio); 174 } 175 176 static void 177 vdev_file_io_strategy(void *arg) 178 { 179 buf_t *bp = arg; 180 vnode_t *vp = bp->b_private; 181 ssize_t resid; 182 int error; 183 184 error = vn_rdwr((bp->b_flags & B_READ) ? UIO_READ : UIO_WRITE, 185 vp, bp->b_un.b_addr, bp->b_bcount, ldbtob(bp->b_lblkno), 186 UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 187 188 if (error == 0) { 189 bp->b_resid = resid; 190 biodone(bp); 191 } else { 192 bioerror(bp, error); 193 biodone(bp); 194 } 195 } 196 197 static void 198 vdev_file_io_start(zio_t *zio) 199 { 200 vdev_t *vd = zio->io_vd; 201 vdev_file_t *vf = vd->vdev_tsd; 202 vdev_buf_t *vb; 203 buf_t *bp; 204 205 if (zio->io_type == ZIO_TYPE_IOCTL) { 206 /* XXPOLICY */ 207 if (!vdev_readable(vd)) { 208 zio->io_error = SET_ERROR(ENXIO); 209 zio_interrupt(zio); 210 return; 211 } 212 213 switch (zio->io_cmd) { 214 case DKIOCFLUSHWRITECACHE: 215 zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, 216 kcred, NULL); 217 break; 218 default: 219 zio->io_error = SET_ERROR(ENOTSUP); 220 } 221 222 zio_execute(zio); 223 return; 224 } 225 226 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 227 zio->io_target_timestamp = zio_handle_io_delay(zio); 228 229 vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); 230 231 vb->vb_io = zio; 232 bp = &vb->vb_buf; 233 234 bioinit(bp); 235 bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); 236 bp->b_bcount = zio->io_size; 237 238 if (zio->io_type == ZIO_TYPE_READ) { 239 bp->b_un.b_addr = 240 abd_borrow_buf(zio->io_abd, zio->io_size); 241 } else { 242 bp->b_un.b_addr = 243 abd_borrow_buf_copy(zio->io_abd, zio->io_size); 244 } 245 246 bp->b_lblkno = lbtodb(zio->io_offset); 247 bp->b_bufsize = zio->io_size; 248 bp->b_private = vf->vf_vnode; 249 bp->b_iodone = (int (*)())vdev_file_io_intr; 250 251 VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, bp, 252 TQ_SLEEP), !=, 0); 253 } 254 255 /* ARGSUSED */ 256 static void 257 vdev_file_io_done(zio_t *zio) 258 { 259 } 260 261 vdev_ops_t vdev_file_ops = { 262 vdev_file_open, 263 vdev_file_close, 264 vdev_default_asize, 265 vdev_file_io_start, 266 vdev_file_io_done, 267 NULL, 268 vdev_file_hold, 269 vdev_file_rele, 270 VDEV_TYPE_FILE, /* name of this vdev type */ 271 B_TRUE /* leaf vdev */ 272 }; 273 274 /* 275 * From userland we access disks just like files. 276 */ 277 #ifndef _KERNEL 278 279 vdev_ops_t vdev_disk_ops = { 280 vdev_file_open, 281 vdev_file_close, 282 vdev_default_asize, 283 vdev_file_io_start, 284 vdev_file_io_done, 285 NULL, 286 vdev_file_hold, 287 vdev_file_rele, 288 VDEV_TYPE_DISK, /* name of this vdev type */ 289 B_TRUE /* leaf vdev */ 290 }; 291 292 #endif