7127 remove -Wno-missing-braces from Makefile.uts
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26
27 #include <sys/debug.h>
28 #include <sys/types.h>
29 #include <sys/file.h>
30 #include <sys/errno.h>
31 #include <sys/uio.h>
32 #include <sys/open.h>
33 #include <sys/cred.h>
34 #include <sys/kmem.h>
35 #include <sys/conf.h>
36 #include <sys/cmn_err.h>
37 #include <sys/modctl.h>
38 #include <sys/disp.h>
39 #include <sys/atomic.h>
40 #include <sys/filio.h>
41 #include <sys/stat.h> /* needed for S_IFBLK and S_IFCHR */
42 #include <sys/kstat.h>
43
44 #include <sys/ddi.h>
45 #include <sys/devops.h>
46 #include <sys/sunddi.h>
47 #include <sys/esunddi.h>
48 #include <sys/priv_names.h>
49
50 #include <sys/fssnap.h>
51 #include <sys/fssnap_if.h>
52
53 /*
54 * This module implements the file system snapshot code, which provides a
55 * point-in-time image of a file system for the purposes of online backup.
56 * There are essentially two parts to this project: the driver half and the
57 * file system half. The driver half is a pseudo device driver called
58 * "fssnap" that represents the snapshot. Each snapshot is assigned a
59 * number that corresponds to the minor number of the device, and a control
60 * device with a high minor number is used to initiate snapshot creation and
61 * deletion. For all practical purposes the driver half acts like a
62 * read-only disk device whose contents are exactly the same as the master
63 * file system at the time the snapshot was created.
64 *
65 * The file system half provides interfaces necessary for performing the
66 * file system dependent operations required to create and delete snapshots
67 * and a special driver strategy routine that must always be used by the file
68 * system for snapshots to work correctly.
69 *
70 * When a snapshot is to be created, the user utility will send an ioctl to
71 * the control device of the driver half specifying the file system to be
72 * snapshotted, the file descriptor of a backing-store file which is used to
73 * hold old data before it is overwritten, and other snapshot parameters.
74 * This ioctl is passed on to the file system specified in the original
75 * ioctl request. The file system is expected to be able to flush
76 * everything out to make the file system consistent and lock it to ensure
77 * no changes occur while the snapshot is being created. It then calls
78 * fssnap_create() to create state for a new snapshot, from which an opaque
79 * handle is returned with the snapshot locked. Next, the file system must
80 * populate the "candidate bitmap", which tells the snapshot code which
81 * "chunks" should be considered for copy-on-write (a chunk is the unit of
82 * granularity used for copy-on-write, which is independent of the device
83 * and file system block sizes). This is typically done by scanning the
84 * file system allocation bitmaps to determine which chunks contain
85 * allocated blocks in the file system at the time the snapshot was created.
86 * If a chunk has no allocated blocks, it does not need to be copied before
87 * being written to. Once the candidate bitmap is populated with
88 * fssnap_set_candidate(), the file system calls fssnap_create_done() to
89 * complete the snapshot creation and unlock the snapshot. The file system
90 * may now be unlocked and modifications to it resumed.
91 *
92 * Once a snapshot is created, the file system must perform all writes
93 * through a special strategy routine, fssnap_strategy(). This strategy
94 * routine determines whether the chunks contained by the write must be
95 * copied before being overwritten by consulting the candidate bitmap
96 * described above, and the "hastrans bitmap" which tells it whether the chunk
97 * has been copied already or not. If the chunk is a candidate but has not
98 * been copied, it reads the old data in and adds it to a queue. The
99 * old data can then be overwritten with the new data. An asynchronous
100 * task queue is dispatched for each old chunk read in which writes the old
101 * data to the backing file specified at snapshot creation time. The
102 * backing file is a sparse file the same size as the file system that
103 * contains the old data at the offset that data originally had in the
104 * file system. If the queue containing in-memory chunks gets too large,
105 * writes to the file system may be throttled by a semaphore until the
106 * task queues have a chance to push some of the chunks to the backing file.
107 *
108 * With the candidate bitmap, the hastrans bitmap, the data on the master
109 * file system, and the old data in memory and in the backing file, the
110 * snapshot pseudo-driver can piece together the original file system
111 * information to satisfy read requests. If the requested chunk is not a
112 * candidate, it returns a zeroed buffer. If the chunk is a candidate but
113 * has not been copied it reads it from the master file system. If it is a
114 * candidate and has been copied, it either copies the data from the
115 * in-memory queue or it reads it in from the backing file. The result is
116 * a replication of the original file system that can be backed up, mounted,
117 * or manipulated by other file system utilities that work on a read-only
118 * device.
119 *
120 * This module is divided into three roughly logical sections:
121 *
122 * - The snapshot driver, which is a character/block driver
123 * representing the snapshot itself. These routines are
124 * prefixed with "snap_".
125 *
126 * - The library routines that are defined in fssnap_if.h that
127 * are used by file systems that use this snapshot implementation.
128 * These functions are prefixed with "fssnap_" and are called through
129 * a function vector from the file system.
130 *
131 * - The helper routines used by the snapshot driver and the fssnap
132 * library routines for managing the translation table and other
133 * useful functions. These routines are all static and are
134 * prefixed with either "fssnap_" or "transtbl_" if they
135 * are specifically used for translation table activities.
136 */
137
138 static dev_info_t *fssnap_dip = NULL;
139 static struct snapshot_id *snapshot = NULL;
140 static struct snapshot_id snap_ctl;
141 static int num_snapshots = 0;
142 static kmutex_t snapshot_mutex;
143 static char snapname[] = SNAP_NAME;
144
145 /* "tunable" parameters */
146 static int fssnap_taskq_nthreads = FSSNAP_TASKQ_THREADS;
147 static uint_t fssnap_max_mem_chunks = FSSNAP_MAX_MEM_CHUNKS;
148 static int fssnap_taskq_maxtasks = FSSNAP_TASKQ_MAXTASKS;
149
150 /* static function prototypes */
151
152 /* snapshot driver */
153 static int snap_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
154 static int snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
155 static int snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
156 static int snap_open(dev_t *devp, int flag, int otyp, cred_t *cred);
157 static int snap_close(dev_t dev, int flag, int otyp, cred_t *cred);
158 static int snap_strategy(struct buf *bp);
159 static int snap_read(dev_t dev, struct uio *uiop, cred_t *credp);
160 static int snap_print(dev_t dev, char *str);
161 static int snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
162 cred_t *credp, int *rvalp);
163 static int snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
164 int flags, char *name, caddr_t valuep, int *lengthp);
165 static int snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk,
166 int offset, int len, char *buffer);
167
168
169 /* fssnap interface implementations (see fssnap_if.h) */
170 static void fssnap_strategy_impl(void *, struct buf *);
171 static void *fssnap_create_impl(chunknumber_t, uint_t, u_offset_t,
172 struct vnode *, int, struct vnode **, char *, u_offset_t);
173 static void fssnap_set_candidate_impl(void *, chunknumber_t);
174 static int fssnap_is_candidate_impl(void *, u_offset_t);
175 static int fssnap_create_done_impl(void *);
176 static int fssnap_delete_impl(void *);
177
178 /* fssnap interface support routines */
179 static int fssnap_translate(struct snapshot_id **, struct buf *);
180 static void fssnap_write_taskq(void *);
181 static void fssnap_create_kstats(snapshot_id_t *, int, const char *,
182 const char *);
183 static int fssnap_update_kstat_num(kstat_t *, int);
184 static void fssnap_delete_kstats(struct cow_info *);
185
186 /* translation table prototypes */
187 static cow_map_node_t *transtbl_add(cow_map_t *, chunknumber_t, caddr_t);
188 static cow_map_node_t *transtbl_get(cow_map_t *, chunknumber_t);
189 static void transtbl_delete(cow_map_t *, cow_map_node_t *);
190 static void transtbl_free(cow_map_t *);
191
192 static kstat_t *fssnap_highwater_kstat;
193
194 /* ************************************************************************ */
195
196 /* Device and Module Structures */
197
198 static struct cb_ops snap_cb_ops = {
199 snap_open,
200 snap_close,
201 snap_strategy,
202 snap_print,
203 nodev, /* no snap_dump */
204 snap_read,
205 nodev, /* no snap_write */
206 snap_ioctl,
207 nodev, /* no snap_devmap */
208 nodev, /* no snap_mmap */
209 nodev, /* no snap_segmap */
210 nochpoll,
211 snap_prop_op,
212 NULL, /* streamtab */
213 D_64BIT | D_NEW | D_MP, /* driver compatibility */
214 CB_REV,
215 nodev, /* async I/O read entry point */
216 nodev /* async I/O write entry point */
217 };
218
219 static struct dev_ops snap_ops = {
220 DEVO_REV,
221 0, /* ref count */
222 snap_getinfo,
223 nulldev, /* snap_identify obsolete */
224 nulldev, /* no snap_probe */
225 snap_attach,
226 snap_detach,
227 nodev, /* no snap_reset */
228 &snap_cb_ops,
229 (struct bus_ops *)NULL,
230 nulldev, /* no snap_power() */
231 ddi_quiesce_not_needed, /* quiesce */
232 };
233
234 extern struct mod_ops mod_driverops;
235
236 static struct modldrv md = {
237 &mod_driverops, /* Type of module. This is a driver */
238 "snapshot driver", /* Name of the module */
239 &snap_ops,
240 };
241
242 static struct modlinkage ml = {
243 MODREV_1,
244 { &md, NULL }
245 };
246
247 static void *statep;
248
249 int
250 _init(void)
251 {
252 int error;
253 kstat_t *ksp;
254 kstat_named_t *ksdata;
255
256 error = ddi_soft_state_init(&statep, sizeof (struct snapshot_id *), 1);
257 if (error) {
258 cmn_err(CE_WARN, "_init: failed to init ddi_soft_state.");
259 return (error);
260 }
261
262 error = mod_install(&ml);
263
264 if (error) {
265 cmn_err(CE_WARN, "_init: failed to mod_install.");
266 ddi_soft_state_fini(&statep);
267 return (error);
268 }
269
270 /*
271 * Fill in the snapshot operations vector for file systems
272 * (defined in fssnap_if.c)
273 */
274
275 snapops.fssnap_create = fssnap_create_impl;
276 snapops.fssnap_set_candidate = fssnap_set_candidate_impl;
277 snapops.fssnap_is_candidate = fssnap_is_candidate_impl;
278 snapops.fssnap_create_done = fssnap_create_done_impl;
279 snapops.fssnap_delete = fssnap_delete_impl;
280 snapops.fssnap_strategy = fssnap_strategy_impl;
281
282 mutex_init(&snapshot_mutex, NULL, MUTEX_DEFAULT, NULL);
283
284 /*
285 * Initialize the fssnap highwater kstat
286 */
287 ksp = kstat_create(snapname, 0, FSSNAP_KSTAT_HIGHWATER, "misc",
288 KSTAT_TYPE_NAMED, 1, 0);
289 if (ksp != NULL) {
290 ksdata = (kstat_named_t *)ksp->ks_data;
291 kstat_named_init(ksdata, FSSNAP_KSTAT_HIGHWATER,
292 KSTAT_DATA_UINT32);
293 ksdata->value.ui32 = 0;
294 kstat_install(ksp);
295 } else {
296 cmn_err(CE_WARN, "_init: failed to create highwater kstat.");
297 }
298 fssnap_highwater_kstat = ksp;
299
300 return (0);
301 }
302
303 int
304 _info(struct modinfo *modinfop)
305 {
306 return (mod_info(&ml, modinfop));
307 }
308
309 int
310 _fini(void)
311 {
312 int error;
313
314 error = mod_remove(&ml);
315 if (error)
316 return (error);
317 ddi_soft_state_fini(&statep);
318
319 /*
320 * delete the fssnap highwater kstat
321 */
322 kstat_delete(fssnap_highwater_kstat);
323
324 mutex_destroy(&snapshot_mutex);
325
326 /* Clear out the file system operations vector */
327 snapops.fssnap_create = NULL;
328 snapops.fssnap_set_candidate = NULL;
329 snapops.fssnap_create_done = NULL;
330 snapops.fssnap_delete = NULL;
331 snapops.fssnap_strategy = NULL;
332
333 return (0);
334 }
335
336 /* ************************************************************************ */
337
338 /*
339 * Snapshot Driver Routines
340 *
341 * This section implements the snapshot character and block drivers. The
342 * device will appear to be a consistent read-only file system to
343 * applications that wish to back it up or mount it. The snapshot driver
344 * communicates with the file system through the translation table, which
345 * tells the snapshot driver where to find the data necessary to piece
346 * together the frozen file system. The data may either be on the master
347 * device (no translation exists), in memory (a translation exists but has
348 * not been flushed to the backing store), or in the backing store file.
349 * The read request may require the snapshot driver to retrieve data from
350 * several different places and piece it together to look like a single
351 * contiguous read.
352 *
353 * The device minor number corresponds to the snapshot number in the list of
354 * snapshot identifiers. The soft state for each minor number is simply a
355 * pointer to the snapshot id, which holds all of the snapshot state. One
356 * minor number is designated as the control device. All snapshot create
357 * and delete requests go through the control device to ensure this module
358 * is properly loaded and attached before the file system starts calling
359 * routines defined here.
360 */
361
362
363 /*
364 * snap_getinfo() - snapshot driver getinfo(9E) routine
365 *
366 */
367 /*ARGSUSED*/
368 static int
369 snap_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
370 {
371 switch (infocmd) {
372 case DDI_INFO_DEVT2DEVINFO:
373 *result = fssnap_dip;
374 return (DDI_SUCCESS);
375 case DDI_INFO_DEVT2INSTANCE:
376 *result = 0; /* we only have one instance */
377 return (DDI_SUCCESS);
378 }
379 return (DDI_FAILURE);
380 }
381
382 /*
383 * snap_attach() - snapshot driver attach(9E) routine
384 *
385 * sets up snapshot control device and control state. The control state
386 * is a pointer to an "anonymous" snapshot_id for tracking opens and closes
387 */
388 static int
389 snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
390 {
391 int error;
392
393 switch (cmd) {
394 case DDI_ATTACH:
395 /* create the control device */
396 error = ddi_create_priv_minor_node(dip, SNAP_CTL_NODE, S_IFCHR,
397 SNAP_CTL_MINOR, DDI_PSEUDO, PRIVONLY_DEV,
398 PRIV_SYS_CONFIG, PRIV_SYS_CONFIG, 0666);
399 if (error == DDI_FAILURE) {
400 return (DDI_FAILURE);
401 }
402
403 rw_init(&snap_ctl.sid_rwlock, NULL, RW_DEFAULT, NULL);
404 rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
405 fssnap_dip = dip;
406 snap_ctl.sid_snapnumber = SNAP_CTL_MINOR;
407 /* the control sid is not linked into the snapshot list */
408 snap_ctl.sid_next = NULL;
409 snap_ctl.sid_cowinfo = NULL;
410 snap_ctl.sid_flags = 0;
411 rw_exit(&snap_ctl.sid_rwlock);
412 ddi_report_dev(dip);
413
414 return (DDI_SUCCESS);
415 case DDI_PM_RESUME:
416 return (DDI_SUCCESS);
417
418 case DDI_RESUME:
419 return (DDI_SUCCESS);
420
421 default:
422 return (DDI_FAILURE);
423 }
424 }
425
426 /*
427 * snap_detach() - snapshot driver detach(9E) routine
428 *
429 * destroys snapshot control device and control state. If any snapshots
430 * are active (ie. num_snapshots != 0), the device will refuse to detach.
431 */
432 static int
433 snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
434 {
435 struct snapshot_id *sidp, *sidnextp;
436
437 switch (cmd) {
438 case DDI_DETACH:
439 /* do not detach if the device is active */
440 mutex_enter(&snapshot_mutex);
441 if ((num_snapshots != 0) ||
442 ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0)) {
443 mutex_exit(&snapshot_mutex);
444 return (DDI_FAILURE);
445 }
446
447 /* free up the snapshot list */
448 for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
449 ASSERT(SID_AVAILABLE(sidp) &&
450 !RW_LOCK_HELD(&sidp->sid_rwlock));
451 sidnextp = sidp->sid_next;
452 rw_destroy(&sidp->sid_rwlock);
453 kmem_free(sidp, sizeof (struct snapshot_id));
454 }
455 snapshot = NULL;
456
457 /* delete the control device */
458 ddi_remove_minor_node(dip, SNAP_CTL_NODE);
459 fssnap_dip = NULL;
460
461 ASSERT((snap_ctl.sid_flags & SID_CHAR_BUSY) == 0);
462 rw_destroy(&snap_ctl.sid_rwlock);
463 mutex_exit(&snapshot_mutex);
464
465 return (DDI_SUCCESS);
466
467 default:
468 return (DDI_FAILURE);
469 }
470 }
471
472 /*
473 * snap_open() - snapshot driver open(9E) routine
474 *
475 * marks the snapshot id as busy so it will not be recycled when deleted
476 * until the snapshot is closed.
477 */
478 /* ARGSUSED */
479 static int
480 snap_open(dev_t *devp, int flag, int otyp, cred_t *cred)
481 {
482 minor_t minor;
483 struct snapshot_id **sidpp, *sidp;
484
485 /* snapshots are read-only */
486 if (flag & FWRITE)
487 return (EROFS);
488
489 minor = getminor(*devp);
490
491 if (minor == SNAP_CTL_MINOR) {
492 /* control device must be opened exclusively */
493 if (((flag & FEXCL) != FEXCL) || (otyp != OTYP_CHR))
494 return (EINVAL);
495
496 rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
497 if ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0) {
498 rw_exit(&snap_ctl.sid_rwlock);
499 return (EBUSY);
500 }
501
502 snap_ctl.sid_flags |= SID_CHAR_BUSY;
503 rw_exit(&snap_ctl.sid_rwlock);
504
505 return (0);
506 }
507
508 sidpp = ddi_get_soft_state(statep, minor);
509 if (sidpp == NULL || *sidpp == NULL)
510 return (ENXIO);
511 sidp = *sidpp;
512 rw_enter(&sidp->sid_rwlock, RW_WRITER);
513
514 if ((flag & FEXCL) && SID_BUSY(sidp)) {
515 rw_exit(&sidp->sid_rwlock);
516 return (EAGAIN);
517 }
518
519 ASSERT(sidpp != NULL && sidp != NULL);
520 /* check to see if this snapshot has been killed on us */
521 if (SID_INACTIVE(sidp)) {
522 cmn_err(CE_WARN, "snap_open: snapshot %d does not exist.",
523 minor);
524 rw_exit(&sidp->sid_rwlock);
525 return (ENXIO);
526 }
527
528 switch (otyp) {
529 case OTYP_CHR:
530 sidp->sid_flags |= SID_CHAR_BUSY;
531 break;
532 case OTYP_BLK:
533 sidp->sid_flags |= SID_BLOCK_BUSY;
534 break;
535 default:
536 rw_exit(&sidp->sid_rwlock);
537 return (EINVAL);
538 }
539
540 rw_exit(&sidp->sid_rwlock);
541
542 /*
543 * at this point if a valid snapshot was found then it has
544 * been marked busy and we can use it.
545 */
546 return (0);
547 }
548
549 /*
550 * snap_close() - snapshot driver close(9E) routine
551 *
552 * unsets the busy bits in the snapshot id. If the snapshot has been
553 * deleted while the snapshot device was open, the close call will clean
554 * up the remaining state information.
555 */
556 /* ARGSUSED */
557 static int
558 snap_close(dev_t dev, int flag, int otyp, cred_t *cred)
559 {
560 struct snapshot_id **sidpp, *sidp;
561 minor_t minor;
562 char name[20];
563
564 minor = getminor(dev);
565
566 /* if this is the control device, close it and return */
567 if (minor == SNAP_CTL_MINOR) {
568 rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
569 snap_ctl.sid_flags &= ~(SID_CHAR_BUSY);
570 rw_exit(&snap_ctl.sid_rwlock);
571 return (0);
572 }
573
574 sidpp = ddi_get_soft_state(statep, minor);
575 if (sidpp == NULL || *sidpp == NULL) {
576 cmn_err(CE_WARN, "snap_close: could not find state for "
577 "snapshot %d.", minor);
578 return (ENXIO);
579 }
580 sidp = *sidpp;
581 mutex_enter(&snapshot_mutex);
582 rw_enter(&sidp->sid_rwlock, RW_WRITER);
583
584 /* Mark the snapshot as not being busy anymore */
585 switch (otyp) {
586 case OTYP_CHR:
587 sidp->sid_flags &= ~(SID_CHAR_BUSY);
588 break;
589 case OTYP_BLK:
590 sidp->sid_flags &= ~(SID_BLOCK_BUSY);
591 break;
592 default:
593 mutex_exit(&snapshot_mutex);
594 rw_exit(&sidp->sid_rwlock);
595 return (EINVAL);
596 }
597
598 if (SID_AVAILABLE(sidp)) {
599 /*
600 * if this is the last close on a snapshot that has been
601 * deleted, then free up the soft state. The snapdelete
602 * ioctl does not free this when the device is in use so
603 * we do it here after the last reference goes away.
604 */
605
606 /* remove the device nodes */
607 ASSERT(fssnap_dip != NULL);
608 (void) snprintf(name, sizeof (name), "%d",
609 sidp->sid_snapnumber);
610 ddi_remove_minor_node(fssnap_dip, name);
611 (void) snprintf(name, sizeof (name), "%d,raw",
612 sidp->sid_snapnumber);
613 ddi_remove_minor_node(fssnap_dip, name);
614
615 /* delete the state structure */
616 ddi_soft_state_free(statep, sidp->sid_snapnumber);
617 num_snapshots--;
618 }
619
620 mutex_exit(&snapshot_mutex);
621 rw_exit(&sidp->sid_rwlock);
622
623 return (0);
624 }
625
626 /*
627 * snap_read() - snapshot driver read(9E) routine
628 *
629 * reads data from the snapshot by calling snap_strategy() through physio()
630 */
631 /* ARGSUSED */
632 static int
633 snap_read(dev_t dev, struct uio *uiop, cred_t *credp)
634 {
635 minor_t minor;
636 struct snapshot_id **sidpp;
637
638 minor = getminor(dev);
639 sidpp = ddi_get_soft_state(statep, minor);
640 if (sidpp == NULL || *sidpp == NULL) {
641 cmn_err(CE_WARN,
642 "snap_read: could not find state for snapshot %d.", minor);
643 return (ENXIO);
644 }
645 return (physio(snap_strategy, NULL, dev, B_READ, minphys, uiop));
646 }
647
648 /*
649 * snap_strategy() - snapshot driver strategy(9E) routine
650 *
651 * cycles through each chunk in the requested buffer and calls
652 * snap_getchunk() on each chunk to retrieve it from the appropriate
653 * place. Once all of the parts are put together the requested buffer
654 * is returned. The snapshot driver is read-only, so a write is invalid.
655 */
656 static int
657 snap_strategy(struct buf *bp)
658 {
659 struct snapshot_id **sidpp, *sidp;
660 minor_t minor;
661 chunknumber_t chunk;
662 int off, len;
663 u_longlong_t reqptr;
664 int error = 0;
665 size_t chunksz;
666 caddr_t buf;
667
668 /* snapshot device is read-only */
669 if (bp->b_flags & B_WRITE) {
670 bioerror(bp, EROFS);
671 bp->b_resid = bp->b_bcount;
672 biodone(bp);
673 return (0);
674 }
675
676 minor = getminor(bp->b_edev);
677 sidpp = ddi_get_soft_state(statep, minor);
678 if (sidpp == NULL || *sidpp == NULL) {
679 cmn_err(CE_WARN,
680 "snap_strategy: could not find state for snapshot %d.",
681 minor);
682 bioerror(bp, ENXIO);
683 bp->b_resid = bp->b_bcount;
684 biodone(bp);
685 return (0);
686 }
687 sidp = *sidpp;
688 ASSERT(sidp);
689 rw_enter(&sidp->sid_rwlock, RW_READER);
690
691 if (SID_INACTIVE(sidp)) {
692 bioerror(bp, ENXIO);
693 bp->b_resid = bp->b_bcount;
694 biodone(bp);
695 rw_exit(&sidp->sid_rwlock);
696 return (0);
697 }
698
699 if (bp->b_flags & (B_PAGEIO|B_PHYS))
700 bp_mapin(bp);
701
702 bp->b_resid = bp->b_bcount;
703 ASSERT(bp->b_un.b_addr);
704 buf = bp->b_un.b_addr;
705
706 chunksz = sidp->sid_cowinfo->cow_map.cmap_chunksz;
707
708 /* reqptr is the current DEV_BSIZE offset into the device */
709 /* chunk is the chunk containing reqptr */
710 /* len is the length of the request (in the current chunk) in bytes */
711 /* off is the byte offset into the current chunk */
712 reqptr = bp->b_lblkno;
713 while (bp->b_resid > 0) {
714 chunk = dbtocowchunk(&sidp->sid_cowinfo->cow_map, reqptr);
715 off = (reqptr % (chunksz >> DEV_BSHIFT)) << DEV_BSHIFT;
716 len = min(chunksz - off, bp->b_resid);
717 ASSERT((off + len) <= chunksz);
718
719 if ((error = snap_getchunk(sidp, chunk, off, len, buf)) != 0) {
720 /*
721 * EINVAL means the user tried to go out of range.
722 * Anything else means it's likely that we're
723 * confused.
724 */
725 if (error != EINVAL) {
726 cmn_err(CE_WARN, "snap_strategy: error "
727 "calling snap_getchunk, chunk = %llu, "
728 "offset = %d, len = %d, resid = %lu, "
729 "error = %d.",
730 chunk, off, len, bp->b_resid, error);
731 }
732 bioerror(bp, error);
733 biodone(bp);
734 rw_exit(&sidp->sid_rwlock);
735 return (0);
736 }
737 bp->b_resid -= len;
738 reqptr += (len >> DEV_BSHIFT);
739 buf += len;
740 }
741
742 ASSERT(bp->b_resid == 0);
743 biodone(bp);
744
745 rw_exit(&sidp->sid_rwlock);
746 return (0);
747 }
748
749 /*
750 * snap_getchunk() - helper function for snap_strategy()
751 *
752 * gets the requested data from the appropriate place and fills in the
753 * buffer. chunk is the chunk number of the request, offset is the
754 * offset into that chunk and must be less than the chunk size. len is
755 * the length of the request starting at offset, and must not exceed a
756 * chunk boundary. buffer is the address to copy the data to. len
757 * bytes are copied into the buffer starting at the location specified.
758 *
759 * A chunk is located according to the following algorithm:
760 * - If the chunk does not have a translation or is not a candidate
761 * for translation, it is read straight from the master device.
762 * - If the chunk does have a translation, then it is either on
763 * disk or in memory:
764 * o If it is in memory the requested data is simply copied out
765 * of the in-memory buffer.
766 * o If it is in the backing store, it is read from there.
767 *
768 * This function does the real work of the snapshot driver.
769 */
770 static int
771 snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk, int offset,
772 int len, char *buffer)
773 {
774 cow_map_t *cmap = &sidp->sid_cowinfo->cow_map;
775 cow_map_node_t *cmn;
776 struct buf *snapbuf;
777 int error = 0;
778 char *newbuffer;
779 int newlen = 0;
780 int partial = 0;
781
782 ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
783 ASSERT(offset + len <= cmap->cmap_chunksz);
784
785 /*
786 * Check if the chunk number is out of range and if so bail out
787 */
788 if (chunk >= (cmap->cmap_bmsize * NBBY)) {
789 return (EINVAL);
790 }
791
792 /*
793 * If the chunk is not a candidate for translation, then the chunk
794 * was not allocated when the snapshot was taken. Since it does
795 * not contain data associated with this snapshot, just return a
796 * zero buffer instead.
797 */
798 if (isclr(cmap->cmap_candidate, chunk)) {
799 bzero(buffer, len);
800 return (0);
801 }
802
803 /*
804 * if the chunk is a candidate for translation but a
805 * translation does not exist, then read through to the
806 * original file system. The rwlock is held until the read
807 * completes if it hasn't been translated to make sure the
808 * file system does not translate the block before we
809 * access it. If it has already been translated we don't
810 * need the lock, because the translation will never go away.
811 */
812 rw_enter(&cmap->cmap_rwlock, RW_READER);
813 if (isclr(cmap->cmap_hastrans, chunk)) {
814 snapbuf = getrbuf(KM_SLEEP);
815 /*
816 * Reading into the buffer saves having to do a copy,
817 * but gets tricky if the request size is not a
818 * multiple of DEV_BSIZE. However, we are filling the
819 * buffer left to right, so future reads will write
820 * over any extra data we might have read.
821 */
822
823 partial = len % DEV_BSIZE;
824
825 snapbuf->b_bcount = len;
826 snapbuf->b_lblkno = lbtodb(chunk * cmap->cmap_chunksz + offset);
827 snapbuf->b_un.b_addr = buffer;
828
829 snapbuf->b_iodone = NULL;
830 snapbuf->b_proc = NULL; /* i.e. the kernel */
831 snapbuf->b_flags = B_READ | B_BUSY;
832 snapbuf->b_edev = sidp->sid_fvp->v_vfsp->vfs_dev;
833
834 if (partial) {
835 /*
836 * Partial block read in progress.
837 * This is bad as modules further down the line
838 * assume buf's are exact multiples of DEV_BSIZE
839 * and we end up with fewer, or zero, bytes read.
840 * To get round this we need to round up to the
841 * nearest full block read and then return only
842 * len bytes.
843 */
844 newlen = (len - partial) + DEV_BSIZE;
845 newbuffer = kmem_alloc(newlen, KM_SLEEP);
846
847 snapbuf->b_bcount = newlen;
848 snapbuf->b_un.b_addr = newbuffer;
849 }
850
851 (void) bdev_strategy(snapbuf);
852 (void) biowait(snapbuf);
853
854 error = geterror(snapbuf);
855
856 if (partial) {
857 /*
858 * Partial block read. Now we need to bcopy the
859 * correct number of bytes back into the
860 * supplied buffer, and tidy up our temp
861 * buffer.
862 */
863 bcopy(newbuffer, buffer, len);
864 kmem_free(newbuffer, newlen);
865 }
866
867 freerbuf(snapbuf);
868 rw_exit(&cmap->cmap_rwlock);
869
870 return (error);
871 }
872
873 /*
874 * finally, if the chunk is a candidate for translation and it
875 * has been translated, then we clone the chunk of the buffer
876 * that was copied aside by the file system.
877 * The cmap_rwlock does not need to be held after we know the
878 * data has already been copied. Once a chunk has been copied
879 * to the backing file, it is stable read only data.
880 */
881 cmn = transtbl_get(cmap, chunk);
882
883 /* check whether the data is in memory or in the backing file */
884 if (cmn != NULL) {
885 ASSERT(cmn->cmn_buf);
886 /* already in memory */
887 bcopy(cmn->cmn_buf + offset, buffer, len);
888 rw_exit(&cmap->cmap_rwlock);
889 } else {
890 ssize_t resid = len;
891 int bf_index;
892 /*
893 * can cause deadlock with writer if we don't drop the
894 * cmap_rwlock before trying to get the backing store file
895 * vnode rwlock.
896 */
897 rw_exit(&cmap->cmap_rwlock);
898
899 bf_index = chunk / cmap->cmap_chunksperbf;
900
901 /* read buffer from backing file */
902 error = vn_rdwr(UIO_READ,
903 (sidp->sid_cowinfo->cow_backfile_array)[bf_index],
904 buffer, len, ((chunk % cmap->cmap_chunksperbf) *
905 cmap->cmap_chunksz) + offset, UIO_SYSSPACE, 0,
906 RLIM64_INFINITY, kcred, &resid);
907 }
908
909 return (error);
910 }
911
912 /*
913 * snap_print() - snapshot driver print(9E) routine
914 *
915 * prints the device identification string.
916 */
917 static int
918 snap_print(dev_t dev, char *str)
919 {
920 struct snapshot_id **sidpp;
921 minor_t minor;
922
923 minor = getminor(dev);
924 sidpp = ddi_get_soft_state(statep, minor);
925 if (sidpp == NULL || *sidpp == NULL) {
926 cmn_err(CE_WARN,
927 "snap_print: could not find state for snapshot %d.", minor);
928 return (ENXIO);
929 }
930
931 cmn_err(CE_NOTE, "snap_print: snapshot %d: %s", minor, str);
932
933 return (0);
934 }
935
936 /*
937 * snap_prop_op() - snapshot driver prop_op(9E) routine
938 *
939 * get 32-bit and 64-bit values for size (character driver) and nblocks
940 * (block driver).
941 */
942 static int
943 snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
944 int flags, char *name, caddr_t valuep, int *lengthp)
945 {
946 int minor;
947 struct snapshot_id **sidpp;
948 dev_t mdev;
949 dev_info_t *mdip;
950 int error;
951
952 minor = getminor(dev);
953
954 /*
955 * If this is the control device just check for .conf properties,
956 * if the wildcard DDI_DEV_T_ANY was passed in via the dev_t
957 * just fall back to the defaults.
958 */
959 if ((minor == SNAP_CTL_MINOR) || (dev == DDI_DEV_T_ANY))
960 return (ddi_prop_op(dev, dip, prop_op, flags, name,
961 valuep, lengthp));
962
963 /* check to see if there is a master device plumbed */
964 sidpp = ddi_get_soft_state(statep, minor);
965 if (sidpp == NULL || *sidpp == NULL) {
966 cmn_err(CE_WARN,
967 "snap_prop_op: could not find state for "
968 "snapshot %d.", minor);
969 return (DDI_PROP_NOT_FOUND);
970 }
971
972 if (((*sidpp)->sid_fvp == NULL) || ((*sidpp)->sid_fvp->v_vfsp == NULL))
973 return (ddi_prop_op(dev, dip, prop_op, flags, name,
974 valuep, lengthp));
975
976 /* hold master device and pass operation down */
977 mdev = (*sidpp)->sid_fvp->v_vfsp->vfs_dev;
978 if (mdip = e_ddi_hold_devi_by_dev(mdev, 0)) {
979
980 /* get size information from the master device. */
981 error = cdev_prop_op(mdev, mdip,
982 prop_op, flags, name, valuep, lengthp);
983 ddi_release_devi(mdip);
984 if (error == DDI_PROP_SUCCESS)
985 return (error);
986 }
987
988 /* master device did not service the request, try framework */
989 return (ddi_prop_op(dev, dip, prop_op, flags, name, valuep, lengthp));
990
991 }
992
993 /*
994 * snap_ioctl() - snapshot driver ioctl(9E) routine
995 *
996 * only applies to the control device. The control device accepts two
997 * ioctl requests: create a snapshot or delete a snapshot. In either
998 * case, the vnode for the requested file system is extracted, and the
999 * request is passed on to the file system via the same ioctl. The file
1000 * system is responsible for doing the things necessary for creating or
1001 * destroying a snapshot, including any file system specific operations
1002 * that must be performed as well as setting up and deleting the snapshot
1003 * state through the fssnap interfaces.
1004 */
1005 static int
1006 snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1007 int *rvalp)
1008 {
1009 minor_t minor;
1010 int error = 0;
1011
1012 minor = getminor(dev);
1013
1014 if (minor != SNAP_CTL_MINOR) {
1015 return (EINVAL);
1016 }
1017
1018 switch (cmd) {
1019 case _FIOSNAPSHOTCREATE:
1020 {
1021 struct fiosnapcreate fc;
1022 struct file *fp;
1023 struct vnode *vp;
1024
1025 if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1026 return (EFAULT);
1027
1028 /* get vnode for file system mount point */
1029 if ((fp = getf(fc.rootfiledesc)) == NULL)
1030 return (EBADF);
1031
1032 ASSERT(fp->f_vnode);
1033 vp = fp->f_vnode;
1034 VN_HOLD(vp);
1035 releasef(fc.rootfiledesc);
1036
1037 /* pass ioctl request to file system */
1038 error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
1039 VN_RELE(vp);
1040 break;
1041 }
1042 case _FIOSNAPSHOTCREATE_MULTI:
1043 {
1044 struct fiosnapcreate_multi fc;
1045 struct file *fp;
1046 struct vnode *vp;
1047
1048 if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1049 return (EFAULT);
1050
1051 /* get vnode for file system mount point */
1052 if ((fp = getf(fc.rootfiledesc)) == NULL)
1053 return (EBADF);
1054
1055 ASSERT(fp->f_vnode);
1056 vp = fp->f_vnode;
1057 VN_HOLD(vp);
1058 releasef(fc.rootfiledesc);
1059
1060 /* pass ioctl request to file system */
1061 error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
1062 VN_RELE(vp);
1063 break;
1064 }
1065 case _FIOSNAPSHOTDELETE:
1066 {
1067 major_t major;
1068 struct fiosnapdelete fc;
1069 snapshot_id_t *sidp = NULL;
1070 snapshot_id_t *sidnextp = NULL;
1071 struct file *fp = NULL;
1072 struct vnode *vp = NULL;
1073 struct vfs *vfsp = NULL;
1074 vfsops_t *vfsops = EIO_vfsops;
1075
1076 if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1077 return (EFAULT);
1078
1079 /* get vnode for file system mount point */
1080 if ((fp = getf(fc.rootfiledesc)) == NULL)
1081 return (EBADF);
1082
1083 ASSERT(fp->f_vnode);
1084 vp = fp->f_vnode;
1085 VN_HOLD(vp);
1086 releasef(fc.rootfiledesc);
1087 /*
1088 * Test for two formats of delete and set correct minor/vp:
1089 * pseudo device:
1090 * fssnap -d [/dev/fssnap/x]
1091 * or
1092 * mount point:
1093 * fssnap -d [/mntpt]
1094 * Note that minor is verified to be equal to SNAP_CTL_MINOR
1095 * at this point which is an invalid minor number.
1096 */
1097 ASSERT(fssnap_dip != NULL);
1098 major = ddi_driver_major(fssnap_dip);
1099 mutex_enter(&snapshot_mutex);
1100 for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
1101 rw_enter(&sidp->sid_rwlock, RW_READER);
1102 sidnextp = sidp->sid_next;
1103 /* pseudo device: */
1104 if (major == getmajor(vp->v_rdev)) {
1105 minor = getminor(vp->v_rdev);
1106 if (sidp->sid_snapnumber == (uint_t)minor &&
1107 sidp->sid_fvp) {
1108 VN_RELE(vp);
1109 vp = sidp->sid_fvp;
1110 VN_HOLD(vp);
1111 rw_exit(&sidp->sid_rwlock);
1112 break;
1113 }
1114 /* Mount point: */
1115 } else {
1116 if (sidp->sid_fvp == vp) {
1117 minor = sidp->sid_snapnumber;
1118 rw_exit(&sidp->sid_rwlock);
1119 break;
1120 }
1121 }
1122 rw_exit(&sidp->sid_rwlock);
1123 }
1124 mutex_exit(&snapshot_mutex);
1125 /* Verify minor got set correctly above */
1126 if (minor == SNAP_CTL_MINOR) {
1127 VN_RELE(vp);
1128 return (EINVAL);
1129 }
1130 dev = makedevice(major, minor);
1131 /*
1132 * Create dummy vfs entry
1133 * to use as a locking semaphore across the IOCTL
1134 * for mount in progress cases...
1135 */
1136 vfsp = vfs_alloc(KM_SLEEP);
1137 VFS_INIT(vfsp, vfsops, NULL);
1138 VFS_HOLD(vfsp);
1139 vfs_addmip(dev, vfsp);
1140 if ((vfs_devmounting(dev, vfsp)) ||
1141 (vfs_devismounted(dev))) {
1142 vfs_delmip(vfsp);
1143 VFS_RELE(vfsp);
1144 VN_RELE(vp);
1145 return (EBUSY);
1146 }
1147 /*
1148 * Nobody mounted but do not release mount in progress lock
1149 * until IOCTL complete to prohibit a mount sneaking
1150 * in
1151 */
1152 error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
1153 vfs_delmip(vfsp);
1154 VFS_RELE(vfsp);
1155 VN_RELE(vp);
1156 break;
1157 }
1158 default:
1159 cmn_err(CE_WARN, "snap_ioctl: Invalid ioctl cmd %d, minor %d.",
1160 cmd, minor);
1161 return (EINVAL);
1162 }
1163
1164 return (error);
1165 }
1166
1167
1168 /* ************************************************************************ */
1169
1170 /*
1171 * Translation Table Routines
1172 *
1173 * These support routines implement a simple doubly linked list
1174 * to keep track of chunks that are currently in memory. The maximum
1175 * size of the list is determined by the fssnap_max_mem_chunks variable.
1176 * The cmap_rwlock is used to protect the linkage of the list.
1177 */
1178
1179 /*
1180 * transtbl_add() - add a node to the translation table
1181 *
1182 * allocates a new node and points it at the buffer passed in. The node
1183 * is added to the beginning of the doubly linked list and the head of
1184 * the list is moved. The cmap_rwlock must be held as a writer through
1185 * this operation.
1186 */
1187 static cow_map_node_t *
1188 transtbl_add(cow_map_t *cmap, chunknumber_t chunk, caddr_t buf)
1189 {
1190 cow_map_node_t *cmnode;
1191
1192 ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
1193
1194 cmnode = kmem_alloc(sizeof (cow_map_node_t), KM_SLEEP);
1195
1196 /*
1197 * insert new translations at the beginning so cmn_table is always
1198 * the first node.
1199 */
1200 cmnode->cmn_chunk = chunk;
1201 cmnode->cmn_buf = buf;
1202 cmnode->cmn_prev = NULL;
1203 cmnode->cmn_next = cmap->cmap_table;
1204 if (cmnode->cmn_next)
1205 cmnode->cmn_next->cmn_prev = cmnode;
1206 cmap->cmap_table = cmnode;
1207
1208 return (cmnode);
1209 }
1210
1211 /*
1212 * transtbl_get() - look up a node in the translation table
1213 *
1214 * called by the snapshot driver to find data that has been translated.
1215 * The lookup is done by the chunk number, and the node is returned.
1216 * If the node was not found, NULL is returned.
1217 */
1218 static cow_map_node_t *
1219 transtbl_get(cow_map_t *cmap, chunknumber_t chunk)
1220 {
1221 cow_map_node_t *cmn;
1222
1223 ASSERT(RW_READ_HELD(&cmap->cmap_rwlock));
1224 ASSERT(cmap);
1225
1226 /* search the translation table */
1227 for (cmn = cmap->cmap_table; cmn != NULL; cmn = cmn->cmn_next) {
1228 if (cmn->cmn_chunk == chunk)
1229 return (cmn);
1230 }
1231
1232 /* not found */
1233 return (NULL);
1234 }
1235
1236 /*
1237 * transtbl_delete() - delete a node from the translation table
1238 *
1239 * called when a node's data has been written out to disk. The
1240 * cmap_rwlock must be held as a writer for this operation. If the node
1241 * being deleted is the head of the list, then the head is moved to the
1242 * next node. Both the node's data and the node itself are freed.
1243 */
1244 static void
1245 transtbl_delete(cow_map_t *cmap, cow_map_node_t *cmn)
1246 {
1247 ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
1248 ASSERT(cmn);
1249 ASSERT(cmap->cmap_table);
1250
1251 /* if the head of the list is being deleted, then move the head up */
1252 if (cmap->cmap_table == cmn) {
1253 ASSERT(cmn->cmn_prev == NULL);
1254 cmap->cmap_table = cmn->cmn_next;
1255 }
1256
1257
1258 /* make previous node's next pointer skip over current node */
1259 if (cmn->cmn_prev != NULL) {
1260 ASSERT(cmn->cmn_prev->cmn_next == cmn);
1261 cmn->cmn_prev->cmn_next = cmn->cmn_next;
1262 }
1263
1264 /* make next node's previous pointer skip over current node */
1265 if (cmn->cmn_next != NULL) {
1266 ASSERT(cmn->cmn_next->cmn_prev == cmn);
1267 cmn->cmn_next->cmn_prev = cmn->cmn_prev;
1268 }
1269
1270 /* free the data and the node */
1271 ASSERT(cmn->cmn_buf);
1272 kmem_free(cmn->cmn_buf, cmap->cmap_chunksz);
1273 kmem_free(cmn, sizeof (cow_map_node_t));
1274 }
1275
1276 /*
1277 * transtbl_free() - free the entire translation table
1278 *
1279 * called when the snapshot is deleted. This frees all of the nodes in
1280 * the translation table (but not the bitmaps).
1281 */
1282 static void
1283 transtbl_free(cow_map_t *cmap)
1284 {
1285 cow_map_node_t *curnode;
1286 cow_map_node_t *tempnode;
1287
1288 for (curnode = cmap->cmap_table; curnode != NULL; curnode = tempnode) {
1289 tempnode = curnode->cmn_next;
1290
1291 kmem_free(curnode->cmn_buf, cmap->cmap_chunksz);
1292 kmem_free(curnode, sizeof (cow_map_node_t));
1293 }
1294 }
1295
1296
1297 /* ************************************************************************ */
1298
1299 /*
1300 * Interface Implementation Routines
1301 *
1302 * The following functions implement snapshot interface routines that are
1303 * called by the file system to create, delete, and use a snapshot. The
1304 * interfaces are defined in fssnap_if.c and are filled in by this driver
1305 * when it is loaded. This technique allows the file system to depend on
1306 * the interface module without having to load the full implementation and
1307 * snapshot device drivers.
1308 */
1309
1310 /*
1311 * fssnap_strategy_impl() - strategy routine called by the file system
1312 *
1313 * called by the file system to handle copy-on-write when necessary. All
1314 * reads and writes that the file system performs should go through this
1315 * function. If the file system calls the underlying device's strategy
1316 * routine without going through fssnap_strategy() (eg. by calling
1317 * bdev_strategy()), the snapshot may not be consistent.
1318 *
1319 * This function starts by doing significant sanity checking to insure
1320 * the snapshot was not deleted out from under it or deleted and then
1321 * recreated. To do this, it checks the actual pointer passed into it
1322 * (ie. the handle held by the file system). NOTE that the parameter is
1323 * a POINTER TO A POINTER to the snapshot id. Once the snapshot id is
1324 * locked, it knows things are ok and that this snapshot is really for
1325 * this file system.
1326 *
1327 * If the request is a write, fssnap_translate() is called to determine
1328 * whether a copy-on-write is required. If it is a read, the read is
1329 * simply passed on to the underlying device.
1330 */
1331 static void
1332 fssnap_strategy_impl(void *snapshot_id, buf_t *bp)
1333 {
1334 struct snapshot_id **sidpp;
1335 struct snapshot_id *sidp;
1336 int error;
1337
1338 /* read requests are always passed through */
1339 if (bp->b_flags & B_READ) {
1340 (void) bdev_strategy(bp);
1341 return;
1342 }
1343
1344 /*
1345 * Because we were not able to take the snapshot read lock BEFORE
1346 * checking for a snapshot back in the file system, things may have
1347 * drastically changed out from under us. For instance, the snapshot
1348 * may have been deleted, deleted and recreated, or worse yet, deleted
1349 * for this file system but now the snapshot number is in use by another
1350 * file system.
1351 *
1352 * Having a pointer to the file system's snapshot id pointer allows us
1353 * to sanity check most of this, though it assumes the file system is
1354 * keeping track of a pointer to the snapshot_id somewhere.
1355 */
1356 sidpp = (struct snapshot_id **)snapshot_id;
1357 sidp = *sidpp;
1358
1359 /*
1360 * if this file system's snapshot was disabled, just pass the
1361 * request through.
1362 */
1363 if (sidp == NULL) {
1364 (void) bdev_strategy(bp);
1365 return;
1366 }
1367
1368 /*
1369 * Once we have the reader lock the snapshot will not magically go
1370 * away. But things may have changed on us before this so double check.
1371 */
1372 rw_enter(&sidp->sid_rwlock, RW_READER);
1373
1374 /*
1375 * if an error was founds somewhere the DELETE flag will be
1376 * set to indicate the snapshot should be deleted and no new
1377 * translations should occur.
1378 */
1379 if (sidp->sid_flags & SID_DELETE) {
1380 rw_exit(&sidp->sid_rwlock);
1381 (void) fssnap_delete_impl(sidpp);
1382 (void) bdev_strategy(bp);
1383 return;
1384 }
1385
1386 /*
1387 * If the file system is no longer pointing to the snapshot we were
1388 * called with, then it should not attempt to translate this buffer as
1389 * it may be going to a snapshot for a different file system.
1390 * Even if the file system snapshot pointer is still the same, the
1391 * snapshot may have been disabled before we got the reader lock.
1392 */
1393 if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1394 rw_exit(&sidp->sid_rwlock);
1395 (void) bdev_strategy(bp);
1396 return;
1397 }
1398
1399 /*
1400 * At this point we're sure the snapshot will not go away while the
1401 * reader lock is held, and we are reasonably certain that we are
1402 * writing to the correct snapshot.
1403 */
1404 if ((error = fssnap_translate(sidpp, bp)) != 0) {
1405 /*
1406 * fssnap_translate can release the reader lock if it
1407 * has to wait for a semaphore. In this case it is possible
1408 * for the snapshot to be deleted in this time frame. If this
1409 * happens just sent the buf thru to the filesystems device.
1410 */
1411 if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1412 rw_exit(&sidp->sid_rwlock);
1413 (void) bdev_strategy(bp);
1414 return;
1415 }
1416 bioerror(bp, error);
1417 biodone(bp);
1418 }
1419 rw_exit(&sidp->sid_rwlock);
1420 }
1421
1422 /*
1423 * fssnap_translate() - helper function for fssnap_strategy()
1424 *
1425 * performs the actual copy-on-write for write requests, if required.
1426 * This function does the real work of the file system side of things.
1427 *
1428 * It first checks the candidate bitmap to quickly determine whether any
1429 * action is necessary. If the candidate bitmap indicates the chunk was
1430 * allocated when the snapshot was created, then it checks to see whether
1431 * a translation already exists. If a translation already exists then no
1432 * action is required. If the chunk is a candidate for copy-on-write,
1433 * and a translation does not already exist, then the chunk is read in
1434 * and a node is added to the translation table.
1435 *
1436 * Once all of the chunks in the request range have been copied (if they
1437 * needed to be), then the original request can be satisfied and the old
1438 * data can be overwritten.
1439 */
1440 static int
1441 fssnap_translate(struct snapshot_id **sidpp, struct buf *wbp)
1442 {
1443 snapshot_id_t *sidp = *sidpp;
1444 struct buf *oldbp; /* buffer to store old data in */
1445 struct cow_info *cowp = sidp->sid_cowinfo;
1446 cow_map_t *cmap = &cowp->cow_map;
1447 cow_map_node_t *cmn;
1448 chunknumber_t cowchunk, startchunk, endchunk;
1449 int error;
1450 int throttle_write = 0;
1451
1452 /* make sure the snapshot is active */
1453 ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
1454
1455 startchunk = dbtocowchunk(cmap, wbp->b_lblkno);
1456 endchunk = dbtocowchunk(cmap, wbp->b_lblkno +
1457 ((wbp->b_bcount-1) >> DEV_BSHIFT));
1458
1459 /*
1460 * Do not throttle the writes of the fssnap taskq thread and
1461 * the log roll (trans_roll) thread. Furthermore the writes to
1462 * the on-disk log are also not subject to throttling.
1463 * The fssnap_write_taskq thread's write can block on the throttling
1464 * semaphore which leads to self-deadlock as this same thread
1465 * releases the throttling semaphore after completing the IO.
1466 * If the trans_roll thread's write is throttled then we can deadlock
1467 * because the fssnap_taskq_thread which releases the throttling
1468 * semaphore can block waiting for log space which can only be
1469 * released by the trans_roll thread.
1470 */
1471
1472 throttle_write = !(taskq_member(cowp->cow_taskq, curthread) ||
1473 tsd_get(bypass_snapshot_throttle_key));
1474
1475 /*
1476 * Iterate through all chunks covered by this write and perform the
1477 * copy-aside if necessary. Once all chunks have been safely
1478 * stowed away, the new data may be written in a single sweep.
1479 *
1480 * For each chunk in the range, the following sequence is performed:
1481 * - Is the chunk a candidate for translation?
1482 * o If not, then no translation is necessary, continue
1483 * - If it is a candidate, then does it already have a translation?
1484 * o If so, then no translation is necessary, continue
1485 * - If it is a candidate, but does not yet have a translation,
1486 * then read the old data and schedule an asynchronous taskq
1487 * to write the old data to the backing file.
1488 *
1489 * Once this has been performed over the entire range of chunks, then
1490 * it is safe to overwrite the data that is there.
1491 *
1492 * Note that no lock is required to check the candidate bitmap because
1493 * it never changes once the snapshot is created. The reader lock is
1494 * taken to check the hastrans bitmap since it may change. If it
1495 * turns out a copy is required, then the lock is upgraded to a
1496 * writer, and the bitmap is re-checked as it may have changed while
1497 * the lock was released. Finally, the write lock is held while
1498 * reading the old data to make sure it is not translated out from
1499 * under us.
1500 *
1501 * This locking mechanism should be sufficient to handle multiple
1502 * threads writing to overlapping chunks simultaneously.
1503 */
1504 for (cowchunk = startchunk; cowchunk <= endchunk; cowchunk++) {
1505 /*
1506 * If the cowchunk is outside of the range of our
1507 * candidate maps, then simply break out of the
1508 * loop and pass the I/O through to bdev_strategy.
1509 * This would occur if the file system has grown
1510 * larger since the snapshot was taken.
1511 */
1512 if (cowchunk >= (cmap->cmap_bmsize * NBBY))
1513 break;
1514
1515 /*
1516 * If no disk blocks were allocated in this chunk when the
1517 * snapshot was created then no copy-on-write will be
1518 * required. Since this bitmap is read-only no locks are
1519 * necessary.
1520 */
1521 if (isclr(cmap->cmap_candidate, cowchunk)) {
1522 continue;
1523 }
1524
1525 /*
1526 * If a translation already exists, the data can be written
1527 * through since the old data has already been saved off.
1528 */
1529 if (isset(cmap->cmap_hastrans, cowchunk)) {
1530 continue;
1531 }
1532
1533
1534 /*
1535 * Throttle translations if there are too many outstanding
1536 * chunks in memory. The semaphore is sema_v'd by the taskq.
1537 *
1538 * You can't keep the sid_rwlock if you would go to sleep.
1539 * This will result in deadlock when someone tries to delete
1540 * the snapshot (wants the sid_rwlock as a writer, but can't
1541 * get it).
1542 */
1543 if (throttle_write) {
1544 if (sema_tryp(&cmap->cmap_throttle_sem) == 0) {
1545 rw_exit(&sidp->sid_rwlock);
1546 atomic_inc_32(&cmap->cmap_waiters);
1547 sema_p(&cmap->cmap_throttle_sem);
1548 atomic_dec_32(&cmap->cmap_waiters);
1549 rw_enter(&sidp->sid_rwlock, RW_READER);
1550
1551 /*
1552 * Now since we released the sid_rwlock the state may
1553 * have transitioned underneath us. so check that again.
1554 */
1555 if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1556 sema_v(&cmap->cmap_throttle_sem);
1557 return (ENXIO);
1558 }
1559 }
1560 }
1561
1562 /*
1563 * Acquire the lock as a writer and check to see if a
1564 * translation has been added in the meantime.
1565 */
1566 rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1567 if (isset(cmap->cmap_hastrans, cowchunk)) {
1568 if (throttle_write)
1569 sema_v(&cmap->cmap_throttle_sem);
1570 rw_exit(&cmap->cmap_rwlock);
1571 continue; /* go to the next chunk */
1572 }
1573
1574 /*
1575 * read a full chunk of data from the requested offset rounded
1576 * down to the nearest chunk size.
1577 */
1578 oldbp = getrbuf(KM_SLEEP);
1579 oldbp->b_lblkno = cowchunktodb(cmap, cowchunk);
1580 oldbp->b_edev = wbp->b_edev;
1581 oldbp->b_bcount = cmap->cmap_chunksz;
1582 oldbp->b_bufsize = cmap->cmap_chunksz;
1583 oldbp->b_iodone = NULL;
1584 oldbp->b_proc = NULL;
1585 oldbp->b_flags = B_READ;
1586 oldbp->b_un.b_addr = kmem_alloc(cmap->cmap_chunksz, KM_SLEEP);
1587
1588 (void) bdev_strategy(oldbp);
1589 (void) biowait(oldbp);
1590
1591 /*
1592 * It's ok to bail in the middle of translating the range
1593 * because the extra copy-asides will not hurt anything
1594 * (except by using extra space in the backing store).
1595 */
1596 if ((error = geterror(oldbp)) != 0) {
1597 cmn_err(CE_WARN, "fssnap_translate: error reading "
1598 "old data for snapshot %d, chunk %llu, disk block "
1599 "%lld, size %lu, error %d.", sidp->sid_snapnumber,
1600 cowchunk, oldbp->b_lblkno, oldbp->b_bcount, error);
1601 kmem_free(oldbp->b_un.b_addr, cmap->cmap_chunksz);
1602 freerbuf(oldbp);
1603 rw_exit(&cmap->cmap_rwlock);
1604 if (throttle_write)
1605 sema_v(&cmap->cmap_throttle_sem);
1606 return (error);
1607 }
1608
1609 /*
1610 * add the node to the translation table and save a reference
1611 * to pass to the taskq for writing out to the backing file
1612 */
1613 cmn = transtbl_add(cmap, cowchunk, oldbp->b_un.b_addr);
1614 freerbuf(oldbp);
1615
1616 /*
1617 * Add a reference to the snapshot id so the lower level
1618 * processing (ie. the taskq) can get back to the state
1619 * information.
1620 */
1621 cmn->cmn_sid = sidp;
1622 cmn->release_sem = throttle_write;
1623 setbit(cmap->cmap_hastrans, cowchunk);
1624
1625 rw_exit(&cmap->cmap_rwlock);
1626
1627 /*
1628 * schedule the asynchronous write to the backing file
1629 */
1630 if (cowp->cow_backfile_array != NULL)
1631 (void) taskq_dispatch(cowp->cow_taskq,
1632 fssnap_write_taskq, cmn, TQ_SLEEP);
1633 }
1634
1635 /*
1636 * Write new data in place of the old data. At this point all of the
1637 * chunks touched by this write have been copied aside and so the new
1638 * data can be written out all at once.
1639 */
1640 (void) bdev_strategy(wbp);
1641
1642 return (0);
1643 }
1644
1645 /*
1646 * fssnap_write_taskq() - write in-memory translations to the backing file
1647 *
1648 * writes in-memory translations to the backing file asynchronously. A
1649 * task is dispatched each time a new translation is created. The task
1650 * writes the data to the backing file and removes it from the memory
1651 * list. The throttling semaphore is released only if the particular
1652 * translation was throttled in fssnap_translate.
1653 */
1654 static void
1655 fssnap_write_taskq(void *arg)
1656 {
1657 cow_map_node_t *cmn = (cow_map_node_t *)arg;
1658 snapshot_id_t *sidp = cmn->cmn_sid;
1659 cow_info_t *cowp = sidp->sid_cowinfo;
1660 cow_map_t *cmap = &cowp->cow_map;
1661 int error;
1662 int bf_index;
1663 int release_sem = cmn->release_sem;
1664
1665 /*
1666 * The sid_rwlock does not need to be held here because the taskqs
1667 * are destroyed explicitly by fssnap_delete (with the sid_rwlock
1668 * held as a writer). taskq_destroy() will flush all of the tasks
1669 * out before fssnap_delete frees up all of the structures.
1670 */
1671
1672 /* if the snapshot was disabled from under us, drop the request. */
1673 rw_enter(&sidp->sid_rwlock, RW_READER);
1674 if (SID_INACTIVE(sidp)) {
1675 rw_exit(&sidp->sid_rwlock);
1676 if (release_sem)
1677 sema_v(&cmap->cmap_throttle_sem);
1678 return;
1679 }
1680 rw_exit(&sidp->sid_rwlock);
1681
1682 atomic_inc_64((uint64_t *)&cmap->cmap_nchunks);
1683
1684 if ((cmap->cmap_maxsize != 0) &&
1685 ((cmap->cmap_nchunks * cmap->cmap_chunksz) > cmap->cmap_maxsize)) {
1686 cmn_err(CE_WARN, "fssnap_write_taskq: snapshot %d (%s) has "
1687 "reached the maximum backing file size specified (%llu "
1688 "bytes) and will be deleted.", sidp->sid_snapnumber,
1689 (char *)cowp->cow_kstat_mntpt->ks_data,
1690 cmap->cmap_maxsize);
1691 if (release_sem)
1692 sema_v(&cmap->cmap_throttle_sem);
1693 atomic_or_uint(&sidp->sid_flags, SID_DELETE);
1694 return;
1695 }
1696
1697 /* perform the write */
1698 bf_index = cmn->cmn_chunk / cmap->cmap_chunksperbf;
1699
1700 if (error = vn_rdwr(UIO_WRITE, (cowp->cow_backfile_array)[bf_index],
1701 cmn->cmn_buf, cmap->cmap_chunksz,
1702 (cmn->cmn_chunk % cmap->cmap_chunksperbf) * cmap->cmap_chunksz,
1703 UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, (ssize_t *)NULL)) {
1704 cmn_err(CE_WARN, "fssnap_write_taskq: error writing to "
1705 "backing file. DELETING SNAPSHOT %d, backing file path "
1706 "%s, offset %llu bytes, error %d.", sidp->sid_snapnumber,
1707 (char *)cowp->cow_kstat_bfname->ks_data,
1708 cmn->cmn_chunk * cmap->cmap_chunksz, error);
1709 if (release_sem)
1710 sema_v(&cmap->cmap_throttle_sem);
1711 atomic_or_uint(&sidp->sid_flags, SID_DELETE);
1712 return;
1713 }
1714
1715 /*
1716 * now remove the node and buffer from memory
1717 */
1718 rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1719 transtbl_delete(cmap, cmn);
1720 rw_exit(&cmap->cmap_rwlock);
1721
1722 /* Allow more translations */
1723 if (release_sem)
1724 sema_v(&cmap->cmap_throttle_sem);
1725
1726 }
1727
1728 /*
1729 * fssnap_create_impl() - called from the file system to create a new snapshot
1730 *
1731 * allocates and initializes the structures needed for a new snapshot.
1732 * This is called by the file system when it receives an ioctl request to
1733 * create a new snapshot. An unused snapshot identifier is either found
1734 * or created, and eventually returned as the opaque handle the file
1735 * system will use to identify this snapshot. The snapshot number
1736 * associated with the snapshot identifier is the same as the minor
1737 * number for the snapshot device that is used to access that snapshot.
1738 *
1739 * The snapshot can not be used until the candidate bitmap is populated
1740 * by the file system (see fssnap_set_candidate_impl()), and the file
1741 * system finishes the setup process by calling fssnap_create_done().
1742 * Nearly all of the snapshot locks are held for the duration of the
1743 * create, and are not released until fssnap_create_done is called().
1744 */
1745 static void *
1746 fssnap_create_impl(chunknumber_t nchunks, uint_t chunksz, u_offset_t maxsize,
1747 struct vnode *fsvp, int backfilecount, struct vnode **bfvpp, char *backpath,
1748 u_offset_t max_backfile_size)
1749 {
1750 refstr_t *mountpoint;
1751 char taskqname[50];
1752 struct cow_info *cowp;
1753 struct cow_map *cmap;
1754 struct snapshot_id *sidp;
1755 int lastsnap;
1756
1757 /*
1758 * Sanity check the parameters we care about
1759 * (we don't care about the informational parameters)
1760 */
1761 if ((nchunks == 0) ||
1762 ((chunksz % DEV_BSIZE) != 0) ||
1763 (bfvpp == NULL)) {
1764 return (NULL);
1765 }
1766
1767 /*
1768 * Look for unused snapshot identifiers. Snapshot ids are never
1769 * freed, but deleted snapshot ids will be recycled as needed.
1770 */
1771 mutex_enter(&snapshot_mutex);
1772
1773 findagain:
1774 lastsnap = 0;
1775 for (sidp = snapshot; sidp != NULL; sidp = sidp->sid_next) {
1776 if (sidp->sid_snapnumber > lastsnap)
1777 lastsnap = sidp->sid_snapnumber;
1778
1779 /*
1780 * The sid_rwlock is taken as a reader initially so that
1781 * activity on each snapshot is not stalled while searching
1782 * for a free snapshot id.
1783 */
1784 rw_enter(&sidp->sid_rwlock, RW_READER);
1785
1786 /*
1787 * If the snapshot has been deleted and nobody is using the
1788 * snapshot device than we can reuse this snapshot_id. If
1789 * the snapshot is marked to be deleted (SID_DELETE), then
1790 * it hasn't been deleted yet so don't reuse it.
1791 */
1792 if (SID_AVAILABLE(sidp))
1793 break; /* This spot is unused, so take it */
1794 rw_exit(&sidp->sid_rwlock);
1795 }
1796
1797 /*
1798 * add a new snapshot identifier if there are no deleted
1799 * entries. Since it doesn't matter what order the entries
1800 * are in we can just add it to the beginning of the list.
1801 */
1802 if (sidp) {
1803 if (rw_tryupgrade(&sidp->sid_rwlock) == 0) {
1804 /* someone else grabbed it as a writer, try again */
1805 rw_exit(&sidp->sid_rwlock);
1806 goto findagain;
1807 }
1808 } else {
1809 /* Create a new node if we didn't find an unused one */
1810 sidp = kmem_alloc(sizeof (struct snapshot_id), KM_SLEEP);
1811 rw_init(&sidp->sid_rwlock, NULL, RW_DEFAULT, NULL);
1812 rw_enter(&sidp->sid_rwlock, RW_WRITER);
1813 sidp->sid_snapnumber = (snapshot == NULL) ? 0 : lastsnap + 1;
1814 sidp->sid_cowinfo = NULL;
1815 sidp->sid_flags = 0;
1816 sidp->sid_next = snapshot;
1817 snapshot = sidp;
1818 }
1819
1820 ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
1821 ASSERT(sidp->sid_cowinfo == NULL);
1822 ASSERT(sidp->sid_snapnumber <= (lastsnap + 1));
1823
1824 sidp->sid_flags |= SID_CREATING;
1825 /* The root vnode is held until snap_delete_impl() is called */
1826 VN_HOLD(fsvp);
1827 sidp->sid_fvp = fsvp;
1828 num_snapshots++;
1829
1830 /* allocate and initialize structures */
1831
1832 cowp = kmem_zalloc(sizeof (struct cow_info), KM_SLEEP);
1833
1834 cowp->cow_backfile_array = bfvpp;
1835 cowp->cow_backcount = backfilecount;
1836 cowp->cow_backfile_sz = max_backfile_size;
1837
1838 /*
1839 * Initialize task queues for this snapshot. Only a small number
1840 * of threads are required because they will be serialized on the
1841 * backing file's reader/writer lock anyway.
1842 */
1843 (void) snprintf(taskqname, sizeof (taskqname), "%s_taskq_%d", snapname,
1844 sidp->sid_snapnumber);
1845 cowp->cow_taskq = taskq_create(taskqname, fssnap_taskq_nthreads,
1846 minclsyspri, 1, fssnap_taskq_maxtasks, 0);
1847
1848 /* don't allow tasks to start until after everything is ready */
1849 taskq_suspend(cowp->cow_taskq);
1850
1851 /* initialize translation table */
1852 cmap = &cowp->cow_map;
1853 rw_init(&cmap->cmap_rwlock, NULL, RW_DEFAULT, NULL);
1854 rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1855
1856 sema_init(&cmap->cmap_throttle_sem, fssnap_max_mem_chunks, NULL,
1857 SEMA_DEFAULT, NULL);
1858
1859 cmap->cmap_chunksz = chunksz;
1860 cmap->cmap_maxsize = maxsize;
1861 cmap->cmap_chunksperbf = max_backfile_size / chunksz;
1862
1863 /*
1864 * allocate one bit per chunk for the bitmaps, round up
1865 */
1866 cmap->cmap_bmsize = (nchunks + (NBBY - 1)) / NBBY;
1867 cmap->cmap_hastrans = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
1868 cmap->cmap_candidate = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
1869
1870 sidp->sid_cowinfo = cowp;
1871
1872 /* initialize kstats for this snapshot */
1873 mountpoint = vfs_getmntpoint(fsvp->v_vfsp);
1874 fssnap_create_kstats(sidp, sidp->sid_snapnumber,
1875 refstr_value(mountpoint), backpath);
1876 refstr_rele(mountpoint);
1877
1878 mutex_exit(&snapshot_mutex);
1879
1880 /*
1881 * return with snapshot id rwlock held as a writer until
1882 * fssnap_create_done is called
1883 */
1884 return (sidp);
1885 }
1886
1887 /*
1888 * fssnap_set_candidate_impl() - mark a chunk as a candidate for copy-on-write
1889 *
1890 * sets a bit in the candidate bitmap that indicates that a chunk is a
1891 * candidate for copy-on-write. Typically, chunks that are allocated on
1892 * the file system at the time the snapshot is taken are candidates,
1893 * while chunks that have no allocated data do not need to be copied.
1894 * Chunks containing metadata must be marked as candidates as well.
1895 */
1896 static void
1897 fssnap_set_candidate_impl(void *snapshot_id, chunknumber_t chunknumber)
1898 {
1899 struct snapshot_id *sid = snapshot_id;
1900 struct cow_info *cowp = sid->sid_cowinfo;
1901 struct cow_map *cmap = &cowp->cow_map;
1902
1903 /* simple bitmap operation for now */
1904 ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
1905 setbit(cmap->cmap_candidate, chunknumber);
1906 }
1907
1908 /*
1909 * fssnap_is_candidate_impl() - check whether a chunk is a candidate
1910 *
1911 * returns 0 if the chunk is not a candidate and 1 if the chunk is a
1912 * candidate. This can be used by the file system to change behavior for
1913 * chunks that might induce a copy-on-write. The offset is specified in
1914 * bytes since the chunk size may not be known by the file system.
1915 */
1916 static int
1917 fssnap_is_candidate_impl(void *snapshot_id, u_offset_t off)
1918 {
1919 struct snapshot_id *sid = snapshot_id;
1920 struct cow_info *cowp = sid->sid_cowinfo;
1921 struct cow_map *cmap = &cowp->cow_map;
1922 ulong_t chunknumber = off / cmap->cmap_chunksz;
1923
1924 /* simple bitmap operation for now */
1925 ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
1926 return (isset(cmap->cmap_candidate, chunknumber));
1927 }
1928
1929 /*
1930 * fssnap_create_done_impl() - complete the snapshot setup process
1931 *
1932 * called when the file system is done populating the candidate bitmap
1933 * and it is ready to start using the snapshot. This routine releases
1934 * the snapshot locks, allows taskq tasks to start processing, and
1935 * creates the device minor nodes associated with the snapshot.
1936 */
1937 static int
1938 fssnap_create_done_impl(void *snapshot_id)
1939 {
1940 struct snapshot_id **sidpp, *sidp = snapshot_id;
1941 struct cow_info *cowp;
1942 struct cow_map *cmap;
1943 int snapnumber = -1;
1944 char name[20];
1945
1946 /* sid rwlock and cmap rwlock should be taken from fssnap_create */
1947 ASSERT(sidp);
1948 ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
1949 ASSERT(sidp->sid_cowinfo);
1950
1951 cowp = sidp->sid_cowinfo;
1952 cmap = &cowp->cow_map;
1953
1954 ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
1955
1956 sidp->sid_flags &= ~(SID_CREATING | SID_DISABLED);
1957 snapnumber = sidp->sid_snapnumber;
1958
1959 /* allocate state structure and find new snapshot id */
1960 if (ddi_soft_state_zalloc(statep, snapnumber) != DDI_SUCCESS) {
1961 cmn_err(CE_WARN,
1962 "snap_ioctl: create: could not allocate "
1963 "state for snapshot %d.", snapnumber);
1964 snapnumber = -1;
1965 goto out;
1966 }
1967
1968 sidpp = ddi_get_soft_state(statep, snapnumber);
1969 *sidpp = sidp;
1970
1971 /* create minor node based on snapshot number */
1972 ASSERT(fssnap_dip != NULL);
1973 (void) snprintf(name, sizeof (name), "%d", snapnumber);
1974 if (ddi_create_minor_node(fssnap_dip, name, S_IFBLK,
1975 snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
1976 cmn_err(CE_WARN, "snap_ioctl: could not create "
1977 "block minor node for snapshot %d.", snapnumber);
1978 snapnumber = -1;
1979 goto out;
1980 }
1981
1982 (void) snprintf(name, sizeof (name), "%d,raw", snapnumber);
1983 if (ddi_create_minor_node(fssnap_dip, name, S_IFCHR,
1984 snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
1985 cmn_err(CE_WARN, "snap_ioctl: could not create "
1986 "character minor node for snapshot %d.", snapnumber);
1987 snapnumber = -1;
1988 }
1989
1990 out:
1991 rw_exit(&sidp->sid_rwlock);
1992 rw_exit(&cmap->cmap_rwlock);
1993
1994 /* let the taskq threads start processing */
1995 taskq_resume(cowp->cow_taskq);
1996
1997 return (snapnumber);
1998 }
1999
2000 /*
2001 * fssnap_delete_impl() - delete a snapshot
2002 *
2003 * used when a snapshot is no longer needed. This is called by the file
2004 * system when it receives an ioctl request to delete a snapshot. It is
2005 * also called internally when error conditions such as disk full, errors
2006 * writing to the backing file, or backing file maxsize exceeded occur.
2007 * If the snapshot device is busy when the delete request is received,
2008 * all state will be deleted except for the soft state and device files
2009 * associated with the snapshot; they will be deleted when the snapshot
2010 * device is closed.
2011 *
2012 * NOTE this function takes a POINTER TO A POINTER to the snapshot id,
2013 * and expects to be able to set the handle held by the file system to
2014 * NULL. This depends on the file system checking that variable for NULL
2015 * before calling fssnap_strategy().
2016 */
2017 static int
2018 fssnap_delete_impl(void *snapshot_id)
2019 {
2020 struct snapshot_id **sidpp = (struct snapshot_id **)snapshot_id;
2021 struct snapshot_id *sidp;
2022 struct snapshot_id **statesidpp;
2023 struct cow_info *cowp;
2024 struct cow_map *cmap;
2025 char name[20];
2026 int snapnumber = -1;
2027 vnode_t **vpp;
2028
2029 /*
2030 * sidp is guaranteed to be valid if sidpp is valid because
2031 * the snapshot list is append-only.
2032 */
2033 if (sidpp == NULL) {
2034 return (-1);
2035 }
2036
2037 sidp = *sidpp;
2038 rw_enter(&sidp->sid_rwlock, RW_WRITER);
2039
2040 ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
2041
2042 /*
2043 * double check that the snapshot is still valid for THIS file system
2044 */
2045 if (*sidpp == NULL) {
2046 rw_exit(&sidp->sid_rwlock);
2047 return (-1);
2048 }
2049
2050 /*
2051 * Now we know the snapshot is still valid and will not go away
2052 * because we have the write lock. Once the state is transitioned
2053 * to "disabling", the sid_rwlock can be released. Any pending I/O
2054 * waiting for the lock as a reader will check for this state and
2055 * abort without touching data that may be getting freed.
2056 */
2057 sidp->sid_flags |= SID_DISABLING;
2058 if (sidp->sid_flags & SID_DELETE) {
2059 cmn_err(CE_WARN, "Snapshot %d automatically deleted.",
2060 sidp->sid_snapnumber);
2061 sidp->sid_flags &= ~(SID_DELETE);
2062 }
2063
2064
2065 /*
2066 * This is pointing into file system specific data! The assumption is
2067 * that fssnap_strategy() gets called from the file system based on
2068 * whether this reference to the snapshot_id is NULL or not. So
2069 * setting this to NULL should disable snapshots for the file system.
2070 */
2071 *sidpp = NULL;
2072
2073 /* remove cowinfo */
2074 cowp = sidp->sid_cowinfo;
2075 if (cowp == NULL) {
2076 rw_exit(&sidp->sid_rwlock);
2077 return (-1);
2078 }
2079 rw_exit(&sidp->sid_rwlock);
2080
2081 /* destroy task queues first so they don't reference freed data. */
2082 if (cowp->cow_taskq) {
2083 taskq_destroy(cowp->cow_taskq);
2084 cowp->cow_taskq = NULL;
2085 }
2086
2087 if (cowp->cow_backfile_array != NULL) {
2088 for (vpp = cowp->cow_backfile_array; *vpp; vpp++)
2089 VN_RELE(*vpp);
2090 kmem_free(cowp->cow_backfile_array,
2091 (cowp->cow_backcount + 1) * sizeof (vnode_t *));
2092 cowp->cow_backfile_array = NULL;
2093 }
2094
2095 sidp->sid_cowinfo = NULL;
2096
2097 /* remove cmap */
2098 cmap = &cowp->cow_map;
2099 ASSERT(cmap);
2100
2101 if (cmap->cmap_candidate)
2102 kmem_free(cmap->cmap_candidate, cmap->cmap_bmsize);
2103
2104 if (cmap->cmap_hastrans)
2105 kmem_free(cmap->cmap_hastrans, cmap->cmap_bmsize);
2106
2107 if (cmap->cmap_table)
2108 transtbl_free(&cowp->cow_map);
2109
2110 rw_destroy(&cmap->cmap_rwlock);
2111
2112 while (cmap->cmap_waiters) {
2113 sema_p(&cmap->cmap_throttle_sem);
2114 sema_v(&cmap->cmap_throttle_sem);
2115 }
2116 sema_destroy(&cmap->cmap_throttle_sem);
2117
2118 /* remove kstats */
2119 fssnap_delete_kstats(cowp);
2120
2121 kmem_free(cowp, sizeof (struct cow_info));
2122
2123 statesidpp = ddi_get_soft_state(statep, sidp->sid_snapnumber);
2124 if (statesidpp == NULL || *statesidpp == NULL) {
2125 cmn_err(CE_WARN,
2126 "fssnap_delete_impl: could not find state for snapshot %d.",
2127 sidp->sid_snapnumber);
2128 }
2129 ASSERT(*statesidpp == sidp);
2130
2131 /*
2132 * Leave the node in the list marked DISABLED so it can be reused
2133 * and avoid many race conditions. Return the snapshot number
2134 * that was deleted.
2135 */
2136 mutex_enter(&snapshot_mutex);
2137 rw_enter(&sidp->sid_rwlock, RW_WRITER);
2138 sidp->sid_flags &= ~(SID_DISABLING);
2139 sidp->sid_flags |= SID_DISABLED;
2140 VN_RELE(sidp->sid_fvp);
2141 sidp->sid_fvp = NULL;
2142 snapnumber = sidp->sid_snapnumber;
2143
2144 /*
2145 * If the snapshot is not busy, free the device info now. Otherwise
2146 * the device nodes are freed in snap_close() when the device is
2147 * closed. The sid will not be reused until the device is not busy.
2148 */
2149 if (SID_AVAILABLE(sidp)) {
2150 /* remove the device nodes */
2151 ASSERT(fssnap_dip != NULL);
2152 (void) snprintf(name, sizeof (name), "%d",
2153 sidp->sid_snapnumber);
2154 ddi_remove_minor_node(fssnap_dip, name);
2155 (void) snprintf(name, sizeof (name), "%d,raw",
2156 sidp->sid_snapnumber);
2157 ddi_remove_minor_node(fssnap_dip, name);
2158
2159 /* delete the state structure */
2160 ddi_soft_state_free(statep, sidp->sid_snapnumber);
2161 num_snapshots--;
2162 }
2163
2164 mutex_exit(&snapshot_mutex);
2165 rw_exit(&sidp->sid_rwlock);
2166
2167 return (snapnumber);
2168 }
2169
2170 /*
2171 * fssnap_create_kstats() - allocate and initialize snapshot kstats
2172 *
2173 */
2174 static void
2175 fssnap_create_kstats(snapshot_id_t *sidp, int snapnum,
2176 const char *mountpoint, const char *backfilename)
2177 {
2178 kstat_t *num, *mntpoint, *bfname;
2179 kstat_named_t *hw;
2180 struct cow_info *cowp = sidp->sid_cowinfo;
2181 struct cow_kstat_num *stats;
2182
2183 /* update the high water mark */
2184 if (fssnap_highwater_kstat == NULL) {
2185 cmn_err(CE_WARN, "fssnap_create_kstats: failed to lookup "
2186 "high water mark kstat.");
2187 return;
2188 }
2189
2190 hw = (kstat_named_t *)fssnap_highwater_kstat->ks_data;
2191 if (hw->value.ui32 < snapnum)
2192 hw->value.ui32 = snapnum;
2193
2194 /* initialize the mount point kstat */
2195 kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_MNTPT);
2196
2197 if (mountpoint != NULL) {
2198 mntpoint = kstat_create(snapname, snapnum, FSSNAP_KSTAT_MNTPT,
2199 "misc", KSTAT_TYPE_RAW, strlen(mountpoint) + 1, 0);
2200 if (mntpoint == NULL) {
2201 cowp->cow_kstat_mntpt = NULL;
2202 cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
2203 "create mount point kstat");
2204 } else {
2205 (void) strncpy(mntpoint->ks_data, mountpoint,
2206 strlen(mountpoint));
2207 cowp->cow_kstat_mntpt = mntpoint;
2208 kstat_install(mntpoint);
2209 }
2210 } else {
2211 cowp->cow_kstat_mntpt = NULL;
2212 cmn_err(CE_WARN, "fssnap_create_kstats: mount point not "
2213 "specified.");
2214 }
2215
2216 /* initialize the backing file kstat */
2217 kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_BFNAME);
2218
2219 if (backfilename == NULL) {
2220 cowp->cow_kstat_bfname = NULL;
2221 } else {
2222 bfname = kstat_create(snapname, snapnum, FSSNAP_KSTAT_BFNAME,
2223 "misc", KSTAT_TYPE_RAW, strlen(backfilename) + 1, 0);
2224 if (bfname != NULL) {
2225 (void) strncpy(bfname->ks_data, backfilename,
2226 strlen(backfilename));
2227 cowp->cow_kstat_bfname = bfname;
2228 kstat_install(bfname);
2229 } else {
2230 cowp->cow_kstat_bfname = NULL;
2231 cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
2232 "create backing file name kstat");
2233 }
2234 }
2235
2236 /* initialize numeric kstats */
2237 kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_NUM);
2238
2239 num = kstat_create(snapname, snapnum, FSSNAP_KSTAT_NUM,
2240 "misc", KSTAT_TYPE_NAMED,
2241 sizeof (struct cow_kstat_num) / sizeof (kstat_named_t),
2242 0);
2243 if (num == NULL) {
2244 cmn_err(CE_WARN, "fssnap_create_kstats: failed to create "
2245 "numeric kstats");
2246 cowp->cow_kstat_num = NULL;
2247 return;
2248 }
2249
2250 cowp->cow_kstat_num = num;
2251 stats = num->ks_data;
2252 num->ks_update = fssnap_update_kstat_num;
2253 num->ks_private = sidp;
2254
2255 kstat_named_init(&stats->ckn_state, FSSNAP_KSTAT_NUM_STATE,
2256 KSTAT_DATA_INT32);
2257 kstat_named_init(&stats->ckn_bfsize, FSSNAP_KSTAT_NUM_BFSIZE,
2258 KSTAT_DATA_UINT64);
2259 kstat_named_init(&stats->ckn_maxsize, FSSNAP_KSTAT_NUM_MAXSIZE,
2260 KSTAT_DATA_UINT64);
2261 kstat_named_init(&stats->ckn_createtime, FSSNAP_KSTAT_NUM_CREATETIME,
2262 KSTAT_DATA_LONG);
2263 kstat_named_init(&stats->ckn_chunksize, FSSNAP_KSTAT_NUM_CHUNKSIZE,
2264 KSTAT_DATA_UINT32);
2265
2266 /* initialize the static kstats */
2267 stats->ckn_chunksize.value.ui32 = cowp->cow_map.cmap_chunksz;
2268 stats->ckn_maxsize.value.ui64 = cowp->cow_map.cmap_maxsize;
2269 stats->ckn_createtime.value.l = gethrestime_sec();
2270
2271 kstat_install(num);
2272 }
2273
2274 /*
2275 * fssnap_update_kstat_num() - update a numerical snapshot kstat value
2276 *
2277 */
2278 int
2279 fssnap_update_kstat_num(kstat_t *ksp, int rw)
2280 {
2281 snapshot_id_t *sidp = (snapshot_id_t *)ksp->ks_private;
2282 struct cow_info *cowp = sidp->sid_cowinfo;
2283 struct cow_kstat_num *stats = ksp->ks_data;
2284
2285 if (rw == KSTAT_WRITE)
2286 return (EACCES);
2287
2288 /* state */
2289 if (sidp->sid_flags & SID_CREATING)
2290 stats->ckn_state.value.i32 = COWSTATE_CREATING;
2291 else if (SID_INACTIVE(sidp))
2292 stats->ckn_state.value.i32 = COWSTATE_DISABLED;
2293 else if (SID_BUSY(sidp))
2294 stats->ckn_state.value.i32 = COWSTATE_ACTIVE;
2295 else
2296 stats->ckn_state.value.i32 = COWSTATE_IDLE;
2297
2298 /* bfsize */
2299 stats->ckn_bfsize.value.ui64 = cowp->cow_map.cmap_nchunks *
2300 cowp->cow_map.cmap_chunksz;
2301
2302 return (0);
2303 }
2304
2305 /*
2306 * fssnap_delete_kstats() - deallocate snapshot kstats
2307 *
2308 */
2309 void
2310 fssnap_delete_kstats(struct cow_info *cowp)
2311 {
2312 if (cowp->cow_kstat_num != NULL) {
2313 kstat_delete(cowp->cow_kstat_num);
2314 cowp->cow_kstat_num = NULL;
2315 }
2316 if (cowp->cow_kstat_mntpt != NULL) {
2317 kstat_delete(cowp->cow_kstat_mntpt);
2318 cowp->cow_kstat_mntpt = NULL;
2319 }
2320 if (cowp->cow_kstat_bfname != NULL) {
2321 kstat_delete(cowp->cow_kstat_bfname);
2322 cowp->cow_kstat_bfname = NULL;
2323 }
2324 }
--- EOF ---