1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 
  27 #include <sys/debug.h>
  28 #include <sys/types.h>
  29 #include <sys/file.h>
  30 #include <sys/errno.h>
  31 #include <sys/uio.h>
  32 #include <sys/open.h>
  33 #include <sys/cred.h>
  34 #include <sys/kmem.h>
  35 #include <sys/conf.h>
  36 #include <sys/cmn_err.h>
  37 #include <sys/modctl.h>
  38 #include <sys/disp.h>
  39 #include <sys/atomic.h>
  40 #include <sys/filio.h>
  41 #include <sys/stat.h> /* needed for S_IFBLK and S_IFCHR */
  42 #include <sys/kstat.h>
  43 
  44 #include <sys/ddi.h>
  45 #include <sys/devops.h>
  46 #include <sys/sunddi.h>
  47 #include <sys/esunddi.h>
  48 #include <sys/priv_names.h>
  49 
  50 #include <sys/fssnap.h>
  51 #include <sys/fssnap_if.h>
  52 
  53 /*
  54  * This module implements the file system snapshot code, which provides a
  55  * point-in-time image of a file system for the purposes of online backup.
  56  * There are essentially two parts to this project: the driver half and the
  57  * file system half.  The driver half is a pseudo device driver called
  58  * "fssnap" that represents the snapshot.  Each snapshot is assigned a
  59  * number that corresponds to the minor number of the device, and a control
  60  * device with a high minor number is used to initiate snapshot creation and
  61  * deletion.  For all practical purposes the driver half acts like a
  62  * read-only disk device whose contents are exactly the same as the master
  63  * file system at the time the snapshot was created.
  64  *
  65  * The file system half provides interfaces necessary for performing the
  66  * file system dependent operations required to create and delete snapshots
  67  * and a special driver strategy routine that must always be used by the file
  68  * system for snapshots to work correctly.
  69  *
  70  * When a snapshot is to be created, the user utility will send an ioctl to
  71  * the control device of the driver half specifying the file system to be
  72  * snapshotted, the file descriptor of a backing-store file which is used to
  73  * hold old data before it is overwritten, and other snapshot parameters.
  74  * This ioctl is passed on to the file system specified in the original
  75  * ioctl request.  The file system is expected to be able to flush
  76  * everything out to make the file system consistent and lock it to ensure
  77  * no changes occur while the snapshot is being created.  It then calls
  78  * fssnap_create() to create state for a new snapshot, from which an opaque
  79  * handle is returned with the snapshot locked.  Next, the file system must
  80  * populate the "candidate bitmap", which tells the snapshot code which
  81  * "chunks" should be considered for copy-on-write (a chunk is the unit of
  82  * granularity used for copy-on-write, which is independent of the device
  83  * and file system block sizes).  This is typically done by scanning the
  84  * file system allocation bitmaps to determine which chunks contain
  85  * allocated blocks in the file system at the time the snapshot was created.
  86  * If a chunk has no allocated blocks, it does not need to be copied before
  87  * being written to.  Once the candidate bitmap is populated with
  88  * fssnap_set_candidate(), the file system calls fssnap_create_done() to
  89  * complete the snapshot creation and unlock the snapshot.  The file system
  90  * may now be unlocked and modifications to it resumed.
  91  *
  92  * Once a snapshot is created, the file system must perform all writes
  93  * through a special strategy routine, fssnap_strategy().  This strategy
  94  * routine determines whether the chunks contained by the write must be
  95  * copied before being overwritten by consulting the candidate bitmap
  96  * described above, and the "hastrans bitmap" which tells it whether the chunk
  97  * has been copied already or not.  If the chunk is a candidate but has not
  98  * been copied, it reads the old data in and adds it to a queue.  The
  99  * old data can then be overwritten with the new data.  An asynchronous
 100  * task queue is dispatched for each old chunk read in which writes the old
 101  * data to the backing file specified at snapshot creation time.  The
 102  * backing file is a sparse file the same size as the file system that
 103  * contains the old data at the offset that data originally had in the
 104  * file system.  If the queue containing in-memory chunks gets too large,
 105  * writes to the file system may be throttled by a semaphore until the
 106  * task queues have a chance to push some of the chunks to the backing file.
 107  *
 108  * With the candidate bitmap, the hastrans bitmap, the data on the master
 109  * file system, and the old data in memory and in the backing file, the
 110  * snapshot pseudo-driver can piece together the original file system
 111  * information to satisfy read requests.  If the requested chunk is not a
 112  * candidate, it returns a zeroed buffer.  If the chunk is a candidate but
 113  * has not been copied it reads it from the master file system.  If it is a
 114  * candidate and has been copied, it either copies the data from the
 115  * in-memory queue or it reads it in from the backing file.  The result is
 116  * a replication of the original file system that can be backed up, mounted,
 117  * or manipulated by other file system utilities that work on a read-only
 118  * device.
 119  *
 120  * This module is divided into three roughly logical sections:
 121  *
 122  *     - The snapshot driver, which is a character/block driver
 123  *       representing the snapshot itself.  These routines are
 124  *       prefixed with "snap_".
 125  *
 126  *     - The library routines that are defined in fssnap_if.h that
 127  *       are used by file systems that use this snapshot implementation.
 128  *       These functions are prefixed with "fssnap_" and are called through
 129  *       a function vector from the file system.
 130  *
 131  *     - The helper routines used by the snapshot driver and the fssnap
 132  *       library routines for managing the translation table and other
 133  *       useful functions.  These routines are all static and are
 134  *       prefixed with either "fssnap_" or "transtbl_" if they
 135  *       are specifically used for translation table activities.
 136  */
 137 
 138 static dev_info_t               *fssnap_dip = NULL;
 139 static struct snapshot_id       *snapshot = NULL;
 140 static struct snapshot_id       snap_ctl;
 141 static int                      num_snapshots = 0;
 142 static kmutex_t                 snapshot_mutex;
 143 static char                     snapname[] = SNAP_NAME;
 144 
 145 /* "tunable" parameters */
 146 static int              fssnap_taskq_nthreads = FSSNAP_TASKQ_THREADS;
 147 static uint_t           fssnap_max_mem_chunks = FSSNAP_MAX_MEM_CHUNKS;
 148 static int              fssnap_taskq_maxtasks = FSSNAP_TASKQ_MAXTASKS;
 149 
 150 /* static function prototypes */
 151 
 152 /* snapshot driver */
 153 static int snap_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
 154 static int snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
 155 static int snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
 156 static int snap_open(dev_t *devp, int flag, int otyp, cred_t *cred);
 157 static int snap_close(dev_t dev, int flag, int otyp, cred_t *cred);
 158 static int snap_strategy(struct buf *bp);
 159 static int snap_read(dev_t dev, struct uio *uiop, cred_t *credp);
 160 static int snap_print(dev_t dev, char *str);
 161 static int snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
 162     cred_t *credp, int *rvalp);
 163 static int snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
 164     int flags, char *name, caddr_t valuep, int *lengthp);
 165 static int snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk,
 166     int offset, int len, char *buffer);
 167 
 168 
 169 /* fssnap interface implementations (see fssnap_if.h) */
 170 static void fssnap_strategy_impl(void *, struct buf *);
 171 static void *fssnap_create_impl(chunknumber_t, uint_t, u_offset_t,
 172     struct vnode *, int, struct vnode **, char *, u_offset_t);
 173 static void fssnap_set_candidate_impl(void *, chunknumber_t);
 174 static int fssnap_is_candidate_impl(void *, u_offset_t);
 175 static int fssnap_create_done_impl(void *);
 176 static int fssnap_delete_impl(void *);
 177 
 178 /* fssnap interface support routines */
 179 static int  fssnap_translate(struct snapshot_id **, struct buf *);
 180 static void fssnap_write_taskq(void *);
 181 static void fssnap_create_kstats(snapshot_id_t *, int, const char *,
 182     const char *);
 183 static int  fssnap_update_kstat_num(kstat_t *, int);
 184 static void fssnap_delete_kstats(struct cow_info *);
 185 
 186 /* translation table prototypes */
 187 static cow_map_node_t *transtbl_add(cow_map_t *, chunknumber_t, caddr_t);
 188 static cow_map_node_t *transtbl_get(cow_map_t *, chunknumber_t);
 189 static void transtbl_delete(cow_map_t *, cow_map_node_t *);
 190 static void transtbl_free(cow_map_t *);
 191 
 192 static kstat_t *fssnap_highwater_kstat;
 193 
 194 /* ************************************************************************ */
 195 
 196 /* Device and Module Structures */
 197 
 198 static struct cb_ops snap_cb_ops = {
 199         snap_open,
 200         snap_close,
 201         snap_strategy,
 202         snap_print,
 203         nodev,          /* no snap_dump */
 204         snap_read,
 205         nodev,          /* no snap_write */
 206         snap_ioctl,
 207         nodev,          /* no snap_devmap */
 208         nodev,          /* no snap_mmap   */
 209         nodev,          /* no snap_segmap */
 210         nochpoll,
 211         snap_prop_op,
 212         NULL,           /* streamtab */
 213         D_64BIT | D_NEW | D_MP, /* driver compatibility */
 214         CB_REV,
 215         nodev,          /* async I/O read entry point */
 216         nodev           /* async I/O write entry point */
 217 };
 218 
 219 static struct dev_ops snap_ops = {
 220         DEVO_REV,
 221         0,                      /* ref count */
 222         snap_getinfo,
 223         nulldev,                /* snap_identify obsolete */
 224         nulldev,                /* no snap_probe */
 225         snap_attach,
 226         snap_detach,
 227         nodev,                  /* no snap_reset */
 228         &snap_cb_ops,
 229         (struct bus_ops *)NULL,
 230         nulldev,                /* no snap_power() */
 231         ddi_quiesce_not_needed,         /* quiesce */
 232 };
 233 
 234 extern struct mod_ops mod_driverops;
 235 
 236 static struct modldrv md = {
 237         &mod_driverops, /* Type of module. This is a driver */
 238         "snapshot driver",      /* Name of the module */
 239         &snap_ops,
 240 };
 241 
 242 static struct modlinkage ml = {
 243         MODREV_1,
 244         { &md, NULL }
 245 };
 246 
 247 static void *statep;
 248 
 249 int
 250 _init(void)
 251 {
 252         int     error;
 253         kstat_t *ksp;
 254         kstat_named_t   *ksdata;
 255 
 256         error = ddi_soft_state_init(&statep, sizeof (struct snapshot_id *), 1);
 257         if (error) {
 258                 cmn_err(CE_WARN, "_init: failed to init ddi_soft_state.");
 259                 return (error);
 260         }
 261 
 262         error = mod_install(&ml);
 263 
 264         if (error) {
 265                 cmn_err(CE_WARN, "_init: failed to mod_install.");
 266                 ddi_soft_state_fini(&statep);
 267                 return (error);
 268         }
 269 
 270         /*
 271          * Fill in the snapshot operations vector for file systems
 272          * (defined in fssnap_if.c)
 273          */
 274 
 275         snapops.fssnap_create = fssnap_create_impl;
 276         snapops.fssnap_set_candidate = fssnap_set_candidate_impl;
 277         snapops.fssnap_is_candidate = fssnap_is_candidate_impl;
 278         snapops.fssnap_create_done = fssnap_create_done_impl;
 279         snapops.fssnap_delete = fssnap_delete_impl;
 280         snapops.fssnap_strategy = fssnap_strategy_impl;
 281 
 282         mutex_init(&snapshot_mutex, NULL, MUTEX_DEFAULT, NULL);
 283 
 284         /*
 285          * Initialize the fssnap highwater kstat
 286          */
 287         ksp = kstat_create(snapname, 0, FSSNAP_KSTAT_HIGHWATER, "misc",
 288             KSTAT_TYPE_NAMED, 1, 0);
 289         if (ksp != NULL) {
 290                 ksdata = (kstat_named_t *)ksp->ks_data;
 291                 kstat_named_init(ksdata, FSSNAP_KSTAT_HIGHWATER,
 292                     KSTAT_DATA_UINT32);
 293                 ksdata->value.ui32 = 0;
 294                 kstat_install(ksp);
 295         } else {
 296                 cmn_err(CE_WARN, "_init: failed to create highwater kstat.");
 297         }
 298         fssnap_highwater_kstat = ksp;
 299 
 300         return (0);
 301 }
 302 
 303 int
 304 _info(struct modinfo *modinfop)
 305 {
 306         return (mod_info(&ml, modinfop));
 307 }
 308 
 309 int
 310 _fini(void)
 311 {
 312         int     error;
 313 
 314         error = mod_remove(&ml);
 315         if (error)
 316                 return (error);
 317         ddi_soft_state_fini(&statep);
 318 
 319         /*
 320          * delete the fssnap highwater kstat
 321          */
 322         kstat_delete(fssnap_highwater_kstat);
 323 
 324         mutex_destroy(&snapshot_mutex);
 325 
 326         /* Clear out the file system operations vector */
 327         snapops.fssnap_create = NULL;
 328         snapops.fssnap_set_candidate = NULL;
 329         snapops.fssnap_create_done = NULL;
 330         snapops.fssnap_delete = NULL;
 331         snapops.fssnap_strategy = NULL;
 332 
 333         return (0);
 334 }
 335 
 336 /* ************************************************************************ */
 337 
 338 /*
 339  * Snapshot Driver Routines
 340  *
 341  * This section implements the snapshot character and block drivers.  The
 342  * device will appear to be a consistent read-only file system to
 343  * applications that wish to back it up or mount it.  The snapshot driver
 344  * communicates with the file system through the translation table, which
 345  * tells the snapshot driver where to find the data necessary to piece
 346  * together the frozen file system.  The data may either be on the master
 347  * device (no translation exists), in memory (a translation exists but has
 348  * not been flushed to the backing store), or in the backing store file.
 349  * The read request may require the snapshot driver to retrieve data from
 350  * several different places and piece it together to look like a single
 351  * contiguous read.
 352  *
 353  * The device minor number corresponds to the snapshot number in the list of
 354  * snapshot identifiers.  The soft state for each minor number is simply a
 355  * pointer to the snapshot id, which holds all of the snapshot state.  One
 356  * minor number is designated as the control device.  All snapshot create
 357  * and delete requests go through the control device to ensure this module
 358  * is properly loaded and attached before the file system starts calling
 359  * routines defined here.
 360  */
 361 
 362 
 363 /*
 364  * snap_getinfo() - snapshot driver getinfo(9E) routine
 365  *
 366  */
 367 /*ARGSUSED*/
 368 static int
 369 snap_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 370 {
 371         switch (infocmd) {
 372         case DDI_INFO_DEVT2DEVINFO:
 373                 *result = fssnap_dip;
 374                 return (DDI_SUCCESS);
 375         case DDI_INFO_DEVT2INSTANCE:
 376                 *result = 0;    /* we only have one instance */
 377                 return (DDI_SUCCESS);
 378         }
 379         return (DDI_FAILURE);
 380 }
 381 
 382 /*
 383  * snap_attach() - snapshot driver attach(9E) routine
 384  *
 385  *    sets up snapshot control device and control state.  The control state
 386  *    is a pointer to an "anonymous" snapshot_id for tracking opens and closes
 387  */
 388 static int
 389 snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 390 {
 391         int                     error;
 392 
 393         switch (cmd) {
 394         case DDI_ATTACH:
 395                 /* create the control device */
 396                 error = ddi_create_priv_minor_node(dip, SNAP_CTL_NODE, S_IFCHR,
 397                     SNAP_CTL_MINOR, DDI_PSEUDO, PRIVONLY_DEV,
 398                     PRIV_SYS_CONFIG, PRIV_SYS_CONFIG, 0666);
 399                 if (error == DDI_FAILURE) {
 400                         return (DDI_FAILURE);
 401                 }
 402 
 403                 rw_init(&snap_ctl.sid_rwlock, NULL, RW_DEFAULT, NULL);
 404                 rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
 405                 fssnap_dip = dip;
 406                 snap_ctl.sid_snapnumber = SNAP_CTL_MINOR;
 407                 /* the control sid is not linked into the snapshot list */
 408                 snap_ctl.sid_next = NULL;
 409                 snap_ctl.sid_cowinfo = NULL;
 410                 snap_ctl.sid_flags = 0;
 411                 rw_exit(&snap_ctl.sid_rwlock);
 412                 ddi_report_dev(dip);
 413 
 414                 return (DDI_SUCCESS);
 415         case DDI_PM_RESUME:
 416                 return (DDI_SUCCESS);
 417 
 418         case DDI_RESUME:
 419                 return (DDI_SUCCESS);
 420 
 421         default:
 422                 return (DDI_FAILURE);
 423         }
 424 }
 425 
 426 /*
 427  * snap_detach() - snapshot driver detach(9E) routine
 428  *
 429  *    destroys snapshot control device and control state.  If any snapshots
 430  *    are active (ie. num_snapshots != 0), the device will refuse to detach.
 431  */
 432 static int
 433 snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 434 {
 435         struct snapshot_id *sidp, *sidnextp;
 436 
 437         switch (cmd) {
 438         case DDI_DETACH:
 439                 /* do not detach if the device is active */
 440                 mutex_enter(&snapshot_mutex);
 441                 if ((num_snapshots != 0) ||
 442                     ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0)) {
 443                         mutex_exit(&snapshot_mutex);
 444                         return (DDI_FAILURE);
 445                 }
 446 
 447                 /* free up the snapshot list */
 448                 for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
 449                         ASSERT(SID_AVAILABLE(sidp) &&
 450                             !RW_LOCK_HELD(&sidp->sid_rwlock));
 451                         sidnextp = sidp->sid_next;
 452                         rw_destroy(&sidp->sid_rwlock);
 453                         kmem_free(sidp, sizeof (struct snapshot_id));
 454                 }
 455                 snapshot = NULL;
 456 
 457                 /* delete the control device */
 458                 ddi_remove_minor_node(dip, SNAP_CTL_NODE);
 459                 fssnap_dip = NULL;
 460 
 461                 ASSERT((snap_ctl.sid_flags & SID_CHAR_BUSY) == 0);
 462                 rw_destroy(&snap_ctl.sid_rwlock);
 463                 mutex_exit(&snapshot_mutex);
 464 
 465                 return (DDI_SUCCESS);
 466 
 467         default:
 468                 return (DDI_FAILURE);
 469         }
 470 }
 471 
 472 /*
 473  * snap_open() - snapshot driver open(9E) routine
 474  *
 475  *     marks the snapshot id as busy so it will not be recycled when deleted
 476  *     until the snapshot is closed.
 477  */
 478 /* ARGSUSED */
 479 static int
 480 snap_open(dev_t *devp, int flag, int otyp, cred_t *cred)
 481 {
 482         minor_t minor;
 483         struct snapshot_id **sidpp, *sidp;
 484 
 485         /* snapshots are read-only */
 486         if (flag & FWRITE)
 487                 return (EROFS);
 488 
 489         minor = getminor(*devp);
 490 
 491         if (minor == SNAP_CTL_MINOR) {
 492                 /* control device must be opened exclusively */
 493                 if (((flag & FEXCL) != FEXCL) || (otyp != OTYP_CHR))
 494                         return (EINVAL);
 495 
 496                 rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
 497                 if ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0) {
 498                         rw_exit(&snap_ctl.sid_rwlock);
 499                         return (EBUSY);
 500                 }
 501 
 502                 snap_ctl.sid_flags |= SID_CHAR_BUSY;
 503                 rw_exit(&snap_ctl.sid_rwlock);
 504 
 505                 return (0);
 506         }
 507 
 508         sidpp = ddi_get_soft_state(statep, minor);
 509         if (sidpp == NULL || *sidpp == NULL)
 510                 return (ENXIO);
 511         sidp = *sidpp;
 512         rw_enter(&sidp->sid_rwlock, RW_WRITER);
 513 
 514         if ((flag & FEXCL) && SID_BUSY(sidp)) {
 515                 rw_exit(&sidp->sid_rwlock);
 516                 return (EAGAIN);
 517         }
 518 
 519         ASSERT(sidpp != NULL && sidp != NULL);
 520         /* check to see if this snapshot has been killed on us */
 521         if (SID_INACTIVE(sidp)) {
 522                 cmn_err(CE_WARN, "snap_open: snapshot %d does not exist.",
 523                     minor);
 524                 rw_exit(&sidp->sid_rwlock);
 525                 return (ENXIO);
 526         }
 527 
 528         switch (otyp) {
 529         case OTYP_CHR:
 530                 sidp->sid_flags |= SID_CHAR_BUSY;
 531                 break;
 532         case OTYP_BLK:
 533                 sidp->sid_flags |= SID_BLOCK_BUSY;
 534                 break;
 535         default:
 536                 rw_exit(&sidp->sid_rwlock);
 537                 return (EINVAL);
 538         }
 539 
 540         rw_exit(&sidp->sid_rwlock);
 541 
 542         /*
 543          * at this point if a valid snapshot was found then it has
 544          * been marked busy and we can use it.
 545          */
 546         return (0);
 547 }
 548 
 549 /*
 550  * snap_close() - snapshot driver close(9E) routine
 551  *
 552  *    unsets the busy bits in the snapshot id.  If the snapshot has been
 553  *    deleted while the snapshot device was open, the close call will clean
 554  *    up the remaining state information.
 555  */
 556 /* ARGSUSED */
 557 static int
 558 snap_close(dev_t dev, int flag, int otyp, cred_t *cred)
 559 {
 560         struct snapshot_id      **sidpp, *sidp;
 561         minor_t                 minor;
 562         char                    name[20];
 563 
 564         minor = getminor(dev);
 565 
 566         /* if this is the control device, close it and return */
 567         if (minor == SNAP_CTL_MINOR) {
 568                 rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
 569                 snap_ctl.sid_flags &= ~(SID_CHAR_BUSY);
 570                 rw_exit(&snap_ctl.sid_rwlock);
 571                 return (0);
 572         }
 573 
 574         sidpp = ddi_get_soft_state(statep, minor);
 575         if (sidpp == NULL || *sidpp == NULL) {
 576                 cmn_err(CE_WARN, "snap_close: could not find state for "
 577                     "snapshot %d.", minor);
 578                 return (ENXIO);
 579         }
 580         sidp = *sidpp;
 581         mutex_enter(&snapshot_mutex);
 582         rw_enter(&sidp->sid_rwlock, RW_WRITER);
 583 
 584         /* Mark the snapshot as not being busy anymore */
 585         switch (otyp) {
 586         case OTYP_CHR:
 587                 sidp->sid_flags &= ~(SID_CHAR_BUSY);
 588                 break;
 589         case OTYP_BLK:
 590                 sidp->sid_flags &= ~(SID_BLOCK_BUSY);
 591                 break;
 592         default:
 593                 mutex_exit(&snapshot_mutex);
 594                 rw_exit(&sidp->sid_rwlock);
 595                 return (EINVAL);
 596         }
 597 
 598         if (SID_AVAILABLE(sidp)) {
 599                 /*
 600                  * if this is the last close on a snapshot that has been
 601                  * deleted, then free up the soft state.  The snapdelete
 602                  * ioctl does not free this when the device is in use so
 603                  * we do it here after the last reference goes away.
 604                  */
 605 
 606                 /* remove the device nodes */
 607                 ASSERT(fssnap_dip != NULL);
 608                 (void) snprintf(name, sizeof (name), "%d",
 609                     sidp->sid_snapnumber);
 610                 ddi_remove_minor_node(fssnap_dip, name);
 611                 (void) snprintf(name, sizeof (name), "%d,raw",
 612                     sidp->sid_snapnumber);
 613                 ddi_remove_minor_node(fssnap_dip, name);
 614 
 615                 /* delete the state structure */
 616                 ddi_soft_state_free(statep, sidp->sid_snapnumber);
 617                 num_snapshots--;
 618         }
 619 
 620         mutex_exit(&snapshot_mutex);
 621         rw_exit(&sidp->sid_rwlock);
 622 
 623         return (0);
 624 }
 625 
 626 /*
 627  * snap_read() - snapshot driver read(9E) routine
 628  *
 629  *    reads data from the snapshot by calling snap_strategy() through physio()
 630  */
 631 /* ARGSUSED */
 632 static int
 633 snap_read(dev_t dev, struct uio *uiop, cred_t *credp)
 634 {
 635         minor_t         minor;
 636         struct snapshot_id **sidpp;
 637 
 638         minor = getminor(dev);
 639         sidpp = ddi_get_soft_state(statep, minor);
 640         if (sidpp == NULL || *sidpp == NULL) {
 641                 cmn_err(CE_WARN,
 642                     "snap_read: could not find state for snapshot %d.", minor);
 643                 return (ENXIO);
 644         }
 645         return (physio(snap_strategy, NULL, dev, B_READ, minphys, uiop));
 646 }
 647 
 648 /*
 649  * snap_strategy() - snapshot driver strategy(9E) routine
 650  *
 651  *    cycles through each chunk in the requested buffer and calls
 652  *    snap_getchunk() on each chunk to retrieve it from the appropriate
 653  *    place.  Once all of the parts are put together the requested buffer
 654  *    is returned.  The snapshot driver is read-only, so a write is invalid.
 655  */
 656 static int
 657 snap_strategy(struct buf *bp)
 658 {
 659         struct snapshot_id **sidpp, *sidp;
 660         minor_t         minor;
 661         chunknumber_t   chunk;
 662         int             off, len;
 663         u_longlong_t    reqptr;
 664         int             error = 0;
 665         size_t          chunksz;
 666         caddr_t         buf;
 667 
 668         /* snapshot device is read-only */
 669         if (bp->b_flags & B_WRITE) {
 670                 bioerror(bp, EROFS);
 671                 bp->b_resid = bp->b_bcount;
 672                 biodone(bp);
 673                 return (0);
 674         }
 675 
 676         minor = getminor(bp->b_edev);
 677         sidpp = ddi_get_soft_state(statep, minor);
 678         if (sidpp == NULL || *sidpp == NULL) {
 679                 cmn_err(CE_WARN,
 680                     "snap_strategy: could not find state for snapshot %d.",
 681                     minor);
 682                 bioerror(bp, ENXIO);
 683                 bp->b_resid = bp->b_bcount;
 684                 biodone(bp);
 685                 return (0);
 686         }
 687         sidp = *sidpp;
 688         ASSERT(sidp);
 689         rw_enter(&sidp->sid_rwlock, RW_READER);
 690 
 691         if (SID_INACTIVE(sidp)) {
 692                 bioerror(bp, ENXIO);
 693                 bp->b_resid = bp->b_bcount;
 694                 biodone(bp);
 695                 rw_exit(&sidp->sid_rwlock);
 696                 return (0);
 697         }
 698 
 699         if (bp->b_flags & (B_PAGEIO|B_PHYS))
 700                 bp_mapin(bp);
 701 
 702         bp->b_resid = bp->b_bcount;
 703         ASSERT(bp->b_un.b_addr);
 704         buf = bp->b_un.b_addr;
 705 
 706         chunksz = sidp->sid_cowinfo->cow_map.cmap_chunksz;
 707 
 708         /* reqptr is the current DEV_BSIZE offset into the device */
 709         /* chunk is the chunk containing reqptr */
 710         /* len is the length of the request (in the current chunk) in bytes */
 711         /* off is the byte offset into the current chunk */
 712         reqptr = bp->b_lblkno;
 713         while (bp->b_resid > 0) {
 714                 chunk = dbtocowchunk(&sidp->sid_cowinfo->cow_map, reqptr);
 715                 off = (reqptr % (chunksz >> DEV_BSHIFT)) << DEV_BSHIFT;
 716                 len = min(chunksz - off, bp->b_resid);
 717                 ASSERT((off + len) <= chunksz);
 718 
 719                 if ((error = snap_getchunk(sidp, chunk, off, len, buf)) != 0) {
 720                         /*
 721                          * EINVAL means the user tried to go out of range.
 722                          * Anything else means it's likely that we're
 723                          * confused.
 724                          */
 725                         if (error != EINVAL) {
 726                                 cmn_err(CE_WARN, "snap_strategy: error "
 727                                     "calling snap_getchunk, chunk = %llu, "
 728                                     "offset = %d, len = %d, resid = %lu, "
 729                                     "error = %d.",
 730                                     chunk, off, len, bp->b_resid, error);
 731                         }
 732                         bioerror(bp, error);
 733                         biodone(bp);
 734                         rw_exit(&sidp->sid_rwlock);
 735                         return (0);
 736                 }
 737                 bp->b_resid -= len;
 738                 reqptr += (len >> DEV_BSHIFT);
 739                 buf += len;
 740         }
 741 
 742         ASSERT(bp->b_resid == 0);
 743         biodone(bp);
 744 
 745         rw_exit(&sidp->sid_rwlock);
 746         return (0);
 747 }
 748 
 749 /*
 750  * snap_getchunk() - helper function for snap_strategy()
 751  *
 752  *    gets the requested data from the appropriate place and fills in the
 753  *    buffer.  chunk is the chunk number of the request, offset is the
 754  *    offset into that chunk and must be less than the chunk size.  len is
 755  *    the length of the request starting at offset, and must not exceed a
 756  *    chunk boundary.  buffer is the address to copy the data to.  len
 757  *    bytes are copied into the buffer starting at the location specified.
 758  *
 759  *    A chunk is located according to the following algorithm:
 760  *        - If the chunk does not have a translation or is not a candidate
 761  *          for translation, it is read straight from the master device.
 762  *        - If the chunk does have a translation, then it is either on
 763  *          disk or in memory:
 764  *            o If it is in memory the requested data is simply copied out
 765  *              of the in-memory buffer.
 766  *            o If it is in the backing store, it is read from there.
 767  *
 768  *    This function does the real work of the snapshot driver.
 769  */
 770 static int
 771 snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk, int offset,
 772     int len, char *buffer)
 773 {
 774         cow_map_t       *cmap = &sidp->sid_cowinfo->cow_map;
 775         cow_map_node_t  *cmn;
 776         struct buf      *snapbuf;
 777         int             error = 0;
 778         char            *newbuffer;
 779         int             newlen = 0;
 780         int             partial = 0;
 781 
 782         ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
 783         ASSERT(offset + len <= cmap->cmap_chunksz);
 784 
 785         /*
 786          * Check if the chunk number is out of range and if so bail out
 787          */
 788         if (chunk >= (cmap->cmap_bmsize * NBBY)) {
 789                 return (EINVAL);
 790         }
 791 
 792         /*
 793          * If the chunk is not a candidate for translation, then the chunk
 794          * was not allocated when the snapshot was taken.  Since it does
 795          * not contain data associated with this snapshot, just return a
 796          * zero buffer instead.
 797          */
 798         if (isclr(cmap->cmap_candidate, chunk)) {
 799                 bzero(buffer, len);
 800                 return (0);
 801         }
 802 
 803         /*
 804          * if the chunk is a candidate for translation but a
 805          * translation does not exist, then read through to the
 806          * original file system.  The rwlock is held until the read
 807          * completes if it hasn't been translated to make sure the
 808          * file system does not translate the block before we
 809          * access it. If it has already been translated we don't
 810          * need the lock, because the translation will never go away.
 811          */
 812         rw_enter(&cmap->cmap_rwlock, RW_READER);
 813         if (isclr(cmap->cmap_hastrans, chunk)) {
 814                 snapbuf = getrbuf(KM_SLEEP);
 815                 /*
 816                  * Reading into the buffer saves having to do a copy,
 817                  * but gets tricky if the request size is not a
 818                  * multiple of DEV_BSIZE.  However, we are filling the
 819                  * buffer left to right, so future reads will write
 820                  * over any extra data we might have read.
 821                  */
 822 
 823                 partial = len % DEV_BSIZE;
 824 
 825                 snapbuf->b_bcount = len;
 826                 snapbuf->b_lblkno = lbtodb(chunk * cmap->cmap_chunksz + offset);
 827                 snapbuf->b_un.b_addr = buffer;
 828 
 829                 snapbuf->b_iodone = NULL;
 830                 snapbuf->b_proc = NULL;              /* i.e. the kernel */
 831                 snapbuf->b_flags = B_READ | B_BUSY;
 832                 snapbuf->b_edev = sidp->sid_fvp->v_vfsp->vfs_dev;
 833 
 834                 if (partial) {
 835                         /*
 836                          * Partial block read in progress.
 837                          * This is bad as modules further down the line
 838                          * assume buf's are exact multiples of DEV_BSIZE
 839                          * and we end up with fewer, or zero, bytes read.
 840                          * To get round this we need to round up to the
 841                          * nearest full block read and then return only
 842                          * len bytes.
 843                          */
 844                         newlen = (len - partial) + DEV_BSIZE;
 845                         newbuffer = kmem_alloc(newlen, KM_SLEEP);
 846 
 847                         snapbuf->b_bcount = newlen;
 848                         snapbuf->b_un.b_addr = newbuffer;
 849                 }
 850 
 851                 (void) bdev_strategy(snapbuf);
 852                 (void) biowait(snapbuf);
 853 
 854                 error = geterror(snapbuf);
 855 
 856                 if (partial) {
 857                         /*
 858                          * Partial block read. Now we need to bcopy the
 859                          * correct number of bytes back into the
 860                          * supplied buffer, and tidy up our temp
 861                          * buffer.
 862                          */
 863                         bcopy(newbuffer, buffer, len);
 864                         kmem_free(newbuffer, newlen);
 865                 }
 866 
 867                 freerbuf(snapbuf);
 868                 rw_exit(&cmap->cmap_rwlock);
 869 
 870                 return (error);
 871         }
 872 
 873         /*
 874          * finally, if the chunk is a candidate for translation and it
 875          * has been translated, then we clone the chunk of the buffer
 876          * that was copied aside by the file system.
 877          * The cmap_rwlock does not need to be held after we know the
 878          * data has already been copied. Once a chunk has been copied
 879          * to the backing file, it is stable read only data.
 880          */
 881         cmn = transtbl_get(cmap, chunk);
 882 
 883         /* check whether the data is in memory or in the backing file */
 884         if (cmn != NULL) {
 885                 ASSERT(cmn->cmn_buf);
 886                 /* already in memory */
 887                 bcopy(cmn->cmn_buf + offset, buffer, len);
 888                 rw_exit(&cmap->cmap_rwlock);
 889         } else {
 890                 ssize_t resid = len;
 891                 int     bf_index;
 892                 /*
 893                  * can cause deadlock with writer if we don't drop the
 894                  * cmap_rwlock before trying to get the backing store file
 895                  * vnode rwlock.
 896                  */
 897                 rw_exit(&cmap->cmap_rwlock);
 898 
 899                 bf_index = chunk / cmap->cmap_chunksperbf;
 900 
 901                 /* read buffer from backing file */
 902                 error = vn_rdwr(UIO_READ,
 903                     (sidp->sid_cowinfo->cow_backfile_array)[bf_index],
 904                     buffer, len, ((chunk % cmap->cmap_chunksperbf) *
 905                     cmap->cmap_chunksz) + offset, UIO_SYSSPACE, 0,
 906                     RLIM64_INFINITY, kcred, &resid);
 907         }
 908 
 909         return (error);
 910 }
 911 
 912 /*
 913  * snap_print() - snapshot driver print(9E) routine
 914  *
 915  *    prints the device identification string.
 916  */
 917 static int
 918 snap_print(dev_t dev, char *str)
 919 {
 920         struct snapshot_id **sidpp;
 921         minor_t         minor;
 922 
 923         minor = getminor(dev);
 924         sidpp = ddi_get_soft_state(statep, minor);
 925         if (sidpp == NULL || *sidpp == NULL) {
 926                 cmn_err(CE_WARN,
 927                     "snap_print: could not find state for snapshot %d.", minor);
 928                 return (ENXIO);
 929         }
 930 
 931         cmn_err(CE_NOTE, "snap_print: snapshot %d: %s",  minor, str);
 932 
 933         return (0);
 934 }
 935 
 936 /*
 937  * snap_prop_op() - snapshot driver prop_op(9E) routine
 938  *
 939  *    get 32-bit and 64-bit values for size (character driver) and nblocks
 940  *    (block driver).
 941  */
 942 static int
 943 snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
 944     int flags, char *name, caddr_t valuep, int *lengthp)
 945 {
 946         int             minor;
 947         struct snapshot_id **sidpp;
 948         dev_t           mdev;
 949         dev_info_t      *mdip;
 950         int             error;
 951 
 952         minor = getminor(dev);
 953 
 954         /*
 955          * If this is the control device just check for .conf properties,
 956          * if the wildcard DDI_DEV_T_ANY was passed in via the dev_t
 957          * just fall back to the defaults.
 958          */
 959         if ((minor == SNAP_CTL_MINOR) || (dev == DDI_DEV_T_ANY))
 960                 return (ddi_prop_op(dev, dip, prop_op, flags, name,
 961                     valuep, lengthp));
 962 
 963         /* check to see if there is a master device plumbed */
 964         sidpp = ddi_get_soft_state(statep, minor);
 965         if (sidpp == NULL || *sidpp == NULL) {
 966                 cmn_err(CE_WARN,
 967                     "snap_prop_op: could not find state for "
 968                     "snapshot %d.", minor);
 969                 return (DDI_PROP_NOT_FOUND);
 970         }
 971 
 972         if (((*sidpp)->sid_fvp == NULL) || ((*sidpp)->sid_fvp->v_vfsp == NULL))
 973                 return (ddi_prop_op(dev, dip, prop_op, flags, name,
 974                     valuep, lengthp));
 975 
 976         /* hold master device and pass operation down */
 977         mdev = (*sidpp)->sid_fvp->v_vfsp->vfs_dev;
 978         if (mdip = e_ddi_hold_devi_by_dev(mdev, 0)) {
 979 
 980                 /* get size information from the master device. */
 981                 error = cdev_prop_op(mdev, mdip,
 982                     prop_op, flags, name, valuep, lengthp);
 983                 ddi_release_devi(mdip);
 984                 if (error == DDI_PROP_SUCCESS)
 985                         return (error);
 986         }
 987 
 988         /* master device did not service the request, try framework */
 989         return (ddi_prop_op(dev, dip, prop_op, flags, name, valuep, lengthp));
 990 
 991 }
 992 
 993 /*
 994  * snap_ioctl() - snapshot driver ioctl(9E) routine
 995  *
 996  *    only applies to the control device.  The control device accepts two
 997  *    ioctl requests: create a snapshot or delete a snapshot.  In either
 998  *    case, the vnode for the requested file system is extracted, and the
 999  *    request is passed on to the file system via the same ioctl.  The file
1000  *    system is responsible for doing the things necessary for creating or
1001  *    destroying a snapshot, including any file system specific operations
1002  *    that must be performed as well as setting up and deleting the snapshot
1003  *    state through the fssnap interfaces.
1004  */
1005 static int
1006 snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1007 int *rvalp)
1008 {
1009         minor_t minor;
1010         int error = 0;
1011 
1012         minor = getminor(dev);
1013 
1014         if (minor != SNAP_CTL_MINOR) {
1015                 return (EINVAL);
1016         }
1017 
1018         switch (cmd) {
1019         case _FIOSNAPSHOTCREATE:
1020         {
1021                 struct fiosnapcreate    fc;
1022                 struct file             *fp;
1023                 struct vnode            *vp;
1024 
1025                 if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1026                         return (EFAULT);
1027 
1028                 /* get vnode for file system mount point */
1029                 if ((fp = getf(fc.rootfiledesc)) == NULL)
1030                         return (EBADF);
1031 
1032                 ASSERT(fp->f_vnode);
1033                 vp = fp->f_vnode;
1034                 VN_HOLD(vp);
1035                 releasef(fc.rootfiledesc);
1036 
1037                 /* pass ioctl request to file system */
1038                 error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
1039                 VN_RELE(vp);
1040                 break;
1041         }
1042         case _FIOSNAPSHOTCREATE_MULTI:
1043         {
1044                 struct fiosnapcreate_multi      fc;
1045                 struct file             *fp;
1046                 struct vnode            *vp;
1047 
1048                 if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1049                         return (EFAULT);
1050 
1051                 /* get vnode for file system mount point */
1052                 if ((fp = getf(fc.rootfiledesc)) == NULL)
1053                         return (EBADF);
1054 
1055                 ASSERT(fp->f_vnode);
1056                 vp = fp->f_vnode;
1057                 VN_HOLD(vp);
1058                 releasef(fc.rootfiledesc);
1059 
1060                 /* pass ioctl request to file system */
1061                 error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
1062                 VN_RELE(vp);
1063                 break;
1064         }
1065         case _FIOSNAPSHOTDELETE:
1066         {
1067                 major_t                 major;
1068                 struct fiosnapdelete    fc;
1069                 snapshot_id_t           *sidp = NULL;
1070                 snapshot_id_t           *sidnextp = NULL;
1071                 struct file             *fp = NULL;
1072                 struct vnode            *vp = NULL;
1073                 struct vfs              *vfsp = NULL;
1074                 vfsops_t                *vfsops = EIO_vfsops;
1075 
1076                 if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1077                         return (EFAULT);
1078 
1079                 /* get vnode for file system mount point */
1080                 if ((fp = getf(fc.rootfiledesc)) == NULL)
1081                         return (EBADF);
1082 
1083                 ASSERT(fp->f_vnode);
1084                 vp = fp->f_vnode;
1085                 VN_HOLD(vp);
1086                 releasef(fc.rootfiledesc);
1087                 /*
1088                  * Test for two formats of delete and set correct minor/vp:
1089                  * pseudo device:
1090                  * fssnap -d [/dev/fssnap/x]
1091                  * or
1092                  * mount point:
1093                  * fssnap -d [/mntpt]
1094                  * Note that minor is verified to be equal to SNAP_CTL_MINOR
1095                  * at this point which is an invalid minor number.
1096                  */
1097                 ASSERT(fssnap_dip != NULL);
1098                 major = ddi_driver_major(fssnap_dip);
1099                 mutex_enter(&snapshot_mutex);
1100                 for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
1101                         rw_enter(&sidp->sid_rwlock, RW_READER);
1102                         sidnextp = sidp->sid_next;
1103                         /* pseudo device: */
1104                         if (major == getmajor(vp->v_rdev)) {
1105                                 minor = getminor(vp->v_rdev);
1106                                 if (sidp->sid_snapnumber == (uint_t)minor &&
1107                                     sidp->sid_fvp) {
1108                                         VN_RELE(vp);
1109                                         vp = sidp->sid_fvp;
1110                                         VN_HOLD(vp);
1111                                         rw_exit(&sidp->sid_rwlock);
1112                                         break;
1113                                 }
1114                         /* Mount point: */
1115                         } else {
1116                                 if (sidp->sid_fvp == vp) {
1117                                         minor = sidp->sid_snapnumber;
1118                                         rw_exit(&sidp->sid_rwlock);
1119                                         break;
1120                                 }
1121                         }
1122                         rw_exit(&sidp->sid_rwlock);
1123                 }
1124                 mutex_exit(&snapshot_mutex);
1125                 /* Verify minor got set correctly above */
1126                 if (minor == SNAP_CTL_MINOR) {
1127                         VN_RELE(vp);
1128                         return (EINVAL);
1129                 }
1130                 dev = makedevice(major, minor);
1131                 /*
1132                  * Create dummy vfs entry
1133                  * to use as a locking semaphore across the IOCTL
1134                  * for mount in progress cases...
1135                  */
1136                 vfsp = vfs_alloc(KM_SLEEP);
1137                 VFS_INIT(vfsp, vfsops, NULL);
1138                 VFS_HOLD(vfsp);
1139                 vfs_addmip(dev, vfsp);
1140                 if ((vfs_devmounting(dev, vfsp)) ||
1141                     (vfs_devismounted(dev))) {
1142                         vfs_delmip(vfsp);
1143                         VFS_RELE(vfsp);
1144                         VN_RELE(vp);
1145                         return (EBUSY);
1146                 }
1147                 /*
1148                  * Nobody mounted but do not release mount in progress lock
1149                  * until IOCTL complete to prohibit a mount sneaking
1150                  * in
1151                  */
1152                 error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
1153                 vfs_delmip(vfsp);
1154                 VFS_RELE(vfsp);
1155                 VN_RELE(vp);
1156                 break;
1157         }
1158         default:
1159                 cmn_err(CE_WARN, "snap_ioctl: Invalid ioctl cmd %d, minor %d.",
1160                     cmd, minor);
1161                 return (EINVAL);
1162         }
1163 
1164         return (error);
1165 }
1166 
1167 
1168 /* ************************************************************************ */
1169 
1170 /*
1171  * Translation Table Routines
1172  *
1173  *    These support routines implement a simple doubly linked list
1174  *    to keep track of chunks that are currently in memory.  The maximum
1175  *    size of the list is determined by the fssnap_max_mem_chunks variable.
1176  *    The cmap_rwlock is used to protect the linkage of the list.
1177  */
1178 
1179 /*
1180  * transtbl_add() - add a node to the translation table
1181  *
1182  *    allocates a new node and points it at the buffer passed in.  The node
1183  *    is added to the beginning of the doubly linked list and the head of
1184  *    the list is moved.  The cmap_rwlock must be held as a writer through
1185  *    this operation.
1186  */
1187 static cow_map_node_t *
1188 transtbl_add(cow_map_t *cmap, chunknumber_t chunk, caddr_t buf)
1189 {
1190         cow_map_node_t  *cmnode;
1191 
1192         ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
1193 
1194         cmnode = kmem_alloc(sizeof (cow_map_node_t), KM_SLEEP);
1195 
1196         /*
1197          * insert new translations at the beginning so cmn_table is always
1198          * the first node.
1199          */
1200         cmnode->cmn_chunk = chunk;
1201         cmnode->cmn_buf = buf;
1202         cmnode->cmn_prev = NULL;
1203         cmnode->cmn_next = cmap->cmap_table;
1204         if (cmnode->cmn_next)
1205                 cmnode->cmn_next->cmn_prev = cmnode;
1206         cmap->cmap_table = cmnode;
1207 
1208         return (cmnode);
1209 }
1210 
1211 /*
1212  * transtbl_get() - look up a node in the translation table
1213  *
1214  *    called by the snapshot driver to find data that has been translated.
1215  *    The lookup is done by the chunk number, and the node is returned.
1216  *    If the node was not found, NULL is returned.
1217  */
1218 static cow_map_node_t *
1219 transtbl_get(cow_map_t *cmap, chunknumber_t chunk)
1220 {
1221         cow_map_node_t *cmn;
1222 
1223         ASSERT(RW_READ_HELD(&cmap->cmap_rwlock));
1224         ASSERT(cmap);
1225 
1226         /* search the translation table */
1227         for (cmn = cmap->cmap_table; cmn != NULL; cmn = cmn->cmn_next) {
1228                 if (cmn->cmn_chunk == chunk)
1229                         return (cmn);
1230         }
1231 
1232         /* not found */
1233         return (NULL);
1234 }
1235 
1236 /*
1237  * transtbl_delete() - delete a node from the translation table
1238  *
1239  *    called when a node's data has been written out to disk.  The
1240  *    cmap_rwlock must be held as a writer for this operation.  If the node
1241  *    being deleted is the head of the list, then the head is moved to the
1242  *    next node.  Both the node's data and the node itself are freed.
1243  */
1244 static void
1245 transtbl_delete(cow_map_t *cmap, cow_map_node_t *cmn)
1246 {
1247         ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
1248         ASSERT(cmn);
1249         ASSERT(cmap->cmap_table);
1250 
1251         /* if the head of the list is being deleted, then move the head up */
1252         if (cmap->cmap_table == cmn) {
1253                 ASSERT(cmn->cmn_prev == NULL);
1254                 cmap->cmap_table = cmn->cmn_next;
1255         }
1256 
1257 
1258         /* make previous node's next pointer skip over current node */
1259         if (cmn->cmn_prev != NULL) {
1260                 ASSERT(cmn->cmn_prev->cmn_next == cmn);
1261                 cmn->cmn_prev->cmn_next = cmn->cmn_next;
1262         }
1263 
1264         /* make next node's previous pointer skip over current node */
1265         if (cmn->cmn_next != NULL) {
1266                 ASSERT(cmn->cmn_next->cmn_prev == cmn);
1267                 cmn->cmn_next->cmn_prev = cmn->cmn_prev;
1268         }
1269 
1270         /* free the data and the node */
1271         ASSERT(cmn->cmn_buf);
1272         kmem_free(cmn->cmn_buf, cmap->cmap_chunksz);
1273         kmem_free(cmn, sizeof (cow_map_node_t));
1274 }
1275 
1276 /*
1277  * transtbl_free() - free the entire translation table
1278  *
1279  *    called when the snapshot is deleted.  This frees all of the nodes in
1280  *    the translation table (but not the bitmaps).
1281  */
1282 static void
1283 transtbl_free(cow_map_t *cmap)
1284 {
1285         cow_map_node_t  *curnode;
1286         cow_map_node_t  *tempnode;
1287 
1288         for (curnode = cmap->cmap_table; curnode != NULL; curnode = tempnode) {
1289                 tempnode = curnode->cmn_next;
1290 
1291                 kmem_free(curnode->cmn_buf, cmap->cmap_chunksz);
1292                 kmem_free(curnode, sizeof (cow_map_node_t));
1293         }
1294 }
1295 
1296 
1297 /* ************************************************************************ */
1298 
1299 /*
1300  * Interface Implementation Routines
1301  *
1302  * The following functions implement snapshot interface routines that are
1303  * called by the file system to create, delete, and use a snapshot.  The
1304  * interfaces are defined in fssnap_if.c and are filled in by this driver
1305  * when it is loaded.  This technique allows the file system to depend on
1306  * the interface module without having to load the full implementation and
1307  * snapshot device drivers.
1308  */
1309 
1310 /*
1311  * fssnap_strategy_impl() - strategy routine called by the file system
1312  *
1313  *    called by the file system to handle copy-on-write when necessary.  All
1314  *    reads and writes that the file system performs should go through this
1315  *    function.  If the file system calls the underlying device's strategy
1316  *    routine without going through fssnap_strategy() (eg. by calling
1317  *    bdev_strategy()), the snapshot may not be consistent.
1318  *
1319  *    This function starts by doing significant sanity checking to insure
1320  *    the snapshot was not deleted out from under it or deleted and then
1321  *    recreated.  To do this, it checks the actual pointer passed into it
1322  *    (ie. the handle held by the file system).  NOTE that the parameter is
1323  *    a POINTER TO A POINTER to the snapshot id.  Once the snapshot id is
1324  *    locked, it knows things are ok and that this snapshot is really for
1325  *    this file system.
1326  *
1327  *    If the request is a write, fssnap_translate() is called to determine
1328  *    whether a copy-on-write is required.  If it is a read, the read is
1329  *    simply passed on to the underlying device.
1330  */
1331 static void
1332 fssnap_strategy_impl(void *snapshot_id, buf_t *bp)
1333 {
1334         struct snapshot_id **sidpp;
1335         struct snapshot_id *sidp;
1336         int error;
1337 
1338         /* read requests are always passed through */
1339         if (bp->b_flags & B_READ) {
1340                 (void) bdev_strategy(bp);
1341                 return;
1342         }
1343 
1344         /*
1345          * Because we were not able to take the snapshot read lock BEFORE
1346          * checking for a snapshot back in the file system, things may have
1347          * drastically changed out from under us.  For instance, the snapshot
1348          * may have been deleted, deleted and recreated, or worse yet, deleted
1349          * for this file system but now the snapshot number is in use by another
1350          * file system.
1351          *
1352          * Having a pointer to the file system's snapshot id pointer allows us
1353          * to sanity check most of this, though it assumes the file system is
1354          * keeping track of a pointer to the snapshot_id somewhere.
1355          */
1356         sidpp = (struct snapshot_id **)snapshot_id;
1357         sidp = *sidpp;
1358 
1359         /*
1360          * if this file system's snapshot was disabled, just pass the
1361          * request through.
1362          */
1363         if (sidp == NULL) {
1364                 (void) bdev_strategy(bp);
1365                 return;
1366         }
1367 
1368         /*
1369          * Once we have the reader lock the snapshot will not magically go
1370          * away.  But things may have changed on us before this so double check.
1371          */
1372         rw_enter(&sidp->sid_rwlock, RW_READER);
1373 
1374         /*
1375          * if an error was founds somewhere the DELETE flag will be
1376          * set to indicate the snapshot should be deleted and no new
1377          * translations should occur.
1378          */
1379         if (sidp->sid_flags & SID_DELETE) {
1380                 rw_exit(&sidp->sid_rwlock);
1381                 (void) fssnap_delete_impl(sidpp);
1382                 (void) bdev_strategy(bp);
1383                 return;
1384         }
1385 
1386         /*
1387          * If the file system is no longer pointing to the snapshot we were
1388          * called with, then it should not attempt to translate this buffer as
1389          * it may be going to a snapshot for a different file system.
1390          * Even if the file system snapshot pointer is still the same, the
1391          * snapshot may have been disabled before we got the reader lock.
1392          */
1393         if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1394                 rw_exit(&sidp->sid_rwlock);
1395                 (void) bdev_strategy(bp);
1396                 return;
1397         }
1398 
1399         /*
1400          * At this point we're sure the snapshot will not go away while the
1401          * reader lock is held, and we are reasonably certain that we are
1402          * writing to the correct snapshot.
1403          */
1404         if ((error = fssnap_translate(sidpp, bp)) != 0) {
1405                 /*
1406                  * fssnap_translate can release the reader lock if it
1407                  * has to wait for a semaphore.  In this case it is possible
1408                  * for the snapshot to be deleted in this time frame.  If this
1409                  * happens just sent the buf thru to the filesystems device.
1410                  */
1411                 if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1412                         rw_exit(&sidp->sid_rwlock);
1413                         (void) bdev_strategy(bp);
1414                         return;
1415                 }
1416                 bioerror(bp, error);
1417                 biodone(bp);
1418         }
1419         rw_exit(&sidp->sid_rwlock);
1420 }
1421 
1422 /*
1423  * fssnap_translate() - helper function for fssnap_strategy()
1424  *
1425  *    performs the actual copy-on-write for write requests, if required.
1426  *    This function does the real work of the file system side of things.
1427  *
1428  *    It first checks the candidate bitmap to quickly determine whether any
1429  *    action is necessary.  If the candidate bitmap indicates the chunk was
1430  *    allocated when the snapshot was created, then it checks to see whether
1431  *    a translation already exists.  If a translation already exists then no
1432  *    action is required.  If the chunk is a candidate for copy-on-write,
1433  *    and a translation does not already exist, then the chunk is read in
1434  *    and a node is added to the translation table.
1435  *
1436  *    Once all of the chunks in the request range have been copied (if they
1437  *    needed to be), then the original request can be satisfied and the old
1438  *    data can be overwritten.
1439  */
1440 static int
1441 fssnap_translate(struct snapshot_id **sidpp, struct buf *wbp)
1442 {
1443         snapshot_id_t   *sidp = *sidpp;
1444         struct buf      *oldbp; /* buffer to store old data in */
1445         struct cow_info *cowp = sidp->sid_cowinfo;
1446         cow_map_t       *cmap = &cowp->cow_map;
1447         cow_map_node_t  *cmn;
1448         chunknumber_t   cowchunk, startchunk, endchunk;
1449         int             error;
1450         int     throttle_write = 0;
1451 
1452         /* make sure the snapshot is active */
1453         ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
1454 
1455         startchunk = dbtocowchunk(cmap, wbp->b_lblkno);
1456         endchunk   = dbtocowchunk(cmap, wbp->b_lblkno +
1457             ((wbp->b_bcount-1) >> DEV_BSHIFT));
1458 
1459         /*
1460          * Do not throttle the writes of the fssnap taskq thread and
1461          * the log roll (trans_roll) thread. Furthermore the writes to
1462          * the on-disk log are also not subject to throttling.
1463          * The fssnap_write_taskq thread's write can block on the throttling
1464          * semaphore which leads to self-deadlock as this same thread
1465          * releases the throttling semaphore after completing the IO.
1466          * If the trans_roll thread's write is throttled then we can deadlock
1467          * because the fssnap_taskq_thread which releases the throttling
1468          * semaphore can block waiting for log space which can only be
1469          * released by the trans_roll thread.
1470          */
1471 
1472         throttle_write = !(taskq_member(cowp->cow_taskq, curthread) ||
1473             tsd_get(bypass_snapshot_throttle_key));
1474 
1475         /*
1476          * Iterate through all chunks covered by this write and perform the
1477          * copy-aside if necessary.  Once all chunks have been safely
1478          * stowed away, the new data may be written in a single sweep.
1479          *
1480          * For each chunk in the range, the following sequence is performed:
1481          *      - Is the chunk a candidate for translation?
1482          *              o If not, then no translation is necessary, continue
1483          *      - If it is a candidate, then does it already have a translation?
1484          *              o If so, then no translation is necessary, continue
1485          *      - If it is a candidate, but does not yet have a translation,
1486          *        then read the old data and schedule an asynchronous taskq
1487          *        to write the old data to the backing file.
1488          *
1489          * Once this has been performed over the entire range of chunks, then
1490          * it is safe to overwrite the data that is there.
1491          *
1492          * Note that no lock is required to check the candidate bitmap because
1493          * it never changes once the snapshot is created.  The reader lock is
1494          * taken to check the hastrans bitmap since it may change.  If it
1495          * turns out a copy is required, then the lock is upgraded to a
1496          * writer, and the bitmap is re-checked as it may have changed while
1497          * the lock was released.  Finally, the write lock is held while
1498          * reading the old data to make sure it is not translated out from
1499          * under us.
1500          *
1501          * This locking mechanism should be sufficient to handle multiple
1502          * threads writing to overlapping chunks simultaneously.
1503          */
1504         for (cowchunk = startchunk; cowchunk <= endchunk; cowchunk++) {
1505                 /*
1506                  * If the cowchunk is outside of the range of our
1507                  * candidate maps, then simply break out of the
1508                  * loop and pass the I/O through to bdev_strategy.
1509                  * This would occur if the file system has grown
1510                  * larger since the snapshot was taken.
1511                  */
1512                 if (cowchunk >= (cmap->cmap_bmsize * NBBY))
1513                         break;
1514 
1515                 /*
1516                  * If no disk blocks were allocated in this chunk when the
1517                  * snapshot was created then no copy-on-write will be
1518                  * required.  Since this bitmap is read-only no locks are
1519                  * necessary.
1520                  */
1521                 if (isclr(cmap->cmap_candidate, cowchunk)) {
1522                         continue;
1523                 }
1524 
1525                 /*
1526                  * If a translation already exists, the data can be written
1527                  * through since the old data has already been saved off.
1528                  */
1529                 if (isset(cmap->cmap_hastrans, cowchunk)) {
1530                         continue;
1531                 }
1532 
1533 
1534                 /*
1535                  * Throttle translations if there are too many outstanding
1536                  * chunks in memory.  The semaphore is sema_v'd by the taskq.
1537                  *
1538                  * You can't keep the sid_rwlock if you would go to sleep.
1539                  * This will result in deadlock when someone tries to delete
1540                  * the snapshot (wants the sid_rwlock as a writer, but can't
1541                  * get it).
1542                  */
1543                 if (throttle_write) {
1544                         if (sema_tryp(&cmap->cmap_throttle_sem) == 0) {
1545                                 rw_exit(&sidp->sid_rwlock);
1546                                 atomic_inc_32(&cmap->cmap_waiters);
1547                                 sema_p(&cmap->cmap_throttle_sem);
1548                                 atomic_dec_32(&cmap->cmap_waiters);
1549                                 rw_enter(&sidp->sid_rwlock, RW_READER);
1550 
1551                         /*
1552                          * Now since we released the sid_rwlock the state may
1553                          * have transitioned underneath us. so check that again.
1554                          */
1555                                 if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1556                                         sema_v(&cmap->cmap_throttle_sem);
1557                                         return (ENXIO);
1558                                 }
1559                         }
1560                 }
1561 
1562                 /*
1563                  * Acquire the lock as a writer and check to see if a
1564                  * translation has been added in the meantime.
1565                  */
1566                 rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1567                 if (isset(cmap->cmap_hastrans, cowchunk)) {
1568                         if (throttle_write)
1569                                 sema_v(&cmap->cmap_throttle_sem);
1570                         rw_exit(&cmap->cmap_rwlock);
1571                         continue; /* go to the next chunk */
1572                 }
1573 
1574                 /*
1575                  * read a full chunk of data from the requested offset rounded
1576                  * down to the nearest chunk size.
1577                  */
1578                 oldbp = getrbuf(KM_SLEEP);
1579                 oldbp->b_lblkno = cowchunktodb(cmap, cowchunk);
1580                 oldbp->b_edev = wbp->b_edev;
1581                 oldbp->b_bcount = cmap->cmap_chunksz;
1582                 oldbp->b_bufsize = cmap->cmap_chunksz;
1583                 oldbp->b_iodone = NULL;
1584                 oldbp->b_proc = NULL;
1585                 oldbp->b_flags = B_READ;
1586                 oldbp->b_un.b_addr = kmem_alloc(cmap->cmap_chunksz, KM_SLEEP);
1587 
1588                 (void) bdev_strategy(oldbp);
1589                 (void) biowait(oldbp);
1590 
1591                 /*
1592                  * It's ok to bail in the middle of translating the range
1593                  * because the extra copy-asides will not hurt anything
1594                  * (except by using extra space in the backing store).
1595                  */
1596                 if ((error = geterror(oldbp)) != 0) {
1597                         cmn_err(CE_WARN, "fssnap_translate: error reading "
1598                             "old data for snapshot %d, chunk %llu, disk block "
1599                             "%lld, size %lu, error %d.", sidp->sid_snapnumber,
1600                             cowchunk, oldbp->b_lblkno, oldbp->b_bcount, error);
1601                         kmem_free(oldbp->b_un.b_addr, cmap->cmap_chunksz);
1602                         freerbuf(oldbp);
1603                         rw_exit(&cmap->cmap_rwlock);
1604                         if (throttle_write)
1605                                 sema_v(&cmap->cmap_throttle_sem);
1606                         return (error);
1607                 }
1608 
1609                 /*
1610                  * add the node to the translation table and save a reference
1611                  * to pass to the taskq for writing out to the backing file
1612                  */
1613                 cmn = transtbl_add(cmap, cowchunk, oldbp->b_un.b_addr);
1614                 freerbuf(oldbp);
1615 
1616                 /*
1617                  * Add a reference to the snapshot id so the lower level
1618                  * processing (ie. the taskq) can get back to the state
1619                  * information.
1620                  */
1621                 cmn->cmn_sid = sidp;
1622                 cmn->release_sem = throttle_write;
1623                 setbit(cmap->cmap_hastrans, cowchunk);
1624 
1625                 rw_exit(&cmap->cmap_rwlock);
1626 
1627                 /*
1628                  * schedule the asynchronous write to the backing file
1629                  */
1630                 if (cowp->cow_backfile_array != NULL)
1631                         (void) taskq_dispatch(cowp->cow_taskq,
1632                             fssnap_write_taskq, cmn, TQ_SLEEP);
1633         }
1634 
1635         /*
1636          * Write new data in place of the old data.  At this point all of the
1637          * chunks touched by this write have been copied aside and so the new
1638          * data can be written out all at once.
1639          */
1640         (void) bdev_strategy(wbp);
1641 
1642         return (0);
1643 }
1644 
1645 /*
1646  * fssnap_write_taskq() - write in-memory translations to the backing file
1647  *
1648  *    writes in-memory translations to the backing file asynchronously.  A
1649  *    task is dispatched each time a new translation is created.  The task
1650  *    writes the data to the backing file and removes it from the memory
1651  *    list. The throttling semaphore is released only if the particular
1652  *    translation was throttled in fssnap_translate.
1653  */
1654 static void
1655 fssnap_write_taskq(void *arg)
1656 {
1657         cow_map_node_t  *cmn = (cow_map_node_t *)arg;
1658         snapshot_id_t   *sidp = cmn->cmn_sid;
1659         cow_info_t      *cowp = sidp->sid_cowinfo;
1660         cow_map_t       *cmap = &cowp->cow_map;
1661         int             error;
1662         int             bf_index;
1663         int             release_sem = cmn->release_sem;
1664 
1665         /*
1666          * The sid_rwlock does not need to be held here because the taskqs
1667          * are destroyed explicitly by fssnap_delete (with the sid_rwlock
1668          * held as a writer).  taskq_destroy() will flush all of the tasks
1669          * out before fssnap_delete frees up all of the structures.
1670          */
1671 
1672         /* if the snapshot was disabled from under us, drop the request. */
1673         rw_enter(&sidp->sid_rwlock, RW_READER);
1674         if (SID_INACTIVE(sidp)) {
1675                 rw_exit(&sidp->sid_rwlock);
1676                 if (release_sem)
1677                         sema_v(&cmap->cmap_throttle_sem);
1678                 return;
1679         }
1680         rw_exit(&sidp->sid_rwlock);
1681 
1682         atomic_inc_64((uint64_t *)&cmap->cmap_nchunks);
1683 
1684         if ((cmap->cmap_maxsize != 0) &&
1685             ((cmap->cmap_nchunks * cmap->cmap_chunksz) > cmap->cmap_maxsize)) {
1686                 cmn_err(CE_WARN, "fssnap_write_taskq: snapshot %d (%s) has "
1687                     "reached the maximum backing file size specified (%llu "
1688                     "bytes) and will be deleted.", sidp->sid_snapnumber,
1689                     (char *)cowp->cow_kstat_mntpt->ks_data,
1690                     cmap->cmap_maxsize);
1691                 if (release_sem)
1692                         sema_v(&cmap->cmap_throttle_sem);
1693                 atomic_or_uint(&sidp->sid_flags, SID_DELETE);
1694                 return;
1695         }
1696 
1697         /* perform the write */
1698         bf_index = cmn->cmn_chunk / cmap->cmap_chunksperbf;
1699 
1700         if (error = vn_rdwr(UIO_WRITE, (cowp->cow_backfile_array)[bf_index],
1701             cmn->cmn_buf, cmap->cmap_chunksz,
1702             (cmn->cmn_chunk % cmap->cmap_chunksperbf) * cmap->cmap_chunksz,
1703             UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, (ssize_t *)NULL)) {
1704                 cmn_err(CE_WARN, "fssnap_write_taskq: error writing to "
1705                     "backing file.  DELETING SNAPSHOT %d, backing file path "
1706                     "%s, offset %llu bytes, error %d.", sidp->sid_snapnumber,
1707                     (char *)cowp->cow_kstat_bfname->ks_data,
1708                     cmn->cmn_chunk * cmap->cmap_chunksz, error);
1709                 if (release_sem)
1710                         sema_v(&cmap->cmap_throttle_sem);
1711                 atomic_or_uint(&sidp->sid_flags, SID_DELETE);
1712                 return;
1713         }
1714 
1715         /*
1716          * now remove the node and buffer from memory
1717          */
1718         rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1719         transtbl_delete(cmap, cmn);
1720         rw_exit(&cmap->cmap_rwlock);
1721 
1722         /* Allow more translations */
1723         if (release_sem)
1724                 sema_v(&cmap->cmap_throttle_sem);
1725 
1726 }
1727 
1728 /*
1729  * fssnap_create_impl() - called from the file system to create a new snapshot
1730  *
1731  *    allocates and initializes the structures needed for a new snapshot.
1732  *    This is called by the file system when it receives an ioctl request to
1733  *    create a new snapshot.  An unused snapshot identifier is either found
1734  *    or created, and eventually returned as the opaque handle the file
1735  *    system will use to identify this snapshot.  The snapshot number
1736  *    associated with the snapshot identifier is the same as the minor
1737  *    number for the snapshot device that is used to access that snapshot.
1738  *
1739  *    The snapshot can not be used until the candidate bitmap is populated
1740  *    by the file system (see fssnap_set_candidate_impl()), and the file
1741  *    system finishes the setup process by calling fssnap_create_done().
1742  *    Nearly all of the snapshot locks are held for the duration of the
1743  *    create, and are not released until fssnap_create_done is called().
1744  */
1745 static void *
1746 fssnap_create_impl(chunknumber_t nchunks, uint_t chunksz, u_offset_t maxsize,
1747     struct vnode *fsvp, int backfilecount, struct vnode **bfvpp, char *backpath,
1748     u_offset_t max_backfile_size)
1749 {
1750         refstr_t *mountpoint;
1751         char taskqname[50];
1752         struct cow_info *cowp;
1753         struct cow_map  *cmap;
1754         struct snapshot_id *sidp;
1755         int lastsnap;
1756 
1757         /*
1758          * Sanity check the parameters we care about
1759          * (we don't care about the informational parameters)
1760          */
1761         if ((nchunks == 0) ||
1762             ((chunksz % DEV_BSIZE) != 0) ||
1763             (bfvpp == NULL)) {
1764                 return (NULL);
1765         }
1766 
1767         /*
1768          * Look for unused snapshot identifiers.  Snapshot ids are never
1769          * freed, but deleted snapshot ids will be recycled as needed.
1770          */
1771         mutex_enter(&snapshot_mutex);
1772 
1773 findagain:
1774         lastsnap = 0;
1775         for (sidp = snapshot; sidp != NULL; sidp = sidp->sid_next) {
1776                 if (sidp->sid_snapnumber > lastsnap)
1777                         lastsnap = sidp->sid_snapnumber;
1778 
1779                 /*
1780                  * The sid_rwlock is taken as a reader initially so that
1781                  * activity on each snapshot is not stalled while searching
1782                  * for a free snapshot id.
1783                  */
1784                 rw_enter(&sidp->sid_rwlock, RW_READER);
1785 
1786                 /*
1787                  * If the snapshot has been deleted and nobody is using the
1788                  * snapshot device than we can reuse this snapshot_id.  If
1789                  * the snapshot is marked to be deleted (SID_DELETE), then
1790                  * it hasn't been deleted yet so don't reuse it.
1791                  */
1792                 if (SID_AVAILABLE(sidp))
1793                         break; /* This spot is unused, so take it */
1794                 rw_exit(&sidp->sid_rwlock);
1795         }
1796 
1797         /*
1798          * add a new snapshot identifier if there are no deleted
1799          * entries.  Since it doesn't matter what order the entries
1800          * are in we can just add it to the beginning of the list.
1801          */
1802         if (sidp) {
1803                 if (rw_tryupgrade(&sidp->sid_rwlock) == 0) {
1804                         /* someone else grabbed it as a writer, try again */
1805                         rw_exit(&sidp->sid_rwlock);
1806                         goto findagain;
1807                 }
1808         } else {
1809                 /* Create a new node if we didn't find an unused one */
1810                 sidp = kmem_alloc(sizeof (struct snapshot_id), KM_SLEEP);
1811                 rw_init(&sidp->sid_rwlock, NULL, RW_DEFAULT, NULL);
1812                 rw_enter(&sidp->sid_rwlock, RW_WRITER);
1813                 sidp->sid_snapnumber = (snapshot == NULL) ? 0 : lastsnap + 1;
1814                 sidp->sid_cowinfo = NULL;
1815                 sidp->sid_flags = 0;
1816                 sidp->sid_next = snapshot;
1817                 snapshot = sidp;
1818         }
1819 
1820         ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
1821         ASSERT(sidp->sid_cowinfo == NULL);
1822         ASSERT(sidp->sid_snapnumber <= (lastsnap + 1));
1823 
1824         sidp->sid_flags |= SID_CREATING;
1825         /* The root vnode is held until snap_delete_impl() is called */
1826         VN_HOLD(fsvp);
1827         sidp->sid_fvp = fsvp;
1828         num_snapshots++;
1829 
1830         /* allocate and initialize structures */
1831 
1832         cowp = kmem_zalloc(sizeof (struct cow_info), KM_SLEEP);
1833 
1834         cowp->cow_backfile_array = bfvpp;
1835         cowp->cow_backcount = backfilecount;
1836         cowp->cow_backfile_sz = max_backfile_size;
1837 
1838         /*
1839          * Initialize task queues for this snapshot.  Only a small number
1840          * of threads are required because they will be serialized on the
1841          * backing file's reader/writer lock anyway.
1842          */
1843         (void) snprintf(taskqname, sizeof (taskqname), "%s_taskq_%d", snapname,
1844             sidp->sid_snapnumber);
1845         cowp->cow_taskq = taskq_create(taskqname, fssnap_taskq_nthreads,
1846             minclsyspri, 1,  fssnap_taskq_maxtasks, 0);
1847 
1848         /* don't allow tasks to start until after everything is ready */
1849         taskq_suspend(cowp->cow_taskq);
1850 
1851         /* initialize translation table */
1852         cmap = &cowp->cow_map;
1853         rw_init(&cmap->cmap_rwlock, NULL, RW_DEFAULT, NULL);
1854         rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1855 
1856         sema_init(&cmap->cmap_throttle_sem, fssnap_max_mem_chunks, NULL,
1857             SEMA_DEFAULT, NULL);
1858 
1859         cmap->cmap_chunksz = chunksz;
1860         cmap->cmap_maxsize = maxsize;
1861         cmap->cmap_chunksperbf = max_backfile_size / chunksz;
1862 
1863         /*
1864          * allocate one bit per chunk for the bitmaps, round up
1865          */
1866         cmap->cmap_bmsize = (nchunks + (NBBY - 1)) / NBBY;
1867         cmap->cmap_hastrans  = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
1868         cmap->cmap_candidate = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
1869 
1870         sidp->sid_cowinfo = cowp;
1871 
1872         /* initialize kstats for this snapshot */
1873         mountpoint = vfs_getmntpoint(fsvp->v_vfsp);
1874         fssnap_create_kstats(sidp, sidp->sid_snapnumber,
1875             refstr_value(mountpoint), backpath);
1876         refstr_rele(mountpoint);
1877 
1878         mutex_exit(&snapshot_mutex);
1879 
1880         /*
1881          * return with snapshot id rwlock held as a writer until
1882          * fssnap_create_done is called
1883          */
1884         return (sidp);
1885 }
1886 
1887 /*
1888  * fssnap_set_candidate_impl() - mark a chunk as a candidate for copy-on-write
1889  *
1890  *    sets a bit in the candidate bitmap that indicates that a chunk is a
1891  *    candidate for copy-on-write.  Typically, chunks that are allocated on
1892  *    the file system at the time the snapshot is taken are candidates,
1893  *    while chunks that have no allocated data do not need to be copied.
1894  *    Chunks containing metadata must be marked as candidates as well.
1895  */
1896 static void
1897 fssnap_set_candidate_impl(void *snapshot_id, chunknumber_t chunknumber)
1898 {
1899         struct snapshot_id      *sid = snapshot_id;
1900         struct cow_info *cowp = sid->sid_cowinfo;
1901         struct cow_map  *cmap = &cowp->cow_map;
1902 
1903         /* simple bitmap operation for now */
1904         ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
1905         setbit(cmap->cmap_candidate, chunknumber);
1906 }
1907 
1908 /*
1909  * fssnap_is_candidate_impl() - check whether a chunk is a candidate
1910  *
1911  *    returns 0 if the chunk is not a candidate and 1 if the chunk is a
1912  *    candidate.  This can be used by the file system to change behavior for
1913  *    chunks that might induce a copy-on-write.  The offset is specified in
1914  *    bytes since the chunk size may not be known by the file system.
1915  */
1916 static int
1917 fssnap_is_candidate_impl(void *snapshot_id, u_offset_t off)
1918 {
1919         struct snapshot_id      *sid = snapshot_id;
1920         struct cow_info *cowp = sid->sid_cowinfo;
1921         struct cow_map  *cmap = &cowp->cow_map;
1922         ulong_t chunknumber = off / cmap->cmap_chunksz;
1923 
1924         /* simple bitmap operation for now */
1925         ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
1926         return (isset(cmap->cmap_candidate, chunknumber));
1927 }
1928 
1929 /*
1930  * fssnap_create_done_impl() - complete the snapshot setup process
1931  *
1932  *    called when the file system is done populating the candidate bitmap
1933  *    and it is ready to start using the snapshot.  This routine releases
1934  *    the snapshot locks, allows taskq tasks to start processing, and
1935  *    creates the device minor nodes associated with the snapshot.
1936  */
1937 static int
1938 fssnap_create_done_impl(void *snapshot_id)
1939 {
1940         struct snapshot_id      **sidpp, *sidp = snapshot_id;
1941         struct cow_info         *cowp;
1942         struct cow_map          *cmap;
1943         int                     snapnumber = -1;
1944         char                    name[20];
1945 
1946         /* sid rwlock and cmap rwlock should be taken from fssnap_create */
1947         ASSERT(sidp);
1948         ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
1949         ASSERT(sidp->sid_cowinfo);
1950 
1951         cowp = sidp->sid_cowinfo;
1952         cmap = &cowp->cow_map;
1953 
1954         ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
1955 
1956         sidp->sid_flags &= ~(SID_CREATING | SID_DISABLED);
1957         snapnumber = sidp->sid_snapnumber;
1958 
1959         /* allocate state structure and find new snapshot id */
1960         if (ddi_soft_state_zalloc(statep, snapnumber) != DDI_SUCCESS) {
1961                 cmn_err(CE_WARN,
1962                     "snap_ioctl: create: could not allocate "
1963                     "state for snapshot %d.", snapnumber);
1964                 snapnumber = -1;
1965                 goto out;
1966         }
1967 
1968         sidpp = ddi_get_soft_state(statep, snapnumber);
1969         *sidpp = sidp;
1970 
1971         /* create minor node based on snapshot number */
1972         ASSERT(fssnap_dip != NULL);
1973         (void) snprintf(name, sizeof (name), "%d", snapnumber);
1974         if (ddi_create_minor_node(fssnap_dip, name, S_IFBLK,
1975             snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
1976                 cmn_err(CE_WARN, "snap_ioctl: could not create "
1977                     "block minor node for snapshot %d.", snapnumber);
1978                 snapnumber = -1;
1979                 goto out;
1980         }
1981 
1982         (void) snprintf(name, sizeof (name), "%d,raw", snapnumber);
1983         if (ddi_create_minor_node(fssnap_dip, name, S_IFCHR,
1984             snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
1985                 cmn_err(CE_WARN, "snap_ioctl: could not create "
1986                     "character minor node for snapshot %d.", snapnumber);
1987                 snapnumber = -1;
1988         }
1989 
1990 out:
1991         rw_exit(&sidp->sid_rwlock);
1992         rw_exit(&cmap->cmap_rwlock);
1993 
1994         /* let the taskq threads start processing */
1995         taskq_resume(cowp->cow_taskq);
1996 
1997         return (snapnumber);
1998 }
1999 
2000 /*
2001  * fssnap_delete_impl() - delete a snapshot
2002  *
2003  *    used when a snapshot is no longer needed.  This is called by the file
2004  *    system when it receives an ioctl request to delete a snapshot.  It is
2005  *    also called internally when error conditions such as disk full, errors
2006  *    writing to the backing file, or backing file maxsize exceeded occur.
2007  *    If the snapshot device is busy when the delete request is received,
2008  *    all state will be deleted except for the soft state and device files
2009  *    associated with the snapshot; they will be deleted when the snapshot
2010  *    device is closed.
2011  *
2012  *    NOTE this function takes a POINTER TO A POINTER to the snapshot id,
2013  *    and expects to be able to set the handle held by the file system to
2014  *    NULL.  This depends on the file system checking that variable for NULL
2015  *    before calling fssnap_strategy().
2016  */
2017 static int
2018 fssnap_delete_impl(void *snapshot_id)
2019 {
2020         struct snapshot_id      **sidpp = (struct snapshot_id **)snapshot_id;
2021         struct snapshot_id      *sidp;
2022         struct snapshot_id      **statesidpp;
2023         struct cow_info         *cowp;
2024         struct cow_map          *cmap;
2025         char                    name[20];
2026         int                     snapnumber = -1;
2027         vnode_t                 **vpp;
2028 
2029         /*
2030          * sidp is guaranteed to be valid if sidpp is valid because
2031          * the snapshot list is append-only.
2032          */
2033         if (sidpp == NULL) {
2034                 return (-1);
2035         }
2036 
2037         sidp = *sidpp;
2038         rw_enter(&sidp->sid_rwlock, RW_WRITER);
2039 
2040         ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
2041 
2042         /*
2043          * double check that the snapshot is still valid for THIS file system
2044          */
2045         if (*sidpp == NULL) {
2046                 rw_exit(&sidp->sid_rwlock);
2047                 return (-1);
2048         }
2049 
2050         /*
2051          * Now we know the snapshot is still valid and will not go away
2052          * because we have the write lock.  Once the state is transitioned
2053          * to "disabling", the sid_rwlock can be released.  Any pending I/O
2054          * waiting for the lock as a reader will check for this state and
2055          * abort without touching data that may be getting freed.
2056          */
2057         sidp->sid_flags |= SID_DISABLING;
2058         if (sidp->sid_flags & SID_DELETE) {
2059                 cmn_err(CE_WARN, "Snapshot %d automatically deleted.",
2060                     sidp->sid_snapnumber);
2061                 sidp->sid_flags &= ~(SID_DELETE);
2062         }
2063 
2064 
2065         /*
2066          * This is pointing into file system specific data!  The assumption is
2067          * that fssnap_strategy() gets called from the file system based on
2068          * whether this reference to the snapshot_id is NULL or not.  So
2069          * setting this to NULL should disable snapshots for the file system.
2070          */
2071         *sidpp = NULL;
2072 
2073         /* remove cowinfo */
2074         cowp = sidp->sid_cowinfo;
2075         if (cowp == NULL) {
2076                 rw_exit(&sidp->sid_rwlock);
2077                 return (-1);
2078         }
2079         rw_exit(&sidp->sid_rwlock);
2080 
2081         /* destroy task queues first so they don't reference freed data. */
2082         if (cowp->cow_taskq) {
2083                 taskq_destroy(cowp->cow_taskq);
2084                 cowp->cow_taskq = NULL;
2085         }
2086 
2087         if (cowp->cow_backfile_array != NULL) {
2088                 for (vpp = cowp->cow_backfile_array; *vpp; vpp++)
2089                         VN_RELE(*vpp);
2090                 kmem_free(cowp->cow_backfile_array,
2091                     (cowp->cow_backcount + 1) * sizeof (vnode_t *));
2092                 cowp->cow_backfile_array = NULL;
2093         }
2094 
2095         sidp->sid_cowinfo = NULL;
2096 
2097         /* remove cmap */
2098         cmap = &cowp->cow_map;
2099         ASSERT(cmap);
2100 
2101         if (cmap->cmap_candidate)
2102                 kmem_free(cmap->cmap_candidate, cmap->cmap_bmsize);
2103 
2104         if (cmap->cmap_hastrans)
2105                 kmem_free(cmap->cmap_hastrans, cmap->cmap_bmsize);
2106 
2107         if (cmap->cmap_table)
2108                 transtbl_free(&cowp->cow_map);
2109 
2110         rw_destroy(&cmap->cmap_rwlock);
2111 
2112         while (cmap->cmap_waiters) {
2113                 sema_p(&cmap->cmap_throttle_sem);
2114                 sema_v(&cmap->cmap_throttle_sem);
2115         }
2116         sema_destroy(&cmap->cmap_throttle_sem);
2117 
2118         /* remove kstats */
2119         fssnap_delete_kstats(cowp);
2120 
2121         kmem_free(cowp, sizeof (struct cow_info));
2122 
2123         statesidpp = ddi_get_soft_state(statep, sidp->sid_snapnumber);
2124         if (statesidpp == NULL || *statesidpp == NULL) {
2125                 cmn_err(CE_WARN,
2126                     "fssnap_delete_impl: could not find state for snapshot %d.",
2127                     sidp->sid_snapnumber);
2128         }
2129         ASSERT(*statesidpp == sidp);
2130 
2131         /*
2132          * Leave the node in the list marked DISABLED so it can be reused
2133          * and avoid many race conditions.  Return the snapshot number
2134          * that was deleted.
2135          */
2136         mutex_enter(&snapshot_mutex);
2137         rw_enter(&sidp->sid_rwlock, RW_WRITER);
2138         sidp->sid_flags &= ~(SID_DISABLING);
2139         sidp->sid_flags |= SID_DISABLED;
2140         VN_RELE(sidp->sid_fvp);
2141         sidp->sid_fvp = NULL;
2142         snapnumber = sidp->sid_snapnumber;
2143 
2144         /*
2145          * If the snapshot is not busy, free the device info now.  Otherwise
2146          * the device nodes are freed in snap_close() when the device is
2147          * closed.  The sid will not be reused until the device is not busy.
2148          */
2149         if (SID_AVAILABLE(sidp)) {
2150                 /* remove the device nodes */
2151                 ASSERT(fssnap_dip != NULL);
2152                 (void) snprintf(name, sizeof (name), "%d",
2153                     sidp->sid_snapnumber);
2154                 ddi_remove_minor_node(fssnap_dip, name);
2155                 (void) snprintf(name, sizeof (name), "%d,raw",
2156                     sidp->sid_snapnumber);
2157                 ddi_remove_minor_node(fssnap_dip, name);
2158 
2159                 /* delete the state structure */
2160                 ddi_soft_state_free(statep, sidp->sid_snapnumber);
2161                 num_snapshots--;
2162         }
2163 
2164         mutex_exit(&snapshot_mutex);
2165         rw_exit(&sidp->sid_rwlock);
2166 
2167         return (snapnumber);
2168 }
2169 
2170 /*
2171  * fssnap_create_kstats() - allocate and initialize snapshot kstats
2172  *
2173  */
2174 static void
2175 fssnap_create_kstats(snapshot_id_t *sidp, int snapnum,
2176     const char *mountpoint, const char *backfilename)
2177 {
2178         kstat_t *num, *mntpoint, *bfname;
2179         kstat_named_t *hw;
2180         struct cow_info *cowp = sidp->sid_cowinfo;
2181         struct cow_kstat_num *stats;
2182 
2183         /* update the high water mark */
2184         if (fssnap_highwater_kstat == NULL) {
2185                 cmn_err(CE_WARN, "fssnap_create_kstats: failed to lookup "
2186                     "high water mark kstat.");
2187                 return;
2188         }
2189 
2190         hw = (kstat_named_t *)fssnap_highwater_kstat->ks_data;
2191         if (hw->value.ui32 < snapnum)
2192                 hw->value.ui32 = snapnum;
2193 
2194         /* initialize the mount point kstat */
2195         kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_MNTPT);
2196 
2197         if (mountpoint != NULL) {
2198                 mntpoint = kstat_create(snapname, snapnum, FSSNAP_KSTAT_MNTPT,
2199                     "misc", KSTAT_TYPE_RAW, strlen(mountpoint) + 1, 0);
2200                 if (mntpoint == NULL) {
2201                         cowp->cow_kstat_mntpt = NULL;
2202                         cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
2203                             "create mount point kstat");
2204                 } else {
2205                         (void) strncpy(mntpoint->ks_data, mountpoint,
2206                             strlen(mountpoint));
2207                         cowp->cow_kstat_mntpt = mntpoint;
2208                         kstat_install(mntpoint);
2209                 }
2210         } else {
2211                 cowp->cow_kstat_mntpt = NULL;
2212                 cmn_err(CE_WARN, "fssnap_create_kstats: mount point not "
2213                     "specified.");
2214         }
2215 
2216         /* initialize the backing file kstat */
2217         kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_BFNAME);
2218 
2219         if (backfilename == NULL) {
2220                 cowp->cow_kstat_bfname = NULL;
2221         } else {
2222                 bfname = kstat_create(snapname, snapnum, FSSNAP_KSTAT_BFNAME,
2223                     "misc", KSTAT_TYPE_RAW, strlen(backfilename) + 1, 0);
2224                 if (bfname != NULL) {
2225                         (void) strncpy(bfname->ks_data, backfilename,
2226                             strlen(backfilename));
2227                         cowp->cow_kstat_bfname = bfname;
2228                         kstat_install(bfname);
2229                 } else {
2230                         cowp->cow_kstat_bfname = NULL;
2231                         cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
2232                             "create backing file name kstat");
2233                 }
2234         }
2235 
2236         /* initialize numeric kstats */
2237         kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_NUM);
2238 
2239         num = kstat_create(snapname, snapnum, FSSNAP_KSTAT_NUM,
2240             "misc", KSTAT_TYPE_NAMED,
2241             sizeof (struct cow_kstat_num) / sizeof (kstat_named_t),
2242             0);
2243         if (num == NULL) {
2244                 cmn_err(CE_WARN, "fssnap_create_kstats: failed to create "
2245                     "numeric kstats");
2246                 cowp->cow_kstat_num = NULL;
2247                 return;
2248         }
2249 
2250         cowp->cow_kstat_num = num;
2251         stats = num->ks_data;
2252         num->ks_update = fssnap_update_kstat_num;
2253         num->ks_private = sidp;
2254 
2255         kstat_named_init(&stats->ckn_state, FSSNAP_KSTAT_NUM_STATE,
2256             KSTAT_DATA_INT32);
2257         kstat_named_init(&stats->ckn_bfsize, FSSNAP_KSTAT_NUM_BFSIZE,
2258             KSTAT_DATA_UINT64);
2259         kstat_named_init(&stats->ckn_maxsize, FSSNAP_KSTAT_NUM_MAXSIZE,
2260             KSTAT_DATA_UINT64);
2261         kstat_named_init(&stats->ckn_createtime, FSSNAP_KSTAT_NUM_CREATETIME,
2262             KSTAT_DATA_LONG);
2263         kstat_named_init(&stats->ckn_chunksize, FSSNAP_KSTAT_NUM_CHUNKSIZE,
2264             KSTAT_DATA_UINT32);
2265 
2266         /* initialize the static kstats */
2267         stats->ckn_chunksize.value.ui32 = cowp->cow_map.cmap_chunksz;
2268         stats->ckn_maxsize.value.ui64 = cowp->cow_map.cmap_maxsize;
2269         stats->ckn_createtime.value.l = gethrestime_sec();
2270 
2271         kstat_install(num);
2272 }
2273 
2274 /*
2275  * fssnap_update_kstat_num() - update a numerical snapshot kstat value
2276  *
2277  */
2278 int
2279 fssnap_update_kstat_num(kstat_t *ksp, int rw)
2280 {
2281         snapshot_id_t *sidp = (snapshot_id_t *)ksp->ks_private;
2282         struct cow_info *cowp = sidp->sid_cowinfo;
2283         struct cow_kstat_num *stats = ksp->ks_data;
2284 
2285         if (rw == KSTAT_WRITE)
2286                 return (EACCES);
2287 
2288         /* state */
2289         if (sidp->sid_flags & SID_CREATING)
2290                 stats->ckn_state.value.i32 = COWSTATE_CREATING;
2291         else if (SID_INACTIVE(sidp))
2292                 stats->ckn_state.value.i32 = COWSTATE_DISABLED;
2293         else if (SID_BUSY(sidp))
2294                 stats->ckn_state.value.i32 = COWSTATE_ACTIVE;
2295         else
2296                 stats->ckn_state.value.i32 = COWSTATE_IDLE;
2297 
2298         /* bfsize */
2299         stats->ckn_bfsize.value.ui64 = cowp->cow_map.cmap_nchunks *
2300             cowp->cow_map.cmap_chunksz;
2301 
2302         return (0);
2303 }
2304 
2305 /*
2306  * fssnap_delete_kstats() - deallocate snapshot kstats
2307  *
2308  */
2309 void
2310 fssnap_delete_kstats(struct cow_info *cowp)
2311 {
2312         if (cowp->cow_kstat_num != NULL) {
2313                 kstat_delete(cowp->cow_kstat_num);
2314                 cowp->cow_kstat_num = NULL;
2315         }
2316         if (cowp->cow_kstat_mntpt != NULL) {
2317                 kstat_delete(cowp->cow_kstat_mntpt);
2318                 cowp->cow_kstat_mntpt = NULL;
2319         }
2320         if (cowp->cow_kstat_bfname != NULL) {
2321                 kstat_delete(cowp->cow_kstat_bfname);
2322                 cowp->cow_kstat_bfname = NULL;
2323         }
2324 }