1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2013 Damian Bogel.  All rights reserved.
  14  */
  15 
  16 #include <sys/debug.h>
  17 #include <sys/errno.h>
  18 #include <sys/fsh.h>
  19 #include <sys/fsh_impl.h>
  20 #include <sys/id_space.h>
  21 #include <sys/kmem.h>
  22 #include <sys/ksynch.h>
  23 #include <sys/list.h>
  24 #include <sys/sunddi.h>
  25 #include <sys/sysmacros.h>
  26 #include <sys/types.h>
  27 #include <sys/vfs.h>
  28 #include <sys/vnode.h>
  29 
  30 /*
  31  * Filesystem hook framework (fsh)
  32  *
  33  * 1. Abstract.
  34  * The main goal of the filesystem hook framework is to provide an easy way to
  35  * inject client-defined behaviour into vfs/vnode calls. fsh works on
  36  * vfs_t granularity.
  37  *
  38  *
  39  * 2. Overview.
  40  * fsh_t is the main object in the fsh. An fsh_t is a structure containing:
  41  *      - pointers to hooking functions (named after corresponding
  42  *      vnodeops/vfsops)
  43  *      - a pointer to an argument to pass (this is shared for all the
  44  *      hooks in a given fsh_t)
  45  *      - a pointer to the *hook remove callback* - it's being fired after a
  46  *      hook is removed and the hook has stopped executing. It's safe to destroy
  47  *      any data associated with this hook.
  48  *
  49  * The information from fsh_t is copied by the fsh and an fsh_handle_t
  50  * is returned. It should be used for further removing.
  51  *
  52  *
  53  * 3. Usage.
  54  * It is expected that vfs_t/vnode_t that are passed to fsh_foo() functions
  55  * are held by the caller when needed. fsh does no vfs_t/vnode_t locking.
  56  *
  57  * fsh_t is a structure filled out by the client. If a client does not want
  58  * to add/remove a hook for function foo(), he should fill the foo field of
  59  * fsh_t with NULL. Every hook has a type of corresponding vfsop/vnodeop with
  60  * two additional arguments:
  61  *      - fsh_int_t *fsh_int - this argument MUST be passed to
  62  *      hook_next_foo(). fsh wouldn't know which hook to execute next
  63  *      without it
  64  *      - void *arg - this is the argument passed with fsh_t during
  65  *      installation
  66  *      - void (*remove_cb)(void *, fsh_handle_t) - hook remove callback
  67  *      (mentioned earlier); it's first argument is arg, the second is the
  68  *      handle
  69  *
  70  * After installation, an fsh_handle_t is returned to the caller.
  71  *
  72  * Every hook function is responsible for passing the control to the next
  73  * hook associated with a particular call. In order to provide an easy way to
  74  * modify the behaviour of a function call both before and after the
  75  * underlying vfsop/vnodeop (or next hook) execution, a hook has to call
  76  * fsh_next_foo() at some point. This function does necessary internal
  77  * operations and calls the next hook, until there's no hook left, then it
  78  * calls the underlying vfsop/vnodeop.
  79  * Example:
  80  * my_freefs(fsh_int_t *fsh_int, void *arg, vfs_t *vfsp) {
  81  *      cmn_err(CE_NOTE, "freefs called!\n");
  82  *      return (fsh_next_freefs(fsh_int, vfsp));
  83  * }
  84  *
  85  *
  86  * A client might want to fire callbacks when vfs_t's are being mounted
  87  * or freed. There's an fsh_callback_t structure provided to install such
  88  * callbacks along with the API.
  89  * It is legal to call fsh_hook_{install,remove}() inside a mount callback
  90  * WITHOUT holding the vfs_t.
  91  *
  92  * After vfs_t's free callback returns, all the handles associated with the
  93  * hooks installed on this vfs_t are invalid and must not be used.
  94  *
  95  *
  96  * 4. API
  97  * None of the APIs should be called during interrupt context above lock
  98  * level. The only exceptions are fsh_next_foo() functions, which do not use
  99  * locks.
 100  *
 101  * a) fsh.h
 102  * Any of these functions could be called inside a hook or a hook remove
 103  * callback.
 104  * fsh_callback_{install,remove}() must not be called inside a {mount,free}
 105  * callback. Doing so will cause a deadlock. Other functions can be called
 106  * inside {mount,free} callbacks.
 107  *
 108  * fsh_fs_enable(vfs_t *vfsp)
 109  * fsh_fs_disable(vfs_t *vfsp)
 110  *      Enables/disables fsh for a given vfs_t.
 111  *
 112  * fsh_hook_install(vfs_t *vfsp, fsh_t *hooks)
 113  *      Installs hooks on vfsp filesystem.
 114  *      It's important that hooks are executed in LIFO installation order,
 115  *      which means that if there are hooks A and B installed in this order, B
 116  *      is going to be executed before A.
 117  *      It returns a correct handle, or (-1) if hook/callback limit exceeded.
 118  *      The handle is valid until a free callback returns or an explicit call
 119  *      to fsh_hook_remove().
 120  *
 121  * fsh_hook_remove(fsh_handle_t handle)
 122  *      Removes a hook and invalidates the handle.
 123  *      It is guaranteed that after this funcion returns, calls to
 124  *      vnodeops/vfsops won't go through this hook, although there might be
 125  *      some threads still executing this hook. When hook remove callback is
 126  *      fired, it is guaranteed that the hook won't be executed anymore. It is
 127  *      safe to remove all the internal data associated with this hook inside
 128  *      the hook remove callback. The hook remove callback could be called
 129  *      inside fsh_hook_remove().
 130  *
 131  * fsh_next_foo(fsh_int_t *fsh_int, void *arg, ARGUMENTS)
 132  *      This is the function which should be called once in every hook. It
 133  *      does the necessary internal operations and passes control to the
 134  *      next hook or, if there's no hook left, to the underlying
 135  *      vfsop/vnodeop.
 136  *
 137  * fsh_callback_install(fsh_callback_t *callback)
 138  * fsh_callback_remove(fsh_callback_handle_t handle)
 139  *      Installs/removes callbacks for vfs_t mount/free. The mount callback
 140  *      is executed right before domount() returns. The free callback is
 141  *      called right before VFS_FREEVFS() is called.
 142  *      The fsh_callback_install() returns a correct handle, or (-1) if
 143  *      hook/callback limit exceeded.
 144  *
 145  * b) fsh_impl.h (for vfs.c and vnode.c only)
 146  * fsh_init()
 147  *      This call has to be done in vfsinit(). It initialises the fsh. It
 148  *      is absolutely necessary that this call is made before any other fsh
 149  *      operation.
 150  *
 151  * fsh_exec_mount_callbacks(vfs_t *vfsp)
 152  * fsh_exec_free_callbacks(vfs_t *vfsp)
 153  *      Used to execute all fsh callbacks for {mount,free} of a vfs_t.
 154  *
 155  * fsh_fsrec_destroy(struct fsh_fsrecord *fsrecp)
 156  *      Destroys an fsh_fsrecord structure. All the hooks installed on this
 157  *      vfs_t are then destroyed. free callback is called before this function.
 158  *
 159  * fsh_foo(ARGUMENTS)
 160  *      Function used to start executing the hook chain for a given call.
 161  *
 162  *
 163  * 5. Internals.
 164  * fsh_int_t is an internal hook structure. It is reference counted.
 165  * fshi_hold() and fshi_rele() should be used whenever needed.
 166  * fsh_int_t entries are elements of both fsh_map (global) and fshfsr_list
 167  * (local to vfs_t). All entries are unique and are identified by fshi_handle.
 168  *
 169  * fsh_int_t properties:
 170  *      - fsh_hook_install() sets the ref. counter to 1 and adds it to both
 171  *      fsh_map and fshfsr_list
 172  *      - fsh_hook_remove() decreases the ref. counter by 1, removes the hook
 173  *      from fsh_map and marks the hook as *doomed*
 174  *      - if fsh_int_t is on the fshfsr_list, it's alive and there is a thread
 175  *      executing it
 176  *      - if fsh_int_t is marked as *doomed*, the reference counter is not
 177  *      be increased and thus no thread can acquire this fsh_int_t
 178  *      - ref. counter can drop to 0 only after an fsh_hook_remove() call; this
 179  *      also means that the fsh_int_t is *doomed* and isn't a part of fsh_map
 180  *      - fsh_int_t could be also destroyed without fsh_hook_remove() call,
 181  *      that happens only inside fsh_fsrec_destroy() where it is guaranteed
 182  *      that there is no thread executing the hook
 183  *
 184  *
 185  * fsh_fsrecord_t is a structure which lives inside a vfs_t.
 186  * fsh_fsrecord_t contains:
 187  *      - an rw-lock that protects the structure
 188  *      - a list of hooks installed on this vfs_t
 189  *      - a flag which tells whether fsh is enabled on this vfs_t
 190  *
 191  *
 192  * fsh_prepare_fsrec rule:
 193  * Every function that needs vfsp->vfs_fshrecord has to call
 194  * fsh_prepare_fsrec() first. If and only if the call is made, it is safe to
 195  * use vfsp->vfs_fshrecord.
 196  *
 197  * Unfortunately, because of unexpected behaviour of some filesystems (no use
 198  * of vfs_alloc()/vfs_init()) there's no good place to initialise the
 199  * fsh_fshrecord_t structure. The approach being used here is to check if it's
 200  * initialised in every call. Because of the fact that no lock could be used
 201  * here (the same problem with initialisation), a spinlock is used.  This is
 202  * explained in more detail in a comment before fsh_prepare_fsrec(). After
 203  * calling fsh_preapre_fsrec() it's completely safe to keep the vfs_fshrecord
 204  * pointer locally, because it won't be changed until vfs_free() is called.
 205  *
 206  * The only exception from the fsh_prepare_fsrec() rule is vfs_free(),
 207  * where there is expected that no other fsh calls would be made for the
 208  * vfs_t that's being freed. That's why vfs_fshrecord could be only NULL or a
 209  * valid pointer and could not be concurrently accessed.
 210  *
 211  * When there are no fsh functions (that use a particular fsh_fsrecord_t)
 212  * executing, the vfs_fshrecord pointer won't be equal to fsh_res_ptr. It
 213  * would be NULL or a pointer to an initialised fsh_fsrecord_t.
 214  *
 215  *
 216  * Callbacks:
 217  * Mount callbacks are executed by a call to fsh_exec_mount_callbacks() right
 218  * before returning from domount()@vfs.c.
 219  *
 220  * Free callbacks are executed by a call to fsh_exec_free_callbacks() right
 221  * before calling VFS_FREEVFS(), after vfs_t's reference count drops to 0.
 222  *
 223  *
 224  * fsh_next_foo(fsh_int_t *fshi, ARGUMENTS)
 225  *      This function is quite simple. It takes the fsh_int_t and passes control
 226  *      to the next hook or to the underlying vnodeop/vfsop.
 227  *
 228  *
 229  * 6. Locking
 230  * a) public
 231  * fsh does no vfs_t nor vnode_t locking. It is expected that whenever it is
 232  * needed, the client does that.
 233  *
 234  * fsh_callback_{install,remove} must not be called inside a callback, because
 235  * it will cause a deadlock.
 236  *
 237  * b) internal
 238  * Locking diagram:
 239  *
 240  *     fsh_hook_install()    fsh_hook_remove()   fsh_fsrec_destroy()
 241  *           |                     |                |
 242  *           |                     |                |
 243  *           +------------------+  |   +------------+
 244  *                              |  |   |
 245  *                              V  V   V
 246  *                              fsh_lock
 247  *                                 |   |
 248  *                                 |   +----- fshfsr_lock, RW_WRITER ---+
 249  *                                 |                                    |
 250  *                                 V                                    |
 251  *               +---------------------------------------+              |
 252  *               |               fsh_map                 |              |
 253  *               |                                       |              |
 254  *          +----|-> vfsp->vfs_fshrecord->fshfsr_list <--|--------------+
 255  *          |    +------------------------------^--------+
 256  *          |                                   |
 257  *          |                                   |
 258  * fshfsr_lock, RW_READER              fshfsr_lock, RW_WRITER
 259  *          |                                   |
 260  *          |                                   |
 261  *   fsh_read(),                            fshi_rele()
 262  *   fsh_write(),
 263  *   ...,                               Might be called from:
 264  *   fsh_next_read(),                    fsh_hook_remove()
 265  *   fsh_next_write(),                   fsh_read(), fsh_write(), ...
 266  *   ...                                 fsh_next_read(), fsh_next_write(), ...
 267  *
 268  * fsh_lock is a global lock for adminsitrative path (fsh_hook_install,
 269  * fsh_hook_remove) and fsh_fsrec_destroy() (which is semi-administrative, since
 270  * it destroys the unremoved hooks). It is used only when fsh_map needs to be
 271  * locked. The usage of this lock guarantees that the data in fsh_map and
 272  * fshfsr_lists is consistent.
 273  */
 274 
 275 
 276 /* Internals */
 277 struct fsh_int {
 278         fsh_handle_t    fshi_handle;
 279         fsh_t           fshi_hooks;
 280         vfs_t           *fshi_vfsp;
 281 
 282         kmutex_t        fshi_lock;
 283         uint64_t        fshi_ref;
 284         uint64_t        fshi_doomed;    /* changed inside fsh_lock */
 285 
 286         /* next node in fshfsr_list */
 287         list_node_t     fshi_next;
 288 
 289         /* next node in fsh_map */
 290         list_node_t     fshi_global;
 291 };
 292 
 293 typedef struct fsh_callback_int {
 294         fsh_callback_t  fshci_cb;
 295         fsh_callback_handle_t fshci_handle;
 296         list_node_t     fshci_next;
 297 } fsh_callback_int_t;
 298 
 299 
 300 static kmutex_t fsh_lock;
 301 
 302 /*
 303  * fsh_fsrecord_t is the main internal structure. It's content is protected
 304  * by fshfsr_lock. The fshfsr_list is a list of fsh_int_t hook entries for
 305  * the vfs_t that contains the fsh_fsrecord_t.
 306  */
 307 struct fsh_fsrecord {
 308         krwlock_t       fshfsr_lock;
 309         int             fshfsr_enabled;
 310         list_t          fshfsr_list;
 311 };
 312 
 313 /*
 314  * Global list of fsh_int_t. Protected by fsh_lock.
 315  */
 316 static list_t fsh_map;
 317 
 318 /*
 319  * Global list of fsh_callback_int_t.
 320  */
 321 static krwlock_t fsh_cblist_lock;
 322 static list_t fsh_cblist;
 323 
 324 /*
 325  * A reserved pointer for fsh purposes. It is used because of the method
 326  * chosen for solving concurrency issues with vfs_fshrecord. The full
 327  * explanation is in the big theory statement at the beginning of this
 328  * file and above fsh_fsrec_prepare(). It is initialised in fsh_init().
 329  */
 330 static void *fsh_res_ptr;
 331 
 332 static fsh_fsrecord_t *fsh_fsrec_create();
 333 
 334 int fsh_limit = INT_MAX;
 335 static id_space_t *fsh_idspace;
 336 
 337 /*
 338  * fsh_prepare_fsrec()
 339  *
 340  * Important note:
 341  * Before using this function, fsh_init() MUST be called. We do that in
 342  * vfsinit()@vfs.c.
 343  *
 344  * One would ask, why isn't the vfsp->vfs_fshrecord initialised when the
 345  * vfs_t is created. Unfortunately, some filesystems (e.g. fifofs) do not
 346  * call vfs_init() or even vfs_alloc(), It's possible that some unbundled
 347  * filesystems could do the same thing. That's why this solution is
 348  * introduced. It should be called before any code that needs access to
 349  * vfs_fshrecord.
 350  *
 351  * Locking:
 352  * There are no locks here, because there's no good place to initialise
 353  * the lock. Concurrency issues are solved by using atomic instructions
 354  * and a spinlock, which is spinning only once for a given vfs_t. Because
 355  * of that, the usage of the spinlock isn't bad at all.
 356  *
 357  * How it works:
 358  * a) if vfsp->vfs_fshrecord equals NULL, atomic_cas_ptr() changes it to
 359  *      fsh_res_ptr. That's a signal for other threads, that the structure
 360  *      is being initialised.
 361  * b) if vfsp->vfs_fshrecord equals fsh_res_ptr, that means we have to wait,
 362  *      because vfs_fshrecord is being initialised by another call.
 363  * c) other cases:
 364  *      vfs_fshrecord is already initialised, so we can use it. It won't change
 365  *      until vfs_free() is called. It can't happen when someone is holding
 366  *      the vfs_t, which is expected from the caller of fsh API.
 367  */
 368 static void
 369 fsh_prepare_fsrec(vfs_t *vfsp)
 370 {
 371         fsh_fsrecord_t *fsrec;
 372 
 373         while ((fsrec = atomic_cas_ptr(&vfsp->vfs_fshrecord, NULL,
 374             fsh_res_ptr)) == fsh_res_ptr)
 375                 ;
 376 
 377         if (fsrec == NULL)
 378                 atomic_swap_ptr(&vfsp->vfs_fshrecord, fsh_fsrec_create());
 379 }
 380 
 381 /*
 382  * API for enabling/disabling fsh per vfs_t.
 383  *
 384  * A newly created vfs_t has fsh enabled by default. If one would want to change
 385  * this behaviour, mount callbacks could be used.
 386  *
 387  * The caller is expected to hold the vfs_t.
 388  *
 389  * These functions must NOT be called in a hook.
 390  */
 391 void
 392 fsh_fs_enable(vfs_t *vfsp)
 393 {
 394         fsh_prepare_fsrec(vfsp);
 395 
 396         rw_enter(&vfsp->vfs_fshrecord->fshfsr_lock, RW_WRITER);
 397         vfsp->vfs_fshrecord->fshfsr_enabled = 1;
 398         rw_exit(&vfsp->vfs_fshrecord->fshfsr_lock);
 399 }
 400 
 401 void
 402 fsh_fs_disable(vfs_t *vfsp)
 403 {
 404         fsh_prepare_fsrec(vfsp);
 405 
 406         rw_enter(&vfsp->vfs_fshrecord->fshfsr_lock, RW_WRITER);
 407         vfsp->vfs_fshrecord->fshfsr_enabled = 0;
 408         rw_exit(&vfsp->vfs_fshrecord->fshfsr_lock);
 409 }
 410 
 411 /*
 412  * API used for installing hooks. fsh_handle_t is returned for further
 413  * actions (currently just removing) on this set of hooks.
 414  *
 415  * fsh_t fields:
 416  * - arg - argument passed to every hook
 417  * - remove_cb - remove callback, called after a hook is removed and all the
 418  *      threads stops executing it
 419  * - read, write, ... - pointers to hooks for corresponding vnodeops/vfsops;
 420  *      if there is no hook desired for an operation, it should be set to
 421  *      NULL
 422  *
 423  * It's important that the hooks are executed in LIFO installation order (they
 424  * are added to the head of the hook list).
 425  *
 426  * The caller is expected to hold the vfs_t.
 427  *
 428  * Returns (-1) if hook/callback limit exceeded, handle otherwise.
 429  */
 430 fsh_handle_t
 431 fsh_hook_install(vfs_t *vfsp, fsh_t *hooks)
 432 {
 433         fsh_handle_t    handle;
 434         fsh_int_t       *fshi;
 435 
 436         fsh_prepare_fsrec(vfsp);
 437 
 438         if ((handle = id_alloc(fsh_idspace)) == -1)
 439                 return (-1);
 440 
 441         fshi = kmem_alloc(sizeof (*fshi), KM_SLEEP);
 442         mutex_init(&fshi->fshi_lock, NULL, MUTEX_DRIVER, NULL);
 443         (void) memcpy(&fshi->fshi_hooks, hooks, sizeof (fshi->fshi_hooks));
 444         fshi->fshi_handle = handle;
 445         fshi->fshi_doomed = 0;
 446         fshi->fshi_ref = 1;
 447         fshi->fshi_vfsp = vfsp;
 448 
 449         mutex_enter(&fsh_lock);
 450         rw_enter(&vfsp->vfs_fshrecord->fshfsr_lock, RW_WRITER);
 451         list_insert_head(&vfsp->vfs_fshrecord->fshfsr_list, fshi);
 452         rw_exit(&vfsp->vfs_fshrecord->fshfsr_lock);
 453 
 454         list_insert_head(&fsh_map, fshi);
 455         mutex_exit(&fsh_lock);
 456 
 457         return (handle);
 458 }
 459 
 460 static int
 461 fshi_hold(fsh_int_t *fshi)
 462 {
 463         int can_hold;
 464 
 465         mutex_enter(&fshi->fshi_lock);
 466         if (fshi->fshi_doomed == 1) {
 467                 can_hold = 0;
 468         } else {
 469                 fshi->fshi_ref++;
 470                 can_hold = 1;
 471         }
 472         mutex_exit(&fshi->fshi_lock);
 473 
 474         return (can_hold);
 475 }
 476 
 477 /*
 478  * This function must not be called while fshfsr_lock is held. Doing so could
 479  * cause a deadlock.
 480  */
 481 static void
 482 fshi_rele(fsh_int_t *fshi)
 483 {
 484         int destroy;
 485 
 486         mutex_enter(&fshi->fshi_lock);
 487         ASSERT(fshi->fshi_ref > 0);
 488         fshi->fshi_ref--;
 489         if (fshi->fshi_ref == 0) {
 490                 ASSERT(fshi->fshi_doomed == 1);
 491                 destroy = 1;
 492         } else {
 493                 destroy = 0;
 494         }
 495         mutex_exit(&fshi->fshi_lock);
 496 
 497         if (destroy) {
 498                 /*
 499                  * At this point, we are sure that fsh_hook_remove() has been
 500                  * called, that's why we don't remove the fshi from fsh_map.
 501                  * fsh_hook_remove() did that already.
 502                  */
 503                 fsh_fsrecord_t *fsrecp;
 504 
 505                 if (fshi->fshi_hooks.remove_cb != NULL)
 506                         (*fshi->fshi_hooks.remove_cb)(
 507                             fshi->fshi_hooks.arg, fshi->fshi_handle);
 508                 /*
 509                  * We don't have to call fsh_prepare_fsrec() here.
 510                  * fsh_fsrecord_t is already initialised, because we've found a
 511                  * mapping for the given handle.
 512                  */
 513                 fsrecp = fshi->fshi_vfsp->vfs_fshrecord;
 514                 ASSERT(fsrecp != NULL);
 515                 ASSERT(fsrecp != fsh_res_ptr);
 516 
 517                 rw_enter(&fsrecp->fshfsr_lock, RW_WRITER);
 518                 list_remove(&fsrecp->fshfsr_list, fshi);
 519                 rw_exit(&fsrecp->fshfsr_lock);
 520 
 521                 id_free(fsh_idspace, fshi->fshi_handle);
 522                 mutex_destroy(&fshi->fshi_lock);
 523                 kmem_free(fshi, sizeof (*fshi));
 524         }
 525 }
 526 
 527 /*
 528  * Used for removing a hook set.
 529  *
 530  * fsh_hook_remove() invalidates the given handle.
 531  *
 532  * It is guaranteed, that after successful return from fsh_hook_remove(),
 533  * calls to vnodeops/vfsops, on the vfs_t on which the hook is installed, won't
 534  * go through this hook.
 535  *
 536  * There is no guarantee that after fsh_hook_remove() returns, the hook
 537  * associated with the handle won't be executing. Instead, it is guaranteed that
 538  * when remove_cb() is called, the hook finished it's execution in all threads.
 539  * It is safe to destroy all internal data associated with this hook inside
 540  * remove_cb().
 541  *
 542  * It is possible that remove_cb() would be called before fsh_hook_remove()
 543  * returns.
 544  *
 545  * Returns (-1) if hook wasn't found, 0 otherwise.
 546  */
 547 int
 548 fsh_hook_remove(fsh_handle_t handle)
 549 {
 550         fsh_int_t       *fshi;
 551 
 552         mutex_enter(&fsh_lock);
 553         for (fshi = list_head(&fsh_map); fshi != NULL;
 554             fshi = list_next(&fsh_map, fshi)) {
 555                 if (fshi->fshi_handle == handle) {
 556                         list_remove(&fsh_map, fshi);
 557                         break;
 558                 }
 559         }
 560 
 561         if (fshi == NULL)
 562                 return (-1);
 563 
 564         mutex_enter(&fshi->fshi_lock);
 565         ASSERT(fshi->fshi_doomed == 0);
 566         fshi->fshi_doomed = 1;
 567         mutex_exit(&fshi->fshi_lock);
 568         mutex_exit(&fsh_lock);
 569 
 570         fshi_rele(fshi);
 571 
 572         return (0);
 573 }
 574 
 575 /*
 576  * API for installing global mount/free callbacks.
 577  *
 578  * fsh_callback_t fields:
 579  * fshc_arg - argument passed to the callbacks
 580  * fshc_free - callback fired before VFS_FREEVFS() is called, after vfs_count
 581  *      drops to 0
 582  * fshc_mount - callback fired right before returning from domount()
 583  * The first argument of these callbacks is the vfs_t that is mounted/freed.
 584  * The second one is the fshc_arg.
 585  *
 586  * fsh_callback_handle_t is filled out by this function.
 587  *
 588  * This function must NOT be called in a callback, because it will cause
 589  * a deadlock.
 590  *
 591  * Returns (-1) if hook/callback limit exceeded.
 592  */
 593 fsh_callback_handle_t
 594 fsh_callback_install(fsh_callback_t *callback)
 595 {
 596         fsh_callback_int_t *fshci;
 597         fsh_callback_handle_t handle;
 598 
 599         if ((handle = id_alloc(fsh_idspace)) == -1)
 600                 return (-1);
 601 
 602         fshci = (fsh_callback_int_t *)kmem_alloc(sizeof (*fshci), KM_SLEEP);
 603         (void) memcpy(&fshci->fshci_cb, callback, sizeof (fshci->fshci_cb));
 604         fshci->fshci_handle = handle;
 605 
 606         /* If it is called in a {mount,free} callback, causes deadlock. */
 607         rw_enter(&fsh_cblist_lock, RW_WRITER);
 608         list_insert_head(&fsh_cblist, fshci);
 609         rw_exit(&fsh_cblist_lock);
 610 
 611         return (handle);
 612 }
 613 
 614 /*
 615  * API for removing global mount/free callbacks.
 616  *
 617  * This function must NOT be called in a callback, because it will cause
 618  * a deadlock.
 619  *
 620  * Returns (-1) if callback wasn't found, 0 otherwise.
 621  */
 622 int
 623 fsh_callback_remove(fsh_callback_handle_t handle)
 624 {
 625         fsh_callback_int_t *fshci;
 626 
 627         /* If it is called in a {mount,free} callback, causes deadlock. */
 628         rw_enter(&fsh_cblist_lock, RW_WRITER);
 629         for (fshci = list_head(&fsh_cblist); fshci != NULL;
 630             fshci = list_next(&fsh_cblist, fshci)) {
 631                 if (fshci->fshci_handle == handle) {
 632                         list_remove(&fsh_cblist, fshci);
 633                         break;
 634                 }
 635         }
 636         rw_exit(&fsh_cblist_lock);
 637 
 638         if (fshci == NULL)
 639                 return (-1);
 640 
 641         kmem_free(fshci, sizeof (*fshci));
 642         id_free(fsh_idspace, handle);
 643 
 644         return (0);
 645 }
 646 
 647 /*
 648  * This function is executed right before returning from domount()@vfs.c.
 649  * We are sure that it's called only after fsh_init().
 650  * It executes all the mount callbacks installed in the fsh.
 651  *
 652  * Since fsh_exec_mount_callbacks() is called only inside domount(), it is legal
 653  * to call fsh_hook_{install,remove}() inside a mount callback WITHOUT holding
 654  * this vfs_t. This guarantee should be preserved, because it's in the "Usage"
 655  * section in the big theory statement at the top of this file.
 656  */
 657 void
 658 fsh_exec_mount_callbacks(vfs_t *vfsp)
 659 {
 660         fsh_callback_int_t *fshci;
 661         fsh_callback_t *cb;
 662 
 663         rw_enter(&fsh_cblist_lock, RW_READER);
 664         for (fshci = list_head(&fsh_cblist); fshci != NULL;
 665             fshci = list_next(&fsh_cblist, fshci)) {
 666                 cb = &fshci->fshci_cb;
 667                 if (cb->fshc_mount != NULL)
 668                         (*(cb->fshc_mount))(vfsp, cb->fshc_arg);
 669         }
 670         rw_exit(&fsh_cblist_lock);
 671 }
 672 
 673 /*
 674  * This function is executed right before VFS_FREEVFS() is called in
 675  * vfs_rele()@vfs.c. We are sure that it's called only after fsh_init().
 676  * It executes all the free callbacks installed in the fsh.
 677  *
 678  * free() callback is the point after the handles associated with the hooks
 679  * installed on this vfs_t become invalid
 680  */
 681 void
 682 fsh_exec_free_callbacks(vfs_t *vfsp)
 683 {
 684         fsh_callback_int_t *fshci;
 685         fsh_callback_t *cb;
 686 
 687         rw_enter(&fsh_cblist_lock, RW_READER);
 688         for (fshci = list_head(&fsh_cblist); fshci != NULL;
 689             fshci = list_next(&fsh_cblist, fshci)) {
 690                 cb = &fshci->fshci_cb;
 691                 if (cb->fshc_free != NULL)
 692                         (*(cb->fshc_free))(vfsp, cb->fshc_arg);
 693         }
 694         rw_exit(&fsh_cblist_lock);
 695 }
 696 
 697 /*
 698  * API for vnode.c/vfs.c to start executing the fsh for a given operation.
 699  *
 700  * fsh_xxx() tries to find the first non-NULL xxx hook on the fshfsr_list. If it
 701  * does, it executes it. If not, underlying vnodeop/vfsop is called.
 702  *
 703  * These interfaces are using fsh_res_ptr (in fsh_prepare_fsrec()), so it's
 704  * absolutely necessary to call fsh_init() before using them. That's done in
 705  * vfsinit().
 706  *
 707  * While these functions are executing, it's expected that necessary vfs_t's
 708  * are held so that vfs_free() isn't called. vfs_free() expects that noone
 709  * accesses vfs_fshrecord of a given vfs_t.
 710  * It's also the caller's responsibility to keep vnode_t passed to fsh_foo()
 711  * alive and valid.
 712  * All these expectations are met because these functions are used only in
 713  * correspondng {fop,fsop}_foo() functions.
 714  */
 715 int
 716 fsh_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
 717         caller_context_t *ct)
 718 {
 719         int ret;
 720         fsh_fsrecord_t *fsrecp;
 721         fsh_int_t *fshi;
 722 
 723         fsh_prepare_fsrec(vp->v_vfsp);
 724         fsrecp = vp->v_vfsp->vfs_fshrecord;
 725 
 726         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
 727         if (!(fsrecp->fshfsr_enabled)) {
 728                 rw_exit(&fsrecp->fshfsr_lock);
 729                 return ((*(vp->v_op->vop_read))(vp, uiop, ioflag, cr, ct));
 730         }
 731 
 732         for (fshi = list_head(&fsrecp->fshfsr_list); fshi != NULL;
 733             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
 734                 if (fshi->fshi_hooks.read != NULL)
 735                         if (fshi_hold(fshi))
 736                                 break;
 737         }
 738         rw_exit(&fsrecp->fshfsr_lock);
 739 
 740         if (fshi == NULL)
 741                 return ((*(vp->v_op->vop_read))(vp, uiop, ioflag, cr, ct));
 742 
 743         ret = (*fshi->fshi_hooks.read)(fshi, fshi->fshi_hooks.arg,
 744             vp, uiop, ioflag, cr, ct);
 745         fshi_rele(fshi);
 746         return (ret);
 747 }
 748 
 749 int
 750 fsh_write(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
 751         caller_context_t *ct)
 752 {
 753         fsh_int_t *fshi;
 754         int ret;
 755         fsh_fsrecord_t *fsrecp;
 756 
 757         fsh_prepare_fsrec(vp->v_vfsp);
 758         fsrecp = vp->v_vfsp->vfs_fshrecord;
 759 
 760         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
 761         if (!(vp->v_vfsp->vfs_fshrecord->fshfsr_enabled)) {
 762                 rw_exit(&fsrecp->fshfsr_lock);
 763                 return ((*(vp->v_op->vop_write))(vp, uiop, ioflag, cr, ct));
 764         }
 765 
 766         for (fshi = list_head(&fsrecp->fshfsr_list); fshi != NULL;
 767             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
 768                 if (fshi->fshi_hooks.write != NULL)
 769                         if (fshi_hold(fshi))
 770                                 break;
 771         }
 772         rw_exit(&fsrecp->fshfsr_lock);
 773 
 774         if (fshi == NULL)
 775                 return ((*(vp->v_op->vop_write))(vp, uiop, ioflag, cr, ct));
 776 
 777         ret = (*fshi->fshi_hooks.write)(fshi, fshi->fshi_hooks.arg,
 778             vp, uiop, ioflag, cr, ct);
 779         fshi_rele(fshi);
 780         return (ret);
 781 }
 782 
 783 int
 784 fsh_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 785 {
 786         fsh_fsrecord_t *fsrecp;
 787         fsh_int_t *fshi;
 788         int ret;
 789 
 790         fsh_prepare_fsrec(vfsp);
 791         fsrecp = vfsp->vfs_fshrecord;
 792 
 793         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
 794         if (!(fsrecp->fshfsr_enabled)) {
 795                 rw_exit(&fsrecp->fshfsr_lock);
 796                 return ((*(vfsp->vfs_op->vfs_mount))(vfsp, mvp, uap, cr));
 797         }
 798 
 799         for (fshi = list_head(&fsrecp->fshfsr_list); fshi != NULL;
 800             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
 801                 if (fshi->fshi_hooks.mount != NULL)
 802                         if (fshi_hold(fshi))
 803                                 break;
 804         }
 805         rw_exit(&fsrecp->fshfsr_lock);
 806 
 807         if (fshi == NULL)
 808                 return ((*(vfsp->vfs_op->vfs_mount))(vfsp, mvp, uap, cr));
 809 
 810         ret = (*fshi->fshi_hooks.mount)(fshi, fshi->fshi_hooks.arg,
 811             vfsp, mvp, uap, cr);
 812         fshi_rele(fshi);
 813         return (ret);
 814 }
 815 
 816 int
 817 fsh_unmount(vfs_t *vfsp, int flag, cred_t *cr)
 818 {
 819         fsh_fsrecord_t *fsrecp;
 820         fsh_int_t *fshi;
 821         int ret;
 822 
 823         fsh_prepare_fsrec(vfsp);
 824         fsrecp = vfsp->vfs_fshrecord;
 825 
 826         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
 827         if (!(fsrecp->fshfsr_enabled)) {
 828                 rw_exit(&fsrecp->fshfsr_lock);
 829                 return ((*(vfsp->vfs_op->vfs_unmount))(vfsp, flag, cr));
 830         }
 831 
 832         for (fshi = list_head(&fsrecp->fshfsr_list); fshi != NULL;
 833             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
 834                 if (fshi->fshi_hooks.unmount != NULL)
 835                         if (fshi_hold(fshi))
 836                                 break;
 837         }
 838         rw_exit(&fsrecp->fshfsr_lock);
 839 
 840         if (fshi == NULL)
 841                 return ((*(vfsp->vfs_op->vfs_unmount))(vfsp, flag, cr));
 842 
 843         ret = (*fshi->fshi_hooks.unmount)(fshi, fshi->fshi_hooks.arg,
 844             vfsp, flag, cr);
 845         fshi_rele(fshi);
 846         return (ret);
 847 }
 848 
 849 /*
 850  * This is the funtion used by fsh_prepare_fsrec() to allocate a new
 851  * fsh_fsrecord. This function is called by the first function which
 852  * access the vfs_fshrecord and finds out it's NULL.
 853  */
 854 static fsh_fsrecord_t *
 855 fsh_fsrec_create()
 856 {
 857         fsh_fsrecord_t *fsrecp;
 858 
 859         fsrecp = (fsh_fsrecord_t *)kmem_zalloc(sizeof (*fsrecp), KM_SLEEP);
 860         list_create(&fsrecp->fshfsr_list, sizeof (fsh_int_t),
 861             offsetof(fsh_int_t, fshi_next));
 862         rw_init(&fsrecp->fshfsr_lock, NULL, RW_DRIVER, NULL);
 863         fsrecp->fshfsr_enabled = 1;
 864         return (fsrecp);
 865 }
 866 
 867 
 868 /*
 869  * This call can be used ONLY in vfs_free(). It's assumed that no other
 870  * fsh calls using the vfs_t that owns the fsh_fsrecord to be destroyed
 871  * are executing while a call to fsh_fsrec_destroy() is made. With this
 872  * assumptions, no concurrency issues occur.
 873  *
 874  * Before calling this function outside the fsh, it's sufficient and
 875  * required to check if the passed fsh_fsrecord * is not NULL. We don't
 876  * have to check if it is not equal to fsh_res_ptr, because all the fsh API
 877  * calls involving this vfs_t should end before vfs_free() is called
 878  * (outside the fsh, fsh_fsrecord is never equal to fsh_res_ptr). That is
 879  * guaranteed by the explicit requirement that the caller of fsh API holds
 880  * the vfs_t when needed.
 881  *
 882  * All the remaining hooks are being removed.
 883  */
 884 void
 885 fsh_fsrec_destroy(struct fsh_fsrecord *volatile fsrecp)
 886 {
 887         fsh_int_t *fshi;
 888 
 889         VERIFY(fsrecp != NULL);
 890 
 891         _NOTE(CONSTCOND)
 892         while (1) {
 893                 mutex_enter(&fsh_lock);
 894                 /* No need here to hold fshfsr_lock */
 895                 fshi = list_remove_head(&fsrecp->fshfsr_list);
 896                 if (fshi == NULL) {
 897                         mutex_exit(&fsh_lock);
 898                         break;
 899                 }
 900                 ASSERT(fshi->fshi_doomed == 0);
 901                 list_remove(&fsh_map, fshi);
 902                 mutex_exit(&fsh_lock);
 903 
 904                 if (fshi->fshi_hooks.remove_cb != NULL)
 905                         (*fshi->fshi_hooks.remove_cb)(fshi->fshi_hooks.arg,
 906                             fshi->fshi_handle);
 907                 id_free(fsh_idspace, fshi->fshi_handle);
 908                 mutex_destroy(&fshi->fshi_lock);
 909                 kmem_free(fshi, sizeof (*fshi));
 910 
 911         }
 912 
 913         list_destroy(&fsrecp->fshfsr_list);
 914         rw_destroy(&fsrecp->fshfsr_lock);
 915         kmem_free(fsrecp, sizeof (*fsrecp));
 916 }
 917 
 918 /*
 919  * fsh_init() is called in vfsinit()@vfs.c. This function MUST be called
 920  * before every other fsh call.
 921  */
 922 void
 923 fsh_init(void)
 924 {
 925         rw_init(&fsh_cblist_lock, NULL, RW_DRIVER, NULL);
 926         list_create(&fsh_cblist, sizeof (fsh_callback_int_t),
 927             offsetof(fsh_callback_int_t, fshci_next));
 928 
 929         mutex_init(&fsh_lock, NULL, MUTEX_DRIVER, NULL);
 930 
 931         list_create(&fsh_map, sizeof (fsh_int_t), offsetof(fsh_int_t,
 932             fshi_global));
 933 
 934         /* See comment above fsh_prepare_fsrec() */
 935         fsh_res_ptr = (void *)-1;
 936 
 937         fsh_idspace = id_space_create("fsh", 0, fsh_limit);
 938 }
 939 
 940 /*
 941  * These functions are used to pass control to the next hook or underlying
 942  * vop or vfsop. It's client doesn't have to worry about any locking.
 943  */
 944 int
 945 fsh_next_read(fsh_int_t *fshi, vnode_t *vp, uio_t *uiop, int ioflag,
 946         cred_t *cr, caller_context_t *ct)
 947 {
 948         int ret;
 949         fsh_fsrecord_t *fsrecp = vp->v_vfsp->vfs_fshrecord;
 950 
 951         /*
 952          * The passed fshi is the previous hook (the one from which we've been
 953          * called). We need to find the next one.
 954          */
 955         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
 956         for (fshi = list_next(&fsrecp->fshfsr_list, fshi); fshi != NULL;
 957             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
 958                 if (fshi->fshi_hooks.read != NULL)
 959                         if (fshi_hold(fshi))
 960                                 break;
 961         }
 962         rw_exit(&fsrecp->fshfsr_lock);
 963 
 964         if (fshi == NULL)
 965                 return ((*vp->v_op->vop_read)(vp, uiop, ioflag, cr, ct));
 966 
 967         ret = (*fshi->fshi_hooks.read)(fshi, fshi->fshi_hooks.arg,
 968             vp, uiop, ioflag, cr, ct);
 969         fshi_rele(fshi);
 970         return (ret);
 971 }
 972 
 973 int
 974 fsh_next_write(fsh_int_t *fshi, vnode_t *vp, uio_t *uiop, int ioflag,
 975         cred_t *cr, caller_context_t *ct)
 976 {
 977         fsh_fsrecord_t *fsrecp = vp->v_vfsp->vfs_fshrecord;
 978         int ret;
 979 
 980         /*
 981          * The passed fshi is the previous hook (the one from which we've been
 982          * called). We need to find the next one.
 983          */
 984         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
 985         for (fshi = list_next(&fsrecp->fshfsr_list, fshi); fshi != NULL;
 986             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
 987                 if (fshi->fshi_hooks.write != NULL)
 988                         if (fshi_hold(fshi))
 989                                 break;
 990         }
 991         rw_exit(&fsrecp->fshfsr_lock);
 992 
 993         if (fshi == NULL)
 994                 return ((*vp->v_op->vop_write)(vp, uiop, ioflag, cr, ct));
 995 
 996         ret = (*fshi->fshi_hooks.write)(fshi, fshi->fshi_hooks.arg,
 997             vp, uiop, ioflag, cr, ct);
 998         fshi_rele(fshi);
 999         return (ret);
1000 }
1001 
1002 int
1003 fsh_next_mount(fsh_int_t *fshi, vfs_t *vfsp, vnode_t *mvp, struct mounta *uap,
1004         cred_t *cr)
1005 {
1006         fsh_fsrecord_t *fsrecp = vfsp->vfs_fshrecord;
1007         int ret;
1008 
1009         /*
1010          * The passed fshi is the previous hook (the one from which we've been
1011          * called). We need to find the next one.
1012          */
1013         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
1014         for (fshi = list_next(&fsrecp->fshfsr_list, fshi); fshi != NULL;
1015             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
1016                 if (fshi->fshi_hooks.mount != NULL)
1017                         if (fshi_hold(fshi))
1018                                 break;
1019         }
1020         rw_exit(&fsrecp->fshfsr_lock);
1021 
1022         if (fshi == NULL)
1023                 return ((*(vfsp->vfs_op->vfs_mount))(vfsp, mvp, uap, cr));
1024 
1025         ret = (*fshi->fshi_hooks.mount)(fshi, fshi->fshi_hooks.arg,
1026             vfsp, mvp, uap, cr);
1027         fshi_rele(fshi);
1028         return (ret);
1029 }
1030 
1031 int
1032 fsh_next_unmount(fsh_int_t *fshi, vfs_t *vfsp, int flag, cred_t *cr)
1033 {
1034         fsh_fsrecord_t *fsrecp = vfsp->vfs_fshrecord;
1035         int ret;
1036 
1037         /*
1038          * The passed fshi is the previous hook (the one from which we've been
1039          * called). We need to find the next one.
1040          */
1041         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
1042         for (fshi = list_next(&fsrecp->fshfsr_list, fshi); fshi != NULL;
1043             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
1044                 if (fshi->fshi_hooks.unmount != NULL)
1045                         if (fshi_hold(fshi))
1046                                 break;
1047         }
1048         rw_exit(&fsrecp->fshfsr_lock);
1049 
1050         if (fshi == NULL)
1051                 return ((*vfsp->vfs_op->vfs_unmount)(vfsp, flag, cr));
1052 
1053         ret = (*fshi->fshi_hooks.unmount)(fshi, fshi->fshi_hooks.arg,
1054             vfsp, flag, cr);
1055         fshi_rele(fshi);
1056         return (ret);
1057 }