1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2013 Damian Bogel.  All rights reserved.
  14  */
  15 
  16 #include <sys/debug.h>
  17 #include <sys/errno.h>
  18 #include <sys/fsh.h>
  19 #include <sys/fsh_impl.h>
  20 #include <sys/id_space.h>
  21 #include <sys/kmem.h>
  22 #include <sys/ksynch.h>
  23 #include <sys/list.h>
  24 #include <sys/sunddi.h>
  25 #include <sys/sysmacros.h>
  26 #include <sys/types.h>
  27 #include <sys/vfs.h>
  28 #include <sys/vnode.h>
  29 
  30 /*
  31  * Filesystem hook framework (fsh)
  32  *
  33  * 1. Abstract.
  34  * The main goal of the filesystem hook framework is to provide an easy way to
  35  * inject client-defined behaviour into vfs/vnode calls. fsh works on
  36  * vfs_t granularity.
  37  *
  38  *
  39  * 2. Overview.
  40  * fsh_t is the main object in the fsh. An fsh_t is a structure containing:
  41  *      - pointers to hooking functions (named after corresponding
  42  *      vnodeops/vfsops)
  43  *      - a pointer to an argument to pass (this is shared for all the
  44  *      hooks in a given fsh_t)
  45  *      - a pointer to the *hook remove callback* - it's being fired after a
  46  *      hook is removed and the hook has stopped executing. It's safe to destroy
  47  *      any data associated with this hook.
  48  *
  49  * The information from fsh_t is copied by the fsh and an fsh_handle_t
  50  * is returned. It should be used for further removing.
  51  *
  52  *
  53  * 3. Usage.
  54  * It is expected that vfs_t/vnode_t that are passed to fsh_foo() functions
  55  * are held by the caller when needed. fsh does no vfs_t/vnode_t locking.
  56  *
  57  * fsh_t is a structure filled out by the client. If a client does not want
  58  * to add/remove a hook for function foo(), he should fill the foo field of
  59  * fsh_t with NULL. Every hook has a type of corresponding vfsop/vnodeop with
  60  * two additional arguments:
  61  *      - fsh_int_t *fsh_int - this argument MUST be passed to
  62  *      hook_next_foo(). fsh wouldn't know which hook to execute next
  63  *      without it
  64  *      - void *arg - this is the argument passed with fsh_t during
  65  *      installation
  66  *      - void (*remove_cb)(void *, fsh_handle_t) - hook remove callback
  67  *      (mentioned earlier); it's first argument is arg, the second is the
  68  *      handle
  69  *
  70  * After installation, an fsh_handle_t is returned to the caller.
  71  *
  72  * Every hook function is responsible for passing the control to the next
  73  * hook associated with a particular call. In order to provide an easy way to
  74  * modify the behaviour of a function call both before and after the
  75  * underlying vfsop/vnodeop (or next hook) execution, a hook has to call
  76  * fsh_next_foo() at some point. This function does necessary internal
  77  * operations and calls the next hook, until there's no hook left, then it
  78  * calls the underlying vfsop/vnodeop.
  79  * Example:
  80  * my_freefs(fsh_int_t *fsh_int, void *arg, vfs_t *vfsp) {
  81  *      cmn_err(CE_NOTE, "freefs called!\n");
  82  *      return (fsh_next_freefs(fsh_int, vfsp));
  83  * }
  84  *
  85  *
  86  * A client might want to fire callbacks when vfs_t's are being mounted
  87  * or freed. There's an fsh_callback_t structure provided to install such
  88  * callbacks along with the API.
  89  * It is legal to call fsh_hook_{install,remove}() inside a mount callback
  90  * WITHOUT holding the vfs_t.
  91  *
  92  * After vfs_t's free callback returns, all the handles associated with the
  93  * hooks installed on this vfs_t are invalid and must not be used.
  94  *
  95  *
  96  * 4. API
  97  * None of the APIs should be called during interrupt context above lock
  98  * level. The only exceptions are fsh_next_foo() functions, which do not use
  99  * locks.
 100  *
 101  * a) fsh.h
 102  * Any of these functions could be called inside a hook or a hook remove
 103  * callback.
 104  * fsh_callback_{install,remove}() must not be called inside a {mount,free}
 105  * callback. Doing so will cause a deadlock. Other functions can be called
 106  * inside {mount,free} callbacks.
 107  *
 108  * fsh_fs_enable(vfs_t *vfsp)
 109  * fsh_fs_disable(vfs_t *vfsp)
 110  *      Enables/disables fsh for a given vfs_t.
 111  *
 112  * fsh_hook_install(vfs_t *vfsp, fsh_t *hooks)
 113  *      Installs hooks on vfsp filesystem.
 114  *      It's important that hooks are executed in LIFO installation order,
 115  *      which means that if there are hooks A and B installed in this order, B
 116  *      is going to be executed before A.
 117  *      It returns a correct handle, or (-1) if hook/callback limit exceeded.
 118  *      The handle is valid until a free callback returns or an explicit call
 119  *      to fsh_hook_remove().
 120  *
 121  * fsh_hook_remove(fsh_handle_t handle)
 122  *      Removes a hook and invalidates the handle.
 123  *      It is guaranteed that after this funcion returns, calls to
 124  *      vnodeops/vfsops won't go through this hook, although there might be
 125  *      some threads still executing this hook. When hook remove callback is
 126  *      fired, it is guaranteed that the hook won't be executed anymore. It is
 127  *      safe to remove all the internal data associated with this hook inside
 128  *      the hook remove callback.
 129  *
 130  * fsh_next_foo(fsh_int_t *fsh_int, void *arg, ARGUMENTS)
 131  *      This is the function which should be called once in every hook. It
 132  *      does the necessary internal operations and passes control to the
 133  *      next hook or, if there's no hook left, to the underlying
 134  *      vfsop/vnodeop.
 135  *
 136  * fsh_callback_install(fsh_callback_t *callback)
 137  * fsh_callback_remove(fsh_callback_handle_t handle)
 138  *      Installs/removes callbacks for vfs_t mount/free. The mount callback
 139  *      is executed right before domount() returns. The free callback is
 140  *      called right before VFS_FREEVFS() is called.
 141  *      The fsh_callback_install() returns a correct handle, or (-1) if
 142  *      hook/callback limit exceeded.
 143  *
 144  * b) fsh_impl.h (for vfs.c and vnode.c only)
 145  * fsh_init()
 146  *      This call has to be done in vfsinit(). It initialises the fsh. It
 147  *      is absolutely necessary that this call is made before any other fsh
 148  *      operation.
 149  *
 150  * fsh_exec_mount_callbacks(vfs_t *vfsp)
 151  * fsh_exec_free_callbacks(vfs_t *vfsp)
 152  *      Used to execute all fsh callbacks for {mount,free} of a vfs_t.
 153  *
 154  * fsh_fsrec_destroy(struct fsh_fsrecord *fsrecp)
 155  *      Destroys an fsh_fsrecord structure. All the hooks installed on this
 156  *      vfs_t are then destroyed. free callback is called before this function.
 157  *
 158  * fsh_foo(ARGUMENTS)
 159  *      Function used to start executing the hook chain for a given call.
 160  *
 161  *
 162  * 5. Internals.
 163  * fsh_int_t is an internal hook structure. It is reference counted.
 164  * fshi_hold() and fshi_rele() should be used whenever needed.
 165  * fsh_int_t entries are elements of both fsh_map (global) and fshfsr_list
 166  * (local to vfs_t). All entries are unique and are identified by fshi_handle.
 167  *
 168  * fsh_int_t properties:
 169  *      - fsh_hook_install() sets the ref. counter to 1 and adds it to both
 170  *      fsh_map and fshfsr_list
 171  *      - fsh_hook_remove() decreases the ref. counter by 1, removes the hook
 172  *      from fsh_map and marks the hook as *doomed*
 173  *      - if fsh_int_t is on the fshfsr_list, it's alive and there is a thread
 174  *      executing it
 175  *      - if fsh_int_t is marked as *doomed*, the reference counter is not
 176  *      be increased and thus no thread can acquire this fsh_int_t
 177  *      - ref. counter can drop to 0 only after an fsh_hook_remove() call; this
 178  *      also means that the fsh_int_t is *doomed* and isn't a part of fsh_map
 179  *      - fsh_int_t could be also destroyed without fsh_hook_remove() call,
 180  *      that happens only inside fsh_fsrec_destroy() where it is guaranteed
 181  *      that there is no thread executing the hook
 182  *
 183  *
 184  * fsh_fsrecord_t is a structure which lives inside a vfs_t.
 185  * fsh_fsrecord_t contains:
 186  *      - an rw-lock that protects the structure
 187  *      - a list of hooks installed on this vfs_t
 188  *      - a flag which tells whether fsh is enabled on this vfs_t
 189  *
 190  *
 191  * fsh_prepare_fsrec rule:
 192  * Every function that needs vfsp->vfs_fshrecord has to call
 193  * fsh_prepare_fsrec() first. If and only if the call is made, it is safe to
 194  * use vfsp->vfs_fshrecord.
 195  *
 196  * Unfortunately, because of unexpected behaviour of some filesystems (no use
 197  * of vfs_alloc()/vfs_init()) there's no good place to initialise the
 198  * fsh_fshrecord_t structure. The approach being used here is to check if it's
 199  * initialised in every call. Because of the fact that no lock could be used
 200  * here (the same problem with initialisation), a spinlock is used.  This is
 201  * explained in more detail in a comment before fsh_prepare_fsrec(). After
 202  * calling fsh_preapre_fsrec() it's completely safe to keep the vfs_fshrecord
 203  * pointer locally, because it won't be changed until vfs_free() is called.
 204  *
 205  * The only exception from the fsh_prepare_fsrec() rule is vfs_free(),
 206  * where there is expected that no other fsh calls would be made for the
 207  * vfs_t that's being freed. That's why vfs_fshrecord could be only NULL or a
 208  * valid pointer and could not be concurrently accessed.
 209  *
 210  * When there are no fsh functions (that use a particular fsh_fsrecord_t)
 211  * executing, the vfs_fshrecord pointer won't be equal to fsh_res_ptr. It
 212  * would be NULL or a pointer to an initialised fsh_fsrecord_t.
 213  *
 214  *
 215  * Callbacks:
 216  * Mount callbacks are executed by a call to fsh_exec_mount_callbacks() right
 217  * before returning from domount()@vfs.c.
 218  *
 219  * Free callbacks are executed by a call to fsh_exec_free_callbacks() right
 220  * before calling VFS_FREEVFS(), after vfs_t's reference count drops to 0.
 221  *
 222  *
 223  * fsh_next_foo(fsh_int_t *fshi, ARGUMENTS)
 224  *      This function is quite simple. It takes the fsh_int_t and passes control
 225  *      to the next hook or to the underlying vnodeop/vfsop.
 226  *
 227  *
 228  * 6. Locking
 229  * a) public
 230  * fsh does no vfs_t nor vnode_t locking. It is expected that whenever it is
 231  * needed, the client does that.
 232  *
 233  * fsh_callback_{install,remove} must not be called inside a callback, because
 234  * it will cause a deadlock.
 235  *
 236  * b) internal
 237  * Locking diagram:
 238  *
 239  *     fsh_hook_install()    fsh_hook_remove()   fsh_fsrec_destroy()
 240  *           |                     |                |
 241  *           |                     |                |
 242  *           +------------------+  |   +------------+
 243  *                              |  |   |
 244  *                              V  V   V
 245  *                              fsh_lock
 246  *                                 |   |
 247  *                                 |   +----- fshfsr_lock, RW_WRITER ---+
 248  *                                 |                                    |
 249  *                                 V                                    |
 250  *               +---------------------------------------+              |
 251  *               |               fsh_map                 |              |
 252  *               |                                       |              |
 253  *          +----|-> vfsp->vfs_fshrecord->fshfsr_list <--|--------------+
 254  *          |    +------------------------------^--------+
 255  *          |                                   |
 256  *          |                                   |
 257  * fshfsr_lock, RW_READER              fshfsr_lock, RW_WRITER
 258  *          |                                   |
 259  *          |                                   |
 260  *   fsh_read(),                            fshi_rele()
 261  *   fsh_write(),
 262  *   ...,                               Might be called from:
 263  *   fsh_next_read(),                    fsh_hook_remove()
 264  *   fsh_next_write(),                   fsh_read(), fsh_write(), ...
 265  *   ...                                 fsh_next_read(), fsh_next_write(), ...
 266  *
 267  * fsh_lock is a global lock for adminsitrative path (fsh_hook_install,
 268  * fsh_hook_remove) and fsh_fsrec_destroy() (which is semi-administrative, since
 269  * it destroys the unremoved hooks). It is used only when fsh_map needs to be
 270  * locked. The usage of this lock guarantees that the data in fsh_map and
 271  * fshfsr_lists is consistent.
 272  */
 273 
 274 
 275 /* Internals */
 276 struct fsh_int {
 277         fsh_handle_t    fshi_handle;
 278         fsh_t           fshi_hooks;
 279         vfs_t           *fshi_vfsp;
 280 
 281         kmutex_t        fshi_lock;
 282         uint64_t        fshi_ref;
 283         uint64_t        fshi_doomed;    /* changed inside fsh_lock */
 284 
 285         /* next node in fshfsr_list */
 286         list_node_t     fshi_next;
 287 
 288         /* next node in fsh_map */
 289         list_node_t     fshi_global;
 290 };
 291 
 292 typedef struct fsh_callback_int {
 293         fsh_callback_t  fshci_cb;
 294         fsh_callback_handle_t fshci_handle;
 295         list_node_t     fshci_next;
 296 } fsh_callback_int_t;
 297 
 298 
 299 static kmutex_t fsh_lock;
 300 
 301 /*
 302  * fsh_fsrecord_t is the main internal structure. It's content is protected
 303  * by fshfsr_lock. The fshfsr_list is a list of fsh_int_t hook entries for
 304  * the vfs_t that contains the fsh_fsrecord_t.
 305  */
 306 struct fsh_fsrecord {
 307         krwlock_t       fshfsr_lock;
 308         int             fshfsr_enabled;
 309         list_t          fshfsr_list;
 310 };
 311 
 312 /*
 313  * Global list of fsh_int_t. Protected by fsh_lock.
 314  */
 315 static list_t fsh_map;
 316 
 317 /*
 318  * Global list of fsh_callback_int_t.
 319  */
 320 static krwlock_t fsh_cblist_lock;
 321 static list_t fsh_cblist;
 322 
 323 /*
 324  * A reserved pointer for fsh purposes. It is used because of the method
 325  * chosen for solving concurrency issues with vfs_fshrecord. The full
 326  * explanation is in the big theory statement at the beginning of this
 327  * file and above fsh_fsrec_prepare(). It is initialised in fsh_init().
 328  */
 329 static void *fsh_res_ptr;
 330 
 331 static fsh_fsrecord_t *fsh_fsrec_create();
 332 
 333 int fsh_limit = INT_MAX;
 334 static id_space_t *fsh_idspace;
 335 
 336 /*
 337  * fsh_prepare_fsrec()
 338  *
 339  * Important note:
 340  * Before using this function, fsh_init() MUST be called. We do that in
 341  * vfsinit()@vfs.c.
 342  *
 343  * One would ask, why isn't the vfsp->vfs_fshrecord initialised when the
 344  * vfs_t is created. Unfortunately, some filesystems (e.g. fifofs) do not
 345  * call vfs_init() or even vfs_alloc(), It's possible that some unbundled
 346  * filesystems could do the same thing. That's why this solution is
 347  * introduced. It should be called before any code that needs access to
 348  * vfs_fshrecord.
 349  *
 350  * Locking:
 351  * There are no locks here, because there's no good place to initialise
 352  * the lock. Concurrency issues are solved by using atomic instructions
 353  * and a spinlock, which is spinning only once for a given vfs_t. Because
 354  * of that, the usage of the spinlock isn't bad at all.
 355  *
 356  * How it works:
 357  * a) if vfsp->vfs_fshrecord equals NULL, atomic_cas_ptr() changes it to
 358  *      fsh_res_ptr. That's a signal for other threads, that the structure
 359  *      is being initialised.
 360  * b) if vfsp->vfs_fshrecord equals fsh_res_ptr, that means we have to wait,
 361  *      because vfs_fshrecord is being initialised by another call.
 362  * c) other cases:
 363  *      vfs_fshrecord is already initialised, so we can use it. It won't change
 364  *      until vfs_free() is called. It can't happen when someone is holding
 365  *      the vfs_t, which is expected from the caller of fsh API.
 366  */
 367 static void
 368 fsh_prepare_fsrec(vfs_t *vfsp)
 369 {
 370         fsh_fsrecord_t *fsrec;
 371 
 372         while ((fsrec = atomic_cas_ptr(&vfsp->vfs_fshrecord, NULL,
 373             fsh_res_ptr)) == fsh_res_ptr)
 374                 ;
 375 
 376         if (fsrec == NULL)
 377                 atomic_swap_ptr(&vfsp->vfs_fshrecord, fsh_fsrec_create());
 378 }
 379 
 380 /*
 381  * API for enabling/disabling fsh per vfs_t.
 382  *
 383  * A newly created vfs_t has fsh enabled by default. If one would want to change
 384  * this behaviour, mount callbacks could be used.
 385  *
 386  * The caller is expected to hold the vfs_t.
 387  *
 388  * These functions must NOT be called in a hook.
 389  */
 390 void
 391 fsh_fs_enable(vfs_t *vfsp)
 392 {
 393         fsh_prepare_fsrec(vfsp);
 394 
 395         rw_enter(&vfsp->vfs_fshrecord->fshfsr_lock, RW_WRITER);
 396         vfsp->vfs_fshrecord->fshfsr_enabled = 1;
 397         rw_exit(&vfsp->vfs_fshrecord->fshfsr_lock);
 398 }
 399 
 400 void
 401 fsh_fs_disable(vfs_t *vfsp)
 402 {
 403         fsh_prepare_fsrec(vfsp);
 404 
 405         rw_enter(&vfsp->vfs_fshrecord->fshfsr_lock, RW_WRITER);
 406         vfsp->vfs_fshrecord->fshfsr_enabled = 0;
 407         rw_exit(&vfsp->vfs_fshrecord->fshfsr_lock);
 408 }
 409 
 410 /*
 411  * API used for installing hooks. fsh_handle_t is returned for further
 412  * actions (currently just removing) on this set of hooks.
 413  *
 414  * fsh_t fields:
 415  * - arg - argument passed to every hook
 416  * - remove_cb - remove callback, called after a hook is removed and all the
 417  *      threads stops executing it
 418  * - read, write, ... - pointers to hooks for corresponding vnodeops/vfsops;
 419  *      if there is no hook desired for an operation, it should be set to
 420  *      NULL
 421  *
 422  * It's important that the hooks are executed in LIFO installation order (they
 423  * are added to the head of the hook list).
 424  *
 425  * The caller is expected to hold the vfs_t.
 426  *
 427  * Returns (-1) if hook/callback limit exceeded, handle otherwise.
 428  */
 429 fsh_handle_t
 430 fsh_hook_install(vfs_t *vfsp, fsh_t *hooks)
 431 {
 432         fsh_handle_t    handle;
 433         fsh_int_t       *fshi;
 434 
 435         fsh_prepare_fsrec(vfsp);
 436 
 437         if ((handle = id_alloc(fsh_idspace)) == -1)
 438                 return (-1);
 439 
 440         fshi = kmem_alloc(sizeof (*fshi), KM_SLEEP);
 441         mutex_init(&fshi->fshi_lock, NULL, MUTEX_DRIVER, NULL);
 442         (void) memcpy(&fshi->fshi_hooks, hooks, sizeof (fshi->fshi_hooks));
 443         fshi->fshi_handle = handle;
 444         fshi->fshi_doomed = 0;
 445         fshi->fshi_ref = 1;
 446         fshi->fshi_vfsp = vfsp;
 447 
 448         mutex_enter(&fsh_lock);
 449         rw_enter(&vfsp->vfs_fshrecord->fshfsr_lock, RW_WRITER);
 450         list_insert_head(&vfsp->vfs_fshrecord->fshfsr_list, fshi);
 451         rw_exit(&vfsp->vfs_fshrecord->fshfsr_lock);
 452 
 453         list_insert_head(&fsh_map, fshi);
 454         mutex_exit(&fsh_lock);
 455 
 456         return (handle);
 457 }
 458 
 459 static int
 460 fshi_hold(fsh_int_t *fshi)
 461 {
 462         int can_hold;
 463 
 464         mutex_enter(&fshi->fshi_lock);
 465         if (fshi->fshi_doomed == 1) {
 466                 can_hold = 0;
 467         } else {
 468                 fshi->fshi_ref++;
 469                 can_hold = 1;
 470         }
 471         mutex_exit(&fshi->fshi_lock);
 472 
 473         return (can_hold);
 474 }
 475 
 476 /*
 477  * This function must not be called while fshfsr_lock is held. Doing so could
 478  * cause a deadlock.
 479  */
 480 static void
 481 fshi_rele(fsh_int_t *fshi)
 482 {
 483         int destroy;
 484 
 485         mutex_enter(&fshi->fshi_lock);
 486         ASSERT(fshi->fshi_ref > 0);
 487         fshi->fshi_ref--;
 488         if (fshi->fshi_ref == 0) {
 489                 ASSERT(fshi->fshi_doomed == 1);
 490                 destroy = 1;
 491         } else {
 492                 destroy = 0;
 493         }
 494         mutex_exit(&fshi->fshi_lock);
 495 
 496         if (destroy) {
 497                 /*
 498                  * At this point, we are sure that fsh_hook_remove() has been
 499                  * called, that's why we don't remove the fshi from fsh_map.
 500                  * fsh_hook_remove() did that already.
 501                  */
 502                 fsh_fsrecord_t *fsrecp;
 503 
 504                 if (fshi->fshi_hooks.remove_cb != NULL)
 505                         (*fshi->fshi_hooks.remove_cb)(
 506                             fshi->fshi_hooks.arg, fshi->fshi_handle);
 507                 /*
 508                  * We don't have to call fsh_prepare_fsrec() here.
 509                  * fsh_fsrecord_t is already initialised, because we've found a
 510                  * mapping for the given handle.
 511                  */
 512                 fsrecp = fshi->fshi_vfsp->vfs_fshrecord;
 513                 ASSERT(fsrecp != NULL);
 514                 ASSERT(fsrecp != fsh_res_ptr);
 515 
 516                 rw_enter(&fsrecp->fshfsr_lock, RW_WRITER);
 517                 list_remove(&fsrecp->fshfsr_list, fshi);
 518                 rw_exit(&fsrecp->fshfsr_lock);
 519 
 520                 id_free(fsh_idspace, fshi->fshi_handle);
 521                 mutex_destroy(&fshi->fshi_lock);
 522                 kmem_free(fshi, sizeof (*fshi));
 523         }
 524 }
 525 
 526 /*
 527  * Used for removing a hook set.
 528  *
 529  * fsh_hook_remove() invalidates the given handle.
 530  *
 531  * It is guaranteed, that after successful return from fsh_hook_remove(),
 532  * calls to vnodeops/vfsops, on the vfs_t on which the hook is installed, won't
 533  * go through this hook.
 534  *
 535  * There is no guarantee that after fsh_hook_remove() returns, the hook
 536  * associated with the handle won't be executing. Instead, it is guaranteed that
 537  * when remove_cb() is called, the hook finished it's execution in all threads.
 538  * It is safe to destroy all internal data associated with this hook inside
 539  * remove_cb().
 540  *
 541  * It is possible that remove_cb() would be called before fsh_hook_remove()
 542  * returns.
 543  *
 544  * Returns (-1) if hook wasn't found, 0 otherwise.
 545  */
 546 int
 547 fsh_hook_remove(fsh_handle_t handle)
 548 {
 549         fsh_int_t       *fshi;
 550 
 551         mutex_enter(&fsh_lock);
 552         for (fshi = list_head(&fsh_map); fshi != NULL;
 553             fshi = list_next(&fsh_map, fshi)) {
 554                 if (fshi->fshi_handle == handle) {
 555                         list_remove(&fsh_map, fshi);
 556                         break;
 557                 }
 558         }
 559 
 560         if (fshi == NULL)
 561                 return (-1);
 562 
 563         mutex_enter(&fshi->fshi_lock);
 564         ASSERT(fshi->fshi_doomed == 0);
 565         fshi->fshi_doomed = 1;
 566         mutex_exit(&fshi->fshi_lock);
 567         mutex_exit(&fsh_lock);
 568 
 569         fshi_rele(fshi);
 570 
 571         return (0);
 572 }
 573 
 574 /*
 575  * API for installing global mount/free callbacks.
 576  *
 577  * fsh_callback_t fields:
 578  * fshc_arg - argument passed to the callbacks
 579  * fshc_free - callback fired before VFS_FREEVFS() is called, after vfs_count
 580  *      drops to 0
 581  * fshc_mount - callback fired right before returning from domount()
 582  * The first argument of these callbacks is the vfs_t that is mounted/freed.
 583  * The second one is the fshc_arg.
 584  *
 585  * fsh_callback_handle_t is filled out by this function.
 586  *
 587  * This function must NOT be called in a callback, because it will cause
 588  * a deadlock.
 589  *
 590  * Returns (-1) if hook/callback limit exceeded.
 591  */
 592 fsh_callback_handle_t
 593 fsh_callback_install(fsh_callback_t *callback)
 594 {
 595         fsh_callback_int_t *fshci;
 596         fsh_callback_handle_t handle;
 597 
 598         if ((handle = id_alloc(fsh_idspace)) == -1)
 599                 return (-1);
 600 
 601         fshci = (fsh_callback_int_t *)kmem_alloc(sizeof (*fshci), KM_SLEEP);
 602         (void) memcpy(&fshci->fshci_cb, callback, sizeof (fshci->fshci_cb));
 603         fshci->fshci_handle = handle;
 604 
 605         /* If it is called in a {mount,free} callback, causes deadlock. */
 606         rw_enter(&fsh_cblist_lock, RW_WRITER);
 607         list_insert_head(&fsh_cblist, fshci);
 608         rw_exit(&fsh_cblist_lock);
 609 
 610         return (handle);
 611 }
 612 
 613 /*
 614  * API for removing global mount/free callbacks.
 615  *
 616  * This function must NOT be called in a callback, because it will cause
 617  * a deadlock.
 618  *
 619  * Returns (-1) if callback wasn't found, 0 otherwise.
 620  */
 621 int
 622 fsh_callback_remove(fsh_callback_handle_t handle)
 623 {
 624         fsh_callback_int_t *fshci;
 625 
 626         /* If it is called in a {mount,free} callback, causes deadlock. */
 627         rw_enter(&fsh_cblist_lock, RW_WRITER);
 628         for (fshci = list_head(&fsh_cblist); fshci != NULL;
 629             fshci = list_next(&fsh_cblist, fshci)) {
 630                 if (fshci->fshci_handle == handle) {
 631                         list_remove(&fsh_cblist, fshci);
 632                         break;
 633                 }
 634         }
 635         rw_exit(&fsh_cblist_lock);
 636 
 637         if (fshci == NULL)
 638                 return (-1);
 639 
 640         kmem_free(fshci, sizeof (*fshci));
 641         id_free(fsh_idspace, handle);
 642 
 643         return (0);
 644 }
 645 
 646 /*
 647  * This function is executed right before returning from domount()@vfs.c.
 648  * We are sure that it's called only after fsh_init().
 649  * It executes all the mount callbacks installed in the fsh.
 650  *
 651  * Since fsh_exec_mount_callbacks() is called only inside domount(), it is legal
 652  * to call fsh_hook_{install,remove}() inside a mount callback WITHOUT holding
 653  * this vfs_t. This guarantee should be preserved, because it's in the "Usage"
 654  * section in the big theory statement at the top of this file.
 655  */
 656 void
 657 fsh_exec_mount_callbacks(vfs_t *vfsp)
 658 {
 659         fsh_callback_int_t *fshci;
 660         fsh_callback_t *cb;
 661 
 662         rw_enter(&fsh_cblist_lock, RW_READER);
 663         for (fshci = list_head(&fsh_cblist); fshci != NULL;
 664             fshci = list_next(&fsh_cblist, fshci)) {
 665                 cb = &fshci->fshci_cb;
 666                 if (cb->fshc_mount != NULL)
 667                         (*(cb->fshc_mount))(vfsp, cb->fshc_arg);
 668         }
 669         rw_exit(&fsh_cblist_lock);
 670 }
 671 
 672 /*
 673  * This function is executed right before VFS_FREEVFS() is called in
 674  * vfs_rele()@vfs.c. We are sure that it's called only after fsh_init().
 675  * It executes all the free callbacks installed in the fsh.
 676  *
 677  * free() callback is the point after the handles associated with the hooks
 678  * installed on this vfs_t become invalid
 679  */
 680 void
 681 fsh_exec_free_callbacks(vfs_t *vfsp)
 682 {
 683         fsh_callback_int_t *fshci;
 684         fsh_callback_t *cb;
 685 
 686         rw_enter(&fsh_cblist_lock, RW_READER);
 687         for (fshci = list_head(&fsh_cblist); fshci != NULL;
 688             fshci = list_next(&fsh_cblist, fshci)) {
 689                 cb = &fshci->fshci_cb;
 690                 if (cb->fshc_free != NULL)
 691                         (*(cb->fshc_free))(vfsp, cb->fshc_arg);
 692         }
 693         rw_exit(&fsh_cblist_lock);
 694 }
 695 
 696 /*
 697  * API for vnode.c/vfs.c to start executing the fsh for a given operation.
 698  *
 699  * fsh_xxx() tries to find the first non-NULL xxx hook on the fshfsr_list. If it
 700  * does, it executes it. If not, underlying vnodeop/vfsop is called.
 701  *
 702  * These interfaces are using fsh_res_ptr (in fsh_prepare_fsrec()), so it's
 703  * absolutely necessary to call fsh_init() before using them. That's done in
 704  * vfsinit().
 705  *
 706  * While these functions are executing, it's expected that necessary vfs_t's
 707  * are held so that vfs_free() isn't called. vfs_free() expects that noone
 708  * accesses vfs_fshrecord of a given vfs_t.
 709  * It's also the caller's responsibility to keep vnode_t passed to fsh_foo()
 710  * alive and valid.
 711  * All these expectations are met because these functions are used only in
 712  * correspondng {fop,fsop}_foo() functions.
 713  */
 714 int
 715 fsh_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
 716         caller_context_t *ct)
 717 {
 718         int ret;
 719         fsh_fsrecord_t *fsrecp;
 720         fsh_int_t *fshi;
 721 
 722         fsh_prepare_fsrec(vp->v_vfsp);
 723         fsrecp = vp->v_vfsp->vfs_fshrecord;
 724 
 725         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
 726         if (!(fsrecp->fshfsr_enabled)) {
 727                 rw_exit(&fsrecp->fshfsr_lock);
 728                 return ((*(vp->v_op->vop_read))(vp, uiop, ioflag, cr, ct));
 729         }
 730 
 731         for (fshi = list_head(&fsrecp->fshfsr_list); fshi != NULL;
 732             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
 733                 if (fshi->fshi_hooks.read != NULL)
 734                         if (fshi_hold(fshi))
 735                                 break;
 736         }
 737         rw_exit(&fsrecp->fshfsr_lock);
 738 
 739         if (fshi == NULL)
 740                 return ((*(vp->v_op->vop_read))(vp, uiop, ioflag, cr, ct));
 741 
 742         ret = (*fshi->fshi_hooks.read)(fshi, fshi->fshi_hooks.arg,
 743             vp, uiop, ioflag, cr, ct);
 744         fshi_rele(fshi);
 745         return (ret);
 746 }
 747 
 748 int
 749 fsh_write(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
 750         caller_context_t *ct)
 751 {
 752         fsh_int_t *fshi;
 753         int ret;
 754         fsh_fsrecord_t *fsrecp;
 755 
 756         fsh_prepare_fsrec(vp->v_vfsp);
 757         fsrecp = vp->v_vfsp->vfs_fshrecord;
 758 
 759         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
 760         if (!(vp->v_vfsp->vfs_fshrecord->fshfsr_enabled)) {
 761                 rw_exit(&fsrecp->fshfsr_lock);
 762                 return ((*(vp->v_op->vop_write))(vp, uiop, ioflag, cr, ct));
 763         }
 764 
 765         for (fshi = list_head(&fsrecp->fshfsr_list); fshi != NULL;
 766             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
 767                 if (fshi->fshi_hooks.write != NULL)
 768                         if (fshi_hold(fshi))
 769                                 break;
 770         }
 771         rw_exit(&fsrecp->fshfsr_lock);
 772 
 773         if (fshi == NULL)
 774                 return ((*(vp->v_op->vop_write))(vp, uiop, ioflag, cr, ct));
 775 
 776         ret = (*fshi->fshi_hooks.write)(fshi, fshi->fshi_hooks.arg,
 777             vp, uiop, ioflag, cr, ct);
 778         fshi_rele(fshi);
 779         return (ret);
 780 }
 781 
 782 int
 783 fsh_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 784 {
 785         fsh_fsrecord_t *fsrecp;
 786         fsh_int_t *fshi;
 787         int ret;
 788 
 789         fsh_prepare_fsrec(vfsp);
 790         fsrecp = vfsp->vfs_fshrecord;
 791 
 792         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
 793         if (!(fsrecp->fshfsr_enabled)) {
 794                 rw_exit(&fsrecp->fshfsr_lock);
 795                 return ((*(vfsp->vfs_op->vfs_mount))(vfsp, mvp, uap, cr));
 796         }
 797 
 798         for (fshi = list_head(&fsrecp->fshfsr_list); fshi != NULL;
 799             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
 800                 if (fshi->fshi_hooks.mount != NULL)
 801                         if (fshi_hold(fshi))
 802                                 break;
 803         }
 804         rw_exit(&fsrecp->fshfsr_lock);
 805 
 806         if (fshi == NULL)
 807                 return ((*(vfsp->vfs_op->vfs_mount))(vfsp, mvp, uap, cr));
 808 
 809         ret = (*fshi->fshi_hooks.mount)(fshi, fshi->fshi_hooks.arg,
 810             vfsp, mvp, uap, cr);
 811         fshi_rele(fshi);
 812         return (ret);
 813 }
 814 
 815 int
 816 fsh_unmount(vfs_t *vfsp, int flag, cred_t *cr)
 817 {
 818         fsh_fsrecord_t *fsrecp;
 819         fsh_int_t *fshi;
 820         int ret;
 821 
 822         fsh_prepare_fsrec(vfsp);
 823         fsrecp = vfsp->vfs_fshrecord;
 824 
 825         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
 826         if (!(fsrecp->fshfsr_enabled)) {
 827                 rw_exit(&fsrecp->fshfsr_lock);
 828                 return ((*(vfsp->vfs_op->vfs_unmount))(vfsp, flag, cr));
 829         }
 830 
 831         for (fshi = list_head(&fsrecp->fshfsr_list); fshi != NULL;
 832             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
 833                 if (fshi->fshi_hooks.unmount != NULL)
 834                         if (fshi_hold(fshi))
 835                                 break;
 836         }
 837         rw_exit(&fsrecp->fshfsr_lock);
 838 
 839         if (fshi == NULL)
 840                 return ((*(vfsp->vfs_op->vfs_unmount))(vfsp, flag, cr));
 841 
 842         ret = (*fshi->fshi_hooks.unmount)(fshi, fshi->fshi_hooks.arg,
 843             vfsp, flag, cr);
 844         fshi_rele(fshi);
 845         return (ret);
 846 }
 847 
 848 /*
 849  * This is the funtion used by fsh_prepare_fsrec() to allocate a new
 850  * fsh_fsrecord. This function is called by the first function which
 851  * access the vfs_fshrecord and finds out it's NULL.
 852  */
 853 static fsh_fsrecord_t *
 854 fsh_fsrec_create()
 855 {
 856         fsh_fsrecord_t *fsrecp;
 857 
 858         fsrecp = (fsh_fsrecord_t *)kmem_zalloc(sizeof (*fsrecp), KM_SLEEP);
 859         list_create(&fsrecp->fshfsr_list, sizeof (fsh_int_t),
 860             offsetof(fsh_int_t, fshi_next));
 861         rw_init(&fsrecp->fshfsr_lock, NULL, RW_DRIVER, NULL);
 862         fsrecp->fshfsr_enabled = 1;
 863         return (fsrecp);
 864 }
 865 
 866 
 867 /*
 868  * This call can be used ONLY in vfs_free(). It's assumed that no other
 869  * fsh calls using the vfs_t that owns the fsh_fsrecord to be destroyed
 870  * are executing while a call to fsh_fsrec_destroy() is made. With this
 871  * assumptions, no concurrency issues occur.
 872  *
 873  * Before calling this function outside the fsh, it's sufficient and
 874  * required to check if the passed fsh_fsrecord * is not NULL. We don't
 875  * have to check if it is not equal to fsh_res_ptr, because all the fsh API
 876  * calls involving this vfs_t should end before vfs_free() is called
 877  * (outside the fsh, fsh_fsrecord is never equal to fsh_res_ptr). That is
 878  * guaranteed by the explicit requirement that the caller of fsh API holds
 879  * the vfs_t when needed.
 880  *
 881  * All the remaining hooks are being removed.
 882  */
 883 void
 884 fsh_fsrec_destroy(struct fsh_fsrecord *volatile fsrecp)
 885 {
 886         fsh_int_t *fshi;
 887 
 888         VERIFY(fsrecp != NULL);
 889 
 890         _NOTE(CONSTCOND)
 891         while (1) {
 892                 mutex_enter(&fsh_lock);
 893                 fshi = list_remove_head(&fsrecp->fshfsr_list);
 894                 if (fshi == NULL) {
 895                         mutex_exit(&fsh_lock);
 896                         break;
 897                 }
 898                 ASSERT(fshi->fshi_doomed == 0);
 899                 list_remove(&fsh_map, fshi);
 900                 mutex_exit(&fsh_lock);
 901 
 902                 if (fshi->fshi_hooks.remove_cb != NULL)
 903                         (*fshi->fshi_hooks.remove_cb)(fshi->fshi_hooks.arg,
 904                             fshi->fshi_handle);
 905                 id_free(fsh_idspace, fshi->fshi_handle);
 906                 mutex_destroy(&fshi->fshi_lock);
 907                 kmem_free(fshi, sizeof (*fshi));
 908 
 909         }
 910 
 911         list_destroy(&fsrecp->fshfsr_list);
 912         rw_destroy(&fsrecp->fshfsr_lock);
 913         kmem_free(fsrecp, sizeof (*fsrecp));
 914 }
 915 
 916 /*
 917  * fsh_init() is called in vfsinit()@vfs.c. This function MUST be called
 918  * before every other fsh call.
 919  */
 920 void
 921 fsh_init(void)
 922 {
 923         rw_init(&fsh_cblist_lock, NULL, RW_DRIVER, NULL);
 924         list_create(&fsh_cblist, sizeof (fsh_callback_int_t),
 925             offsetof(fsh_callback_int_t, fshci_next));
 926 
 927         mutex_init(&fsh_lock, NULL, MUTEX_DRIVER, NULL);
 928 
 929         list_create(&fsh_map, sizeof (fsh_int_t), offsetof(fsh_int_t,
 930             fshi_global));
 931 
 932         /* See comment above fsh_prepare_fsrec() */
 933         fsh_res_ptr = (void *)-1;
 934 
 935         fsh_idspace = id_space_create("fsh", 0, fsh_limit);
 936 }
 937 
 938 /*
 939  * These functions are used to pass control to the next hook or underlying
 940  * vop or vfsop. It's client doesn't have to worry about any locking.
 941  */
 942 int
 943 fsh_next_read(fsh_int_t *fshi, vnode_t *vp, uio_t *uiop, int ioflag,
 944         cred_t *cr, caller_context_t *ct)
 945 {
 946         int ret;
 947         fsh_fsrecord_t *fsrecp = vp->v_vfsp->vfs_fshrecord;
 948 
 949         /*
 950          * The passed fshi is the previous hook (the one from which we've been
 951          * called). We need to find the next one.
 952          */
 953         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
 954         for (fshi = list_next(&fsrecp->fshfsr_list, fshi); fshi != NULL;
 955             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
 956                 if (fshi->fshi_hooks.read != NULL)
 957                         if (fshi_hold(fshi))
 958                                 break;
 959         }
 960         rw_exit(&fsrecp->fshfsr_lock);
 961 
 962         if (fshi == NULL)
 963                 return ((*vp->v_op->vop_read)(vp, uiop, ioflag, cr, ct));
 964 
 965         ret = (*fshi->fshi_hooks.read)(fshi, fshi->fshi_hooks.arg,
 966             vp, uiop, ioflag, cr, ct);
 967         fshi_rele(fshi);
 968         return (ret);
 969 }
 970 
 971 int
 972 fsh_next_write(fsh_int_t *fshi, vnode_t *vp, uio_t *uiop, int ioflag,
 973         cred_t *cr, caller_context_t *ct)
 974 {
 975         fsh_fsrecord_t *fsrecp = vp->v_vfsp->vfs_fshrecord;
 976         int ret;
 977 
 978         /*
 979          * The passed fshi is the previous hook (the one from which we've been
 980          * called). We need to find the next one.
 981          */
 982         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
 983         for (fshi = list_next(&fsrecp->fshfsr_list, fshi); fshi != NULL;
 984             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
 985                 if (fshi->fshi_hooks.write != NULL)
 986                         if (fshi_hold(fshi))
 987                                 break;
 988         }
 989         rw_exit(&fsrecp->fshfsr_lock);
 990 
 991         if (fshi == NULL)
 992                 return ((*vp->v_op->vop_write)(vp, uiop, ioflag, cr, ct));
 993 
 994         ret = (*fshi->fshi_hooks.write)(fshi, fshi->fshi_hooks.arg,
 995             vp, uiop, ioflag, cr, ct);
 996         fshi_rele(fshi);
 997         return (ret);
 998 }
 999 
1000 int
1001 fsh_next_mount(fsh_int_t *fshi, vfs_t *vfsp, vnode_t *mvp, struct mounta *uap,
1002         cred_t *cr)
1003 {
1004         fsh_fsrecord_t *fsrecp = vfsp->vfs_fshrecord;
1005         int ret;
1006 
1007         /*
1008          * The passed fshi is the previous hook (the one from which we've been
1009          * called). We need to find the next one.
1010          */
1011         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
1012         for (fshi = list_next(&fsrecp->fshfsr_list, fshi); fshi != NULL;
1013             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
1014                 if (fshi->fshi_hooks.mount != NULL)
1015                         if (fshi_hold(fshi))
1016                                 break;
1017         }
1018         rw_exit(&fsrecp->fshfsr_lock);
1019 
1020         if (fshi == NULL)
1021                 return ((*(vfsp->vfs_op->vfs_mount))(vfsp, mvp, uap, cr));
1022 
1023         ret = (*fshi->fshi_hooks.mount)(fshi, fshi->fshi_hooks.arg,
1024             vfsp, mvp, uap, cr);
1025         fshi_rele(fshi);
1026         return (ret);
1027 }
1028 
1029 int
1030 fsh_next_unmount(fsh_int_t *fshi, vfs_t *vfsp, int flag, cred_t *cr)
1031 {
1032         fsh_fsrecord_t *fsrecp = vfsp->vfs_fshrecord;
1033         int ret;
1034 
1035         /*
1036          * The passed fshi is the previous hook (the one from which we've been
1037          * called). We need to find the next one.
1038          */
1039         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
1040         for (fshi = list_next(&fsrecp->fshfsr_list, fshi); fshi != NULL;
1041             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
1042                 if (fshi->fshi_hooks.unmount != NULL)
1043                         if (fshi_hold(fshi))
1044                                 break;
1045         }
1046         rw_exit(&fsrecp->fshfsr_lock);
1047 
1048         if (fshi == NULL)
1049                 return ((*vfsp->vfs_op->vfs_unmount)(vfsp, flag, cr));
1050 
1051         ret = (*fshi->fshi_hooks.unmount)(fshi, fshi->fshi_hooks.arg,
1052             vfsp, flag, cr);
1053         fshi_rele(fshi);
1054         return (ret);
1055 }