1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2013 Damian Bogel.  All rights reserved.
  14  */
  15 
  16 #include <sys/debug.h>
  17 #include <sys/errno.h>
  18 #include <sys/fsh.h>
  19 #include <sys/fsh_impl.h>
  20 #include <sys/id_space.h>
  21 #include <sys/kmem.h>
  22 #include <sys/ksynch.h>
  23 #include <sys/list.h>
  24 #include <sys/sunddi.h>
  25 #include <sys/sysmacros.h>
  26 #include <sys/types.h>
  27 #include <sys/vfs.h>
  28 #include <sys/vnode.h>
  29 
  30 /*
  31  * Filesystem hook framework (fsh)
  32  *
  33  * 1. Abstract.
  34  * The main goal of the filesystem hook framework is to provide an easy way to
  35  * inject client-defined behaviour into vfs/vnode calls. fsh works on
  36  * vfs_t granularity.
  37  *
  38  * Note: In this document, both an fsh_t structure and hooking function for a
  39  * vnodeop/vfsop is referred to as *hook*.
  40  *
  41  *
  42  * 2. Overview.
  43  * fsh_t is the main object in the fsh. An fsh_t is a structure containing:
  44  *      - pointers to hooking functions
  45  *      - an argument to pass (this is shared for all the hooks in a given
  46  *      fsh_t)
  47  *      - a pointer to the *hook remove callback*
  48  *
  49  * The information from fsh_t is copied by the fsh and an fsh_handle_t
  50  * is returned. It should be used for further removing.
  51  *
  52  *
  53  * 3. Usage.
  54  * It is expected that vfs_t/vnode_t passed to fsh_foo() functions are held by
  55  * the caller when needed. fsh does no vfs_t/vnode_t locking.
  56  *
  57  * fsh_t is a structure filled out by the client. It contains:
  58  *      - pointers to hooking functions
  59  *      - the argument passed to the hooks
  60  *      - the *hook remove callback*
  61  *
  62  * If a client does not want to add a hook for function foo(), he should fill
  63  * corresponding fields with NULLs. For every vfsop/vnodeop there are two
  64  * fields: pre_foo() and post_foo(). These are the functions called before and
  65  * after the next hook or underlying vfsop/vnodeop.
  66  *
  67  * Pre hooks take:
  68  *      - arg
  69  *      - pointer to a field containing void* - it should be filled whenever
  70  *      the client wants to have some data shared by the pre and post hooks in
  71  *      the same syscall execution. This is called the *instance data*.
  72  *      - pointers to the arguments passed to the underlying vfsop/vnodeop
  73  * Pre hooks return void.
  74  *
  75  * Post hooks take:
  76  *      - value returned by the previous post hook or underlying vfsop/vnodeop
  77  *      - arg
  78  *      - pointer to the *instance data*
  79  *      - arguments passed to the underlying vfsop/vnodeop
  80  * Post hooks return an int, which should be treated as the vfsop/vnodeop
  81  * return value.
  82  * Memory allocated by pre hook must be deallocated by the post hook.
  83  *
  84  * Execution path of hooks A, B, C is as follows:
  85  * foo()
  86  *      preA(argA, &instancepA, ...);
  87  *      preB(argB, &instancepB, ...);
  88  *      preC(argC, &instancepC, ...);
  89  *      ret = VOP_FOO();
  90  *      ret = postC(ret, argC, instancepC, ...);
  91  *      ret = postB(ret, argB, instancepB, ...);
  92  *      ret = postC(ret, argA, instancepA, ...);
  93  *      return (ret);
  94  *
  95  * After installation, an fsh_handle_t is returned to the caller.
  96  *
  97  * Hook remove callback - it's a function being fired after a hook is removed
  98  * and no thread is going to execute it anymore. It's safe to destroy all the
  99  * data associated with this hook inside it.
 100  *
 101  * It is guaranteed, that whenever a pre_hook() is called, there will be also
 102  * post_hook() called within the same syscall.
 103  *
 104  * If a hook (HNew) is installed/removed on/from a vfs_t within execution of
 105  * another hook (HExec) installed on this vfs_t, the syscall that executes
 106  * HExec won't fire HNew.
 107  *
 108  * A client might want to fire callbacks when vfs_ts are being mounted
 109  * or freed. There's an fsh_callback_t structure provided to install such
 110  * callbacks along with the API.
 111  * It is legal to call fsh_hook_{install,remove}() inside a mount callback
 112  * WITHOUT holding the vfs_t.
 113  *
 114  * After vfs_t's free callback returns, all the handles associated with the
 115  * hooks installed on this vfs_t are invalid and must not be used.
 116  *
 117  * 4. API
 118  * None of the APIs should be called during interrupt context above lock
 119  * level.
 120  *
 121  * a) fsh.h
 122  * Any of these functions could be called in a hook or a hook remove callback.
 123  * The only functions that must not be called inside a {mount,free} callback are
 124  * fsd_callback_{install,remove}. Using them will cause a deadlock.
 125  *
 126  *
 127  * fsh_fs_enable(vfs_t *vfsp)
 128  * fsh_fs_disable(vfs_t *vfsp)
 129  *      Enables/disables fsh for a given vfs_t.
 130  *
 131  * fsh_hook_install(vfs_t *vfsp, fsh_t *hooks)
 132  *      Installs hooks on vfsp filesystem.
 133  *      It's important that hooks are executed in LIFO installation order,
 134  *      which means that if there are hooks A and B installed in this order, B
 135  *      is going to be executed before A.
 136  *      It returns a correct handle, or (-1) if hook/callback limit exceeded.
 137  *      The handle is valid until a free callback returns or an explicit call
 138  *      to fsh_hook_remove().
 139  *
 140  * fsh_hook_remove(fsh_handle_t handle)
 141  *      Removes a hook and invalidates the handle.
 142  *      It is guaranteed that after this funcion returns, calls to
 143  *      vnodeops/vfsops won't go through this hook, although there might be
 144  *      some threads still executing this hook. When hook remove callback is
 145  *      fired, it is guaranteed that the hook won't be executed anymore. It is
 146  *      safe to remove all the internal data associated with this hook inside
 147  *      the hook remove callback. The hook remove callback could be called
 148  *      inside fsh_hook_remove().
 149  *
 150  *
 151  * fsh_callback_install(fsh_callback_t *callback)
 152  * fsh_callback_remove(fsh_callback_handle_t handle)
 153  *      Installs/removes callbacks for vfs_t mount/free. The mount callback
 154  *      is executed right before domount() returns. The free callback is
 155  *      called right before VFS_FREEVFS() is called.
 156  *      The fsh_callback_install() returns a correct handle, or (-1) if
 157  *      hook/callback limit exceeded.
 158  *
 159  *
 160  * b) fsh_impl.h (for vfs.c and vnode.c only)
 161  * fsh_init()
 162  *      This call has to be done in vfsinit(). It initialises the fsh. It
 163  *      is absolutely necessary that this call is made before any other fsh
 164  *      operation.
 165  *
 166  * fsh_exec_mount_callbacks(vfs_t *vfsp)
 167  * fsh_exec_free_callbacks(vfs_t *vfsp)
 168  *      Used to execute all fsh callbacks for {mount,free} of a vfs_t.
 169  *
 170  * fsh_fsrec_destroy(struct fsh_fsrecord *fsrecp)
 171  *      Destroys an fsh_fsrecord structure. All the hooks installed on this
 172  *      vfs_t are then destroyed. free callback is called before this function.
 173  *
 174  * fsh_foo(ARGUMENTS)
 175  *      Function used to execute the hook chain for a given syscall.
 176  *
 177  *
 178  * 5. Internals.
 179  * fsh_int_t is an internal hook structure. It is reference counted.
 180  * fshi_hold() and fshi_rele() should be used whenever needed.
 181  * fsh_int_t entries are elements of both fsh_map (global) and fshfsr_list
 182  * (local to vfs_t). All entries are unique and are identified by fshi_handle.
 183  *
 184  * fsh_int_t properties:
 185  *      - fsh_hook_install() sets the ref. counter to 1 and adds it to both
 186  *      fsh_map and fshfsr_list
 187  *      - fsh_hook_remove() decreases the ref. counter by 1, removes the hook
 188  *      from fsh_map and marks the hook as *doomed*
 189  *      - if fsh_int_t is on the fshfsr_list, it's alive and there is a thread
 190  *      executing it
 191  *      - if fsh_int_t is marked as *doomed*, the reference counter is not
 192  *      be increased and thus no thread can acquire this fsh_int_t
 193  *      - ref. counter can drop to 0 only after an fsh_hook_remove() call; this
 194  *      also means that the fsh_int_t is *doomed* and isn't a part of fsh_map
 195  *      - fsh_int_t could be also destroyed without fsh_hook_remove() call,
 196  *      that happens only inside fsh_fsrec_destroy() where it is guaranteed
 197  *      that there is no thread executing the hook
 198  *
 199  *
 200  * fsh_fsrecord_t is a structure which lives inside a vfs_t.
 201  * fsh_fsrecord_t contains:
 202  *      - an rw-lock that protects the structure
 203  *      - a list of hooks installed on this vfs_t
 204  *      - a flag which tells whether fsh is enabled on this vfs_t
 205  *
 206  *
 207  * fsh_fsrec_prepare rule:
 208  * Every function that needs vfsp->vfs_fshrecord has to call
 209  * fsh_fsrec_prepare() first. If and only if the call is made, it is safe to
 210  * use vfsp->vfs_fshrecord.
 211  *
 212  * Unfortunately, because of unexpected behaviour of some filesystems (no use
 213  * of vfs_alloc()/vfs_init()) there's no good place to initialise the
 214  * fsh_fshrecord_t structure. The approach being used here is to check if it's
 215  * initialised in every call. Because of the fact that no lock could be used
 216  * here (the same problem with initialisation), a spinlock is used.  This is
 217  * explained in more detail in a comment before fsh_fsrec_prepare(). After
 218  * calling fsh_preapre_fsrec() it's completely safe to keep the vfs_fshrecord
 219  * pointer locally, because it won't be changed until vfs_free() is called.
 220  *
 221  * Exceptions from this rule:
 222  * - vfs_free() - it is expected that no other fsh calls would be made for the
 223  * vfs_t that's being freed. That's why vfs_fshrecord could be only NULL or a
 224  * valid pointer and could not be concurrently accessed.
 225  * - fshi_rele() - fsh_hook_install() comes before first fshi_rele() call;
 226  * the fsh_fsrecord_t has been initialised there
 227  *
 228  *
 229  * When there are no fsh functions (that use a particular fsh_fsrecord_t)
 230  * executing, the vfs_fshrecord pointer won't be equal to fsh_res_ptr. It
 231  * would be NULL or a pointer to an initialised fsh_fsrecord_t.
 232  *
 233  * It is required and sufficient to check if fsh_fsrecord_t is not NULL before
 234  * passing it to fsh_fsrec_destroy. We don't have to check if it is not equal
 235  * to fsh_res_ptr, because all the fsh API calls involving this vfs_t should
 236  * end before vfs_free() is called (outside the fsh, fsh_fsrecord is never
 237  * equal to fsh_res_ptr). That is guaranteed by the explicit requirement that
 238  * the caller of fsh API holds the vfs_t when needed. fsh_hook_remove() must not
 239  * be called either, because the handles are invalidated after free callback has
 240  * fired.
 241  *
 242  *
 243  * Callbacks:
 244  * Mount callbacks are executed by a call to fsh_exec_mount_callbacks() right
 245  * before returning from domount()@vfs.c.
 246  *
 247  * Free callbacks are executed by a call to fsh_exec_free_callbacks() right
 248  * before calling VFS_FREEVFS(), after vfs_t's reference count drops to 0.
 249  *
 250  *
 251  * 6. Locking
 252  * a) public
 253  * fsh does no vfs_t nor vnode_t locking. It is expected that whenever it is
 254  * needed, the client does that.
 255  *
 256  * No locks are held across hooks or hook remove callbacks execution. It is
 257  * safe to use fsh API inside hooks and hook remove callbacks.
 258  *
 259  * fsh_cb_lock is held across {mount,free} callbacks. Calling
 260  * fsh_callback_{install,remove} inside of a callback will cause a deadlock.
 261  *
 262  * b) internals
 263  * Locking diagram:
 264  *
 265  *     fsh_hook_remove()          fsh_hook_install()   fsh_fsrec_destroy()
 266  *           |                            |                |
 267  *           |                            |                |
 268  *           +------------------+         |   +------------+
 269  *           |                  |         |   |
 270  *           |                  V         |   |
 271  *           V               +------------|---|-+
 272  *      fshi_rele()          |  fsh_lock  |   | |
 273  *      (sometimes)          +------------|---|-+
 274  *                                 |      |   |
 275  *                                 |      +---+-- fshfsr_lock, RW_WRITER -+
 276  *                                 |                                      |
 277  *                                 V                                      |
 278  *               +---------------------------------------+                |
 279  *               |               fsh_map                 |                |
 280  *               |                                       |                |
 281  *          +----|-> vfsp->vfs_fshrecord->fshfsr_list <--|----------------+
 282  *          |    +------------------------------^--------+
 283  *          |                                   |
 284  *          |                                   |
 285  * fshfsr_lock, RW_READER              fshfsr_lock, RW_WRITER
 286  *          |                                   |
 287  *          |                                   |
 288  *   fsh_read(),                            fshi_rele()
 289  *   fsh_write(),
 290  *   ...                                Might be called from:
 291  *                                        fsh_hook_remove()
 292  *                                        fsh_read(), fsh_write(), ...
 293  *
 294  *
 295  * fsh_lock is a global lock for adminsitrative path (fsh_hook_install,
 296  * fsh_hook_remove) and fsh_fsrec_destroy() (which is semi-administrative, since
 297  * it destroys the unremoved hooks). It is used only when fsh_map needs to be
 298  * locked. The usage of this lock guarantees that the data in fsh_map and
 299  * fshfsr_lists is consistent.
 300  *
 301  * In order to make calling callbacks inside callbacks possible, fsh_cb_owner is
 302  * set by fsh_exec_{mount,free} callbacks to the thread that owns the
 303  * fsh_cb_lock.  It's always checked if we are owners of the mutex before
 304  * entering it.
 305  *
 306  */
 307 
 308 
 309 /* Internals */
 310 typedef struct fsh_int {
 311         fsh_handle_t    fshi_handle;
 312         fsh_t           fshi_hooks;
 313         vfs_t           *fshi_vfsp;
 314 
 315         kmutex_t        fshi_lock;
 316         uint64_t        fshi_ref;
 317         uint64_t        fshi_doomed;    /* changed inside fsh_lock */
 318 
 319         /* next node in fshfsr_list */
 320         list_node_t     fshi_node;
 321 
 322         /* next node in fsh_map */
 323         list_node_t     fshi_global;
 324 } fsh_int_t;
 325 
 326 typedef struct fsh_callback_int {
 327         fsh_callback_t  fshci_cb;
 328         fsh_callback_handle_t fshci_handle;
 329         list_node_t     fshci_node;
 330 } fsh_callback_int_t;
 331 
 332 
 333 typedef struct fsh_exec {
 334         fsh_int_t       *fshe_fshi;
 335         void            *fshe_instance;
 336         list_node_t     fshe_node;
 337 } fsh_exec_t;
 338 
 339 
 340 static kmutex_t fsh_lock;
 341 
 342 /*
 343  * fsh_fsrecord_t is the main internal structure. It's content is protected
 344  * by fshfsr_lock. The fshfsr_list is a list of fsh_int_t hook entries for
 345  * the vfs_t that contains the fsh_fsrecord_t.
 346  */
 347 struct fsh_fsrecord {
 348         krwlock_t       fshfsr_lock;
 349         int             fshfsr_enabled;
 350         list_t          fshfsr_list;
 351 };
 352 
 353 /*
 354  * Global list of fsh_int_t. Protected by fsh_lock.
 355  */
 356 static list_t fsh_map;
 357 
 358 /*
 359  * Global list of fsh_callback_int_t.
 360  */
 361 static kmutex_t fsh_cb_lock;
 362 static kmutex_t fsh_cb_owner_lock;
 363 static kthread_t *fsh_cb_owner;
 364 static list_t fsh_cblist;
 365 
 366 /*
 367  * A reserved pointer for fsh purposes. It is used because of the method
 368  * chosen for solving concurrency issues with vfs_fshrecord. The full
 369  * explanation is in the big theory statement at the beginning of this
 370  * file and above fsh_fsrec_prepare(). It is initialised in fsh_init().
 371  */
 372 static void *fsh_res_ptr;
 373 
 374 static fsh_fsrecord_t *fsh_fsrec_create();
 375 
 376 int fsh_limit = INT_MAX;
 377 static id_space_t *fsh_idspace;
 378 
 379 /*
 380  * fsh_fsrec_prepare()
 381  *
 382  * Important note:
 383  * Before using this function, fsh_init() MUST be called. We do that in
 384  * vfsinit()@vfs.c.
 385  *
 386  * One would ask, why isn't the vfsp->vfs_fshrecord initialised when the
 387  * vfs_t is created. Unfortunately, some filesystems (e.g. fifofs) do not
 388  * call vfs_init() or even vfs_alloc(), It's possible that some unbundled
 389  * filesystems could do the same thing. That's why this solution is
 390  * introduced. It should be called before any code that needs access to
 391  * vfs_fshrecord.
 392  *
 393  * Locking:
 394  * There are no locks here, because there's no good place to initialise
 395  * the lock. Concurrency issues are solved by using atomic instructions
 396  * and a spinlock, which is spinning only once for a given vfs_t. Because
 397  * of that, the usage of the spinlock isn't bad at all.
 398  *
 399  * How it works:
 400  * a) if vfsp->vfs_fshrecord equals NULL, atomic_cas_ptr() changes it to
 401  *      fsh_res_ptr. That's a signal for other threads, that the structure
 402  *      is being initialised.
 403  * b) if vfsp->vfs_fshrecord equals fsh_res_ptr, that means we have to wait,
 404  *      because vfs_fshrecord is being initialised by another call.
 405  * c) other cases:
 406  *      vfs_fshrecord is already initialised, so we can use it. It won't change
 407  *      until vfs_free() is called. It can't happen when someone is holding
 408  *      the vfs_t, which is expected from the caller of fsh API.
 409  */
 410 static void
 411 fsh_fsrec_prepare(vfs_t *vfsp)
 412 {
 413         fsh_fsrecord_t *fsrec;
 414 
 415         while ((fsrec = atomic_cas_ptr(&vfsp->vfs_fshrecord, NULL,
 416             fsh_res_ptr)) == fsh_res_ptr)
 417                 ;
 418 
 419         if (fsrec == NULL)
 420                 atomic_swap_ptr(&vfsp->vfs_fshrecord, fsh_fsrec_create());
 421 }
 422 
 423 /*
 424  * API for enabling/disabling fsh per vfs_t.
 425  *
 426  * A newly created vfs_t has fsh enabled by default. If one would want to change
 427  * this behaviour, mount callbacks could be used.
 428  *
 429  * The caller is expected to hold the vfs_t.
 430  *
 431  * These functions must NOT be called in a hook.
 432  */
 433 void
 434 fsh_fs_enable(vfs_t *vfsp)
 435 {
 436         fsh_fsrec_prepare(vfsp);
 437 
 438         rw_enter(&vfsp->vfs_fshrecord->fshfsr_lock, RW_WRITER);
 439         vfsp->vfs_fshrecord->fshfsr_enabled = 1;
 440         rw_exit(&vfsp->vfs_fshrecord->fshfsr_lock);
 441 }
 442 
 443 void
 444 fsh_fs_disable(vfs_t *vfsp)
 445 {
 446         fsh_fsrec_prepare(vfsp);
 447 
 448         rw_enter(&vfsp->vfs_fshrecord->fshfsr_lock, RW_WRITER);
 449         vfsp->vfs_fshrecord->fshfsr_enabled = 0;
 450         rw_exit(&vfsp->vfs_fshrecord->fshfsr_lock);
 451 }
 452 
 453 /*
 454  * API used for installing hooks. fsh_handle_t is returned for further
 455  * actions (currently just removing) on this set of hooks.
 456  *
 457  * It's important that the hooks are executed in LIFO installation order (they
 458  * are added to the head of the hook list).
 459  *
 460  * The caller is expected to hold the vfs_t.
 461  *
 462  * Returns (-1) if hook/callback limit exceeded, handle otherwise.
 463  */
 464 fsh_handle_t
 465 fsh_hook_install(vfs_t *vfsp, fsh_t *hooks)
 466 {
 467         fsh_handle_t    handle;
 468         fsh_int_t       *fshi;
 469 
 470         fsh_fsrec_prepare(vfsp);
 471 
 472         if ((handle = id_alloc(fsh_idspace)) == -1)
 473                 return (-1);
 474 
 475         fshi = kmem_alloc(sizeof (*fshi), KM_SLEEP);
 476         mutex_init(&fshi->fshi_lock, NULL, MUTEX_DRIVER, NULL);
 477         (void) memcpy(&fshi->fshi_hooks, hooks, sizeof (fshi->fshi_hooks));
 478         fshi->fshi_handle = handle;
 479         fshi->fshi_doomed = 0;
 480         fshi->fshi_ref = 1;
 481         fshi->fshi_vfsp = vfsp;
 482 
 483         mutex_enter(&fsh_lock);
 484         rw_enter(&vfsp->vfs_fshrecord->fshfsr_lock, RW_WRITER);
 485         list_insert_head(&vfsp->vfs_fshrecord->fshfsr_list, fshi);
 486         rw_exit(&vfsp->vfs_fshrecord->fshfsr_lock);
 487 
 488         list_insert_head(&fsh_map, fshi);
 489         mutex_exit(&fsh_lock);
 490 
 491         return (handle);
 492 }
 493 
 494 static int
 495 fshi_hold(fsh_int_t *fshi)
 496 {
 497         int can_hold;
 498 
 499         mutex_enter(&fshi->fshi_lock);
 500         if (fshi->fshi_doomed == 1) {
 501                 can_hold = 0;
 502         } else {
 503                 fshi->fshi_ref++;
 504                 can_hold = 1;
 505         }
 506         mutex_exit(&fshi->fshi_lock);
 507 
 508         return (can_hold);
 509 }
 510 
 511 /*
 512  * This function must not be called while fshfsr_lock is held. Doing so could
 513  * cause a deadlock.
 514  */
 515 static void
 516 fshi_rele(fsh_int_t *fshi)
 517 {
 518         int destroy;
 519 
 520         mutex_enter(&fshi->fshi_lock);
 521         ASSERT(fshi->fshi_ref > 0);
 522         fshi->fshi_ref--;
 523         if (fshi->fshi_ref == 0) {
 524                 ASSERT(fshi->fshi_doomed == 1);
 525                 destroy = 1;
 526         } else {
 527                 destroy = 0;
 528         }
 529         mutex_exit(&fshi->fshi_lock);
 530 
 531         if (destroy) {
 532                 /*
 533                  * At this point, we are sure that fsh_hook_remove() has been
 534                  * called, that's why we don't remove the fshi from fsh_map.
 535                  * fsh_hook_remove() did that already.
 536                  * There is also no need to call fsh_fsrec_prepare() here.
 537                  */
 538                 fsh_fsrecord_t *fsrecp;
 539 
 540                 /*
 541                  * We don't have to call fsh_fsrec_prepare() here.
 542                  * fsh_fsrecord_t is already initialised, because we've found a
 543                  * mapping for the given handle.
 544                  */
 545                 fsrecp = fshi->fshi_vfsp->vfs_fshrecord;
 546                 ASSERT(fsrecp != NULL);
 547                 ASSERT(fsrecp != fsh_res_ptr);
 548 
 549                 rw_enter(&fsrecp->fshfsr_lock, RW_WRITER);
 550                 list_remove(&fsrecp->fshfsr_list, fshi);
 551                 rw_exit(&fsrecp->fshfsr_lock);
 552 
 553                 if (fshi->fshi_hooks.remove_cb != NULL)
 554                         (*fshi->fshi_hooks.remove_cb)(
 555                             fshi->fshi_hooks.arg, fshi->fshi_handle);
 556 
 557                 id_free(fsh_idspace, fshi->fshi_handle);
 558                 mutex_destroy(&fshi->fshi_lock);
 559                 kmem_free(fshi, sizeof (*fshi));
 560         }
 561 }
 562 
 563 /*
 564  * Used for removing a hook set.
 565  *
 566  * fsh_hook_remove() invalidates the given handle.
 567  *
 568  * It is guaranteed, that after successful return from fsh_hook_remove(),
 569  * calls to vnodeops/vfsops, on the vfs_t on which the hook is installed, won't
 570  * go through this hook.
 571  *
 572  * There is no guarantee that after fsh_hook_remove() returns, the hook
 573  * associated with the handle won't be executing. Instead, it is guaranteed that
 574  * when remove_cb() is called, the hook finished it's execution in all threads.
 575  * It is safe to destroy all internal data associated with this hook inside
 576  * remove_cb().
 577  *
 578  * It is possible that remove_cb() would be called before fsh_hook_remove()
 579  * returns.
 580  *
 581  * Returns (-1) if hook wasn't found, 0 otherwise.
 582  */
 583 int
 584 fsh_hook_remove(fsh_handle_t handle)
 585 {
 586         fsh_int_t       *fshi;
 587 
 588         mutex_enter(&fsh_lock);
 589         for (fshi = list_head(&fsh_map); fshi != NULL;
 590             fshi = list_next(&fsh_map, fshi)) {
 591                 if (fshi->fshi_handle == handle) {
 592                         list_remove(&fsh_map, fshi);
 593                         break;
 594                 }
 595         }
 596 
 597         if (fshi == NULL)
 598                 return (-1);
 599 
 600         mutex_enter(&fshi->fshi_lock);
 601         ASSERT(fshi->fshi_doomed == 0);
 602         fshi->fshi_doomed = 1;
 603         mutex_exit(&fshi->fshi_lock);
 604         mutex_exit(&fsh_lock);
 605 
 606         fshi_rele(fshi);
 607 
 608         return (0);
 609 }
 610 
 611 /*
 612  * API for installing global mount/free callbacks.
 613  *
 614  * fsh_callback_t fields:
 615  * fshc_arg - argument passed to the callbacks
 616  * fshc_free - callback fired before VFS_FREEVFS() is called, after vfs_count
 617  *      drops to 0
 618  * fshc_mount - callback fired right before returning from domount()
 619  * The first argument of these callbacks is the vfs_t that is mounted/freed.
 620  * The second one is the fshc_arg.
 621  *
 622  * fsh_callback_handle_t is filled out by this function.
 623  *
 624  * Returns (-1) if hook/callback limit exceeded.
 625  *
 626  * Calling this function in a {mount,free} callback will cause a deadlock.
 627  */
 628 fsh_callback_handle_t
 629 fsh_callback_install(fsh_callback_t *callback)
 630 {
 631         fsh_callback_int_t *fshci;
 632         fsh_callback_handle_t handle;
 633 
 634         if ((handle = id_alloc(fsh_idspace)) == -1)
 635                 return (-1);
 636 
 637         fshci = (fsh_callback_int_t *)kmem_alloc(sizeof (*fshci), KM_SLEEP);
 638         (void) memcpy(&fshci->fshci_cb, callback, sizeof (fshci->fshci_cb));
 639         fshci->fshci_handle = handle;
 640 
 641         mutex_enter(&fsh_cb_lock);
 642         list_insert_head(&fsh_cblist, fshci);
 643         mutex_exit(&fsh_cb_lock);
 644 
 645         return (handle);
 646 }
 647 
 648 /*
 649  * API for removing global mount/free callbacks.
 650  *
 651  * Returns (-1) if callback wasn't found, 0 otherwise.
 652  *
 653  * Calling this function in a {mount,free} callback will cause a deadlock.
 654  */
 655 int
 656 fsh_callback_remove(fsh_callback_handle_t handle)
 657 {
 658         fsh_callback_int_t *fshci;
 659 
 660         mutex_enter(&fsh_cb_lock);
 661 
 662         for (fshci = list_head(&fsh_cblist); fshci != NULL;
 663             fshci = list_next(&fsh_cblist, fshci)) {
 664                 if (fshci->fshci_handle == handle) {
 665                         list_remove(&fsh_cblist, fshci);
 666                         break;
 667                 }
 668         }
 669 
 670         mutex_exit(&fsh_cb_lock);
 671 
 672         if (fshci == NULL)
 673                 return (-1);
 674 
 675         kmem_free(fshci, sizeof (*fshci));
 676         id_free(fsh_idspace, handle);
 677 
 678         return (0);
 679 }
 680 
 681 /*
 682  * This function is executed right before returning from domount()@vfs.c.
 683  * We are sure that it's called only after fsh_init().
 684  * It executes all the mount callbacks installed in the fsh.
 685  *
 686  * Since fsh_exec_mount_callbacks() is called only inside domount(), it is legal
 687  * to call fsh_hook_{install,remove}() inside a mount callback WITHOUT holding
 688  * this vfs_t. This guarantee should be preserved, because it's in the "Usage"
 689  * section in the big theory statement at the top of this file.
 690  */
 691 void
 692 fsh_exec_mount_callbacks(vfs_t *vfsp)
 693 {
 694         fsh_callback_int_t *fshci;
 695         fsh_callback_t *cb;
 696         int fsh_context;
 697 
 698         mutex_enter(&fsh_cb_owner_lock);
 699         fsh_context = fsh_cb_owner == curthread;
 700         mutex_exit(&fsh_cb_owner_lock);
 701 
 702         if (!fsh_context) {
 703                 mutex_enter(&fsh_cb_lock);
 704                 mutex_enter(&fsh_cb_owner_lock);
 705                 fsh_cb_owner = curthread;
 706                 mutex_exit(&fsh_cb_owner_lock);
 707         }
 708 
 709         ASSERT(MUTEX_HELD(&fsh_cb_lock));
 710 
 711         for (fshci = list_head(&fsh_cblist); fshci != NULL;
 712             fshci = list_next(&fsh_cblist, fshci)) {
 713                 cb = &fshci->fshci_cb;
 714                 if (cb->fshc_mount != NULL)
 715                         (*(cb->fshc_mount))(vfsp, cb->fshc_arg);
 716         }
 717 
 718         if (!fsh_context) {
 719                 mutex_enter(&fsh_cb_owner_lock);
 720                 fsh_cb_owner = NULL;
 721                 mutex_exit(&fsh_cb_owner_lock);
 722                 mutex_exit(&fsh_cb_lock);
 723         }
 724 }
 725 
 726 /*
 727  * This function is executed right before VFS_FREEVFS() is called in
 728  * vfs_rele()@vfs.c. We are sure that it's called only after fsh_init().
 729  * It executes all the free callbacks installed in the fsh.
 730  *
 731  * free() callback is the point after the handles associated with the hooks
 732  * installed on this vfs_t become invalid
 733  */
 734 void
 735 fsh_exec_free_callbacks(vfs_t *vfsp)
 736 {
 737         fsh_callback_int_t *fshci;
 738         fsh_callback_t *cb;
 739         int fsh_context;
 740 
 741         mutex_enter(&fsh_cb_owner_lock);
 742         fsh_context = fsh_cb_owner == curthread;
 743         mutex_exit(&fsh_cb_owner_lock);
 744 
 745         if (!fsh_context) {
 746                 mutex_enter(&fsh_cb_lock);
 747                 mutex_enter(&fsh_cb_owner_lock);
 748                 fsh_cb_owner = curthread;
 749                 mutex_exit(&fsh_cb_owner_lock);
 750         }
 751 
 752         ASSERT(MUTEX_HELD(&fsh_cb_lock));
 753 
 754         for (fshci = list_head(&fsh_cblist); fshci != NULL;
 755             fshci = list_next(&fsh_cblist, fshci)) {
 756                 cb = &fshci->fshci_cb;
 757                 if (cb->fshc_free != NULL)
 758                         (*(cb->fshc_free))(vfsp, cb->fshc_arg);
 759         }
 760 
 761         if (!fsh_context) {
 762                 mutex_enter(&fsh_cb_owner_lock);
 763                 fsh_cb_owner = NULL;
 764                 mutex_exit(&fsh_cb_owner_lock);
 765                 mutex_exit(&fsh_cb_lock);
 766         }
 767 }
 768 
 769 /*
 770  * API for vnode.c/vfs.c to start executing the fsh for a given operation.
 771  *
 772  * fsh_xxx() tries to find the first non-NULL xxx hook on the fshfsr_list. If it
 773  * does, it executes it. If not, underlying vnodeop/vfsop is called.
 774  *
 775  * These interfaces are using fsh_res_ptr (in fsh_fsrec_prepare()), so it's
 776  * absolutely necessary to call fsh_init() before using them. That's done in
 777  * vfsinit().
 778  *
 779  * While these functions are executing, it's expected that necessary vfs_t's
 780  * are held so that vfs_free() isn't called. vfs_free() expects that noone
 781  * accesses vfs_fshrecord of a given vfs_t.
 782  * It's also the caller's responsibility to keep vnode_t passed to fsh_foo()
 783  * alive and valid.
 784  * All these expectations are met because these functions are used only in
 785  * correspondng {fop,fsop}_foo() functions.
 786  */
 787 int
 788 fsh_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
 789         caller_context_t *ct)
 790 {
 791         int ret;
 792         fsh_fsrecord_t *fsrecp;
 793         fsh_int_t *fshi;
 794         fsh_exec_t *fshe;
 795         list_t exec_list;
 796 
 797         fsh_fsrec_prepare(vp->v_vfsp);
 798         fsrecp = vp->v_vfsp->vfs_fshrecord;
 799 
 800         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
 801         if (!(fsrecp->fshfsr_enabled)) {
 802                 rw_exit(&fsrecp->fshfsr_lock);
 803                 return ((*vp->v_op->vop_read)(vp, uiop, ioflag, cr, ct));
 804         }
 805 
 806         list_create(&exec_list, sizeof (fsh_exec_t),
 807             offsetof(fsh_exec_t, fshe_node));
 808 
 809         for (fshi = list_head(&fsrecp->fshfsr_list); fshi != NULL;
 810             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
 811                 if (fshi->fshi_hooks.pre_read != NULL ||
 812                     fshi->fshi_hooks.post_read != NULL) {
 813                         if (fshi_hold(fshi)) {
 814                                 fshe = kmem_alloc(sizeof (*fshe), KM_SLEEP);
 815                                 fshe->fshe_fshi = fshi;
 816                                 list_insert_tail(&exec_list, fshe);
 817                         }
 818                 }
 819         }
 820         rw_exit(&fsrecp->fshfsr_lock);
 821 
 822         /* Execute pre hooks */
 823         for (fshe = list_head(&exec_list); fshe != NULL;
 824             fshe = list_next(&exec_list, fshe)) {
 825                 if (fshe->fshe_fshi->fshi_hooks.pre_read != NULL)
 826                         (*fshe->fshe_fshi->fshi_hooks.pre_read)(
 827                             fshe->fshe_fshi->fshi_hooks.arg,
 828                             &fshe->fshe_instance,
 829                             &vp, &uiop, &ioflag, &cr, &ct);
 830         }
 831 
 832         ret = (*vp->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
 833 
 834         /* Execute post hooks */
 835         while ((fshe = list_remove_tail(&exec_list)) != NULL) {
 836                 if (fshe->fshe_fshi->fshi_hooks.post_read != NULL)
 837                         ret = (*fshe->fshe_fshi->fshi_hooks.post_read)(
 838                             ret, fshe->fshe_fshi->fshi_hooks.arg,
 839                             fshe->fshe_instance,
 840                             vp, uiop, ioflag, cr, ct);
 841                 fshi_rele(fshe->fshe_fshi);
 842                 kmem_free(fshe, sizeof (*fshe));
 843         }
 844         list_destroy(&exec_list);
 845 
 846         return (ret);
 847 }
 848 
 849 int
 850 fsh_write(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
 851         caller_context_t *ct)
 852 {
 853         int ret;
 854         fsh_fsrecord_t *fsrecp;
 855         fsh_int_t *fshi;
 856         fsh_exec_t *fshe;
 857         list_t exec_list;
 858 
 859         fsh_fsrec_prepare(vp->v_vfsp);
 860         fsrecp = vp->v_vfsp->vfs_fshrecord;
 861 
 862         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
 863         if (!(fsrecp->fshfsr_enabled)) {
 864                 rw_exit(&fsrecp->fshfsr_lock);
 865                 return ((*vp->v_op->vop_write)(vp, uiop, ioflag, cr, ct));
 866         }
 867 
 868         list_create(&exec_list, sizeof (fsh_exec_t),
 869             offsetof(fsh_exec_t, fshe_node));
 870 
 871         for (fshi = list_head(&fsrecp->fshfsr_list); fshi != NULL;
 872             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
 873                 if (fshi->fshi_hooks.pre_write != NULL ||
 874                     fshi->fshi_hooks.post_write != NULL) {
 875                         if (fshi_hold(fshi)) {
 876                                 fshe = kmem_alloc(sizeof (*fshe), KM_SLEEP);
 877                                 fshe->fshe_fshi = fshi;
 878                                 list_insert_tail(&exec_list, fshe);
 879                         }
 880                 }
 881         }
 882         rw_exit(&fsrecp->fshfsr_lock);
 883 
 884         /* Execute pre hooks */
 885         for (fshe = list_head(&exec_list); fshe != NULL;
 886             fshe = list_next(&exec_list, fshe)) {
 887                 if (fshe->fshe_fshi->fshi_hooks.pre_write != NULL)
 888                         (*fshe->fshe_fshi->fshi_hooks.pre_write)(
 889                             fshe->fshe_fshi->fshi_hooks.arg,
 890                             &fshe->fshe_instance,
 891                             &vp, &uiop, &ioflag, &cr, &ct);
 892         }
 893 
 894         ret = (*vp->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
 895 
 896         /* Execute post hooks */
 897         while ((fshe = list_remove_tail(&exec_list)) != NULL) {
 898                 if (fshe->fshe_fshi->fshi_hooks.post_write != NULL)
 899                         ret = (*fshe->fshe_fshi->fshi_hooks.post_write)(
 900                             ret, fshe->fshe_fshi->fshi_hooks.arg,
 901                             fshe->fshe_instance,
 902                             vp, uiop, ioflag, cr, ct);
 903                 fshi_rele(fshe->fshe_fshi);
 904                 kmem_free(fshe, sizeof (*fshe));
 905         }
 906         list_destroy(&exec_list);
 907 
 908         return (ret);
 909 }
 910 
 911 int
 912 fsh_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 913 {
 914         int ret;
 915         fsh_fsrecord_t *fsrecp;
 916         fsh_int_t *fshi;
 917         fsh_exec_t *fshe;
 918         list_t exec_list;
 919 
 920         fsh_fsrec_prepare(vfsp);
 921         fsrecp = vfsp->vfs_fshrecord;
 922 
 923         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
 924         if (!(fsrecp->fshfsr_enabled)) {
 925                 rw_exit(&fsrecp->fshfsr_lock);
 926                 return ((*vfsp->vfs_op->vfs_mount)(vfsp, mvp, uap, cr));
 927         }
 928 
 929         list_create(&exec_list, sizeof (fsh_exec_t),
 930             offsetof(fsh_exec_t, fshe_node));
 931 
 932         for (fshi = list_head(&fsrecp->fshfsr_list); fshi != NULL;
 933             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
 934                 if (fshi->fshi_hooks.pre_mount != NULL ||
 935                     fshi->fshi_hooks.post_mount != NULL) {
 936                         if (fshi_hold(fshi)) {
 937                                 fshe = kmem_alloc(sizeof (*fshe), KM_SLEEP);
 938                                 fshe->fshe_fshi = fshi;
 939                                 list_insert_tail(&exec_list, fshe);
 940                         }
 941                 }
 942         }
 943         rw_exit(&fsrecp->fshfsr_lock);
 944 
 945         /* Execute pre hooks */
 946         for (fshe = list_head(&exec_list); fshe != NULL;
 947             fshe = list_next(&exec_list, fshe)) {
 948                 if (fshe->fshe_fshi->fshi_hooks.pre_mount != NULL)
 949                         (*fshe->fshe_fshi->fshi_hooks.pre_mount)(
 950                             &fshe->fshe_fshi->fshi_hooks.arg,
 951                             &fshe->fshe_instance,
 952                             &vfsp, &mvp, &uap, &cr);
 953         }
 954 
 955         ret = (*vfsp->vfs_op->vfs_mount)(vfsp, mvp, uap, cr);
 956 
 957         /* Execute post hooks */
 958         while ((fshe = list_remove_tail(&exec_list)) != NULL) {
 959                 if (fshe->fshe_fshi->fshi_hooks.post_mount != NULL)
 960                         ret = (*fshe->fshe_fshi->fshi_hooks.post_mount)(
 961                             ret, fshe->fshe_fshi->fshi_hooks.arg,
 962                             fshe->fshe_instance,
 963                             vfsp, mvp, uap, cr);
 964                 fshi_rele(fshe->fshe_fshi);
 965                 kmem_free(fshe, sizeof (*fshe));
 966         }
 967         list_destroy(&exec_list);
 968 
 969         return (ret);
 970 }
 971 
 972 int
 973 fsh_unmount(vfs_t *vfsp, int flag, cred_t *cr)
 974 {
 975         int ret;
 976         fsh_fsrecord_t *fsrecp;
 977         fsh_int_t *fshi;
 978         fsh_exec_t *fshe;
 979         list_t exec_list;
 980 
 981         fsh_fsrec_prepare(vfsp);
 982         fsrecp = vfsp->vfs_fshrecord;
 983 
 984         rw_enter(&fsrecp->fshfsr_lock, RW_READER);
 985         if (!(fsrecp->fshfsr_enabled)) {
 986                 rw_exit(&fsrecp->fshfsr_lock);
 987                 return ((*vfsp->vfs_op->vfs_unmount)(vfsp, flag, cr));
 988         }
 989 
 990         list_create(&exec_list, sizeof (fsh_exec_t),
 991             offsetof(fsh_exec_t, fshe_node));
 992 
 993         for (fshi = list_head(&fsrecp->fshfsr_list); fshi != NULL;
 994             fshi = list_next(&fsrecp->fshfsr_list, fshi)) {
 995                 if (fshi->fshi_hooks.pre_unmount != NULL ||
 996                     fshi->fshi_hooks.post_unmount != NULL) {
 997                         if (fshi_hold(fshi)) {
 998                                 fshe = kmem_alloc(sizeof (*fshe), KM_SLEEP);
 999                                 fshe->fshe_fshi = fshi;
1000                                 list_insert_tail(&exec_list, fshe);
1001                         }
1002                 }
1003         }
1004         rw_exit(&fsrecp->fshfsr_lock);
1005 
1006         /* Execute pre hooks */
1007         for (fshe = list_head(&exec_list); fshe != NULL;
1008             fshe = list_next(&exec_list, fshe)) {
1009                 if (fshe->fshe_fshi->fshi_hooks.pre_unmount != NULL)
1010                         (*fshe->fshe_fshi->fshi_hooks.pre_unmount)(
1011                             fshe->fshe_fshi->fshi_hooks.arg,
1012                             &fshe->fshe_instance,
1013                             &vfsp, &flag, &cr);
1014         }
1015 
1016         ret = (*vfsp->vfs_op->vfs_unmount)(vfsp, flag, cr);
1017 
1018         /* Execute post hooks */
1019         while ((fshe = list_remove_tail(&exec_list)) != NULL) {
1020                 if (fshe->fshe_fshi->fshi_hooks.post_unmount != NULL)
1021                         ret = (*fshe->fshe_fshi->fshi_hooks.post_unmount)(
1022                             ret, fshe->fshe_fshi->fshi_hooks.arg,
1023                             fshe->fshe_instance,
1024                             vfsp, flag, cr);
1025                 fshi_rele(fshe->fshe_fshi);
1026                 kmem_free(fshe, sizeof (*fshe));
1027         }
1028         list_destroy(&exec_list);
1029 
1030         return (ret);
1031 }
1032 
1033 /*
1034  * This is the funtion used by fsh_fsrec_prepare() to allocate a new
1035  * fsh_fsrecord. This function is called by the first function which
1036  * access the vfs_fshrecord and finds out it's NULL.
1037  */
1038 static fsh_fsrecord_t *
1039 fsh_fsrec_create()
1040 {
1041         fsh_fsrecord_t *fsrecp;
1042 
1043         fsrecp = (fsh_fsrecord_t *)kmem_zalloc(sizeof (*fsrecp), KM_SLEEP);
1044         list_create(&fsrecp->fshfsr_list, sizeof (fsh_int_t),
1045             offsetof(fsh_int_t, fshi_node));
1046         rw_init(&fsrecp->fshfsr_lock, NULL, RW_DRIVER, NULL);
1047         fsrecp->fshfsr_enabled = 1;
1048         return (fsrecp);
1049 }
1050 
1051 
1052 /*
1053  * This call must be used ONLY in vfs_free().
1054  *
1055  * It is required and sufficient to check if fsh_fsrecord_t is not NULL before
1056  * passing it to fsh_fsrec_destroy.
1057  *
1058  * All the remaining hooks are being removed here.
1059  */
1060 void
1061 fsh_fsrec_destroy(struct fsh_fsrecord *volatile fsrecp)
1062 {
1063         fsh_int_t *fshi;
1064 
1065         VERIFY(fsrecp != NULL);
1066 
1067         _NOTE(CONSTCOND)
1068         while (1) {
1069                 mutex_enter(&fsh_lock);
1070                 rw_enter(&fsrecp->fshfsr_lock, RW_WRITER);
1071                 fshi = list_remove_head(&fsrecp->fshfsr_list);
1072                 rw_exit(&fsrecp->fshfsr_lock);
1073                 if (fshi == NULL) {
1074                         mutex_exit(&fsh_lock);
1075                         break;
1076                 }
1077                 ASSERT(fshi->fshi_doomed == 0);
1078                 list_remove(&fsh_map, fshi);
1079                 mutex_exit(&fsh_lock);
1080 
1081                 if (fshi->fshi_hooks.remove_cb != NULL)
1082                         (*fshi->fshi_hooks.remove_cb)(fshi->fshi_hooks.arg,
1083                             fshi->fshi_handle);
1084 
1085                 id_free(fsh_idspace, fshi->fshi_handle);
1086                 mutex_destroy(&fshi->fshi_lock);
1087                 kmem_free(fshi, sizeof (*fshi));
1088 
1089         }
1090 
1091         list_destroy(&fsrecp->fshfsr_list);
1092         rw_destroy(&fsrecp->fshfsr_lock);
1093         kmem_free(fsrecp, sizeof (*fsrecp));
1094 }
1095 
1096 /*
1097  * fsh_init() is called in vfsinit()@vfs.c. This function MUST be called
1098  * before every other fsh call.
1099  */
1100 void
1101 fsh_init(void)
1102 {
1103         mutex_init(&fsh_cb_lock, NULL, MUTEX_DRIVER, NULL);
1104         mutex_init(&fsh_cb_owner_lock, NULL, MUTEX_DRIVER, NULL);
1105         list_create(&fsh_cblist, sizeof (fsh_callback_int_t),
1106             offsetof(fsh_callback_int_t, fshci_node));
1107 
1108         mutex_init(&fsh_lock, NULL, MUTEX_DRIVER, NULL);
1109 
1110         list_create(&fsh_map, sizeof (fsh_int_t), offsetof(fsh_int_t,
1111             fshi_global));
1112 
1113         /* See comment above fsh_fsrec_prepare() */
1114         fsh_res_ptr = (void *)-1;
1115 
1116         fsh_idspace = id_space_create("fsh", 0, fsh_limit);
1117 }