illumos-gate Wdiff usr/src/uts/common/fs/zfs/rrwlock.c

Print this page

5981 Deadlock in dmu_objset_find_dp

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/rrwlock.c
          +++ new/usr/src/uts/common/fs/zfs/rrwlock.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  /*
  26   26   * Copyright (c) 2012 by Delphix. All rights reserved.
  27   27   */
  28   28  
  29   29  #include <sys/refcount.h>
  30   30  #include <sys/rrwlock.h>
  31   31  
  32   32  /*
  33   33   * This file contains the implementation of a re-entrant read
  34   34   * reader/writer lock (aka "rrwlock").
  35   35   *
  36   36   * This is a normal reader/writer lock with the additional feature
  37   37   * of allowing threads who have already obtained a read lock to
  38   38   * re-enter another read lock (re-entrant read) - even if there are
  39   39   * waiting writers.
  40   40   *
  41   41   * Callers who have not obtained a read lock give waiting writers priority.
  42   42   *
  43   43   * The rrwlock_t lock does not allow re-entrant writers, nor does it
  44   44   * allow a re-entrant mix of reads and writes (that is, it does not
  45   45   * allow a caller who has already obtained a read lock to be able to
  46   46   * then grab a write lock without first dropping all read locks, and
  47   47   * vice versa).
  48   48   *
  49   49   * The rrwlock_t uses tsd (thread specific data) to keep a list of
  50   50   * nodes (rrw_node_t), where each node keeps track of which specific
  51   51   * lock (rrw_node_t::rn_rrl) the thread has grabbed.  Since re-entering
  52   52   * should be rare, a thread that grabs multiple reads on the same rrwlock_t
  53   53   * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
  54   54   * tsd list can represent a different rrwlock_t.  This allows a thread
  55   55   * to enter multiple and unique rrwlock_ts for read locks at the same time.
  56   56   *
  57   57   * Since using tsd exposes some overhead, the rrwlock_t only needs to
  58   58   * keep tsd data when writers are waiting.  If no writers are waiting, then
  59   59   * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
  60   60   * is needed.  Once a writer attempts to grab the lock, readers then
  61   61   * keep tsd data and bump the linked readers count (rr_linked_rcount).
  62   62   *
  63   63   * If there are waiting writers and there are anonymous readers, then a
  64   64   * reader doesn't know if it is a re-entrant lock. But since it may be one,
  65   65   * we allow the read to proceed (otherwise it could deadlock).  Since once
  66   66   * waiting writers are active, readers no longer bump the anonymous count,
  67   67   * the anonymous readers will eventually flush themselves out.  At this point,
  68   68   * readers will be able to tell if they are a re-entrant lock (have a
  69   69   * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
  70   70   * we must let the proceed.  If they are not, then the reader blocks for the
  71   71   * waiting writers.  Hence, we do not starve writers.
  72   72   */
  73   73  
  74   74  /* global key for TSD */
  75   75  uint_t rrw_tsd_key;
  76   76  
  77   77  typedef struct rrw_node {
  78   78          struct rrw_node *rn_next;
  79   79          rrwlock_t *rn_rrl;
  80   80          void *rn_tag;
  81   81  } rrw_node_t;
  82   82  
  83   83  static rrw_node_t *
  84   84  rrn_find(rrwlock_t *rrl)
  85   85  {
  86   86          rrw_node_t *rn;
  87   87  
  88   88          if (refcount_count(&rrl->rr_linked_rcount) == 0)
  89   89                  return (NULL);
  90   90  
  91   91          for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
  92   92                  if (rn->rn_rrl == rrl)
  93   93                          return (rn);
  94   94          }
  95   95          return (NULL);
  96   96  }
  97   97  
  98   98  /*
  99   99   * Add a node to the head of the singly linked list.
 100  100   */
 101  101  static void
 102  102  rrn_add(rrwlock_t *rrl, void *tag)
 103  103  {
 104  104          rrw_node_t *rn;
 105  105  
 106  106          rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
 107  107          rn->rn_rrl = rrl;
 108  108          rn->rn_next = tsd_get(rrw_tsd_key);
 109  109          rn->rn_tag = tag;
 110  110          VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
 111  111  }
 112  112  
 113  113  /*
 114  114   * If a node is found for 'rrl', then remove the node from this
 115  115   * thread's list and return TRUE; otherwise return FALSE.
 116  116   */
 117  117  static boolean_t
 118  118  rrn_find_and_remove(rrwlock_t *rrl, void *tag)
 119  119  {
 120  120          rrw_node_t *rn;
 121  121          rrw_node_t *prev = NULL;
 122  122  
 123  123          if (refcount_count(&rrl->rr_linked_rcount) == 0)
 124  124                  return (B_FALSE);
 125  125  
 126  126          for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
 127  127                  if (rn->rn_rrl == rrl && rn->rn_tag == tag) {
 128  128                          if (prev)
 129  129                                  prev->rn_next = rn->rn_next;
 130  130                          else
 131  131                                  VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
 132  132                          kmem_free(rn, sizeof (*rn));
 133  133                          return (B_TRUE);
 134  134                  }
 135  135                  prev = rn;
 136  136          }
 137  137          return (B_FALSE);
 138  138  }
 139  139  
 140  140  void
 141  141  rrw_init(rrwlock_t *rrl, boolean_t track_all)
 142  142  {
 143  143          mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
 144  144          cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
 145  145          rrl->rr_writer = NULL;
 146  146          refcount_create(&rrl->rr_anon_rcount);
 147  147          refcount_create(&rrl->rr_linked_rcount);
 148  148          rrl->rr_writer_wanted = B_FALSE;
 149  149          rrl->rr_track_all = track_all;
 150  150  }
 151  151

↓ open down ↓

151 lines elided

↑ open up ↑

 152  152  void
 153  153  rrw_destroy(rrwlock_t *rrl)
 154  154  {
 155  155          mutex_destroy(&rrl->rr_lock);
 156  156          cv_destroy(&rrl->rr_cv);
 157  157          ASSERT(rrl->rr_writer == NULL);
 158  158          refcount_destroy(&rrl->rr_anon_rcount);
 159  159          refcount_destroy(&rrl->rr_linked_rcount);
 160  160  }
 161  161  
 162      -void
 163      -rrw_enter_read(rrwlock_t *rrl, void *tag)
      162 +static void
      163 +rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag)
 164  164  {
 165  165          mutex_enter(&rrl->rr_lock);
 166  166  #if !defined(DEBUG) && defined(_KERNEL)
 167  167          if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted &&
 168  168              !rrl->rr_track_all) {
 169  169                  rrl->rr_anon_rcount.rc_count++;
 170  170                  mutex_exit(&rrl->rr_lock);
 171  171                  return;
 172  172          }
 173  173          DTRACE_PROBE(zfs__rrwfastpath__rdmiss);
 174  174  #endif
 175  175          ASSERT(rrl->rr_writer != curthread);
 176  176          ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
 177  177  
 178  178          while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted &&
 179      -            refcount_is_zero(&rrl->rr_anon_rcount) &&
      179 +            refcount_is_zero(&rrl->rr_anon_rcount) && !prio &&
 180  180              rrn_find(rrl) == NULL))
 181  181                  cv_wait(&rrl->rr_cv, &rrl->rr_lock);
 182  182  
 183  183          if (rrl->rr_writer_wanted || rrl->rr_track_all) {
 184  184                  /* may or may not be a re-entrant enter */
 185  185                  rrn_add(rrl, tag);
 186  186                  (void) refcount_add(&rrl->rr_linked_rcount, tag);
 187  187          } else {
 188  188                  (void) refcount_add(&rrl->rr_anon_rcount, tag);
 189  189          }
 190  190          ASSERT(rrl->rr_writer == NULL);
 191  191          mutex_exit(&rrl->rr_lock);
 192  192  }
 193  193  
 194  194  void
      195 +rrw_enter_read(rrwlock_t *rrl, void *tag)
      196 +{
      197 +        rrw_enter_read_impl(rrl, B_FALSE, tag);
      198 +}
      199 +
      200 +/*
      201 + * take a read lock even if there are pending write lock requests. if we want
      202 + * to take a lock reentrantly, but from different threads (that have a
      203 + * relationship to each other), the normal detection mechanism to overrule
      204 + * the pending writer does not work, so we have to give an explicit hint here.
      205 + */
      206 +void
      207 +rrw_enter_read_prio(rrwlock_t *rrl, void *tag)
      208 +{
      209 +        rrw_enter_read_impl(rrl, B_TRUE, tag);
      210 +}
      211 +
      212 +
      213 +void
 195  214  rrw_enter_write(rrwlock_t *rrl)
 196  215  {
 197  216          mutex_enter(&rrl->rr_lock);
 198  217          ASSERT(rrl->rr_writer != curthread);
 199  218  
 200  219          while (refcount_count(&rrl->rr_anon_rcount) > 0 ||
 201  220              refcount_count(&rrl->rr_linked_rcount) > 0 ||
 202  221              rrl->rr_writer != NULL) {
 203  222                  rrl->rr_writer_wanted = B_TRUE;
 204  223                  cv_wait(&rrl->rr_cv, &rrl->rr_lock);

 205  224          }
 206  225          rrl->rr_writer_wanted = B_FALSE;
 207  226          rrl->rr_writer = curthread;
 208  227          mutex_exit(&rrl->rr_lock);
 209  228  }
 210  229  
 211  230  void
 212  231  rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
 213  232  {
 214  233          if (rw == RW_READER)
 215  234                  rrw_enter_read(rrl, tag);
 216  235          else
 217  236                  rrw_enter_write(rrl);
 218  237  }
 219  238  
 220  239  void
 221  240  rrw_exit(rrwlock_t *rrl, void *tag)
 222  241  {
 223  242          mutex_enter(&rrl->rr_lock);
 224  243  #if !defined(DEBUG) && defined(_KERNEL)
 225  244          if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) {
 226  245                  rrl->rr_anon_rcount.rc_count--;
 227  246                  if (rrl->rr_anon_rcount.rc_count == 0)
 228  247                          cv_broadcast(&rrl->rr_cv);
 229  248                  mutex_exit(&rrl->rr_lock);
 230  249                  return;
 231  250          }
 232  251          DTRACE_PROBE(zfs__rrwfastpath__exitmiss);
 233  252  #endif
 234  253          ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
 235  254              !refcount_is_zero(&rrl->rr_linked_rcount) ||
 236  255              rrl->rr_writer != NULL);
 237  256  
 238  257          if (rrl->rr_writer == NULL) {
 239  258                  int64_t count;
 240  259                  if (rrn_find_and_remove(rrl, tag)) {
 241  260                          count = refcount_remove(&rrl->rr_linked_rcount, tag);
 242  261                  } else {
 243  262                          ASSERT(!rrl->rr_track_all);
 244  263                          count = refcount_remove(&rrl->rr_anon_rcount, tag);
 245  264                  }
 246  265                  if (count == 0)
 247  266                          cv_broadcast(&rrl->rr_cv);
 248  267          } else {
 249  268                  ASSERT(rrl->rr_writer == curthread);
 250  269                  ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
 251  270                      refcount_is_zero(&rrl->rr_linked_rcount));
 252  271                  rrl->rr_writer = NULL;
 253  272                  cv_broadcast(&rrl->rr_cv);
 254  273          }
 255  274          mutex_exit(&rrl->rr_lock);
 256  275  }
 257  276  
 258  277  /*
 259  278   * If the lock was created with track_all, rrw_held(RW_READER) will return
 260  279   * B_TRUE iff the current thread has the lock for reader.  Otherwise it may
 261  280   * return B_TRUE if any thread has the lock for reader.
 262  281   */
 263  282  boolean_t
 264  283  rrw_held(rrwlock_t *rrl, krw_t rw)
 265  284  {
 266  285          boolean_t held;
 267  286  
 268  287          mutex_enter(&rrl->rr_lock);
 269  288          if (rw == RW_WRITER) {
 270  289                  held = (rrl->rr_writer == curthread);
 271  290          } else {
 272  291                  held = (!refcount_is_zero(&rrl->rr_anon_rcount) ||
 273  292                      rrn_find(rrl) != NULL);
 274  293          }
 275  294          mutex_exit(&rrl->rr_lock);
 276  295  
 277  296          return (held);
 278  297  }
 279  298  
 280  299  void
 281  300  rrw_tsd_destroy(void *arg)
 282  301  {
 283  302          rrw_node_t *rn = arg;
 284  303          if (rn != NULL) {
 285  304                  panic("thread %p terminating with rrw lock %p held",
 286  305                      (void *)curthread, (void *)rn->rn_rrl);
 287  306          }
 288  307  }
 289  308  
 290  309  /*
 291  310   * A reader-mostly lock implementation, tuning above reader-writer locks
 292  311   * for hightly parallel read acquisitions, while pessimizing writes.
 293  312   *
 294  313   * The idea is to split single busy lock into array of locks, so that
 295  314   * each reader can lock only one of them for read, depending on result
 296  315   * of simple hash function.  That proportionally reduces lock congestion.
 297  316   * Writer same time has to sequentially aquire write on all the locks.
 298  317   * That makes write aquisition proportionally slower, but in places where
 299  318   * it is used (filesystem unmount) performance is not critical.
 300  319   *
 301  320   * All the functions below are direct wrappers around functions above.
 302  321   */
 303  322  void
 304  323  rrm_init(rrmlock_t *rrl, boolean_t track_all)
 305  324  {
 306  325          int i;
 307  326  
 308  327          for (i = 0; i < RRM_NUM_LOCKS; i++)
 309  328                  rrw_init(&rrl->locks[i], track_all);
 310  329  }
 311  330  
 312  331  void
 313  332  rrm_destroy(rrmlock_t *rrl)
 314  333  {
 315  334          int i;
 316  335  
 317  336          for (i = 0; i < RRM_NUM_LOCKS; i++)
 318  337                  rrw_destroy(&rrl->locks[i]);
 319  338  }
 320  339  
 321  340  void
 322  341  rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
 323  342  {
 324  343          if (rw == RW_READER)
 325  344                  rrm_enter_read(rrl, tag);
 326  345          else
 327  346                  rrm_enter_write(rrl);
 328  347  }
 329  348  
 330  349  /*
 331  350   * This maps the current thread to a specific lock.  Note that the lock
 332  351   * must be released by the same thread that acquired it.  We do this
 333  352   * mapping by taking the thread pointer mod a prime number.  We examine
 334  353   * only the low 32 bits of the thread pointer, because 32-bit division
 335  354   * is faster than 64-bit division, and the high 32 bits have little
 336  355   * entropy anyway.
 337  356   */
 338  357  #define RRM_TD_LOCK()   (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS)
 339  358  
 340  359  void
 341  360  rrm_enter_read(rrmlock_t *rrl, void *tag)
 342  361  {
 343  362          rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
 344  363  }
 345  364  
 346  365  void
 347  366  rrm_enter_write(rrmlock_t *rrl)
 348  367  {
 349  368          int i;
 350  369  
 351  370          for (i = 0; i < RRM_NUM_LOCKS; i++)
 352  371                  rrw_enter_write(&rrl->locks[i]);
 353  372  }
 354  373  
 355  374  void
 356  375  rrm_exit(rrmlock_t *rrl, void *tag)
 357  376  {
 358  377          int i;
 359  378  
 360  379          if (rrl->locks[0].rr_writer == curthread) {
 361  380                  for (i = 0; i < RRM_NUM_LOCKS; i++)
 362  381                          rrw_exit(&rrl->locks[i], tag);
 363  382          } else {
 364  383                  rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag);
 365  384          }
 366  385  }
 367  386  
 368  387  boolean_t
 369  388  rrm_held(rrmlock_t *rrl, krw_t rw)
 370  389  {
 371  390          if (rw == RW_WRITER) {
 372  391                  return (rrw_held(&rrl->locks[0], rw));
 373  392          } else {
 374  393                  return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw));
 375  394          }
 376  395  }

↓ open down ↓

172 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX