Print this page
5981 Deadlock in dmu_objset_find_dp
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/rrwlock.c
+++ new/usr/src/uts/common/fs/zfs/rrwlock.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25 /*
26 26 * Copyright (c) 2012 by Delphix. All rights reserved.
27 27 */
28 28
29 29 #include <sys/refcount.h>
30 30 #include <sys/rrwlock.h>
31 31
32 32 /*
33 33 * This file contains the implementation of a re-entrant read
34 34 * reader/writer lock (aka "rrwlock").
35 35 *
36 36 * This is a normal reader/writer lock with the additional feature
37 37 * of allowing threads who have already obtained a read lock to
38 38 * re-enter another read lock (re-entrant read) - even if there are
39 39 * waiting writers.
40 40 *
41 41 * Callers who have not obtained a read lock give waiting writers priority.
42 42 *
43 43 * The rrwlock_t lock does not allow re-entrant writers, nor does it
44 44 * allow a re-entrant mix of reads and writes (that is, it does not
45 45 * allow a caller who has already obtained a read lock to be able to
46 46 * then grab a write lock without first dropping all read locks, and
47 47 * vice versa).
48 48 *
49 49 * The rrwlock_t uses tsd (thread specific data) to keep a list of
50 50 * nodes (rrw_node_t), where each node keeps track of which specific
51 51 * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering
52 52 * should be rare, a thread that grabs multiple reads on the same rrwlock_t
53 53 * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
54 54 * tsd list can represent a different rrwlock_t. This allows a thread
55 55 * to enter multiple and unique rrwlock_ts for read locks at the same time.
56 56 *
57 57 * Since using tsd exposes some overhead, the rrwlock_t only needs to
58 58 * keep tsd data when writers are waiting. If no writers are waiting, then
59 59 * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
60 60 * is needed. Once a writer attempts to grab the lock, readers then
61 61 * keep tsd data and bump the linked readers count (rr_linked_rcount).
62 62 *
63 63 * If there are waiting writers and there are anonymous readers, then a
64 64 * reader doesn't know if it is a re-entrant lock. But since it may be one,
65 65 * we allow the read to proceed (otherwise it could deadlock). Since once
66 66 * waiting writers are active, readers no longer bump the anonymous count,
67 67 * the anonymous readers will eventually flush themselves out. At this point,
68 68 * readers will be able to tell if they are a re-entrant lock (have a
69 69 * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
70 70 * we must let the proceed. If they are not, then the reader blocks for the
71 71 * waiting writers. Hence, we do not starve writers.
72 72 */
73 73
74 74 /* global key for TSD */
75 75 uint_t rrw_tsd_key;
76 76
77 77 typedef struct rrw_node {
78 78 struct rrw_node *rn_next;
79 79 rrwlock_t *rn_rrl;
80 80 void *rn_tag;
81 81 } rrw_node_t;
82 82
83 83 static rrw_node_t *
84 84 rrn_find(rrwlock_t *rrl)
85 85 {
86 86 rrw_node_t *rn;
87 87
88 88 if (refcount_count(&rrl->rr_linked_rcount) == 0)
89 89 return (NULL);
90 90
91 91 for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
92 92 if (rn->rn_rrl == rrl)
93 93 return (rn);
94 94 }
95 95 return (NULL);
96 96 }
97 97
98 98 /*
99 99 * Add a node to the head of the singly linked list.
100 100 */
101 101 static void
102 102 rrn_add(rrwlock_t *rrl, void *tag)
103 103 {
104 104 rrw_node_t *rn;
105 105
106 106 rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
107 107 rn->rn_rrl = rrl;
108 108 rn->rn_next = tsd_get(rrw_tsd_key);
109 109 rn->rn_tag = tag;
110 110 VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
111 111 }
112 112
113 113 /*
114 114 * If a node is found for 'rrl', then remove the node from this
115 115 * thread's list and return TRUE; otherwise return FALSE.
116 116 */
117 117 static boolean_t
118 118 rrn_find_and_remove(rrwlock_t *rrl, void *tag)
119 119 {
120 120 rrw_node_t *rn;
121 121 rrw_node_t *prev = NULL;
122 122
123 123 if (refcount_count(&rrl->rr_linked_rcount) == 0)
124 124 return (B_FALSE);
125 125
126 126 for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
127 127 if (rn->rn_rrl == rrl && rn->rn_tag == tag) {
128 128 if (prev)
129 129 prev->rn_next = rn->rn_next;
130 130 else
131 131 VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
132 132 kmem_free(rn, sizeof (*rn));
133 133 return (B_TRUE);
134 134 }
135 135 prev = rn;
136 136 }
137 137 return (B_FALSE);
138 138 }
139 139
140 140 void
141 141 rrw_init(rrwlock_t *rrl, boolean_t track_all)
142 142 {
143 143 mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
144 144 cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
145 145 rrl->rr_writer = NULL;
146 146 refcount_create(&rrl->rr_anon_rcount);
147 147 refcount_create(&rrl->rr_linked_rcount);
148 148 rrl->rr_writer_wanted = B_FALSE;
149 149 rrl->rr_track_all = track_all;
150 150 }
151 151
↓ open down ↓ |
151 lines elided |
↑ open up ↑ |
152 152 void
153 153 rrw_destroy(rrwlock_t *rrl)
154 154 {
155 155 mutex_destroy(&rrl->rr_lock);
156 156 cv_destroy(&rrl->rr_cv);
157 157 ASSERT(rrl->rr_writer == NULL);
158 158 refcount_destroy(&rrl->rr_anon_rcount);
159 159 refcount_destroy(&rrl->rr_linked_rcount);
160 160 }
161 161
162 -void
163 -rrw_enter_read(rrwlock_t *rrl, void *tag)
162 +static void
163 +rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag)
164 164 {
165 165 mutex_enter(&rrl->rr_lock);
166 166 #if !defined(DEBUG) && defined(_KERNEL)
167 167 if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted &&
168 168 !rrl->rr_track_all) {
169 169 rrl->rr_anon_rcount.rc_count++;
170 170 mutex_exit(&rrl->rr_lock);
171 171 return;
172 172 }
173 173 DTRACE_PROBE(zfs__rrwfastpath__rdmiss);
174 174 #endif
175 175 ASSERT(rrl->rr_writer != curthread);
176 176 ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
177 177
178 178 while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted &&
179 - refcount_is_zero(&rrl->rr_anon_rcount) &&
179 + refcount_is_zero(&rrl->rr_anon_rcount) && !prio &&
180 180 rrn_find(rrl) == NULL))
181 181 cv_wait(&rrl->rr_cv, &rrl->rr_lock);
182 182
183 183 if (rrl->rr_writer_wanted || rrl->rr_track_all) {
184 184 /* may or may not be a re-entrant enter */
185 185 rrn_add(rrl, tag);
186 186 (void) refcount_add(&rrl->rr_linked_rcount, tag);
187 187 } else {
188 188 (void) refcount_add(&rrl->rr_anon_rcount, tag);
189 189 }
190 190 ASSERT(rrl->rr_writer == NULL);
191 191 mutex_exit(&rrl->rr_lock);
192 192 }
193 193
194 194 void
195 +rrw_enter_read(rrwlock_t *rrl, void *tag)
196 +{
197 + rrw_enter_read_impl(rrl, B_FALSE, tag);
198 +}
199 +
200 +/*
201 + * take a read lock even if there are pending write lock requests. if we want
202 + * to take a lock reentrantly, but from different threads (that have a
203 + * relationship to each other), the normal detection mechanism to overrule
204 + * the pending writer does not work, so we have to give an explicit hint here.
205 + */
206 +void
207 +rrw_enter_read_prio(rrwlock_t *rrl, void *tag)
208 +{
209 + rrw_enter_read_impl(rrl, B_TRUE, tag);
210 +}
211 +
212 +
213 +void
195 214 rrw_enter_write(rrwlock_t *rrl)
196 215 {
197 216 mutex_enter(&rrl->rr_lock);
198 217 ASSERT(rrl->rr_writer != curthread);
199 218
200 219 while (refcount_count(&rrl->rr_anon_rcount) > 0 ||
201 220 refcount_count(&rrl->rr_linked_rcount) > 0 ||
202 221 rrl->rr_writer != NULL) {
203 222 rrl->rr_writer_wanted = B_TRUE;
204 223 cv_wait(&rrl->rr_cv, &rrl->rr_lock);
205 224 }
206 225 rrl->rr_writer_wanted = B_FALSE;
207 226 rrl->rr_writer = curthread;
208 227 mutex_exit(&rrl->rr_lock);
209 228 }
210 229
211 230 void
212 231 rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
213 232 {
214 233 if (rw == RW_READER)
215 234 rrw_enter_read(rrl, tag);
216 235 else
217 236 rrw_enter_write(rrl);
218 237 }
219 238
220 239 void
221 240 rrw_exit(rrwlock_t *rrl, void *tag)
222 241 {
223 242 mutex_enter(&rrl->rr_lock);
224 243 #if !defined(DEBUG) && defined(_KERNEL)
225 244 if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) {
226 245 rrl->rr_anon_rcount.rc_count--;
227 246 if (rrl->rr_anon_rcount.rc_count == 0)
228 247 cv_broadcast(&rrl->rr_cv);
229 248 mutex_exit(&rrl->rr_lock);
230 249 return;
231 250 }
232 251 DTRACE_PROBE(zfs__rrwfastpath__exitmiss);
233 252 #endif
234 253 ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
235 254 !refcount_is_zero(&rrl->rr_linked_rcount) ||
236 255 rrl->rr_writer != NULL);
237 256
238 257 if (rrl->rr_writer == NULL) {
239 258 int64_t count;
240 259 if (rrn_find_and_remove(rrl, tag)) {
241 260 count = refcount_remove(&rrl->rr_linked_rcount, tag);
242 261 } else {
243 262 ASSERT(!rrl->rr_track_all);
244 263 count = refcount_remove(&rrl->rr_anon_rcount, tag);
245 264 }
246 265 if (count == 0)
247 266 cv_broadcast(&rrl->rr_cv);
248 267 } else {
249 268 ASSERT(rrl->rr_writer == curthread);
250 269 ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
251 270 refcount_is_zero(&rrl->rr_linked_rcount));
252 271 rrl->rr_writer = NULL;
253 272 cv_broadcast(&rrl->rr_cv);
254 273 }
255 274 mutex_exit(&rrl->rr_lock);
256 275 }
257 276
258 277 /*
259 278 * If the lock was created with track_all, rrw_held(RW_READER) will return
260 279 * B_TRUE iff the current thread has the lock for reader. Otherwise it may
261 280 * return B_TRUE if any thread has the lock for reader.
262 281 */
263 282 boolean_t
264 283 rrw_held(rrwlock_t *rrl, krw_t rw)
265 284 {
266 285 boolean_t held;
267 286
268 287 mutex_enter(&rrl->rr_lock);
269 288 if (rw == RW_WRITER) {
270 289 held = (rrl->rr_writer == curthread);
271 290 } else {
272 291 held = (!refcount_is_zero(&rrl->rr_anon_rcount) ||
273 292 rrn_find(rrl) != NULL);
274 293 }
275 294 mutex_exit(&rrl->rr_lock);
276 295
277 296 return (held);
278 297 }
279 298
280 299 void
281 300 rrw_tsd_destroy(void *arg)
282 301 {
283 302 rrw_node_t *rn = arg;
284 303 if (rn != NULL) {
285 304 panic("thread %p terminating with rrw lock %p held",
286 305 (void *)curthread, (void *)rn->rn_rrl);
287 306 }
288 307 }
289 308
290 309 /*
291 310 * A reader-mostly lock implementation, tuning above reader-writer locks
292 311 * for hightly parallel read acquisitions, while pessimizing writes.
293 312 *
294 313 * The idea is to split single busy lock into array of locks, so that
295 314 * each reader can lock only one of them for read, depending on result
296 315 * of simple hash function. That proportionally reduces lock congestion.
297 316 * Writer same time has to sequentially aquire write on all the locks.
298 317 * That makes write aquisition proportionally slower, but in places where
299 318 * it is used (filesystem unmount) performance is not critical.
300 319 *
301 320 * All the functions below are direct wrappers around functions above.
302 321 */
303 322 void
304 323 rrm_init(rrmlock_t *rrl, boolean_t track_all)
305 324 {
306 325 int i;
307 326
308 327 for (i = 0; i < RRM_NUM_LOCKS; i++)
309 328 rrw_init(&rrl->locks[i], track_all);
310 329 }
311 330
312 331 void
313 332 rrm_destroy(rrmlock_t *rrl)
314 333 {
315 334 int i;
316 335
317 336 for (i = 0; i < RRM_NUM_LOCKS; i++)
318 337 rrw_destroy(&rrl->locks[i]);
319 338 }
320 339
321 340 void
322 341 rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
323 342 {
324 343 if (rw == RW_READER)
325 344 rrm_enter_read(rrl, tag);
326 345 else
327 346 rrm_enter_write(rrl);
328 347 }
329 348
330 349 /*
331 350 * This maps the current thread to a specific lock. Note that the lock
332 351 * must be released by the same thread that acquired it. We do this
333 352 * mapping by taking the thread pointer mod a prime number. We examine
334 353 * only the low 32 bits of the thread pointer, because 32-bit division
335 354 * is faster than 64-bit division, and the high 32 bits have little
336 355 * entropy anyway.
337 356 */
338 357 #define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS)
339 358
340 359 void
341 360 rrm_enter_read(rrmlock_t *rrl, void *tag)
342 361 {
343 362 rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
344 363 }
345 364
346 365 void
347 366 rrm_enter_write(rrmlock_t *rrl)
348 367 {
349 368 int i;
350 369
351 370 for (i = 0; i < RRM_NUM_LOCKS; i++)
352 371 rrw_enter_write(&rrl->locks[i]);
353 372 }
354 373
355 374 void
356 375 rrm_exit(rrmlock_t *rrl, void *tag)
357 376 {
358 377 int i;
359 378
360 379 if (rrl->locks[0].rr_writer == curthread) {
361 380 for (i = 0; i < RRM_NUM_LOCKS; i++)
362 381 rrw_exit(&rrl->locks[i], tag);
363 382 } else {
364 383 rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag);
365 384 }
366 385 }
367 386
368 387 boolean_t
369 388 rrm_held(rrmlock_t *rrl, krw_t rw)
370 389 {
371 390 if (rw == RW_WRITER) {
372 391 return (rrw_held(&rrl->locks[0], rw));
373 392 } else {
374 393 return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw));
375 394 }
376 395 }
↓ open down ↓ |
172 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX