Print this page
4168 ztest assertion failure in dbuf_undirty
4169 verbatim import causes zdb to segfault
4170 zhack leaves pool in ACTIVE state
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Eric Schrock <eric.schrock@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/dbuf.c
+++ new/usr/src/uts/common/fs/zfs/dbuf.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26 26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 27 */
28 28
29 29 #include <sys/zfs_context.h>
30 30 #include <sys/dmu.h>
31 31 #include <sys/dmu_send.h>
32 32 #include <sys/dmu_impl.h>
33 33 #include <sys/dbuf.h>
34 34 #include <sys/dmu_objset.h>
35 35 #include <sys/dsl_dataset.h>
36 36 #include <sys/dsl_dir.h>
37 37 #include <sys/dmu_tx.h>
38 38 #include <sys/spa.h>
39 39 #include <sys/zio.h>
40 40 #include <sys/dmu_zfetch.h>
41 41 #include <sys/sa.h>
42 42 #include <sys/sa_impl.h>
43 43
44 44 /*
45 45 * Number of times that zfs_free_range() took the slow path while doing
46 46 * a zfs receive. A nonzero value indicates a potential performance problem.
47 47 */
48 48 uint64_t zfs_free_range_recv_miss;
49 49
50 50 static void dbuf_destroy(dmu_buf_impl_t *db);
51 51 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
52 52 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
53 53
54 54 /*
55 55 * Global data structures and functions for the dbuf cache.
56 56 */
57 57 static kmem_cache_t *dbuf_cache;
58 58
59 59 /* ARGSUSED */
60 60 static int
61 61 dbuf_cons(void *vdb, void *unused, int kmflag)
62 62 {
63 63 dmu_buf_impl_t *db = vdb;
64 64 bzero(db, sizeof (dmu_buf_impl_t));
65 65
66 66 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
67 67 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
68 68 refcount_create(&db->db_holds);
69 69 return (0);
70 70 }
71 71
72 72 /* ARGSUSED */
73 73 static void
74 74 dbuf_dest(void *vdb, void *unused)
75 75 {
76 76 dmu_buf_impl_t *db = vdb;
77 77 mutex_destroy(&db->db_mtx);
78 78 cv_destroy(&db->db_changed);
79 79 refcount_destroy(&db->db_holds);
80 80 }
81 81
82 82 /*
83 83 * dbuf hash table routines
84 84 */
85 85 static dbuf_hash_table_t dbuf_hash_table;
86 86
87 87 static uint64_t dbuf_hash_count;
88 88
89 89 static uint64_t
90 90 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
91 91 {
92 92 uintptr_t osv = (uintptr_t)os;
93 93 uint64_t crc = -1ULL;
94 94
95 95 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
96 96 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
97 97 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
98 98 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
99 99 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
100 100 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
101 101 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
102 102
103 103 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
104 104
105 105 return (crc);
106 106 }
107 107
108 108 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
109 109
110 110 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
111 111 ((dbuf)->db.db_object == (obj) && \
112 112 (dbuf)->db_objset == (os) && \
113 113 (dbuf)->db_level == (level) && \
114 114 (dbuf)->db_blkid == (blkid))
115 115
116 116 dmu_buf_impl_t *
117 117 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
118 118 {
119 119 dbuf_hash_table_t *h = &dbuf_hash_table;
120 120 objset_t *os = dn->dn_objset;
121 121 uint64_t obj = dn->dn_object;
122 122 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
123 123 uint64_t idx = hv & h->hash_table_mask;
124 124 dmu_buf_impl_t *db;
125 125
126 126 mutex_enter(DBUF_HASH_MUTEX(h, idx));
127 127 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
128 128 if (DBUF_EQUAL(db, os, obj, level, blkid)) {
129 129 mutex_enter(&db->db_mtx);
130 130 if (db->db_state != DB_EVICTING) {
131 131 mutex_exit(DBUF_HASH_MUTEX(h, idx));
132 132 return (db);
133 133 }
134 134 mutex_exit(&db->db_mtx);
135 135 }
136 136 }
137 137 mutex_exit(DBUF_HASH_MUTEX(h, idx));
138 138 return (NULL);
139 139 }
140 140
141 141 /*
142 142 * Insert an entry into the hash table. If there is already an element
143 143 * equal to elem in the hash table, then the already existing element
144 144 * will be returned and the new element will not be inserted.
145 145 * Otherwise returns NULL.
146 146 */
147 147 static dmu_buf_impl_t *
148 148 dbuf_hash_insert(dmu_buf_impl_t *db)
149 149 {
150 150 dbuf_hash_table_t *h = &dbuf_hash_table;
151 151 objset_t *os = db->db_objset;
152 152 uint64_t obj = db->db.db_object;
153 153 int level = db->db_level;
154 154 uint64_t blkid = db->db_blkid;
155 155 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
156 156 uint64_t idx = hv & h->hash_table_mask;
157 157 dmu_buf_impl_t *dbf;
158 158
159 159 mutex_enter(DBUF_HASH_MUTEX(h, idx));
160 160 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
161 161 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
162 162 mutex_enter(&dbf->db_mtx);
163 163 if (dbf->db_state != DB_EVICTING) {
164 164 mutex_exit(DBUF_HASH_MUTEX(h, idx));
165 165 return (dbf);
166 166 }
167 167 mutex_exit(&dbf->db_mtx);
168 168 }
169 169 }
170 170
171 171 mutex_enter(&db->db_mtx);
172 172 db->db_hash_next = h->hash_table[idx];
173 173 h->hash_table[idx] = db;
174 174 mutex_exit(DBUF_HASH_MUTEX(h, idx));
175 175 atomic_add_64(&dbuf_hash_count, 1);
176 176
177 177 return (NULL);
178 178 }
179 179
180 180 /*
181 181 * Remove an entry from the hash table. This operation will
182 182 * fail if there are any existing holds on the db.
183 183 */
184 184 static void
185 185 dbuf_hash_remove(dmu_buf_impl_t *db)
186 186 {
187 187 dbuf_hash_table_t *h = &dbuf_hash_table;
188 188 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
189 189 db->db_level, db->db_blkid);
190 190 uint64_t idx = hv & h->hash_table_mask;
191 191 dmu_buf_impl_t *dbf, **dbp;
192 192
193 193 /*
194 194 * We musn't hold db_mtx to maintin lock ordering:
195 195 * DBUF_HASH_MUTEX > db_mtx.
196 196 */
197 197 ASSERT(refcount_is_zero(&db->db_holds));
198 198 ASSERT(db->db_state == DB_EVICTING);
199 199 ASSERT(!MUTEX_HELD(&db->db_mtx));
200 200
201 201 mutex_enter(DBUF_HASH_MUTEX(h, idx));
202 202 dbp = &h->hash_table[idx];
203 203 while ((dbf = *dbp) != db) {
204 204 dbp = &dbf->db_hash_next;
205 205 ASSERT(dbf != NULL);
206 206 }
207 207 *dbp = db->db_hash_next;
208 208 db->db_hash_next = NULL;
209 209 mutex_exit(DBUF_HASH_MUTEX(h, idx));
210 210 atomic_add_64(&dbuf_hash_count, -1);
211 211 }
212 212
213 213 static arc_evict_func_t dbuf_do_evict;
214 214
215 215 static void
216 216 dbuf_evict_user(dmu_buf_impl_t *db)
217 217 {
218 218 ASSERT(MUTEX_HELD(&db->db_mtx));
219 219
220 220 if (db->db_level != 0 || db->db_evict_func == NULL)
221 221 return;
222 222
223 223 if (db->db_user_data_ptr_ptr)
224 224 *db->db_user_data_ptr_ptr = db->db.db_data;
225 225 db->db_evict_func(&db->db, db->db_user_ptr);
226 226 db->db_user_ptr = NULL;
227 227 db->db_user_data_ptr_ptr = NULL;
228 228 db->db_evict_func = NULL;
229 229 }
230 230
231 231 boolean_t
232 232 dbuf_is_metadata(dmu_buf_impl_t *db)
233 233 {
234 234 if (db->db_level > 0) {
235 235 return (B_TRUE);
236 236 } else {
237 237 boolean_t is_metadata;
238 238
239 239 DB_DNODE_ENTER(db);
240 240 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
241 241 DB_DNODE_EXIT(db);
242 242
243 243 return (is_metadata);
244 244 }
245 245 }
246 246
247 247 void
248 248 dbuf_evict(dmu_buf_impl_t *db)
249 249 {
250 250 ASSERT(MUTEX_HELD(&db->db_mtx));
251 251 ASSERT(db->db_buf == NULL);
252 252 ASSERT(db->db_data_pending == NULL);
253 253
254 254 dbuf_clear(db);
255 255 dbuf_destroy(db);
256 256 }
257 257
258 258 void
259 259 dbuf_init(void)
260 260 {
261 261 uint64_t hsize = 1ULL << 16;
262 262 dbuf_hash_table_t *h = &dbuf_hash_table;
263 263 int i;
264 264
265 265 /*
266 266 * The hash table is big enough to fill all of physical memory
267 267 * with an average 4K block size. The table will take up
268 268 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
269 269 */
270 270 while (hsize * 4096 < physmem * PAGESIZE)
271 271 hsize <<= 1;
272 272
273 273 retry:
274 274 h->hash_table_mask = hsize - 1;
275 275 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
276 276 if (h->hash_table == NULL) {
277 277 /* XXX - we should really return an error instead of assert */
278 278 ASSERT(hsize > (1ULL << 10));
279 279 hsize >>= 1;
280 280 goto retry;
281 281 }
282 282
283 283 dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
284 284 sizeof (dmu_buf_impl_t),
285 285 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
286 286
287 287 for (i = 0; i < DBUF_MUTEXES; i++)
288 288 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
289 289 }
290 290
291 291 void
292 292 dbuf_fini(void)
293 293 {
294 294 dbuf_hash_table_t *h = &dbuf_hash_table;
295 295 int i;
296 296
297 297 for (i = 0; i < DBUF_MUTEXES; i++)
298 298 mutex_destroy(&h->hash_mutexes[i]);
299 299 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
300 300 kmem_cache_destroy(dbuf_cache);
301 301 }
302 302
303 303 /*
304 304 * Other stuff.
305 305 */
306 306
307 307 #ifdef ZFS_DEBUG
308 308 static void
309 309 dbuf_verify(dmu_buf_impl_t *db)
310 310 {
311 311 dnode_t *dn;
312 312 dbuf_dirty_record_t *dr;
313 313
314 314 ASSERT(MUTEX_HELD(&db->db_mtx));
315 315
316 316 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
317 317 return;
318 318
319 319 ASSERT(db->db_objset != NULL);
320 320 DB_DNODE_ENTER(db);
321 321 dn = DB_DNODE(db);
322 322 if (dn == NULL) {
323 323 ASSERT(db->db_parent == NULL);
324 324 ASSERT(db->db_blkptr == NULL);
325 325 } else {
326 326 ASSERT3U(db->db.db_object, ==, dn->dn_object);
327 327 ASSERT3P(db->db_objset, ==, dn->dn_objset);
328 328 ASSERT3U(db->db_level, <, dn->dn_nlevels);
329 329 ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
330 330 db->db_blkid == DMU_SPILL_BLKID ||
331 331 !list_is_empty(&dn->dn_dbufs));
332 332 }
333 333 if (db->db_blkid == DMU_BONUS_BLKID) {
334 334 ASSERT(dn != NULL);
335 335 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
336 336 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
337 337 } else if (db->db_blkid == DMU_SPILL_BLKID) {
338 338 ASSERT(dn != NULL);
339 339 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
340 340 ASSERT0(db->db.db_offset);
341 341 } else {
342 342 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
343 343 }
344 344
345 345 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
346 346 ASSERT(dr->dr_dbuf == db);
347 347
348 348 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
349 349 ASSERT(dr->dr_dbuf == db);
350 350
351 351 /*
352 352 * We can't assert that db_size matches dn_datablksz because it
353 353 * can be momentarily different when another thread is doing
354 354 * dnode_set_blksz().
355 355 */
356 356 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
357 357 dr = db->db_data_pending;
358 358 /*
359 359 * It should only be modified in syncing context, so
360 360 * make sure we only have one copy of the data.
361 361 */
362 362 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
363 363 }
364 364
365 365 /* verify db->db_blkptr */
366 366 if (db->db_blkptr) {
367 367 if (db->db_parent == dn->dn_dbuf) {
368 368 /* db is pointed to by the dnode */
369 369 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
370 370 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
371 371 ASSERT(db->db_parent == NULL);
372 372 else
373 373 ASSERT(db->db_parent != NULL);
374 374 if (db->db_blkid != DMU_SPILL_BLKID)
375 375 ASSERT3P(db->db_blkptr, ==,
376 376 &dn->dn_phys->dn_blkptr[db->db_blkid]);
377 377 } else {
378 378 /* db is pointed to by an indirect block */
379 379 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
380 380 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
381 381 ASSERT3U(db->db_parent->db.db_object, ==,
382 382 db->db.db_object);
383 383 /*
384 384 * dnode_grow_indblksz() can make this fail if we don't
385 385 * have the struct_rwlock. XXX indblksz no longer
386 386 * grows. safe to do this now?
387 387 */
388 388 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
389 389 ASSERT3P(db->db_blkptr, ==,
390 390 ((blkptr_t *)db->db_parent->db.db_data +
391 391 db->db_blkid % epb));
392 392 }
393 393 }
394 394 }
395 395 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
396 396 (db->db_buf == NULL || db->db_buf->b_data) &&
397 397 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
398 398 db->db_state != DB_FILL && !dn->dn_free_txg) {
399 399 /*
400 400 * If the blkptr isn't set but they have nonzero data,
401 401 * it had better be dirty, otherwise we'll lose that
402 402 * data when we evict this buffer.
403 403 */
404 404 if (db->db_dirtycnt == 0) {
405 405 uint64_t *buf = db->db.db_data;
406 406 int i;
407 407
408 408 for (i = 0; i < db->db.db_size >> 3; i++) {
409 409 ASSERT(buf[i] == 0);
410 410 }
411 411 }
412 412 }
413 413 DB_DNODE_EXIT(db);
414 414 }
415 415 #endif
416 416
417 417 static void
418 418 dbuf_update_data(dmu_buf_impl_t *db)
419 419 {
420 420 ASSERT(MUTEX_HELD(&db->db_mtx));
421 421 if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
422 422 ASSERT(!refcount_is_zero(&db->db_holds));
423 423 *db->db_user_data_ptr_ptr = db->db.db_data;
424 424 }
425 425 }
426 426
427 427 static void
428 428 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
429 429 {
430 430 ASSERT(MUTEX_HELD(&db->db_mtx));
431 431 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
432 432 db->db_buf = buf;
433 433 if (buf != NULL) {
434 434 ASSERT(buf->b_data != NULL);
435 435 db->db.db_data = buf->b_data;
436 436 if (!arc_released(buf))
437 437 arc_set_callback(buf, dbuf_do_evict, db);
438 438 dbuf_update_data(db);
439 439 } else {
440 440 dbuf_evict_user(db);
441 441 db->db.db_data = NULL;
442 442 if (db->db_state != DB_NOFILL)
443 443 db->db_state = DB_UNCACHED;
444 444 }
445 445 }
446 446
447 447 /*
448 448 * Loan out an arc_buf for read. Return the loaned arc_buf.
449 449 */
450 450 arc_buf_t *
451 451 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
452 452 {
453 453 arc_buf_t *abuf;
454 454
455 455 mutex_enter(&db->db_mtx);
456 456 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
457 457 int blksz = db->db.db_size;
458 458 spa_t *spa;
459 459
460 460 mutex_exit(&db->db_mtx);
461 461 DB_GET_SPA(&spa, db);
462 462 abuf = arc_loan_buf(spa, blksz);
463 463 bcopy(db->db.db_data, abuf->b_data, blksz);
464 464 } else {
465 465 abuf = db->db_buf;
466 466 arc_loan_inuse_buf(abuf, db);
467 467 dbuf_set_data(db, NULL);
468 468 mutex_exit(&db->db_mtx);
469 469 }
470 470 return (abuf);
471 471 }
472 472
473 473 uint64_t
474 474 dbuf_whichblock(dnode_t *dn, uint64_t offset)
475 475 {
476 476 if (dn->dn_datablkshift) {
477 477 return (offset >> dn->dn_datablkshift);
478 478 } else {
479 479 ASSERT3U(offset, <, dn->dn_datablksz);
480 480 return (0);
481 481 }
482 482 }
483 483
484 484 static void
485 485 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
486 486 {
487 487 dmu_buf_impl_t *db = vdb;
488 488
489 489 mutex_enter(&db->db_mtx);
490 490 ASSERT3U(db->db_state, ==, DB_READ);
491 491 /*
492 492 * All reads are synchronous, so we must have a hold on the dbuf
493 493 */
494 494 ASSERT(refcount_count(&db->db_holds) > 0);
495 495 ASSERT(db->db_buf == NULL);
496 496 ASSERT(db->db.db_data == NULL);
497 497 if (db->db_level == 0 && db->db_freed_in_flight) {
498 498 /* we were freed in flight; disregard any error */
499 499 arc_release(buf, db);
500 500 bzero(buf->b_data, db->db.db_size);
501 501 arc_buf_freeze(buf);
502 502 db->db_freed_in_flight = FALSE;
503 503 dbuf_set_data(db, buf);
504 504 db->db_state = DB_CACHED;
505 505 } else if (zio == NULL || zio->io_error == 0) {
506 506 dbuf_set_data(db, buf);
507 507 db->db_state = DB_CACHED;
508 508 } else {
509 509 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
510 510 ASSERT3P(db->db_buf, ==, NULL);
511 511 VERIFY(arc_buf_remove_ref(buf, db));
512 512 db->db_state = DB_UNCACHED;
513 513 }
514 514 cv_broadcast(&db->db_changed);
515 515 dbuf_rele_and_unlock(db, NULL);
516 516 }
517 517
518 518 static void
519 519 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
520 520 {
521 521 dnode_t *dn;
522 522 spa_t *spa;
523 523 zbookmark_t zb;
524 524 uint32_t aflags = ARC_NOWAIT;
525 525
526 526 DB_DNODE_ENTER(db);
527 527 dn = DB_DNODE(db);
528 528 ASSERT(!refcount_is_zero(&db->db_holds));
529 529 /* We need the struct_rwlock to prevent db_blkptr from changing. */
530 530 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
531 531 ASSERT(MUTEX_HELD(&db->db_mtx));
532 532 ASSERT(db->db_state == DB_UNCACHED);
533 533 ASSERT(db->db_buf == NULL);
534 534
535 535 if (db->db_blkid == DMU_BONUS_BLKID) {
536 536 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
537 537
538 538 ASSERT3U(bonuslen, <=, db->db.db_size);
539 539 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
540 540 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
541 541 if (bonuslen < DN_MAX_BONUSLEN)
542 542 bzero(db->db.db_data, DN_MAX_BONUSLEN);
543 543 if (bonuslen)
544 544 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
545 545 DB_DNODE_EXIT(db);
546 546 dbuf_update_data(db);
547 547 db->db_state = DB_CACHED;
548 548 mutex_exit(&db->db_mtx);
549 549 return;
550 550 }
551 551
552 552 /*
553 553 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
554 554 * processes the delete record and clears the bp while we are waiting
555 555 * for the dn_mtx (resulting in a "no" from block_freed).
556 556 */
557 557 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
558 558 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
559 559 BP_IS_HOLE(db->db_blkptr)))) {
560 560 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
561 561
562 562 dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
563 563 db->db.db_size, db, type));
564 564 DB_DNODE_EXIT(db);
565 565 bzero(db->db.db_data, db->db.db_size);
566 566 db->db_state = DB_CACHED;
567 567 *flags |= DB_RF_CACHED;
568 568 mutex_exit(&db->db_mtx);
569 569 return;
570 570 }
571 571
572 572 spa = dn->dn_objset->os_spa;
573 573 DB_DNODE_EXIT(db);
574 574
575 575 db->db_state = DB_READ;
576 576 mutex_exit(&db->db_mtx);
577 577
578 578 if (DBUF_IS_L2CACHEABLE(db))
579 579 aflags |= ARC_L2CACHE;
580 580 if (DBUF_IS_L2COMPRESSIBLE(db))
581 581 aflags |= ARC_L2COMPRESS;
582 582
583 583 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
584 584 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
585 585 db->db.db_object, db->db_level, db->db_blkid);
586 586
587 587 dbuf_add_ref(db, NULL);
588 588
589 589 (void) arc_read(zio, spa, db->db_blkptr,
590 590 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
591 591 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
592 592 &aflags, &zb);
593 593 if (aflags & ARC_CACHED)
594 594 *flags |= DB_RF_CACHED;
595 595 }
596 596
597 597 int
598 598 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
599 599 {
600 600 int err = 0;
601 601 int havepzio = (zio != NULL);
602 602 int prefetch;
603 603 dnode_t *dn;
604 604
605 605 /*
606 606 * We don't have to hold the mutex to check db_state because it
607 607 * can't be freed while we have a hold on the buffer.
608 608 */
609 609 ASSERT(!refcount_is_zero(&db->db_holds));
610 610
611 611 if (db->db_state == DB_NOFILL)
612 612 return (SET_ERROR(EIO));
613 613
614 614 DB_DNODE_ENTER(db);
615 615 dn = DB_DNODE(db);
616 616 if ((flags & DB_RF_HAVESTRUCT) == 0)
617 617 rw_enter(&dn->dn_struct_rwlock, RW_READER);
618 618
619 619 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
620 620 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
621 621 DBUF_IS_CACHEABLE(db);
622 622
623 623 mutex_enter(&db->db_mtx);
624 624 if (db->db_state == DB_CACHED) {
625 625 mutex_exit(&db->db_mtx);
626 626 if (prefetch)
627 627 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
628 628 db->db.db_size, TRUE);
629 629 if ((flags & DB_RF_HAVESTRUCT) == 0)
630 630 rw_exit(&dn->dn_struct_rwlock);
631 631 DB_DNODE_EXIT(db);
632 632 } else if (db->db_state == DB_UNCACHED) {
633 633 spa_t *spa = dn->dn_objset->os_spa;
634 634
635 635 if (zio == NULL)
636 636 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
637 637 dbuf_read_impl(db, zio, &flags);
638 638
639 639 /* dbuf_read_impl has dropped db_mtx for us */
640 640
641 641 if (prefetch)
642 642 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
643 643 db->db.db_size, flags & DB_RF_CACHED);
644 644
645 645 if ((flags & DB_RF_HAVESTRUCT) == 0)
646 646 rw_exit(&dn->dn_struct_rwlock);
647 647 DB_DNODE_EXIT(db);
648 648
649 649 if (!havepzio)
650 650 err = zio_wait(zio);
651 651 } else {
652 652 /*
653 653 * Another reader came in while the dbuf was in flight
654 654 * between UNCACHED and CACHED. Either a writer will finish
655 655 * writing the buffer (sending the dbuf to CACHED) or the
656 656 * first reader's request will reach the read_done callback
657 657 * and send the dbuf to CACHED. Otherwise, a failure
658 658 * occurred and the dbuf went to UNCACHED.
659 659 */
660 660 mutex_exit(&db->db_mtx);
661 661 if (prefetch)
662 662 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
663 663 db->db.db_size, TRUE);
664 664 if ((flags & DB_RF_HAVESTRUCT) == 0)
665 665 rw_exit(&dn->dn_struct_rwlock);
666 666 DB_DNODE_EXIT(db);
667 667
668 668 /* Skip the wait per the caller's request. */
669 669 mutex_enter(&db->db_mtx);
670 670 if ((flags & DB_RF_NEVERWAIT) == 0) {
671 671 while (db->db_state == DB_READ ||
672 672 db->db_state == DB_FILL) {
673 673 ASSERT(db->db_state == DB_READ ||
674 674 (flags & DB_RF_HAVESTRUCT) == 0);
675 675 cv_wait(&db->db_changed, &db->db_mtx);
676 676 }
677 677 if (db->db_state == DB_UNCACHED)
678 678 err = SET_ERROR(EIO);
679 679 }
680 680 mutex_exit(&db->db_mtx);
681 681 }
682 682
683 683 ASSERT(err || havepzio || db->db_state == DB_CACHED);
684 684 return (err);
685 685 }
686 686
687 687 static void
688 688 dbuf_noread(dmu_buf_impl_t *db)
689 689 {
690 690 ASSERT(!refcount_is_zero(&db->db_holds));
691 691 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
692 692 mutex_enter(&db->db_mtx);
693 693 while (db->db_state == DB_READ || db->db_state == DB_FILL)
694 694 cv_wait(&db->db_changed, &db->db_mtx);
695 695 if (db->db_state == DB_UNCACHED) {
696 696 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
697 697 spa_t *spa;
698 698
699 699 ASSERT(db->db_buf == NULL);
700 700 ASSERT(db->db.db_data == NULL);
701 701 DB_GET_SPA(&spa, db);
702 702 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
703 703 db->db_state = DB_FILL;
704 704 } else if (db->db_state == DB_NOFILL) {
705 705 dbuf_set_data(db, NULL);
706 706 } else {
707 707 ASSERT3U(db->db_state, ==, DB_CACHED);
708 708 }
709 709 mutex_exit(&db->db_mtx);
710 710 }
711 711
712 712 /*
713 713 * This is our just-in-time copy function. It makes a copy of
714 714 * buffers, that have been modified in a previous transaction
715 715 * group, before we modify them in the current active group.
716 716 *
717 717 * This function is used in two places: when we are dirtying a
718 718 * buffer for the first time in a txg, and when we are freeing
719 719 * a range in a dnode that includes this buffer.
720 720 *
721 721 * Note that when we are called from dbuf_free_range() we do
722 722 * not put a hold on the buffer, we just traverse the active
723 723 * dbuf list for the dnode.
724 724 */
725 725 static void
726 726 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
727 727 {
728 728 dbuf_dirty_record_t *dr = db->db_last_dirty;
729 729
730 730 ASSERT(MUTEX_HELD(&db->db_mtx));
731 731 ASSERT(db->db.db_data != NULL);
732 732 ASSERT(db->db_level == 0);
733 733 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
734 734
735 735 if (dr == NULL ||
736 736 (dr->dt.dl.dr_data !=
737 737 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
738 738 return;
739 739
740 740 /*
741 741 * If the last dirty record for this dbuf has not yet synced
742 742 * and its referencing the dbuf data, either:
743 743 * reset the reference to point to a new copy,
744 744 * or (if there a no active holders)
745 745 * just null out the current db_data pointer.
746 746 */
747 747 ASSERT(dr->dr_txg >= txg - 2);
748 748 if (db->db_blkid == DMU_BONUS_BLKID) {
749 749 /* Note that the data bufs here are zio_bufs */
750 750 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
751 751 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
752 752 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
753 753 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
754 754 int size = db->db.db_size;
755 755 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
756 756 spa_t *spa;
757 757
758 758 DB_GET_SPA(&spa, db);
759 759 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
760 760 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
761 761 } else {
762 762 dbuf_set_data(db, NULL);
763 763 }
764 764 }
765 765
766 766 void
767 767 dbuf_unoverride(dbuf_dirty_record_t *dr)
768 768 {
769 769 dmu_buf_impl_t *db = dr->dr_dbuf;
770 770 blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
771 771 uint64_t txg = dr->dr_txg;
772 772
773 773 ASSERT(MUTEX_HELD(&db->db_mtx));
774 774 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
775 775 ASSERT(db->db_level == 0);
776 776
777 777 if (db->db_blkid == DMU_BONUS_BLKID ||
778 778 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
779 779 return;
780 780
781 781 ASSERT(db->db_data_pending != dr);
782 782
783 783 /* free this block */
784 784 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) {
785 785 spa_t *spa;
786 786
787 787 DB_GET_SPA(&spa, db);
788 788 zio_free(spa, txg, bp);
789 789 }
790 790 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
791 791 dr->dt.dl.dr_nopwrite = B_FALSE;
792 792
793 793 /*
794 794 * Release the already-written buffer, so we leave it in
795 795 * a consistent dirty state. Note that all callers are
796 796 * modifying the buffer, so they will immediately do
797 797 * another (redundant) arc_release(). Therefore, leave
798 798 * the buf thawed to save the effort of freezing &
799 799 * immediately re-thawing it.
800 800 */
801 801 arc_release(dr->dt.dl.dr_data, db);
802 802 }
803 803
804 804 /*
805 805 * Evict (if its unreferenced) or clear (if its referenced) any level-0
806 806 * data blocks in the free range, so that any future readers will find
807 807 * empty blocks. Also, if we happen across any level-1 dbufs in the
808 808 * range that have not already been marked dirty, mark them dirty so
809 809 * they stay in memory.
810 810 *
811 811 * This is a no-op if the dataset is in the middle of an incremental
812 812 * receive; see comment below for details.
813 813 */
814 814 void
815 815 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
816 816 {
817 817 dmu_buf_impl_t *db, *db_next;
818 818 uint64_t txg = tx->tx_txg;
819 819 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
820 820 uint64_t first_l1 = start >> epbs;
821 821 uint64_t last_l1 = end >> epbs;
822 822
823 823 if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
824 824 end = dn->dn_maxblkid;
825 825 last_l1 = end >> epbs;
826 826 }
827 827 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
828 828
829 829 mutex_enter(&dn->dn_dbufs_mtx);
830 830 if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
831 831 /* There can't be any dbufs in this range; no need to search. */
832 832 mutex_exit(&dn->dn_dbufs_mtx);
833 833 return;
834 834 } else if (dmu_objset_is_receiving(dn->dn_objset)) {
835 835 /*
836 836 * If we are receiving, we expect there to be no dbufs in
837 837 * the range to be freed, because receive modifies each
838 838 * block at most once, and in offset order. If this is
839 839 * not the case, it can lead to performance problems,
840 840 * so note that we unexpectedly took the slow path.
841 841 */
842 842 atomic_inc_64(&zfs_free_range_recv_miss);
843 843 }
844 844
845 845 for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
846 846 db_next = list_next(&dn->dn_dbufs, db);
847 847 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
848 848
849 849 if (db->db_level == 1 &&
850 850 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
851 851 mutex_enter(&db->db_mtx);
852 852 if (db->db_last_dirty &&
853 853 db->db_last_dirty->dr_txg < txg) {
854 854 dbuf_add_ref(db, FTAG);
855 855 mutex_exit(&db->db_mtx);
856 856 dbuf_will_dirty(db, tx);
857 857 dbuf_rele(db, FTAG);
858 858 } else {
859 859 mutex_exit(&db->db_mtx);
860 860 }
861 861 }
862 862
863 863 if (db->db_level != 0)
864 864 continue;
865 865 dprintf_dbuf(db, "found buf %s\n", "");
866 866 if (db->db_blkid < start || db->db_blkid > end)
867 867 continue;
868 868
869 869 /* found a level 0 buffer in the range */
870 870 mutex_enter(&db->db_mtx);
871 871 if (dbuf_undirty(db, tx)) {
872 872 /* mutex has been dropped and dbuf destroyed */
873 873 continue;
874 874 }
875 875
876 876 if (db->db_state == DB_UNCACHED ||
877 877 db->db_state == DB_NOFILL ||
878 878 db->db_state == DB_EVICTING) {
879 879 ASSERT(db->db.db_data == NULL);
880 880 mutex_exit(&db->db_mtx);
881 881 continue;
882 882 }
883 883 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
884 884 /* will be handled in dbuf_read_done or dbuf_rele */
885 885 db->db_freed_in_flight = TRUE;
886 886 mutex_exit(&db->db_mtx);
887 887 continue;
888 888 }
889 889 if (refcount_count(&db->db_holds) == 0) {
890 890 ASSERT(db->db_buf);
891 891 dbuf_clear(db);
892 892 continue;
893 893 }
894 894 /* The dbuf is referenced */
895 895
896 896 if (db->db_last_dirty != NULL) {
897 897 dbuf_dirty_record_t *dr = db->db_last_dirty;
898 898
899 899 if (dr->dr_txg == txg) {
900 900 /*
901 901 * This buffer is "in-use", re-adjust the file
902 902 * size to reflect that this buffer may
903 903 * contain new data when we sync.
904 904 */
905 905 if (db->db_blkid != DMU_SPILL_BLKID &&
906 906 db->db_blkid > dn->dn_maxblkid)
907 907 dn->dn_maxblkid = db->db_blkid;
908 908 dbuf_unoverride(dr);
909 909 } else {
910 910 /*
911 911 * This dbuf is not dirty in the open context.
912 912 * Either uncache it (if its not referenced in
913 913 * the open context) or reset its contents to
914 914 * empty.
915 915 */
916 916 dbuf_fix_old_data(db, txg);
917 917 }
918 918 }
919 919 /* clear the contents if its cached */
920 920 if (db->db_state == DB_CACHED) {
921 921 ASSERT(db->db.db_data != NULL);
922 922 arc_release(db->db_buf, db);
923 923 bzero(db->db.db_data, db->db.db_size);
924 924 arc_buf_freeze(db->db_buf);
925 925 }
926 926
927 927 mutex_exit(&db->db_mtx);
928 928 }
929 929 mutex_exit(&dn->dn_dbufs_mtx);
930 930 }
931 931
932 932 static int
933 933 dbuf_block_freeable(dmu_buf_impl_t *db)
934 934 {
935 935 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
936 936 uint64_t birth_txg = 0;
937 937
938 938 /*
939 939 * We don't need any locking to protect db_blkptr:
940 940 * If it's syncing, then db_last_dirty will be set
941 941 * so we'll ignore db_blkptr.
942 942 */
943 943 ASSERT(MUTEX_HELD(&db->db_mtx));
944 944 if (db->db_last_dirty)
945 945 birth_txg = db->db_last_dirty->dr_txg;
946 946 else if (db->db_blkptr)
947 947 birth_txg = db->db_blkptr->blk_birth;
948 948
949 949 /*
950 950 * If we don't exist or are in a snapshot, we can't be freed.
951 951 * Don't pass the bp to dsl_dataset_block_freeable() since we
952 952 * are holding the db_mtx lock and might deadlock if we are
953 953 * prefetching a dedup-ed block.
954 954 */
955 955 if (birth_txg)
956 956 return (ds == NULL ||
957 957 dsl_dataset_block_freeable(ds, NULL, birth_txg));
958 958 else
959 959 return (FALSE);
960 960 }
961 961
962 962 void
963 963 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
964 964 {
965 965 arc_buf_t *buf, *obuf;
966 966 int osize = db->db.db_size;
967 967 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
968 968 dnode_t *dn;
969 969
970 970 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
971 971
972 972 DB_DNODE_ENTER(db);
973 973 dn = DB_DNODE(db);
974 974
975 975 /* XXX does *this* func really need the lock? */
976 976 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
977 977
978 978 /*
979 979 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
980 980 * is OK, because there can be no other references to the db
981 981 * when we are changing its size, so no concurrent DB_FILL can
982 982 * be happening.
983 983 */
984 984 /*
985 985 * XXX we should be doing a dbuf_read, checking the return
986 986 * value and returning that up to our callers
987 987 */
988 988 dbuf_will_dirty(db, tx);
989 989
990 990 /* create the data buffer for the new block */
991 991 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
992 992
993 993 /* copy old block data to the new block */
994 994 obuf = db->db_buf;
995 995 bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
996 996 /* zero the remainder */
997 997 if (size > osize)
998 998 bzero((uint8_t *)buf->b_data + osize, size - osize);
999 999
1000 1000 mutex_enter(&db->db_mtx);
1001 1001 dbuf_set_data(db, buf);
1002 1002 VERIFY(arc_buf_remove_ref(obuf, db));
1003 1003 db->db.db_size = size;
1004 1004
1005 1005 if (db->db_level == 0) {
1006 1006 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1007 1007 db->db_last_dirty->dt.dl.dr_data = buf;
1008 1008 }
1009 1009 mutex_exit(&db->db_mtx);
1010 1010
1011 1011 dnode_willuse_space(dn, size-osize, tx);
1012 1012 DB_DNODE_EXIT(db);
1013 1013 }
1014 1014
1015 1015 void
1016 1016 dbuf_release_bp(dmu_buf_impl_t *db)
1017 1017 {
1018 1018 objset_t *os;
1019 1019
1020 1020 DB_GET_OBJSET(&os, db);
1021 1021 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1022 1022 ASSERT(arc_released(os->os_phys_buf) ||
1023 1023 list_link_active(&os->os_dsl_dataset->ds_synced_link));
1024 1024 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1025 1025
1026 1026 (void) arc_release(db->db_buf, db);
1027 1027 }
1028 1028
1029 1029 dbuf_dirty_record_t *
1030 1030 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1031 1031 {
1032 1032 dnode_t *dn;
1033 1033 objset_t *os;
1034 1034 dbuf_dirty_record_t **drp, *dr;
1035 1035 int drop_struct_lock = FALSE;
1036 1036 boolean_t do_free_accounting = B_FALSE;
1037 1037 int txgoff = tx->tx_txg & TXG_MASK;
1038 1038
1039 1039 ASSERT(tx->tx_txg != 0);
1040 1040 ASSERT(!refcount_is_zero(&db->db_holds));
1041 1041 DMU_TX_DIRTY_BUF(tx, db);
1042 1042
1043 1043 DB_DNODE_ENTER(db);
1044 1044 dn = DB_DNODE(db);
1045 1045 /*
1046 1046 * Shouldn't dirty a regular buffer in syncing context. Private
1047 1047 * objects may be dirtied in syncing context, but only if they
1048 1048 * were already pre-dirtied in open context.
1049 1049 */
1050 1050 ASSERT(!dmu_tx_is_syncing(tx) ||
1051 1051 BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1052 1052 DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1053 1053 dn->dn_objset->os_dsl_dataset == NULL);
1054 1054 /*
1055 1055 * We make this assert for private objects as well, but after we
1056 1056 * check if we're already dirty. They are allowed to re-dirty
1057 1057 * in syncing context.
1058 1058 */
1059 1059 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1060 1060 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1061 1061 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1062 1062
1063 1063 mutex_enter(&db->db_mtx);
1064 1064 /*
1065 1065 * XXX make this true for indirects too? The problem is that
1066 1066 * transactions created with dmu_tx_create_assigned() from
1067 1067 * syncing context don't bother holding ahead.
1068 1068 */
1069 1069 ASSERT(db->db_level != 0 ||
1070 1070 db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1071 1071 db->db_state == DB_NOFILL);
1072 1072
1073 1073 mutex_enter(&dn->dn_mtx);
1074 1074 /*
1075 1075 * Don't set dirtyctx to SYNC if we're just modifying this as we
1076 1076 * initialize the objset.
1077 1077 */
1078 1078 if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1079 1079 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1080 1080 dn->dn_dirtyctx =
1081 1081 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1082 1082 ASSERT(dn->dn_dirtyctx_firstset == NULL);
1083 1083 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1084 1084 }
1085 1085 mutex_exit(&dn->dn_mtx);
1086 1086
1087 1087 if (db->db_blkid == DMU_SPILL_BLKID)
1088 1088 dn->dn_have_spill = B_TRUE;
1089 1089
1090 1090 /*
1091 1091 * If this buffer is already dirty, we're done.
1092 1092 */
1093 1093 drp = &db->db_last_dirty;
1094 1094 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1095 1095 db->db.db_object == DMU_META_DNODE_OBJECT);
1096 1096 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1097 1097 drp = &dr->dr_next;
1098 1098 if (dr && dr->dr_txg == tx->tx_txg) {
1099 1099 DB_DNODE_EXIT(db);
1100 1100
1101 1101 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1102 1102 /*
1103 1103 * If this buffer has already been written out,
1104 1104 * we now need to reset its state.
1105 1105 */
1106 1106 dbuf_unoverride(dr);
1107 1107 if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1108 1108 db->db_state != DB_NOFILL)
1109 1109 arc_buf_thaw(db->db_buf);
1110 1110 }
1111 1111 mutex_exit(&db->db_mtx);
1112 1112 return (dr);
1113 1113 }
1114 1114
1115 1115 /*
1116 1116 * Only valid if not already dirty.
1117 1117 */
1118 1118 ASSERT(dn->dn_object == 0 ||
1119 1119 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1120 1120 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1121 1121
1122 1122 ASSERT3U(dn->dn_nlevels, >, db->db_level);
1123 1123 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1124 1124 dn->dn_phys->dn_nlevels > db->db_level ||
1125 1125 dn->dn_next_nlevels[txgoff] > db->db_level ||
1126 1126 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1127 1127 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1128 1128
1129 1129 /*
1130 1130 * We should only be dirtying in syncing context if it's the
1131 1131 * mos or we're initializing the os or it's a special object.
1132 1132 * However, we are allowed to dirty in syncing context provided
1133 1133 * we already dirtied it in open context. Hence we must make
1134 1134 * this assertion only if we're not already dirty.
1135 1135 */
1136 1136 os = dn->dn_objset;
1137 1137 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1138 1138 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1139 1139 ASSERT(db->db.db_size != 0);
1140 1140
1141 1141 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1142 1142
1143 1143 if (db->db_blkid != DMU_BONUS_BLKID) {
1144 1144 /*
1145 1145 * Update the accounting.
1146 1146 * Note: we delay "free accounting" until after we drop
1147 1147 * the db_mtx. This keeps us from grabbing other locks
1148 1148 * (and possibly deadlocking) in bp_get_dsize() while
1149 1149 * also holding the db_mtx.
1150 1150 */
1151 1151 dnode_willuse_space(dn, db->db.db_size, tx);
1152 1152 do_free_accounting = dbuf_block_freeable(db);
1153 1153 }
1154 1154
1155 1155 /*
1156 1156 * If this buffer is dirty in an old transaction group we need
1157 1157 * to make a copy of it so that the changes we make in this
1158 1158 * transaction group won't leak out when we sync the older txg.
1159 1159 */
1160 1160 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1161 1161 if (db->db_level == 0) {
1162 1162 void *data_old = db->db_buf;
1163 1163
1164 1164 if (db->db_state != DB_NOFILL) {
1165 1165 if (db->db_blkid == DMU_BONUS_BLKID) {
1166 1166 dbuf_fix_old_data(db, tx->tx_txg);
1167 1167 data_old = db->db.db_data;
1168 1168 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1169 1169 /*
1170 1170 * Release the data buffer from the cache so
1171 1171 * that we can modify it without impacting
1172 1172 * possible other users of this cached data
1173 1173 * block. Note that indirect blocks and
1174 1174 * private objects are not released until the
1175 1175 * syncing state (since they are only modified
1176 1176 * then).
1177 1177 */
1178 1178 arc_release(db->db_buf, db);
1179 1179 dbuf_fix_old_data(db, tx->tx_txg);
1180 1180 data_old = db->db_buf;
1181 1181 }
1182 1182 ASSERT(data_old != NULL);
1183 1183 }
1184 1184 dr->dt.dl.dr_data = data_old;
1185 1185 } else {
1186 1186 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1187 1187 list_create(&dr->dt.di.dr_children,
1188 1188 sizeof (dbuf_dirty_record_t),
1189 1189 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1190 1190 }
1191 1191 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1192 1192 dr->dr_accounted = db->db.db_size;
1193 1193 dr->dr_dbuf = db;
1194 1194 dr->dr_txg = tx->tx_txg;
1195 1195 dr->dr_next = *drp;
1196 1196 *drp = dr;
1197 1197
1198 1198 /*
1199 1199 * We could have been freed_in_flight between the dbuf_noread
1200 1200 * and dbuf_dirty. We win, as though the dbuf_noread() had
1201 1201 * happened after the free.
1202 1202 */
1203 1203 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1204 1204 db->db_blkid != DMU_SPILL_BLKID) {
1205 1205 mutex_enter(&dn->dn_mtx);
1206 1206 dnode_clear_range(dn, db->db_blkid, 1, tx);
1207 1207 mutex_exit(&dn->dn_mtx);
1208 1208 db->db_freed_in_flight = FALSE;
1209 1209 }
1210 1210
1211 1211 /*
1212 1212 * This buffer is now part of this txg
1213 1213 */
1214 1214 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1215 1215 db->db_dirtycnt += 1;
1216 1216 ASSERT3U(db->db_dirtycnt, <=, 3);
1217 1217
1218 1218 mutex_exit(&db->db_mtx);
1219 1219
1220 1220 if (db->db_blkid == DMU_BONUS_BLKID ||
1221 1221 db->db_blkid == DMU_SPILL_BLKID) {
1222 1222 mutex_enter(&dn->dn_mtx);
1223 1223 ASSERT(!list_link_active(&dr->dr_dirty_node));
1224 1224 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1225 1225 mutex_exit(&dn->dn_mtx);
1226 1226 dnode_setdirty(dn, tx);
1227 1227 DB_DNODE_EXIT(db);
1228 1228 return (dr);
1229 1229 } else if (do_free_accounting) {
1230 1230 blkptr_t *bp = db->db_blkptr;
1231 1231 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1232 1232 bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1233 1233 /*
1234 1234 * This is only a guess -- if the dbuf is dirty
1235 1235 * in a previous txg, we don't know how much
1236 1236 * space it will use on disk yet. We should
1237 1237 * really have the struct_rwlock to access
1238 1238 * db_blkptr, but since this is just a guess,
1239 1239 * it's OK if we get an odd answer.
1240 1240 */
1241 1241 ddt_prefetch(os->os_spa, bp);
1242 1242 dnode_willuse_space(dn, -willfree, tx);
1243 1243 }
1244 1244
1245 1245 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1246 1246 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1247 1247 drop_struct_lock = TRUE;
1248 1248 }
1249 1249
1250 1250 if (db->db_level == 0) {
1251 1251 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1252 1252 ASSERT(dn->dn_maxblkid >= db->db_blkid);
1253 1253 }
1254 1254
1255 1255 if (db->db_level+1 < dn->dn_nlevels) {
1256 1256 dmu_buf_impl_t *parent = db->db_parent;
1257 1257 dbuf_dirty_record_t *di;
1258 1258 int parent_held = FALSE;
1259 1259
1260 1260 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1261 1261 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1262 1262
1263 1263 parent = dbuf_hold_level(dn, db->db_level+1,
1264 1264 db->db_blkid >> epbs, FTAG);
1265 1265 ASSERT(parent != NULL);
1266 1266 parent_held = TRUE;
1267 1267 }
1268 1268 if (drop_struct_lock)
1269 1269 rw_exit(&dn->dn_struct_rwlock);
1270 1270 ASSERT3U(db->db_level+1, ==, parent->db_level);
1271 1271 di = dbuf_dirty(parent, tx);
1272 1272 if (parent_held)
1273 1273 dbuf_rele(parent, FTAG);
1274 1274
1275 1275 mutex_enter(&db->db_mtx);
1276 1276 /*
1277 1277 * Since we've dropped the mutex, it's possible that
1278 1278 * dbuf_undirty() might have changed this out from under us.
1279 1279 */
1280 1280 if (db->db_last_dirty == dr ||
1281 1281 dn->dn_object == DMU_META_DNODE_OBJECT) {
1282 1282 mutex_enter(&di->dt.di.dr_mtx);
1283 1283 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1284 1284 ASSERT(!list_link_active(&dr->dr_dirty_node));
1285 1285 list_insert_tail(&di->dt.di.dr_children, dr);
1286 1286 mutex_exit(&di->dt.di.dr_mtx);
1287 1287 dr->dr_parent = di;
1288 1288 }
1289 1289 mutex_exit(&db->db_mtx);
1290 1290 } else {
1291 1291 ASSERT(db->db_level+1 == dn->dn_nlevels);
1292 1292 ASSERT(db->db_blkid < dn->dn_nblkptr);
1293 1293 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1294 1294 mutex_enter(&dn->dn_mtx);
1295 1295 ASSERT(!list_link_active(&dr->dr_dirty_node));
1296 1296 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1297 1297 mutex_exit(&dn->dn_mtx);
1298 1298 if (drop_struct_lock)
1299 1299 rw_exit(&dn->dn_struct_rwlock);
1300 1300 }
1301 1301
1302 1302 dnode_setdirty(dn, tx);
1303 1303 DB_DNODE_EXIT(db);
1304 1304 return (dr);
1305 1305 }
1306 1306
1307 1307 /*
1308 1308 * Undirty a buffer in the transaction group referenced by the given
1309 1309 * transaction. Return whether this evicted the dbuf.
1310 1310 */
1311 1311 static boolean_t
1312 1312 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1313 1313 {
1314 1314 dnode_t *dn;
1315 1315 uint64_t txg = tx->tx_txg;
1316 1316 dbuf_dirty_record_t *dr, **drp;
1317 1317
1318 1318 ASSERT(txg != 0);
1319 1319 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1320 1320 ASSERT0(db->db_level);
1321 1321 ASSERT(MUTEX_HELD(&db->db_mtx));
1322 1322
1323 1323 /*
1324 1324 * If this buffer is not dirty, we're done.
1325 1325 */
1326 1326 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
↓ open down ↓ |
1326 lines elided |
↑ open up ↑ |
1327 1327 if (dr->dr_txg <= txg)
1328 1328 break;
1329 1329 if (dr == NULL || dr->dr_txg < txg)
1330 1330 return (B_FALSE);
1331 1331 ASSERT(dr->dr_txg == txg);
1332 1332 ASSERT(dr->dr_dbuf == db);
1333 1333
1334 1334 DB_DNODE_ENTER(db);
1335 1335 dn = DB_DNODE(db);
1336 1336
1337 - /*
1338 - * Note: This code will probably work even if there are concurrent
1339 - * holders, but it is untested in that scenerio, as the ZPL and
1340 - * ztest have additional locking (the range locks) that prevents
1341 - * that type of concurrent access.
1342 - */
1343 - ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
1344 -
1345 1337 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1346 1338
1347 1339 ASSERT(db->db.db_size != 0);
1348 1340
1349 1341 /*
1350 1342 * Any space we accounted for in dp_dirty_* will be cleaned up by
1351 1343 * dsl_pool_sync(). This is relatively rare so the discrepancy
1352 1344 * is not a big deal.
1353 1345 */
1354 1346
1355 1347 *drp = dr->dr_next;
1356 1348
1357 1349 /*
1358 1350 * Note that there are three places in dbuf_dirty()
1359 1351 * where this dirty record may be put on a list.
1360 1352 * Make sure to do a list_remove corresponding to
1361 1353 * every one of those list_insert calls.
1362 1354 */
1363 1355 if (dr->dr_parent) {
1364 1356 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1365 1357 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1366 1358 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1367 1359 } else if (db->db_blkid == DMU_SPILL_BLKID ||
1368 1360 db->db_level+1 == dn->dn_nlevels) {
1369 1361 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1370 1362 mutex_enter(&dn->dn_mtx);
1371 1363 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1372 1364 mutex_exit(&dn->dn_mtx);
1373 1365 }
1374 1366 DB_DNODE_EXIT(db);
1375 1367
1376 1368 if (db->db_state != DB_NOFILL) {
1377 1369 dbuf_unoverride(dr);
1378 1370
1379 1371 ASSERT(db->db_buf != NULL);
1380 1372 ASSERT(dr->dt.dl.dr_data != NULL);
1381 1373 if (dr->dt.dl.dr_data != db->db_buf)
1382 1374 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1383 1375 }
1384 1376 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1385 1377
1386 1378 ASSERT(db->db_dirtycnt > 0);
1387 1379 db->db_dirtycnt -= 1;
1388 1380
1389 1381 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1390 1382 arc_buf_t *buf = db->db_buf;
1391 1383
1392 1384 ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1393 1385 dbuf_set_data(db, NULL);
1394 1386 VERIFY(arc_buf_remove_ref(buf, db));
1395 1387 dbuf_evict(db);
1396 1388 return (B_TRUE);
1397 1389 }
1398 1390
1399 1391 return (B_FALSE);
1400 1392 }
1401 1393
1402 1394 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
1403 1395 void
1404 1396 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1405 1397 {
1406 1398 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1407 1399
1408 1400 ASSERT(tx->tx_txg != 0);
1409 1401 ASSERT(!refcount_is_zero(&db->db_holds));
1410 1402
1411 1403 DB_DNODE_ENTER(db);
1412 1404 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1413 1405 rf |= DB_RF_HAVESTRUCT;
1414 1406 DB_DNODE_EXIT(db);
1415 1407 (void) dbuf_read(db, NULL, rf);
1416 1408 (void) dbuf_dirty(db, tx);
1417 1409 }
1418 1410
1419 1411 void
1420 1412 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1421 1413 {
1422 1414 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1423 1415
1424 1416 db->db_state = DB_NOFILL;
1425 1417
1426 1418 dmu_buf_will_fill(db_fake, tx);
1427 1419 }
1428 1420
1429 1421 void
1430 1422 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1431 1423 {
1432 1424 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1433 1425
1434 1426 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1435 1427 ASSERT(tx->tx_txg != 0);
1436 1428 ASSERT(db->db_level == 0);
1437 1429 ASSERT(!refcount_is_zero(&db->db_holds));
1438 1430
1439 1431 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1440 1432 dmu_tx_private_ok(tx));
1441 1433
1442 1434 dbuf_noread(db);
1443 1435 (void) dbuf_dirty(db, tx);
1444 1436 }
1445 1437
1446 1438 #pragma weak dmu_buf_fill_done = dbuf_fill_done
1447 1439 /* ARGSUSED */
1448 1440 void
1449 1441 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1450 1442 {
1451 1443 mutex_enter(&db->db_mtx);
1452 1444 DBUF_VERIFY(db);
1453 1445
1454 1446 if (db->db_state == DB_FILL) {
1455 1447 if (db->db_level == 0 && db->db_freed_in_flight) {
1456 1448 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1457 1449 /* we were freed while filling */
1458 1450 /* XXX dbuf_undirty? */
1459 1451 bzero(db->db.db_data, db->db.db_size);
1460 1452 db->db_freed_in_flight = FALSE;
1461 1453 }
1462 1454 db->db_state = DB_CACHED;
1463 1455 cv_broadcast(&db->db_changed);
1464 1456 }
1465 1457 mutex_exit(&db->db_mtx);
1466 1458 }
1467 1459
1468 1460 /*
1469 1461 * Directly assign a provided arc buf to a given dbuf if it's not referenced
1470 1462 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1471 1463 */
1472 1464 void
1473 1465 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1474 1466 {
1475 1467 ASSERT(!refcount_is_zero(&db->db_holds));
1476 1468 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1477 1469 ASSERT(db->db_level == 0);
1478 1470 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1479 1471 ASSERT(buf != NULL);
1480 1472 ASSERT(arc_buf_size(buf) == db->db.db_size);
1481 1473 ASSERT(tx->tx_txg != 0);
1482 1474
1483 1475 arc_return_buf(buf, db);
1484 1476 ASSERT(arc_released(buf));
1485 1477
1486 1478 mutex_enter(&db->db_mtx);
1487 1479
1488 1480 while (db->db_state == DB_READ || db->db_state == DB_FILL)
1489 1481 cv_wait(&db->db_changed, &db->db_mtx);
1490 1482
1491 1483 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1492 1484
1493 1485 if (db->db_state == DB_CACHED &&
1494 1486 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1495 1487 mutex_exit(&db->db_mtx);
1496 1488 (void) dbuf_dirty(db, tx);
1497 1489 bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1498 1490 VERIFY(arc_buf_remove_ref(buf, db));
1499 1491 xuio_stat_wbuf_copied();
1500 1492 return;
1501 1493 }
1502 1494
1503 1495 xuio_stat_wbuf_nocopy();
1504 1496 if (db->db_state == DB_CACHED) {
1505 1497 dbuf_dirty_record_t *dr = db->db_last_dirty;
1506 1498
1507 1499 ASSERT(db->db_buf != NULL);
1508 1500 if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1509 1501 ASSERT(dr->dt.dl.dr_data == db->db_buf);
1510 1502 if (!arc_released(db->db_buf)) {
1511 1503 ASSERT(dr->dt.dl.dr_override_state ==
1512 1504 DR_OVERRIDDEN);
1513 1505 arc_release(db->db_buf, db);
1514 1506 }
1515 1507 dr->dt.dl.dr_data = buf;
1516 1508 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1517 1509 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1518 1510 arc_release(db->db_buf, db);
1519 1511 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1520 1512 }
1521 1513 db->db_buf = NULL;
1522 1514 }
1523 1515 ASSERT(db->db_buf == NULL);
1524 1516 dbuf_set_data(db, buf);
1525 1517 db->db_state = DB_FILL;
1526 1518 mutex_exit(&db->db_mtx);
1527 1519 (void) dbuf_dirty(db, tx);
1528 1520 dbuf_fill_done(db, tx);
1529 1521 }
1530 1522
1531 1523 /*
1532 1524 * "Clear" the contents of this dbuf. This will mark the dbuf
1533 1525 * EVICTING and clear *most* of its references. Unfortunately,
1534 1526 * when we are not holding the dn_dbufs_mtx, we can't clear the
1535 1527 * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1536 1528 * in this case. For callers from the DMU we will usually see:
1537 1529 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1538 1530 * For the arc callback, we will usually see:
1539 1531 * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1540 1532 * Sometimes, though, we will get a mix of these two:
1541 1533 * DMU: dbuf_clear()->arc_buf_evict()
1542 1534 * ARC: dbuf_do_evict()->dbuf_destroy()
1543 1535 */
1544 1536 void
1545 1537 dbuf_clear(dmu_buf_impl_t *db)
1546 1538 {
1547 1539 dnode_t *dn;
1548 1540 dmu_buf_impl_t *parent = db->db_parent;
1549 1541 dmu_buf_impl_t *dndb;
1550 1542 int dbuf_gone = FALSE;
1551 1543
1552 1544 ASSERT(MUTEX_HELD(&db->db_mtx));
1553 1545 ASSERT(refcount_is_zero(&db->db_holds));
1554 1546
1555 1547 dbuf_evict_user(db);
1556 1548
1557 1549 if (db->db_state == DB_CACHED) {
1558 1550 ASSERT(db->db.db_data != NULL);
1559 1551 if (db->db_blkid == DMU_BONUS_BLKID) {
1560 1552 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1561 1553 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1562 1554 }
1563 1555 db->db.db_data = NULL;
1564 1556 db->db_state = DB_UNCACHED;
1565 1557 }
1566 1558
1567 1559 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1568 1560 ASSERT(db->db_data_pending == NULL);
1569 1561
1570 1562 db->db_state = DB_EVICTING;
1571 1563 db->db_blkptr = NULL;
1572 1564
1573 1565 DB_DNODE_ENTER(db);
1574 1566 dn = DB_DNODE(db);
1575 1567 dndb = dn->dn_dbuf;
1576 1568 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1577 1569 list_remove(&dn->dn_dbufs, db);
1578 1570 (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1579 1571 membar_producer();
1580 1572 DB_DNODE_EXIT(db);
1581 1573 /*
1582 1574 * Decrementing the dbuf count means that the hold corresponding
1583 1575 * to the removed dbuf is no longer discounted in dnode_move(),
1584 1576 * so the dnode cannot be moved until after we release the hold.
1585 1577 * The membar_producer() ensures visibility of the decremented
1586 1578 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1587 1579 * release any lock.
1588 1580 */
1589 1581 dnode_rele(dn, db);
1590 1582 db->db_dnode_handle = NULL;
1591 1583 } else {
1592 1584 DB_DNODE_EXIT(db);
1593 1585 }
1594 1586
1595 1587 if (db->db_buf)
1596 1588 dbuf_gone = arc_buf_evict(db->db_buf);
1597 1589
1598 1590 if (!dbuf_gone)
1599 1591 mutex_exit(&db->db_mtx);
1600 1592
1601 1593 /*
1602 1594 * If this dbuf is referenced from an indirect dbuf,
1603 1595 * decrement the ref count on the indirect dbuf.
1604 1596 */
1605 1597 if (parent && parent != dndb)
1606 1598 dbuf_rele(parent, db);
1607 1599 }
1608 1600
1609 1601 static int
1610 1602 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1611 1603 dmu_buf_impl_t **parentp, blkptr_t **bpp)
1612 1604 {
1613 1605 int nlevels, epbs;
1614 1606
1615 1607 *parentp = NULL;
1616 1608 *bpp = NULL;
1617 1609
1618 1610 ASSERT(blkid != DMU_BONUS_BLKID);
1619 1611
1620 1612 if (blkid == DMU_SPILL_BLKID) {
1621 1613 mutex_enter(&dn->dn_mtx);
1622 1614 if (dn->dn_have_spill &&
1623 1615 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1624 1616 *bpp = &dn->dn_phys->dn_spill;
1625 1617 else
1626 1618 *bpp = NULL;
1627 1619 dbuf_add_ref(dn->dn_dbuf, NULL);
1628 1620 *parentp = dn->dn_dbuf;
1629 1621 mutex_exit(&dn->dn_mtx);
1630 1622 return (0);
1631 1623 }
1632 1624
1633 1625 if (dn->dn_phys->dn_nlevels == 0)
1634 1626 nlevels = 1;
1635 1627 else
1636 1628 nlevels = dn->dn_phys->dn_nlevels;
1637 1629
1638 1630 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1639 1631
1640 1632 ASSERT3U(level * epbs, <, 64);
1641 1633 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1642 1634 if (level >= nlevels ||
1643 1635 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1644 1636 /* the buffer has no parent yet */
1645 1637 return (SET_ERROR(ENOENT));
1646 1638 } else if (level < nlevels-1) {
1647 1639 /* this block is referenced from an indirect block */
1648 1640 int err = dbuf_hold_impl(dn, level+1,
1649 1641 blkid >> epbs, fail_sparse, NULL, parentp);
1650 1642 if (err)
1651 1643 return (err);
1652 1644 err = dbuf_read(*parentp, NULL,
1653 1645 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1654 1646 if (err) {
1655 1647 dbuf_rele(*parentp, NULL);
1656 1648 *parentp = NULL;
1657 1649 return (err);
1658 1650 }
1659 1651 *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1660 1652 (blkid & ((1ULL << epbs) - 1));
1661 1653 return (0);
1662 1654 } else {
1663 1655 /* the block is referenced from the dnode */
1664 1656 ASSERT3U(level, ==, nlevels-1);
1665 1657 ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1666 1658 blkid < dn->dn_phys->dn_nblkptr);
1667 1659 if (dn->dn_dbuf) {
1668 1660 dbuf_add_ref(dn->dn_dbuf, NULL);
1669 1661 *parentp = dn->dn_dbuf;
1670 1662 }
1671 1663 *bpp = &dn->dn_phys->dn_blkptr[blkid];
1672 1664 return (0);
1673 1665 }
1674 1666 }
1675 1667
1676 1668 static dmu_buf_impl_t *
1677 1669 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1678 1670 dmu_buf_impl_t *parent, blkptr_t *blkptr)
1679 1671 {
1680 1672 objset_t *os = dn->dn_objset;
1681 1673 dmu_buf_impl_t *db, *odb;
1682 1674
1683 1675 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1684 1676 ASSERT(dn->dn_type != DMU_OT_NONE);
1685 1677
1686 1678 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1687 1679
1688 1680 db->db_objset = os;
1689 1681 db->db.db_object = dn->dn_object;
1690 1682 db->db_level = level;
1691 1683 db->db_blkid = blkid;
1692 1684 db->db_last_dirty = NULL;
1693 1685 db->db_dirtycnt = 0;
1694 1686 db->db_dnode_handle = dn->dn_handle;
1695 1687 db->db_parent = parent;
1696 1688 db->db_blkptr = blkptr;
1697 1689
1698 1690 db->db_user_ptr = NULL;
1699 1691 db->db_user_data_ptr_ptr = NULL;
1700 1692 db->db_evict_func = NULL;
1701 1693 db->db_immediate_evict = 0;
1702 1694 db->db_freed_in_flight = 0;
1703 1695
1704 1696 if (blkid == DMU_BONUS_BLKID) {
1705 1697 ASSERT3P(parent, ==, dn->dn_dbuf);
1706 1698 db->db.db_size = DN_MAX_BONUSLEN -
1707 1699 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1708 1700 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1709 1701 db->db.db_offset = DMU_BONUS_BLKID;
1710 1702 db->db_state = DB_UNCACHED;
1711 1703 /* the bonus dbuf is not placed in the hash table */
1712 1704 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1713 1705 return (db);
1714 1706 } else if (blkid == DMU_SPILL_BLKID) {
1715 1707 db->db.db_size = (blkptr != NULL) ?
1716 1708 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1717 1709 db->db.db_offset = 0;
1718 1710 } else {
1719 1711 int blocksize =
1720 1712 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1721 1713 db->db.db_size = blocksize;
1722 1714 db->db.db_offset = db->db_blkid * blocksize;
1723 1715 }
1724 1716
1725 1717 /*
1726 1718 * Hold the dn_dbufs_mtx while we get the new dbuf
1727 1719 * in the hash table *and* added to the dbufs list.
1728 1720 * This prevents a possible deadlock with someone
1729 1721 * trying to look up this dbuf before its added to the
1730 1722 * dn_dbufs list.
1731 1723 */
1732 1724 mutex_enter(&dn->dn_dbufs_mtx);
1733 1725 db->db_state = DB_EVICTING;
1734 1726 if ((odb = dbuf_hash_insert(db)) != NULL) {
1735 1727 /* someone else inserted it first */
1736 1728 kmem_cache_free(dbuf_cache, db);
1737 1729 mutex_exit(&dn->dn_dbufs_mtx);
1738 1730 return (odb);
1739 1731 }
1740 1732 list_insert_head(&dn->dn_dbufs, db);
1741 1733 if (db->db_level == 0 && db->db_blkid >=
1742 1734 dn->dn_unlisted_l0_blkid)
1743 1735 dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
1744 1736 db->db_state = DB_UNCACHED;
1745 1737 mutex_exit(&dn->dn_dbufs_mtx);
1746 1738 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1747 1739
1748 1740 if (parent && parent != dn->dn_dbuf)
1749 1741 dbuf_add_ref(parent, db);
1750 1742
1751 1743 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1752 1744 refcount_count(&dn->dn_holds) > 0);
1753 1745 (void) refcount_add(&dn->dn_holds, db);
1754 1746 (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
1755 1747
1756 1748 dprintf_dbuf(db, "db=%p\n", db);
1757 1749
1758 1750 return (db);
1759 1751 }
1760 1752
1761 1753 static int
1762 1754 dbuf_do_evict(void *private)
1763 1755 {
1764 1756 arc_buf_t *buf = private;
1765 1757 dmu_buf_impl_t *db = buf->b_private;
1766 1758
1767 1759 if (!MUTEX_HELD(&db->db_mtx))
1768 1760 mutex_enter(&db->db_mtx);
1769 1761
1770 1762 ASSERT(refcount_is_zero(&db->db_holds));
1771 1763
1772 1764 if (db->db_state != DB_EVICTING) {
1773 1765 ASSERT(db->db_state == DB_CACHED);
1774 1766 DBUF_VERIFY(db);
1775 1767 db->db_buf = NULL;
1776 1768 dbuf_evict(db);
1777 1769 } else {
1778 1770 mutex_exit(&db->db_mtx);
1779 1771 dbuf_destroy(db);
1780 1772 }
1781 1773 return (0);
1782 1774 }
1783 1775
1784 1776 static void
1785 1777 dbuf_destroy(dmu_buf_impl_t *db)
1786 1778 {
1787 1779 ASSERT(refcount_is_zero(&db->db_holds));
1788 1780
1789 1781 if (db->db_blkid != DMU_BONUS_BLKID) {
1790 1782 /*
1791 1783 * If this dbuf is still on the dn_dbufs list,
1792 1784 * remove it from that list.
1793 1785 */
1794 1786 if (db->db_dnode_handle != NULL) {
1795 1787 dnode_t *dn;
1796 1788
1797 1789 DB_DNODE_ENTER(db);
1798 1790 dn = DB_DNODE(db);
1799 1791 mutex_enter(&dn->dn_dbufs_mtx);
1800 1792 list_remove(&dn->dn_dbufs, db);
1801 1793 (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1802 1794 mutex_exit(&dn->dn_dbufs_mtx);
1803 1795 DB_DNODE_EXIT(db);
1804 1796 /*
1805 1797 * Decrementing the dbuf count means that the hold
1806 1798 * corresponding to the removed dbuf is no longer
1807 1799 * discounted in dnode_move(), so the dnode cannot be
1808 1800 * moved until after we release the hold.
1809 1801 */
1810 1802 dnode_rele(dn, db);
1811 1803 db->db_dnode_handle = NULL;
1812 1804 }
1813 1805 dbuf_hash_remove(db);
1814 1806 }
1815 1807 db->db_parent = NULL;
1816 1808 db->db_buf = NULL;
1817 1809
1818 1810 ASSERT(!list_link_active(&db->db_link));
1819 1811 ASSERT(db->db.db_data == NULL);
1820 1812 ASSERT(db->db_hash_next == NULL);
1821 1813 ASSERT(db->db_blkptr == NULL);
1822 1814 ASSERT(db->db_data_pending == NULL);
1823 1815
1824 1816 kmem_cache_free(dbuf_cache, db);
1825 1817 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1826 1818 }
1827 1819
1828 1820 void
1829 1821 dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
1830 1822 {
1831 1823 dmu_buf_impl_t *db = NULL;
1832 1824 blkptr_t *bp = NULL;
1833 1825
1834 1826 ASSERT(blkid != DMU_BONUS_BLKID);
1835 1827 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1836 1828
1837 1829 if (dnode_block_freed(dn, blkid))
1838 1830 return;
1839 1831
1840 1832 /* dbuf_find() returns with db_mtx held */
1841 1833 if (db = dbuf_find(dn, 0, blkid)) {
1842 1834 /*
1843 1835 * This dbuf is already in the cache. We assume that
1844 1836 * it is already CACHED, or else about to be either
1845 1837 * read or filled.
1846 1838 */
1847 1839 mutex_exit(&db->db_mtx);
1848 1840 return;
1849 1841 }
1850 1842
1851 1843 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1852 1844 if (bp && !BP_IS_HOLE(bp)) {
1853 1845 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1854 1846 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1855 1847 zbookmark_t zb;
1856 1848
1857 1849 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1858 1850 dn->dn_object, 0, blkid);
1859 1851
1860 1852 (void) arc_read(NULL, dn->dn_objset->os_spa,
1861 1853 bp, NULL, NULL, prio,
1862 1854 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1863 1855 &aflags, &zb);
1864 1856 }
1865 1857 if (db)
1866 1858 dbuf_rele(db, NULL);
1867 1859 }
1868 1860 }
1869 1861
1870 1862 /*
1871 1863 * Returns with db_holds incremented, and db_mtx not held.
1872 1864 * Note: dn_struct_rwlock must be held.
1873 1865 */
1874 1866 int
1875 1867 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1876 1868 void *tag, dmu_buf_impl_t **dbp)
1877 1869 {
1878 1870 dmu_buf_impl_t *db, *parent = NULL;
1879 1871
1880 1872 ASSERT(blkid != DMU_BONUS_BLKID);
1881 1873 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1882 1874 ASSERT3U(dn->dn_nlevels, >, level);
1883 1875
1884 1876 *dbp = NULL;
1885 1877 top:
1886 1878 /* dbuf_find() returns with db_mtx held */
1887 1879 db = dbuf_find(dn, level, blkid);
1888 1880
1889 1881 if (db == NULL) {
1890 1882 blkptr_t *bp = NULL;
1891 1883 int err;
1892 1884
1893 1885 ASSERT3P(parent, ==, NULL);
1894 1886 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1895 1887 if (fail_sparse) {
1896 1888 if (err == 0 && bp && BP_IS_HOLE(bp))
1897 1889 err = SET_ERROR(ENOENT);
1898 1890 if (err) {
1899 1891 if (parent)
1900 1892 dbuf_rele(parent, NULL);
1901 1893 return (err);
1902 1894 }
1903 1895 }
1904 1896 if (err && err != ENOENT)
1905 1897 return (err);
1906 1898 db = dbuf_create(dn, level, blkid, parent, bp);
1907 1899 }
1908 1900
1909 1901 if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1910 1902 arc_buf_add_ref(db->db_buf, db);
1911 1903 if (db->db_buf->b_data == NULL) {
1912 1904 dbuf_clear(db);
1913 1905 if (parent) {
1914 1906 dbuf_rele(parent, NULL);
1915 1907 parent = NULL;
1916 1908 }
1917 1909 goto top;
1918 1910 }
1919 1911 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1920 1912 }
1921 1913
1922 1914 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1923 1915
1924 1916 /*
1925 1917 * If this buffer is currently syncing out, and we are are
1926 1918 * still referencing it from db_data, we need to make a copy
1927 1919 * of it in case we decide we want to dirty it again in this txg.
1928 1920 */
1929 1921 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1930 1922 dn->dn_object != DMU_META_DNODE_OBJECT &&
1931 1923 db->db_state == DB_CACHED && db->db_data_pending) {
1932 1924 dbuf_dirty_record_t *dr = db->db_data_pending;
1933 1925
1934 1926 if (dr->dt.dl.dr_data == db->db_buf) {
1935 1927 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1936 1928
1937 1929 dbuf_set_data(db,
1938 1930 arc_buf_alloc(dn->dn_objset->os_spa,
1939 1931 db->db.db_size, db, type));
1940 1932 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1941 1933 db->db.db_size);
1942 1934 }
1943 1935 }
1944 1936
1945 1937 (void) refcount_add(&db->db_holds, tag);
1946 1938 dbuf_update_data(db);
1947 1939 DBUF_VERIFY(db);
1948 1940 mutex_exit(&db->db_mtx);
1949 1941
1950 1942 /* NOTE: we can't rele the parent until after we drop the db_mtx */
1951 1943 if (parent)
1952 1944 dbuf_rele(parent, NULL);
1953 1945
1954 1946 ASSERT3P(DB_DNODE(db), ==, dn);
1955 1947 ASSERT3U(db->db_blkid, ==, blkid);
1956 1948 ASSERT3U(db->db_level, ==, level);
1957 1949 *dbp = db;
1958 1950
1959 1951 return (0);
1960 1952 }
1961 1953
1962 1954 dmu_buf_impl_t *
1963 1955 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1964 1956 {
1965 1957 dmu_buf_impl_t *db;
1966 1958 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1967 1959 return (err ? NULL : db);
1968 1960 }
1969 1961
1970 1962 dmu_buf_impl_t *
1971 1963 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1972 1964 {
1973 1965 dmu_buf_impl_t *db;
1974 1966 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1975 1967 return (err ? NULL : db);
1976 1968 }
1977 1969
1978 1970 void
1979 1971 dbuf_create_bonus(dnode_t *dn)
1980 1972 {
1981 1973 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1982 1974
1983 1975 ASSERT(dn->dn_bonus == NULL);
1984 1976 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
1985 1977 }
1986 1978
1987 1979 int
1988 1980 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
1989 1981 {
1990 1982 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1991 1983 dnode_t *dn;
1992 1984
1993 1985 if (db->db_blkid != DMU_SPILL_BLKID)
1994 1986 return (SET_ERROR(ENOTSUP));
1995 1987 if (blksz == 0)
1996 1988 blksz = SPA_MINBLOCKSIZE;
1997 1989 if (blksz > SPA_MAXBLOCKSIZE)
1998 1990 blksz = SPA_MAXBLOCKSIZE;
1999 1991 else
2000 1992 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
2001 1993
2002 1994 DB_DNODE_ENTER(db);
2003 1995 dn = DB_DNODE(db);
2004 1996 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2005 1997 dbuf_new_size(db, blksz, tx);
2006 1998 rw_exit(&dn->dn_struct_rwlock);
2007 1999 DB_DNODE_EXIT(db);
2008 2000
2009 2001 return (0);
2010 2002 }
2011 2003
2012 2004 void
2013 2005 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
2014 2006 {
2015 2007 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
2016 2008 }
2017 2009
2018 2010 #pragma weak dmu_buf_add_ref = dbuf_add_ref
2019 2011 void
2020 2012 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
2021 2013 {
2022 2014 int64_t holds = refcount_add(&db->db_holds, tag);
2023 2015 ASSERT(holds > 1);
2024 2016 }
2025 2017
2026 2018 /*
2027 2019 * If you call dbuf_rele() you had better not be referencing the dnode handle
2028 2020 * unless you have some other direct or indirect hold on the dnode. (An indirect
2029 2021 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2030 2022 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2031 2023 * dnode's parent dbuf evicting its dnode handles.
2032 2024 */
2033 2025 #pragma weak dmu_buf_rele = dbuf_rele
2034 2026 void
2035 2027 dbuf_rele(dmu_buf_impl_t *db, void *tag)
2036 2028 {
2037 2029 mutex_enter(&db->db_mtx);
2038 2030 dbuf_rele_and_unlock(db, tag);
2039 2031 }
2040 2032
2041 2033 /*
2042 2034 * dbuf_rele() for an already-locked dbuf. This is necessary to allow
2043 2035 * db_dirtycnt and db_holds to be updated atomically.
2044 2036 */
2045 2037 void
2046 2038 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2047 2039 {
2048 2040 int64_t holds;
2049 2041
2050 2042 ASSERT(MUTEX_HELD(&db->db_mtx));
2051 2043 DBUF_VERIFY(db);
2052 2044
2053 2045 /*
2054 2046 * Remove the reference to the dbuf before removing its hold on the
2055 2047 * dnode so we can guarantee in dnode_move() that a referenced bonus
2056 2048 * buffer has a corresponding dnode hold.
2057 2049 */
2058 2050 holds = refcount_remove(&db->db_holds, tag);
2059 2051 ASSERT(holds >= 0);
2060 2052
2061 2053 /*
2062 2054 * We can't freeze indirects if there is a possibility that they
2063 2055 * may be modified in the current syncing context.
2064 2056 */
2065 2057 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2066 2058 arc_buf_freeze(db->db_buf);
2067 2059
2068 2060 if (holds == db->db_dirtycnt &&
2069 2061 db->db_level == 0 && db->db_immediate_evict)
2070 2062 dbuf_evict_user(db);
2071 2063
2072 2064 if (holds == 0) {
2073 2065 if (db->db_blkid == DMU_BONUS_BLKID) {
2074 2066 mutex_exit(&db->db_mtx);
2075 2067
2076 2068 /*
2077 2069 * If the dnode moves here, we cannot cross this barrier
2078 2070 * until the move completes.
2079 2071 */
2080 2072 DB_DNODE_ENTER(db);
2081 2073 (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
2082 2074 DB_DNODE_EXIT(db);
2083 2075 /*
2084 2076 * The bonus buffer's dnode hold is no longer discounted
2085 2077 * in dnode_move(). The dnode cannot move until after
2086 2078 * the dnode_rele().
2087 2079 */
2088 2080 dnode_rele(DB_DNODE(db), db);
2089 2081 } else if (db->db_buf == NULL) {
2090 2082 /*
2091 2083 * This is a special case: we never associated this
2092 2084 * dbuf with any data allocated from the ARC.
2093 2085 */
2094 2086 ASSERT(db->db_state == DB_UNCACHED ||
2095 2087 db->db_state == DB_NOFILL);
2096 2088 dbuf_evict(db);
2097 2089 } else if (arc_released(db->db_buf)) {
2098 2090 arc_buf_t *buf = db->db_buf;
2099 2091 /*
2100 2092 * This dbuf has anonymous data associated with it.
2101 2093 */
2102 2094 dbuf_set_data(db, NULL);
2103 2095 VERIFY(arc_buf_remove_ref(buf, db));
2104 2096 dbuf_evict(db);
2105 2097 } else {
2106 2098 VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2107 2099
2108 2100 /*
2109 2101 * A dbuf will be eligible for eviction if either the
2110 2102 * 'primarycache' property is set or a duplicate
2111 2103 * copy of this buffer is already cached in the arc.
2112 2104 *
2113 2105 * In the case of the 'primarycache' a buffer
2114 2106 * is considered for eviction if it matches the
2115 2107 * criteria set in the property.
2116 2108 *
2117 2109 * To decide if our buffer is considered a
2118 2110 * duplicate, we must call into the arc to determine
2119 2111 * if multiple buffers are referencing the same
2120 2112 * block on-disk. If so, then we simply evict
2121 2113 * ourselves.
2122 2114 */
2123 2115 if (!DBUF_IS_CACHEABLE(db) ||
2124 2116 arc_buf_eviction_needed(db->db_buf))
2125 2117 dbuf_clear(db);
2126 2118 else
2127 2119 mutex_exit(&db->db_mtx);
2128 2120 }
2129 2121 } else {
2130 2122 mutex_exit(&db->db_mtx);
2131 2123 }
2132 2124 }
2133 2125
2134 2126 #pragma weak dmu_buf_refcount = dbuf_refcount
2135 2127 uint64_t
2136 2128 dbuf_refcount(dmu_buf_impl_t *db)
2137 2129 {
2138 2130 return (refcount_count(&db->db_holds));
2139 2131 }
2140 2132
2141 2133 void *
2142 2134 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2143 2135 dmu_buf_evict_func_t *evict_func)
2144 2136 {
2145 2137 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2146 2138 user_data_ptr_ptr, evict_func));
2147 2139 }
2148 2140
2149 2141 void *
2150 2142 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2151 2143 dmu_buf_evict_func_t *evict_func)
2152 2144 {
2153 2145 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2154 2146
2155 2147 db->db_immediate_evict = TRUE;
2156 2148 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2157 2149 user_data_ptr_ptr, evict_func));
2158 2150 }
2159 2151
2160 2152 void *
2161 2153 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
2162 2154 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
2163 2155 {
2164 2156 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2165 2157 ASSERT(db->db_level == 0);
2166 2158
2167 2159 ASSERT((user_ptr == NULL) == (evict_func == NULL));
2168 2160
2169 2161 mutex_enter(&db->db_mtx);
2170 2162
2171 2163 if (db->db_user_ptr == old_user_ptr) {
2172 2164 db->db_user_ptr = user_ptr;
2173 2165 db->db_user_data_ptr_ptr = user_data_ptr_ptr;
2174 2166 db->db_evict_func = evict_func;
2175 2167
2176 2168 dbuf_update_data(db);
2177 2169 } else {
2178 2170 old_user_ptr = db->db_user_ptr;
2179 2171 }
2180 2172
2181 2173 mutex_exit(&db->db_mtx);
2182 2174 return (old_user_ptr);
2183 2175 }
2184 2176
2185 2177 void *
2186 2178 dmu_buf_get_user(dmu_buf_t *db_fake)
2187 2179 {
2188 2180 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2189 2181 ASSERT(!refcount_is_zero(&db->db_holds));
2190 2182
2191 2183 return (db->db_user_ptr);
2192 2184 }
2193 2185
2194 2186 boolean_t
2195 2187 dmu_buf_freeable(dmu_buf_t *dbuf)
2196 2188 {
2197 2189 boolean_t res = B_FALSE;
2198 2190 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2199 2191
2200 2192 if (db->db_blkptr)
2201 2193 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2202 2194 db->db_blkptr, db->db_blkptr->blk_birth);
2203 2195
2204 2196 return (res);
2205 2197 }
2206 2198
2207 2199 blkptr_t *
2208 2200 dmu_buf_get_blkptr(dmu_buf_t *db)
2209 2201 {
2210 2202 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2211 2203 return (dbi->db_blkptr);
2212 2204 }
2213 2205
2214 2206 static void
2215 2207 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2216 2208 {
2217 2209 /* ASSERT(dmu_tx_is_syncing(tx) */
2218 2210 ASSERT(MUTEX_HELD(&db->db_mtx));
2219 2211
2220 2212 if (db->db_blkptr != NULL)
2221 2213 return;
2222 2214
2223 2215 if (db->db_blkid == DMU_SPILL_BLKID) {
2224 2216 db->db_blkptr = &dn->dn_phys->dn_spill;
2225 2217 BP_ZERO(db->db_blkptr);
2226 2218 return;
2227 2219 }
2228 2220 if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2229 2221 /*
2230 2222 * This buffer was allocated at a time when there was
2231 2223 * no available blkptrs from the dnode, or it was
2232 2224 * inappropriate to hook it in (i.e., nlevels mis-match).
2233 2225 */
2234 2226 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2235 2227 ASSERT(db->db_parent == NULL);
2236 2228 db->db_parent = dn->dn_dbuf;
2237 2229 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2238 2230 DBUF_VERIFY(db);
2239 2231 } else {
2240 2232 dmu_buf_impl_t *parent = db->db_parent;
2241 2233 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2242 2234
2243 2235 ASSERT(dn->dn_phys->dn_nlevels > 1);
2244 2236 if (parent == NULL) {
2245 2237 mutex_exit(&db->db_mtx);
2246 2238 rw_enter(&dn->dn_struct_rwlock, RW_READER);
2247 2239 (void) dbuf_hold_impl(dn, db->db_level+1,
2248 2240 db->db_blkid >> epbs, FALSE, db, &parent);
2249 2241 rw_exit(&dn->dn_struct_rwlock);
2250 2242 mutex_enter(&db->db_mtx);
2251 2243 db->db_parent = parent;
2252 2244 }
2253 2245 db->db_blkptr = (blkptr_t *)parent->db.db_data +
2254 2246 (db->db_blkid & ((1ULL << epbs) - 1));
2255 2247 DBUF_VERIFY(db);
2256 2248 }
2257 2249 }
2258 2250
2259 2251 static void
2260 2252 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2261 2253 {
2262 2254 dmu_buf_impl_t *db = dr->dr_dbuf;
2263 2255 dnode_t *dn;
2264 2256 zio_t *zio;
2265 2257
2266 2258 ASSERT(dmu_tx_is_syncing(tx));
2267 2259
2268 2260 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2269 2261
2270 2262 mutex_enter(&db->db_mtx);
2271 2263
2272 2264 ASSERT(db->db_level > 0);
2273 2265 DBUF_VERIFY(db);
2274 2266
2275 2267 /* Read the block if it hasn't been read yet. */
2276 2268 if (db->db_buf == NULL) {
2277 2269 mutex_exit(&db->db_mtx);
2278 2270 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2279 2271 mutex_enter(&db->db_mtx);
2280 2272 }
2281 2273 ASSERT3U(db->db_state, ==, DB_CACHED);
2282 2274 ASSERT(db->db_buf != NULL);
2283 2275
2284 2276 DB_DNODE_ENTER(db);
2285 2277 dn = DB_DNODE(db);
2286 2278 /* Indirect block size must match what the dnode thinks it is. */
2287 2279 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2288 2280 dbuf_check_blkptr(dn, db);
2289 2281 DB_DNODE_EXIT(db);
2290 2282
2291 2283 /* Provide the pending dirty record to child dbufs */
2292 2284 db->db_data_pending = dr;
2293 2285
2294 2286 mutex_exit(&db->db_mtx);
2295 2287 dbuf_write(dr, db->db_buf, tx);
2296 2288
2297 2289 zio = dr->dr_zio;
2298 2290 mutex_enter(&dr->dt.di.dr_mtx);
2299 2291 dbuf_sync_list(&dr->dt.di.dr_children, tx);
2300 2292 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2301 2293 mutex_exit(&dr->dt.di.dr_mtx);
2302 2294 zio_nowait(zio);
2303 2295 }
2304 2296
2305 2297 static void
2306 2298 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2307 2299 {
2308 2300 arc_buf_t **datap = &dr->dt.dl.dr_data;
2309 2301 dmu_buf_impl_t *db = dr->dr_dbuf;
2310 2302 dnode_t *dn;
2311 2303 objset_t *os;
2312 2304 uint64_t txg = tx->tx_txg;
2313 2305
2314 2306 ASSERT(dmu_tx_is_syncing(tx));
2315 2307
2316 2308 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2317 2309
2318 2310 mutex_enter(&db->db_mtx);
2319 2311 /*
2320 2312 * To be synced, we must be dirtied. But we
2321 2313 * might have been freed after the dirty.
2322 2314 */
2323 2315 if (db->db_state == DB_UNCACHED) {
2324 2316 /* This buffer has been freed since it was dirtied */
2325 2317 ASSERT(db->db.db_data == NULL);
2326 2318 } else if (db->db_state == DB_FILL) {
2327 2319 /* This buffer was freed and is now being re-filled */
2328 2320 ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2329 2321 } else {
2330 2322 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2331 2323 }
2332 2324 DBUF_VERIFY(db);
2333 2325
2334 2326 DB_DNODE_ENTER(db);
2335 2327 dn = DB_DNODE(db);
2336 2328
2337 2329 if (db->db_blkid == DMU_SPILL_BLKID) {
2338 2330 mutex_enter(&dn->dn_mtx);
2339 2331 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2340 2332 mutex_exit(&dn->dn_mtx);
2341 2333 }
2342 2334
2343 2335 /*
2344 2336 * If this is a bonus buffer, simply copy the bonus data into the
2345 2337 * dnode. It will be written out when the dnode is synced (and it
2346 2338 * will be synced, since it must have been dirty for dbuf_sync to
2347 2339 * be called).
2348 2340 */
2349 2341 if (db->db_blkid == DMU_BONUS_BLKID) {
2350 2342 dbuf_dirty_record_t **drp;
2351 2343
2352 2344 ASSERT(*datap != NULL);
2353 2345 ASSERT0(db->db_level);
2354 2346 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2355 2347 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2356 2348 DB_DNODE_EXIT(db);
2357 2349
2358 2350 if (*datap != db->db.db_data) {
2359 2351 zio_buf_free(*datap, DN_MAX_BONUSLEN);
2360 2352 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2361 2353 }
2362 2354 db->db_data_pending = NULL;
2363 2355 drp = &db->db_last_dirty;
2364 2356 while (*drp != dr)
2365 2357 drp = &(*drp)->dr_next;
2366 2358 ASSERT(dr->dr_next == NULL);
2367 2359 ASSERT(dr->dr_dbuf == db);
2368 2360 *drp = dr->dr_next;
2369 2361 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2370 2362 ASSERT(db->db_dirtycnt > 0);
2371 2363 db->db_dirtycnt -= 1;
2372 2364 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2373 2365 return;
2374 2366 }
2375 2367
2376 2368 os = dn->dn_objset;
2377 2369
2378 2370 /*
2379 2371 * This function may have dropped the db_mtx lock allowing a dmu_sync
2380 2372 * operation to sneak in. As a result, we need to ensure that we
2381 2373 * don't check the dr_override_state until we have returned from
2382 2374 * dbuf_check_blkptr.
2383 2375 */
2384 2376 dbuf_check_blkptr(dn, db);
2385 2377
2386 2378 /*
2387 2379 * If this buffer is in the middle of an immediate write,
2388 2380 * wait for the synchronous IO to complete.
2389 2381 */
2390 2382 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2391 2383 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2392 2384 cv_wait(&db->db_changed, &db->db_mtx);
2393 2385 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2394 2386 }
2395 2387
2396 2388 if (db->db_state != DB_NOFILL &&
2397 2389 dn->dn_object != DMU_META_DNODE_OBJECT &&
2398 2390 refcount_count(&db->db_holds) > 1 &&
2399 2391 dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2400 2392 *datap == db->db_buf) {
2401 2393 /*
2402 2394 * If this buffer is currently "in use" (i.e., there
2403 2395 * are active holds and db_data still references it),
2404 2396 * then make a copy before we start the write so that
2405 2397 * any modifications from the open txg will not leak
2406 2398 * into this write.
2407 2399 *
2408 2400 * NOTE: this copy does not need to be made for
2409 2401 * objects only modified in the syncing context (e.g.
2410 2402 * DNONE_DNODE blocks).
2411 2403 */
2412 2404 int blksz = arc_buf_size(*datap);
2413 2405 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2414 2406 *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2415 2407 bcopy(db->db.db_data, (*datap)->b_data, blksz);
2416 2408 }
2417 2409 db->db_data_pending = dr;
2418 2410
2419 2411 mutex_exit(&db->db_mtx);
2420 2412
2421 2413 dbuf_write(dr, *datap, tx);
2422 2414
2423 2415 ASSERT(!list_link_active(&dr->dr_dirty_node));
2424 2416 if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2425 2417 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2426 2418 DB_DNODE_EXIT(db);
2427 2419 } else {
2428 2420 /*
2429 2421 * Although zio_nowait() does not "wait for an IO", it does
2430 2422 * initiate the IO. If this is an empty write it seems plausible
2431 2423 * that the IO could actually be completed before the nowait
2432 2424 * returns. We need to DB_DNODE_EXIT() first in case
2433 2425 * zio_nowait() invalidates the dbuf.
2434 2426 */
2435 2427 DB_DNODE_EXIT(db);
2436 2428 zio_nowait(dr->dr_zio);
2437 2429 }
2438 2430 }
2439 2431
2440 2432 void
2441 2433 dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2442 2434 {
2443 2435 dbuf_dirty_record_t *dr;
2444 2436
2445 2437 while (dr = list_head(list)) {
2446 2438 if (dr->dr_zio != NULL) {
2447 2439 /*
2448 2440 * If we find an already initialized zio then we
2449 2441 * are processing the meta-dnode, and we have finished.
2450 2442 * The dbufs for all dnodes are put back on the list
2451 2443 * during processing, so that we can zio_wait()
2452 2444 * these IOs after initiating all child IOs.
2453 2445 */
2454 2446 ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2455 2447 DMU_META_DNODE_OBJECT);
2456 2448 break;
2457 2449 }
2458 2450 list_remove(list, dr);
2459 2451 if (dr->dr_dbuf->db_level > 0)
2460 2452 dbuf_sync_indirect(dr, tx);
2461 2453 else
2462 2454 dbuf_sync_leaf(dr, tx);
2463 2455 }
2464 2456 }
2465 2457
2466 2458 /* ARGSUSED */
2467 2459 static void
2468 2460 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2469 2461 {
2470 2462 dmu_buf_impl_t *db = vdb;
2471 2463 dnode_t *dn;
2472 2464 blkptr_t *bp = zio->io_bp;
2473 2465 blkptr_t *bp_orig = &zio->io_bp_orig;
2474 2466 spa_t *spa = zio->io_spa;
2475 2467 int64_t delta;
2476 2468 uint64_t fill = 0;
2477 2469 int i;
2478 2470
2479 2471 ASSERT(db->db_blkptr == bp);
2480 2472
2481 2473 DB_DNODE_ENTER(db);
2482 2474 dn = DB_DNODE(db);
2483 2475 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2484 2476 dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2485 2477 zio->io_prev_space_delta = delta;
2486 2478
2487 2479 if (BP_IS_HOLE(bp)) {
2488 2480 ASSERT(bp->blk_fill == 0);
2489 2481 DB_DNODE_EXIT(db);
2490 2482 return;
2491 2483 }
2492 2484
2493 2485 ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2494 2486 BP_GET_TYPE(bp) == dn->dn_type) ||
2495 2487 (db->db_blkid == DMU_SPILL_BLKID &&
2496 2488 BP_GET_TYPE(bp) == dn->dn_bonustype));
2497 2489 ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2498 2490
2499 2491 mutex_enter(&db->db_mtx);
2500 2492
2501 2493 #ifdef ZFS_DEBUG
2502 2494 if (db->db_blkid == DMU_SPILL_BLKID) {
2503 2495 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2504 2496 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2505 2497 db->db_blkptr == &dn->dn_phys->dn_spill);
2506 2498 }
2507 2499 #endif
2508 2500
2509 2501 if (db->db_level == 0) {
2510 2502 mutex_enter(&dn->dn_mtx);
2511 2503 if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2512 2504 db->db_blkid != DMU_SPILL_BLKID)
2513 2505 dn->dn_phys->dn_maxblkid = db->db_blkid;
2514 2506 mutex_exit(&dn->dn_mtx);
2515 2507
2516 2508 if (dn->dn_type == DMU_OT_DNODE) {
2517 2509 dnode_phys_t *dnp = db->db.db_data;
2518 2510 for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2519 2511 i--, dnp++) {
2520 2512 if (dnp->dn_type != DMU_OT_NONE)
2521 2513 fill++;
2522 2514 }
2523 2515 } else {
2524 2516 fill = 1;
2525 2517 }
2526 2518 } else {
2527 2519 blkptr_t *ibp = db->db.db_data;
2528 2520 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2529 2521 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2530 2522 if (BP_IS_HOLE(ibp))
2531 2523 continue;
2532 2524 fill += ibp->blk_fill;
2533 2525 }
2534 2526 }
2535 2527 DB_DNODE_EXIT(db);
2536 2528
2537 2529 bp->blk_fill = fill;
2538 2530
2539 2531 mutex_exit(&db->db_mtx);
2540 2532 }
2541 2533
2542 2534 /*
2543 2535 * The SPA will call this callback several times for each zio - once
2544 2536 * for every physical child i/o (zio->io_phys_children times). This
2545 2537 * allows the DMU to monitor the progress of each logical i/o. For example,
2546 2538 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
2547 2539 * block. There may be a long delay before all copies/fragments are completed,
2548 2540 * so this callback allows us to retire dirty space gradually, as the physical
2549 2541 * i/os complete.
2550 2542 */
2551 2543 /* ARGSUSED */
2552 2544 static void
2553 2545 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
2554 2546 {
2555 2547 dmu_buf_impl_t *db = arg;
2556 2548 objset_t *os = db->db_objset;
2557 2549 dsl_pool_t *dp = dmu_objset_pool(os);
2558 2550 dbuf_dirty_record_t *dr;
2559 2551 int delta = 0;
2560 2552
2561 2553 dr = db->db_data_pending;
2562 2554 ASSERT3U(dr->dr_txg, ==, zio->io_txg);
2563 2555
2564 2556 /*
2565 2557 * The callback will be called io_phys_children times. Retire one
2566 2558 * portion of our dirty space each time we are called. Any rounding
2567 2559 * error will be cleaned up by dsl_pool_sync()'s call to
2568 2560 * dsl_pool_undirty_space().
2569 2561 */
2570 2562 delta = dr->dr_accounted / zio->io_phys_children;
2571 2563 dsl_pool_undirty_space(dp, delta, zio->io_txg);
2572 2564 }
2573 2565
2574 2566 /* ARGSUSED */
2575 2567 static void
2576 2568 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2577 2569 {
2578 2570 dmu_buf_impl_t *db = vdb;
2579 2571 blkptr_t *bp = zio->io_bp;
2580 2572 blkptr_t *bp_orig = &zio->io_bp_orig;
2581 2573 uint64_t txg = zio->io_txg;
2582 2574 dbuf_dirty_record_t **drp, *dr;
2583 2575
2584 2576 ASSERT0(zio->io_error);
2585 2577 ASSERT(db->db_blkptr == bp);
2586 2578
2587 2579 /*
2588 2580 * For nopwrites and rewrites we ensure that the bp matches our
2589 2581 * original and bypass all the accounting.
2590 2582 */
2591 2583 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2592 2584 ASSERT(BP_EQUAL(bp, bp_orig));
2593 2585 } else {
2594 2586 objset_t *os;
2595 2587 dsl_dataset_t *ds;
2596 2588 dmu_tx_t *tx;
2597 2589
2598 2590 DB_GET_OBJSET(&os, db);
2599 2591 ds = os->os_dsl_dataset;
2600 2592 tx = os->os_synctx;
2601 2593
2602 2594 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2603 2595 dsl_dataset_block_born(ds, bp, tx);
2604 2596 }
2605 2597
2606 2598 mutex_enter(&db->db_mtx);
2607 2599
2608 2600 DBUF_VERIFY(db);
2609 2601
2610 2602 drp = &db->db_last_dirty;
2611 2603 while ((dr = *drp) != db->db_data_pending)
2612 2604 drp = &dr->dr_next;
2613 2605 ASSERT(!list_link_active(&dr->dr_dirty_node));
2614 2606 ASSERT(dr->dr_txg == txg);
2615 2607 ASSERT(dr->dr_dbuf == db);
2616 2608 ASSERT(dr->dr_next == NULL);
2617 2609 *drp = dr->dr_next;
2618 2610
2619 2611 #ifdef ZFS_DEBUG
2620 2612 if (db->db_blkid == DMU_SPILL_BLKID) {
2621 2613 dnode_t *dn;
2622 2614
2623 2615 DB_DNODE_ENTER(db);
2624 2616 dn = DB_DNODE(db);
2625 2617 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2626 2618 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2627 2619 db->db_blkptr == &dn->dn_phys->dn_spill);
2628 2620 DB_DNODE_EXIT(db);
2629 2621 }
2630 2622 #endif
2631 2623
2632 2624 if (db->db_level == 0) {
2633 2625 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2634 2626 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2635 2627 if (db->db_state != DB_NOFILL) {
2636 2628 if (dr->dt.dl.dr_data != db->db_buf)
2637 2629 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
2638 2630 db));
2639 2631 else if (!arc_released(db->db_buf))
2640 2632 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2641 2633 }
2642 2634 } else {
2643 2635 dnode_t *dn;
2644 2636
2645 2637 DB_DNODE_ENTER(db);
2646 2638 dn = DB_DNODE(db);
2647 2639 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2648 2640 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2649 2641 if (!BP_IS_HOLE(db->db_blkptr)) {
2650 2642 int epbs =
2651 2643 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2652 2644 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2653 2645 db->db.db_size);
2654 2646 ASSERT3U(dn->dn_phys->dn_maxblkid
2655 2647 >> (db->db_level * epbs), >=, db->db_blkid);
2656 2648 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2657 2649 }
2658 2650 DB_DNODE_EXIT(db);
2659 2651 mutex_destroy(&dr->dt.di.dr_mtx);
2660 2652 list_destroy(&dr->dt.di.dr_children);
2661 2653 }
2662 2654 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2663 2655
2664 2656 cv_broadcast(&db->db_changed);
2665 2657 ASSERT(db->db_dirtycnt > 0);
2666 2658 db->db_dirtycnt -= 1;
2667 2659 db->db_data_pending = NULL;
2668 2660
2669 2661 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2670 2662 }
2671 2663
2672 2664 static void
2673 2665 dbuf_write_nofill_ready(zio_t *zio)
2674 2666 {
2675 2667 dbuf_write_ready(zio, NULL, zio->io_private);
2676 2668 }
2677 2669
2678 2670 static void
2679 2671 dbuf_write_nofill_done(zio_t *zio)
2680 2672 {
2681 2673 dbuf_write_done(zio, NULL, zio->io_private);
2682 2674 }
2683 2675
2684 2676 static void
2685 2677 dbuf_write_override_ready(zio_t *zio)
2686 2678 {
2687 2679 dbuf_dirty_record_t *dr = zio->io_private;
2688 2680 dmu_buf_impl_t *db = dr->dr_dbuf;
2689 2681
2690 2682 dbuf_write_ready(zio, NULL, db);
2691 2683 }
2692 2684
2693 2685 static void
2694 2686 dbuf_write_override_done(zio_t *zio)
2695 2687 {
2696 2688 dbuf_dirty_record_t *dr = zio->io_private;
2697 2689 dmu_buf_impl_t *db = dr->dr_dbuf;
2698 2690 blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2699 2691
2700 2692 mutex_enter(&db->db_mtx);
2701 2693 if (!BP_EQUAL(zio->io_bp, obp)) {
2702 2694 if (!BP_IS_HOLE(obp))
2703 2695 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2704 2696 arc_release(dr->dt.dl.dr_data, db);
2705 2697 }
2706 2698 mutex_exit(&db->db_mtx);
2707 2699
2708 2700 dbuf_write_done(zio, NULL, db);
2709 2701 }
2710 2702
2711 2703 /* Issue I/O to commit a dirty buffer to disk. */
2712 2704 static void
2713 2705 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2714 2706 {
2715 2707 dmu_buf_impl_t *db = dr->dr_dbuf;
2716 2708 dnode_t *dn;
2717 2709 objset_t *os;
2718 2710 dmu_buf_impl_t *parent = db->db_parent;
2719 2711 uint64_t txg = tx->tx_txg;
2720 2712 zbookmark_t zb;
2721 2713 zio_prop_t zp;
2722 2714 zio_t *zio;
2723 2715 int wp_flag = 0;
2724 2716
2725 2717 DB_DNODE_ENTER(db);
2726 2718 dn = DB_DNODE(db);
2727 2719 os = dn->dn_objset;
2728 2720
2729 2721 if (db->db_state != DB_NOFILL) {
2730 2722 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2731 2723 /*
2732 2724 * Private object buffers are released here rather
2733 2725 * than in dbuf_dirty() since they are only modified
2734 2726 * in the syncing context and we don't want the
2735 2727 * overhead of making multiple copies of the data.
2736 2728 */
2737 2729 if (BP_IS_HOLE(db->db_blkptr)) {
2738 2730 arc_buf_thaw(data);
2739 2731 } else {
2740 2732 dbuf_release_bp(db);
2741 2733 }
2742 2734 }
2743 2735 }
2744 2736
2745 2737 if (parent != dn->dn_dbuf) {
2746 2738 /* Our parent is an indirect block. */
2747 2739 /* We have a dirty parent that has been scheduled for write. */
2748 2740 ASSERT(parent && parent->db_data_pending);
2749 2741 /* Our parent's buffer is one level closer to the dnode. */
2750 2742 ASSERT(db->db_level == parent->db_level-1);
2751 2743 /*
2752 2744 * We're about to modify our parent's db_data by modifying
2753 2745 * our block pointer, so the parent must be released.
2754 2746 */
2755 2747 ASSERT(arc_released(parent->db_buf));
2756 2748 zio = parent->db_data_pending->dr_zio;
2757 2749 } else {
2758 2750 /* Our parent is the dnode itself. */
2759 2751 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2760 2752 db->db_blkid != DMU_SPILL_BLKID) ||
2761 2753 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2762 2754 if (db->db_blkid != DMU_SPILL_BLKID)
2763 2755 ASSERT3P(db->db_blkptr, ==,
2764 2756 &dn->dn_phys->dn_blkptr[db->db_blkid]);
2765 2757 zio = dn->dn_zio;
2766 2758 }
2767 2759
2768 2760 ASSERT(db->db_level == 0 || data == db->db_buf);
2769 2761 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2770 2762 ASSERT(zio);
2771 2763
2772 2764 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2773 2765 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2774 2766 db->db.db_object, db->db_level, db->db_blkid);
2775 2767
2776 2768 if (db->db_blkid == DMU_SPILL_BLKID)
2777 2769 wp_flag = WP_SPILL;
2778 2770 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2779 2771
2780 2772 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2781 2773 DB_DNODE_EXIT(db);
2782 2774
2783 2775 if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2784 2776 ASSERT(db->db_state != DB_NOFILL);
2785 2777 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2786 2778 db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
2787 2779 dbuf_write_override_ready, NULL, dbuf_write_override_done,
2788 2780 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2789 2781 mutex_enter(&db->db_mtx);
2790 2782 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2791 2783 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2792 2784 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2793 2785 mutex_exit(&db->db_mtx);
2794 2786 } else if (db->db_state == DB_NOFILL) {
2795 2787 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
2796 2788 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
2797 2789 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2798 2790 db->db_blkptr, NULL, db->db.db_size, &zp,
2799 2791 dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
2800 2792 ZIO_PRIORITY_ASYNC_WRITE,
2801 2793 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2802 2794 } else {
2803 2795 ASSERT(arc_released(data));
2804 2796 dr->dr_zio = arc_write(zio, os->os_spa, txg,
2805 2797 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
2806 2798 DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
2807 2799 dbuf_write_physdone, dbuf_write_done, db,
2808 2800 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2809 2801 }
2810 2802 }
↓ open down ↓ |
1456 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX