1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 */
26
27 #include <sys/zfs_context.h>
28 #include <sys/dmu.h>
29 #include <sys/dmu_impl.h>
30 #include <sys/dbuf.h>
31 #include <sys/dmu_objset.h>
32 #include <sys/dsl_dataset.h>
33 #include <sys/dsl_dir.h>
34 #include <sys/dmu_tx.h>
35 #include <sys/spa.h>
36 #include <sys/zio.h>
37 #include <sys/dmu_zfetch.h>
38 #include <sys/sa.h>
39 #include <sys/sa_impl.h>
40
41 static void dbuf_destroy(dmu_buf_impl_t *db);
42 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
43 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
44
45 /*
46 * Global data structures and functions for the dbuf cache.
47 */
48 static kmem_cache_t *dbuf_cache;
49
50 /* ARGSUSED */
51 static int
52 dbuf_cons(void *vdb, void *unused, int kmflag)
53 {
54 dmu_buf_impl_t *db = vdb;
55 bzero(db, sizeof (dmu_buf_impl_t));
56
57 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
58 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
59 refcount_create(&db->db_holds);
60 return (0);
61 }
62
63 /* ARGSUSED */
64 static void
65 dbuf_dest(void *vdb, void *unused)
66 {
67 dmu_buf_impl_t *db = vdb;
68 mutex_destroy(&db->db_mtx);
69 cv_destroy(&db->db_changed);
70 refcount_destroy(&db->db_holds);
71 }
72
73 /*
74 * dbuf hash table routines
75 */
76 static dbuf_hash_table_t dbuf_hash_table;
77
78 static uint64_t dbuf_hash_count;
79
80 static uint64_t
81 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
82 {
83 uintptr_t osv = (uintptr_t)os;
84 uint64_t crc = -1ULL;
85
86 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
87 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
88 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
89 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
90 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
91 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
92 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
93
94 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
95
96 return (crc);
97 }
98
99 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
100
101 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
102 ((dbuf)->db.db_object == (obj) && \
103 (dbuf)->db_objset == (os) && \
104 (dbuf)->db_level == (level) && \
105 (dbuf)->db_blkid == (blkid))
106
107 dmu_buf_impl_t *
108 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
109 {
110 dbuf_hash_table_t *h = &dbuf_hash_table;
111 objset_t *os = dn->dn_objset;
112 uint64_t obj = dn->dn_object;
113 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
114 uint64_t idx = hv & h->hash_table_mask;
115 dmu_buf_impl_t *db;
116
117 mutex_enter(DBUF_HASH_MUTEX(h, idx));
118 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
119 if (DBUF_EQUAL(db, os, obj, level, blkid)) {
120 mutex_enter(&db->db_mtx);
121 if (db->db_state != DB_EVICTING) {
122 mutex_exit(DBUF_HASH_MUTEX(h, idx));
123 return (db);
124 }
125 mutex_exit(&db->db_mtx);
126 }
127 }
128 mutex_exit(DBUF_HASH_MUTEX(h, idx));
129 return (NULL);
130 }
131
132 /*
133 * Insert an entry into the hash table. If there is already an element
134 * equal to elem in the hash table, then the already existing element
135 * will be returned and the new element will not be inserted.
136 * Otherwise returns NULL.
137 */
138 static dmu_buf_impl_t *
139 dbuf_hash_insert(dmu_buf_impl_t *db)
140 {
141 dbuf_hash_table_t *h = &dbuf_hash_table;
142 objset_t *os = db->db_objset;
143 uint64_t obj = db->db.db_object;
144 int level = db->db_level;
145 uint64_t blkid = db->db_blkid;
146 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
147 uint64_t idx = hv & h->hash_table_mask;
148 dmu_buf_impl_t *dbf;
149
150 mutex_enter(DBUF_HASH_MUTEX(h, idx));
151 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
152 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
153 mutex_enter(&dbf->db_mtx);
154 if (dbf->db_state != DB_EVICTING) {
155 mutex_exit(DBUF_HASH_MUTEX(h, idx));
156 return (dbf);
157 }
158 mutex_exit(&dbf->db_mtx);
159 }
160 }
161
162 mutex_enter(&db->db_mtx);
163 db->db_hash_next = h->hash_table[idx];
164 h->hash_table[idx] = db;
165 mutex_exit(DBUF_HASH_MUTEX(h, idx));
166 atomic_add_64(&dbuf_hash_count, 1);
167
168 return (NULL);
169 }
170
171 /*
172 * Remove an entry from the hash table. This operation will
173 * fail if there are any existing holds on the db.
174 */
175 static void
176 dbuf_hash_remove(dmu_buf_impl_t *db)
177 {
178 dbuf_hash_table_t *h = &dbuf_hash_table;
179 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
180 db->db_level, db->db_blkid);
181 uint64_t idx = hv & h->hash_table_mask;
182 dmu_buf_impl_t *dbf, **dbp;
183
184 /*
185 * We musn't hold db_mtx to maintin lock ordering:
186 * DBUF_HASH_MUTEX > db_mtx.
187 */
188 ASSERT(refcount_is_zero(&db->db_holds));
189 ASSERT(db->db_state == DB_EVICTING);
190 ASSERT(!MUTEX_HELD(&db->db_mtx));
191
192 mutex_enter(DBUF_HASH_MUTEX(h, idx));
193 dbp = &h->hash_table[idx];
194 while ((dbf = *dbp) != db) {
195 dbp = &dbf->db_hash_next;
196 ASSERT(dbf != NULL);
197 }
198 *dbp = db->db_hash_next;
199 db->db_hash_next = NULL;
200 mutex_exit(DBUF_HASH_MUTEX(h, idx));
201 atomic_add_64(&dbuf_hash_count, -1);
202 }
203
204 static arc_evict_func_t dbuf_do_evict;
205
206 static void
207 dbuf_verify_user(dmu_buf_impl_t *db, boolean_t evicting)
208 {
209 #ifdef ZFS_DEBUG
210
211 if (db->db_level != 0)
212 ASSERT(db->db_user == NULL);
213
214 if (db->db_user == NULL)
215 return;
216
217 /* Clients must resolve a dbuf before attaching user data. */
218 ASSERT(db->db.db_data != NULL && db->db_state == DB_CACHED);
219 /*
220 * We can't check the hold count here, because they are modified
221 * independently of the dbuf mutex. But it would be nice to ensure
222 * that the user has the appropriate number.
223 */
224 #endif
225 }
226
227 /*
228 * Evict the dbuf's user, either immediately, or use a provided queue.
229 *
230 * Call dmu_buf_process_user_evicts or dmu_buf_destroy_user_evict_list
231 * on the list when finished generating it.
232 *
233 * NOTE: If db->db_immediate_evict is FALSE, evict_list_p must be provided.
234 * NOTE: See dmu_buf_user_t about how this process works.
235 */
236 static void
237 dbuf_evict_user(dmu_buf_impl_t *db, list_t *evict_list_p)
238 {
239 ASSERT(MUTEX_HELD(&db->db_mtx));
240 ASSERT(evict_list_p != NULL);
241 dbuf_verify_user(db, /*evicting*/B_TRUE);
242
243 if (db->db_user == NULL)
244 return;
245
246 ASSERT(!list_link_active(&db->db_user->evict_queue_link));
247 list_insert_head(evict_list_p, db->db_user);
248 db->db_user = NULL;
249 }
250
251 /*
252 * Replace the current user of the dbuf. Requires that the caller knows who
253 * the old user is. Returns the old user, which may not necessarily be
254 * the same old_user provided by the caller.
255 */
256 dmu_buf_user_t *
257 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
258 dmu_buf_user_t *new_user)
259 {
260 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
261
262 mutex_enter(&db->db_mtx);
263 dbuf_verify_user(db, /*evicting*/B_FALSE);
264 if (db->db_user == old_user)
265 db->db_user = new_user;
266 else
267 old_user = db->db_user;
268 dbuf_verify_user(db, /*evicting*/B_FALSE);
269 mutex_exit(&db->db_mtx);
270
271 return (old_user);
272 }
273
274 /*
275 * Set the user eviction data for the DMU beturns NULL on success,
276 * or the existing user if another user currently owns the buffer.
277 */
278 dmu_buf_user_t *
279 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
280 {
281 return (dmu_buf_replace_user(db_fake, NULL, user));
282 }
283
284 dmu_buf_user_t *
285 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
286 {
287 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
288
289 db->db_immediate_evict = TRUE;
290 return (dmu_buf_set_user(db_fake, user));
291 }
292
293 /*
294 * Remove the user eviction data for the DMU buffer.
295 */
296 dmu_buf_user_t *
297 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
298 {
299 return (dmu_buf_replace_user(db_fake, user, NULL));
300 }
301
302 /*
303 * Returns the db_user set with dmu_buf_update_user(), or NULL if not set.
304 */
305 dmu_buf_user_t *
306 dmu_buf_get_user(dmu_buf_t *db_fake)
307 {
308 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
309
310 dbuf_verify_user(db, /*evicting*/B_FALSE);
311 return (db->db_user);
312 }
313
314 static void
315 dbuf_clear_data(dmu_buf_impl_t *db, list_t *evict_list_p)
316 {
317 ASSERT(MUTEX_HELD(&db->db_mtx));
318 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
319 dbuf_evict_user(db, evict_list_p);
320 db->db_buf = NULL;
321 db->db.db_data = NULL;
322 if (db->db_state != DB_NOFILL)
323 db->db_state = DB_UNCACHED;
324 }
325
326 static void
327 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
328 {
329 ASSERT(MUTEX_HELD(&db->db_mtx));
330 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
331 ASSERT(buf != NULL);
332
333 db->db_buf = buf;
334 ASSERT(buf->b_data != NULL);
335 db->db.db_data = buf->b_data;
336 if (!arc_released(buf))
337 arc_set_callback(buf, dbuf_do_evict, db);
338 }
339
340 boolean_t
341 dbuf_is_metadata(dmu_buf_impl_t *db)
342 {
343 if (db->db_level > 0) {
344 return (B_TRUE);
345 } else {
346 boolean_t is_metadata;
347
348 DB_DNODE_ENTER(db);
349 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
350 DB_DNODE_EXIT(db);
351
352 return (is_metadata);
353 }
354 }
355
356 void
357 dbuf_evict(dmu_buf_impl_t *db, list_t *evict_list_p)
358 {
359 ASSERT(MUTEX_HELD(&db->db_mtx));
360 ASSERT(db->db_buf == NULL);
361 ASSERT(db->db_data_pending == NULL);
362
363 dbuf_clear(db, evict_list_p);
364 dbuf_destroy(db);
365 }
366
367 void
368 dbuf_init(void)
369 {
370 uint64_t hsize = 1ULL << 16;
371 dbuf_hash_table_t *h = &dbuf_hash_table;
372 int i;
373
374 /*
375 * The hash table is big enough to fill all of physical memory
376 * with an average 4K block size. The table will take up
377 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
378 */
379 while (hsize * 4096 < physmem * PAGESIZE)
380 hsize <<= 1;
381
382 retry:
383 h->hash_table_mask = hsize - 1;
384 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
385 if (h->hash_table == NULL) {
386 /* XXX - we should really return an error instead of assert */
387 ASSERT(hsize > (1ULL << 10));
388 hsize >>= 1;
389 goto retry;
390 }
391
392 dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
393 sizeof (dmu_buf_impl_t),
394 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
395
396 for (i = 0; i < DBUF_MUTEXES; i++)
397 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
398 }
399
400 void
401 dbuf_fini(void)
402 {
403 dbuf_hash_table_t *h = &dbuf_hash_table;
404 int i;
405
406 for (i = 0; i < DBUF_MUTEXES; i++)
407 mutex_destroy(&h->hash_mutexes[i]);
408 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
409 kmem_cache_destroy(dbuf_cache);
410 }
411
412 /*
413 * Other stuff.
414 */
415
416 #ifdef ZFS_DEBUG
417 static void
418 dbuf_verify(dmu_buf_impl_t *db)
419 {
420 dnode_t *dn;
421 dbuf_dirty_record_t *dr;
422
423 ASSERT(MUTEX_HELD(&db->db_mtx));
424
425 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
426 return;
427
428 ASSERT(db->db_objset != NULL);
429 DB_DNODE_ENTER(db);
430 dn = DB_DNODE(db);
431 if (dn == NULL) {
432 ASSERT(db->db_parent == NULL);
433 ASSERT(db->db_blkptr == NULL);
434 } else {
435 ASSERT3U(db->db.db_object, ==, dn->dn_object);
436 ASSERT3P(db->db_objset, ==, dn->dn_objset);
437 ASSERT3U(db->db_level, <, dn->dn_nlevels);
438 ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
439 db->db_blkid == DMU_SPILL_BLKID ||
440 !list_is_empty(&dn->dn_dbufs));
441 }
442 if (db->db_blkid == DMU_BONUS_BLKID) {
443 ASSERT(dn != NULL);
444 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
445 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
446 } else if (db->db_blkid == DMU_SPILL_BLKID) {
447 ASSERT(dn != NULL);
448 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
449 ASSERT0(db->db.db_offset);
450 } else {
451 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
452 }
453
454 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
455 ASSERT(dr->dr_dbuf == db);
456
457 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
458 ASSERT(dr->dr_dbuf == db);
459
460 /*
461 * We can't assert that db_size matches dn_datablksz because it
462 * can be momentarily different when another thread is doing
463 * dnode_set_blksz().
464 */
465 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
466 dr = db->db_data_pending;
467 /*
468 * It should only be modified in syncing context, so
469 * make sure we only have one copy of the data.
470 */
471 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
472 }
473
474 /* verify db->db_blkptr */
475 if (db->db_blkptr) {
476 if (db->db_parent == dn->dn_dbuf) {
477 /* db is pointed to by the dnode */
478 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
479 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
480 ASSERT(db->db_parent == NULL);
481 else
482 ASSERT(db->db_parent != NULL);
483 if (db->db_blkid != DMU_SPILL_BLKID)
484 ASSERT3P(db->db_blkptr, ==,
485 &dn->dn_phys->dn_blkptr[db->db_blkid]);
486 } else {
487 /* db is pointed to by an indirect block */
488 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
489 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
490 ASSERT3U(db->db_parent->db.db_object, ==,
491 db->db.db_object);
492 /*
493 * dnode_grow_indblksz() can make this fail if we don't
494 * have the struct_rwlock. XXX indblksz no longer
495 * grows. safe to do this now?
496 */
497 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
498 ASSERT3P(db->db_blkptr, ==,
499 ((blkptr_t *)db->db_parent->db.db_data +
500 db->db_blkid % epb));
501 }
502 }
503 }
504 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
505 (db->db_buf == NULL || db->db_buf->b_data) &&
506 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
507 db->db_state != DB_FILL && !dn->dn_free_txg) {
508 /*
509 * If the blkptr isn't set but they have nonzero data,
510 * it had better be dirty, otherwise we'll lose that
511 * data when we evict this buffer.
512 */
513 if (db->db_dirtycnt == 0) {
514 uint64_t *buf = db->db.db_data;
515 int i;
516
517 for (i = 0; i < db->db.db_size >> 3; i++) {
518 ASSERT(buf[i] == 0);
519 }
520 }
521 }
522 DB_DNODE_EXIT(db);
523 }
524 #endif
525
526 /*
527 * Loan out an arc_buf for read. Return the loaned arc_buf.
528 */
529 arc_buf_t *
530 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
531 {
532 arc_buf_t *abuf;
533 list_t evict_list;
534
535 dmu_buf_create_user_evict_list(&evict_list);
536
537 mutex_enter(&db->db_mtx);
538 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
539 int blksz = db->db.db_size;
540 spa_t *spa;
541
542 mutex_exit(&db->db_mtx);
543 DB_GET_SPA(&spa, db);
544 abuf = arc_loan_buf(spa, blksz);
545 bcopy(db->db.db_data, abuf->b_data, blksz);
546 } else {
547 abuf = db->db_buf;
548 arc_loan_inuse_buf(abuf, db);
549 dbuf_clear_data(db, &evict_list);
550 mutex_exit(&db->db_mtx);
551 }
552 dmu_buf_destroy_user_evict_list(&evict_list);
553 return (abuf);
554 }
555
556 uint64_t
557 dbuf_whichblock(dnode_t *dn, uint64_t offset)
558 {
559 if (dn->dn_datablkshift) {
560 return (offset >> dn->dn_datablkshift);
561 } else {
562 ASSERT3U(offset, <, dn->dn_datablksz);
563 return (0);
564 }
565 }
566
567 static void
568 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
569 {
570 dmu_buf_impl_t *db = vdb;
571
572 mutex_enter(&db->db_mtx);
573 ASSERT3U(db->db_state, ==, DB_READ);
574 /*
575 * All reads are synchronous, so we must have a hold on the dbuf
576 */
577 ASSERT(refcount_count(&db->db_holds) > 0);
578 ASSERT(db->db_buf == NULL);
579 ASSERT(db->db.db_data == NULL);
580 if (db->db_level == 0 && db->db_freed_in_flight) {
581 /* we were freed in flight; disregard any error */
582 arc_release(buf, db);
583 bzero(buf->b_data, db->db.db_size);
584 arc_buf_freeze(buf);
585 db->db_freed_in_flight = FALSE;
586 dbuf_set_data(db, buf);
587 db->db_state = DB_CACHED;
588 } else if (zio == NULL || zio->io_error == 0) {
589 dbuf_set_data(db, buf);
590 db->db_state = DB_CACHED;
591 } else {
592 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
593 ASSERT3P(db->db_buf, ==, NULL);
594 VERIFY(arc_buf_remove_ref(buf, db));
595 db->db_state = DB_UNCACHED;
596 }
597 cv_broadcast(&db->db_changed);
598 dbuf_rele_and_unlock(db, NULL);
599 }
600
601 static void
602 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
603 {
604 dnode_t *dn;
605 spa_t *spa;
606 zbookmark_t zb;
607 uint32_t aflags = ARC_NOWAIT;
608
609 DB_DNODE_ENTER(db);
610 dn = DB_DNODE(db);
611 ASSERT(!refcount_is_zero(&db->db_holds));
612 /* We need the struct_rwlock to prevent db_blkptr from changing. */
613 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
614 ASSERT(MUTEX_HELD(&db->db_mtx));
615 ASSERT(db->db_state == DB_UNCACHED);
616 ASSERT(db->db_buf == NULL);
617
618 if (db->db_blkid == DMU_BONUS_BLKID) {
619 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
620
621 ASSERT3U(bonuslen, <=, db->db.db_size);
622 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
623 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
624 if (bonuslen < DN_MAX_BONUSLEN)
625 bzero(db->db.db_data, DN_MAX_BONUSLEN);
626 if (bonuslen)
627 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
628 DB_DNODE_EXIT(db);
629 db->db_state = DB_CACHED;
630 mutex_exit(&db->db_mtx);
631 return;
632 }
633
634 /*
635 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
636 * processes the delete record and clears the bp while we are waiting
637 * for the dn_mtx (resulting in a "no" from block_freed).
638 */
639 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
640 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
641 BP_IS_HOLE(db->db_blkptr)))) {
642 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
643
644 dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
645 db->db.db_size, db, type));
646 DB_DNODE_EXIT(db);
647 bzero(db->db.db_data, db->db.db_size);
648 db->db_state = DB_CACHED;
649 *flags |= DB_RF_CACHED;
650 mutex_exit(&db->db_mtx);
651 return;
652 }
653
654 spa = dn->dn_objset->os_spa;
655 DB_DNODE_EXIT(db);
656
657 db->db_state = DB_READ;
658 mutex_exit(&db->db_mtx);
659
660 if (DBUF_IS_L2CACHEABLE(db))
661 aflags |= ARC_L2CACHE;
662
663 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
664 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
665 db->db.db_object, db->db_level, db->db_blkid);
666
667 dbuf_add_ref(db, NULL);
668
669 (void) arc_read(zio, spa, db->db_blkptr,
670 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
671 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
672 &aflags, &zb);
673 if (aflags & ARC_CACHED)
674 *flags |= DB_RF_CACHED;
675 }
676
677 int
678 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
679 {
680 int err = 0;
681 int havepzio = (zio != NULL);
682 int prefetch;
683 dnode_t *dn;
684
685 /*
686 * We don't have to hold the mutex to check db_state because it
687 * can't be freed while we have a hold on the buffer.
688 */
689 ASSERT(!refcount_is_zero(&db->db_holds));
690
691 if (db->db_state == DB_NOFILL)
692 return (SET_ERROR(EIO));
693
694 DB_DNODE_ENTER(db);
695 dn = DB_DNODE(db);
696 if ((flags & DB_RF_HAVESTRUCT) == 0)
697 rw_enter(&dn->dn_struct_rwlock, RW_READER);
698
699 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
700 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
701 DBUF_IS_CACHEABLE(db);
702
703 mutex_enter(&db->db_mtx);
704 if (db->db_state == DB_CACHED) {
705 mutex_exit(&db->db_mtx);
706 if (prefetch)
707 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
708 db->db.db_size, TRUE);
709 if ((flags & DB_RF_HAVESTRUCT) == 0)
710 rw_exit(&dn->dn_struct_rwlock);
711 DB_DNODE_EXIT(db);
712 } else if (db->db_state == DB_UNCACHED) {
713 spa_t *spa = dn->dn_objset->os_spa;
714
715 if (zio == NULL)
716 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
717 dbuf_read_impl(db, zio, &flags);
718
719 /* dbuf_read_impl has dropped db_mtx for us */
720
721 if (prefetch)
722 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
723 db->db.db_size, flags & DB_RF_CACHED);
724
725 if ((flags & DB_RF_HAVESTRUCT) == 0)
726 rw_exit(&dn->dn_struct_rwlock);
727 DB_DNODE_EXIT(db);
728
729 if (!havepzio)
730 err = zio_wait(zio);
731 } else {
732 mutex_exit(&db->db_mtx);
733 if (prefetch)
734 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
735 db->db.db_size, TRUE);
736 if ((flags & DB_RF_HAVESTRUCT) == 0)
737 rw_exit(&dn->dn_struct_rwlock);
738 DB_DNODE_EXIT(db);
739
740 mutex_enter(&db->db_mtx);
741 if ((flags & DB_RF_NEVERWAIT) == 0) {
742 while (db->db_state == DB_READ ||
743 db->db_state == DB_FILL) {
744 ASSERT(db->db_state == DB_READ ||
745 (flags & DB_RF_HAVESTRUCT) == 0);
746 cv_wait(&db->db_changed, &db->db_mtx);
747 }
748 if (db->db_state == DB_UNCACHED)
749 err = SET_ERROR(EIO);
750 }
751 mutex_exit(&db->db_mtx);
752 }
753
754 ASSERT(err || havepzio || db->db_state == DB_CACHED);
755 return (err);
756 }
757
758 static void
759 dbuf_noread(dmu_buf_impl_t *db)
760 {
761 list_t evict_list;
762
763 ASSERT(!refcount_is_zero(&db->db_holds));
764 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
765 dmu_buf_create_user_evict_list(&evict_list);
766
767 mutex_enter(&db->db_mtx);
768 while (db->db_state == DB_READ || db->db_state == DB_FILL)
769 cv_wait(&db->db_changed, &db->db_mtx);
770 if (db->db_state == DB_UNCACHED) {
771 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
772 spa_t *spa;
773
774 ASSERT(db->db_buf == NULL);
775 ASSERT(db->db.db_data == NULL);
776 DB_GET_SPA(&spa, db);
777 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
778 db->db_state = DB_FILL;
779 } else if (db->db_state == DB_NOFILL) {
780 dbuf_clear_data(db, &evict_list);
781 } else {
782 ASSERT3U(db->db_state, ==, DB_CACHED);
783 }
784 mutex_exit(&db->db_mtx);
785 dmu_buf_destroy_user_evict_list(&evict_list);
786 }
787
788 /*
789 * This is our just-in-time copy function. It makes a copy of
790 * buffers, that have been modified in a previous transaction
791 * group, before we modify them in the current active group.
792 *
793 * This function is used in two places: when we are dirtying a
794 * buffer for the first time in a txg, and when we are freeing
795 * a range in a dnode that includes this buffer.
796 *
797 * Note that when we are called from dbuf_free_range() we do
798 * not put a hold on the buffer, we just traverse the active
799 * dbuf list for the dnode.
800 */
801 static void
802 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg, list_t *evict_list_p)
803 {
804 dbuf_dirty_record_t *dr = db->db_last_dirty;
805
806 ASSERT(MUTEX_HELD(&db->db_mtx));
807 ASSERT(db->db.db_data != NULL);
808 ASSERT(db->db_level == 0);
809 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
810
811 if (dr == NULL ||
812 (dr->dt.dl.dr_data !=
813 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
814 return;
815
816 /*
817 * If the last dirty record for this dbuf has not yet synced
818 * and its referencing the dbuf data, either:
819 * reset the reference to point to a new copy,
820 * or (if there a no active holders)
821 * just null out the current db_data pointer.
822 */
823 ASSERT(dr->dr_txg >= txg - 2);
824 if (db->db_blkid == DMU_BONUS_BLKID) {
825 /* Note that the data bufs here are zio_bufs */
826 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
827 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
828 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
829 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
830 int size = db->db.db_size;
831 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
832 spa_t *spa;
833
834 DB_GET_SPA(&spa, db);
835 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
836 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
837 } else {
838 dbuf_clear_data(db, evict_list_p);
839 }
840 }
841
842 void
843 dbuf_unoverride(dbuf_dirty_record_t *dr)
844 {
845 dmu_buf_impl_t *db = dr->dr_dbuf;
846 blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
847 uint64_t txg = dr->dr_txg;
848
849 ASSERT(MUTEX_HELD(&db->db_mtx));
850 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
851 ASSERT(db->db_level == 0);
852
853 if (db->db_blkid == DMU_BONUS_BLKID ||
854 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
855 return;
856
857 ASSERT(db->db_data_pending != dr);
858
859 /* free this block */
860 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) {
861 spa_t *spa;
862
863 DB_GET_SPA(&spa, db);
864 zio_free(spa, txg, bp);
865 }
866 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
867 dr->dt.dl.dr_nopwrite = B_FALSE;
868
869 /*
870 * Release the already-written buffer, so we leave it in
871 * a consistent dirty state. Note that all callers are
872 * modifying the buffer, so they will immediately do
873 * another (redundant) arc_release(). Therefore, leave
874 * the buf thawed to save the effort of freezing &
875 * immediately re-thawing it.
876 */
877 arc_release(dr->dt.dl.dr_data, db);
878 }
879
880 /*
881 * Evict (if its unreferenced) or clear (if its referenced) any level-0
882 * data blocks in the free range, so that any future readers will find
883 * empty blocks. Also, if we happen accross any level-1 dbufs in the
884 * range that have not already been marked dirty, mark them dirty so
885 * they stay in memory.
886 */
887 void
888 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
889 {
890 dmu_buf_impl_t *db, *db_next;
891 uint64_t txg = tx->tx_txg;
892 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
893 uint64_t first_l1 = start >> epbs;
894 uint64_t last_l1 = end >> epbs;
895 list_t evict_list;
896
897 dmu_buf_create_user_evict_list(&evict_list);
898
899 if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
900 end = dn->dn_maxblkid;
901 last_l1 = end >> epbs;
902 }
903 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
904 mutex_enter(&dn->dn_dbufs_mtx);
905 for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
906 db_next = list_next(&dn->dn_dbufs, db);
907 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
908
909 if (db->db_level == 1 &&
910 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
911 mutex_enter(&db->db_mtx);
912 if (db->db_last_dirty &&
913 db->db_last_dirty->dr_txg < txg) {
914 dbuf_add_ref(db, FTAG);
915 mutex_exit(&db->db_mtx);
916 dbuf_will_dirty(db, tx);
917 dbuf_rele(db, FTAG);
918 } else {
919 mutex_exit(&db->db_mtx);
920 }
921 }
922
923 if (db->db_level != 0)
924 continue;
925 dprintf_dbuf(db, "found buf %s\n", "");
926 if (db->db_blkid < start || db->db_blkid > end)
927 continue;
928
929 /* found a level 0 buffer in the range */
930 mutex_enter(&db->db_mtx);
931 if (dbuf_undirty(db, tx)) {
932 /* mutex has been dropped and dbuf destroyed */
933 continue;
934 }
935
936 if (db->db_state == DB_UNCACHED ||
937 db->db_state == DB_NOFILL ||
938 db->db_state == DB_EVICTING) {
939 ASSERT(db->db.db_data == NULL);
940 mutex_exit(&db->db_mtx);
941 continue;
942 }
943 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
944 /* will be handled in dbuf_read_done or dbuf_rele */
945 db->db_freed_in_flight = TRUE;
946 mutex_exit(&db->db_mtx);
947 continue;
948 }
949 if (refcount_count(&db->db_holds) == 0) {
950 ASSERT(db->db_buf);
951 dbuf_clear(db, &evict_list);
952 continue;
953 }
954 /* The dbuf is referenced */
955
956 if (db->db_last_dirty != NULL) {
957 dbuf_dirty_record_t *dr = db->db_last_dirty;
958
959 if (dr->dr_txg == txg) {
960 /*
961 * This buffer is "in-use", re-adjust the file
962 * size to reflect that this buffer may
963 * contain new data when we sync.
964 */
965 if (db->db_blkid != DMU_SPILL_BLKID &&
966 db->db_blkid > dn->dn_maxblkid)
967 dn->dn_maxblkid = db->db_blkid;
968 dbuf_unoverride(dr);
969 } else {
970 /*
971 * This dbuf is not dirty in the open context.
972 * Either uncache it (if its not referenced in
973 * the open context) or reset its contents to
974 * empty.
975 */
976 dbuf_fix_old_data(db, txg, &evict_list);
977 }
978 }
979 /* clear the contents if its cached */
980 if (db->db_state == DB_CACHED) {
981 ASSERT(db->db.db_data != NULL);
982 arc_release(db->db_buf, db);
983 bzero(db->db.db_data, db->db.db_size);
984 arc_buf_freeze(db->db_buf);
985 }
986
987 mutex_exit(&db->db_mtx);
988 dmu_buf_process_user_evicts(&evict_list);
989 }
990 mutex_exit(&dn->dn_dbufs_mtx);
991 dmu_buf_destroy_user_evict_list(&evict_list);
992 }
993
994 static int
995 dbuf_block_freeable(dmu_buf_impl_t *db)
996 {
997 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
998 uint64_t birth_txg = 0;
999
1000 /*
1001 * We don't need any locking to protect db_blkptr:
1002 * If it's syncing, then db_last_dirty will be set
1003 * so we'll ignore db_blkptr.
1004 */
1005 ASSERT(MUTEX_HELD(&db->db_mtx));
1006 if (db->db_last_dirty)
1007 birth_txg = db->db_last_dirty->dr_txg;
1008 else if (db->db_blkptr)
1009 birth_txg = db->db_blkptr->blk_birth;
1010
1011 /*
1012 * If we don't exist or are in a snapshot, we can't be freed.
1013 * Don't pass the bp to dsl_dataset_block_freeable() since we
1014 * are holding the db_mtx lock and might deadlock if we are
1015 * prefetching a dedup-ed block.
1016 */
1017 if (birth_txg)
1018 return (ds == NULL ||
1019 dsl_dataset_block_freeable(ds, NULL, birth_txg));
1020 else
1021 return (FALSE);
1022 }
1023
1024 void
1025 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
1026 {
1027 arc_buf_t *buf, *obuf;
1028 int osize = db->db.db_size;
1029 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1030 dnode_t *dn;
1031
1032 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1033
1034 DB_DNODE_ENTER(db);
1035 dn = DB_DNODE(db);
1036
1037 /* XXX does *this* func really need the lock? */
1038 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1039
1040 /*
1041 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
1042 * is OK, because there can be no other references to the db
1043 * when we are changing its size, so no concurrent DB_FILL can
1044 * be happening.
1045 */
1046 /*
1047 * XXX we should be doing a dbuf_read, checking the return
1048 * value and returning that up to our callers
1049 */
1050 dbuf_will_dirty(db, tx);
1051
1052 /* create the data buffer for the new block */
1053 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
1054
1055 /* copy old block data to the new block */
1056 obuf = db->db_buf;
1057 bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
1058 /* zero the remainder */
1059 if (size > osize)
1060 bzero((uint8_t *)buf->b_data + osize, size - osize);
1061
1062 mutex_enter(&db->db_mtx);
1063 dbuf_set_data(db, buf);
1064 VERIFY(arc_buf_remove_ref(obuf, db));
1065 db->db.db_size = size;
1066
1067 if (db->db_level == 0) {
1068 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1069 db->db_last_dirty->dt.dl.dr_data = buf;
1070 }
1071 mutex_exit(&db->db_mtx);
1072
1073 dnode_willuse_space(dn, size-osize, tx);
1074 DB_DNODE_EXIT(db);
1075 }
1076
1077 void
1078 dbuf_release_bp(dmu_buf_impl_t *db)
1079 {
1080 objset_t *os;
1081
1082 DB_GET_OBJSET(&os, db);
1083 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1084 ASSERT(arc_released(os->os_phys_buf) ||
1085 list_link_active(&os->os_dsl_dataset->ds_synced_link));
1086 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1087
1088 (void) arc_release(db->db_buf, db);
1089 }
1090
1091 dbuf_dirty_record_t *
1092 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1093 {
1094 dnode_t *dn;
1095 objset_t *os;
1096 dbuf_dirty_record_t **drp, *dr;
1097 int drop_struct_lock = FALSE;
1098 boolean_t do_free_accounting = B_FALSE;
1099 int txgoff = tx->tx_txg & TXG_MASK;
1100 list_t evict_list;
1101
1102 dmu_buf_create_user_evict_list(&evict_list);
1103
1104 ASSERT(tx->tx_txg != 0);
1105 ASSERT(!refcount_is_zero(&db->db_holds));
1106 DMU_TX_DIRTY_BUF(tx, db);
1107
1108 DB_DNODE_ENTER(db);
1109 dn = DB_DNODE(db);
1110 /*
1111 * Shouldn't dirty a regular buffer in syncing context. Private
1112 * objects may be dirtied in syncing context, but only if they
1113 * were already pre-dirtied in open context.
1114 */
1115 ASSERT(!dmu_tx_is_syncing(tx) ||
1116 BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1117 DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1118 dn->dn_objset->os_dsl_dataset == NULL);
1119 /*
1120 * We make this assert for private objects as well, but after we
1121 * check if we're already dirty. They are allowed to re-dirty
1122 * in syncing context.
1123 */
1124 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1125 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1126 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1127
1128 mutex_enter(&db->db_mtx);
1129 /*
1130 * XXX make this true for indirects too? The problem is that
1131 * transactions created with dmu_tx_create_assigned() from
1132 * syncing context don't bother holding ahead.
1133 */
1134 ASSERT(db->db_level != 0 ||
1135 db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1136 db->db_state == DB_NOFILL);
1137
1138 mutex_enter(&dn->dn_mtx);
1139 /*
1140 * Don't set dirtyctx to SYNC if we're just modifying this as we
1141 * initialize the objset.
1142 */
1143 if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1144 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1145 dn->dn_dirtyctx =
1146 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1147 ASSERT(dn->dn_dirtyctx_firstset == NULL);
1148 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1149 }
1150 mutex_exit(&dn->dn_mtx);
1151
1152 if (db->db_blkid == DMU_SPILL_BLKID)
1153 dn->dn_have_spill = B_TRUE;
1154
1155 /*
1156 * If this buffer is already dirty, we're done.
1157 */
1158 drp = &db->db_last_dirty;
1159 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1160 db->db.db_object == DMU_META_DNODE_OBJECT);
1161 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1162 drp = &dr->dr_next;
1163 if (dr && dr->dr_txg == tx->tx_txg) {
1164 DB_DNODE_EXIT(db);
1165
1166 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1167 /*
1168 * If this buffer has already been written out,
1169 * we now need to reset its state.
1170 */
1171 dbuf_unoverride(dr);
1172 if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1173 db->db_state != DB_NOFILL)
1174 arc_buf_thaw(db->db_buf);
1175 }
1176 mutex_exit(&db->db_mtx);
1177 dmu_buf_destroy_user_evict_list(&evict_list);
1178 return (dr);
1179 }
1180
1181 /*
1182 * Only valid if not already dirty.
1183 */
1184 ASSERT(dn->dn_object == 0 ||
1185 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1186 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1187
1188 ASSERT3U(dn->dn_nlevels, >, db->db_level);
1189 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1190 dn->dn_phys->dn_nlevels > db->db_level ||
1191 dn->dn_next_nlevels[txgoff] > db->db_level ||
1192 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1193 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1194
1195 /*
1196 * We should only be dirtying in syncing context if it's the
1197 * mos or we're initializing the os or it's a special object.
1198 * However, we are allowed to dirty in syncing context provided
1199 * we already dirtied it in open context. Hence we must make
1200 * this assertion only if we're not already dirty.
1201 */
1202 os = dn->dn_objset;
1203 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1204 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1205 ASSERT(db->db.db_size != 0);
1206
1207 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1208
1209 if (db->db_blkid != DMU_BONUS_BLKID) {
1210 /*
1211 * Update the accounting.
1212 * Note: we delay "free accounting" until after we drop
1213 * the db_mtx. This keeps us from grabbing other locks
1214 * (and possibly deadlocking) in bp_get_dsize() while
1215 * also holding the db_mtx.
1216 */
1217 dnode_willuse_space(dn, db->db.db_size, tx);
1218 do_free_accounting = dbuf_block_freeable(db);
1219 }
1220
1221 /*
1222 * If this buffer is dirty in an old transaction group we need
1223 * to make a copy of it so that the changes we make in this
1224 * transaction group won't leak out when we sync the older txg.
1225 */
1226 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1227 if (db->db_level == 0) {
1228 void *data_old = db->db_buf;
1229
1230 if (db->db_state != DB_NOFILL) {
1231 if (db->db_blkid == DMU_BONUS_BLKID) {
1232 dbuf_fix_old_data(db, tx->tx_txg, &evict_list);
1233 data_old = db->db.db_data;
1234 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1235 /*
1236 * Release the data buffer from the cache so
1237 * that we can modify it without impacting
1238 * possible other users of this cached data
1239 * block. Note that indirect blocks and
1240 * private objects are not released until the
1241 * syncing state (since they are only modified
1242 * then).
1243 */
1244 arc_release(db->db_buf, db);
1245 dbuf_fix_old_data(db, tx->tx_txg, &evict_list);
1246 data_old = db->db_buf;
1247 }
1248 ASSERT(data_old != NULL);
1249 }
1250 dr->dt.dl.dr_data = data_old;
1251 } else {
1252 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1253 list_create(&dr->dt.di.dr_children,
1254 sizeof (dbuf_dirty_record_t),
1255 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1256 }
1257 dr->dr_dbuf = db;
1258 dr->dr_txg = tx->tx_txg;
1259 dr->dr_next = *drp;
1260 *drp = dr;
1261
1262 /*
1263 * We could have been freed_in_flight between the dbuf_noread
1264 * and dbuf_dirty. We win, as though the dbuf_noread() had
1265 * happened after the free.
1266 */
1267 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1268 db->db_blkid != DMU_SPILL_BLKID) {
1269 mutex_enter(&dn->dn_mtx);
1270 dnode_clear_range(dn, db->db_blkid, 1, tx);
1271 mutex_exit(&dn->dn_mtx);
1272 db->db_freed_in_flight = FALSE;
1273 }
1274
1275 /*
1276 * This buffer is now part of this txg
1277 */
1278 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1279 db->db_dirtycnt += 1;
1280 ASSERT3U(db->db_dirtycnt, <=, 3);
1281
1282 mutex_exit(&db->db_mtx);
1283 dmu_buf_destroy_user_evict_list(&evict_list);
1284
1285 if (db->db_blkid == DMU_BONUS_BLKID ||
1286 db->db_blkid == DMU_SPILL_BLKID) {
1287 mutex_enter(&dn->dn_mtx);
1288 ASSERT(!list_link_active(&dr->dr_dirty_node));
1289 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1290 mutex_exit(&dn->dn_mtx);
1291 dnode_setdirty(dn, tx);
1292 DB_DNODE_EXIT(db);
1293 return (dr);
1294 } else if (do_free_accounting) {
1295 blkptr_t *bp = db->db_blkptr;
1296 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1297 bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1298 /*
1299 * This is only a guess -- if the dbuf is dirty
1300 * in a previous txg, we don't know how much
1301 * space it will use on disk yet. We should
1302 * really have the struct_rwlock to access
1303 * db_blkptr, but since this is just a guess,
1304 * it's OK if we get an odd answer.
1305 */
1306 ddt_prefetch(os->os_spa, bp);
1307 dnode_willuse_space(dn, -willfree, tx);
1308 }
1309
1310 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1311 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1312 drop_struct_lock = TRUE;
1313 }
1314
1315 if (db->db_level == 0) {
1316 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1317 ASSERT(dn->dn_maxblkid >= db->db_blkid);
1318 }
1319
1320 if (db->db_level+1 < dn->dn_nlevels) {
1321 dmu_buf_impl_t *parent = db->db_parent;
1322 dbuf_dirty_record_t *di;
1323 int parent_held = FALSE;
1324
1325 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1326 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1327
1328 parent = dbuf_hold_level(dn, db->db_level+1,
1329 db->db_blkid >> epbs, FTAG);
1330 ASSERT(parent != NULL);
1331 parent_held = TRUE;
1332 }
1333 if (drop_struct_lock)
1334 rw_exit(&dn->dn_struct_rwlock);
1335 ASSERT3U(db->db_level+1, ==, parent->db_level);
1336 di = dbuf_dirty(parent, tx);
1337 if (parent_held)
1338 dbuf_rele(parent, FTAG);
1339
1340 mutex_enter(&db->db_mtx);
1341 /* possible race with dbuf_undirty() */
1342 if (db->db_last_dirty == dr ||
1343 dn->dn_object == DMU_META_DNODE_OBJECT) {
1344 mutex_enter(&di->dt.di.dr_mtx);
1345 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1346 ASSERT(!list_link_active(&dr->dr_dirty_node));
1347 list_insert_tail(&di->dt.di.dr_children, dr);
1348 mutex_exit(&di->dt.di.dr_mtx);
1349 dr->dr_parent = di;
1350 }
1351 mutex_exit(&db->db_mtx);
1352 } else {
1353 ASSERT(db->db_level+1 == dn->dn_nlevels);
1354 ASSERT(db->db_blkid < dn->dn_nblkptr);
1355 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1356 mutex_enter(&dn->dn_mtx);
1357 ASSERT(!list_link_active(&dr->dr_dirty_node));
1358 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1359 mutex_exit(&dn->dn_mtx);
1360 if (drop_struct_lock)
1361 rw_exit(&dn->dn_struct_rwlock);
1362 }
1363
1364 dnode_setdirty(dn, tx);
1365 DB_DNODE_EXIT(db);
1366 return (dr);
1367 }
1368
1369 /*
1370 * Return TRUE if this evicted the dbuf.
1371 */
1372 static boolean_t
1373 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1374 {
1375 dnode_t *dn;
1376 uint64_t txg = tx->tx_txg;
1377 dbuf_dirty_record_t *dr, **drp;
1378 list_t evict_list;
1379
1380 ASSERT(txg != 0);
1381 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1382 ASSERT0(db->db_level);
1383 ASSERT(MUTEX_HELD(&db->db_mtx));
1384
1385 /*
1386 * If this buffer is not dirty, we're done.
1387 */
1388 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1389 if (dr->dr_txg <= txg)
1390 break;
1391 if (dr == NULL || dr->dr_txg < txg)
1392 return (B_FALSE);
1393 ASSERT(dr->dr_txg == txg);
1394 ASSERT(dr->dr_dbuf == db);
1395
1396 dmu_buf_create_user_evict_list(&evict_list);
1397
1398 DB_DNODE_ENTER(db);
1399 dn = DB_DNODE(db);
1400
1401 /*
1402 * Note: This code will probably work even if there are concurrent
1403 * holders, but it is untested in that scenerio, as the ZPL and
1404 * ztest have additional locking (the range locks) that prevents
1405 * that type of concurrent access.
1406 */
1407 ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
1408
1409 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1410
1411 ASSERT(db->db.db_size != 0);
1412
1413 /* XXX would be nice to fix up dn_towrite_space[] */
1414
1415 *drp = dr->dr_next;
1416
1417 /*
1418 * Note that there are three places in dbuf_dirty()
1419 * where this dirty record may be put on a list.
1420 * Make sure to do a list_remove corresponding to
1421 * every one of those list_insert calls.
1422 */
1423 if (dr->dr_parent) {
1424 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1425 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1426 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1427 } else if (db->db_blkid == DMU_SPILL_BLKID ||
1428 db->db_level+1 == dn->dn_nlevels) {
1429 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1430 mutex_enter(&dn->dn_mtx);
1431 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1432 mutex_exit(&dn->dn_mtx);
1433 }
1434 DB_DNODE_EXIT(db);
1435
1436 if (db->db_state != DB_NOFILL) {
1437 dbuf_unoverride(dr);
1438
1439 ASSERT(db->db_buf != NULL);
1440 ASSERT(dr->dt.dl.dr_data != NULL);
1441 if (dr->dt.dl.dr_data != db->db_buf)
1442 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1443 }
1444 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1445
1446 ASSERT(db->db_dirtycnt > 0);
1447 db->db_dirtycnt -= 1;
1448
1449 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1450 arc_buf_t *buf = db->db_buf;
1451
1452 ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1453 dbuf_clear_data(db, &evict_list);
1454 VERIFY(arc_buf_remove_ref(buf, db));
1455 dbuf_evict(db, &evict_list);
1456 dmu_buf_destroy_user_evict_list(&evict_list);
1457 return (B_TRUE);
1458 }
1459
1460 dmu_buf_destroy_user_evict_list(&evict_list);
1461 return (B_FALSE);
1462 }
1463
1464 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
1465 void
1466 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1467 {
1468 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1469
1470 ASSERT(tx->tx_txg != 0);
1471 ASSERT(!refcount_is_zero(&db->db_holds));
1472
1473 DB_DNODE_ENTER(db);
1474 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1475 rf |= DB_RF_HAVESTRUCT;
1476 DB_DNODE_EXIT(db);
1477 (void) dbuf_read(db, NULL, rf);
1478 (void) dbuf_dirty(db, tx);
1479 }
1480
1481 void
1482 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1483 {
1484 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1485
1486 db->db_state = DB_NOFILL;
1487
1488 dmu_buf_will_fill(db_fake, tx);
1489 }
1490
1491 void
1492 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1493 {
1494 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1495
1496 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1497 ASSERT(tx->tx_txg != 0);
1498 ASSERT(db->db_level == 0);
1499 ASSERT(!refcount_is_zero(&db->db_holds));
1500
1501 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1502 dmu_tx_private_ok(tx));
1503
1504 dbuf_noread(db);
1505 (void) dbuf_dirty(db, tx);
1506 }
1507
1508 #pragma weak dmu_buf_fill_done = dbuf_fill_done
1509 /* ARGSUSED */
1510 void
1511 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1512 {
1513 mutex_enter(&db->db_mtx);
1514 DBUF_VERIFY(db);
1515
1516 if (db->db_state == DB_FILL) {
1517 if (db->db_level == 0 && db->db_freed_in_flight) {
1518 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1519 /* we were freed while filling */
1520 /* XXX dbuf_undirty? */
1521 bzero(db->db.db_data, db->db.db_size);
1522 db->db_freed_in_flight = FALSE;
1523 }
1524 db->db_state = DB_CACHED;
1525 cv_broadcast(&db->db_changed);
1526 }
1527 mutex_exit(&db->db_mtx);
1528 }
1529
1530 /*
1531 * Directly assign a provided arc buf to a given dbuf if it's not referenced
1532 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1533 */
1534 void
1535 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1536 {
1537 ASSERT(!refcount_is_zero(&db->db_holds));
1538 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1539 ASSERT(db->db_level == 0);
1540 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1541 ASSERT(buf != NULL);
1542 ASSERT(arc_buf_size(buf) == db->db.db_size);
1543 ASSERT(tx->tx_txg != 0);
1544
1545 arc_return_buf(buf, db);
1546 ASSERT(arc_released(buf));
1547
1548 mutex_enter(&db->db_mtx);
1549
1550 while (db->db_state == DB_READ || db->db_state == DB_FILL)
1551 cv_wait(&db->db_changed, &db->db_mtx);
1552
1553 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1554
1555 if (db->db_state == DB_CACHED &&
1556 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1557 mutex_exit(&db->db_mtx);
1558 (void) dbuf_dirty(db, tx);
1559 bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1560 VERIFY(arc_buf_remove_ref(buf, db));
1561 xuio_stat_wbuf_copied();
1562 return;
1563 }
1564
1565 xuio_stat_wbuf_nocopy();
1566 if (db->db_state == DB_CACHED) {
1567 dbuf_dirty_record_t *dr = db->db_last_dirty;
1568
1569 ASSERT(db->db_buf != NULL);
1570 if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1571 ASSERT(dr->dt.dl.dr_data == db->db_buf);
1572 if (!arc_released(db->db_buf)) {
1573 ASSERT(dr->dt.dl.dr_override_state ==
1574 DR_OVERRIDDEN);
1575 arc_release(db->db_buf, db);
1576 }
1577 dr->dt.dl.dr_data = buf;
1578 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1579 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1580 arc_release(db->db_buf, db);
1581 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1582 }
1583 db->db_buf = NULL;
1584 }
1585 ASSERT(db->db_buf == NULL);
1586 dbuf_set_data(db, buf);
1587 db->db_state = DB_FILL;
1588 mutex_exit(&db->db_mtx);
1589 (void) dbuf_dirty(db, tx);
1590 dbuf_fill_done(db, tx);
1591 }
1592
1593 /*
1594 * "Clear" the contents of this dbuf. This will mark the dbuf
1595 * EVICTING and clear *most* of its references. Unfortunetely,
1596 * when we are not holding the dn_dbufs_mtx, we can't clear the
1597 * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1598 * in this case. For callers from the DMU we will usually see:
1599 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1600 * For the arc callback, we will usually see:
1601 * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1602 * Sometimes, though, we will get a mix of these two:
1603 * DMU: dbuf_clear()->arc_buf_evict()
1604 * ARC: dbuf_do_evict()->dbuf_destroy()
1605 */
1606 void
1607 dbuf_clear(dmu_buf_impl_t *db, list_t *evict_list_p)
1608 {
1609 dnode_t *dn;
1610 dmu_buf_impl_t *parent = db->db_parent;
1611 dmu_buf_impl_t *dndb;
1612 int dbuf_gone = FALSE;
1613
1614 ASSERT(MUTEX_HELD(&db->db_mtx));
1615 ASSERT(refcount_is_zero(&db->db_holds));
1616
1617 dbuf_evict_user(db, evict_list_p);
1618
1619 if (db->db_state == DB_CACHED) {
1620 ASSERT(db->db.db_data != NULL);
1621 if (db->db_blkid == DMU_BONUS_BLKID) {
1622 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1623 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1624 }
1625 db->db.db_data = NULL;
1626 db->db_state = DB_UNCACHED;
1627 }
1628
1629 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1630 ASSERT(db->db_data_pending == NULL);
1631
1632 db->db_state = DB_EVICTING;
1633 db->db_blkptr = NULL;
1634
1635 DB_DNODE_ENTER(db);
1636 dn = DB_DNODE(db);
1637 dndb = dn->dn_dbuf;
1638 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1639 list_remove(&dn->dn_dbufs, db);
1640 (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1641 membar_producer();
1642 DB_DNODE_EXIT(db);
1643 /*
1644 * Decrementing the dbuf count means that the hold corresponding
1645 * to the removed dbuf is no longer discounted in dnode_move(),
1646 * so the dnode cannot be moved until after we release the hold.
1647 * The membar_producer() ensures visibility of the decremented
1648 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1649 * release any lock.
1650 */
1651 dnode_rele(dn, db);
1652 db->db_dnode_handle = NULL;
1653 } else {
1654 DB_DNODE_EXIT(db);
1655 }
1656
1657 if (db->db_buf)
1658 dbuf_gone = arc_buf_evict(db->db_buf);
1659
1660 if (!dbuf_gone)
1661 mutex_exit(&db->db_mtx);
1662
1663 /*
1664 * If this dbuf is referenced from an indirect dbuf,
1665 * decrement the ref count on the indirect dbuf.
1666 */
1667 if (parent && parent != dndb)
1668 dbuf_rele(parent, db);
1669 }
1670
1671 static int
1672 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1673 dmu_buf_impl_t **parentp, blkptr_t **bpp)
1674 {
1675 int nlevels, epbs;
1676
1677 *parentp = NULL;
1678 *bpp = NULL;
1679
1680 ASSERT(blkid != DMU_BONUS_BLKID);
1681
1682 if (blkid == DMU_SPILL_BLKID) {
1683 mutex_enter(&dn->dn_mtx);
1684 if (dn->dn_have_spill &&
1685 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1686 *bpp = &dn->dn_phys->dn_spill;
1687 else
1688 *bpp = NULL;
1689 dbuf_add_ref(dn->dn_dbuf, NULL);
1690 *parentp = dn->dn_dbuf;
1691 mutex_exit(&dn->dn_mtx);
1692 return (0);
1693 }
1694
1695 if (dn->dn_phys->dn_nlevels == 0)
1696 nlevels = 1;
1697 else
1698 nlevels = dn->dn_phys->dn_nlevels;
1699
1700 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1701
1702 ASSERT3U(level * epbs, <, 64);
1703 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1704 if (level >= nlevels ||
1705 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1706 /* the buffer has no parent yet */
1707 return (SET_ERROR(ENOENT));
1708 } else if (level < nlevels-1) {
1709 /* this block is referenced from an indirect block */
1710 int err = dbuf_hold_impl(dn, level+1,
1711 blkid >> epbs, fail_sparse, NULL, parentp);
1712 if (err)
1713 return (err);
1714 err = dbuf_read(*parentp, NULL,
1715 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1716 if (err) {
1717 dbuf_rele(*parentp, NULL);
1718 *parentp = NULL;
1719 return (err);
1720 }
1721 *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1722 (blkid & ((1ULL << epbs) - 1));
1723 return (0);
1724 } else {
1725 /* the block is referenced from the dnode */
1726 ASSERT3U(level, ==, nlevels-1);
1727 ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1728 blkid < dn->dn_phys->dn_nblkptr);
1729 if (dn->dn_dbuf) {
1730 dbuf_add_ref(dn->dn_dbuf, NULL);
1731 *parentp = dn->dn_dbuf;
1732 }
1733 *bpp = &dn->dn_phys->dn_blkptr[blkid];
1734 return (0);
1735 }
1736 }
1737
1738 static dmu_buf_impl_t *
1739 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1740 dmu_buf_impl_t *parent, blkptr_t *blkptr)
1741 {
1742 objset_t *os = dn->dn_objset;
1743 dmu_buf_impl_t *db, *odb;
1744
1745 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1746 ASSERT(dn->dn_type != DMU_OT_NONE);
1747
1748 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1749
1750 db->db_objset = os;
1751 db->db.db_object = dn->dn_object;
1752 db->db_level = level;
1753 db->db_blkid = blkid;
1754 db->db_last_dirty = NULL;
1755 db->db_dirtycnt = 0;
1756 db->db_dnode_handle = dn->dn_handle;
1757 db->db_parent = parent;
1758 db->db_blkptr = blkptr;
1759
1760 db->db_user = NULL;
1761 db->db_immediate_evict = 0;
1762 db->db_freed_in_flight = 0;
1763
1764 if (blkid == DMU_BONUS_BLKID) {
1765 ASSERT3P(parent, ==, dn->dn_dbuf);
1766 db->db.db_size = DN_MAX_BONUSLEN -
1767 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1768 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1769 db->db.db_offset = DMU_BONUS_BLKID;
1770 db->db_state = DB_UNCACHED;
1771 /* the bonus dbuf is not placed in the hash table */
1772 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1773 return (db);
1774 } else if (blkid == DMU_SPILL_BLKID) {
1775 db->db.db_size = (blkptr != NULL) ?
1776 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1777 db->db.db_offset = 0;
1778 } else {
1779 int blocksize =
1780 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
1781 db->db.db_size = blocksize;
1782 db->db.db_offset = db->db_blkid * blocksize;
1783 }
1784
1785 /*
1786 * Hold the dn_dbufs_mtx while we get the new dbuf
1787 * in the hash table *and* added to the dbufs list.
1788 * This prevents a possible deadlock with someone
1789 * trying to look up this dbuf before its added to the
1790 * dn_dbufs list.
1791 */
1792 mutex_enter(&dn->dn_dbufs_mtx);
1793 db->db_state = DB_EVICTING;
1794 if ((odb = dbuf_hash_insert(db)) != NULL) {
1795 /* someone else inserted it first */
1796 kmem_cache_free(dbuf_cache, db);
1797 mutex_exit(&dn->dn_dbufs_mtx);
1798 return (odb);
1799 }
1800 list_insert_head(&dn->dn_dbufs, db);
1801 db->db_state = DB_UNCACHED;
1802 mutex_exit(&dn->dn_dbufs_mtx);
1803 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1804
1805 if (parent && parent != dn->dn_dbuf)
1806 dbuf_add_ref(parent, db);
1807
1808 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1809 refcount_count(&dn->dn_holds) > 0);
1810 (void) refcount_add(&dn->dn_holds, db);
1811 (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
1812
1813 dprintf_dbuf(db, "db=%p\n", db);
1814
1815 return (db);
1816 }
1817
1818 static int
1819 dbuf_do_evict(void *private)
1820 {
1821 arc_buf_t *buf = private;
1822 dmu_buf_impl_t *db = buf->b_private;
1823 list_t evict_list;
1824
1825 dmu_buf_create_user_evict_list(&evict_list);
1826
1827 if (!MUTEX_HELD(&db->db_mtx))
1828 mutex_enter(&db->db_mtx);
1829
1830 ASSERT(refcount_is_zero(&db->db_holds));
1831
1832 if (db->db_state != DB_EVICTING) {
1833 ASSERT(db->db_state == DB_CACHED);
1834 DBUF_VERIFY(db);
1835 db->db_buf = NULL;
1836 dbuf_evict(db, &evict_list);
1837 } else {
1838 mutex_exit(&db->db_mtx);
1839 dbuf_destroy(db);
1840 }
1841 dmu_buf_destroy_user_evict_list(&evict_list);
1842 return (0);
1843 }
1844
1845 static void
1846 dbuf_destroy(dmu_buf_impl_t *db)
1847 {
1848 ASSERT(refcount_is_zero(&db->db_holds));
1849
1850 if (db->db_blkid != DMU_BONUS_BLKID) {
1851 /*
1852 * If this dbuf is still on the dn_dbufs list,
1853 * remove it from that list.
1854 */
1855 if (db->db_dnode_handle != NULL) {
1856 dnode_t *dn;
1857
1858 DB_DNODE_ENTER(db);
1859 dn = DB_DNODE(db);
1860 mutex_enter(&dn->dn_dbufs_mtx);
1861 list_remove(&dn->dn_dbufs, db);
1862 (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1863 mutex_exit(&dn->dn_dbufs_mtx);
1864 DB_DNODE_EXIT(db);
1865 /*
1866 * Decrementing the dbuf count means that the hold
1867 * corresponding to the removed dbuf is no longer
1868 * discounted in dnode_move(), so the dnode cannot be
1869 * moved until after we release the hold.
1870 */
1871 dnode_rele(dn, db);
1872 db->db_dnode_handle = NULL;
1873 }
1874 dbuf_hash_remove(db);
1875 }
1876 db->db_parent = NULL;
1877 db->db_buf = NULL;
1878
1879 ASSERT(!list_link_active(&db->db_link));
1880 ASSERT(db->db.db_data == NULL);
1881 ASSERT(db->db_hash_next == NULL);
1882 ASSERT(db->db_blkptr == NULL);
1883 ASSERT(db->db_data_pending == NULL);
1884
1885 kmem_cache_free(dbuf_cache, db);
1886 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1887 }
1888
1889 void
1890 dbuf_prefetch(dnode_t *dn, uint64_t blkid)
1891 {
1892 dmu_buf_impl_t *db = NULL;
1893 blkptr_t *bp = NULL;
1894
1895 ASSERT(blkid != DMU_BONUS_BLKID);
1896 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1897
1898 if (dnode_block_freed(dn, blkid))
1899 return;
1900
1901 /* dbuf_find() returns with db_mtx held */
1902 if (db = dbuf_find(dn, 0, blkid)) {
1903 /*
1904 * This dbuf is already in the cache. We assume that
1905 * it is already CACHED, or else about to be either
1906 * read or filled.
1907 */
1908 mutex_exit(&db->db_mtx);
1909 return;
1910 }
1911
1912 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1913 if (bp && !BP_IS_HOLE(bp)) {
1914 int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
1915 ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
1916 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1917 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1918 zbookmark_t zb;
1919
1920 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1921 dn->dn_object, 0, blkid);
1922
1923 (void) arc_read(NULL, dn->dn_objset->os_spa,
1924 bp, NULL, NULL, priority,
1925 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1926 &aflags, &zb);
1927 }
1928 if (db)
1929 dbuf_rele(db, NULL);
1930 }
1931 }
1932
1933 /*
1934 * Returns with db_holds incremented, and db_mtx not held.
1935 * Note: dn_struct_rwlock must be held.
1936 */
1937 int
1938 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1939 void *tag, dmu_buf_impl_t **dbp)
1940 {
1941 dmu_buf_impl_t *db, *parent = NULL;
1942 list_t evict_list;
1943
1944 ASSERT(blkid != DMU_BONUS_BLKID);
1945 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1946 ASSERT3U(dn->dn_nlevels, >, level);
1947
1948 dmu_buf_create_user_evict_list(&evict_list);
1949
1950 *dbp = NULL;
1951 top:
1952 /* dbuf_find() returns with db_mtx held */
1953 db = dbuf_find(dn, level, blkid);
1954
1955 if (db == NULL) {
1956 blkptr_t *bp = NULL;
1957 int err;
1958
1959 ASSERT3P(parent, ==, NULL);
1960 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1961 if (fail_sparse) {
1962 if (err == 0 && bp && BP_IS_HOLE(bp))
1963 err = SET_ERROR(ENOENT);
1964 if (err) {
1965 if (parent)
1966 dbuf_rele(parent, NULL);
1967 return (err);
1968 }
1969 }
1970 if (err && err != ENOENT)
1971 return (err);
1972 db = dbuf_create(dn, level, blkid, parent, bp);
1973 }
1974
1975 if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1976 arc_buf_add_ref(db->db_buf, db);
1977 if (db->db_buf->b_data == NULL) {
1978 dbuf_clear(db, &evict_list);
1979 if (parent) {
1980 dbuf_rele(parent, NULL);
1981 parent = NULL;
1982 }
1983 goto top;
1984 }
1985 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1986 }
1987
1988 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1989
1990 /*
1991 * If this buffer is currently syncing out, and we are are
1992 * still referencing it from db_data, we need to make a copy
1993 * of it in case we decide we want to dirty it again in this txg.
1994 */
1995 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1996 dn->dn_object != DMU_META_DNODE_OBJECT &&
1997 db->db_state == DB_CACHED && db->db_data_pending) {
1998 dbuf_dirty_record_t *dr = db->db_data_pending;
1999
2000 if (dr->dt.dl.dr_data == db->db_buf) {
2001 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2002
2003 dbuf_set_data(db,
2004 arc_buf_alloc(dn->dn_objset->os_spa,
2005 db->db.db_size, db, type));
2006 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
2007 db->db.db_size);
2008 }
2009 }
2010
2011 (void) refcount_add(&db->db_holds, tag);
2012 DBUF_VERIFY(db);
2013 mutex_exit(&db->db_mtx);
2014
2015 dmu_buf_destroy_user_evict_list(&evict_list);
2016
2017 /* NOTE: we can't rele the parent until after we drop the db_mtx */
2018 if (parent)
2019 dbuf_rele(parent, NULL);
2020
2021 ASSERT3P(DB_DNODE(db), ==, dn);
2022 ASSERT3U(db->db_blkid, ==, blkid);
2023 ASSERT3U(db->db_level, ==, level);
2024 *dbp = db;
2025
2026 return (0);
2027 }
2028
2029 dmu_buf_impl_t *
2030 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
2031 {
2032 dmu_buf_impl_t *db;
2033 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
2034 return (err ? NULL : db);
2035 }
2036
2037 dmu_buf_impl_t *
2038 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
2039 {
2040 dmu_buf_impl_t *db;
2041 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
2042 return (err ? NULL : db);
2043 }
2044
2045 void
2046 dbuf_create_bonus(dnode_t *dn)
2047 {
2048 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
2049
2050 ASSERT(dn->dn_bonus == NULL);
2051 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
2052 }
2053
2054 int
2055 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
2056 {
2057 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2058 dnode_t *dn;
2059
2060 if (db->db_blkid != DMU_SPILL_BLKID)
2061 return (SET_ERROR(ENOTSUP));
2062 if (blksz == 0)
2063 blksz = SPA_MINBLOCKSIZE;
2064 if (blksz > SPA_MAXBLOCKSIZE)
2065 blksz = SPA_MAXBLOCKSIZE;
2066 else
2067 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
2068
2069 DB_DNODE_ENTER(db);
2070 dn = DB_DNODE(db);
2071 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2072 dbuf_new_size(db, blksz, tx);
2073 rw_exit(&dn->dn_struct_rwlock);
2074 DB_DNODE_EXIT(db);
2075
2076 return (0);
2077 }
2078
2079 void
2080 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
2081 {
2082 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
2083 }
2084
2085 #pragma weak dmu_buf_add_ref = dbuf_add_ref
2086 void
2087 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
2088 {
2089 int64_t holds = refcount_add(&db->db_holds, tag);
2090 ASSERT(holds > 1);
2091 }
2092
2093 /*
2094 * If you call dbuf_rele() you had better not be referencing the dnode handle
2095 * unless you have some other direct or indirect hold on the dnode. (An indirect
2096 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2097 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2098 * dnode's parent dbuf evicting its dnode handles.
2099 */
2100 #pragma weak dmu_buf_rele = dbuf_rele
2101 void
2102 dbuf_rele(dmu_buf_impl_t *db, void *tag)
2103 {
2104 mutex_enter(&db->db_mtx);
2105 dbuf_rele_and_unlock(db, tag);
2106 }
2107
2108 /*
2109 * dbuf_rele() for an already-locked dbuf. This is necessary to allow
2110 * db_dirtycnt and db_holds to be updated atomically.
2111 */
2112 void
2113 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2114 {
2115 int64_t holds;
2116 list_t evict_list;
2117
2118 ASSERT(MUTEX_HELD(&db->db_mtx));
2119 DBUF_VERIFY(db);
2120
2121 dmu_buf_create_user_evict_list(&evict_list);
2122
2123 /*
2124 * Remove the reference to the dbuf before removing its hold on the
2125 * dnode so we can guarantee in dnode_move() that a referenced bonus
2126 * buffer has a corresponding dnode hold.
2127 */
2128 holds = refcount_remove(&db->db_holds, tag);
2129 ASSERT(holds >= 0);
2130
2131 /*
2132 * We can't freeze indirects if there is a possibility that they
2133 * may be modified in the current syncing context.
2134 */
2135 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2136 arc_buf_freeze(db->db_buf);
2137
2138 if (holds == db->db_dirtycnt &&
2139 db->db_level == 0 && db->db_immediate_evict)
2140 dbuf_evict_user(db, &evict_list);
2141
2142 if (holds == 0) {
2143 if (db->db_blkid == DMU_BONUS_BLKID) {
2144 mutex_exit(&db->db_mtx);
2145
2146 /*
2147 * If the dnode moves here, we cannot cross this barrier
2148 * until the move completes.
2149 */
2150 DB_DNODE_ENTER(db);
2151 (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
2152 DB_DNODE_EXIT(db);
2153 /*
2154 * The bonus buffer's dnode hold is no longer discounted
2155 * in dnode_move(). The dnode cannot move until after
2156 * the dnode_rele().
2157 */
2158 dnode_rele(DB_DNODE(db), db);
2159 } else if (db->db_buf == NULL) {
2160 /*
2161 * This is a special case: we never associated this
2162 * dbuf with any data allocated from the ARC.
2163 */
2164 ASSERT(db->db_state == DB_UNCACHED ||
2165 db->db_state == DB_NOFILL);
2166 dbuf_evict(db, &evict_list);
2167 } else if (arc_released(db->db_buf)) {
2168 arc_buf_t *buf = db->db_buf;
2169 /*
2170 * This dbuf has anonymous data associated with it.
2171 */
2172 dbuf_clear_data(db, &evict_list);
2173 VERIFY(arc_buf_remove_ref(buf, db));
2174 dbuf_evict(db, &evict_list);
2175 } else {
2176 VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2177
2178 /*
2179 * A dbuf will be eligible for eviction if either the
2180 * 'primarycache' property is set or a duplicate
2181 * copy of this buffer is already cached in the arc.
2182 *
2183 * In the case of the 'primarycache' a buffer
2184 * is considered for eviction if it matches the
2185 * criteria set in the property.
2186 *
2187 * To decide if our buffer is considered a
2188 * duplicate, we must call into the arc to determine
2189 * if multiple buffers are referencing the same
2190 * block on-disk. If so, then we simply evict
2191 * ourselves.
2192 */
2193 if (!DBUF_IS_CACHEABLE(db) ||
2194 arc_buf_eviction_needed(db->db_buf))
2195 dbuf_clear(db, &evict_list);
2196 else
2197 mutex_exit(&db->db_mtx);
2198 }
2199 } else {
2200 mutex_exit(&db->db_mtx);
2201 }
2202 dmu_buf_destroy_user_evict_list(&evict_list);
2203 }
2204
2205 #pragma weak dmu_buf_refcount = dbuf_refcount
2206 uint64_t
2207 dbuf_refcount(dmu_buf_impl_t *db)
2208 {
2209 return (refcount_count(&db->db_holds));
2210 }
2211
2212 boolean_t
2213 dmu_buf_freeable(dmu_buf_t *dbuf)
2214 {
2215 boolean_t res = B_FALSE;
2216 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2217
2218 if (db->db_blkptr)
2219 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2220 db->db_blkptr, db->db_blkptr->blk_birth);
2221
2222 return (res);
2223 }
2224
2225 blkptr_t *
2226 dmu_buf_get_blkptr(dmu_buf_t *db)
2227 {
2228 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2229 return (dbi->db_blkptr);
2230 }
2231
2232 static void
2233 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2234 {
2235 /* ASSERT(dmu_tx_is_syncing(tx) */
2236 ASSERT(MUTEX_HELD(&db->db_mtx));
2237
2238 if (db->db_blkptr != NULL)
2239 return;
2240
2241 if (db->db_blkid == DMU_SPILL_BLKID) {
2242 db->db_blkptr = &dn->dn_phys->dn_spill;
2243 BP_ZERO(db->db_blkptr);
2244 return;
2245 }
2246 if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2247 /*
2248 * This buffer was allocated at a time when there was
2249 * no available blkptrs from the dnode, or it was
2250 * inappropriate to hook it in (i.e., nlevels mis-match).
2251 */
2252 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2253 ASSERT(db->db_parent == NULL);
2254 db->db_parent = dn->dn_dbuf;
2255 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2256 DBUF_VERIFY(db);
2257 } else {
2258 dmu_buf_impl_t *parent = db->db_parent;
2259 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2260
2261 ASSERT(dn->dn_phys->dn_nlevels > 1);
2262 if (parent == NULL) {
2263 mutex_exit(&db->db_mtx);
2264 rw_enter(&dn->dn_struct_rwlock, RW_READER);
2265 (void) dbuf_hold_impl(dn, db->db_level+1,
2266 db->db_blkid >> epbs, FALSE, db, &parent);
2267 rw_exit(&dn->dn_struct_rwlock);
2268 mutex_enter(&db->db_mtx);
2269 db->db_parent = parent;
2270 }
2271 db->db_blkptr = (blkptr_t *)parent->db.db_data +
2272 (db->db_blkid & ((1ULL << epbs) - 1));
2273 DBUF_VERIFY(db);
2274 }
2275 }
2276
2277 static void
2278 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2279 {
2280 dmu_buf_impl_t *db = dr->dr_dbuf;
2281 dnode_t *dn;
2282 zio_t *zio;
2283
2284 ASSERT(dmu_tx_is_syncing(tx));
2285
2286 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2287
2288 mutex_enter(&db->db_mtx);
2289
2290 ASSERT(db->db_level > 0);
2291 DBUF_VERIFY(db);
2292
2293 if (db->db_buf == NULL) {
2294 mutex_exit(&db->db_mtx);
2295 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2296 mutex_enter(&db->db_mtx);
2297 }
2298 ASSERT3U(db->db_state, ==, DB_CACHED);
2299 ASSERT(db->db_buf != NULL);
2300
2301 DB_DNODE_ENTER(db);
2302 dn = DB_DNODE(db);
2303 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2304 dbuf_check_blkptr(dn, db);
2305 DB_DNODE_EXIT(db);
2306
2307 db->db_data_pending = dr;
2308
2309 mutex_exit(&db->db_mtx);
2310 dbuf_write(dr, db->db_buf, tx);
2311
2312 zio = dr->dr_zio;
2313 mutex_enter(&dr->dt.di.dr_mtx);
2314 dbuf_sync_list(&dr->dt.di.dr_children, tx);
2315 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2316 mutex_exit(&dr->dt.di.dr_mtx);
2317 zio_nowait(zio);
2318 }
2319
2320 static void
2321 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2322 {
2323 arc_buf_t **datap = &dr->dt.dl.dr_data;
2324 dmu_buf_impl_t *db = dr->dr_dbuf;
2325 dnode_t *dn;
2326 objset_t *os;
2327 uint64_t txg = tx->tx_txg;
2328
2329 ASSERT(dmu_tx_is_syncing(tx));
2330
2331 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2332
2333 mutex_enter(&db->db_mtx);
2334 /*
2335 * To be synced, we must be dirtied. But we
2336 * might have been freed after the dirty.
2337 */
2338 if (db->db_state == DB_UNCACHED) {
2339 /* This buffer has been freed since it was dirtied */
2340 ASSERT(db->db.db_data == NULL);
2341 } else if (db->db_state == DB_FILL) {
2342 /* This buffer was freed and is now being re-filled */
2343 ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2344 } else {
2345 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2346 }
2347 DBUF_VERIFY(db);
2348
2349 DB_DNODE_ENTER(db);
2350 dn = DB_DNODE(db);
2351
2352 if (db->db_blkid == DMU_SPILL_BLKID) {
2353 mutex_enter(&dn->dn_mtx);
2354 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2355 mutex_exit(&dn->dn_mtx);
2356 }
2357
2358 /*
2359 * If this is a bonus buffer, simply copy the bonus data into the
2360 * dnode. It will be written out when the dnode is synced (and it
2361 * will be synced, since it must have been dirty for dbuf_sync to
2362 * be called).
2363 */
2364 if (db->db_blkid == DMU_BONUS_BLKID) {
2365 dbuf_dirty_record_t **drp;
2366
2367 ASSERT(*datap != NULL);
2368 ASSERT0(db->db_level);
2369 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2370 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2371 DB_DNODE_EXIT(db);
2372
2373 if (*datap != db->db.db_data) {
2374 zio_buf_free(*datap, DN_MAX_BONUSLEN);
2375 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2376 }
2377 db->db_data_pending = NULL;
2378 drp = &db->db_last_dirty;
2379 while (*drp != dr)
2380 drp = &(*drp)->dr_next;
2381 ASSERT(dr->dr_next == NULL);
2382 ASSERT(dr->dr_dbuf == db);
2383 *drp = dr->dr_next;
2384 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2385 ASSERT(db->db_dirtycnt > 0);
2386 db->db_dirtycnt -= 1;
2387 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2388 return;
2389 }
2390
2391 os = dn->dn_objset;
2392
2393 /*
2394 * This function may have dropped the db_mtx lock allowing a dmu_sync
2395 * operation to sneak in. As a result, we need to ensure that we
2396 * don't check the dr_override_state until we have returned from
2397 * dbuf_check_blkptr.
2398 */
2399 dbuf_check_blkptr(dn, db);
2400
2401 /*
2402 * If this buffer is in the middle of an immediate write,
2403 * wait for the synchronous IO to complete.
2404 */
2405 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2406 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2407 cv_wait(&db->db_changed, &db->db_mtx);
2408 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2409 }
2410
2411 if (db->db_state != DB_NOFILL &&
2412 dn->dn_object != DMU_META_DNODE_OBJECT &&
2413 refcount_count(&db->db_holds) > 1 &&
2414 dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2415 *datap == db->db_buf) {
2416 /*
2417 * If this buffer is currently "in use" (i.e., there
2418 * are active holds and db_data still references it),
2419 * then make a copy before we start the write so that
2420 * any modifications from the open txg will not leak
2421 * into this write.
2422 *
2423 * NOTE: this copy does not need to be made for
2424 * objects only modified in the syncing context (e.g.
2425 * DNONE_DNODE blocks).
2426 */
2427 int blksz = arc_buf_size(*datap);
2428 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2429 *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2430 bcopy(db->db.db_data, (*datap)->b_data, blksz);
2431 }
2432 db->db_data_pending = dr;
2433
2434 mutex_exit(&db->db_mtx);
2435
2436 dbuf_write(dr, *datap, tx);
2437
2438 ASSERT(!list_link_active(&dr->dr_dirty_node));
2439 if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2440 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2441 DB_DNODE_EXIT(db);
2442 } else {
2443 /*
2444 * Although zio_nowait() does not "wait for an IO", it does
2445 * initiate the IO. If this is an empty write it seems plausible
2446 * that the IO could actually be completed before the nowait
2447 * returns. We need to DB_DNODE_EXIT() first in case
2448 * zio_nowait() invalidates the dbuf.
2449 */
2450 DB_DNODE_EXIT(db);
2451 zio_nowait(dr->dr_zio);
2452 }
2453 }
2454
2455 void
2456 dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2457 {
2458 dbuf_dirty_record_t *dr;
2459
2460 while (dr = list_head(list)) {
2461 if (dr->dr_zio != NULL) {
2462 /*
2463 * If we find an already initialized zio then we
2464 * are processing the meta-dnode, and we have finished.
2465 * The dbufs for all dnodes are put back on the list
2466 * during processing, so that we can zio_wait()
2467 * these IOs after initiating all child IOs.
2468 */
2469 ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2470 DMU_META_DNODE_OBJECT);
2471 break;
2472 }
2473 list_remove(list, dr);
2474 if (dr->dr_dbuf->db_level > 0)
2475 dbuf_sync_indirect(dr, tx);
2476 else
2477 dbuf_sync_leaf(dr, tx);
2478 }
2479 }
2480
2481 /* ARGSUSED */
2482 static void
2483 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2484 {
2485 dmu_buf_impl_t *db = vdb;
2486 dnode_t *dn;
2487 blkptr_t *bp = zio->io_bp;
2488 blkptr_t *bp_orig = &zio->io_bp_orig;
2489 spa_t *spa = zio->io_spa;
2490 int64_t delta;
2491 uint64_t fill = 0;
2492 int i;
2493
2494 ASSERT(db->db_blkptr == bp);
2495
2496 DB_DNODE_ENTER(db);
2497 dn = DB_DNODE(db);
2498 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2499 dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2500 zio->io_prev_space_delta = delta;
2501
2502 if (BP_IS_HOLE(bp)) {
2503 ASSERT(bp->blk_fill == 0);
2504 DB_DNODE_EXIT(db);
2505 return;
2506 }
2507
2508 ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2509 BP_GET_TYPE(bp) == dn->dn_type) ||
2510 (db->db_blkid == DMU_SPILL_BLKID &&
2511 BP_GET_TYPE(bp) == dn->dn_bonustype));
2512 ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2513
2514 mutex_enter(&db->db_mtx);
2515
2516 #ifdef ZFS_DEBUG
2517 if (db->db_blkid == DMU_SPILL_BLKID) {
2518 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2519 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2520 db->db_blkptr == &dn->dn_phys->dn_spill);
2521 }
2522 #endif
2523
2524 if (db->db_level == 0) {
2525 mutex_enter(&dn->dn_mtx);
2526 if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2527 db->db_blkid != DMU_SPILL_BLKID)
2528 dn->dn_phys->dn_maxblkid = db->db_blkid;
2529 mutex_exit(&dn->dn_mtx);
2530
2531 if (dn->dn_type == DMU_OT_DNODE) {
2532 dnode_phys_t *dnp = db->db.db_data;
2533 for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2534 i--, dnp++) {
2535 if (dnp->dn_type != DMU_OT_NONE)
2536 fill++;
2537 }
2538 } else {
2539 fill = 1;
2540 }
2541 } else {
2542 blkptr_t *ibp = db->db.db_data;
2543 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2544 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2545 if (BP_IS_HOLE(ibp))
2546 continue;
2547 fill += ibp->blk_fill;
2548 }
2549 }
2550 DB_DNODE_EXIT(db);
2551
2552 bp->blk_fill = fill;
2553
2554 mutex_exit(&db->db_mtx);
2555 }
2556
2557 /* ARGSUSED */
2558 static void
2559 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2560 {
2561 dmu_buf_impl_t *db = vdb;
2562 blkptr_t *bp = zio->io_bp;
2563 blkptr_t *bp_orig = &zio->io_bp_orig;
2564 uint64_t txg = zio->io_txg;
2565 dbuf_dirty_record_t **drp, *dr;
2566
2567 ASSERT0(zio->io_error);
2568 ASSERT(db->db_blkptr == bp);
2569
2570 /*
2571 * For nopwrites and rewrites we ensure that the bp matches our
2572 * original and bypass all the accounting.
2573 */
2574 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2575 ASSERT(BP_EQUAL(bp, bp_orig));
2576 } else {
2577 objset_t *os;
2578 dsl_dataset_t *ds;
2579 dmu_tx_t *tx;
2580
2581 DB_GET_OBJSET(&os, db);
2582 ds = os->os_dsl_dataset;
2583 tx = os->os_synctx;
2584
2585 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2586 dsl_dataset_block_born(ds, bp, tx);
2587 }
2588
2589 mutex_enter(&db->db_mtx);
2590
2591 DBUF_VERIFY(db);
2592
2593 drp = &db->db_last_dirty;
2594 while ((dr = *drp) != db->db_data_pending)
2595 drp = &dr->dr_next;
2596 ASSERT(!list_link_active(&dr->dr_dirty_node));
2597 ASSERT(dr->dr_txg == txg);
2598 ASSERT(dr->dr_dbuf == db);
2599 ASSERT(dr->dr_next == NULL);
2600 *drp = dr->dr_next;
2601
2602 #ifdef ZFS_DEBUG
2603 if (db->db_blkid == DMU_SPILL_BLKID) {
2604 dnode_t *dn;
2605
2606 DB_DNODE_ENTER(db);
2607 dn = DB_DNODE(db);
2608 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2609 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2610 db->db_blkptr == &dn->dn_phys->dn_spill);
2611 DB_DNODE_EXIT(db);
2612 }
2613 #endif
2614
2615 if (db->db_level == 0) {
2616 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2617 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2618 if (db->db_state != DB_NOFILL) {
2619 if (dr->dt.dl.dr_data != db->db_buf)
2620 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
2621 db));
2622 else if (!arc_released(db->db_buf))
2623 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2624 }
2625 } else {
2626 dnode_t *dn;
2627
2628 DB_DNODE_ENTER(db);
2629 dn = DB_DNODE(db);
2630 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2631 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2632 if (!BP_IS_HOLE(db->db_blkptr)) {
2633 int epbs =
2634 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2635 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2636 db->db.db_size);
2637 ASSERT3U(dn->dn_phys->dn_maxblkid
2638 >> (db->db_level * epbs), >=, db->db_blkid);
2639 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2640 }
2641 DB_DNODE_EXIT(db);
2642 mutex_destroy(&dr->dt.di.dr_mtx);
2643 list_destroy(&dr->dt.di.dr_children);
2644 }
2645 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2646
2647 cv_broadcast(&db->db_changed);
2648 ASSERT(db->db_dirtycnt > 0);
2649 db->db_dirtycnt -= 1;
2650 db->db_data_pending = NULL;
2651 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2652 }
2653
2654 static void
2655 dbuf_write_nofill_ready(zio_t *zio)
2656 {
2657 dbuf_write_ready(zio, NULL, zio->io_private);
2658 }
2659
2660 static void
2661 dbuf_write_nofill_done(zio_t *zio)
2662 {
2663 dbuf_write_done(zio, NULL, zio->io_private);
2664 }
2665
2666 static void
2667 dbuf_write_override_ready(zio_t *zio)
2668 {
2669 dbuf_dirty_record_t *dr = zio->io_private;
2670 dmu_buf_impl_t *db = dr->dr_dbuf;
2671
2672 dbuf_write_ready(zio, NULL, db);
2673 }
2674
2675 static void
2676 dbuf_write_override_done(zio_t *zio)
2677 {
2678 dbuf_dirty_record_t *dr = zio->io_private;
2679 dmu_buf_impl_t *db = dr->dr_dbuf;
2680 blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2681
2682 mutex_enter(&db->db_mtx);
2683 if (!BP_EQUAL(zio->io_bp, obp)) {
2684 if (!BP_IS_HOLE(obp))
2685 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2686 arc_release(dr->dt.dl.dr_data, db);
2687 }
2688 mutex_exit(&db->db_mtx);
2689
2690 dbuf_write_done(zio, NULL, db);
2691 }
2692
2693 static void
2694 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2695 {
2696 dmu_buf_impl_t *db = dr->dr_dbuf;
2697 dnode_t *dn;
2698 objset_t *os;
2699 dmu_buf_impl_t *parent = db->db_parent;
2700 uint64_t txg = tx->tx_txg;
2701 zbookmark_t zb;
2702 zio_prop_t zp;
2703 zio_t *zio;
2704 int wp_flag = 0;
2705
2706 DB_DNODE_ENTER(db);
2707 dn = DB_DNODE(db);
2708 os = dn->dn_objset;
2709
2710 if (db->db_state != DB_NOFILL) {
2711 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2712 /*
2713 * Private object buffers are released here rather
2714 * than in dbuf_dirty() since they are only modified
2715 * in the syncing context and we don't want the
2716 * overhead of making multiple copies of the data.
2717 */
2718 if (BP_IS_HOLE(db->db_blkptr)) {
2719 arc_buf_thaw(data);
2720 } else {
2721 dbuf_release_bp(db);
2722 }
2723 }
2724 }
2725
2726 if (parent != dn->dn_dbuf) {
2727 ASSERT(parent && parent->db_data_pending);
2728 ASSERT(db->db_level == parent->db_level-1);
2729 ASSERT(arc_released(parent->db_buf));
2730 zio = parent->db_data_pending->dr_zio;
2731 } else {
2732 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2733 db->db_blkid != DMU_SPILL_BLKID) ||
2734 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2735 if (db->db_blkid != DMU_SPILL_BLKID)
2736 ASSERT3P(db->db_blkptr, ==,
2737 &dn->dn_phys->dn_blkptr[db->db_blkid]);
2738 zio = dn->dn_zio;
2739 }
2740
2741 ASSERT(db->db_level == 0 || data == db->db_buf);
2742 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2743 ASSERT(zio);
2744
2745 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2746 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2747 db->db.db_object, db->db_level, db->db_blkid);
2748
2749 if (db->db_blkid == DMU_SPILL_BLKID)
2750 wp_flag = WP_SPILL;
2751 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2752
2753 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2754 DB_DNODE_EXIT(db);
2755
2756 if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2757 ASSERT(db->db_state != DB_NOFILL);
2758 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2759 db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
2760 dbuf_write_override_ready, dbuf_write_override_done, dr,
2761 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2762 mutex_enter(&db->db_mtx);
2763 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2764 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2765 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2766 mutex_exit(&db->db_mtx);
2767 } else if (db->db_state == DB_NOFILL) {
2768 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
2769 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2770 db->db_blkptr, NULL, db->db.db_size, &zp,
2771 dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
2772 ZIO_PRIORITY_ASYNC_WRITE,
2773 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2774 } else {
2775 ASSERT(arc_released(data));
2776 dr->dr_zio = arc_write(zio, os->os_spa, txg,
2777 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp,
2778 dbuf_write_ready, dbuf_write_done, db,
2779 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2780 }
2781 }