1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
26 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
27 */
28
29 /*
30 * DVA-based Adjustable Replacement Cache
31 *
32 * While much of the theory of operation used here is
33 * based on the self-tuning, low overhead replacement cache
34 * presented by Megiddo and Modha at FAST 2003, there are some
35 * significant differences:
36 *
37 * 1. The Megiddo and Modha model assumes any page is evictable.
38 * Pages in its cache cannot be "locked" into memory. This makes
39 * the eviction algorithm simple: evict the last page in the list.
40 * This also make the performance characteristics easy to reason
41 * about. Our cache is not so simple. At any given moment, some
42 * subset of the blocks in the cache are un-evictable because we
43 * have handed out a reference to them. Blocks are only evictable
44 * when there are no external references active. This makes
45 * eviction far more problematic: we choose to evict the evictable
46 * blocks that are the "lowest" in the list.
47 *
48 * There are times when it is not possible to evict the requested
49 * space. In these circumstances we are unable to adjust the cache
50 * size. To prevent the cache growing unbounded at these times we
51 * implement a "cache throttle" that slows the flow of new data
52 * into the cache until we can make space available.
53 *
54 * 2. The Megiddo and Modha model assumes a fixed cache size.
55 * Pages are evicted when the cache is full and there is a cache
56 * miss. Our model has a variable sized cache. It grows with
57 * high use, but also tries to react to memory pressure from the
58 * operating system: decreasing its size when system memory is
59 * tight.
60 *
61 * 3. The Megiddo and Modha model assumes a fixed page size. All
62 * elements of the cache are therefore exactly the same size. So
63 * when adjusting the cache size following a cache miss, its simply
64 * a matter of choosing a single page to evict. In our model, we
65 * have variable sized cache blocks (rangeing from 512 bytes to
66 * 128K bytes). We therefore choose a set of blocks to evict to make
67 * space for a cache miss that approximates as closely as possible
68 * the space used by the new block.
69 *
70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
71 * by N. Megiddo & D. Modha, FAST 2003
72 */
73
74 /*
75 * The locking model:
76 *
77 * A new reference to a cache buffer can be obtained in two
78 * ways: 1) via a hash table lookup using the DVA as a key,
79 * or 2) via one of the ARC lists. The arc_read() interface
80 * uses method 1, while the internal arc algorithms for
81 * adjusting the cache use method 2. We therefore provide two
82 * types of locks: 1) the hash table lock array, and 2) the
83 * arc list locks.
84 *
85 * Buffers do not have their own mutexes, rather they rely on the
86 * hash table mutexes for the bulk of their protection (i.e. most
87 * fields in the arc_buf_hdr_t are protected by these mutexes).
88 *
89 * buf_hash_find() returns the appropriate mutex (held) when it
90 * locates the requested buffer in the hash table. It returns
91 * NULL for the mutex if the buffer was not in the table.
92 *
93 * buf_hash_remove() expects the appropriate hash mutex to be
94 * already held before it is invoked.
95 *
96 * Each arc state also has a mutex which is used to protect the
97 * buffer list associated with the state. When attempting to
98 * obtain a hash table lock while holding an arc list lock you
99 * must use: mutex_tryenter() to avoid deadlock. Also note that
100 * the active state mutex must be held before the ghost state mutex.
101 *
102 * Arc buffers may have an associated eviction callback function.
103 * This function will be invoked prior to removing the buffer (e.g.
104 * in arc_do_user_evicts()). Note however that the data associated
105 * with the buffer may be evicted prior to the callback. The callback
106 * must be made with *no locks held* (to prevent deadlock). Additionally,
107 * the users of callbacks must ensure that their private data is
108 * protected from simultaneous callbacks from arc_clear_callback()
109 * and arc_do_user_evicts().
110 *
111 * Note that the majority of the performance stats are manipulated
112 * with atomic operations.
113 *
114 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
115 *
116 * - L2ARC buflist creation
117 * - L2ARC buflist eviction
118 * - L2ARC write completion, which walks L2ARC buflists
119 * - ARC header destruction, as it removes from L2ARC buflists
120 * - ARC header release, as it removes from L2ARC buflists
121 */
122
123 #include <sys/spa.h>
124 #include <sys/zio.h>
125 #include <sys/zio_compress.h>
126 #include <sys/zfs_context.h>
127 #include <sys/arc.h>
128 #include <sys/refcount.h>
129 #include <sys/vdev.h>
130 #include <sys/vdev_impl.h>
131 #include <sys/dsl_pool.h>
132 #ifdef _KERNEL
133 #include <sys/vmsystm.h>
134 #include <vm/anon.h>
135 #include <sys/fs/swapnode.h>
136 #include <sys/dnlc.h>
137 #endif
138 #include <sys/callb.h>
139 #include <sys/kstat.h>
140 #include <zfs_fletcher.h>
141
142 #ifndef _KERNEL
143 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
144 boolean_t arc_watch = B_FALSE;
145 int arc_procfd;
146 #endif
147
148 static kmutex_t arc_reclaim_thr_lock;
149 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
150 static uint8_t arc_thread_exit;
151
152 #define ARC_REDUCE_DNLC_PERCENT 3
153 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
154
155 typedef enum arc_reclaim_strategy {
156 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
157 ARC_RECLAIM_CONS /* Conservative reclaim strategy */
158 } arc_reclaim_strategy_t;
159
160 /*
161 * The number of iterations through arc_evict_*() before we
162 * drop & reacquire the lock.
163 */
164 int arc_evict_iterations = 100;
165
166 /* number of seconds before growing cache again */
167 static int arc_grow_retry = 60;
168
169 /* shift of arc_c for calculating both min and max arc_p */
170 static int arc_p_min_shift = 4;
171
172 /* log2(fraction of arc to reclaim) */
173 static int arc_shrink_shift = 5;
174
175 /*
176 * minimum lifespan of a prefetch block in clock ticks
177 * (initialized in arc_init())
178 */
179 static int arc_min_prefetch_lifespan;
180
181 /*
182 * If this percent of memory is free, don't throttle.
183 */
184 int arc_lotsfree_percent = 10;
185
186 static int arc_dead;
187
188 /*
189 * The arc has filled available memory and has now warmed up.
190 */
191 static boolean_t arc_warm;
192
193 /*
194 * These tunables are for performance analysis.
195 */
196 uint64_t zfs_arc_max;
197 uint64_t zfs_arc_min;
198 uint64_t zfs_arc_meta_limit = 0;
199 int zfs_arc_grow_retry = 0;
200 int zfs_arc_shrink_shift = 0;
201 int zfs_arc_p_min_shift = 0;
202 int zfs_disable_dup_eviction = 0;
203 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
204
205 /*
206 * Note that buffers can be in one of 6 states:
207 * ARC_anon - anonymous (discussed below)
208 * ARC_mru - recently used, currently cached
209 * ARC_mru_ghost - recentely used, no longer in cache
210 * ARC_mfu - frequently used, currently cached
211 * ARC_mfu_ghost - frequently used, no longer in cache
212 * ARC_l2c_only - exists in L2ARC but not other states
213 * When there are no active references to the buffer, they are
214 * are linked onto a list in one of these arc states. These are
215 * the only buffers that can be evicted or deleted. Within each
216 * state there are multiple lists, one for meta-data and one for
217 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
218 * etc.) is tracked separately so that it can be managed more
219 * explicitly: favored over data, limited explicitly.
220 *
221 * Anonymous buffers are buffers that are not associated with
222 * a DVA. These are buffers that hold dirty block copies
223 * before they are written to stable storage. By definition,
224 * they are "ref'd" and are considered part of arc_mru
225 * that cannot be freed. Generally, they will aquire a DVA
226 * as they are written and migrate onto the arc_mru list.
227 *
228 * The ARC_l2c_only state is for buffers that are in the second
229 * level ARC but no longer in any of the ARC_m* lists. The second
230 * level ARC itself may also contain buffers that are in any of
231 * the ARC_m* states - meaning that a buffer can exist in two
232 * places. The reason for the ARC_l2c_only state is to keep the
233 * buffer header in the hash table, so that reads that hit the
234 * second level ARC benefit from these fast lookups.
235 */
236
237 typedef struct arc_state {
238 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
239 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
240 uint64_t arcs_size; /* total amount of data in this state */
241 kmutex_t arcs_mtx;
242 } arc_state_t;
243
244 /* The 6 states: */
245 static arc_state_t ARC_anon;
246 static arc_state_t ARC_mru;
247 static arc_state_t ARC_mru_ghost;
248 static arc_state_t ARC_mfu;
249 static arc_state_t ARC_mfu_ghost;
250 static arc_state_t ARC_l2c_only;
251
252 typedef struct arc_stats {
253 kstat_named_t arcstat_hits;
254 kstat_named_t arcstat_misses;
255 kstat_named_t arcstat_demand_data_hits;
256 kstat_named_t arcstat_demand_data_misses;
257 kstat_named_t arcstat_demand_metadata_hits;
258 kstat_named_t arcstat_demand_metadata_misses;
259 kstat_named_t arcstat_prefetch_data_hits;
260 kstat_named_t arcstat_prefetch_data_misses;
261 kstat_named_t arcstat_prefetch_metadata_hits;
262 kstat_named_t arcstat_prefetch_metadata_misses;
263 kstat_named_t arcstat_mru_hits;
264 kstat_named_t arcstat_mru_ghost_hits;
265 kstat_named_t arcstat_mfu_hits;
266 kstat_named_t arcstat_mfu_ghost_hits;
267 kstat_named_t arcstat_deleted;
268 kstat_named_t arcstat_recycle_miss;
269 /*
270 * Number of buffers that could not be evicted because the hash lock
271 * was held by another thread. The lock may not necessarily be held
272 * by something using the same buffer, since hash locks are shared
273 * by multiple buffers.
274 */
275 kstat_named_t arcstat_mutex_miss;
276 /*
277 * Number of buffers skipped because they have I/O in progress, are
278 * indrect prefetch buffers that have not lived long enough, or are
279 * not from the spa we're trying to evict from.
280 */
281 kstat_named_t arcstat_evict_skip;
282 kstat_named_t arcstat_evict_l2_cached;
283 kstat_named_t arcstat_evict_l2_eligible;
284 kstat_named_t arcstat_evict_l2_ineligible;
285 kstat_named_t arcstat_hash_elements;
286 kstat_named_t arcstat_hash_elements_max;
287 kstat_named_t arcstat_hash_collisions;
288 kstat_named_t arcstat_hash_chains;
289 kstat_named_t arcstat_hash_chain_max;
290 kstat_named_t arcstat_p;
291 kstat_named_t arcstat_c;
292 kstat_named_t arcstat_c_min;
293 kstat_named_t arcstat_c_max;
294 kstat_named_t arcstat_size;
295 kstat_named_t arcstat_hdr_size;
296 kstat_named_t arcstat_data_size;
297 kstat_named_t arcstat_other_size;
298 kstat_named_t arcstat_l2_hits;
299 kstat_named_t arcstat_l2_misses;
300 kstat_named_t arcstat_l2_feeds;
301 kstat_named_t arcstat_l2_rw_clash;
302 kstat_named_t arcstat_l2_read_bytes;
303 kstat_named_t arcstat_l2_write_bytes;
304 kstat_named_t arcstat_l2_writes_sent;
305 kstat_named_t arcstat_l2_writes_done;
306 kstat_named_t arcstat_l2_writes_error;
307 kstat_named_t arcstat_l2_writes_hdr_miss;
308 kstat_named_t arcstat_l2_evict_lock_retry;
309 kstat_named_t arcstat_l2_evict_reading;
310 kstat_named_t arcstat_l2_free_on_write;
311 kstat_named_t arcstat_l2_abort_lowmem;
312 kstat_named_t arcstat_l2_cksum_bad;
313 kstat_named_t arcstat_l2_io_error;
314 kstat_named_t arcstat_l2_size;
315 kstat_named_t arcstat_l2_asize;
316 kstat_named_t arcstat_l2_hdr_size;
317 kstat_named_t arcstat_l2_compress_successes;
318 kstat_named_t arcstat_l2_compress_zeros;
319 kstat_named_t arcstat_l2_compress_failures;
320 kstat_named_t arcstat_memory_throttle_count;
321 kstat_named_t arcstat_duplicate_buffers;
322 kstat_named_t arcstat_duplicate_buffers_size;
323 kstat_named_t arcstat_duplicate_reads;
324 kstat_named_t arcstat_meta_used;
325 kstat_named_t arcstat_meta_limit;
326 kstat_named_t arcstat_meta_max;
327 } arc_stats_t;
328
329 static arc_stats_t arc_stats = {
330 { "hits", KSTAT_DATA_UINT64 },
331 { "misses", KSTAT_DATA_UINT64 },
332 { "demand_data_hits", KSTAT_DATA_UINT64 },
333 { "demand_data_misses", KSTAT_DATA_UINT64 },
334 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
335 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
336 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
337 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
338 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
339 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
340 { "mru_hits", KSTAT_DATA_UINT64 },
341 { "mru_ghost_hits", KSTAT_DATA_UINT64 },
342 { "mfu_hits", KSTAT_DATA_UINT64 },
343 { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
344 { "deleted", KSTAT_DATA_UINT64 },
345 { "recycle_miss", KSTAT_DATA_UINT64 },
346 { "mutex_miss", KSTAT_DATA_UINT64 },
347 { "evict_skip", KSTAT_DATA_UINT64 },
348 { "evict_l2_cached", KSTAT_DATA_UINT64 },
349 { "evict_l2_eligible", KSTAT_DATA_UINT64 },
350 { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
351 { "hash_elements", KSTAT_DATA_UINT64 },
352 { "hash_elements_max", KSTAT_DATA_UINT64 },
353 { "hash_collisions", KSTAT_DATA_UINT64 },
354 { "hash_chains", KSTAT_DATA_UINT64 },
355 { "hash_chain_max", KSTAT_DATA_UINT64 },
356 { "p", KSTAT_DATA_UINT64 },
357 { "c", KSTAT_DATA_UINT64 },
358 { "c_min", KSTAT_DATA_UINT64 },
359 { "c_max", KSTAT_DATA_UINT64 },
360 { "size", KSTAT_DATA_UINT64 },
361 { "hdr_size", KSTAT_DATA_UINT64 },
362 { "data_size", KSTAT_DATA_UINT64 },
363 { "other_size", KSTAT_DATA_UINT64 },
364 { "l2_hits", KSTAT_DATA_UINT64 },
365 { "l2_misses", KSTAT_DATA_UINT64 },
366 { "l2_feeds", KSTAT_DATA_UINT64 },
367 { "l2_rw_clash", KSTAT_DATA_UINT64 },
368 { "l2_read_bytes", KSTAT_DATA_UINT64 },
369 { "l2_write_bytes", KSTAT_DATA_UINT64 },
370 { "l2_writes_sent", KSTAT_DATA_UINT64 },
371 { "l2_writes_done", KSTAT_DATA_UINT64 },
372 { "l2_writes_error", KSTAT_DATA_UINT64 },
373 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
374 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
375 { "l2_evict_reading", KSTAT_DATA_UINT64 },
376 { "l2_free_on_write", KSTAT_DATA_UINT64 },
377 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
378 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
379 { "l2_io_error", KSTAT_DATA_UINT64 },
380 { "l2_size", KSTAT_DATA_UINT64 },
381 { "l2_asize", KSTAT_DATA_UINT64 },
382 { "l2_hdr_size", KSTAT_DATA_UINT64 },
383 { "l2_compress_successes", KSTAT_DATA_UINT64 },
384 { "l2_compress_zeros", KSTAT_DATA_UINT64 },
385 { "l2_compress_failures", KSTAT_DATA_UINT64 },
386 { "memory_throttle_count", KSTAT_DATA_UINT64 },
387 { "duplicate_buffers", KSTAT_DATA_UINT64 },
388 { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
389 { "duplicate_reads", KSTAT_DATA_UINT64 },
390 { "arc_meta_used", KSTAT_DATA_UINT64 },
391 { "arc_meta_limit", KSTAT_DATA_UINT64 },
392 { "arc_meta_max", KSTAT_DATA_UINT64 }
393 };
394
395 #define ARCSTAT(stat) (arc_stats.stat.value.ui64)
396
397 #define ARCSTAT_INCR(stat, val) \
398 atomic_add_64(&arc_stats.stat.value.ui64, (val))
399
400 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
401 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
402
403 #define ARCSTAT_MAX(stat, val) { \
404 uint64_t m; \
405 while ((val) > (m = arc_stats.stat.value.ui64) && \
406 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
407 continue; \
408 }
409
410 #define ARCSTAT_MAXSTAT(stat) \
411 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
412
413 /*
414 * We define a macro to allow ARC hits/misses to be easily broken down by
415 * two separate conditions, giving a total of four different subtypes for
416 * each of hits and misses (so eight statistics total).
417 */
418 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
419 if (cond1) { \
420 if (cond2) { \
421 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
422 } else { \
423 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
424 } \
425 } else { \
426 if (cond2) { \
427 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
428 } else { \
429 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
430 } \
431 }
432
433 kstat_t *arc_ksp;
434 static arc_state_t *arc_anon;
435 static arc_state_t *arc_mru;
436 static arc_state_t *arc_mru_ghost;
437 static arc_state_t *arc_mfu;
438 static arc_state_t *arc_mfu_ghost;
439 static arc_state_t *arc_l2c_only;
440
441 /*
442 * There are several ARC variables that are critical to export as kstats --
443 * but we don't want to have to grovel around in the kstat whenever we wish to
444 * manipulate them. For these variables, we therefore define them to be in
445 * terms of the statistic variable. This assures that we are not introducing
446 * the possibility of inconsistency by having shadow copies of the variables,
447 * while still allowing the code to be readable.
448 */
449 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
450 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
451 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */
452 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
453 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
454 #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
455 #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
456 #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
457
458 #define L2ARC_IS_VALID_COMPRESS(_c_) \
459 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
460
461 static int arc_no_grow; /* Don't try to grow cache size */
462 static uint64_t arc_tempreserve;
463 static uint64_t arc_loaned_bytes;
464
465 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
466
467 typedef struct arc_callback arc_callback_t;
468
469 struct arc_callback {
470 void *acb_private;
471 arc_done_func_t *acb_done;
472 arc_buf_t *acb_buf;
473 zio_t *acb_zio_dummy;
474 arc_callback_t *acb_next;
475 };
476
477 typedef struct arc_write_callback arc_write_callback_t;
478
479 struct arc_write_callback {
480 void *awcb_private;
481 arc_done_func_t *awcb_ready;
482 arc_done_func_t *awcb_physdone;
483 arc_done_func_t *awcb_done;
484 arc_buf_t *awcb_buf;
485 };
486
487 struct arc_buf_hdr {
488 /* protected by hash lock */
489 dva_t b_dva;
490 uint64_t b_birth;
491 uint64_t b_cksum0;
492
493 kmutex_t b_freeze_lock;
494 zio_cksum_t *b_freeze_cksum;
495 void *b_thawed;
496
497 arc_buf_hdr_t *b_hash_next;
498 arc_buf_t *b_buf;
499 uint32_t b_flags;
500 uint32_t b_datacnt;
501
502 arc_callback_t *b_acb;
503 kcondvar_t b_cv;
504
505 /* immutable */
506 arc_buf_contents_t b_type;
507 uint64_t b_size;
508 uint64_t b_spa;
509
510 /* protected by arc state mutex */
511 arc_state_t *b_state;
512 list_node_t b_arc_node;
513
514 /* updated atomically */
515 clock_t b_arc_access;
516
517 /* self protecting */
518 refcount_t b_refcnt;
519
520 l2arc_buf_hdr_t *b_l2hdr;
521 list_node_t b_l2node;
522 };
523
524 static arc_buf_t *arc_eviction_list;
525 static kmutex_t arc_eviction_mtx;
526 static arc_buf_hdr_t arc_eviction_hdr;
527 static void arc_get_data_buf(arc_buf_t *buf);
528 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
529 static int arc_evict_needed(arc_buf_contents_t type);
530 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
531 static void arc_buf_watch(arc_buf_t *buf);
532
533 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
534
535 #define GHOST_STATE(state) \
536 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
537 (state) == arc_l2c_only)
538
539 /*
540 * Private ARC flags. These flags are private ARC only flags that will show up
541 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can
542 * be passed in as arc_flags in things like arc_read. However, these flags
543 * should never be passed and should only be set by ARC code. When adding new
544 * public flags, make sure not to smash the private ones.
545 */
546
547 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */
548 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */
549 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
550 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
551 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
552 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */
553 #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */
554 #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
555 #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
556 #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
557
558 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
559 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
560 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
561 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH)
562 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
563 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
564 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
565 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE)
566 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
567 (hdr)->b_l2hdr != NULL)
568 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING)
569 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED)
570 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
571
572 /*
573 * Other sizes
574 */
575
576 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
577 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
578
579 /*
580 * Hash table routines
581 */
582
583 struct ht_table {
584 arc_buf_hdr_t *hdr;
585 kmutex_t lock;
586 };
587
588 typedef struct buf_hash_table {
589 uint64_t ht_mask;
590 struct ht_table *ht_table;
591 } buf_hash_table_t;
592
593 #pragma align 64(buf_hash_table)
594 static buf_hash_table_t buf_hash_table;
595
596 #define BUF_HASH_INDEX(spa, dva, birth) \
597 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
598 #define BUF_HASH_LOCK(idx) (&buf_hash_table.ht_table[idx].lock)
599 #define HDR_LOCK(hdr) \
600 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
601
602 uint64_t zfs_crc64_table[256];
603
604 /*
605 * Level 2 ARC
606 */
607
608 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
609 #define L2ARC_HEADROOM 2 /* num of writes */
610 /*
611 * If we discover during ARC scan any buffers to be compressed, we boost
612 * our headroom for the next scanning cycle by this percentage multiple.
613 */
614 #define L2ARC_HEADROOM_BOOST 200
615 #define L2ARC_FEED_SECS 1 /* caching interval secs */
616 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
617
618 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
619 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
620
621 /* L2ARC Performance Tunables */
622 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
623 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
624 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
625 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
626 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
627 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
628 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
629 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
630 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
631
632 /*
633 * L2ARC Internals
634 */
635 typedef struct l2arc_dev {
636 vdev_t *l2ad_vdev; /* vdev */
637 spa_t *l2ad_spa; /* spa */
638 uint64_t l2ad_hand; /* next write location */
639 uint64_t l2ad_start; /* first addr on device */
640 uint64_t l2ad_end; /* last addr on device */
641 uint64_t l2ad_evict; /* last addr eviction reached */
642 boolean_t l2ad_first; /* first sweep through */
643 boolean_t l2ad_writing; /* currently writing */
644 list_t *l2ad_buflist; /* buffer list */
645 list_node_t l2ad_node; /* device list node */
646 } l2arc_dev_t;
647
648 static list_t L2ARC_dev_list; /* device list */
649 static list_t *l2arc_dev_list; /* device list pointer */
650 static kmutex_t l2arc_dev_mtx; /* device list mutex */
651 static l2arc_dev_t *l2arc_dev_last; /* last device used */
652 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
653 static list_t L2ARC_free_on_write; /* free after write buf list */
654 static list_t *l2arc_free_on_write; /* free after write list ptr */
655 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
656 static uint64_t l2arc_ndev; /* number of devices */
657
658 typedef struct l2arc_read_callback {
659 arc_buf_t *l2rcb_buf; /* read buffer */
660 spa_t *l2rcb_spa; /* spa */
661 blkptr_t l2rcb_bp; /* original blkptr */
662 zbookmark_phys_t l2rcb_zb; /* original bookmark */
663 int l2rcb_flags; /* original flags */
664 enum zio_compress l2rcb_compress; /* applied compress */
665 } l2arc_read_callback_t;
666
667 typedef struct l2arc_write_callback {
668 l2arc_dev_t *l2wcb_dev; /* device info */
669 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
670 } l2arc_write_callback_t;
671
672 struct l2arc_buf_hdr {
673 /* protected by arc_buf_hdr mutex */
674 l2arc_dev_t *b_dev; /* L2ARC device */
675 uint64_t b_daddr; /* disk address, offset byte */
676 /* compression applied to buffer data */
677 enum zio_compress b_compress;
678 /* real alloc'd buffer size depending on b_compress applied */
679 int b_asize;
680 /* temporary buffer holder for in-flight compressed data */
681 void *b_tmp_cdata;
682 };
683
684 typedef struct l2arc_data_free {
685 /* protected by l2arc_free_on_write_mtx */
686 void *l2df_data;
687 size_t l2df_size;
688 void (*l2df_func)(void *, size_t);
689 list_node_t l2df_list_node;
690 } l2arc_data_free_t;
691
692 static kmutex_t l2arc_feed_thr_lock;
693 static kcondvar_t l2arc_feed_thr_cv;
694 static uint8_t l2arc_thread_exit;
695
696 static void l2arc_read_done(zio_t *zio);
697 static void l2arc_hdr_stat_add(void);
698 static void l2arc_hdr_stat_remove(void);
699
700 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
701 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
702 enum zio_compress c);
703 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
704
705 static uint64_t
706 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
707 {
708 uint8_t *vdva = (uint8_t *)dva;
709 uint64_t crc = -1ULL;
710 int i;
711
712 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
713
714 for (i = 0; i < sizeof (dva_t); i++)
715 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
716
717 crc ^= (spa>>8) ^ birth;
718
719 return (crc);
720 }
721
722 #define BUF_EMPTY(buf) \
723 ((buf)->b_dva.dva_word[0] == 0 && \
724 (buf)->b_dva.dva_word[1] == 0 && \
725 (buf)->b_cksum0 == 0)
726
727 #define BUF_EQUAL(spa, dva, birth, buf) \
728 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
729 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
730 ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
731
732 static void
733 buf_discard_identity(arc_buf_hdr_t *hdr)
734 {
735 hdr->b_dva.dva_word[0] = 0;
736 hdr->b_dva.dva_word[1] = 0;
737 hdr->b_birth = 0;
738 hdr->b_cksum0 = 0;
739 }
740
741 static arc_buf_hdr_t *
742 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
743 {
744 const dva_t *dva = BP_IDENTITY(bp);
745 uint64_t birth = BP_PHYSICAL_BIRTH(bp);
746 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
747 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
748 arc_buf_hdr_t *buf;
749
750 mutex_enter(hash_lock);
751 for (buf = buf_hash_table.ht_table[idx].hdr; buf != NULL;
752 buf = buf->b_hash_next) {
753 if (BUF_EQUAL(spa, dva, birth, buf)) {
754 *lockp = hash_lock;
755 return (buf);
756 }
757 }
758 mutex_exit(hash_lock);
759 *lockp = NULL;
760 return (NULL);
761 }
762
763 /*
764 * Insert an entry into the hash table. If there is already an element
765 * equal to elem in the hash table, then the already existing element
766 * will be returned and the new element will not be inserted.
767 * Otherwise returns NULL.
768 */
769 static arc_buf_hdr_t *
770 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
771 {
772 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
773 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
774 arc_buf_hdr_t *fbuf;
775 uint32_t i;
776
777 ASSERT(!DVA_IS_EMPTY(&buf->b_dva));
778 ASSERT(buf->b_birth != 0);
779 ASSERT(!HDR_IN_HASH_TABLE(buf));
780 *lockp = hash_lock;
781 mutex_enter(hash_lock);
782 for (fbuf = buf_hash_table.ht_table[idx].hdr, i = 0; fbuf != NULL;
783 fbuf = fbuf->b_hash_next, i++) {
784 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
785 return (fbuf);
786 }
787
788 buf->b_hash_next = buf_hash_table.ht_table[idx].hdr;
789 buf_hash_table.ht_table[idx].hdr = buf;
790 buf->b_flags |= ARC_IN_HASH_TABLE;
791
792 /* collect some hash table performance data */
793 if (i > 0) {
794 ARCSTAT_BUMP(arcstat_hash_collisions);
795 if (i == 1)
796 ARCSTAT_BUMP(arcstat_hash_chains);
797
798 ARCSTAT_MAX(arcstat_hash_chain_max, i);
799 }
800
801 ARCSTAT_BUMP(arcstat_hash_elements);
802 ARCSTAT_MAXSTAT(arcstat_hash_elements);
803
804 return (NULL);
805 }
806
807 static void
808 buf_hash_remove(arc_buf_hdr_t *buf)
809 {
810 arc_buf_hdr_t *fbuf, **bufp;
811 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
812
813 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
814 ASSERT(HDR_IN_HASH_TABLE(buf));
815
816 bufp = &buf_hash_table.ht_table[idx].hdr;
817 while ((fbuf = *bufp) != buf) {
818 ASSERT(fbuf != NULL);
819 bufp = &fbuf->b_hash_next;
820 }
821 *bufp = buf->b_hash_next;
822 buf->b_hash_next = NULL;
823 buf->b_flags &= ~ARC_IN_HASH_TABLE;
824
825 /* collect some hash table performance data */
826 ARCSTAT_BUMPDOWN(arcstat_hash_elements);
827
828 if (buf_hash_table.ht_table[idx].hdr &&
829 buf_hash_table.ht_table[idx].hdr->b_hash_next == NULL)
830 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
831 }
832
833 /*
834 * Global data structures and functions for the buf kmem cache.
835 */
836 static kmem_cache_t *hdr_cache;
837 static kmem_cache_t *buf_cache;
838
839 static void
840 buf_fini(void)
841 {
842 int i;
843
844 for (i = 0; i < buf_hash_table.ht_mask + 1; i++)
845 mutex_destroy(&buf_hash_table.ht_table[i].lock);
846 kmem_free(buf_hash_table.ht_table,
847 (buf_hash_table.ht_mask + 1) * sizeof (struct ht_table));
848 kmem_cache_destroy(hdr_cache);
849 kmem_cache_destroy(buf_cache);
850 }
851
852 /*
853 * Constructor callback - called when the cache is empty
854 * and a new buf is requested.
855 */
856 /* ARGSUSED */
857 static int
858 hdr_cons(void *vbuf, void *unused, int kmflag)
859 {
860 arc_buf_hdr_t *buf = vbuf;
861
862 bzero(buf, sizeof (arc_buf_hdr_t));
863 refcount_create(&buf->b_refcnt);
864 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
865 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
866 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
867
868 return (0);
869 }
870
871 /* ARGSUSED */
872 static int
873 buf_cons(void *vbuf, void *unused, int kmflag)
874 {
875 arc_buf_t *buf = vbuf;
876
877 bzero(buf, sizeof (arc_buf_t));
878 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
879 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
880
881 return (0);
882 }
883
884 /*
885 * Destructor callback - called when a cached buf is
886 * no longer required.
887 */
888 /* ARGSUSED */
889 static void
890 hdr_dest(void *vbuf, void *unused)
891 {
892 arc_buf_hdr_t *buf = vbuf;
893
894 ASSERT(BUF_EMPTY(buf));
895 refcount_destroy(&buf->b_refcnt);
896 cv_destroy(&buf->b_cv);
897 mutex_destroy(&buf->b_freeze_lock);
898 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
899 }
900
901 /* ARGSUSED */
902 static void
903 buf_dest(void *vbuf, void *unused)
904 {
905 arc_buf_t *buf = vbuf;
906
907 mutex_destroy(&buf->b_evict_lock);
908 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
909 }
910
911 /*
912 * Reclaim callback -- invoked when memory is low.
913 */
914 /* ARGSUSED */
915 static void
916 hdr_recl(void *unused)
917 {
918 dprintf("hdr_recl called\n");
919 /*
920 * umem calls the reclaim func when we destroy the buf cache,
921 * which is after we do arc_fini().
922 */
923 if (!arc_dead)
924 cv_signal(&arc_reclaim_thr_cv);
925 }
926
927 static void
928 buf_init(void)
929 {
930 uint64_t *ct;
931 uint64_t hsize = 1ULL << 12;
932 int i, j;
933
934 /*
935 * The hash table is big enough to fill all of physical memory
936 * with an average block size of zfs_arc_average_blocksize (default 8K).
937 * By default, the table will take up
938 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
939 */
940 while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
941 hsize <<= 1;
942 retry:
943 buf_hash_table.ht_mask = hsize - 1;
944 buf_hash_table.ht_table =
945 kmem_zalloc(hsize * sizeof (struct ht_table), KM_NOSLEEP);
946 if (buf_hash_table.ht_table == NULL) {
947 ASSERT(hsize > (1ULL << 8));
948 hsize >>= 1;
949 goto retry;
950 }
951
952 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
953 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
954 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
955 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
956
957 for (i = 0; i < 256; i++)
958 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
959 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
960
961 for (i = 0; i < hsize; i++) {
962 mutex_init(&buf_hash_table.ht_table[i].lock,
963 NULL, MUTEX_DEFAULT, NULL);
964 }
965 }
966
967 #define ARC_MINTIME (hz>>4) /* 62 ms */
968
969 static void
970 arc_cksum_verify(arc_buf_t *buf)
971 {
972 zio_cksum_t zc;
973
974 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
975 return;
976
977 mutex_enter(&buf->b_hdr->b_freeze_lock);
978 if (buf->b_hdr->b_freeze_cksum == NULL ||
979 (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
980 mutex_exit(&buf->b_hdr->b_freeze_lock);
981 return;
982 }
983 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
984 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
985 panic("buffer modified while frozen!");
986 mutex_exit(&buf->b_hdr->b_freeze_lock);
987 }
988
989 static int
990 arc_cksum_equal(arc_buf_t *buf)
991 {
992 zio_cksum_t zc;
993 int equal;
994
995 mutex_enter(&buf->b_hdr->b_freeze_lock);
996 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
997 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
998 mutex_exit(&buf->b_hdr->b_freeze_lock);
999
1000 return (equal);
1001 }
1002
1003 static void
1004 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1005 {
1006 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1007 return;
1008
1009 mutex_enter(&buf->b_hdr->b_freeze_lock);
1010 if (buf->b_hdr->b_freeze_cksum != NULL) {
1011 mutex_exit(&buf->b_hdr->b_freeze_lock);
1012 return;
1013 }
1014 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1015 fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1016 buf->b_hdr->b_freeze_cksum);
1017 mutex_exit(&buf->b_hdr->b_freeze_lock);
1018 arc_buf_watch(buf);
1019 }
1020
1021 #ifndef _KERNEL
1022 typedef struct procctl {
1023 long cmd;
1024 prwatch_t prwatch;
1025 } procctl_t;
1026 #endif
1027
1028 /* ARGSUSED */
1029 static void
1030 arc_buf_unwatch(arc_buf_t *buf)
1031 {
1032 #ifndef _KERNEL
1033 if (arc_watch) {
1034 int result;
1035 procctl_t ctl;
1036 ctl.cmd = PCWATCH;
1037 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1038 ctl.prwatch.pr_size = 0;
1039 ctl.prwatch.pr_wflags = 0;
1040 result = write(arc_procfd, &ctl, sizeof (ctl));
1041 ASSERT3U(result, ==, sizeof (ctl));
1042 }
1043 #endif
1044 }
1045
1046 /* ARGSUSED */
1047 static void
1048 arc_buf_watch(arc_buf_t *buf)
1049 {
1050 #ifndef _KERNEL
1051 if (arc_watch) {
1052 int result;
1053 procctl_t ctl;
1054 ctl.cmd = PCWATCH;
1055 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1056 ctl.prwatch.pr_size = buf->b_hdr->b_size;
1057 ctl.prwatch.pr_wflags = WA_WRITE;
1058 result = write(arc_procfd, &ctl, sizeof (ctl));
1059 ASSERT3U(result, ==, sizeof (ctl));
1060 }
1061 #endif
1062 }
1063
1064 void
1065 arc_buf_thaw(arc_buf_t *buf)
1066 {
1067 if (zfs_flags & ZFS_DEBUG_MODIFY) {
1068 if (buf->b_hdr->b_state != arc_anon)
1069 panic("modifying non-anon buffer!");
1070 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1071 panic("modifying buffer while i/o in progress!");
1072 arc_cksum_verify(buf);
1073 }
1074
1075 mutex_enter(&buf->b_hdr->b_freeze_lock);
1076 if (buf->b_hdr->b_freeze_cksum != NULL) {
1077 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1078 buf->b_hdr->b_freeze_cksum = NULL;
1079 }
1080
1081 if (zfs_flags & ZFS_DEBUG_MODIFY) {
1082 if (buf->b_hdr->b_thawed)
1083 kmem_free(buf->b_hdr->b_thawed, 1);
1084 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1085 }
1086
1087 mutex_exit(&buf->b_hdr->b_freeze_lock);
1088
1089 arc_buf_unwatch(buf);
1090 }
1091
1092 void
1093 arc_buf_freeze(arc_buf_t *buf)
1094 {
1095 kmutex_t *hash_lock;
1096
1097 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1098 return;
1099
1100 hash_lock = HDR_LOCK(buf->b_hdr);
1101 mutex_enter(hash_lock);
1102
1103 ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1104 buf->b_hdr->b_state == arc_anon);
1105 arc_cksum_compute(buf, B_FALSE);
1106 mutex_exit(hash_lock);
1107
1108 }
1109
1110 static void
1111 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1112 {
1113 ASSERT(MUTEX_HELD(hash_lock));
1114
1115 if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1116 (ab->b_state != arc_anon)) {
1117 uint64_t delta = ab->b_size * ab->b_datacnt;
1118 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1119 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1120
1121 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1122 mutex_enter(&ab->b_state->arcs_mtx);
1123 ASSERT(list_link_active(&ab->b_arc_node));
1124 list_remove(list, ab);
1125 if (GHOST_STATE(ab->b_state)) {
1126 ASSERT0(ab->b_datacnt);
1127 ASSERT3P(ab->b_buf, ==, NULL);
1128 delta = ab->b_size;
1129 }
1130 ASSERT(delta > 0);
1131 ASSERT3U(*size, >=, delta);
1132 atomic_add_64(size, -delta);
1133 mutex_exit(&ab->b_state->arcs_mtx);
1134 /* remove the prefetch flag if we get a reference */
1135 if (ab->b_flags & ARC_PREFETCH)
1136 ab->b_flags &= ~ARC_PREFETCH;
1137 }
1138 }
1139
1140 static int
1141 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1142 {
1143 int cnt;
1144 arc_state_t *state = ab->b_state;
1145
1146 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1147 ASSERT(!GHOST_STATE(state));
1148
1149 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1150 (state != arc_anon)) {
1151 uint64_t *size = &state->arcs_lsize[ab->b_type];
1152
1153 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1154 mutex_enter(&state->arcs_mtx);
1155 ASSERT(!list_link_active(&ab->b_arc_node));
1156 list_insert_head(&state->arcs_list[ab->b_type], ab);
1157 ASSERT(ab->b_datacnt > 0);
1158 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1159 mutex_exit(&state->arcs_mtx);
1160 }
1161 return (cnt);
1162 }
1163
1164 /*
1165 * Move the supplied buffer to the indicated state. The mutex
1166 * for the buffer must be held by the caller.
1167 */
1168 static void
1169 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1170 {
1171 arc_state_t *old_state = ab->b_state;
1172 int64_t refcnt = refcount_count(&ab->b_refcnt);
1173 uint64_t from_delta, to_delta;
1174
1175 ASSERT(MUTEX_HELD(hash_lock));
1176 ASSERT3P(new_state, !=, old_state);
1177 ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1178 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1179 ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1180
1181 from_delta = to_delta = ab->b_datacnt * ab->b_size;
1182
1183 /*
1184 * If this buffer is evictable, transfer it from the
1185 * old state list to the new state list.
1186 */
1187 if (refcnt == 0) {
1188 if (old_state != arc_anon) {
1189 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1190 uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1191
1192 if (use_mutex)
1193 mutex_enter(&old_state->arcs_mtx);
1194
1195 ASSERT(list_link_active(&ab->b_arc_node));
1196 list_remove(&old_state->arcs_list[ab->b_type], ab);
1197
1198 /*
1199 * If prefetching out of the ghost cache,
1200 * we will have a non-zero datacnt.
1201 */
1202 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1203 /* ghost elements have a ghost size */
1204 ASSERT(ab->b_buf == NULL);
1205 from_delta = ab->b_size;
1206 }
1207 ASSERT3U(*size, >=, from_delta);
1208 atomic_add_64(size, -from_delta);
1209
1210 if (use_mutex)
1211 mutex_exit(&old_state->arcs_mtx);
1212 }
1213 if (new_state != arc_anon) {
1214 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1215 uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1216
1217 if (use_mutex)
1218 mutex_enter(&new_state->arcs_mtx);
1219
1220 list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1221
1222 /* ghost elements have a ghost size */
1223 if (GHOST_STATE(new_state)) {
1224 ASSERT(ab->b_datacnt == 0);
1225 ASSERT(ab->b_buf == NULL);
1226 to_delta = ab->b_size;
1227 }
1228 atomic_add_64(size, to_delta);
1229
1230 if (use_mutex)
1231 mutex_exit(&new_state->arcs_mtx);
1232 }
1233 }
1234
1235 ASSERT(!BUF_EMPTY(ab));
1236 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1237 buf_hash_remove(ab);
1238
1239 /* adjust state sizes */
1240 if (to_delta)
1241 atomic_add_64(&new_state->arcs_size, to_delta);
1242 if (from_delta) {
1243 ASSERT3U(old_state->arcs_size, >=, from_delta);
1244 atomic_add_64(&old_state->arcs_size, -from_delta);
1245 }
1246 ab->b_state = new_state;
1247
1248 /* adjust l2arc hdr stats */
1249 if (new_state == arc_l2c_only)
1250 l2arc_hdr_stat_add();
1251 else if (old_state == arc_l2c_only)
1252 l2arc_hdr_stat_remove();
1253 }
1254
1255 void
1256 arc_space_consume(uint64_t space, arc_space_type_t type)
1257 {
1258 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1259
1260 switch (type) {
1261 case ARC_SPACE_DATA:
1262 ARCSTAT_INCR(arcstat_data_size, space);
1263 break;
1264 case ARC_SPACE_OTHER:
1265 ARCSTAT_INCR(arcstat_other_size, space);
1266 break;
1267 case ARC_SPACE_HDRS:
1268 ARCSTAT_INCR(arcstat_hdr_size, space);
1269 break;
1270 case ARC_SPACE_L2HDRS:
1271 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1272 break;
1273 }
1274
1275 ARCSTAT_INCR(arcstat_meta_used, space);
1276 atomic_add_64(&arc_size, space);
1277 }
1278
1279 void
1280 arc_space_return(uint64_t space, arc_space_type_t type)
1281 {
1282 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1283
1284 switch (type) {
1285 case ARC_SPACE_DATA:
1286 ARCSTAT_INCR(arcstat_data_size, -space);
1287 break;
1288 case ARC_SPACE_OTHER:
1289 ARCSTAT_INCR(arcstat_other_size, -space);
1290 break;
1291 case ARC_SPACE_HDRS:
1292 ARCSTAT_INCR(arcstat_hdr_size, -space);
1293 break;
1294 case ARC_SPACE_L2HDRS:
1295 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1296 break;
1297 }
1298
1299 ASSERT(arc_meta_used >= space);
1300 if (arc_meta_max < arc_meta_used)
1301 arc_meta_max = arc_meta_used;
1302 ARCSTAT_INCR(arcstat_meta_used, -space);
1303 ASSERT(arc_size >= space);
1304 atomic_add_64(&arc_size, -space);
1305 }
1306
1307 void *
1308 arc_data_buf_alloc(uint64_t size)
1309 {
1310 if (arc_evict_needed(ARC_BUFC_DATA))
1311 cv_signal(&arc_reclaim_thr_cv);
1312 atomic_add_64(&arc_size, size);
1313 return (zio_data_buf_alloc(size));
1314 }
1315
1316 void
1317 arc_data_buf_free(void *buf, uint64_t size)
1318 {
1319 zio_data_buf_free(buf, size);
1320 ASSERT(arc_size >= size);
1321 atomic_add_64(&arc_size, -size);
1322 }
1323
1324 arc_buf_t *
1325 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1326 {
1327 arc_buf_hdr_t *hdr;
1328 arc_buf_t *buf;
1329
1330 ASSERT3U(size, >, 0);
1331 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1332 ASSERT(BUF_EMPTY(hdr));
1333 hdr->b_size = size;
1334 hdr->b_type = type;
1335 hdr->b_spa = spa_load_guid(spa);
1336 hdr->b_state = arc_anon;
1337 hdr->b_arc_access = 0;
1338 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1339 buf->b_hdr = hdr;
1340 buf->b_data = NULL;
1341 buf->b_efunc = NULL;
1342 buf->b_private = NULL;
1343 buf->b_next = NULL;
1344 hdr->b_buf = buf;
1345 arc_get_data_buf(buf);
1346 hdr->b_datacnt = 1;
1347 hdr->b_flags = 0;
1348 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1349 (void) refcount_add(&hdr->b_refcnt, tag);
1350
1351 return (buf);
1352 }
1353
1354 static char *arc_onloan_tag = "onloan";
1355
1356 /*
1357 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1358 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1359 * buffers must be returned to the arc before they can be used by the DMU or
1360 * freed.
1361 */
1362 arc_buf_t *
1363 arc_loan_buf(spa_t *spa, int size)
1364 {
1365 arc_buf_t *buf;
1366
1367 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1368
1369 atomic_add_64(&arc_loaned_bytes, size);
1370 return (buf);
1371 }
1372
1373 /*
1374 * Return a loaned arc buffer to the arc.
1375 */
1376 void
1377 arc_return_buf(arc_buf_t *buf, void *tag)
1378 {
1379 arc_buf_hdr_t *hdr = buf->b_hdr;
1380
1381 ASSERT(buf->b_data != NULL);
1382 (void) refcount_add(&hdr->b_refcnt, tag);
1383 (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1384
1385 atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1386 }
1387
1388 /* Detach an arc_buf from a dbuf (tag) */
1389 void
1390 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1391 {
1392 arc_buf_hdr_t *hdr;
1393
1394 ASSERT(buf->b_data != NULL);
1395 hdr = buf->b_hdr;
1396 (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1397 (void) refcount_remove(&hdr->b_refcnt, tag);
1398 buf->b_efunc = NULL;
1399 buf->b_private = NULL;
1400
1401 atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1402 }
1403
1404 static arc_buf_t *
1405 arc_buf_clone(arc_buf_t *from)
1406 {
1407 arc_buf_t *buf;
1408 arc_buf_hdr_t *hdr = from->b_hdr;
1409 uint64_t size = hdr->b_size;
1410
1411 ASSERT(hdr->b_state != arc_anon);
1412
1413 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1414 buf->b_hdr = hdr;
1415 buf->b_data = NULL;
1416 buf->b_efunc = NULL;
1417 buf->b_private = NULL;
1418 buf->b_next = hdr->b_buf;
1419 hdr->b_buf = buf;
1420 arc_get_data_buf(buf);
1421 bcopy(from->b_data, buf->b_data, size);
1422
1423 /*
1424 * This buffer already exists in the arc so create a duplicate
1425 * copy for the caller. If the buffer is associated with user data
1426 * then track the size and number of duplicates. These stats will be
1427 * updated as duplicate buffers are created and destroyed.
1428 */
1429 if (hdr->b_type == ARC_BUFC_DATA) {
1430 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1431 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1432 }
1433 hdr->b_datacnt += 1;
1434 return (buf);
1435 }
1436
1437 void
1438 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1439 {
1440 arc_buf_hdr_t *hdr;
1441 kmutex_t *hash_lock;
1442
1443 /*
1444 * Check to see if this buffer is evicted. Callers
1445 * must verify b_data != NULL to know if the add_ref
1446 * was successful.
1447 */
1448 mutex_enter(&buf->b_evict_lock);
1449 if (buf->b_data == NULL) {
1450 mutex_exit(&buf->b_evict_lock);
1451 return;
1452 }
1453 hash_lock = HDR_LOCK(buf->b_hdr);
1454 mutex_enter(hash_lock);
1455 hdr = buf->b_hdr;
1456 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1457 mutex_exit(&buf->b_evict_lock);
1458
1459 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1460 add_reference(hdr, hash_lock, tag);
1461 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1462 arc_access(hdr, hash_lock);
1463 mutex_exit(hash_lock);
1464 ARCSTAT_BUMP(arcstat_hits);
1465 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1466 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1467 data, metadata, hits);
1468 }
1469
1470 /*
1471 * Free the arc data buffer. If it is an l2arc write in progress,
1472 * the buffer is placed on l2arc_free_on_write to be freed later.
1473 */
1474 static void
1475 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1476 {
1477 arc_buf_hdr_t *hdr = buf->b_hdr;
1478
1479 if (HDR_L2_WRITING(hdr)) {
1480 l2arc_data_free_t *df;
1481 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1482 df->l2df_data = buf->b_data;
1483 df->l2df_size = hdr->b_size;
1484 df->l2df_func = free_func;
1485 mutex_enter(&l2arc_free_on_write_mtx);
1486 list_insert_head(l2arc_free_on_write, df);
1487 mutex_exit(&l2arc_free_on_write_mtx);
1488 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1489 } else {
1490 free_func(buf->b_data, hdr->b_size);
1491 }
1492 }
1493
1494 /*
1495 * Free up buf->b_data and if 'remove' is set, then pull the
1496 * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
1497 */
1498 static void
1499 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
1500 {
1501 arc_buf_t **bufp;
1502
1503 /* free up data associated with the buf */
1504 if (buf->b_data) {
1505 arc_state_t *state = buf->b_hdr->b_state;
1506 uint64_t size = buf->b_hdr->b_size;
1507 arc_buf_contents_t type = buf->b_hdr->b_type;
1508
1509 arc_cksum_verify(buf);
1510 arc_buf_unwatch(buf);
1511
1512 if (!recycle) {
1513 if (type == ARC_BUFC_METADATA) {
1514 arc_buf_data_free(buf, zio_buf_free);
1515 arc_space_return(size, ARC_SPACE_DATA);
1516 } else {
1517 ASSERT(type == ARC_BUFC_DATA);
1518 arc_buf_data_free(buf, zio_data_buf_free);
1519 ARCSTAT_INCR(arcstat_data_size, -size);
1520 atomic_add_64(&arc_size, -size);
1521 }
1522 }
1523 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1524 uint64_t *cnt = &state->arcs_lsize[type];
1525
1526 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1527 ASSERT(state != arc_anon);
1528
1529 ASSERT3U(*cnt, >=, size);
1530 atomic_add_64(cnt, -size);
1531 }
1532 ASSERT3U(state->arcs_size, >=, size);
1533 atomic_add_64(&state->arcs_size, -size);
1534 buf->b_data = NULL;
1535
1536 /*
1537 * If we're destroying a duplicate buffer make sure
1538 * that the appropriate statistics are updated.
1539 */
1540 if (buf->b_hdr->b_datacnt > 1 &&
1541 buf->b_hdr->b_type == ARC_BUFC_DATA) {
1542 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1543 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1544 }
1545 ASSERT(buf->b_hdr->b_datacnt > 0);
1546 buf->b_hdr->b_datacnt -= 1;
1547 }
1548
1549 /* only remove the buf if requested */
1550 if (!remove)
1551 return;
1552
1553 /* remove the buf from the hdr list */
1554 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1555 continue;
1556 *bufp = buf->b_next;
1557 buf->b_next = NULL;
1558
1559 ASSERT(buf->b_efunc == NULL);
1560
1561 /* clean up the buf */
1562 buf->b_hdr = NULL;
1563 kmem_cache_free(buf_cache, buf);
1564 }
1565
1566 static void
1567 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1568 {
1569 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1570 ASSERT3P(hdr->b_state, ==, arc_anon);
1571 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1572 l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1573
1574 if (l2hdr != NULL) {
1575 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1576 /*
1577 * To prevent arc_free() and l2arc_evict() from
1578 * attempting to free the same buffer at the same time,
1579 * a FREE_IN_PROGRESS flag is given to arc_free() to
1580 * give it priority. l2arc_evict() can't destroy this
1581 * header while we are waiting on l2arc_buflist_mtx.
1582 *
1583 * The hdr may be removed from l2ad_buflist before we
1584 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1585 */
1586 if (!buflist_held) {
1587 mutex_enter(&l2arc_buflist_mtx);
1588 l2hdr = hdr->b_l2hdr;
1589 }
1590
1591 if (l2hdr != NULL) {
1592 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1593 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1594 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1595 if (l2hdr->b_dev->l2ad_vdev)
1596 vdev_space_update(l2hdr->b_dev->l2ad_vdev,
1597 -l2hdr->b_asize, 0, 0);
1598 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1599 if (hdr->b_state == arc_l2c_only)
1600 l2arc_hdr_stat_remove();
1601 hdr->b_l2hdr = NULL;
1602 }
1603
1604 if (!buflist_held)
1605 mutex_exit(&l2arc_buflist_mtx);
1606 }
1607
1608 if (!BUF_EMPTY(hdr)) {
1609 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1610 buf_discard_identity(hdr);
1611 }
1612 while (hdr->b_buf) {
1613 arc_buf_t *buf = hdr->b_buf;
1614
1615 if (buf->b_efunc) {
1616 mutex_enter(&arc_eviction_mtx);
1617 mutex_enter(&buf->b_evict_lock);
1618 ASSERT(buf->b_hdr != NULL);
1619 arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1620 hdr->b_buf = buf->b_next;
1621 buf->b_hdr = &arc_eviction_hdr;
1622 buf->b_next = arc_eviction_list;
1623 arc_eviction_list = buf;
1624 mutex_exit(&buf->b_evict_lock);
1625 mutex_exit(&arc_eviction_mtx);
1626 } else {
1627 arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1628 }
1629 }
1630 if (hdr->b_freeze_cksum != NULL) {
1631 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1632 hdr->b_freeze_cksum = NULL;
1633 }
1634 if (hdr->b_thawed) {
1635 kmem_free(hdr->b_thawed, 1);
1636 hdr->b_thawed = NULL;
1637 }
1638
1639 ASSERT(!list_link_active(&hdr->b_arc_node));
1640 ASSERT3P(hdr->b_hash_next, ==, NULL);
1641 ASSERT3P(hdr->b_acb, ==, NULL);
1642 kmem_cache_free(hdr_cache, hdr);
1643 }
1644
1645 void
1646 arc_buf_free(arc_buf_t *buf, void *tag)
1647 {
1648 arc_buf_hdr_t *hdr = buf->b_hdr;
1649 int hashed = hdr->b_state != arc_anon;
1650
1651 ASSERT(buf->b_efunc == NULL);
1652 ASSERT(buf->b_data != NULL);
1653
1654 if (hashed) {
1655 kmutex_t *hash_lock = HDR_LOCK(hdr);
1656
1657 mutex_enter(hash_lock);
1658 hdr = buf->b_hdr;
1659 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1660
1661 (void) remove_reference(hdr, hash_lock, tag);
1662 if (hdr->b_datacnt > 1) {
1663 arc_buf_destroy(buf, FALSE, TRUE);
1664 } else {
1665 ASSERT(buf == hdr->b_buf);
1666 ASSERT(buf->b_efunc == NULL);
1667 hdr->b_flags |= ARC_BUF_AVAILABLE;
1668 }
1669 mutex_exit(hash_lock);
1670 } else if (HDR_IO_IN_PROGRESS(hdr)) {
1671 int destroy_hdr;
1672 /*
1673 * We are in the middle of an async write. Don't destroy
1674 * this buffer unless the write completes before we finish
1675 * decrementing the reference count.
1676 */
1677 mutex_enter(&arc_eviction_mtx);
1678 (void) remove_reference(hdr, NULL, tag);
1679 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1680 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1681 mutex_exit(&arc_eviction_mtx);
1682 if (destroy_hdr)
1683 arc_hdr_destroy(hdr);
1684 } else {
1685 if (remove_reference(hdr, NULL, tag) > 0)
1686 arc_buf_destroy(buf, FALSE, TRUE);
1687 else
1688 arc_hdr_destroy(hdr);
1689 }
1690 }
1691
1692 boolean_t
1693 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1694 {
1695 arc_buf_hdr_t *hdr = buf->b_hdr;
1696 kmutex_t *hash_lock = HDR_LOCK(hdr);
1697 boolean_t no_callback = (buf->b_efunc == NULL);
1698
1699 if (hdr->b_state == arc_anon) {
1700 ASSERT(hdr->b_datacnt == 1);
1701 arc_buf_free(buf, tag);
1702 return (no_callback);
1703 }
1704
1705 mutex_enter(hash_lock);
1706 hdr = buf->b_hdr;
1707 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1708 ASSERT(hdr->b_state != arc_anon);
1709 ASSERT(buf->b_data != NULL);
1710
1711 (void) remove_reference(hdr, hash_lock, tag);
1712 if (hdr->b_datacnt > 1) {
1713 if (no_callback)
1714 arc_buf_destroy(buf, FALSE, TRUE);
1715 } else if (no_callback) {
1716 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1717 ASSERT(buf->b_efunc == NULL);
1718 hdr->b_flags |= ARC_BUF_AVAILABLE;
1719 }
1720 ASSERT(no_callback || hdr->b_datacnt > 1 ||
1721 refcount_is_zero(&hdr->b_refcnt));
1722 mutex_exit(hash_lock);
1723 return (no_callback);
1724 }
1725
1726 int
1727 arc_buf_size(arc_buf_t *buf)
1728 {
1729 return (buf->b_hdr->b_size);
1730 }
1731
1732 /*
1733 * Called from the DMU to determine if the current buffer should be
1734 * evicted. In order to ensure proper locking, the eviction must be initiated
1735 * from the DMU. Return true if the buffer is associated with user data and
1736 * duplicate buffers still exist.
1737 */
1738 boolean_t
1739 arc_buf_eviction_needed(arc_buf_t *buf)
1740 {
1741 arc_buf_hdr_t *hdr;
1742 boolean_t evict_needed = B_FALSE;
1743
1744 if (zfs_disable_dup_eviction)
1745 return (B_FALSE);
1746
1747 mutex_enter(&buf->b_evict_lock);
1748 hdr = buf->b_hdr;
1749 if (hdr == NULL) {
1750 /*
1751 * We are in arc_do_user_evicts(); let that function
1752 * perform the eviction.
1753 */
1754 ASSERT(buf->b_data == NULL);
1755 mutex_exit(&buf->b_evict_lock);
1756 return (B_FALSE);
1757 } else if (buf->b_data == NULL) {
1758 /*
1759 * We have already been added to the arc eviction list;
1760 * recommend eviction.
1761 */
1762 ASSERT3P(hdr, ==, &arc_eviction_hdr);
1763 mutex_exit(&buf->b_evict_lock);
1764 return (B_TRUE);
1765 }
1766
1767 if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1768 evict_needed = B_TRUE;
1769
1770 mutex_exit(&buf->b_evict_lock);
1771 return (evict_needed);
1772 }
1773
1774 int zfs_fastflush = 1;
1775
1776 /*
1777 * Evict buffers from list until we've removed the specified number of
1778 * bytes. Move the removed buffers to the appropriate evict state.
1779 * If the recycle flag is set, then attempt to "recycle" a buffer:
1780 * - look for a buffer to evict that is `bytes' long.
1781 * - return the data block from this buffer rather than freeing it.
1782 * This flag is used by callers that are trying to make space for a
1783 * new buffer in a full arc cache.
1784 *
1785 * This function makes a "best effort". It skips over any buffers
1786 * it can't get a hash_lock on, and so may not catch all candidates.
1787 * It may also return without evicting as much space as requested.
1788 */
1789 static void *
1790 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1791 arc_buf_contents_t type)
1792 {
1793 arc_state_t *evicted_state;
1794 uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1795 arc_buf_hdr_t *ab, *ab_prev = NULL;
1796 list_t *list = &state->arcs_list[type];
1797 kmutex_t *hash_lock;
1798 boolean_t have_lock;
1799 void *stolen = NULL;
1800 arc_buf_hdr_t marker = { 0 };
1801 int count = 0;
1802
1803 ASSERT(state == arc_mru || state == arc_mfu);
1804
1805 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1806
1807 mutex_enter(&state->arcs_mtx);
1808 mutex_enter(&evicted_state->arcs_mtx);
1809
1810 for (ab = list_tail(list); ab; ab = ab_prev) {
1811 ab_prev = list_prev(list, ab);
1812 /* prefetch buffers have a minimum lifespan */
1813 if (HDR_IO_IN_PROGRESS(ab) ||
1814 (spa && ab->b_spa != spa) ||
1815 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1816 ddi_get_lbolt() - ab->b_arc_access <
1817 arc_min_prefetch_lifespan)) {
1818 skipped++;
1819 continue;
1820 }
1821 /* "lookahead" for better eviction candidate */
1822 if (recycle && ab->b_size != bytes &&
1823 ab_prev && ab_prev->b_size == bytes)
1824 continue;
1825
1826 /* ignore markers */
1827 if (ab->b_spa == 0)
1828 continue;
1829
1830 /*
1831 * It may take a long time to evict all the bufs requested.
1832 * To avoid blocking all arc activity, periodically drop
1833 * the arcs_mtx and give other threads a chance to run
1834 * before reacquiring the lock.
1835 *
1836 * If we are looking for a buffer to recycle, we are in
1837 * the hot code path, so don't sleep.
1838 */
1839 if (!recycle && count++ > arc_evict_iterations) {
1840 list_insert_after(list, ab, &marker);
1841 mutex_exit(&evicted_state->arcs_mtx);
1842 mutex_exit(&state->arcs_mtx);
1843 kpreempt(KPREEMPT_SYNC);
1844 mutex_enter(&state->arcs_mtx);
1845 mutex_enter(&evicted_state->arcs_mtx);
1846 ab_prev = list_prev(list, &marker);
1847 list_remove(list, &marker);
1848 count = 0;
1849 continue;
1850 }
1851
1852 hash_lock = HDR_LOCK(ab);
1853 have_lock = MUTEX_HELD(hash_lock);
1854 if (have_lock || mutex_tryenter(hash_lock)) {
1855 ASSERT0(refcount_count(&ab->b_refcnt));
1856 ASSERT(ab->b_datacnt > 0);
1857 while (ab->b_buf) {
1858 arc_buf_t *buf = ab->b_buf;
1859 if (!mutex_tryenter(&buf->b_evict_lock)) {
1860 missed += 1;
1861 break;
1862 }
1863 if (buf->b_data) {
1864 bytes_evicted += ab->b_size;
1865 if (recycle && ab->b_type == type &&
1866 ab->b_size == bytes &&
1867 !HDR_L2_WRITING(ab)) {
1868 stolen = buf->b_data;
1869 recycle = FALSE;
1870 }
1871 }
1872 if (buf->b_efunc) {
1873 mutex_enter(&arc_eviction_mtx);
1874 arc_buf_destroy(buf,
1875 buf->b_data == stolen, FALSE);
1876 ab->b_buf = buf->b_next;
1877 buf->b_hdr = &arc_eviction_hdr;
1878 buf->b_next = arc_eviction_list;
1879 arc_eviction_list = buf;
1880 mutex_exit(&arc_eviction_mtx);
1881 mutex_exit(&buf->b_evict_lock);
1882 } else {
1883 mutex_exit(&buf->b_evict_lock);
1884 arc_buf_destroy(buf,
1885 buf->b_data == stolen, TRUE);
1886 }
1887 }
1888
1889 if (ab->b_l2hdr) {
1890 ARCSTAT_INCR(arcstat_evict_l2_cached,
1891 ab->b_size);
1892 } else {
1893 if (l2arc_write_eligible(ab->b_spa, ab)) {
1894 ARCSTAT_INCR(arcstat_evict_l2_eligible,
1895 ab->b_size);
1896 } else {
1897 ARCSTAT_INCR(
1898 arcstat_evict_l2_ineligible,
1899 ab->b_size);
1900 }
1901 }
1902
1903 if (ab->b_datacnt == 0) {
1904 arc_change_state(evicted_state, ab, hash_lock);
1905 ASSERT(HDR_IN_HASH_TABLE(ab));
1906 ab->b_flags |= ARC_IN_HASH_TABLE;
1907 ab->b_flags &= ~ARC_BUF_AVAILABLE;
1908 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1909 }
1910 if (!have_lock)
1911 mutex_exit(hash_lock);
1912 if (bytes >= 0 && bytes_evicted >= bytes)
1913 break;
1914 } else {
1915 missed += 1;
1916 }
1917 }
1918
1919 mutex_exit(&evicted_state->arcs_mtx);
1920 mutex_exit(&state->arcs_mtx);
1921
1922 if (bytes_evicted < bytes)
1923 dprintf("only evicted %lld bytes from %x",
1924 (longlong_t)bytes_evicted, state);
1925
1926 if (skipped)
1927 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1928
1929 if (missed)
1930 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1931
1932 /*
1933 * Note: we have just evicted some data into the ghost state,
1934 * potentially putting the ghost size over the desired size. Rather
1935 * that evicting from the ghost list in this hot code path, leave
1936 * this chore to the arc_reclaim_thread().
1937 */
1938
1939 return (stolen);
1940 }
1941
1942 /*
1943 * Remove buffers from list until we've removed the specified number of
1944 * bytes. Destroy the buffers that are removed.
1945 */
1946 static void
1947 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1948 {
1949 arc_buf_hdr_t *ab, *ab_prev;
1950 arc_buf_hdr_t marker = { 0 };
1951 list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1952 kmutex_t *hash_lock;
1953 uint64_t bytes_deleted = 0;
1954 uint64_t bufs_skipped = 0;
1955 int count = 0;
1956
1957 ASSERT(GHOST_STATE(state));
1958 top:
1959 mutex_enter(&state->arcs_mtx);
1960 for (ab = list_tail(list); ab; ab = ab_prev) {
1961 ab_prev = list_prev(list, ab);
1962 if (ab->b_type > ARC_BUFC_NUMTYPES)
1963 panic("invalid ab=%p", (void *)ab);
1964 if (spa && ab->b_spa != spa)
1965 continue;
1966
1967 /* ignore markers */
1968 if (ab->b_spa == 0)
1969 continue;
1970
1971 hash_lock = HDR_LOCK(ab);
1972 /* caller may be trying to modify this buffer, skip it */
1973 if (MUTEX_HELD(hash_lock))
1974 continue;
1975
1976 /*
1977 * It may take a long time to evict all the bufs requested.
1978 * To avoid blocking all arc activity, periodically drop
1979 * the arcs_mtx and give other threads a chance to run
1980 * before reacquiring the lock.
1981 */
1982 if (count++ > arc_evict_iterations) {
1983 list_insert_after(list, ab, &marker);
1984 mutex_exit(&state->arcs_mtx);
1985 kpreempt(KPREEMPT_SYNC);
1986 mutex_enter(&state->arcs_mtx);
1987 ab_prev = list_prev(list, &marker);
1988 list_remove(list, &marker);
1989 count = 0;
1990 continue;
1991 }
1992 if (mutex_tryenter(hash_lock)) {
1993 ASSERT(!HDR_IO_IN_PROGRESS(ab));
1994 ASSERT(ab->b_buf == NULL);
1995 ARCSTAT_BUMP(arcstat_deleted);
1996 bytes_deleted += ab->b_size;
1997
1998 if (ab->b_l2hdr != NULL) {
1999 /*
2000 * This buffer is cached on the 2nd Level ARC;
2001 * don't destroy the header.
2002 */
2003 arc_change_state(arc_l2c_only, ab, hash_lock);
2004 mutex_exit(hash_lock);
2005 } else {
2006 arc_change_state(arc_anon, ab, hash_lock);
2007 mutex_exit(hash_lock);
2008 arc_hdr_destroy(ab);
2009 }
2010
2011 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2012 if (bytes >= 0 && bytes_deleted >= bytes)
2013 break;
2014 } else if (bytes < 0) {
2015 /*
2016 * Insert a list marker and then wait for the
2017 * hash lock to become available. Once its
2018 * available, restart from where we left off.
2019 */
2020 list_insert_after(list, ab, &marker);
2021 mutex_exit(&state->arcs_mtx);
2022 mutex_enter(hash_lock);
2023 mutex_exit(hash_lock);
2024 mutex_enter(&state->arcs_mtx);
2025 ab_prev = list_prev(list, &marker);
2026 list_remove(list, &marker);
2027 } else {
2028 bufs_skipped += 1;
2029 }
2030
2031 }
2032 mutex_exit(&state->arcs_mtx);
2033
2034 if (list == &state->arcs_list[ARC_BUFC_DATA] &&
2035 (bytes < 0 || bytes_deleted < bytes)) {
2036 list = &state->arcs_list[ARC_BUFC_METADATA];
2037 goto top;
2038 }
2039
2040 if (bufs_skipped) {
2041 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2042 ASSERT(bytes >= 0);
2043 }
2044
2045 if (bytes_deleted < bytes)
2046 dprintf("only deleted %lld bytes from %p",
2047 (longlong_t)bytes_deleted, state);
2048 }
2049
2050 static void
2051 arc_adjust(void)
2052 {
2053 int64_t adjustment, delta;
2054
2055 /*
2056 * Adjust MRU size
2057 */
2058
2059 adjustment = MIN((int64_t)(arc_size - arc_c),
2060 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2061 arc_p));
2062
2063 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2064 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2065 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
2066 adjustment -= delta;
2067 }
2068
2069 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2070 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2071 (void) arc_evict(arc_mru, NULL, delta, FALSE,
2072 ARC_BUFC_METADATA);
2073 }
2074
2075 /*
2076 * Adjust MFU size
2077 */
2078
2079 adjustment = arc_size - arc_c;
2080
2081 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2082 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2083 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
2084 adjustment -= delta;
2085 }
2086
2087 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2088 int64_t delta = MIN(adjustment,
2089 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2090 (void) arc_evict(arc_mfu, NULL, delta, FALSE,
2091 ARC_BUFC_METADATA);
2092 }
2093
2094 /*
2095 * Adjust ghost lists
2096 */
2097
2098 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2099
2100 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2101 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2102 arc_evict_ghost(arc_mru_ghost, NULL, delta);
2103 }
2104
2105 adjustment =
2106 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2107
2108 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2109 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2110 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2111 }
2112 }
2113
2114 static void
2115 arc_do_user_evicts(void)
2116 {
2117 mutex_enter(&arc_eviction_mtx);
2118 while (arc_eviction_list != NULL) {
2119 arc_buf_t *buf = arc_eviction_list;
2120 arc_eviction_list = buf->b_next;
2121 mutex_enter(&buf->b_evict_lock);
2122 buf->b_hdr = NULL;
2123 mutex_exit(&buf->b_evict_lock);
2124 mutex_exit(&arc_eviction_mtx);
2125
2126 if (buf->b_efunc != NULL)
2127 VERIFY0(buf->b_efunc(buf->b_private));
2128
2129 buf->b_efunc = NULL;
2130 buf->b_private = NULL;
2131 kmem_cache_free(buf_cache, buf);
2132 mutex_enter(&arc_eviction_mtx);
2133 }
2134 mutex_exit(&arc_eviction_mtx);
2135 }
2136
2137 typedef struct arc_async_flush_data {
2138 uint64_t aaf_guid;
2139 } arc_async_flush_data_t;
2140
2141 static taskq_t *arc_flush_taskq;
2142
2143 static void
2144 _arc_flush(uint64_t guid)
2145 {
2146 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2147 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2148 if (guid)
2149 break;
2150 }
2151 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2152 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2153 if (guid)
2154 break;
2155 }
2156 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2157 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2158 if (guid)
2159 break;
2160 }
2161 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2162 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2163 if (guid)
2164 break;
2165 }
2166
2167 arc_evict_ghost(arc_mru_ghost, guid, -1);
2168 arc_evict_ghost(arc_mfu_ghost, guid, -1);
2169
2170 mutex_enter(&arc_reclaim_thr_lock);
2171 arc_do_user_evicts();
2172 mutex_exit(&arc_reclaim_thr_lock);
2173 }
2174
2175 static void
2176 arc_flush_task(void *arg)
2177 {
2178 arc_async_flush_data_t *aaf = (arc_async_flush_data_t *)arg;
2179 _arc_flush(aaf->aaf_guid);
2180 kmem_free(aaf, sizeof (arc_async_flush_data_t));
2181 }
2182
2183 /*
2184 * Flush all *evictable* data from the cache for the given spa.
2185 * NOTE: this will not touch "active" (i.e. referenced) data.
2186 */
2187 void
2188 arc_flush(spa_t *spa)
2189 {
2190 uint64_t guid = 0;
2191 boolean_t async_flush = (spa ? zfs_fastflush : FALSE);
2192 arc_async_flush_data_t *aaf = NULL;
2193
2194 if (spa) {
2195 guid = spa_load_guid(spa);
2196 if (async_flush) {
2197 aaf = kmem_alloc(sizeof (arc_async_flush_data_t),
2198 KM_SLEEP);
2199 aaf->aaf_guid = guid;
2200 }
2201 }
2202
2203 /*
2204 * Try to flush per-spa remaining ARC ghost buffers and buffers in
2205 * arc_eviction_list asynchronously while a pool is being closed.
2206 * An ARC buffer is bound to spa only by guid, so buffer can
2207 * exist even when pool has already gone. If asynchronous flushing
2208 * fails we fall back to regular (synchronous) one.
2209 * NOTE: If asynchronous flushing had not yet finished when the pool
2210 * was imported again it wouldn't be a problem, even when guids before
2211 * and after export/import are the same. We can evict only unreferenced
2212 * buffers, other are skipped.
2213 */
2214 if (!async_flush || (taskq_dispatch(arc_flush_taskq, arc_flush_task,
2215 aaf, TQ_NOSLEEP) == NULL)) {
2216 _arc_flush(guid);
2217 ASSERT(spa || arc_eviction_list == NULL);
2218 if (async_flush)
2219 kmem_free(aaf, sizeof (arc_async_flush_data_t));
2220 }
2221 }
2222
2223 void
2224 arc_shrink(void)
2225 {
2226 if (arc_c > arc_c_min) {
2227 uint64_t to_free;
2228
2229 #ifdef _KERNEL
2230 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
2231 #else
2232 to_free = arc_c >> arc_shrink_shift;
2233 #endif
2234 if (arc_c > arc_c_min + to_free)
2235 atomic_add_64(&arc_c, -to_free);
2236 else
2237 arc_c = arc_c_min;
2238
2239 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2240 if (arc_c > arc_size)
2241 arc_c = MAX(arc_size, arc_c_min);
2242 if (arc_p > arc_c)
2243 arc_p = (arc_c >> 1);
2244 ASSERT(arc_c >= arc_c_min);
2245 ASSERT((int64_t)arc_p >= 0);
2246 }
2247
2248 if (arc_size > arc_c)
2249 arc_adjust();
2250 }
2251
2252 /*
2253 * Determine if the system is under memory pressure and is asking
2254 * to reclaim memory. A return value of 1 indicates that the system
2255 * is under memory pressure and that the arc should adjust accordingly.
2256 */
2257 static int
2258 arc_reclaim_needed(void)
2259 {
2260 uint64_t extra;
2261
2262 #ifdef _KERNEL
2263
2264 if (needfree)
2265 return (1);
2266
2267 /*
2268 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2269 */
2270 extra = desfree;
2271
2272 /*
2273 * check that we're out of range of the pageout scanner. It starts to
2274 * schedule paging if freemem is less than lotsfree and needfree.
2275 * lotsfree is the high-water mark for pageout, and needfree is the
2276 * number of needed free pages. We add extra pages here to make sure
2277 * the scanner doesn't start up while we're freeing memory.
2278 */
2279 if (freemem < lotsfree + needfree + extra)
2280 return (1);
2281
2282 /*
2283 * check to make sure that swapfs has enough space so that anon
2284 * reservations can still succeed. anon_resvmem() checks that the
2285 * availrmem is greater than swapfs_minfree, and the number of reserved
2286 * swap pages. We also add a bit of extra here just to prevent
2287 * circumstances from getting really dire.
2288 */
2289 if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2290 return (1);
2291
2292 /*
2293 * Check that we have enough availrmem that memory locking (e.g., via
2294 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum
2295 * stores the number of pages that cannot be locked; when availrmem
2296 * drops below pages_pp_maximum, page locking mechanisms such as
2297 * page_pp_lock() will fail.)
2298 */
2299 if (availrmem <= pages_pp_maximum)
2300 return (1);
2301
2302 #if defined(__i386)
2303 /*
2304 * If we're on an i386 platform, it's possible that we'll exhaust the
2305 * kernel heap space before we ever run out of available physical
2306 * memory. Most checks of the size of the heap_area compare against
2307 * tune.t_minarmem, which is the minimum available real memory that we
2308 * can have in the system. However, this is generally fixed at 25 pages
2309 * which is so low that it's useless. In this comparison, we seek to
2310 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2311 * heap is allocated. (Or, in the calculation, if less than 1/4th is
2312 * free)
2313 */
2314 if (vmem_size(heap_arena, VMEM_FREE) <
2315 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2))
2316 return (1);
2317 #endif
2318
2319 /*
2320 * If zio data pages are being allocated out of a separate heap segment,
2321 * then enforce that the size of available vmem for this arena remains
2322 * above about 1/16th free.
2323 *
2324 * Note: The 1/16th arena free requirement was put in place
2325 * to aggressively evict memory from the arc in order to avoid
2326 * memory fragmentation issues.
2327 */
2328 if (zio_arena != NULL &&
2329 vmem_size(zio_arena, VMEM_FREE) <
2330 (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2331 return (1);
2332 #else
2333 if (spa_get_random(100) == 0)
2334 return (1);
2335 #endif
2336 return (0);
2337 }
2338
2339 static void
2340 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2341 {
2342 size_t i;
2343 kmem_cache_t *prev_cache = NULL;
2344 kmem_cache_t *prev_data_cache = NULL;
2345 extern kmem_cache_t *zio_buf_cache[];
2346 extern kmem_cache_t *zio_data_buf_cache[];
2347 extern kmem_cache_t *range_seg_cache;
2348
2349 #ifdef _KERNEL
2350 if (arc_meta_used >= arc_meta_limit) {
2351 /*
2352 * We are exceeding our meta-data cache limit.
2353 * Purge some DNLC entries to release holds on meta-data.
2354 */
2355 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2356 }
2357 #if defined(__i386)
2358 /*
2359 * Reclaim unused memory from all kmem caches.
2360 */
2361 kmem_reap();
2362 #endif
2363 #endif
2364
2365 /*
2366 * An aggressive reclamation will shrink the cache size as well as
2367 * reap free buffers from the arc kmem caches.
2368 */
2369 if (strat == ARC_RECLAIM_AGGR)
2370 arc_shrink();
2371
2372 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2373 if (zio_buf_cache[i] != prev_cache) {
2374 prev_cache = zio_buf_cache[i];
2375 kmem_cache_reap_now(zio_buf_cache[i]);
2376 }
2377 if (zio_data_buf_cache[i] != prev_data_cache) {
2378 prev_data_cache = zio_data_buf_cache[i];
2379 kmem_cache_reap_now(zio_data_buf_cache[i]);
2380 }
2381 }
2382 kmem_cache_reap_now(buf_cache);
2383 kmem_cache_reap_now(hdr_cache);
2384 kmem_cache_reap_now(range_seg_cache);
2385
2386 /*
2387 * Ask the vmem areana to reclaim unused memory from its
2388 * quantum caches.
2389 */
2390 if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2391 vmem_qcache_reap(zio_arena);
2392 }
2393
2394 static void
2395 arc_reclaim_thread(void)
2396 {
2397 clock_t growtime = 0;
2398 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
2399 callb_cpr_t cpr;
2400
2401 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2402
2403 mutex_enter(&arc_reclaim_thr_lock);
2404 while (arc_thread_exit == 0) {
2405 if (arc_reclaim_needed()) {
2406
2407 if (arc_no_grow) {
2408 if (last_reclaim == ARC_RECLAIM_CONS) {
2409 last_reclaim = ARC_RECLAIM_AGGR;
2410 } else {
2411 last_reclaim = ARC_RECLAIM_CONS;
2412 }
2413 } else {
2414 arc_no_grow = TRUE;
2415 last_reclaim = ARC_RECLAIM_AGGR;
2416 membar_producer();
2417 }
2418
2419 /* reset the growth delay for every reclaim */
2420 growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2421
2422 arc_kmem_reap_now(last_reclaim);
2423 arc_warm = B_TRUE;
2424
2425 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2426 arc_no_grow = FALSE;
2427 }
2428
2429 arc_adjust();
2430
2431 if (arc_eviction_list != NULL)
2432 arc_do_user_evicts();
2433
2434 /* block until needed, or one second, whichever is shorter */
2435 CALLB_CPR_SAFE_BEGIN(&cpr);
2436 (void) cv_timedwait(&arc_reclaim_thr_cv,
2437 &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2438 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2439 }
2440
2441 arc_thread_exit = 0;
2442 cv_broadcast(&arc_reclaim_thr_cv);
2443 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */
2444 thread_exit();
2445 }
2446
2447 /*
2448 * Adapt arc info given the number of bytes we are trying to add and
2449 * the state that we are comming from. This function is only called
2450 * when we are adding new content to the cache.
2451 */
2452 static void
2453 arc_adapt(int bytes, arc_state_t *state)
2454 {
2455 int mult;
2456 uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2457
2458 if (state == arc_l2c_only)
2459 return;
2460
2461 ASSERT(bytes > 0);
2462 /*
2463 * Adapt the target size of the MRU list:
2464 * - if we just hit in the MRU ghost list, then increase
2465 * the target size of the MRU list.
2466 * - if we just hit in the MFU ghost list, then increase
2467 * the target size of the MFU list by decreasing the
2468 * target size of the MRU list.
2469 */
2470 if (state == arc_mru_ghost) {
2471 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2472 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2473 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2474
2475 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2476 } else if (state == arc_mfu_ghost) {
2477 uint64_t delta;
2478
2479 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2480 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2481 mult = MIN(mult, 10);
2482
2483 delta = MIN(bytes * mult, arc_p);
2484 arc_p = MAX(arc_p_min, arc_p - delta);
2485 }
2486 ASSERT((int64_t)arc_p >= 0);
2487
2488 if (arc_reclaim_needed()) {
2489 cv_signal(&arc_reclaim_thr_cv);
2490 return;
2491 }
2492
2493 if (arc_no_grow)
2494 return;
2495
2496 if (arc_c >= arc_c_max)
2497 return;
2498
2499 /*
2500 * If we're within (2 * maxblocksize) bytes of the target
2501 * cache size, increment the target cache size
2502 */
2503 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2504 atomic_add_64(&arc_c, (int64_t)bytes);
2505 if (arc_c > arc_c_max)
2506 arc_c = arc_c_max;
2507 else if (state == arc_anon)
2508 atomic_add_64(&arc_p, (int64_t)bytes);
2509 if (arc_p > arc_c)
2510 arc_p = arc_c;
2511 }
2512 ASSERT((int64_t)arc_p >= 0);
2513 }
2514
2515 /*
2516 * Check if the cache has reached its limits and eviction is required
2517 * prior to insert.
2518 */
2519 static int
2520 arc_evict_needed(arc_buf_contents_t type)
2521 {
2522 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2523 return (1);
2524
2525 if (arc_reclaim_needed())
2526 return (1);
2527
2528 return (arc_size > arc_c);
2529 }
2530
2531 /*
2532 * The buffer, supplied as the first argument, needs a data block.
2533 * So, if we are at cache max, determine which cache should be victimized.
2534 * We have the following cases:
2535 *
2536 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2537 * In this situation if we're out of space, but the resident size of the MFU is
2538 * under the limit, victimize the MFU cache to satisfy this insertion request.
2539 *
2540 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2541 * Here, we've used up all of the available space for the MRU, so we need to
2542 * evict from our own cache instead. Evict from the set of resident MRU
2543 * entries.
2544 *
2545 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2546 * c minus p represents the MFU space in the cache, since p is the size of the
2547 * cache that is dedicated to the MRU. In this situation there's still space on
2548 * the MFU side, so the MRU side needs to be victimized.
2549 *
2550 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2551 * MFU's resident set is consuming more space than it has been allotted. In
2552 * this situation, we must victimize our own cache, the MFU, for this insertion.
2553 */
2554 static void
2555 arc_get_data_buf(arc_buf_t *buf)
2556 {
2557 arc_state_t *state = buf->b_hdr->b_state;
2558 uint64_t size = buf->b_hdr->b_size;
2559 arc_buf_contents_t type = buf->b_hdr->b_type;
2560
2561 arc_adapt(size, state);
2562
2563 /*
2564 * We have not yet reached cache maximum size,
2565 * just allocate a new buffer.
2566 */
2567 if (!arc_evict_needed(type)) {
2568 if (type == ARC_BUFC_METADATA) {
2569 buf->b_data = zio_buf_alloc(size);
2570 arc_space_consume(size, ARC_SPACE_DATA);
2571 } else {
2572 ASSERT(type == ARC_BUFC_DATA);
2573 buf->b_data = zio_data_buf_alloc(size);
2574 ARCSTAT_INCR(arcstat_data_size, size);
2575 atomic_add_64(&arc_size, size);
2576 }
2577 goto out;
2578 }
2579
2580 /*
2581 * If we are prefetching from the mfu ghost list, this buffer
2582 * will end up on the mru list; so steal space from there.
2583 */
2584 if (state == arc_mfu_ghost)
2585 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2586 else if (state == arc_mru_ghost)
2587 state = arc_mru;
2588
2589 if (state == arc_mru || state == arc_anon) {
2590 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2591 state = (arc_mfu->arcs_lsize[type] >= size &&
2592 arc_p > mru_used) ? arc_mfu : arc_mru;
2593 } else {
2594 /* MFU cases */
2595 uint64_t mfu_space = arc_c - arc_p;
2596 state = (arc_mru->arcs_lsize[type] >= size &&
2597 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2598 }
2599 if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2600 if (type == ARC_BUFC_METADATA) {
2601 buf->b_data = zio_buf_alloc(size);
2602 arc_space_consume(size, ARC_SPACE_DATA);
2603 } else {
2604 ASSERT(type == ARC_BUFC_DATA);
2605 buf->b_data = zio_data_buf_alloc(size);
2606 ARCSTAT_INCR(arcstat_data_size, size);
2607 atomic_add_64(&arc_size, size);
2608 }
2609 ARCSTAT_BUMP(arcstat_recycle_miss);
2610 }
2611 ASSERT(buf->b_data != NULL);
2612 out:
2613 /*
2614 * Update the state size. Note that ghost states have a
2615 * "ghost size" and so don't need to be updated.
2616 */
2617 if (!GHOST_STATE(buf->b_hdr->b_state)) {
2618 arc_buf_hdr_t *hdr = buf->b_hdr;
2619
2620 atomic_add_64(&hdr->b_state->arcs_size, size);
2621 if (list_link_active(&hdr->b_arc_node)) {
2622 ASSERT(refcount_is_zero(&hdr->b_refcnt));
2623 atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2624 }
2625 /*
2626 * If we are growing the cache, and we are adding anonymous
2627 * data, and we have outgrown arc_p, update arc_p
2628 */
2629 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2630 arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2631 arc_p = MIN(arc_c, arc_p + size);
2632 }
2633 }
2634
2635 /*
2636 * This routine is called whenever a buffer is accessed.
2637 * NOTE: the hash lock is dropped in this function.
2638 */
2639 static void
2640 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2641 {
2642 clock_t now;
2643
2644 ASSERT(MUTEX_HELD(hash_lock));
2645
2646 if (buf->b_state == arc_anon) {
2647 /*
2648 * This buffer is not in the cache, and does not
2649 * appear in our "ghost" list. Add the new buffer
2650 * to the MRU state.
2651 */
2652
2653 ASSERT(buf->b_arc_access == 0);
2654 buf->b_arc_access = ddi_get_lbolt();
2655 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2656 arc_change_state(arc_mru, buf, hash_lock);
2657
2658 } else if (buf->b_state == arc_mru) {
2659 now = ddi_get_lbolt();
2660
2661 /*
2662 * If this buffer is here because of a prefetch, then either:
2663 * - clear the flag if this is a "referencing" read
2664 * (any subsequent access will bump this into the MFU state).
2665 * or
2666 * - move the buffer to the head of the list if this is
2667 * another prefetch (to make it less likely to be evicted).
2668 */
2669 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2670 if (refcount_count(&buf->b_refcnt) == 0) {
2671 ASSERT(list_link_active(&buf->b_arc_node));
2672 } else {
2673 buf->b_flags &= ~ARC_PREFETCH;
2674 ARCSTAT_BUMP(arcstat_mru_hits);
2675 }
2676 buf->b_arc_access = now;
2677 return;
2678 }
2679
2680 /*
2681 * This buffer has been "accessed" only once so far,
2682 * but it is still in the cache. Move it to the MFU
2683 * state.
2684 */
2685 if (now > buf->b_arc_access + ARC_MINTIME) {
2686 /*
2687 * More than 125ms have passed since we
2688 * instantiated this buffer. Move it to the
2689 * most frequently used state.
2690 */
2691 buf->b_arc_access = now;
2692 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2693 arc_change_state(arc_mfu, buf, hash_lock);
2694 }
2695 ARCSTAT_BUMP(arcstat_mru_hits);
2696 } else if (buf->b_state == arc_mru_ghost) {
2697 arc_state_t *new_state;
2698 /*
2699 * This buffer has been "accessed" recently, but
2700 * was evicted from the cache. Move it to the
2701 * MFU state.
2702 */
2703
2704 if (buf->b_flags & ARC_PREFETCH) {
2705 new_state = arc_mru;
2706 if (refcount_count(&buf->b_refcnt) > 0)
2707 buf->b_flags &= ~ARC_PREFETCH;
2708 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2709 } else {
2710 new_state = arc_mfu;
2711 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2712 }
2713
2714 buf->b_arc_access = ddi_get_lbolt();
2715 arc_change_state(new_state, buf, hash_lock);
2716
2717 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2718 } else if (buf->b_state == arc_mfu) {
2719 /*
2720 * This buffer has been accessed more than once and is
2721 * still in the cache. Keep it in the MFU state.
2722 *
2723 * NOTE: an add_reference() that occurred when we did
2724 * the arc_read() will have kicked this off the list.
2725 * If it was a prefetch, we will explicitly move it to
2726 * the head of the list now.
2727 */
2728 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2729 ASSERT(refcount_count(&buf->b_refcnt) == 0);
2730 ASSERT(list_link_active(&buf->b_arc_node));
2731 }
2732 ARCSTAT_BUMP(arcstat_mfu_hits);
2733 buf->b_arc_access = ddi_get_lbolt();
2734 } else if (buf->b_state == arc_mfu_ghost) {
2735 arc_state_t *new_state = arc_mfu;
2736 /*
2737 * This buffer has been accessed more than once but has
2738 * been evicted from the cache. Move it back to the
2739 * MFU state.
2740 */
2741
2742 if (buf->b_flags & ARC_PREFETCH) {
2743 /*
2744 * This is a prefetch access...
2745 * move this block back to the MRU state.
2746 */
2747 ASSERT0(refcount_count(&buf->b_refcnt));
2748 new_state = arc_mru;
2749 }
2750
2751 buf->b_arc_access = ddi_get_lbolt();
2752 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2753 arc_change_state(new_state, buf, hash_lock);
2754
2755 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2756 } else if (buf->b_state == arc_l2c_only) {
2757 /*
2758 * This buffer is on the 2nd Level ARC.
2759 */
2760
2761 buf->b_arc_access = ddi_get_lbolt();
2762 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2763 arc_change_state(arc_mfu, buf, hash_lock);
2764 } else {
2765 ASSERT(!"invalid arc state");
2766 }
2767 }
2768
2769 /* a generic arc_done_func_t which you can use */
2770 /* ARGSUSED */
2771 void
2772 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2773 {
2774 if (zio == NULL || zio->io_error == 0)
2775 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2776 VERIFY(arc_buf_remove_ref(buf, arg));
2777 }
2778
2779 /* a generic arc_done_func_t */
2780 void
2781 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2782 {
2783 arc_buf_t **bufp = arg;
2784 if (zio && zio->io_error) {
2785 VERIFY(arc_buf_remove_ref(buf, arg));
2786 *bufp = NULL;
2787 } else {
2788 *bufp = buf;
2789 ASSERT(buf->b_data);
2790 }
2791 }
2792
2793 static void
2794 arc_read_done(zio_t *zio)
2795 {
2796 arc_buf_hdr_t *hdr;
2797 arc_buf_t *buf;
2798 arc_buf_t *abuf; /* buffer we're assigning to callback */
2799 kmutex_t *hash_lock = NULL;
2800 arc_callback_t *callback_list, *acb;
2801 int freeable = FALSE;
2802
2803 buf = zio->io_private;
2804 hdr = buf->b_hdr;
2805
2806 /*
2807 * The hdr was inserted into hash-table and removed from lists
2808 * prior to starting I/O. We should find this header, since
2809 * it's in the hash table, and it should be legit since it's
2810 * not possible to evict it during the I/O. The only possible
2811 * reason for it not to be found is if we were freed during the
2812 * read.
2813 */
2814 if (HDR_IN_HASH_TABLE(hdr)) {
2815 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
2816 ASSERT3U(hdr->b_dva.dva_word[0], ==,
2817 BP_IDENTITY(zio->io_bp)->dva_word[0]);
2818 ASSERT3U(hdr->b_dva.dva_word[1], ==,
2819 BP_IDENTITY(zio->io_bp)->dva_word[1]);
2820
2821 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
2822 &hash_lock);
2823
2824 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
2825 hash_lock == NULL) ||
2826 (found == hdr &&
2827 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2828 (found == hdr && HDR_L2_READING(hdr)));
2829 }
2830
2831 hdr->b_flags &= ~ARC_L2_EVICTED;
2832 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2833 hdr->b_flags &= ~ARC_L2CACHE;
2834
2835 /* byteswap if necessary */
2836 callback_list = hdr->b_acb;
2837 ASSERT(callback_list != NULL);
2838 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2839 dmu_object_byteswap_t bswap =
2840 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2841 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2842 byteswap_uint64_array :
2843 dmu_ot_byteswap[bswap].ob_func;
2844 func(buf->b_data, hdr->b_size);
2845 }
2846
2847 arc_cksum_compute(buf, B_FALSE);
2848 arc_buf_watch(buf);
2849
2850 if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2851 /*
2852 * Only call arc_access on anonymous buffers. This is because
2853 * if we've issued an I/O for an evicted buffer, we've already
2854 * called arc_access (to prevent any simultaneous readers from
2855 * getting confused).
2856 */
2857 arc_access(hdr, hash_lock);
2858 }
2859
2860 /* create copies of the data buffer for the callers */
2861 abuf = buf;
2862 for (acb = callback_list; acb; acb = acb->acb_next) {
2863 if (acb->acb_done) {
2864 if (abuf == NULL) {
2865 ARCSTAT_BUMP(arcstat_duplicate_reads);
2866 abuf = arc_buf_clone(buf);
2867 }
2868 acb->acb_buf = abuf;
2869 abuf = NULL;
2870 }
2871 }
2872 hdr->b_acb = NULL;
2873 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2874 ASSERT(!HDR_BUF_AVAILABLE(hdr));
2875 if (abuf == buf) {
2876 ASSERT(buf->b_efunc == NULL);
2877 ASSERT(hdr->b_datacnt == 1);
2878 hdr->b_flags |= ARC_BUF_AVAILABLE;
2879 }
2880
2881 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2882
2883 if (zio->io_error != 0) {
2884 hdr->b_flags |= ARC_IO_ERROR;
2885 if (hdr->b_state != arc_anon)
2886 arc_change_state(arc_anon, hdr, hash_lock);
2887 if (HDR_IN_HASH_TABLE(hdr))
2888 buf_hash_remove(hdr);
2889 freeable = refcount_is_zero(&hdr->b_refcnt);
2890 }
2891
2892 /*
2893 * Broadcast before we drop the hash_lock to avoid the possibility
2894 * that the hdr (and hence the cv) might be freed before we get to
2895 * the cv_broadcast().
2896 */
2897 cv_broadcast(&hdr->b_cv);
2898
2899 if (hash_lock) {
2900 mutex_exit(hash_lock);
2901 } else {
2902 /*
2903 * This block was freed while we waited for the read to
2904 * complete. It has been removed from the hash table and
2905 * moved to the anonymous state (so that it won't show up
2906 * in the cache).
2907 */
2908 ASSERT3P(hdr->b_state, ==, arc_anon);
2909 freeable = refcount_is_zero(&hdr->b_refcnt);
2910 }
2911
2912 /* execute each callback and free its structure */
2913 while ((acb = callback_list) != NULL) {
2914 if (acb->acb_done)
2915 acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2916
2917 if (acb->acb_zio_dummy != NULL) {
2918 acb->acb_zio_dummy->io_error = zio->io_error;
2919 zio_nowait(acb->acb_zio_dummy);
2920 }
2921
2922 callback_list = acb->acb_next;
2923 kmem_free(acb, sizeof (arc_callback_t));
2924 }
2925
2926 if (freeable)
2927 arc_hdr_destroy(hdr);
2928 }
2929
2930 /*
2931 * "Read" the block at the specified DVA (in bp) via the
2932 * cache. If the block is found in the cache, invoke the provided
2933 * callback immediately and return. Note that the `zio' parameter
2934 * in the callback will be NULL in this case, since no IO was
2935 * required. If the block is not in the cache pass the read request
2936 * on to the spa with a substitute callback function, so that the
2937 * requested block will be added to the cache.
2938 *
2939 * If a read request arrives for a block that has a read in-progress,
2940 * either wait for the in-progress read to complete (and return the
2941 * results); or, if this is a read with a "done" func, add a record
2942 * to the read to invoke the "done" func when the read completes,
2943 * and return; or just return.
2944 *
2945 * arc_read_done() will invoke all the requested "done" functions
2946 * for readers of this block.
2947 */
2948 int
2949 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2950 void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
2951 const zbookmark_phys_t *zb)
2952 {
2953 arc_buf_hdr_t *hdr = NULL;
2954 arc_buf_t *buf = NULL;
2955 kmutex_t *hash_lock = NULL;
2956 zio_t *rzio;
2957 uint64_t guid = spa_load_guid(spa);
2958
2959 ASSERT(!BP_IS_EMBEDDED(bp) ||
2960 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
2961
2962 top:
2963 if (!BP_IS_EMBEDDED(bp)) {
2964 /*
2965 * Embedded BP's have no DVA and require no I/O to "read".
2966 * Create an anonymous arc buf to back it.
2967 */
2968 hdr = buf_hash_find(guid, bp, &hash_lock);
2969 }
2970
2971 if (hdr != NULL && hdr->b_datacnt > 0) {
2972
2973 *arc_flags |= ARC_CACHED;
2974
2975 if (HDR_IO_IN_PROGRESS(hdr)) {
2976
2977 if (*arc_flags & ARC_WAIT) {
2978 cv_wait(&hdr->b_cv, hash_lock);
2979 mutex_exit(hash_lock);
2980 goto top;
2981 }
2982 ASSERT(*arc_flags & ARC_NOWAIT);
2983
2984 if (done) {
2985 arc_callback_t *acb = NULL;
2986
2987 acb = kmem_zalloc(sizeof (arc_callback_t),
2988 KM_SLEEP);
2989 acb->acb_done = done;
2990 acb->acb_private = private;
2991 if (pio != NULL)
2992 acb->acb_zio_dummy = zio_null(pio,
2993 spa, NULL, NULL, NULL, zio_flags);
2994
2995 ASSERT(acb->acb_done != NULL);
2996 acb->acb_next = hdr->b_acb;
2997 hdr->b_acb = acb;
2998 add_reference(hdr, hash_lock, private);
2999 mutex_exit(hash_lock);
3000 return (0);
3001 }
3002 mutex_exit(hash_lock);
3003 return (0);
3004 }
3005
3006 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3007
3008 if (done) {
3009 add_reference(hdr, hash_lock, private);
3010 /*
3011 * If this block is already in use, create a new
3012 * copy of the data so that we will be guaranteed
3013 * that arc_release() will always succeed.
3014 */
3015 buf = hdr->b_buf;
3016 ASSERT(buf);
3017 ASSERT(buf->b_data);
3018 if (HDR_BUF_AVAILABLE(hdr)) {
3019 ASSERT(buf->b_efunc == NULL);
3020 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3021 } else {
3022 buf = arc_buf_clone(buf);
3023 }
3024
3025 } else if (*arc_flags & ARC_PREFETCH &&
3026 refcount_count(&hdr->b_refcnt) == 0) {
3027 hdr->b_flags |= ARC_PREFETCH;
3028 }
3029 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3030 arc_access(hdr, hash_lock);
3031 if (*arc_flags & ARC_L2CACHE)
3032 hdr->b_flags |= ARC_L2CACHE;
3033 if (*arc_flags & ARC_L2COMPRESS)
3034 hdr->b_flags |= ARC_L2COMPRESS;
3035 mutex_exit(hash_lock);
3036 ARCSTAT_BUMP(arcstat_hits);
3037 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3038 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3039 data, metadata, hits);
3040
3041 if (done)
3042 done(NULL, buf, private);
3043 } else {
3044 uint64_t size = BP_GET_LSIZE(bp);
3045 arc_callback_t *acb;
3046 vdev_t *vd = NULL;
3047 uint64_t addr = 0;
3048 boolean_t devw = B_FALSE;
3049 enum zio_compress b_compress = ZIO_COMPRESS_OFF;
3050 uint64_t b_asize = 0;
3051
3052 if (hdr == NULL) {
3053 /* this block is not in the cache */
3054 arc_buf_hdr_t *exists = NULL;
3055 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3056 buf = arc_buf_alloc(spa, size, private, type);
3057 hdr = buf->b_hdr;
3058 if (!BP_IS_EMBEDDED(bp)) {
3059 hdr->b_dva = *BP_IDENTITY(bp);
3060 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3061 hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3062 exists = buf_hash_insert(hdr, &hash_lock);
3063 }
3064 if (exists != NULL) {
3065 /* somebody beat us to the hash insert */
3066 mutex_exit(hash_lock);
3067 buf_discard_identity(hdr);
3068 (void) arc_buf_remove_ref(buf, private);
3069 goto top; /* restart the IO request */
3070 }
3071 /* if this is a prefetch, we don't have a reference */
3072 if (*arc_flags & ARC_PREFETCH) {
3073 (void) remove_reference(hdr, hash_lock,
3074 private);
3075 hdr->b_flags |= ARC_PREFETCH;
3076 }
3077 if (*arc_flags & ARC_L2CACHE)
3078 hdr->b_flags |= ARC_L2CACHE;
3079 if (*arc_flags & ARC_L2COMPRESS)
3080 hdr->b_flags |= ARC_L2COMPRESS;
3081 if (BP_GET_LEVEL(bp) > 0)
3082 hdr->b_flags |= ARC_INDIRECT;
3083 } else {
3084 /* this block is in the ghost cache */
3085 ASSERT(GHOST_STATE(hdr->b_state));
3086 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3087 ASSERT0(refcount_count(&hdr->b_refcnt));
3088 ASSERT(hdr->b_buf == NULL);
3089
3090 /* if this is a prefetch, we don't have a reference */
3091 if (*arc_flags & ARC_PREFETCH)
3092 hdr->b_flags |= ARC_PREFETCH;
3093 else
3094 add_reference(hdr, hash_lock, private);
3095 if (*arc_flags & ARC_L2CACHE)
3096 hdr->b_flags |= ARC_L2CACHE;
3097 if (*arc_flags & ARC_L2COMPRESS)
3098 hdr->b_flags |= ARC_L2COMPRESS;
3099 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3100 buf->b_hdr = hdr;
3101 buf->b_data = NULL;
3102 buf->b_efunc = NULL;
3103 buf->b_private = NULL;
3104 buf->b_next = NULL;
3105 hdr->b_buf = buf;
3106 ASSERT(hdr->b_datacnt == 0);
3107 hdr->b_datacnt = 1;
3108 arc_get_data_buf(buf);
3109 arc_access(hdr, hash_lock);
3110 }
3111
3112 ASSERT(!GHOST_STATE(hdr->b_state));
3113
3114 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3115 acb->acb_done = done;
3116 acb->acb_private = private;
3117
3118 ASSERT(hdr->b_acb == NULL);
3119 hdr->b_acb = acb;
3120 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3121
3122 if (hdr->b_l2hdr != NULL &&
3123 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3124 devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3125 addr = hdr->b_l2hdr->b_daddr;
3126 b_compress = hdr->b_l2hdr->b_compress;
3127 b_asize = hdr->b_l2hdr->b_asize;
3128 /*
3129 * Lock out device removal.
3130 */
3131 if (vdev_is_dead(vd) ||
3132 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3133 vd = NULL;
3134 }
3135
3136 if (hash_lock != NULL)
3137 mutex_exit(hash_lock);
3138
3139 /*
3140 * At this point, we have a level 1 cache miss. Try again in
3141 * L2ARC if possible.
3142 */
3143 ASSERT3U(hdr->b_size, ==, size);
3144 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3145 uint64_t, size, zbookmark_phys_t *, zb);
3146 ARCSTAT_BUMP(arcstat_misses);
3147 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3148 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3149 data, metadata, misses);
3150
3151 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3152 /*
3153 * Read from the L2ARC if the following are true:
3154 * 1. The L2ARC vdev was previously cached.
3155 * 2. This buffer still has L2ARC metadata.
3156 * 3. This buffer isn't currently writing to the L2ARC.
3157 * 4. The L2ARC entry wasn't evicted, which may
3158 * also have invalidated the vdev.
3159 * 5. This isn't prefetch and l2arc_noprefetch is set.
3160 */
3161 if (hdr->b_l2hdr != NULL &&
3162 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3163 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3164 l2arc_read_callback_t *cb;
3165
3166 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3167 ARCSTAT_BUMP(arcstat_l2_hits);
3168
3169 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3170 KM_SLEEP);
3171 cb->l2rcb_buf = buf;
3172 cb->l2rcb_spa = spa;
3173 cb->l2rcb_bp = *bp;
3174 cb->l2rcb_zb = *zb;
3175 cb->l2rcb_flags = zio_flags;
3176 cb->l2rcb_compress = b_compress;
3177
3178 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3179 addr + size < vd->vdev_psize -
3180 VDEV_LABEL_END_SIZE);
3181
3182 /*
3183 * l2arc read. The SCL_L2ARC lock will be
3184 * released by l2arc_read_done().
3185 * Issue a null zio if the underlying buffer
3186 * was squashed to zero size by compression.
3187 */
3188 if (b_compress == ZIO_COMPRESS_EMPTY) {
3189 rzio = zio_null(pio, spa, vd,
3190 l2arc_read_done, cb,
3191 zio_flags | ZIO_FLAG_DONT_CACHE |
3192 ZIO_FLAG_CANFAIL |
3193 ZIO_FLAG_DONT_PROPAGATE |
3194 ZIO_FLAG_DONT_RETRY);
3195 } else {
3196 rzio = zio_read_phys(pio, vd, addr,
3197 b_asize, buf->b_data,
3198 ZIO_CHECKSUM_OFF,
3199 l2arc_read_done, cb, priority,
3200 zio_flags | ZIO_FLAG_DONT_CACHE |
3201 ZIO_FLAG_CANFAIL |
3202 ZIO_FLAG_DONT_PROPAGATE |
3203 ZIO_FLAG_DONT_RETRY, B_FALSE);
3204 }
3205 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3206 zio_t *, rzio);
3207 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
3208
3209 if (*arc_flags & ARC_NOWAIT) {
3210 zio_nowait(rzio);
3211 return (0);
3212 }
3213
3214 ASSERT(*arc_flags & ARC_WAIT);
3215 if (zio_wait(rzio) == 0)
3216 return (0);
3217
3218 /* l2arc read error; goto zio_read() */
3219 } else {
3220 DTRACE_PROBE1(l2arc__miss,
3221 arc_buf_hdr_t *, hdr);
3222 ARCSTAT_BUMP(arcstat_l2_misses);
3223 if (HDR_L2_WRITING(hdr))
3224 ARCSTAT_BUMP(arcstat_l2_rw_clash);
3225 spa_config_exit(spa, SCL_L2ARC, vd);
3226 }
3227 } else {
3228 if (vd != NULL)
3229 spa_config_exit(spa, SCL_L2ARC, vd);
3230 if (l2arc_ndev != 0) {
3231 DTRACE_PROBE1(l2arc__miss,
3232 arc_buf_hdr_t *, hdr);
3233 ARCSTAT_BUMP(arcstat_l2_misses);
3234 }
3235 }
3236
3237 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3238 arc_read_done, buf, priority, zio_flags, zb);
3239
3240 if (*arc_flags & ARC_WAIT)
3241 return (zio_wait(rzio));
3242
3243 ASSERT(*arc_flags & ARC_NOWAIT);
3244 zio_nowait(rzio);
3245 }
3246 return (0);
3247 }
3248
3249 void
3250 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3251 {
3252 ASSERT(buf->b_hdr != NULL);
3253 ASSERT(buf->b_hdr->b_state != arc_anon);
3254 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3255 ASSERT(buf->b_efunc == NULL);
3256 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3257
3258 buf->b_efunc = func;
3259 buf->b_private = private;
3260 }
3261
3262 /*
3263 * Notify the arc that a block was freed, and thus will never be used again.
3264 */
3265 void
3266 arc_freed(spa_t *spa, const blkptr_t *bp)
3267 {
3268 arc_buf_hdr_t *hdr;
3269 kmutex_t *hash_lock;
3270 uint64_t guid = spa_load_guid(spa);
3271
3272 ASSERT(!BP_IS_EMBEDDED(bp));
3273
3274 hdr = buf_hash_find(guid, bp, &hash_lock);
3275 if (hdr == NULL)
3276 return;
3277 if (HDR_BUF_AVAILABLE(hdr)) {
3278 arc_buf_t *buf = hdr->b_buf;
3279 add_reference(hdr, hash_lock, FTAG);
3280 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3281 mutex_exit(hash_lock);
3282
3283 arc_release(buf, FTAG);
3284 (void) arc_buf_remove_ref(buf, FTAG);
3285 } else {
3286 mutex_exit(hash_lock);
3287 }
3288
3289 }
3290
3291 /*
3292 * Clear the user eviction callback set by arc_set_callback(), first calling
3293 * it if it exists. Because the presence of a callback keeps an arc_buf cached
3294 * clearing the callback may result in the arc_buf being destroyed. However,
3295 * it will not result in the *last* arc_buf being destroyed, hence the data
3296 * will remain cached in the ARC. We make a copy of the arc buffer here so
3297 * that we can process the callback without holding any locks.
3298 *
3299 * It's possible that the callback is already in the process of being cleared
3300 * by another thread. In this case we can not clear the callback.
3301 *
3302 * Returns B_TRUE if the callback was successfully called and cleared.
3303 */
3304 boolean_t
3305 arc_clear_callback(arc_buf_t *buf)
3306 {
3307 arc_buf_hdr_t *hdr;
3308 kmutex_t *hash_lock;
3309 arc_evict_func_t *efunc = buf->b_efunc;
3310 void *private = buf->b_private;
3311
3312 mutex_enter(&buf->b_evict_lock);
3313 hdr = buf->b_hdr;
3314 if (hdr == NULL) {
3315 /*
3316 * We are in arc_do_user_evicts().
3317 */
3318 ASSERT(buf->b_data == NULL);
3319 mutex_exit(&buf->b_evict_lock);
3320 return (B_FALSE);
3321 } else if (buf->b_data == NULL) {
3322 /*
3323 * We are on the eviction list; process this buffer now
3324 * but let arc_do_user_evicts() do the reaping.
3325 */
3326 buf->b_efunc = NULL;
3327 mutex_exit(&buf->b_evict_lock);
3328 VERIFY0(efunc(private));
3329 return (B_TRUE);
3330 }
3331 hash_lock = HDR_LOCK(hdr);
3332 mutex_enter(hash_lock);
3333 hdr = buf->b_hdr;
3334 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3335
3336 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3337 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3338
3339 buf->b_efunc = NULL;
3340 buf->b_private = NULL;
3341
3342 if (hdr->b_datacnt > 1) {
3343 mutex_exit(&buf->b_evict_lock);
3344 arc_buf_destroy(buf, FALSE, TRUE);
3345 } else {
3346 ASSERT(buf == hdr->b_buf);
3347 hdr->b_flags |= ARC_BUF_AVAILABLE;
3348 mutex_exit(&buf->b_evict_lock);
3349 }
3350
3351 mutex_exit(hash_lock);
3352 VERIFY0(efunc(private));
3353 return (B_TRUE);
3354 }
3355
3356 /*
3357 * Release this buffer from the cache, making it an anonymous buffer. This
3358 * must be done after a read and prior to modifying the buffer contents.
3359 * If the buffer has more than one reference, we must make
3360 * a new hdr for the buffer.
3361 */
3362 void
3363 arc_release(arc_buf_t *buf, void *tag)
3364 {
3365 arc_buf_hdr_t *hdr;
3366 kmutex_t *hash_lock = NULL;
3367 l2arc_buf_hdr_t *l2hdr;
3368 uint64_t buf_size;
3369
3370 /*
3371 * It would be nice to assert that if it's DMU metadata (level >
3372 * 0 || it's the dnode file), then it must be syncing context.
3373 * But we don't know that information at this level.
3374 */
3375
3376 mutex_enter(&buf->b_evict_lock);
3377 hdr = buf->b_hdr;
3378
3379 /* this buffer is not on any list */
3380 ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3381
3382 if (hdr->b_state == arc_anon) {
3383 /* this buffer is already released */
3384 ASSERT(buf->b_efunc == NULL);
3385 } else {
3386 hash_lock = HDR_LOCK(hdr);
3387 mutex_enter(hash_lock);
3388 hdr = buf->b_hdr;
3389 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3390 }
3391
3392 l2hdr = hdr->b_l2hdr;
3393 if (l2hdr) {
3394 mutex_enter(&l2arc_buflist_mtx);
3395 hdr->b_l2hdr = NULL;
3396 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3397 }
3398 buf_size = hdr->b_size;
3399
3400 /*
3401 * Do we have more than one buf?
3402 */
3403 if (hdr->b_datacnt > 1) {
3404 arc_buf_hdr_t *nhdr;
3405 arc_buf_t **bufp;
3406 uint64_t blksz = hdr->b_size;
3407 uint64_t spa = hdr->b_spa;
3408 arc_buf_contents_t type = hdr->b_type;
3409 uint32_t flags = hdr->b_flags;
3410
3411 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3412 /*
3413 * Pull the data off of this hdr and attach it to
3414 * a new anonymous hdr.
3415 */
3416 (void) remove_reference(hdr, hash_lock, tag);
3417 bufp = &hdr->b_buf;
3418 while (*bufp != buf)
3419 bufp = &(*bufp)->b_next;
3420 *bufp = buf->b_next;
3421 buf->b_next = NULL;
3422
3423 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3424 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3425 if (refcount_is_zero(&hdr->b_refcnt)) {
3426 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3427 ASSERT3U(*size, >=, hdr->b_size);
3428 atomic_add_64(size, -hdr->b_size);
3429 }
3430
3431 /*
3432 * We're releasing a duplicate user data buffer, update
3433 * our statistics accordingly.
3434 */
3435 if (hdr->b_type == ARC_BUFC_DATA) {
3436 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3437 ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3438 -hdr->b_size);
3439 }
3440 hdr->b_datacnt -= 1;
3441 arc_cksum_verify(buf);
3442 arc_buf_unwatch(buf);
3443
3444 mutex_exit(hash_lock);
3445
3446 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3447 nhdr->b_size = blksz;
3448 nhdr->b_spa = spa;
3449 nhdr->b_type = type;
3450 nhdr->b_buf = buf;
3451 nhdr->b_state = arc_anon;
3452 nhdr->b_arc_access = 0;
3453 nhdr->b_flags = flags & ARC_L2_WRITING;
3454 nhdr->b_l2hdr = NULL;
3455 nhdr->b_datacnt = 1;
3456 nhdr->b_freeze_cksum = NULL;
3457 (void) refcount_add(&nhdr->b_refcnt, tag);
3458 buf->b_hdr = nhdr;
3459 mutex_exit(&buf->b_evict_lock);
3460 atomic_add_64(&arc_anon->arcs_size, blksz);
3461 } else {
3462 mutex_exit(&buf->b_evict_lock);
3463 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3464 ASSERT(!list_link_active(&hdr->b_arc_node));
3465 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3466 if (hdr->b_state != arc_anon)
3467 arc_change_state(arc_anon, hdr, hash_lock);
3468 hdr->b_arc_access = 0;
3469 if (hash_lock)
3470 mutex_exit(hash_lock);
3471
3472 buf_discard_identity(hdr);
3473 arc_buf_thaw(buf);
3474 }
3475 buf->b_efunc = NULL;
3476 buf->b_private = NULL;
3477
3478 if (l2hdr) {
3479 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3480 if (l2hdr->b_dev->l2ad_vdev)
3481 vdev_space_update(l2hdr->b_dev->l2ad_vdev,
3482 -l2hdr->b_asize, 0, 0);
3483 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3484 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3485 mutex_exit(&l2arc_buflist_mtx);
3486 }
3487 }
3488
3489 int
3490 arc_released(arc_buf_t *buf)
3491 {
3492 int released;
3493
3494 mutex_enter(&buf->b_evict_lock);
3495 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3496 mutex_exit(&buf->b_evict_lock);
3497 return (released);
3498 }
3499
3500 #ifdef ZFS_DEBUG
3501 int
3502 arc_referenced(arc_buf_t *buf)
3503 {
3504 int referenced;
3505
3506 mutex_enter(&buf->b_evict_lock);
3507 referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3508 mutex_exit(&buf->b_evict_lock);
3509 return (referenced);
3510 }
3511 #endif
3512
3513 static void
3514 arc_write_ready(zio_t *zio)
3515 {
3516 arc_write_callback_t *callback = zio->io_private;
3517 arc_buf_t *buf = callback->awcb_buf;
3518 arc_buf_hdr_t *hdr = buf->b_hdr;
3519
3520 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3521 callback->awcb_ready(zio, buf, callback->awcb_private);
3522
3523 /*
3524 * If the IO is already in progress, then this is a re-write
3525 * attempt, so we need to thaw and re-compute the cksum.
3526 * It is the responsibility of the callback to handle the
3527 * accounting for any re-write attempt.
3528 */
3529 if (HDR_IO_IN_PROGRESS(hdr)) {
3530 mutex_enter(&hdr->b_freeze_lock);
3531 if (hdr->b_freeze_cksum != NULL) {
3532 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3533 hdr->b_freeze_cksum = NULL;
3534 }
3535 mutex_exit(&hdr->b_freeze_lock);
3536 }
3537 arc_cksum_compute(buf, B_FALSE);
3538 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3539 }
3540
3541 /*
3542 * The SPA calls this callback for each physical write that happens on behalf
3543 * of a logical write. See the comment in dbuf_write_physdone() for details.
3544 */
3545 static void
3546 arc_write_physdone(zio_t *zio)
3547 {
3548 arc_write_callback_t *cb = zio->io_private;
3549 if (cb->awcb_physdone != NULL)
3550 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
3551 }
3552
3553 static void
3554 arc_write_done(zio_t *zio)
3555 {
3556 arc_write_callback_t *callback = zio->io_private;
3557 arc_buf_t *buf = callback->awcb_buf;
3558 arc_buf_hdr_t *hdr = buf->b_hdr;
3559
3560 ASSERT(hdr->b_acb == NULL);
3561
3562 if (zio->io_error == 0) {
3563 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
3564 buf_discard_identity(hdr);
3565 } else {
3566 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3567 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3568 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3569 }
3570 } else {
3571 ASSERT(BUF_EMPTY(hdr));
3572 }
3573
3574 /*
3575 * If the block to be written was all-zero or compressed enough to be
3576 * embedded in the BP, no write was performed so there will be no
3577 * dva/birth/checksum. The buffer must therefore remain anonymous
3578 * (and uncached).
3579 */
3580 if (!BUF_EMPTY(hdr)) {
3581 arc_buf_hdr_t *exists;
3582 kmutex_t *hash_lock;
3583
3584 ASSERT(zio->io_error == 0);
3585
3586 arc_cksum_verify(buf);
3587
3588 exists = buf_hash_insert(hdr, &hash_lock);
3589 if (exists) {
3590 /*
3591 * This can only happen if we overwrite for
3592 * sync-to-convergence, because we remove
3593 * buffers from the hash table when we arc_free().
3594 */
3595 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3596 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3597 panic("bad overwrite, hdr=%p exists=%p",
3598 (void *)hdr, (void *)exists);
3599 ASSERT(refcount_is_zero(&exists->b_refcnt));
3600 arc_change_state(arc_anon, exists, hash_lock);
3601 mutex_exit(hash_lock);
3602 arc_hdr_destroy(exists);
3603 exists = buf_hash_insert(hdr, &hash_lock);
3604 ASSERT3P(exists, ==, NULL);
3605 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3606 /* nopwrite */
3607 ASSERT(zio->io_prop.zp_nopwrite);
3608 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3609 panic("bad nopwrite, hdr=%p exists=%p",
3610 (void *)hdr, (void *)exists);
3611 } else {
3612 /* Dedup */
3613 ASSERT(hdr->b_datacnt == 1);
3614 ASSERT(hdr->b_state == arc_anon);
3615 ASSERT(BP_GET_DEDUP(zio->io_bp));
3616 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3617 }
3618 }
3619 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3620 /* if it's not anon, we are doing a scrub */
3621 if (!exists && hdr->b_state == arc_anon)
3622 arc_access(hdr, hash_lock);
3623 mutex_exit(hash_lock);
3624 } else {
3625 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3626 }
3627
3628 ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3629 callback->awcb_done(zio, buf, callback->awcb_private);
3630
3631 kmem_free(callback, sizeof (arc_write_callback_t));
3632 }
3633
3634 zio_t *
3635 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3636 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3637 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
3638 arc_done_func_t *done, void *private, zio_priority_t priority,
3639 int zio_flags, const zbookmark_phys_t *zb)
3640 {
3641 arc_buf_hdr_t *hdr = buf->b_hdr;
3642 arc_write_callback_t *callback;
3643 zio_t *zio;
3644
3645 ASSERT(ready != NULL);
3646 ASSERT(done != NULL);
3647 ASSERT(!HDR_IO_ERROR(hdr));
3648 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3649 ASSERT(hdr->b_acb == NULL);
3650 if (l2arc)
3651 hdr->b_flags |= ARC_L2CACHE;
3652 if (l2arc_compress)
3653 hdr->b_flags |= ARC_L2COMPRESS;
3654 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3655 callback->awcb_ready = ready;
3656 callback->awcb_physdone = physdone;
3657 callback->awcb_done = done;
3658 callback->awcb_private = private;
3659 callback->awcb_buf = buf;
3660
3661 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3662 arc_write_ready, arc_write_physdone, arc_write_done, callback,
3663 priority, zio_flags, zb);
3664
3665 return (zio);
3666 }
3667
3668 static int
3669 arc_memory_throttle(uint64_t reserve, uint64_t txg)
3670 {
3671 #ifdef _KERNEL
3672 uint64_t available_memory = ptob(freemem);
3673 static uint64_t page_load = 0;
3674 static uint64_t last_txg = 0;
3675
3676 #if defined(__i386)
3677 available_memory =
3678 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3679 #endif
3680
3681 if (freemem > physmem * arc_lotsfree_percent / 100)
3682 return (0);
3683
3684 if (txg > last_txg) {
3685 last_txg = txg;
3686 page_load = 0;
3687 }
3688 /*
3689 * If we are in pageout, we know that memory is already tight,
3690 * the arc is already going to be evicting, so we just want to
3691 * continue to let page writes occur as quickly as possible.
3692 */
3693 if (curproc == proc_pageout) {
3694 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3695 return (SET_ERROR(ERESTART));
3696 /* Note: reserve is inflated, so we deflate */
3697 page_load += reserve / 8;
3698 return (0);
3699 } else if (page_load > 0 && arc_reclaim_needed()) {
3700 /* memory is low, delay before restarting */
3701 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3702 return (SET_ERROR(EAGAIN));
3703 }
3704 page_load = 0;
3705 #endif
3706 return (0);
3707 }
3708
3709 void
3710 arc_tempreserve_clear(uint64_t reserve)
3711 {
3712 atomic_add_64(&arc_tempreserve, -reserve);
3713 ASSERT((int64_t)arc_tempreserve >= 0);
3714 }
3715
3716 int
3717 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3718 {
3719 int error;
3720 uint64_t anon_size;
3721
3722 if (reserve > arc_c/4 && !arc_no_grow)
3723 arc_c = MIN(arc_c_max, reserve * 4);
3724 if (reserve > arc_c)
3725 return (SET_ERROR(ENOMEM));
3726
3727 /*
3728 * Don't count loaned bufs as in flight dirty data to prevent long
3729 * network delays from blocking transactions that are ready to be
3730 * assigned to a txg.
3731 */
3732 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3733
3734 /*
3735 * Writes will, almost always, require additional memory allocations
3736 * in order to compress/encrypt/etc the data. We therefore need to
3737 * make sure that there is sufficient available memory for this.
3738 */
3739 error = arc_memory_throttle(reserve, txg);
3740 if (error != 0)
3741 return (error);
3742
3743 /*
3744 * Throttle writes when the amount of dirty data in the cache
3745 * gets too large. We try to keep the cache less than half full
3746 * of dirty blocks so that our sync times don't grow too large.
3747 * Note: if two requests come in concurrently, we might let them
3748 * both succeed, when one of them should fail. Not a huge deal.
3749 */
3750
3751 if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3752 anon_size > arc_c / 4) {
3753 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3754 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3755 arc_tempreserve>>10,
3756 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3757 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3758 reserve>>10, arc_c>>10);
3759 return (SET_ERROR(ERESTART));
3760 }
3761 atomic_add_64(&arc_tempreserve, reserve);
3762 return (0);
3763 }
3764
3765 /* Tuneable, default is 64, which is essentially arbitrary */
3766 int zfs_flush_ntasks = 64;
3767
3768 void
3769 arc_init(void)
3770 {
3771 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3772 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3773
3774 /* Convert seconds to clock ticks */
3775 arc_min_prefetch_lifespan = 1 * hz;
3776
3777 /* Start out with 1/8 of all memory */
3778 arc_c = physmem * PAGESIZE / 8;
3779
3780 #ifdef _KERNEL
3781 /*
3782 * On architectures where the physical memory can be larger
3783 * than the addressable space (intel in 32-bit mode), we may
3784 * need to limit the cache to 1/8 of VM size.
3785 */
3786 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3787 #endif
3788
3789 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3790 arc_c_min = MAX(arc_c / 4, 64<<20);
3791 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3792 if (arc_c * 8 >= 1<<30)
3793 arc_c_max = (arc_c * 8) - (1<<30);
3794 else
3795 arc_c_max = arc_c_min;
3796 arc_c_max = MAX(arc_c * 6, arc_c_max);
3797
3798 /*
3799 * Allow the tunables to override our calculations if they are
3800 * reasonable (ie. over 64MB)
3801 */
3802 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3803 arc_c_max = zfs_arc_max;
3804 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3805 arc_c_min = zfs_arc_min;
3806
3807 arc_c = arc_c_max;
3808 arc_p = (arc_c >> 1);
3809
3810 /* limit meta-data to 1/4 of the arc capacity */
3811 arc_meta_limit = arc_c_max / 4;
3812
3813 /* Allow the tunable to override if it is reasonable */
3814 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3815 arc_meta_limit = zfs_arc_meta_limit;
3816
3817 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3818 arc_c_min = arc_meta_limit / 2;
3819
3820 if (zfs_arc_grow_retry > 0)
3821 arc_grow_retry = zfs_arc_grow_retry;
3822
3823 if (zfs_arc_shrink_shift > 0)
3824 arc_shrink_shift = zfs_arc_shrink_shift;
3825
3826 if (zfs_arc_p_min_shift > 0)
3827 arc_p_min_shift = zfs_arc_p_min_shift;
3828
3829 /* if kmem_flags are set, lets try to use less memory */
3830 if (kmem_debugging())
3831 arc_c = arc_c / 2;
3832 if (arc_c < arc_c_min)
3833 arc_c = arc_c_min;
3834
3835 arc_anon = &ARC_anon;
3836 arc_mru = &ARC_mru;
3837 arc_mru_ghost = &ARC_mru_ghost;
3838 arc_mfu = &ARC_mfu;
3839 arc_mfu_ghost = &ARC_mfu_ghost;
3840 arc_l2c_only = &ARC_l2c_only;
3841 arc_size = 0;
3842
3843 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3844 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3845 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3846 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3847 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3848 mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3849
3850 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3851 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3852 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3853 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3854 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3855 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3856 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3857 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3858 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3859 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3860 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3861 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3862 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3863 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3864 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3865 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3866 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3867 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3868 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3869 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3870
3871 arc_flush_taskq = taskq_create("arc_flush_tq",
3872 max_ncpus, minclsyspri, 1, zfs_flush_ntasks, TASKQ_DYNAMIC);
3873 buf_init();
3874
3875 arc_thread_exit = 0;
3876 arc_eviction_list = NULL;
3877 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3878 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3879
3880 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3881 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3882
3883 if (arc_ksp != NULL) {
3884 arc_ksp->ks_data = &arc_stats;
3885 kstat_install(arc_ksp);
3886 }
3887
3888 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3889 TS_RUN, minclsyspri);
3890
3891 arc_dead = FALSE;
3892 arc_warm = B_FALSE;
3893
3894 /*
3895 * Calculate maximum amount of dirty data per pool.
3896 *
3897 * If it has been set by /etc/system, take that.
3898 * Otherwise, use a percentage of physical memory defined by
3899 * zfs_dirty_data_max_percent (default 10%) with a cap at
3900 * zfs_dirty_data_max_max (default 4GB).
3901 */
3902 if (zfs_dirty_data_max == 0) {
3903 zfs_dirty_data_max = physmem * PAGESIZE *
3904 zfs_dirty_data_max_percent / 100;
3905 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
3906 zfs_dirty_data_max_max);
3907 }
3908 }
3909
3910 void
3911 arc_fini(void)
3912 {
3913 mutex_enter(&arc_reclaim_thr_lock);
3914 arc_thread_exit = 1;
3915 while (arc_thread_exit != 0)
3916 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3917 mutex_exit(&arc_reclaim_thr_lock);
3918
3919 arc_flush(NULL);
3920
3921 arc_dead = TRUE;
3922
3923 if (arc_ksp != NULL) {
3924 kstat_delete(arc_ksp);
3925 arc_ksp = NULL;
3926 }
3927
3928 mutex_destroy(&arc_eviction_mtx);
3929 mutex_destroy(&arc_reclaim_thr_lock);
3930 cv_destroy(&arc_reclaim_thr_cv);
3931
3932 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3933 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3934 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3935 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3936 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3937 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3938 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3939 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3940
3941 mutex_destroy(&arc_anon->arcs_mtx);
3942 mutex_destroy(&arc_mru->arcs_mtx);
3943 mutex_destroy(&arc_mru_ghost->arcs_mtx);
3944 mutex_destroy(&arc_mfu->arcs_mtx);
3945 mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3946 mutex_destroy(&arc_l2c_only->arcs_mtx);
3947
3948 taskq_destroy(arc_flush_taskq);
3949 buf_fini();
3950
3951 ASSERT(arc_loaned_bytes == 0);
3952 }
3953
3954 /*
3955 * Level 2 ARC
3956 *
3957 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3958 * It uses dedicated storage devices to hold cached data, which are populated
3959 * using large infrequent writes. The main role of this cache is to boost
3960 * the performance of random read workloads. The intended L2ARC devices
3961 * include short-stroked disks, solid state disks, and other media with
3962 * substantially faster read latency than disk.
3963 *
3964 * +-----------------------+
3965 * | ARC |
3966 * +-----------------------+
3967 * | ^ ^
3968 * | | |
3969 * l2arc_feed_thread() arc_read()
3970 * | | |
3971 * | l2arc read |
3972 * V | |
3973 * +---------------+ |
3974 * | L2ARC | |
3975 * +---------------+ |
3976 * | ^ |
3977 * l2arc_write() | |
3978 * | | |
3979 * V | |
3980 * +-------+ +-------+
3981 * | vdev | | vdev |
3982 * | cache | | cache |
3983 * +-------+ +-------+
3984 * +=========+ .-----.
3985 * : L2ARC : |-_____-|
3986 * : devices : | Disks |
3987 * +=========+ `-_____-'
3988 *
3989 * Read requests are satisfied from the following sources, in order:
3990 *
3991 * 1) ARC
3992 * 2) vdev cache of L2ARC devices
3993 * 3) L2ARC devices
3994 * 4) vdev cache of disks
3995 * 5) disks
3996 *
3997 * Some L2ARC device types exhibit extremely slow write performance.
3998 * To accommodate for this there are some significant differences between
3999 * the L2ARC and traditional cache design:
4000 *
4001 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
4002 * the ARC behave as usual, freeing buffers and placing headers on ghost
4003 * lists. The ARC does not send buffers to the L2ARC during eviction as
4004 * this would add inflated write latencies for all ARC memory pressure.
4005 *
4006 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4007 * It does this by periodically scanning buffers from the eviction-end of
4008 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4009 * not already there. It scans until a headroom of buffers is satisfied,
4010 * which itself is a buffer for ARC eviction. If a compressible buffer is
4011 * found during scanning and selected for writing to an L2ARC device, we
4012 * temporarily boost scanning headroom during the next scan cycle to make
4013 * sure we adapt to compression effects (which might significantly reduce
4014 * the data volume we write to L2ARC). The thread that does this is
4015 * l2arc_feed_thread(), illustrated below; example sizes are included to
4016 * provide a better sense of ratio than this diagram:
4017 *
4018 * head --> tail
4019 * +---------------------+----------+
4020 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
4021 * +---------------------+----------+ | o L2ARC eligible
4022 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
4023 * +---------------------+----------+ |
4024 * 15.9 Gbytes ^ 32 Mbytes |
4025 * headroom |
4026 * l2arc_feed_thread()
4027 * |
4028 * l2arc write hand <--[oooo]--'
4029 * | 8 Mbyte
4030 * | write max
4031 * V
4032 * +==============================+
4033 * L2ARC dev |####|#|###|###| |####| ... |
4034 * +==============================+
4035 * 32 Gbytes
4036 *
4037 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4038 * evicted, then the L2ARC has cached a buffer much sooner than it probably
4039 * needed to, potentially wasting L2ARC device bandwidth and storage. It is
4040 * safe to say that this is an uncommon case, since buffers at the end of
4041 * the ARC lists have moved there due to inactivity.
4042 *
4043 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4044 * then the L2ARC simply misses copying some buffers. This serves as a
4045 * pressure valve to prevent heavy read workloads from both stalling the ARC
4046 * with waits and clogging the L2ARC with writes. This also helps prevent
4047 * the potential for the L2ARC to churn if it attempts to cache content too
4048 * quickly, such as during backups of the entire pool.
4049 *
4050 * 5. After system boot and before the ARC has filled main memory, there are
4051 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4052 * lists can remain mostly static. Instead of searching from tail of these
4053 * lists as pictured, the l2arc_feed_thread() will search from the list heads
4054 * for eligible buffers, greatly increasing its chance of finding them.
4055 *
4056 * The L2ARC device write speed is also boosted during this time so that
4057 * the L2ARC warms up faster. Since there have been no ARC evictions yet,
4058 * there are no L2ARC reads, and no fear of degrading read performance
4059 * through increased writes.
4060 *
4061 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4062 * the vdev queue can aggregate them into larger and fewer writes. Each
4063 * device is written to in a rotor fashion, sweeping writes through
4064 * available space then repeating.
4065 *
4066 * 7. The L2ARC does not store dirty content. It never needs to flush
4067 * write buffers back to disk based storage.
4068 *
4069 * 8. If an ARC buffer is written (and dirtied) which also exists in the
4070 * L2ARC, the now stale L2ARC buffer is immediately dropped.
4071 *
4072 * The performance of the L2ARC can be tweaked by a number of tunables, which
4073 * may be necessary for different workloads:
4074 *
4075 * l2arc_write_max max write bytes per interval
4076 * l2arc_write_boost extra write bytes during device warmup
4077 * l2arc_noprefetch skip caching prefetched buffers
4078 * l2arc_headroom number of max device writes to precache
4079 * l2arc_headroom_boost when we find compressed buffers during ARC
4080 * scanning, we multiply headroom by this
4081 * percentage factor for the next scan cycle,
4082 * since more compressed buffers are likely to
4083 * be present
4084 * l2arc_feed_secs seconds between L2ARC writing
4085 *
4086 * Tunables may be removed or added as future performance improvements are
4087 * integrated, and also may become zpool properties.
4088 *
4089 * There are three key functions that control how the L2ARC warms up:
4090 *
4091 * l2arc_write_eligible() check if a buffer is eligible to cache
4092 * l2arc_write_size() calculate how much to write
4093 * l2arc_write_interval() calculate sleep delay between writes
4094 *
4095 * These three functions determine what to write, how much, and how quickly
4096 * to send writes.
4097 */
4098
4099 static boolean_t
4100 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4101 {
4102 /*
4103 * A buffer is *not* eligible for the L2ARC if it:
4104 * 1. belongs to a different spa.
4105 * 2. is already cached on the L2ARC.
4106 * 3. has an I/O in progress (it may be an incomplete read).
4107 * 4. is flagged not eligible (zfs property).
4108 */
4109 if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4110 HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4111 return (B_FALSE);
4112
4113 return (B_TRUE);
4114 }
4115
4116 static uint64_t
4117 l2arc_write_size(void)
4118 {
4119 uint64_t size;
4120
4121 /*
4122 * Make sure our globals have meaningful values in case the user
4123 * altered them.
4124 */
4125 size = l2arc_write_max;
4126 if (size == 0) {
4127 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4128 "be greater than zero, resetting it to the default (%d)",
4129 L2ARC_WRITE_SIZE);
4130 size = l2arc_write_max = L2ARC_WRITE_SIZE;
4131 }
4132
4133 if (arc_warm == B_FALSE)
4134 size += l2arc_write_boost;
4135
4136 return (size);
4137
4138 }
4139
4140 static clock_t
4141 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4142 {
4143 clock_t interval, next, now;
4144
4145 /*
4146 * If the ARC lists are busy, increase our write rate; if the
4147 * lists are stale, idle back. This is achieved by checking
4148 * how much we previously wrote - if it was more than half of
4149 * what we wanted, schedule the next write much sooner.
4150 */
4151 if (l2arc_feed_again && wrote > (wanted / 2))
4152 interval = (hz * l2arc_feed_min_ms) / 1000;
4153 else
4154 interval = hz * l2arc_feed_secs;
4155
4156 now = ddi_get_lbolt();
4157 next = MAX(now, MIN(now + interval, began + interval));
4158
4159 return (next);
4160 }
4161
4162 static void
4163 l2arc_hdr_stat_add(void)
4164 {
4165 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4166 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4167 }
4168
4169 static void
4170 l2arc_hdr_stat_remove(void)
4171 {
4172 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4173 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4174 }
4175
4176 /*
4177 * Cycle through L2ARC devices. This is how L2ARC load balances.
4178 * If a device is returned, this also returns holding the spa config lock.
4179 */
4180 static l2arc_dev_t *
4181 l2arc_dev_get_next(void)
4182 {
4183 l2arc_dev_t *first, *next = NULL;
4184
4185 /*
4186 * Lock out the removal of spas (spa_namespace_lock), then removal
4187 * of cache devices (l2arc_dev_mtx). Once a device has been selected,
4188 * both locks will be dropped and a spa config lock held instead.
4189 */
4190 mutex_enter(&spa_namespace_lock);
4191 mutex_enter(&l2arc_dev_mtx);
4192
4193 /* if there are no vdevs, there is nothing to do */
4194 if (l2arc_ndev == 0)
4195 goto out;
4196
4197 first = NULL;
4198 next = l2arc_dev_last;
4199 do {
4200 /* loop around the list looking for a non-faulted vdev */
4201 if (next == NULL) {
4202 next = list_head(l2arc_dev_list);
4203 } else {
4204 next = list_next(l2arc_dev_list, next);
4205 if (next == NULL)
4206 next = list_head(l2arc_dev_list);
4207 }
4208
4209 /* if we have come back to the start, bail out */
4210 if (first == NULL)
4211 first = next;
4212 else if (next == first)
4213 break;
4214
4215 } while (vdev_is_dead(next->l2ad_vdev));
4216
4217 /* if we were unable to find any usable vdevs, return NULL */
4218 if (vdev_is_dead(next->l2ad_vdev))
4219 next = NULL;
4220
4221 l2arc_dev_last = next;
4222
4223 out:
4224 mutex_exit(&l2arc_dev_mtx);
4225
4226 /*
4227 * Grab the config lock to prevent the 'next' device from being
4228 * removed while we are writing to it.
4229 */
4230 if (next != NULL)
4231 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4232 mutex_exit(&spa_namespace_lock);
4233
4234 return (next);
4235 }
4236
4237 /*
4238 * Free buffers that were tagged for destruction.
4239 */
4240 static void
4241 l2arc_do_free_on_write()
4242 {
4243 list_t *buflist;
4244 l2arc_data_free_t *df, *df_prev;
4245
4246 mutex_enter(&l2arc_free_on_write_mtx);
4247 buflist = l2arc_free_on_write;
4248
4249 for (df = list_tail(buflist); df; df = df_prev) {
4250 df_prev = list_prev(buflist, df);
4251 ASSERT(df->l2df_data != NULL);
4252 ASSERT(df->l2df_func != NULL);
4253 df->l2df_func(df->l2df_data, df->l2df_size);
4254 list_remove(buflist, df);
4255 kmem_free(df, sizeof (l2arc_data_free_t));
4256 }
4257
4258 mutex_exit(&l2arc_free_on_write_mtx);
4259 }
4260
4261 /*
4262 * A write to a cache device has completed. Update all headers to allow
4263 * reads from these buffers to begin.
4264 */
4265 static void
4266 l2arc_write_done(zio_t *zio)
4267 {
4268 l2arc_write_callback_t *cb;
4269 l2arc_dev_t *dev;
4270 list_t *buflist;
4271 arc_buf_hdr_t *head, *ab, *ab_prev;
4272 l2arc_buf_hdr_t *abl2;
4273 kmutex_t *hash_lock;
4274 int64_t bytes_dropped = 0;
4275
4276 cb = zio->io_private;
4277 ASSERT(cb != NULL);
4278 dev = cb->l2wcb_dev;
4279 ASSERT(dev != NULL);
4280 head = cb->l2wcb_head;
4281 ASSERT(head != NULL);
4282 buflist = dev->l2ad_buflist;
4283 ASSERT(buflist != NULL);
4284 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4285 l2arc_write_callback_t *, cb);
4286
4287 if (zio->io_error != 0)
4288 ARCSTAT_BUMP(arcstat_l2_writes_error);
4289
4290 mutex_enter(&l2arc_buflist_mtx);
4291
4292 /*
4293 * All writes completed, or an error was hit.
4294 */
4295 for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4296 ab_prev = list_prev(buflist, ab);
4297 abl2 = ab->b_l2hdr;
4298
4299 /*
4300 * Release the temporary compressed buffer as soon as possible.
4301 */
4302 if (abl2->b_compress != ZIO_COMPRESS_OFF)
4303 l2arc_release_cdata_buf(ab);
4304
4305 hash_lock = HDR_LOCK(ab);
4306 if (!mutex_tryenter(hash_lock)) {
4307 /*
4308 * This buffer misses out. It may be in a stage
4309 * of eviction. Its ARC_L2_WRITING flag will be
4310 * left set, denying reads to this buffer.
4311 */
4312 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4313 continue;
4314 }
4315
4316 if (zio->io_error != 0) {
4317 /*
4318 * Error - drop L2ARC entry.
4319 */
4320 list_remove(buflist, ab);
4321 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4322 bytes_dropped += abl2->b_asize;
4323 ab->b_l2hdr = NULL;
4324 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4325 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4326 }
4327
4328 /*
4329 * Allow ARC to begin reads to this L2ARC entry.
4330 */
4331 ab->b_flags &= ~ARC_L2_WRITING;
4332
4333 mutex_exit(hash_lock);
4334 }
4335
4336 atomic_inc_64(&l2arc_writes_done);
4337 list_remove(buflist, head);
4338 kmem_cache_free(hdr_cache, head);
4339 mutex_exit(&l2arc_buflist_mtx);
4340
4341 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
4342
4343 l2arc_do_free_on_write();
4344
4345 kmem_free(cb, sizeof (l2arc_write_callback_t));
4346 }
4347
4348 /*
4349 * A read to a cache device completed. Validate buffer contents before
4350 * handing over to the regular ARC routines.
4351 */
4352 static void
4353 l2arc_read_done(zio_t *zio)
4354 {
4355 l2arc_read_callback_t *cb;
4356 arc_buf_hdr_t *hdr;
4357 arc_buf_t *buf;
4358 kmutex_t *hash_lock;
4359 int equal;
4360
4361 ASSERT(zio->io_vd != NULL);
4362 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4363
4364 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4365
4366 cb = zio->io_private;
4367 ASSERT(cb != NULL);
4368 buf = cb->l2rcb_buf;
4369 ASSERT(buf != NULL);
4370
4371 hash_lock = HDR_LOCK(buf->b_hdr);
4372 mutex_enter(hash_lock);
4373 hdr = buf->b_hdr;
4374 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4375
4376 /*
4377 * If the buffer was compressed, decompress it first.
4378 */
4379 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4380 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4381 ASSERT(zio->io_data != NULL);
4382
4383 /*
4384 * Check this survived the L2ARC journey.
4385 */
4386 equal = arc_cksum_equal(buf);
4387 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4388 mutex_exit(hash_lock);
4389 zio->io_private = buf;
4390 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
4391 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
4392 arc_read_done(zio);
4393 } else {
4394 mutex_exit(hash_lock);
4395 /*
4396 * Buffer didn't survive caching. Increment stats and
4397 * reissue to the original storage device.
4398 */
4399 if (zio->io_error != 0) {
4400 ARCSTAT_BUMP(arcstat_l2_io_error);
4401 } else {
4402 zio->io_error = SET_ERROR(EIO);
4403 }
4404 if (!equal)
4405 ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4406
4407 /*
4408 * If there's no waiter, issue an async i/o to the primary
4409 * storage now. If there *is* a waiter, the caller must
4410 * issue the i/o in a context where it's OK to block.
4411 */
4412 if (zio->io_waiter == NULL) {
4413 zio_t *pio = zio_unique_parent(zio);
4414
4415 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4416
4417 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4418 buf->b_data, zio->io_size, arc_read_done, buf,
4419 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4420 }
4421 }
4422
4423 kmem_free(cb, sizeof (l2arc_read_callback_t));
4424 }
4425
4426 /*
4427 * This is the list priority from which the L2ARC will search for pages to
4428 * cache. This is used within loops (0..3) to cycle through lists in the
4429 * desired order. This order can have a significant effect on cache
4430 * performance.
4431 *
4432 * Currently the metadata lists are hit first, MFU then MRU, followed by
4433 * the data lists. This function returns a locked list, and also returns
4434 * the lock pointer.
4435 */
4436 static list_t *
4437 l2arc_list_locked(int list_num, kmutex_t **lock)
4438 {
4439 list_t *list = NULL;
4440
4441 ASSERT(list_num >= 0 && list_num <= 3);
4442
4443 switch (list_num) {
4444 case 0:
4445 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4446 *lock = &arc_mfu->arcs_mtx;
4447 break;
4448 case 1:
4449 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4450 *lock = &arc_mru->arcs_mtx;
4451 break;
4452 case 2:
4453 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4454 *lock = &arc_mfu->arcs_mtx;
4455 break;
4456 case 3:
4457 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4458 *lock = &arc_mru->arcs_mtx;
4459 break;
4460 }
4461
4462 ASSERT(!(MUTEX_HELD(*lock)));
4463 mutex_enter(*lock);
4464 return (list);
4465 }
4466
4467 /*
4468 * Evict buffers from the device write hand to the distance specified in
4469 * bytes. This distance may span populated buffers, it may span nothing.
4470 * This is clearing a region on the L2ARC device ready for writing.
4471 * If the 'all' boolean is set, every buffer is evicted.
4472 */
4473 static void
4474 _l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all,
4475 boolean_t space_update)
4476 {
4477 list_t *buflist;
4478 l2arc_buf_hdr_t *abl2;
4479 arc_buf_hdr_t *ab, *ab_prev;
4480 kmutex_t *hash_lock;
4481 uint64_t taddr;
4482 int64_t bytes_evicted = 0;
4483
4484 buflist = dev->l2ad_buflist;
4485
4486 if (buflist == NULL)
4487 return;
4488
4489 if (!all && dev->l2ad_first) {
4490 /*
4491 * This is the first sweep through the device. There is
4492 * nothing to evict.
4493 */
4494 return;
4495 }
4496
4497 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4498 /*
4499 * When nearing the end of the device, evict to the end
4500 * before the device write hand jumps to the start.
4501 */
4502 taddr = dev->l2ad_end;
4503 } else {
4504 taddr = dev->l2ad_hand + distance;
4505 }
4506 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4507 uint64_t, taddr, boolean_t, all);
4508
4509 top:
4510 mutex_enter(&l2arc_buflist_mtx);
4511 for (ab = list_tail(buflist); ab; ab = ab_prev) {
4512 ab_prev = list_prev(buflist, ab);
4513
4514 hash_lock = HDR_LOCK(ab);
4515 if (!mutex_tryenter(hash_lock)) {
4516 /*
4517 * Missed the hash lock. Retry.
4518 */
4519 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4520 mutex_exit(&l2arc_buflist_mtx);
4521 mutex_enter(hash_lock);
4522 mutex_exit(hash_lock);
4523 goto top;
4524 }
4525
4526 if (HDR_L2_WRITE_HEAD(ab)) {
4527 /*
4528 * We hit a write head node. Leave it for
4529 * l2arc_write_done().
4530 */
4531 list_remove(buflist, ab);
4532 mutex_exit(hash_lock);
4533 continue;
4534 }
4535
4536 if (!all && ab->b_l2hdr != NULL &&
4537 (ab->b_l2hdr->b_daddr > taddr ||
4538 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4539 /*
4540 * We've evicted to the target address,
4541 * or the end of the device.
4542 */
4543 mutex_exit(hash_lock);
4544 break;
4545 }
4546
4547 if (HDR_FREE_IN_PROGRESS(ab)) {
4548 /*
4549 * Already on the path to destruction.
4550 */
4551 mutex_exit(hash_lock);
4552 continue;
4553 }
4554
4555 if (ab->b_state == arc_l2c_only) {
4556 ASSERT(!HDR_L2_READING(ab));
4557 /*
4558 * This doesn't exist in the ARC. Destroy.
4559 * arc_hdr_destroy() will call list_remove()
4560 * and decrement arcstat_l2_size.
4561 */
4562 arc_change_state(arc_anon, ab, hash_lock);
4563 arc_hdr_destroy(ab);
4564 } else {
4565 /*
4566 * Invalidate issued or about to be issued
4567 * reads, since we may be about to write
4568 * over this location.
4569 */
4570 if (HDR_L2_READING(ab)) {
4571 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4572 ab->b_flags |= ARC_L2_EVICTED;
4573 }
4574
4575 /*
4576 * Tell ARC this no longer exists in L2ARC.
4577 */
4578 if (ab->b_l2hdr != NULL) {
4579 abl2 = ab->b_l2hdr;
4580 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4581 bytes_evicted += abl2->b_asize;
4582 ab->b_l2hdr = NULL;
4583 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4584 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4585 }
4586 list_remove(buflist, ab);
4587
4588 /*
4589 * This may have been leftover after a
4590 * failed write.
4591 */
4592 ab->b_flags &= ~ARC_L2_WRITING;
4593 }
4594 mutex_exit(hash_lock);
4595 }
4596 mutex_exit(&l2arc_buflist_mtx);
4597
4598 /*
4599 * Note: l2ad_vdev can only be touched if space_update is set,
4600 * otherwise the vdev might have been removed by an async
4601 * spa_unload.
4602 */
4603 if (space_update) {
4604 vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0);
4605 dev->l2ad_evict = taddr;
4606 }
4607 }
4608
4609 /*
4610 * Asynchronous task for eviction of all the buffers for this L2ARC device
4611 * The task is dispatched in l2arc_evict()
4612 */
4613 typedef struct {
4614 l2arc_dev_t *dev;
4615 } l2arc_evict_data_t;
4616
4617 static void
4618 l2arc_evict_task(void *arg)
4619 {
4620 l2arc_evict_data_t *d = (l2arc_evict_data_t *)arg;
4621 ASSERT(d && d->dev);
4622
4623 /*
4624 * Evict l2arc buffers asynchronously; we need to keep the device
4625 * around until we are sure there aren't any buffers referencing it.
4626 * We do not need to hold any config locks, etc. because at this point,
4627 * we are the only ones who knows about this device (the in-core
4628 * structure), so no new buffers can be created (e.g. if the pool is
4629 * re-imported while the asynchronous eviction is in progress) that
4630 * reference this same in-core structure. Also remove the vdev link
4631 * since further use of it as l2arc device is prohibited.
4632 */
4633 d->dev->l2ad_vdev = NULL;
4634 _l2arc_evict(d->dev, 0LL, B_TRUE, B_FALSE);
4635
4636 /* Same cleanup as in the synchronous path */
4637 list_destroy(d->dev->l2ad_buflist);
4638 kmem_free(d->dev->l2ad_buflist, sizeof (list_t));
4639 kmem_free(d->dev, sizeof (l2arc_dev_t));
4640 /* Task argument cleanup */
4641 kmem_free(arg, sizeof (l2arc_evict_data_t));
4642 }
4643
4644 boolean_t zfs_l2arc_async_evict = B_TRUE;
4645
4646 /*
4647 * Perform l2arc eviction for buffers associated with this device
4648 * If evicting all buffers (done at pool export time), try to evict
4649 * asynchronously, and fall back to synchronous eviction in case of error
4650 * Tell the caller whether to cleanup the device:
4651 * - B_TRUE means "asynchronous eviction, do not cleanup"
4652 * - B_FALSE means "synchronous eviction, done, please cleanup"
4653 */
4654 static boolean_t
4655 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4656 {
4657 /*
4658 * If we are evicting all the buffers for this device, which happens
4659 * at pool export time, schedule asynchronous task
4660 */
4661 if (all && zfs_l2arc_async_evict) {
4662 l2arc_evict_data_t *arg =
4663 kmem_alloc(sizeof (l2arc_evict_data_t), KM_SLEEP);
4664 arg->dev = dev;
4665
4666 dev->l2ad_evict = dev->l2ad_end;
4667
4668 if ((taskq_dispatch(arc_flush_taskq, l2arc_evict_task,
4669 arg, TQ_NOSLEEP) == NULL)) {
4670 /*
4671 * Failed to dispatch asynchronous task
4672 * cleanup, evict synchronously, avoid adjusting
4673 * vdev space second time
4674 */
4675 kmem_free(arg, sizeof (l2arc_evict_data_t));
4676 _l2arc_evict(dev, distance, all, B_FALSE);
4677 } else {
4678 /*
4679 * Successfull dispatch, vdev space updated
4680 */
4681 return (B_TRUE);
4682 }
4683 } else {
4684 /* Evict synchronously */
4685 _l2arc_evict(dev, distance, all, B_TRUE);
4686 }
4687
4688 return (B_FALSE);
4689 }
4690
4691 /*
4692 * Find and write ARC buffers to the L2ARC device.
4693 *
4694 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4695 * for reading until they have completed writing.
4696 * The headroom_boost is an in-out parameter used to maintain headroom boost
4697 * state between calls to this function.
4698 *
4699 * Returns the number of bytes actually written (which may be smaller than
4700 * the delta by which the device hand has changed due to alignment).
4701 */
4702 static uint64_t
4703 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4704 boolean_t *headroom_boost)
4705 {
4706 arc_buf_hdr_t *ab, *ab_prev, *head;
4707 list_t *list;
4708 uint64_t write_asize, write_psize, write_sz, headroom,
4709 buf_compress_minsz;
4710 void *buf_data;
4711 kmutex_t *list_lock;
4712 boolean_t full;
4713 l2arc_write_callback_t *cb;
4714 zio_t *pio, *wzio;
4715 uint64_t guid = spa_load_guid(spa);
4716 const boolean_t do_headroom_boost = *headroom_boost;
4717
4718 ASSERT(dev->l2ad_vdev != NULL);
4719
4720 /* Lower the flag now, we might want to raise it again later. */
4721 *headroom_boost = B_FALSE;
4722
4723 pio = NULL;
4724 write_sz = write_asize = write_psize = 0;
4725 full = B_FALSE;
4726 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4727 head->b_flags |= ARC_L2_WRITE_HEAD;
4728
4729 /*
4730 * We will want to try to compress buffers that are at least 2x the
4731 * device sector size.
4732 */
4733 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4734
4735 /*
4736 * Copy buffers for L2ARC writing.
4737 */
4738 mutex_enter(&l2arc_buflist_mtx);
4739 for (int try = 0; try <= 3; try++) {
4740 uint64_t passed_sz = 0;
4741
4742 list = l2arc_list_locked(try, &list_lock);
4743
4744 /*
4745 * L2ARC fast warmup.
4746 *
4747 * Until the ARC is warm and starts to evict, read from the
4748 * head of the ARC lists rather than the tail.
4749 */
4750 if (arc_warm == B_FALSE)
4751 ab = list_head(list);
4752 else
4753 ab = list_tail(list);
4754
4755 headroom = target_sz * l2arc_headroom;
4756 if (do_headroom_boost)
4757 headroom = (headroom * l2arc_headroom_boost) / 100;
4758
4759 for (; ab; ab = ab_prev) {
4760 l2arc_buf_hdr_t *l2hdr;
4761 kmutex_t *hash_lock;
4762 uint64_t buf_sz;
4763
4764 if (arc_warm == B_FALSE)
4765 ab_prev = list_next(list, ab);
4766 else
4767 ab_prev = list_prev(list, ab);
4768
4769 hash_lock = HDR_LOCK(ab);
4770 if (!mutex_tryenter(hash_lock)) {
4771 /*
4772 * Skip this buffer rather than waiting.
4773 */
4774 continue;
4775 }
4776
4777 passed_sz += ab->b_size;
4778 if (passed_sz > headroom) {
4779 /*
4780 * Searched too far.
4781 */
4782 mutex_exit(hash_lock);
4783 break;
4784 }
4785
4786 if (!l2arc_write_eligible(guid, ab)) {
4787 mutex_exit(hash_lock);
4788 continue;
4789 }
4790
4791 if ((write_sz + ab->b_size) > target_sz) {
4792 full = B_TRUE;
4793 mutex_exit(hash_lock);
4794 break;
4795 }
4796
4797 if (pio == NULL) {
4798 /*
4799 * Insert a dummy header on the buflist so
4800 * l2arc_write_done() can find where the
4801 * write buffers begin without searching.
4802 */
4803 list_insert_head(dev->l2ad_buflist, head);
4804
4805 cb = kmem_alloc(
4806 sizeof (l2arc_write_callback_t), KM_SLEEP);
4807 cb->l2wcb_dev = dev;
4808 cb->l2wcb_head = head;
4809 pio = zio_root(spa, l2arc_write_done, cb,
4810 ZIO_FLAG_CANFAIL);
4811 }
4812
4813 /*
4814 * Create and add a new L2ARC header.
4815 */
4816 l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4817 l2hdr->b_dev = dev;
4818 ab->b_flags |= ARC_L2_WRITING;
4819
4820 /*
4821 * Temporarily stash the data buffer in b_tmp_cdata.
4822 * The subsequent write step will pick it up from
4823 * there. This is because can't access ab->b_buf
4824 * without holding the hash_lock, which we in turn
4825 * can't access without holding the ARC list locks
4826 * (which we want to avoid during compression/writing).
4827 */
4828 l2hdr->b_compress = ZIO_COMPRESS_OFF;
4829 l2hdr->b_asize = ab->b_size;
4830 l2hdr->b_tmp_cdata = ab->b_buf->b_data;
4831
4832 buf_sz = ab->b_size;
4833 ab->b_l2hdr = l2hdr;
4834
4835 list_insert_head(dev->l2ad_buflist, ab);
4836
4837 /*
4838 * Compute and store the buffer cksum before
4839 * writing. On debug the cksum is verified first.
4840 */
4841 arc_cksum_verify(ab->b_buf);
4842 arc_cksum_compute(ab->b_buf, B_TRUE);
4843
4844 mutex_exit(hash_lock);
4845
4846 write_sz += buf_sz;
4847 }
4848
4849 mutex_exit(list_lock);
4850
4851 if (full == B_TRUE)
4852 break;
4853 }
4854
4855 /* No buffers selected for writing? */
4856 if (pio == NULL) {
4857 ASSERT0(write_sz);
4858 mutex_exit(&l2arc_buflist_mtx);
4859 kmem_cache_free(hdr_cache, head);
4860 return (0);
4861 }
4862
4863 /*
4864 * Now start writing the buffers. We're starting at the write head
4865 * and work backwards, retracing the course of the buffer selector
4866 * loop above.
4867 */
4868 for (ab = list_prev(dev->l2ad_buflist, head); ab;
4869 ab = list_prev(dev->l2ad_buflist, ab)) {
4870 l2arc_buf_hdr_t *l2hdr;
4871 uint64_t buf_sz;
4872
4873 /*
4874 * We shouldn't need to lock the buffer here, since we flagged
4875 * it as ARC_L2_WRITING in the previous step, but we must take
4876 * care to only access its L2 cache parameters. In particular,
4877 * ab->b_buf may be invalid by now due to ARC eviction.
4878 */
4879 l2hdr = ab->b_l2hdr;
4880 l2hdr->b_daddr = dev->l2ad_hand;
4881
4882 if ((ab->b_flags & ARC_L2COMPRESS) &&
4883 l2hdr->b_asize >= buf_compress_minsz) {
4884 if (l2arc_compress_buf(l2hdr)) {
4885 /*
4886 * If compression succeeded, enable headroom
4887 * boost on the next scan cycle.
4888 */
4889 *headroom_boost = B_TRUE;
4890 }
4891 }
4892
4893 /*
4894 * Pick up the buffer data we had previously stashed away
4895 * (and now potentially also compressed).
4896 */
4897 buf_data = l2hdr->b_tmp_cdata;
4898 buf_sz = l2hdr->b_asize;
4899
4900 /* Compression may have squashed the buffer to zero length. */
4901 if (buf_sz != 0) {
4902 uint64_t buf_p_sz;
4903
4904 wzio = zio_write_phys(pio, dev->l2ad_vdev,
4905 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4906 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4907 ZIO_FLAG_CANFAIL, B_FALSE);
4908
4909 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4910 zio_t *, wzio);
4911 (void) zio_nowait(wzio);
4912
4913 write_asize += buf_sz;
4914 /*
4915 * Keep the clock hand suitably device-aligned.
4916 */
4917 buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4918 write_psize += buf_p_sz;
4919 dev->l2ad_hand += buf_p_sz;
4920 }
4921 }
4922
4923 mutex_exit(&l2arc_buflist_mtx);
4924
4925 ASSERT3U(write_asize, <=, target_sz);
4926 ARCSTAT_BUMP(arcstat_l2_writes_sent);
4927 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4928 ARCSTAT_INCR(arcstat_l2_size, write_sz);
4929 ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4930 vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
4931
4932 /*
4933 * Bump device hand to the device start if it is approaching the end.
4934 * l2arc_evict() will already have evicted ahead for this case.
4935 */
4936 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4937 dev->l2ad_hand = dev->l2ad_start;
4938 dev->l2ad_evict = dev->l2ad_start;
4939 dev->l2ad_first = B_FALSE;
4940 }
4941
4942 dev->l2ad_writing = B_TRUE;
4943 (void) zio_wait(pio);
4944 dev->l2ad_writing = B_FALSE;
4945
4946 return (write_asize);
4947 }
4948
4949 /*
4950 * Compresses an L2ARC buffer.
4951 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
4952 * size in l2hdr->b_asize. This routine tries to compress the data and
4953 * depending on the compression result there are three possible outcomes:
4954 * *) The buffer was incompressible. The original l2hdr contents were left
4955 * untouched and are ready for writing to an L2 device.
4956 * *) The buffer was all-zeros, so there is no need to write it to an L2
4957 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
4958 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
4959 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
4960 * data buffer which holds the compressed data to be written, and b_asize
4961 * tells us how much data there is. b_compress is set to the appropriate
4962 * compression algorithm. Once writing is done, invoke
4963 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
4964 *
4965 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
4966 * buffer was incompressible).
4967 */
4968 static boolean_t
4969 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
4970 {
4971 void *cdata;
4972 size_t csize, len, rounded;
4973
4974 ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
4975 ASSERT(l2hdr->b_tmp_cdata != NULL);
4976
4977 len = l2hdr->b_asize;
4978 cdata = zio_data_buf_alloc(len);
4979 csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
4980 cdata, l2hdr->b_asize);
4981
4982 rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
4983 if (rounded > csize) {
4984 bzero((char *)cdata + csize, rounded - csize);
4985 csize = rounded;
4986 }
4987
4988 if (csize == 0) {
4989 /* zero block, indicate that there's nothing to write */
4990 zio_data_buf_free(cdata, len);
4991 l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
4992 l2hdr->b_asize = 0;
4993 l2hdr->b_tmp_cdata = NULL;
4994 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
4995 return (B_TRUE);
4996 } else if (csize > 0 && csize < len) {
4997 /*
4998 * Compression succeeded, we'll keep the cdata around for
4999 * writing and release it afterwards.
5000 */
5001 l2hdr->b_compress = ZIO_COMPRESS_LZ4;
5002 l2hdr->b_asize = csize;
5003 l2hdr->b_tmp_cdata = cdata;
5004 ARCSTAT_BUMP(arcstat_l2_compress_successes);
5005 return (B_TRUE);
5006 } else {
5007 /*
5008 * Compression failed, release the compressed buffer.
5009 * l2hdr will be left unmodified.
5010 */
5011 zio_data_buf_free(cdata, len);
5012 ARCSTAT_BUMP(arcstat_l2_compress_failures);
5013 return (B_FALSE);
5014 }
5015 }
5016
5017 /*
5018 * Decompresses a zio read back from an l2arc device. On success, the
5019 * underlying zio's io_data buffer is overwritten by the uncompressed
5020 * version. On decompression error (corrupt compressed stream), the
5021 * zio->io_error value is set to signal an I/O error.
5022 *
5023 * Please note that the compressed data stream is not checksummed, so
5024 * if the underlying device is experiencing data corruption, we may feed
5025 * corrupt data to the decompressor, so the decompressor needs to be
5026 * able to handle this situation (LZ4 does).
5027 */
5028 static void
5029 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5030 {
5031 ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5032
5033 if (zio->io_error != 0) {
5034 /*
5035 * An io error has occured, just restore the original io
5036 * size in preparation for a main pool read.
5037 */
5038 zio->io_orig_size = zio->io_size = hdr->b_size;
5039 return;
5040 }
5041
5042 if (c == ZIO_COMPRESS_EMPTY) {
5043 /*
5044 * An empty buffer results in a null zio, which means we
5045 * need to fill its io_data after we're done restoring the
5046 * buffer's contents.
5047 */
5048 ASSERT(hdr->b_buf != NULL);
5049 bzero(hdr->b_buf->b_data, hdr->b_size);
5050 zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
5051 } else {
5052 ASSERT(zio->io_data != NULL);
5053 /*
5054 * We copy the compressed data from the start of the arc buffer
5055 * (the zio_read will have pulled in only what we need, the
5056 * rest is garbage which we will overwrite at decompression)
5057 * and then decompress back to the ARC data buffer. This way we
5058 * can minimize copying by simply decompressing back over the
5059 * original compressed data (rather than decompressing to an
5060 * aux buffer and then copying back the uncompressed buffer,
5061 * which is likely to be much larger).
5062 */
5063 uint64_t csize;
5064 void *cdata;
5065
5066 csize = zio->io_size;
5067 cdata = zio_data_buf_alloc(csize);
5068 bcopy(zio->io_data, cdata, csize);
5069 if (zio_decompress_data(c, cdata, zio->io_data, csize,
5070 hdr->b_size) != 0)
5071 zio->io_error = EIO;
5072 zio_data_buf_free(cdata, csize);
5073 }
5074
5075 /* Restore the expected uncompressed IO size. */
5076 zio->io_orig_size = zio->io_size = hdr->b_size;
5077 }
5078
5079 /*
5080 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5081 * This buffer serves as a temporary holder of compressed data while
5082 * the buffer entry is being written to an l2arc device. Once that is
5083 * done, we can dispose of it.
5084 */
5085 static void
5086 l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
5087 {
5088 l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
5089
5090 if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
5091 /*
5092 * If the data was compressed, then we've allocated a
5093 * temporary buffer for it, so now we need to release it.
5094 */
5095 ASSERT(l2hdr->b_tmp_cdata != NULL);
5096 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
5097 }
5098 l2hdr->b_tmp_cdata = NULL;
5099 }
5100
5101 /*
5102 * This thread feeds the L2ARC at regular intervals. This is the beating
5103 * heart of the L2ARC.
5104 */
5105 static void
5106 l2arc_feed_thread(void)
5107 {
5108 callb_cpr_t cpr;
5109 l2arc_dev_t *dev;
5110 spa_t *spa;
5111 uint64_t size, wrote;
5112 clock_t begin, next = ddi_get_lbolt();
5113 boolean_t headroom_boost = B_FALSE;
5114
5115 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5116
5117 mutex_enter(&l2arc_feed_thr_lock);
5118
5119 while (l2arc_thread_exit == 0) {
5120 CALLB_CPR_SAFE_BEGIN(&cpr);
5121 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
5122 next);
5123 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5124 next = ddi_get_lbolt() + hz;
5125
5126 /*
5127 * Quick check for L2ARC devices.
5128 */
5129 mutex_enter(&l2arc_dev_mtx);
5130 if (l2arc_ndev == 0) {
5131 mutex_exit(&l2arc_dev_mtx);
5132 continue;
5133 }
5134 mutex_exit(&l2arc_dev_mtx);
5135 begin = ddi_get_lbolt();
5136
5137 /*
5138 * This selects the next l2arc device to write to, and in
5139 * doing so the next spa to feed from: dev->l2ad_spa. This
5140 * will return NULL if there are now no l2arc devices or if
5141 * they are all faulted.
5142 *
5143 * If a device is returned, its spa's config lock is also
5144 * held to prevent device removal. l2arc_dev_get_next()
5145 * will grab and release l2arc_dev_mtx.
5146 */
5147 if ((dev = l2arc_dev_get_next()) == NULL)
5148 continue;
5149
5150 spa = dev->l2ad_spa;
5151 ASSERT(spa != NULL);
5152
5153 /*
5154 * If the pool is read-only then force the feed thread to
5155 * sleep a little longer.
5156 */
5157 if (!spa_writeable(spa)) {
5158 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5159 spa_config_exit(spa, SCL_L2ARC, dev);
5160 continue;
5161 }
5162
5163 /*
5164 * Avoid contributing to memory pressure.
5165 */
5166 if (arc_reclaim_needed()) {
5167 ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5168 spa_config_exit(spa, SCL_L2ARC, dev);
5169 continue;
5170 }
5171
5172 ARCSTAT_BUMP(arcstat_l2_feeds);
5173
5174 size = l2arc_write_size();
5175
5176 /*
5177 * Evict L2ARC buffers that will be overwritten.
5178 * B_FALSE guarantees synchronous eviction.
5179 */
5180 (void) l2arc_evict(dev, size, B_FALSE);
5181
5182 /*
5183 * Write ARC buffers.
5184 */
5185 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5186
5187 /*
5188 * Calculate interval between writes.
5189 */
5190 next = l2arc_write_interval(begin, size, wrote);
5191 spa_config_exit(spa, SCL_L2ARC, dev);
5192 }
5193
5194 l2arc_thread_exit = 0;
5195 cv_broadcast(&l2arc_feed_thr_cv);
5196 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
5197 thread_exit();
5198 }
5199
5200 boolean_t
5201 l2arc_vdev_present(vdev_t *vd)
5202 {
5203 l2arc_dev_t *dev;
5204
5205 mutex_enter(&l2arc_dev_mtx);
5206 for (dev = list_head(l2arc_dev_list); dev != NULL;
5207 dev = list_next(l2arc_dev_list, dev)) {
5208 if (dev->l2ad_vdev == vd)
5209 break;
5210 }
5211 mutex_exit(&l2arc_dev_mtx);
5212
5213 return (dev != NULL);
5214 }
5215
5216 /*
5217 * Add a vdev for use by the L2ARC. By this point the spa has already
5218 * validated the vdev and opened it.
5219 */
5220 void
5221 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5222 {
5223 l2arc_dev_t *adddev;
5224
5225 ASSERT(!l2arc_vdev_present(vd));
5226
5227 /*
5228 * Create a new l2arc device entry.
5229 */
5230 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5231 adddev->l2ad_spa = spa;
5232 adddev->l2ad_vdev = vd;
5233 adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5234 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5235 adddev->l2ad_hand = adddev->l2ad_start;
5236 adddev->l2ad_evict = adddev->l2ad_start;
5237 adddev->l2ad_first = B_TRUE;
5238 adddev->l2ad_writing = B_FALSE;
5239
5240 /*
5241 * This is a list of all ARC buffers that are still valid on the
5242 * device.
5243 */
5244 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5245 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5246 offsetof(arc_buf_hdr_t, b_l2node));
5247
5248 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5249
5250 /*
5251 * Add device to global list
5252 */
5253 mutex_enter(&l2arc_dev_mtx);
5254 list_insert_head(l2arc_dev_list, adddev);
5255 atomic_inc_64(&l2arc_ndev);
5256 mutex_exit(&l2arc_dev_mtx);
5257 }
5258
5259 /*
5260 * Remove a vdev from the L2ARC.
5261 */
5262 void
5263 l2arc_remove_vdev(vdev_t *vd)
5264 {
5265 l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5266
5267 /*
5268 * Find the device by vdev
5269 */
5270 mutex_enter(&l2arc_dev_mtx);
5271 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5272 nextdev = list_next(l2arc_dev_list, dev);
5273 if (vd == dev->l2ad_vdev) {
5274 remdev = dev;
5275 break;
5276 }
5277 }
5278 ASSERT(remdev != NULL);
5279
5280 /*
5281 * Remove device from global list
5282 */
5283 list_remove(l2arc_dev_list, remdev);
5284 l2arc_dev_last = NULL; /* may have been invalidated */
5285 atomic_dec_64(&l2arc_ndev);
5286 mutex_exit(&l2arc_dev_mtx);
5287
5288 /*
5289 * Clear all buflists and ARC references. L2ARC device flush.
5290 */
5291 if (l2arc_evict(remdev, 0, B_TRUE) == B_FALSE) {
5292 /*
5293 * The eviction was done synchronously, cleanup here
5294 * Otherwise, the asynchronous task will cleanup
5295 */
5296 list_destroy(remdev->l2ad_buflist);
5297 kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5298 kmem_free(remdev, sizeof (l2arc_dev_t));
5299 }
5300 }
5301
5302 void
5303 l2arc_init(void)
5304 {
5305 l2arc_thread_exit = 0;
5306 l2arc_ndev = 0;
5307 l2arc_writes_sent = 0;
5308 l2arc_writes_done = 0;
5309
5310 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5311 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5312 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5313 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5314 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5315
5316 l2arc_dev_list = &L2ARC_dev_list;
5317 l2arc_free_on_write = &L2ARC_free_on_write;
5318 list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5319 offsetof(l2arc_dev_t, l2ad_node));
5320 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5321 offsetof(l2arc_data_free_t, l2df_list_node));
5322 }
5323
5324 void
5325 l2arc_fini(void)
5326 {
5327 /*
5328 * This is called from dmu_fini(), which is called from spa_fini();
5329 * Because of this, we can assume that all l2arc devices have
5330 * already been removed when the pools themselves were removed.
5331 */
5332
5333 l2arc_do_free_on_write();
5334
5335 mutex_destroy(&l2arc_feed_thr_lock);
5336 cv_destroy(&l2arc_feed_thr_cv);
5337 mutex_destroy(&l2arc_dev_mtx);
5338 mutex_destroy(&l2arc_buflist_mtx);
5339 mutex_destroy(&l2arc_free_on_write_mtx);
5340
5341 list_destroy(l2arc_dev_list);
5342 list_destroy(l2arc_free_on_write);
5343 }
5344
5345 void
5346 l2arc_start(void)
5347 {
5348 if (!(spa_mode_global & FWRITE))
5349 return;
5350
5351 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5352 TS_RUN, minclsyspri);
5353 }
5354
5355 void
5356 l2arc_stop(void)
5357 {
5358 if (!(spa_mode_global & FWRITE))
5359 return;
5360
5361 mutex_enter(&l2arc_feed_thr_lock);
5362 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
5363 l2arc_thread_exit = 1;
5364 while (l2arc_thread_exit != 0)
5365 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5366 mutex_exit(&l2arc_feed_thr_lock);
5367 }