1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 */
26
27 /*
28 * DVA-based Adjustable Replacement Cache
29 *
30 * While much of the theory of operation used here is
31 * based on the self-tuning, low overhead replacement cache
32 * presented by Megiddo and Modha at FAST 2003, there are some
33 * significant differences:
34 *
35 * 1. The Megiddo and Modha model assumes any page is evictable.
36 * Pages in its cache cannot be "locked" into memory. This makes
37 * the eviction algorithm simple: evict the last page in the list.
38 * This also make the performance characteristics easy to reason
39 * about. Our cache is not so simple. At any given moment, some
40 * subset of the blocks in the cache are un-evictable because we
41 * have handed out a reference to them. Blocks are only evictable
42 * when there are no external references active. This makes
43 * eviction far more problematic: we choose to evict the evictable
44 * blocks that are the "lowest" in the list.
45 *
46 * There are times when it is not possible to evict the requested
47 * space. In these circumstances we are unable to adjust the cache
48 * size. To prevent the cache growing unbounded at these times we
49 * implement a "cache throttle" that slows the flow of new data
50 * into the cache until we can make space available.
51 *
52 * 2. The Megiddo and Modha model assumes a fixed cache size.
53 * Pages are evicted when the cache is full and there is a cache
54 * miss. Our model has a variable sized cache. It grows with
55 * high use, but also tries to react to memory pressure from the
56 * operating system: decreasing its size when system memory is
57 * tight.
58 *
59 * 3. The Megiddo and Modha model assumes a fixed page size. All
60 * elements of the cache are therefore exactly the same size. So
61 * when adjusting the cache size following a cache miss, its simply
62 * a matter of choosing a single page to evict. In our model, we
63 * have variable sized cache blocks (rangeing from 512 bytes to
64 * 128K bytes). We therefore choose a set of blocks to evict to make
65 * space for a cache miss that approximates as closely as possible
66 * the space used by the new block.
67 *
68 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
69 * by N. Megiddo & D. Modha, FAST 2003
70 */
71
72 /*
73 * The locking model:
74 *
75 * A new reference to a cache buffer can be obtained in two
76 * ways: 1) via a hash table lookup using the DVA as a key,
77 * or 2) via one of the ARC lists. The arc_read() interface
78 * uses method 1, while the internal arc algorithms for
79 * adjusting the cache use method 2. We therefore provide two
80 * types of locks: 1) the hash table lock array, and 2) the
81 * arc list locks.
82 *
83 * Buffers do not have their own mutexes, rather they rely on the
84 * hash table mutexes for the bulk of their protection (i.e. most
85 * fields in the arc_buf_hdr_t are protected by these mutexes).
86 *
87 * buf_hash_find() returns the appropriate mutex (held) when it
88 * locates the requested buffer in the hash table. It returns
89 * NULL for the mutex if the buffer was not in the table.
90 *
91 * buf_hash_remove() expects the appropriate hash mutex to be
92 * already held before it is invoked.
93 *
94 * Each arc state also has a mutex which is used to protect the
95 * buffer list associated with the state. When attempting to
96 * obtain a hash table lock while holding an arc list lock you
97 * must use: mutex_tryenter() to avoid deadlock. Also note that
98 * the active state mutex must be held before the ghost state mutex.
99 *
100 * Arc buffers may have an associated eviction callback function.
101 * This function will be invoked prior to removing the buffer (e.g.
102 * in arc_do_user_evicts()). Note however that the data associated
103 * with the buffer may be evicted prior to the callback. The callback
104 * must be made with *no locks held* (to prevent deadlock). Additionally,
105 * the users of callbacks must ensure that their private data is
106 * protected from simultaneous callbacks from arc_buf_evict()
107 * and arc_do_user_evicts().
108 *
109 * Note that the majority of the performance stats are manipulated
110 * with atomic operations.
111 *
112 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
113 *
114 * - L2ARC buflist creation
115 * - L2ARC buflist eviction
116 * - L2ARC write completion, which walks L2ARC buflists
117 * - ARC header destruction, as it removes from L2ARC buflists
118 * - ARC header release, as it removes from L2ARC buflists
119 */
120
121 #include <sys/spa.h>
122 #include <sys/zio.h>
123 #include <sys/zfs_context.h>
124 #include <sys/arc.h>
125 #include <sys/refcount.h>
126 #include <sys/vdev.h>
127 #include <sys/vdev_impl.h>
128 #ifdef _KERNEL
129 #include <sys/vmsystm.h>
130 #include <vm/anon.h>
131 #include <sys/fs/swapnode.h>
132 #include <sys/dnlc.h>
133 #endif
134 #include <sys/callb.h>
135 #include <sys/kstat.h>
136 #include <zfs_fletcher.h>
137
138 #ifndef _KERNEL
139 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
140 boolean_t arc_watch = B_FALSE;
141 int arc_procfd;
142 #endif
143
144 static kmutex_t arc_reclaim_thr_lock;
145 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
146 static uint8_t arc_thread_exit;
147
148 extern int zfs_write_limit_shift;
149 extern uint64_t zfs_write_limit_max;
150 extern kmutex_t zfs_write_limit_lock;
151
152 #define ARC_REDUCE_DNLC_PERCENT 3
153 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
154
155 typedef enum arc_reclaim_strategy {
156 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
157 ARC_RECLAIM_CONS /* Conservative reclaim strategy */
158 } arc_reclaim_strategy_t;
159
160 /* number of seconds before growing cache again */
161 static int arc_grow_retry = 60;
162
163 /* shift of arc_c for calculating both min and max arc_p */
164 static int arc_p_min_shift = 4;
165
166 /* log2(fraction of arc to reclaim) */
167 static int arc_shrink_shift = 5;
168
169 /*
170 * minimum lifespan of a prefetch block in clock ticks
171 * (initialized in arc_init())
172 */
173 static int arc_min_prefetch_lifespan;
174
175 static int arc_dead;
176
177 /*
178 * The arc has filled available memory and has now warmed up.
179 */
180 static boolean_t arc_warm;
181
182 /*
183 * These tunables are for performance analysis.
184 */
185 uint64_t zfs_arc_max;
186 uint64_t zfs_arc_min;
187 uint64_t zfs_arc_meta_limit = 0;
188 int zfs_arc_grow_retry = 0;
189 int zfs_arc_shrink_shift = 0;
190 int zfs_arc_p_min_shift = 0;
191 int zfs_disable_dup_eviction = 0;
192
193 /*
194 * Note that buffers can be in one of 6 states:
195 * ARC_anon - anonymous (discussed below)
196 * ARC_mru - recently used, currently cached
197 * ARC_mru_ghost - recentely used, no longer in cache
198 * ARC_mfu - frequently used, currently cached
199 * ARC_mfu_ghost - frequently used, no longer in cache
200 * ARC_l2c_only - exists in L2ARC but not other states
201 * When there are no active references to the buffer, they are
202 * are linked onto a list in one of these arc states. These are
203 * the only buffers that can be evicted or deleted. Within each
204 * state there are multiple lists, one for meta-data and one for
205 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
206 * etc.) is tracked separately so that it can be managed more
207 * explicitly: favored over data, limited explicitly.
208 *
209 * Anonymous buffers are buffers that are not associated with
210 * a DVA. These are buffers that hold dirty block copies
211 * before they are written to stable storage. By definition,
212 * they are "ref'd" and are considered part of arc_mru
213 * that cannot be freed. Generally, they will aquire a DVA
214 * as they are written and migrate onto the arc_mru list.
215 *
216 * The ARC_l2c_only state is for buffers that are in the second
217 * level ARC but no longer in any of the ARC_m* lists. The second
218 * level ARC itself may also contain buffers that are in any of
219 * the ARC_m* states - meaning that a buffer can exist in two
220 * places. The reason for the ARC_l2c_only state is to keep the
221 * buffer header in the hash table, so that reads that hit the
222 * second level ARC benefit from these fast lookups.
223 */
224
225 typedef struct arc_state {
226 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
227 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
228 uint64_t arcs_size; /* total amount of data in this state */
229 kmutex_t arcs_mtx;
230 } arc_state_t;
231
232 /* The 6 states: */
233 static arc_state_t ARC_anon;
234 static arc_state_t ARC_mru;
235 static arc_state_t ARC_mru_ghost;
236 static arc_state_t ARC_mfu;
237 static arc_state_t ARC_mfu_ghost;
238 static arc_state_t ARC_l2c_only;
239
240 typedef struct arc_stats {
241 kstat_named_t arcstat_hits;
242 kstat_named_t arcstat_misses;
243 kstat_named_t arcstat_demand_data_hits;
244 kstat_named_t arcstat_demand_data_misses;
245 kstat_named_t arcstat_demand_metadata_hits;
246 kstat_named_t arcstat_demand_metadata_misses;
247 kstat_named_t arcstat_prefetch_data_hits;
248 kstat_named_t arcstat_prefetch_data_misses;
249 kstat_named_t arcstat_prefetch_metadata_hits;
250 kstat_named_t arcstat_prefetch_metadata_misses;
251 kstat_named_t arcstat_mru_hits;
252 kstat_named_t arcstat_mru_ghost_hits;
253 kstat_named_t arcstat_mfu_hits;
254 kstat_named_t arcstat_mfu_ghost_hits;
255 kstat_named_t arcstat_deleted;
256 kstat_named_t arcstat_recycle_miss;
257 /*
258 * Number of buffers that could not be evicted because the hash lock
259 * was held by another thread. The lock may not necessarily be held
260 * by something using the same buffer, since hash locks are shared
261 * by multiple buffers.
262 */
263 kstat_named_t arcstat_mutex_miss;
264 /*
265 * Number of buffers skipped because they have I/O in progress, are
266 * indrect prefetch buffers that have not lived long enough, or are
267 * not from the spa we're trying to evict from.
268 */
269 kstat_named_t arcstat_evict_skip;
270 kstat_named_t arcstat_evict_l2_cached;
271 kstat_named_t arcstat_evict_l2_eligible;
272 kstat_named_t arcstat_evict_l2_ineligible;
273 kstat_named_t arcstat_hash_elements;
274 kstat_named_t arcstat_hash_elements_max;
275 kstat_named_t arcstat_hash_collisions;
276 kstat_named_t arcstat_hash_chains;
277 kstat_named_t arcstat_hash_chain_max;
278 kstat_named_t arcstat_p;
279 kstat_named_t arcstat_c;
280 kstat_named_t arcstat_c_min;
281 kstat_named_t arcstat_c_max;
282 kstat_named_t arcstat_size;
283 kstat_named_t arcstat_hdr_size;
284 kstat_named_t arcstat_data_size;
285 kstat_named_t arcstat_other_size;
286 kstat_named_t arcstat_l2_hits;
287 kstat_named_t arcstat_l2_misses;
288 kstat_named_t arcstat_l2_feeds;
289 kstat_named_t arcstat_l2_rw_clash;
290 kstat_named_t arcstat_l2_read_bytes;
291 kstat_named_t arcstat_l2_write_bytes;
292 kstat_named_t arcstat_l2_writes_sent;
293 kstat_named_t arcstat_l2_writes_done;
294 kstat_named_t arcstat_l2_writes_error;
295 kstat_named_t arcstat_l2_writes_hdr_miss;
296 kstat_named_t arcstat_l2_evict_lock_retry;
297 kstat_named_t arcstat_l2_evict_reading;
298 kstat_named_t arcstat_l2_free_on_write;
299 kstat_named_t arcstat_l2_abort_lowmem;
300 kstat_named_t arcstat_l2_cksum_bad;
301 kstat_named_t arcstat_l2_io_error;
302 kstat_named_t arcstat_l2_size;
303 kstat_named_t arcstat_l2_hdr_size;
304 kstat_named_t arcstat_memory_throttle_count;
305 kstat_named_t arcstat_duplicate_buffers;
306 kstat_named_t arcstat_duplicate_buffers_size;
307 kstat_named_t arcstat_duplicate_reads;
308 kstat_named_t arcstat_meta_used;
309 kstat_named_t arcstat_meta_limit;
310 kstat_named_t arcstat_meta_max;
311 } arc_stats_t;
312
313 static arc_stats_t arc_stats = {
314 { "hits", KSTAT_DATA_UINT64 },
315 { "misses", KSTAT_DATA_UINT64 },
316 { "demand_data_hits", KSTAT_DATA_UINT64 },
317 { "demand_data_misses", KSTAT_DATA_UINT64 },
318 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
319 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
320 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
321 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
322 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
323 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
324 { "mru_hits", KSTAT_DATA_UINT64 },
325 { "mru_ghost_hits", KSTAT_DATA_UINT64 },
326 { "mfu_hits", KSTAT_DATA_UINT64 },
327 { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
328 { "deleted", KSTAT_DATA_UINT64 },
329 { "recycle_miss", KSTAT_DATA_UINT64 },
330 { "mutex_miss", KSTAT_DATA_UINT64 },
331 { "evict_skip", KSTAT_DATA_UINT64 },
332 { "evict_l2_cached", KSTAT_DATA_UINT64 },
333 { "evict_l2_eligible", KSTAT_DATA_UINT64 },
334 { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
335 { "hash_elements", KSTAT_DATA_UINT64 },
336 { "hash_elements_max", KSTAT_DATA_UINT64 },
337 { "hash_collisions", KSTAT_DATA_UINT64 },
338 { "hash_chains", KSTAT_DATA_UINT64 },
339 { "hash_chain_max", KSTAT_DATA_UINT64 },
340 { "p", KSTAT_DATA_UINT64 },
341 { "c", KSTAT_DATA_UINT64 },
342 { "c_min", KSTAT_DATA_UINT64 },
343 { "c_max", KSTAT_DATA_UINT64 },
344 { "size", KSTAT_DATA_UINT64 },
345 { "hdr_size", KSTAT_DATA_UINT64 },
346 { "data_size", KSTAT_DATA_UINT64 },
347 { "other_size", KSTAT_DATA_UINT64 },
348 { "l2_hits", KSTAT_DATA_UINT64 },
349 { "l2_misses", KSTAT_DATA_UINT64 },
350 { "l2_feeds", KSTAT_DATA_UINT64 },
351 { "l2_rw_clash", KSTAT_DATA_UINT64 },
352 { "l2_read_bytes", KSTAT_DATA_UINT64 },
353 { "l2_write_bytes", KSTAT_DATA_UINT64 },
354 { "l2_writes_sent", KSTAT_DATA_UINT64 },
355 { "l2_writes_done", KSTAT_DATA_UINT64 },
356 { "l2_writes_error", KSTAT_DATA_UINT64 },
357 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
358 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
359 { "l2_evict_reading", KSTAT_DATA_UINT64 },
360 { "l2_free_on_write", KSTAT_DATA_UINT64 },
361 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
362 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
363 { "l2_io_error", KSTAT_DATA_UINT64 },
364 { "l2_size", KSTAT_DATA_UINT64 },
365 { "l2_hdr_size", KSTAT_DATA_UINT64 },
366 { "memory_throttle_count", KSTAT_DATA_UINT64 },
367 { "duplicate_buffers", KSTAT_DATA_UINT64 },
368 { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
369 { "duplicate_reads", KSTAT_DATA_UINT64 },
370 { "arc_meta_used", KSTAT_DATA_UINT64 },
371 { "arc_meta_limit", KSTAT_DATA_UINT64 },
372 { "arc_meta_max", KSTAT_DATA_UINT64 }
373 };
374
375 #define ARCSTAT(stat) (arc_stats.stat.value.ui64)
376
377 #define ARCSTAT_INCR(stat, val) \
378 atomic_add_64(&arc_stats.stat.value.ui64, (val))
379
380 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
381 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
382
383 #define ARCSTAT_MAX(stat, val) { \
384 uint64_t m; \
385 while ((val) > (m = arc_stats.stat.value.ui64) && \
386 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
387 continue; \
388 }
389
390 #define ARCSTAT_MAXSTAT(stat) \
391 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
392
393 /*
394 * We define a macro to allow ARC hits/misses to be easily broken down by
395 * two separate conditions, giving a total of four different subtypes for
396 * each of hits and misses (so eight statistics total).
397 */
398 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
399 if (cond1) { \
400 if (cond2) { \
401 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
402 } else { \
403 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
404 } \
405 } else { \
406 if (cond2) { \
407 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
408 } else { \
409 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
410 } \
411 }
412
413 kstat_t *arc_ksp;
414 static arc_state_t *arc_anon;
415 static arc_state_t *arc_mru;
416 static arc_state_t *arc_mru_ghost;
417 static arc_state_t *arc_mfu;
418 static arc_state_t *arc_mfu_ghost;
419 static arc_state_t *arc_l2c_only;
420
421 /*
422 * There are several ARC variables that are critical to export as kstats --
423 * but we don't want to have to grovel around in the kstat whenever we wish to
424 * manipulate them. For these variables, we therefore define them to be in
425 * terms of the statistic variable. This assures that we are not introducing
426 * the possibility of inconsistency by having shadow copies of the variables,
427 * while still allowing the code to be readable.
428 */
429 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
430 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
431 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */
432 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
433 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
434 #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
435 #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
436 #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
437
438 static int arc_no_grow; /* Don't try to grow cache size */
439 static uint64_t arc_tempreserve;
440 static uint64_t arc_loaned_bytes;
441
442 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
443
444 typedef struct arc_callback arc_callback_t;
445
446 struct arc_callback {
447 void *acb_private;
448 arc_done_func_t *acb_done;
449 arc_buf_t *acb_buf;
450 zio_t *acb_zio_dummy;
451 arc_callback_t *acb_next;
452 };
453
454 typedef struct arc_write_callback arc_write_callback_t;
455
456 struct arc_write_callback {
457 void *awcb_private;
458 arc_done_func_t *awcb_ready;
459 arc_done_func_t *awcb_done;
460 arc_buf_t *awcb_buf;
461 };
462
463 struct arc_buf_hdr {
464 /* protected by hash lock */
465 dva_t b_dva;
466 uint64_t b_birth;
467 uint64_t b_cksum0;
468
469 kmutex_t b_freeze_lock;
470 zio_cksum_t *b_freeze_cksum;
471 void *b_thawed;
472
473 arc_buf_hdr_t *b_hash_next;
474 arc_buf_t *b_buf;
475 uint32_t b_flags;
476 uint32_t b_datacnt;
477
478 arc_callback_t *b_acb;
479 kcondvar_t b_cv;
480
481 /* immutable */
482 arc_buf_contents_t b_type;
483 uint64_t b_size;
484 uint64_t b_spa;
485
486 /* protected by arc state mutex */
487 arc_state_t *b_state;
488 list_node_t b_arc_node;
489
490 /* updated atomically */
491 clock_t b_arc_access;
492
493 /* self protecting */
494 refcount_t b_refcnt;
495
496 l2arc_buf_hdr_t *b_l2hdr;
497 list_node_t b_l2node;
498 };
499
500 static arc_buf_t *arc_eviction_list;
501 static kmutex_t arc_eviction_mtx;
502 static arc_buf_hdr_t arc_eviction_hdr;
503 static void arc_get_data_buf(arc_buf_t *buf);
504 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
505 static int arc_evict_needed(arc_buf_contents_t type);
506 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
507 static void arc_buf_watch(arc_buf_t *buf);
508
509 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
510
511 #define GHOST_STATE(state) \
512 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
513 (state) == arc_l2c_only)
514
515 /*
516 * Private ARC flags. These flags are private ARC only flags that will show up
517 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can
518 * be passed in as arc_flags in things like arc_read. However, these flags
519 * should never be passed and should only be set by ARC code. When adding new
520 * public flags, make sure not to smash the private ones.
521 */
522
523 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */
524 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */
525 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
526 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
527 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
528 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */
529 #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */
530 #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
531 #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
532 #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
533
534 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
535 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
536 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
537 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH)
538 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
539 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
540 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
541 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE)
542 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
543 (hdr)->b_l2hdr != NULL)
544 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING)
545 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED)
546 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
547
548 /*
549 * Other sizes
550 */
551
552 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
553 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
554
555 /*
556 * Hash table routines
557 */
558
559 #define HT_LOCK_PAD 64
560
561 struct ht_lock {
562 kmutex_t ht_lock;
563 #ifdef _KERNEL
564 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
565 #endif
566 };
567
568 #define BUF_LOCKS 256
569 typedef struct buf_hash_table {
570 uint64_t ht_mask;
571 arc_buf_hdr_t **ht_table;
572 struct ht_lock ht_locks[BUF_LOCKS];
573 } buf_hash_table_t;
574
575 static buf_hash_table_t buf_hash_table;
576
577 #define BUF_HASH_INDEX(spa, dva, birth) \
578 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
579 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
580 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
581 #define HDR_LOCK(hdr) \
582 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
583
584 uint64_t zfs_crc64_table[256];
585
586 /*
587 * Level 2 ARC
588 */
589
590 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
591 #define L2ARC_HEADROOM 2 /* num of writes */
592 #define L2ARC_FEED_SECS 1 /* caching interval secs */
593 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
594
595 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
596 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
597
598 /* L2ARC Performance Tunables */
599 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
600 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
601 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
602 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
603 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
604 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
605 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
606 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
607
608 /*
609 * L2ARC Internals
610 */
611 typedef struct l2arc_dev {
612 vdev_t *l2ad_vdev; /* vdev */
613 spa_t *l2ad_spa; /* spa */
614 uint64_t l2ad_hand; /* next write location */
615 uint64_t l2ad_write; /* desired write size, bytes */
616 uint64_t l2ad_boost; /* warmup write boost, bytes */
617 uint64_t l2ad_start; /* first addr on device */
618 uint64_t l2ad_end; /* last addr on device */
619 uint64_t l2ad_evict; /* last addr eviction reached */
620 boolean_t l2ad_first; /* first sweep through */
621 boolean_t l2ad_writing; /* currently writing */
622 list_t *l2ad_buflist; /* buffer list */
623 list_node_t l2ad_node; /* device list node */
624 } l2arc_dev_t;
625
626 static list_t L2ARC_dev_list; /* device list */
627 static list_t *l2arc_dev_list; /* device list pointer */
628 static kmutex_t l2arc_dev_mtx; /* device list mutex */
629 static l2arc_dev_t *l2arc_dev_last; /* last device used */
630 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
631 static list_t L2ARC_free_on_write; /* free after write buf list */
632 static list_t *l2arc_free_on_write; /* free after write list ptr */
633 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
634 static uint64_t l2arc_ndev; /* number of devices */
635
636 typedef struct l2arc_read_callback {
637 arc_buf_t *l2rcb_buf; /* read buffer */
638 spa_t *l2rcb_spa; /* spa */
639 blkptr_t l2rcb_bp; /* original blkptr */
640 zbookmark_t l2rcb_zb; /* original bookmark */
641 int l2rcb_flags; /* original flags */
642 } l2arc_read_callback_t;
643
644 typedef struct l2arc_write_callback {
645 l2arc_dev_t *l2wcb_dev; /* device info */
646 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
647 } l2arc_write_callback_t;
648
649 struct l2arc_buf_hdr {
650 /* protected by arc_buf_hdr mutex */
651 l2arc_dev_t *b_dev; /* L2ARC device */
652 uint64_t b_daddr; /* disk address, offset byte */
653 };
654
655 typedef struct l2arc_data_free {
656 /* protected by l2arc_free_on_write_mtx */
657 void *l2df_data;
658 size_t l2df_size;
659 void (*l2df_func)(void *, size_t);
660 list_node_t l2df_list_node;
661 } l2arc_data_free_t;
662
663 static kmutex_t l2arc_feed_thr_lock;
664 static kcondvar_t l2arc_feed_thr_cv;
665 static uint8_t l2arc_thread_exit;
666
667 static void l2arc_read_done(zio_t *zio);
668 static void l2arc_hdr_stat_add(void);
669 static void l2arc_hdr_stat_remove(void);
670
671 static uint64_t
672 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
673 {
674 uint8_t *vdva = (uint8_t *)dva;
675 uint64_t crc = -1ULL;
676 int i;
677
678 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
679
680 for (i = 0; i < sizeof (dva_t); i++)
681 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
682
683 crc ^= (spa>>8) ^ birth;
684
685 return (crc);
686 }
687
688 #define BUF_EMPTY(buf) \
689 ((buf)->b_dva.dva_word[0] == 0 && \
690 (buf)->b_dva.dva_word[1] == 0 && \
691 (buf)->b_birth == 0)
692
693 #define BUF_EQUAL(spa, dva, birth, buf) \
694 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
695 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
696 ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
697
698 static void
699 buf_discard_identity(arc_buf_hdr_t *hdr)
700 {
701 hdr->b_dva.dva_word[0] = 0;
702 hdr->b_dva.dva_word[1] = 0;
703 hdr->b_birth = 0;
704 hdr->b_cksum0 = 0;
705 }
706
707 static arc_buf_hdr_t *
708 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
709 {
710 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
711 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
712 arc_buf_hdr_t *buf;
713
714 mutex_enter(hash_lock);
715 for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
716 buf = buf->b_hash_next) {
717 if (BUF_EQUAL(spa, dva, birth, buf)) {
718 *lockp = hash_lock;
719 return (buf);
720 }
721 }
722 mutex_exit(hash_lock);
723 *lockp = NULL;
724 return (NULL);
725 }
726
727 /*
728 * Insert an entry into the hash table. If there is already an element
729 * equal to elem in the hash table, then the already existing element
730 * will be returned and the new element will not be inserted.
731 * Otherwise returns NULL.
732 */
733 static arc_buf_hdr_t *
734 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
735 {
736 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
737 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
738 arc_buf_hdr_t *fbuf;
739 uint32_t i;
740
741 ASSERT(!HDR_IN_HASH_TABLE(buf));
742 *lockp = hash_lock;
743 mutex_enter(hash_lock);
744 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
745 fbuf = fbuf->b_hash_next, i++) {
746 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
747 return (fbuf);
748 }
749
750 buf->b_hash_next = buf_hash_table.ht_table[idx];
751 buf_hash_table.ht_table[idx] = buf;
752 buf->b_flags |= ARC_IN_HASH_TABLE;
753
754 /* collect some hash table performance data */
755 if (i > 0) {
756 ARCSTAT_BUMP(arcstat_hash_collisions);
757 if (i == 1)
758 ARCSTAT_BUMP(arcstat_hash_chains);
759
760 ARCSTAT_MAX(arcstat_hash_chain_max, i);
761 }
762
763 ARCSTAT_BUMP(arcstat_hash_elements);
764 ARCSTAT_MAXSTAT(arcstat_hash_elements);
765
766 return (NULL);
767 }
768
769 static void
770 buf_hash_remove(arc_buf_hdr_t *buf)
771 {
772 arc_buf_hdr_t *fbuf, **bufp;
773 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
774
775 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
776 ASSERT(HDR_IN_HASH_TABLE(buf));
777
778 bufp = &buf_hash_table.ht_table[idx];
779 while ((fbuf = *bufp) != buf) {
780 ASSERT(fbuf != NULL);
781 bufp = &fbuf->b_hash_next;
782 }
783 *bufp = buf->b_hash_next;
784 buf->b_hash_next = NULL;
785 buf->b_flags &= ~ARC_IN_HASH_TABLE;
786
787 /* collect some hash table performance data */
788 ARCSTAT_BUMPDOWN(arcstat_hash_elements);
789
790 if (buf_hash_table.ht_table[idx] &&
791 buf_hash_table.ht_table[idx]->b_hash_next == NULL)
792 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
793 }
794
795 /*
796 * Global data structures and functions for the buf kmem cache.
797 */
798 static kmem_cache_t *hdr_cache;
799 static kmem_cache_t *buf_cache;
800
801 static void
802 buf_fini(void)
803 {
804 int i;
805
806 kmem_free(buf_hash_table.ht_table,
807 (buf_hash_table.ht_mask + 1) * sizeof (void *));
808 for (i = 0; i < BUF_LOCKS; i++)
809 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
810 kmem_cache_destroy(hdr_cache);
811 kmem_cache_destroy(buf_cache);
812 }
813
814 /*
815 * Constructor callback - called when the cache is empty
816 * and a new buf is requested.
817 */
818 /* ARGSUSED */
819 static int
820 hdr_cons(void *vbuf, void *unused, int kmflag)
821 {
822 arc_buf_hdr_t *buf = vbuf;
823
824 bzero(buf, sizeof (arc_buf_hdr_t));
825 refcount_create(&buf->b_refcnt);
826 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
827 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
828 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
829
830 return (0);
831 }
832
833 /* ARGSUSED */
834 static int
835 buf_cons(void *vbuf, void *unused, int kmflag)
836 {
837 arc_buf_t *buf = vbuf;
838
839 bzero(buf, sizeof (arc_buf_t));
840 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
841 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
842
843 return (0);
844 }
845
846 /*
847 * Destructor callback - called when a cached buf is
848 * no longer required.
849 */
850 /* ARGSUSED */
851 static void
852 hdr_dest(void *vbuf, void *unused)
853 {
854 arc_buf_hdr_t *buf = vbuf;
855
856 ASSERT(BUF_EMPTY(buf));
857 refcount_destroy(&buf->b_refcnt);
858 cv_destroy(&buf->b_cv);
859 mutex_destroy(&buf->b_freeze_lock);
860 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
861 }
862
863 /* ARGSUSED */
864 static void
865 buf_dest(void *vbuf, void *unused)
866 {
867 arc_buf_t *buf = vbuf;
868
869 mutex_destroy(&buf->b_evict_lock);
870 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
871 }
872
873 /*
874 * Reclaim callback -- invoked when memory is low.
875 */
876 /* ARGSUSED */
877 static void
878 hdr_recl(void *unused)
879 {
880 dprintf("hdr_recl called\n");
881 /*
882 * umem calls the reclaim func when we destroy the buf cache,
883 * which is after we do arc_fini().
884 */
885 if (!arc_dead)
886 cv_signal(&arc_reclaim_thr_cv);
887 }
888
889 static void
890 buf_init(void)
891 {
892 uint64_t *ct;
893 uint64_t hsize = 1ULL << 12;
894 int i, j;
895
896 /*
897 * The hash table is big enough to fill all of physical memory
898 * with an average 64K block size. The table will take up
899 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
900 */
901 while (hsize * 65536 < physmem * PAGESIZE)
902 hsize <<= 1;
903 retry:
904 buf_hash_table.ht_mask = hsize - 1;
905 buf_hash_table.ht_table =
906 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
907 if (buf_hash_table.ht_table == NULL) {
908 ASSERT(hsize > (1ULL << 8));
909 hsize >>= 1;
910 goto retry;
911 }
912
913 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
914 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
915 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
916 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
917
918 for (i = 0; i < 256; i++)
919 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
920 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
921
922 for (i = 0; i < BUF_LOCKS; i++) {
923 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
924 NULL, MUTEX_DEFAULT, NULL);
925 }
926 }
927
928 #define ARC_MINTIME (hz>>4) /* 62 ms */
929
930 static void
931 arc_cksum_verify(arc_buf_t *buf)
932 {
933 zio_cksum_t zc;
934
935 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
936 return;
937
938 mutex_enter(&buf->b_hdr->b_freeze_lock);
939 if (buf->b_hdr->b_freeze_cksum == NULL ||
940 (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
941 mutex_exit(&buf->b_hdr->b_freeze_lock);
942 return;
943 }
944 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
945 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
946 panic("buffer modified while frozen!");
947 mutex_exit(&buf->b_hdr->b_freeze_lock);
948 }
949
950 static int
951 arc_cksum_equal(arc_buf_t *buf)
952 {
953 zio_cksum_t zc;
954 int equal;
955
956 mutex_enter(&buf->b_hdr->b_freeze_lock);
957 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
958 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
959 mutex_exit(&buf->b_hdr->b_freeze_lock);
960
961 return (equal);
962 }
963
964 static void
965 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
966 {
967 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
968 return;
969
970 mutex_enter(&buf->b_hdr->b_freeze_lock);
971 if (buf->b_hdr->b_freeze_cksum != NULL) {
972 mutex_exit(&buf->b_hdr->b_freeze_lock);
973 return;
974 }
975 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
976 fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
977 buf->b_hdr->b_freeze_cksum);
978 mutex_exit(&buf->b_hdr->b_freeze_lock);
979 arc_buf_watch(buf);
980 }
981
982 #ifndef _KERNEL
983 typedef struct procctl {
984 long cmd;
985 prwatch_t prwatch;
986 } procctl_t;
987 #endif
988
989 /* ARGSUSED */
990 static void
991 arc_buf_unwatch(arc_buf_t *buf)
992 {
993 #ifndef _KERNEL
994 if (arc_watch) {
995 int result;
996 procctl_t ctl;
997 ctl.cmd = PCWATCH;
998 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
999 ctl.prwatch.pr_size = 0;
1000 ctl.prwatch.pr_wflags = 0;
1001 result = write(arc_procfd, &ctl, sizeof (ctl));
1002 ASSERT3U(result, ==, sizeof (ctl));
1003 }
1004 #endif
1005 }
1006
1007 /* ARGSUSED */
1008 static void
1009 arc_buf_watch(arc_buf_t *buf)
1010 {
1011 #ifndef _KERNEL
1012 if (arc_watch) {
1013 int result;
1014 procctl_t ctl;
1015 ctl.cmd = PCWATCH;
1016 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1017 ctl.prwatch.pr_size = buf->b_hdr->b_size;
1018 ctl.prwatch.pr_wflags = WA_WRITE;
1019 result = write(arc_procfd, &ctl, sizeof (ctl));
1020 ASSERT3U(result, ==, sizeof (ctl));
1021 }
1022 #endif
1023 }
1024
1025 void
1026 arc_buf_thaw(arc_buf_t *buf)
1027 {
1028 if (zfs_flags & ZFS_DEBUG_MODIFY) {
1029 if (buf->b_hdr->b_state != arc_anon)
1030 panic("modifying non-anon buffer!");
1031 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1032 panic("modifying buffer while i/o in progress!");
1033 arc_cksum_verify(buf);
1034 }
1035
1036 mutex_enter(&buf->b_hdr->b_freeze_lock);
1037 if (buf->b_hdr->b_freeze_cksum != NULL) {
1038 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1039 buf->b_hdr->b_freeze_cksum = NULL;
1040 }
1041
1042 if (zfs_flags & ZFS_DEBUG_MODIFY) {
1043 if (buf->b_hdr->b_thawed)
1044 kmem_free(buf->b_hdr->b_thawed, 1);
1045 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1046 }
1047
1048 mutex_exit(&buf->b_hdr->b_freeze_lock);
1049
1050 arc_buf_unwatch(buf);
1051 }
1052
1053 void
1054 arc_buf_freeze(arc_buf_t *buf)
1055 {
1056 kmutex_t *hash_lock;
1057
1058 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1059 return;
1060
1061 hash_lock = HDR_LOCK(buf->b_hdr);
1062 mutex_enter(hash_lock);
1063
1064 ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1065 buf->b_hdr->b_state == arc_anon);
1066 arc_cksum_compute(buf, B_FALSE);
1067 mutex_exit(hash_lock);
1068
1069 }
1070
1071 static void
1072 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1073 {
1074 ASSERT(MUTEX_HELD(hash_lock));
1075
1076 if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1077 (ab->b_state != arc_anon)) {
1078 uint64_t delta = ab->b_size * ab->b_datacnt;
1079 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1080 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1081
1082 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1083 mutex_enter(&ab->b_state->arcs_mtx);
1084 ASSERT(list_link_active(&ab->b_arc_node));
1085 list_remove(list, ab);
1086 if (GHOST_STATE(ab->b_state)) {
1087 ASSERT0(ab->b_datacnt);
1088 ASSERT3P(ab->b_buf, ==, NULL);
1089 delta = ab->b_size;
1090 }
1091 ASSERT(delta > 0);
1092 ASSERT3U(*size, >=, delta);
1093 atomic_add_64(size, -delta);
1094 mutex_exit(&ab->b_state->arcs_mtx);
1095 /* remove the prefetch flag if we get a reference */
1096 if (ab->b_flags & ARC_PREFETCH)
1097 ab->b_flags &= ~ARC_PREFETCH;
1098 }
1099 }
1100
1101 static int
1102 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1103 {
1104 int cnt;
1105 arc_state_t *state = ab->b_state;
1106
1107 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1108 ASSERT(!GHOST_STATE(state));
1109
1110 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1111 (state != arc_anon)) {
1112 uint64_t *size = &state->arcs_lsize[ab->b_type];
1113
1114 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1115 mutex_enter(&state->arcs_mtx);
1116 ASSERT(!list_link_active(&ab->b_arc_node));
1117 list_insert_head(&state->arcs_list[ab->b_type], ab);
1118 ASSERT(ab->b_datacnt > 0);
1119 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1120 mutex_exit(&state->arcs_mtx);
1121 }
1122 return (cnt);
1123 }
1124
1125 /*
1126 * Move the supplied buffer to the indicated state. The mutex
1127 * for the buffer must be held by the caller.
1128 */
1129 static void
1130 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1131 {
1132 arc_state_t *old_state = ab->b_state;
1133 int64_t refcnt = refcount_count(&ab->b_refcnt);
1134 uint64_t from_delta, to_delta;
1135
1136 ASSERT(MUTEX_HELD(hash_lock));
1137 ASSERT(new_state != old_state);
1138 ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1139 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1140 ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1141
1142 from_delta = to_delta = ab->b_datacnt * ab->b_size;
1143
1144 /*
1145 * If this buffer is evictable, transfer it from the
1146 * old state list to the new state list.
1147 */
1148 if (refcnt == 0) {
1149 if (old_state != arc_anon) {
1150 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1151 uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1152
1153 if (use_mutex)
1154 mutex_enter(&old_state->arcs_mtx);
1155
1156 ASSERT(list_link_active(&ab->b_arc_node));
1157 list_remove(&old_state->arcs_list[ab->b_type], ab);
1158
1159 /*
1160 * If prefetching out of the ghost cache,
1161 * we will have a non-zero datacnt.
1162 */
1163 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1164 /* ghost elements have a ghost size */
1165 ASSERT(ab->b_buf == NULL);
1166 from_delta = ab->b_size;
1167 }
1168 ASSERT3U(*size, >=, from_delta);
1169 atomic_add_64(size, -from_delta);
1170
1171 if (use_mutex)
1172 mutex_exit(&old_state->arcs_mtx);
1173 }
1174 if (new_state != arc_anon) {
1175 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1176 uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1177
1178 if (use_mutex)
1179 mutex_enter(&new_state->arcs_mtx);
1180
1181 list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1182
1183 /* ghost elements have a ghost size */
1184 if (GHOST_STATE(new_state)) {
1185 ASSERT(ab->b_datacnt == 0);
1186 ASSERT(ab->b_buf == NULL);
1187 to_delta = ab->b_size;
1188 }
1189 atomic_add_64(size, to_delta);
1190
1191 if (use_mutex)
1192 mutex_exit(&new_state->arcs_mtx);
1193 }
1194 }
1195
1196 ASSERT(!BUF_EMPTY(ab));
1197 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1198 buf_hash_remove(ab);
1199
1200 /* adjust state sizes */
1201 if (to_delta)
1202 atomic_add_64(&new_state->arcs_size, to_delta);
1203 if (from_delta) {
1204 ASSERT3U(old_state->arcs_size, >=, from_delta);
1205 atomic_add_64(&old_state->arcs_size, -from_delta);
1206 }
1207 ab->b_state = new_state;
1208
1209 /* adjust l2arc hdr stats */
1210 if (new_state == arc_l2c_only)
1211 l2arc_hdr_stat_add();
1212 else if (old_state == arc_l2c_only)
1213 l2arc_hdr_stat_remove();
1214 }
1215
1216 void
1217 arc_space_consume(uint64_t space, arc_space_type_t type)
1218 {
1219 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1220
1221 switch (type) {
1222 case ARC_SPACE_DATA:
1223 ARCSTAT_INCR(arcstat_data_size, space);
1224 break;
1225 case ARC_SPACE_OTHER:
1226 ARCSTAT_INCR(arcstat_other_size, space);
1227 break;
1228 case ARC_SPACE_HDRS:
1229 ARCSTAT_INCR(arcstat_hdr_size, space);
1230 break;
1231 case ARC_SPACE_L2HDRS:
1232 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1233 break;
1234 }
1235
1236 ARCSTAT_INCR(arcstat_meta_used, space);
1237 atomic_add_64(&arc_size, space);
1238 }
1239
1240 void
1241 arc_space_return(uint64_t space, arc_space_type_t type)
1242 {
1243 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1244
1245 switch (type) {
1246 case ARC_SPACE_DATA:
1247 ARCSTAT_INCR(arcstat_data_size, -space);
1248 break;
1249 case ARC_SPACE_OTHER:
1250 ARCSTAT_INCR(arcstat_other_size, -space);
1251 break;
1252 case ARC_SPACE_HDRS:
1253 ARCSTAT_INCR(arcstat_hdr_size, -space);
1254 break;
1255 case ARC_SPACE_L2HDRS:
1256 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1257 break;
1258 }
1259
1260 ASSERT(arc_meta_used >= space);
1261 if (arc_meta_max < arc_meta_used)
1262 arc_meta_max = arc_meta_used;
1263 ARCSTAT_INCR(arcstat_meta_used, -space);
1264 ASSERT(arc_size >= space);
1265 atomic_add_64(&arc_size, -space);
1266 }
1267
1268 void *
1269 arc_data_buf_alloc(uint64_t size)
1270 {
1271 if (arc_evict_needed(ARC_BUFC_DATA))
1272 cv_signal(&arc_reclaim_thr_cv);
1273 atomic_add_64(&arc_size, size);
1274 return (zio_data_buf_alloc(size));
1275 }
1276
1277 void
1278 arc_data_buf_free(void *buf, uint64_t size)
1279 {
1280 zio_data_buf_free(buf, size);
1281 ASSERT(arc_size >= size);
1282 atomic_add_64(&arc_size, -size);
1283 }
1284
1285 arc_buf_t *
1286 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1287 {
1288 arc_buf_hdr_t *hdr;
1289 arc_buf_t *buf;
1290
1291 ASSERT3U(size, >, 0);
1292 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1293 ASSERT(BUF_EMPTY(hdr));
1294 hdr->b_size = size;
1295 hdr->b_type = type;
1296 hdr->b_spa = spa_load_guid(spa);
1297 hdr->b_state = arc_anon;
1298 hdr->b_arc_access = 0;
1299 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1300 buf->b_hdr = hdr;
1301 buf->b_data = NULL;
1302 buf->b_efunc = NULL;
1303 buf->b_private = NULL;
1304 buf->b_next = NULL;
1305 hdr->b_buf = buf;
1306 arc_get_data_buf(buf);
1307 hdr->b_datacnt = 1;
1308 hdr->b_flags = 0;
1309 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1310 (void) refcount_add(&hdr->b_refcnt, tag);
1311
1312 return (buf);
1313 }
1314
1315 static char *arc_onloan_tag = "onloan";
1316
1317 /*
1318 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1319 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1320 * buffers must be returned to the arc before they can be used by the DMU or
1321 * freed.
1322 */
1323 arc_buf_t *
1324 arc_loan_buf(spa_t *spa, int size)
1325 {
1326 arc_buf_t *buf;
1327
1328 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1329
1330 atomic_add_64(&arc_loaned_bytes, size);
1331 return (buf);
1332 }
1333
1334 /*
1335 * Return a loaned arc buffer to the arc.
1336 */
1337 void
1338 arc_return_buf(arc_buf_t *buf, void *tag)
1339 {
1340 arc_buf_hdr_t *hdr = buf->b_hdr;
1341
1342 ASSERT(buf->b_data != NULL);
1343 (void) refcount_add(&hdr->b_refcnt, tag);
1344 (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1345
1346 atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1347 }
1348
1349 /* Detach an arc_buf from a dbuf (tag) */
1350 void
1351 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1352 {
1353 arc_buf_hdr_t *hdr;
1354
1355 ASSERT(buf->b_data != NULL);
1356 hdr = buf->b_hdr;
1357 (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1358 (void) refcount_remove(&hdr->b_refcnt, tag);
1359 buf->b_efunc = NULL;
1360 buf->b_private = NULL;
1361
1362 atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1363 }
1364
1365 static arc_buf_t *
1366 arc_buf_clone(arc_buf_t *from)
1367 {
1368 arc_buf_t *buf;
1369 arc_buf_hdr_t *hdr = from->b_hdr;
1370 uint64_t size = hdr->b_size;
1371
1372 ASSERT(hdr->b_state != arc_anon);
1373
1374 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1375 buf->b_hdr = hdr;
1376 buf->b_data = NULL;
1377 buf->b_efunc = NULL;
1378 buf->b_private = NULL;
1379 buf->b_next = hdr->b_buf;
1380 hdr->b_buf = buf;
1381 arc_get_data_buf(buf);
1382 bcopy(from->b_data, buf->b_data, size);
1383
1384 /*
1385 * This buffer already exists in the arc so create a duplicate
1386 * copy for the caller. If the buffer is associated with user data
1387 * then track the size and number of duplicates. These stats will be
1388 * updated as duplicate buffers are created and destroyed.
1389 */
1390 if (hdr->b_type == ARC_BUFC_DATA) {
1391 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1392 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1393 }
1394 hdr->b_datacnt += 1;
1395 return (buf);
1396 }
1397
1398 void
1399 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1400 {
1401 arc_buf_hdr_t *hdr;
1402 kmutex_t *hash_lock;
1403
1404 /*
1405 * Check to see if this buffer is evicted. Callers
1406 * must verify b_data != NULL to know if the add_ref
1407 * was successful.
1408 */
1409 mutex_enter(&buf->b_evict_lock);
1410 if (buf->b_data == NULL) {
1411 mutex_exit(&buf->b_evict_lock);
1412 return;
1413 }
1414 hash_lock = HDR_LOCK(buf->b_hdr);
1415 mutex_enter(hash_lock);
1416 hdr = buf->b_hdr;
1417 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1418 mutex_exit(&buf->b_evict_lock);
1419
1420 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1421 add_reference(hdr, hash_lock, tag);
1422 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1423 arc_access(hdr, hash_lock);
1424 mutex_exit(hash_lock);
1425 ARCSTAT_BUMP(arcstat_hits);
1426 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1427 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1428 data, metadata, hits);
1429 }
1430
1431 /*
1432 * Free the arc data buffer. If it is an l2arc write in progress,
1433 * the buffer is placed on l2arc_free_on_write to be freed later.
1434 */
1435 static void
1436 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1437 {
1438 arc_buf_hdr_t *hdr = buf->b_hdr;
1439
1440 if (HDR_L2_WRITING(hdr)) {
1441 l2arc_data_free_t *df;
1442 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1443 df->l2df_data = buf->b_data;
1444 df->l2df_size = hdr->b_size;
1445 df->l2df_func = free_func;
1446 mutex_enter(&l2arc_free_on_write_mtx);
1447 list_insert_head(l2arc_free_on_write, df);
1448 mutex_exit(&l2arc_free_on_write_mtx);
1449 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1450 } else {
1451 free_func(buf->b_data, hdr->b_size);
1452 }
1453 }
1454
1455 static void
1456 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1457 {
1458 arc_buf_t **bufp;
1459
1460 /* free up data associated with the buf */
1461 if (buf->b_data) {
1462 arc_state_t *state = buf->b_hdr->b_state;
1463 uint64_t size = buf->b_hdr->b_size;
1464 arc_buf_contents_t type = buf->b_hdr->b_type;
1465
1466 arc_cksum_verify(buf);
1467 arc_buf_unwatch(buf);
1468
1469 if (!recycle) {
1470 if (type == ARC_BUFC_METADATA) {
1471 arc_buf_data_free(buf, zio_buf_free);
1472 arc_space_return(size, ARC_SPACE_DATA);
1473 } else {
1474 ASSERT(type == ARC_BUFC_DATA);
1475 arc_buf_data_free(buf, zio_data_buf_free);
1476 ARCSTAT_INCR(arcstat_data_size, -size);
1477 atomic_add_64(&arc_size, -size);
1478 }
1479 }
1480 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1481 uint64_t *cnt = &state->arcs_lsize[type];
1482
1483 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1484 ASSERT(state != arc_anon);
1485
1486 ASSERT3U(*cnt, >=, size);
1487 atomic_add_64(cnt, -size);
1488 }
1489 ASSERT3U(state->arcs_size, >=, size);
1490 atomic_add_64(&state->arcs_size, -size);
1491 buf->b_data = NULL;
1492
1493 /*
1494 * If we're destroying a duplicate buffer make sure
1495 * that the appropriate statistics are updated.
1496 */
1497 if (buf->b_hdr->b_datacnt > 1 &&
1498 buf->b_hdr->b_type == ARC_BUFC_DATA) {
1499 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1500 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1501 }
1502 ASSERT(buf->b_hdr->b_datacnt > 0);
1503 buf->b_hdr->b_datacnt -= 1;
1504 }
1505
1506 /* only remove the buf if requested */
1507 if (!all)
1508 return;
1509
1510 /* remove the buf from the hdr list */
1511 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1512 continue;
1513 *bufp = buf->b_next;
1514 buf->b_next = NULL;
1515
1516 ASSERT(buf->b_efunc == NULL);
1517
1518 /* clean up the buf */
1519 buf->b_hdr = NULL;
1520 kmem_cache_free(buf_cache, buf);
1521 }
1522
1523 static void
1524 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1525 {
1526 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1527 ASSERT3P(hdr->b_state, ==, arc_anon);
1528 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1529 l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1530
1531 if (l2hdr != NULL) {
1532 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1533 /*
1534 * To prevent arc_free() and l2arc_evict() from
1535 * attempting to free the same buffer at the same time,
1536 * a FREE_IN_PROGRESS flag is given to arc_free() to
1537 * give it priority. l2arc_evict() can't destroy this
1538 * header while we are waiting on l2arc_buflist_mtx.
1539 *
1540 * The hdr may be removed from l2ad_buflist before we
1541 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1542 */
1543 if (!buflist_held) {
1544 mutex_enter(&l2arc_buflist_mtx);
1545 l2hdr = hdr->b_l2hdr;
1546 }
1547
1548 if (l2hdr != NULL) {
1549 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1550 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1551 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1552 if (hdr->b_state == arc_l2c_only)
1553 l2arc_hdr_stat_remove();
1554 hdr->b_l2hdr = NULL;
1555 }
1556
1557 if (!buflist_held)
1558 mutex_exit(&l2arc_buflist_mtx);
1559 }
1560
1561 if (!BUF_EMPTY(hdr)) {
1562 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1563 buf_discard_identity(hdr);
1564 }
1565 while (hdr->b_buf) {
1566 arc_buf_t *buf = hdr->b_buf;
1567
1568 if (buf->b_efunc) {
1569 mutex_enter(&arc_eviction_mtx);
1570 mutex_enter(&buf->b_evict_lock);
1571 ASSERT(buf->b_hdr != NULL);
1572 arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1573 hdr->b_buf = buf->b_next;
1574 buf->b_hdr = &arc_eviction_hdr;
1575 buf->b_next = arc_eviction_list;
1576 arc_eviction_list = buf;
1577 mutex_exit(&buf->b_evict_lock);
1578 mutex_exit(&arc_eviction_mtx);
1579 } else {
1580 arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1581 }
1582 }
1583 if (hdr->b_freeze_cksum != NULL) {
1584 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1585 hdr->b_freeze_cksum = NULL;
1586 }
1587 if (hdr->b_thawed) {
1588 kmem_free(hdr->b_thawed, 1);
1589 hdr->b_thawed = NULL;
1590 }
1591
1592 ASSERT(!list_link_active(&hdr->b_arc_node));
1593 ASSERT3P(hdr->b_hash_next, ==, NULL);
1594 ASSERT3P(hdr->b_acb, ==, NULL);
1595 kmem_cache_free(hdr_cache, hdr);
1596 }
1597
1598 void
1599 arc_buf_free(arc_buf_t *buf, void *tag)
1600 {
1601 arc_buf_hdr_t *hdr = buf->b_hdr;
1602 int hashed = hdr->b_state != arc_anon;
1603
1604 ASSERT(buf->b_efunc == NULL);
1605 ASSERT(buf->b_data != NULL);
1606
1607 if (hashed) {
1608 kmutex_t *hash_lock = HDR_LOCK(hdr);
1609
1610 mutex_enter(hash_lock);
1611 hdr = buf->b_hdr;
1612 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1613
1614 (void) remove_reference(hdr, hash_lock, tag);
1615 if (hdr->b_datacnt > 1) {
1616 arc_buf_destroy(buf, FALSE, TRUE);
1617 } else {
1618 ASSERT(buf == hdr->b_buf);
1619 ASSERT(buf->b_efunc == NULL);
1620 hdr->b_flags |= ARC_BUF_AVAILABLE;
1621 }
1622 mutex_exit(hash_lock);
1623 } else if (HDR_IO_IN_PROGRESS(hdr)) {
1624 int destroy_hdr;
1625 /*
1626 * We are in the middle of an async write. Don't destroy
1627 * this buffer unless the write completes before we finish
1628 * decrementing the reference count.
1629 */
1630 mutex_enter(&arc_eviction_mtx);
1631 (void) remove_reference(hdr, NULL, tag);
1632 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1633 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1634 mutex_exit(&arc_eviction_mtx);
1635 if (destroy_hdr)
1636 arc_hdr_destroy(hdr);
1637 } else {
1638 if (remove_reference(hdr, NULL, tag) > 0)
1639 arc_buf_destroy(buf, FALSE, TRUE);
1640 else
1641 arc_hdr_destroy(hdr);
1642 }
1643 }
1644
1645 boolean_t
1646 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1647 {
1648 arc_buf_hdr_t *hdr = buf->b_hdr;
1649 kmutex_t *hash_lock = HDR_LOCK(hdr);
1650 boolean_t no_callback = (buf->b_efunc == NULL);
1651
1652 if (hdr->b_state == arc_anon) {
1653 ASSERT(hdr->b_datacnt == 1);
1654 arc_buf_free(buf, tag);
1655 return (no_callback);
1656 }
1657
1658 mutex_enter(hash_lock);
1659 hdr = buf->b_hdr;
1660 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1661 ASSERT(hdr->b_state != arc_anon);
1662 ASSERT(buf->b_data != NULL);
1663
1664 (void) remove_reference(hdr, hash_lock, tag);
1665 if (hdr->b_datacnt > 1) {
1666 if (no_callback)
1667 arc_buf_destroy(buf, FALSE, TRUE);
1668 } else if (no_callback) {
1669 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1670 ASSERT(buf->b_efunc == NULL);
1671 hdr->b_flags |= ARC_BUF_AVAILABLE;
1672 }
1673 ASSERT(no_callback || hdr->b_datacnt > 1 ||
1674 refcount_is_zero(&hdr->b_refcnt));
1675 mutex_exit(hash_lock);
1676 return (no_callback);
1677 }
1678
1679 int
1680 arc_buf_size(arc_buf_t *buf)
1681 {
1682 return (buf->b_hdr->b_size);
1683 }
1684
1685 /*
1686 * Called from the DMU to determine if the current buffer should be
1687 * evicted. In order to ensure proper locking, the eviction must be initiated
1688 * from the DMU. Return true if the buffer is associated with user data and
1689 * duplicate buffers still exist.
1690 */
1691 boolean_t
1692 arc_buf_eviction_needed(arc_buf_t *buf)
1693 {
1694 arc_buf_hdr_t *hdr;
1695 boolean_t evict_needed = B_FALSE;
1696
1697 if (zfs_disable_dup_eviction)
1698 return (B_FALSE);
1699
1700 mutex_enter(&buf->b_evict_lock);
1701 hdr = buf->b_hdr;
1702 if (hdr == NULL) {
1703 /*
1704 * We are in arc_do_user_evicts(); let that function
1705 * perform the eviction.
1706 */
1707 ASSERT(buf->b_data == NULL);
1708 mutex_exit(&buf->b_evict_lock);
1709 return (B_FALSE);
1710 } else if (buf->b_data == NULL) {
1711 /*
1712 * We have already been added to the arc eviction list;
1713 * recommend eviction.
1714 */
1715 ASSERT3P(hdr, ==, &arc_eviction_hdr);
1716 mutex_exit(&buf->b_evict_lock);
1717 return (B_TRUE);
1718 }
1719
1720 if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1721 evict_needed = B_TRUE;
1722
1723 mutex_exit(&buf->b_evict_lock);
1724 return (evict_needed);
1725 }
1726
1727 /*
1728 * Evict buffers from list until we've removed the specified number of
1729 * bytes. Move the removed buffers to the appropriate evict state.
1730 * If the recycle flag is set, then attempt to "recycle" a buffer:
1731 * - look for a buffer to evict that is `bytes' long.
1732 * - return the data block from this buffer rather than freeing it.
1733 * This flag is used by callers that are trying to make space for a
1734 * new buffer in a full arc cache.
1735 *
1736 * This function makes a "best effort". It skips over any buffers
1737 * it can't get a hash_lock on, and so may not catch all candidates.
1738 * It may also return without evicting as much space as requested.
1739 */
1740 static void *
1741 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1742 arc_buf_contents_t type)
1743 {
1744 arc_state_t *evicted_state;
1745 uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1746 arc_buf_hdr_t *ab, *ab_prev = NULL;
1747 list_t *list = &state->arcs_list[type];
1748 kmutex_t *hash_lock;
1749 boolean_t have_lock;
1750 void *stolen = NULL;
1751
1752 ASSERT(state == arc_mru || state == arc_mfu);
1753
1754 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1755
1756 mutex_enter(&state->arcs_mtx);
1757 mutex_enter(&evicted_state->arcs_mtx);
1758
1759 for (ab = list_tail(list); ab; ab = ab_prev) {
1760 ab_prev = list_prev(list, ab);
1761 /* prefetch buffers have a minimum lifespan */
1762 if (HDR_IO_IN_PROGRESS(ab) ||
1763 (spa && ab->b_spa != spa) ||
1764 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1765 ddi_get_lbolt() - ab->b_arc_access <
1766 arc_min_prefetch_lifespan)) {
1767 skipped++;
1768 continue;
1769 }
1770 /* "lookahead" for better eviction candidate */
1771 if (recycle && ab->b_size != bytes &&
1772 ab_prev && ab_prev->b_size == bytes)
1773 continue;
1774 hash_lock = HDR_LOCK(ab);
1775 have_lock = MUTEX_HELD(hash_lock);
1776 if (have_lock || mutex_tryenter(hash_lock)) {
1777 ASSERT0(refcount_count(&ab->b_refcnt));
1778 ASSERT(ab->b_datacnt > 0);
1779 while (ab->b_buf) {
1780 arc_buf_t *buf = ab->b_buf;
1781 if (!mutex_tryenter(&buf->b_evict_lock)) {
1782 missed += 1;
1783 break;
1784 }
1785 if (buf->b_data) {
1786 bytes_evicted += ab->b_size;
1787 if (recycle && ab->b_type == type &&
1788 ab->b_size == bytes &&
1789 !HDR_L2_WRITING(ab)) {
1790 stolen = buf->b_data;
1791 recycle = FALSE;
1792 }
1793 }
1794 if (buf->b_efunc) {
1795 mutex_enter(&arc_eviction_mtx);
1796 arc_buf_destroy(buf,
1797 buf->b_data == stolen, FALSE);
1798 ab->b_buf = buf->b_next;
1799 buf->b_hdr = &arc_eviction_hdr;
1800 buf->b_next = arc_eviction_list;
1801 arc_eviction_list = buf;
1802 mutex_exit(&arc_eviction_mtx);
1803 mutex_exit(&buf->b_evict_lock);
1804 } else {
1805 mutex_exit(&buf->b_evict_lock);
1806 arc_buf_destroy(buf,
1807 buf->b_data == stolen, TRUE);
1808 }
1809 }
1810
1811 if (ab->b_l2hdr) {
1812 ARCSTAT_INCR(arcstat_evict_l2_cached,
1813 ab->b_size);
1814 } else {
1815 if (l2arc_write_eligible(ab->b_spa, ab)) {
1816 ARCSTAT_INCR(arcstat_evict_l2_eligible,
1817 ab->b_size);
1818 } else {
1819 ARCSTAT_INCR(
1820 arcstat_evict_l2_ineligible,
1821 ab->b_size);
1822 }
1823 }
1824
1825 if (ab->b_datacnt == 0) {
1826 arc_change_state(evicted_state, ab, hash_lock);
1827 ASSERT(HDR_IN_HASH_TABLE(ab));
1828 ab->b_flags |= ARC_IN_HASH_TABLE;
1829 ab->b_flags &= ~ARC_BUF_AVAILABLE;
1830 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1831 }
1832 if (!have_lock)
1833 mutex_exit(hash_lock);
1834 if (bytes >= 0 && bytes_evicted >= bytes)
1835 break;
1836 } else {
1837 missed += 1;
1838 }
1839 }
1840
1841 mutex_exit(&evicted_state->arcs_mtx);
1842 mutex_exit(&state->arcs_mtx);
1843
1844 if (bytes_evicted < bytes)
1845 dprintf("only evicted %lld bytes from %x",
1846 (longlong_t)bytes_evicted, state);
1847
1848 if (skipped)
1849 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1850
1851 if (missed)
1852 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1853
1854 /*
1855 * We have just evicted some data into the ghost state, make
1856 * sure we also adjust the ghost state size if necessary.
1857 */
1858 if (arc_no_grow &&
1859 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1860 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1861 arc_mru_ghost->arcs_size - arc_c;
1862
1863 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1864 int64_t todelete =
1865 MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1866 arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1867 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1868 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1869 arc_mru_ghost->arcs_size +
1870 arc_mfu_ghost->arcs_size - arc_c);
1871 arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1872 }
1873 }
1874
1875 return (stolen);
1876 }
1877
1878 /*
1879 * Remove buffers from list until we've removed the specified number of
1880 * bytes. Destroy the buffers that are removed.
1881 */
1882 static void
1883 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1884 {
1885 arc_buf_hdr_t *ab, *ab_prev;
1886 arc_buf_hdr_t marker = { 0 };
1887 list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1888 kmutex_t *hash_lock;
1889 uint64_t bytes_deleted = 0;
1890 uint64_t bufs_skipped = 0;
1891
1892 ASSERT(GHOST_STATE(state));
1893 top:
1894 mutex_enter(&state->arcs_mtx);
1895 for (ab = list_tail(list); ab; ab = ab_prev) {
1896 ab_prev = list_prev(list, ab);
1897 if (spa && ab->b_spa != spa)
1898 continue;
1899
1900 /* ignore markers */
1901 if (ab->b_spa == 0)
1902 continue;
1903
1904 hash_lock = HDR_LOCK(ab);
1905 /* caller may be trying to modify this buffer, skip it */
1906 if (MUTEX_HELD(hash_lock))
1907 continue;
1908 if (mutex_tryenter(hash_lock)) {
1909 ASSERT(!HDR_IO_IN_PROGRESS(ab));
1910 ASSERT(ab->b_buf == NULL);
1911 ARCSTAT_BUMP(arcstat_deleted);
1912 bytes_deleted += ab->b_size;
1913
1914 if (ab->b_l2hdr != NULL) {
1915 /*
1916 * This buffer is cached on the 2nd Level ARC;
1917 * don't destroy the header.
1918 */
1919 arc_change_state(arc_l2c_only, ab, hash_lock);
1920 mutex_exit(hash_lock);
1921 } else {
1922 arc_change_state(arc_anon, ab, hash_lock);
1923 mutex_exit(hash_lock);
1924 arc_hdr_destroy(ab);
1925 }
1926
1927 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1928 if (bytes >= 0 && bytes_deleted >= bytes)
1929 break;
1930 } else if (bytes < 0) {
1931 /*
1932 * Insert a list marker and then wait for the
1933 * hash lock to become available. Once its
1934 * available, restart from where we left off.
1935 */
1936 list_insert_after(list, ab, &marker);
1937 mutex_exit(&state->arcs_mtx);
1938 mutex_enter(hash_lock);
1939 mutex_exit(hash_lock);
1940 mutex_enter(&state->arcs_mtx);
1941 ab_prev = list_prev(list, &marker);
1942 list_remove(list, &marker);
1943 } else
1944 bufs_skipped += 1;
1945 }
1946 mutex_exit(&state->arcs_mtx);
1947
1948 if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1949 (bytes < 0 || bytes_deleted < bytes)) {
1950 list = &state->arcs_list[ARC_BUFC_METADATA];
1951 goto top;
1952 }
1953
1954 if (bufs_skipped) {
1955 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1956 ASSERT(bytes >= 0);
1957 }
1958
1959 if (bytes_deleted < bytes)
1960 dprintf("only deleted %lld bytes from %p",
1961 (longlong_t)bytes_deleted, state);
1962 }
1963
1964 static void
1965 arc_adjust(void)
1966 {
1967 int64_t adjustment, delta;
1968
1969 /*
1970 * Adjust MRU size
1971 */
1972
1973 adjustment = MIN((int64_t)(arc_size - arc_c),
1974 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
1975 arc_p));
1976
1977 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
1978 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
1979 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
1980 adjustment -= delta;
1981 }
1982
1983 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1984 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
1985 (void) arc_evict(arc_mru, NULL, delta, FALSE,
1986 ARC_BUFC_METADATA);
1987 }
1988
1989 /*
1990 * Adjust MFU size
1991 */
1992
1993 adjustment = arc_size - arc_c;
1994
1995 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
1996 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
1997 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
1998 adjustment -= delta;
1999 }
2000
2001 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2002 int64_t delta = MIN(adjustment,
2003 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2004 (void) arc_evict(arc_mfu, NULL, delta, FALSE,
2005 ARC_BUFC_METADATA);
2006 }
2007
2008 /*
2009 * Adjust ghost lists
2010 */
2011
2012 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2013
2014 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2015 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2016 arc_evict_ghost(arc_mru_ghost, NULL, delta);
2017 }
2018
2019 adjustment =
2020 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2021
2022 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2023 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2024 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2025 }
2026 }
2027
2028 static void
2029 arc_do_user_evicts(void)
2030 {
2031 mutex_enter(&arc_eviction_mtx);
2032 while (arc_eviction_list != NULL) {
2033 arc_buf_t *buf = arc_eviction_list;
2034 arc_eviction_list = buf->b_next;
2035 mutex_enter(&buf->b_evict_lock);
2036 buf->b_hdr = NULL;
2037 mutex_exit(&buf->b_evict_lock);
2038 mutex_exit(&arc_eviction_mtx);
2039
2040 if (buf->b_efunc != NULL)
2041 VERIFY(buf->b_efunc(buf) == 0);
2042
2043 buf->b_efunc = NULL;
2044 buf->b_private = NULL;
2045 kmem_cache_free(buf_cache, buf);
2046 mutex_enter(&arc_eviction_mtx);
2047 }
2048 mutex_exit(&arc_eviction_mtx);
2049 }
2050
2051 /*
2052 * Flush all *evictable* data from the cache for the given spa.
2053 * NOTE: this will not touch "active" (i.e. referenced) data.
2054 */
2055 void
2056 arc_flush(spa_t *spa)
2057 {
2058 uint64_t guid = 0;
2059
2060 if (spa)
2061 guid = spa_load_guid(spa);
2062
2063 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2064 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2065 if (spa)
2066 break;
2067 }
2068 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2069 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2070 if (spa)
2071 break;
2072 }
2073 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2074 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2075 if (spa)
2076 break;
2077 }
2078 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2079 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2080 if (spa)
2081 break;
2082 }
2083
2084 arc_evict_ghost(arc_mru_ghost, guid, -1);
2085 arc_evict_ghost(arc_mfu_ghost, guid, -1);
2086
2087 mutex_enter(&arc_reclaim_thr_lock);
2088 arc_do_user_evicts();
2089 mutex_exit(&arc_reclaim_thr_lock);
2090 ASSERT(spa || arc_eviction_list == NULL);
2091 }
2092
2093 void
2094 arc_shrink(void)
2095 {
2096 if (arc_c > arc_c_min) {
2097 uint64_t to_free;
2098
2099 #ifdef _KERNEL
2100 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
2101 #else
2102 to_free = arc_c >> arc_shrink_shift;
2103 #endif
2104 if (arc_c > arc_c_min + to_free)
2105 atomic_add_64(&arc_c, -to_free);
2106 else
2107 arc_c = arc_c_min;
2108
2109 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2110 if (arc_c > arc_size)
2111 arc_c = MAX(arc_size, arc_c_min);
2112 if (arc_p > arc_c)
2113 arc_p = (arc_c >> 1);
2114 ASSERT(arc_c >= arc_c_min);
2115 ASSERT((int64_t)arc_p >= 0);
2116 }
2117
2118 if (arc_size > arc_c)
2119 arc_adjust();
2120 }
2121
2122 /*
2123 * Determine if the system is under memory pressure and is asking
2124 * to reclaim memory. A return value of 1 indicates that the system
2125 * is under memory pressure and that the arc should adjust accordingly.
2126 */
2127 static int
2128 arc_reclaim_needed(void)
2129 {
2130 uint64_t extra;
2131
2132 #ifdef _KERNEL
2133
2134 if (needfree)
2135 return (1);
2136
2137 /*
2138 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2139 */
2140 extra = desfree;
2141
2142 /*
2143 * check that we're out of range of the pageout scanner. It starts to
2144 * schedule paging if freemem is less than lotsfree and needfree.
2145 * lotsfree is the high-water mark for pageout, and needfree is the
2146 * number of needed free pages. We add extra pages here to make sure
2147 * the scanner doesn't start up while we're freeing memory.
2148 */
2149 if (freemem < lotsfree + needfree + extra)
2150 return (1);
2151
2152 /*
2153 * check to make sure that swapfs has enough space so that anon
2154 * reservations can still succeed. anon_resvmem() checks that the
2155 * availrmem is greater than swapfs_minfree, and the number of reserved
2156 * swap pages. We also add a bit of extra here just to prevent
2157 * circumstances from getting really dire.
2158 */
2159 if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2160 return (1);
2161
2162 #if defined(__i386)
2163 /*
2164 * If we're on an i386 platform, it's possible that we'll exhaust the
2165 * kernel heap space before we ever run out of available physical
2166 * memory. Most checks of the size of the heap_area compare against
2167 * tune.t_minarmem, which is the minimum available real memory that we
2168 * can have in the system. However, this is generally fixed at 25 pages
2169 * which is so low that it's useless. In this comparison, we seek to
2170 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2171 * heap is allocated. (Or, in the calculation, if less than 1/4th is
2172 * free)
2173 */
2174 if (vmem_size(heap_arena, VMEM_FREE) <
2175 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2))
2176 return (1);
2177 #endif
2178
2179 /*
2180 * If zio data pages are being allocated out of a separate heap segment,
2181 * then enforce that the size of available vmem for this arena remains
2182 * above about 1/16th free.
2183 *
2184 * Note: The 1/16th arena free requirement was put in place
2185 * to aggressively evict memory from the arc in order to avoid
2186 * memory fragmentation issues.
2187 */
2188 if (zio_arena != NULL &&
2189 vmem_size(zio_arena, VMEM_FREE) <
2190 (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2191 return (1);
2192 #else
2193 if (spa_get_random(100) == 0)
2194 return (1);
2195 #endif
2196 return (0);
2197 }
2198
2199 static void
2200 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2201 {
2202 size_t i;
2203 kmem_cache_t *prev_cache = NULL;
2204 kmem_cache_t *prev_data_cache = NULL;
2205 extern kmem_cache_t *zio_buf_cache[];
2206 extern kmem_cache_t *zio_data_buf_cache[];
2207
2208 #ifdef _KERNEL
2209 if (arc_meta_used >= arc_meta_limit) {
2210 /*
2211 * We are exceeding our meta-data cache limit.
2212 * Purge some DNLC entries to release holds on meta-data.
2213 */
2214 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2215 }
2216 #if defined(__i386)
2217 /*
2218 * Reclaim unused memory from all kmem caches.
2219 */
2220 kmem_reap();
2221 #endif
2222 #endif
2223
2224 /*
2225 * An aggressive reclamation will shrink the cache size as well as
2226 * reap free buffers from the arc kmem caches.
2227 */
2228 if (strat == ARC_RECLAIM_AGGR)
2229 arc_shrink();
2230
2231 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2232 if (zio_buf_cache[i] != prev_cache) {
2233 prev_cache = zio_buf_cache[i];
2234 kmem_cache_reap_now(zio_buf_cache[i]);
2235 }
2236 if (zio_data_buf_cache[i] != prev_data_cache) {
2237 prev_data_cache = zio_data_buf_cache[i];
2238 kmem_cache_reap_now(zio_data_buf_cache[i]);
2239 }
2240 }
2241 kmem_cache_reap_now(buf_cache);
2242 kmem_cache_reap_now(hdr_cache);
2243
2244 /*
2245 * Ask the vmem areana to reclaim unused memory from its
2246 * quantum caches.
2247 */
2248 if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2249 vmem_qcache_reap(zio_arena);
2250 }
2251
2252 static void
2253 arc_reclaim_thread(void)
2254 {
2255 clock_t growtime = 0;
2256 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
2257 callb_cpr_t cpr;
2258
2259 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2260
2261 mutex_enter(&arc_reclaim_thr_lock);
2262 while (arc_thread_exit == 0) {
2263 if (arc_reclaim_needed()) {
2264
2265 if (arc_no_grow) {
2266 if (last_reclaim == ARC_RECLAIM_CONS) {
2267 last_reclaim = ARC_RECLAIM_AGGR;
2268 } else {
2269 last_reclaim = ARC_RECLAIM_CONS;
2270 }
2271 } else {
2272 arc_no_grow = TRUE;
2273 last_reclaim = ARC_RECLAIM_AGGR;
2274 membar_producer();
2275 }
2276
2277 /* reset the growth delay for every reclaim */
2278 growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2279
2280 arc_kmem_reap_now(last_reclaim);
2281 arc_warm = B_TRUE;
2282
2283 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2284 arc_no_grow = FALSE;
2285 }
2286
2287 arc_adjust();
2288
2289 if (arc_eviction_list != NULL)
2290 arc_do_user_evicts();
2291
2292 /* block until needed, or one second, whichever is shorter */
2293 CALLB_CPR_SAFE_BEGIN(&cpr);
2294 (void) cv_timedwait(&arc_reclaim_thr_cv,
2295 &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2296 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2297 }
2298
2299 arc_thread_exit = 0;
2300 cv_broadcast(&arc_reclaim_thr_cv);
2301 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */
2302 thread_exit();
2303 }
2304
2305 /*
2306 * Adapt arc info given the number of bytes we are trying to add and
2307 * the state that we are comming from. This function is only called
2308 * when we are adding new content to the cache.
2309 */
2310 static void
2311 arc_adapt(int bytes, arc_state_t *state)
2312 {
2313 int mult;
2314 uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2315
2316 if (state == arc_l2c_only)
2317 return;
2318
2319 ASSERT(bytes > 0);
2320 /*
2321 * Adapt the target size of the MRU list:
2322 * - if we just hit in the MRU ghost list, then increase
2323 * the target size of the MRU list.
2324 * - if we just hit in the MFU ghost list, then increase
2325 * the target size of the MFU list by decreasing the
2326 * target size of the MRU list.
2327 */
2328 if (state == arc_mru_ghost) {
2329 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2330 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2331 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2332
2333 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2334 } else if (state == arc_mfu_ghost) {
2335 uint64_t delta;
2336
2337 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2338 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2339 mult = MIN(mult, 10);
2340
2341 delta = MIN(bytes * mult, arc_p);
2342 arc_p = MAX(arc_p_min, arc_p - delta);
2343 }
2344 ASSERT((int64_t)arc_p >= 0);
2345
2346 if (arc_reclaim_needed()) {
2347 cv_signal(&arc_reclaim_thr_cv);
2348 return;
2349 }
2350
2351 if (arc_no_grow)
2352 return;
2353
2354 if (arc_c >= arc_c_max)
2355 return;
2356
2357 /*
2358 * If we're within (2 * maxblocksize) bytes of the target
2359 * cache size, increment the target cache size
2360 */
2361 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2362 atomic_add_64(&arc_c, (int64_t)bytes);
2363 if (arc_c > arc_c_max)
2364 arc_c = arc_c_max;
2365 else if (state == arc_anon)
2366 atomic_add_64(&arc_p, (int64_t)bytes);
2367 if (arc_p > arc_c)
2368 arc_p = arc_c;
2369 }
2370 ASSERT((int64_t)arc_p >= 0);
2371 }
2372
2373 /*
2374 * Check if the cache has reached its limits and eviction is required
2375 * prior to insert.
2376 */
2377 static int
2378 arc_evict_needed(arc_buf_contents_t type)
2379 {
2380 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2381 return (1);
2382
2383 if (arc_reclaim_needed())
2384 return (1);
2385
2386 return (arc_size > arc_c);
2387 }
2388
2389 /*
2390 * The buffer, supplied as the first argument, needs a data block.
2391 * So, if we are at cache max, determine which cache should be victimized.
2392 * We have the following cases:
2393 *
2394 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2395 * In this situation if we're out of space, but the resident size of the MFU is
2396 * under the limit, victimize the MFU cache to satisfy this insertion request.
2397 *
2398 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2399 * Here, we've used up all of the available space for the MRU, so we need to
2400 * evict from our own cache instead. Evict from the set of resident MRU
2401 * entries.
2402 *
2403 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2404 * c minus p represents the MFU space in the cache, since p is the size of the
2405 * cache that is dedicated to the MRU. In this situation there's still space on
2406 * the MFU side, so the MRU side needs to be victimized.
2407 *
2408 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2409 * MFU's resident set is consuming more space than it has been allotted. In
2410 * this situation, we must victimize our own cache, the MFU, for this insertion.
2411 */
2412 static void
2413 arc_get_data_buf(arc_buf_t *buf)
2414 {
2415 arc_state_t *state = buf->b_hdr->b_state;
2416 uint64_t size = buf->b_hdr->b_size;
2417 arc_buf_contents_t type = buf->b_hdr->b_type;
2418
2419 arc_adapt(size, state);
2420
2421 /*
2422 * We have not yet reached cache maximum size,
2423 * just allocate a new buffer.
2424 */
2425 if (!arc_evict_needed(type)) {
2426 if (type == ARC_BUFC_METADATA) {
2427 buf->b_data = zio_buf_alloc(size);
2428 arc_space_consume(size, ARC_SPACE_DATA);
2429 } else {
2430 ASSERT(type == ARC_BUFC_DATA);
2431 buf->b_data = zio_data_buf_alloc(size);
2432 ARCSTAT_INCR(arcstat_data_size, size);
2433 atomic_add_64(&arc_size, size);
2434 }
2435 goto out;
2436 }
2437
2438 /*
2439 * If we are prefetching from the mfu ghost list, this buffer
2440 * will end up on the mru list; so steal space from there.
2441 */
2442 if (state == arc_mfu_ghost)
2443 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2444 else if (state == arc_mru_ghost)
2445 state = arc_mru;
2446
2447 if (state == arc_mru || state == arc_anon) {
2448 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2449 state = (arc_mfu->arcs_lsize[type] >= size &&
2450 arc_p > mru_used) ? arc_mfu : arc_mru;
2451 } else {
2452 /* MFU cases */
2453 uint64_t mfu_space = arc_c - arc_p;
2454 state = (arc_mru->arcs_lsize[type] >= size &&
2455 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2456 }
2457 if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2458 if (type == ARC_BUFC_METADATA) {
2459 buf->b_data = zio_buf_alloc(size);
2460 arc_space_consume(size, ARC_SPACE_DATA);
2461 } else {
2462 ASSERT(type == ARC_BUFC_DATA);
2463 buf->b_data = zio_data_buf_alloc(size);
2464 ARCSTAT_INCR(arcstat_data_size, size);
2465 atomic_add_64(&arc_size, size);
2466 }
2467 ARCSTAT_BUMP(arcstat_recycle_miss);
2468 }
2469 ASSERT(buf->b_data != NULL);
2470 out:
2471 /*
2472 * Update the state size. Note that ghost states have a
2473 * "ghost size" and so don't need to be updated.
2474 */
2475 if (!GHOST_STATE(buf->b_hdr->b_state)) {
2476 arc_buf_hdr_t *hdr = buf->b_hdr;
2477
2478 atomic_add_64(&hdr->b_state->arcs_size, size);
2479 if (list_link_active(&hdr->b_arc_node)) {
2480 ASSERT(refcount_is_zero(&hdr->b_refcnt));
2481 atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2482 }
2483 /*
2484 * If we are growing the cache, and we are adding anonymous
2485 * data, and we have outgrown arc_p, update arc_p
2486 */
2487 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2488 arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2489 arc_p = MIN(arc_c, arc_p + size);
2490 }
2491 }
2492
2493 /*
2494 * This routine is called whenever a buffer is accessed.
2495 * NOTE: the hash lock is dropped in this function.
2496 */
2497 static void
2498 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2499 {
2500 clock_t now;
2501
2502 ASSERT(MUTEX_HELD(hash_lock));
2503
2504 if (buf->b_state == arc_anon) {
2505 /*
2506 * This buffer is not in the cache, and does not
2507 * appear in our "ghost" list. Add the new buffer
2508 * to the MRU state.
2509 */
2510
2511 ASSERT(buf->b_arc_access == 0);
2512 buf->b_arc_access = ddi_get_lbolt();
2513 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2514 arc_change_state(arc_mru, buf, hash_lock);
2515
2516 } else if (buf->b_state == arc_mru) {
2517 now = ddi_get_lbolt();
2518
2519 /*
2520 * If this buffer is here because of a prefetch, then either:
2521 * - clear the flag if this is a "referencing" read
2522 * (any subsequent access will bump this into the MFU state).
2523 * or
2524 * - move the buffer to the head of the list if this is
2525 * another prefetch (to make it less likely to be evicted).
2526 */
2527 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2528 if (refcount_count(&buf->b_refcnt) == 0) {
2529 ASSERT(list_link_active(&buf->b_arc_node));
2530 } else {
2531 buf->b_flags &= ~ARC_PREFETCH;
2532 ARCSTAT_BUMP(arcstat_mru_hits);
2533 }
2534 buf->b_arc_access = now;
2535 return;
2536 }
2537
2538 /*
2539 * This buffer has been "accessed" only once so far,
2540 * but it is still in the cache. Move it to the MFU
2541 * state.
2542 */
2543 if (now > buf->b_arc_access + ARC_MINTIME) {
2544 /*
2545 * More than 125ms have passed since we
2546 * instantiated this buffer. Move it to the
2547 * most frequently used state.
2548 */
2549 buf->b_arc_access = now;
2550 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2551 arc_change_state(arc_mfu, buf, hash_lock);
2552 }
2553 ARCSTAT_BUMP(arcstat_mru_hits);
2554 } else if (buf->b_state == arc_mru_ghost) {
2555 arc_state_t *new_state;
2556 /*
2557 * This buffer has been "accessed" recently, but
2558 * was evicted from the cache. Move it to the
2559 * MFU state.
2560 */
2561
2562 if (buf->b_flags & ARC_PREFETCH) {
2563 new_state = arc_mru;
2564 if (refcount_count(&buf->b_refcnt) > 0)
2565 buf->b_flags &= ~ARC_PREFETCH;
2566 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2567 } else {
2568 new_state = arc_mfu;
2569 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2570 }
2571
2572 buf->b_arc_access = ddi_get_lbolt();
2573 arc_change_state(new_state, buf, hash_lock);
2574
2575 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2576 } else if (buf->b_state == arc_mfu) {
2577 /*
2578 * This buffer has been accessed more than once and is
2579 * still in the cache. Keep it in the MFU state.
2580 *
2581 * NOTE: an add_reference() that occurred when we did
2582 * the arc_read() will have kicked this off the list.
2583 * If it was a prefetch, we will explicitly move it to
2584 * the head of the list now.
2585 */
2586 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2587 ASSERT(refcount_count(&buf->b_refcnt) == 0);
2588 ASSERT(list_link_active(&buf->b_arc_node));
2589 }
2590 ARCSTAT_BUMP(arcstat_mfu_hits);
2591 buf->b_arc_access = ddi_get_lbolt();
2592 } else if (buf->b_state == arc_mfu_ghost) {
2593 arc_state_t *new_state = arc_mfu;
2594 /*
2595 * This buffer has been accessed more than once but has
2596 * been evicted from the cache. Move it back to the
2597 * MFU state.
2598 */
2599
2600 if (buf->b_flags & ARC_PREFETCH) {
2601 /*
2602 * This is a prefetch access...
2603 * move this block back to the MRU state.
2604 */
2605 ASSERT0(refcount_count(&buf->b_refcnt));
2606 new_state = arc_mru;
2607 }
2608
2609 buf->b_arc_access = ddi_get_lbolt();
2610 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2611 arc_change_state(new_state, buf, hash_lock);
2612
2613 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2614 } else if (buf->b_state == arc_l2c_only) {
2615 /*
2616 * This buffer is on the 2nd Level ARC.
2617 */
2618
2619 buf->b_arc_access = ddi_get_lbolt();
2620 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2621 arc_change_state(arc_mfu, buf, hash_lock);
2622 } else {
2623 ASSERT(!"invalid arc state");
2624 }
2625 }
2626
2627 /* a generic arc_done_func_t which you can use */
2628 /* ARGSUSED */
2629 void
2630 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2631 {
2632 if (zio == NULL || zio->io_error == 0)
2633 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2634 VERIFY(arc_buf_remove_ref(buf, arg));
2635 }
2636
2637 /* a generic arc_done_func_t */
2638 void
2639 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2640 {
2641 arc_buf_t **bufp = arg;
2642 if (zio && zio->io_error) {
2643 VERIFY(arc_buf_remove_ref(buf, arg));
2644 *bufp = NULL;
2645 } else {
2646 *bufp = buf;
2647 ASSERT(buf->b_data);
2648 }
2649 }
2650
2651 static void
2652 arc_read_done(zio_t *zio)
2653 {
2654 arc_buf_hdr_t *hdr, *found;
2655 arc_buf_t *buf;
2656 arc_buf_t *abuf; /* buffer we're assigning to callback */
2657 kmutex_t *hash_lock;
2658 arc_callback_t *callback_list, *acb;
2659 int freeable = FALSE;
2660
2661 buf = zio->io_private;
2662 hdr = buf->b_hdr;
2663
2664 /*
2665 * The hdr was inserted into hash-table and removed from lists
2666 * prior to starting I/O. We should find this header, since
2667 * it's in the hash table, and it should be legit since it's
2668 * not possible to evict it during the I/O. The only possible
2669 * reason for it not to be found is if we were freed during the
2670 * read.
2671 */
2672 found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2673 &hash_lock);
2674
2675 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2676 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2677 (found == hdr && HDR_L2_READING(hdr)));
2678
2679 hdr->b_flags &= ~ARC_L2_EVICTED;
2680 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2681 hdr->b_flags &= ~ARC_L2CACHE;
2682
2683 /* byteswap if necessary */
2684 callback_list = hdr->b_acb;
2685 ASSERT(callback_list != NULL);
2686 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2687 dmu_object_byteswap_t bswap =
2688 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2689 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2690 byteswap_uint64_array :
2691 dmu_ot_byteswap[bswap].ob_func;
2692 func(buf->b_data, hdr->b_size);
2693 }
2694
2695 arc_cksum_compute(buf, B_FALSE);
2696 arc_buf_watch(buf);
2697
2698 if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2699 /*
2700 * Only call arc_access on anonymous buffers. This is because
2701 * if we've issued an I/O for an evicted buffer, we've already
2702 * called arc_access (to prevent any simultaneous readers from
2703 * getting confused).
2704 */
2705 arc_access(hdr, hash_lock);
2706 }
2707
2708 /* create copies of the data buffer for the callers */
2709 abuf = buf;
2710 for (acb = callback_list; acb; acb = acb->acb_next) {
2711 if (acb->acb_done) {
2712 if (abuf == NULL) {
2713 ARCSTAT_BUMP(arcstat_duplicate_reads);
2714 abuf = arc_buf_clone(buf);
2715 }
2716 acb->acb_buf = abuf;
2717 abuf = NULL;
2718 }
2719 }
2720 hdr->b_acb = NULL;
2721 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2722 ASSERT(!HDR_BUF_AVAILABLE(hdr));
2723 if (abuf == buf) {
2724 ASSERT(buf->b_efunc == NULL);
2725 ASSERT(hdr->b_datacnt == 1);
2726 hdr->b_flags |= ARC_BUF_AVAILABLE;
2727 }
2728
2729 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2730
2731 if (zio->io_error != 0) {
2732 hdr->b_flags |= ARC_IO_ERROR;
2733 if (hdr->b_state != arc_anon)
2734 arc_change_state(arc_anon, hdr, hash_lock);
2735 if (HDR_IN_HASH_TABLE(hdr))
2736 buf_hash_remove(hdr);
2737 freeable = refcount_is_zero(&hdr->b_refcnt);
2738 }
2739
2740 /*
2741 * Broadcast before we drop the hash_lock to avoid the possibility
2742 * that the hdr (and hence the cv) might be freed before we get to
2743 * the cv_broadcast().
2744 */
2745 cv_broadcast(&hdr->b_cv);
2746
2747 if (hash_lock) {
2748 mutex_exit(hash_lock);
2749 } else {
2750 /*
2751 * This block was freed while we waited for the read to
2752 * complete. It has been removed from the hash table and
2753 * moved to the anonymous state (so that it won't show up
2754 * in the cache).
2755 */
2756 ASSERT3P(hdr->b_state, ==, arc_anon);
2757 freeable = refcount_is_zero(&hdr->b_refcnt);
2758 }
2759
2760 /* execute each callback and free its structure */
2761 while ((acb = callback_list) != NULL) {
2762 if (acb->acb_done)
2763 acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2764
2765 if (acb->acb_zio_dummy != NULL) {
2766 acb->acb_zio_dummy->io_error = zio->io_error;
2767 zio_nowait(acb->acb_zio_dummy);
2768 }
2769
2770 callback_list = acb->acb_next;
2771 kmem_free(acb, sizeof (arc_callback_t));
2772 }
2773
2774 if (freeable)
2775 arc_hdr_destroy(hdr);
2776 }
2777
2778 /*
2779 * "Read" the block at the specified DVA (in bp) via the
2780 * cache. If the block is found in the cache, invoke the provided
2781 * callback immediately and return. Note that the `zio' parameter
2782 * in the callback will be NULL in this case, since no IO was
2783 * required. If the block is not in the cache pass the read request
2784 * on to the spa with a substitute callback function, so that the
2785 * requested block will be added to the cache.
2786 *
2787 * If a read request arrives for a block that has a read in-progress,
2788 * either wait for the in-progress read to complete (and return the
2789 * results); or, if this is a read with a "done" func, add a record
2790 * to the read to invoke the "done" func when the read completes,
2791 * and return; or just return.
2792 *
2793 * arc_read_done() will invoke all the requested "done" functions
2794 * for readers of this block.
2795 */
2796 int
2797 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2798 void *private, int priority, int zio_flags, uint32_t *arc_flags,
2799 const zbookmark_t *zb)
2800 {
2801 arc_buf_hdr_t *hdr;
2802 arc_buf_t *buf = NULL;
2803 kmutex_t *hash_lock;
2804 zio_t *rzio;
2805 uint64_t guid = spa_load_guid(spa);
2806
2807 top:
2808 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2809 &hash_lock);
2810 if (hdr && hdr->b_datacnt > 0) {
2811
2812 *arc_flags |= ARC_CACHED;
2813
2814 if (HDR_IO_IN_PROGRESS(hdr)) {
2815
2816 if (*arc_flags & ARC_WAIT) {
2817 cv_wait(&hdr->b_cv, hash_lock);
2818 mutex_exit(hash_lock);
2819 goto top;
2820 }
2821 ASSERT(*arc_flags & ARC_NOWAIT);
2822
2823 if (done) {
2824 arc_callback_t *acb = NULL;
2825
2826 acb = kmem_zalloc(sizeof (arc_callback_t),
2827 KM_SLEEP);
2828 acb->acb_done = done;
2829 acb->acb_private = private;
2830 if (pio != NULL)
2831 acb->acb_zio_dummy = zio_null(pio,
2832 spa, NULL, NULL, NULL, zio_flags);
2833
2834 ASSERT(acb->acb_done != NULL);
2835 acb->acb_next = hdr->b_acb;
2836 hdr->b_acb = acb;
2837 add_reference(hdr, hash_lock, private);
2838 mutex_exit(hash_lock);
2839 return (0);
2840 }
2841 mutex_exit(hash_lock);
2842 return (0);
2843 }
2844
2845 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2846
2847 if (done) {
2848 add_reference(hdr, hash_lock, private);
2849 /*
2850 * If this block is already in use, create a new
2851 * copy of the data so that we will be guaranteed
2852 * that arc_release() will always succeed.
2853 */
2854 buf = hdr->b_buf;
2855 ASSERT(buf);
2856 ASSERT(buf->b_data);
2857 if (HDR_BUF_AVAILABLE(hdr)) {
2858 ASSERT(buf->b_efunc == NULL);
2859 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2860 } else {
2861 buf = arc_buf_clone(buf);
2862 }
2863
2864 } else if (*arc_flags & ARC_PREFETCH &&
2865 refcount_count(&hdr->b_refcnt) == 0) {
2866 hdr->b_flags |= ARC_PREFETCH;
2867 }
2868 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2869 arc_access(hdr, hash_lock);
2870 if (*arc_flags & ARC_L2CACHE)
2871 hdr->b_flags |= ARC_L2CACHE;
2872 mutex_exit(hash_lock);
2873 ARCSTAT_BUMP(arcstat_hits);
2874 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2875 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2876 data, metadata, hits);
2877
2878 if (done)
2879 done(NULL, buf, private);
2880 } else {
2881 uint64_t size = BP_GET_LSIZE(bp);
2882 arc_callback_t *acb;
2883 vdev_t *vd = NULL;
2884 uint64_t addr = 0;
2885 boolean_t devw = B_FALSE;
2886
2887 if (hdr == NULL) {
2888 /* this block is not in the cache */
2889 arc_buf_hdr_t *exists;
2890 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2891 buf = arc_buf_alloc(spa, size, private, type);
2892 hdr = buf->b_hdr;
2893 hdr->b_dva = *BP_IDENTITY(bp);
2894 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
2895 hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2896 exists = buf_hash_insert(hdr, &hash_lock);
2897 if (exists) {
2898 /* somebody beat us to the hash insert */
2899 mutex_exit(hash_lock);
2900 buf_discard_identity(hdr);
2901 (void) arc_buf_remove_ref(buf, private);
2902 goto top; /* restart the IO request */
2903 }
2904 /* if this is a prefetch, we don't have a reference */
2905 if (*arc_flags & ARC_PREFETCH) {
2906 (void) remove_reference(hdr, hash_lock,
2907 private);
2908 hdr->b_flags |= ARC_PREFETCH;
2909 }
2910 if (*arc_flags & ARC_L2CACHE)
2911 hdr->b_flags |= ARC_L2CACHE;
2912 if (BP_GET_LEVEL(bp) > 0)
2913 hdr->b_flags |= ARC_INDIRECT;
2914 } else {
2915 /* this block is in the ghost cache */
2916 ASSERT(GHOST_STATE(hdr->b_state));
2917 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2918 ASSERT0(refcount_count(&hdr->b_refcnt));
2919 ASSERT(hdr->b_buf == NULL);
2920
2921 /* if this is a prefetch, we don't have a reference */
2922 if (*arc_flags & ARC_PREFETCH)
2923 hdr->b_flags |= ARC_PREFETCH;
2924 else
2925 add_reference(hdr, hash_lock, private);
2926 if (*arc_flags & ARC_L2CACHE)
2927 hdr->b_flags |= ARC_L2CACHE;
2928 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2929 buf->b_hdr = hdr;
2930 buf->b_data = NULL;
2931 buf->b_efunc = NULL;
2932 buf->b_private = NULL;
2933 buf->b_next = NULL;
2934 hdr->b_buf = buf;
2935 ASSERT(hdr->b_datacnt == 0);
2936 hdr->b_datacnt = 1;
2937 arc_get_data_buf(buf);
2938 arc_access(hdr, hash_lock);
2939 }
2940
2941 ASSERT(!GHOST_STATE(hdr->b_state));
2942
2943 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2944 acb->acb_done = done;
2945 acb->acb_private = private;
2946
2947 ASSERT(hdr->b_acb == NULL);
2948 hdr->b_acb = acb;
2949 hdr->b_flags |= ARC_IO_IN_PROGRESS;
2950
2951 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
2952 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
2953 devw = hdr->b_l2hdr->b_dev->l2ad_writing;
2954 addr = hdr->b_l2hdr->b_daddr;
2955 /*
2956 * Lock out device removal.
2957 */
2958 if (vdev_is_dead(vd) ||
2959 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
2960 vd = NULL;
2961 }
2962
2963 mutex_exit(hash_lock);
2964
2965 /*
2966 * At this point, we have a level 1 cache miss. Try again in
2967 * L2ARC if possible.
2968 */
2969 ASSERT3U(hdr->b_size, ==, size);
2970 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
2971 uint64_t, size, zbookmark_t *, zb);
2972 ARCSTAT_BUMP(arcstat_misses);
2973 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2974 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2975 data, metadata, misses);
2976
2977 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
2978 /*
2979 * Read from the L2ARC if the following are true:
2980 * 1. The L2ARC vdev was previously cached.
2981 * 2. This buffer still has L2ARC metadata.
2982 * 3. This buffer isn't currently writing to the L2ARC.
2983 * 4. The L2ARC entry wasn't evicted, which may
2984 * also have invalidated the vdev.
2985 * 5. This isn't prefetch and l2arc_noprefetch is set.
2986 */
2987 if (hdr->b_l2hdr != NULL &&
2988 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
2989 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
2990 l2arc_read_callback_t *cb;
2991
2992 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
2993 ARCSTAT_BUMP(arcstat_l2_hits);
2994
2995 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
2996 KM_SLEEP);
2997 cb->l2rcb_buf = buf;
2998 cb->l2rcb_spa = spa;
2999 cb->l2rcb_bp = *bp;
3000 cb->l2rcb_zb = *zb;
3001 cb->l2rcb_flags = zio_flags;
3002
3003 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3004 addr + size < vd->vdev_psize -
3005 VDEV_LABEL_END_SIZE);
3006
3007 /*
3008 * l2arc read. The SCL_L2ARC lock will be
3009 * released by l2arc_read_done().
3010 */
3011 rzio = zio_read_phys(pio, vd, addr, size,
3012 buf->b_data, ZIO_CHECKSUM_OFF,
3013 l2arc_read_done, cb, priority, zio_flags |
3014 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
3015 ZIO_FLAG_DONT_PROPAGATE |
3016 ZIO_FLAG_DONT_RETRY, B_FALSE);
3017 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3018 zio_t *, rzio);
3019 ARCSTAT_INCR(arcstat_l2_read_bytes, size);
3020
3021 if (*arc_flags & ARC_NOWAIT) {
3022 zio_nowait(rzio);
3023 return (0);
3024 }
3025
3026 ASSERT(*arc_flags & ARC_WAIT);
3027 if (zio_wait(rzio) == 0)
3028 return (0);
3029
3030 /* l2arc read error; goto zio_read() */
3031 } else {
3032 DTRACE_PROBE1(l2arc__miss,
3033 arc_buf_hdr_t *, hdr);
3034 ARCSTAT_BUMP(arcstat_l2_misses);
3035 if (HDR_L2_WRITING(hdr))
3036 ARCSTAT_BUMP(arcstat_l2_rw_clash);
3037 spa_config_exit(spa, SCL_L2ARC, vd);
3038 }
3039 } else {
3040 if (vd != NULL)
3041 spa_config_exit(spa, SCL_L2ARC, vd);
3042 if (l2arc_ndev != 0) {
3043 DTRACE_PROBE1(l2arc__miss,
3044 arc_buf_hdr_t *, hdr);
3045 ARCSTAT_BUMP(arcstat_l2_misses);
3046 }
3047 }
3048
3049 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3050 arc_read_done, buf, priority, zio_flags, zb);
3051
3052 if (*arc_flags & ARC_WAIT)
3053 return (zio_wait(rzio));
3054
3055 ASSERT(*arc_flags & ARC_NOWAIT);
3056 zio_nowait(rzio);
3057 }
3058 return (0);
3059 }
3060
3061 void
3062 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3063 {
3064 ASSERT(buf->b_hdr != NULL);
3065 ASSERT(buf->b_hdr->b_state != arc_anon);
3066 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3067 ASSERT(buf->b_efunc == NULL);
3068 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3069
3070 buf->b_efunc = func;
3071 buf->b_private = private;
3072 }
3073
3074 /*
3075 * This is used by the DMU to let the ARC know that a buffer is
3076 * being evicted, so the ARC should clean up. If this arc buf
3077 * is not yet in the evicted state, it will be put there.
3078 */
3079 int
3080 arc_buf_evict(arc_buf_t *buf)
3081 {
3082 arc_buf_hdr_t *hdr;
3083 kmutex_t *hash_lock;
3084 arc_buf_t **bufp;
3085
3086 mutex_enter(&buf->b_evict_lock);
3087 hdr = buf->b_hdr;
3088 if (hdr == NULL) {
3089 /*
3090 * We are in arc_do_user_evicts().
3091 */
3092 ASSERT(buf->b_data == NULL);
3093 mutex_exit(&buf->b_evict_lock);
3094 return (0);
3095 } else if (buf->b_data == NULL) {
3096 arc_buf_t copy = *buf; /* structure assignment */
3097 /*
3098 * We are on the eviction list; process this buffer now
3099 * but let arc_do_user_evicts() do the reaping.
3100 */
3101 buf->b_efunc = NULL;
3102 mutex_exit(&buf->b_evict_lock);
3103 VERIFY(copy.b_efunc(©) == 0);
3104 return (1);
3105 }
3106 hash_lock = HDR_LOCK(hdr);
3107 mutex_enter(hash_lock);
3108 hdr = buf->b_hdr;
3109 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3110
3111 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3112 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3113
3114 /*
3115 * Pull this buffer off of the hdr
3116 */
3117 bufp = &hdr->b_buf;
3118 while (*bufp != buf)
3119 bufp = &(*bufp)->b_next;
3120 *bufp = buf->b_next;
3121
3122 ASSERT(buf->b_data != NULL);
3123 arc_buf_destroy(buf, FALSE, FALSE);
3124
3125 if (hdr->b_datacnt == 0) {
3126 arc_state_t *old_state = hdr->b_state;
3127 arc_state_t *evicted_state;
3128
3129 ASSERT(hdr->b_buf == NULL);
3130 ASSERT(refcount_is_zero(&hdr->b_refcnt));
3131
3132 evicted_state =
3133 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3134
3135 mutex_enter(&old_state->arcs_mtx);
3136 mutex_enter(&evicted_state->arcs_mtx);
3137
3138 arc_change_state(evicted_state, hdr, hash_lock);
3139 ASSERT(HDR_IN_HASH_TABLE(hdr));
3140 hdr->b_flags |= ARC_IN_HASH_TABLE;
3141 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3142
3143 mutex_exit(&evicted_state->arcs_mtx);
3144 mutex_exit(&old_state->arcs_mtx);
3145 }
3146 mutex_exit(hash_lock);
3147 mutex_exit(&buf->b_evict_lock);
3148
3149 VERIFY(buf->b_efunc(buf) == 0);
3150 buf->b_efunc = NULL;
3151 buf->b_private = NULL;
3152 buf->b_hdr = NULL;
3153 buf->b_next = NULL;
3154 kmem_cache_free(buf_cache, buf);
3155 return (1);
3156 }
3157
3158 /*
3159 * Release this buffer from the cache, making it an anonymous buffer. This
3160 * must be done after a read and prior to modifying the buffer contents.
3161 * If the buffer has more than one reference, we must make
3162 * a new hdr for the buffer.
3163 */
3164 void
3165 arc_release(arc_buf_t *buf, void *tag)
3166 {
3167 arc_buf_hdr_t *hdr;
3168 kmutex_t *hash_lock = NULL;
3169 l2arc_buf_hdr_t *l2hdr;
3170 uint64_t buf_size;
3171
3172 /*
3173 * It would be nice to assert that if it's DMU metadata (level >
3174 * 0 || it's the dnode file), then it must be syncing context.
3175 * But we don't know that information at this level.
3176 */
3177
3178 mutex_enter(&buf->b_evict_lock);
3179 hdr = buf->b_hdr;
3180
3181 /* this buffer is not on any list */
3182 ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3183
3184 if (hdr->b_state == arc_anon) {
3185 /* this buffer is already released */
3186 ASSERT(buf->b_efunc == NULL);
3187 } else {
3188 hash_lock = HDR_LOCK(hdr);
3189 mutex_enter(hash_lock);
3190 hdr = buf->b_hdr;
3191 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3192 }
3193
3194 l2hdr = hdr->b_l2hdr;
3195 if (l2hdr) {
3196 mutex_enter(&l2arc_buflist_mtx);
3197 hdr->b_l2hdr = NULL;
3198 }
3199 buf_size = hdr->b_size;
3200
3201 /*
3202 * Do we have more than one buf?
3203 */
3204 if (hdr->b_datacnt > 1) {
3205 arc_buf_hdr_t *nhdr;
3206 arc_buf_t **bufp;
3207 uint64_t blksz = hdr->b_size;
3208 uint64_t spa = hdr->b_spa;
3209 arc_buf_contents_t type = hdr->b_type;
3210 uint32_t flags = hdr->b_flags;
3211
3212 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3213 /*
3214 * Pull the data off of this hdr and attach it to
3215 * a new anonymous hdr.
3216 */
3217 (void) remove_reference(hdr, hash_lock, tag);
3218 bufp = &hdr->b_buf;
3219 while (*bufp != buf)
3220 bufp = &(*bufp)->b_next;
3221 *bufp = buf->b_next;
3222 buf->b_next = NULL;
3223
3224 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3225 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3226 if (refcount_is_zero(&hdr->b_refcnt)) {
3227 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3228 ASSERT3U(*size, >=, hdr->b_size);
3229 atomic_add_64(size, -hdr->b_size);
3230 }
3231
3232 /*
3233 * We're releasing a duplicate user data buffer, update
3234 * our statistics accordingly.
3235 */
3236 if (hdr->b_type == ARC_BUFC_DATA) {
3237 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3238 ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3239 -hdr->b_size);
3240 }
3241 hdr->b_datacnt -= 1;
3242 arc_cksum_verify(buf);
3243 arc_buf_unwatch(buf);
3244
3245 mutex_exit(hash_lock);
3246
3247 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3248 nhdr->b_size = blksz;
3249 nhdr->b_spa = spa;
3250 nhdr->b_type = type;
3251 nhdr->b_buf = buf;
3252 nhdr->b_state = arc_anon;
3253 nhdr->b_arc_access = 0;
3254 nhdr->b_flags = flags & ARC_L2_WRITING;
3255 nhdr->b_l2hdr = NULL;
3256 nhdr->b_datacnt = 1;
3257 nhdr->b_freeze_cksum = NULL;
3258 (void) refcount_add(&nhdr->b_refcnt, tag);
3259 buf->b_hdr = nhdr;
3260 mutex_exit(&buf->b_evict_lock);
3261 atomic_add_64(&arc_anon->arcs_size, blksz);
3262 } else {
3263 mutex_exit(&buf->b_evict_lock);
3264 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3265 ASSERT(!list_link_active(&hdr->b_arc_node));
3266 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3267 if (hdr->b_state != arc_anon)
3268 arc_change_state(arc_anon, hdr, hash_lock);
3269 hdr->b_arc_access = 0;
3270 if (hash_lock)
3271 mutex_exit(hash_lock);
3272
3273 buf_discard_identity(hdr);
3274 arc_buf_thaw(buf);
3275 }
3276 buf->b_efunc = NULL;
3277 buf->b_private = NULL;
3278
3279 if (l2hdr) {
3280 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3281 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3282 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3283 mutex_exit(&l2arc_buflist_mtx);
3284 }
3285 }
3286
3287 int
3288 arc_released(arc_buf_t *buf)
3289 {
3290 int released;
3291
3292 mutex_enter(&buf->b_evict_lock);
3293 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3294 mutex_exit(&buf->b_evict_lock);
3295 return (released);
3296 }
3297
3298 int
3299 arc_has_callback(arc_buf_t *buf)
3300 {
3301 int callback;
3302
3303 mutex_enter(&buf->b_evict_lock);
3304 callback = (buf->b_efunc != NULL);
3305 mutex_exit(&buf->b_evict_lock);
3306 return (callback);
3307 }
3308
3309 #ifdef ZFS_DEBUG
3310 int
3311 arc_referenced(arc_buf_t *buf)
3312 {
3313 int referenced;
3314
3315 mutex_enter(&buf->b_evict_lock);
3316 referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3317 mutex_exit(&buf->b_evict_lock);
3318 return (referenced);
3319 }
3320 #endif
3321
3322 static void
3323 arc_write_ready(zio_t *zio)
3324 {
3325 arc_write_callback_t *callback = zio->io_private;
3326 arc_buf_t *buf = callback->awcb_buf;
3327 arc_buf_hdr_t *hdr = buf->b_hdr;
3328
3329 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3330 callback->awcb_ready(zio, buf, callback->awcb_private);
3331
3332 /*
3333 * If the IO is already in progress, then this is a re-write
3334 * attempt, so we need to thaw and re-compute the cksum.
3335 * It is the responsibility of the callback to handle the
3336 * accounting for any re-write attempt.
3337 */
3338 if (HDR_IO_IN_PROGRESS(hdr)) {
3339 mutex_enter(&hdr->b_freeze_lock);
3340 if (hdr->b_freeze_cksum != NULL) {
3341 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3342 hdr->b_freeze_cksum = NULL;
3343 }
3344 mutex_exit(&hdr->b_freeze_lock);
3345 }
3346 arc_cksum_compute(buf, B_FALSE);
3347 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3348 }
3349
3350 static void
3351 arc_write_done(zio_t *zio)
3352 {
3353 arc_write_callback_t *callback = zio->io_private;
3354 arc_buf_t *buf = callback->awcb_buf;
3355 arc_buf_hdr_t *hdr = buf->b_hdr;
3356
3357 ASSERT(hdr->b_acb == NULL);
3358
3359 if (zio->io_error == 0) {
3360 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3361 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3362 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3363 } else {
3364 ASSERT(BUF_EMPTY(hdr));
3365 }
3366
3367 /*
3368 * If the block to be written was all-zero, we may have
3369 * compressed it away. In this case no write was performed
3370 * so there will be no dva/birth/checksum. The buffer must
3371 * therefore remain anonymous (and uncached).
3372 */
3373 if (!BUF_EMPTY(hdr)) {
3374 arc_buf_hdr_t *exists;
3375 kmutex_t *hash_lock;
3376
3377 ASSERT(zio->io_error == 0);
3378
3379 arc_cksum_verify(buf);
3380
3381 exists = buf_hash_insert(hdr, &hash_lock);
3382 if (exists) {
3383 /*
3384 * This can only happen if we overwrite for
3385 * sync-to-convergence, because we remove
3386 * buffers from the hash table when we arc_free().
3387 */
3388 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3389 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3390 panic("bad overwrite, hdr=%p exists=%p",
3391 (void *)hdr, (void *)exists);
3392 ASSERT(refcount_is_zero(&exists->b_refcnt));
3393 arc_change_state(arc_anon, exists, hash_lock);
3394 mutex_exit(hash_lock);
3395 arc_hdr_destroy(exists);
3396 exists = buf_hash_insert(hdr, &hash_lock);
3397 ASSERT3P(exists, ==, NULL);
3398 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3399 /* nopwrite */
3400 ASSERT(zio->io_prop.zp_nopwrite);
3401 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3402 panic("bad nopwrite, hdr=%p exists=%p",
3403 (void *)hdr, (void *)exists);
3404 } else {
3405 /* Dedup */
3406 ASSERT(hdr->b_datacnt == 1);
3407 ASSERT(hdr->b_state == arc_anon);
3408 ASSERT(BP_GET_DEDUP(zio->io_bp));
3409 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3410 }
3411 }
3412 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3413 /* if it's not anon, we are doing a scrub */
3414 if (!exists && hdr->b_state == arc_anon)
3415 arc_access(hdr, hash_lock);
3416 mutex_exit(hash_lock);
3417 } else {
3418 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3419 }
3420
3421 ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3422 callback->awcb_done(zio, buf, callback->awcb_private);
3423
3424 kmem_free(callback, sizeof (arc_write_callback_t));
3425 }
3426
3427 zio_t *
3428 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3429 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
3430 arc_done_func_t *ready, arc_done_func_t *done, void *private,
3431 int priority, int zio_flags, const zbookmark_t *zb)
3432 {
3433 arc_buf_hdr_t *hdr = buf->b_hdr;
3434 arc_write_callback_t *callback;
3435 zio_t *zio;
3436
3437 ASSERT(ready != NULL);
3438 ASSERT(done != NULL);
3439 ASSERT(!HDR_IO_ERROR(hdr));
3440 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3441 ASSERT(hdr->b_acb == NULL);
3442 if (l2arc)
3443 hdr->b_flags |= ARC_L2CACHE;
3444 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3445 callback->awcb_ready = ready;
3446 callback->awcb_done = done;
3447 callback->awcb_private = private;
3448 callback->awcb_buf = buf;
3449
3450 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3451 arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3452
3453 return (zio);
3454 }
3455
3456 static int
3457 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3458 {
3459 #ifdef _KERNEL
3460 uint64_t available_memory = ptob(freemem);
3461 static uint64_t page_load = 0;
3462 static uint64_t last_txg = 0;
3463
3464 #if defined(__i386)
3465 available_memory =
3466 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3467 #endif
3468 if (available_memory >= zfs_write_limit_max)
3469 return (0);
3470
3471 if (txg > last_txg) {
3472 last_txg = txg;
3473 page_load = 0;
3474 }
3475 /*
3476 * If we are in pageout, we know that memory is already tight,
3477 * the arc is already going to be evicting, so we just want to
3478 * continue to let page writes occur as quickly as possible.
3479 */
3480 if (curproc == proc_pageout) {
3481 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3482 return (SET_ERROR(ERESTART));
3483 /* Note: reserve is inflated, so we deflate */
3484 page_load += reserve / 8;
3485 return (0);
3486 } else if (page_load > 0 && arc_reclaim_needed()) {
3487 /* memory is low, delay before restarting */
3488 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3489 return (SET_ERROR(EAGAIN));
3490 }
3491 page_load = 0;
3492
3493 if (arc_size > arc_c_min) {
3494 uint64_t evictable_memory =
3495 arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3496 arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3497 arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3498 arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3499 available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3500 }
3501
3502 if (inflight_data > available_memory / 4) {
3503 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3504 return (SET_ERROR(ERESTART));
3505 }
3506 #endif
3507 return (0);
3508 }
3509
3510 void
3511 arc_tempreserve_clear(uint64_t reserve)
3512 {
3513 atomic_add_64(&arc_tempreserve, -reserve);
3514 ASSERT((int64_t)arc_tempreserve >= 0);
3515 }
3516
3517 int
3518 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3519 {
3520 int error;
3521 uint64_t anon_size;
3522
3523 #ifdef ZFS_DEBUG
3524 /*
3525 * Once in a while, fail for no reason. Everything should cope.
3526 */
3527 if (spa_get_random(10000) == 0) {
3528 dprintf("forcing random failure\n");
3529 return (SET_ERROR(ERESTART));
3530 }
3531 #endif
3532 if (reserve > arc_c/4 && !arc_no_grow)
3533 arc_c = MIN(arc_c_max, reserve * 4);
3534 if (reserve > arc_c)
3535 return (SET_ERROR(ENOMEM));
3536
3537 /*
3538 * Don't count loaned bufs as in flight dirty data to prevent long
3539 * network delays from blocking transactions that are ready to be
3540 * assigned to a txg.
3541 */
3542 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3543
3544 /*
3545 * Writes will, almost always, require additional memory allocations
3546 * in order to compress/encrypt/etc the data. We therefore need to
3547 * make sure that there is sufficient available memory for this.
3548 */
3549 if (error = arc_memory_throttle(reserve, anon_size, txg))
3550 return (error);
3551
3552 /*
3553 * Throttle writes when the amount of dirty data in the cache
3554 * gets too large. We try to keep the cache less than half full
3555 * of dirty blocks so that our sync times don't grow too large.
3556 * Note: if two requests come in concurrently, we might let them
3557 * both succeed, when one of them should fail. Not a huge deal.
3558 */
3559
3560 if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3561 anon_size > arc_c / 4) {
3562 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3563 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3564 arc_tempreserve>>10,
3565 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3566 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3567 reserve>>10, arc_c>>10);
3568 return (SET_ERROR(ERESTART));
3569 }
3570 atomic_add_64(&arc_tempreserve, reserve);
3571 return (0);
3572 }
3573
3574 void
3575 arc_init(void)
3576 {
3577 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3578 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3579
3580 /* Convert seconds to clock ticks */
3581 arc_min_prefetch_lifespan = 1 * hz;
3582
3583 /* Start out with 1/8 of all memory */
3584 arc_c = physmem * PAGESIZE / 8;
3585
3586 #ifdef _KERNEL
3587 /*
3588 * On architectures where the physical memory can be larger
3589 * than the addressable space (intel in 32-bit mode), we may
3590 * need to limit the cache to 1/8 of VM size.
3591 */
3592 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3593 #endif
3594
3595 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3596 arc_c_min = MAX(arc_c / 4, 64<<20);
3597 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3598 if (arc_c * 8 >= 1<<30)
3599 arc_c_max = (arc_c * 8) - (1<<30);
3600 else
3601 arc_c_max = arc_c_min;
3602 arc_c_max = MAX(arc_c * 6, arc_c_max);
3603
3604 /*
3605 * Allow the tunables to override our calculations if they are
3606 * reasonable (ie. over 64MB)
3607 */
3608 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3609 arc_c_max = zfs_arc_max;
3610 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3611 arc_c_min = zfs_arc_min;
3612
3613 arc_c = arc_c_max;
3614 arc_p = (arc_c >> 1);
3615
3616 /* limit meta-data to 1/4 of the arc capacity */
3617 arc_meta_limit = arc_c_max / 4;
3618
3619 /* Allow the tunable to override if it is reasonable */
3620 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3621 arc_meta_limit = zfs_arc_meta_limit;
3622
3623 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3624 arc_c_min = arc_meta_limit / 2;
3625
3626 if (zfs_arc_grow_retry > 0)
3627 arc_grow_retry = zfs_arc_grow_retry;
3628
3629 if (zfs_arc_shrink_shift > 0)
3630 arc_shrink_shift = zfs_arc_shrink_shift;
3631
3632 if (zfs_arc_p_min_shift > 0)
3633 arc_p_min_shift = zfs_arc_p_min_shift;
3634
3635 /* if kmem_flags are set, lets try to use less memory */
3636 if (kmem_debugging())
3637 arc_c = arc_c / 2;
3638 if (arc_c < arc_c_min)
3639 arc_c = arc_c_min;
3640
3641 arc_anon = &ARC_anon;
3642 arc_mru = &ARC_mru;
3643 arc_mru_ghost = &ARC_mru_ghost;
3644 arc_mfu = &ARC_mfu;
3645 arc_mfu_ghost = &ARC_mfu_ghost;
3646 arc_l2c_only = &ARC_l2c_only;
3647 arc_size = 0;
3648
3649 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3650 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3651 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3652 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3653 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3654 mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3655
3656 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3657 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3658 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3659 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3660 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3661 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3662 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3663 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3664 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3665 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3666 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3667 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3668 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3669 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3670 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3671 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3672 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3673 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3674 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3675 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3676
3677 buf_init();
3678
3679 arc_thread_exit = 0;
3680 arc_eviction_list = NULL;
3681 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3682 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3683
3684 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3685 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3686
3687 if (arc_ksp != NULL) {
3688 arc_ksp->ks_data = &arc_stats;
3689 kstat_install(arc_ksp);
3690 }
3691
3692 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3693 TS_RUN, minclsyspri);
3694
3695 arc_dead = FALSE;
3696 arc_warm = B_FALSE;
3697
3698 if (zfs_write_limit_max == 0)
3699 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3700 else
3701 zfs_write_limit_shift = 0;
3702 mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3703 }
3704
3705 void
3706 arc_fini(void)
3707 {
3708 mutex_enter(&arc_reclaim_thr_lock);
3709 arc_thread_exit = 1;
3710 while (arc_thread_exit != 0)
3711 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3712 mutex_exit(&arc_reclaim_thr_lock);
3713
3714 arc_flush(NULL);
3715
3716 arc_dead = TRUE;
3717
3718 if (arc_ksp != NULL) {
3719 kstat_delete(arc_ksp);
3720 arc_ksp = NULL;
3721 }
3722
3723 mutex_destroy(&arc_eviction_mtx);
3724 mutex_destroy(&arc_reclaim_thr_lock);
3725 cv_destroy(&arc_reclaim_thr_cv);
3726
3727 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3728 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3729 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3730 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3731 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3732 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3733 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3734 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3735
3736 mutex_destroy(&arc_anon->arcs_mtx);
3737 mutex_destroy(&arc_mru->arcs_mtx);
3738 mutex_destroy(&arc_mru_ghost->arcs_mtx);
3739 mutex_destroy(&arc_mfu->arcs_mtx);
3740 mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3741 mutex_destroy(&arc_l2c_only->arcs_mtx);
3742
3743 mutex_destroy(&zfs_write_limit_lock);
3744
3745 buf_fini();
3746
3747 ASSERT(arc_loaned_bytes == 0);
3748 }
3749
3750 /*
3751 * Level 2 ARC
3752 *
3753 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3754 * It uses dedicated storage devices to hold cached data, which are populated
3755 * using large infrequent writes. The main role of this cache is to boost
3756 * the performance of random read workloads. The intended L2ARC devices
3757 * include short-stroked disks, solid state disks, and other media with
3758 * substantially faster read latency than disk.
3759 *
3760 * +-----------------------+
3761 * | ARC |
3762 * +-----------------------+
3763 * | ^ ^
3764 * | | |
3765 * l2arc_feed_thread() arc_read()
3766 * | | |
3767 * | l2arc read |
3768 * V | |
3769 * +---------------+ |
3770 * | L2ARC | |
3771 * +---------------+ |
3772 * | ^ |
3773 * l2arc_write() | |
3774 * | | |
3775 * V | |
3776 * +-------+ +-------+
3777 * | vdev | | vdev |
3778 * | cache | | cache |
3779 * +-------+ +-------+
3780 * +=========+ .-----.
3781 * : L2ARC : |-_____-|
3782 * : devices : | Disks |
3783 * +=========+ `-_____-'
3784 *
3785 * Read requests are satisfied from the following sources, in order:
3786 *
3787 * 1) ARC
3788 * 2) vdev cache of L2ARC devices
3789 * 3) L2ARC devices
3790 * 4) vdev cache of disks
3791 * 5) disks
3792 *
3793 * Some L2ARC device types exhibit extremely slow write performance.
3794 * To accommodate for this there are some significant differences between
3795 * the L2ARC and traditional cache design:
3796 *
3797 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
3798 * the ARC behave as usual, freeing buffers and placing headers on ghost
3799 * lists. The ARC does not send buffers to the L2ARC during eviction as
3800 * this would add inflated write latencies for all ARC memory pressure.
3801 *
3802 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3803 * It does this by periodically scanning buffers from the eviction-end of
3804 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3805 * not already there. It scans until a headroom of buffers is satisfied,
3806 * which itself is a buffer for ARC eviction. The thread that does this is
3807 * l2arc_feed_thread(), illustrated below; example sizes are included to
3808 * provide a better sense of ratio than this diagram:
3809 *
3810 * head --> tail
3811 * +---------------------+----------+
3812 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
3813 * +---------------------+----------+ | o L2ARC eligible
3814 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
3815 * +---------------------+----------+ |
3816 * 15.9 Gbytes ^ 32 Mbytes |
3817 * headroom |
3818 * l2arc_feed_thread()
3819 * |
3820 * l2arc write hand <--[oooo]--'
3821 * | 8 Mbyte
3822 * | write max
3823 * V
3824 * +==============================+
3825 * L2ARC dev |####|#|###|###| |####| ... |
3826 * +==============================+
3827 * 32 Gbytes
3828 *
3829 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3830 * evicted, then the L2ARC has cached a buffer much sooner than it probably
3831 * needed to, potentially wasting L2ARC device bandwidth and storage. It is
3832 * safe to say that this is an uncommon case, since buffers at the end of
3833 * the ARC lists have moved there due to inactivity.
3834 *
3835 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3836 * then the L2ARC simply misses copying some buffers. This serves as a
3837 * pressure valve to prevent heavy read workloads from both stalling the ARC
3838 * with waits and clogging the L2ARC with writes. This also helps prevent
3839 * the potential for the L2ARC to churn if it attempts to cache content too
3840 * quickly, such as during backups of the entire pool.
3841 *
3842 * 5. After system boot and before the ARC has filled main memory, there are
3843 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
3844 * lists can remain mostly static. Instead of searching from tail of these
3845 * lists as pictured, the l2arc_feed_thread() will search from the list heads
3846 * for eligible buffers, greatly increasing its chance of finding them.
3847 *
3848 * The L2ARC device write speed is also boosted during this time so that
3849 * the L2ARC warms up faster. Since there have been no ARC evictions yet,
3850 * there are no L2ARC reads, and no fear of degrading read performance
3851 * through increased writes.
3852 *
3853 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
3854 * the vdev queue can aggregate them into larger and fewer writes. Each
3855 * device is written to in a rotor fashion, sweeping writes through
3856 * available space then repeating.
3857 *
3858 * 7. The L2ARC does not store dirty content. It never needs to flush
3859 * write buffers back to disk based storage.
3860 *
3861 * 8. If an ARC buffer is written (and dirtied) which also exists in the
3862 * L2ARC, the now stale L2ARC buffer is immediately dropped.
3863 *
3864 * The performance of the L2ARC can be tweaked by a number of tunables, which
3865 * may be necessary for different workloads:
3866 *
3867 * l2arc_write_max max write bytes per interval
3868 * l2arc_write_boost extra write bytes during device warmup
3869 * l2arc_noprefetch skip caching prefetched buffers
3870 * l2arc_headroom number of max device writes to precache
3871 * l2arc_feed_secs seconds between L2ARC writing
3872 *
3873 * Tunables may be removed or added as future performance improvements are
3874 * integrated, and also may become zpool properties.
3875 *
3876 * There are three key functions that control how the L2ARC warms up:
3877 *
3878 * l2arc_write_eligible() check if a buffer is eligible to cache
3879 * l2arc_write_size() calculate how much to write
3880 * l2arc_write_interval() calculate sleep delay between writes
3881 *
3882 * These three functions determine what to write, how much, and how quickly
3883 * to send writes.
3884 */
3885
3886 static boolean_t
3887 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
3888 {
3889 /*
3890 * A buffer is *not* eligible for the L2ARC if it:
3891 * 1. belongs to a different spa.
3892 * 2. is already cached on the L2ARC.
3893 * 3. has an I/O in progress (it may be an incomplete read).
3894 * 4. is flagged not eligible (zfs property).
3895 */
3896 if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
3897 HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
3898 return (B_FALSE);
3899
3900 return (B_TRUE);
3901 }
3902
3903 static uint64_t
3904 l2arc_write_size(l2arc_dev_t *dev)
3905 {
3906 uint64_t size;
3907
3908 size = dev->l2ad_write;
3909
3910 if (arc_warm == B_FALSE)
3911 size += dev->l2ad_boost;
3912
3913 return (size);
3914
3915 }
3916
3917 static clock_t
3918 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
3919 {
3920 clock_t interval, next, now;
3921
3922 /*
3923 * If the ARC lists are busy, increase our write rate; if the
3924 * lists are stale, idle back. This is achieved by checking
3925 * how much we previously wrote - if it was more than half of
3926 * what we wanted, schedule the next write much sooner.
3927 */
3928 if (l2arc_feed_again && wrote > (wanted / 2))
3929 interval = (hz * l2arc_feed_min_ms) / 1000;
3930 else
3931 interval = hz * l2arc_feed_secs;
3932
3933 now = ddi_get_lbolt();
3934 next = MAX(now, MIN(now + interval, began + interval));
3935
3936 return (next);
3937 }
3938
3939 static void
3940 l2arc_hdr_stat_add(void)
3941 {
3942 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
3943 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
3944 }
3945
3946 static void
3947 l2arc_hdr_stat_remove(void)
3948 {
3949 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
3950 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
3951 }
3952
3953 /*
3954 * Cycle through L2ARC devices. This is how L2ARC load balances.
3955 * If a device is returned, this also returns holding the spa config lock.
3956 */
3957 static l2arc_dev_t *
3958 l2arc_dev_get_next(void)
3959 {
3960 l2arc_dev_t *first, *next = NULL;
3961
3962 /*
3963 * Lock out the removal of spas (spa_namespace_lock), then removal
3964 * of cache devices (l2arc_dev_mtx). Once a device has been selected,
3965 * both locks will be dropped and a spa config lock held instead.
3966 */
3967 mutex_enter(&spa_namespace_lock);
3968 mutex_enter(&l2arc_dev_mtx);
3969
3970 /* if there are no vdevs, there is nothing to do */
3971 if (l2arc_ndev == 0)
3972 goto out;
3973
3974 first = NULL;
3975 next = l2arc_dev_last;
3976 do {
3977 /* loop around the list looking for a non-faulted vdev */
3978 if (next == NULL) {
3979 next = list_head(l2arc_dev_list);
3980 } else {
3981 next = list_next(l2arc_dev_list, next);
3982 if (next == NULL)
3983 next = list_head(l2arc_dev_list);
3984 }
3985
3986 /* if we have come back to the start, bail out */
3987 if (first == NULL)
3988 first = next;
3989 else if (next == first)
3990 break;
3991
3992 } while (vdev_is_dead(next->l2ad_vdev));
3993
3994 /* if we were unable to find any usable vdevs, return NULL */
3995 if (vdev_is_dead(next->l2ad_vdev))
3996 next = NULL;
3997
3998 l2arc_dev_last = next;
3999
4000 out:
4001 mutex_exit(&l2arc_dev_mtx);
4002
4003 /*
4004 * Grab the config lock to prevent the 'next' device from being
4005 * removed while we are writing to it.
4006 */
4007 if (next != NULL)
4008 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4009 mutex_exit(&spa_namespace_lock);
4010
4011 return (next);
4012 }
4013
4014 /*
4015 * Free buffers that were tagged for destruction.
4016 */
4017 static void
4018 l2arc_do_free_on_write()
4019 {
4020 list_t *buflist;
4021 l2arc_data_free_t *df, *df_prev;
4022
4023 mutex_enter(&l2arc_free_on_write_mtx);
4024 buflist = l2arc_free_on_write;
4025
4026 for (df = list_tail(buflist); df; df = df_prev) {
4027 df_prev = list_prev(buflist, df);
4028 ASSERT(df->l2df_data != NULL);
4029 ASSERT(df->l2df_func != NULL);
4030 df->l2df_func(df->l2df_data, df->l2df_size);
4031 list_remove(buflist, df);
4032 kmem_free(df, sizeof (l2arc_data_free_t));
4033 }
4034
4035 mutex_exit(&l2arc_free_on_write_mtx);
4036 }
4037
4038 /*
4039 * A write to a cache device has completed. Update all headers to allow
4040 * reads from these buffers to begin.
4041 */
4042 static void
4043 l2arc_write_done(zio_t *zio)
4044 {
4045 l2arc_write_callback_t *cb;
4046 l2arc_dev_t *dev;
4047 list_t *buflist;
4048 arc_buf_hdr_t *head, *ab, *ab_prev;
4049 l2arc_buf_hdr_t *abl2;
4050 kmutex_t *hash_lock;
4051
4052 cb = zio->io_private;
4053 ASSERT(cb != NULL);
4054 dev = cb->l2wcb_dev;
4055 ASSERT(dev != NULL);
4056 head = cb->l2wcb_head;
4057 ASSERT(head != NULL);
4058 buflist = dev->l2ad_buflist;
4059 ASSERT(buflist != NULL);
4060 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4061 l2arc_write_callback_t *, cb);
4062
4063 if (zio->io_error != 0)
4064 ARCSTAT_BUMP(arcstat_l2_writes_error);
4065
4066 mutex_enter(&l2arc_buflist_mtx);
4067
4068 /*
4069 * All writes completed, or an error was hit.
4070 */
4071 for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4072 ab_prev = list_prev(buflist, ab);
4073
4074 hash_lock = HDR_LOCK(ab);
4075 if (!mutex_tryenter(hash_lock)) {
4076 /*
4077 * This buffer misses out. It may be in a stage
4078 * of eviction. Its ARC_L2_WRITING flag will be
4079 * left set, denying reads to this buffer.
4080 */
4081 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4082 continue;
4083 }
4084
4085 if (zio->io_error != 0) {
4086 /*
4087 * Error - drop L2ARC entry.
4088 */
4089 list_remove(buflist, ab);
4090 abl2 = ab->b_l2hdr;
4091 ab->b_l2hdr = NULL;
4092 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4093 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4094 }
4095
4096 /*
4097 * Allow ARC to begin reads to this L2ARC entry.
4098 */
4099 ab->b_flags &= ~ARC_L2_WRITING;
4100
4101 mutex_exit(hash_lock);
4102 }
4103
4104 atomic_inc_64(&l2arc_writes_done);
4105 list_remove(buflist, head);
4106 kmem_cache_free(hdr_cache, head);
4107 mutex_exit(&l2arc_buflist_mtx);
4108
4109 l2arc_do_free_on_write();
4110
4111 kmem_free(cb, sizeof (l2arc_write_callback_t));
4112 }
4113
4114 /*
4115 * A read to a cache device completed. Validate buffer contents before
4116 * handing over to the regular ARC routines.
4117 */
4118 static void
4119 l2arc_read_done(zio_t *zio)
4120 {
4121 l2arc_read_callback_t *cb;
4122 arc_buf_hdr_t *hdr;
4123 arc_buf_t *buf;
4124 kmutex_t *hash_lock;
4125 int equal;
4126
4127 ASSERT(zio->io_vd != NULL);
4128 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4129
4130 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4131
4132 cb = zio->io_private;
4133 ASSERT(cb != NULL);
4134 buf = cb->l2rcb_buf;
4135 ASSERT(buf != NULL);
4136
4137 hash_lock = HDR_LOCK(buf->b_hdr);
4138 mutex_enter(hash_lock);
4139 hdr = buf->b_hdr;
4140 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4141
4142 /*
4143 * Check this survived the L2ARC journey.
4144 */
4145 equal = arc_cksum_equal(buf);
4146 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4147 mutex_exit(hash_lock);
4148 zio->io_private = buf;
4149 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
4150 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
4151 arc_read_done(zio);
4152 } else {
4153 mutex_exit(hash_lock);
4154 /*
4155 * Buffer didn't survive caching. Increment stats and
4156 * reissue to the original storage device.
4157 */
4158 if (zio->io_error != 0) {
4159 ARCSTAT_BUMP(arcstat_l2_io_error);
4160 } else {
4161 zio->io_error = SET_ERROR(EIO);
4162 }
4163 if (!equal)
4164 ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4165
4166 /*
4167 * If there's no waiter, issue an async i/o to the primary
4168 * storage now. If there *is* a waiter, the caller must
4169 * issue the i/o in a context where it's OK to block.
4170 */
4171 if (zio->io_waiter == NULL) {
4172 zio_t *pio = zio_unique_parent(zio);
4173
4174 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4175
4176 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4177 buf->b_data, zio->io_size, arc_read_done, buf,
4178 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4179 }
4180 }
4181
4182 kmem_free(cb, sizeof (l2arc_read_callback_t));
4183 }
4184
4185 /*
4186 * This is the list priority from which the L2ARC will search for pages to
4187 * cache. This is used within loops (0..3) to cycle through lists in the
4188 * desired order. This order can have a significant effect on cache
4189 * performance.
4190 *
4191 * Currently the metadata lists are hit first, MFU then MRU, followed by
4192 * the data lists. This function returns a locked list, and also returns
4193 * the lock pointer.
4194 */
4195 static list_t *
4196 l2arc_list_locked(int list_num, kmutex_t **lock)
4197 {
4198 list_t *list = NULL;
4199
4200 ASSERT(list_num >= 0 && list_num <= 3);
4201
4202 switch (list_num) {
4203 case 0:
4204 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4205 *lock = &arc_mfu->arcs_mtx;
4206 break;
4207 case 1:
4208 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4209 *lock = &arc_mru->arcs_mtx;
4210 break;
4211 case 2:
4212 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4213 *lock = &arc_mfu->arcs_mtx;
4214 break;
4215 case 3:
4216 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4217 *lock = &arc_mru->arcs_mtx;
4218 break;
4219 }
4220
4221 ASSERT(!(MUTEX_HELD(*lock)));
4222 mutex_enter(*lock);
4223 return (list);
4224 }
4225
4226 /*
4227 * Evict buffers from the device write hand to the distance specified in
4228 * bytes. This distance may span populated buffers, it may span nothing.
4229 * This is clearing a region on the L2ARC device ready for writing.
4230 * If the 'all' boolean is set, every buffer is evicted.
4231 */
4232 static void
4233 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4234 {
4235 list_t *buflist;
4236 l2arc_buf_hdr_t *abl2;
4237 arc_buf_hdr_t *ab, *ab_prev;
4238 kmutex_t *hash_lock;
4239 uint64_t taddr;
4240
4241 buflist = dev->l2ad_buflist;
4242
4243 if (buflist == NULL)
4244 return;
4245
4246 if (!all && dev->l2ad_first) {
4247 /*
4248 * This is the first sweep through the device. There is
4249 * nothing to evict.
4250 */
4251 return;
4252 }
4253
4254 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4255 /*
4256 * When nearing the end of the device, evict to the end
4257 * before the device write hand jumps to the start.
4258 */
4259 taddr = dev->l2ad_end;
4260 } else {
4261 taddr = dev->l2ad_hand + distance;
4262 }
4263 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4264 uint64_t, taddr, boolean_t, all);
4265
4266 top:
4267 mutex_enter(&l2arc_buflist_mtx);
4268 for (ab = list_tail(buflist); ab; ab = ab_prev) {
4269 ab_prev = list_prev(buflist, ab);
4270
4271 hash_lock = HDR_LOCK(ab);
4272 if (!mutex_tryenter(hash_lock)) {
4273 /*
4274 * Missed the hash lock. Retry.
4275 */
4276 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4277 mutex_exit(&l2arc_buflist_mtx);
4278 mutex_enter(hash_lock);
4279 mutex_exit(hash_lock);
4280 goto top;
4281 }
4282
4283 if (HDR_L2_WRITE_HEAD(ab)) {
4284 /*
4285 * We hit a write head node. Leave it for
4286 * l2arc_write_done().
4287 */
4288 list_remove(buflist, ab);
4289 mutex_exit(hash_lock);
4290 continue;
4291 }
4292
4293 if (!all && ab->b_l2hdr != NULL &&
4294 (ab->b_l2hdr->b_daddr > taddr ||
4295 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4296 /*
4297 * We've evicted to the target address,
4298 * or the end of the device.
4299 */
4300 mutex_exit(hash_lock);
4301 break;
4302 }
4303
4304 if (HDR_FREE_IN_PROGRESS(ab)) {
4305 /*
4306 * Already on the path to destruction.
4307 */
4308 mutex_exit(hash_lock);
4309 continue;
4310 }
4311
4312 if (ab->b_state == arc_l2c_only) {
4313 ASSERT(!HDR_L2_READING(ab));
4314 /*
4315 * This doesn't exist in the ARC. Destroy.
4316 * arc_hdr_destroy() will call list_remove()
4317 * and decrement arcstat_l2_size.
4318 */
4319 arc_change_state(arc_anon, ab, hash_lock);
4320 arc_hdr_destroy(ab);
4321 } else {
4322 /*
4323 * Invalidate issued or about to be issued
4324 * reads, since we may be about to write
4325 * over this location.
4326 */
4327 if (HDR_L2_READING(ab)) {
4328 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4329 ab->b_flags |= ARC_L2_EVICTED;
4330 }
4331
4332 /*
4333 * Tell ARC this no longer exists in L2ARC.
4334 */
4335 if (ab->b_l2hdr != NULL) {
4336 abl2 = ab->b_l2hdr;
4337 ab->b_l2hdr = NULL;
4338 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4339 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4340 }
4341 list_remove(buflist, ab);
4342
4343 /*
4344 * This may have been leftover after a
4345 * failed write.
4346 */
4347 ab->b_flags &= ~ARC_L2_WRITING;
4348 }
4349 mutex_exit(hash_lock);
4350 }
4351 mutex_exit(&l2arc_buflist_mtx);
4352
4353 vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4354 dev->l2ad_evict = taddr;
4355 }
4356
4357 /*
4358 * Find and write ARC buffers to the L2ARC device.
4359 *
4360 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4361 * for reading until they have completed writing.
4362 */
4363 static uint64_t
4364 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
4365 {
4366 arc_buf_hdr_t *ab, *ab_prev, *head;
4367 l2arc_buf_hdr_t *hdrl2;
4368 list_t *list;
4369 uint64_t passed_sz, write_sz, buf_sz, headroom;
4370 void *buf_data;
4371 kmutex_t *hash_lock, *list_lock;
4372 boolean_t have_lock, full;
4373 l2arc_write_callback_t *cb;
4374 zio_t *pio, *wzio;
4375 uint64_t guid = spa_load_guid(spa);
4376
4377 ASSERT(dev->l2ad_vdev != NULL);
4378
4379 pio = NULL;
4380 write_sz = 0;
4381 full = B_FALSE;
4382 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4383 head->b_flags |= ARC_L2_WRITE_HEAD;
4384
4385 /*
4386 * Copy buffers for L2ARC writing.
4387 */
4388 mutex_enter(&l2arc_buflist_mtx);
4389 for (int try = 0; try <= 3; try++) {
4390 list = l2arc_list_locked(try, &list_lock);
4391 passed_sz = 0;
4392
4393 /*
4394 * L2ARC fast warmup.
4395 *
4396 * Until the ARC is warm and starts to evict, read from the
4397 * head of the ARC lists rather than the tail.
4398 */
4399 headroom = target_sz * l2arc_headroom;
4400 if (arc_warm == B_FALSE)
4401 ab = list_head(list);
4402 else
4403 ab = list_tail(list);
4404
4405 for (; ab; ab = ab_prev) {
4406 if (arc_warm == B_FALSE)
4407 ab_prev = list_next(list, ab);
4408 else
4409 ab_prev = list_prev(list, ab);
4410
4411 hash_lock = HDR_LOCK(ab);
4412 have_lock = MUTEX_HELD(hash_lock);
4413 if (!have_lock && !mutex_tryenter(hash_lock)) {
4414 /*
4415 * Skip this buffer rather than waiting.
4416 */
4417 continue;
4418 }
4419
4420 passed_sz += ab->b_size;
4421 if (passed_sz > headroom) {
4422 /*
4423 * Searched too far.
4424 */
4425 mutex_exit(hash_lock);
4426 break;
4427 }
4428
4429 if (!l2arc_write_eligible(guid, ab)) {
4430 mutex_exit(hash_lock);
4431 continue;
4432 }
4433
4434 if ((write_sz + ab->b_size) > target_sz) {
4435 full = B_TRUE;
4436 mutex_exit(hash_lock);
4437 break;
4438 }
4439
4440 if (pio == NULL) {
4441 /*
4442 * Insert a dummy header on the buflist so
4443 * l2arc_write_done() can find where the
4444 * write buffers begin without searching.
4445 */
4446 list_insert_head(dev->l2ad_buflist, head);
4447
4448 cb = kmem_alloc(
4449 sizeof (l2arc_write_callback_t), KM_SLEEP);
4450 cb->l2wcb_dev = dev;
4451 cb->l2wcb_head = head;
4452 pio = zio_root(spa, l2arc_write_done, cb,
4453 ZIO_FLAG_CANFAIL);
4454 }
4455
4456 /*
4457 * Create and add a new L2ARC header.
4458 */
4459 hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4460 hdrl2->b_dev = dev;
4461 hdrl2->b_daddr = dev->l2ad_hand;
4462
4463 ab->b_flags |= ARC_L2_WRITING;
4464 ab->b_l2hdr = hdrl2;
4465 list_insert_head(dev->l2ad_buflist, ab);
4466 buf_data = ab->b_buf->b_data;
4467 buf_sz = ab->b_size;
4468
4469 /*
4470 * Compute and store the buffer cksum before
4471 * writing. On debug the cksum is verified first.
4472 */
4473 arc_cksum_verify(ab->b_buf);
4474 arc_cksum_compute(ab->b_buf, B_TRUE);
4475
4476 mutex_exit(hash_lock);
4477
4478 wzio = zio_write_phys(pio, dev->l2ad_vdev,
4479 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4480 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4481 ZIO_FLAG_CANFAIL, B_FALSE);
4482
4483 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4484 zio_t *, wzio);
4485 (void) zio_nowait(wzio);
4486
4487 /*
4488 * Keep the clock hand suitably device-aligned.
4489 */
4490 buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4491
4492 write_sz += buf_sz;
4493 dev->l2ad_hand += buf_sz;
4494 }
4495
4496 mutex_exit(list_lock);
4497
4498 if (full == B_TRUE)
4499 break;
4500 }
4501 mutex_exit(&l2arc_buflist_mtx);
4502
4503 if (pio == NULL) {
4504 ASSERT0(write_sz);
4505 kmem_cache_free(hdr_cache, head);
4506 return (0);
4507 }
4508
4509 ASSERT3U(write_sz, <=, target_sz);
4510 ARCSTAT_BUMP(arcstat_l2_writes_sent);
4511 ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
4512 ARCSTAT_INCR(arcstat_l2_size, write_sz);
4513 vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
4514
4515 /*
4516 * Bump device hand to the device start if it is approaching the end.
4517 * l2arc_evict() will already have evicted ahead for this case.
4518 */
4519 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4520 vdev_space_update(dev->l2ad_vdev,
4521 dev->l2ad_end - dev->l2ad_hand, 0, 0);
4522 dev->l2ad_hand = dev->l2ad_start;
4523 dev->l2ad_evict = dev->l2ad_start;
4524 dev->l2ad_first = B_FALSE;
4525 }
4526
4527 dev->l2ad_writing = B_TRUE;
4528 (void) zio_wait(pio);
4529 dev->l2ad_writing = B_FALSE;
4530
4531 return (write_sz);
4532 }
4533
4534 /*
4535 * This thread feeds the L2ARC at regular intervals. This is the beating
4536 * heart of the L2ARC.
4537 */
4538 static void
4539 l2arc_feed_thread(void)
4540 {
4541 callb_cpr_t cpr;
4542 l2arc_dev_t *dev;
4543 spa_t *spa;
4544 uint64_t size, wrote;
4545 clock_t begin, next = ddi_get_lbolt();
4546
4547 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4548
4549 mutex_enter(&l2arc_feed_thr_lock);
4550
4551 while (l2arc_thread_exit == 0) {
4552 CALLB_CPR_SAFE_BEGIN(&cpr);
4553 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4554 next);
4555 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4556 next = ddi_get_lbolt() + hz;
4557
4558 /*
4559 * Quick check for L2ARC devices.
4560 */
4561 mutex_enter(&l2arc_dev_mtx);
4562 if (l2arc_ndev == 0) {
4563 mutex_exit(&l2arc_dev_mtx);
4564 continue;
4565 }
4566 mutex_exit(&l2arc_dev_mtx);
4567 begin = ddi_get_lbolt();
4568
4569 /*
4570 * This selects the next l2arc device to write to, and in
4571 * doing so the next spa to feed from: dev->l2ad_spa. This
4572 * will return NULL if there are now no l2arc devices or if
4573 * they are all faulted.
4574 *
4575 * If a device is returned, its spa's config lock is also
4576 * held to prevent device removal. l2arc_dev_get_next()
4577 * will grab and release l2arc_dev_mtx.
4578 */
4579 if ((dev = l2arc_dev_get_next()) == NULL)
4580 continue;
4581
4582 spa = dev->l2ad_spa;
4583 ASSERT(spa != NULL);
4584
4585 /*
4586 * If the pool is read-only then force the feed thread to
4587 * sleep a little longer.
4588 */
4589 if (!spa_writeable(spa)) {
4590 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
4591 spa_config_exit(spa, SCL_L2ARC, dev);
4592 continue;
4593 }
4594
4595 /*
4596 * Avoid contributing to memory pressure.
4597 */
4598 if (arc_reclaim_needed()) {
4599 ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4600 spa_config_exit(spa, SCL_L2ARC, dev);
4601 continue;
4602 }
4603
4604 ARCSTAT_BUMP(arcstat_l2_feeds);
4605
4606 size = l2arc_write_size(dev);
4607
4608 /*
4609 * Evict L2ARC buffers that will be overwritten.
4610 */
4611 l2arc_evict(dev, size, B_FALSE);
4612
4613 /*
4614 * Write ARC buffers.
4615 */
4616 wrote = l2arc_write_buffers(spa, dev, size);
4617
4618 /*
4619 * Calculate interval between writes.
4620 */
4621 next = l2arc_write_interval(begin, size, wrote);
4622 spa_config_exit(spa, SCL_L2ARC, dev);
4623 }
4624
4625 l2arc_thread_exit = 0;
4626 cv_broadcast(&l2arc_feed_thr_cv);
4627 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
4628 thread_exit();
4629 }
4630
4631 boolean_t
4632 l2arc_vdev_present(vdev_t *vd)
4633 {
4634 l2arc_dev_t *dev;
4635
4636 mutex_enter(&l2arc_dev_mtx);
4637 for (dev = list_head(l2arc_dev_list); dev != NULL;
4638 dev = list_next(l2arc_dev_list, dev)) {
4639 if (dev->l2ad_vdev == vd)
4640 break;
4641 }
4642 mutex_exit(&l2arc_dev_mtx);
4643
4644 return (dev != NULL);
4645 }
4646
4647 /*
4648 * Add a vdev for use by the L2ARC. By this point the spa has already
4649 * validated the vdev and opened it.
4650 */
4651 void
4652 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
4653 {
4654 l2arc_dev_t *adddev;
4655
4656 ASSERT(!l2arc_vdev_present(vd));
4657
4658 /*
4659 * Create a new l2arc device entry.
4660 */
4661 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
4662 adddev->l2ad_spa = spa;
4663 adddev->l2ad_vdev = vd;
4664 adddev->l2ad_write = l2arc_write_max;
4665 adddev->l2ad_boost = l2arc_write_boost;
4666 adddev->l2ad_start = VDEV_LABEL_START_SIZE;
4667 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
4668 adddev->l2ad_hand = adddev->l2ad_start;
4669 adddev->l2ad_evict = adddev->l2ad_start;
4670 adddev->l2ad_first = B_TRUE;
4671 adddev->l2ad_writing = B_FALSE;
4672 ASSERT3U(adddev->l2ad_write, >, 0);
4673
4674 /*
4675 * This is a list of all ARC buffers that are still valid on the
4676 * device.
4677 */
4678 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
4679 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
4680 offsetof(arc_buf_hdr_t, b_l2node));
4681
4682 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
4683
4684 /*
4685 * Add device to global list
4686 */
4687 mutex_enter(&l2arc_dev_mtx);
4688 list_insert_head(l2arc_dev_list, adddev);
4689 atomic_inc_64(&l2arc_ndev);
4690 mutex_exit(&l2arc_dev_mtx);
4691 }
4692
4693 /*
4694 * Remove a vdev from the L2ARC.
4695 */
4696 void
4697 l2arc_remove_vdev(vdev_t *vd)
4698 {
4699 l2arc_dev_t *dev, *nextdev, *remdev = NULL;
4700
4701 /*
4702 * Find the device by vdev
4703 */
4704 mutex_enter(&l2arc_dev_mtx);
4705 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
4706 nextdev = list_next(l2arc_dev_list, dev);
4707 if (vd == dev->l2ad_vdev) {
4708 remdev = dev;
4709 break;
4710 }
4711 }
4712 ASSERT(remdev != NULL);
4713
4714 /*
4715 * Remove device from global list
4716 */
4717 list_remove(l2arc_dev_list, remdev);
4718 l2arc_dev_last = NULL; /* may have been invalidated */
4719 atomic_dec_64(&l2arc_ndev);
4720 mutex_exit(&l2arc_dev_mtx);
4721
4722 /*
4723 * Clear all buflists and ARC references. L2ARC device flush.
4724 */
4725 l2arc_evict(remdev, 0, B_TRUE);
4726 list_destroy(remdev->l2ad_buflist);
4727 kmem_free(remdev->l2ad_buflist, sizeof (list_t));
4728 kmem_free(remdev, sizeof (l2arc_dev_t));
4729 }
4730
4731 void
4732 l2arc_init(void)
4733 {
4734 l2arc_thread_exit = 0;
4735 l2arc_ndev = 0;
4736 l2arc_writes_sent = 0;
4737 l2arc_writes_done = 0;
4738
4739 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4740 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
4741 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
4742 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
4743 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
4744
4745 l2arc_dev_list = &L2ARC_dev_list;
4746 l2arc_free_on_write = &L2ARC_free_on_write;
4747 list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
4748 offsetof(l2arc_dev_t, l2ad_node));
4749 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
4750 offsetof(l2arc_data_free_t, l2df_list_node));
4751 }
4752
4753 void
4754 l2arc_fini(void)
4755 {
4756 /*
4757 * This is called from dmu_fini(), which is called from spa_fini();
4758 * Because of this, we can assume that all l2arc devices have
4759 * already been removed when the pools themselves were removed.
4760 */
4761
4762 l2arc_do_free_on_write();
4763
4764 mutex_destroy(&l2arc_feed_thr_lock);
4765 cv_destroy(&l2arc_feed_thr_cv);
4766 mutex_destroy(&l2arc_dev_mtx);
4767 mutex_destroy(&l2arc_buflist_mtx);
4768 mutex_destroy(&l2arc_free_on_write_mtx);
4769
4770 list_destroy(l2arc_dev_list);
4771 list_destroy(l2arc_free_on_write);
4772 }
4773
4774 void
4775 l2arc_start(void)
4776 {
4777 if (!(spa_mode_global & FWRITE))
4778 return;
4779
4780 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
4781 TS_RUN, minclsyspri);
4782 }
4783
4784 void
4785 l2arc_stop(void)
4786 {
4787 if (!(spa_mode_global & FWRITE))
4788 return;
4789
4790 mutex_enter(&l2arc_feed_thr_lock);
4791 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
4792 l2arc_thread_exit = 1;
4793 while (l2arc_thread_exit != 0)
4794 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
4795 mutex_exit(&l2arc_feed_thr_lock);
4796 }