1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
27 */
28
29 /*
30 * DVA-based Adjustable Replacement Cache
31 *
32 * While much of the theory of operation used here is
33 * based on the self-tuning, low overhead replacement cache
34 * presented by Megiddo and Modha at FAST 2003, there are some
35 * significant differences:
36 *
37 * 1. The Megiddo and Modha model assumes any page is evictable.
38 * Pages in its cache cannot be "locked" into memory. This makes
39 * the eviction algorithm simple: evict the last page in the list.
40 * This also make the performance characteristics easy to reason
41 * about. Our cache is not so simple. At any given moment, some
42 * subset of the blocks in the cache are un-evictable because we
43 * have handed out a reference to them. Blocks are only evictable
44 * when there are no external references active. This makes
45 * eviction far more problematic: we choose to evict the evictable
46 * blocks that are the "lowest" in the list.
47 *
48 * There are times when it is not possible to evict the requested
49 * space. In these circumstances we are unable to adjust the cache
50 * size. To prevent the cache growing unbounded at these times we
51 * implement a "cache throttle" that slows the flow of new data
52 * into the cache until we can make space available.
53 *
54 * 2. The Megiddo and Modha model assumes a fixed cache size.
55 * Pages are evicted when the cache is full and there is a cache
56 * miss. Our model has a variable sized cache. It grows with
57 * high use, but also tries to react to memory pressure from the
58 * operating system: decreasing its size when system memory is
59 * tight.
60 *
61 * 3. The Megiddo and Modha model assumes a fixed page size. All
62 * elements of the cache are therefore exactly the same size. So
63 * when adjusting the cache size following a cache miss, its simply
64 * a matter of choosing a single page to evict. In our model, we
65 * have variable sized cache blocks (rangeing from 512 bytes to
66 * 128K bytes). We therefore choose a set of blocks to evict to make
67 * space for a cache miss that approximates as closely as possible
68 * the space used by the new block.
69 *
70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
71 * by N. Megiddo & D. Modha, FAST 2003
72 */
73
74 /*
75 * The locking model:
76 *
77 * A new reference to a cache buffer can be obtained in two
78 * ways: 1) via a hash table lookup using the DVA as a key,
79 * or 2) via one of the ARC lists. The arc_read() interface
80 * uses method 1, while the internal arc algorithms for
81 * adjusting the cache use method 2. We therefore provide two
82 * types of locks: 1) the hash table lock array, and 2) the
83 * arc list locks.
84 *
85 * Buffers do not have their own mutexes, rather they rely on the
86 * hash table mutexes for the bulk of their protection (i.e. most
87 * fields in the arc_buf_hdr_t are protected by these mutexes).
88 *
89 * buf_hash_find() returns the appropriate mutex (held) when it
90 * locates the requested buffer in the hash table. It returns
91 * NULL for the mutex if the buffer was not in the table.
92 *
93 * buf_hash_remove() expects the appropriate hash mutex to be
94 * already held before it is invoked.
95 *
96 * Each arc state also has a mutex which is used to protect the
97 * buffer list associated with the state. When attempting to
98 * obtain a hash table lock while holding an arc list lock you
99 * must use: mutex_tryenter() to avoid deadlock. Also note that
100 * the active state mutex must be held before the ghost state mutex.
101 *
102 * Arc buffers may have an associated eviction callback function.
103 * This function will be invoked prior to removing the buffer (e.g.
104 * in arc_do_user_evicts()). Note however that the data associated
105 * with the buffer may be evicted prior to the callback. The callback
106 * must be made with *no locks held* (to prevent deadlock). Additionally,
107 * the users of callbacks must ensure that their private data is
108 * protected from simultaneous callbacks from arc_buf_evict()
109 * and arc_do_user_evicts().
110 *
111 * Note that the majority of the performance stats are manipulated
112 * with atomic operations.
113 *
114 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
115 *
116 * - L2ARC buflist creation
117 * - L2ARC buflist eviction
118 * - L2ARC write completion, which walks L2ARC buflists
119 * - ARC header destruction, as it removes from L2ARC buflists
120 * - ARC header release, as it removes from L2ARC buflists
121 */
122
123 #include <sys/spa.h>
124 #include <sys/zio.h>
125 #include <sys/zio_compress.h>
126 #include <sys/zfs_context.h>
127 #include <sys/arc.h>
128 #include <sys/refcount.h>
129 #include <sys/vdev.h>
130 #include <sys/vdev_impl.h>
131 #include <sys/dsl_pool.h>
132 #ifdef _KERNEL
133 #include <sys/vmsystm.h>
134 #include <vm/anon.h>
135 #include <sys/fs/swapnode.h>
136 #include <sys/dnlc.h>
137 #endif
138 #include <sys/callb.h>
139 #include <sys/kstat.h>
140 #include <zfs_fletcher.h>
141 #include <sys/byteorder.h>
142 #include <sys/spa_impl.h>
143
144 #ifndef _KERNEL
145 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
146 boolean_t arc_watch = B_FALSE;
147 int arc_procfd;
148 #endif
149
150 static kmutex_t arc_reclaim_thr_lock;
151 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
152 static uint8_t arc_thread_exit;
153
154 #define ARC_REDUCE_DNLC_PERCENT 3
155 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
156
157 typedef enum arc_reclaim_strategy {
158 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
159 ARC_RECLAIM_CONS /* Conservative reclaim strategy */
160 } arc_reclaim_strategy_t;
161
162 /*
163 * The number of iterations through arc_evict_*() before we
164 * drop & reacquire the lock.
165 */
166 int arc_evict_iterations = 100;
167
168 /* number of seconds before growing cache again */
169 static int arc_grow_retry = 60;
170
171 /* shift of arc_c for calculating both min and max arc_p */
172 static int arc_p_min_shift = 4;
173
174 /* log2(fraction of arc to reclaim) */
175 static int arc_shrink_shift = 5;
176
177 /*
178 * minimum lifespan of a prefetch block in clock ticks
179 * (initialized in arc_init())
180 */
181 static int arc_min_prefetch_lifespan;
182
183 /*
184 * If this percent of memory is free, don't throttle.
185 */
186 int arc_lotsfree_percent = 10;
187
188 static int arc_dead;
189
190 /*
191 * The arc has filled available memory and has now warmed up.
192 */
193 static boolean_t arc_warm;
194
195 /*
196 * These tunables are for performance analysis.
197 */
198 uint64_t zfs_arc_max;
199 uint64_t zfs_arc_min;
200 uint64_t zfs_arc_meta_limit = 0;
201 int zfs_arc_grow_retry = 0;
202 int zfs_arc_shrink_shift = 0;
203 int zfs_arc_p_min_shift = 0;
204 int zfs_disable_dup_eviction = 0;
205
206 /*
207 * Note that buffers can be in one of 6 states:
208 * ARC_anon - anonymous (discussed below)
209 * ARC_mru - recently used, currently cached
210 * ARC_mru_ghost - recentely used, no longer in cache
211 * ARC_mfu - frequently used, currently cached
212 * ARC_mfu_ghost - frequently used, no longer in cache
213 * ARC_l2c_only - exists in L2ARC but not other states
214 * When there are no active references to the buffer, they are
215 * are linked onto a list in one of these arc states. These are
216 * the only buffers that can be evicted or deleted. Within each
217 * state there are multiple lists, one for meta-data and one for
218 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
219 * etc.) is tracked separately so that it can be managed more
220 * explicitly: favored over data, limited explicitly.
221 *
222 * Anonymous buffers are buffers that are not associated with
223 * a DVA. These are buffers that hold dirty block copies
224 * before they are written to stable storage. By definition,
225 * they are "ref'd" and are considered part of arc_mru
226 * that cannot be freed. Generally, they will aquire a DVA
227 * as they are written and migrate onto the arc_mru list.
228 *
229 * The ARC_l2c_only state is for buffers that are in the second
230 * level ARC but no longer in any of the ARC_m* lists. The second
231 * level ARC itself may also contain buffers that are in any of
232 * the ARC_m* states - meaning that a buffer can exist in two
233 * places. The reason for the ARC_l2c_only state is to keep the
234 * buffer header in the hash table, so that reads that hit the
235 * second level ARC benefit from these fast lookups.
236 */
237
238 typedef struct arc_state {
239 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
240 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
241 uint64_t arcs_size; /* total amount of data in this state */
242 kmutex_t arcs_mtx;
243 } arc_state_t;
244
245 /* The 6 states: */
246 static arc_state_t ARC_anon;
247 static arc_state_t ARC_mru;
248 static arc_state_t ARC_mru_ghost;
249 static arc_state_t ARC_mfu;
250 static arc_state_t ARC_mfu_ghost;
251 static arc_state_t ARC_l2c_only;
252
253 typedef struct arc_stats {
254 kstat_named_t arcstat_hits;
255 kstat_named_t arcstat_misses;
256 kstat_named_t arcstat_demand_data_hits;
257 kstat_named_t arcstat_demand_data_misses;
258 kstat_named_t arcstat_demand_metadata_hits;
259 kstat_named_t arcstat_demand_metadata_misses;
260 kstat_named_t arcstat_prefetch_data_hits;
261 kstat_named_t arcstat_prefetch_data_misses;
262 kstat_named_t arcstat_prefetch_metadata_hits;
263 kstat_named_t arcstat_prefetch_metadata_misses;
264 kstat_named_t arcstat_mru_hits;
265 kstat_named_t arcstat_mru_ghost_hits;
266 kstat_named_t arcstat_mfu_hits;
267 kstat_named_t arcstat_mfu_ghost_hits;
268 kstat_named_t arcstat_deleted;
269 kstat_named_t arcstat_recycle_miss;
270 /*
271 * Number of buffers that could not be evicted because the hash lock
272 * was held by another thread. The lock may not necessarily be held
273 * by something using the same buffer, since hash locks are shared
274 * by multiple buffers.
275 */
276 kstat_named_t arcstat_mutex_miss;
277 /*
278 * Number of buffers skipped because they have I/O in progress, are
279 * indrect prefetch buffers that have not lived long enough, or are
280 * not from the spa we're trying to evict from.
281 */
282 kstat_named_t arcstat_evict_skip;
283 kstat_named_t arcstat_evict_l2_cached;
284 kstat_named_t arcstat_evict_l2_eligible;
285 kstat_named_t arcstat_evict_l2_ineligible;
286 kstat_named_t arcstat_hash_elements;
287 kstat_named_t arcstat_hash_elements_max;
288 kstat_named_t arcstat_hash_collisions;
289 kstat_named_t arcstat_hash_chains;
290 kstat_named_t arcstat_hash_chain_max;
291 kstat_named_t arcstat_p;
292 kstat_named_t arcstat_c;
293 kstat_named_t arcstat_c_min;
294 kstat_named_t arcstat_c_max;
295 kstat_named_t arcstat_size;
296 kstat_named_t arcstat_hdr_size;
297 kstat_named_t arcstat_data_size;
298 kstat_named_t arcstat_other_size;
299 kstat_named_t arcstat_l2_hits;
300 kstat_named_t arcstat_l2_misses;
301 kstat_named_t arcstat_l2_feeds;
302 kstat_named_t arcstat_l2_rw_clash;
303 kstat_named_t arcstat_l2_read_bytes;
304 kstat_named_t arcstat_l2_write_bytes;
305 kstat_named_t arcstat_l2_writes_sent;
306 kstat_named_t arcstat_l2_writes_done;
307 kstat_named_t arcstat_l2_writes_error;
308 kstat_named_t arcstat_l2_writes_hdr_miss;
309 kstat_named_t arcstat_l2_evict_lock_retry;
310 kstat_named_t arcstat_l2_evict_reading;
311 kstat_named_t arcstat_l2_free_on_write;
312 kstat_named_t arcstat_l2_abort_lowmem;
313 kstat_named_t arcstat_l2_cksum_bad;
314 kstat_named_t arcstat_l2_io_error;
315 kstat_named_t arcstat_l2_size;
316 kstat_named_t arcstat_l2_asize;
317 kstat_named_t arcstat_l2_hdr_size;
318 kstat_named_t arcstat_l2_compress_successes;
319 kstat_named_t arcstat_l2_compress_zeros;
320 kstat_named_t arcstat_l2_compress_failures;
321 kstat_named_t arcstat_l2_log_blk_writes;
322 kstat_named_t arcstat_l2_log_blk_avg_size;
323 kstat_named_t arcstat_l2_data_to_meta_ratio;
324 kstat_named_t arcstat_l2_rebuild_successes;
325 kstat_named_t arcstat_l2_rebuild_abort_unsupported;
326 kstat_named_t arcstat_l2_rebuild_abort_timeout;
327 kstat_named_t arcstat_l2_rebuild_abort_io_errors;
328 kstat_named_t arcstat_l2_rebuild_abort_cksum_errors;
329 kstat_named_t arcstat_l2_rebuild_abort_loop_errors;
330 kstat_named_t arcstat_l2_rebuild_abort_lowmem;
331 kstat_named_t arcstat_l2_rebuild_size;
332 kstat_named_t arcstat_l2_rebuild_bufs;
333 kstat_named_t arcstat_l2_rebuild_bufs_precached;
334 kstat_named_t arcstat_l2_rebuild_psize;
335 kstat_named_t arcstat_l2_rebuild_log_blks;
336 kstat_named_t arcstat_memory_throttle_count;
337 kstat_named_t arcstat_duplicate_buffers;
338 kstat_named_t arcstat_duplicate_buffers_size;
339 kstat_named_t arcstat_duplicate_reads;
340 kstat_named_t arcstat_meta_used;
341 kstat_named_t arcstat_meta_limit;
342 kstat_named_t arcstat_meta_max;
343 } arc_stats_t;
344
345 static arc_stats_t arc_stats = {
346 { "hits", KSTAT_DATA_UINT64 },
347 { "misses", KSTAT_DATA_UINT64 },
348 { "demand_data_hits", KSTAT_DATA_UINT64 },
349 { "demand_data_misses", KSTAT_DATA_UINT64 },
350 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
351 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
352 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
353 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
354 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
355 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
356 { "mru_hits", KSTAT_DATA_UINT64 },
357 { "mru_ghost_hits", KSTAT_DATA_UINT64 },
358 { "mfu_hits", KSTAT_DATA_UINT64 },
359 { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
360 { "deleted", KSTAT_DATA_UINT64 },
361 { "recycle_miss", KSTAT_DATA_UINT64 },
362 { "mutex_miss", KSTAT_DATA_UINT64 },
363 { "evict_skip", KSTAT_DATA_UINT64 },
364 { "evict_l2_cached", KSTAT_DATA_UINT64 },
365 { "evict_l2_eligible", KSTAT_DATA_UINT64 },
366 { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
367 { "hash_elements", KSTAT_DATA_UINT64 },
368 { "hash_elements_max", KSTAT_DATA_UINT64 },
369 { "hash_collisions", KSTAT_DATA_UINT64 },
370 { "hash_chains", KSTAT_DATA_UINT64 },
371 { "hash_chain_max", KSTAT_DATA_UINT64 },
372 { "p", KSTAT_DATA_UINT64 },
373 { "c", KSTAT_DATA_UINT64 },
374 { "c_min", KSTAT_DATA_UINT64 },
375 { "c_max", KSTAT_DATA_UINT64 },
376 { "size", KSTAT_DATA_UINT64 },
377 { "hdr_size", KSTAT_DATA_UINT64 },
378 { "data_size", KSTAT_DATA_UINT64 },
379 { "other_size", KSTAT_DATA_UINT64 },
380 { "l2_hits", KSTAT_DATA_UINT64 },
381 { "l2_misses", KSTAT_DATA_UINT64 },
382 { "l2_feeds", KSTAT_DATA_UINT64 },
383 { "l2_rw_clash", KSTAT_DATA_UINT64 },
384 { "l2_read_bytes", KSTAT_DATA_UINT64 },
385 { "l2_write_bytes", KSTAT_DATA_UINT64 },
386 { "l2_writes_sent", KSTAT_DATA_UINT64 },
387 { "l2_writes_done", KSTAT_DATA_UINT64 },
388 { "l2_writes_error", KSTAT_DATA_UINT64 },
389 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
390 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
391 { "l2_evict_reading", KSTAT_DATA_UINT64 },
392 { "l2_free_on_write", KSTAT_DATA_UINT64 },
393 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
394 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
395 { "l2_io_error", KSTAT_DATA_UINT64 },
396 { "l2_size", KSTAT_DATA_UINT64 },
397 { "l2_asize", KSTAT_DATA_UINT64 },
398 { "l2_hdr_size", KSTAT_DATA_UINT64 },
399 { "l2_compress_successes", KSTAT_DATA_UINT64 },
400 { "l2_compress_zeros", KSTAT_DATA_UINT64 },
401 { "l2_compress_failures", KSTAT_DATA_UINT64 },
402 { "l2_log_blk_writes", KSTAT_DATA_UINT64 },
403 { "l2_log_blk_avg_size", KSTAT_DATA_UINT64 },
404 { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 },
405 { "l2_rebuild_successes", KSTAT_DATA_UINT64 },
406 { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 },
407 { "l2_rebuild_timeout", KSTAT_DATA_UINT64 },
408 { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 },
409 { "l2_rebuild_cksum_errors", KSTAT_DATA_UINT64 },
410 { "l2_rebuild_loop_errors", KSTAT_DATA_UINT64 },
411 { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 },
412 { "l2_rebuild_psize", KSTAT_DATA_UINT64 },
413 { "l2_rebuild_bufs", KSTAT_DATA_UINT64 },
414 { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 },
415 { "l2_rebuild_size", KSTAT_DATA_UINT64 },
416 { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 },
417 { "memory_throttle_count", KSTAT_DATA_UINT64 },
418 { "duplicate_buffers", KSTAT_DATA_UINT64 },
419 { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
420 { "duplicate_reads", KSTAT_DATA_UINT64 },
421 { "arc_meta_used", KSTAT_DATA_UINT64 },
422 { "arc_meta_limit", KSTAT_DATA_UINT64 },
423 { "arc_meta_max", KSTAT_DATA_UINT64 }
424 };
425
426 #define ARCSTAT(stat) (arc_stats.stat.value.ui64)
427
428 #define ARCSTAT_INCR(stat, val) \
429 atomic_add_64(&arc_stats.stat.value.ui64, (val))
430
431 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
432 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
433
434 #define ARCSTAT_MAX(stat, val) { \
435 uint64_t m; \
436 while ((val) > (m = arc_stats.stat.value.ui64) && \
437 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
438 continue; \
439 }
440
441 #define ARCSTAT_MAXSTAT(stat) \
442 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
443
444 /*
445 * We define a macro to allow ARC hits/misses to be easily broken down by
446 * two separate conditions, giving a total of four different subtypes for
447 * each of hits and misses (so eight statistics total).
448 */
449 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
450 if (cond1) { \
451 if (cond2) { \
452 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
453 } else { \
454 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
455 } \
456 } else { \
457 if (cond2) { \
458 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
459 } else { \
460 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
461 } \
462 }
463
464 /*
465 * This macro allows us to use kstats as floating averages. Each time we
466 * update this kstat, we first factor it and the update value by
467 * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
468 * average. This macro assumes that integer loads and stores are atomic, but
469 * is not safe for multiple writers updating the kstat in parallel (only the
470 * last writer's update will remain).
471 */
472 #define ARCSTAT_F_AVG_FACTOR 3
473 #define ARCSTAT_F_AVG(stat, value) \
474 do { \
475 uint64_t x = ARCSTAT(stat); \
476 x = x - x / ARCSTAT_F_AVG_FACTOR + \
477 (value) / ARCSTAT_F_AVG_FACTOR; \
478 ARCSTAT(stat) = x; \
479 _NOTE(NOTREACHED) \
480 _NOTE(CONSTCOND) \
481 } while (0)
482
483 kstat_t *arc_ksp;
484 static arc_state_t *arc_anon;
485 static arc_state_t *arc_mru;
486 static arc_state_t *arc_mru_ghost;
487 static arc_state_t *arc_mfu;
488 static arc_state_t *arc_mfu_ghost;
489 static arc_state_t *arc_l2c_only;
490
491 /*
492 * There are several ARC variables that are critical to export as kstats --
493 * but we don't want to have to grovel around in the kstat whenever we wish to
494 * manipulate them. For these variables, we therefore define them to be in
495 * terms of the statistic variable. This assures that we are not introducing
496 * the possibility of inconsistency by having shadow copies of the variables,
497 * while still allowing the code to be readable.
498 */
499 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
500 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
501 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */
502 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
503 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
504 #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
505 #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
506 #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
507
508 #define L2ARC_IS_VALID_COMPRESS(_c_) \
509 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
510
511 static int arc_no_grow; /* Don't try to grow cache size */
512 static uint64_t arc_tempreserve;
513 static uint64_t arc_loaned_bytes;
514
515 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
516
517 typedef struct arc_callback arc_callback_t;
518
519 struct arc_callback {
520 void *acb_private;
521 arc_done_func_t *acb_done;
522 arc_buf_t *acb_buf;
523 zio_t *acb_zio_dummy;
524 arc_callback_t *acb_next;
525 };
526
527 typedef struct arc_write_callback arc_write_callback_t;
528
529 struct arc_write_callback {
530 void *awcb_private;
531 arc_done_func_t *awcb_ready;
532 arc_done_func_t *awcb_physdone;
533 arc_done_func_t *awcb_done;
534 arc_buf_t *awcb_buf;
535 };
536
537 struct arc_buf_hdr {
538 /* protected by hash lock */
539 dva_t b_dva;
540 uint64_t b_birth;
541 uint64_t b_cksum0;
542
543 kmutex_t b_freeze_lock;
544 zio_cksum_t *b_freeze_cksum;
545 void *b_thawed;
546
547 arc_buf_hdr_t *b_hash_next;
548 arc_buf_t *b_buf;
549 uint32_t b_flags;
550 uint32_t b_datacnt;
551
552 arc_callback_t *b_acb;
553 kcondvar_t b_cv;
554
555 /* immutable */
556 arc_buf_contents_t b_type;
557 uint64_t b_size;
558 uint64_t b_spa;
559
560 /* protected by arc state mutex */
561 arc_state_t *b_state;
562 list_node_t b_arc_node;
563
564 /* updated atomically */
565 clock_t b_arc_access;
566
567 /* self protecting */
568 refcount_t b_refcnt;
569
570 l2arc_buf_hdr_t *b_l2hdr;
571 list_node_t b_l2node;
572 };
573
574 static arc_buf_t *arc_eviction_list;
575 static kmutex_t arc_eviction_mtx;
576 static arc_buf_hdr_t arc_eviction_hdr;
577 static void arc_get_data_buf(arc_buf_t *buf);
578 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
579 static int arc_evict_needed(arc_buf_contents_t type);
580 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
581 static void arc_buf_watch(arc_buf_t *buf);
582
583 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
584
585 #define GHOST_STATE(state) \
586 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
587 (state) == arc_l2c_only)
588
589 /*
590 * Private ARC flags. These flags are private ARC only flags that will show up
591 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can
592 * be passed in as arc_flags in things like arc_read. However, these flags
593 * should never be passed and should only be set by ARC code. When adding new
594 * public flags, make sure not to smash the private ones.
595 */
596
597 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */
598 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */
599 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
600 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
601 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
602 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */
603 #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */
604 #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
605 #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
606 #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
607
608 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
609 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
610 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
611 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH)
612 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
613 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
614 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
615 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE)
616 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
617 (hdr)->b_l2hdr != NULL)
618 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING)
619 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED)
620 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
621
622 /*
623 * Other sizes
624 */
625
626 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
627 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
628
629 /*
630 * Hash table routines
631 */
632
633 #define HT_LOCK_PAD 64
634
635 struct ht_lock {
636 kmutex_t ht_lock;
637 #ifdef _KERNEL
638 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
639 #endif
640 };
641
642 #define BUF_LOCKS 256
643 typedef struct buf_hash_table {
644 uint64_t ht_mask;
645 arc_buf_hdr_t **ht_table;
646 struct ht_lock ht_locks[BUF_LOCKS];
647 } buf_hash_table_t;
648
649 static buf_hash_table_t buf_hash_table;
650
651 #define BUF_HASH_INDEX(spa, dva, birth) \
652 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
653 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
654 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
655 #define HDR_LOCK(hdr) \
656 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
657
658 uint64_t zfs_crc64_table[256];
659
660 /*
661 * Level 2 ARC
662 */
663
664 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
665 #define L2ARC_HEADROOM 2 /* num of writes */
666 /*
667 * If we discover during ARC scan any buffers to be compressed, we boost
668 * our headroom for the next scanning cycle by this percentage multiple.
669 */
670 #define L2ARC_HEADROOM_BOOST 200
671 #define L2ARC_FEED_SECS 1 /* caching interval secs */
672 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
673
674 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
675 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
676
677 /* L2ARC Performance Tunables */
678 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
679 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
680 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
681 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
682 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
683 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
684 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
685 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
686 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
687
688 /*
689 * L2ARC Internals
690 */
691 typedef struct l2arc_dev l2arc_dev_t;
692 static list_t L2ARC_dev_list; /* device list */
693 static list_t *l2arc_dev_list; /* device list pointer */
694 static kmutex_t l2arc_dev_mtx; /* device list mutex */
695 static l2arc_dev_t *l2arc_dev_last; /* last device used */
696 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
697 static list_t L2ARC_free_on_write; /* free after write buf list */
698 static list_t *l2arc_free_on_write; /* free after write list ptr */
699 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
700 static uint64_t l2arc_ndev; /* number of devices */
701
702 typedef struct l2arc_read_callback {
703 arc_buf_t *l2rcb_buf; /* read buffer */
704 spa_t *l2rcb_spa; /* spa */
705 blkptr_t l2rcb_bp; /* original blkptr */
706 zbookmark_t l2rcb_zb; /* original bookmark */
707 int l2rcb_flags; /* original flags */
708 enum zio_compress l2rcb_compress; /* applied compress */
709 } l2arc_read_callback_t;
710
711 typedef struct l2arc_write_callback {
712 l2arc_dev_t *l2wcb_dev; /* device info */
713 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
714 /* list of in-flight l2arc_log_blk_buf_t's */
715 list_t l2wcb_log_blk_buf_list;
716 } l2arc_write_callback_t;
717
718 struct l2arc_buf_hdr {
719 /* protected by arc_buf_hdr mutex */
720 l2arc_dev_t *b_dev; /* L2ARC device */
721 uint64_t b_daddr; /* disk address, offset byte */
722 /* compression applied to buffer data */
723 enum zio_compress b_compress;
724 /* real alloc'd buffer size depending on b_compress applied */
725 int b_asize;
726 /* temporary buffer holder for in-flight compressed data */
727 void *b_tmp_cdata;
728 };
729
730 typedef struct l2arc_data_free {
731 /* protected by l2arc_free_on_write_mtx */
732 void *l2df_data;
733 size_t l2df_size;
734 void (*l2df_func)(void *, size_t);
735 list_node_t l2df_list_node;
736 } l2arc_data_free_t;
737
738 static kmutex_t l2arc_feed_thr_lock;
739 static kcondvar_t l2arc_feed_thr_cv;
740 static uint8_t l2arc_thread_exit;
741
742 static void l2arc_read_done(zio_t *zio);
743 static void l2arc_hdr_stat_add(boolean_t from_arc);
744 static void l2arc_hdr_stat_remove(void);
745 static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
746
747 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
748 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
749 enum zio_compress c);
750 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
751
752 enum {
753 L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */
754 };
755
756 /*
757 * Pointer used in persistent L2ARC (for pointing to log blocks & ARC buffers).
758 */
759 typedef struct l2arc_log_blk_ptr {
760 uint64_t l2lbp_daddr; /* device address of log */
761 /*
762 * l2lbp_prop is the same format as the blk_prop in blkptr_t:
763 * * logical size (in sectors)
764 * * physical (compressed) size (in sectors)
765 * * compression algorithm (we always LZ4-compress l2arc logs)
766 * * checksum algorithm (used for l2lbp_cksum)
767 * * object type & level (unused for now)
768 */
769 uint64_t l2lbp_prop;
770 zio_cksum_t l2lbp_cksum; /* fletcher4 of log */
771 } l2arc_log_blk_ptr_t;
772
773 /*
774 * The persistent L2ARC device header.
775 */
776 typedef struct l2arc_dev_hdr_phys {
777 uint64_t l2dh_magic;
778 zio_cksum_t l2dh_self_cksum; /* fletcher4 of fields below */
779
780 /*
781 * Global L2ARC device state and metadata.
782 */
783 uint64_t l2dh_spa_guid;
784 uint64_t l2dh_evict_tail; /* current evict pointer */
785 uint64_t l2dh_alloc_space; /* vdev space alloc status */
786 uint64_t l2dh_flags; /* l2arc_dev_hdr_flags_t */
787
788 /*
789 * Start of log block chain. [0] -> newest log, [1] -> one older (used
790 * for initiating prefetch).
791 */
792 l2arc_log_blk_ptr_t l2dh_start_lbps[2];
793
794 const uint64_t l2dh_pad[43]; /* pad to 512 bytes */
795 } l2arc_dev_hdr_phys_t;
796 CTASSERT(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
797
798 /*
799 * A single ARC buffer header entry in a l2arc_log_blk_phys_t.
800 */
801 typedef struct l2arc_log_ent_phys {
802 dva_t l2le_dva; /* dva of buffer */
803 uint64_t l2le_birth; /* birth txg of buffer */
804 uint64_t l2le_cksum0;
805 zio_cksum_t l2le_freeze_cksum;
806 /*
807 * l2le_prop is the same format as the blk_prop in blkptr_t:
808 * * logical size (in sectors)
809 * * physical (compressed) size (in sectors)
810 * * compression algorithm
811 * * checksum algorithm (used for cksum0)
812 * * object type & level (used to restore arc_buf_contents_t)
813 */
814 uint64_t l2le_prop;
815 uint64_t l2le_daddr; /* buf location on l2dev */
816 const uint64_t l2le_pad[6]; /* resv'd for future use */
817 } l2arc_log_ent_phys_t;
818
819 /*
820 * These design limits give us the following overhead (before compression):
821 * avg_blk_sz overhead
822 * 1k 12.51 %
823 * 2k 6.26 %
824 * 4k 3.13 %
825 * 8k 1.56 %
826 * 16k 0.78 %
827 * 32k 0.39 %
828 * 64k 0.20 %
829 * 128k 0.10 %
830 * Compression should be able to sequeeze these down by about a factor of 2x.
831 */
832 #define L2ARC_LOG_BLK_SIZE (128 * 1024) /* 128k */
833 #define L2ARC_LOG_BLK_HEADER_LEN (128)
834 #define L2ARC_LOG_BLK_ENTRIES /* 1023 entries */ \
835 ((L2ARC_LOG_BLK_SIZE - L2ARC_LOG_BLK_HEADER_LEN) / \
836 sizeof (l2arc_log_ent_phys_t))
837 /*
838 * Maximum amount of data in an l2arc log block (used to terminate rebuilding
839 * before we hit the write head and restore potentially corrupted blocks).
840 */
841 #define L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE \
842 (SPA_MAXBLOCKSIZE * L2ARC_LOG_BLK_ENTRIES)
843 /*
844 * For the persistency and rebuild algorithms to operate reliably we need
845 * the L2ARC device to at least be able to hold 3 full log blocks (otherwise
846 * excessive log block looping might confuse the log chain end detection).
847 * Under normal circumstances this is not a problem, since this is somewhere
848 * around only 400 MB.
849 */
850 #define L2ARC_PERSIST_MIN_SIZE (3 * L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE)
851
852 /*
853 * A log block of up to 1023 ARC buffer log entries, chained into the
854 * persistent L2ARC metadata linked list.
855 */
856 typedef struct l2arc_log_blk_phys {
857 /* Header - see L2ARC_LOG_BLK_HEADER_LEN above */
858 uint64_t l2lb_magic;
859 l2arc_log_blk_ptr_t l2lb_back2_lbp; /* back 2 steps in chain */
860 uint64_t l2lb_pad[9]; /* resv'd for future use */
861 /* Payload */
862 l2arc_log_ent_phys_t l2lb_entries[L2ARC_LOG_BLK_ENTRIES];
863 } l2arc_log_blk_phys_t;
864
865 CTASSERT(sizeof (l2arc_log_blk_phys_t) == L2ARC_LOG_BLK_SIZE);
866 CTASSERT(offsetof(l2arc_log_blk_phys_t, l2lb_entries) -
867 offsetof(l2arc_log_blk_phys_t, l2lb_magic) == L2ARC_LOG_BLK_HEADER_LEN);
868
869 /*
870 * These structures hold in-flight l2arc_log_blk_phys_t's as they're being
871 * written to the L2ARC device. They may be compressed, hence the uint8_t[].
872 */
873 typedef struct l2arc_log_blk_buf {
874 uint8_t l2lbb_log_blk[sizeof (l2arc_log_blk_phys_t)];
875 list_node_t l2lbb_node;
876 } l2arc_log_blk_buf_t;
877
878 /* Macros for the manipulation fields in the blk_prop format of blkptr_t */
879 #define BLKPROP_GET_LSIZE(_obj, _field) \
880 BF64_GET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1)
881 #define BLKPROP_SET_LSIZE(_obj, _field, x) \
882 BF64_SET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
883 #define BLKPROP_GET_PSIZE(_obj, _field) \
884 BF64_GET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1)
885 #define BLKPROP_SET_PSIZE(_obj, _field, x) \
886 BF64_SET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
887 #define BLKPROP_GET_COMPRESS(_obj, _field) \
888 BF64_GET((_obj)->_field, 32, 8)
889 #define BLKPROP_SET_COMPRESS(_obj, _field, x) \
890 BF64_SET((_obj)->_field, 32, 8, x)
891 #define BLKPROP_GET_CHECKSUM(_obj, _field) \
892 BF64_GET((_obj)->_field, 40, 8)
893 #define BLKPROP_SET_CHECKSUM(_obj, _field, x) \
894 BF64_SET((_obj)->_field, 40, 8, x)
895 #define BLKPROP_GET_TYPE(_obj, _field) \
896 BF64_GET((_obj)->_field, 48, 8)
897 #define BLKPROP_SET_TYPE(_obj, _field, x) \
898 BF64_SET((_obj)->_field, 48, 8, x)
899
900 /* Macros for manipulating a l2arc_log_blk_ptr_t->l2lbp_prop field */
901 #define LBP_GET_LSIZE(_add) BLKPROP_GET_LSIZE(_add, l2lbp_prop)
902 #define LBP_SET_LSIZE(_add, x) BLKPROP_SET_LSIZE(_add, l2lbp_prop, x)
903 #define LBP_GET_PSIZE(_add) BLKPROP_GET_PSIZE(_add, l2lbp_prop)
904 #define LBP_SET_PSIZE(_add, x) BLKPROP_SET_PSIZE(_add, l2lbp_prop, x)
905 #define LBP_GET_COMPRESS(_add) BLKPROP_GET_COMPRESS(_add, l2lbp_prop)
906 #define LBP_SET_COMPRESS(_add, x) BLKPROP_SET_COMPRESS(_add, l2lbp_prop, \
907 x)
908 #define LBP_GET_CHECKSUM(_add) BLKPROP_GET_CHECKSUM(_add, l2lbp_prop)
909 #define LBP_SET_CHECKSUM(_add, x) BLKPROP_SET_CHECKSUM(_add, l2lbp_prop, \
910 x)
911 #define LBP_GET_TYPE(_add) BLKPROP_GET_TYPE(_add, l2lbp_prop)
912 #define LBP_SET_TYPE(_add, x) BLKPROP_SET_TYPE(_add, l2lbp_prop, x)
913
914 /* Macros for manipulating a l2arc_log_ent_phys_t->l2le_prop field */
915 #define LE_GET_LSIZE(_le) BLKPROP_GET_LSIZE(_le, l2le_prop)
916 #define LE_SET_LSIZE(_le, x) BLKPROP_SET_LSIZE(_le, l2le_prop, x)
917 #define LE_GET_PSIZE(_le) BLKPROP_GET_PSIZE(_le, l2le_prop)
918 #define LE_SET_PSIZE(_le, x) BLKPROP_SET_PSIZE(_le, l2le_prop, x)
919 #define LE_GET_COMPRESS(_le) BLKPROP_GET_COMPRESS(_le, l2le_prop)
920 #define LE_SET_COMPRESS(_le, x) BLKPROP_SET_COMPRESS(_le, l2le_prop, x)
921 #define LE_GET_CHECKSUM(_le) BLKPROP_GET_CHECKSUM(_le, l2le_prop)
922 #define LE_SET_CHECKSUM(_le, x) BLKPROP_SET_CHECKSUM(_le, l2le_prop, x)
923 #define LE_GET_TYPE(_le) BLKPROP_GET_TYPE(_le, l2le_prop)
924 #define LE_SET_TYPE(_le, x) BLKPROP_SET_TYPE(_le, l2le_prop, x)
925
926 #define PTR_SWAP(x, y) \
927 do { \
928 void *tmp = (x);\
929 x = y; \
930 y = tmp; \
931 _NOTE(CONSTCOND)\
932 } while (0)
933
934 #define L2ARC_DEV_HDR_MAGIC 0x12bab10c00000001LLU
935 #define L2ARC_LOG_BLK_MAGIC 0x120103b10c000001LLU
936 #define L2ARC_REBUILD_TIMEOUT 300 /* a rebuild may take at most 300s */
937
938 struct l2arc_dev {
939 vdev_t *l2ad_vdev; /* vdev */
940 spa_t *l2ad_spa; /* spa */
941 uint64_t l2ad_hand; /* next write location */
942 uint64_t l2ad_start; /* first addr on device */
943 uint64_t l2ad_end; /* last addr on device */
944 uint64_t l2ad_evict; /* last addr eviction reached */
945 boolean_t l2ad_first; /* first sweep through */
946 boolean_t l2ad_writing; /* currently writing */
947 list_t *l2ad_buflist; /* buffer list */
948 list_node_t l2ad_node; /* device list node */
949 l2arc_dev_hdr_phys_t l2ad_dev_hdr; /* persistent device header */
950 l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */
951 int l2ad_log_ent_idx; /* index into cur log blk */
952 /* number of bytes in current log block's payload */
953 uint64_t l2ad_log_blk_payload_asize;
954 /* flag indicating whether a rebuild is scheduled or is going on */
955 boolean_t l2ad_rebuild;
956 };
957
958 /*
959 * Performance tuning of L2ARC persistency:
960 *
961 * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
962 * pool import or when adding one manually later) will attempt
963 * to rebuild L2ARC buffer contents. In special circumstances,
964 * the administrator may want to set this to B_FALSE, if they
965 * are having trouble importing a pool or attaching an L2ARC
966 * device (e.g. the L2ARC device is slow to read in stored log
967 * metadata, or the metadata has become somehow
968 * fragmented/unusable).
969 * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help
970 * avoid a slow L2ARC device from preventing pool import. If we
971 * are not done rebuilding an L2ARC device by this time, we
972 * stop the rebuild and return immediately.
973 */
974 boolean_t l2arc_rebuild_enabled = B_TRUE;
975 uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT;
976
977 /*
978 * L2ARC persistency rebuild routines.
979 */
980 static void l2arc_dev_rebuild_start(l2arc_dev_t *dev);
981 static int l2arc_rebuild(l2arc_dev_t *dev);
982 static void l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
983 l2arc_log_blk_phys_t *lb, uint64_t lb_psize);
984 static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
985 l2arc_dev_t *dev, uint64_t guid);
986
987 /*
988 * L2ARC persistency read I/O routines.
989 */
990 static int l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr);
991 static int l2arc_log_blk_read(l2arc_dev_t *dev,
992 const l2arc_log_blk_ptr_t *this_lp, const l2arc_log_blk_ptr_t *next_lp,
993 l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
994 uint8_t *this_lb_buf, uint8_t *next_lb_buf,
995 zio_t *this_io, zio_t **next_io);
996 static boolean_t l2arc_log_blk_ptr_valid(l2arc_dev_t *dev,
997 const l2arc_log_blk_ptr_t *lp);
998 static zio_t *l2arc_log_blk_prefetch(vdev_t *vd,
999 const l2arc_log_blk_ptr_t *lp, uint8_t *lb_buf);
1000 static void l2arc_log_blk_prefetch_abort(zio_t *zio);
1001
1002 /*
1003 * L2ARC persistency write I/O routines.
1004 */
1005 static void l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio);
1006 static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
1007 l2arc_write_callback_t *cb);
1008
1009 /*
1010 * L2ARC persistency auxilliary routines.
1011 */
1012 static void l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr,
1013 zio_cksum_t *cksum);
1014 static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
1015 const arc_buf_hdr_t *ab);
1016 static inline boolean_t l2arc_range_check_overlap(uint64_t bottom,
1017 uint64_t top, uint64_t check);
1018 static boolean_t l2arc_check_rebuild_timeout_hit(int64_t deadline);
1019
1020 static inline uint64_t
1021 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
1022 {
1023 uint8_t *vdva = (uint8_t *)dva;
1024 uint64_t crc = -1ULL;
1025 int i;
1026
1027 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
1028
1029 for (i = 0; i < sizeof (dva_t); i++)
1030 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
1031
1032 crc ^= (spa>>8) ^ birth;
1033
1034 return (crc);
1035 }
1036
1037 #define BUF_EMPTY(buf) \
1038 ((buf)->b_dva.dva_word[0] == 0 && \
1039 (buf)->b_dva.dva_word[1] == 0 && \
1040 (buf)->b_birth == 0)
1041
1042 #define BUF_EQUAL(spa, dva, birth, buf) \
1043 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
1044 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
1045 ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
1046
1047 static void
1048 buf_discard_identity(arc_buf_hdr_t *hdr)
1049 {
1050 hdr->b_dva.dva_word[0] = 0;
1051 hdr->b_dva.dva_word[1] = 0;
1052 hdr->b_birth = 0;
1053 hdr->b_cksum0 = 0;
1054 }
1055
1056 static arc_buf_hdr_t *
1057 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
1058 {
1059 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
1060 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1061 arc_buf_hdr_t *buf;
1062
1063 mutex_enter(hash_lock);
1064 for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
1065 buf = buf->b_hash_next) {
1066 if (BUF_EQUAL(spa, dva, birth, buf)) {
1067 *lockp = hash_lock;
1068 return (buf);
1069 }
1070 }
1071 mutex_exit(hash_lock);
1072 *lockp = NULL;
1073 return (NULL);
1074 }
1075
1076 /*
1077 * Insert an entry into the hash table. If there is already an element
1078 * equal to elem in the hash table, then the already existing element
1079 * will be returned and the new element will not be inserted.
1080 * Otherwise returns NULL.
1081 */
1082 static arc_buf_hdr_t *
1083 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
1084 {
1085 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
1086 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1087 arc_buf_hdr_t *fbuf;
1088 uint32_t i;
1089
1090 ASSERT(!HDR_IN_HASH_TABLE(buf));
1091 *lockp = hash_lock;
1092 mutex_enter(hash_lock);
1093 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
1094 fbuf = fbuf->b_hash_next, i++) {
1095 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
1096 return (fbuf);
1097 }
1098
1099 buf->b_hash_next = buf_hash_table.ht_table[idx];
1100 buf_hash_table.ht_table[idx] = buf;
1101 buf->b_flags |= ARC_IN_HASH_TABLE;
1102
1103 /* collect some hash table performance data */
1104 if (i > 0) {
1105 ARCSTAT_BUMP(arcstat_hash_collisions);
1106 if (i == 1)
1107 ARCSTAT_BUMP(arcstat_hash_chains);
1108
1109 ARCSTAT_MAX(arcstat_hash_chain_max, i);
1110 }
1111
1112 ARCSTAT_BUMP(arcstat_hash_elements);
1113 ARCSTAT_MAXSTAT(arcstat_hash_elements);
1114
1115 return (NULL);
1116 }
1117
1118 static void
1119 buf_hash_remove(arc_buf_hdr_t *buf)
1120 {
1121 arc_buf_hdr_t *fbuf, **bufp;
1122 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
1123
1124 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1125 ASSERT(HDR_IN_HASH_TABLE(buf));
1126
1127 bufp = &buf_hash_table.ht_table[idx];
1128 while ((fbuf = *bufp) != buf) {
1129 ASSERT(fbuf != NULL);
1130 bufp = &fbuf->b_hash_next;
1131 }
1132 *bufp = buf->b_hash_next;
1133 buf->b_hash_next = NULL;
1134 buf->b_flags &= ~ARC_IN_HASH_TABLE;
1135
1136 /* collect some hash table performance data */
1137 ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1138
1139 if (buf_hash_table.ht_table[idx] &&
1140 buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1141 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1142 }
1143
1144 /*
1145 * Global data structures and functions for the buf kmem cache.
1146 */
1147 static kmem_cache_t *hdr_cache;
1148 static kmem_cache_t *buf_cache;
1149
1150 static void
1151 buf_fini(void)
1152 {
1153 int i;
1154
1155 kmem_free(buf_hash_table.ht_table,
1156 (buf_hash_table.ht_mask + 1) * sizeof (void *));
1157 for (i = 0; i < BUF_LOCKS; i++)
1158 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1159 kmem_cache_destroy(hdr_cache);
1160 kmem_cache_destroy(buf_cache);
1161 }
1162
1163 /*
1164 * Constructor callback - called when the cache is empty
1165 * and a new buf is requested.
1166 */
1167 /* ARGSUSED */
1168 static int
1169 hdr_cons(void *vbuf, void *unused, int kmflag)
1170 {
1171 arc_buf_hdr_t *buf = vbuf;
1172
1173 bzero(buf, sizeof (arc_buf_hdr_t));
1174 refcount_create(&buf->b_refcnt);
1175 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
1176 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1177 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1178
1179 return (0);
1180 }
1181
1182 /* ARGSUSED */
1183 static int
1184 buf_cons(void *vbuf, void *unused, int kmflag)
1185 {
1186 arc_buf_t *buf = vbuf;
1187
1188 bzero(buf, sizeof (arc_buf_t));
1189 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1190 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1191
1192 return (0);
1193 }
1194
1195 /*
1196 * Destructor callback - called when a cached buf is
1197 * no longer required.
1198 */
1199 /* ARGSUSED */
1200 static void
1201 hdr_dest(void *vbuf, void *unused)
1202 {
1203 arc_buf_hdr_t *buf = vbuf;
1204
1205 ASSERT(BUF_EMPTY(buf));
1206 refcount_destroy(&buf->b_refcnt);
1207 cv_destroy(&buf->b_cv);
1208 mutex_destroy(&buf->b_freeze_lock);
1209 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1210 }
1211
1212 /* ARGSUSED */
1213 static void
1214 buf_dest(void *vbuf, void *unused)
1215 {
1216 arc_buf_t *buf = vbuf;
1217
1218 mutex_destroy(&buf->b_evict_lock);
1219 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1220 }
1221
1222 /*
1223 * Reclaim callback -- invoked when memory is low.
1224 */
1225 /* ARGSUSED */
1226 static void
1227 hdr_recl(void *unused)
1228 {
1229 dprintf("hdr_recl called\n");
1230 /*
1231 * umem calls the reclaim func when we destroy the buf cache,
1232 * which is after we do arc_fini().
1233 */
1234 if (!arc_dead)
1235 cv_signal(&arc_reclaim_thr_cv);
1236 }
1237
1238 static void
1239 buf_init(void)
1240 {
1241 uint64_t *ct;
1242 uint64_t hsize = 1ULL << 12;
1243 int i, j;
1244
1245 /*
1246 * The hash table is big enough to fill all of physical memory
1247 * with an average 64K block size. The table will take up
1248 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
1249 */
1250 while (hsize * 65536 < physmem * PAGESIZE)
1251 hsize <<= 1;
1252 retry:
1253 buf_hash_table.ht_mask = hsize - 1;
1254 buf_hash_table.ht_table =
1255 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1256 if (buf_hash_table.ht_table == NULL) {
1257 ASSERT(hsize > (1ULL << 8));
1258 hsize >>= 1;
1259 goto retry;
1260 }
1261
1262 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
1263 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
1264 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1265 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1266
1267 for (i = 0; i < 256; i++)
1268 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1269 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1270
1271 for (i = 0; i < BUF_LOCKS; i++) {
1272 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1273 NULL, MUTEX_DEFAULT, NULL);
1274 }
1275 }
1276
1277 #define ARC_MINTIME (hz>>4) /* 62 ms */
1278
1279 static void
1280 arc_cksum_verify(arc_buf_t *buf)
1281 {
1282 zio_cksum_t zc;
1283
1284 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1285 return;
1286
1287 mutex_enter(&buf->b_hdr->b_freeze_lock);
1288 if (buf->b_hdr->b_freeze_cksum == NULL ||
1289 (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1290 mutex_exit(&buf->b_hdr->b_freeze_lock);
1291 return;
1292 }
1293 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1294 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1295 panic("buffer modified while frozen!");
1296 mutex_exit(&buf->b_hdr->b_freeze_lock);
1297 }
1298
1299 static int
1300 arc_cksum_equal(arc_buf_t *buf)
1301 {
1302 zio_cksum_t zc;
1303 int equal;
1304
1305 mutex_enter(&buf->b_hdr->b_freeze_lock);
1306 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1307 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1308 mutex_exit(&buf->b_hdr->b_freeze_lock);
1309
1310 return (equal);
1311 }
1312
1313 static void
1314 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1315 {
1316 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1317 return;
1318
1319 mutex_enter(&buf->b_hdr->b_freeze_lock);
1320 if (buf->b_hdr->b_freeze_cksum != NULL) {
1321 mutex_exit(&buf->b_hdr->b_freeze_lock);
1322 return;
1323 }
1324 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1325 fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1326 buf->b_hdr->b_freeze_cksum);
1327 mutex_exit(&buf->b_hdr->b_freeze_lock);
1328 arc_buf_watch(buf);
1329 }
1330
1331 #ifndef _KERNEL
1332 typedef struct procctl {
1333 long cmd;
1334 prwatch_t prwatch;
1335 } procctl_t;
1336 #endif
1337
1338 /* ARGSUSED */
1339 static void
1340 arc_buf_unwatch(arc_buf_t *buf)
1341 {
1342 #ifndef _KERNEL
1343 if (arc_watch) {
1344 int result;
1345 procctl_t ctl;
1346 ctl.cmd = PCWATCH;
1347 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1348 ctl.prwatch.pr_size = 0;
1349 ctl.prwatch.pr_wflags = 0;
1350 result = write(arc_procfd, &ctl, sizeof (ctl));
1351 ASSERT3U(result, ==, sizeof (ctl));
1352 }
1353 #endif
1354 }
1355
1356 /* ARGSUSED */
1357 static void
1358 arc_buf_watch(arc_buf_t *buf)
1359 {
1360 #ifndef _KERNEL
1361 if (arc_watch) {
1362 int result;
1363 procctl_t ctl;
1364 ctl.cmd = PCWATCH;
1365 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1366 ctl.prwatch.pr_size = buf->b_hdr->b_size;
1367 ctl.prwatch.pr_wflags = WA_WRITE;
1368 result = write(arc_procfd, &ctl, sizeof (ctl));
1369 ASSERT3U(result, ==, sizeof (ctl));
1370 }
1371 #endif
1372 }
1373
1374 void
1375 arc_buf_thaw(arc_buf_t *buf)
1376 {
1377 if (zfs_flags & ZFS_DEBUG_MODIFY) {
1378 if (buf->b_hdr->b_state != arc_anon)
1379 panic("modifying non-anon buffer!");
1380 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1381 panic("modifying buffer while i/o in progress!");
1382 arc_cksum_verify(buf);
1383 }
1384
1385 mutex_enter(&buf->b_hdr->b_freeze_lock);
1386 if (buf->b_hdr->b_freeze_cksum != NULL) {
1387 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1388 buf->b_hdr->b_freeze_cksum = NULL;
1389 }
1390
1391 if (zfs_flags & ZFS_DEBUG_MODIFY) {
1392 if (buf->b_hdr->b_thawed)
1393 kmem_free(buf->b_hdr->b_thawed, 1);
1394 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1395 }
1396
1397 mutex_exit(&buf->b_hdr->b_freeze_lock);
1398
1399 arc_buf_unwatch(buf);
1400 }
1401
1402 void
1403 arc_buf_freeze(arc_buf_t *buf)
1404 {
1405 kmutex_t *hash_lock;
1406
1407 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1408 return;
1409
1410 hash_lock = HDR_LOCK(buf->b_hdr);
1411 mutex_enter(hash_lock);
1412
1413 ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1414 buf->b_hdr->b_state == arc_anon);
1415 arc_cksum_compute(buf, B_FALSE);
1416 mutex_exit(hash_lock);
1417
1418 }
1419
1420 static void
1421 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1422 {
1423 ASSERT(MUTEX_HELD(hash_lock));
1424
1425 if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1426 (ab->b_state != arc_anon)) {
1427 uint64_t delta = ab->b_size * ab->b_datacnt;
1428 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1429 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1430
1431 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1432 mutex_enter(&ab->b_state->arcs_mtx);
1433 ASSERT(list_link_active(&ab->b_arc_node));
1434 list_remove(list, ab);
1435 if (GHOST_STATE(ab->b_state)) {
1436 ASSERT0(ab->b_datacnt);
1437 ASSERT3P(ab->b_buf, ==, NULL);
1438 delta = ab->b_size;
1439 }
1440 ASSERT(delta > 0);
1441 ASSERT3U(*size, >=, delta);
1442 atomic_add_64(size, -delta);
1443 mutex_exit(&ab->b_state->arcs_mtx);
1444 /* remove the prefetch flag if we get a reference */
1445 if (ab->b_flags & ARC_PREFETCH)
1446 ab->b_flags &= ~ARC_PREFETCH;
1447 }
1448 }
1449
1450 static int
1451 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1452 {
1453 int cnt;
1454 arc_state_t *state = ab->b_state;
1455
1456 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1457 ASSERT(!GHOST_STATE(state));
1458
1459 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1460 (state != arc_anon)) {
1461 uint64_t *size = &state->arcs_lsize[ab->b_type];
1462
1463 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1464 mutex_enter(&state->arcs_mtx);
1465 ASSERT(!list_link_active(&ab->b_arc_node));
1466 list_insert_head(&state->arcs_list[ab->b_type], ab);
1467 ASSERT(ab->b_datacnt > 0);
1468 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1469 mutex_exit(&state->arcs_mtx);
1470 }
1471 return (cnt);
1472 }
1473
1474 /*
1475 * Move the supplied buffer to the indicated state. The mutex
1476 * for the buffer must be held by the caller.
1477 */
1478 static void
1479 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1480 {
1481 arc_state_t *old_state = ab->b_state;
1482 int64_t refcnt = refcount_count(&ab->b_refcnt);
1483 uint64_t from_delta, to_delta;
1484
1485 ASSERT(MUTEX_HELD(hash_lock));
1486 ASSERT3P(new_state, !=, old_state);
1487 ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1488 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1489 ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1490
1491 from_delta = to_delta = ab->b_datacnt * ab->b_size;
1492
1493 /*
1494 * If this buffer is evictable, transfer it from the
1495 * old state list to the new state list.
1496 */
1497 if (refcnt == 0) {
1498 if (old_state != arc_anon) {
1499 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1500 uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1501
1502 if (use_mutex)
1503 mutex_enter(&old_state->arcs_mtx);
1504
1505 ASSERT(list_link_active(&ab->b_arc_node));
1506 list_remove(&old_state->arcs_list[ab->b_type], ab);
1507
1508 /*
1509 * If prefetching out of the ghost cache,
1510 * we will have a non-zero datacnt.
1511 */
1512 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1513 /* ghost elements have a ghost size */
1514 ASSERT(ab->b_buf == NULL);
1515 from_delta = ab->b_size;
1516 }
1517 ASSERT3U(*size, >=, from_delta);
1518 atomic_add_64(size, -from_delta);
1519
1520 if (use_mutex)
1521 mutex_exit(&old_state->arcs_mtx);
1522 }
1523 if (new_state != arc_anon) {
1524 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1525 uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1526
1527 if (use_mutex)
1528 mutex_enter(&new_state->arcs_mtx);
1529
1530 list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1531
1532 /* ghost elements have a ghost size */
1533 if (GHOST_STATE(new_state)) {
1534 ASSERT(ab->b_datacnt == 0);
1535 ASSERT(ab->b_buf == NULL);
1536 to_delta = ab->b_size;
1537 }
1538 atomic_add_64(size, to_delta);
1539
1540 if (use_mutex)
1541 mutex_exit(&new_state->arcs_mtx);
1542 }
1543 }
1544
1545 ASSERT(!BUF_EMPTY(ab));
1546 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1547 buf_hash_remove(ab);
1548
1549 /* adjust state sizes */
1550 if (to_delta)
1551 atomic_add_64(&new_state->arcs_size, to_delta);
1552 if (from_delta) {
1553 ASSERT3U(old_state->arcs_size, >=, from_delta);
1554 atomic_add_64(&old_state->arcs_size, -from_delta);
1555 }
1556 ab->b_state = new_state;
1557
1558 /* adjust l2arc hdr stats */
1559 if (new_state == arc_l2c_only)
1560 l2arc_hdr_stat_add(old_state != arc_anon);
1561 else if (old_state == arc_l2c_only)
1562 l2arc_hdr_stat_remove();
1563 }
1564
1565 void
1566 arc_space_consume(uint64_t space, arc_space_type_t type)
1567 {
1568 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1569
1570 switch (type) {
1571 case ARC_SPACE_DATA:
1572 ARCSTAT_INCR(arcstat_data_size, space);
1573 break;
1574 case ARC_SPACE_OTHER:
1575 ARCSTAT_INCR(arcstat_other_size, space);
1576 break;
1577 case ARC_SPACE_HDRS:
1578 ARCSTAT_INCR(arcstat_hdr_size, space);
1579 break;
1580 case ARC_SPACE_L2HDRS:
1581 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1582 break;
1583 }
1584
1585 ARCSTAT_INCR(arcstat_meta_used, space);
1586 atomic_add_64(&arc_size, space);
1587 }
1588
1589 void
1590 arc_space_return(uint64_t space, arc_space_type_t type)
1591 {
1592 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1593
1594 switch (type) {
1595 case ARC_SPACE_DATA:
1596 ARCSTAT_INCR(arcstat_data_size, -space);
1597 break;
1598 case ARC_SPACE_OTHER:
1599 ARCSTAT_INCR(arcstat_other_size, -space);
1600 break;
1601 case ARC_SPACE_HDRS:
1602 ARCSTAT_INCR(arcstat_hdr_size, -space);
1603 break;
1604 case ARC_SPACE_L2HDRS:
1605 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1606 break;
1607 }
1608
1609 ASSERT(arc_meta_used >= space);
1610 if (arc_meta_max < arc_meta_used)
1611 arc_meta_max = arc_meta_used;
1612 ARCSTAT_INCR(arcstat_meta_used, -space);
1613 ASSERT(arc_size >= space);
1614 atomic_add_64(&arc_size, -space);
1615 }
1616
1617 void *
1618 arc_data_buf_alloc(uint64_t size)
1619 {
1620 if (arc_evict_needed(ARC_BUFC_DATA))
1621 cv_signal(&arc_reclaim_thr_cv);
1622 atomic_add_64(&arc_size, size);
1623 return (zio_data_buf_alloc(size));
1624 }
1625
1626 void
1627 arc_data_buf_free(void *buf, uint64_t size)
1628 {
1629 zio_data_buf_free(buf, size);
1630 ASSERT(arc_size >= size);
1631 atomic_add_64(&arc_size, -size);
1632 }
1633
1634 arc_buf_t *
1635 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1636 {
1637 arc_buf_hdr_t *hdr;
1638 arc_buf_t *buf;
1639
1640 ASSERT3U(size, >, 0);
1641 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1642 ASSERT(BUF_EMPTY(hdr));
1643 hdr->b_size = size;
1644 hdr->b_type = type;
1645 hdr->b_spa = spa_load_guid(spa);
1646 hdr->b_state = arc_anon;
1647 hdr->b_arc_access = 0;
1648 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1649 buf->b_hdr = hdr;
1650 buf->b_data = NULL;
1651 buf->b_efunc = NULL;
1652 buf->b_private = NULL;
1653 buf->b_next = NULL;
1654 hdr->b_buf = buf;
1655 arc_get_data_buf(buf);
1656 hdr->b_datacnt = 1;
1657 hdr->b_flags = 0;
1658 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1659 (void) refcount_add(&hdr->b_refcnt, tag);
1660
1661 return (buf);
1662 }
1663
1664 /*
1665 * Allocates an empty arc_buf_hdr structure (lacking any data buffer).
1666 * This is used during l2arc reconstruction to make empty ARC buffers
1667 * which circumvent the regular disk->arc->l2arc path and instead come
1668 * into being in the reverse order, i.e. l2arc->arc->(disk).
1669 */
1670 arc_buf_hdr_t *
1671 arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type)
1672 {
1673 arc_buf_hdr_t *hdr;
1674
1675 ASSERT3U(size, >, 0);
1676 hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
1677 ASSERT(BUF_EMPTY(hdr));
1678 hdr->b_size = size;
1679 hdr->b_type = type;
1680 hdr->b_spa = guid;
1681 hdr->b_state = arc_anon;
1682 hdr->b_arc_access = 0;
1683 hdr->b_buf = NULL;
1684 hdr->b_datacnt = 0;
1685 hdr->b_flags = 0;
1686 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1687
1688 return (hdr);
1689 }
1690
1691 static char *arc_onloan_tag = "onloan";
1692
1693 /*
1694 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1695 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1696 * buffers must be returned to the arc before they can be used by the DMU or
1697 * freed.
1698 */
1699 arc_buf_t *
1700 arc_loan_buf(spa_t *spa, int size)
1701 {
1702 arc_buf_t *buf;
1703
1704 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1705
1706 atomic_add_64(&arc_loaned_bytes, size);
1707 return (buf);
1708 }
1709
1710 /*
1711 * Return a loaned arc buffer to the arc.
1712 */
1713 void
1714 arc_return_buf(arc_buf_t *buf, void *tag)
1715 {
1716 arc_buf_hdr_t *hdr = buf->b_hdr;
1717
1718 ASSERT(buf->b_data != NULL);
1719 (void) refcount_add(&hdr->b_refcnt, tag);
1720 (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1721
1722 atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1723 }
1724
1725 /* Detach an arc_buf from a dbuf (tag) */
1726 void
1727 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1728 {
1729 arc_buf_hdr_t *hdr;
1730
1731 ASSERT(buf->b_data != NULL);
1732 hdr = buf->b_hdr;
1733 (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1734 (void) refcount_remove(&hdr->b_refcnt, tag);
1735 buf->b_efunc = NULL;
1736 buf->b_private = NULL;
1737
1738 atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1739 }
1740
1741 static arc_buf_t *
1742 arc_buf_clone(arc_buf_t *from)
1743 {
1744 arc_buf_t *buf;
1745 arc_buf_hdr_t *hdr = from->b_hdr;
1746 uint64_t size = hdr->b_size;
1747
1748 ASSERT(hdr->b_state != arc_anon);
1749
1750 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1751 buf->b_hdr = hdr;
1752 buf->b_data = NULL;
1753 buf->b_efunc = NULL;
1754 buf->b_private = NULL;
1755 buf->b_next = hdr->b_buf;
1756 hdr->b_buf = buf;
1757 arc_get_data_buf(buf);
1758 bcopy(from->b_data, buf->b_data, size);
1759
1760 /*
1761 * This buffer already exists in the arc so create a duplicate
1762 * copy for the caller. If the buffer is associated with user data
1763 * then track the size and number of duplicates. These stats will be
1764 * updated as duplicate buffers are created and destroyed.
1765 */
1766 if (hdr->b_type == ARC_BUFC_DATA) {
1767 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1768 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1769 }
1770 hdr->b_datacnt += 1;
1771 return (buf);
1772 }
1773
1774 void
1775 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1776 {
1777 arc_buf_hdr_t *hdr;
1778 kmutex_t *hash_lock;
1779
1780 /*
1781 * Check to see if this buffer is evicted. Callers
1782 * must verify b_data != NULL to know if the add_ref
1783 * was successful.
1784 */
1785 mutex_enter(&buf->b_evict_lock);
1786 if (buf->b_data == NULL) {
1787 mutex_exit(&buf->b_evict_lock);
1788 return;
1789 }
1790 hash_lock = HDR_LOCK(buf->b_hdr);
1791 mutex_enter(hash_lock);
1792 hdr = buf->b_hdr;
1793 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1794 mutex_exit(&buf->b_evict_lock);
1795
1796 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1797 add_reference(hdr, hash_lock, tag);
1798 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1799 arc_access(hdr, hash_lock);
1800 mutex_exit(hash_lock);
1801 ARCSTAT_BUMP(arcstat_hits);
1802 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1803 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1804 data, metadata, hits);
1805 }
1806
1807 /*
1808 * Free the arc data buffer. If it is an l2arc write in progress,
1809 * the buffer is placed on l2arc_free_on_write to be freed later.
1810 */
1811 static void
1812 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1813 {
1814 arc_buf_hdr_t *hdr = buf->b_hdr;
1815
1816 if (HDR_L2_WRITING(hdr)) {
1817 l2arc_data_free_t *df;
1818 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1819 df->l2df_data = buf->b_data;
1820 df->l2df_size = hdr->b_size;
1821 df->l2df_func = free_func;
1822 mutex_enter(&l2arc_free_on_write_mtx);
1823 list_insert_head(l2arc_free_on_write, df);
1824 mutex_exit(&l2arc_free_on_write_mtx);
1825 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1826 } else {
1827 free_func(buf->b_data, hdr->b_size);
1828 }
1829 }
1830
1831 static void
1832 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1833 {
1834 arc_buf_t **bufp;
1835
1836 /* free up data associated with the buf */
1837 if (buf->b_data) {
1838 arc_state_t *state = buf->b_hdr->b_state;
1839 uint64_t size = buf->b_hdr->b_size;
1840 arc_buf_contents_t type = buf->b_hdr->b_type;
1841
1842 arc_cksum_verify(buf);
1843 arc_buf_unwatch(buf);
1844
1845 if (!recycle) {
1846 if (type == ARC_BUFC_METADATA) {
1847 arc_buf_data_free(buf, zio_buf_free);
1848 arc_space_return(size, ARC_SPACE_DATA);
1849 } else {
1850 ASSERT(type == ARC_BUFC_DATA);
1851 arc_buf_data_free(buf, zio_data_buf_free);
1852 ARCSTAT_INCR(arcstat_data_size, -size);
1853 atomic_add_64(&arc_size, -size);
1854 }
1855 }
1856 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1857 uint64_t *cnt = &state->arcs_lsize[type];
1858
1859 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1860 ASSERT(state != arc_anon);
1861
1862 ASSERT3U(*cnt, >=, size);
1863 atomic_add_64(cnt, -size);
1864 }
1865 ASSERT3U(state->arcs_size, >=, size);
1866 atomic_add_64(&state->arcs_size, -size);
1867 buf->b_data = NULL;
1868
1869 /*
1870 * If we're destroying a duplicate buffer make sure
1871 * that the appropriate statistics are updated.
1872 */
1873 if (buf->b_hdr->b_datacnt > 1 &&
1874 buf->b_hdr->b_type == ARC_BUFC_DATA) {
1875 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1876 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1877 }
1878 ASSERT(buf->b_hdr->b_datacnt > 0);
1879 buf->b_hdr->b_datacnt -= 1;
1880 }
1881
1882 /* only remove the buf if requested */
1883 if (!all)
1884 return;
1885
1886 /* remove the buf from the hdr list */
1887 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1888 continue;
1889 *bufp = buf->b_next;
1890 buf->b_next = NULL;
1891
1892 ASSERT(buf->b_efunc == NULL);
1893
1894 /* clean up the buf */
1895 buf->b_hdr = NULL;
1896 kmem_cache_free(buf_cache, buf);
1897 }
1898
1899 static void
1900 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1901 {
1902 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1903 ASSERT3P(hdr->b_state, ==, arc_anon);
1904 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1905 l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1906
1907 if (l2hdr != NULL) {
1908 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1909 /*
1910 * To prevent arc_free() and l2arc_evict() from
1911 * attempting to free the same buffer at the same time,
1912 * a FREE_IN_PROGRESS flag is given to arc_free() to
1913 * give it priority. l2arc_evict() can't destroy this
1914 * header while we are waiting on l2arc_buflist_mtx.
1915 *
1916 * The hdr may be removed from l2ad_buflist before we
1917 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1918 */
1919 if (!buflist_held) {
1920 mutex_enter(&l2arc_buflist_mtx);
1921 l2hdr = hdr->b_l2hdr;
1922 }
1923
1924 if (l2hdr != NULL) {
1925 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1926 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1927 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1928 kmem_free(l2hdr, sizeof (*l2hdr));
1929 if (hdr->b_state == arc_l2c_only)
1930 l2arc_hdr_stat_remove();
1931 hdr->b_l2hdr = NULL;
1932 }
1933
1934 if (!buflist_held)
1935 mutex_exit(&l2arc_buflist_mtx);
1936 }
1937
1938 if (!BUF_EMPTY(hdr)) {
1939 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1940 buf_discard_identity(hdr);
1941 }
1942 while (hdr->b_buf) {
1943 arc_buf_t *buf = hdr->b_buf;
1944
1945 if (buf->b_efunc) {
1946 mutex_enter(&arc_eviction_mtx);
1947 mutex_enter(&buf->b_evict_lock);
1948 ASSERT(buf->b_hdr != NULL);
1949 arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1950 hdr->b_buf = buf->b_next;
1951 buf->b_hdr = &arc_eviction_hdr;
1952 buf->b_next = arc_eviction_list;
1953 arc_eviction_list = buf;
1954 mutex_exit(&buf->b_evict_lock);
1955 mutex_exit(&arc_eviction_mtx);
1956 } else {
1957 arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1958 }
1959 }
1960 if (hdr->b_freeze_cksum != NULL) {
1961 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1962 hdr->b_freeze_cksum = NULL;
1963 }
1964 if (hdr->b_thawed) {
1965 kmem_free(hdr->b_thawed, 1);
1966 hdr->b_thawed = NULL;
1967 }
1968
1969 ASSERT(!list_link_active(&hdr->b_arc_node));
1970 ASSERT3P(hdr->b_hash_next, ==, NULL);
1971 ASSERT3P(hdr->b_acb, ==, NULL);
1972 kmem_cache_free(hdr_cache, hdr);
1973 }
1974
1975 void
1976 arc_buf_free(arc_buf_t *buf, void *tag)
1977 {
1978 arc_buf_hdr_t *hdr = buf->b_hdr;
1979 int hashed = hdr->b_state != arc_anon;
1980
1981 ASSERT(buf->b_efunc == NULL);
1982 ASSERT(buf->b_data != NULL);
1983
1984 if (hashed) {
1985 kmutex_t *hash_lock = HDR_LOCK(hdr);
1986
1987 mutex_enter(hash_lock);
1988 hdr = buf->b_hdr;
1989 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1990
1991 (void) remove_reference(hdr, hash_lock, tag);
1992 if (hdr->b_datacnt > 1) {
1993 arc_buf_destroy(buf, FALSE, TRUE);
1994 } else {
1995 ASSERT(buf == hdr->b_buf);
1996 ASSERT(buf->b_efunc == NULL);
1997 hdr->b_flags |= ARC_BUF_AVAILABLE;
1998 }
1999 mutex_exit(hash_lock);
2000 } else if (HDR_IO_IN_PROGRESS(hdr)) {
2001 int destroy_hdr;
2002 /*
2003 * We are in the middle of an async write. Don't destroy
2004 * this buffer unless the write completes before we finish
2005 * decrementing the reference count.
2006 */
2007 mutex_enter(&arc_eviction_mtx);
2008 (void) remove_reference(hdr, NULL, tag);
2009 ASSERT(refcount_is_zero(&hdr->b_refcnt));
2010 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
2011 mutex_exit(&arc_eviction_mtx);
2012 if (destroy_hdr)
2013 arc_hdr_destroy(hdr);
2014 } else {
2015 if (remove_reference(hdr, NULL, tag) > 0)
2016 arc_buf_destroy(buf, FALSE, TRUE);
2017 else
2018 arc_hdr_destroy(hdr);
2019 }
2020 }
2021
2022 boolean_t
2023 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
2024 {
2025 arc_buf_hdr_t *hdr = buf->b_hdr;
2026 kmutex_t *hash_lock = HDR_LOCK(hdr);
2027 boolean_t no_callback = (buf->b_efunc == NULL);
2028
2029 if (hdr->b_state == arc_anon) {
2030 ASSERT(hdr->b_datacnt == 1);
2031 arc_buf_free(buf, tag);
2032 return (no_callback);
2033 }
2034
2035 mutex_enter(hash_lock);
2036 hdr = buf->b_hdr;
2037 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2038 ASSERT(hdr->b_state != arc_anon);
2039 ASSERT(buf->b_data != NULL);
2040
2041 (void) remove_reference(hdr, hash_lock, tag);
2042 if (hdr->b_datacnt > 1) {
2043 if (no_callback)
2044 arc_buf_destroy(buf, FALSE, TRUE);
2045 } else if (no_callback) {
2046 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
2047 ASSERT(buf->b_efunc == NULL);
2048 hdr->b_flags |= ARC_BUF_AVAILABLE;
2049 }
2050 ASSERT(no_callback || hdr->b_datacnt > 1 ||
2051 refcount_is_zero(&hdr->b_refcnt));
2052 mutex_exit(hash_lock);
2053 return (no_callback);
2054 }
2055
2056 int
2057 arc_buf_size(arc_buf_t *buf)
2058 {
2059 return (buf->b_hdr->b_size);
2060 }
2061
2062 /*
2063 * Called from the DMU to determine if the current buffer should be
2064 * evicted. In order to ensure proper locking, the eviction must be initiated
2065 * from the DMU. Return true if the buffer is associated with user data and
2066 * duplicate buffers still exist.
2067 */
2068 boolean_t
2069 arc_buf_eviction_needed(arc_buf_t *buf)
2070 {
2071 arc_buf_hdr_t *hdr;
2072 boolean_t evict_needed = B_FALSE;
2073
2074 if (zfs_disable_dup_eviction)
2075 return (B_FALSE);
2076
2077 mutex_enter(&buf->b_evict_lock);
2078 hdr = buf->b_hdr;
2079 if (hdr == NULL) {
2080 /*
2081 * We are in arc_do_user_evicts(); let that function
2082 * perform the eviction.
2083 */
2084 ASSERT(buf->b_data == NULL);
2085 mutex_exit(&buf->b_evict_lock);
2086 return (B_FALSE);
2087 } else if (buf->b_data == NULL) {
2088 /*
2089 * We have already been added to the arc eviction list;
2090 * recommend eviction.
2091 */
2092 ASSERT3P(hdr, ==, &arc_eviction_hdr);
2093 mutex_exit(&buf->b_evict_lock);
2094 return (B_TRUE);
2095 }
2096
2097 if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
2098 evict_needed = B_TRUE;
2099
2100 mutex_exit(&buf->b_evict_lock);
2101 return (evict_needed);
2102 }
2103
2104 /*
2105 * Evict buffers from list until we've removed the specified number of
2106 * bytes. Move the removed buffers to the appropriate evict state.
2107 * If the recycle flag is set, then attempt to "recycle" a buffer:
2108 * - look for a buffer to evict that is `bytes' long.
2109 * - return the data block from this buffer rather than freeing it.
2110 * This flag is used by callers that are trying to make space for a
2111 * new buffer in a full arc cache.
2112 *
2113 * This function makes a "best effort". It skips over any buffers
2114 * it can't get a hash_lock on, and so may not catch all candidates.
2115 * It may also return without evicting as much space as requested.
2116 */
2117 static void *
2118 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
2119 arc_buf_contents_t type)
2120 {
2121 arc_state_t *evicted_state;
2122 uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
2123 arc_buf_hdr_t *ab, *ab_prev = NULL;
2124 list_t *list = &state->arcs_list[type];
2125 kmutex_t *hash_lock;
2126 boolean_t have_lock;
2127 void *stolen = NULL;
2128 arc_buf_hdr_t marker = { 0 };
2129 int count = 0;
2130
2131 ASSERT(state == arc_mru || state == arc_mfu);
2132
2133 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2134
2135 mutex_enter(&state->arcs_mtx);
2136 mutex_enter(&evicted_state->arcs_mtx);
2137
2138 for (ab = list_tail(list); ab; ab = ab_prev) {
2139 ab_prev = list_prev(list, ab);
2140 /* prefetch buffers have a minimum lifespan */
2141 if (HDR_IO_IN_PROGRESS(ab) ||
2142 (spa && ab->b_spa != spa) ||
2143 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
2144 ddi_get_lbolt() - ab->b_arc_access <
2145 arc_min_prefetch_lifespan)) {
2146 skipped++;
2147 continue;
2148 }
2149 /* "lookahead" for better eviction candidate */
2150 if (recycle && ab->b_size != bytes &&
2151 ab_prev && ab_prev->b_size == bytes)
2152 continue;
2153
2154 /* ignore markers */
2155 if (ab->b_spa == 0)
2156 continue;
2157
2158 /*
2159 * It may take a long time to evict all the bufs requested.
2160 * To avoid blocking all arc activity, periodically drop
2161 * the arcs_mtx and give other threads a chance to run
2162 * before reacquiring the lock.
2163 *
2164 * If we are looking for a buffer to recycle, we are in
2165 * the hot code path, so don't sleep.
2166 */
2167 if (!recycle && count++ > arc_evict_iterations) {
2168 list_insert_after(list, ab, &marker);
2169 mutex_exit(&evicted_state->arcs_mtx);
2170 mutex_exit(&state->arcs_mtx);
2171 kpreempt(KPREEMPT_SYNC);
2172 mutex_enter(&state->arcs_mtx);
2173 mutex_enter(&evicted_state->arcs_mtx);
2174 ab_prev = list_prev(list, &marker);
2175 list_remove(list, &marker);
2176 count = 0;
2177 continue;
2178 }
2179
2180 hash_lock = HDR_LOCK(ab);
2181 have_lock = MUTEX_HELD(hash_lock);
2182 if (have_lock || mutex_tryenter(hash_lock)) {
2183 ASSERT0(refcount_count(&ab->b_refcnt));
2184 ASSERT(ab->b_datacnt > 0);
2185 while (ab->b_buf) {
2186 arc_buf_t *buf = ab->b_buf;
2187 if (!mutex_tryenter(&buf->b_evict_lock)) {
2188 missed += 1;
2189 break;
2190 }
2191 if (buf->b_data) {
2192 bytes_evicted += ab->b_size;
2193 if (recycle && ab->b_type == type &&
2194 ab->b_size == bytes &&
2195 !HDR_L2_WRITING(ab)) {
2196 stolen = buf->b_data;
2197 recycle = FALSE;
2198 }
2199 }
2200 if (buf->b_efunc) {
2201 mutex_enter(&arc_eviction_mtx);
2202 arc_buf_destroy(buf,
2203 buf->b_data == stolen, FALSE);
2204 ab->b_buf = buf->b_next;
2205 buf->b_hdr = &arc_eviction_hdr;
2206 buf->b_next = arc_eviction_list;
2207 arc_eviction_list = buf;
2208 mutex_exit(&arc_eviction_mtx);
2209 mutex_exit(&buf->b_evict_lock);
2210 } else {
2211 mutex_exit(&buf->b_evict_lock);
2212 arc_buf_destroy(buf,
2213 buf->b_data == stolen, TRUE);
2214 }
2215 }
2216
2217 if (ab->b_l2hdr) {
2218 ARCSTAT_INCR(arcstat_evict_l2_cached,
2219 ab->b_size);
2220 } else {
2221 if (l2arc_write_eligible(ab->b_spa, ab)) {
2222 ARCSTAT_INCR(arcstat_evict_l2_eligible,
2223 ab->b_size);
2224 } else {
2225 ARCSTAT_INCR(
2226 arcstat_evict_l2_ineligible,
2227 ab->b_size);
2228 }
2229 }
2230
2231 if (ab->b_datacnt == 0) {
2232 arc_change_state(evicted_state, ab, hash_lock);
2233 ASSERT(HDR_IN_HASH_TABLE(ab));
2234 ab->b_flags |= ARC_IN_HASH_TABLE;
2235 ab->b_flags &= ~ARC_BUF_AVAILABLE;
2236 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
2237 }
2238 if (!have_lock)
2239 mutex_exit(hash_lock);
2240 if (bytes >= 0 && bytes_evicted >= bytes)
2241 break;
2242 } else {
2243 missed += 1;
2244 }
2245 }
2246
2247 mutex_exit(&evicted_state->arcs_mtx);
2248 mutex_exit(&state->arcs_mtx);
2249
2250 if (bytes_evicted < bytes)
2251 dprintf("only evicted %lld bytes from %x",
2252 (longlong_t)bytes_evicted, state);
2253
2254 if (skipped)
2255 ARCSTAT_INCR(arcstat_evict_skip, skipped);
2256
2257 if (missed)
2258 ARCSTAT_INCR(arcstat_mutex_miss, missed);
2259
2260 /*
2261 * Note: we have just evicted some data into the ghost state,
2262 * potentially putting the ghost size over the desired size. Rather
2263 * that evicting from the ghost list in this hot code path, leave
2264 * this chore to the arc_reclaim_thread().
2265 */
2266
2267 return (stolen);
2268 }
2269
2270 /*
2271 * Remove buffers from list until we've removed the specified number of
2272 * bytes. Destroy the buffers that are removed.
2273 */
2274 static void
2275 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2276 {
2277 arc_buf_hdr_t *ab, *ab_prev;
2278 arc_buf_hdr_t marker = { 0 };
2279 list_t *list = &state->arcs_list[ARC_BUFC_DATA];
2280 kmutex_t *hash_lock;
2281 uint64_t bytes_deleted = 0;
2282 uint64_t bufs_skipped = 0;
2283 int count = 0;
2284
2285 ASSERT(GHOST_STATE(state));
2286 top:
2287 mutex_enter(&state->arcs_mtx);
2288 for (ab = list_tail(list); ab; ab = ab_prev) {
2289 ab_prev = list_prev(list, ab);
2290 if (ab->b_type > ARC_BUFC_NUMTYPES)
2291 panic("invalid ab=%p", (void *)ab);
2292 if (spa && ab->b_spa != spa)
2293 continue;
2294
2295 /* ignore markers */
2296 if (ab->b_spa == 0)
2297 continue;
2298
2299 hash_lock = HDR_LOCK(ab);
2300 /* caller may be trying to modify this buffer, skip it */
2301 if (MUTEX_HELD(hash_lock))
2302 continue;
2303
2304 /*
2305 * It may take a long time to evict all the bufs requested.
2306 * To avoid blocking all arc activity, periodically drop
2307 * the arcs_mtx and give other threads a chance to run
2308 * before reacquiring the lock.
2309 */
2310 if (count++ > arc_evict_iterations) {
2311 list_insert_after(list, ab, &marker);
2312 mutex_exit(&state->arcs_mtx);
2313 kpreempt(KPREEMPT_SYNC);
2314 mutex_enter(&state->arcs_mtx);
2315 ab_prev = list_prev(list, &marker);
2316 list_remove(list, &marker);
2317 count = 0;
2318 continue;
2319 }
2320 if (mutex_tryenter(hash_lock)) {
2321 ASSERT(!HDR_IO_IN_PROGRESS(ab));
2322 ASSERT(ab->b_buf == NULL);
2323 ARCSTAT_BUMP(arcstat_deleted);
2324 bytes_deleted += ab->b_size;
2325
2326 if (ab->b_l2hdr != NULL) {
2327 /*
2328 * This buffer is cached on the 2nd Level ARC;
2329 * don't destroy the header.
2330 */
2331 arc_change_state(arc_l2c_only, ab, hash_lock);
2332 mutex_exit(hash_lock);
2333 } else {
2334 arc_change_state(arc_anon, ab, hash_lock);
2335 mutex_exit(hash_lock);
2336 arc_hdr_destroy(ab);
2337 }
2338
2339 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2340 if (bytes >= 0 && bytes_deleted >= bytes)
2341 break;
2342 } else if (bytes < 0) {
2343 /*
2344 * Insert a list marker and then wait for the
2345 * hash lock to become available. Once its
2346 * available, restart from where we left off.
2347 */
2348 list_insert_after(list, ab, &marker);
2349 mutex_exit(&state->arcs_mtx);
2350 mutex_enter(hash_lock);
2351 mutex_exit(hash_lock);
2352 mutex_enter(&state->arcs_mtx);
2353 ab_prev = list_prev(list, &marker);
2354 list_remove(list, &marker);
2355 } else {
2356 bufs_skipped += 1;
2357 }
2358
2359 }
2360 mutex_exit(&state->arcs_mtx);
2361
2362 if (list == &state->arcs_list[ARC_BUFC_DATA] &&
2363 (bytes < 0 || bytes_deleted < bytes)) {
2364 list = &state->arcs_list[ARC_BUFC_METADATA];
2365 goto top;
2366 }
2367
2368 if (bufs_skipped) {
2369 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2370 ASSERT(bytes >= 0);
2371 }
2372
2373 if (bytes_deleted < bytes)
2374 dprintf("only deleted %lld bytes from %p",
2375 (longlong_t)bytes_deleted, state);
2376 }
2377
2378 static void
2379 arc_adjust(void)
2380 {
2381 int64_t adjustment, delta;
2382
2383 /*
2384 * Adjust MRU size
2385 */
2386
2387 adjustment = MIN((int64_t)(arc_size - arc_c),
2388 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2389 arc_p));
2390
2391 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2392 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2393 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
2394 adjustment -= delta;
2395 }
2396
2397 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2398 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2399 (void) arc_evict(arc_mru, NULL, delta, FALSE,
2400 ARC_BUFC_METADATA);
2401 }
2402
2403 /*
2404 * Adjust MFU size
2405 */
2406
2407 adjustment = arc_size - arc_c;
2408
2409 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2410 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2411 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
2412 adjustment -= delta;
2413 }
2414
2415 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2416 int64_t delta = MIN(adjustment,
2417 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2418 (void) arc_evict(arc_mfu, NULL, delta, FALSE,
2419 ARC_BUFC_METADATA);
2420 }
2421
2422 /*
2423 * Adjust ghost lists
2424 */
2425
2426 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2427
2428 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2429 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2430 arc_evict_ghost(arc_mru_ghost, NULL, delta);
2431 }
2432
2433 adjustment =
2434 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2435
2436 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2437 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2438 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2439 }
2440 }
2441
2442 static void
2443 arc_do_user_evicts(void)
2444 {
2445 mutex_enter(&arc_eviction_mtx);
2446 while (arc_eviction_list != NULL) {
2447 arc_buf_t *buf = arc_eviction_list;
2448 arc_eviction_list = buf->b_next;
2449 mutex_enter(&buf->b_evict_lock);
2450 buf->b_hdr = NULL;
2451 mutex_exit(&buf->b_evict_lock);
2452 mutex_exit(&arc_eviction_mtx);
2453
2454 if (buf->b_efunc != NULL)
2455 VERIFY(buf->b_efunc(buf) == 0);
2456
2457 buf->b_efunc = NULL;
2458 buf->b_private = NULL;
2459 kmem_cache_free(buf_cache, buf);
2460 mutex_enter(&arc_eviction_mtx);
2461 }
2462 mutex_exit(&arc_eviction_mtx);
2463 }
2464
2465 /*
2466 * Flush all *evictable* data from the cache for the given spa.
2467 * NOTE: this will not touch "active" (i.e. referenced) data.
2468 */
2469 void
2470 arc_flush(spa_t *spa)
2471 {
2472 uint64_t guid = 0;
2473
2474 if (spa)
2475 guid = spa_load_guid(spa);
2476
2477 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2478 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2479 if (spa)
2480 break;
2481 }
2482 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2483 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2484 if (spa)
2485 break;
2486 }
2487 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2488 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2489 if (spa)
2490 break;
2491 }
2492 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2493 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2494 if (spa)
2495 break;
2496 }
2497
2498 arc_evict_ghost(arc_mru_ghost, guid, -1);
2499 arc_evict_ghost(arc_mfu_ghost, guid, -1);
2500
2501 mutex_enter(&arc_reclaim_thr_lock);
2502 arc_do_user_evicts();
2503 mutex_exit(&arc_reclaim_thr_lock);
2504 ASSERT(spa || arc_eviction_list == NULL);
2505 }
2506
2507 void
2508 arc_shrink(void)
2509 {
2510 if (arc_c > arc_c_min) {
2511 uint64_t to_free;
2512
2513 #ifdef _KERNEL
2514 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
2515 #else
2516 to_free = arc_c >> arc_shrink_shift;
2517 #endif
2518 if (arc_c > arc_c_min + to_free)
2519 atomic_add_64(&arc_c, -to_free);
2520 else
2521 arc_c = arc_c_min;
2522
2523 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2524 if (arc_c > arc_size)
2525 arc_c = MAX(arc_size, arc_c_min);
2526 if (arc_p > arc_c)
2527 arc_p = (arc_c >> 1);
2528 ASSERT(arc_c >= arc_c_min);
2529 ASSERT((int64_t)arc_p >= 0);
2530 }
2531
2532 if (arc_size > arc_c)
2533 arc_adjust();
2534 }
2535
2536 /*
2537 * Determine if the system is under memory pressure and is asking
2538 * to reclaim memory. A return value of 1 indicates that the system
2539 * is under memory pressure and that the arc should adjust accordingly.
2540 */
2541 static int
2542 arc_reclaim_needed(void)
2543 {
2544 uint64_t extra;
2545
2546 #ifdef _KERNEL
2547
2548 if (needfree)
2549 return (1);
2550
2551 /*
2552 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2553 */
2554 extra = desfree;
2555
2556 /*
2557 * check that we're out of range of the pageout scanner. It starts to
2558 * schedule paging if freemem is less than lotsfree and needfree.
2559 * lotsfree is the high-water mark for pageout, and needfree is the
2560 * number of needed free pages. We add extra pages here to make sure
2561 * the scanner doesn't start up while we're freeing memory.
2562 */
2563 if (freemem < lotsfree + needfree + extra)
2564 return (1);
2565
2566 /*
2567 * check to make sure that swapfs has enough space so that anon
2568 * reservations can still succeed. anon_resvmem() checks that the
2569 * availrmem is greater than swapfs_minfree, and the number of reserved
2570 * swap pages. We also add a bit of extra here just to prevent
2571 * circumstances from getting really dire.
2572 */
2573 if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2574 return (1);
2575
2576 /*
2577 * Check that we have enough availrmem that memory locking (e.g., via
2578 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum
2579 * stores the number of pages that cannot be locked; when availrmem
2580 * drops below pages_pp_maximum, page locking mechanisms such as
2581 * page_pp_lock() will fail.)
2582 */
2583 if (availrmem <= pages_pp_maximum)
2584 return (1);
2585
2586 #if defined(__i386)
2587 /*
2588 * If we're on an i386 platform, it's possible that we'll exhaust the
2589 * kernel heap space before we ever run out of available physical
2590 * memory. Most checks of the size of the heap_area compare against
2591 * tune.t_minarmem, which is the minimum available real memory that we
2592 * can have in the system. However, this is generally fixed at 25 pages
2593 * which is so low that it's useless. In this comparison, we seek to
2594 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2595 * heap is allocated. (Or, in the calculation, if less than 1/4th is
2596 * free)
2597 */
2598 if (vmem_size(heap_arena, VMEM_FREE) <
2599 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2))
2600 return (1);
2601 #endif
2602
2603 /*
2604 * If zio data pages are being allocated out of a separate heap segment,
2605 * then enforce that the size of available vmem for this arena remains
2606 * above about 1/16th free.
2607 *
2608 * Note: The 1/16th arena free requirement was put in place
2609 * to aggressively evict memory from the arc in order to avoid
2610 * memory fragmentation issues.
2611 */
2612 if (zio_arena != NULL &&
2613 vmem_size(zio_arena, VMEM_FREE) <
2614 (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2615 return (1);
2616 #else
2617 if (spa_get_random(100) == 0)
2618 return (1);
2619 #endif
2620 return (0);
2621 }
2622
2623 static void
2624 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2625 {
2626 size_t i;
2627 kmem_cache_t *prev_cache = NULL;
2628 kmem_cache_t *prev_data_cache = NULL;
2629 extern kmem_cache_t *zio_buf_cache[];
2630 extern kmem_cache_t *zio_data_buf_cache[];
2631
2632 #ifdef _KERNEL
2633 if (arc_meta_used >= arc_meta_limit) {
2634 /*
2635 * We are exceeding our meta-data cache limit.
2636 * Purge some DNLC entries to release holds on meta-data.
2637 */
2638 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2639 }
2640 #if defined(__i386)
2641 /*
2642 * Reclaim unused memory from all kmem caches.
2643 */
2644 kmem_reap();
2645 #endif
2646 #endif
2647
2648 /*
2649 * An aggressive reclamation will shrink the cache size as well as
2650 * reap free buffers from the arc kmem caches.
2651 */
2652 if (strat == ARC_RECLAIM_AGGR)
2653 arc_shrink();
2654
2655 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2656 if (zio_buf_cache[i] != prev_cache) {
2657 prev_cache = zio_buf_cache[i];
2658 kmem_cache_reap_now(zio_buf_cache[i]);
2659 }
2660 if (zio_data_buf_cache[i] != prev_data_cache) {
2661 prev_data_cache = zio_data_buf_cache[i];
2662 kmem_cache_reap_now(zio_data_buf_cache[i]);
2663 }
2664 }
2665 kmem_cache_reap_now(buf_cache);
2666 kmem_cache_reap_now(hdr_cache);
2667
2668 /*
2669 * Ask the vmem areana to reclaim unused memory from its
2670 * quantum caches.
2671 */
2672 if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2673 vmem_qcache_reap(zio_arena);
2674 }
2675
2676 static void
2677 arc_reclaim_thread(void)
2678 {
2679 clock_t growtime = 0;
2680 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
2681 callb_cpr_t cpr;
2682
2683 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2684
2685 mutex_enter(&arc_reclaim_thr_lock);
2686 while (arc_thread_exit == 0) {
2687 if (arc_reclaim_needed()) {
2688
2689 if (arc_no_grow) {
2690 if (last_reclaim == ARC_RECLAIM_CONS) {
2691 last_reclaim = ARC_RECLAIM_AGGR;
2692 } else {
2693 last_reclaim = ARC_RECLAIM_CONS;
2694 }
2695 } else {
2696 arc_no_grow = TRUE;
2697 last_reclaim = ARC_RECLAIM_AGGR;
2698 membar_producer();
2699 }
2700
2701 /* reset the growth delay for every reclaim */
2702 growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2703
2704 arc_kmem_reap_now(last_reclaim);
2705 arc_warm = B_TRUE;
2706
2707 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2708 arc_no_grow = FALSE;
2709 }
2710
2711 arc_adjust();
2712
2713 if (arc_eviction_list != NULL)
2714 arc_do_user_evicts();
2715
2716 /* block until needed, or one second, whichever is shorter */
2717 CALLB_CPR_SAFE_BEGIN(&cpr);
2718 (void) cv_timedwait(&arc_reclaim_thr_cv,
2719 &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2720 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2721 }
2722
2723 arc_thread_exit = 0;
2724 cv_broadcast(&arc_reclaim_thr_cv);
2725 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */
2726 thread_exit();
2727 }
2728
2729 /*
2730 * Adapt arc info given the number of bytes we are trying to add and
2731 * the state that we are comming from. This function is only called
2732 * when we are adding new content to the cache.
2733 */
2734 static void
2735 arc_adapt(int bytes, arc_state_t *state)
2736 {
2737 int mult;
2738 uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2739
2740 if (state == arc_l2c_only)
2741 return;
2742
2743 ASSERT(bytes > 0);
2744 /*
2745 * Adapt the target size of the MRU list:
2746 * - if we just hit in the MRU ghost list, then increase
2747 * the target size of the MRU list.
2748 * - if we just hit in the MFU ghost list, then increase
2749 * the target size of the MFU list by decreasing the
2750 * target size of the MRU list.
2751 */
2752 if (state == arc_mru_ghost) {
2753 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2754 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2755 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2756
2757 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2758 } else if (state == arc_mfu_ghost) {
2759 uint64_t delta;
2760
2761 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2762 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2763 mult = MIN(mult, 10);
2764
2765 delta = MIN(bytes * mult, arc_p);
2766 arc_p = MAX(arc_p_min, arc_p - delta);
2767 }
2768 ASSERT((int64_t)arc_p >= 0);
2769
2770 if (arc_reclaim_needed()) {
2771 cv_signal(&arc_reclaim_thr_cv);
2772 return;
2773 }
2774
2775 if (arc_no_grow)
2776 return;
2777
2778 if (arc_c >= arc_c_max)
2779 return;
2780
2781 /*
2782 * If we're within (2 * maxblocksize) bytes of the target
2783 * cache size, increment the target cache size
2784 */
2785 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2786 atomic_add_64(&arc_c, (int64_t)bytes);
2787 if (arc_c > arc_c_max)
2788 arc_c = arc_c_max;
2789 else if (state == arc_anon)
2790 atomic_add_64(&arc_p, (int64_t)bytes);
2791 if (arc_p > arc_c)
2792 arc_p = arc_c;
2793 }
2794 ASSERT((int64_t)arc_p >= 0);
2795 }
2796
2797 /*
2798 * Check if the cache has reached its limits and eviction is required
2799 * prior to insert.
2800 */
2801 static int
2802 arc_evict_needed(arc_buf_contents_t type)
2803 {
2804 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2805 return (1);
2806
2807 if (arc_reclaim_needed())
2808 return (1);
2809
2810 return (arc_size > arc_c);
2811 }
2812
2813 /*
2814 * The buffer, supplied as the first argument, needs a data block.
2815 * So, if we are at cache max, determine which cache should be victimized.
2816 * We have the following cases:
2817 *
2818 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2819 * In this situation if we're out of space, but the resident size of the MFU is
2820 * under the limit, victimize the MFU cache to satisfy this insertion request.
2821 *
2822 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2823 * Here, we've used up all of the available space for the MRU, so we need to
2824 * evict from our own cache instead. Evict from the set of resident MRU
2825 * entries.
2826 *
2827 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2828 * c minus p represents the MFU space in the cache, since p is the size of the
2829 * cache that is dedicated to the MRU. In this situation there's still space on
2830 * the MFU side, so the MRU side needs to be victimized.
2831 *
2832 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2833 * MFU's resident set is consuming more space than it has been allotted. In
2834 * this situation, we must victimize our own cache, the MFU, for this insertion.
2835 */
2836 static void
2837 arc_get_data_buf(arc_buf_t *buf)
2838 {
2839 arc_state_t *state = buf->b_hdr->b_state;
2840 uint64_t size = buf->b_hdr->b_size;
2841 arc_buf_contents_t type = buf->b_hdr->b_type;
2842
2843 arc_adapt(size, state);
2844
2845 /*
2846 * We have not yet reached cache maximum size,
2847 * just allocate a new buffer.
2848 */
2849 if (!arc_evict_needed(type)) {
2850 if (type == ARC_BUFC_METADATA) {
2851 buf->b_data = zio_buf_alloc(size);
2852 arc_space_consume(size, ARC_SPACE_DATA);
2853 } else {
2854 ASSERT(type == ARC_BUFC_DATA);
2855 buf->b_data = zio_data_buf_alloc(size);
2856 ARCSTAT_INCR(arcstat_data_size, size);
2857 atomic_add_64(&arc_size, size);
2858 }
2859 goto out;
2860 }
2861
2862 /*
2863 * If we are prefetching from the mfu ghost list, this buffer
2864 * will end up on the mru list; so steal space from there.
2865 */
2866 if (state == arc_mfu_ghost)
2867 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2868 else if (state == arc_mru_ghost)
2869 state = arc_mru;
2870
2871 if (state == arc_mru || state == arc_anon) {
2872 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2873 state = (arc_mfu->arcs_lsize[type] >= size &&
2874 arc_p > mru_used) ? arc_mfu : arc_mru;
2875 } else {
2876 /* MFU cases */
2877 uint64_t mfu_space = arc_c - arc_p;
2878 state = (arc_mru->arcs_lsize[type] >= size &&
2879 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2880 }
2881 if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2882 if (type == ARC_BUFC_METADATA) {
2883 buf->b_data = zio_buf_alloc(size);
2884 arc_space_consume(size, ARC_SPACE_DATA);
2885 } else {
2886 ASSERT(type == ARC_BUFC_DATA);
2887 buf->b_data = zio_data_buf_alloc(size);
2888 ARCSTAT_INCR(arcstat_data_size, size);
2889 atomic_add_64(&arc_size, size);
2890 }
2891 ARCSTAT_BUMP(arcstat_recycle_miss);
2892 }
2893 ASSERT(buf->b_data != NULL);
2894 out:
2895 /*
2896 * Update the state size. Note that ghost states have a
2897 * "ghost size" and so don't need to be updated.
2898 */
2899 if (!GHOST_STATE(buf->b_hdr->b_state)) {
2900 arc_buf_hdr_t *hdr = buf->b_hdr;
2901
2902 atomic_add_64(&hdr->b_state->arcs_size, size);
2903 if (list_link_active(&hdr->b_arc_node)) {
2904 ASSERT(refcount_is_zero(&hdr->b_refcnt));
2905 atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2906 }
2907 /*
2908 * If we are growing the cache, and we are adding anonymous
2909 * data, and we have outgrown arc_p, update arc_p
2910 */
2911 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2912 arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2913 arc_p = MIN(arc_c, arc_p + size);
2914 }
2915 }
2916
2917 /*
2918 * This routine is called whenever a buffer is accessed.
2919 * NOTE: the hash lock is dropped in this function.
2920 */
2921 static void
2922 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2923 {
2924 clock_t now;
2925
2926 ASSERT(MUTEX_HELD(hash_lock));
2927
2928 if (buf->b_state == arc_anon) {
2929 /*
2930 * This buffer is not in the cache, and does not
2931 * appear in our "ghost" list. Add the new buffer
2932 * to the MRU state.
2933 */
2934
2935 ASSERT(buf->b_arc_access == 0);
2936 buf->b_arc_access = ddi_get_lbolt();
2937 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2938 arc_change_state(arc_mru, buf, hash_lock);
2939
2940 } else if (buf->b_state == arc_mru) {
2941 now = ddi_get_lbolt();
2942
2943 /*
2944 * If this buffer is here because of a prefetch, then either:
2945 * - clear the flag if this is a "referencing" read
2946 * (any subsequent access will bump this into the MFU state).
2947 * or
2948 * - move the buffer to the head of the list if this is
2949 * another prefetch (to make it less likely to be evicted).
2950 */
2951 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2952 if (refcount_count(&buf->b_refcnt) == 0) {
2953 ASSERT(list_link_active(&buf->b_arc_node));
2954 } else {
2955 buf->b_flags &= ~ARC_PREFETCH;
2956 ARCSTAT_BUMP(arcstat_mru_hits);
2957 }
2958 buf->b_arc_access = now;
2959 return;
2960 }
2961
2962 /*
2963 * This buffer has been "accessed" only once so far,
2964 * but it is still in the cache. Move it to the MFU
2965 * state.
2966 */
2967 if (now > buf->b_arc_access + ARC_MINTIME) {
2968 /*
2969 * More than 125ms have passed since we
2970 * instantiated this buffer. Move it to the
2971 * most frequently used state.
2972 */
2973 buf->b_arc_access = now;
2974 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2975 arc_change_state(arc_mfu, buf, hash_lock);
2976 }
2977 ARCSTAT_BUMP(arcstat_mru_hits);
2978 } else if (buf->b_state == arc_mru_ghost) {
2979 arc_state_t *new_state;
2980 /*
2981 * This buffer has been "accessed" recently, but
2982 * was evicted from the cache. Move it to the
2983 * MFU state.
2984 */
2985
2986 if (buf->b_flags & ARC_PREFETCH) {
2987 new_state = arc_mru;
2988 if (refcount_count(&buf->b_refcnt) > 0)
2989 buf->b_flags &= ~ARC_PREFETCH;
2990 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2991 } else {
2992 new_state = arc_mfu;
2993 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2994 }
2995
2996 buf->b_arc_access = ddi_get_lbolt();
2997 arc_change_state(new_state, buf, hash_lock);
2998
2999 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
3000 } else if (buf->b_state == arc_mfu) {
3001 /*
3002 * This buffer has been accessed more than once and is
3003 * still in the cache. Keep it in the MFU state.
3004 *
3005 * NOTE: an add_reference() that occurred when we did
3006 * the arc_read() will have kicked this off the list.
3007 * If it was a prefetch, we will explicitly move it to
3008 * the head of the list now.
3009 */
3010 if ((buf->b_flags & ARC_PREFETCH) != 0) {
3011 ASSERT(refcount_count(&buf->b_refcnt) == 0);
3012 ASSERT(list_link_active(&buf->b_arc_node));
3013 }
3014 ARCSTAT_BUMP(arcstat_mfu_hits);
3015 buf->b_arc_access = ddi_get_lbolt();
3016 } else if (buf->b_state == arc_mfu_ghost) {
3017 arc_state_t *new_state = arc_mfu;
3018 /*
3019 * This buffer has been accessed more than once but has
3020 * been evicted from the cache. Move it back to the
3021 * MFU state.
3022 */
3023
3024 if (buf->b_flags & ARC_PREFETCH) {
3025 /*
3026 * This is a prefetch access...
3027 * move this block back to the MRU state.
3028 */
3029 ASSERT0(refcount_count(&buf->b_refcnt));
3030 new_state = arc_mru;
3031 }
3032
3033 buf->b_arc_access = ddi_get_lbolt();
3034 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
3035 arc_change_state(new_state, buf, hash_lock);
3036
3037 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
3038 } else if (buf->b_state == arc_l2c_only) {
3039 /*
3040 * This buffer is on the 2nd Level ARC.
3041 */
3042
3043 buf->b_arc_access = ddi_get_lbolt();
3044 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
3045 arc_change_state(arc_mfu, buf, hash_lock);
3046 } else {
3047 ASSERT(!"invalid arc state");
3048 }
3049 }
3050
3051 /* a generic arc_done_func_t which you can use */
3052 /* ARGSUSED */
3053 void
3054 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
3055 {
3056 if (zio == NULL || zio->io_error == 0)
3057 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
3058 VERIFY(arc_buf_remove_ref(buf, arg));
3059 }
3060
3061 /* a generic arc_done_func_t */
3062 void
3063 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
3064 {
3065 arc_buf_t **bufp = arg;
3066 if (zio && zio->io_error) {
3067 VERIFY(arc_buf_remove_ref(buf, arg));
3068 *bufp = NULL;
3069 } else {
3070 *bufp = buf;
3071 ASSERT(buf->b_data);
3072 }
3073 }
3074
3075 static void
3076 arc_read_done(zio_t *zio)
3077 {
3078 arc_buf_hdr_t *hdr, *found;
3079 arc_buf_t *buf;
3080 arc_buf_t *abuf; /* buffer we're assigning to callback */
3081 kmutex_t *hash_lock;
3082 arc_callback_t *callback_list, *acb;
3083 int freeable = FALSE;
3084
3085 buf = zio->io_private;
3086 hdr = buf->b_hdr;
3087
3088 /*
3089 * The hdr was inserted into hash-table and removed from lists
3090 * prior to starting I/O. We should find this header, since
3091 * it's in the hash table, and it should be legit since it's
3092 * not possible to evict it during the I/O. The only possible
3093 * reason for it not to be found is if we were freed during the
3094 * read.
3095 */
3096 found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
3097 &hash_lock);
3098
3099 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
3100 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3101 (found == hdr && HDR_L2_READING(hdr)));
3102
3103 hdr->b_flags &= ~ARC_L2_EVICTED;
3104 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
3105 hdr->b_flags &= ~ARC_L2CACHE;
3106
3107 /* byteswap if necessary */
3108 callback_list = hdr->b_acb;
3109 ASSERT(callback_list != NULL);
3110 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3111 dmu_object_byteswap_t bswap =
3112 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3113 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
3114 byteswap_uint64_array :
3115 dmu_ot_byteswap[bswap].ob_func;
3116 func(buf->b_data, hdr->b_size);
3117 }
3118
3119 arc_cksum_compute(buf, B_FALSE);
3120 arc_buf_watch(buf);
3121
3122 if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
3123 /*
3124 * Only call arc_access on anonymous buffers. This is because
3125 * if we've issued an I/O for an evicted buffer, we've already
3126 * called arc_access (to prevent any simultaneous readers from
3127 * getting confused).
3128 */
3129 arc_access(hdr, hash_lock);
3130 }
3131
3132 /* create copies of the data buffer for the callers */
3133 abuf = buf;
3134 for (acb = callback_list; acb; acb = acb->acb_next) {
3135 if (acb->acb_done) {
3136 if (abuf == NULL) {
3137 ARCSTAT_BUMP(arcstat_duplicate_reads);
3138 abuf = arc_buf_clone(buf);
3139 }
3140 acb->acb_buf = abuf;
3141 abuf = NULL;
3142 }
3143 }
3144 hdr->b_acb = NULL;
3145 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3146 ASSERT(!HDR_BUF_AVAILABLE(hdr));
3147 if (abuf == buf) {
3148 ASSERT(buf->b_efunc == NULL);
3149 ASSERT(hdr->b_datacnt == 1);
3150 hdr->b_flags |= ARC_BUF_AVAILABLE;
3151 }
3152
3153 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
3154
3155 if (zio->io_error != 0) {
3156 hdr->b_flags |= ARC_IO_ERROR;
3157 if (hdr->b_state != arc_anon)
3158 arc_change_state(arc_anon, hdr, hash_lock);
3159 if (HDR_IN_HASH_TABLE(hdr))
3160 buf_hash_remove(hdr);
3161 freeable = refcount_is_zero(&hdr->b_refcnt);
3162 }
3163
3164 /*
3165 * Broadcast before we drop the hash_lock to avoid the possibility
3166 * that the hdr (and hence the cv) might be freed before we get to
3167 * the cv_broadcast().
3168 */
3169 cv_broadcast(&hdr->b_cv);
3170
3171 if (hash_lock) {
3172 mutex_exit(hash_lock);
3173 } else {
3174 /*
3175 * This block was freed while we waited for the read to
3176 * complete. It has been removed from the hash table and
3177 * moved to the anonymous state (so that it won't show up
3178 * in the cache).
3179 */
3180 ASSERT3P(hdr->b_state, ==, arc_anon);
3181 freeable = refcount_is_zero(&hdr->b_refcnt);
3182 }
3183
3184 /* execute each callback and free its structure */
3185 while ((acb = callback_list) != NULL) {
3186 if (acb->acb_done)
3187 acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3188
3189 if (acb->acb_zio_dummy != NULL) {
3190 acb->acb_zio_dummy->io_error = zio->io_error;
3191 zio_nowait(acb->acb_zio_dummy);
3192 }
3193
3194 callback_list = acb->acb_next;
3195 kmem_free(acb, sizeof (arc_callback_t));
3196 }
3197
3198 if (freeable)
3199 arc_hdr_destroy(hdr);
3200 }
3201
3202 /*
3203 * "Read" the block at the specified DVA (in bp) via the
3204 * cache. If the block is found in the cache, invoke the provided
3205 * callback immediately and return. Note that the `zio' parameter
3206 * in the callback will be NULL in this case, since no IO was
3207 * required. If the block is not in the cache pass the read request
3208 * on to the spa with a substitute callback function, so that the
3209 * requested block will be added to the cache.
3210 *
3211 * If a read request arrives for a block that has a read in-progress,
3212 * either wait for the in-progress read to complete (and return the
3213 * results); or, if this is a read with a "done" func, add a record
3214 * to the read to invoke the "done" func when the read completes,
3215 * and return; or just return.
3216 *
3217 * arc_read_done() will invoke all the requested "done" functions
3218 * for readers of this block.
3219 */
3220 int
3221 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3222 void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
3223 const zbookmark_t *zb)
3224 {
3225 arc_buf_hdr_t *hdr;
3226 arc_buf_t *buf = NULL;
3227 kmutex_t *hash_lock;
3228 zio_t *rzio;
3229 uint64_t guid = spa_load_guid(spa);
3230
3231 top:
3232 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3233 &hash_lock);
3234 if (hdr && hdr->b_datacnt > 0) {
3235
3236 *arc_flags |= ARC_CACHED;
3237
3238 if (HDR_IO_IN_PROGRESS(hdr)) {
3239
3240 if (*arc_flags & ARC_WAIT) {
3241 cv_wait(&hdr->b_cv, hash_lock);
3242 mutex_exit(hash_lock);
3243 goto top;
3244 }
3245 ASSERT(*arc_flags & ARC_NOWAIT);
3246
3247 if (done) {
3248 arc_callback_t *acb = NULL;
3249
3250 acb = kmem_zalloc(sizeof (arc_callback_t),
3251 KM_SLEEP);
3252 acb->acb_done = done;
3253 acb->acb_private = private;
3254 if (pio != NULL)
3255 acb->acb_zio_dummy = zio_null(pio,
3256 spa, NULL, NULL, NULL, zio_flags);
3257
3258 ASSERT(acb->acb_done != NULL);
3259 acb->acb_next = hdr->b_acb;
3260 hdr->b_acb = acb;
3261 add_reference(hdr, hash_lock, private);
3262 mutex_exit(hash_lock);
3263 return (0);
3264 }
3265 mutex_exit(hash_lock);
3266 return (0);
3267 }
3268
3269 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3270
3271 if (done) {
3272 add_reference(hdr, hash_lock, private);
3273 /*
3274 * If this block is already in use, create a new
3275 * copy of the data so that we will be guaranteed
3276 * that arc_release() will always succeed.
3277 */
3278 buf = hdr->b_buf;
3279 ASSERT(buf);
3280 ASSERT(buf->b_data);
3281 if (HDR_BUF_AVAILABLE(hdr)) {
3282 ASSERT(buf->b_efunc == NULL);
3283 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3284 } else {
3285 buf = arc_buf_clone(buf);
3286 }
3287
3288 } else if (*arc_flags & ARC_PREFETCH &&
3289 refcount_count(&hdr->b_refcnt) == 0) {
3290 hdr->b_flags |= ARC_PREFETCH;
3291 }
3292 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3293 arc_access(hdr, hash_lock);
3294 if (*arc_flags & ARC_L2CACHE)
3295 hdr->b_flags |= ARC_L2CACHE;
3296 if (*arc_flags & ARC_L2COMPRESS)
3297 hdr->b_flags |= ARC_L2COMPRESS;
3298 mutex_exit(hash_lock);
3299 ARCSTAT_BUMP(arcstat_hits);
3300 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3301 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3302 data, metadata, hits);
3303
3304 if (done)
3305 done(NULL, buf, private);
3306 } else {
3307 uint64_t size = BP_GET_LSIZE(bp);
3308 arc_callback_t *acb;
3309 vdev_t *vd = NULL;
3310 uint64_t addr = 0;
3311 boolean_t devw = B_FALSE;
3312 enum zio_compress b_compress = ZIO_COMPRESS_OFF;
3313 uint64_t b_asize = 0;
3314
3315 if (hdr == NULL) {
3316 /* this block is not in the cache */
3317 arc_buf_hdr_t *exists;
3318 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3319 buf = arc_buf_alloc(spa, size, private, type);
3320 hdr = buf->b_hdr;
3321 hdr->b_dva = *BP_IDENTITY(bp);
3322 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3323 hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3324 exists = buf_hash_insert(hdr, &hash_lock);
3325 if (exists) {
3326 /* somebody beat us to the hash insert */
3327 mutex_exit(hash_lock);
3328 buf_discard_identity(hdr);
3329 (void) arc_buf_remove_ref(buf, private);
3330 goto top; /* restart the IO request */
3331 }
3332 /* if this is a prefetch, we don't have a reference */
3333 if (*arc_flags & ARC_PREFETCH) {
3334 (void) remove_reference(hdr, hash_lock,
3335 private);
3336 hdr->b_flags |= ARC_PREFETCH;
3337 }
3338 if (*arc_flags & ARC_L2CACHE)
3339 hdr->b_flags |= ARC_L2CACHE;
3340 if (*arc_flags & ARC_L2COMPRESS)
3341 hdr->b_flags |= ARC_L2COMPRESS;
3342 if (BP_GET_LEVEL(bp) > 0)
3343 hdr->b_flags |= ARC_INDIRECT;
3344 } else {
3345 /* this block is in the ghost cache */
3346 ASSERT(GHOST_STATE(hdr->b_state));
3347 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3348 ASSERT0(refcount_count(&hdr->b_refcnt));
3349 ASSERT(hdr->b_buf == NULL);
3350
3351 /* if this is a prefetch, we don't have a reference */
3352 if (*arc_flags & ARC_PREFETCH)
3353 hdr->b_flags |= ARC_PREFETCH;
3354 else
3355 add_reference(hdr, hash_lock, private);
3356 if (*arc_flags & ARC_L2CACHE)
3357 hdr->b_flags |= ARC_L2CACHE;
3358 if (*arc_flags & ARC_L2COMPRESS)
3359 hdr->b_flags |= ARC_L2COMPRESS;
3360 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3361 buf->b_hdr = hdr;
3362 buf->b_data = NULL;
3363 buf->b_efunc = NULL;
3364 buf->b_private = NULL;
3365 buf->b_next = NULL;
3366 hdr->b_buf = buf;
3367 ASSERT(hdr->b_datacnt == 0);
3368 hdr->b_datacnt = 1;
3369 arc_get_data_buf(buf);
3370 arc_access(hdr, hash_lock);
3371 }
3372
3373 ASSERT(!GHOST_STATE(hdr->b_state));
3374
3375 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3376 acb->acb_done = done;
3377 acb->acb_private = private;
3378
3379 ASSERT(hdr->b_acb == NULL);
3380 hdr->b_acb = acb;
3381 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3382
3383 if (hdr->b_l2hdr != NULL &&
3384 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3385 /*
3386 * Need to stash these before letting go of hash_lock
3387 */
3388 devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3389 addr = hdr->b_l2hdr->b_daddr;
3390 b_compress = hdr->b_l2hdr->b_compress;
3391 b_asize = hdr->b_l2hdr->b_asize;
3392 /*
3393 * Lock out device removal.
3394 */
3395 if (vdev_is_dead(vd) ||
3396 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3397 vd = NULL;
3398 }
3399
3400 mutex_exit(hash_lock);
3401
3402 /*
3403 * At this point, we have a level 1 cache miss. Try again in
3404 * L2ARC if possible.
3405 */
3406 ASSERT3U(hdr->b_size, ==, size);
3407 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3408 uint64_t, size, zbookmark_t *, zb);
3409 ARCSTAT_BUMP(arcstat_misses);
3410 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3411 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3412 data, metadata, misses);
3413
3414 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3415 /*
3416 * Read from the L2ARC if the following are true:
3417 * 1. The L2ARC vdev was previously cached.
3418 * 2. This buffer still has L2ARC metadata.
3419 * 3. This buffer isn't currently writing to the L2ARC.
3420 * 4. The L2ARC entry wasn't evicted, which may
3421 * also have invalidated the vdev.
3422 * 5. This isn't prefetch and l2arc_noprefetch is set.
3423 */
3424 if (hdr->b_l2hdr != NULL &&
3425 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3426 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3427 l2arc_read_callback_t *cb;
3428
3429 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3430 ARCSTAT_BUMP(arcstat_l2_hits);
3431
3432 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3433 KM_SLEEP);
3434 cb->l2rcb_buf = buf;
3435 cb->l2rcb_spa = spa;
3436 cb->l2rcb_bp = *bp;
3437 cb->l2rcb_zb = *zb;
3438 cb->l2rcb_flags = zio_flags;
3439 cb->l2rcb_compress = b_compress;
3440
3441 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3442 addr + size < vd->vdev_psize -
3443 VDEV_LABEL_END_SIZE);
3444
3445 /*
3446 * l2arc read. The SCL_L2ARC lock will be
3447 * released by l2arc_read_done().
3448 * Issue a null zio if the underlying buffer
3449 * was squashed to zero size by compression.
3450 */
3451 if (b_compress == ZIO_COMPRESS_EMPTY) {
3452 rzio = zio_null(pio, spa, vd,
3453 l2arc_read_done, cb,
3454 zio_flags | ZIO_FLAG_DONT_CACHE |
3455 ZIO_FLAG_CANFAIL |
3456 ZIO_FLAG_DONT_PROPAGATE |
3457 ZIO_FLAG_DONT_RETRY);
3458 } else {
3459 rzio = zio_read_phys(pio, vd, addr,
3460 b_asize, buf->b_data,
3461 ZIO_CHECKSUM_OFF,
3462 l2arc_read_done, cb, priority,
3463 zio_flags | ZIO_FLAG_DONT_CACHE |
3464 ZIO_FLAG_CANFAIL |
3465 ZIO_FLAG_DONT_PROPAGATE |
3466 ZIO_FLAG_DONT_RETRY, B_FALSE);
3467 }
3468 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3469 zio_t *, rzio);
3470 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
3471
3472 if (*arc_flags & ARC_NOWAIT) {
3473 zio_nowait(rzio);
3474 return (0);
3475 }
3476
3477 ASSERT(*arc_flags & ARC_WAIT);
3478 if (zio_wait(rzio) == 0)
3479 return (0);
3480
3481 /* l2arc read error; goto zio_read() */
3482 } else {
3483 DTRACE_PROBE1(l2arc__miss,
3484 arc_buf_hdr_t *, hdr);
3485 ARCSTAT_BUMP(arcstat_l2_misses);
3486 if (HDR_L2_WRITING(hdr))
3487 ARCSTAT_BUMP(arcstat_l2_rw_clash);
3488 spa_config_exit(spa, SCL_L2ARC, vd);
3489 }
3490 } else {
3491 if (vd != NULL)
3492 spa_config_exit(spa, SCL_L2ARC, vd);
3493 if (l2arc_ndev != 0) {
3494 DTRACE_PROBE1(l2arc__miss,
3495 arc_buf_hdr_t *, hdr);
3496 ARCSTAT_BUMP(arcstat_l2_misses);
3497 }
3498 }
3499
3500 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3501 arc_read_done, buf, priority, zio_flags, zb);
3502
3503 if (*arc_flags & ARC_WAIT)
3504 return (zio_wait(rzio));
3505
3506 ASSERT(*arc_flags & ARC_NOWAIT);
3507 zio_nowait(rzio);
3508 }
3509 return (0);
3510 }
3511
3512 void
3513 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3514 {
3515 ASSERT(buf->b_hdr != NULL);
3516 ASSERT(buf->b_hdr->b_state != arc_anon);
3517 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3518 ASSERT(buf->b_efunc == NULL);
3519 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3520
3521 buf->b_efunc = func;
3522 buf->b_private = private;
3523 }
3524
3525 /*
3526 * Notify the arc that a block was freed, and thus will never be used again.
3527 */
3528 void
3529 arc_freed(spa_t *spa, const blkptr_t *bp)
3530 {
3531 arc_buf_hdr_t *hdr;
3532 kmutex_t *hash_lock;
3533 uint64_t guid = spa_load_guid(spa);
3534
3535 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3536 &hash_lock);
3537 if (hdr == NULL)
3538 return;
3539 if (HDR_BUF_AVAILABLE(hdr)) {
3540 arc_buf_t *buf = hdr->b_buf;
3541 add_reference(hdr, hash_lock, FTAG);
3542 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3543 mutex_exit(hash_lock);
3544
3545 arc_release(buf, FTAG);
3546 (void) arc_buf_remove_ref(buf, FTAG);
3547 } else {
3548 mutex_exit(hash_lock);
3549 }
3550
3551 }
3552
3553 /*
3554 * This is used by the DMU to let the ARC know that a buffer is
3555 * being evicted, so the ARC should clean up. If this arc buf
3556 * is not yet in the evicted state, it will be put there.
3557 */
3558 int
3559 arc_buf_evict(arc_buf_t *buf)
3560 {
3561 arc_buf_hdr_t *hdr;
3562 kmutex_t *hash_lock;
3563 arc_buf_t **bufp;
3564
3565 mutex_enter(&buf->b_evict_lock);
3566 hdr = buf->b_hdr;
3567 if (hdr == NULL) {
3568 /*
3569 * We are in arc_do_user_evicts().
3570 */
3571 ASSERT(buf->b_data == NULL);
3572 mutex_exit(&buf->b_evict_lock);
3573 return (0);
3574 } else if (buf->b_data == NULL) {
3575 arc_buf_t copy = *buf; /* structure assignment */
3576 /*
3577 * We are on the eviction list; process this buffer now
3578 * but let arc_do_user_evicts() do the reaping.
3579 */
3580 buf->b_efunc = NULL;
3581 mutex_exit(&buf->b_evict_lock);
3582 VERIFY(copy.b_efunc(©) == 0);
3583 return (1);
3584 }
3585 hash_lock = HDR_LOCK(hdr);
3586 mutex_enter(hash_lock);
3587 hdr = buf->b_hdr;
3588 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3589
3590 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3591 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3592
3593 /*
3594 * Pull this buffer off of the hdr
3595 */
3596 bufp = &hdr->b_buf;
3597 while (*bufp != buf)
3598 bufp = &(*bufp)->b_next;
3599 *bufp = buf->b_next;
3600
3601 ASSERT(buf->b_data != NULL);
3602 arc_buf_destroy(buf, FALSE, FALSE);
3603
3604 if (hdr->b_datacnt == 0) {
3605 arc_state_t *old_state = hdr->b_state;
3606 arc_state_t *evicted_state;
3607
3608 ASSERT(hdr->b_buf == NULL);
3609 ASSERT(refcount_is_zero(&hdr->b_refcnt));
3610
3611 evicted_state =
3612 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3613
3614 mutex_enter(&old_state->arcs_mtx);
3615 mutex_enter(&evicted_state->arcs_mtx);
3616
3617 arc_change_state(evicted_state, hdr, hash_lock);
3618 ASSERT(HDR_IN_HASH_TABLE(hdr));
3619 hdr->b_flags |= ARC_IN_HASH_TABLE;
3620 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3621
3622 mutex_exit(&evicted_state->arcs_mtx);
3623 mutex_exit(&old_state->arcs_mtx);
3624 }
3625 mutex_exit(hash_lock);
3626 mutex_exit(&buf->b_evict_lock);
3627
3628 VERIFY(buf->b_efunc(buf) == 0);
3629 buf->b_efunc = NULL;
3630 buf->b_private = NULL;
3631 buf->b_hdr = NULL;
3632 buf->b_next = NULL;
3633 kmem_cache_free(buf_cache, buf);
3634 return (1);
3635 }
3636
3637 /*
3638 * Release this buffer from the cache, making it an anonymous buffer. This
3639 * must be done after a read and prior to modifying the buffer contents.
3640 * If the buffer has more than one reference, we must make
3641 * a new hdr for the buffer.
3642 */
3643 void
3644 arc_release(arc_buf_t *buf, void *tag)
3645 {
3646 arc_buf_hdr_t *hdr;
3647 kmutex_t *hash_lock = NULL;
3648 l2arc_buf_hdr_t *l2hdr;
3649 uint64_t buf_size;
3650
3651 /*
3652 * It would be nice to assert that if it's DMU metadata (level >
3653 * 0 || it's the dnode file), then it must be syncing context.
3654 * But we don't know that information at this level.
3655 */
3656
3657 mutex_enter(&buf->b_evict_lock);
3658 hdr = buf->b_hdr;
3659
3660 /* this buffer is not on any list */
3661 ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3662
3663 if (hdr->b_state == arc_anon) {
3664 /* this buffer is already released */
3665 ASSERT(buf->b_efunc == NULL);
3666 } else {
3667 hash_lock = HDR_LOCK(hdr);
3668 mutex_enter(hash_lock);
3669 hdr = buf->b_hdr;
3670 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3671 }
3672
3673 l2hdr = hdr->b_l2hdr;
3674 if (l2hdr) {
3675 mutex_enter(&l2arc_buflist_mtx);
3676 hdr->b_l2hdr = NULL;
3677 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3678 }
3679 buf_size = hdr->b_size;
3680
3681 /*
3682 * Do we have more than one buf?
3683 */
3684 if (hdr->b_datacnt > 1) {
3685 arc_buf_hdr_t *nhdr;
3686 arc_buf_t **bufp;
3687 uint64_t blksz = hdr->b_size;
3688 uint64_t spa = hdr->b_spa;
3689 arc_buf_contents_t type = hdr->b_type;
3690 uint32_t flags = hdr->b_flags;
3691
3692 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3693 /*
3694 * Pull the data off of this hdr and attach it to
3695 * a new anonymous hdr.
3696 */
3697 (void) remove_reference(hdr, hash_lock, tag);
3698 bufp = &hdr->b_buf;
3699 while (*bufp != buf)
3700 bufp = &(*bufp)->b_next;
3701 *bufp = buf->b_next;
3702 buf->b_next = NULL;
3703
3704 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3705 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3706 if (refcount_is_zero(&hdr->b_refcnt)) {
3707 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3708 ASSERT3U(*size, >=, hdr->b_size);
3709 atomic_add_64(size, -hdr->b_size);
3710 }
3711
3712 /*
3713 * We're releasing a duplicate user data buffer, update
3714 * our statistics accordingly.
3715 */
3716 if (hdr->b_type == ARC_BUFC_DATA) {
3717 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3718 ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3719 -hdr->b_size);
3720 }
3721 hdr->b_datacnt -= 1;
3722 arc_cksum_verify(buf);
3723 arc_buf_unwatch(buf);
3724
3725 mutex_exit(hash_lock);
3726
3727 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3728 nhdr->b_size = blksz;
3729 nhdr->b_spa = spa;
3730 nhdr->b_type = type;
3731 nhdr->b_buf = buf;
3732 nhdr->b_state = arc_anon;
3733 nhdr->b_arc_access = 0;
3734 nhdr->b_flags = flags & ARC_L2_WRITING;
3735 nhdr->b_l2hdr = NULL;
3736 nhdr->b_datacnt = 1;
3737 nhdr->b_freeze_cksum = NULL;
3738 (void) refcount_add(&nhdr->b_refcnt, tag);
3739 buf->b_hdr = nhdr;
3740 mutex_exit(&buf->b_evict_lock);
3741 atomic_add_64(&arc_anon->arcs_size, blksz);
3742 } else {
3743 mutex_exit(&buf->b_evict_lock);
3744 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3745 ASSERT(!list_link_active(&hdr->b_arc_node));
3746 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3747 if (hdr->b_state != arc_anon)
3748 arc_change_state(arc_anon, hdr, hash_lock);
3749 hdr->b_arc_access = 0;
3750 if (hash_lock)
3751 mutex_exit(hash_lock);
3752
3753 buf_discard_identity(hdr);
3754 arc_buf_thaw(buf);
3755 }
3756 buf->b_efunc = NULL;
3757 buf->b_private = NULL;
3758
3759 if (l2hdr) {
3760 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3761 kmem_free(l2hdr, sizeof (*l2hdr));
3762 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3763 mutex_exit(&l2arc_buflist_mtx);
3764 }
3765 }
3766
3767 int
3768 arc_released(arc_buf_t *buf)
3769 {
3770 int released;
3771
3772 mutex_enter(&buf->b_evict_lock);
3773 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3774 mutex_exit(&buf->b_evict_lock);
3775 return (released);
3776 }
3777
3778 int
3779 arc_has_callback(arc_buf_t *buf)
3780 {
3781 int callback;
3782
3783 mutex_enter(&buf->b_evict_lock);
3784 callback = (buf->b_efunc != NULL);
3785 mutex_exit(&buf->b_evict_lock);
3786 return (callback);
3787 }
3788
3789 #ifdef ZFS_DEBUG
3790 int
3791 arc_referenced(arc_buf_t *buf)
3792 {
3793 int referenced;
3794
3795 mutex_enter(&buf->b_evict_lock);
3796 referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3797 mutex_exit(&buf->b_evict_lock);
3798 return (referenced);
3799 }
3800 #endif
3801
3802 static void
3803 arc_write_ready(zio_t *zio)
3804 {
3805 arc_write_callback_t *callback = zio->io_private;
3806 arc_buf_t *buf = callback->awcb_buf;
3807 arc_buf_hdr_t *hdr = buf->b_hdr;
3808
3809 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3810 callback->awcb_ready(zio, buf, callback->awcb_private);
3811
3812 /*
3813 * If the IO is already in progress, then this is a re-write
3814 * attempt, so we need to thaw and re-compute the cksum.
3815 * It is the responsibility of the callback to handle the
3816 * accounting for any re-write attempt.
3817 */
3818 if (HDR_IO_IN_PROGRESS(hdr)) {
3819 mutex_enter(&hdr->b_freeze_lock);
3820 if (hdr->b_freeze_cksum != NULL) {
3821 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3822 hdr->b_freeze_cksum = NULL;
3823 }
3824 mutex_exit(&hdr->b_freeze_lock);
3825 }
3826 arc_cksum_compute(buf, B_FALSE);
3827 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3828 }
3829
3830 /*
3831 * The SPA calls this callback for each physical write that happens on behalf
3832 * of a logical write. See the comment in dbuf_write_physdone() for details.
3833 */
3834 static void
3835 arc_write_physdone(zio_t *zio)
3836 {
3837 arc_write_callback_t *cb = zio->io_private;
3838 if (cb->awcb_physdone != NULL)
3839 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
3840 }
3841
3842 static void
3843 arc_write_done(zio_t *zio)
3844 {
3845 arc_write_callback_t *callback = zio->io_private;
3846 arc_buf_t *buf = callback->awcb_buf;
3847 arc_buf_hdr_t *hdr = buf->b_hdr;
3848
3849 ASSERT(hdr->b_acb == NULL);
3850
3851 if (zio->io_error == 0) {
3852 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3853 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3854 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3855 } else {
3856 ASSERT(BUF_EMPTY(hdr));
3857 }
3858
3859 /*
3860 * If the block to be written was all-zero, we may have
3861 * compressed it away. In this case no write was performed
3862 * so there will be no dva/birth/checksum. The buffer must
3863 * therefore remain anonymous (and uncached).
3864 */
3865 if (!BUF_EMPTY(hdr)) {
3866 arc_buf_hdr_t *exists;
3867 kmutex_t *hash_lock;
3868
3869 ASSERT(zio->io_error == 0);
3870
3871 arc_cksum_verify(buf);
3872
3873 exists = buf_hash_insert(hdr, &hash_lock);
3874 if (exists) {
3875 /*
3876 * This can only happen if we overwrite for
3877 * sync-to-convergence, because we remove
3878 * buffers from the hash table when we arc_free().
3879 */
3880 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3881 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3882 panic("bad overwrite, hdr=%p exists=%p",
3883 (void *)hdr, (void *)exists);
3884 ASSERT(refcount_is_zero(&exists->b_refcnt));
3885 arc_change_state(arc_anon, exists, hash_lock);
3886 mutex_exit(hash_lock);
3887 arc_hdr_destroy(exists);
3888 exists = buf_hash_insert(hdr, &hash_lock);
3889 ASSERT3P(exists, ==, NULL);
3890 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3891 /* nopwrite */
3892 ASSERT(zio->io_prop.zp_nopwrite);
3893 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3894 panic("bad nopwrite, hdr=%p exists=%p",
3895 (void *)hdr, (void *)exists);
3896 } else {
3897 /* Dedup */
3898 ASSERT(hdr->b_datacnt == 1);
3899 ASSERT(hdr->b_state == arc_anon);
3900 ASSERT(BP_GET_DEDUP(zio->io_bp));
3901 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3902 }
3903 }
3904 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3905 /* if it's not anon, we are doing a scrub */
3906 if (!exists && hdr->b_state == arc_anon)
3907 arc_access(hdr, hash_lock);
3908 mutex_exit(hash_lock);
3909 } else {
3910 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3911 }
3912
3913 ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3914 callback->awcb_done(zio, buf, callback->awcb_private);
3915
3916 kmem_free(callback, sizeof (arc_write_callback_t));
3917 }
3918
3919 zio_t *
3920 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3921 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3922 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
3923 arc_done_func_t *done, void *private, zio_priority_t priority,
3924 int zio_flags, const zbookmark_t *zb)
3925 {
3926 arc_buf_hdr_t *hdr = buf->b_hdr;
3927 arc_write_callback_t *callback;
3928 zio_t *zio;
3929
3930 ASSERT(ready != NULL);
3931 ASSERT(done != NULL);
3932 ASSERT(!HDR_IO_ERROR(hdr));
3933 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3934 ASSERT(hdr->b_acb == NULL);
3935 if (l2arc)
3936 hdr->b_flags |= ARC_L2CACHE;
3937 if (l2arc_compress)
3938 hdr->b_flags |= ARC_L2COMPRESS;
3939 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3940 callback->awcb_ready = ready;
3941 callback->awcb_physdone = physdone;
3942 callback->awcb_done = done;
3943 callback->awcb_private = private;
3944 callback->awcb_buf = buf;
3945
3946 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3947 arc_write_ready, arc_write_physdone, arc_write_done, callback,
3948 priority, zio_flags, zb);
3949
3950 return (zio);
3951 }
3952
3953 static int
3954 arc_memory_throttle(uint64_t reserve, uint64_t txg)
3955 {
3956 #ifdef _KERNEL
3957 uint64_t available_memory = ptob(freemem);
3958 static uint64_t page_load = 0;
3959 static uint64_t last_txg = 0;
3960
3961 #if defined(__i386)
3962 available_memory =
3963 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3964 #endif
3965
3966 if (freemem > physmem * arc_lotsfree_percent / 100)
3967 return (0);
3968
3969 if (txg > last_txg) {
3970 last_txg = txg;
3971 page_load = 0;
3972 }
3973 /*
3974 * If we are in pageout, we know that memory is already tight,
3975 * the arc is already going to be evicting, so we just want to
3976 * continue to let page writes occur as quickly as possible.
3977 */
3978 if (curproc == proc_pageout) {
3979 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3980 return (SET_ERROR(ERESTART));
3981 /* Note: reserve is inflated, so we deflate */
3982 page_load += reserve / 8;
3983 return (0);
3984 } else if (page_load > 0 && arc_reclaim_needed()) {
3985 /* memory is low, delay before restarting */
3986 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3987 return (SET_ERROR(EAGAIN));
3988 }
3989 page_load = 0;
3990 #endif
3991 return (0);
3992 }
3993
3994 void
3995 arc_tempreserve_clear(uint64_t reserve)
3996 {
3997 atomic_add_64(&arc_tempreserve, -reserve);
3998 ASSERT((int64_t)arc_tempreserve >= 0);
3999 }
4000
4001 int
4002 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
4003 {
4004 int error;
4005 uint64_t anon_size;
4006
4007 if (reserve > arc_c/4 && !arc_no_grow)
4008 arc_c = MIN(arc_c_max, reserve * 4);
4009 if (reserve > arc_c)
4010 return (SET_ERROR(ENOMEM));
4011
4012 /*
4013 * Don't count loaned bufs as in flight dirty data to prevent long
4014 * network delays from blocking transactions that are ready to be
4015 * assigned to a txg.
4016 */
4017 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
4018
4019 /*
4020 * Writes will, almost always, require additional memory allocations
4021 * in order to compress/encrypt/etc the data. We therefore need to
4022 * make sure that there is sufficient available memory for this.
4023 */
4024 error = arc_memory_throttle(reserve, txg);
4025 if (error != 0)
4026 return (error);
4027
4028 /*
4029 * Throttle writes when the amount of dirty data in the cache
4030 * gets too large. We try to keep the cache less than half full
4031 * of dirty blocks so that our sync times don't grow too large.
4032 * Note: if two requests come in concurrently, we might let them
4033 * both succeed, when one of them should fail. Not a huge deal.
4034 */
4035
4036 if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
4037 anon_size > arc_c / 4) {
4038 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
4039 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
4040 arc_tempreserve>>10,
4041 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
4042 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
4043 reserve>>10, arc_c>>10);
4044 return (SET_ERROR(ERESTART));
4045 }
4046 atomic_add_64(&arc_tempreserve, reserve);
4047 return (0);
4048 }
4049
4050 void
4051 arc_init(void)
4052 {
4053 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4054 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
4055
4056 /* Convert seconds to clock ticks */
4057 arc_min_prefetch_lifespan = 1 * hz;
4058
4059 /* Start out with 1/8 of all memory */
4060 arc_c = physmem * PAGESIZE / 8;
4061
4062 #ifdef _KERNEL
4063 /*
4064 * On architectures where the physical memory can be larger
4065 * than the addressable space (intel in 32-bit mode), we may
4066 * need to limit the cache to 1/8 of VM size.
4067 */
4068 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
4069 #endif
4070
4071 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
4072 arc_c_min = MAX(arc_c / 4, 64<<20);
4073 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
4074 if (arc_c * 8 >= 1<<30)
4075 arc_c_max = (arc_c * 8) - (1<<30);
4076 else
4077 arc_c_max = arc_c_min;
4078 arc_c_max = MAX(arc_c * 6, arc_c_max);
4079
4080 /*
4081 * Allow the tunables to override our calculations if they are
4082 * reasonable (ie. over 64MB)
4083 */
4084 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
4085 arc_c_max = zfs_arc_max;
4086 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
4087 arc_c_min = zfs_arc_min;
4088
4089 arc_c = arc_c_max;
4090 arc_p = (arc_c >> 1);
4091
4092 /* limit meta-data to 1/4 of the arc capacity */
4093 arc_meta_limit = arc_c_max / 4;
4094
4095 /* Allow the tunable to override if it is reasonable */
4096 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4097 arc_meta_limit = zfs_arc_meta_limit;
4098
4099 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
4100 arc_c_min = arc_meta_limit / 2;
4101
4102 if (zfs_arc_grow_retry > 0)
4103 arc_grow_retry = zfs_arc_grow_retry;
4104
4105 if (zfs_arc_shrink_shift > 0)
4106 arc_shrink_shift = zfs_arc_shrink_shift;
4107
4108 if (zfs_arc_p_min_shift > 0)
4109 arc_p_min_shift = zfs_arc_p_min_shift;
4110
4111 /* if kmem_flags are set, lets try to use less memory */
4112 if (kmem_debugging())
4113 arc_c = arc_c / 2;
4114 if (arc_c < arc_c_min)
4115 arc_c = arc_c_min;
4116
4117 arc_anon = &ARC_anon;
4118 arc_mru = &ARC_mru;
4119 arc_mru_ghost = &ARC_mru_ghost;
4120 arc_mfu = &ARC_mfu;
4121 arc_mfu_ghost = &ARC_mfu_ghost;
4122 arc_l2c_only = &ARC_l2c_only;
4123 arc_size = 0;
4124
4125 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4126 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4127 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4128 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4129 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4130 mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4131
4132 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
4133 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4134 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
4135 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4136 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
4137 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4138 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
4139 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4140 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
4141 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4142 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
4143 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4144 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
4145 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4146 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
4147 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4148 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
4149 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4150 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
4151 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4152
4153 buf_init();
4154
4155 arc_thread_exit = 0;
4156 arc_eviction_list = NULL;
4157 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4158 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4159
4160 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4161 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4162
4163 if (arc_ksp != NULL) {
4164 arc_ksp->ks_data = &arc_stats;
4165 kstat_install(arc_ksp);
4166 }
4167
4168 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4169 TS_RUN, minclsyspri);
4170
4171 arc_dead = FALSE;
4172 arc_warm = B_FALSE;
4173
4174 /*
4175 * Calculate maximum amount of dirty data per pool.
4176 *
4177 * If it has been set by /etc/system, take that.
4178 * Otherwise, use a percentage of physical memory defined by
4179 * zfs_dirty_data_max_percent (default 10%) with a cap at
4180 * zfs_dirty_data_max_max (default 4GB).
4181 */
4182 if (zfs_dirty_data_max == 0) {
4183 zfs_dirty_data_max = physmem * PAGESIZE *
4184 zfs_dirty_data_max_percent / 100;
4185 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
4186 zfs_dirty_data_max_max);
4187 }
4188 }
4189
4190 void
4191 arc_fini(void)
4192 {
4193 mutex_enter(&arc_reclaim_thr_lock);
4194 arc_thread_exit = 1;
4195 while (arc_thread_exit != 0)
4196 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4197 mutex_exit(&arc_reclaim_thr_lock);
4198
4199 arc_flush(NULL);
4200
4201 arc_dead = TRUE;
4202
4203 if (arc_ksp != NULL) {
4204 kstat_delete(arc_ksp);
4205 arc_ksp = NULL;
4206 }
4207
4208 mutex_destroy(&arc_eviction_mtx);
4209 mutex_destroy(&arc_reclaim_thr_lock);
4210 cv_destroy(&arc_reclaim_thr_cv);
4211
4212 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
4213 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
4214 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
4215 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
4216 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
4217 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
4218 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
4219 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
4220
4221 mutex_destroy(&arc_anon->arcs_mtx);
4222 mutex_destroy(&arc_mru->arcs_mtx);
4223 mutex_destroy(&arc_mru_ghost->arcs_mtx);
4224 mutex_destroy(&arc_mfu->arcs_mtx);
4225 mutex_destroy(&arc_mfu_ghost->arcs_mtx);
4226 mutex_destroy(&arc_l2c_only->arcs_mtx);
4227
4228 buf_fini();
4229
4230 ASSERT(arc_loaned_bytes == 0);
4231 }
4232
4233 /*
4234 * Level 2 ARC
4235 *
4236 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4237 * It uses dedicated storage devices to hold cached data, which are populated
4238 * using large infrequent writes. The main role of this cache is to boost
4239 * the performance of random read workloads. The intended L2ARC devices
4240 * include short-stroked disks, solid state disks, and other media with
4241 * substantially faster read latency than disk.
4242 *
4243 * +-----------------------+
4244 * | ARC |
4245 * +-----------------------+
4246 * | ^ ^
4247 * | | |
4248 * l2arc_feed_thread() arc_read()
4249 * | | |
4250 * | l2arc read |
4251 * V | |
4252 * +---------------+ |
4253 * | L2ARC | |
4254 * +---------------+ |
4255 * | ^ |
4256 * l2arc_write() | |
4257 * | | |
4258 * V | |
4259 * +-------+ +-------+
4260 * | vdev | | vdev |
4261 * | cache | | cache |
4262 * +-------+ +-------+
4263 * +=========+ .-----.
4264 * : L2ARC : |-_____-|
4265 * : devices : | Disks |
4266 * +=========+ `-_____-'
4267 *
4268 * Read requests are satisfied from the following sources, in order:
4269 *
4270 * 1) ARC
4271 * 2) vdev cache of L2ARC devices
4272 * 3) L2ARC devices
4273 * 4) vdev cache of disks
4274 * 5) disks
4275 *
4276 * Some L2ARC device types exhibit extremely slow write performance.
4277 * To accommodate for this there are some significant differences between
4278 * the L2ARC and traditional cache design:
4279 *
4280 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
4281 * the ARC behave as usual, freeing buffers and placing headers on ghost
4282 * lists. The ARC does not send buffers to the L2ARC during eviction as
4283 * this would add inflated write latencies for all ARC memory pressure.
4284 *
4285 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4286 * It does this by periodically scanning buffers from the eviction-end of
4287 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4288 * not already there. It scans until a headroom of buffers is satisfied,
4289 * which itself is a buffer for ARC eviction. If a compressible buffer is
4290 * found during scanning and selected for writing to an L2ARC device, we
4291 * temporarily boost scanning headroom during the next scan cycle to make
4292 * sure we adapt to compression effects (which might significantly reduce
4293 * the data volume we write to L2ARC). The thread that does this is
4294 * l2arc_feed_thread(), illustrated below; example sizes are included to
4295 * provide a better sense of ratio than this diagram:
4296 *
4297 * head --> tail
4298 * +---------------------+----------+
4299 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
4300 * +---------------------+----------+ | o L2ARC eligible
4301 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
4302 * +---------------------+----------+ |
4303 * 15.9 Gbytes ^ 32 Mbytes |
4304 * headroom |
4305 * l2arc_feed_thread()
4306 * |
4307 * l2arc write hand <--[oooo]--'
4308 * | 8 Mbyte
4309 * | write max
4310 * V
4311 * +==============================+
4312 * L2ARC dev |####|#|###|###| |####| ... |
4313 * +==============================+
4314 * 32 Gbytes
4315 *
4316 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4317 * evicted, then the L2ARC has cached a buffer much sooner than it probably
4318 * needed to, potentially wasting L2ARC device bandwidth and storage. It is
4319 * safe to say that this is an uncommon case, since buffers at the end of
4320 * the ARC lists have moved there due to inactivity.
4321 *
4322 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4323 * then the L2ARC simply misses copying some buffers. This serves as a
4324 * pressure valve to prevent heavy read workloads from both stalling the ARC
4325 * with waits and clogging the L2ARC with writes. This also helps prevent
4326 * the potential for the L2ARC to churn if it attempts to cache content too
4327 * quickly, such as during backups of the entire pool.
4328 *
4329 * 5. After system boot and before the ARC has filled main memory, there are
4330 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4331 * lists can remain mostly static. Instead of searching from tail of these
4332 * lists as pictured, the l2arc_feed_thread() will search from the list heads
4333 * for eligible buffers, greatly increasing its chance of finding them.
4334 *
4335 * The L2ARC device write speed is also boosted during this time so that
4336 * the L2ARC warms up faster. Since there have been no ARC evictions yet,
4337 * there are no L2ARC reads, and no fear of degrading read performance
4338 * through increased writes.
4339 *
4340 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4341 * the vdev queue can aggregate them into larger and fewer writes. Each
4342 * device is written to in a rotor fashion, sweeping writes through
4343 * available space then repeating.
4344 *
4345 * 7. The L2ARC does not store dirty content. It never needs to flush
4346 * write buffers back to disk based storage.
4347 *
4348 * 8. If an ARC buffer is written (and dirtied) which also exists in the
4349 * L2ARC, the now stale L2ARC buffer is immediately dropped.
4350 *
4351 * The performance of the L2ARC can be tweaked by a number of tunables, which
4352 * may be necessary for different workloads:
4353 *
4354 * l2arc_write_max max write bytes per interval
4355 * l2arc_write_boost extra write bytes during device warmup
4356 * l2arc_noprefetch skip caching prefetched buffers
4357 * l2arc_headroom number of max device writes to precache
4358 * l2arc_headroom_boost when we find compressed buffers during ARC
4359 * scanning, we multiply headroom by this
4360 * percentage factor for the next scan cycle,
4361 * since more compressed buffers are likely to
4362 * be present
4363 * l2arc_feed_secs seconds between L2ARC writing
4364 *
4365 * Tunables may be removed or added as future performance improvements are
4366 * integrated, and also may become zpool properties.
4367 *
4368 * There are three key functions that control how the L2ARC warms up:
4369 *
4370 * l2arc_write_eligible() check if a buffer is eligible to cache
4371 * l2arc_write_size() calculate how much to write
4372 * l2arc_write_interval() calculate sleep delay between writes
4373 *
4374 * These three functions determine what to write, how much, and how quickly
4375 * to send writes.
4376 *
4377 * L2ARC persistency:
4378 *
4379 * When writing buffers to L2ARC, we periodically add some metadata to
4380 * make sure we can pick them up after reboot, thus dramatically reducing
4381 * the impact that any downtime has on the performance of storage systems
4382 * with large caches.
4383 *
4384 * The implementation works fairly simply by integrating the following two
4385 * modifications:
4386 *
4387 * *) Every now and then we mix in a piece of metadata (called a log block)
4388 * into the L2ARC write. This allows us to understand what's been written,
4389 * so that we can rebuild the arc_buf_hdr_t structures of the main ARC
4390 * buffers. The log block also includes a "back-reference" pointer to the
4391 * previous block, forming a back-linked list of blocks on the L2ARC device.
4392 *
4393 * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
4394 * for our header bookkeeping purposes. This contains a device header, which
4395 * contains our top-level reference structures. We update it each time we
4396 * write a new log block, so that we're able to locate it in the L2ARC
4397 * device. If this write results in an inconsistent device header (e.g. due
4398 * to power failure), we detect this by verifying the header's checksum
4399 * and simply drop the entries from L2ARC.
4400 *
4401 * Implementation diagram:
4402 *
4403 * +=== L2ARC device (not to scale) ======================================+
4404 * | __________newest log block pointers_________ |
4405 * | / \1 back \latest |
4406 * | / V V |
4407 * ||L2 dev hdr |---|bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
4408 * | ^ / ^ / ^ / |
4409 * | `-prev-' `-prev-' `-prev-' |
4410 * | lb lb lb |
4411 * +======================================================================+
4412 *
4413 * On-device data structures:
4414 *
4415 * L2ARC device header: l2arc_dev_hdr_phys_t
4416 * L2ARC log block: l2arc_log_blk_phys_t
4417 *
4418 * L2ARC reconstruction:
4419 *
4420 * When writing data, we simply write in the standard rotary fashion,
4421 * evicting buffers as we go and simply writing new data over them (writing
4422 * a new log block every now and then). This obviously means that once we
4423 * loop around the end of the device, we will start cutting into an already
4424 * committed log block (and its referenced data buffers), like so:
4425 *
4426 * current write head__ __old tail
4427 * \ /
4428 * V V
4429 * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |-->
4430 * ^ ^^^^^^^^^___________________________________
4431 * | \
4432 * <<nextwrite>> may overwrite this blk and/or its bufs --'
4433 *
4434 * When importing the pool, we detect this situation and use it to stop
4435 * our scanning process (see l2arc_rebuild).
4436 *
4437 * There is one significant caveat to consider when rebuilding ARC contents
4438 * from an L2ARC device: what about invalidated buffers? Given the above
4439 * construction, we cannot update blocks which we've already written to amend
4440 * them to remove buffers which were invalidated. Thus, during reconstruction,
4441 * we might be populating the cache with buffers for data that's not on the
4442 * main pool anymore, or may have been overwritten!
4443 *
4444 * As it turns out, this isn't a problem. Every arc_read request includes
4445 * both the DVA and, crucially, the birth TXG of the BP the caller is
4446 * looking for. So even if the cache were populated by completely rotten
4447 * blocks for data that had been long deleted and/or overwritten, we'll
4448 * never actually return bad data from the cache, since the DVA with the
4449 * birth TXG uniquely identify a block in space and time - once created,
4450 * a block is immutable on disk. The worst thing we have done is wasted
4451 * some time and memory at l2arc rebuild to reconstruct outdated ARC
4452 * entries that will get dropped from the l2arc as it is being updated
4453 * with new blocks.
4454 */
4455
4456 static boolean_t
4457 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4458 {
4459 /*
4460 * A buffer is *not* eligible for the L2ARC if it:
4461 * 1. belongs to a different spa.
4462 * 2. is already cached on the L2ARC.
4463 * 3. has an I/O in progress (it may be an incomplete read).
4464 * 4. is flagged not eligible (zfs property).
4465 */
4466 if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4467 HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4468 return (B_FALSE);
4469
4470 return (B_TRUE);
4471 }
4472
4473 static uint64_t
4474 l2arc_write_size(void)
4475 {
4476 uint64_t size;
4477
4478 /*
4479 * Make sure our globals have meaningful values in case the user
4480 * altered them.
4481 */
4482 size = l2arc_write_max;
4483 if (size == 0) {
4484 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4485 "be greater than zero, resetting it to the default (%d)",
4486 L2ARC_WRITE_SIZE);
4487 size = l2arc_write_max = L2ARC_WRITE_SIZE;
4488 }
4489
4490 if (arc_warm == B_FALSE)
4491 size += l2arc_write_boost;
4492
4493 return (size);
4494
4495 }
4496
4497 static clock_t
4498 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4499 {
4500 clock_t interval, next, now;
4501
4502 /*
4503 * If the ARC lists are busy, increase our write rate; if the
4504 * lists are stale, idle back. This is achieved by checking
4505 * how much we previously wrote - if it was more than half of
4506 * what we wanted, schedule the next write much sooner.
4507 */
4508 if (l2arc_feed_again && wrote > (wanted / 2))
4509 interval = (hz * l2arc_feed_min_ms) / 1000;
4510 else
4511 interval = hz * l2arc_feed_secs;
4512
4513 now = ddi_get_lbolt();
4514 next = MAX(now, MIN(now + interval, began + interval));
4515
4516 return (next);
4517 }
4518
4519 static void
4520 l2arc_hdr_stat_add(boolean_t from_arc)
4521 {
4522 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4523 if (from_arc)
4524 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4525 }
4526
4527 static void
4528 l2arc_hdr_stat_remove(void)
4529 {
4530 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4531 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4532 }
4533
4534 /*
4535 * Cycle through L2ARC devices. This is how L2ARC load balances.
4536 * If a device is returned, this also returns holding the spa config lock.
4537 */
4538 static l2arc_dev_t *
4539 l2arc_dev_get_next(void)
4540 {
4541 l2arc_dev_t *first, *next = NULL;
4542
4543 /*
4544 * Lock out the removal of spas (spa_namespace_lock), then removal
4545 * of cache devices (l2arc_dev_mtx). Once a device has been selected,
4546 * both locks will be dropped and a spa config lock held instead.
4547 */
4548 mutex_enter(&spa_namespace_lock);
4549 mutex_enter(&l2arc_dev_mtx);
4550
4551 /* if there are no vdevs, there is nothing to do */
4552 if (l2arc_ndev == 0)
4553 goto out;
4554
4555 first = NULL;
4556 next = l2arc_dev_last;
4557 do {
4558 /*
4559 * Loop around the list looking for a non-faulted vdev
4560 * and one that isn't currently doing an L2ARC rebuild.
4561 */
4562 if (next == NULL) {
4563 next = list_head(l2arc_dev_list);
4564 } else {
4565 next = list_next(l2arc_dev_list, next);
4566 if (next == NULL)
4567 next = list_head(l2arc_dev_list);
4568 }
4569
4570 /* if we have come back to the start, bail out */
4571 if (first == NULL)
4572 first = next;
4573 else if (next == first)
4574 break;
4575
4576 } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild);
4577
4578 /* if we were unable to find any usable vdevs, return NULL */
4579 if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild)
4580 next = NULL;
4581
4582 l2arc_dev_last = next;
4583
4584 out:
4585 mutex_exit(&l2arc_dev_mtx);
4586
4587 /*
4588 * Grab the config lock to prevent the 'next' device from being
4589 * removed while we are writing to it.
4590 */
4591 if (next != NULL)
4592 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4593 mutex_exit(&spa_namespace_lock);
4594
4595 return (next);
4596 }
4597
4598 /*
4599 * Free buffers that were tagged for destruction.
4600 */
4601 static void
4602 l2arc_do_free_on_write()
4603 {
4604 list_t *buflist;
4605 l2arc_data_free_t *df, *df_prev;
4606
4607 mutex_enter(&l2arc_free_on_write_mtx);
4608 buflist = l2arc_free_on_write;
4609
4610 for (df = list_tail(buflist); df; df = df_prev) {
4611 df_prev = list_prev(buflist, df);
4612 ASSERT(df->l2df_data != NULL);
4613 ASSERT(df->l2df_func != NULL);
4614 df->l2df_func(df->l2df_data, df->l2df_size);
4615 list_remove(buflist, df);
4616 kmem_free(df, sizeof (l2arc_data_free_t));
4617 }
4618
4619 mutex_exit(&l2arc_free_on_write_mtx);
4620 }
4621
4622 /*
4623 * A write to a cache device has completed. Update all headers to allow
4624 * reads from these buffers to begin.
4625 */
4626 static void
4627 l2arc_write_done(zio_t *zio)
4628 {
4629 l2arc_write_callback_t *cb;
4630 l2arc_dev_t *dev;
4631 list_t *buflist;
4632 arc_buf_hdr_t *head, *ab, *ab_prev;
4633 l2arc_buf_hdr_t *l2hdr;
4634 kmutex_t *hash_lock;
4635 l2arc_log_blk_buf_t *lb_buf;
4636
4637 cb = zio->io_private;
4638 ASSERT(cb != NULL);
4639 dev = cb->l2wcb_dev;
4640 ASSERT(dev != NULL);
4641 head = cb->l2wcb_head;
4642 ASSERT(head != NULL);
4643 buflist = dev->l2ad_buflist;
4644 ASSERT(buflist != NULL);
4645 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4646 l2arc_write_callback_t *, cb);
4647
4648 if (zio->io_error != 0)
4649 ARCSTAT_BUMP(arcstat_l2_writes_error);
4650
4651 mutex_enter(&l2arc_buflist_mtx);
4652
4653 /*
4654 * All writes completed, or an error was hit.
4655 */
4656 for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4657 ab_prev = list_prev(buflist, ab);
4658 l2hdr = ab->b_l2hdr;
4659
4660 /*
4661 * Release the temporary compressed buffer as soon as possible.
4662 */
4663 if (l2hdr->b_compress != ZIO_COMPRESS_OFF)
4664 l2arc_release_cdata_buf(ab);
4665
4666 hash_lock = HDR_LOCK(ab);
4667 if (!mutex_tryenter(hash_lock)) {
4668 /*
4669 * This buffer misses out. It may be in a stage
4670 * of eviction. Its ARC_L2_WRITING flag will be
4671 * left set, denying reads to this buffer.
4672 */
4673 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4674 continue;
4675 }
4676
4677 if (zio->io_error != 0) {
4678 /*
4679 * Error - drop L2ARC entry.
4680 */
4681 list_remove(buflist, ab);
4682 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
4683 ab->b_l2hdr = NULL;
4684 kmem_free(l2hdr, sizeof (*l2hdr));
4685 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4686 }
4687
4688 /*
4689 * Allow ARC to begin reads to this L2ARC entry.
4690 */
4691 ab->b_flags &= ~ARC_L2_WRITING;
4692
4693 mutex_exit(hash_lock);
4694 }
4695
4696 atomic_inc_64(&l2arc_writes_done);
4697 list_remove(buflist, head);
4698 kmem_cache_free(hdr_cache, head);
4699 mutex_exit(&l2arc_buflist_mtx);
4700
4701 l2arc_do_free_on_write();
4702
4703 for (lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list); lb_buf != NULL;
4704 lb_buf = list_tail(&cb->l2wcb_log_blk_buf_list)) {
4705 (void) list_remove_tail(&cb->l2wcb_log_blk_buf_list);
4706 kmem_free(lb_buf, sizeof (*lb_buf));
4707 }
4708 list_destroy(&cb->l2wcb_log_blk_buf_list);
4709 kmem_free(cb, sizeof (l2arc_write_callback_t));
4710 }
4711
4712 /*
4713 * A read to a cache device completed. Validate buffer contents before
4714 * handing over to the regular ARC routines.
4715 */
4716 static void
4717 l2arc_read_done(zio_t *zio)
4718 {
4719 l2arc_read_callback_t *cb;
4720 arc_buf_hdr_t *hdr;
4721 arc_buf_t *buf;
4722 kmutex_t *hash_lock;
4723 int equal;
4724
4725 ASSERT(zio->io_vd != NULL);
4726 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4727
4728 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4729
4730 cb = zio->io_private;
4731 ASSERT(cb != NULL);
4732 buf = cb->l2rcb_buf;
4733 ASSERT(buf != NULL);
4734
4735 hash_lock = HDR_LOCK(buf->b_hdr);
4736 mutex_enter(hash_lock);
4737 hdr = buf->b_hdr;
4738 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4739
4740 /*
4741 * If the buffer was compressed, decompress it first.
4742 */
4743 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4744 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4745 ASSERT(zio->io_data != NULL);
4746
4747 /*
4748 * Check this survived the L2ARC journey.
4749 */
4750 equal = arc_cksum_equal(buf);
4751 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4752 mutex_exit(hash_lock);
4753 zio->io_private = buf;
4754 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
4755 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
4756 arc_read_done(zio);
4757 } else {
4758 mutex_exit(hash_lock);
4759 /*
4760 * Buffer didn't survive caching. Increment stats and
4761 * reissue to the original storage device.
4762 */
4763 if (zio->io_error != 0) {
4764 ARCSTAT_BUMP(arcstat_l2_io_error);
4765 } else {
4766 zio->io_error = SET_ERROR(EIO);
4767 }
4768 if (!equal)
4769 ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4770
4771 /*
4772 * If there's no waiter, issue an async i/o to the primary
4773 * storage now. If there *is* a waiter, the caller must
4774 * issue the i/o in a context where it's OK to block.
4775 */
4776 if (zio->io_waiter == NULL) {
4777 zio_t *pio = zio_unique_parent(zio);
4778
4779 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4780
4781 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4782 buf->b_data, zio->io_size, arc_read_done, buf,
4783 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4784 }
4785 }
4786
4787 kmem_free(cb, sizeof (l2arc_read_callback_t));
4788 }
4789
4790 /*
4791 * This is the list priority from which the L2ARC will search for pages to
4792 * cache. This is used within loops (0..3) to cycle through lists in the
4793 * desired order. This order can have a significant effect on cache
4794 * performance.
4795 *
4796 * Currently the metadata lists are hit first, MFU then MRU, followed by
4797 * the data lists. This function returns a locked list, and also returns
4798 * the lock pointer.
4799 */
4800 static list_t *
4801 l2arc_list_locked(int list_num, kmutex_t **lock)
4802 {
4803 list_t *list = NULL;
4804
4805 ASSERT(list_num >= 0 && list_num <= 3);
4806
4807 switch (list_num) {
4808 case 0:
4809 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4810 *lock = &arc_mfu->arcs_mtx;
4811 break;
4812 case 1:
4813 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4814 *lock = &arc_mru->arcs_mtx;
4815 break;
4816 case 2:
4817 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4818 *lock = &arc_mfu->arcs_mtx;
4819 break;
4820 case 3:
4821 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4822 *lock = &arc_mru->arcs_mtx;
4823 break;
4824 }
4825
4826 ASSERT(!(MUTEX_HELD(*lock)));
4827 mutex_enter(*lock);
4828 return (list);
4829 }
4830
4831 /*
4832 * Calculates the maximum overhead of L2ARC metadata log blocks for a given
4833 * L2ARC write size. l2arc_evict and l2arc_write_buffers need to include this
4834 * overhead in processing to make sure there is enough headroom available
4835 * when writing buffers.
4836 */
4837 static inline uint64_t
4838 l2arc_log_blk_overhead(uint64_t write_sz)
4839 {
4840 return ((write_sz / SPA_MINBLOCKSIZE / L2ARC_LOG_BLK_ENTRIES) + 1) *
4841 L2ARC_LOG_BLK_SIZE;
4842 }
4843
4844 /*
4845 * Evict buffers from the device write hand to the distance specified in
4846 * bytes. This distance may span populated buffers, it may span nothing.
4847 * This is clearing a region on the L2ARC device ready for writing.
4848 * If the 'all' boolean is set, every buffer is evicted.
4849 */
4850 static void
4851 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4852 {
4853 list_t *buflist;
4854 l2arc_buf_hdr_t *l2hdr;
4855 arc_buf_hdr_t *ab, *ab_prev;
4856 kmutex_t *hash_lock;
4857 uint64_t taddr;
4858
4859 buflist = dev->l2ad_buflist;
4860
4861 if (buflist == NULL)
4862 return;
4863
4864 if (!all && dev->l2ad_first) {
4865 /*
4866 * This is the first sweep through the device. There is
4867 * nothing to evict.
4868 */
4869 return;
4870 }
4871
4872 /*
4873 * We need to add in the worst case scenario of log block overhead.
4874 */
4875 distance += l2arc_log_blk_overhead(distance);
4876 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4877 /*
4878 * When nearing the end of the device, evict to the end
4879 * before the device write hand jumps to the start.
4880 */
4881 taddr = dev->l2ad_end;
4882 } else {
4883 taddr = dev->l2ad_hand + distance;
4884 }
4885 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4886 uint64_t, taddr, boolean_t, all);
4887
4888 top:
4889 mutex_enter(&l2arc_buflist_mtx);
4890 for (ab = list_tail(buflist); ab; ab = ab_prev) {
4891 ab_prev = list_prev(buflist, ab);
4892
4893 hash_lock = HDR_LOCK(ab);
4894 if (!mutex_tryenter(hash_lock)) {
4895 /*
4896 * Missed the hash lock. Retry.
4897 */
4898 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4899 mutex_exit(&l2arc_buflist_mtx);
4900 mutex_enter(hash_lock);
4901 mutex_exit(hash_lock);
4902 goto top;
4903 }
4904
4905 if (HDR_L2_WRITE_HEAD(ab)) {
4906 /*
4907 * We hit a write head node. Leave it for
4908 * l2arc_write_done().
4909 */
4910 list_remove(buflist, ab);
4911 mutex_exit(hash_lock);
4912 continue;
4913 }
4914
4915 if (!all && ab->b_l2hdr != NULL &&
4916 (ab->b_l2hdr->b_daddr > taddr ||
4917 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4918 /*
4919 * We've evicted to the target address,
4920 * or the end of the device.
4921 */
4922 mutex_exit(hash_lock);
4923 break;
4924 }
4925
4926 if (HDR_FREE_IN_PROGRESS(ab)) {
4927 /*
4928 * Already on the path to destruction.
4929 */
4930 mutex_exit(hash_lock);
4931 continue;
4932 }
4933
4934 if (ab->b_state == arc_l2c_only) {
4935 ASSERT(!HDR_L2_READING(ab));
4936 /*
4937 * This doesn't exist in the ARC. Destroy.
4938 * arc_hdr_destroy() will call list_remove()
4939 * and decrement arcstat_l2_size.
4940 */
4941 arc_change_state(arc_anon, ab, hash_lock);
4942 arc_hdr_destroy(ab);
4943 } else {
4944 /*
4945 * Invalidate issued or about to be issued
4946 * reads, since we may be about to write
4947 * over this location.
4948 */
4949 if (HDR_L2_READING(ab)) {
4950 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4951 ab->b_flags |= ARC_L2_EVICTED;
4952 }
4953
4954 /*
4955 * Tell ARC this no longer exists in L2ARC.
4956 */
4957 if (ab->b_l2hdr != NULL) {
4958 l2hdr = ab->b_l2hdr;
4959 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
4960 ab->b_l2hdr = NULL;
4961 kmem_free(l2hdr, sizeof (*l2hdr));
4962 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4963 }
4964 list_remove(buflist, ab);
4965
4966 /*
4967 * This may have been leftover after a
4968 * failed write.
4969 */
4970 ab->b_flags &= ~ARC_L2_WRITING;
4971 }
4972 mutex_exit(hash_lock);
4973 }
4974 mutex_exit(&l2arc_buflist_mtx);
4975
4976 vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4977 dev->l2ad_evict = taddr;
4978 }
4979
4980 /*
4981 * Find and write ARC buffers to the L2ARC device.
4982 *
4983 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4984 * for reading until they have completed writing.
4985 * The headroom_boost is an in-out parameter used to maintain headroom boost
4986 * state between calls to this function.
4987 *
4988 * Returns the number of bytes actually written (which may be smaller than
4989 * the delta by which the device hand has changed due to alignment).
4990 */
4991 static uint64_t
4992 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4993 boolean_t *headroom_boost)
4994 {
4995 arc_buf_hdr_t *ab, *ab_prev, *head;
4996 list_t *list;
4997 /*
4998 * These variables mean:
4999 * - write_size: in-memory size of ARC buffers we've written (before
5000 * compression).
5001 * - write_asize: actual on-disk size of ARC buffers we've written
5002 * (after compression).
5003 * - write_aligned_asize: actual sum of space taken by ARC buffers
5004 * on the device (after compression and alignment, so that
5005 * every buffer starts on a multiple of the device block size).
5006 * - headroom: L2ARC scanning headroom (we won't scan beyond this
5007 * distance from the list tail).
5008 * - buf_compress_minsz: minimum in-memory ARC buffer size for us
5009 * to try compressing it.
5010 */
5011 uint64_t write_size, write_asize, write_aligned_asize, headroom,
5012 buf_compress_minsz;
5013 void *buf_data;
5014 kmutex_t *list_lock;
5015 boolean_t full;
5016 l2arc_write_callback_t *cb;
5017 zio_t *pio, *wzio;
5018 uint64_t guid = spa_load_guid(spa);
5019 const boolean_t do_headroom_boost = *headroom_boost;
5020 boolean_t dev_hdr_update = B_FALSE;
5021
5022 ASSERT(dev->l2ad_vdev != NULL);
5023
5024 /* Lower the flag now, we might want to raise it again later. */
5025 *headroom_boost = B_FALSE;
5026
5027 pio = NULL;
5028 cb = NULL;
5029 write_size = write_asize = write_aligned_asize = 0;
5030 full = B_FALSE;
5031 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
5032 head->b_flags |= ARC_L2_WRITE_HEAD;
5033
5034 /*
5035 * We will want to try to compress buffers that are at least 2x the
5036 * device sector size.
5037 */
5038 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
5039
5040 /*
5041 * Copy buffers for L2ARC writing.
5042 */
5043 mutex_enter(&l2arc_buflist_mtx);
5044 for (int try = 0; try <= 3; try++) {
5045 uint64_t passed_sz = 0;
5046
5047 list = l2arc_list_locked(try, &list_lock);
5048
5049 /*
5050 * L2ARC fast warmup.
5051 *
5052 * Until the ARC is warm and starts to evict, read from the
5053 * head of the ARC lists rather than the tail.
5054 */
5055 if (arc_warm == B_FALSE)
5056 ab = list_head(list);
5057 else
5058 ab = list_tail(list);
5059
5060 headroom = target_sz * l2arc_headroom;
5061 if (do_headroom_boost)
5062 headroom = (headroom * l2arc_headroom_boost) / 100;
5063
5064 for (; ab; ab = ab_prev) {
5065 l2arc_buf_hdr_t *l2hdr;
5066 kmutex_t *hash_lock;
5067 uint64_t buf_aligned_size;
5068
5069 if (arc_warm == B_FALSE)
5070 ab_prev = list_next(list, ab);
5071 else
5072 ab_prev = list_prev(list, ab);
5073
5074 hash_lock = HDR_LOCK(ab);
5075 if (!mutex_tryenter(hash_lock)) {
5076 /*
5077 * Skip this buffer rather than waiting.
5078 */
5079 continue;
5080 }
5081
5082 /*
5083 * When examining whether we've met our write target,
5084 * we must always use the aligned size of the buffer,
5085 * since that's the maximum amount of space a buffer
5086 * can take up on the L2ARC device.
5087 */
5088 buf_aligned_size = vdev_psize_to_asize(dev->l2ad_vdev,
5089 ab->b_size);
5090 passed_sz += buf_aligned_size;
5091 if (passed_sz > headroom) {
5092 /*
5093 * Searched too far.
5094 */
5095 mutex_exit(hash_lock);
5096 break;
5097 }
5098
5099 if (!l2arc_write_eligible(guid, ab)) {
5100 mutex_exit(hash_lock);
5101 continue;
5102 }
5103
5104 if ((write_size + buf_aligned_size) > target_sz) {
5105 full = B_TRUE;
5106 mutex_exit(hash_lock);
5107 break;
5108 }
5109
5110 if (pio == NULL) {
5111 /*
5112 * Insert a dummy header on the buflist so
5113 * l2arc_write_done() can find where the
5114 * write buffers begin without searching.
5115 */
5116 list_insert_head(dev->l2ad_buflist, head);
5117
5118 cb = kmem_zalloc(
5119 sizeof (l2arc_write_callback_t), KM_SLEEP);
5120 cb->l2wcb_dev = dev;
5121 cb->l2wcb_head = head;
5122 list_create(&cb->l2wcb_log_blk_buf_list,
5123 sizeof (l2arc_log_blk_buf_t),
5124 offsetof(l2arc_log_blk_buf_t, l2lbb_node));
5125 pio = zio_root(spa, l2arc_write_done, cb,
5126 ZIO_FLAG_CANFAIL);
5127 }
5128
5129 /*
5130 * Create and add a new L2ARC header.
5131 */
5132 l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP);
5133 l2hdr->b_dev = dev;
5134 ab->b_flags |= ARC_L2_WRITING;
5135
5136 /*
5137 * Temporarily stash the data buffer in b_tmp_cdata.
5138 * The subsequent write step will pick it up from
5139 * there. This is because can't access ab->b_buf
5140 * without holding the hash_lock, which we in turn
5141 * can't access without holding the ARC list locks
5142 * (which we want to avoid during compression/writing).
5143 */
5144 l2hdr->b_compress = ZIO_COMPRESS_OFF;
5145 l2hdr->b_asize = ab->b_size;
5146 l2hdr->b_tmp_cdata = ab->b_buf->b_data;
5147
5148 ab->b_l2hdr = l2hdr;
5149
5150 list_insert_head(dev->l2ad_buflist, ab);
5151
5152 /*
5153 * Compute and store the buffer cksum before
5154 * writing. On debug the cksum is verified first.
5155 */
5156 arc_cksum_verify(ab->b_buf);
5157 arc_cksum_compute(ab->b_buf, B_TRUE);
5158
5159 mutex_exit(hash_lock);
5160
5161 write_size += buf_aligned_size;
5162 }
5163
5164 mutex_exit(list_lock);
5165
5166 if (full == B_TRUE)
5167 break;
5168 }
5169
5170 /* No buffers selected for writing? */
5171 if (pio == NULL) {
5172 ASSERT0(write_size);
5173 mutex_exit(&l2arc_buflist_mtx);
5174 kmem_cache_free(hdr_cache, head);
5175 return (0);
5176 }
5177
5178 /*
5179 * Now start writing the buffers. We're starting at the write head
5180 * and work backwards, retracing the course of the buffer selector
5181 * loop above.
5182 */
5183 for (ab = list_prev(dev->l2ad_buflist, head); ab;
5184 ab = list_prev(dev->l2ad_buflist, ab)) {
5185 l2arc_buf_hdr_t *l2hdr;
5186 uint64_t buf_sz;
5187
5188 /*
5189 * We shouldn't need to lock the buffer here, since we flagged
5190 * it as ARC_L2_WRITING in the previous step, but we must take
5191 * care to only access its L2 cache parameters. In particular,
5192 * ab->b_buf may be invalid by now due to ARC eviction.
5193 */
5194 l2hdr = ab->b_l2hdr;
5195 l2hdr->b_daddr = dev->l2ad_hand;
5196
5197 if ((ab->b_flags & ARC_L2COMPRESS) &&
5198 l2hdr->b_asize >= buf_compress_minsz) {
5199 if (l2arc_compress_buf(l2hdr)) {
5200 /*
5201 * If compression succeeded, enable headroom
5202 * boost on the next scan cycle.
5203 */
5204 *headroom_boost = B_TRUE;
5205 }
5206 }
5207
5208 /*
5209 * Pick up the buffer data we had previously stashed away
5210 * (and now potentially also compressed).
5211 */
5212 buf_data = l2hdr->b_tmp_cdata;
5213 buf_sz = l2hdr->b_asize;
5214
5215 /* Compression may have squashed the buffer to zero length. */
5216 if (buf_sz != 0) {
5217 uint64_t buf_aligned_asize;
5218
5219 wzio = zio_write_phys(pio, dev->l2ad_vdev,
5220 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5221 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5222 ZIO_FLAG_CANFAIL, B_FALSE);
5223
5224 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5225 zio_t *, wzio);
5226 (void) zio_nowait(wzio);
5227
5228 write_asize += buf_sz;
5229 /*
5230 * Keep the clock hand suitably device-aligned.
5231 */
5232 buf_aligned_asize = vdev_psize_to_asize(dev->l2ad_vdev,
5233 buf_sz);
5234 write_aligned_asize += buf_aligned_asize;
5235 dev->l2ad_hand += buf_aligned_asize;
5236 ASSERT(dev->l2ad_hand <= dev->l2ad_evict ||
5237 dev->l2ad_first);
5238 }
5239
5240 if (l2arc_log_blk_insert(dev, ab)) {
5241 l2arc_log_blk_commit(dev, pio, cb);
5242 dev_hdr_update = B_TRUE;
5243 }
5244 }
5245 mutex_exit(&l2arc_buflist_mtx);
5246
5247 if (dev_hdr_update)
5248 l2arc_dev_hdr_update(dev, pio);
5249
5250 VERIFY3U(write_aligned_asize, <=, target_sz);
5251 ARCSTAT_BUMP(arcstat_l2_writes_sent);
5252 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5253 ARCSTAT_INCR(arcstat_l2_size, write_size);
5254 ARCSTAT_INCR(arcstat_l2_asize, write_aligned_asize);
5255 vdev_space_update(dev->l2ad_vdev, write_aligned_asize, 0, 0);
5256
5257 /*
5258 * Bump device hand to the device start if it is approaching the end.
5259 * l2arc_evict() will already have evicted ahead for this case.
5260 */
5261 if (dev->l2ad_hand + target_sz + l2arc_log_blk_overhead(target_sz) >=
5262 dev->l2ad_end) {
5263 vdev_space_update(dev->l2ad_vdev,
5264 dev->l2ad_end - dev->l2ad_hand, 0, 0);
5265 dev->l2ad_hand = dev->l2ad_start;
5266 dev->l2ad_evict = dev->l2ad_start;
5267 dev->l2ad_first = B_FALSE;
5268 }
5269
5270 dev->l2ad_writing = B_TRUE;
5271 (void) zio_wait(pio);
5272 dev->l2ad_writing = B_FALSE;
5273
5274 return (write_asize);
5275 }
5276
5277 /*
5278 * Compresses an L2ARC buffer.
5279 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
5280 * size in l2hdr->b_asize. This routine tries to compress the data and
5281 * depending on the compression result there are three possible outcomes:
5282 * *) The buffer was incompressible. The original l2hdr contents were left
5283 * untouched and are ready for writing to an L2 device.
5284 * *) The buffer was all-zeros, so there is no need to write it to an L2
5285 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5286 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5287 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5288 * data buffer which holds the compressed data to be written, and b_asize
5289 * tells us how much data there is. b_compress is set to the appropriate
5290 * compression algorithm. Once writing is done, invoke
5291 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
5292 *
5293 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
5294 * buffer was incompressible).
5295 */
5296 static boolean_t
5297 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
5298 {
5299 void *cdata;
5300 size_t csize, len;
5301
5302 ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
5303 ASSERT(l2hdr->b_tmp_cdata != NULL);
5304
5305 len = l2hdr->b_asize;
5306 cdata = zio_data_buf_alloc(len);
5307 csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
5308 cdata, l2hdr->b_asize);
5309
5310 if (csize == 0) {
5311 /* zero block, indicate that there's nothing to write */
5312 zio_data_buf_free(cdata, len);
5313 l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
5314 l2hdr->b_asize = 0;
5315 l2hdr->b_tmp_cdata = NULL;
5316 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
5317 return (B_TRUE);
5318 } else if (csize > 0 && csize < len) {
5319 /*
5320 * Compression succeeded, we'll keep the cdata around for
5321 * writing and release it afterwards.
5322 */
5323 l2hdr->b_compress = ZIO_COMPRESS_LZ4;
5324 l2hdr->b_asize = csize;
5325 l2hdr->b_tmp_cdata = cdata;
5326 ARCSTAT_BUMP(arcstat_l2_compress_successes);
5327 return (B_TRUE);
5328 } else {
5329 /*
5330 * Compression failed, release the compressed buffer.
5331 * l2hdr will be left unmodified.
5332 */
5333 zio_data_buf_free(cdata, len);
5334 ARCSTAT_BUMP(arcstat_l2_compress_failures);
5335 return (B_FALSE);
5336 }
5337 }
5338
5339 /*
5340 * Decompresses a zio read back from an l2arc device. On success, the
5341 * underlying zio's io_data buffer is overwritten by the uncompressed
5342 * version. On decompression error (corrupt compressed stream), the
5343 * zio->io_error value is set to signal an I/O error.
5344 *
5345 * Please note that the compressed data stream is not checksummed, so
5346 * if the underlying device is experiencing data corruption, we may feed
5347 * corrupt data to the decompressor, so the decompressor needs to be
5348 * able to handle this situation (LZ4 does).
5349 */
5350 static void
5351 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5352 {
5353 ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5354
5355 if (zio->io_error != 0) {
5356 /*
5357 * An io error has occured, just restore the original io
5358 * size in preparation for a main pool read.
5359 */
5360 zio->io_orig_size = zio->io_size = hdr->b_size;
5361 return;
5362 }
5363
5364 if (c == ZIO_COMPRESS_EMPTY) {
5365 /*
5366 * An empty buffer results in a null zio, which means we
5367 * need to fill its io_data after we're done restoring the
5368 * buffer's contents.
5369 */
5370 ASSERT(hdr->b_buf != NULL);
5371 bzero(hdr->b_buf->b_data, hdr->b_size);
5372 zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
5373 } else {
5374 ASSERT(zio->io_data != NULL);
5375 /*
5376 * We copy the compressed data from the start of the arc buffer
5377 * (the zio_read will have pulled in only what we need, the
5378 * rest is garbage which we will overwrite at decompression)
5379 * and then decompress back to the ARC data buffer. This way we
5380 * can minimize copying by simply decompressing back over the
5381 * original compressed data (rather than decompressing to an
5382 * aux buffer and then copying back the uncompressed buffer,
5383 * which is likely to be much larger).
5384 */
5385 uint64_t csize;
5386 void *cdata;
5387
5388 csize = zio->io_size;
5389 cdata = zio_data_buf_alloc(csize);
5390 bcopy(zio->io_data, cdata, csize);
5391 if (zio_decompress_data(c, cdata, zio->io_data, csize,
5392 hdr->b_size) != 0)
5393 zio->io_error = EIO;
5394 zio_data_buf_free(cdata, csize);
5395 }
5396
5397 /* Restore the expected uncompressed IO size. */
5398 zio->io_orig_size = zio->io_size = hdr->b_size;
5399 }
5400
5401 /*
5402 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5403 * This buffer serves as a temporary holder of compressed data while
5404 * the buffer entry is being written to an l2arc device. Once that is
5405 * done, we can dispose of it.
5406 */
5407 static void
5408 l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
5409 {
5410 l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
5411
5412 if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
5413 /*
5414 * If the data was compressed, then we've allocated a
5415 * temporary buffer for it, so now we need to release it.
5416 */
5417 ASSERT(l2hdr->b_tmp_cdata != NULL);
5418 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
5419 }
5420 l2hdr->b_tmp_cdata = NULL;
5421 }
5422
5423 /*
5424 * This thread feeds the L2ARC at regular intervals. This is the beating
5425 * heart of the L2ARC.
5426 */
5427 static void
5428 l2arc_feed_thread(void)
5429 {
5430 callb_cpr_t cpr;
5431 l2arc_dev_t *dev;
5432 spa_t *spa;
5433 uint64_t size, wrote;
5434 clock_t begin, next = ddi_get_lbolt();
5435 boolean_t headroom_boost = B_FALSE;
5436
5437 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5438
5439 mutex_enter(&l2arc_feed_thr_lock);
5440
5441 while (l2arc_thread_exit == 0) {
5442 CALLB_CPR_SAFE_BEGIN(&cpr);
5443 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
5444 next);
5445 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5446 next = ddi_get_lbolt() + hz;
5447
5448 /*
5449 * Quick check for L2ARC devices.
5450 */
5451 mutex_enter(&l2arc_dev_mtx);
5452 if (l2arc_ndev == 0) {
5453 mutex_exit(&l2arc_dev_mtx);
5454 continue;
5455 }
5456 mutex_exit(&l2arc_dev_mtx);
5457 begin = ddi_get_lbolt();
5458
5459 /*
5460 * This selects the next l2arc device to write to, and in
5461 * doing so the next spa to feed from: dev->l2ad_spa. This
5462 * will return NULL if there are now no l2arc devices or if
5463 * they are all faulted.
5464 *
5465 * If a device is returned, its spa's config lock is also
5466 * held to prevent device removal. l2arc_dev_get_next()
5467 * will grab and release l2arc_dev_mtx.
5468 */
5469 if ((dev = l2arc_dev_get_next()) == NULL)
5470 continue;
5471
5472 spa = dev->l2ad_spa;
5473 ASSERT(spa != NULL);
5474
5475 /*
5476 * If the pool is read-only then force the feed thread to
5477 * sleep a little longer.
5478 */
5479 if (!spa_writeable(spa)) {
5480 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5481 spa_config_exit(spa, SCL_L2ARC, dev);
5482 continue;
5483 }
5484
5485 /*
5486 * Avoid contributing to memory pressure.
5487 */
5488 if (arc_reclaim_needed()) {
5489 ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5490 spa_config_exit(spa, SCL_L2ARC, dev);
5491 continue;
5492 }
5493
5494 ARCSTAT_BUMP(arcstat_l2_feeds);
5495
5496 size = l2arc_write_size();
5497
5498 /*
5499 * Evict L2ARC buffers that will be overwritten.
5500 */
5501 l2arc_evict(dev, size, B_FALSE);
5502
5503 /*
5504 * Write ARC buffers.
5505 */
5506 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5507
5508 /*
5509 * Calculate interval between writes.
5510 */
5511 next = l2arc_write_interval(begin, size, wrote);
5512 spa_config_exit(spa, SCL_L2ARC, dev);
5513 }
5514
5515 l2arc_thread_exit = 0;
5516 cv_broadcast(&l2arc_feed_thr_cv);
5517 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
5518 thread_exit();
5519 }
5520
5521 boolean_t
5522 l2arc_vdev_present(vdev_t *vd)
5523 {
5524 return (l2arc_vdev_get(vd) != NULL);
5525 }
5526
5527 static l2arc_dev_t *
5528 l2arc_vdev_get(vdev_t *vd)
5529 {
5530 l2arc_dev_t *dev;
5531 boolean_t held = MUTEX_HELD(&l2arc_dev_mtx);
5532
5533 if (!held)
5534 mutex_enter(&l2arc_dev_mtx);
5535 for (dev = list_head(l2arc_dev_list); dev != NULL;
5536 dev = list_next(l2arc_dev_list, dev)) {
5537 if (dev->l2ad_vdev == vd)
5538 break;
5539 }
5540 if (!held)
5541 mutex_exit(&l2arc_dev_mtx);
5542
5543 return (dev);
5544 }
5545
5546 /*
5547 * Add a vdev for use by the L2ARC. By this point the spa has already
5548 * validated the vdev and opened it. The `rebuild' flag indicates whether
5549 * we should attempt an L2ARC persistency rebuild.
5550 */
5551 void
5552 l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
5553 {
5554 l2arc_dev_t *adddev;
5555
5556 ASSERT(!l2arc_vdev_present(vd));
5557
5558 /*
5559 * Create a new l2arc device entry.
5560 */
5561 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5562 adddev->l2ad_spa = spa;
5563 adddev->l2ad_vdev = vd;
5564 /* leave an extra SPA_MINBLOCKSIZE for l2arc device header */
5565 adddev->l2ad_start = VDEV_LABEL_START_SIZE + SPA_MINBLOCKSIZE;
5566 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5567 adddev->l2ad_hand = adddev->l2ad_start;
5568 adddev->l2ad_evict = adddev->l2ad_start;
5569 adddev->l2ad_first = B_TRUE;
5570 adddev->l2ad_writing = B_FALSE;
5571
5572 /*
5573 * This is a list of all ARC buffers that are still valid on the
5574 * device.
5575 */
5576 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5577 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5578 offsetof(arc_buf_hdr_t, b_l2node));
5579
5580 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5581
5582 /*
5583 * Add device to global list
5584 */
5585 mutex_enter(&l2arc_dev_mtx);
5586 list_insert_head(l2arc_dev_list, adddev);
5587 atomic_inc_64(&l2arc_ndev);
5588 if (rebuild && l2arc_rebuild_enabled &&
5589 adddev->l2ad_end - adddev->l2ad_start > L2ARC_PERSIST_MIN_SIZE) {
5590 /*
5591 * Just mark the device as pending for a rebuild. We won't
5592 * be starting a rebuild in line here as it would block pool
5593 * import. Instead spa_load_impl will hand that off to an
5594 * async task which will call l2arc_spa_rebuild_start.
5595 */
5596 adddev->l2ad_rebuild = B_TRUE;
5597 }
5598 mutex_exit(&l2arc_dev_mtx);
5599 }
5600
5601 /*
5602 * Remove a vdev from the L2ARC.
5603 */
5604 void
5605 l2arc_remove_vdev(vdev_t *vd)
5606 {
5607 l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5608
5609 /*
5610 * Find the device by vdev
5611 */
5612 mutex_enter(&l2arc_dev_mtx);
5613 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5614 nextdev = list_next(l2arc_dev_list, dev);
5615 if (vd == dev->l2ad_vdev) {
5616 remdev = dev;
5617 break;
5618 }
5619 }
5620 ASSERT(remdev != NULL);
5621
5622 /*
5623 * Remove device from global list
5624 */
5625 list_remove(l2arc_dev_list, remdev);
5626 l2arc_dev_last = NULL; /* may have been invalidated */
5627 atomic_dec_64(&l2arc_ndev);
5628 mutex_exit(&l2arc_dev_mtx);
5629
5630 /*
5631 * Clear all buflists and ARC references. L2ARC device flush.
5632 */
5633 l2arc_evict(remdev, 0, B_TRUE);
5634 list_destroy(remdev->l2ad_buflist);
5635 kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5636 kmem_free(remdev, sizeof (l2arc_dev_t));
5637 }
5638
5639 void
5640 l2arc_init(void)
5641 {
5642 l2arc_thread_exit = 0;
5643 l2arc_ndev = 0;
5644 l2arc_writes_sent = 0;
5645 l2arc_writes_done = 0;
5646
5647 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5648 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5649 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5650 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5651 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5652
5653 l2arc_dev_list = &L2ARC_dev_list;
5654 l2arc_free_on_write = &L2ARC_free_on_write;
5655 list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5656 offsetof(l2arc_dev_t, l2ad_node));
5657 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5658 offsetof(l2arc_data_free_t, l2df_list_node));
5659 }
5660
5661 void
5662 l2arc_fini(void)
5663 {
5664 /*
5665 * This is called from dmu_fini(), which is called from spa_fini();
5666 * Because of this, we can assume that all l2arc devices have
5667 * already been removed when the pools themselves were removed.
5668 */
5669
5670 l2arc_do_free_on_write();
5671
5672 mutex_destroy(&l2arc_feed_thr_lock);
5673 cv_destroy(&l2arc_feed_thr_cv);
5674 mutex_destroy(&l2arc_dev_mtx);
5675 mutex_destroy(&l2arc_buflist_mtx);
5676 mutex_destroy(&l2arc_free_on_write_mtx);
5677
5678 list_destroy(l2arc_dev_list);
5679 list_destroy(l2arc_free_on_write);
5680 }
5681
5682 void
5683 l2arc_start(void)
5684 {
5685 if (!(spa_mode_global & FWRITE))
5686 return;
5687
5688 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5689 TS_RUN, minclsyspri);
5690 }
5691
5692 void
5693 l2arc_stop(void)
5694 {
5695 if (!(spa_mode_global & FWRITE))
5696 return;
5697
5698 mutex_enter(&l2arc_feed_thr_lock);
5699 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
5700 l2arc_thread_exit = 1;
5701 while (l2arc_thread_exit != 0)
5702 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5703 mutex_exit(&l2arc_feed_thr_lock);
5704 }
5705
5706 /*
5707 * Punches out rebuild threads for the L2ARC devices in a spa. This should
5708 * be called as one of the final steps of a pool import.
5709 */
5710 void
5711 l2arc_spa_rebuild_start(spa_t *spa)
5712 {
5713 l2arc_dev_t *dev;
5714 /*
5715 * Locate the spa's l2arc devices and kick off rebuild threads.
5716 */
5717 mutex_enter(&l2arc_dev_mtx);
5718 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
5719 dev = l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
5720 ASSERT(dev != NULL);
5721 if (dev->l2ad_rebuild) {
5722 (void) thread_create(NULL, 0, l2arc_dev_rebuild_start,
5723 dev, 0, &p0, TS_RUN, minclsyspri);
5724 }
5725 }
5726 mutex_exit(&l2arc_dev_mtx);
5727 }
5728
5729 /*
5730 * Main entry point for L2ARC rebuilding.
5731 */
5732 static void
5733 l2arc_dev_rebuild_start(l2arc_dev_t *dev)
5734 {
5735 spa_t *spa = dev->l2ad_spa;
5736 vdev_t *vd = dev->l2ad_vdev;
5737
5738 /* Lock out device removal. */
5739 spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
5740 ASSERT(dev->l2ad_rebuild);
5741 (void) l2arc_rebuild(dev);
5742 dev->l2ad_rebuild = B_FALSE;
5743 spa_config_exit(spa, SCL_L2ARC, vd);
5744 thread_exit();
5745 }
5746
5747 /*
5748 * This function implements the actual L2ARC metadata rebuild. It:
5749 *
5750 * 1) reads the device's header
5751 * 2) if a good device header is found, starts reading the log block chain
5752 * 3) restores each block's contents to memory (reconstructing arc_buf_hdr_t's)
5753 *
5754 * Operation stops under any of the following conditions:
5755 *
5756 * 1) We reach the end of the log blk chain (the back-reference in the blk is
5757 * invalid or loops over our starting point).
5758 * 2) We encounter *any* error condition (cksum errors, io errors, looped
5759 * blocks, etc.).
5760 * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect
5761 * from making severely fragmented L2ARC log blocks or slow L2ARC devices
5762 * prevent a machine from finishing a pool import (and thus letting the
5763 * administrator take corrective action, e.g. by kicking the misbehaving
5764 * L2ARC device out of the pool, or by reimporting the pool with L2ARC
5765 * rebuilding disabled).
5766 */
5767 static int
5768 l2arc_rebuild(l2arc_dev_t *dev)
5769 {
5770 int err;
5771 l2arc_log_blk_phys_t *this_lb, *next_lb;
5772 uint8_t *this_lb_buf, *next_lb_buf;
5773 zio_t *this_io = NULL, *next_io = NULL;
5774 int64_t deadline;
5775 l2arc_log_blk_ptr_t lb_ptrs[2];
5776 boolean_t first_pass;
5777 uint64_t load_guid;
5778
5779 load_guid = spa_load_guid(dev->l2ad_vdev->vdev_spa);
5780 deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout;
5781 /*
5782 * Device header processing phase.
5783 */
5784 if ((err = l2arc_dev_hdr_read(dev, &dev->l2ad_dev_hdr)) != 0) {
5785 /* device header corrupted, start a new one */
5786 bzero(&dev->l2ad_dev_hdr, sizeof (&dev->l2ad_dev_hdr));
5787 return (err);
5788 }
5789 if (l2arc_check_rebuild_timeout_hit(deadline))
5790 return (SET_ERROR(ETIMEDOUT));
5791
5792 /* Retrieve the persistent L2ARC device state */
5793 dev->l2ad_evict = dev->l2ad_dev_hdr.l2dh_evict_tail;
5794 dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
5795 dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr +
5796 LBP_GET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0]));
5797 dev->l2ad_first = !!(dev->l2ad_dev_hdr.l2dh_flags &
5798 L2ARC_DEV_HDR_EVICT_FIRST);
5799
5800 /* Prepare the rebuild processing state */
5801 bcopy(dev->l2ad_dev_hdr.l2dh_start_lbps, lb_ptrs, sizeof (lb_ptrs));
5802 this_lb = kmem_zalloc(sizeof (*this_lb), KM_SLEEP);
5803 next_lb = kmem_zalloc(sizeof (*next_lb), KM_SLEEP);
5804 this_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
5805 next_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
5806 first_pass = B_TRUE;
5807
5808 /* Start the rebuild process */
5809 for (;;) {
5810 if (!l2arc_log_blk_ptr_valid(dev, &lb_ptrs[0]))
5811 /* We hit an invalid block address, end the rebuild. */
5812 break;
5813
5814 if ((err = l2arc_log_blk_read(dev, &lb_ptrs[0], &lb_ptrs[1],
5815 this_lb, next_lb, this_lb_buf, next_lb_buf,
5816 this_io, &next_io)) != 0)
5817 break;
5818
5819 /* Protection against infinite loops of log blocks. */
5820 if (l2arc_range_check_overlap(lb_ptrs[1].l2lbp_daddr,
5821 lb_ptrs[0].l2lbp_daddr,
5822 dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) &&
5823 !first_pass) {
5824 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_loop_errors);
5825 err = SET_ERROR(ELOOP);
5826 break;
5827 }
5828
5829 /*
5830 * Our memory pressure valve. If the system is running low
5831 * on memory, rather than swamping memory with new ARC buf
5832 * hdrs, we opt not to rebuild the L2ARC. At this point,
5833 * however, we have already set up our L2ARC dev to chain in
5834 * new metadata log blk, so the user may choose to re-add the
5835 * L2ARC dev at a later time to reconstruct it (when there's
5836 * less memory pressure).
5837 */
5838 if (arc_reclaim_needed()) {
5839 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
5840 cmn_err(CE_NOTE, "System running low on memory, "
5841 "aborting L2ARC rebuild.");
5842 err = SET_ERROR(ENOMEM);
5843 break;
5844 }
5845
5846 /*
5847 * Now that we know that the next_lb checks out alright, we
5848 * can start reconstruction from this lb - we can be sure
5849 * that the L2ARC write hand has not yet reached any of our
5850 * buffers.
5851 */
5852 l2arc_log_blk_restore(dev, load_guid, this_lb,
5853 LBP_GET_PSIZE(&lb_ptrs[0]));
5854
5855 /*
5856 * End of list detection. We can look ahead two steps in the
5857 * blk chain and if the 2nd blk from this_lb dips below the
5858 * initial chain starting point, then we know two things:
5859 * 1) it can't be valid, and
5860 * 2) the next_lb's ARC entries might have already been
5861 * partially overwritten and so we should stop before
5862 * we restore it
5863 */
5864 if (l2arc_range_check_overlap(
5865 this_lb->l2lb_back2_lbp.l2lbp_daddr, lb_ptrs[0].l2lbp_daddr,
5866 dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr) &&
5867 !first_pass)
5868 break;
5869
5870 /* log blk restored, continue with next one in the list */
5871 lb_ptrs[0] = lb_ptrs[1];
5872 lb_ptrs[1] = this_lb->l2lb_back2_lbp;
5873 PTR_SWAP(this_lb, next_lb);
5874 PTR_SWAP(this_lb_buf, next_lb_buf);
5875 this_io = next_io;
5876 next_io = NULL;
5877 first_pass = B_FALSE;
5878
5879 if (l2arc_check_rebuild_timeout_hit(deadline)) {
5880 err = SET_ERROR(ETIMEDOUT);
5881 break;
5882 }
5883 }
5884 if (next_io != NULL)
5885 l2arc_log_blk_prefetch_abort(next_io);
5886 kmem_free(this_lb, sizeof (*this_lb));
5887 kmem_free(next_lb, sizeof (*next_lb));
5888 kmem_free(this_lb_buf, sizeof (l2arc_log_blk_phys_t));
5889 kmem_free(next_lb_buf, sizeof (l2arc_log_blk_phys_t));
5890 if (err == 0)
5891 ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
5892
5893 return (err);
5894 }
5895
5896 /*
5897 * Restores the payload of a log blk to ARC. This creates empty ARC hdr
5898 * entries which only contain an l2arc hdr, essentially restoring the
5899 * buffers to their L2ARC evicted state. This function also updates space
5900 * usage on the L2ARC vdev to make sure it tracks restored buffers.
5901 */
5902 static void
5903 l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
5904 l2arc_log_blk_phys_t *lb, uint64_t lb_psize)
5905 {
5906 uint64_t size = 0, psize = 0;
5907
5908 mutex_enter(&l2arc_buflist_mtx);
5909
5910 for (int i = L2ARC_LOG_BLK_ENTRIES - 1; i >= 0; i--) {
5911 /*
5912 * Restore goes in the reverse direction to preserve correct
5913 * temporal ordering of buffers in the l2ad_buflist.
5914 */
5915 l2arc_hdr_restore(&lb->l2lb_entries[i], dev, load_guid);
5916 size += LE_GET_LSIZE(&lb->l2lb_entries[i]);
5917 psize += LE_GET_PSIZE(&lb->l2lb_entries[i]);
5918 }
5919 mutex_exit(&l2arc_buflist_mtx);
5920
5921 /*
5922 * Record rebuild stats:
5923 * size In-memory size of restored buffer data in ARC
5924 * psize Physical size of restored buffers in the L2ARC
5925 * bufs # of ARC buffer headers restored
5926 * log_blks # of L2ARC log entries processed during restore
5927 */
5928 ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
5929 ARCSTAT_INCR(arcstat_l2_rebuild_psize, psize);
5930 ARCSTAT_INCR(arcstat_l2_rebuild_bufs, L2ARC_LOG_BLK_ENTRIES);
5931 ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
5932 ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, lb_psize);
5933 ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, psize / lb_psize);
5934 vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
5935 }
5936
5937 /*
5938 * Restores a single ARC buf hdr from a log block. The ARC buffer is put
5939 * into a state indicating that it has been evicted to L2ARC.
5940 */
5941 static void
5942 l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev,
5943 uint64_t load_guid)
5944 {
5945 arc_buf_hdr_t *hdr, *exists;
5946 kmutex_t *hash_lock;
5947 arc_buf_contents_t type = LE_GET_TYPE(le);
5948 l2arc_buf_hdr_t *l2hdr;
5949
5950 hdr = arc_buf_hdr_alloc(load_guid, LE_GET_LSIZE(le), type);
5951 hdr->b_dva = le->l2le_dva;
5952 hdr->b_birth = le->l2le_birth;
5953 hdr->b_cksum0 = le->l2le_cksum0;
5954 hdr->b_size = LE_GET_LSIZE(le);
5955 exists = buf_hash_insert(hdr, &hash_lock);
5956 if (exists) {
5957 /* Buffer was already cached, no need to restore it. */
5958 mutex_exit(hash_lock);
5959 arc_hdr_destroy(hdr);
5960 ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
5961 return;
5962 }
5963 hdr->b_flags = ARC_IN_HASH_TABLE | ARC_L2CACHE;
5964 if (LE_GET_COMPRESS(le) != ZIO_COMPRESS_OFF)
5965 hdr->b_flags |= ARC_L2COMPRESS;
5966 mutex_enter(&hdr->b_freeze_lock);
5967 ASSERT(hdr->b_freeze_cksum == NULL);
5968 hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
5969 *hdr->b_freeze_cksum = le->l2le_freeze_cksum;
5970 mutex_exit(&hdr->b_freeze_lock);
5971
5972 /* now rebuild the l2arc entry */
5973 ASSERT(hdr->b_l2hdr == NULL);
5974 l2hdr = kmem_zalloc(sizeof (*l2hdr), KM_SLEEP);
5975 l2hdr->b_dev = dev;
5976 l2hdr->b_daddr = le->l2le_daddr;
5977 l2hdr->b_asize = LE_GET_PSIZE(le);
5978 l2hdr->b_compress = LE_GET_COMPRESS(le);
5979 hdr->b_l2hdr = l2hdr;
5980 list_insert_tail(dev->l2ad_buflist, hdr);
5981 ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
5982 ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize);
5983
5984 arc_change_state(arc_l2c_only, hdr, hash_lock);
5985 mutex_exit(hash_lock);
5986 }
5987
5988 /*
5989 * Attempts to read the device header on the provided L2ARC device and writes
5990 * it to `ub'. On success, this function returns 0, otherwise the appropriate
5991 * error code is returned.
5992 */
5993 static int
5994 l2arc_dev_hdr_read(l2arc_dev_t *dev, l2arc_dev_hdr_phys_t *hdr)
5995 {
5996 int err;
5997 uint64_t guid;
5998 zio_cksum_t cksum;
5999
6000 guid = spa_guid(dev->l2ad_vdev->vdev_spa);
6001
6002 if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
6003 VDEV_LABEL_START_SIZE, sizeof (*hdr), hdr,
6004 ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
6005 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
6006 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
6007 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
6008 return (err);
6009 }
6010
6011 if (hdr->l2dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
6012 byteswap_uint64_array(hdr, sizeof (*hdr));
6013
6014 if (hdr->l2dh_magic != L2ARC_DEV_HDR_MAGIC ||
6015 hdr->l2dh_spa_guid != guid) {
6016 /*
6017 * Attempt to rebuild a device containing no actual dev hdr
6018 * or containing a header from some other pool.
6019 */
6020 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
6021 return (SET_ERROR(ENOTSUP));
6022 }
6023
6024 l2arc_dev_hdr_checksum(hdr, &cksum);
6025 if (!ZIO_CHECKSUM_EQUAL(hdr->l2dh_self_cksum, cksum)) {
6026 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
6027 return (SET_ERROR(EINVAL));
6028 }
6029 if (hdr->l2dh_evict_tail < dev->l2ad_start ||
6030 hdr->l2dh_evict_tail >= dev->l2ad_end) {
6031 /* Data in dev hdr is invalid for this device. */
6032 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
6033 return (SET_ERROR(EINVAL));
6034 }
6035
6036 return (0);
6037 }
6038
6039 /*
6040 * Reads L2ARC log blocks from storage and validates their contents.
6041 *
6042 * This function implements a simple prefetcher to make sure that while
6043 * we're processing one buffer the L2ARC is already prefetching the next
6044 * one in the chain.
6045 *
6046 * The arguments this_lp and next_lp point to the current and next log blk
6047 * address in the block chain. Similarly, this_lb and next_lb hold the
6048 * l2arc_log_blk_phys_t's of the current and next L2ARC blk. The this_lb_buf
6049 * and next_lb_buf must be buffers of appropriate to hold a raw
6050 * l2arc_log_blk_phys_t (they are used as catch buffers for read ops prior
6051 * to buffer decompression).
6052 *
6053 * The `this_io' and `next_io' arguments are used for block prefetching.
6054 * When issuing the first blk IO during rebuild, you should pass NULL for
6055 * `this_io'. This function will then issue a sync IO to read the block and
6056 * also issue an async IO to fetch the next block in the block chain. The
6057 * prefetch IO is returned in `next_io'. On subsequent calls to this
6058 * function, pass the value returned in `next_io' from the previous call
6059 * as `this_io' and a fresh `next_io' pointer to hold the next prefetch IO.
6060 * Prior to the call, you should initialize your `next_io' pointer to be
6061 * NULL. If no prefetch IO was issued, the pointer is left set at NULL.
6062 *
6063 * On success, this function returns 0, otherwise it returns an appropriate
6064 * error code. On error the prefetching IO is aborted and cleared before
6065 * returning from this function. Therefore, if we return `success', the
6066 * caller can assume that we have taken care of cleanup of prefetch IOs.
6067 */
6068 static int
6069 l2arc_log_blk_read(l2arc_dev_t *dev,
6070 const l2arc_log_blk_ptr_t *this_lbp, const l2arc_log_blk_ptr_t *next_lbp,
6071 l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
6072 uint8_t *this_lb_buf, uint8_t *next_lb_buf,
6073 zio_t *this_io, zio_t **next_io)
6074 {
6075 int err = 0;
6076 zio_cksum_t cksum;
6077
6078 ASSERT(this_lbp != NULL && next_lbp != NULL);
6079 ASSERT(this_lb != NULL && next_lb != NULL);
6080 ASSERT(this_lb_buf != NULL && next_lb_buf != NULL);
6081 ASSERT(next_io != NULL && *next_io == NULL);
6082 ASSERT(l2arc_log_blk_ptr_valid(dev, this_lbp));
6083
6084 /*
6085 * Check to see if we have issued the IO for this log blk in a
6086 * previous run. If not, this is the first call, so issue it now.
6087 */
6088 if (this_io == NULL) {
6089 this_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, this_lbp,
6090 this_lb_buf);
6091 }
6092
6093 /*
6094 * Peek to see if we can start issuing the next IO immediately.
6095 */
6096 if (l2arc_log_blk_ptr_valid(dev, next_lbp)) {
6097 /*
6098 * Start issuing IO for the next log blk early - this
6099 * should help keep the L2ARC device busy while we
6100 * decompress and restore this log blk.
6101 */
6102 *next_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, next_lbp,
6103 next_lb_buf);
6104 }
6105
6106 /* Wait for the IO to read this log block to complete */
6107 if ((err = zio_wait(this_io)) != 0) {
6108 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
6109 goto cleanup;
6110 }
6111
6112 /* Make sure the buffer checks out */
6113 fletcher_4_native(this_lb_buf, LBP_GET_PSIZE(this_lbp), &cksum);
6114 if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->l2lbp_cksum)) {
6115 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
6116 err = SET_ERROR(EINVAL);
6117 goto cleanup;
6118 }
6119
6120 /* Now we can take our time decoding this buffer */
6121 switch (LBP_GET_COMPRESS(this_lbp)) {
6122 case ZIO_COMPRESS_OFF:
6123 bcopy(this_lb_buf, this_lb, sizeof (*this_lb));
6124 break;
6125 case ZIO_COMPRESS_LZ4:
6126 if ((err = zio_decompress_data(LBP_GET_COMPRESS(this_lbp),
6127 this_lb_buf, this_lb, LBP_GET_PSIZE(this_lbp),
6128 sizeof (*this_lb))) != 0) {
6129 err = SET_ERROR(EINVAL);
6130 goto cleanup;
6131 }
6132 break;
6133 default:
6134 err = SET_ERROR(EINVAL);
6135 goto cleanup;
6136 }
6137 if (this_lb->l2lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
6138 byteswap_uint64_array(this_lb, sizeof (*this_lb));
6139 if (this_lb->l2lb_magic != L2ARC_LOG_BLK_MAGIC) {
6140 err = SET_ERROR(EINVAL);
6141 goto cleanup;
6142 }
6143 cleanup:
6144 /* Abort an in-flight prefetch I/O in case of error */
6145 if (err != 0 && *next_io != NULL) {
6146 l2arc_log_blk_prefetch_abort(*next_io);
6147 *next_io = NULL;
6148 }
6149 return (err);
6150 }
6151
6152 /*
6153 * Validates an L2ARC log blk address to make sure that it can be read
6154 * from the provided L2ARC device. Returns B_TRUE if the address is
6155 * within the device's bounds, or B_FALSE if not.
6156 */
6157 static boolean_t
6158 l2arc_log_blk_ptr_valid(l2arc_dev_t *dev, const l2arc_log_blk_ptr_t *lbp)
6159 {
6160 uint64_t psize = LBP_GET_PSIZE(lbp);
6161 uint64_t end = lbp->l2lbp_daddr + psize;
6162
6163 /*
6164 * A log block is valid if all of the following conditions are true:
6165 * - it fits entirely between l2ad_start and l2ad_end
6166 * - it has a valid size
6167 * - it isn't anywhere between l2ad_hand and l2ad_evict (i.e. it
6168 * doesn't sit in the evicted region)
6169 */
6170 return (lbp->l2lbp_daddr >= dev->l2ad_start && end < dev->l2ad_end &&
6171 psize != 0 && psize <= sizeof (l2arc_log_blk_phys_t) &&
6172 lbp->l2lbp_daddr > dev->l2ad_evict && end <= dev->l2ad_hand);
6173 }
6174
6175 /*
6176 * Starts an asynchronous read IO to read a log block. This is used in log
6177 * block reconstruction to start reading the next block before we are done
6178 * decoding and reconstructing the current block, to keep the l2arc device
6179 * nice and hot with read IO to process.
6180 * The returned zio will contain a newly allocated memory buffers for the IO
6181 * data which should then be freed by the caller once the zio is no longer
6182 * needed (i.e. due to it having completed). If you wish to abort this
6183 * zio, you should do so using l2arc_log_blk_prefetch_abort, which takes
6184 * care of disposing of the allocated buffers correctly.
6185 */
6186 static zio_t *
6187 l2arc_log_blk_prefetch(vdev_t *vd, const l2arc_log_blk_ptr_t *lbp,
6188 uint8_t *lb_buf)
6189 {
6190 uint32_t psize;
6191 zio_t *pio;
6192
6193 psize = LBP_GET_PSIZE(lbp);
6194 ASSERT(psize <= sizeof (l2arc_log_blk_phys_t));
6195 pio = zio_root(vd->vdev_spa, NULL, NULL, ZIO_FLAG_DONT_CACHE |
6196 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
6197 ZIO_FLAG_DONT_RETRY);
6198 (void) zio_nowait(zio_read_phys(pio, vd, lbp->l2lbp_daddr, psize,
6199 lb_buf, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
6200 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
6201 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
6202
6203 return (pio);
6204 }
6205
6206 /*
6207 * Aborts a zio returned from l2arc_log_blk_prefetch and frees the data
6208 * buffers allocated for it.
6209 */
6210 static void
6211 l2arc_log_blk_prefetch_abort(zio_t *zio)
6212 {
6213 (void) zio_wait(zio);
6214 }
6215
6216 /*
6217 * Creates a zio to update the device header on an l2arc device. The zio is
6218 * initiated as a child of `pio'.
6219 */
6220 static void
6221 l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio)
6222 {
6223 zio_t *wzio;
6224 vdev_stat_t st;
6225 l2arc_dev_hdr_phys_t *hdr = &dev->l2ad_dev_hdr;
6226
6227 vdev_get_stats(dev->l2ad_vdev, &st);
6228
6229 hdr->l2dh_magic = L2ARC_DEV_HDR_MAGIC;
6230 hdr->l2dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
6231 hdr->l2dh_evict_tail = dev->l2ad_evict;
6232 hdr->l2dh_alloc_space = st.vs_alloc;
6233 hdr->l2dh_flags = 0;
6234 if (dev->l2ad_first)
6235 hdr->l2dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
6236
6237 /* checksum operation goes last */
6238 l2arc_dev_hdr_checksum(hdr, &hdr->l2dh_self_cksum);
6239
6240 CTASSERT(sizeof (*hdr) >= SPA_MINBLOCKSIZE &&
6241 sizeof (*hdr) <= SPA_MAXBLOCKSIZE);
6242 wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
6243 sizeof (*hdr), hdr, ZIO_CHECKSUM_OFF, NULL,
6244 NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
6245 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6246 zio_t *, wzio);
6247 (void) zio_nowait(wzio);
6248 }
6249
6250 /*
6251 * Commits a log block to the L2ARC device. This routine is invoked from
6252 * l2arc_write_buffers when the log block fills up.
6253 * This function allocates some memory to temporarily hold the serialized
6254 * buffer to be written. This is then released in l2arc_write_done.
6255 */
6256 static void
6257 l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
6258 l2arc_write_callback_t *cb)
6259 {
6260 l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
6261 uint64_t psize, asize;
6262 l2arc_log_blk_buf_t *lb_buf;
6263 zio_t *wzio;
6264
6265 VERIFY(dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
6266
6267 /* link the buffer into the block chain */
6268 lb->l2lb_back2_lbp = dev->l2ad_dev_hdr.l2dh_start_lbps[1];
6269 lb->l2lb_magic = L2ARC_LOG_BLK_MAGIC;
6270
6271 /* try to compress the buffer */
6272 lb_buf = kmem_zalloc(sizeof (*lb_buf), KM_SLEEP);
6273 list_insert_tail(&cb->l2wcb_log_blk_buf_list, lb_buf);
6274 VERIFY((psize = zio_compress_data(ZIO_COMPRESS_LZ4, lb,
6275 lb_buf->l2lbb_log_blk, sizeof (*lb))) != 0);
6276
6277 /*
6278 * Update the start log blk pointer in the device header to point
6279 * to the log block we're about to write.
6280 */
6281 dev->l2ad_dev_hdr.l2dh_start_lbps[1] =
6282 dev->l2ad_dev_hdr.l2dh_start_lbps[0];
6283 dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_daddr = dev->l2ad_hand;
6284 LBP_SET_LSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], sizeof (*lb));
6285 LBP_SET_PSIZE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], psize);
6286 LBP_SET_CHECKSUM(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
6287 ZIO_CHECKSUM_FLETCHER_4);
6288 LBP_SET_TYPE(&dev->l2ad_dev_hdr.l2dh_start_lbps[0], 0);
6289 if (psize < sizeof (*lb)) {
6290 /* compression succeeded */
6291 LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
6292 ZIO_COMPRESS_LZ4);
6293 } else {
6294 /* compression failed */
6295 bcopy(lb, lb_buf->l2lbb_log_blk, sizeof (*lb));
6296 LBP_SET_COMPRESS(&dev->l2ad_dev_hdr.l2dh_start_lbps[0],
6297 ZIO_COMPRESS_OFF);
6298 }
6299 /* checksum what we're about to write */
6300 fletcher_4_native(lb_buf->l2lbb_log_blk, psize,
6301 &dev->l2ad_dev_hdr.l2dh_start_lbps[0].l2lbp_cksum);
6302
6303 /* perform the write itself */
6304 CTASSERT(L2ARC_LOG_BLK_SIZE >= SPA_MINBLOCKSIZE &&
6305 L2ARC_LOG_BLK_SIZE <= SPA_MAXBLOCKSIZE);
6306 wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
6307 psize, lb_buf->l2lbb_log_blk, ZIO_CHECKSUM_OFF, NULL, NULL,
6308 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
6309 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
6310 (void) zio_nowait(wzio);
6311
6312 /* realign the device hand */
6313 asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
6314 dev->l2ad_hand += asize;
6315 VERIFY(dev->l2ad_hand <= dev->l2ad_evict || dev->l2ad_first);
6316 vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
6317
6318 /* bump the kstats */
6319 ARCSTAT_INCR(arcstat_l2_write_bytes, psize);
6320 ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
6321 ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, asize);
6322 ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
6323 dev->l2ad_log_blk_payload_asize / asize);
6324
6325 dev->l2ad_log_ent_idx = dev->l2ad_log_blk_payload_asize = 0;
6326 }
6327
6328 /*
6329 * Computes the checksum of `hdr' and stores it in `cksum'.
6330 */
6331 static void
6332 l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr, zio_cksum_t *cksum)
6333 {
6334 fletcher_4_native((uint8_t *)hdr +
6335 offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid),
6336 sizeof (*hdr) - offsetof(l2arc_dev_hdr_phys_t, l2dh_spa_guid),
6337 cksum);
6338 }
6339
6340 /*
6341 * Inserts ARC buffer `ab' into the current L2ARC log blk on the device.
6342 * The buffer being inserted must be present in L2ARC.
6343 * Returns B_TRUE if the L2ARC log blk is full and needs to be committed
6344 * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
6345 */
6346 static boolean_t
6347 l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab)
6348 {
6349 l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
6350 l2arc_log_ent_phys_t *le;
6351 const l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
6352 int index = dev->l2ad_log_ent_idx++;
6353
6354 ASSERT(l2hdr != NULL);
6355 ASSERT(index < L2ARC_LOG_BLK_ENTRIES);
6356
6357 le = &lb->l2lb_entries[index];
6358 bzero(le, sizeof (*le));
6359 le->l2le_dva = ab->b_dva;
6360 le->l2le_birth = ab->b_birth;
6361 le->l2le_cksum0 = ab->b_cksum0;
6362 le->l2le_daddr = l2hdr->b_daddr;
6363 LE_SET_LSIZE(le, ab->b_size);
6364 LE_SET_PSIZE(le, l2hdr->b_asize);
6365 LE_SET_COMPRESS(le, l2hdr->b_compress);
6366 le->l2le_freeze_cksum = *ab->b_freeze_cksum;
6367 LE_SET_CHECKSUM(le, ZIO_CHECKSUM_FLETCHER_2);
6368 LE_SET_TYPE(le, ab->b_type);
6369 dev->l2ad_log_blk_payload_asize += l2hdr->b_asize;
6370
6371 return (dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
6372 }
6373
6374 /*
6375 * Checks whether a given L2ARC device address sits in a time-sequential
6376 * range. The trick here is that the L2ARC is a rotary buffer, so we can't
6377 * just do a range comparison, we need to handle the situation in which the
6378 * range wraps around the end of the L2ARC device. Arguments:
6379 * bottom Lower end of the range to check (written to earlier).
6380 * top Upper end of the range to check (written to later).
6381 * check The address for which we want to determine if it sits in
6382 * between the top and bottom.
6383 *
6384 * The 3-way conditional below represents the following cases:
6385 *
6386 * bottom < top : Sequentially ordered case:
6387 * <check>--------+-------------------+
6388 * | (overlap here?) |
6389 * L2ARC dev V V
6390 * |---------------<bottom>============<top>--------------|
6391 *
6392 * bottom > top: Looped-around case:
6393 * <check>--------+------------------+
6394 * | (overlap here?) |
6395 * L2ARC dev V V
6396 * |===============<top>---------------<bottom>===========|
6397 * ^ ^
6398 * | (or here?) |
6399 * +---------------+---------<check>
6400 *
6401 * top == bottom : Just a single address comparison.
6402 */
6403 static inline boolean_t
6404 l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
6405 {
6406 if (bottom < top)
6407 return (bottom <= check && check <= top);
6408 else if (bottom > top)
6409 return (check <= top || bottom <= check);
6410 else
6411 return (check == top);
6412 }
6413
6414 /*
6415 * Checks whether a rebuild timeout deadline has been hit and if it has,
6416 * increments the appropriate error counters.
6417 */
6418 static boolean_t
6419 l2arc_check_rebuild_timeout_hit(int64_t deadline)
6420 {
6421 if (deadline != 0 && deadline < ddi_get_lbolt64()) {
6422 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_timeout);
6423 cmn_err(CE_WARN, "L2ARC rebuild is taking too long, "
6424 "dropping remaining L2ARC metadata.");
6425 return (B_TRUE);
6426 } else {
6427 return (B_FALSE);
6428 }
6429 }