1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26 */
27
28 /*
29 * DVA-based Adjustable Replacement Cache
30 *
31 * While much of the theory of operation used here is
32 * based on the self-tuning, low overhead replacement cache
33 * presented by Megiddo and Modha at FAST 2003, there are some
34 * significant differences:
35 *
36 * 1. The Megiddo and Modha model assumes any page is evictable.
37 * Pages in its cache cannot be "locked" into memory. This makes
38 * the eviction algorithm simple: evict the last page in the list.
39 * This also make the performance characteristics easy to reason
40 * about. Our cache is not so simple. At any given moment, some
41 * subset of the blocks in the cache are un-evictable because we
42 * have handed out a reference to them. Blocks are only evictable
43 * when there are no external references active. This makes
44 * eviction far more problematic: we choose to evict the evictable
45 * blocks that are the "lowest" in the list.
46 *
47 * There are times when it is not possible to evict the requested
48 * space. In these circumstances we are unable to adjust the cache
49 * size. To prevent the cache growing unbounded at these times we
50 * implement a "cache throttle" that slows the flow of new data
51 * into the cache until we can make space available.
52 *
53 * 2. The Megiddo and Modha model assumes a fixed cache size.
54 * Pages are evicted when the cache is full and there is a cache
55 * miss. Our model has a variable sized cache. It grows with
56 * high use, but also tries to react to memory pressure from the
57 * operating system: decreasing its size when system memory is
58 * tight.
59 *
60 * 3. The Megiddo and Modha model assumes a fixed page size. All
61 * elements of the cache are therefore exactly the same size. So
62 * when adjusting the cache size following a cache miss, its simply
63 * a matter of choosing a single page to evict. In our model, we
64 * have variable sized cache blocks (rangeing from 512 bytes to
65 * 128K bytes). We therefore choose a set of blocks to evict to make
66 * space for a cache miss that approximates as closely as possible
67 * the space used by the new block.
68 *
69 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70 * by N. Megiddo & D. Modha, FAST 2003
71 */
72
73 /*
74 * The locking model:
75 *
76 * A new reference to a cache buffer can be obtained in two
77 * ways: 1) via a hash table lookup using the DVA as a key,
78 * or 2) via one of the ARC lists. The arc_read() interface
79 * uses method 1, while the internal arc algorithms for
80 * adjusting the cache use method 2. We therefore provide two
81 * types of locks: 1) the hash table lock array, and 2) the
82 * arc list locks.
83 *
84 * Buffers do not have their own mutexes, rather they rely on the
85 * hash table mutexes for the bulk of their protection (i.e. most
86 * fields in the arc_buf_hdr_t are protected by these mutexes).
87 *
88 * buf_hash_find() returns the appropriate mutex (held) when it
89 * locates the requested buffer in the hash table. It returns
90 * NULL for the mutex if the buffer was not in the table.
91 *
92 * buf_hash_remove() expects the appropriate hash mutex to be
93 * already held before it is invoked.
94 *
95 * Each arc state also has a mutex which is used to protect the
96 * buffer list associated with the state. When attempting to
97 * obtain a hash table lock while holding an arc list lock you
98 * must use: mutex_tryenter() to avoid deadlock. Also note that
99 * the active state mutex must be held before the ghost state mutex.
100 *
101 * Arc buffers may have an associated eviction callback function.
102 * This function will be invoked prior to removing the buffer (e.g.
103 * in arc_do_user_evicts()). Note however that the data associated
104 * with the buffer may be evicted prior to the callback. The callback
105 * must be made with *no locks held* (to prevent deadlock). Additionally,
106 * the users of callbacks must ensure that their private data is
107 * protected from simultaneous callbacks from arc_buf_evict()
108 * and arc_do_user_evicts().
109 *
110 * Note that the majority of the performance stats are manipulated
111 * with atomic operations.
112 *
113 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
114 *
115 * - L2ARC buflist creation
116 * - L2ARC buflist eviction
117 * - L2ARC write completion, which walks L2ARC buflists
118 * - ARC header destruction, as it removes from L2ARC buflists
119 * - ARC header release, as it removes from L2ARC buflists
120 */
121
122 #include <sys/spa.h>
123 #include <sys/zio.h>
124 #include <sys/zio_compress.h>
125 #include <sys/zfs_context.h>
126 #include <sys/arc.h>
127 #include <sys/refcount.h>
128 #include <sys/vdev.h>
129 #include <sys/vdev_impl.h>
130 #ifdef _KERNEL
131 #include <sys/vmsystm.h>
132 #include <vm/anon.h>
133 #include <sys/fs/swapnode.h>
134 #include <sys/dnlc.h>
135 #endif
136 #include <sys/callb.h>
137 #include <sys/kstat.h>
138 #include <zfs_fletcher.h>
139 #include <sys/byteorder.h>
140
141 #ifndef _KERNEL
142 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
143 boolean_t arc_watch = B_FALSE;
144 int arc_procfd;
145 #endif
146
147 static kmutex_t arc_reclaim_thr_lock;
148 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
149 static uint8_t arc_thread_exit;
150
151 extern int zfs_write_limit_shift;
152 extern uint64_t zfs_write_limit_max;
153 extern kmutex_t zfs_write_limit_lock;
154
155 #define ARC_REDUCE_DNLC_PERCENT 3
156 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
157
158 typedef enum arc_reclaim_strategy {
159 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
160 ARC_RECLAIM_CONS /* Conservative reclaim strategy */
161 } arc_reclaim_strategy_t;
162
163 /* number of seconds before growing cache again */
164 static int arc_grow_retry = 60;
165
166 /* shift of arc_c for calculating both min and max arc_p */
167 static int arc_p_min_shift = 4;
168
169 /* log2(fraction of arc to reclaim) */
170 static int arc_shrink_shift = 5;
171
172 /*
173 * minimum lifespan of a prefetch block in clock ticks
174 * (initialized in arc_init())
175 */
176 static int arc_min_prefetch_lifespan;
177
178 static int arc_dead;
179
180 /*
181 * The arc has filled available memory and has now warmed up.
182 */
183 static boolean_t arc_warm;
184
185 /*
186 * These tunables are for performance analysis.
187 */
188 uint64_t zfs_arc_max;
189 uint64_t zfs_arc_min;
190 uint64_t zfs_arc_meta_limit = 0;
191 int zfs_arc_grow_retry = 0;
192 int zfs_arc_shrink_shift = 0;
193 int zfs_arc_p_min_shift = 0;
194 int zfs_disable_dup_eviction = 0;
195
196 /*
197 * Note that buffers can be in one of 6 states:
198 * ARC_anon - anonymous (discussed below)
199 * ARC_mru - recently used, currently cached
200 * ARC_mru_ghost - recentely used, no longer in cache
201 * ARC_mfu - frequently used, currently cached
202 * ARC_mfu_ghost - frequently used, no longer in cache
203 * ARC_l2c_only - exists in L2ARC but not other states
204 * When there are no active references to the buffer, they are
205 * are linked onto a list in one of these arc states. These are
206 * the only buffers that can be evicted or deleted. Within each
207 * state there are multiple lists, one for meta-data and one for
208 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
209 * etc.) is tracked separately so that it can be managed more
210 * explicitly: favored over data, limited explicitly.
211 *
212 * Anonymous buffers are buffers that are not associated with
213 * a DVA. These are buffers that hold dirty block copies
214 * before they are written to stable storage. By definition,
215 * they are "ref'd" and are considered part of arc_mru
216 * that cannot be freed. Generally, they will aquire a DVA
217 * as they are written and migrate onto the arc_mru list.
218 *
219 * The ARC_l2c_only state is for buffers that are in the second
220 * level ARC but no longer in any of the ARC_m* lists. The second
221 * level ARC itself may also contain buffers that are in any of
222 * the ARC_m* states - meaning that a buffer can exist in two
223 * places. The reason for the ARC_l2c_only state is to keep the
224 * buffer header in the hash table, so that reads that hit the
225 * second level ARC benefit from these fast lookups.
226 */
227
228 typedef struct arc_state {
229 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
230 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
231 uint64_t arcs_size; /* total amount of data in this state */
232 kmutex_t arcs_mtx;
233 } arc_state_t;
234
235 /* The 6 states: */
236 static arc_state_t ARC_anon;
237 static arc_state_t ARC_mru;
238 static arc_state_t ARC_mru_ghost;
239 static arc_state_t ARC_mfu;
240 static arc_state_t ARC_mfu_ghost;
241 static arc_state_t ARC_l2c_only;
242
243 typedef struct arc_stats {
244 kstat_named_t arcstat_hits;
245 kstat_named_t arcstat_misses;
246 kstat_named_t arcstat_demand_data_hits;
247 kstat_named_t arcstat_demand_data_misses;
248 kstat_named_t arcstat_demand_metadata_hits;
249 kstat_named_t arcstat_demand_metadata_misses;
250 kstat_named_t arcstat_prefetch_data_hits;
251 kstat_named_t arcstat_prefetch_data_misses;
252 kstat_named_t arcstat_prefetch_metadata_hits;
253 kstat_named_t arcstat_prefetch_metadata_misses;
254 kstat_named_t arcstat_mru_hits;
255 kstat_named_t arcstat_mru_ghost_hits;
256 kstat_named_t arcstat_mfu_hits;
257 kstat_named_t arcstat_mfu_ghost_hits;
258 kstat_named_t arcstat_deleted;
259 kstat_named_t arcstat_recycle_miss;
260 /*
261 * Number of buffers that could not be evicted because the hash lock
262 * was held by another thread. The lock may not necessarily be held
263 * by something using the same buffer, since hash locks are shared
264 * by multiple buffers.
265 */
266 kstat_named_t arcstat_mutex_miss;
267 /*
268 * Number of buffers skipped because they have I/O in progress, are
269 * indrect prefetch buffers that have not lived long enough, or are
270 * not from the spa we're trying to evict from.
271 */
272 kstat_named_t arcstat_evict_skip;
273 kstat_named_t arcstat_evict_l2_cached;
274 kstat_named_t arcstat_evict_l2_eligible;
275 kstat_named_t arcstat_evict_l2_ineligible;
276 kstat_named_t arcstat_hash_elements;
277 kstat_named_t arcstat_hash_elements_max;
278 kstat_named_t arcstat_hash_collisions;
279 kstat_named_t arcstat_hash_chains;
280 kstat_named_t arcstat_hash_chain_max;
281 kstat_named_t arcstat_p;
282 kstat_named_t arcstat_c;
283 kstat_named_t arcstat_c_min;
284 kstat_named_t arcstat_c_max;
285 kstat_named_t arcstat_size;
286 kstat_named_t arcstat_hdr_size;
287 kstat_named_t arcstat_data_size;
288 kstat_named_t arcstat_other_size;
289 kstat_named_t arcstat_l2_hits;
290 kstat_named_t arcstat_l2_misses;
291 kstat_named_t arcstat_l2_feeds;
292 kstat_named_t arcstat_l2_rw_clash;
293 kstat_named_t arcstat_l2_read_bytes;
294 kstat_named_t arcstat_l2_write_bytes;
295 kstat_named_t arcstat_l2_writes_sent;
296 kstat_named_t arcstat_l2_writes_done;
297 kstat_named_t arcstat_l2_writes_error;
298 kstat_named_t arcstat_l2_writes_hdr_miss;
299 kstat_named_t arcstat_l2_evict_lock_retry;
300 kstat_named_t arcstat_l2_evict_reading;
301 kstat_named_t arcstat_l2_free_on_write;
302 kstat_named_t arcstat_l2_abort_lowmem;
303 kstat_named_t arcstat_l2_cksum_bad;
304 kstat_named_t arcstat_l2_io_error;
305 kstat_named_t arcstat_l2_size;
306 kstat_named_t arcstat_l2_asize;
307 kstat_named_t arcstat_l2_hdr_size;
308 kstat_named_t arcstat_l2_compress_successes;
309 kstat_named_t arcstat_l2_compress_zeros;
310 kstat_named_t arcstat_l2_compress_failures;
311 kstat_named_t arcstat_l2_meta_writes;
312 kstat_named_t arcstat_l2_meta_avg_size;
313 kstat_named_t arcstat_l2_meta_avg_asize;
314 kstat_named_t arcstat_l2_asize_to_meta_ratio;
315 kstat_named_t arcstat_l2_rebuild_attempts;
316 kstat_named_t arcstat_l2_rebuild_successes;
317 kstat_named_t arcstat_l2_rebuild_unsupported;
318 kstat_named_t arcstat_l2_rebuild_timeout;
319 kstat_named_t arcstat_l2_rebuild_arc_bytes;
320 kstat_named_t arcstat_l2_rebuild_l2arc_bytes;
321 kstat_named_t arcstat_l2_rebuild_bufs;
322 kstat_named_t arcstat_l2_rebuild_bufs_precached;
323 kstat_named_t arcstat_l2_rebuild_metabufs;
324 kstat_named_t arcstat_l2_rebuild_uberblk_errors;
325 kstat_named_t arcstat_l2_rebuild_io_errors;
326 kstat_named_t arcstat_l2_rebuild_cksum_errors;
327 kstat_named_t arcstat_l2_rebuild_loop_errors;
328 kstat_named_t arcstat_l2_rebuild_abort_lowmem;
329 kstat_named_t arcstat_memory_throttle_count;
330 kstat_named_t arcstat_duplicate_buffers;
331 kstat_named_t arcstat_duplicate_buffers_size;
332 kstat_named_t arcstat_duplicate_reads;
333 kstat_named_t arcstat_meta_used;
334 kstat_named_t arcstat_meta_limit;
335 kstat_named_t arcstat_meta_max;
336 } arc_stats_t;
337
338 static arc_stats_t arc_stats = {
339 { "hits", KSTAT_DATA_UINT64 },
340 { "misses", KSTAT_DATA_UINT64 },
341 { "demand_data_hits", KSTAT_DATA_UINT64 },
342 { "demand_data_misses", KSTAT_DATA_UINT64 },
343 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
344 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
345 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
346 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
347 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
348 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
349 { "mru_hits", KSTAT_DATA_UINT64 },
350 { "mru_ghost_hits", KSTAT_DATA_UINT64 },
351 { "mfu_hits", KSTAT_DATA_UINT64 },
352 { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
353 { "deleted", KSTAT_DATA_UINT64 },
354 { "recycle_miss", KSTAT_DATA_UINT64 },
355 { "mutex_miss", KSTAT_DATA_UINT64 },
356 { "evict_skip", KSTAT_DATA_UINT64 },
357 { "evict_l2_cached", KSTAT_DATA_UINT64 },
358 { "evict_l2_eligible", KSTAT_DATA_UINT64 },
359 { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
360 { "hash_elements", KSTAT_DATA_UINT64 },
361 { "hash_elements_max", KSTAT_DATA_UINT64 },
362 { "hash_collisions", KSTAT_DATA_UINT64 },
363 { "hash_chains", KSTAT_DATA_UINT64 },
364 { "hash_chain_max", KSTAT_DATA_UINT64 },
365 { "p", KSTAT_DATA_UINT64 },
366 { "c", KSTAT_DATA_UINT64 },
367 { "c_min", KSTAT_DATA_UINT64 },
368 { "c_max", KSTAT_DATA_UINT64 },
369 { "size", KSTAT_DATA_UINT64 },
370 { "hdr_size", KSTAT_DATA_UINT64 },
371 { "data_size", KSTAT_DATA_UINT64 },
372 { "other_size", KSTAT_DATA_UINT64 },
373 { "l2_hits", KSTAT_DATA_UINT64 },
374 { "l2_misses", KSTAT_DATA_UINT64 },
375 { "l2_feeds", KSTAT_DATA_UINT64 },
376 { "l2_rw_clash", KSTAT_DATA_UINT64 },
377 { "l2_read_bytes", KSTAT_DATA_UINT64 },
378 { "l2_write_bytes", KSTAT_DATA_UINT64 },
379 { "l2_writes_sent", KSTAT_DATA_UINT64 },
380 { "l2_writes_done", KSTAT_DATA_UINT64 },
381 { "l2_writes_error", KSTAT_DATA_UINT64 },
382 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
383 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
384 { "l2_evict_reading", KSTAT_DATA_UINT64 },
385 { "l2_free_on_write", KSTAT_DATA_UINT64 },
386 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
387 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
388 { "l2_io_error", KSTAT_DATA_UINT64 },
389 { "l2_size", KSTAT_DATA_UINT64 },
390 { "l2_asize", KSTAT_DATA_UINT64 },
391 { "l2_hdr_size", KSTAT_DATA_UINT64 },
392 { "l2_compress_successes", KSTAT_DATA_UINT64 },
393 { "l2_compress_zeros", KSTAT_DATA_UINT64 },
394 { "l2_compress_failures", KSTAT_DATA_UINT64 },
395 { "l2_meta_writes", KSTAT_DATA_UINT64 },
396 { "l2_meta_avg_size", KSTAT_DATA_UINT64 },
397 { "l2_meta_avg_asize", KSTAT_DATA_UINT64 },
398 { "l2_asize_to_meta_ratio", KSTAT_DATA_UINT64 },
399 { "l2_rebuild_attempts", KSTAT_DATA_UINT64 },
400 { "l2_rebuild_successes", KSTAT_DATA_UINT64 },
401 { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 },
402 { "l2_rebuild_timeout", KSTAT_DATA_UINT64 },
403 { "l2_rebuild_arc_bytes", KSTAT_DATA_UINT64 },
404 { "l2_rebuild_l2arc_bytes", KSTAT_DATA_UINT64 },
405 { "l2_rebuild_bufs", KSTAT_DATA_UINT64 },
406 { "l2_rebuild_precached", KSTAT_DATA_UINT64 },
407 { "l2_rebuild_metabufs", KSTAT_DATA_UINT64 },
408 { "l2_rebuild_uberblk_errors", KSTAT_DATA_UINT64 },
409 { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 },
410 { "l2_rebuild_cksum_errors", KSTAT_DATA_UINT64 },
411 { "l2_rebuild_loop_errors", KSTAT_DATA_UINT64 },
412 { "l2_rebuild_abort_lowmem", KSTAT_DATA_UINT64 },
413 { "memory_throttle_count", KSTAT_DATA_UINT64 },
414 { "duplicate_buffers", KSTAT_DATA_UINT64 },
415 { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
416 { "duplicate_reads", KSTAT_DATA_UINT64 },
417 { "arc_meta_used", KSTAT_DATA_UINT64 },
418 { "arc_meta_limit", KSTAT_DATA_UINT64 },
419 { "arc_meta_max", KSTAT_DATA_UINT64 }
420 };
421
422 #define ARCSTAT(stat) (arc_stats.stat.value.ui64)
423
424 #define ARCSTAT_INCR(stat, val) \
425 atomic_add_64(&arc_stats.stat.value.ui64, (val))
426
427 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
428 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
429
430 #define ARCSTAT_MAX(stat, val) { \
431 uint64_t m; \
432 while ((val) > (m = arc_stats.stat.value.ui64) && \
433 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
434 continue; \
435 }
436
437 #define ARCSTAT_MAXSTAT(stat) \
438 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
439
440 /*
441 * We define a macro to allow ARC hits/misses to be easily broken down by
442 * two separate conditions, giving a total of four different subtypes for
443 * each of hits and misses (so eight statistics total).
444 */
445 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
446 if (cond1) { \
447 if (cond2) { \
448 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
449 } else { \
450 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
451 } \
452 } else { \
453 if (cond2) { \
454 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
455 } else { \
456 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
457 } \
458 }
459
460 /*
461 * This macro allows us to use kstats as floating averages. Each time we
462 * update this kstat, we first factor it and the update value by
463 * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
464 * average. This macro assumes that integer loads and stores are atomic, but
465 * is not safe for multiple writers updating the kstat in parallel (only the
466 * last writer's update will remain).
467 */
468 #define ARCSTAT_F_AVG_FACTOR 3
469 #define ARCSTAT_F_AVG(stat, value) \
470 do { \
471 uint64_t x = ARCSTAT(stat); \
472 x = x - x / ARCSTAT_F_AVG_FACTOR + \
473 (value) / ARCSTAT_F_AVG_FACTOR; \
474 ARCSTAT(stat) = x; \
475 _NOTE(NOTREACHED) \
476 _NOTE(CONSTCOND) \
477 } while (0)
478
479 kstat_t *arc_ksp;
480 static arc_state_t *arc_anon;
481 static arc_state_t *arc_mru;
482 static arc_state_t *arc_mru_ghost;
483 static arc_state_t *arc_mfu;
484 static arc_state_t *arc_mfu_ghost;
485 static arc_state_t *arc_l2c_only;
486
487 /*
488 * There are several ARC variables that are critical to export as kstats --
489 * but we don't want to have to grovel around in the kstat whenever we wish to
490 * manipulate them. For these variables, we therefore define them to be in
491 * terms of the statistic variable. This assures that we are not introducing
492 * the possibility of inconsistency by having shadow copies of the variables,
493 * while still allowing the code to be readable.
494 */
495 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
496 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
497 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */
498 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
499 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
500 #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
501 #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
502 #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
503
504 #define L2ARC_IS_VALID_COMPRESS(_c_) \
505 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
506
507 static int arc_no_grow; /* Don't try to grow cache size */
508 static uint64_t arc_tempreserve;
509 static uint64_t arc_loaned_bytes;
510
511 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
512
513 typedef struct arc_callback arc_callback_t;
514
515 struct arc_callback {
516 void *acb_private;
517 arc_done_func_t *acb_done;
518 arc_buf_t *acb_buf;
519 zio_t *acb_zio_dummy;
520 arc_callback_t *acb_next;
521 };
522
523 typedef struct arc_write_callback arc_write_callback_t;
524
525 struct arc_write_callback {
526 void *awcb_private;
527 arc_done_func_t *awcb_ready;
528 arc_done_func_t *awcb_done;
529 arc_buf_t *awcb_buf;
530 };
531
532 struct arc_buf_hdr {
533 /* protected by hash lock */
534 dva_t b_dva;
535 uint64_t b_birth;
536 uint64_t b_cksum0;
537
538 kmutex_t b_freeze_lock;
539 zio_cksum_t *b_freeze_cksum;
540 void *b_thawed;
541
542 arc_buf_hdr_t *b_hash_next;
543 arc_buf_t *b_buf;
544 uint32_t b_flags;
545 uint32_t b_datacnt;
546
547 arc_callback_t *b_acb;
548 kcondvar_t b_cv;
549
550 /* immutable */
551 arc_buf_contents_t b_type;
552 uint64_t b_size;
553 uint64_t b_spa;
554
555 /* protected by arc state mutex */
556 arc_state_t *b_state;
557 list_node_t b_arc_node;
558
559 /* updated atomically */
560 clock_t b_arc_access;
561
562 /* self protecting */
563 refcount_t b_refcnt;
564
565 l2arc_buf_hdr_t *b_l2hdr;
566 list_node_t b_l2node;
567 };
568
569 static arc_buf_t *arc_eviction_list;
570 static kmutex_t arc_eviction_mtx;
571 static arc_buf_hdr_t arc_eviction_hdr;
572 static void arc_get_data_buf(arc_buf_t *buf);
573 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
574 static int arc_evict_needed(arc_buf_contents_t type);
575 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
576 static void arc_buf_watch(arc_buf_t *buf);
577
578 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
579
580 #define GHOST_STATE(state) \
581 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
582 (state) == arc_l2c_only)
583
584 /*
585 * Private ARC flags. These flags are private ARC only flags that will show up
586 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can
587 * be passed in as arc_flags in things like arc_read. However, these flags
588 * should never be passed and should only be set by ARC code. When adding new
589 * public flags, make sure not to smash the private ones.
590 */
591
592 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */
593 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */
594 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
595 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
596 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
597 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */
598 #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */
599 #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
600 #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
601 #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
602
603 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
604 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
605 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
606 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH)
607 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
608 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
609 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
610 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE)
611 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
612 (hdr)->b_l2hdr != NULL)
613 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING)
614 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED)
615 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
616
617 /*
618 * Other sizes
619 */
620
621 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
622 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
623
624 /*
625 * Hash table routines
626 */
627
628 #define HT_LOCK_PAD 64
629
630 struct ht_lock {
631 kmutex_t ht_lock;
632 #ifdef _KERNEL
633 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
634 #endif
635 };
636
637 #define BUF_LOCKS 256
638 typedef struct buf_hash_table {
639 uint64_t ht_mask;
640 arc_buf_hdr_t **ht_table;
641 struct ht_lock ht_locks[BUF_LOCKS];
642 } buf_hash_table_t;
643
644 static buf_hash_table_t buf_hash_table;
645
646 #define BUF_HASH_INDEX(spa, dva, birth) \
647 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
648 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
649 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
650 #define HDR_LOCK(hdr) \
651 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
652
653 uint64_t zfs_crc64_table[256];
654
655 /*
656 * Level 2 ARC
657 */
658
659 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
660 #define L2ARC_HEADROOM 2 /* num of writes */
661 /*
662 * If we discover during ARC scan any buffers to be compressed, we boost
663 * our headroom for the next scanning cycle by this percentage multiple.
664 */
665 #define L2ARC_HEADROOM_BOOST 200
666 #define L2ARC_FEED_SECS 1 /* caching interval secs */
667 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
668
669 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
670 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
671
672 /* L2ARC Performance Tunables */
673 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
674 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
675 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
676 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
677 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
678 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
679 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
680 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
681 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
682
683 /*
684 * L2ARC Internals
685 */
686 typedef struct l2arc_dev l2arc_dev_t;
687 static list_t L2ARC_dev_list; /* device list */
688 static list_t *l2arc_dev_list; /* device list pointer */
689 static kmutex_t l2arc_dev_mtx; /* device list mutex */
690 static l2arc_dev_t *l2arc_dev_last; /* last device used */
691 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
692 static list_t L2ARC_free_on_write; /* free after write buf list */
693 static list_t *l2arc_free_on_write; /* free after write list ptr */
694 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
695 static uint64_t l2arc_ndev; /* number of devices */
696
697 typedef struct l2arc_read_callback {
698 arc_buf_t *l2rcb_buf; /* read buffer */
699 spa_t *l2rcb_spa; /* spa */
700 blkptr_t l2rcb_bp; /* original blkptr */
701 zbookmark_t l2rcb_zb; /* original bookmark */
702 int l2rcb_flags; /* original flags */
703 enum zio_compress l2rcb_compress; /* applied compress */
704 } l2arc_read_callback_t;
705
706 typedef struct l2arc_write_callback {
707 l2arc_dev_t *l2wcb_dev; /* device info */
708 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
709 uint8_t *l2wcb_pbuf; /* pbuf sent in this write */
710 uint32_t l2wcb_pbuf_size; /* size of committed pbuf */
711 uint8_t *l2wcb_ub_buf; /* uberblock in this write */
712 } l2arc_write_callback_t;
713
714 struct l2arc_buf_hdr {
715 /* protected by arc_buf_hdr mutex */
716 l2arc_dev_t *b_dev; /* L2ARC device */
717 uint64_t b_daddr; /* disk address, offset byte */
718 /* compression applied to buffer data */
719 enum zio_compress b_compress;
720 /* real alloc'd buffer size depending on b_compress applied */
721 int b_asize;
722 /* temporary buffer holder for in-flight compressed data */
723 void *b_tmp_cdata;
724 };
725
726 typedef struct l2arc_data_free {
727 /* protected by l2arc_free_on_write_mtx */
728 void *l2df_data;
729 size_t l2df_size;
730 void (*l2df_func)(void *, size_t);
731 list_node_t l2df_list_node;
732 } l2arc_data_free_t;
733
734 static kmutex_t l2arc_feed_thr_lock;
735 static kcondvar_t l2arc_feed_thr_cv;
736 static uint8_t l2arc_thread_exit;
737
738 static void l2arc_read_done(zio_t *zio);
739 static void l2arc_hdr_stat_add(boolean_t from_arc);
740 static void l2arc_hdr_stat_remove(void);
741
742 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
743 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
744 enum zio_compress c);
745 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
746
747 typedef enum {
748 L2UBLK_BIG_ENDIAN = (1 << 0), /* little endian assumed otherwise */
749 L2UBLK_EVICT_FIRST = (1 << 1) /* mirror of l2ad_first in l2dev */
750 } l2uberblock_flags_t;
751
752 typedef struct l2uberblock {
753 uint32_t ub_magic;
754 uint8_t ub_version;
755 l2uberblock_flags_t ub_flags;
756
757 uint64_t ub_spa_guid;
758 uint64_t ub_birth;
759 uint64_t ub_evict_tail; /* current evict pointer */
760 uint64_t ub_alloc_space; /* vdev space alloc status */
761 uint64_t ub_pbuf_daddr; /* address of newest pbuf */
762 uint32_t ub_pbuf_asize; /* size of newest pbuf */
763 zio_cksum_t ub_pbuf_cksum; /* fletcher4 of newest pbuf */
764
765 zio_cksum_t ub_cksum; /* cksum of uberblock */
766 } l2uberblock_t;
767
768 typedef enum {
769 L2PBUF_BIG_ENDIAN = (1 << 0), /* little endian assumed otherwise */
770 L2PBUF_COMPRESSED = (1 << 1) /* pbuf data items are compressed */
771 } l2pbuf_flags_t;
772
773 typedef struct l2pbuf {
774 uint32_t pb_magic;
775 unsigned int pb_version;
776 l2pbuf_flags_t pb_flags;
777
778 uint64_t pb_prev_daddr; /* address of previous pbuf */
779 uint32_t pb_prev_asize; /* size of previous pbuf */
780 zio_cksum_t pb_prev_cksum; /* fletcher4 of prev. pbuf */
781
782 /*
783 * This is a set of item lists that are contained in this pbuf. Each
784 * L2ARC write appends a new l2pbuf_buflist_t array of l2pbuf_buf_t's.
785 * This serves as a soft timeout feature - once the limit of the
786 * number of item lists that a pbuf can hold is reached, the pbuf is
787 * flushed to stable storage, regardless of its total size.
788 */
789 list_t *pb_buflists_list;
790
791 /*
792 * Number of compressed bytes referenced by items in this pbuf and
793 * the number of lists present.
794 * This is not actually written to storage, it is only used by
795 * internal algorithms which check for when a pbuf reaches a
796 * certain size limit, after which it is flushed in a write.
797 */
798 uint64_t pb_payload_asz;
799 /* Same thing for number of buflists */
800 int pb_nbuflists;
801
802 /*
803 * Filled in by l2arc_pbuf_read to hold this pbuf's alloc'd size.
804 * This is then used by l2arc_pbuf_restore to update used space
805 * on the L2ARC vdev.
806 */
807 size_t pb_asize;
808 } l2pbuf_t;
809
810 typedef struct l2pbuf_buf l2pbuf_buf_t;
811 typedef struct l2pbuf_buflist {
812 uint32_t l2pbl_nbufs;
813 l2pbuf_buf_t *l2pbl_bufs;
814 list_node_t l2pbl_node;
815 } l2pbuf_buflist_t;
816
817 struct l2pbuf_buf {
818 dva_t b_dva; /* dva of buffer */
819 uint64_t b_birth; /* birth txg of buffer */
820 uint64_t b_cksum0;
821 zio_cksum_t b_freeze_cksum;
822 uint32_t b_size; /* uncompressed buf size */
823 uint64_t b_l2daddr; /* buf location on l2dev */
824 uint32_t b_l2asize; /* actual buf data size */
825 enum zio_compress b_l2compress; /* compression applied */
826 uint16_t b_contents_type;
827 uint32_t b_flags;
828 };
829
830 struct l2arc_dev {
831 vdev_t *l2ad_vdev; /* vdev */
832 spa_t *l2ad_spa; /* spa */
833 uint64_t l2ad_hand; /* next write location */
834 uint64_t l2ad_start; /* first addr on device */
835 uint64_t l2ad_end; /* last addr on device */
836 uint64_t l2ad_evict; /* last addr eviction reached */
837 boolean_t l2ad_first; /* first sweep through */
838 boolean_t l2ad_writing; /* currently writing */
839 list_t *l2ad_buflist; /* buffer list */
840 list_node_t l2ad_node; /* device list node */
841 l2pbuf_t l2ad_pbuf; /* currently open pbuf */
842 uint64_t l2ad_pbuf_daddr; /* prev pbuf daddr */
843 uint64_t l2ad_pbuf_asize; /* prev pbuf asize */
844 zio_cksum_t l2ad_pbuf_cksum; /* prev pbuf cksum */
845 /* uberblock birth counter - incremented for each committed uberblk */
846 uint64_t l2ad_uberblock_birth;
847 /* flag indicating whether a rebuild is currently going on */
848 boolean_t l2ad_rebuilding;
849 };
850
851 /* Stores information about an L2ARC prefetch zio */
852 typedef struct l2arc_prefetch_info {
853 uint8_t *pi_buf; /* where the zio writes to */
854 uint64_t pi_buflen; /* length of `buf' */
855 zio_t *pi_hdr_io; /* see l2arc_pbuf_read below */
856 } l2arc_prefetch_info_t;
857
858 /* 256 x 4k of l2uberblocks */
859 #define L2UBERBLOCK_SIZE 4096
860 #define L2UBERBLOCK_MAGIC 0x12bab10c
861 #define L2UBERBLOCK_MAX_VERSION 1 /* our maximum uberblock version */
862 #define L2PBUF_MAGIC 0xdb0faba6
863 #define L2PBUF_MAX_VERSION 1 /* our maximum pbuf version */
864 #define L2PBUF_BUF_SIZE 88 /* size of one pbuf buf entry */
865 #define L2PBUF_HDR_SIZE 56 /* pbuf header excluding any payload */
866 #define L2PBUF_ENCODED_SIZE(_pb) \
867 (L2PBUF_HDR_SIZE + l2arc_pbuf_items_encoded_size(_pb))
868 /*
869 * Allocation limit for the payload of a pbuf. This also fundamentally
870 * limits the number of bufs we can reference in a pbuf.
871 */
872 #define L2PBUF_MAX_PAYLOAD_SIZE (24 * 1024 * 1024)
873 #define L2PBUF_MAX_BUFS (L2PBUF_MAX_PAYLOAD_SIZE / L2PBUF_BUF_SIZE)
874 #define L2PBUF_COMPRESS_MINSZ 8192 /* minimum size to compress a pbuf */
875 #define L2PBUF_MAXSZ 100 * 1024 * 1024 /* maximum pbuf size */
876 #define L2PBUF_MAX_BUFLISTS 128 /* max number of buflists per pbuf */
877 #define L2ARC_REBUILD_TIMEOUT 60 /* a rebuild may take at most 60s */
878 #define L2PBUF_IS_FULL(_pb) \
879 ((_pb)->pb_payload_asz > l2arc_pbuf_max_sz || \
880 (_pb)->pb_nbuflists + 1 >= l2arc_pbuf_max_buflists)
881 /*
882 * These are the flags we allow to persist in L2ARC pbufs. The other flags
883 * of an ARC buffer pertain to the buffer's runtime behavior.
884 */
885 #define L2ARC_PERSIST_FLAGS \
886 (ARC_IN_HASH_TABLE | ARC_L2CACHE | ARC_L2COMPRESS | ARC_PREFETCH)
887
888 /*
889 * Used during L2ARC rebuild after each read operation to check whether we
890 * haven't exceeded the rebuild timeout value.
891 */
892 #define L2ARC_CHK_REBUILD_TIMEOUT(_deadline_, ...) \
893 do { \
894 if ((_deadline_) != 0 && (_deadline_) < ddi_get_lbolt64()) { \
895 __VA_ARGS__; \
896 ARCSTAT_BUMP(arcstat_l2_rebuild_timeout); \
897 cmn_err(CE_WARN, "L2ARC rebuild is taking too long, " \
898 "dropping remaining L2ARC metadata."); \
899 return; \
900 } \
901 _NOTE(NOTREACHED) \
902 _NOTE(CONSTCOND) \
903 } while (0)
904
905 /*
906 * Performance tuning of L2ARC persistency:
907 *
908 * l2arc_pbuf_compress_minsz : Minimum size of a pbuf in order to attempt
909 * compressing it.
910 * l2arc_pbuf_max_sz : Upper bound on the physical size of L2ARC buffers
911 * referenced from a pbuf. Once a pbuf reaches this size, it is
912 * committed to stable storage. Ideally, there should be approx.
913 * l2arc_dev_size / l2arc_pbuf_max_sz pbufs on an L2ARC device.
914 * l2arc_pbuf_max_buflists : Maximum number of L2ARC feed cycles that will
915 * be buffered in a pbuf before it is committed to L2ARC. This
916 * puts a soft temporal upper bound on pbuf commit intervals.
917 * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
918 * pool import or when adding one manually later) will attempt
919 * to rebuild L2ARC buffer contents. In special circumstances,
920 * the administrator may want to set this to B_FALSE, if they
921 * are having trouble importing a pool or attaching an L2ARC
922 * device (e.g. the L2ARC device is slow to read in stored pbuf
923 * metadata, or the metadata has become somehow
924 * fragmented/unusable).
925 * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help
926 * avoid a slow L2ARC device from preventing pool import. If we
927 * are not done rebuilding an L2ARC device by this time, we
928 * stop the rebuild and return immediately.
929 */
930 uint64_t l2arc_pbuf_compress_minsz = L2PBUF_COMPRESS_MINSZ;
931 uint64_t l2arc_pbuf_max_sz = L2PBUF_MAXSZ;
932 uint64_t l2arc_pbuf_max_buflists = L2PBUF_MAX_BUFLISTS;
933 boolean_t l2arc_rebuild_enabled = B_TRUE;
934 uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT;
935
936 static void l2arc_rebuild_start(l2arc_dev_t *dev);
937 static void l2arc_rebuild(l2arc_dev_t *dev);
938 static void l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb);
939 static void l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev,
940 uint64_t guid);
941
942 static int l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub);
943 static int l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize,
944 zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **next_io);
945 static int l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr,
946 uint32_t asize);
947 static zio_t *l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize);
948 static void l2arc_pbuf_prefetch_abort(zio_t *zio);
949
950 static void l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf);
951 static void l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub);
952 static int l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub,
953 uint64_t guid);
954 static void l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio,
955 l2arc_write_callback_t *cb);
956
957 static uint32_t l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen);
958 static int l2arc_pbuf_decode(uint8_t *buf, uint32_t buflen,
959 l2pbuf_t *pbuf);
960 static int l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen,
961 uint64_t *daddr, uint32_t *asize, zio_cksum_t *cksum);
962 static void l2arc_pbuf_init(l2pbuf_t *pb);
963 static void l2arc_pbuf_destroy(l2pbuf_t *pb);
964 static void l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio,
965 l2arc_write_callback_t *cb);
966 static l2pbuf_buflist_t *l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs);
967 static void l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl,
968 const arc_buf_hdr_t *ab, int index);
969 static uint32_t l2arc_pbuf_items_encoded_size(l2pbuf_t *pb);
970
971 static uint64_t
972 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
973 {
974 uint8_t *vdva = (uint8_t *)dva;
975 uint64_t crc = -1ULL;
976 int i;
977
978 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
979
980 for (i = 0; i < sizeof (dva_t); i++)
981 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
982
983 crc ^= (spa>>8) ^ birth;
984
985 return (crc);
986 }
987
988 #define BUF_EMPTY(buf) \
989 ((buf)->b_dva.dva_word[0] == 0 && \
990 (buf)->b_dva.dva_word[1] == 0 && \
991 (buf)->b_birth == 0)
992
993 #define BUF_EQUAL(spa, dva, birth, buf) \
994 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
995 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
996 ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
997
998 static void
999 buf_discard_identity(arc_buf_hdr_t *hdr)
1000 {
1001 hdr->b_dva.dva_word[0] = 0;
1002 hdr->b_dva.dva_word[1] = 0;
1003 hdr->b_birth = 0;
1004 hdr->b_cksum0 = 0;
1005 }
1006
1007 static arc_buf_hdr_t *
1008 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
1009 {
1010 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
1011 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1012 arc_buf_hdr_t *buf;
1013
1014 mutex_enter(hash_lock);
1015 for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
1016 buf = buf->b_hash_next) {
1017 if (BUF_EQUAL(spa, dva, birth, buf)) {
1018 *lockp = hash_lock;
1019 return (buf);
1020 }
1021 }
1022 mutex_exit(hash_lock);
1023 *lockp = NULL;
1024 return (NULL);
1025 }
1026
1027 /*
1028 * Insert an entry into the hash table. If there is already an element
1029 * equal to elem in the hash table, then the already existing element
1030 * will be returned and the new element will not be inserted.
1031 * Otherwise returns NULL.
1032 */
1033 static arc_buf_hdr_t *
1034 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
1035 {
1036 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
1037 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1038 arc_buf_hdr_t *fbuf;
1039 uint32_t i;
1040
1041 ASSERT(!HDR_IN_HASH_TABLE(buf));
1042 *lockp = hash_lock;
1043 mutex_enter(hash_lock);
1044 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
1045 fbuf = fbuf->b_hash_next, i++) {
1046 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
1047 return (fbuf);
1048 }
1049
1050 buf->b_hash_next = buf_hash_table.ht_table[idx];
1051 buf_hash_table.ht_table[idx] = buf;
1052 buf->b_flags |= ARC_IN_HASH_TABLE;
1053
1054 /* collect some hash table performance data */
1055 if (i > 0) {
1056 ARCSTAT_BUMP(arcstat_hash_collisions);
1057 if (i == 1)
1058 ARCSTAT_BUMP(arcstat_hash_chains);
1059
1060 ARCSTAT_MAX(arcstat_hash_chain_max, i);
1061 }
1062
1063 ARCSTAT_BUMP(arcstat_hash_elements);
1064 ARCSTAT_MAXSTAT(arcstat_hash_elements);
1065
1066 return (NULL);
1067 }
1068
1069 static void
1070 buf_hash_remove(arc_buf_hdr_t *buf)
1071 {
1072 arc_buf_hdr_t *fbuf, **bufp;
1073 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
1074
1075 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1076 ASSERT(HDR_IN_HASH_TABLE(buf));
1077
1078 bufp = &buf_hash_table.ht_table[idx];
1079 while ((fbuf = *bufp) != buf) {
1080 ASSERT(fbuf != NULL);
1081 bufp = &fbuf->b_hash_next;
1082 }
1083 *bufp = buf->b_hash_next;
1084 buf->b_hash_next = NULL;
1085 buf->b_flags &= ~ARC_IN_HASH_TABLE;
1086
1087 /* collect some hash table performance data */
1088 ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1089
1090 if (buf_hash_table.ht_table[idx] &&
1091 buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1092 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1093 }
1094
1095 /*
1096 * Global data structures and functions for the buf kmem cache.
1097 */
1098 static kmem_cache_t *hdr_cache;
1099 static kmem_cache_t *buf_cache;
1100
1101 static void
1102 buf_fini(void)
1103 {
1104 int i;
1105
1106 kmem_free(buf_hash_table.ht_table,
1107 (buf_hash_table.ht_mask + 1) * sizeof (void *));
1108 for (i = 0; i < BUF_LOCKS; i++)
1109 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1110 kmem_cache_destroy(hdr_cache);
1111 kmem_cache_destroy(buf_cache);
1112 }
1113
1114 /*
1115 * Constructor callback - called when the cache is empty
1116 * and a new buf is requested.
1117 */
1118 /* ARGSUSED */
1119 static int
1120 hdr_cons(void *vbuf, void *unused, int kmflag)
1121 {
1122 arc_buf_hdr_t *buf = vbuf;
1123
1124 bzero(buf, sizeof (arc_buf_hdr_t));
1125 refcount_create(&buf->b_refcnt);
1126 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
1127 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1128 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1129
1130 return (0);
1131 }
1132
1133 /* ARGSUSED */
1134 static int
1135 buf_cons(void *vbuf, void *unused, int kmflag)
1136 {
1137 arc_buf_t *buf = vbuf;
1138
1139 bzero(buf, sizeof (arc_buf_t));
1140 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1141 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1142
1143 return (0);
1144 }
1145
1146 /*
1147 * Destructor callback - called when a cached buf is
1148 * no longer required.
1149 */
1150 /* ARGSUSED */
1151 static void
1152 hdr_dest(void *vbuf, void *unused)
1153 {
1154 arc_buf_hdr_t *buf = vbuf;
1155
1156 ASSERT(BUF_EMPTY(buf));
1157 refcount_destroy(&buf->b_refcnt);
1158 cv_destroy(&buf->b_cv);
1159 mutex_destroy(&buf->b_freeze_lock);
1160 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1161 }
1162
1163 /* ARGSUSED */
1164 static void
1165 buf_dest(void *vbuf, void *unused)
1166 {
1167 arc_buf_t *buf = vbuf;
1168
1169 mutex_destroy(&buf->b_evict_lock);
1170 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1171 }
1172
1173 /*
1174 * Reclaim callback -- invoked when memory is low.
1175 */
1176 /* ARGSUSED */
1177 static void
1178 hdr_recl(void *unused)
1179 {
1180 dprintf("hdr_recl called\n");
1181 /*
1182 * umem calls the reclaim func when we destroy the buf cache,
1183 * which is after we do arc_fini().
1184 */
1185 if (!arc_dead)
1186 cv_signal(&arc_reclaim_thr_cv);
1187 }
1188
1189 static void
1190 buf_init(void)
1191 {
1192 uint64_t *ct;
1193 uint64_t hsize = 1ULL << 12;
1194 int i, j;
1195
1196 /*
1197 * The hash table is big enough to fill all of physical memory
1198 * with an average 64K block size. The table will take up
1199 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
1200 */
1201 while (hsize * 65536 < physmem * PAGESIZE)
1202 hsize <<= 1;
1203 retry:
1204 buf_hash_table.ht_mask = hsize - 1;
1205 buf_hash_table.ht_table =
1206 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1207 if (buf_hash_table.ht_table == NULL) {
1208 ASSERT(hsize > (1ULL << 8));
1209 hsize >>= 1;
1210 goto retry;
1211 }
1212
1213 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
1214 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
1215 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1216 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1217
1218 for (i = 0; i < 256; i++)
1219 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1220 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1221
1222 for (i = 0; i < BUF_LOCKS; i++) {
1223 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1224 NULL, MUTEX_DEFAULT, NULL);
1225 }
1226 }
1227
1228 #define ARC_MINTIME (hz>>4) /* 62 ms */
1229
1230 static void
1231 arc_cksum_verify(arc_buf_t *buf)
1232 {
1233 zio_cksum_t zc;
1234
1235 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1236 return;
1237
1238 mutex_enter(&buf->b_hdr->b_freeze_lock);
1239 if (buf->b_hdr->b_freeze_cksum == NULL ||
1240 (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1241 mutex_exit(&buf->b_hdr->b_freeze_lock);
1242 return;
1243 }
1244 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1245 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1246 panic("buffer modified while frozen!");
1247 mutex_exit(&buf->b_hdr->b_freeze_lock);
1248 }
1249
1250 static int
1251 arc_cksum_equal(arc_buf_t *buf)
1252 {
1253 zio_cksum_t zc;
1254 int equal;
1255
1256 mutex_enter(&buf->b_hdr->b_freeze_lock);
1257 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1258 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1259 mutex_exit(&buf->b_hdr->b_freeze_lock);
1260
1261 return (equal);
1262 }
1263
1264 static void
1265 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1266 {
1267 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1268 return;
1269
1270 mutex_enter(&buf->b_hdr->b_freeze_lock);
1271 if (buf->b_hdr->b_freeze_cksum != NULL) {
1272 mutex_exit(&buf->b_hdr->b_freeze_lock);
1273 return;
1274 }
1275 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1276 fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1277 buf->b_hdr->b_freeze_cksum);
1278 mutex_exit(&buf->b_hdr->b_freeze_lock);
1279 arc_buf_watch(buf);
1280 }
1281
1282 #ifndef _KERNEL
1283 typedef struct procctl {
1284 long cmd;
1285 prwatch_t prwatch;
1286 } procctl_t;
1287 #endif
1288
1289 /* ARGSUSED */
1290 static void
1291 arc_buf_unwatch(arc_buf_t *buf)
1292 {
1293 #ifndef _KERNEL
1294 if (arc_watch) {
1295 int result;
1296 procctl_t ctl;
1297 ctl.cmd = PCWATCH;
1298 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1299 ctl.prwatch.pr_size = 0;
1300 ctl.prwatch.pr_wflags = 0;
1301 result = write(arc_procfd, &ctl, sizeof (ctl));
1302 ASSERT3U(result, ==, sizeof (ctl));
1303 }
1304 #endif
1305 }
1306
1307 /* ARGSUSED */
1308 static void
1309 arc_buf_watch(arc_buf_t *buf)
1310 {
1311 #ifndef _KERNEL
1312 if (arc_watch) {
1313 int result;
1314 procctl_t ctl;
1315 ctl.cmd = PCWATCH;
1316 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1317 ctl.prwatch.pr_size = buf->b_hdr->b_size;
1318 ctl.prwatch.pr_wflags = WA_WRITE;
1319 result = write(arc_procfd, &ctl, sizeof (ctl));
1320 ASSERT3U(result, ==, sizeof (ctl));
1321 }
1322 #endif
1323 }
1324
1325 void
1326 arc_buf_thaw(arc_buf_t *buf)
1327 {
1328 if (zfs_flags & ZFS_DEBUG_MODIFY) {
1329 if (buf->b_hdr->b_state != arc_anon)
1330 panic("modifying non-anon buffer!");
1331 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1332 panic("modifying buffer while i/o in progress!");
1333 arc_cksum_verify(buf);
1334 }
1335
1336 mutex_enter(&buf->b_hdr->b_freeze_lock);
1337 if (buf->b_hdr->b_freeze_cksum != NULL) {
1338 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1339 buf->b_hdr->b_freeze_cksum = NULL;
1340 }
1341
1342 if (zfs_flags & ZFS_DEBUG_MODIFY) {
1343 if (buf->b_hdr->b_thawed)
1344 kmem_free(buf->b_hdr->b_thawed, 1);
1345 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1346 }
1347
1348 mutex_exit(&buf->b_hdr->b_freeze_lock);
1349
1350 arc_buf_unwatch(buf);
1351 }
1352
1353 void
1354 arc_buf_freeze(arc_buf_t *buf)
1355 {
1356 kmutex_t *hash_lock;
1357
1358 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1359 return;
1360
1361 hash_lock = HDR_LOCK(buf->b_hdr);
1362 mutex_enter(hash_lock);
1363
1364 ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1365 buf->b_hdr->b_state == arc_anon);
1366 arc_cksum_compute(buf, B_FALSE);
1367 mutex_exit(hash_lock);
1368
1369 }
1370
1371 static void
1372 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1373 {
1374 ASSERT(MUTEX_HELD(hash_lock));
1375
1376 if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1377 (ab->b_state != arc_anon)) {
1378 uint64_t delta = ab->b_size * ab->b_datacnt;
1379 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1380 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1381
1382 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1383 mutex_enter(&ab->b_state->arcs_mtx);
1384 ASSERT(list_link_active(&ab->b_arc_node));
1385 list_remove(list, ab);
1386 if (GHOST_STATE(ab->b_state)) {
1387 ASSERT0(ab->b_datacnt);
1388 ASSERT3P(ab->b_buf, ==, NULL);
1389 delta = ab->b_size;
1390 }
1391 ASSERT(delta > 0);
1392 ASSERT3U(*size, >=, delta);
1393 atomic_add_64(size, -delta);
1394 mutex_exit(&ab->b_state->arcs_mtx);
1395 /* remove the prefetch flag if we get a reference */
1396 if (ab->b_flags & ARC_PREFETCH)
1397 ab->b_flags &= ~ARC_PREFETCH;
1398 }
1399 }
1400
1401 static int
1402 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1403 {
1404 int cnt;
1405 arc_state_t *state = ab->b_state;
1406
1407 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1408 ASSERT(!GHOST_STATE(state));
1409
1410 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1411 (state != arc_anon)) {
1412 uint64_t *size = &state->arcs_lsize[ab->b_type];
1413
1414 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1415 mutex_enter(&state->arcs_mtx);
1416 ASSERT(!list_link_active(&ab->b_arc_node));
1417 list_insert_head(&state->arcs_list[ab->b_type], ab);
1418 ASSERT(ab->b_datacnt > 0);
1419 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1420 mutex_exit(&state->arcs_mtx);
1421 }
1422 return (cnt);
1423 }
1424
1425 /*
1426 * Move the supplied buffer to the indicated state. The mutex
1427 * for the buffer must be held by the caller.
1428 */
1429 static void
1430 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1431 {
1432 arc_state_t *old_state = ab->b_state;
1433 int64_t refcnt = refcount_count(&ab->b_refcnt);
1434 uint64_t from_delta, to_delta;
1435
1436 ASSERT(MUTEX_HELD(hash_lock));
1437 ASSERT(new_state != old_state);
1438 ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1439 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1440 ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1441
1442 from_delta = to_delta = ab->b_datacnt * ab->b_size;
1443
1444 /*
1445 * If this buffer is evictable, transfer it from the
1446 * old state list to the new state list.
1447 */
1448 if (refcnt == 0) {
1449 if (old_state != arc_anon) {
1450 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1451 uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1452
1453 if (use_mutex)
1454 mutex_enter(&old_state->arcs_mtx);
1455
1456 ASSERT(list_link_active(&ab->b_arc_node));
1457 list_remove(&old_state->arcs_list[ab->b_type], ab);
1458
1459 /*
1460 * If prefetching out of the ghost cache,
1461 * we will have a non-zero datacnt.
1462 */
1463 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1464 /* ghost elements have a ghost size */
1465 ASSERT(ab->b_buf == NULL);
1466 from_delta = ab->b_size;
1467 }
1468 ASSERT3U(*size, >=, from_delta);
1469 atomic_add_64(size, -from_delta);
1470
1471 if (use_mutex)
1472 mutex_exit(&old_state->arcs_mtx);
1473 }
1474 if (new_state != arc_anon) {
1475 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1476 uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1477
1478 if (use_mutex)
1479 mutex_enter(&new_state->arcs_mtx);
1480
1481 list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1482
1483 /* ghost elements have a ghost size */
1484 if (GHOST_STATE(new_state)) {
1485 ASSERT(ab->b_datacnt == 0);
1486 ASSERT(ab->b_buf == NULL);
1487 to_delta = ab->b_size;
1488 }
1489 atomic_add_64(size, to_delta);
1490
1491 if (use_mutex)
1492 mutex_exit(&new_state->arcs_mtx);
1493 }
1494 }
1495
1496 ASSERT(!BUF_EMPTY(ab));
1497 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1498 buf_hash_remove(ab);
1499
1500 /* adjust state sizes */
1501 if (to_delta)
1502 atomic_add_64(&new_state->arcs_size, to_delta);
1503 if (from_delta) {
1504 ASSERT3U(old_state->arcs_size, >=, from_delta);
1505 atomic_add_64(&old_state->arcs_size, -from_delta);
1506 }
1507 ab->b_state = new_state;
1508
1509 /* adjust l2arc hdr stats */
1510 if (new_state == arc_l2c_only)
1511 l2arc_hdr_stat_add(old_state != arc_anon);
1512 else if (old_state == arc_l2c_only)
1513 l2arc_hdr_stat_remove();
1514 }
1515
1516 void
1517 arc_space_consume(uint64_t space, arc_space_type_t type)
1518 {
1519 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1520
1521 switch (type) {
1522 case ARC_SPACE_DATA:
1523 ARCSTAT_INCR(arcstat_data_size, space);
1524 break;
1525 case ARC_SPACE_OTHER:
1526 ARCSTAT_INCR(arcstat_other_size, space);
1527 break;
1528 case ARC_SPACE_HDRS:
1529 ARCSTAT_INCR(arcstat_hdr_size, space);
1530 break;
1531 case ARC_SPACE_L2HDRS:
1532 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1533 break;
1534 }
1535
1536 ARCSTAT_INCR(arcstat_meta_used, space);
1537 atomic_add_64(&arc_size, space);
1538 }
1539
1540 void
1541 arc_space_return(uint64_t space, arc_space_type_t type)
1542 {
1543 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1544
1545 switch (type) {
1546 case ARC_SPACE_DATA:
1547 ARCSTAT_INCR(arcstat_data_size, -space);
1548 break;
1549 case ARC_SPACE_OTHER:
1550 ARCSTAT_INCR(arcstat_other_size, -space);
1551 break;
1552 case ARC_SPACE_HDRS:
1553 ARCSTAT_INCR(arcstat_hdr_size, -space);
1554 break;
1555 case ARC_SPACE_L2HDRS:
1556 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1557 break;
1558 }
1559
1560 ASSERT(arc_meta_used >= space);
1561 if (arc_meta_max < arc_meta_used)
1562 arc_meta_max = arc_meta_used;
1563 ARCSTAT_INCR(arcstat_meta_used, -space);
1564 ASSERT(arc_size >= space);
1565 atomic_add_64(&arc_size, -space);
1566 }
1567
1568 void *
1569 arc_data_buf_alloc(uint64_t size)
1570 {
1571 if (arc_evict_needed(ARC_BUFC_DATA))
1572 cv_signal(&arc_reclaim_thr_cv);
1573 atomic_add_64(&arc_size, size);
1574 return (zio_data_buf_alloc(size));
1575 }
1576
1577 void
1578 arc_data_buf_free(void *buf, uint64_t size)
1579 {
1580 zio_data_buf_free(buf, size);
1581 ASSERT(arc_size >= size);
1582 atomic_add_64(&arc_size, -size);
1583 }
1584
1585 arc_buf_t *
1586 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1587 {
1588 arc_buf_hdr_t *hdr;
1589 arc_buf_t *buf;
1590
1591 ASSERT3U(size, >, 0);
1592 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1593 ASSERT(BUF_EMPTY(hdr));
1594 hdr->b_size = size;
1595 hdr->b_type = type;
1596 hdr->b_spa = spa_load_guid(spa);
1597 hdr->b_state = arc_anon;
1598 hdr->b_arc_access = 0;
1599 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1600 buf->b_hdr = hdr;
1601 buf->b_data = NULL;
1602 buf->b_efunc = NULL;
1603 buf->b_private = NULL;
1604 buf->b_next = NULL;
1605 hdr->b_buf = buf;
1606 arc_get_data_buf(buf);
1607 hdr->b_datacnt = 1;
1608 hdr->b_flags = 0;
1609 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1610 (void) refcount_add(&hdr->b_refcnt, tag);
1611
1612 return (buf);
1613 }
1614
1615 /*
1616 * Allocates an empty arc_buf_hdr structure (lacking any data buffer).
1617 * This is used during l2arc reconstruction to make empty ARC buffers
1618 * which circumvent the regular disk->arc->l2arc path and instead come
1619 * into being in the reverse order, i.e. l2arc->arc->(disk).
1620 */
1621 arc_buf_hdr_t *
1622 arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type)
1623 {
1624 arc_buf_hdr_t *hdr;
1625
1626 ASSERT3U(size, >, 0);
1627 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1628 ASSERT(BUF_EMPTY(hdr));
1629 hdr->b_size = size;
1630 hdr->b_type = type;
1631 hdr->b_spa = guid;
1632 hdr->b_state = arc_anon;
1633 hdr->b_arc_access = 0;
1634 hdr->b_buf = NULL;
1635 hdr->b_datacnt = 0;
1636 hdr->b_flags = 0;
1637 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1638
1639 return (hdr);
1640 }
1641
1642 static char *arc_onloan_tag = "onloan";
1643
1644 /*
1645 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1646 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1647 * buffers must be returned to the arc before they can be used by the DMU or
1648 * freed.
1649 */
1650 arc_buf_t *
1651 arc_loan_buf(spa_t *spa, int size)
1652 {
1653 arc_buf_t *buf;
1654
1655 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1656
1657 atomic_add_64(&arc_loaned_bytes, size);
1658 return (buf);
1659 }
1660
1661 /*
1662 * Return a loaned arc buffer to the arc.
1663 */
1664 void
1665 arc_return_buf(arc_buf_t *buf, void *tag)
1666 {
1667 arc_buf_hdr_t *hdr = buf->b_hdr;
1668
1669 ASSERT(buf->b_data != NULL);
1670 (void) refcount_add(&hdr->b_refcnt, tag);
1671 (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1672
1673 atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1674 }
1675
1676 /* Detach an arc_buf from a dbuf (tag) */
1677 void
1678 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1679 {
1680 arc_buf_hdr_t *hdr;
1681
1682 ASSERT(buf->b_data != NULL);
1683 hdr = buf->b_hdr;
1684 (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1685 (void) refcount_remove(&hdr->b_refcnt, tag);
1686 buf->b_efunc = NULL;
1687 buf->b_private = NULL;
1688
1689 atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1690 }
1691
1692 static arc_buf_t *
1693 arc_buf_clone(arc_buf_t *from)
1694 {
1695 arc_buf_t *buf;
1696 arc_buf_hdr_t *hdr = from->b_hdr;
1697 uint64_t size = hdr->b_size;
1698
1699 ASSERT(hdr->b_state != arc_anon);
1700
1701 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1702 buf->b_hdr = hdr;
1703 buf->b_data = NULL;
1704 buf->b_efunc = NULL;
1705 buf->b_private = NULL;
1706 buf->b_next = hdr->b_buf;
1707 hdr->b_buf = buf;
1708 arc_get_data_buf(buf);
1709 bcopy(from->b_data, buf->b_data, size);
1710
1711 /*
1712 * This buffer already exists in the arc so create a duplicate
1713 * copy for the caller. If the buffer is associated with user data
1714 * then track the size and number of duplicates. These stats will be
1715 * updated as duplicate buffers are created and destroyed.
1716 */
1717 if (hdr->b_type == ARC_BUFC_DATA) {
1718 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1719 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1720 }
1721 hdr->b_datacnt += 1;
1722 return (buf);
1723 }
1724
1725 void
1726 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1727 {
1728 arc_buf_hdr_t *hdr;
1729 kmutex_t *hash_lock;
1730
1731 /*
1732 * Check to see if this buffer is evicted. Callers
1733 * must verify b_data != NULL to know if the add_ref
1734 * was successful.
1735 */
1736 mutex_enter(&buf->b_evict_lock);
1737 if (buf->b_data == NULL) {
1738 mutex_exit(&buf->b_evict_lock);
1739 return;
1740 }
1741 hash_lock = HDR_LOCK(buf->b_hdr);
1742 mutex_enter(hash_lock);
1743 hdr = buf->b_hdr;
1744 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1745 mutex_exit(&buf->b_evict_lock);
1746
1747 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1748 add_reference(hdr, hash_lock, tag);
1749 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1750 arc_access(hdr, hash_lock);
1751 mutex_exit(hash_lock);
1752 ARCSTAT_BUMP(arcstat_hits);
1753 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1754 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1755 data, metadata, hits);
1756 }
1757
1758 /*
1759 * Free the arc data buffer. If it is an l2arc write in progress,
1760 * the buffer is placed on l2arc_free_on_write to be freed later.
1761 */
1762 static void
1763 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1764 {
1765 arc_buf_hdr_t *hdr = buf->b_hdr;
1766
1767 if (HDR_L2_WRITING(hdr)) {
1768 l2arc_data_free_t *df;
1769 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1770 df->l2df_data = buf->b_data;
1771 df->l2df_size = hdr->b_size;
1772 df->l2df_func = free_func;
1773 mutex_enter(&l2arc_free_on_write_mtx);
1774 list_insert_head(l2arc_free_on_write, df);
1775 mutex_exit(&l2arc_free_on_write_mtx);
1776 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1777 } else {
1778 free_func(buf->b_data, hdr->b_size);
1779 }
1780 }
1781
1782 static void
1783 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1784 {
1785 arc_buf_t **bufp;
1786
1787 /* free up data associated with the buf */
1788 if (buf->b_data) {
1789 arc_state_t *state = buf->b_hdr->b_state;
1790 uint64_t size = buf->b_hdr->b_size;
1791 arc_buf_contents_t type = buf->b_hdr->b_type;
1792
1793 arc_cksum_verify(buf);
1794 arc_buf_unwatch(buf);
1795
1796 if (!recycle) {
1797 if (type == ARC_BUFC_METADATA) {
1798 arc_buf_data_free(buf, zio_buf_free);
1799 arc_space_return(size, ARC_SPACE_DATA);
1800 } else {
1801 ASSERT(type == ARC_BUFC_DATA);
1802 arc_buf_data_free(buf, zio_data_buf_free);
1803 ARCSTAT_INCR(arcstat_data_size, -size);
1804 atomic_add_64(&arc_size, -size);
1805 }
1806 }
1807 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1808 uint64_t *cnt = &state->arcs_lsize[type];
1809
1810 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1811 ASSERT(state != arc_anon);
1812
1813 ASSERT3U(*cnt, >=, size);
1814 atomic_add_64(cnt, -size);
1815 }
1816 ASSERT3U(state->arcs_size, >=, size);
1817 atomic_add_64(&state->arcs_size, -size);
1818 buf->b_data = NULL;
1819
1820 /*
1821 * If we're destroying a duplicate buffer make sure
1822 * that the appropriate statistics are updated.
1823 */
1824 if (buf->b_hdr->b_datacnt > 1 &&
1825 buf->b_hdr->b_type == ARC_BUFC_DATA) {
1826 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1827 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1828 }
1829 ASSERT(buf->b_hdr->b_datacnt > 0);
1830 buf->b_hdr->b_datacnt -= 1;
1831 }
1832
1833 /* only remove the buf if requested */
1834 if (!all)
1835 return;
1836
1837 /* remove the buf from the hdr list */
1838 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1839 continue;
1840 *bufp = buf->b_next;
1841 buf->b_next = NULL;
1842
1843 ASSERT(buf->b_efunc == NULL);
1844
1845 /* clean up the buf */
1846 buf->b_hdr = NULL;
1847 kmem_cache_free(buf_cache, buf);
1848 }
1849
1850 static void
1851 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1852 {
1853 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1854 ASSERT3P(hdr->b_state, ==, arc_anon);
1855 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1856 l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1857
1858 if (l2hdr != NULL) {
1859 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1860 /*
1861 * To prevent arc_free() and l2arc_evict() from
1862 * attempting to free the same buffer at the same time,
1863 * a FREE_IN_PROGRESS flag is given to arc_free() to
1864 * give it priority. l2arc_evict() can't destroy this
1865 * header while we are waiting on l2arc_buflist_mtx.
1866 *
1867 * The hdr may be removed from l2ad_buflist before we
1868 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1869 */
1870 if (!buflist_held) {
1871 mutex_enter(&l2arc_buflist_mtx);
1872 l2hdr = hdr->b_l2hdr;
1873 }
1874
1875 if (l2hdr != NULL) {
1876 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1877 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1878 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1879 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1880 if (hdr->b_state == arc_l2c_only)
1881 l2arc_hdr_stat_remove();
1882 hdr->b_l2hdr = NULL;
1883 }
1884
1885 if (!buflist_held)
1886 mutex_exit(&l2arc_buflist_mtx);
1887 }
1888
1889 if (!BUF_EMPTY(hdr)) {
1890 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1891 buf_discard_identity(hdr);
1892 }
1893 while (hdr->b_buf) {
1894 arc_buf_t *buf = hdr->b_buf;
1895
1896 if (buf->b_efunc) {
1897 mutex_enter(&arc_eviction_mtx);
1898 mutex_enter(&buf->b_evict_lock);
1899 ASSERT(buf->b_hdr != NULL);
1900 arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1901 hdr->b_buf = buf->b_next;
1902 buf->b_hdr = &arc_eviction_hdr;
1903 buf->b_next = arc_eviction_list;
1904 arc_eviction_list = buf;
1905 mutex_exit(&buf->b_evict_lock);
1906 mutex_exit(&arc_eviction_mtx);
1907 } else {
1908 arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1909 }
1910 }
1911 if (hdr->b_freeze_cksum != NULL) {
1912 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1913 hdr->b_freeze_cksum = NULL;
1914 }
1915 if (hdr->b_thawed) {
1916 kmem_free(hdr->b_thawed, 1);
1917 hdr->b_thawed = NULL;
1918 }
1919
1920 ASSERT(!list_link_active(&hdr->b_arc_node));
1921 ASSERT3P(hdr->b_hash_next, ==, NULL);
1922 ASSERT3P(hdr->b_acb, ==, NULL);
1923 kmem_cache_free(hdr_cache, hdr);
1924 }
1925
1926 void
1927 arc_buf_free(arc_buf_t *buf, void *tag)
1928 {
1929 arc_buf_hdr_t *hdr = buf->b_hdr;
1930 int hashed = hdr->b_state != arc_anon;
1931
1932 ASSERT(buf->b_efunc == NULL);
1933 ASSERT(buf->b_data != NULL);
1934
1935 if (hashed) {
1936 kmutex_t *hash_lock = HDR_LOCK(hdr);
1937
1938 mutex_enter(hash_lock);
1939 hdr = buf->b_hdr;
1940 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1941
1942 (void) remove_reference(hdr, hash_lock, tag);
1943 if (hdr->b_datacnt > 1) {
1944 arc_buf_destroy(buf, FALSE, TRUE);
1945 } else {
1946 ASSERT(buf == hdr->b_buf);
1947 ASSERT(buf->b_efunc == NULL);
1948 hdr->b_flags |= ARC_BUF_AVAILABLE;
1949 }
1950 mutex_exit(hash_lock);
1951 } else if (HDR_IO_IN_PROGRESS(hdr)) {
1952 int destroy_hdr;
1953 /*
1954 * We are in the middle of an async write. Don't destroy
1955 * this buffer unless the write completes before we finish
1956 * decrementing the reference count.
1957 */
1958 mutex_enter(&arc_eviction_mtx);
1959 (void) remove_reference(hdr, NULL, tag);
1960 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1961 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1962 mutex_exit(&arc_eviction_mtx);
1963 if (destroy_hdr)
1964 arc_hdr_destroy(hdr);
1965 } else {
1966 if (remove_reference(hdr, NULL, tag) > 0)
1967 arc_buf_destroy(buf, FALSE, TRUE);
1968 else
1969 arc_hdr_destroy(hdr);
1970 }
1971 }
1972
1973 boolean_t
1974 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1975 {
1976 arc_buf_hdr_t *hdr = buf->b_hdr;
1977 kmutex_t *hash_lock = HDR_LOCK(hdr);
1978 boolean_t no_callback = (buf->b_efunc == NULL);
1979
1980 if (hdr->b_state == arc_anon) {
1981 ASSERT(hdr->b_datacnt == 1);
1982 arc_buf_free(buf, tag);
1983 return (no_callback);
1984 }
1985
1986 mutex_enter(hash_lock);
1987 hdr = buf->b_hdr;
1988 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1989 ASSERT(hdr->b_state != arc_anon);
1990 ASSERT(buf->b_data != NULL);
1991
1992 (void) remove_reference(hdr, hash_lock, tag);
1993 if (hdr->b_datacnt > 1) {
1994 if (no_callback)
1995 arc_buf_destroy(buf, FALSE, TRUE);
1996 } else if (no_callback) {
1997 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1998 ASSERT(buf->b_efunc == NULL);
1999 hdr->b_flags |= ARC_BUF_AVAILABLE;
2000 }
2001 ASSERT(no_callback || hdr->b_datacnt > 1 ||
2002 refcount_is_zero(&hdr->b_refcnt));
2003 mutex_exit(hash_lock);
2004 return (no_callback);
2005 }
2006
2007 int
2008 arc_buf_size(arc_buf_t *buf)
2009 {
2010 return (buf->b_hdr->b_size);
2011 }
2012
2013 /*
2014 * Called from the DMU to determine if the current buffer should be
2015 * evicted. In order to ensure proper locking, the eviction must be initiated
2016 * from the DMU. Return true if the buffer is associated with user data and
2017 * duplicate buffers still exist.
2018 */
2019 boolean_t
2020 arc_buf_eviction_needed(arc_buf_t *buf)
2021 {
2022 arc_buf_hdr_t *hdr;
2023 boolean_t evict_needed = B_FALSE;
2024
2025 if (zfs_disable_dup_eviction)
2026 return (B_FALSE);
2027
2028 mutex_enter(&buf->b_evict_lock);
2029 hdr = buf->b_hdr;
2030 if (hdr == NULL) {
2031 /*
2032 * We are in arc_do_user_evicts(); let that function
2033 * perform the eviction.
2034 */
2035 ASSERT(buf->b_data == NULL);
2036 mutex_exit(&buf->b_evict_lock);
2037 return (B_FALSE);
2038 } else if (buf->b_data == NULL) {
2039 /*
2040 * We have already been added to the arc eviction list;
2041 * recommend eviction.
2042 */
2043 ASSERT3P(hdr, ==, &arc_eviction_hdr);
2044 mutex_exit(&buf->b_evict_lock);
2045 return (B_TRUE);
2046 }
2047
2048 if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
2049 evict_needed = B_TRUE;
2050
2051 mutex_exit(&buf->b_evict_lock);
2052 return (evict_needed);
2053 }
2054
2055 /*
2056 * Evict buffers from list until we've removed the specified number of
2057 * bytes. Move the removed buffers to the appropriate evict state.
2058 * If the recycle flag is set, then attempt to "recycle" a buffer:
2059 * - look for a buffer to evict that is `bytes' long.
2060 * - return the data block from this buffer rather than freeing it.
2061 * This flag is used by callers that are trying to make space for a
2062 * new buffer in a full arc cache.
2063 *
2064 * This function makes a "best effort". It skips over any buffers
2065 * it can't get a hash_lock on, and so may not catch all candidates.
2066 * It may also return without evicting as much space as requested.
2067 */
2068 static void *
2069 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
2070 arc_buf_contents_t type)
2071 {
2072 arc_state_t *evicted_state;
2073 uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
2074 arc_buf_hdr_t *ab, *ab_prev = NULL;
2075 list_t *list = &state->arcs_list[type];
2076 kmutex_t *hash_lock;
2077 boolean_t have_lock;
2078 void *stolen = NULL;
2079
2080 ASSERT(state == arc_mru || state == arc_mfu);
2081
2082 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2083
2084 mutex_enter(&state->arcs_mtx);
2085 mutex_enter(&evicted_state->arcs_mtx);
2086
2087 for (ab = list_tail(list); ab; ab = ab_prev) {
2088 ab_prev = list_prev(list, ab);
2089 /* prefetch buffers have a minimum lifespan */
2090 if (HDR_IO_IN_PROGRESS(ab) ||
2091 (spa && ab->b_spa != spa) ||
2092 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
2093 ddi_get_lbolt() - ab->b_arc_access <
2094 arc_min_prefetch_lifespan)) {
2095 skipped++;
2096 continue;
2097 }
2098 /* "lookahead" for better eviction candidate */
2099 if (recycle && ab->b_size != bytes &&
2100 ab_prev && ab_prev->b_size == bytes)
2101 continue;
2102 hash_lock = HDR_LOCK(ab);
2103 have_lock = MUTEX_HELD(hash_lock);
2104 if (have_lock || mutex_tryenter(hash_lock)) {
2105 ASSERT0(refcount_count(&ab->b_refcnt));
2106 ASSERT(ab->b_datacnt > 0);
2107 while (ab->b_buf) {
2108 arc_buf_t *buf = ab->b_buf;
2109 if (!mutex_tryenter(&buf->b_evict_lock)) {
2110 missed += 1;
2111 break;
2112 }
2113 if (buf->b_data) {
2114 bytes_evicted += ab->b_size;
2115 if (recycle && ab->b_type == type &&
2116 ab->b_size == bytes &&
2117 !HDR_L2_WRITING(ab)) {
2118 stolen = buf->b_data;
2119 recycle = FALSE;
2120 }
2121 }
2122 if (buf->b_efunc) {
2123 mutex_enter(&arc_eviction_mtx);
2124 arc_buf_destroy(buf,
2125 buf->b_data == stolen, FALSE);
2126 ab->b_buf = buf->b_next;
2127 buf->b_hdr = &arc_eviction_hdr;
2128 buf->b_next = arc_eviction_list;
2129 arc_eviction_list = buf;
2130 mutex_exit(&arc_eviction_mtx);
2131 mutex_exit(&buf->b_evict_lock);
2132 } else {
2133 mutex_exit(&buf->b_evict_lock);
2134 arc_buf_destroy(buf,
2135 buf->b_data == stolen, TRUE);
2136 }
2137 }
2138
2139 if (ab->b_l2hdr) {
2140 ARCSTAT_INCR(arcstat_evict_l2_cached,
2141 ab->b_size);
2142 } else {
2143 if (l2arc_write_eligible(ab->b_spa, ab)) {
2144 ARCSTAT_INCR(arcstat_evict_l2_eligible,
2145 ab->b_size);
2146 } else {
2147 ARCSTAT_INCR(
2148 arcstat_evict_l2_ineligible,
2149 ab->b_size);
2150 }
2151 }
2152
2153 if (ab->b_datacnt == 0) {
2154 arc_change_state(evicted_state, ab, hash_lock);
2155 ASSERT(HDR_IN_HASH_TABLE(ab));
2156 ab->b_flags |= ARC_IN_HASH_TABLE;
2157 ab->b_flags &= ~ARC_BUF_AVAILABLE;
2158 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
2159 }
2160 if (!have_lock)
2161 mutex_exit(hash_lock);
2162 if (bytes >= 0 && bytes_evicted >= bytes)
2163 break;
2164 } else {
2165 missed += 1;
2166 }
2167 }
2168
2169 mutex_exit(&evicted_state->arcs_mtx);
2170 mutex_exit(&state->arcs_mtx);
2171
2172 if (bytes_evicted < bytes)
2173 dprintf("only evicted %lld bytes from %x",
2174 (longlong_t)bytes_evicted, state);
2175
2176 if (skipped)
2177 ARCSTAT_INCR(arcstat_evict_skip, skipped);
2178
2179 if (missed)
2180 ARCSTAT_INCR(arcstat_mutex_miss, missed);
2181
2182 /*
2183 * We have just evicted some data into the ghost state, make
2184 * sure we also adjust the ghost state size if necessary.
2185 */
2186 if (arc_no_grow &&
2187 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
2188 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
2189 arc_mru_ghost->arcs_size - arc_c;
2190
2191 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
2192 int64_t todelete =
2193 MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
2194 arc_evict_ghost(arc_mru_ghost, NULL, todelete);
2195 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
2196 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
2197 arc_mru_ghost->arcs_size +
2198 arc_mfu_ghost->arcs_size - arc_c);
2199 arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
2200 }
2201 }
2202
2203 return (stolen);
2204 }
2205
2206 /*
2207 * Remove buffers from list until we've removed the specified number of
2208 * bytes. Destroy the buffers that are removed.
2209 */
2210 static void
2211 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2212 {
2213 arc_buf_hdr_t *ab, *ab_prev;
2214 arc_buf_hdr_t marker = { 0 };
2215 list_t *list = &state->arcs_list[ARC_BUFC_DATA];
2216 kmutex_t *hash_lock;
2217 uint64_t bytes_deleted = 0;
2218 uint64_t bufs_skipped = 0;
2219
2220 ASSERT(GHOST_STATE(state));
2221 top:
2222 mutex_enter(&state->arcs_mtx);
2223 for (ab = list_tail(list); ab; ab = ab_prev) {
2224 ab_prev = list_prev(list, ab);
2225 if (spa && ab->b_spa != spa)
2226 continue;
2227
2228 /* ignore markers */
2229 if (ab->b_spa == 0)
2230 continue;
2231
2232 hash_lock = HDR_LOCK(ab);
2233 /* caller may be trying to modify this buffer, skip it */
2234 if (MUTEX_HELD(hash_lock))
2235 continue;
2236 if (mutex_tryenter(hash_lock)) {
2237 ASSERT(!HDR_IO_IN_PROGRESS(ab));
2238 ASSERT(ab->b_buf == NULL);
2239 ARCSTAT_BUMP(arcstat_deleted);
2240 bytes_deleted += ab->b_size;
2241
2242 if (ab->b_l2hdr != NULL) {
2243 /*
2244 * This buffer is cached on the 2nd Level ARC;
2245 * don't destroy the header.
2246 */
2247 arc_change_state(arc_l2c_only, ab, hash_lock);
2248 mutex_exit(hash_lock);
2249 } else {
2250 arc_change_state(arc_anon, ab, hash_lock);
2251 mutex_exit(hash_lock);
2252 arc_hdr_destroy(ab);
2253 }
2254
2255 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2256 if (bytes >= 0 && bytes_deleted >= bytes)
2257 break;
2258 } else if (bytes < 0) {
2259 /*
2260 * Insert a list marker and then wait for the
2261 * hash lock to become available. Once its
2262 * available, restart from where we left off.
2263 */
2264 list_insert_after(list, ab, &marker);
2265 mutex_exit(&state->arcs_mtx);
2266 mutex_enter(hash_lock);
2267 mutex_exit(hash_lock);
2268 mutex_enter(&state->arcs_mtx);
2269 ab_prev = list_prev(list, &marker);
2270 list_remove(list, &marker);
2271 } else
2272 bufs_skipped += 1;
2273 }
2274 mutex_exit(&state->arcs_mtx);
2275
2276 if (list == &state->arcs_list[ARC_BUFC_DATA] &&
2277 (bytes < 0 || bytes_deleted < bytes)) {
2278 list = &state->arcs_list[ARC_BUFC_METADATA];
2279 goto top;
2280 }
2281
2282 if (bufs_skipped) {
2283 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2284 ASSERT(bytes >= 0);
2285 }
2286
2287 if (bytes_deleted < bytes)
2288 dprintf("only deleted %lld bytes from %p",
2289 (longlong_t)bytes_deleted, state);
2290 }
2291
2292 static void
2293 arc_adjust(void)
2294 {
2295 int64_t adjustment, delta;
2296
2297 /*
2298 * Adjust MRU size
2299 */
2300
2301 adjustment = MIN((int64_t)(arc_size - arc_c),
2302 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2303 arc_p));
2304
2305 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2306 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2307 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
2308 adjustment -= delta;
2309 }
2310
2311 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2312 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2313 (void) arc_evict(arc_mru, NULL, delta, FALSE,
2314 ARC_BUFC_METADATA);
2315 }
2316
2317 /*
2318 * Adjust MFU size
2319 */
2320
2321 adjustment = arc_size - arc_c;
2322
2323 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2324 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2325 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
2326 adjustment -= delta;
2327 }
2328
2329 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2330 int64_t delta = MIN(adjustment,
2331 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2332 (void) arc_evict(arc_mfu, NULL, delta, FALSE,
2333 ARC_BUFC_METADATA);
2334 }
2335
2336 /*
2337 * Adjust ghost lists
2338 */
2339
2340 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2341
2342 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2343 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2344 arc_evict_ghost(arc_mru_ghost, NULL, delta);
2345 }
2346
2347 adjustment =
2348 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2349
2350 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2351 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2352 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2353 }
2354 }
2355
2356 static void
2357 arc_do_user_evicts(void)
2358 {
2359 mutex_enter(&arc_eviction_mtx);
2360 while (arc_eviction_list != NULL) {
2361 arc_buf_t *buf = arc_eviction_list;
2362 arc_eviction_list = buf->b_next;
2363 mutex_enter(&buf->b_evict_lock);
2364 buf->b_hdr = NULL;
2365 mutex_exit(&buf->b_evict_lock);
2366 mutex_exit(&arc_eviction_mtx);
2367
2368 if (buf->b_efunc != NULL)
2369 VERIFY(buf->b_efunc(buf) == 0);
2370
2371 buf->b_efunc = NULL;
2372 buf->b_private = NULL;
2373 kmem_cache_free(buf_cache, buf);
2374 mutex_enter(&arc_eviction_mtx);
2375 }
2376 mutex_exit(&arc_eviction_mtx);
2377 }
2378
2379 /*
2380 * Flush all *evictable* data from the cache for the given spa.
2381 * NOTE: this will not touch "active" (i.e. referenced) data.
2382 */
2383 void
2384 arc_flush(spa_t *spa)
2385 {
2386 uint64_t guid = 0;
2387
2388 if (spa)
2389 guid = spa_load_guid(spa);
2390
2391 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2392 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2393 if (spa)
2394 break;
2395 }
2396 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2397 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2398 if (spa)
2399 break;
2400 }
2401 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2402 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2403 if (spa)
2404 break;
2405 }
2406 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2407 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2408 if (spa)
2409 break;
2410 }
2411
2412 arc_evict_ghost(arc_mru_ghost, guid, -1);
2413 arc_evict_ghost(arc_mfu_ghost, guid, -1);
2414
2415 mutex_enter(&arc_reclaim_thr_lock);
2416 arc_do_user_evicts();
2417 mutex_exit(&arc_reclaim_thr_lock);
2418 ASSERT(spa || arc_eviction_list == NULL);
2419 }
2420
2421 void
2422 arc_shrink(void)
2423 {
2424 if (arc_c > arc_c_min) {
2425 uint64_t to_free;
2426
2427 #ifdef _KERNEL
2428 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
2429 #else
2430 to_free = arc_c >> arc_shrink_shift;
2431 #endif
2432 if (arc_c > arc_c_min + to_free)
2433 atomic_add_64(&arc_c, -to_free);
2434 else
2435 arc_c = arc_c_min;
2436
2437 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2438 if (arc_c > arc_size)
2439 arc_c = MAX(arc_size, arc_c_min);
2440 if (arc_p > arc_c)
2441 arc_p = (arc_c >> 1);
2442 ASSERT(arc_c >= arc_c_min);
2443 ASSERT((int64_t)arc_p >= 0);
2444 }
2445
2446 if (arc_size > arc_c)
2447 arc_adjust();
2448 }
2449
2450 /*
2451 * Determine if the system is under memory pressure and is asking
2452 * to reclaim memory. A return value of 1 indicates that the system
2453 * is under memory pressure and that the arc should adjust accordingly.
2454 */
2455 static int
2456 arc_reclaim_needed(void)
2457 {
2458 uint64_t extra;
2459
2460 #ifdef _KERNEL
2461
2462 if (needfree)
2463 return (1);
2464
2465 /*
2466 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2467 */
2468 extra = desfree;
2469
2470 /*
2471 * check that we're out of range of the pageout scanner. It starts to
2472 * schedule paging if freemem is less than lotsfree and needfree.
2473 * lotsfree is the high-water mark for pageout, and needfree is the
2474 * number of needed free pages. We add extra pages here to make sure
2475 * the scanner doesn't start up while we're freeing memory.
2476 */
2477 if (freemem < lotsfree + needfree + extra)
2478 return (1);
2479
2480 /*
2481 * check to make sure that swapfs has enough space so that anon
2482 * reservations can still succeed. anon_resvmem() checks that the
2483 * availrmem is greater than swapfs_minfree, and the number of reserved
2484 * swap pages. We also add a bit of extra here just to prevent
2485 * circumstances from getting really dire.
2486 */
2487 if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2488 return (1);
2489
2490 #if defined(__i386)
2491 /*
2492 * If we're on an i386 platform, it's possible that we'll exhaust the
2493 * kernel heap space before we ever run out of available physical
2494 * memory. Most checks of the size of the heap_area compare against
2495 * tune.t_minarmem, which is the minimum available real memory that we
2496 * can have in the system. However, this is generally fixed at 25 pages
2497 * which is so low that it's useless. In this comparison, we seek to
2498 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2499 * heap is allocated. (Or, in the calculation, if less than 1/4th is
2500 * free)
2501 */
2502 if (vmem_size(heap_arena, VMEM_FREE) <
2503 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2))
2504 return (1);
2505 #endif
2506
2507 /*
2508 * If zio data pages are being allocated out of a separate heap segment,
2509 * then enforce that the size of available vmem for this arena remains
2510 * above about 1/16th free.
2511 *
2512 * Note: The 1/16th arena free requirement was put in place
2513 * to aggressively evict memory from the arc in order to avoid
2514 * memory fragmentation issues.
2515 */
2516 if (zio_arena != NULL &&
2517 vmem_size(zio_arena, VMEM_FREE) <
2518 (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2519 return (1);
2520 #else
2521 if (spa_get_random(100) == 0)
2522 return (1);
2523 #endif
2524 return (0);
2525 }
2526
2527 static void
2528 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2529 {
2530 size_t i;
2531 kmem_cache_t *prev_cache = NULL;
2532 kmem_cache_t *prev_data_cache = NULL;
2533 extern kmem_cache_t *zio_buf_cache[];
2534 extern kmem_cache_t *zio_data_buf_cache[];
2535
2536 #ifdef _KERNEL
2537 if (arc_meta_used >= arc_meta_limit) {
2538 /*
2539 * We are exceeding our meta-data cache limit.
2540 * Purge some DNLC entries to release holds on meta-data.
2541 */
2542 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2543 }
2544 #if defined(__i386)
2545 /*
2546 * Reclaim unused memory from all kmem caches.
2547 */
2548 kmem_reap();
2549 #endif
2550 #endif
2551
2552 /*
2553 * An aggressive reclamation will shrink the cache size as well as
2554 * reap free buffers from the arc kmem caches.
2555 */
2556 if (strat == ARC_RECLAIM_AGGR)
2557 arc_shrink();
2558
2559 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2560 if (zio_buf_cache[i] != prev_cache) {
2561 prev_cache = zio_buf_cache[i];
2562 kmem_cache_reap_now(zio_buf_cache[i]);
2563 }
2564 if (zio_data_buf_cache[i] != prev_data_cache) {
2565 prev_data_cache = zio_data_buf_cache[i];
2566 kmem_cache_reap_now(zio_data_buf_cache[i]);
2567 }
2568 }
2569 kmem_cache_reap_now(buf_cache);
2570 kmem_cache_reap_now(hdr_cache);
2571
2572 /*
2573 * Ask the vmem areana to reclaim unused memory from its
2574 * quantum caches.
2575 */
2576 if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2577 vmem_qcache_reap(zio_arena);
2578 }
2579
2580 static void
2581 arc_reclaim_thread(void)
2582 {
2583 clock_t growtime = 0;
2584 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
2585 callb_cpr_t cpr;
2586
2587 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2588
2589 mutex_enter(&arc_reclaim_thr_lock);
2590 while (arc_thread_exit == 0) {
2591 if (arc_reclaim_needed()) {
2592
2593 if (arc_no_grow) {
2594 if (last_reclaim == ARC_RECLAIM_CONS) {
2595 last_reclaim = ARC_RECLAIM_AGGR;
2596 } else {
2597 last_reclaim = ARC_RECLAIM_CONS;
2598 }
2599 } else {
2600 arc_no_grow = TRUE;
2601 last_reclaim = ARC_RECLAIM_AGGR;
2602 membar_producer();
2603 }
2604
2605 /* reset the growth delay for every reclaim */
2606 growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2607
2608 arc_kmem_reap_now(last_reclaim);
2609 arc_warm = B_TRUE;
2610
2611 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2612 arc_no_grow = FALSE;
2613 }
2614
2615 arc_adjust();
2616
2617 if (arc_eviction_list != NULL)
2618 arc_do_user_evicts();
2619
2620 /* block until needed, or one second, whichever is shorter */
2621 CALLB_CPR_SAFE_BEGIN(&cpr);
2622 (void) cv_timedwait(&arc_reclaim_thr_cv,
2623 &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2624 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2625 }
2626
2627 arc_thread_exit = 0;
2628 cv_broadcast(&arc_reclaim_thr_cv);
2629 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */
2630 thread_exit();
2631 }
2632
2633 /*
2634 * Adapt arc info given the number of bytes we are trying to add and
2635 * the state that we are comming from. This function is only called
2636 * when we are adding new content to the cache.
2637 */
2638 static void
2639 arc_adapt(int bytes, arc_state_t *state)
2640 {
2641 int mult;
2642 uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2643
2644 if (state == arc_l2c_only)
2645 return;
2646
2647 ASSERT(bytes > 0);
2648 /*
2649 * Adapt the target size of the MRU list:
2650 * - if we just hit in the MRU ghost list, then increase
2651 * the target size of the MRU list.
2652 * - if we just hit in the MFU ghost list, then increase
2653 * the target size of the MFU list by decreasing the
2654 * target size of the MRU list.
2655 */
2656 if (state == arc_mru_ghost) {
2657 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2658 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2659 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2660
2661 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2662 } else if (state == arc_mfu_ghost) {
2663 uint64_t delta;
2664
2665 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2666 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2667 mult = MIN(mult, 10);
2668
2669 delta = MIN(bytes * mult, arc_p);
2670 arc_p = MAX(arc_p_min, arc_p - delta);
2671 }
2672 ASSERT((int64_t)arc_p >= 0);
2673
2674 if (arc_reclaim_needed()) {
2675 cv_signal(&arc_reclaim_thr_cv);
2676 return;
2677 }
2678
2679 if (arc_no_grow)
2680 return;
2681
2682 if (arc_c >= arc_c_max)
2683 return;
2684
2685 /*
2686 * If we're within (2 * maxblocksize) bytes of the target
2687 * cache size, increment the target cache size
2688 */
2689 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2690 atomic_add_64(&arc_c, (int64_t)bytes);
2691 if (arc_c > arc_c_max)
2692 arc_c = arc_c_max;
2693 else if (state == arc_anon)
2694 atomic_add_64(&arc_p, (int64_t)bytes);
2695 if (arc_p > arc_c)
2696 arc_p = arc_c;
2697 }
2698 ASSERT((int64_t)arc_p >= 0);
2699 }
2700
2701 /*
2702 * Check if the cache has reached its limits and eviction is required
2703 * prior to insert.
2704 */
2705 static int
2706 arc_evict_needed(arc_buf_contents_t type)
2707 {
2708 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2709 return (1);
2710
2711 if (arc_reclaim_needed())
2712 return (1);
2713
2714 return (arc_size > arc_c);
2715 }
2716
2717 /*
2718 * The buffer, supplied as the first argument, needs a data block.
2719 * So, if we are at cache max, determine which cache should be victimized.
2720 * We have the following cases:
2721 *
2722 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2723 * In this situation if we're out of space, but the resident size of the MFU is
2724 * under the limit, victimize the MFU cache to satisfy this insertion request.
2725 *
2726 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2727 * Here, we've used up all of the available space for the MRU, so we need to
2728 * evict from our own cache instead. Evict from the set of resident MRU
2729 * entries.
2730 *
2731 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2732 * c minus p represents the MFU space in the cache, since p is the size of the
2733 * cache that is dedicated to the MRU. In this situation there's still space on
2734 * the MFU side, so the MRU side needs to be victimized.
2735 *
2736 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2737 * MFU's resident set is consuming more space than it has been allotted. In
2738 * this situation, we must victimize our own cache, the MFU, for this insertion.
2739 */
2740 static void
2741 arc_get_data_buf(arc_buf_t *buf)
2742 {
2743 arc_state_t *state = buf->b_hdr->b_state;
2744 uint64_t size = buf->b_hdr->b_size;
2745 arc_buf_contents_t type = buf->b_hdr->b_type;
2746
2747 arc_adapt(size, state);
2748
2749 /*
2750 * We have not yet reached cache maximum size,
2751 * just allocate a new buffer.
2752 */
2753 if (!arc_evict_needed(type)) {
2754 if (type == ARC_BUFC_METADATA) {
2755 buf->b_data = zio_buf_alloc(size);
2756 arc_space_consume(size, ARC_SPACE_DATA);
2757 } else {
2758 ASSERT(type == ARC_BUFC_DATA);
2759 buf->b_data = zio_data_buf_alloc(size);
2760 ARCSTAT_INCR(arcstat_data_size, size);
2761 atomic_add_64(&arc_size, size);
2762 }
2763 goto out;
2764 }
2765
2766 /*
2767 * If we are prefetching from the mfu ghost list, this buffer
2768 * will end up on the mru list; so steal space from there.
2769 */
2770 if (state == arc_mfu_ghost)
2771 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2772 else if (state == arc_mru_ghost)
2773 state = arc_mru;
2774
2775 if (state == arc_mru || state == arc_anon) {
2776 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2777 state = (arc_mfu->arcs_lsize[type] >= size &&
2778 arc_p > mru_used) ? arc_mfu : arc_mru;
2779 } else {
2780 /* MFU cases */
2781 uint64_t mfu_space = arc_c - arc_p;
2782 state = (arc_mru->arcs_lsize[type] >= size &&
2783 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2784 }
2785 if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2786 if (type == ARC_BUFC_METADATA) {
2787 buf->b_data = zio_buf_alloc(size);
2788 arc_space_consume(size, ARC_SPACE_DATA);
2789 } else {
2790 ASSERT(type == ARC_BUFC_DATA);
2791 buf->b_data = zio_data_buf_alloc(size);
2792 ARCSTAT_INCR(arcstat_data_size, size);
2793 atomic_add_64(&arc_size, size);
2794 }
2795 ARCSTAT_BUMP(arcstat_recycle_miss);
2796 }
2797 ASSERT(buf->b_data != NULL);
2798 out:
2799 /*
2800 * Update the state size. Note that ghost states have a
2801 * "ghost size" and so don't need to be updated.
2802 */
2803 if (!GHOST_STATE(buf->b_hdr->b_state)) {
2804 arc_buf_hdr_t *hdr = buf->b_hdr;
2805
2806 atomic_add_64(&hdr->b_state->arcs_size, size);
2807 if (list_link_active(&hdr->b_arc_node)) {
2808 ASSERT(refcount_is_zero(&hdr->b_refcnt));
2809 atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2810 }
2811 /*
2812 * If we are growing the cache, and we are adding anonymous
2813 * data, and we have outgrown arc_p, update arc_p
2814 */
2815 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2816 arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2817 arc_p = MIN(arc_c, arc_p + size);
2818 }
2819 }
2820
2821 /*
2822 * This routine is called whenever a buffer is accessed.
2823 * NOTE: the hash lock is dropped in this function.
2824 */
2825 static void
2826 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2827 {
2828 clock_t now;
2829
2830 ASSERT(MUTEX_HELD(hash_lock));
2831
2832 if (buf->b_state == arc_anon) {
2833 /*
2834 * This buffer is not in the cache, and does not
2835 * appear in our "ghost" list. Add the new buffer
2836 * to the MRU state.
2837 */
2838
2839 ASSERT(buf->b_arc_access == 0);
2840 buf->b_arc_access = ddi_get_lbolt();
2841 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2842 arc_change_state(arc_mru, buf, hash_lock);
2843
2844 } else if (buf->b_state == arc_mru) {
2845 now = ddi_get_lbolt();
2846
2847 /*
2848 * If this buffer is here because of a prefetch, then either:
2849 * - clear the flag if this is a "referencing" read
2850 * (any subsequent access will bump this into the MFU state).
2851 * or
2852 * - move the buffer to the head of the list if this is
2853 * another prefetch (to make it less likely to be evicted).
2854 */
2855 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2856 if (refcount_count(&buf->b_refcnt) == 0) {
2857 ASSERT(list_link_active(&buf->b_arc_node));
2858 } else {
2859 buf->b_flags &= ~ARC_PREFETCH;
2860 ARCSTAT_BUMP(arcstat_mru_hits);
2861 }
2862 buf->b_arc_access = now;
2863 return;
2864 }
2865
2866 /*
2867 * This buffer has been "accessed" only once so far,
2868 * but it is still in the cache. Move it to the MFU
2869 * state.
2870 */
2871 if (now > buf->b_arc_access + ARC_MINTIME) {
2872 /*
2873 * More than 125ms have passed since we
2874 * instantiated this buffer. Move it to the
2875 * most frequently used state.
2876 */
2877 buf->b_arc_access = now;
2878 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2879 arc_change_state(arc_mfu, buf, hash_lock);
2880 }
2881 ARCSTAT_BUMP(arcstat_mru_hits);
2882 } else if (buf->b_state == arc_mru_ghost) {
2883 arc_state_t *new_state;
2884 /*
2885 * This buffer has been "accessed" recently, but
2886 * was evicted from the cache. Move it to the
2887 * MFU state.
2888 */
2889
2890 if (buf->b_flags & ARC_PREFETCH) {
2891 new_state = arc_mru;
2892 if (refcount_count(&buf->b_refcnt) > 0)
2893 buf->b_flags &= ~ARC_PREFETCH;
2894 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2895 } else {
2896 new_state = arc_mfu;
2897 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2898 }
2899
2900 buf->b_arc_access = ddi_get_lbolt();
2901 arc_change_state(new_state, buf, hash_lock);
2902
2903 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2904 } else if (buf->b_state == arc_mfu) {
2905 /*
2906 * This buffer has been accessed more than once and is
2907 * still in the cache. Keep it in the MFU state.
2908 *
2909 * NOTE: an add_reference() that occurred when we did
2910 * the arc_read() will have kicked this off the list.
2911 * If it was a prefetch, we will explicitly move it to
2912 * the head of the list now.
2913 */
2914 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2915 ASSERT(refcount_count(&buf->b_refcnt) == 0);
2916 ASSERT(list_link_active(&buf->b_arc_node));
2917 }
2918 ARCSTAT_BUMP(arcstat_mfu_hits);
2919 buf->b_arc_access = ddi_get_lbolt();
2920 } else if (buf->b_state == arc_mfu_ghost) {
2921 arc_state_t *new_state = arc_mfu;
2922 /*
2923 * This buffer has been accessed more than once but has
2924 * been evicted from the cache. Move it back to the
2925 * MFU state.
2926 */
2927
2928 if (buf->b_flags & ARC_PREFETCH) {
2929 /*
2930 * This is a prefetch access...
2931 * move this block back to the MRU state.
2932 */
2933 ASSERT0(refcount_count(&buf->b_refcnt));
2934 new_state = arc_mru;
2935 }
2936
2937 buf->b_arc_access = ddi_get_lbolt();
2938 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2939 arc_change_state(new_state, buf, hash_lock);
2940
2941 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2942 } else if (buf->b_state == arc_l2c_only) {
2943 /*
2944 * This buffer is on the 2nd Level ARC.
2945 */
2946
2947 buf->b_arc_access = ddi_get_lbolt();
2948 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2949 arc_change_state(arc_mfu, buf, hash_lock);
2950 } else {
2951 ASSERT(!"invalid arc state");
2952 }
2953 }
2954
2955 /* a generic arc_done_func_t which you can use */
2956 /* ARGSUSED */
2957 void
2958 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2959 {
2960 if (zio == NULL || zio->io_error == 0)
2961 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2962 VERIFY(arc_buf_remove_ref(buf, arg));
2963 }
2964
2965 /* a generic arc_done_func_t */
2966 void
2967 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2968 {
2969 arc_buf_t **bufp = arg;
2970 if (zio && zio->io_error) {
2971 VERIFY(arc_buf_remove_ref(buf, arg));
2972 *bufp = NULL;
2973 } else {
2974 *bufp = buf;
2975 ASSERT(buf->b_data);
2976 }
2977 }
2978
2979 static void
2980 arc_read_done(zio_t *zio)
2981 {
2982 arc_buf_hdr_t *hdr, *found;
2983 arc_buf_t *buf;
2984 arc_buf_t *abuf; /* buffer we're assigning to callback */
2985 kmutex_t *hash_lock;
2986 arc_callback_t *callback_list, *acb;
2987 int freeable = FALSE;
2988
2989 buf = zio->io_private;
2990 hdr = buf->b_hdr;
2991
2992 /*
2993 * The hdr was inserted into hash-table and removed from lists
2994 * prior to starting I/O. We should find this header, since
2995 * it's in the hash table, and it should be legit since it's
2996 * not possible to evict it during the I/O. The only possible
2997 * reason for it not to be found is if we were freed during the
2998 * read.
2999 */
3000 found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
3001 &hash_lock);
3002
3003 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
3004 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3005 (found == hdr && HDR_L2_READING(hdr)));
3006
3007 hdr->b_flags &= ~ARC_L2_EVICTED;
3008 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
3009 hdr->b_flags &= ~ARC_L2CACHE;
3010
3011 /* byteswap if necessary */
3012 callback_list = hdr->b_acb;
3013 ASSERT(callback_list != NULL);
3014 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3015 dmu_object_byteswap_t bswap =
3016 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3017 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
3018 byteswap_uint64_array :
3019 dmu_ot_byteswap[bswap].ob_func;
3020 func(buf->b_data, hdr->b_size);
3021 }
3022
3023 arc_cksum_compute(buf, B_FALSE);
3024 arc_buf_watch(buf);
3025
3026 if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
3027 /*
3028 * Only call arc_access on anonymous buffers. This is because
3029 * if we've issued an I/O for an evicted buffer, we've already
3030 * called arc_access (to prevent any simultaneous readers from
3031 * getting confused).
3032 */
3033 arc_access(hdr, hash_lock);
3034 }
3035
3036 /* create copies of the data buffer for the callers */
3037 abuf = buf;
3038 for (acb = callback_list; acb; acb = acb->acb_next) {
3039 if (acb->acb_done) {
3040 if (abuf == NULL) {
3041 ARCSTAT_BUMP(arcstat_duplicate_reads);
3042 abuf = arc_buf_clone(buf);
3043 }
3044 acb->acb_buf = abuf;
3045 abuf = NULL;
3046 }
3047 }
3048 hdr->b_acb = NULL;
3049 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3050 ASSERT(!HDR_BUF_AVAILABLE(hdr));
3051 if (abuf == buf) {
3052 ASSERT(buf->b_efunc == NULL);
3053 ASSERT(hdr->b_datacnt == 1);
3054 hdr->b_flags |= ARC_BUF_AVAILABLE;
3055 }
3056
3057 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
3058
3059 if (zio->io_error != 0) {
3060 hdr->b_flags |= ARC_IO_ERROR;
3061 if (hdr->b_state != arc_anon)
3062 arc_change_state(arc_anon, hdr, hash_lock);
3063 if (HDR_IN_HASH_TABLE(hdr))
3064 buf_hash_remove(hdr);
3065 freeable = refcount_is_zero(&hdr->b_refcnt);
3066 }
3067
3068 /*
3069 * Broadcast before we drop the hash_lock to avoid the possibility
3070 * that the hdr (and hence the cv) might be freed before we get to
3071 * the cv_broadcast().
3072 */
3073 cv_broadcast(&hdr->b_cv);
3074
3075 if (hash_lock) {
3076 mutex_exit(hash_lock);
3077 } else {
3078 /*
3079 * This block was freed while we waited for the read to
3080 * complete. It has been removed from the hash table and
3081 * moved to the anonymous state (so that it won't show up
3082 * in the cache).
3083 */
3084 ASSERT3P(hdr->b_state, ==, arc_anon);
3085 freeable = refcount_is_zero(&hdr->b_refcnt);
3086 }
3087
3088 /* execute each callback and free its structure */
3089 while ((acb = callback_list) != NULL) {
3090 if (acb->acb_done)
3091 acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3092
3093 if (acb->acb_zio_dummy != NULL) {
3094 acb->acb_zio_dummy->io_error = zio->io_error;
3095 zio_nowait(acb->acb_zio_dummy);
3096 }
3097
3098 callback_list = acb->acb_next;
3099 kmem_free(acb, sizeof (arc_callback_t));
3100 }
3101
3102 if (freeable)
3103 arc_hdr_destroy(hdr);
3104 }
3105
3106 /*
3107 * "Read" the block at the specified DVA (in bp) via the
3108 * cache. If the block is found in the cache, invoke the provided
3109 * callback immediately and return. Note that the `zio' parameter
3110 * in the callback will be NULL in this case, since no IO was
3111 * required. If the block is not in the cache pass the read request
3112 * on to the spa with a substitute callback function, so that the
3113 * requested block will be added to the cache.
3114 *
3115 * If a read request arrives for a block that has a read in-progress,
3116 * either wait for the in-progress read to complete (and return the
3117 * results); or, if this is a read with a "done" func, add a record
3118 * to the read to invoke the "done" func when the read completes,
3119 * and return; or just return.
3120 *
3121 * arc_read_done() will invoke all the requested "done" functions
3122 * for readers of this block.
3123 */
3124 int
3125 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3126 void *private, int priority, int zio_flags, uint32_t *arc_flags,
3127 const zbookmark_t *zb)
3128 {
3129 arc_buf_hdr_t *hdr;
3130 arc_buf_t *buf = NULL;
3131 kmutex_t *hash_lock;
3132 zio_t *rzio;
3133 uint64_t guid = spa_load_guid(spa);
3134
3135 top:
3136 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3137 &hash_lock);
3138 if (hdr && hdr->b_datacnt > 0) {
3139
3140 *arc_flags |= ARC_CACHED;
3141
3142 if (HDR_IO_IN_PROGRESS(hdr)) {
3143
3144 if (*arc_flags & ARC_WAIT) {
3145 cv_wait(&hdr->b_cv, hash_lock);
3146 mutex_exit(hash_lock);
3147 goto top;
3148 }
3149 ASSERT(*arc_flags & ARC_NOWAIT);
3150
3151 if (done) {
3152 arc_callback_t *acb = NULL;
3153
3154 acb = kmem_zalloc(sizeof (arc_callback_t),
3155 KM_SLEEP);
3156 acb->acb_done = done;
3157 acb->acb_private = private;
3158 if (pio != NULL)
3159 acb->acb_zio_dummy = zio_null(pio,
3160 spa, NULL, NULL, NULL, zio_flags);
3161
3162 ASSERT(acb->acb_done != NULL);
3163 acb->acb_next = hdr->b_acb;
3164 hdr->b_acb = acb;
3165 add_reference(hdr, hash_lock, private);
3166 mutex_exit(hash_lock);
3167 return (0);
3168 }
3169 mutex_exit(hash_lock);
3170 return (0);
3171 }
3172
3173 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3174
3175 if (done) {
3176 add_reference(hdr, hash_lock, private);
3177 /*
3178 * If this block is already in use, create a new
3179 * copy of the data so that we will be guaranteed
3180 * that arc_release() will always succeed.
3181 */
3182 buf = hdr->b_buf;
3183 ASSERT(buf);
3184 ASSERT(buf->b_data);
3185 if (HDR_BUF_AVAILABLE(hdr)) {
3186 ASSERT(buf->b_efunc == NULL);
3187 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3188 } else {
3189 buf = arc_buf_clone(buf);
3190 }
3191
3192 } else if (*arc_flags & ARC_PREFETCH &&
3193 refcount_count(&hdr->b_refcnt) == 0) {
3194 hdr->b_flags |= ARC_PREFETCH;
3195 }
3196 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3197 arc_access(hdr, hash_lock);
3198 if (*arc_flags & ARC_L2CACHE)
3199 hdr->b_flags |= ARC_L2CACHE;
3200 if (*arc_flags & ARC_L2COMPRESS)
3201 hdr->b_flags |= ARC_L2COMPRESS;
3202 mutex_exit(hash_lock);
3203 ARCSTAT_BUMP(arcstat_hits);
3204 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3205 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3206 data, metadata, hits);
3207
3208 if (done)
3209 done(NULL, buf, private);
3210 } else {
3211 uint64_t size = BP_GET_LSIZE(bp);
3212 arc_callback_t *acb;
3213 vdev_t *vd = NULL;
3214 uint64_t addr = 0;
3215 boolean_t devw = B_FALSE;
3216
3217 if (hdr == NULL) {
3218 /* this block is not in the cache */
3219 arc_buf_hdr_t *exists;
3220 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3221 buf = arc_buf_alloc(spa, size, private, type);
3222 hdr = buf->b_hdr;
3223 hdr->b_dva = *BP_IDENTITY(bp);
3224 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3225 hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3226 exists = buf_hash_insert(hdr, &hash_lock);
3227 if (exists) {
3228 /* somebody beat us to the hash insert */
3229 mutex_exit(hash_lock);
3230 buf_discard_identity(hdr);
3231 (void) arc_buf_remove_ref(buf, private);
3232 goto top; /* restart the IO request */
3233 }
3234 /* if this is a prefetch, we don't have a reference */
3235 if (*arc_flags & ARC_PREFETCH) {
3236 (void) remove_reference(hdr, hash_lock,
3237 private);
3238 hdr->b_flags |= ARC_PREFETCH;
3239 }
3240 if (*arc_flags & ARC_L2CACHE)
3241 hdr->b_flags |= ARC_L2CACHE;
3242 if (*arc_flags & ARC_L2COMPRESS)
3243 hdr->b_flags |= ARC_L2COMPRESS;
3244 if (BP_GET_LEVEL(bp) > 0)
3245 hdr->b_flags |= ARC_INDIRECT;
3246 } else {
3247 /* this block is in the ghost cache */
3248 ASSERT(GHOST_STATE(hdr->b_state));
3249 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3250 ASSERT0(refcount_count(&hdr->b_refcnt));
3251 ASSERT(hdr->b_buf == NULL);
3252
3253 /* if this is a prefetch, we don't have a reference */
3254 if (*arc_flags & ARC_PREFETCH)
3255 hdr->b_flags |= ARC_PREFETCH;
3256 else
3257 add_reference(hdr, hash_lock, private);
3258 if (*arc_flags & ARC_L2CACHE)
3259 hdr->b_flags |= ARC_L2CACHE;
3260 if (*arc_flags & ARC_L2COMPRESS)
3261 hdr->b_flags |= ARC_L2COMPRESS;
3262 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3263 buf->b_hdr = hdr;
3264 buf->b_data = NULL;
3265 buf->b_efunc = NULL;
3266 buf->b_private = NULL;
3267 buf->b_next = NULL;
3268 hdr->b_buf = buf;
3269 ASSERT(hdr->b_datacnt == 0);
3270 hdr->b_datacnt = 1;
3271 arc_get_data_buf(buf);
3272 arc_access(hdr, hash_lock);
3273 }
3274
3275 ASSERT(!GHOST_STATE(hdr->b_state));
3276
3277 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3278 acb->acb_done = done;
3279 acb->acb_private = private;
3280
3281 ASSERT(hdr->b_acb == NULL);
3282 hdr->b_acb = acb;
3283 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3284
3285 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
3286 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3287 devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3288 addr = hdr->b_l2hdr->b_daddr;
3289 /*
3290 * Lock out device removal.
3291 */
3292 if (vdev_is_dead(vd) ||
3293 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3294 vd = NULL;
3295 }
3296
3297 mutex_exit(hash_lock);
3298
3299 /*
3300 * At this point, we have a level 1 cache miss. Try again in
3301 * L2ARC if possible.
3302 */
3303 ASSERT3U(hdr->b_size, ==, size);
3304 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3305 uint64_t, size, zbookmark_t *, zb);
3306 ARCSTAT_BUMP(arcstat_misses);
3307 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3308 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3309 data, metadata, misses);
3310
3311 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3312 /*
3313 * Read from the L2ARC if the following are true:
3314 * 1. The L2ARC vdev was previously cached.
3315 * 2. This buffer still has L2ARC metadata.
3316 * 3. This buffer isn't currently writing to the L2ARC.
3317 * 4. The L2ARC entry wasn't evicted, which may
3318 * also have invalidated the vdev.
3319 * 5. This isn't prefetch and l2arc_noprefetch is set.
3320 */
3321 if (hdr->b_l2hdr != NULL &&
3322 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3323 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3324 l2arc_read_callback_t *cb;
3325
3326 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3327 ARCSTAT_BUMP(arcstat_l2_hits);
3328
3329 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3330 KM_SLEEP);
3331 cb->l2rcb_buf = buf;
3332 cb->l2rcb_spa = spa;
3333 cb->l2rcb_bp = *bp;
3334 cb->l2rcb_zb = *zb;
3335 cb->l2rcb_flags = zio_flags;
3336 cb->l2rcb_compress = hdr->b_l2hdr->b_compress;
3337
3338 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3339 addr + size < vd->vdev_psize -
3340 VDEV_LABEL_END_SIZE);
3341
3342 /*
3343 * l2arc read. The SCL_L2ARC lock will be
3344 * released by l2arc_read_done().
3345 * Issue a null zio if the underlying buffer
3346 * was squashed to zero size by compression.
3347 */
3348 if (hdr->b_l2hdr->b_compress ==
3349 ZIO_COMPRESS_EMPTY) {
3350 rzio = zio_null(pio, spa, vd,
3351 l2arc_read_done, cb,
3352 zio_flags | ZIO_FLAG_DONT_CACHE |
3353 ZIO_FLAG_CANFAIL |
3354 ZIO_FLAG_DONT_PROPAGATE |
3355 ZIO_FLAG_DONT_RETRY);
3356 } else {
3357 rzio = zio_read_phys(pio, vd, addr,
3358 hdr->b_l2hdr->b_asize,
3359 buf->b_data, ZIO_CHECKSUM_OFF,
3360 l2arc_read_done, cb, priority,
3361 zio_flags | ZIO_FLAG_DONT_CACHE |
3362 ZIO_FLAG_CANFAIL |
3363 ZIO_FLAG_DONT_PROPAGATE |
3364 ZIO_FLAG_DONT_RETRY, B_FALSE);
3365 }
3366 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3367 zio_t *, rzio);
3368 ARCSTAT_INCR(arcstat_l2_read_bytes,
3369 hdr->b_l2hdr->b_asize);
3370
3371 if (*arc_flags & ARC_NOWAIT) {
3372 zio_nowait(rzio);
3373 return (0);
3374 }
3375
3376 ASSERT(*arc_flags & ARC_WAIT);
3377 if (zio_wait(rzio) == 0)
3378 return (0);
3379
3380 /* l2arc read error; goto zio_read() */
3381 } else {
3382 DTRACE_PROBE1(l2arc__miss,
3383 arc_buf_hdr_t *, hdr);
3384 ARCSTAT_BUMP(arcstat_l2_misses);
3385 if (HDR_L2_WRITING(hdr))
3386 ARCSTAT_BUMP(arcstat_l2_rw_clash);
3387 spa_config_exit(spa, SCL_L2ARC, vd);
3388 }
3389 } else {
3390 if (vd != NULL)
3391 spa_config_exit(spa, SCL_L2ARC, vd);
3392 if (l2arc_ndev != 0) {
3393 DTRACE_PROBE1(l2arc__miss,
3394 arc_buf_hdr_t *, hdr);
3395 ARCSTAT_BUMP(arcstat_l2_misses);
3396 }
3397 }
3398
3399 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3400 arc_read_done, buf, priority, zio_flags, zb);
3401
3402 if (*arc_flags & ARC_WAIT)
3403 return (zio_wait(rzio));
3404
3405 ASSERT(*arc_flags & ARC_NOWAIT);
3406 zio_nowait(rzio);
3407 }
3408 return (0);
3409 }
3410
3411 void
3412 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3413 {
3414 ASSERT(buf->b_hdr != NULL);
3415 ASSERT(buf->b_hdr->b_state != arc_anon);
3416 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3417 ASSERT(buf->b_efunc == NULL);
3418 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3419
3420 buf->b_efunc = func;
3421 buf->b_private = private;
3422 }
3423
3424 /*
3425 * Notify the arc that a block was freed, and thus will never be used again.
3426 */
3427 void
3428 arc_freed(spa_t *spa, const blkptr_t *bp)
3429 {
3430 arc_buf_hdr_t *hdr;
3431 kmutex_t *hash_lock;
3432 uint64_t guid = spa_load_guid(spa);
3433
3434 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3435 &hash_lock);
3436 if (hdr == NULL)
3437 return;
3438 if (HDR_BUF_AVAILABLE(hdr)) {
3439 arc_buf_t *buf = hdr->b_buf;
3440 add_reference(hdr, hash_lock, FTAG);
3441 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3442 mutex_exit(hash_lock);
3443
3444 arc_release(buf, FTAG);
3445 (void) arc_buf_remove_ref(buf, FTAG);
3446 } else {
3447 mutex_exit(hash_lock);
3448 }
3449
3450 }
3451
3452 /*
3453 * This is used by the DMU to let the ARC know that a buffer is
3454 * being evicted, so the ARC should clean up. If this arc buf
3455 * is not yet in the evicted state, it will be put there.
3456 */
3457 int
3458 arc_buf_evict(arc_buf_t *buf)
3459 {
3460 arc_buf_hdr_t *hdr;
3461 kmutex_t *hash_lock;
3462 arc_buf_t **bufp;
3463
3464 mutex_enter(&buf->b_evict_lock);
3465 hdr = buf->b_hdr;
3466 if (hdr == NULL) {
3467 /*
3468 * We are in arc_do_user_evicts().
3469 */
3470 ASSERT(buf->b_data == NULL);
3471 mutex_exit(&buf->b_evict_lock);
3472 return (0);
3473 } else if (buf->b_data == NULL) {
3474 arc_buf_t copy = *buf; /* structure assignment */
3475 /*
3476 * We are on the eviction list; process this buffer now
3477 * but let arc_do_user_evicts() do the reaping.
3478 */
3479 buf->b_efunc = NULL;
3480 mutex_exit(&buf->b_evict_lock);
3481 VERIFY(copy.b_efunc(©) == 0);
3482 return (1);
3483 }
3484 hash_lock = HDR_LOCK(hdr);
3485 mutex_enter(hash_lock);
3486 hdr = buf->b_hdr;
3487 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3488
3489 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3490 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3491
3492 /*
3493 * Pull this buffer off of the hdr
3494 */
3495 bufp = &hdr->b_buf;
3496 while (*bufp != buf)
3497 bufp = &(*bufp)->b_next;
3498 *bufp = buf->b_next;
3499
3500 ASSERT(buf->b_data != NULL);
3501 arc_buf_destroy(buf, FALSE, FALSE);
3502
3503 if (hdr->b_datacnt == 0) {
3504 arc_state_t *old_state = hdr->b_state;
3505 arc_state_t *evicted_state;
3506
3507 ASSERT(hdr->b_buf == NULL);
3508 ASSERT(refcount_is_zero(&hdr->b_refcnt));
3509
3510 evicted_state =
3511 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3512
3513 mutex_enter(&old_state->arcs_mtx);
3514 mutex_enter(&evicted_state->arcs_mtx);
3515
3516 arc_change_state(evicted_state, hdr, hash_lock);
3517 ASSERT(HDR_IN_HASH_TABLE(hdr));
3518 hdr->b_flags |= ARC_IN_HASH_TABLE;
3519 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3520
3521 mutex_exit(&evicted_state->arcs_mtx);
3522 mutex_exit(&old_state->arcs_mtx);
3523 }
3524 mutex_exit(hash_lock);
3525 mutex_exit(&buf->b_evict_lock);
3526
3527 VERIFY(buf->b_efunc(buf) == 0);
3528 buf->b_efunc = NULL;
3529 buf->b_private = NULL;
3530 buf->b_hdr = NULL;
3531 buf->b_next = NULL;
3532 kmem_cache_free(buf_cache, buf);
3533 return (1);
3534 }
3535
3536 /*
3537 * Release this buffer from the cache, making it an anonymous buffer. This
3538 * must be done after a read and prior to modifying the buffer contents.
3539 * If the buffer has more than one reference, we must make
3540 * a new hdr for the buffer.
3541 */
3542 void
3543 arc_release(arc_buf_t *buf, void *tag)
3544 {
3545 arc_buf_hdr_t *hdr;
3546 kmutex_t *hash_lock = NULL;
3547 l2arc_buf_hdr_t *l2hdr;
3548 uint64_t buf_size;
3549
3550 /*
3551 * It would be nice to assert that if it's DMU metadata (level >
3552 * 0 || it's the dnode file), then it must be syncing context.
3553 * But we don't know that information at this level.
3554 */
3555
3556 mutex_enter(&buf->b_evict_lock);
3557 hdr = buf->b_hdr;
3558
3559 /* this buffer is not on any list */
3560 ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3561
3562 if (hdr->b_state == arc_anon) {
3563 /* this buffer is already released */
3564 ASSERT(buf->b_efunc == NULL);
3565 } else {
3566 hash_lock = HDR_LOCK(hdr);
3567 mutex_enter(hash_lock);
3568 hdr = buf->b_hdr;
3569 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3570 }
3571
3572 l2hdr = hdr->b_l2hdr;
3573 if (l2hdr) {
3574 mutex_enter(&l2arc_buflist_mtx);
3575 hdr->b_l2hdr = NULL;
3576 }
3577 buf_size = hdr->b_size;
3578
3579 /*
3580 * Do we have more than one buf?
3581 */
3582 if (hdr->b_datacnt > 1) {
3583 arc_buf_hdr_t *nhdr;
3584 arc_buf_t **bufp;
3585 uint64_t blksz = hdr->b_size;
3586 uint64_t spa = hdr->b_spa;
3587 arc_buf_contents_t type = hdr->b_type;
3588 uint32_t flags = hdr->b_flags;
3589
3590 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3591 /*
3592 * Pull the data off of this hdr and attach it to
3593 * a new anonymous hdr.
3594 */
3595 (void) remove_reference(hdr, hash_lock, tag);
3596 bufp = &hdr->b_buf;
3597 while (*bufp != buf)
3598 bufp = &(*bufp)->b_next;
3599 *bufp = buf->b_next;
3600 buf->b_next = NULL;
3601
3602 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3603 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3604 if (refcount_is_zero(&hdr->b_refcnt)) {
3605 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3606 ASSERT3U(*size, >=, hdr->b_size);
3607 atomic_add_64(size, -hdr->b_size);
3608 }
3609
3610 /*
3611 * We're releasing a duplicate user data buffer, update
3612 * our statistics accordingly.
3613 */
3614 if (hdr->b_type == ARC_BUFC_DATA) {
3615 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3616 ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3617 -hdr->b_size);
3618 }
3619 hdr->b_datacnt -= 1;
3620 arc_cksum_verify(buf);
3621 arc_buf_unwatch(buf);
3622
3623 mutex_exit(hash_lock);
3624
3625 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3626 nhdr->b_size = blksz;
3627 nhdr->b_spa = spa;
3628 nhdr->b_type = type;
3629 nhdr->b_buf = buf;
3630 nhdr->b_state = arc_anon;
3631 nhdr->b_arc_access = 0;
3632 nhdr->b_flags = flags & ARC_L2_WRITING;
3633 nhdr->b_l2hdr = NULL;
3634 nhdr->b_datacnt = 1;
3635 nhdr->b_freeze_cksum = NULL;
3636 (void) refcount_add(&nhdr->b_refcnt, tag);
3637 buf->b_hdr = nhdr;
3638 mutex_exit(&buf->b_evict_lock);
3639 atomic_add_64(&arc_anon->arcs_size, blksz);
3640 } else {
3641 mutex_exit(&buf->b_evict_lock);
3642 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3643 ASSERT(!list_link_active(&hdr->b_arc_node));
3644 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3645 if (hdr->b_state != arc_anon)
3646 arc_change_state(arc_anon, hdr, hash_lock);
3647 hdr->b_arc_access = 0;
3648 if (hash_lock)
3649 mutex_exit(hash_lock);
3650
3651 buf_discard_identity(hdr);
3652 arc_buf_thaw(buf);
3653 }
3654 buf->b_efunc = NULL;
3655 buf->b_private = NULL;
3656
3657 if (l2hdr) {
3658 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3659 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3660 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3661 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3662 mutex_exit(&l2arc_buflist_mtx);
3663 }
3664 }
3665
3666 int
3667 arc_released(arc_buf_t *buf)
3668 {
3669 int released;
3670
3671 mutex_enter(&buf->b_evict_lock);
3672 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3673 mutex_exit(&buf->b_evict_lock);
3674 return (released);
3675 }
3676
3677 int
3678 arc_has_callback(arc_buf_t *buf)
3679 {
3680 int callback;
3681
3682 mutex_enter(&buf->b_evict_lock);
3683 callback = (buf->b_efunc != NULL);
3684 mutex_exit(&buf->b_evict_lock);
3685 return (callback);
3686 }
3687
3688 #ifdef ZFS_DEBUG
3689 int
3690 arc_referenced(arc_buf_t *buf)
3691 {
3692 int referenced;
3693
3694 mutex_enter(&buf->b_evict_lock);
3695 referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3696 mutex_exit(&buf->b_evict_lock);
3697 return (referenced);
3698 }
3699 #endif
3700
3701 static void
3702 arc_write_ready(zio_t *zio)
3703 {
3704 arc_write_callback_t *callback = zio->io_private;
3705 arc_buf_t *buf = callback->awcb_buf;
3706 arc_buf_hdr_t *hdr = buf->b_hdr;
3707
3708 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3709 callback->awcb_ready(zio, buf, callback->awcb_private);
3710
3711 /*
3712 * If the IO is already in progress, then this is a re-write
3713 * attempt, so we need to thaw and re-compute the cksum.
3714 * It is the responsibility of the callback to handle the
3715 * accounting for any re-write attempt.
3716 */
3717 if (HDR_IO_IN_PROGRESS(hdr)) {
3718 mutex_enter(&hdr->b_freeze_lock);
3719 if (hdr->b_freeze_cksum != NULL) {
3720 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3721 hdr->b_freeze_cksum = NULL;
3722 }
3723 mutex_exit(&hdr->b_freeze_lock);
3724 }
3725 arc_cksum_compute(buf, B_FALSE);
3726 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3727 }
3728
3729 static void
3730 arc_write_done(zio_t *zio)
3731 {
3732 arc_write_callback_t *callback = zio->io_private;
3733 arc_buf_t *buf = callback->awcb_buf;
3734 arc_buf_hdr_t *hdr = buf->b_hdr;
3735
3736 ASSERT(hdr->b_acb == NULL);
3737
3738 if (zio->io_error == 0) {
3739 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3740 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3741 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3742 } else {
3743 ASSERT(BUF_EMPTY(hdr));
3744 }
3745
3746 /*
3747 * If the block to be written was all-zero, we may have
3748 * compressed it away. In this case no write was performed
3749 * so there will be no dva/birth/checksum. The buffer must
3750 * therefore remain anonymous (and uncached).
3751 */
3752 if (!BUF_EMPTY(hdr)) {
3753 arc_buf_hdr_t *exists;
3754 kmutex_t *hash_lock;
3755
3756 ASSERT(zio->io_error == 0);
3757
3758 arc_cksum_verify(buf);
3759
3760 exists = buf_hash_insert(hdr, &hash_lock);
3761 if (exists) {
3762 /*
3763 * This can only happen if we overwrite for
3764 * sync-to-convergence, because we remove
3765 * buffers from the hash table when we arc_free().
3766 */
3767 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3768 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3769 panic("bad overwrite, hdr=%p exists=%p",
3770 (void *)hdr, (void *)exists);
3771 ASSERT(refcount_is_zero(&exists->b_refcnt));
3772 arc_change_state(arc_anon, exists, hash_lock);
3773 mutex_exit(hash_lock);
3774 arc_hdr_destroy(exists);
3775 exists = buf_hash_insert(hdr, &hash_lock);
3776 ASSERT3P(exists, ==, NULL);
3777 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3778 /* nopwrite */
3779 ASSERT(zio->io_prop.zp_nopwrite);
3780 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3781 panic("bad nopwrite, hdr=%p exists=%p",
3782 (void *)hdr, (void *)exists);
3783 } else {
3784 /* Dedup */
3785 ASSERT(hdr->b_datacnt == 1);
3786 ASSERT(hdr->b_state == arc_anon);
3787 ASSERT(BP_GET_DEDUP(zio->io_bp));
3788 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3789 }
3790 }
3791 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3792 /* if it's not anon, we are doing a scrub */
3793 if (!exists && hdr->b_state == arc_anon)
3794 arc_access(hdr, hash_lock);
3795 mutex_exit(hash_lock);
3796 } else {
3797 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3798 }
3799
3800 ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3801 callback->awcb_done(zio, buf, callback->awcb_private);
3802
3803 kmem_free(callback, sizeof (arc_write_callback_t));
3804 }
3805
3806 zio_t *
3807 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3808 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3809 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
3810 void *private, int priority, int zio_flags, const zbookmark_t *zb)
3811 {
3812 arc_buf_hdr_t *hdr = buf->b_hdr;
3813 arc_write_callback_t *callback;
3814 zio_t *zio;
3815
3816 ASSERT(ready != NULL);
3817 ASSERT(done != NULL);
3818 ASSERT(!HDR_IO_ERROR(hdr));
3819 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3820 ASSERT(hdr->b_acb == NULL);
3821 if (l2arc)
3822 hdr->b_flags |= ARC_L2CACHE;
3823 if (l2arc_compress)
3824 hdr->b_flags |= ARC_L2COMPRESS;
3825 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3826 callback->awcb_ready = ready;
3827 callback->awcb_done = done;
3828 callback->awcb_private = private;
3829 callback->awcb_buf = buf;
3830
3831 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3832 arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3833
3834 return (zio);
3835 }
3836
3837 static int
3838 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3839 {
3840 #ifdef _KERNEL
3841 uint64_t available_memory = ptob(freemem);
3842 static uint64_t page_load = 0;
3843 static uint64_t last_txg = 0;
3844
3845 #if defined(__i386)
3846 available_memory =
3847 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3848 #endif
3849 if (available_memory >= zfs_write_limit_max)
3850 return (0);
3851
3852 if (txg > last_txg) {
3853 last_txg = txg;
3854 page_load = 0;
3855 }
3856 /*
3857 * If we are in pageout, we know that memory is already tight,
3858 * the arc is already going to be evicting, so we just want to
3859 * continue to let page writes occur as quickly as possible.
3860 */
3861 if (curproc == proc_pageout) {
3862 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3863 return (SET_ERROR(ERESTART));
3864 /* Note: reserve is inflated, so we deflate */
3865 page_load += reserve / 8;
3866 return (0);
3867 } else if (page_load > 0 && arc_reclaim_needed()) {
3868 /* memory is low, delay before restarting */
3869 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3870 return (SET_ERROR(EAGAIN));
3871 }
3872 page_load = 0;
3873
3874 if (arc_size > arc_c_min) {
3875 uint64_t evictable_memory =
3876 arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3877 arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3878 arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3879 arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3880 available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3881 }
3882
3883 if (inflight_data > available_memory / 4) {
3884 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3885 return (SET_ERROR(ERESTART));
3886 }
3887 #endif
3888 return (0);
3889 }
3890
3891 void
3892 arc_tempreserve_clear(uint64_t reserve)
3893 {
3894 atomic_add_64(&arc_tempreserve, -reserve);
3895 ASSERT((int64_t)arc_tempreserve >= 0);
3896 }
3897
3898 int
3899 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3900 {
3901 int error;
3902 uint64_t anon_size;
3903
3904 #ifdef ZFS_DEBUG
3905 /*
3906 * Once in a while, fail for no reason. Everything should cope.
3907 */
3908 if (spa_get_random(10000) == 0) {
3909 dprintf("forcing random failure\n");
3910 return (SET_ERROR(ERESTART));
3911 }
3912 #endif
3913 if (reserve > arc_c/4 && !arc_no_grow)
3914 arc_c = MIN(arc_c_max, reserve * 4);
3915 if (reserve > arc_c)
3916 return (SET_ERROR(ENOMEM));
3917
3918 /*
3919 * Don't count loaned bufs as in flight dirty data to prevent long
3920 * network delays from blocking transactions that are ready to be
3921 * assigned to a txg.
3922 */
3923 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3924
3925 /*
3926 * Writes will, almost always, require additional memory allocations
3927 * in order to compress/encrypt/etc the data. We therefore need to
3928 * make sure that there is sufficient available memory for this.
3929 */
3930 if (error = arc_memory_throttle(reserve, anon_size, txg))
3931 return (error);
3932
3933 /*
3934 * Throttle writes when the amount of dirty data in the cache
3935 * gets too large. We try to keep the cache less than half full
3936 * of dirty blocks so that our sync times don't grow too large.
3937 * Note: if two requests come in concurrently, we might let them
3938 * both succeed, when one of them should fail. Not a huge deal.
3939 */
3940
3941 if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3942 anon_size > arc_c / 4) {
3943 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3944 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3945 arc_tempreserve>>10,
3946 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3947 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3948 reserve>>10, arc_c>>10);
3949 return (SET_ERROR(ERESTART));
3950 }
3951 atomic_add_64(&arc_tempreserve, reserve);
3952 return (0);
3953 }
3954
3955 void
3956 arc_init(void)
3957 {
3958 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3959 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3960
3961 /* Convert seconds to clock ticks */
3962 arc_min_prefetch_lifespan = 1 * hz;
3963
3964 /* Start out with 1/8 of all memory */
3965 arc_c = physmem * PAGESIZE / 8;
3966
3967 #ifdef _KERNEL
3968 /*
3969 * On architectures where the physical memory can be larger
3970 * than the addressable space (intel in 32-bit mode), we may
3971 * need to limit the cache to 1/8 of VM size.
3972 */
3973 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3974 #endif
3975
3976 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3977 arc_c_min = MAX(arc_c / 4, 64<<20);
3978 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3979 if (arc_c * 8 >= 1<<30)
3980 arc_c_max = (arc_c * 8) - (1<<30);
3981 else
3982 arc_c_max = arc_c_min;
3983 arc_c_max = MAX(arc_c * 6, arc_c_max);
3984
3985 /*
3986 * Allow the tunables to override our calculations if they are
3987 * reasonable (ie. over 64MB)
3988 */
3989 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3990 arc_c_max = zfs_arc_max;
3991 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3992 arc_c_min = zfs_arc_min;
3993
3994 arc_c = arc_c_max;
3995 arc_p = (arc_c >> 1);
3996
3997 /* limit meta-data to 1/4 of the arc capacity */
3998 arc_meta_limit = arc_c_max / 4;
3999
4000 /* Allow the tunable to override if it is reasonable */
4001 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4002 arc_meta_limit = zfs_arc_meta_limit;
4003
4004 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
4005 arc_c_min = arc_meta_limit / 2;
4006
4007 if (zfs_arc_grow_retry > 0)
4008 arc_grow_retry = zfs_arc_grow_retry;
4009
4010 if (zfs_arc_shrink_shift > 0)
4011 arc_shrink_shift = zfs_arc_shrink_shift;
4012
4013 if (zfs_arc_p_min_shift > 0)
4014 arc_p_min_shift = zfs_arc_p_min_shift;
4015
4016 /* if kmem_flags are set, lets try to use less memory */
4017 if (kmem_debugging())
4018 arc_c = arc_c / 2;
4019 if (arc_c < arc_c_min)
4020 arc_c = arc_c_min;
4021
4022 arc_anon = &ARC_anon;
4023 arc_mru = &ARC_mru;
4024 arc_mru_ghost = &ARC_mru_ghost;
4025 arc_mfu = &ARC_mfu;
4026 arc_mfu_ghost = &ARC_mfu_ghost;
4027 arc_l2c_only = &ARC_l2c_only;
4028 arc_size = 0;
4029
4030 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4031 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4032 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4033 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4034 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4035 mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4036
4037 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
4038 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4039 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
4040 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4041 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
4042 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4043 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
4044 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4045 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
4046 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4047 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
4048 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4049 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
4050 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4051 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
4052 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4053 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
4054 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4055 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
4056 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4057
4058 buf_init();
4059
4060 arc_thread_exit = 0;
4061 arc_eviction_list = NULL;
4062 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4063 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4064
4065 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4066 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4067
4068 if (arc_ksp != NULL) {
4069 arc_ksp->ks_data = &arc_stats;
4070 kstat_install(arc_ksp);
4071 }
4072
4073 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4074 TS_RUN, minclsyspri);
4075
4076 arc_dead = FALSE;
4077 arc_warm = B_FALSE;
4078
4079 if (zfs_write_limit_max == 0)
4080 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
4081 else
4082 zfs_write_limit_shift = 0;
4083 mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
4084 }
4085
4086 void
4087 arc_fini(void)
4088 {
4089 mutex_enter(&arc_reclaim_thr_lock);
4090 arc_thread_exit = 1;
4091 while (arc_thread_exit != 0)
4092 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4093 mutex_exit(&arc_reclaim_thr_lock);
4094
4095 arc_flush(NULL);
4096
4097 arc_dead = TRUE;
4098
4099 if (arc_ksp != NULL) {
4100 kstat_delete(arc_ksp);
4101 arc_ksp = NULL;
4102 }
4103
4104 mutex_destroy(&arc_eviction_mtx);
4105 mutex_destroy(&arc_reclaim_thr_lock);
4106 cv_destroy(&arc_reclaim_thr_cv);
4107
4108 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
4109 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
4110 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
4111 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
4112 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
4113 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
4114 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
4115 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
4116
4117 mutex_destroy(&arc_anon->arcs_mtx);
4118 mutex_destroy(&arc_mru->arcs_mtx);
4119 mutex_destroy(&arc_mru_ghost->arcs_mtx);
4120 mutex_destroy(&arc_mfu->arcs_mtx);
4121 mutex_destroy(&arc_mfu_ghost->arcs_mtx);
4122 mutex_destroy(&arc_l2c_only->arcs_mtx);
4123
4124 mutex_destroy(&zfs_write_limit_lock);
4125
4126 buf_fini();
4127
4128 ASSERT(arc_loaned_bytes == 0);
4129 }
4130
4131 /*
4132 * Level 2 ARC
4133 *
4134 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4135 * It uses dedicated storage devices to hold cached data, which are populated
4136 * using large infrequent writes. The main role of this cache is to boost
4137 * the performance of random read workloads. The intended L2ARC devices
4138 * include short-stroked disks, solid state disks, and other media with
4139 * substantially faster read latency than disk.
4140 *
4141 * +-----------------------+
4142 * | ARC |
4143 * +-----------------------+
4144 * | ^ ^
4145 * | | |
4146 * l2arc_feed_thread() arc_read()
4147 * | | |
4148 * | l2arc read |
4149 * V | |
4150 * +---------------+ |
4151 * | L2ARC | |
4152 * +---------------+ |
4153 * | ^ |
4154 * l2arc_write() | |
4155 * | | |
4156 * V | |
4157 * +-------+ +-------+
4158 * | vdev | | vdev |
4159 * | cache | | cache |
4160 * +-------+ +-------+
4161 * +=========+ .-----.
4162 * : L2ARC : |-_____-|
4163 * : devices : | Disks |
4164 * +=========+ `-_____-'
4165 *
4166 * Read requests are satisfied from the following sources, in order:
4167 *
4168 * 1) ARC
4169 * 2) vdev cache of L2ARC devices
4170 * 3) L2ARC devices
4171 * 4) vdev cache of disks
4172 * 5) disks
4173 *
4174 * Some L2ARC device types exhibit extremely slow write performance.
4175 * To accommodate for this there are some significant differences between
4176 * the L2ARC and traditional cache design:
4177 *
4178 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
4179 * the ARC behave as usual, freeing buffers and placing headers on ghost
4180 * lists. The ARC does not send buffers to the L2ARC during eviction as
4181 * this would add inflated write latencies for all ARC memory pressure.
4182 *
4183 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4184 * It does this by periodically scanning buffers from the eviction-end of
4185 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4186 * not already there. It scans until a headroom of buffers is satisfied,
4187 * which itself is a buffer for ARC eviction. If a compressible buffer is
4188 * found during scanning and selected for writing to an L2ARC device, we
4189 * temporarily boost scanning headroom during the next scan cycle to make
4190 * sure we adapt to compression effects (which might significantly reduce
4191 * the data volume we write to L2ARC). The thread that does this is
4192 * l2arc_feed_thread(), illustrated below; example sizes are included to
4193 * provide a better sense of ratio than this diagram:
4194 *
4195 * head --> tail
4196 * +---------------------+----------+
4197 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
4198 * +---------------------+----------+ | o L2ARC eligible
4199 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
4200 * +---------------------+----------+ |
4201 * 15.9 Gbytes ^ 32 Mbytes |
4202 * headroom |
4203 * l2arc_feed_thread()
4204 * |
4205 * l2arc write hand <--[oooo]--'
4206 * | 8 Mbyte
4207 * | write max
4208 * V
4209 * +==============================+
4210 * L2ARC dev |####|#|###|###| |####| ... |
4211 * +==============================+
4212 * 32 Gbytes
4213 *
4214 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4215 * evicted, then the L2ARC has cached a buffer much sooner than it probably
4216 * needed to, potentially wasting L2ARC device bandwidth and storage. It is
4217 * safe to say that this is an uncommon case, since buffers at the end of
4218 * the ARC lists have moved there due to inactivity.
4219 *
4220 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4221 * then the L2ARC simply misses copying some buffers. This serves as a
4222 * pressure valve to prevent heavy read workloads from both stalling the ARC
4223 * with waits and clogging the L2ARC with writes. This also helps prevent
4224 * the potential for the L2ARC to churn if it attempts to cache content too
4225 * quickly, such as during backups of the entire pool.
4226 *
4227 * 5. After system boot and before the ARC has filled main memory, there are
4228 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4229 * lists can remain mostly static. Instead of searching from tail of these
4230 * lists as pictured, the l2arc_feed_thread() will search from the list heads
4231 * for eligible buffers, greatly increasing its chance of finding them.
4232 *
4233 * The L2ARC device write speed is also boosted during this time so that
4234 * the L2ARC warms up faster. Since there have been no ARC evictions yet,
4235 * there are no L2ARC reads, and no fear of degrading read performance
4236 * through increased writes.
4237 *
4238 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4239 * the vdev queue can aggregate them into larger and fewer writes. Each
4240 * device is written to in a rotor fashion, sweeping writes through
4241 * available space then repeating.
4242 *
4243 * 7. The L2ARC does not store dirty content. It never needs to flush
4244 * write buffers back to disk based storage.
4245 *
4246 * 8. If an ARC buffer is written (and dirtied) which also exists in the
4247 * L2ARC, the now stale L2ARC buffer is immediately dropped.
4248 *
4249 * The performance of the L2ARC can be tweaked by a number of tunables, which
4250 * may be necessary for different workloads:
4251 *
4252 * l2arc_write_max max write bytes per interval
4253 * l2arc_write_boost extra write bytes during device warmup
4254 * l2arc_noprefetch skip caching prefetched buffers
4255 * l2arc_headroom number of max device writes to precache
4256 * l2arc_headroom_boost when we find compressed buffers during ARC
4257 * scanning, we multiply headroom by this
4258 * percentage factor for the next scan cycle,
4259 * since more compressed buffers are likely to
4260 * be present
4261 * l2arc_feed_secs seconds between L2ARC writing
4262 *
4263 * Tunables may be removed or added as future performance improvements are
4264 * integrated, and also may become zpool properties.
4265 *
4266 * There are three key functions that control how the L2ARC warms up:
4267 *
4268 * l2arc_write_eligible() check if a buffer is eligible to cache
4269 * l2arc_write_size() calculate how much to write
4270 * l2arc_write_interval() calculate sleep delay between writes
4271 *
4272 * These three functions determine what to write, how much, and how quickly
4273 * to send writes.
4274 *
4275 * L2ARC persistency:
4276 *
4277 * When writing buffers to L2ARC, we periodically add some metadata to
4278 * make sure we can pick them up after reboot, thus dramatically reducing
4279 * the impact that any downtime has on the performance of storage systems
4280 * with large caches.
4281 *
4282 * The implementation works fairly simply by integrating the following two
4283 * modifications:
4284 *
4285 * *) Every now and then, at end of an L2ARC feed cycle, we append a piece
4286 * of metadata (called a "pbuf", or "persistency buffer") to the L2ARC
4287 * write. This allows us to understand what what's been written, so that
4288 * we can rebuild the arc_buf_hdr_t structures of the main ARC buffers.
4289 * The pbuf also includes a "back-reference" pointer to the previous
4290 * pbuf, forming a linked list of pbufs on the L2ARC device.
4291 *
4292 * *) We reserve 4k of space at the start of each L2ARC device for our
4293 * header bookkeeping purposes. This contains a single 4k uberblock, which
4294 * contains our top-level reference structures. We update it on each pbuf
4295 * write. If this write results in an inconsistent uberblock (e.g. due to
4296 * power failure), we detect this by verifying the uberblock's checksum
4297 * and simply drop the entries from L2ARC. Once an L2ARC pbuf update
4298 * completes, we update the uberblock to point to it.
4299 *
4300 * Implementation diagram:
4301 *
4302 * +=== L2ARC device (not to scale) ======================================+
4303 * | ____________newest pbuf pointer_____________ |
4304 * | / \ |
4305 * | / V |
4306 * ||l2uberblock|---|bufs|pbuf|bufs|pbuf|bufs|pbuf|bufs|pbuf|---(empty)---|
4307 * | ^ / ^ / ^ / |
4308 * | `-prev-' `-prev-' `-prev-' |
4309 * | pbuf pbuf pbuf |
4310 * +======================================================================+
4311 *
4312 * On-device data structures:
4313 *
4314 * (L2ARC persistent uberblock)
4315 * struct l2uberblock {
4316 * (these fields are in network byte order)
4317 * uint32_t magic = 0x12bab10c; l2-ber-block
4318 * uint8_t version = 0x1;
4319 * uint8_t reserved = 0x0;
4320 * uint16_t ublk_flags; see l2uberblock_flags_t
4321 *
4322 * (byte order of fields below determined by `ublk_flags')
4323 * uint64_t spa_guid; what pool this l2arc dev belongs to
4324 * uint64_t birth_txg; ublk with highest birth_txg is newest
4325 * uint64_t evict_tail; current evict pointer on l2arc dev
4326 * uint64_t alloc_space; how much space is alloc'd on the dev
4327 * uint64_t pbuf_daddr; dev addr of the newest l2pbuf_t
4328 * uint32_t pbuf_asize; size of newest pbuf
4329 * uint64_t pbuf_cksum[4]; fletcher4 of newest pbuf
4330 *
4331 * uint8_t reserved[3996] = {0x0, 0x0, ... 0x0};
4332 *
4333 * uint64_t ublk_cksum[4] = fletcher4(of the 4064 bytes above);
4334 * } l2dev_uberblock;
4335 *
4336 * (L2ARC persistent buffer list)
4337 * typedef struct l2pbuf_t {
4338 * (these fields are in network byte order)
4339 * uint32_t magic = 0xdb0faba6; the-buffer-bag
4340 * uint8_t version = 0x1;
4341 * uint8_t reserved = 0x0;
4342 * uint16_t pbuf_flags; see l2pbuf_flags_t
4343 *
4344 * (byte order of fields below determined by `pbuf_flags')
4345 * uint64_t prev_pbuf_daddr; previous pbuf dev addr
4346 * uint32_t prev_pbuf_asize; previous pbuf size
4347 * uint64_t prev_pbuf_cksum[4]; fletcher4(of previous pbuf)
4348 *
4349 * uint32_t items_size; uncompressed size of `items' below
4350 * (if (pbuf_flags & compress) decompress `items' prior to decoding)
4351 * struct l2pbuf_buf_item {
4352 * (these fields mirror [l2]arc_buf_hdr fields)
4353 * uint64_t dva[2]; buffer's DVA
4354 * uint64_t birth; buffer's birth TXG in ARC
4355 * uint64_t cksum0; lower 64-bits of buffer's cksum
4356 * uint64_t freeze_cksum[4]; buffer's freeze cksum
4357 * uint32_t size; uncompressed buffer data size
4358 * uint64_t l2daddr; device address (offset) of buf
4359 * uint32_t l2asize; actual space occupied by buf
4360 * uint8_t compress; compress algo used on data
4361 * uint8_t contents_type; buffer's contents type
4362 * uint16_t reserved = 0x0; for alignment and future use
4363 * uint32_t flags; buffer's persistent flags
4364 * } items[]; continues for remainder of pbuf
4365 * } l2pbuf_t;
4366 *
4367 * L2ARC reconstruction:
4368 *
4369 * When writing data, we simply write in the standard rotary fashion,
4370 * evicting buffers as we go and simply writing new data over them (appending
4371 * an updated l2pbuf_t every now and then). This obviously means that once we
4372 * loop around the end of the device, we will start cutting into an already
4373 * committed l2pbuf (and its referenced data buffers), like so:
4374 *
4375 * current write head__ __old tail
4376 * \ /
4377 * V V
4378 * <--|bufs|pbuf|bufs|pbuf| |bufs|pbuf|bufs|pbuf|-->
4379 * ^ ^^^^^^^^^_____________________________
4380 * | \
4381 * <<nextwrite>> - will overwrite this pbuf --/
4382 *
4383 * When importing the pool, we detect this situation and use it to stop
4384 * our scanning process:
4385 * 1) Let `this_pbuf' refer to the current l2pbuf_t and `prev_pbuf' to the
4386 * previous one.
4387 * 2) if (fletcher4(prev_pbuf) != this_pbuf->prev_pbuf_cksum)
4388 * then the pbuf is invalid and stop scanning (goto step 3 below).
4389 * 3) if (this is the last valid pbuf)
4390 * discard this pbuf as well (its ARC bufs may have been damaged by a
4391 * partial overwrite).
4392 * (We could potentially salvage the remaining good arc bufs above in step 3,
4393 * buf the cost of doing so probably outweighs the value of the entire pbuf).
4394 *
4395 * There is one significant caveat to consider when rebuilding ARC contents
4396 * from an L2ARC device: what about invalidated buffers? Given the above
4397 * construction, we cannot update pbufs which we've already written to amend
4398 * them to remove buffers which were invalidated. Thus, during reconstruction,
4399 * we might be populating the cache with buffers for data that's not on the
4400 * main pool anymore, or may have been overwritten!
4401 *
4402 * As it turns out, this isn't a problem. Every arc_read request includes
4403 * both the DVA and, crucially, the birth TXG of the BP the caller is
4404 * looking for. So even if the cache were populated by completely rotten
4405 * blocks for data that had been long deleted and/or overwritten, we'll
4406 * never actually return bad data from the cache, since the DVA with the
4407 * birth TXG uniquely identify a block in space and time - once created,
4408 * a block is immutable on disk. The worst thing we have done is wasted
4409 * some time and memory at l2arc rebuild to reconstruct outdated ARC
4410 * entries that will get dropped from the l2arc as it is being updated
4411 * with new blocks.
4412 */
4413
4414 static boolean_t
4415 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4416 {
4417 /*
4418 * A buffer is *not* eligible for the L2ARC if it:
4419 * 1. belongs to a different spa.
4420 * 2. is already cached on the L2ARC.
4421 * 3. has an I/O in progress (it may be an incomplete read).
4422 * 4. is flagged not eligible (zfs property).
4423 */
4424 if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4425 HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4426 return (B_FALSE);
4427
4428 return (B_TRUE);
4429 }
4430
4431 static uint64_t
4432 l2arc_write_size(void)
4433 {
4434 uint64_t size;
4435
4436 /*
4437 * Make sure our globals have meaningful values in case the user
4438 * altered them.
4439 */
4440 size = l2arc_write_max;
4441 if (size == 0) {
4442 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4443 "be greater than zero, resetting it to the default (%d)",
4444 L2ARC_WRITE_SIZE);
4445 size = l2arc_write_max = L2ARC_WRITE_SIZE;
4446 }
4447
4448 if (arc_warm == B_FALSE)
4449 size += l2arc_write_boost;
4450
4451 return (size);
4452
4453 }
4454
4455 static clock_t
4456 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4457 {
4458 clock_t interval, next, now;
4459
4460 /*
4461 * If the ARC lists are busy, increase our write rate; if the
4462 * lists are stale, idle back. This is achieved by checking
4463 * how much we previously wrote - if it was more than half of
4464 * what we wanted, schedule the next write much sooner.
4465 */
4466 if (l2arc_feed_again && wrote > (wanted / 2))
4467 interval = (hz * l2arc_feed_min_ms) / 1000;
4468 else
4469 interval = hz * l2arc_feed_secs;
4470
4471 now = ddi_get_lbolt();
4472 next = MAX(now, MIN(now + interval, began + interval));
4473
4474 return (next);
4475 }
4476
4477 static void
4478 l2arc_hdr_stat_add(boolean_t from_arc)
4479 {
4480 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4481 if (from_arc)
4482 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4483 }
4484
4485 static void
4486 l2arc_hdr_stat_remove(void)
4487 {
4488 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4489 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4490 }
4491
4492 /*
4493 * Cycle through L2ARC devices. This is how L2ARC load balances.
4494 * If a device is returned, this also returns holding the spa config lock.
4495 */
4496 static l2arc_dev_t *
4497 l2arc_dev_get_next(void)
4498 {
4499 l2arc_dev_t *first, *next = NULL;
4500
4501 /*
4502 * Lock out the removal of spas (spa_namespace_lock), then removal
4503 * of cache devices (l2arc_dev_mtx). Once a device has been selected,
4504 * both locks will be dropped and a spa config lock held instead.
4505 */
4506 mutex_enter(&spa_namespace_lock);
4507 mutex_enter(&l2arc_dev_mtx);
4508
4509 /* if there are no vdevs, there is nothing to do */
4510 if (l2arc_ndev == 0)
4511 goto out;
4512
4513 first = NULL;
4514 next = l2arc_dev_last;
4515 do {
4516 /*
4517 * Loop around the list looking for a non-faulted vdev
4518 * and one that isn't currently doing an L2ARC rebuild.
4519 */
4520 if (next == NULL) {
4521 next = list_head(l2arc_dev_list);
4522 } else {
4523 next = list_next(l2arc_dev_list, next);
4524 if (next == NULL)
4525 next = list_head(l2arc_dev_list);
4526 }
4527
4528 /* if we have come back to the start, bail out */
4529 if (first == NULL)
4530 first = next;
4531 else if (next == first)
4532 break;
4533
4534 } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding);
4535
4536 /* if we were unable to find any usable vdevs, return NULL */
4537 if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding)
4538 next = NULL;
4539
4540 l2arc_dev_last = next;
4541
4542 out:
4543 mutex_exit(&l2arc_dev_mtx);
4544
4545 /*
4546 * Grab the config lock to prevent the 'next' device from being
4547 * removed while we are writing to it.
4548 */
4549 if (next != NULL)
4550 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4551 mutex_exit(&spa_namespace_lock);
4552
4553 return (next);
4554 }
4555
4556 /*
4557 * Free buffers that were tagged for destruction.
4558 */
4559 static void
4560 l2arc_do_free_on_write()
4561 {
4562 list_t *buflist;
4563 l2arc_data_free_t *df, *df_prev;
4564
4565 mutex_enter(&l2arc_free_on_write_mtx);
4566 buflist = l2arc_free_on_write;
4567
4568 for (df = list_tail(buflist); df; df = df_prev) {
4569 df_prev = list_prev(buflist, df);
4570 ASSERT(df->l2df_data != NULL);
4571 ASSERT(df->l2df_func != NULL);
4572 df->l2df_func(df->l2df_data, df->l2df_size);
4573 list_remove(buflist, df);
4574 kmem_free(df, sizeof (l2arc_data_free_t));
4575 }
4576
4577 mutex_exit(&l2arc_free_on_write_mtx);
4578 }
4579
4580 /*
4581 * A write to a cache device has completed. Update all headers to allow
4582 * reads from these buffers to begin.
4583 */
4584 static void
4585 l2arc_write_done(zio_t *zio)
4586 {
4587 l2arc_write_callback_t *cb;
4588 l2arc_dev_t *dev;
4589 list_t *buflist;
4590 arc_buf_hdr_t *head, *ab, *ab_prev;
4591 l2arc_buf_hdr_t *abl2;
4592 kmutex_t *hash_lock;
4593
4594 cb = zio->io_private;
4595 ASSERT(cb != NULL);
4596 dev = cb->l2wcb_dev;
4597 ASSERT(dev != NULL);
4598 head = cb->l2wcb_head;
4599 ASSERT(head != NULL);
4600 buflist = dev->l2ad_buflist;
4601 ASSERT(buflist != NULL);
4602 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4603 l2arc_write_callback_t *, cb);
4604
4605 if (zio->io_error != 0)
4606 ARCSTAT_BUMP(arcstat_l2_writes_error);
4607
4608 mutex_enter(&l2arc_buflist_mtx);
4609
4610 /*
4611 * All writes completed, or an error was hit.
4612 */
4613 for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4614 ab_prev = list_prev(buflist, ab);
4615 abl2 = ab->b_l2hdr;
4616
4617 /*
4618 * Release the temporary compressed buffer as soon as possible.
4619 */
4620 if (abl2->b_compress != ZIO_COMPRESS_OFF)
4621 l2arc_release_cdata_buf(ab);
4622
4623 hash_lock = HDR_LOCK(ab);
4624 if (!mutex_tryenter(hash_lock)) {
4625 /*
4626 * This buffer misses out. It may be in a stage
4627 * of eviction. Its ARC_L2_WRITING flag will be
4628 * left set, denying reads to this buffer.
4629 */
4630 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4631 continue;
4632 }
4633
4634 if (zio->io_error != 0) {
4635 /*
4636 * Error - drop L2ARC entry.
4637 */
4638 list_remove(buflist, ab);
4639 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4640 ab->b_l2hdr = NULL;
4641 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4642 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4643 }
4644
4645 /*
4646 * Allow ARC to begin reads to this L2ARC entry.
4647 */
4648 ab->b_flags &= ~ARC_L2_WRITING;
4649
4650 mutex_exit(hash_lock);
4651 }
4652
4653 atomic_inc_64(&l2arc_writes_done);
4654 list_remove(buflist, head);
4655 kmem_cache_free(hdr_cache, head);
4656 mutex_exit(&l2arc_buflist_mtx);
4657
4658 l2arc_do_free_on_write();
4659
4660 if (cb->l2wcb_pbuf)
4661 kmem_free(cb->l2wcb_pbuf, cb->l2wcb_pbuf_size);
4662 if (cb->l2wcb_ub_buf)
4663 kmem_free(cb->l2wcb_ub_buf, L2UBERBLOCK_SIZE);
4664 kmem_free(cb, sizeof (l2arc_write_callback_t));
4665 }
4666
4667 /*
4668 * A read to a cache device completed. Validate buffer contents before
4669 * handing over to the regular ARC routines.
4670 */
4671 static void
4672 l2arc_read_done(zio_t *zio)
4673 {
4674 l2arc_read_callback_t *cb;
4675 arc_buf_hdr_t *hdr;
4676 arc_buf_t *buf;
4677 kmutex_t *hash_lock;
4678 int equal;
4679
4680 ASSERT(zio->io_vd != NULL);
4681 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4682
4683 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4684
4685 cb = zio->io_private;
4686 ASSERT(cb != NULL);
4687 buf = cb->l2rcb_buf;
4688 ASSERT(buf != NULL);
4689
4690 hash_lock = HDR_LOCK(buf->b_hdr);
4691 mutex_enter(hash_lock);
4692 hdr = buf->b_hdr;
4693 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4694
4695 /*
4696 * If the buffer was compressed, decompress it first.
4697 */
4698 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4699 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4700 ASSERT(zio->io_data != NULL);
4701
4702 /*
4703 * Check this survived the L2ARC journey.
4704 */
4705 equal = arc_cksum_equal(buf);
4706 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4707 mutex_exit(hash_lock);
4708 zio->io_private = buf;
4709 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
4710 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
4711 arc_read_done(zio);
4712 } else {
4713 mutex_exit(hash_lock);
4714 /*
4715 * Buffer didn't survive caching. Increment stats and
4716 * reissue to the original storage device.
4717 */
4718 if (zio->io_error != 0) {
4719 ARCSTAT_BUMP(arcstat_l2_io_error);
4720 } else {
4721 zio->io_error = SET_ERROR(EIO);
4722 }
4723 if (!equal)
4724 ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4725
4726 /*
4727 * If there's no waiter, issue an async i/o to the primary
4728 * storage now. If there *is* a waiter, the caller must
4729 * issue the i/o in a context where it's OK to block.
4730 */
4731 if (zio->io_waiter == NULL) {
4732 zio_t *pio = zio_unique_parent(zio);
4733
4734 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4735
4736 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4737 buf->b_data, zio->io_size, arc_read_done, buf,
4738 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4739 }
4740 }
4741
4742 kmem_free(cb, sizeof (l2arc_read_callback_t));
4743 }
4744
4745 /*
4746 * This is the list priority from which the L2ARC will search for pages to
4747 * cache. This is used within loops (0..3) to cycle through lists in the
4748 * desired order. This order can have a significant effect on cache
4749 * performance.
4750 *
4751 * Currently the metadata lists are hit first, MFU then MRU, followed by
4752 * the data lists. This function returns a locked list, and also returns
4753 * the lock pointer.
4754 */
4755 static list_t *
4756 l2arc_list_locked(int list_num, kmutex_t **lock)
4757 {
4758 list_t *list = NULL;
4759
4760 ASSERT(list_num >= 0 && list_num <= 3);
4761
4762 switch (list_num) {
4763 case 0:
4764 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4765 *lock = &arc_mfu->arcs_mtx;
4766 break;
4767 case 1:
4768 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4769 *lock = &arc_mru->arcs_mtx;
4770 break;
4771 case 2:
4772 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4773 *lock = &arc_mfu->arcs_mtx;
4774 break;
4775 case 3:
4776 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4777 *lock = &arc_mru->arcs_mtx;
4778 break;
4779 }
4780
4781 ASSERT(!(MUTEX_HELD(*lock)));
4782 mutex_enter(*lock);
4783 return (list);
4784 }
4785
4786 /*
4787 * Evict buffers from the device write hand to the distance specified in
4788 * bytes. This distance may span populated buffers, it may span nothing.
4789 * This is clearing a region on the L2ARC device ready for writing.
4790 * If the 'all' boolean is set, every buffer is evicted.
4791 */
4792 static void
4793 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4794 {
4795 list_t *buflist;
4796 l2arc_buf_hdr_t *abl2;
4797 arc_buf_hdr_t *ab, *ab_prev;
4798 kmutex_t *hash_lock;
4799 uint64_t taddr;
4800
4801 buflist = dev->l2ad_buflist;
4802
4803 if (buflist == NULL)
4804 return;
4805
4806 if (!all && dev->l2ad_first) {
4807 /*
4808 * This is the first sweep through the device. There is
4809 * nothing to evict.
4810 */
4811 return;
4812 }
4813
4814 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4815 /*
4816 * When nearing the end of the device, evict to the end
4817 * before the device write hand jumps to the start.
4818 */
4819 taddr = dev->l2ad_end;
4820 } else {
4821 taddr = dev->l2ad_hand + distance;
4822 }
4823 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4824 uint64_t, taddr, boolean_t, all);
4825
4826 top:
4827 mutex_enter(&l2arc_buflist_mtx);
4828 for (ab = list_tail(buflist); ab; ab = ab_prev) {
4829 ab_prev = list_prev(buflist, ab);
4830
4831 hash_lock = HDR_LOCK(ab);
4832 if (!mutex_tryenter(hash_lock)) {
4833 /*
4834 * Missed the hash lock. Retry.
4835 */
4836 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4837 mutex_exit(&l2arc_buflist_mtx);
4838 mutex_enter(hash_lock);
4839 mutex_exit(hash_lock);
4840 goto top;
4841 }
4842
4843 if (HDR_L2_WRITE_HEAD(ab)) {
4844 /*
4845 * We hit a write head node. Leave it for
4846 * l2arc_write_done().
4847 */
4848 list_remove(buflist, ab);
4849 mutex_exit(hash_lock);
4850 continue;
4851 }
4852
4853 if (!all && ab->b_l2hdr != NULL &&
4854 (ab->b_l2hdr->b_daddr > taddr ||
4855 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4856 /*
4857 * We've evicted to the target address,
4858 * or the end of the device.
4859 */
4860 mutex_exit(hash_lock);
4861 break;
4862 }
4863
4864 if (HDR_FREE_IN_PROGRESS(ab)) {
4865 /*
4866 * Already on the path to destruction.
4867 */
4868 mutex_exit(hash_lock);
4869 continue;
4870 }
4871
4872 if (ab->b_state == arc_l2c_only) {
4873 ASSERT(!HDR_L2_READING(ab));
4874 /*
4875 * This doesn't exist in the ARC. Destroy.
4876 * arc_hdr_destroy() will call list_remove()
4877 * and decrement arcstat_l2_size.
4878 */
4879 arc_change_state(arc_anon, ab, hash_lock);
4880 arc_hdr_destroy(ab);
4881 } else {
4882 /*
4883 * Invalidate issued or about to be issued
4884 * reads, since we may be about to write
4885 * over this location.
4886 */
4887 if (HDR_L2_READING(ab)) {
4888 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4889 ab->b_flags |= ARC_L2_EVICTED;
4890 }
4891
4892 /*
4893 * Tell ARC this no longer exists in L2ARC.
4894 */
4895 if (ab->b_l2hdr != NULL) {
4896 abl2 = ab->b_l2hdr;
4897 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4898 ab->b_l2hdr = NULL;
4899 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4900 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4901 }
4902 list_remove(buflist, ab);
4903
4904 /*
4905 * This may have been leftover after a
4906 * failed write.
4907 */
4908 ab->b_flags &= ~ARC_L2_WRITING;
4909 }
4910 mutex_exit(hash_lock);
4911 }
4912 mutex_exit(&l2arc_buflist_mtx);
4913
4914 vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4915 dev->l2ad_evict = taddr;
4916 }
4917
4918 /*
4919 * Find and write ARC buffers to the L2ARC device.
4920 *
4921 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4922 * for reading until they have completed writing.
4923 * The headroom_boost is an in-out parameter used to maintain headroom boost
4924 * state between calls to this function.
4925 *
4926 * Returns the number of bytes actually written (which may be smaller than
4927 * the delta by which the device hand has changed due to alignment).
4928 */
4929 static uint64_t
4930 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4931 boolean_t *headroom_boost)
4932 {
4933 arc_buf_hdr_t *ab, *ab_prev, *head;
4934 list_t *list;
4935 uint64_t write_asize, write_psize, write_sz, headroom,
4936 buf_compress_minsz;
4937 void *buf_data;
4938 kmutex_t *list_lock;
4939 boolean_t full;
4940 l2arc_write_callback_t *cb;
4941 zio_t *pio, *wzio;
4942 uint64_t guid = spa_load_guid(spa);
4943 const boolean_t do_headroom_boost = *headroom_boost;
4944
4945 /* persistency-related */
4946 l2pbuf_t *pb;
4947 l2pbuf_buflist_t *pb_buflist;
4948 int num_bufs, buf_index;
4949
4950 ASSERT(dev->l2ad_vdev != NULL);
4951
4952 /* Lower the flag now, we might want to raise it again later. */
4953 *headroom_boost = B_FALSE;
4954
4955 pio = NULL;
4956 cb = NULL;
4957 write_sz = write_asize = write_psize = 0;
4958 full = B_FALSE;
4959 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4960 head->b_flags |= ARC_L2_WRITE_HEAD;
4961
4962 /*
4963 * We will want to try to compress buffers that are at least 2x the
4964 * device sector size.
4965 */
4966 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4967
4968 pb = &dev->l2ad_pbuf;
4969 num_bufs = 0;
4970
4971 /*
4972 * We will want to try to compress buffers that are at least 2x the
4973 * device sector size.
4974 */
4975 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4976
4977 /*
4978 * Copy buffers for L2ARC writing.
4979 */
4980 mutex_enter(&l2arc_buflist_mtx);
4981 for (int try = 0; try <= 3; try++) {
4982 uint64_t passed_sz = 0;
4983
4984 list = l2arc_list_locked(try, &list_lock);
4985
4986 /*
4987 * L2ARC fast warmup.
4988 *
4989 * Until the ARC is warm and starts to evict, read from the
4990 * head of the ARC lists rather than the tail.
4991 */
4992 if (arc_warm == B_FALSE)
4993 ab = list_head(list);
4994 else
4995 ab = list_tail(list);
4996
4997 headroom = target_sz * l2arc_headroom;
4998 if (do_headroom_boost)
4999 headroom = (headroom * l2arc_headroom_boost) / 100;
5000
5001 for (; ab; ab = ab_prev) {
5002 l2arc_buf_hdr_t *l2hdr;
5003 kmutex_t *hash_lock;
5004 uint64_t buf_sz;
5005
5006 if (arc_warm == B_FALSE)
5007 ab_prev = list_next(list, ab);
5008 else
5009 ab_prev = list_prev(list, ab);
5010
5011 hash_lock = HDR_LOCK(ab);
5012 if (!mutex_tryenter(hash_lock)) {
5013 /*
5014 * Skip this buffer rather than waiting.
5015 */
5016 continue;
5017 }
5018
5019 passed_sz += ab->b_size;
5020 if (passed_sz > headroom) {
5021 /*
5022 * Searched too far.
5023 */
5024 mutex_exit(hash_lock);
5025 break;
5026 }
5027
5028 if (!l2arc_write_eligible(guid, ab)) {
5029 mutex_exit(hash_lock);
5030 continue;
5031 }
5032
5033 if ((write_sz + ab->b_size) > target_sz) {
5034 full = B_TRUE;
5035 mutex_exit(hash_lock);
5036 break;
5037 }
5038
5039 if (pio == NULL) {
5040 /*
5041 * Insert a dummy header on the buflist so
5042 * l2arc_write_done() can find where the
5043 * write buffers begin without searching.
5044 */
5045 list_insert_head(dev->l2ad_buflist, head);
5046
5047 cb = kmem_zalloc(
5048 sizeof (l2arc_write_callback_t), KM_SLEEP);
5049 cb->l2wcb_dev = dev;
5050 cb->l2wcb_head = head;
5051 pio = zio_root(spa, l2arc_write_done, cb,
5052 ZIO_FLAG_CANFAIL);
5053 }
5054
5055 /*
5056 * Create and add a new L2ARC header.
5057 */
5058 l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
5059 l2hdr->b_dev = dev;
5060 ab->b_flags |= ARC_L2_WRITING;
5061
5062 /*
5063 * Temporarily stash the data buffer in b_tmp_cdata.
5064 * The subsequent write step will pick it up from
5065 * there. This is because can't access ab->b_buf
5066 * without holding the hash_lock, which we in turn
5067 * can't access without holding the ARC list locks
5068 * (which we want to avoid during compression/writing).
5069 */
5070 l2hdr->b_compress = ZIO_COMPRESS_OFF;
5071 l2hdr->b_asize = ab->b_size;
5072 l2hdr->b_tmp_cdata = ab->b_buf->b_data;
5073
5074 buf_sz = ab->b_size;
5075 ab->b_l2hdr = l2hdr;
5076
5077 list_insert_head(dev->l2ad_buflist, ab);
5078
5079 /*
5080 * Compute and store the buffer cksum before
5081 * writing. On debug the cksum is verified first.
5082 */
5083 arc_cksum_verify(ab->b_buf);
5084 arc_cksum_compute(ab->b_buf, B_TRUE);
5085
5086 mutex_exit(hash_lock);
5087
5088 write_sz += buf_sz;
5089 num_bufs++;
5090 }
5091
5092 mutex_exit(list_lock);
5093
5094 if (full == B_TRUE)
5095 break;
5096 }
5097
5098 /* No buffers selected for writing? */
5099 if (pio == NULL) {
5100 ASSERT0(write_sz);
5101 mutex_exit(&l2arc_buflist_mtx);
5102 kmem_cache_free(hdr_cache, head);
5103 return (0);
5104 }
5105
5106 /* expand the pbuf to include a new list */
5107 pb_buflist = l2arc_pbuf_buflist_alloc(pb, num_bufs);
5108
5109 /*
5110 * Now start writing the buffers. We're starting at the write head
5111 * and work backwards, retracing the course of the buffer selector
5112 * loop above.
5113 */
5114 for (ab = list_prev(dev->l2ad_buflist, head), buf_index = 0; ab;
5115 ab = list_prev(dev->l2ad_buflist, ab), buf_index++) {
5116 l2arc_buf_hdr_t *l2hdr;
5117 uint64_t buf_sz;
5118
5119 /*
5120 * We shouldn't need to lock the buffer here, since we flagged
5121 * it as ARC_L2_WRITING in the previous step, but we must take
5122 * care to only access its L2 cache parameters. In particular,
5123 * ab->b_buf may be invalid by now due to ARC eviction.
5124 */
5125 l2hdr = ab->b_l2hdr;
5126 l2hdr->b_daddr = dev->l2ad_hand;
5127
5128 if ((ab->b_flags & ARC_L2COMPRESS) &&
5129 l2hdr->b_asize >= buf_compress_minsz) {
5130 if (l2arc_compress_buf(l2hdr)) {
5131 /*
5132 * If compression succeeded, enable headroom
5133 * boost on the next scan cycle.
5134 */
5135 *headroom_boost = B_TRUE;
5136 }
5137 }
5138
5139 /*
5140 * Pick up the buffer data we had previously stashed away
5141 * (and now potentially also compressed).
5142 */
5143 buf_data = l2hdr->b_tmp_cdata;
5144 buf_sz = l2hdr->b_asize;
5145
5146 /* Compression may have squashed the buffer to zero length. */
5147 if (buf_sz != 0) {
5148 uint64_t buf_p_sz;
5149
5150 wzio = zio_write_phys(pio, dev->l2ad_vdev,
5151 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5152 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5153 ZIO_FLAG_CANFAIL, B_FALSE);
5154
5155 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5156 zio_t *, wzio);
5157 (void) zio_nowait(wzio);
5158
5159 write_asize += buf_sz;
5160 /*
5161 * Keep the clock hand suitably device-aligned.
5162 */
5163 buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5164 write_psize += buf_p_sz;
5165 dev->l2ad_hand += buf_p_sz;
5166 }
5167
5168 l2arc_pbuflist_insert(pb, pb_buflist, ab, buf_index);
5169 }
5170 ASSERT(buf_index == num_bufs);
5171 mutex_exit(&l2arc_buflist_mtx);
5172
5173 ASSERT3U(write_asize, <=, target_sz);
5174 ARCSTAT_BUMP(arcstat_l2_writes_sent);
5175 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5176 ARCSTAT_INCR(arcstat_l2_size, write_sz);
5177 ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5178 vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
5179
5180 /* Is it time to commit this pbuf? */
5181 if (L2PBUF_IS_FULL(pb) &&
5182 dev->l2ad_hand + L2PBUF_ENCODED_SIZE(pb) < dev->l2ad_end) {
5183 l2arc_pbuf_commit(dev, pio, cb);
5184 l2arc_pbuf_destroy(pb);
5185 l2arc_pbuf_init(pb);
5186 }
5187
5188 /*
5189 * Bump device hand to the device start if it is approaching the end.
5190 * l2arc_evict() will already have evicted ahead for this case.
5191 */
5192 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5193 vdev_space_update(dev->l2ad_vdev,
5194 dev->l2ad_end - dev->l2ad_hand, 0, 0);
5195 dev->l2ad_hand = dev->l2ad_start;
5196 dev->l2ad_evict = dev->l2ad_start;
5197 dev->l2ad_first = B_FALSE;
5198 }
5199
5200 dev->l2ad_writing = B_TRUE;
5201 (void) zio_wait(pio);
5202 dev->l2ad_writing = B_FALSE;
5203
5204 return (write_asize);
5205 }
5206
5207 /*
5208 * Compresses an L2ARC buffer.
5209 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
5210 * size in l2hdr->b_asize. This routine tries to compress the data and
5211 * depending on the compression result there are three possible outcomes:
5212 * *) The buffer was incompressible. The original l2hdr contents were left
5213 * untouched and are ready for writing to an L2 device.
5214 * *) The buffer was all-zeros, so there is no need to write it to an L2
5215 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5216 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5217 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5218 * data buffer which holds the compressed data to be written, and b_asize
5219 * tells us how much data there is. b_compress is set to the appropriate
5220 * compression algorithm. Once writing is done, invoke
5221 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
5222 *
5223 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
5224 * buffer was incompressible).
5225 */
5226 static boolean_t
5227 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
5228 {
5229 void *cdata;
5230 size_t csize, len;
5231
5232 ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
5233 ASSERT(l2hdr->b_tmp_cdata != NULL);
5234
5235 len = l2hdr->b_asize;
5236 cdata = zio_data_buf_alloc(len);
5237 csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
5238 cdata, l2hdr->b_asize);
5239
5240 if (csize == 0) {
5241 /* zero block, indicate that there's nothing to write */
5242 zio_data_buf_free(cdata, len);
5243 l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
5244 l2hdr->b_asize = 0;
5245 l2hdr->b_tmp_cdata = NULL;
5246 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
5247 return (B_TRUE);
5248 } else if (csize > 0 && csize < len) {
5249 /*
5250 * Compression succeeded, we'll keep the cdata around for
5251 * writing and release it afterwards.
5252 */
5253 l2hdr->b_compress = ZIO_COMPRESS_LZ4;
5254 l2hdr->b_asize = csize;
5255 l2hdr->b_tmp_cdata = cdata;
5256 ARCSTAT_BUMP(arcstat_l2_compress_successes);
5257 return (B_TRUE);
5258 } else {
5259 /*
5260 * Compression failed, release the compressed buffer.
5261 * l2hdr will be left unmodified.
5262 */
5263 zio_data_buf_free(cdata, len);
5264 ARCSTAT_BUMP(arcstat_l2_compress_failures);
5265 return (B_FALSE);
5266 }
5267 }
5268
5269 /*
5270 * Decompresses a zio read back from an l2arc device. On success, the
5271 * underlying zio's io_data buffer is overwritten by the uncompressed
5272 * version. On decompression error (corrupt compressed stream), the
5273 * zio->io_error value is set to signal an I/O error.
5274 *
5275 * Please note that the compressed data stream is not checksummed, so
5276 * if the underlying device is experiencing data corruption, we may feed
5277 * corrupt data to the decompressor, so the decompressor needs to be
5278 * able to handle this situation (LZ4 does).
5279 */
5280 static void
5281 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5282 {
5283 ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5284
5285 if (zio->io_error != 0) {
5286 /*
5287 * An io error has occured, just restore the original io
5288 * size in preparation for a main pool read.
5289 */
5290 zio->io_orig_size = zio->io_size = hdr->b_size;
5291 return;
5292 }
5293
5294 if (c == ZIO_COMPRESS_EMPTY) {
5295 /*
5296 * An empty buffer results in a null zio, which means we
5297 * need to fill its io_data after we're done restoring the
5298 * buffer's contents.
5299 */
5300 ASSERT(hdr->b_buf != NULL);
5301 bzero(hdr->b_buf->b_data, hdr->b_size);
5302 zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
5303 } else {
5304 ASSERT(zio->io_data != NULL);
5305 /*
5306 * We copy the compressed data from the start of the arc buffer
5307 * (the zio_read will have pulled in only what we need, the
5308 * rest is garbage which we will overwrite at decompression)
5309 * and then decompress back to the ARC data buffer. This way we
5310 * can minimize copying by simply decompressing back over the
5311 * original compressed data (rather than decompressing to an
5312 * aux buffer and then copying back the uncompressed buffer,
5313 * which is likely to be much larger).
5314 */
5315 uint64_t csize;
5316 void *cdata;
5317
5318 csize = zio->io_size;
5319 cdata = zio_data_buf_alloc(csize);
5320 bcopy(zio->io_data, cdata, csize);
5321 if (zio_decompress_data(c, cdata, zio->io_data, csize,
5322 hdr->b_size) != 0)
5323 zio->io_error = EIO;
5324 zio_data_buf_free(cdata, csize);
5325 }
5326
5327 /* Restore the expected uncompressed IO size. */
5328 zio->io_orig_size = zio->io_size = hdr->b_size;
5329 }
5330
5331 /*
5332 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5333 * This buffer serves as a temporary holder of compressed data while
5334 * the buffer entry is being written to an l2arc device. Once that is
5335 * done, we can dispose of it.
5336 */
5337 static void
5338 l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
5339 {
5340 l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
5341
5342 if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
5343 /*
5344 * If the data was compressed, then we've allocated a
5345 * temporary buffer for it, so now we need to release it.
5346 */
5347 ASSERT(l2hdr->b_tmp_cdata != NULL);
5348 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
5349 }
5350 l2hdr->b_tmp_cdata = NULL;
5351 }
5352
5353 /*
5354 * This thread feeds the L2ARC at regular intervals. This is the beating
5355 * heart of the L2ARC.
5356 */
5357 static void
5358 l2arc_feed_thread(void)
5359 {
5360 callb_cpr_t cpr;
5361 l2arc_dev_t *dev;
5362 spa_t *spa;
5363 uint64_t size, wrote;
5364 clock_t begin, next = ddi_get_lbolt();
5365 boolean_t headroom_boost = B_FALSE;
5366
5367 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5368
5369 mutex_enter(&l2arc_feed_thr_lock);
5370
5371 while (l2arc_thread_exit == 0) {
5372 CALLB_CPR_SAFE_BEGIN(&cpr);
5373 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
5374 next);
5375 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5376 next = ddi_get_lbolt() + hz;
5377
5378 /*
5379 * Quick check for L2ARC devices.
5380 */
5381 mutex_enter(&l2arc_dev_mtx);
5382 if (l2arc_ndev == 0) {
5383 mutex_exit(&l2arc_dev_mtx);
5384 continue;
5385 }
5386 mutex_exit(&l2arc_dev_mtx);
5387 begin = ddi_get_lbolt();
5388
5389 /*
5390 * This selects the next l2arc device to write to, and in
5391 * doing so the next spa to feed from: dev->l2ad_spa. This
5392 * will return NULL if there are now no l2arc devices or if
5393 * they are all faulted.
5394 *
5395 * If a device is returned, its spa's config lock is also
5396 * held to prevent device removal. l2arc_dev_get_next()
5397 * will grab and release l2arc_dev_mtx.
5398 */
5399 if ((dev = l2arc_dev_get_next()) == NULL)
5400 continue;
5401
5402 spa = dev->l2ad_spa;
5403 ASSERT(spa != NULL);
5404
5405 /*
5406 * If the pool is read-only then force the feed thread to
5407 * sleep a little longer.
5408 */
5409 if (!spa_writeable(spa)) {
5410 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5411 spa_config_exit(spa, SCL_L2ARC, dev);
5412 continue;
5413 }
5414
5415 /*
5416 * Avoid contributing to memory pressure.
5417 */
5418 if (arc_reclaim_needed()) {
5419 ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5420 spa_config_exit(spa, SCL_L2ARC, dev);
5421 continue;
5422 }
5423
5424 ARCSTAT_BUMP(arcstat_l2_feeds);
5425
5426 size = l2arc_write_size();
5427
5428 /*
5429 * Evict L2ARC buffers that will be overwritten.
5430 */
5431 l2arc_evict(dev, size, B_FALSE);
5432
5433 /*
5434 * Write ARC buffers.
5435 */
5436 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5437
5438 /*
5439 * Calculate interval between writes.
5440 */
5441 next = l2arc_write_interval(begin, size, wrote);
5442 spa_config_exit(spa, SCL_L2ARC, dev);
5443 }
5444
5445 l2arc_thread_exit = 0;
5446 cv_broadcast(&l2arc_feed_thr_cv);
5447 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
5448 thread_exit();
5449 }
5450
5451 boolean_t
5452 l2arc_vdev_present(vdev_t *vd)
5453 {
5454 l2arc_dev_t *dev;
5455
5456 mutex_enter(&l2arc_dev_mtx);
5457 for (dev = list_head(l2arc_dev_list); dev != NULL;
5458 dev = list_next(l2arc_dev_list, dev)) {
5459 if (dev->l2ad_vdev == vd)
5460 break;
5461 }
5462 mutex_exit(&l2arc_dev_mtx);
5463
5464 return (dev != NULL);
5465 }
5466
5467 /*
5468 * Add a vdev for use by the L2ARC. By this point the spa has already
5469 * validated the vdev and opened it. The `rebuild' flag indicates whether
5470 * we should attempt an L2ARC persistency rebuild.
5471 */
5472 void
5473 l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
5474 {
5475 l2arc_dev_t *adddev;
5476
5477 ASSERT(!l2arc_vdev_present(vd));
5478
5479 /*
5480 * Create a new l2arc device entry.
5481 */
5482 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5483 adddev->l2ad_spa = spa;
5484 adddev->l2ad_vdev = vd;
5485 adddev->l2ad_start = VDEV_LABEL_START_SIZE + L2UBERBLOCK_SIZE;
5486 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5487 adddev->l2ad_hand = adddev->l2ad_start;
5488 adddev->l2ad_evict = adddev->l2ad_start;
5489 adddev->l2ad_first = B_TRUE;
5490 adddev->l2ad_writing = B_FALSE;
5491 l2arc_pbuf_init(&adddev->l2ad_pbuf);
5492
5493 /*
5494 * This is a list of all ARC buffers that are still valid on the
5495 * device.
5496 */
5497 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5498 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5499 offsetof(arc_buf_hdr_t, b_l2node));
5500
5501 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5502
5503 /*
5504 * Add device to global list
5505 */
5506 mutex_enter(&l2arc_dev_mtx);
5507 list_insert_head(l2arc_dev_list, adddev);
5508 atomic_inc_64(&l2arc_ndev);
5509 if (rebuild && l2arc_rebuild_enabled) {
5510 adddev->l2ad_rebuilding = B_TRUE;
5511 (void) thread_create(NULL, 0, l2arc_rebuild_start, adddev,
5512 0, &p0, TS_RUN, minclsyspri);
5513 }
5514 mutex_exit(&l2arc_dev_mtx);
5515 }
5516
5517 /*
5518 * Remove a vdev from the L2ARC.
5519 */
5520 void
5521 l2arc_remove_vdev(vdev_t *vd)
5522 {
5523 l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5524
5525 /*
5526 * Find the device by vdev
5527 */
5528 mutex_enter(&l2arc_dev_mtx);
5529 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5530 nextdev = list_next(l2arc_dev_list, dev);
5531 if (vd == dev->l2ad_vdev) {
5532 remdev = dev;
5533 break;
5534 }
5535 }
5536 ASSERT(remdev != NULL);
5537
5538 /*
5539 * Remove device from global list
5540 */
5541 list_remove(l2arc_dev_list, remdev);
5542 l2arc_dev_last = NULL; /* may have been invalidated */
5543 atomic_dec_64(&l2arc_ndev);
5544 mutex_exit(&l2arc_dev_mtx);
5545
5546 /*
5547 * Clear all buflists and ARC references. L2ARC device flush.
5548 */
5549 l2arc_pbuf_destroy(&remdev->l2ad_pbuf);
5550 l2arc_evict(remdev, 0, B_TRUE);
5551 list_destroy(remdev->l2ad_buflist);
5552 kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5553 kmem_free(remdev, sizeof (l2arc_dev_t));
5554 }
5555
5556 void
5557 l2arc_init(void)
5558 {
5559 l2arc_thread_exit = 0;
5560 l2arc_ndev = 0;
5561 l2arc_writes_sent = 0;
5562 l2arc_writes_done = 0;
5563
5564 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5565 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5566 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5567 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5568 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5569
5570 l2arc_dev_list = &L2ARC_dev_list;
5571 l2arc_free_on_write = &L2ARC_free_on_write;
5572 list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5573 offsetof(l2arc_dev_t, l2ad_node));
5574 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5575 offsetof(l2arc_data_free_t, l2df_list_node));
5576 }
5577
5578 void
5579 l2arc_fini(void)
5580 {
5581 /*
5582 * This is called from dmu_fini(), which is called from spa_fini();
5583 * Because of this, we can assume that all l2arc devices have
5584 * already been removed when the pools themselves were removed.
5585 */
5586
5587 l2arc_do_free_on_write();
5588
5589 mutex_destroy(&l2arc_feed_thr_lock);
5590 cv_destroy(&l2arc_feed_thr_cv);
5591 mutex_destroy(&l2arc_dev_mtx);
5592 mutex_destroy(&l2arc_buflist_mtx);
5593 mutex_destroy(&l2arc_free_on_write_mtx);
5594
5595 list_destroy(l2arc_dev_list);
5596 list_destroy(l2arc_free_on_write);
5597 }
5598
5599 void
5600 l2arc_start(void)
5601 {
5602 if (!(spa_mode_global & FWRITE))
5603 return;
5604
5605 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5606 TS_RUN, minclsyspri);
5607 }
5608
5609 void
5610 l2arc_stop(void)
5611 {
5612 if (!(spa_mode_global & FWRITE))
5613 return;
5614
5615 mutex_enter(&l2arc_feed_thr_lock);
5616 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
5617 l2arc_thread_exit = 1;
5618 while (l2arc_thread_exit != 0)
5619 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5620 mutex_exit(&l2arc_feed_thr_lock);
5621 }
5622
5623 /*
5624 * Main entry point for L2ARC metadata rebuilding. This function must be
5625 * called via thread_create so that the L2ARC metadata rebuild doesn't block
5626 * pool import and may proceed in parallel on all available L2ARC devices.
5627 */
5628 static void
5629 l2arc_rebuild_start(l2arc_dev_t *dev)
5630 {
5631 vdev_t *vd = dev->l2ad_vdev;
5632 spa_t *spa = dev->l2ad_spa;
5633
5634 /* Lock out device removal. */
5635 spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
5636 ASSERT(dev->l2ad_rebuilding == B_TRUE);
5637 l2arc_rebuild(dev);
5638 dev->l2ad_rebuilding = B_FALSE;
5639 spa_config_exit(spa, SCL_L2ARC, vd);
5640 thread_exit();
5641 }
5642
5643 /*
5644 * This function implements the actual L2ARC metadata rebuild. It:
5645 *
5646 * 1) scans the device for valid l2uberblocks
5647 * 2) if it finds a good uberblock, starts reading the pbuf chain
5648 * 3) restores each pbuf's contents to memory
5649 *
5650 * Operation stops under any of the following conditions:
5651 *
5652 * 1) We reach the end of the pbuf chain (the previous-buffer reference
5653 * in the pbuf is zero).
5654 * 2) We encounter *any* error condition (cksum errors, io errors, looped
5655 * pbufs, etc.).
5656 * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect
5657 * from making severely fragmented L2ARC pbufs or slow L2ARC devices
5658 * prevent a machine from importing the pool (and letting the
5659 * administrator take corrective action, e.g. by kicking the misbehaving
5660 * L2ARC device out of the pool, or by reimporting the pool with L2ARC
5661 * rebuilding disabled).
5662 */
5663 static void
5664 l2arc_rebuild(l2arc_dev_t *dev)
5665 {
5666 int err;
5667 l2uberblock_t ub;
5668 l2pbuf_t pb;
5669 zio_t *this_io = NULL, *next_io = NULL;
5670 int64_t deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout;
5671
5672 if ((err = l2arc_uberblock_find(dev, &ub)) != 0)
5673 return;
5674 L2ARC_CHK_REBUILD_TIMEOUT(deadline, /* nop */);
5675
5676 /* set up uberblock update info */
5677 dev->l2ad_uberblock_birth = ub.ub_birth + 1;
5678
5679 /* initial sanity checks */
5680 l2arc_pbuf_init(&pb);
5681 if ((err = l2arc_pbuf_read(dev, ub.ub_pbuf_daddr, ub.ub_pbuf_asize,
5682 ub.ub_pbuf_cksum, &pb, NULL, &this_io)) != 0) {
5683 /* root pbuf is bad, we can't do anything about that */
5684 if (err == EINVAL) {
5685 ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors);
5686 } else {
5687 ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors);
5688 }
5689 l2arc_pbuf_destroy(&pb);
5690 return;
5691 }
5692 L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb));
5693
5694 dev->l2ad_evict = ub.ub_evict_tail;
5695
5696 /* keep on chaining in new blocks */
5697 dev->l2ad_pbuf_daddr = ub.ub_pbuf_daddr;
5698 dev->l2ad_pbuf_asize = ub.ub_pbuf_asize;
5699 dev->l2ad_pbuf_cksum = ub.ub_pbuf_cksum;
5700 dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
5701 ub.ub_pbuf_daddr + ub.ub_pbuf_asize);
5702 dev->l2ad_first = ((ub.ub_flags & L2UBLK_EVICT_FIRST) != 0);
5703
5704 /* start the rebuild process */
5705 for (;;) {
5706 l2pbuf_t pb_prev;
5707
5708 l2arc_pbuf_init(&pb_prev);
5709 if ((err = l2arc_pbuf_read(dev, pb.pb_prev_daddr,
5710 pb.pb_prev_asize, pb.pb_prev_cksum, &pb_prev, this_io,
5711 &next_io)) != 0) {
5712 /*
5713 * We are done reading, discard the last good buffer.
5714 */
5715 if (pb.pb_prev_daddr > dev->l2ad_hand &&
5716 pb.pb_prev_asize > L2PBUF_HDR_SIZE) {
5717 /* this is an error, we stopped too early */
5718 if (err == EINVAL) {
5719 ARCSTAT_BUMP(
5720 arcstat_l2_rebuild_cksum_errors);
5721 } else {
5722 ARCSTAT_BUMP(
5723 arcstat_l2_rebuild_io_errors);
5724 }
5725 }
5726 l2arc_pbuf_destroy(&pb_prev);
5727 l2arc_pbuf_destroy(&pb);
5728 break;
5729 }
5730
5731 /*
5732 * Protection against infinite loops of pbufs. This is also
5733 * our primary termination mechanism - once the buffer list
5734 * loops around our starting pbuf, we can stop.
5735 */
5736 if (pb.pb_prev_daddr >= ub.ub_pbuf_daddr &&
5737 pb_prev.pb_prev_daddr <= ub.ub_pbuf_daddr) {
5738 ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors);
5739 l2arc_pbuf_destroy(&pb);
5740 l2arc_pbuf_destroy(&pb_prev);
5741 if (next_io)
5742 l2arc_pbuf_prefetch_abort(next_io);
5743 return;
5744 }
5745
5746 /*
5747 * Our memory pressure valve. If the system is running low
5748 * on memory, rather than swamping memory with new ARC buf
5749 * hdrs, we opt not to reconstruct the L2ARC. At this point,
5750 * however, we have already set up our L2ARC dev to chain in
5751 * new metadata pbufs, so the user may choose to re-add the
5752 * L2ARC dev at a later time to reconstruct it (when there's
5753 * less memory pressure).
5754 */
5755 if (arc_reclaim_needed()) {
5756 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
5757 cmn_err(CE_NOTE, "System running low on memory, "
5758 "aborting L2ARC rebuild.");
5759 l2arc_pbuf_destroy(&pb);
5760 l2arc_pbuf_destroy(&pb_prev);
5761 if (next_io)
5762 l2arc_pbuf_prefetch_abort(next_io);
5763 break;
5764 }
5765
5766 /*
5767 * Now that we know that the prev_pbuf checks out alright, we
5768 * can start reconstruction from this pbuf - we can be sure
5769 * that the L2ARC write hand has not yet reached any of our
5770 * buffers.
5771 */
5772 l2arc_pbuf_restore(dev, &pb);
5773
5774 /* pbuf restored, continue with next one in the list */
5775 l2arc_pbuf_destroy(&pb);
5776 pb = pb_prev;
5777 this_io = next_io;
5778 next_io = NULL;
5779
5780 L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb));
5781 }
5782
5783 ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
5784 }
5785
5786 /*
5787 * Restores the payload of a pbuf to ARC. This creates empty ARC hdr entries
5788 * which only contain an l2arc hdr, essentially restoring the buffers to
5789 * their L2ARC evicted state. This function also updates space usage on the
5790 * L2ARC vdev to make sure it tracks restored buffers.
5791 */
5792 static void
5793 l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb)
5794 {
5795 spa_t *spa;
5796 uint64_t guid;
5797 list_t *buflists_list;
5798 l2pbuf_buflist_t *buflist;
5799
5800 mutex_enter(&l2arc_buflist_mtx);
5801 spa = dev->l2ad_vdev->vdev_spa;
5802 guid = spa_load_guid(spa);
5803 buflists_list = pb->pb_buflists_list;
5804 for (buflist = list_head(buflists_list); buflist;
5805 buflist = list_next(buflists_list, buflist)) {
5806 int i;
5807 uint64_t size, asize, psize;
5808
5809 size = asize = psize = 0;
5810 for (i = 0; i < buflist->l2pbl_nbufs; i++) {
5811 l2arc_hdr_restore(&buflist->l2pbl_bufs[i], dev,
5812 guid);
5813 size += buflist->l2pbl_bufs[i].b_size;
5814 asize += buflist->l2pbl_bufs[i].b_l2asize;
5815 psize += vdev_psize_to_asize(dev->l2ad_vdev,
5816 buflist->l2pbl_bufs[i].b_l2asize);
5817 }
5818 ARCSTAT_INCR(arcstat_l2_rebuild_arc_bytes, size);
5819 ARCSTAT_INCR(arcstat_l2_rebuild_l2arc_bytes, asize);
5820 ARCSTAT_INCR(arcstat_l2_rebuild_bufs, buflist->l2pbl_nbufs);
5821 vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
5822 }
5823 mutex_exit(&l2arc_buflist_mtx);
5824 ARCSTAT_BUMP(arcstat_l2_rebuild_metabufs);
5825 vdev_space_update(dev->l2ad_vdev, vdev_psize_to_asize(dev->l2ad_vdev,
5826 pb->pb_asize), 0, 0);
5827 }
5828
5829 /*
5830 * Restores a single ARC buf hdr from a pbuf. The ARC buffer is put into
5831 * a state indicating that it has been evicted to L2ARC.
5832 * The `guid' here is the ARC-load-guid from spa_load_guid.
5833 */
5834 static void
5835 l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev, uint64_t guid)
5836 {
5837 arc_buf_hdr_t *hdr;
5838 kmutex_t *hash_lock;
5839 dva_t dva = {buf->b_dva.dva_word[0], buf->b_dva.dva_word[1]};
5840
5841 hdr = buf_hash_find(guid, &dva, buf->b_birth, &hash_lock);
5842 if (hdr == NULL) {
5843 /* not in cache, try to insert */
5844 arc_buf_hdr_t *exists;
5845 arc_buf_contents_t type = buf->b_contents_type;
5846 l2arc_buf_hdr_t *l2hdr;
5847
5848 hdr = arc_buf_hdr_alloc(guid, buf->b_size, type);
5849 hdr->b_dva = buf->b_dva;
5850 hdr->b_birth = buf->b_birth;
5851 hdr->b_cksum0 = buf->b_cksum0;
5852 hdr->b_size = buf->b_size;
5853 exists = buf_hash_insert(hdr, &hash_lock);
5854 if (exists) {
5855 /* somebody beat us to the hash insert */
5856 mutex_exit(hash_lock);
5857 arc_hdr_destroy(hdr);
5858 ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
5859 return;
5860 }
5861 hdr->b_flags = buf->b_flags;
5862 mutex_enter(&hdr->b_freeze_lock);
5863 ASSERT(hdr->b_freeze_cksum == NULL);
5864 hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
5865 KM_SLEEP);
5866 *hdr->b_freeze_cksum = buf->b_freeze_cksum;
5867 mutex_exit(&hdr->b_freeze_lock);
5868
5869 /* now rebuild the l2arc entry */
5870 ASSERT(hdr->b_l2hdr == NULL);
5871 l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
5872 l2hdr->b_dev = dev;
5873 l2hdr->b_daddr = buf->b_l2daddr;
5874 l2hdr->b_asize = buf->b_l2asize;
5875 l2hdr->b_compress = buf->b_l2compress;
5876 hdr->b_l2hdr = l2hdr;
5877 list_insert_head(dev->l2ad_buflist, hdr);
5878 ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
5879 ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize);
5880
5881 arc_change_state(arc_l2c_only, hdr, hash_lock);
5882 }
5883 mutex_exit(hash_lock);
5884 }
5885
5886 /*
5887 * Attempts to locate and read the newest valid uberblock on the provided
5888 * L2ARC device and writes it to `ub'. On success, this function returns 0,
5889 * otherwise the appropriate error code is returned.
5890 */
5891 static int
5892 l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub)
5893 {
5894 int err = 0;
5895 uint8_t *ub_buf;
5896 uint64_t guid;
5897
5898 ARCSTAT_BUMP(arcstat_l2_rebuild_attempts);
5899 ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP);
5900 guid = spa_guid(dev->l2ad_vdev->vdev_spa);
5901
5902 if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
5903 VDEV_LABEL_START_SIZE, L2UBERBLOCK_SIZE, ub_buf,
5904 ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
5905 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
5906 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
5907 ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors);
5908 goto cleanup;
5909 }
5910
5911 /*
5912 * Initial peek - does the device even have any usable uberblocks?
5913 * If not, don't bother continuing.
5914 */
5915 l2arc_uberblock_decode(ub_buf, ub);
5916 if (ub->ub_magic != L2UBERBLOCK_MAGIC || ub->ub_version == 0 ||
5917 ub->ub_version > L2UBERBLOCK_MAX_VERSION ||
5918 ub->ub_spa_guid != guid) {
5919 err = ENOTSUP;
5920 ARCSTAT_BUMP(arcstat_l2_rebuild_unsupported);
5921 goto cleanup;
5922 }
5923
5924 /* now check to make sure that what we selected is okay */
5925 if ((err = l2arc_uberblock_verify(ub_buf, ub, guid)) != 0) {
5926 if (err == EINVAL) {
5927 ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors);
5928 } else {
5929 ARCSTAT_BUMP(arcstat_l2_rebuild_uberblk_errors);
5930 }
5931 goto cleanup;
5932 }
5933
5934 /* this uberblock is valid */
5935
5936 cleanup:
5937 kmem_free(ub_buf, L2UBERBLOCK_SIZE);
5938 return (err);
5939 }
5940
5941 /*
5942 * Reads a pbuf from storage, decodes it and validates its contents against
5943 * the provided checksum. The result is placed in `pb'.
5944 *
5945 * The `this_io' and `prefetch_io' arguments are used for pbuf prefetching.
5946 * When issuing the first pbuf IO during rebuild, you should pass NULL for
5947 * `this_io'. This function will then issue a sync IO to read the pbuf and
5948 * also issue an async IO to fetch the next pbuf in the pbuf chain. The
5949 * prefetch IO is returned in `prefetch_io. On subsequent calls to this
5950 * function, pass the value returned in `prefetch_io' from the previous
5951 * call as `this_io' and a fresh `prefetch_io' pointer to hold the next
5952 * prefetch IO. Prior to the call, you should initialize your `prefetch_io'
5953 * pointer to be NULL. If no prefetch IO was issued, the pointer is left
5954 * set at NULL.
5955 *
5956 * Actual prefetching takes place in two steps: a header IO (pi_hdr_io)
5957 * and the main pbuf payload IO (placed in prefetch_io). The pi_hdr_io
5958 * IO is used internally in this function to be able to `peek' at the next
5959 * buffer's header before the main IO to read it in completely has finished.
5960 * We can then begin to issue the IO for the next buffer in the chain before
5961 * we are done reading, keeping the L2ARC device's pipeline saturated with
5962 * reads (rather than issuing an IO, waiting for it to complete, validating
5963 * the returned buffer and issuing the next one). This will make sure that
5964 * the rebuild proceeds at maximum read throughput.
5965 *
5966 * On success, this function returns 0, otherwise it returns an appropriate
5967 * error code. On error the prefetching IO is aborted and cleared before
5968 * returning from this function. Therefore, if we return `success', the
5969 * caller can assume that we have taken care of cleanup of prefetch IOs.
5970 */
5971 static int
5972 l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize,
5973 zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **prefetch_io)
5974 {
5975 int err = 0;
5976 uint64_t prev_pb_start;
5977 uint32_t prev_pb_asize;
5978 zio_cksum_t calc_cksum, prev_pb_cksum;
5979 l2arc_prefetch_info_t *pi = NULL;
5980
5981 ASSERT(dev != NULL);
5982 ASSERT(pb != NULL);
5983 ASSERT(*prefetch_io == NULL);
5984
5985 if (!l2arc_pbuf_ptr_valid(dev, daddr, asize)) {
5986 /* We could not have issued a prefetch IO for this */
5987 ASSERT(this_io == NULL);
5988 return (EINVAL);
5989 }
5990
5991 /*
5992 * Check to see if we have issued the IO for this pbuf in a previous
5993 * run. If not, issue it now.
5994 */
5995 if (this_io == NULL)
5996 this_io = l2arc_pbuf_prefetch(dev->l2ad_vdev, daddr, asize);
5997
5998 /* Pick up the prefetch info buffer and read its contents */
5999 pi = this_io->io_private;
6000 ASSERT(pi != NULL);
6001 ASSERT(asize <= pi->pi_buflen);
6002
6003 /* Wait for the IO to read this pbuf's header to complete */
6004 if ((err = zio_wait(pi->pi_hdr_io)) != 0) {
6005 (void) zio_wait(this_io);
6006 goto cleanup;
6007 }
6008
6009 /*
6010 * Peek to see if we can start issuing the next pbuf IO immediately.
6011 * At this point, only the current pbuf's header has been read.
6012 */
6013 if (l2arc_pbuf_decode_prev_ptr(pi->pi_buf, asize, &prev_pb_start,
6014 &prev_pb_asize, &prev_pb_cksum) == 0) {
6015 uint64_t this_pb_start, this_pb_end, prev_pb_end;
6016 /* Detect malformed pbuf references and loops */
6017 this_pb_start = daddr;
6018 this_pb_end = daddr + asize;
6019 prev_pb_end = prev_pb_start + prev_pb_asize;
6020 if ((prev_pb_start >= this_pb_start && prev_pb_start <
6021 this_pb_end) ||
6022 (prev_pb_end >= this_pb_start && prev_pb_end <
6023 this_pb_end)) {
6024 ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors);
6025 cmn_err(CE_WARN, "Looping L2ARC metadata reference "
6026 "detected, aborting rebuild.");
6027 err = EINVAL;
6028 goto cleanup;
6029 }
6030 /*
6031 * Start issuing IO for the next pbuf early - this should
6032 * help keep the L2ARC device busy while we read, decode
6033 * and restore this pbuf.
6034 */
6035 if (l2arc_pbuf_ptr_valid(dev, prev_pb_start, prev_pb_asize))
6036 *prefetch_io = l2arc_pbuf_prefetch(dev->l2ad_vdev,
6037 prev_pb_start, prev_pb_asize);
6038 }
6039
6040 /* Wait for the main pbuf IO to complete */
6041 if ((err = zio_wait(this_io)) != 0)
6042 goto cleanup;
6043
6044 /* Make sure the buffer checks out ok */
6045 fletcher_4_native(pi->pi_buf, asize, &calc_cksum);
6046 if (!ZIO_CHECKSUM_EQUAL(calc_cksum, cksum)) {
6047 err = EINVAL;
6048 goto cleanup;
6049 }
6050
6051 /* Now we can take our time decoding this buffer */
6052 if ((err = l2arc_pbuf_decode(pi->pi_buf, asize, pb)) != 0)
6053 goto cleanup;
6054
6055 /* This will be used in l2arc_pbuf_restore for space accounting */
6056 pb->pb_asize = asize;
6057
6058 ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, L2PBUF_ENCODED_SIZE(pb));
6059 ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, asize);
6060 ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio,
6061 pb->pb_payload_asz / asize);
6062
6063 cleanup:
6064 kmem_free(pi->pi_buf, pi->pi_buflen);
6065 pi->pi_buf = NULL;
6066 kmem_free(pi, sizeof (l2arc_prefetch_info_t));
6067 /* Abort an in-flight prefetch in case of error */
6068 if (err != 0 && *prefetch_io != NULL) {
6069 l2arc_pbuf_prefetch_abort(*prefetch_io);
6070 *prefetch_io = NULL;
6071 }
6072 return (err);
6073 }
6074
6075 /*
6076 * Validates a pbuf device address to make sure that it can be read
6077 * from the provided L2ARC device. Returns 1 if the address is within
6078 * the device's bounds, or 0 if not.
6079 */
6080 static int
6081 l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize)
6082 {
6083 uint32_t psize;
6084 uint64_t end;
6085
6086 psize = vdev_psize_to_asize(dev->l2ad_vdev, asize);
6087 end = daddr + psize;
6088
6089 if (end > dev->l2ad_end || asize < L2PBUF_HDR_SIZE ||
6090 asize > L2PBUF_MAX_PAYLOAD_SIZE || daddr < dev->l2ad_start ||
6091 /* check that the buffer address is correctly aligned */
6092 (daddr & (vdev_psize_to_asize(dev->l2ad_vdev,
6093 SPA_MINBLOCKSIZE) - 1)) != 0)
6094 return (0);
6095 else
6096 return (1);
6097 }
6098
6099 /*
6100 * Starts an asynchronous read IO to read a pbuf. This is used in pbuf
6101 * reconstruction to start reading the next pbuf before we are done
6102 * decoding and reconstructing the current pbuf, to keep the l2arc device
6103 * nice and hot with read IO to process.
6104 * The returned zio will contain a newly allocated memory buffers for the IO
6105 * data which should then be freed by the caller once the zio is no longer
6106 * needed (i.e. due to it having completed). If you wish to abort this
6107 * zio, you should do so using l2arc_pbuf_prefetch_abort, which takes care
6108 * of disposing of the allocated buffers correctly.
6109 */
6110 static zio_t *
6111 l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize)
6112 {
6113 uint32_t i, psize;
6114 zio_t *pio, *hdr_io;
6115 uint64_t hdr_rsize;
6116 uint8_t *buf;
6117 l2arc_prefetch_info_t *pinfo;
6118
6119 psize = vdev_psize_to_asize(vd, asize);
6120 buf = kmem_alloc(psize, KM_SLEEP);
6121 pinfo = kmem_alloc(sizeof (l2arc_prefetch_info_t), KM_SLEEP);
6122 pinfo->pi_buf = buf;
6123 pinfo->pi_buflen = psize;
6124
6125 /*
6126 * We start issuing the IO for the pbuf header early. This
6127 * allows l2arc_pbuf_read to start issuing IO for the next
6128 * buffer before the current pbuf is read in completely.
6129 */
6130
6131 hdr_rsize = vdev_psize_to_asize(vd, SPA_MINBLOCKSIZE);
6132 ASSERT(hdr_rsize <= psize);
6133 pinfo->pi_hdr_io = zio_root(vd->vdev_spa, NULL, NULL,
6134 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
6135 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
6136 hdr_io = zio_read_phys(pinfo->pi_hdr_io, vd, daddr, hdr_rsize, buf,
6137 ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
6138 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
6139 ZIO_FLAG_DONT_RETRY, B_FALSE);
6140 (void) zio_nowait(hdr_io);
6141
6142 /*
6143 * Read in the rest of the pbuf - this can take longer than just
6144 * having a peek at the header.
6145 */
6146 pio = zio_root(vd->vdev_spa, NULL, pinfo, ZIO_FLAG_DONT_CACHE |
6147 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
6148 ZIO_FLAG_DONT_RETRY);
6149 for (i = hdr_rsize; i < psize; ) {
6150 uint64_t rsize = psize - i;
6151 zio_t *rzio;
6152
6153 if (psize - i > SPA_MAXBLOCKSIZE)
6154 rsize = SPA_MAXBLOCKSIZE;
6155 ASSERT(rsize >= SPA_MINBLOCKSIZE);
6156 rzio = zio_read_phys(pio, vd, daddr + i,
6157 rsize, buf + i, ZIO_CHECKSUM_OFF, NULL, NULL,
6158 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE |
6159 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
6160 ZIO_FLAG_DONT_RETRY, B_FALSE);
6161 (void) zio_nowait(rzio);
6162 i += rsize;
6163 }
6164
6165 return (pio);
6166 }
6167
6168 /*
6169 * Aborts a zio returned from l2arc_pbuf_prefetch and frees the data
6170 * buffers allocated for it.
6171 */
6172 static void
6173 l2arc_pbuf_prefetch_abort(zio_t *zio)
6174 {
6175 l2arc_prefetch_info_t *pi;
6176
6177 pi = zio->io_private;
6178 ASSERT(pi != NULL);
6179 if (pi->pi_hdr_io != NULL)
6180 (void) zio_wait(pi->pi_hdr_io);
6181 (void) zio_wait(zio);
6182 kmem_free(pi->pi_buf, pi->pi_buflen);
6183 pi->pi_buf = NULL;
6184 kmem_free(pi, sizeof (l2arc_prefetch_info_t));
6185 }
6186
6187 /*
6188 * Encodes an l2uberblock_t structure into a destination buffer. This
6189 * buffer must be at least L2UBERBLOCK_SIZE bytes long. The resulting
6190 * uberblock is always of this constant size.
6191 */
6192 static void
6193 l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf)
6194 {
6195 zio_cksum_t cksum;
6196
6197 bzero(buf, L2UBERBLOCK_SIZE);
6198
6199 #if defined(_BIG_ENDIAN)
6200 *(uint32_t *)buf = L2UBERBLOCK_MAGIC;
6201 *(uint16_t *)(buf + 6) = L2UB_BIG_ENDIAN;
6202 #else /* !defined(_BIG_ENDIAN) */
6203 *(uint32_t *)buf = BSWAP_32(L2UBERBLOCK_MAGIC);
6204 /* zero flags is ok */
6205 #endif /* !defined(_BIG_ENDIAN) */
6206 buf[4] = L2UBERBLOCK_MAX_VERSION;
6207
6208 /* rest in native byte order */
6209 *(uint64_t *)(buf + 8) = ub->ub_spa_guid;
6210 *(uint64_t *)(buf + 16) = ub->ub_birth;
6211 *(uint64_t *)(buf + 24) = ub->ub_evict_tail;
6212 *(uint64_t *)(buf + 32) = ub->ub_alloc_space;
6213 *(uint64_t *)(buf + 40) = ub->ub_pbuf_daddr;
6214 *(uint32_t *)(buf + 48) = ub->ub_pbuf_asize;
6215 bcopy(&ub->ub_pbuf_cksum, buf + 52, 32);
6216
6217 fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum);
6218 bcopy(&cksum, buf + L2UBERBLOCK_SIZE - 32, 32);
6219 }
6220
6221 /*
6222 * Decodes an l2uberblock_t from an on-disk representation. Please note
6223 * that this function does not perform any uberblock validation and
6224 * checksumming - call l2arc_uberblock_verify() for that.
6225 */
6226 static void
6227 l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub)
6228 {
6229 boolean_t bswap_needed;
6230
6231 /* these always come in big endian */
6232 #if defined(_BIG_ENDIAN)
6233 ub->ub_magic = *(uint32_t *)buf;
6234 ub->ub_flags = *(uint16_t *)(buf + 6);
6235 bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 1);
6236 #else /* !defined(_BIG_ENDIAN) */
6237 ub->ub_magic = BSWAP_32(*(uint32_t *)buf);
6238 ub->ub_flags = BSWAP_16(*(uint16_t *)(buf + 6));
6239 bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 0);
6240 #endif /* !defined(_BIG_ENDIAN) */
6241 ub->ub_version = buf[4];
6242
6243 ub->ub_spa_guid = *(uint64_t *)(buf + 8);
6244 ub->ub_birth = *(uint64_t *)(buf + 16);
6245 ub->ub_evict_tail = *(uint64_t *)(buf + 24);
6246 ub->ub_alloc_space = *(uint64_t *)(buf + 32);
6247 ub->ub_pbuf_daddr = *(uint64_t *)(buf + 40);
6248 ub->ub_pbuf_asize = *(uint32_t *)(buf + 48);
6249 bcopy(buf + 52, &ub->ub_pbuf_cksum, 36);
6250 bcopy(buf + L2UBERBLOCK_SIZE - 32, &ub->ub_cksum, 32);
6251
6252 /* swap the rest if endianness doesn't match us */
6253 if (bswap_needed) {
6254 ub->ub_spa_guid = BSWAP_64(ub->ub_spa_guid);
6255 ub->ub_birth = BSWAP_64(ub->ub_birth);
6256 ub->ub_evict_tail = BSWAP_64(ub->ub_evict_tail);
6257 ub->ub_alloc_space = BSWAP_64(ub->ub_alloc_space);
6258 ub->ub_pbuf_daddr = BSWAP_64(ub->ub_pbuf_daddr);
6259 ub->ub_pbuf_asize = BSWAP_32(ub->ub_pbuf_asize);
6260 ZIO_CHECKSUM_BSWAP(&ub->ub_pbuf_cksum);
6261 ZIO_CHECKSUM_BSWAP(&ub->ub_cksum);
6262 }
6263 }
6264
6265 /*
6266 * Verifies whether a decoded uberblock (via l2arc_uberblock_decode()) is
6267 * valid and matches its checksum.
6268 */
6269 static int
6270 l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub,
6271 uint64_t guid)
6272 {
6273 zio_cksum_t cksum;
6274
6275 if (ub->ub_magic != L2UBERBLOCK_MAGIC ||
6276 ub->ub_version == 0 || ub->ub_version > L2UBERBLOCK_MAX_VERSION)
6277 /*
6278 * bad magic or invalid version => persistent l2arc not
6279 * supported
6280 */
6281 return (ENOTSUP);
6282
6283 if (ub->ub_spa_guid != guid)
6284 /* this l2arc dev isn't ours */
6285 return (EINVAL);
6286
6287 fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum);
6288 if (!ZIO_CHECKSUM_EQUAL(cksum, ub->ub_cksum))
6289 /* bad checksum, corrupt uberblock */
6290 return (EINVAL);
6291
6292 return (0);
6293 }
6294
6295 /*
6296 * Schedules a zio to update the uberblock on an l2arc device. The zio is
6297 * initiated as a child of `pio' and `cb' is filled with the information
6298 * needed to free the uberblock data buffer after writing.
6299 */
6300 static void
6301 l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
6302 {
6303 uint8_t *ub_buf;
6304 l2uberblock_t ub;
6305 zio_t *wzio;
6306 vdev_stat_t st;
6307
6308 ASSERT(cb->l2wcb_ub_buf == NULL);
6309 vdev_get_stats(dev->l2ad_vdev, &st);
6310
6311 bzero(&ub, sizeof (ub));
6312 ub.ub_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
6313 ub.ub_birth = dev->l2ad_uberblock_birth++;
6314 ub.ub_evict_tail = dev->l2ad_evict;
6315 ub.ub_alloc_space = st.vs_alloc;
6316 ub.ub_pbuf_daddr = dev->l2ad_pbuf_daddr;
6317 ub.ub_pbuf_asize = dev->l2ad_pbuf_asize;
6318 ub.ub_pbuf_cksum = dev->l2ad_pbuf_cksum;
6319 if (dev->l2ad_first)
6320 ub.ub_flags |= L2UBLK_EVICT_FIRST;
6321
6322 ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP);
6323 cb->l2wcb_ub_buf = ub_buf;
6324 l2arc_uberblock_encode(&ub, ub_buf);
6325 wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
6326 L2UBERBLOCK_SIZE, ub_buf, ZIO_CHECKSUM_OFF, NULL, NULL,
6327 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
6328 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6329 zio_t *, wzio);
6330 (void) zio_nowait(wzio);
6331 }
6332
6333 /*
6334 * Encodes a l2pbuf_t structure into the portable on-disk format. The
6335 * `buf' buffer must be suitably sized to hold the entire uncompressed
6336 * structure (use L2PBUF_ENCODED_SIZE()). If requested, this function
6337 * also compresses the buffer.
6338 *
6339 * The return value is the length of the resulting encoded pbuf structure.
6340 * This can be either equal to L2PBUF_ENCODED_SIZE(pb) if no compression
6341 * was applied, or smaller if compression was applied. In either case,
6342 * prior to writing to disk, the caller must suitably pad the output
6343 * buffer so that it is aligned on a multiple of the underlying storage
6344 * system's block size.
6345 */
6346 static uint32_t
6347 l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen)
6348 {
6349 uint16_t flags = 0;
6350 uint8_t *dst_buf;
6351 uint32_t enclen;
6352 l2pbuf_buflist_t *buflist;
6353
6354 enclen = L2PBUF_ENCODED_SIZE(pb);
6355 ASSERT(buflen >= enclen);
6356 bzero(buf, enclen);
6357
6358 /* non-header portions of pbufs are in native byte order */
6359 *(uint64_t *)(buf + 8) = pb->pb_prev_daddr;
6360 *(uint32_t *)(buf + 16) = pb->pb_prev_asize;
6361 bcopy(&pb->pb_prev_cksum, buf + 20, 32);
6362 *(uint32_t *)(buf + 52) = enclen - L2PBUF_HDR_SIZE;
6363
6364 /* first we encode the buflists uncompressed */
6365 dst_buf = buf + L2PBUF_HDR_SIZE;
6366 for (buflist = list_head(pb->pb_buflists_list); buflist;
6367 buflist = list_next(pb->pb_buflists_list, buflist)) {
6368 int i;
6369
6370 ASSERT(buflist->l2pbl_nbufs != 0);
6371 for (i = 0; i < buflist->l2pbl_nbufs; i++) {
6372 l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i];
6373
6374 ASSERT(pbl_buf->b_size != 0);
6375 *(uint64_t *)dst_buf = pbl_buf->b_dva.dva_word[0];
6376 *(uint64_t *)(dst_buf + 8) = pbl_buf->b_dva.dva_word[1];
6377 *(uint64_t *)(dst_buf + 16) = pbl_buf->b_birth;
6378 *(uint64_t *)(dst_buf + 24) = pbl_buf->b_cksum0;
6379 bcopy(&pbl_buf->b_freeze_cksum, dst_buf + 32, 32);
6380 *(uint32_t *)(dst_buf + 64) = pbl_buf->b_size;
6381 *(uint64_t *)(dst_buf + 68) = pbl_buf->b_l2daddr;
6382 *(uint32_t *)(dst_buf + 76) = pbl_buf->b_l2asize;
6383 dst_buf[80] = pbl_buf->b_l2compress;
6384 dst_buf[81] = pbl_buf->b_contents_type;
6385 *(uint32_t *)(dst_buf + 84) = pbl_buf->b_flags;
6386 dst_buf += L2PBUF_BUF_SIZE;
6387 }
6388 }
6389 ASSERT((uint32_t)(dst_buf - buf) == enclen);
6390
6391 /* and then compress them if necessary */
6392 if (enclen >= l2arc_pbuf_compress_minsz) {
6393 uint8_t *cbuf;
6394 size_t slen, clen;
6395
6396 slen = l2arc_pbuf_items_encoded_size(pb);
6397 cbuf = kmem_alloc(slen, KM_SLEEP);
6398 clen = lz4_compress(buf + L2PBUF_HDR_SIZE, cbuf, slen, slen, 0);
6399 ASSERT(clen != 0);
6400 if (clen < slen) {
6401 bcopy(cbuf, buf + L2PBUF_HDR_SIZE, clen);
6402 flags |= L2PBUF_COMPRESSED;
6403 /* zero out the rest of the input buffer */
6404 bzero(buf + L2PBUF_HDR_SIZE + clen,
6405 buflen - (L2PBUF_HDR_SIZE + clen));
6406 /* adjust our buffer length now that it's shortened */
6407 enclen = L2PBUF_HDR_SIZE + clen;
6408 }
6409 kmem_free(cbuf, slen);
6410 }
6411
6412 /* the header goes last since `flags' may change due to compression */
6413 #if defined(_BIG_ENDIAN)
6414 *(uint32_t *)buf = L2PBUF_MAGIC;
6415 flags |= L2PBUF_BIG_ENDIAN;
6416 *(uint16_t *)(buf + 6) = flags;
6417 #else /* !defined(_BIG_ENDIAN) */
6418 *(uint32_t *)buf = BSWAP_32(L2PBUF_MAGIC);
6419 *(uint16_t *)(buf + 6) = BSWAP_16(flags);
6420 #endif /* !defined(_BIG_ENDIAN) */
6421 buf[4] = L2PBUF_MAX_VERSION;
6422
6423 return (enclen);
6424 }
6425
6426 /*
6427 * Decodes a stored l2pbuf_t structure previously encoded using
6428 * l2arc_pbuf_encode. The source buffer is not modified. The passed pbuf
6429 * must be initialized by l2arc_pbuf_init by the caller beforehand, but
6430 * must not have been used to store any buffers yet.
6431 *
6432 * Please note that we don't do checksum verification here, as we don't
6433 * know our own checksum (that's know by the previous block in the linked
6434 * list, or by the uberblock). This should be performed by the caller
6435 * prior to calling l2arc_pbuf_decode.
6436 */
6437 static int
6438 l2arc_pbuf_decode(uint8_t *input_buf, uint32_t buflen, l2pbuf_t *pb)
6439 {
6440 boolean_t bswap_needed;
6441 uint32_t payload_sz, payload_asz;
6442 uint8_t *src_bufs;
6443 l2pbuf_buflist_t *buflist;
6444 int i, nbufs;
6445
6446 ASSERT(input_buf != NULL);
6447 ASSERT(pb != NULL);
6448 ASSERT(pb->pb_version != 0);
6449 ASSERT(pb->pb_nbuflists == 0);
6450
6451 /* no valid buffer can be this small */
6452 if (buflen < L2PBUF_HDR_SIZE)
6453 return (EINVAL);
6454
6455 /* these always come in big endian */
6456 #if defined(_BIG_ENDIAN)
6457 pb->pb_magic = *(uint32_t *)input_buf;
6458 pb->pb_flags = *(uint16_t *)(input_buf + 6);
6459 bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 1);
6460 #else /* !defined(_BIG_ENDIAN) */
6461 pb->pb_magic = BSWAP_32(*(uint32_t *)input_buf);
6462 pb->pb_flags = BSWAP_16(*(uint16_t *)(input_buf + 6));
6463 bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 0);
6464 #endif /* !defined(_BIG_ENDIAN) */
6465 pb->pb_version = input_buf[4];
6466
6467 if (pb->pb_magic != L2PBUF_MAGIC || pb->pb_version == 0)
6468 return (EINVAL);
6469 if (pb->pb_version > L2PBUF_MAX_VERSION)
6470 return (ENOTSUP);
6471
6472 /* remainder of pbuf may need bswap'ping */
6473 pb->pb_prev_daddr = *(uint64_t *)(input_buf + 8);
6474 pb->pb_prev_asize = *(uint64_t *)(input_buf + 16);
6475 bcopy(input_buf + 20, &pb->pb_prev_cksum, 32);
6476 payload_sz = *(uint32_t *)(input_buf + 52);
6477 payload_asz = buflen - L2PBUF_HDR_SIZE;
6478
6479 if (bswap_needed) {
6480 pb->pb_prev_daddr = BSWAP_64(pb->pb_prev_daddr);
6481 pb->pb_prev_asize = BSWAP_64(pb->pb_prev_asize);
6482 ZIO_CHECKSUM_BSWAP(&pb->pb_prev_cksum);
6483 payload_sz = BSWAP_32(payload_sz);
6484 }
6485
6486 /* check for sensible buffer allocation limits */
6487 if (((pb->pb_flags & L2PBUF_COMPRESSED) && payload_sz <= payload_asz) ||
6488 (payload_sz > L2PBUF_MAX_PAYLOAD_SIZE) ||
6489 (payload_sz % L2PBUF_BUF_SIZE) != 0 || payload_sz == 0)
6490 return (EINVAL);
6491 nbufs = payload_sz / L2PBUF_BUF_SIZE;
6492
6493 /* decompression might be needed */
6494 if (pb->pb_flags & L2PBUF_COMPRESSED) {
6495 src_bufs = kmem_alloc(payload_sz, KM_SLEEP);
6496 if (lz4_decompress(input_buf + L2PBUF_HDR_SIZE, src_bufs,
6497 payload_asz, payload_sz, 0) != 0) {
6498 kmem_free(src_bufs, payload_sz);
6499 return (EINVAL);
6500 }
6501 } else {
6502 src_bufs = input_buf + L2PBUF_HDR_SIZE;
6503 }
6504
6505 /* Decode individual pbuf items from our source buffer. */
6506 buflist = l2arc_pbuf_buflist_alloc(pb, nbufs);
6507 for (i = 0; i < nbufs; i++) {
6508 l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i];
6509 const uint8_t *src = src_bufs + i * L2PBUF_BUF_SIZE;
6510
6511 pbl_buf->b_dva.dva_word[0] = *(uint64_t *)src;
6512 pbl_buf->b_dva.dva_word[1] = *(uint64_t *)(src + 8);
6513 pbl_buf->b_birth = *(uint64_t *)(src + 16);
6514 pbl_buf->b_cksum0 = *(uint64_t *)(src + 24);
6515 bcopy(src + 32, &pbl_buf->b_freeze_cksum, 32);
6516 pbl_buf->b_size = *(uint32_t *)(src + 64);
6517 pbl_buf->b_l2daddr = *(uint64_t *)(src + 68);
6518 pbl_buf->b_l2asize = *(uint32_t *)(src + 76);
6519 pbl_buf->b_l2compress = src[80];
6520 pbl_buf->b_contents_type = src[81];
6521 pbl_buf->b_flags = *(uint32_t *)(src + 84);
6522
6523 if (bswap_needed) {
6524 pbl_buf->b_dva.dva_word[0] =
6525 BSWAP_64(pbl_buf->b_dva.dva_word[0]);
6526 pbl_buf->b_dva.dva_word[1] =
6527 BSWAP_64(pbl_buf->b_dva.dva_word[1]);
6528 pbl_buf->b_birth = BSWAP_64(pbl_buf->b_birth);
6529 pbl_buf->b_cksum0 = BSWAP_64(pbl_buf->b_cksum0);
6530 ZIO_CHECKSUM_BSWAP(&pbl_buf->b_freeze_cksum);
6531 pbl_buf->b_size = BSWAP_32(pbl_buf->b_size);
6532 pbl_buf->b_l2daddr = BSWAP_64(pbl_buf->b_l2daddr);
6533 pbl_buf->b_l2asize = BSWAP_32(pbl_buf->b_l2asize);
6534 pbl_buf->b_flags = BSWAP_32(pbl_buf->b_flags);
6535 }
6536
6537 pb->pb_payload_asz += pbl_buf->b_l2asize;
6538 }
6539
6540 if (pb->pb_flags & L2PBUF_COMPRESSED)
6541 kmem_free(src_bufs, payload_sz);
6542
6543 return (0);
6544 }
6545
6546 /*
6547 * Decodes the previous buffer pointer encoded in a pbuf. This is used
6548 * during L2ARC reconstruction to "peek" at the next buffer and start
6549 * issuing IO to fetch it early, before decoding of the current buffer
6550 * is done (which can take time due to decompression).
6551 * Returns 0 on success (and fills in the return parameters `daddr',
6552 * `asize' and `cksum' with the info of the previous pbuf), and an errno
6553 * on error.
6554 */
6555 static int
6556 l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen, uint64_t *daddr,
6557 uint32_t *asize, zio_cksum_t *cksum)
6558 {
6559 boolean_t bswap_needed;
6560 uint16_t version, flags;
6561 uint32_t magic;
6562
6563 ASSERT(buf != NULL);
6564
6565 /* no valid buffer can be this small */
6566 if (buflen <= L2PBUF_HDR_SIZE)
6567 return (EINVAL);
6568
6569 /* these always come in big endian */
6570 #if defined(_BIG_ENDIAN)
6571 magic = *(uint32_t *)buf;
6572 flags = *(uint16_t *)(buf + 6);
6573 bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 1);
6574 #else /* !defined(_BIG_ENDIAN) */
6575 magic = BSWAP_32(*(uint32_t *)buf);
6576 flags = BSWAP_16(*(uint16_t *)(buf + 6));
6577 bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 0);
6578 #endif /* !defined(_BIG_ENDIAN) */
6579 version = buf[4];
6580
6581 if (magic != L2PBUF_MAGIC || version == 0)
6582 return (EINVAL);
6583 if (version > L2PBUF_MAX_VERSION)
6584 return (ENOTSUP);
6585
6586 *daddr = *(uint64_t *)(buf + 4);
6587 *asize = *(uint64_t *)(buf + 12);
6588 bcopy(buf + 16, cksum, 32);
6589
6590 if (bswap_needed) {
6591 *daddr = BSWAP_64(*daddr);
6592 *asize = BSWAP_64(*asize);
6593 ZIO_CHECKSUM_BSWAP(cksum);
6594 }
6595
6596 return (0);
6597 }
6598
6599 /*
6600 * Initializes a pbuf structure into a clean state. All version and flags
6601 * fields are filled in as appropriate for this architecture.
6602 * If the structure was used before, first call l2arc_pbuf_destroy on it,
6603 * as this function assumes the structure is uninitialized.
6604 */
6605 static void
6606 l2arc_pbuf_init(l2pbuf_t *pb)
6607 {
6608 bzero(pb, sizeof (l2pbuf_t));
6609 pb->pb_version = L2PBUF_MAX_VERSION;
6610 #if defined(_BIG_ENDIAN)
6611 pb->pb_flags |= L2PB_BIG_ENDIAN;
6612 #endif
6613 pb->pb_buflists_list = kmem_zalloc(sizeof (list_t), KM_SLEEP);
6614 list_create(pb->pb_buflists_list, sizeof (l2pbuf_buflist_t),
6615 offsetof(l2pbuf_buflist_t, l2pbl_node));
6616 }
6617
6618 /*
6619 * Destroys a pbuf structure and puts it into a clean state ready to be
6620 * initialized by l2arc_pbuf_init. All buflists created by
6621 * l2arc_pbuf_buflist_alloc are released as well.
6622 */
6623 static void
6624 l2arc_pbuf_destroy(l2pbuf_t *pb)
6625 {
6626 list_t *buflist_list = pb->pb_buflists_list;
6627 l2pbuf_buflist_t *buflist;
6628
6629 while ((buflist = list_head(buflist_list)) != NULL) {
6630 ASSERT(buflist->l2pbl_nbufs > 0);
6631 kmem_free(buflist->l2pbl_bufs, sizeof (l2pbuf_buf_t) *
6632 buflist->l2pbl_nbufs);
6633 list_remove(buflist_list, buflist);
6634 kmem_free(buflist, sizeof (l2pbuf_buflist_t));
6635 }
6636 pb->pb_nbuflists = 0;
6637 list_destroy(pb->pb_buflists_list);
6638 kmem_free(pb->pb_buflists_list, sizeof (list_t));
6639 bzero(pb, sizeof (l2pbuf_t));
6640 }
6641
6642 /*
6643 * Allocates a new buflist inside of a pbuf, which can hold up to `nbufs'
6644 * buffers. This is used during the buffer write cycle - each cycle allocates
6645 * a new buflist and fills it with buffers it writes. Then, when the pbuf
6646 * reaches its buflist limit, it is commited to stable storage.
6647 */
6648 static l2pbuf_buflist_t *
6649 l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs)
6650 {
6651 l2pbuf_buflist_t *buflist;
6652
6653 ASSERT(pb->pb_buflists_list != NULL);
6654 buflist = kmem_zalloc(sizeof (l2pbuf_buflist_t), KM_SLEEP);
6655 buflist->l2pbl_nbufs = nbufs;
6656 buflist->l2pbl_bufs = kmem_zalloc(sizeof (l2pbuf_buf_t) * nbufs,
6657 KM_SLEEP);
6658 list_insert_tail(pb->pb_buflists_list, buflist);
6659 pb->pb_nbuflists++;
6660
6661 return (buflist);
6662 }
6663
6664 /*
6665 * Inserts ARC buffer `ab' into the pbuf `pb' buflist `pbl' at index `idx'.
6666 * The buffer being inserted must be present in L2ARC.
6667 */
6668 static void
6669 l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl,
6670 const arc_buf_hdr_t *ab, int index)
6671 {
6672 l2pbuf_buf_t *pb_buf;
6673 const l2arc_buf_hdr_t *l2hdr;
6674
6675 l2hdr = ab->b_l2hdr;
6676 ASSERT(l2hdr != NULL);
6677 ASSERT(pbl->l2pbl_nbufs > index);
6678
6679 pb_buf = &pbl->l2pbl_bufs[index];
6680 pb_buf->b_dva = ab->b_dva;
6681 pb_buf->b_birth = ab->b_birth;
6682 pb_buf->b_cksum0 = ab->b_cksum0;
6683 pb_buf->b_freeze_cksum = *ab->b_freeze_cksum;
6684 pb_buf->b_size = ab->b_size;
6685 pb_buf->b_l2daddr = l2hdr->b_daddr;
6686 pb_buf->b_l2asize = l2hdr->b_asize;
6687 pb_buf->b_l2compress = l2hdr->b_compress;
6688 pb_buf->b_contents_type = ab->b_type;
6689 pb_buf->b_flags = ab->b_flags & L2ARC_PERSIST_FLAGS;
6690 pb->pb_payload_asz += l2hdr->b_asize;
6691 }
6692
6693 /*
6694 * Commits a pbuf to stable storage. This routine is invoked when writing
6695 * ARC buffers to an L2ARC device. When the pbuf associated with the device
6696 * has reached its limits (either in size or in number of writes), it is
6697 * scheduled here for writing.
6698 * This function allocates some memory to temporarily hold the serialized
6699 * buffer to be written. This is then released in l2arc_write_done.
6700 */
6701 static void
6702 l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
6703 {
6704 l2pbuf_t *pb = &dev->l2ad_pbuf;
6705 uint64_t i, est_encsize, bufsize, encsize, io_size;
6706 uint8_t *pb_buf;
6707
6708 pb->pb_prev_daddr = dev->l2ad_pbuf_daddr;
6709 pb->pb_prev_asize = dev->l2ad_pbuf_asize;
6710 pb->pb_prev_cksum = dev->l2ad_pbuf_cksum;
6711
6712 est_encsize = L2PBUF_ENCODED_SIZE(pb);
6713 bufsize = vdev_psize_to_asize(dev->l2ad_vdev, est_encsize);
6714 pb_buf = kmem_zalloc(bufsize, KM_SLEEP);
6715 encsize = l2arc_pbuf_encode(pb, pb_buf, bufsize);
6716 cb->l2wcb_pbuf = pb_buf;
6717 cb->l2wcb_pbuf_size = bufsize;
6718
6719 dev->l2ad_pbuf_daddr = dev->l2ad_hand;
6720 dev->l2ad_pbuf_asize = encsize;
6721 fletcher_4_native(pb_buf, encsize, &dev->l2ad_pbuf_cksum);
6722
6723 io_size = vdev_psize_to_asize(dev->l2ad_vdev, encsize);
6724 for (i = 0; i < io_size; ) {
6725 zio_t *wzio;
6726 uint64_t wsize = io_size - i;
6727
6728 if (wsize > SPA_MAXBLOCKSIZE)
6729 wsize = SPA_MAXBLOCKSIZE;
6730 ASSERT(wsize >= SPA_MINBLOCKSIZE);
6731 wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand + i,
6732 wsize, pb_buf + i, ZIO_CHECKSUM_OFF, NULL, NULL,
6733 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
6734 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6735 zio_t *, wzio);
6736 (void) zio_nowait(wzio);
6737 i += wsize;
6738 }
6739
6740 dev->l2ad_hand += io_size;
6741 vdev_space_update(dev->l2ad_vdev, io_size, 0, 0);
6742 l2arc_uberblock_update(dev, pio, cb);
6743
6744 ARCSTAT_INCR(arcstat_l2_write_bytes, io_size);
6745 ARCSTAT_BUMP(arcstat_l2_meta_writes);
6746 ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, est_encsize);
6747 ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, encsize);
6748 ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio,
6749 pb->pb_payload_asz / encsize);
6750 }
6751
6752 /*
6753 * Returns the number of bytes occupied by the payload buffer items of
6754 * a pbuf in portable (on-disk) encoded form, i.e. the bytes following
6755 * L2PBUF_HDR_SIZE.
6756 */
6757 static uint32_t
6758 l2arc_pbuf_items_encoded_size(l2pbuf_t *pb)
6759 {
6760 uint32_t size = 0;
6761 l2pbuf_buflist_t *buflist;
6762
6763 for (buflist = list_head(pb->pb_buflists_list); buflist != NULL;
6764 buflist = list_next(pb->pb_buflists_list, buflist))
6765 size += L2PBUF_BUF_SIZE * buflist->l2pbl_nbufs;
6766
6767 return (size);
6768 }