Print this page
3525 Persistent L2ARC
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/arc.c
+++ new/usr/src/uts/common/fs/zfs/arc.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26 26 */
27 27
28 28 /*
29 29 * DVA-based Adjustable Replacement Cache
30 30 *
31 31 * While much of the theory of operation used here is
32 32 * based on the self-tuning, low overhead replacement cache
33 33 * presented by Megiddo and Modha at FAST 2003, there are some
34 34 * significant differences:
35 35 *
36 36 * 1. The Megiddo and Modha model assumes any page is evictable.
37 37 * Pages in its cache cannot be "locked" into memory. This makes
38 38 * the eviction algorithm simple: evict the last page in the list.
39 39 * This also make the performance characteristics easy to reason
40 40 * about. Our cache is not so simple. At any given moment, some
41 41 * subset of the blocks in the cache are un-evictable because we
42 42 * have handed out a reference to them. Blocks are only evictable
43 43 * when there are no external references active. This makes
44 44 * eviction far more problematic: we choose to evict the evictable
45 45 * blocks that are the "lowest" in the list.
46 46 *
47 47 * There are times when it is not possible to evict the requested
48 48 * space. In these circumstances we are unable to adjust the cache
49 49 * size. To prevent the cache growing unbounded at these times we
50 50 * implement a "cache throttle" that slows the flow of new data
51 51 * into the cache until we can make space available.
52 52 *
53 53 * 2. The Megiddo and Modha model assumes a fixed cache size.
54 54 * Pages are evicted when the cache is full and there is a cache
55 55 * miss. Our model has a variable sized cache. It grows with
56 56 * high use, but also tries to react to memory pressure from the
57 57 * operating system: decreasing its size when system memory is
58 58 * tight.
59 59 *
60 60 * 3. The Megiddo and Modha model assumes a fixed page size. All
61 61 * elements of the cache are therefore exactly the same size. So
62 62 * when adjusting the cache size following a cache miss, its simply
63 63 * a matter of choosing a single page to evict. In our model, we
64 64 * have variable sized cache blocks (rangeing from 512 bytes to
65 65 * 128K bytes). We therefore choose a set of blocks to evict to make
66 66 * space for a cache miss that approximates as closely as possible
67 67 * the space used by the new block.
68 68 *
69 69 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70 70 * by N. Megiddo & D. Modha, FAST 2003
71 71 */
72 72
73 73 /*
74 74 * The locking model:
75 75 *
76 76 * A new reference to a cache buffer can be obtained in two
77 77 * ways: 1) via a hash table lookup using the DVA as a key,
78 78 * or 2) via one of the ARC lists. The arc_read() interface
79 79 * uses method 1, while the internal arc algorithms for
80 80 * adjusting the cache use method 2. We therefore provide two
81 81 * types of locks: 1) the hash table lock array, and 2) the
82 82 * arc list locks.
83 83 *
84 84 * Buffers do not have their own mutexes, rather they rely on the
85 85 * hash table mutexes for the bulk of their protection (i.e. most
86 86 * fields in the arc_buf_hdr_t are protected by these mutexes).
87 87 *
88 88 * buf_hash_find() returns the appropriate mutex (held) when it
89 89 * locates the requested buffer in the hash table. It returns
90 90 * NULL for the mutex if the buffer was not in the table.
91 91 *
92 92 * buf_hash_remove() expects the appropriate hash mutex to be
93 93 * already held before it is invoked.
94 94 *
95 95 * Each arc state also has a mutex which is used to protect the
96 96 * buffer list associated with the state. When attempting to
97 97 * obtain a hash table lock while holding an arc list lock you
98 98 * must use: mutex_tryenter() to avoid deadlock. Also note that
99 99 * the active state mutex must be held before the ghost state mutex.
100 100 *
101 101 * Arc buffers may have an associated eviction callback function.
102 102 * This function will be invoked prior to removing the buffer (e.g.
103 103 * in arc_do_user_evicts()). Note however that the data associated
104 104 * with the buffer may be evicted prior to the callback. The callback
105 105 * must be made with *no locks held* (to prevent deadlock). Additionally,
106 106 * the users of callbacks must ensure that their private data is
107 107 * protected from simultaneous callbacks from arc_buf_evict()
108 108 * and arc_do_user_evicts().
109 109 *
110 110 * Note that the majority of the performance stats are manipulated
111 111 * with atomic operations.
112 112 *
113 113 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
114 114 *
115 115 * - L2ARC buflist creation
116 116 * - L2ARC buflist eviction
117 117 * - L2ARC write completion, which walks L2ARC buflists
118 118 * - ARC header destruction, as it removes from L2ARC buflists
119 119 * - ARC header release, as it removes from L2ARC buflists
120 120 */
121 121
122 122 #include <sys/spa.h>
123 123 #include <sys/zio.h>
124 124 #include <sys/zio_compress.h>
125 125 #include <sys/zfs_context.h>
126 126 #include <sys/arc.h>
127 127 #include <sys/refcount.h>
128 128 #include <sys/vdev.h>
↓ open down ↓ |
128 lines elided |
↑ open up ↑ |
129 129 #include <sys/vdev_impl.h>
130 130 #ifdef _KERNEL
131 131 #include <sys/vmsystm.h>
132 132 #include <vm/anon.h>
133 133 #include <sys/fs/swapnode.h>
134 134 #include <sys/dnlc.h>
135 135 #endif
136 136 #include <sys/callb.h>
137 137 #include <sys/kstat.h>
138 138 #include <zfs_fletcher.h>
139 +#include <sys/byteorder.h>
139 140
140 141 #ifndef _KERNEL
141 142 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
142 143 boolean_t arc_watch = B_FALSE;
143 144 int arc_procfd;
144 145 #endif
145 146
146 147 static kmutex_t arc_reclaim_thr_lock;
147 148 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
148 149 static uint8_t arc_thread_exit;
149 150
150 151 extern int zfs_write_limit_shift;
151 152 extern uint64_t zfs_write_limit_max;
152 153 extern kmutex_t zfs_write_limit_lock;
153 154
154 155 #define ARC_REDUCE_DNLC_PERCENT 3
155 156 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
156 157
157 158 typedef enum arc_reclaim_strategy {
158 159 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
159 160 ARC_RECLAIM_CONS /* Conservative reclaim strategy */
160 161 } arc_reclaim_strategy_t;
161 162
162 163 /* number of seconds before growing cache again */
163 164 static int arc_grow_retry = 60;
164 165
165 166 /* shift of arc_c for calculating both min and max arc_p */
166 167 static int arc_p_min_shift = 4;
167 168
168 169 /* log2(fraction of arc to reclaim) */
169 170 static int arc_shrink_shift = 5;
170 171
171 172 /*
172 173 * minimum lifespan of a prefetch block in clock ticks
173 174 * (initialized in arc_init())
174 175 */
175 176 static int arc_min_prefetch_lifespan;
176 177
177 178 static int arc_dead;
178 179
179 180 /*
180 181 * The arc has filled available memory and has now warmed up.
181 182 */
182 183 static boolean_t arc_warm;
183 184
184 185 /*
185 186 * These tunables are for performance analysis.
186 187 */
187 188 uint64_t zfs_arc_max;
188 189 uint64_t zfs_arc_min;
189 190 uint64_t zfs_arc_meta_limit = 0;
190 191 int zfs_arc_grow_retry = 0;
191 192 int zfs_arc_shrink_shift = 0;
192 193 int zfs_arc_p_min_shift = 0;
193 194 int zfs_disable_dup_eviction = 0;
194 195
195 196 /*
196 197 * Note that buffers can be in one of 6 states:
197 198 * ARC_anon - anonymous (discussed below)
198 199 * ARC_mru - recently used, currently cached
199 200 * ARC_mru_ghost - recentely used, no longer in cache
200 201 * ARC_mfu - frequently used, currently cached
201 202 * ARC_mfu_ghost - frequently used, no longer in cache
202 203 * ARC_l2c_only - exists in L2ARC but not other states
203 204 * When there are no active references to the buffer, they are
204 205 * are linked onto a list in one of these arc states. These are
205 206 * the only buffers that can be evicted or deleted. Within each
206 207 * state there are multiple lists, one for meta-data and one for
207 208 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
208 209 * etc.) is tracked separately so that it can be managed more
209 210 * explicitly: favored over data, limited explicitly.
210 211 *
211 212 * Anonymous buffers are buffers that are not associated with
212 213 * a DVA. These are buffers that hold dirty block copies
213 214 * before they are written to stable storage. By definition,
214 215 * they are "ref'd" and are considered part of arc_mru
215 216 * that cannot be freed. Generally, they will aquire a DVA
216 217 * as they are written and migrate onto the arc_mru list.
217 218 *
218 219 * The ARC_l2c_only state is for buffers that are in the second
219 220 * level ARC but no longer in any of the ARC_m* lists. The second
220 221 * level ARC itself may also contain buffers that are in any of
221 222 * the ARC_m* states - meaning that a buffer can exist in two
222 223 * places. The reason for the ARC_l2c_only state is to keep the
223 224 * buffer header in the hash table, so that reads that hit the
224 225 * second level ARC benefit from these fast lookups.
225 226 */
226 227
227 228 typedef struct arc_state {
228 229 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
229 230 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
230 231 uint64_t arcs_size; /* total amount of data in this state */
231 232 kmutex_t arcs_mtx;
232 233 } arc_state_t;
233 234
234 235 /* The 6 states: */
235 236 static arc_state_t ARC_anon;
236 237 static arc_state_t ARC_mru;
237 238 static arc_state_t ARC_mru_ghost;
238 239 static arc_state_t ARC_mfu;
239 240 static arc_state_t ARC_mfu_ghost;
240 241 static arc_state_t ARC_l2c_only;
241 242
242 243 typedef struct arc_stats {
243 244 kstat_named_t arcstat_hits;
244 245 kstat_named_t arcstat_misses;
245 246 kstat_named_t arcstat_demand_data_hits;
246 247 kstat_named_t arcstat_demand_data_misses;
247 248 kstat_named_t arcstat_demand_metadata_hits;
248 249 kstat_named_t arcstat_demand_metadata_misses;
249 250 kstat_named_t arcstat_prefetch_data_hits;
250 251 kstat_named_t arcstat_prefetch_data_misses;
251 252 kstat_named_t arcstat_prefetch_metadata_hits;
252 253 kstat_named_t arcstat_prefetch_metadata_misses;
253 254 kstat_named_t arcstat_mru_hits;
254 255 kstat_named_t arcstat_mru_ghost_hits;
255 256 kstat_named_t arcstat_mfu_hits;
256 257 kstat_named_t arcstat_mfu_ghost_hits;
257 258 kstat_named_t arcstat_deleted;
258 259 kstat_named_t arcstat_recycle_miss;
259 260 /*
260 261 * Number of buffers that could not be evicted because the hash lock
261 262 * was held by another thread. The lock may not necessarily be held
262 263 * by something using the same buffer, since hash locks are shared
263 264 * by multiple buffers.
264 265 */
265 266 kstat_named_t arcstat_mutex_miss;
266 267 /*
267 268 * Number of buffers skipped because they have I/O in progress, are
268 269 * indrect prefetch buffers that have not lived long enough, or are
269 270 * not from the spa we're trying to evict from.
270 271 */
271 272 kstat_named_t arcstat_evict_skip;
272 273 kstat_named_t arcstat_evict_l2_cached;
273 274 kstat_named_t arcstat_evict_l2_eligible;
274 275 kstat_named_t arcstat_evict_l2_ineligible;
275 276 kstat_named_t arcstat_hash_elements;
276 277 kstat_named_t arcstat_hash_elements_max;
277 278 kstat_named_t arcstat_hash_collisions;
278 279 kstat_named_t arcstat_hash_chains;
279 280 kstat_named_t arcstat_hash_chain_max;
280 281 kstat_named_t arcstat_p;
281 282 kstat_named_t arcstat_c;
282 283 kstat_named_t arcstat_c_min;
283 284 kstat_named_t arcstat_c_max;
284 285 kstat_named_t arcstat_size;
285 286 kstat_named_t arcstat_hdr_size;
286 287 kstat_named_t arcstat_data_size;
287 288 kstat_named_t arcstat_other_size;
288 289 kstat_named_t arcstat_l2_hits;
289 290 kstat_named_t arcstat_l2_misses;
290 291 kstat_named_t arcstat_l2_feeds;
291 292 kstat_named_t arcstat_l2_rw_clash;
292 293 kstat_named_t arcstat_l2_read_bytes;
293 294 kstat_named_t arcstat_l2_write_bytes;
294 295 kstat_named_t arcstat_l2_writes_sent;
295 296 kstat_named_t arcstat_l2_writes_done;
296 297 kstat_named_t arcstat_l2_writes_error;
297 298 kstat_named_t arcstat_l2_writes_hdr_miss;
298 299 kstat_named_t arcstat_l2_evict_lock_retry;
299 300 kstat_named_t arcstat_l2_evict_reading;
↓ open down ↓ |
151 lines elided |
↑ open up ↑ |
300 301 kstat_named_t arcstat_l2_free_on_write;
301 302 kstat_named_t arcstat_l2_abort_lowmem;
302 303 kstat_named_t arcstat_l2_cksum_bad;
303 304 kstat_named_t arcstat_l2_io_error;
304 305 kstat_named_t arcstat_l2_size;
305 306 kstat_named_t arcstat_l2_asize;
306 307 kstat_named_t arcstat_l2_hdr_size;
307 308 kstat_named_t arcstat_l2_compress_successes;
308 309 kstat_named_t arcstat_l2_compress_zeros;
309 310 kstat_named_t arcstat_l2_compress_failures;
311 + kstat_named_t arcstat_l2_meta_writes;
312 + kstat_named_t arcstat_l2_meta_avg_size;
313 + kstat_named_t arcstat_l2_meta_avg_asize;
314 + kstat_named_t arcstat_l2_asize_to_meta_ratio;
315 + kstat_named_t arcstat_l2_rebuild_attempts;
316 + kstat_named_t arcstat_l2_rebuild_successes;
317 + kstat_named_t arcstat_l2_rebuild_unsupported;
318 + kstat_named_t arcstat_l2_rebuild_timeout;
319 + kstat_named_t arcstat_l2_rebuild_arc_bytes;
320 + kstat_named_t arcstat_l2_rebuild_l2arc_bytes;
321 + kstat_named_t arcstat_l2_rebuild_bufs;
322 + kstat_named_t arcstat_l2_rebuild_bufs_precached;
323 + kstat_named_t arcstat_l2_rebuild_metabufs;
324 + kstat_named_t arcstat_l2_rebuild_uberblk_errors;
325 + kstat_named_t arcstat_l2_rebuild_io_errors;
326 + kstat_named_t arcstat_l2_rebuild_cksum_errors;
327 + kstat_named_t arcstat_l2_rebuild_loop_errors;
328 + kstat_named_t arcstat_l2_rebuild_abort_lowmem;
310 329 kstat_named_t arcstat_memory_throttle_count;
311 330 kstat_named_t arcstat_duplicate_buffers;
312 331 kstat_named_t arcstat_duplicate_buffers_size;
313 332 kstat_named_t arcstat_duplicate_reads;
314 333 kstat_named_t arcstat_meta_used;
315 334 kstat_named_t arcstat_meta_limit;
316 335 kstat_named_t arcstat_meta_max;
317 336 } arc_stats_t;
318 337
319 338 static arc_stats_t arc_stats = {
320 339 { "hits", KSTAT_DATA_UINT64 },
321 340 { "misses", KSTAT_DATA_UINT64 },
322 341 { "demand_data_hits", KSTAT_DATA_UINT64 },
323 342 { "demand_data_misses", KSTAT_DATA_UINT64 },
324 343 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
325 344 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
326 345 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
327 346 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
328 347 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
329 348 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
330 349 { "mru_hits", KSTAT_DATA_UINT64 },
331 350 { "mru_ghost_hits", KSTAT_DATA_UINT64 },
332 351 { "mfu_hits", KSTAT_DATA_UINT64 },
333 352 { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
334 353 { "deleted", KSTAT_DATA_UINT64 },
335 354 { "recycle_miss", KSTAT_DATA_UINT64 },
336 355 { "mutex_miss", KSTAT_DATA_UINT64 },
337 356 { "evict_skip", KSTAT_DATA_UINT64 },
338 357 { "evict_l2_cached", KSTAT_DATA_UINT64 },
339 358 { "evict_l2_eligible", KSTAT_DATA_UINT64 },
340 359 { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
341 360 { "hash_elements", KSTAT_DATA_UINT64 },
342 361 { "hash_elements_max", KSTAT_DATA_UINT64 },
343 362 { "hash_collisions", KSTAT_DATA_UINT64 },
344 363 { "hash_chains", KSTAT_DATA_UINT64 },
345 364 { "hash_chain_max", KSTAT_DATA_UINT64 },
346 365 { "p", KSTAT_DATA_UINT64 },
347 366 { "c", KSTAT_DATA_UINT64 },
348 367 { "c_min", KSTAT_DATA_UINT64 },
349 368 { "c_max", KSTAT_DATA_UINT64 },
350 369 { "size", KSTAT_DATA_UINT64 },
351 370 { "hdr_size", KSTAT_DATA_UINT64 },
352 371 { "data_size", KSTAT_DATA_UINT64 },
353 372 { "other_size", KSTAT_DATA_UINT64 },
354 373 { "l2_hits", KSTAT_DATA_UINT64 },
355 374 { "l2_misses", KSTAT_DATA_UINT64 },
356 375 { "l2_feeds", KSTAT_DATA_UINT64 },
357 376 { "l2_rw_clash", KSTAT_DATA_UINT64 },
358 377 { "l2_read_bytes", KSTAT_DATA_UINT64 },
359 378 { "l2_write_bytes", KSTAT_DATA_UINT64 },
360 379 { "l2_writes_sent", KSTAT_DATA_UINT64 },
361 380 { "l2_writes_done", KSTAT_DATA_UINT64 },
362 381 { "l2_writes_error", KSTAT_DATA_UINT64 },
363 382 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
364 383 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
365 384 { "l2_evict_reading", KSTAT_DATA_UINT64 },
↓ open down ↓ |
46 lines elided |
↑ open up ↑ |
366 385 { "l2_free_on_write", KSTAT_DATA_UINT64 },
367 386 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
368 387 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
369 388 { "l2_io_error", KSTAT_DATA_UINT64 },
370 389 { "l2_size", KSTAT_DATA_UINT64 },
371 390 { "l2_asize", KSTAT_DATA_UINT64 },
372 391 { "l2_hdr_size", KSTAT_DATA_UINT64 },
373 392 { "l2_compress_successes", KSTAT_DATA_UINT64 },
374 393 { "l2_compress_zeros", KSTAT_DATA_UINT64 },
375 394 { "l2_compress_failures", KSTAT_DATA_UINT64 },
395 + { "l2_meta_writes", KSTAT_DATA_UINT64 },
396 + { "l2_meta_avg_size", KSTAT_DATA_UINT64 },
397 + { "l2_meta_avg_asize", KSTAT_DATA_UINT64 },
398 + { "l2_asize_to_meta_ratio", KSTAT_DATA_UINT64 },
399 + { "l2_rebuild_attempts", KSTAT_DATA_UINT64 },
400 + { "l2_rebuild_successes", KSTAT_DATA_UINT64 },
401 + { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 },
402 + { "l2_rebuild_timeout", KSTAT_DATA_UINT64 },
403 + { "l2_rebuild_arc_bytes", KSTAT_DATA_UINT64 },
404 + { "l2_rebuild_l2arc_bytes", KSTAT_DATA_UINT64 },
405 + { "l2_rebuild_bufs", KSTAT_DATA_UINT64 },
406 + { "l2_rebuild_precached", KSTAT_DATA_UINT64 },
407 + { "l2_rebuild_metabufs", KSTAT_DATA_UINT64 },
408 + { "l2_rebuild_uberblk_errors", KSTAT_DATA_UINT64 },
409 + { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 },
410 + { "l2_rebuild_cksum_errors", KSTAT_DATA_UINT64 },
411 + { "l2_rebuild_loop_errors", KSTAT_DATA_UINT64 },
412 + { "l2_rebuild_abort_lowmem", KSTAT_DATA_UINT64 },
376 413 { "memory_throttle_count", KSTAT_DATA_UINT64 },
377 414 { "duplicate_buffers", KSTAT_DATA_UINT64 },
378 415 { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
379 416 { "duplicate_reads", KSTAT_DATA_UINT64 },
380 417 { "arc_meta_used", KSTAT_DATA_UINT64 },
381 418 { "arc_meta_limit", KSTAT_DATA_UINT64 },
382 419 { "arc_meta_max", KSTAT_DATA_UINT64 }
383 420 };
384 421
385 422 #define ARCSTAT(stat) (arc_stats.stat.value.ui64)
386 423
387 424 #define ARCSTAT_INCR(stat, val) \
388 425 atomic_add_64(&arc_stats.stat.value.ui64, (val))
389 426
390 427 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
391 428 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
392 429
393 430 #define ARCSTAT_MAX(stat, val) { \
394 431 uint64_t m; \
395 432 while ((val) > (m = arc_stats.stat.value.ui64) && \
396 433 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
397 434 continue; \
398 435 }
399 436
400 437 #define ARCSTAT_MAXSTAT(stat) \
401 438 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
402 439
403 440 /*
404 441 * We define a macro to allow ARC hits/misses to be easily broken down by
405 442 * two separate conditions, giving a total of four different subtypes for
406 443 * each of hits and misses (so eight statistics total).
407 444 */
408 445 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
409 446 if (cond1) { \
410 447 if (cond2) { \
411 448 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
412 449 } else { \
↓ open down ↓ |
27 lines elided |
↑ open up ↑ |
413 450 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
414 451 } \
415 452 } else { \
416 453 if (cond2) { \
417 454 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
418 455 } else { \
419 456 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
420 457 } \
421 458 }
422 459
460 +/*
461 + * This macro allows us to use kstats as floating averages. Each time we
462 + * update this kstat, we first factor it and the update value by
463 + * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
464 + * average. This macro assumes that integer loads and stores are atomic, but
465 + * is not safe for multiple writers updating the kstat in parallel (only the
466 + * last writer's update will remain).
467 + */
468 +#define ARCSTAT_F_AVG_FACTOR 3
469 +#define ARCSTAT_F_AVG(stat, value) \
470 + do { \
471 + uint64_t x = ARCSTAT(stat); \
472 + x = x - x / ARCSTAT_F_AVG_FACTOR + \
473 + (value) / ARCSTAT_F_AVG_FACTOR; \
474 + ARCSTAT(stat) = x; \
475 + _NOTE(NOTREACHED) \
476 + _NOTE(CONSTCOND) \
477 + } while (0)
478 +
423 479 kstat_t *arc_ksp;
424 480 static arc_state_t *arc_anon;
425 481 static arc_state_t *arc_mru;
426 482 static arc_state_t *arc_mru_ghost;
427 483 static arc_state_t *arc_mfu;
428 484 static arc_state_t *arc_mfu_ghost;
429 485 static arc_state_t *arc_l2c_only;
430 486
431 487 /*
432 488 * There are several ARC variables that are critical to export as kstats --
433 489 * but we don't want to have to grovel around in the kstat whenever we wish to
434 490 * manipulate them. For these variables, we therefore define them to be in
435 491 * terms of the statistic variable. This assures that we are not introducing
436 492 * the possibility of inconsistency by having shadow copies of the variables,
437 493 * while still allowing the code to be readable.
438 494 */
439 495 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
440 496 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
441 497 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */
442 498 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
443 499 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
444 500 #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
445 501 #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
446 502 #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
447 503
448 504 #define L2ARC_IS_VALID_COMPRESS(_c_) \
449 505 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
450 506
451 507 static int arc_no_grow; /* Don't try to grow cache size */
452 508 static uint64_t arc_tempreserve;
453 509 static uint64_t arc_loaned_bytes;
454 510
455 511 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
456 512
457 513 typedef struct arc_callback arc_callback_t;
458 514
459 515 struct arc_callback {
460 516 void *acb_private;
461 517 arc_done_func_t *acb_done;
462 518 arc_buf_t *acb_buf;
463 519 zio_t *acb_zio_dummy;
464 520 arc_callback_t *acb_next;
465 521 };
466 522
467 523 typedef struct arc_write_callback arc_write_callback_t;
468 524
469 525 struct arc_write_callback {
470 526 void *awcb_private;
471 527 arc_done_func_t *awcb_ready;
472 528 arc_done_func_t *awcb_done;
473 529 arc_buf_t *awcb_buf;
474 530 };
475 531
476 532 struct arc_buf_hdr {
477 533 /* protected by hash lock */
478 534 dva_t b_dva;
479 535 uint64_t b_birth;
480 536 uint64_t b_cksum0;
481 537
482 538 kmutex_t b_freeze_lock;
483 539 zio_cksum_t *b_freeze_cksum;
484 540 void *b_thawed;
485 541
486 542 arc_buf_hdr_t *b_hash_next;
487 543 arc_buf_t *b_buf;
488 544 uint32_t b_flags;
489 545 uint32_t b_datacnt;
490 546
491 547 arc_callback_t *b_acb;
492 548 kcondvar_t b_cv;
493 549
494 550 /* immutable */
495 551 arc_buf_contents_t b_type;
496 552 uint64_t b_size;
497 553 uint64_t b_spa;
498 554
499 555 /* protected by arc state mutex */
500 556 arc_state_t *b_state;
501 557 list_node_t b_arc_node;
502 558
503 559 /* updated atomically */
504 560 clock_t b_arc_access;
505 561
506 562 /* self protecting */
507 563 refcount_t b_refcnt;
508 564
509 565 l2arc_buf_hdr_t *b_l2hdr;
510 566 list_node_t b_l2node;
511 567 };
512 568
513 569 static arc_buf_t *arc_eviction_list;
514 570 static kmutex_t arc_eviction_mtx;
515 571 static arc_buf_hdr_t arc_eviction_hdr;
516 572 static void arc_get_data_buf(arc_buf_t *buf);
517 573 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
518 574 static int arc_evict_needed(arc_buf_contents_t type);
519 575 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
520 576 static void arc_buf_watch(arc_buf_t *buf);
521 577
522 578 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
523 579
524 580 #define GHOST_STATE(state) \
525 581 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
526 582 (state) == arc_l2c_only)
527 583
528 584 /*
529 585 * Private ARC flags. These flags are private ARC only flags that will show up
530 586 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can
531 587 * be passed in as arc_flags in things like arc_read. However, these flags
532 588 * should never be passed and should only be set by ARC code. When adding new
533 589 * public flags, make sure not to smash the private ones.
534 590 */
535 591
536 592 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */
537 593 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */
538 594 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
539 595 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
540 596 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
541 597 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */
542 598 #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */
543 599 #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
544 600 #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
545 601 #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
546 602
547 603 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
548 604 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
549 605 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
550 606 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH)
551 607 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
552 608 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
553 609 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
554 610 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE)
555 611 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
556 612 (hdr)->b_l2hdr != NULL)
557 613 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING)
558 614 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED)
559 615 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
560 616
561 617 /*
562 618 * Other sizes
563 619 */
564 620
565 621 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
566 622 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
567 623
568 624 /*
569 625 * Hash table routines
570 626 */
571 627
572 628 #define HT_LOCK_PAD 64
573 629
574 630 struct ht_lock {
575 631 kmutex_t ht_lock;
576 632 #ifdef _KERNEL
577 633 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
578 634 #endif
579 635 };
580 636
581 637 #define BUF_LOCKS 256
582 638 typedef struct buf_hash_table {
583 639 uint64_t ht_mask;
584 640 arc_buf_hdr_t **ht_table;
585 641 struct ht_lock ht_locks[BUF_LOCKS];
586 642 } buf_hash_table_t;
587 643
588 644 static buf_hash_table_t buf_hash_table;
589 645
590 646 #define BUF_HASH_INDEX(spa, dva, birth) \
591 647 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
592 648 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
593 649 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
594 650 #define HDR_LOCK(hdr) \
595 651 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
596 652
597 653 uint64_t zfs_crc64_table[256];
598 654
599 655 /*
600 656 * Level 2 ARC
601 657 */
602 658
603 659 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
604 660 #define L2ARC_HEADROOM 2 /* num of writes */
605 661 /*
606 662 * If we discover during ARC scan any buffers to be compressed, we boost
607 663 * our headroom for the next scanning cycle by this percentage multiple.
608 664 */
609 665 #define L2ARC_HEADROOM_BOOST 200
610 666 #define L2ARC_FEED_SECS 1 /* caching interval secs */
611 667 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
612 668
613 669 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
614 670 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
615 671
616 672 /* L2ARC Performance Tunables */
617 673 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
618 674 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
619 675 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
↓ open down ↓ |
187 lines elided |
↑ open up ↑ |
620 676 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
621 677 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
622 678 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
623 679 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
624 680 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
625 681 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
626 682
627 683 /*
628 684 * L2ARC Internals
629 685 */
630 -typedef struct l2arc_dev {
631 - vdev_t *l2ad_vdev; /* vdev */
632 - spa_t *l2ad_spa; /* spa */
633 - uint64_t l2ad_hand; /* next write location */
634 - uint64_t l2ad_start; /* first addr on device */
635 - uint64_t l2ad_end; /* last addr on device */
636 - uint64_t l2ad_evict; /* last addr eviction reached */
637 - boolean_t l2ad_first; /* first sweep through */
638 - boolean_t l2ad_writing; /* currently writing */
639 - list_t *l2ad_buflist; /* buffer list */
640 - list_node_t l2ad_node; /* device list node */
641 -} l2arc_dev_t;
642 -
686 +typedef struct l2arc_dev l2arc_dev_t;
643 687 static list_t L2ARC_dev_list; /* device list */
644 688 static list_t *l2arc_dev_list; /* device list pointer */
645 689 static kmutex_t l2arc_dev_mtx; /* device list mutex */
646 690 static l2arc_dev_t *l2arc_dev_last; /* last device used */
647 691 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
648 692 static list_t L2ARC_free_on_write; /* free after write buf list */
649 693 static list_t *l2arc_free_on_write; /* free after write list ptr */
650 694 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
651 695 static uint64_t l2arc_ndev; /* number of devices */
652 696
653 697 typedef struct l2arc_read_callback {
654 698 arc_buf_t *l2rcb_buf; /* read buffer */
↓ open down ↓ |
2 lines elided |
↑ open up ↑ |
655 699 spa_t *l2rcb_spa; /* spa */
656 700 blkptr_t l2rcb_bp; /* original blkptr */
657 701 zbookmark_t l2rcb_zb; /* original bookmark */
658 702 int l2rcb_flags; /* original flags */
659 703 enum zio_compress l2rcb_compress; /* applied compress */
660 704 } l2arc_read_callback_t;
661 705
662 706 typedef struct l2arc_write_callback {
663 707 l2arc_dev_t *l2wcb_dev; /* device info */
664 708 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
709 + uint8_t *l2wcb_pbuf; /* pbuf sent in this write */
710 + uint32_t l2wcb_pbuf_size; /* size of committed pbuf */
711 + uint8_t *l2wcb_ub_buf; /* uberblock in this write */
665 712 } l2arc_write_callback_t;
666 713
667 714 struct l2arc_buf_hdr {
668 715 /* protected by arc_buf_hdr mutex */
669 716 l2arc_dev_t *b_dev; /* L2ARC device */
670 717 uint64_t b_daddr; /* disk address, offset byte */
671 718 /* compression applied to buffer data */
672 719 enum zio_compress b_compress;
673 720 /* real alloc'd buffer size depending on b_compress applied */
674 721 int b_asize;
675 722 /* temporary buffer holder for in-flight compressed data */
676 723 void *b_tmp_cdata;
677 724 };
678 725
679 726 typedef struct l2arc_data_free {
680 727 /* protected by l2arc_free_on_write_mtx */
681 728 void *l2df_data;
↓ open down ↓ |
7 lines elided |
↑ open up ↑ |
682 729 size_t l2df_size;
683 730 void (*l2df_func)(void *, size_t);
684 731 list_node_t l2df_list_node;
685 732 } l2arc_data_free_t;
686 733
687 734 static kmutex_t l2arc_feed_thr_lock;
688 735 static kcondvar_t l2arc_feed_thr_cv;
689 736 static uint8_t l2arc_thread_exit;
690 737
691 738 static void l2arc_read_done(zio_t *zio);
692 -static void l2arc_hdr_stat_add(void);
739 +static void l2arc_hdr_stat_add(boolean_t from_arc);
693 740 static void l2arc_hdr_stat_remove(void);
694 741
695 742 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
696 743 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
697 744 enum zio_compress c);
698 745 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
699 746
747 +typedef enum {
748 + L2UBLK_BIG_ENDIAN = (1 << 0), /* little endian assumed otherwise */
749 + L2UBLK_EVICT_FIRST = (1 << 1) /* mirror of l2ad_first in l2dev */
750 +} l2uberblock_flags_t;
751 +
752 +typedef struct l2uberblock {
753 + uint32_t ub_magic;
754 + uint8_t ub_version;
755 + l2uberblock_flags_t ub_flags;
756 +
757 + uint64_t ub_spa_guid;
758 + uint64_t ub_birth;
759 + uint64_t ub_evict_tail; /* current evict pointer */
760 + uint64_t ub_alloc_space; /* vdev space alloc status */
761 + uint64_t ub_pbuf_daddr; /* address of newest pbuf */
762 + uint32_t ub_pbuf_asize; /* size of newest pbuf */
763 + zio_cksum_t ub_pbuf_cksum; /* fletcher4 of newest pbuf */
764 +
765 + zio_cksum_t ub_cksum; /* cksum of uberblock */
766 +} l2uberblock_t;
767 +
768 +typedef enum {
769 + L2PBUF_BIG_ENDIAN = (1 << 0), /* little endian assumed otherwise */
770 + L2PBUF_COMPRESSED = (1 << 1) /* pbuf data items are compressed */
771 +} l2pbuf_flags_t;
772 +
773 +typedef struct l2pbuf {
774 + uint32_t pb_magic;
775 + unsigned int pb_version;
776 + l2pbuf_flags_t pb_flags;
777 +
778 + uint64_t pb_prev_daddr; /* address of previous pbuf */
779 + uint32_t pb_prev_asize; /* size of previous pbuf */
780 + zio_cksum_t pb_prev_cksum; /* fletcher4 of prev. pbuf */
781 +
782 + /*
783 + * This is a set of item lists that are contained in this pbuf. Each
784 + * L2ARC write appends a new l2pbuf_buflist_t array of l2pbuf_buf_t's.
785 + * This serves as a soft timeout feature - once the limit of the
786 + * number of item lists that a pbuf can hold is reached, the pbuf is
787 + * flushed to stable storage, regardless of its total size.
788 + */
789 + list_t *pb_buflists_list;
790 +
791 + /*
792 + * Number of compressed bytes referenced by items in this pbuf and
793 + * the number of lists present.
794 + * This is not actually written to storage, it is only used by
795 + * internal algorithms which check for when a pbuf reaches a
796 + * certain size limit, after which it is flushed in a write.
797 + */
798 + uint64_t pb_payload_asz;
799 + /* Same thing for number of buflists */
800 + int pb_nbuflists;
801 +
802 + /*
803 + * Filled in by l2arc_pbuf_read to hold this pbuf's alloc'd size.
804 + * This is then used by l2arc_pbuf_restore to update used space
805 + * on the L2ARC vdev.
806 + */
807 + size_t pb_asize;
808 +} l2pbuf_t;
809 +
810 +typedef struct l2pbuf_buf l2pbuf_buf_t;
811 +typedef struct l2pbuf_buflist {
812 + uint32_t l2pbl_nbufs;
813 + l2pbuf_buf_t *l2pbl_bufs;
814 + list_node_t l2pbl_node;
815 +} l2pbuf_buflist_t;
816 +
817 +struct l2pbuf_buf {
818 + dva_t b_dva; /* dva of buffer */
819 + uint64_t b_birth; /* birth txg of buffer */
820 + uint64_t b_cksum0;
821 + zio_cksum_t b_freeze_cksum;
822 + uint32_t b_size; /* uncompressed buf size */
823 + uint64_t b_l2daddr; /* buf location on l2dev */
824 + uint32_t b_l2asize; /* actual buf data size */
825 + enum zio_compress b_l2compress; /* compression applied */
826 + uint16_t b_contents_type;
827 + uint32_t b_flags;
828 +};
829 +
830 +struct l2arc_dev {
831 + vdev_t *l2ad_vdev; /* vdev */
832 + spa_t *l2ad_spa; /* spa */
833 + uint64_t l2ad_hand; /* next write location */
834 + uint64_t l2ad_start; /* first addr on device */
835 + uint64_t l2ad_end; /* last addr on device */
836 + uint64_t l2ad_evict; /* last addr eviction reached */
837 + boolean_t l2ad_first; /* first sweep through */
838 + boolean_t l2ad_writing; /* currently writing */
839 + list_t *l2ad_buflist; /* buffer list */
840 + list_node_t l2ad_node; /* device list node */
841 + l2pbuf_t l2ad_pbuf; /* currently open pbuf */
842 + uint64_t l2ad_pbuf_daddr; /* prev pbuf daddr */
843 + uint64_t l2ad_pbuf_asize; /* prev pbuf asize */
844 + zio_cksum_t l2ad_pbuf_cksum; /* prev pbuf cksum */
845 + /* uberblock birth counter - incremented for each committed uberblk */
846 + uint64_t l2ad_uberblock_birth;
847 + /* flag indicating whether a rebuild is currently going on */
848 + boolean_t l2ad_rebuilding;
849 +};
850 +
851 +/* Stores information about an L2ARC prefetch zio */
852 +typedef struct l2arc_prefetch_info {
853 + uint8_t *pi_buf; /* where the zio writes to */
854 + uint64_t pi_buflen; /* length of `buf' */
855 + zio_t *pi_hdr_io; /* see l2arc_pbuf_read below */
856 +} l2arc_prefetch_info_t;
857 +
858 +/* 256 x 4k of l2uberblocks */
859 +#define L2UBERBLOCK_SIZE 4096
860 +#define L2UBERBLOCK_MAGIC 0x12bab10c
861 +#define L2UBERBLOCK_MAX_VERSION 1 /* our maximum uberblock version */
862 +#define L2PBUF_MAGIC 0xdb0faba6
863 +#define L2PBUF_MAX_VERSION 1 /* our maximum pbuf version */
864 +#define L2PBUF_BUF_SIZE 88 /* size of one pbuf buf entry */
865 +#define L2PBUF_HDR_SIZE 56 /* pbuf header excluding any payload */
866 +#define L2PBUF_ENCODED_SIZE(_pb) \
867 + (L2PBUF_HDR_SIZE + l2arc_pbuf_items_encoded_size(_pb))
868 +/*
869 + * Allocation limit for the payload of a pbuf. This also fundamentally
870 + * limits the number of bufs we can reference in a pbuf.
871 + */
872 +#define L2PBUF_MAX_PAYLOAD_SIZE (24 * 1024 * 1024)
873 +#define L2PBUF_MAX_BUFS (L2PBUF_MAX_PAYLOAD_SIZE / L2PBUF_BUF_SIZE)
874 +#define L2PBUF_COMPRESS_MINSZ 8192 /* minimum size to compress a pbuf */
875 +#define L2PBUF_MAXSZ 100 * 1024 * 1024 /* maximum pbuf size */
876 +#define L2PBUF_MAX_BUFLISTS 128 /* max number of buflists per pbuf */
877 +#define L2ARC_REBUILD_TIMEOUT 60 /* a rebuild may take at most 60s */
878 +#define L2PBUF_IS_FULL(_pb) \
879 + ((_pb)->pb_payload_asz > l2arc_pbuf_max_sz || \
880 + (_pb)->pb_nbuflists + 1 >= l2arc_pbuf_max_buflists)
881 +/*
882 + * These are the flags we allow to persist in L2ARC pbufs. The other flags
883 + * of an ARC buffer pertain to the buffer's runtime behavior.
884 + */
885 +#define L2ARC_PERSIST_FLAGS \
886 + (ARC_IN_HASH_TABLE | ARC_L2CACHE | ARC_L2COMPRESS | ARC_PREFETCH)
887 +
888 +/*
889 + * Used during L2ARC rebuild after each read operation to check whether we
890 + * haven't exceeded the rebuild timeout value.
891 + */
892 +#define L2ARC_CHK_REBUILD_TIMEOUT(_deadline_, ...) \
893 + do { \
894 + if ((_deadline_) != 0 && (_deadline_) < ddi_get_lbolt64()) { \
895 + __VA_ARGS__; \
896 + ARCSTAT_BUMP(arcstat_l2_rebuild_timeout); \
897 + cmn_err(CE_WARN, "L2ARC rebuild is taking too long, " \
898 + "dropping remaining L2ARC metadata."); \
899 + return; \
900 + } \
901 + _NOTE(NOTREACHED) \
902 + _NOTE(CONSTCOND) \
903 + } while (0)
904 +
905 +/*
906 + * Performance tuning of L2ARC persistency:
907 + *
908 + * l2arc_pbuf_compress_minsz : Minimum size of a pbuf in order to attempt
909 + * compressing it.
910 + * l2arc_pbuf_max_sz : Upper bound on the physical size of L2ARC buffers
911 + * referenced from a pbuf. Once a pbuf reaches this size, it is
912 + * committed to stable storage. Ideally, there should be approx.
913 + * l2arc_dev_size / l2arc_pbuf_max_sz pbufs on an L2ARC device.
914 + * l2arc_pbuf_max_buflists : Maximum number of L2ARC feed cycles that will
915 + * be buffered in a pbuf before it is committed to L2ARC. This
916 + * puts a soft temporal upper bound on pbuf commit intervals.
917 + * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
918 + * pool import or when adding one manually later) will attempt
919 + * to rebuild L2ARC buffer contents. In special circumstances,
920 + * the administrator may want to set this to B_FALSE, if they
921 + * are having trouble importing a pool or attaching an L2ARC
922 + * device (e.g. the L2ARC device is slow to read in stored pbuf
923 + * metadata, or the metadata has become somehow
924 + * fragmented/unusable).
925 + * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help
926 + * avoid a slow L2ARC device from preventing pool import. If we
927 + * are not done rebuilding an L2ARC device by this time, we
928 + * stop the rebuild and return immediately.
929 + */
930 +uint64_t l2arc_pbuf_compress_minsz = L2PBUF_COMPRESS_MINSZ;
931 +uint64_t l2arc_pbuf_max_sz = L2PBUF_MAXSZ;
932 +uint64_t l2arc_pbuf_max_buflists = L2PBUF_MAX_BUFLISTS;
933 +boolean_t l2arc_rebuild_enabled = B_TRUE;
934 +uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT;
935 +
936 +static void l2arc_rebuild_start(l2arc_dev_t *dev);
937 +static void l2arc_rebuild(l2arc_dev_t *dev);
938 +static void l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb);
939 +static void l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev,
940 + uint64_t guid);
941 +
942 +static int l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub);
943 +static int l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize,
944 + zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **next_io);
945 +static int l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr,
946 + uint32_t asize);
947 +static zio_t *l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize);
948 +static void l2arc_pbuf_prefetch_abort(zio_t *zio);
949 +
950 +static void l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf);
951 +static void l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub);
952 +static int l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub,
953 + uint64_t guid);
954 +static void l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio,
955 + l2arc_write_callback_t *cb);
956 +
957 +static uint32_t l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen);
958 +static int l2arc_pbuf_decode(uint8_t *buf, uint32_t buflen,
959 + l2pbuf_t *pbuf);
960 +static int l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen,
961 + uint64_t *daddr, uint32_t *asize, zio_cksum_t *cksum);
962 +static void l2arc_pbuf_init(l2pbuf_t *pb);
963 +static void l2arc_pbuf_destroy(l2pbuf_t *pb);
964 +static void l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio,
965 + l2arc_write_callback_t *cb);
966 +static l2pbuf_buflist_t *l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs);
967 +static void l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl,
968 + const arc_buf_hdr_t *ab, int index);
969 +static uint32_t l2arc_pbuf_items_encoded_size(l2pbuf_t *pb);
970 +
700 971 static uint64_t
701 972 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
702 973 {
703 974 uint8_t *vdva = (uint8_t *)dva;
704 975 uint64_t crc = -1ULL;
705 976 int i;
706 977
707 978 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
708 979
709 980 for (i = 0; i < sizeof (dva_t); i++)
710 981 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
711 982
712 983 crc ^= (spa>>8) ^ birth;
713 984
714 985 return (crc);
715 986 }
716 987
717 988 #define BUF_EMPTY(buf) \
718 989 ((buf)->b_dva.dva_word[0] == 0 && \
719 990 (buf)->b_dva.dva_word[1] == 0 && \
720 991 (buf)->b_birth == 0)
721 992
722 993 #define BUF_EQUAL(spa, dva, birth, buf) \
723 994 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
724 995 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
725 996 ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
726 997
727 998 static void
728 999 buf_discard_identity(arc_buf_hdr_t *hdr)
729 1000 {
730 1001 hdr->b_dva.dva_word[0] = 0;
731 1002 hdr->b_dva.dva_word[1] = 0;
732 1003 hdr->b_birth = 0;
733 1004 hdr->b_cksum0 = 0;
734 1005 }
735 1006
736 1007 static arc_buf_hdr_t *
737 1008 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
738 1009 {
739 1010 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
740 1011 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
741 1012 arc_buf_hdr_t *buf;
742 1013
743 1014 mutex_enter(hash_lock);
744 1015 for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
745 1016 buf = buf->b_hash_next) {
746 1017 if (BUF_EQUAL(spa, dva, birth, buf)) {
747 1018 *lockp = hash_lock;
748 1019 return (buf);
749 1020 }
750 1021 }
751 1022 mutex_exit(hash_lock);
752 1023 *lockp = NULL;
753 1024 return (NULL);
754 1025 }
755 1026
756 1027 /*
757 1028 * Insert an entry into the hash table. If there is already an element
758 1029 * equal to elem in the hash table, then the already existing element
759 1030 * will be returned and the new element will not be inserted.
760 1031 * Otherwise returns NULL.
761 1032 */
762 1033 static arc_buf_hdr_t *
763 1034 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
764 1035 {
765 1036 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
766 1037 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
767 1038 arc_buf_hdr_t *fbuf;
768 1039 uint32_t i;
769 1040
770 1041 ASSERT(!HDR_IN_HASH_TABLE(buf));
771 1042 *lockp = hash_lock;
772 1043 mutex_enter(hash_lock);
773 1044 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
774 1045 fbuf = fbuf->b_hash_next, i++) {
775 1046 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
776 1047 return (fbuf);
777 1048 }
778 1049
779 1050 buf->b_hash_next = buf_hash_table.ht_table[idx];
780 1051 buf_hash_table.ht_table[idx] = buf;
781 1052 buf->b_flags |= ARC_IN_HASH_TABLE;
782 1053
783 1054 /* collect some hash table performance data */
784 1055 if (i > 0) {
785 1056 ARCSTAT_BUMP(arcstat_hash_collisions);
786 1057 if (i == 1)
787 1058 ARCSTAT_BUMP(arcstat_hash_chains);
788 1059
789 1060 ARCSTAT_MAX(arcstat_hash_chain_max, i);
790 1061 }
791 1062
792 1063 ARCSTAT_BUMP(arcstat_hash_elements);
793 1064 ARCSTAT_MAXSTAT(arcstat_hash_elements);
794 1065
795 1066 return (NULL);
796 1067 }
797 1068
798 1069 static void
799 1070 buf_hash_remove(arc_buf_hdr_t *buf)
800 1071 {
801 1072 arc_buf_hdr_t *fbuf, **bufp;
802 1073 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
803 1074
804 1075 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
805 1076 ASSERT(HDR_IN_HASH_TABLE(buf));
806 1077
807 1078 bufp = &buf_hash_table.ht_table[idx];
808 1079 while ((fbuf = *bufp) != buf) {
809 1080 ASSERT(fbuf != NULL);
810 1081 bufp = &fbuf->b_hash_next;
811 1082 }
812 1083 *bufp = buf->b_hash_next;
813 1084 buf->b_hash_next = NULL;
814 1085 buf->b_flags &= ~ARC_IN_HASH_TABLE;
815 1086
816 1087 /* collect some hash table performance data */
817 1088 ARCSTAT_BUMPDOWN(arcstat_hash_elements);
818 1089
819 1090 if (buf_hash_table.ht_table[idx] &&
820 1091 buf_hash_table.ht_table[idx]->b_hash_next == NULL)
821 1092 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
822 1093 }
823 1094
824 1095 /*
825 1096 * Global data structures and functions for the buf kmem cache.
826 1097 */
827 1098 static kmem_cache_t *hdr_cache;
828 1099 static kmem_cache_t *buf_cache;
829 1100
830 1101 static void
831 1102 buf_fini(void)
832 1103 {
833 1104 int i;
834 1105
835 1106 kmem_free(buf_hash_table.ht_table,
836 1107 (buf_hash_table.ht_mask + 1) * sizeof (void *));
837 1108 for (i = 0; i < BUF_LOCKS; i++)
838 1109 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
839 1110 kmem_cache_destroy(hdr_cache);
840 1111 kmem_cache_destroy(buf_cache);
841 1112 }
842 1113
843 1114 /*
844 1115 * Constructor callback - called when the cache is empty
845 1116 * and a new buf is requested.
846 1117 */
847 1118 /* ARGSUSED */
848 1119 static int
849 1120 hdr_cons(void *vbuf, void *unused, int kmflag)
850 1121 {
851 1122 arc_buf_hdr_t *buf = vbuf;
852 1123
853 1124 bzero(buf, sizeof (arc_buf_hdr_t));
854 1125 refcount_create(&buf->b_refcnt);
855 1126 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
856 1127 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
857 1128 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
858 1129
859 1130 return (0);
860 1131 }
861 1132
862 1133 /* ARGSUSED */
863 1134 static int
864 1135 buf_cons(void *vbuf, void *unused, int kmflag)
865 1136 {
866 1137 arc_buf_t *buf = vbuf;
867 1138
868 1139 bzero(buf, sizeof (arc_buf_t));
869 1140 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
870 1141 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
871 1142
872 1143 return (0);
873 1144 }
874 1145
875 1146 /*
876 1147 * Destructor callback - called when a cached buf is
877 1148 * no longer required.
878 1149 */
879 1150 /* ARGSUSED */
880 1151 static void
881 1152 hdr_dest(void *vbuf, void *unused)
882 1153 {
883 1154 arc_buf_hdr_t *buf = vbuf;
884 1155
885 1156 ASSERT(BUF_EMPTY(buf));
886 1157 refcount_destroy(&buf->b_refcnt);
887 1158 cv_destroy(&buf->b_cv);
888 1159 mutex_destroy(&buf->b_freeze_lock);
889 1160 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
890 1161 }
891 1162
892 1163 /* ARGSUSED */
893 1164 static void
894 1165 buf_dest(void *vbuf, void *unused)
895 1166 {
896 1167 arc_buf_t *buf = vbuf;
897 1168
898 1169 mutex_destroy(&buf->b_evict_lock);
899 1170 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
900 1171 }
901 1172
902 1173 /*
903 1174 * Reclaim callback -- invoked when memory is low.
904 1175 */
905 1176 /* ARGSUSED */
906 1177 static void
907 1178 hdr_recl(void *unused)
908 1179 {
909 1180 dprintf("hdr_recl called\n");
910 1181 /*
911 1182 * umem calls the reclaim func when we destroy the buf cache,
912 1183 * which is after we do arc_fini().
913 1184 */
914 1185 if (!arc_dead)
915 1186 cv_signal(&arc_reclaim_thr_cv);
916 1187 }
917 1188
918 1189 static void
919 1190 buf_init(void)
920 1191 {
921 1192 uint64_t *ct;
922 1193 uint64_t hsize = 1ULL << 12;
923 1194 int i, j;
924 1195
925 1196 /*
926 1197 * The hash table is big enough to fill all of physical memory
927 1198 * with an average 64K block size. The table will take up
928 1199 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
929 1200 */
930 1201 while (hsize * 65536 < physmem * PAGESIZE)
931 1202 hsize <<= 1;
932 1203 retry:
933 1204 buf_hash_table.ht_mask = hsize - 1;
934 1205 buf_hash_table.ht_table =
935 1206 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
936 1207 if (buf_hash_table.ht_table == NULL) {
937 1208 ASSERT(hsize > (1ULL << 8));
938 1209 hsize >>= 1;
939 1210 goto retry;
940 1211 }
941 1212
942 1213 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
943 1214 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
944 1215 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
945 1216 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
946 1217
947 1218 for (i = 0; i < 256; i++)
948 1219 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
949 1220 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
950 1221
951 1222 for (i = 0; i < BUF_LOCKS; i++) {
952 1223 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
953 1224 NULL, MUTEX_DEFAULT, NULL);
954 1225 }
955 1226 }
956 1227
957 1228 #define ARC_MINTIME (hz>>4) /* 62 ms */
958 1229
959 1230 static void
960 1231 arc_cksum_verify(arc_buf_t *buf)
961 1232 {
962 1233 zio_cksum_t zc;
963 1234
964 1235 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
965 1236 return;
966 1237
967 1238 mutex_enter(&buf->b_hdr->b_freeze_lock);
968 1239 if (buf->b_hdr->b_freeze_cksum == NULL ||
969 1240 (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
970 1241 mutex_exit(&buf->b_hdr->b_freeze_lock);
971 1242 return;
972 1243 }
973 1244 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
974 1245 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
975 1246 panic("buffer modified while frozen!");
976 1247 mutex_exit(&buf->b_hdr->b_freeze_lock);
977 1248 }
978 1249
979 1250 static int
980 1251 arc_cksum_equal(arc_buf_t *buf)
981 1252 {
982 1253 zio_cksum_t zc;
983 1254 int equal;
984 1255
985 1256 mutex_enter(&buf->b_hdr->b_freeze_lock);
986 1257 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
987 1258 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
988 1259 mutex_exit(&buf->b_hdr->b_freeze_lock);
989 1260
990 1261 return (equal);
991 1262 }
992 1263
993 1264 static void
994 1265 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
995 1266 {
996 1267 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
997 1268 return;
998 1269
999 1270 mutex_enter(&buf->b_hdr->b_freeze_lock);
1000 1271 if (buf->b_hdr->b_freeze_cksum != NULL) {
1001 1272 mutex_exit(&buf->b_hdr->b_freeze_lock);
1002 1273 return;
1003 1274 }
1004 1275 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1005 1276 fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1006 1277 buf->b_hdr->b_freeze_cksum);
1007 1278 mutex_exit(&buf->b_hdr->b_freeze_lock);
1008 1279 arc_buf_watch(buf);
1009 1280 }
1010 1281
1011 1282 #ifndef _KERNEL
1012 1283 typedef struct procctl {
1013 1284 long cmd;
1014 1285 prwatch_t prwatch;
1015 1286 } procctl_t;
1016 1287 #endif
1017 1288
1018 1289 /* ARGSUSED */
1019 1290 static void
1020 1291 arc_buf_unwatch(arc_buf_t *buf)
1021 1292 {
1022 1293 #ifndef _KERNEL
1023 1294 if (arc_watch) {
1024 1295 int result;
1025 1296 procctl_t ctl;
1026 1297 ctl.cmd = PCWATCH;
1027 1298 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1028 1299 ctl.prwatch.pr_size = 0;
1029 1300 ctl.prwatch.pr_wflags = 0;
1030 1301 result = write(arc_procfd, &ctl, sizeof (ctl));
1031 1302 ASSERT3U(result, ==, sizeof (ctl));
1032 1303 }
1033 1304 #endif
1034 1305 }
1035 1306
1036 1307 /* ARGSUSED */
1037 1308 static void
1038 1309 arc_buf_watch(arc_buf_t *buf)
1039 1310 {
1040 1311 #ifndef _KERNEL
1041 1312 if (arc_watch) {
1042 1313 int result;
1043 1314 procctl_t ctl;
1044 1315 ctl.cmd = PCWATCH;
1045 1316 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1046 1317 ctl.prwatch.pr_size = buf->b_hdr->b_size;
1047 1318 ctl.prwatch.pr_wflags = WA_WRITE;
1048 1319 result = write(arc_procfd, &ctl, sizeof (ctl));
1049 1320 ASSERT3U(result, ==, sizeof (ctl));
1050 1321 }
1051 1322 #endif
1052 1323 }
1053 1324
1054 1325 void
1055 1326 arc_buf_thaw(arc_buf_t *buf)
1056 1327 {
1057 1328 if (zfs_flags & ZFS_DEBUG_MODIFY) {
1058 1329 if (buf->b_hdr->b_state != arc_anon)
1059 1330 panic("modifying non-anon buffer!");
1060 1331 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1061 1332 panic("modifying buffer while i/o in progress!");
1062 1333 arc_cksum_verify(buf);
1063 1334 }
1064 1335
1065 1336 mutex_enter(&buf->b_hdr->b_freeze_lock);
1066 1337 if (buf->b_hdr->b_freeze_cksum != NULL) {
1067 1338 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1068 1339 buf->b_hdr->b_freeze_cksum = NULL;
1069 1340 }
1070 1341
1071 1342 if (zfs_flags & ZFS_DEBUG_MODIFY) {
1072 1343 if (buf->b_hdr->b_thawed)
1073 1344 kmem_free(buf->b_hdr->b_thawed, 1);
1074 1345 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1075 1346 }
1076 1347
1077 1348 mutex_exit(&buf->b_hdr->b_freeze_lock);
1078 1349
1079 1350 arc_buf_unwatch(buf);
1080 1351 }
1081 1352
1082 1353 void
1083 1354 arc_buf_freeze(arc_buf_t *buf)
1084 1355 {
1085 1356 kmutex_t *hash_lock;
1086 1357
1087 1358 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1088 1359 return;
1089 1360
1090 1361 hash_lock = HDR_LOCK(buf->b_hdr);
1091 1362 mutex_enter(hash_lock);
1092 1363
1093 1364 ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1094 1365 buf->b_hdr->b_state == arc_anon);
1095 1366 arc_cksum_compute(buf, B_FALSE);
1096 1367 mutex_exit(hash_lock);
1097 1368
1098 1369 }
1099 1370
1100 1371 static void
1101 1372 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1102 1373 {
1103 1374 ASSERT(MUTEX_HELD(hash_lock));
1104 1375
1105 1376 if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1106 1377 (ab->b_state != arc_anon)) {
1107 1378 uint64_t delta = ab->b_size * ab->b_datacnt;
1108 1379 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1109 1380 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1110 1381
1111 1382 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1112 1383 mutex_enter(&ab->b_state->arcs_mtx);
1113 1384 ASSERT(list_link_active(&ab->b_arc_node));
1114 1385 list_remove(list, ab);
1115 1386 if (GHOST_STATE(ab->b_state)) {
1116 1387 ASSERT0(ab->b_datacnt);
1117 1388 ASSERT3P(ab->b_buf, ==, NULL);
1118 1389 delta = ab->b_size;
1119 1390 }
1120 1391 ASSERT(delta > 0);
1121 1392 ASSERT3U(*size, >=, delta);
1122 1393 atomic_add_64(size, -delta);
1123 1394 mutex_exit(&ab->b_state->arcs_mtx);
1124 1395 /* remove the prefetch flag if we get a reference */
1125 1396 if (ab->b_flags & ARC_PREFETCH)
1126 1397 ab->b_flags &= ~ARC_PREFETCH;
1127 1398 }
1128 1399 }
1129 1400
1130 1401 static int
1131 1402 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1132 1403 {
1133 1404 int cnt;
1134 1405 arc_state_t *state = ab->b_state;
1135 1406
1136 1407 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1137 1408 ASSERT(!GHOST_STATE(state));
1138 1409
1139 1410 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1140 1411 (state != arc_anon)) {
1141 1412 uint64_t *size = &state->arcs_lsize[ab->b_type];
1142 1413
1143 1414 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1144 1415 mutex_enter(&state->arcs_mtx);
1145 1416 ASSERT(!list_link_active(&ab->b_arc_node));
1146 1417 list_insert_head(&state->arcs_list[ab->b_type], ab);
1147 1418 ASSERT(ab->b_datacnt > 0);
1148 1419 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1149 1420 mutex_exit(&state->arcs_mtx);
1150 1421 }
1151 1422 return (cnt);
1152 1423 }
1153 1424
1154 1425 /*
1155 1426 * Move the supplied buffer to the indicated state. The mutex
1156 1427 * for the buffer must be held by the caller.
1157 1428 */
1158 1429 static void
1159 1430 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1160 1431 {
1161 1432 arc_state_t *old_state = ab->b_state;
1162 1433 int64_t refcnt = refcount_count(&ab->b_refcnt);
1163 1434 uint64_t from_delta, to_delta;
1164 1435
1165 1436 ASSERT(MUTEX_HELD(hash_lock));
1166 1437 ASSERT(new_state != old_state);
1167 1438 ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1168 1439 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1169 1440 ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1170 1441
1171 1442 from_delta = to_delta = ab->b_datacnt * ab->b_size;
1172 1443
1173 1444 /*
1174 1445 * If this buffer is evictable, transfer it from the
1175 1446 * old state list to the new state list.
1176 1447 */
1177 1448 if (refcnt == 0) {
1178 1449 if (old_state != arc_anon) {
1179 1450 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1180 1451 uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1181 1452
1182 1453 if (use_mutex)
1183 1454 mutex_enter(&old_state->arcs_mtx);
1184 1455
1185 1456 ASSERT(list_link_active(&ab->b_arc_node));
1186 1457 list_remove(&old_state->arcs_list[ab->b_type], ab);
1187 1458
1188 1459 /*
1189 1460 * If prefetching out of the ghost cache,
1190 1461 * we will have a non-zero datacnt.
1191 1462 */
1192 1463 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1193 1464 /* ghost elements have a ghost size */
1194 1465 ASSERT(ab->b_buf == NULL);
1195 1466 from_delta = ab->b_size;
1196 1467 }
1197 1468 ASSERT3U(*size, >=, from_delta);
1198 1469 atomic_add_64(size, -from_delta);
1199 1470
1200 1471 if (use_mutex)
1201 1472 mutex_exit(&old_state->arcs_mtx);
1202 1473 }
1203 1474 if (new_state != arc_anon) {
1204 1475 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1205 1476 uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1206 1477
1207 1478 if (use_mutex)
1208 1479 mutex_enter(&new_state->arcs_mtx);
1209 1480
1210 1481 list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1211 1482
1212 1483 /* ghost elements have a ghost size */
1213 1484 if (GHOST_STATE(new_state)) {
1214 1485 ASSERT(ab->b_datacnt == 0);
1215 1486 ASSERT(ab->b_buf == NULL);
1216 1487 to_delta = ab->b_size;
1217 1488 }
1218 1489 atomic_add_64(size, to_delta);
1219 1490
1220 1491 if (use_mutex)
1221 1492 mutex_exit(&new_state->arcs_mtx);
1222 1493 }
1223 1494 }
1224 1495
1225 1496 ASSERT(!BUF_EMPTY(ab));
1226 1497 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1227 1498 buf_hash_remove(ab);
1228 1499
1229 1500 /* adjust state sizes */
↓ open down ↓ |
520 lines elided |
↑ open up ↑ |
1230 1501 if (to_delta)
1231 1502 atomic_add_64(&new_state->arcs_size, to_delta);
1232 1503 if (from_delta) {
1233 1504 ASSERT3U(old_state->arcs_size, >=, from_delta);
1234 1505 atomic_add_64(&old_state->arcs_size, -from_delta);
1235 1506 }
1236 1507 ab->b_state = new_state;
1237 1508
1238 1509 /* adjust l2arc hdr stats */
1239 1510 if (new_state == arc_l2c_only)
1240 - l2arc_hdr_stat_add();
1511 + l2arc_hdr_stat_add(old_state != arc_anon);
1241 1512 else if (old_state == arc_l2c_only)
1242 1513 l2arc_hdr_stat_remove();
1243 1514 }
1244 1515
1245 1516 void
1246 1517 arc_space_consume(uint64_t space, arc_space_type_t type)
1247 1518 {
1248 1519 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1249 1520
1250 1521 switch (type) {
1251 1522 case ARC_SPACE_DATA:
1252 1523 ARCSTAT_INCR(arcstat_data_size, space);
1253 1524 break;
1254 1525 case ARC_SPACE_OTHER:
1255 1526 ARCSTAT_INCR(arcstat_other_size, space);
1256 1527 break;
1257 1528 case ARC_SPACE_HDRS:
1258 1529 ARCSTAT_INCR(arcstat_hdr_size, space);
1259 1530 break;
1260 1531 case ARC_SPACE_L2HDRS:
1261 1532 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1262 1533 break;
1263 1534 }
1264 1535
1265 1536 ARCSTAT_INCR(arcstat_meta_used, space);
1266 1537 atomic_add_64(&arc_size, space);
1267 1538 }
1268 1539
1269 1540 void
1270 1541 arc_space_return(uint64_t space, arc_space_type_t type)
1271 1542 {
1272 1543 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1273 1544
1274 1545 switch (type) {
1275 1546 case ARC_SPACE_DATA:
1276 1547 ARCSTAT_INCR(arcstat_data_size, -space);
1277 1548 break;
1278 1549 case ARC_SPACE_OTHER:
1279 1550 ARCSTAT_INCR(arcstat_other_size, -space);
1280 1551 break;
1281 1552 case ARC_SPACE_HDRS:
1282 1553 ARCSTAT_INCR(arcstat_hdr_size, -space);
1283 1554 break;
1284 1555 case ARC_SPACE_L2HDRS:
1285 1556 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1286 1557 break;
1287 1558 }
1288 1559
1289 1560 ASSERT(arc_meta_used >= space);
1290 1561 if (arc_meta_max < arc_meta_used)
1291 1562 arc_meta_max = arc_meta_used;
1292 1563 ARCSTAT_INCR(arcstat_meta_used, -space);
1293 1564 ASSERT(arc_size >= space);
1294 1565 atomic_add_64(&arc_size, -space);
1295 1566 }
1296 1567
1297 1568 void *
1298 1569 arc_data_buf_alloc(uint64_t size)
1299 1570 {
1300 1571 if (arc_evict_needed(ARC_BUFC_DATA))
1301 1572 cv_signal(&arc_reclaim_thr_cv);
1302 1573 atomic_add_64(&arc_size, size);
1303 1574 return (zio_data_buf_alloc(size));
1304 1575 }
1305 1576
1306 1577 void
1307 1578 arc_data_buf_free(void *buf, uint64_t size)
1308 1579 {
1309 1580 zio_data_buf_free(buf, size);
1310 1581 ASSERT(arc_size >= size);
1311 1582 atomic_add_64(&arc_size, -size);
1312 1583 }
1313 1584
1314 1585 arc_buf_t *
1315 1586 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1316 1587 {
1317 1588 arc_buf_hdr_t *hdr;
1318 1589 arc_buf_t *buf;
1319 1590
1320 1591 ASSERT3U(size, >, 0);
1321 1592 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1322 1593 ASSERT(BUF_EMPTY(hdr));
1323 1594 hdr->b_size = size;
1324 1595 hdr->b_type = type;
1325 1596 hdr->b_spa = spa_load_guid(spa);
1326 1597 hdr->b_state = arc_anon;
1327 1598 hdr->b_arc_access = 0;
1328 1599 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1329 1600 buf->b_hdr = hdr;
1330 1601 buf->b_data = NULL;
1331 1602 buf->b_efunc = NULL;
1332 1603 buf->b_private = NULL;
1333 1604 buf->b_next = NULL;
↓ open down ↓ |
83 lines elided |
↑ open up ↑ |
1334 1605 hdr->b_buf = buf;
1335 1606 arc_get_data_buf(buf);
1336 1607 hdr->b_datacnt = 1;
1337 1608 hdr->b_flags = 0;
1338 1609 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1339 1610 (void) refcount_add(&hdr->b_refcnt, tag);
1340 1611
1341 1612 return (buf);
1342 1613 }
1343 1614
1615 +/*
1616 + * Allocates an empty arc_buf_hdr structure (lacking any data buffer).
1617 + * This is used during l2arc reconstruction to make empty ARC buffers
1618 + * which circumvent the regular disk->arc->l2arc path and instead come
1619 + * into being in the reverse order, i.e. l2arc->arc->(disk).
1620 + */
1621 +arc_buf_hdr_t *
1622 +arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type)
1623 +{
1624 + arc_buf_hdr_t *hdr;
1625 +
1626 + ASSERT3U(size, >, 0);
1627 + hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1628 + ASSERT(BUF_EMPTY(hdr));
1629 + hdr->b_size = size;
1630 + hdr->b_type = type;
1631 + hdr->b_spa = guid;
1632 + hdr->b_state = arc_anon;
1633 + hdr->b_arc_access = 0;
1634 + hdr->b_buf = NULL;
1635 + hdr->b_datacnt = 0;
1636 + hdr->b_flags = 0;
1637 + ASSERT(refcount_is_zero(&hdr->b_refcnt));
1638 +
1639 + return (hdr);
1640 +}
1641 +
1344 1642 static char *arc_onloan_tag = "onloan";
1345 1643
1346 1644 /*
1347 1645 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1348 1646 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1349 1647 * buffers must be returned to the arc before they can be used by the DMU or
1350 1648 * freed.
1351 1649 */
1352 1650 arc_buf_t *
1353 1651 arc_loan_buf(spa_t *spa, int size)
1354 1652 {
1355 1653 arc_buf_t *buf;
1356 1654
1357 1655 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1358 1656
1359 1657 atomic_add_64(&arc_loaned_bytes, size);
1360 1658 return (buf);
1361 1659 }
1362 1660
1363 1661 /*
1364 1662 * Return a loaned arc buffer to the arc.
1365 1663 */
1366 1664 void
1367 1665 arc_return_buf(arc_buf_t *buf, void *tag)
1368 1666 {
1369 1667 arc_buf_hdr_t *hdr = buf->b_hdr;
1370 1668
1371 1669 ASSERT(buf->b_data != NULL);
1372 1670 (void) refcount_add(&hdr->b_refcnt, tag);
1373 1671 (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1374 1672
1375 1673 atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1376 1674 }
1377 1675
1378 1676 /* Detach an arc_buf from a dbuf (tag) */
1379 1677 void
1380 1678 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1381 1679 {
1382 1680 arc_buf_hdr_t *hdr;
1383 1681
1384 1682 ASSERT(buf->b_data != NULL);
1385 1683 hdr = buf->b_hdr;
1386 1684 (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1387 1685 (void) refcount_remove(&hdr->b_refcnt, tag);
1388 1686 buf->b_efunc = NULL;
1389 1687 buf->b_private = NULL;
1390 1688
1391 1689 atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1392 1690 }
1393 1691
1394 1692 static arc_buf_t *
1395 1693 arc_buf_clone(arc_buf_t *from)
1396 1694 {
1397 1695 arc_buf_t *buf;
1398 1696 arc_buf_hdr_t *hdr = from->b_hdr;
1399 1697 uint64_t size = hdr->b_size;
1400 1698
1401 1699 ASSERT(hdr->b_state != arc_anon);
1402 1700
1403 1701 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1404 1702 buf->b_hdr = hdr;
1405 1703 buf->b_data = NULL;
1406 1704 buf->b_efunc = NULL;
1407 1705 buf->b_private = NULL;
1408 1706 buf->b_next = hdr->b_buf;
1409 1707 hdr->b_buf = buf;
1410 1708 arc_get_data_buf(buf);
1411 1709 bcopy(from->b_data, buf->b_data, size);
1412 1710
1413 1711 /*
1414 1712 * This buffer already exists in the arc so create a duplicate
1415 1713 * copy for the caller. If the buffer is associated with user data
1416 1714 * then track the size and number of duplicates. These stats will be
1417 1715 * updated as duplicate buffers are created and destroyed.
1418 1716 */
1419 1717 if (hdr->b_type == ARC_BUFC_DATA) {
1420 1718 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1421 1719 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1422 1720 }
1423 1721 hdr->b_datacnt += 1;
1424 1722 return (buf);
1425 1723 }
1426 1724
1427 1725 void
1428 1726 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1429 1727 {
1430 1728 arc_buf_hdr_t *hdr;
1431 1729 kmutex_t *hash_lock;
1432 1730
1433 1731 /*
1434 1732 * Check to see if this buffer is evicted. Callers
1435 1733 * must verify b_data != NULL to know if the add_ref
1436 1734 * was successful.
1437 1735 */
1438 1736 mutex_enter(&buf->b_evict_lock);
1439 1737 if (buf->b_data == NULL) {
1440 1738 mutex_exit(&buf->b_evict_lock);
1441 1739 return;
1442 1740 }
1443 1741 hash_lock = HDR_LOCK(buf->b_hdr);
1444 1742 mutex_enter(hash_lock);
1445 1743 hdr = buf->b_hdr;
1446 1744 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1447 1745 mutex_exit(&buf->b_evict_lock);
1448 1746
1449 1747 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1450 1748 add_reference(hdr, hash_lock, tag);
1451 1749 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1452 1750 arc_access(hdr, hash_lock);
1453 1751 mutex_exit(hash_lock);
1454 1752 ARCSTAT_BUMP(arcstat_hits);
1455 1753 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1456 1754 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1457 1755 data, metadata, hits);
1458 1756 }
1459 1757
1460 1758 /*
1461 1759 * Free the arc data buffer. If it is an l2arc write in progress,
1462 1760 * the buffer is placed on l2arc_free_on_write to be freed later.
1463 1761 */
1464 1762 static void
1465 1763 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1466 1764 {
1467 1765 arc_buf_hdr_t *hdr = buf->b_hdr;
1468 1766
1469 1767 if (HDR_L2_WRITING(hdr)) {
1470 1768 l2arc_data_free_t *df;
1471 1769 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1472 1770 df->l2df_data = buf->b_data;
1473 1771 df->l2df_size = hdr->b_size;
1474 1772 df->l2df_func = free_func;
1475 1773 mutex_enter(&l2arc_free_on_write_mtx);
1476 1774 list_insert_head(l2arc_free_on_write, df);
1477 1775 mutex_exit(&l2arc_free_on_write_mtx);
1478 1776 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1479 1777 } else {
1480 1778 free_func(buf->b_data, hdr->b_size);
1481 1779 }
1482 1780 }
1483 1781
1484 1782 static void
1485 1783 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1486 1784 {
1487 1785 arc_buf_t **bufp;
1488 1786
1489 1787 /* free up data associated with the buf */
1490 1788 if (buf->b_data) {
1491 1789 arc_state_t *state = buf->b_hdr->b_state;
1492 1790 uint64_t size = buf->b_hdr->b_size;
1493 1791 arc_buf_contents_t type = buf->b_hdr->b_type;
1494 1792
1495 1793 arc_cksum_verify(buf);
1496 1794 arc_buf_unwatch(buf);
1497 1795
1498 1796 if (!recycle) {
1499 1797 if (type == ARC_BUFC_METADATA) {
1500 1798 arc_buf_data_free(buf, zio_buf_free);
1501 1799 arc_space_return(size, ARC_SPACE_DATA);
1502 1800 } else {
1503 1801 ASSERT(type == ARC_BUFC_DATA);
1504 1802 arc_buf_data_free(buf, zio_data_buf_free);
1505 1803 ARCSTAT_INCR(arcstat_data_size, -size);
1506 1804 atomic_add_64(&arc_size, -size);
1507 1805 }
1508 1806 }
1509 1807 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1510 1808 uint64_t *cnt = &state->arcs_lsize[type];
1511 1809
1512 1810 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1513 1811 ASSERT(state != arc_anon);
1514 1812
1515 1813 ASSERT3U(*cnt, >=, size);
1516 1814 atomic_add_64(cnt, -size);
1517 1815 }
1518 1816 ASSERT3U(state->arcs_size, >=, size);
1519 1817 atomic_add_64(&state->arcs_size, -size);
1520 1818 buf->b_data = NULL;
1521 1819
1522 1820 /*
1523 1821 * If we're destroying a duplicate buffer make sure
1524 1822 * that the appropriate statistics are updated.
1525 1823 */
1526 1824 if (buf->b_hdr->b_datacnt > 1 &&
1527 1825 buf->b_hdr->b_type == ARC_BUFC_DATA) {
1528 1826 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1529 1827 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1530 1828 }
1531 1829 ASSERT(buf->b_hdr->b_datacnt > 0);
1532 1830 buf->b_hdr->b_datacnt -= 1;
1533 1831 }
1534 1832
1535 1833 /* only remove the buf if requested */
1536 1834 if (!all)
1537 1835 return;
1538 1836
1539 1837 /* remove the buf from the hdr list */
1540 1838 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1541 1839 continue;
1542 1840 *bufp = buf->b_next;
1543 1841 buf->b_next = NULL;
1544 1842
1545 1843 ASSERT(buf->b_efunc == NULL);
1546 1844
1547 1845 /* clean up the buf */
1548 1846 buf->b_hdr = NULL;
1549 1847 kmem_cache_free(buf_cache, buf);
1550 1848 }
1551 1849
1552 1850 static void
1553 1851 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1554 1852 {
1555 1853 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1556 1854 ASSERT3P(hdr->b_state, ==, arc_anon);
1557 1855 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1558 1856 l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1559 1857
1560 1858 if (l2hdr != NULL) {
1561 1859 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1562 1860 /*
1563 1861 * To prevent arc_free() and l2arc_evict() from
1564 1862 * attempting to free the same buffer at the same time,
1565 1863 * a FREE_IN_PROGRESS flag is given to arc_free() to
1566 1864 * give it priority. l2arc_evict() can't destroy this
1567 1865 * header while we are waiting on l2arc_buflist_mtx.
1568 1866 *
1569 1867 * The hdr may be removed from l2ad_buflist before we
1570 1868 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1571 1869 */
1572 1870 if (!buflist_held) {
1573 1871 mutex_enter(&l2arc_buflist_mtx);
1574 1872 l2hdr = hdr->b_l2hdr;
1575 1873 }
1576 1874
1577 1875 if (l2hdr != NULL) {
1578 1876 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1579 1877 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1580 1878 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1581 1879 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1582 1880 if (hdr->b_state == arc_l2c_only)
1583 1881 l2arc_hdr_stat_remove();
1584 1882 hdr->b_l2hdr = NULL;
1585 1883 }
1586 1884
1587 1885 if (!buflist_held)
1588 1886 mutex_exit(&l2arc_buflist_mtx);
1589 1887 }
1590 1888
1591 1889 if (!BUF_EMPTY(hdr)) {
1592 1890 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1593 1891 buf_discard_identity(hdr);
1594 1892 }
1595 1893 while (hdr->b_buf) {
1596 1894 arc_buf_t *buf = hdr->b_buf;
1597 1895
1598 1896 if (buf->b_efunc) {
1599 1897 mutex_enter(&arc_eviction_mtx);
1600 1898 mutex_enter(&buf->b_evict_lock);
1601 1899 ASSERT(buf->b_hdr != NULL);
1602 1900 arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1603 1901 hdr->b_buf = buf->b_next;
1604 1902 buf->b_hdr = &arc_eviction_hdr;
1605 1903 buf->b_next = arc_eviction_list;
1606 1904 arc_eviction_list = buf;
1607 1905 mutex_exit(&buf->b_evict_lock);
1608 1906 mutex_exit(&arc_eviction_mtx);
1609 1907 } else {
1610 1908 arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1611 1909 }
1612 1910 }
1613 1911 if (hdr->b_freeze_cksum != NULL) {
1614 1912 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1615 1913 hdr->b_freeze_cksum = NULL;
1616 1914 }
1617 1915 if (hdr->b_thawed) {
1618 1916 kmem_free(hdr->b_thawed, 1);
1619 1917 hdr->b_thawed = NULL;
1620 1918 }
1621 1919
1622 1920 ASSERT(!list_link_active(&hdr->b_arc_node));
1623 1921 ASSERT3P(hdr->b_hash_next, ==, NULL);
1624 1922 ASSERT3P(hdr->b_acb, ==, NULL);
1625 1923 kmem_cache_free(hdr_cache, hdr);
1626 1924 }
1627 1925
1628 1926 void
1629 1927 arc_buf_free(arc_buf_t *buf, void *tag)
1630 1928 {
1631 1929 arc_buf_hdr_t *hdr = buf->b_hdr;
1632 1930 int hashed = hdr->b_state != arc_anon;
1633 1931
1634 1932 ASSERT(buf->b_efunc == NULL);
1635 1933 ASSERT(buf->b_data != NULL);
1636 1934
1637 1935 if (hashed) {
1638 1936 kmutex_t *hash_lock = HDR_LOCK(hdr);
1639 1937
1640 1938 mutex_enter(hash_lock);
1641 1939 hdr = buf->b_hdr;
1642 1940 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1643 1941
1644 1942 (void) remove_reference(hdr, hash_lock, tag);
1645 1943 if (hdr->b_datacnt > 1) {
1646 1944 arc_buf_destroy(buf, FALSE, TRUE);
1647 1945 } else {
1648 1946 ASSERT(buf == hdr->b_buf);
1649 1947 ASSERT(buf->b_efunc == NULL);
1650 1948 hdr->b_flags |= ARC_BUF_AVAILABLE;
1651 1949 }
1652 1950 mutex_exit(hash_lock);
1653 1951 } else if (HDR_IO_IN_PROGRESS(hdr)) {
1654 1952 int destroy_hdr;
1655 1953 /*
1656 1954 * We are in the middle of an async write. Don't destroy
1657 1955 * this buffer unless the write completes before we finish
1658 1956 * decrementing the reference count.
1659 1957 */
1660 1958 mutex_enter(&arc_eviction_mtx);
1661 1959 (void) remove_reference(hdr, NULL, tag);
1662 1960 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1663 1961 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1664 1962 mutex_exit(&arc_eviction_mtx);
1665 1963 if (destroy_hdr)
1666 1964 arc_hdr_destroy(hdr);
1667 1965 } else {
1668 1966 if (remove_reference(hdr, NULL, tag) > 0)
1669 1967 arc_buf_destroy(buf, FALSE, TRUE);
1670 1968 else
1671 1969 arc_hdr_destroy(hdr);
1672 1970 }
1673 1971 }
1674 1972
1675 1973 boolean_t
1676 1974 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1677 1975 {
1678 1976 arc_buf_hdr_t *hdr = buf->b_hdr;
1679 1977 kmutex_t *hash_lock = HDR_LOCK(hdr);
1680 1978 boolean_t no_callback = (buf->b_efunc == NULL);
1681 1979
1682 1980 if (hdr->b_state == arc_anon) {
1683 1981 ASSERT(hdr->b_datacnt == 1);
1684 1982 arc_buf_free(buf, tag);
1685 1983 return (no_callback);
1686 1984 }
1687 1985
1688 1986 mutex_enter(hash_lock);
1689 1987 hdr = buf->b_hdr;
1690 1988 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1691 1989 ASSERT(hdr->b_state != arc_anon);
1692 1990 ASSERT(buf->b_data != NULL);
1693 1991
1694 1992 (void) remove_reference(hdr, hash_lock, tag);
1695 1993 if (hdr->b_datacnt > 1) {
1696 1994 if (no_callback)
1697 1995 arc_buf_destroy(buf, FALSE, TRUE);
1698 1996 } else if (no_callback) {
1699 1997 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1700 1998 ASSERT(buf->b_efunc == NULL);
1701 1999 hdr->b_flags |= ARC_BUF_AVAILABLE;
1702 2000 }
1703 2001 ASSERT(no_callback || hdr->b_datacnt > 1 ||
1704 2002 refcount_is_zero(&hdr->b_refcnt));
1705 2003 mutex_exit(hash_lock);
1706 2004 return (no_callback);
1707 2005 }
1708 2006
1709 2007 int
1710 2008 arc_buf_size(arc_buf_t *buf)
1711 2009 {
1712 2010 return (buf->b_hdr->b_size);
1713 2011 }
1714 2012
1715 2013 /*
1716 2014 * Called from the DMU to determine if the current buffer should be
1717 2015 * evicted. In order to ensure proper locking, the eviction must be initiated
1718 2016 * from the DMU. Return true if the buffer is associated with user data and
1719 2017 * duplicate buffers still exist.
1720 2018 */
1721 2019 boolean_t
1722 2020 arc_buf_eviction_needed(arc_buf_t *buf)
1723 2021 {
1724 2022 arc_buf_hdr_t *hdr;
1725 2023 boolean_t evict_needed = B_FALSE;
1726 2024
1727 2025 if (zfs_disable_dup_eviction)
1728 2026 return (B_FALSE);
1729 2027
1730 2028 mutex_enter(&buf->b_evict_lock);
1731 2029 hdr = buf->b_hdr;
1732 2030 if (hdr == NULL) {
1733 2031 /*
1734 2032 * We are in arc_do_user_evicts(); let that function
1735 2033 * perform the eviction.
1736 2034 */
1737 2035 ASSERT(buf->b_data == NULL);
1738 2036 mutex_exit(&buf->b_evict_lock);
1739 2037 return (B_FALSE);
1740 2038 } else if (buf->b_data == NULL) {
1741 2039 /*
1742 2040 * We have already been added to the arc eviction list;
1743 2041 * recommend eviction.
1744 2042 */
1745 2043 ASSERT3P(hdr, ==, &arc_eviction_hdr);
1746 2044 mutex_exit(&buf->b_evict_lock);
1747 2045 return (B_TRUE);
1748 2046 }
1749 2047
1750 2048 if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1751 2049 evict_needed = B_TRUE;
1752 2050
1753 2051 mutex_exit(&buf->b_evict_lock);
1754 2052 return (evict_needed);
1755 2053 }
1756 2054
1757 2055 /*
1758 2056 * Evict buffers from list until we've removed the specified number of
1759 2057 * bytes. Move the removed buffers to the appropriate evict state.
1760 2058 * If the recycle flag is set, then attempt to "recycle" a buffer:
1761 2059 * - look for a buffer to evict that is `bytes' long.
1762 2060 * - return the data block from this buffer rather than freeing it.
1763 2061 * This flag is used by callers that are trying to make space for a
1764 2062 * new buffer in a full arc cache.
1765 2063 *
1766 2064 * This function makes a "best effort". It skips over any buffers
1767 2065 * it can't get a hash_lock on, and so may not catch all candidates.
1768 2066 * It may also return without evicting as much space as requested.
1769 2067 */
1770 2068 static void *
1771 2069 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1772 2070 arc_buf_contents_t type)
1773 2071 {
1774 2072 arc_state_t *evicted_state;
1775 2073 uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1776 2074 arc_buf_hdr_t *ab, *ab_prev = NULL;
1777 2075 list_t *list = &state->arcs_list[type];
1778 2076 kmutex_t *hash_lock;
1779 2077 boolean_t have_lock;
1780 2078 void *stolen = NULL;
1781 2079
1782 2080 ASSERT(state == arc_mru || state == arc_mfu);
1783 2081
1784 2082 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1785 2083
1786 2084 mutex_enter(&state->arcs_mtx);
1787 2085 mutex_enter(&evicted_state->arcs_mtx);
1788 2086
1789 2087 for (ab = list_tail(list); ab; ab = ab_prev) {
1790 2088 ab_prev = list_prev(list, ab);
1791 2089 /* prefetch buffers have a minimum lifespan */
1792 2090 if (HDR_IO_IN_PROGRESS(ab) ||
1793 2091 (spa && ab->b_spa != spa) ||
1794 2092 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1795 2093 ddi_get_lbolt() - ab->b_arc_access <
1796 2094 arc_min_prefetch_lifespan)) {
1797 2095 skipped++;
1798 2096 continue;
1799 2097 }
1800 2098 /* "lookahead" for better eviction candidate */
1801 2099 if (recycle && ab->b_size != bytes &&
1802 2100 ab_prev && ab_prev->b_size == bytes)
1803 2101 continue;
1804 2102 hash_lock = HDR_LOCK(ab);
1805 2103 have_lock = MUTEX_HELD(hash_lock);
1806 2104 if (have_lock || mutex_tryenter(hash_lock)) {
1807 2105 ASSERT0(refcount_count(&ab->b_refcnt));
1808 2106 ASSERT(ab->b_datacnt > 0);
1809 2107 while (ab->b_buf) {
1810 2108 arc_buf_t *buf = ab->b_buf;
1811 2109 if (!mutex_tryenter(&buf->b_evict_lock)) {
1812 2110 missed += 1;
1813 2111 break;
1814 2112 }
1815 2113 if (buf->b_data) {
1816 2114 bytes_evicted += ab->b_size;
1817 2115 if (recycle && ab->b_type == type &&
1818 2116 ab->b_size == bytes &&
1819 2117 !HDR_L2_WRITING(ab)) {
1820 2118 stolen = buf->b_data;
1821 2119 recycle = FALSE;
1822 2120 }
1823 2121 }
1824 2122 if (buf->b_efunc) {
1825 2123 mutex_enter(&arc_eviction_mtx);
1826 2124 arc_buf_destroy(buf,
1827 2125 buf->b_data == stolen, FALSE);
1828 2126 ab->b_buf = buf->b_next;
1829 2127 buf->b_hdr = &arc_eviction_hdr;
1830 2128 buf->b_next = arc_eviction_list;
1831 2129 arc_eviction_list = buf;
1832 2130 mutex_exit(&arc_eviction_mtx);
1833 2131 mutex_exit(&buf->b_evict_lock);
1834 2132 } else {
1835 2133 mutex_exit(&buf->b_evict_lock);
1836 2134 arc_buf_destroy(buf,
1837 2135 buf->b_data == stolen, TRUE);
1838 2136 }
1839 2137 }
1840 2138
1841 2139 if (ab->b_l2hdr) {
1842 2140 ARCSTAT_INCR(arcstat_evict_l2_cached,
1843 2141 ab->b_size);
1844 2142 } else {
1845 2143 if (l2arc_write_eligible(ab->b_spa, ab)) {
1846 2144 ARCSTAT_INCR(arcstat_evict_l2_eligible,
1847 2145 ab->b_size);
1848 2146 } else {
1849 2147 ARCSTAT_INCR(
1850 2148 arcstat_evict_l2_ineligible,
1851 2149 ab->b_size);
1852 2150 }
1853 2151 }
1854 2152
1855 2153 if (ab->b_datacnt == 0) {
1856 2154 arc_change_state(evicted_state, ab, hash_lock);
1857 2155 ASSERT(HDR_IN_HASH_TABLE(ab));
1858 2156 ab->b_flags |= ARC_IN_HASH_TABLE;
1859 2157 ab->b_flags &= ~ARC_BUF_AVAILABLE;
1860 2158 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1861 2159 }
1862 2160 if (!have_lock)
1863 2161 mutex_exit(hash_lock);
1864 2162 if (bytes >= 0 && bytes_evicted >= bytes)
1865 2163 break;
1866 2164 } else {
1867 2165 missed += 1;
1868 2166 }
1869 2167 }
1870 2168
1871 2169 mutex_exit(&evicted_state->arcs_mtx);
1872 2170 mutex_exit(&state->arcs_mtx);
1873 2171
1874 2172 if (bytes_evicted < bytes)
1875 2173 dprintf("only evicted %lld bytes from %x",
1876 2174 (longlong_t)bytes_evicted, state);
1877 2175
1878 2176 if (skipped)
1879 2177 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1880 2178
1881 2179 if (missed)
1882 2180 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1883 2181
1884 2182 /*
1885 2183 * We have just evicted some data into the ghost state, make
1886 2184 * sure we also adjust the ghost state size if necessary.
1887 2185 */
1888 2186 if (arc_no_grow &&
1889 2187 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1890 2188 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1891 2189 arc_mru_ghost->arcs_size - arc_c;
1892 2190
1893 2191 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1894 2192 int64_t todelete =
1895 2193 MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1896 2194 arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1897 2195 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1898 2196 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1899 2197 arc_mru_ghost->arcs_size +
1900 2198 arc_mfu_ghost->arcs_size - arc_c);
1901 2199 arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1902 2200 }
1903 2201 }
1904 2202
1905 2203 return (stolen);
1906 2204 }
1907 2205
1908 2206 /*
1909 2207 * Remove buffers from list until we've removed the specified number of
1910 2208 * bytes. Destroy the buffers that are removed.
1911 2209 */
1912 2210 static void
1913 2211 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1914 2212 {
1915 2213 arc_buf_hdr_t *ab, *ab_prev;
1916 2214 arc_buf_hdr_t marker = { 0 };
1917 2215 list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1918 2216 kmutex_t *hash_lock;
1919 2217 uint64_t bytes_deleted = 0;
1920 2218 uint64_t bufs_skipped = 0;
1921 2219
1922 2220 ASSERT(GHOST_STATE(state));
1923 2221 top:
1924 2222 mutex_enter(&state->arcs_mtx);
1925 2223 for (ab = list_tail(list); ab; ab = ab_prev) {
1926 2224 ab_prev = list_prev(list, ab);
1927 2225 if (spa && ab->b_spa != spa)
1928 2226 continue;
1929 2227
1930 2228 /* ignore markers */
1931 2229 if (ab->b_spa == 0)
1932 2230 continue;
1933 2231
1934 2232 hash_lock = HDR_LOCK(ab);
1935 2233 /* caller may be trying to modify this buffer, skip it */
1936 2234 if (MUTEX_HELD(hash_lock))
1937 2235 continue;
1938 2236 if (mutex_tryenter(hash_lock)) {
1939 2237 ASSERT(!HDR_IO_IN_PROGRESS(ab));
1940 2238 ASSERT(ab->b_buf == NULL);
1941 2239 ARCSTAT_BUMP(arcstat_deleted);
1942 2240 bytes_deleted += ab->b_size;
1943 2241
1944 2242 if (ab->b_l2hdr != NULL) {
1945 2243 /*
1946 2244 * This buffer is cached on the 2nd Level ARC;
1947 2245 * don't destroy the header.
1948 2246 */
1949 2247 arc_change_state(arc_l2c_only, ab, hash_lock);
1950 2248 mutex_exit(hash_lock);
1951 2249 } else {
1952 2250 arc_change_state(arc_anon, ab, hash_lock);
1953 2251 mutex_exit(hash_lock);
1954 2252 arc_hdr_destroy(ab);
1955 2253 }
1956 2254
1957 2255 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1958 2256 if (bytes >= 0 && bytes_deleted >= bytes)
1959 2257 break;
1960 2258 } else if (bytes < 0) {
1961 2259 /*
1962 2260 * Insert a list marker and then wait for the
1963 2261 * hash lock to become available. Once its
1964 2262 * available, restart from where we left off.
1965 2263 */
1966 2264 list_insert_after(list, ab, &marker);
1967 2265 mutex_exit(&state->arcs_mtx);
1968 2266 mutex_enter(hash_lock);
1969 2267 mutex_exit(hash_lock);
1970 2268 mutex_enter(&state->arcs_mtx);
1971 2269 ab_prev = list_prev(list, &marker);
1972 2270 list_remove(list, &marker);
1973 2271 } else
1974 2272 bufs_skipped += 1;
1975 2273 }
1976 2274 mutex_exit(&state->arcs_mtx);
1977 2275
1978 2276 if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1979 2277 (bytes < 0 || bytes_deleted < bytes)) {
1980 2278 list = &state->arcs_list[ARC_BUFC_METADATA];
1981 2279 goto top;
1982 2280 }
1983 2281
1984 2282 if (bufs_skipped) {
1985 2283 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1986 2284 ASSERT(bytes >= 0);
1987 2285 }
1988 2286
1989 2287 if (bytes_deleted < bytes)
1990 2288 dprintf("only deleted %lld bytes from %p",
1991 2289 (longlong_t)bytes_deleted, state);
1992 2290 }
1993 2291
1994 2292 static void
1995 2293 arc_adjust(void)
1996 2294 {
1997 2295 int64_t adjustment, delta;
1998 2296
1999 2297 /*
2000 2298 * Adjust MRU size
2001 2299 */
2002 2300
2003 2301 adjustment = MIN((int64_t)(arc_size - arc_c),
2004 2302 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2005 2303 arc_p));
2006 2304
2007 2305 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2008 2306 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2009 2307 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
2010 2308 adjustment -= delta;
2011 2309 }
2012 2310
2013 2311 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2014 2312 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2015 2313 (void) arc_evict(arc_mru, NULL, delta, FALSE,
2016 2314 ARC_BUFC_METADATA);
2017 2315 }
2018 2316
2019 2317 /*
2020 2318 * Adjust MFU size
2021 2319 */
2022 2320
2023 2321 adjustment = arc_size - arc_c;
2024 2322
2025 2323 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2026 2324 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2027 2325 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
2028 2326 adjustment -= delta;
2029 2327 }
2030 2328
2031 2329 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2032 2330 int64_t delta = MIN(adjustment,
2033 2331 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2034 2332 (void) arc_evict(arc_mfu, NULL, delta, FALSE,
2035 2333 ARC_BUFC_METADATA);
2036 2334 }
2037 2335
2038 2336 /*
2039 2337 * Adjust ghost lists
2040 2338 */
2041 2339
2042 2340 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2043 2341
2044 2342 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2045 2343 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2046 2344 arc_evict_ghost(arc_mru_ghost, NULL, delta);
2047 2345 }
2048 2346
2049 2347 adjustment =
2050 2348 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2051 2349
2052 2350 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2053 2351 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2054 2352 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2055 2353 }
2056 2354 }
2057 2355
2058 2356 static void
2059 2357 arc_do_user_evicts(void)
2060 2358 {
2061 2359 mutex_enter(&arc_eviction_mtx);
2062 2360 while (arc_eviction_list != NULL) {
2063 2361 arc_buf_t *buf = arc_eviction_list;
2064 2362 arc_eviction_list = buf->b_next;
2065 2363 mutex_enter(&buf->b_evict_lock);
2066 2364 buf->b_hdr = NULL;
2067 2365 mutex_exit(&buf->b_evict_lock);
2068 2366 mutex_exit(&arc_eviction_mtx);
2069 2367
2070 2368 if (buf->b_efunc != NULL)
2071 2369 VERIFY(buf->b_efunc(buf) == 0);
2072 2370
2073 2371 buf->b_efunc = NULL;
2074 2372 buf->b_private = NULL;
2075 2373 kmem_cache_free(buf_cache, buf);
2076 2374 mutex_enter(&arc_eviction_mtx);
2077 2375 }
2078 2376 mutex_exit(&arc_eviction_mtx);
2079 2377 }
2080 2378
2081 2379 /*
2082 2380 * Flush all *evictable* data from the cache for the given spa.
2083 2381 * NOTE: this will not touch "active" (i.e. referenced) data.
2084 2382 */
2085 2383 void
2086 2384 arc_flush(spa_t *spa)
2087 2385 {
2088 2386 uint64_t guid = 0;
2089 2387
2090 2388 if (spa)
2091 2389 guid = spa_load_guid(spa);
2092 2390
2093 2391 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2094 2392 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2095 2393 if (spa)
2096 2394 break;
2097 2395 }
2098 2396 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2099 2397 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2100 2398 if (spa)
2101 2399 break;
2102 2400 }
2103 2401 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2104 2402 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2105 2403 if (spa)
2106 2404 break;
2107 2405 }
2108 2406 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2109 2407 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2110 2408 if (spa)
2111 2409 break;
2112 2410 }
2113 2411
2114 2412 arc_evict_ghost(arc_mru_ghost, guid, -1);
2115 2413 arc_evict_ghost(arc_mfu_ghost, guid, -1);
2116 2414
2117 2415 mutex_enter(&arc_reclaim_thr_lock);
2118 2416 arc_do_user_evicts();
2119 2417 mutex_exit(&arc_reclaim_thr_lock);
2120 2418 ASSERT(spa || arc_eviction_list == NULL);
2121 2419 }
2122 2420
2123 2421 void
2124 2422 arc_shrink(void)
2125 2423 {
2126 2424 if (arc_c > arc_c_min) {
2127 2425 uint64_t to_free;
2128 2426
2129 2427 #ifdef _KERNEL
2130 2428 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
2131 2429 #else
2132 2430 to_free = arc_c >> arc_shrink_shift;
2133 2431 #endif
2134 2432 if (arc_c > arc_c_min + to_free)
2135 2433 atomic_add_64(&arc_c, -to_free);
2136 2434 else
2137 2435 arc_c = arc_c_min;
2138 2436
2139 2437 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2140 2438 if (arc_c > arc_size)
2141 2439 arc_c = MAX(arc_size, arc_c_min);
2142 2440 if (arc_p > arc_c)
2143 2441 arc_p = (arc_c >> 1);
2144 2442 ASSERT(arc_c >= arc_c_min);
2145 2443 ASSERT((int64_t)arc_p >= 0);
2146 2444 }
2147 2445
2148 2446 if (arc_size > arc_c)
2149 2447 arc_adjust();
2150 2448 }
2151 2449
2152 2450 /*
2153 2451 * Determine if the system is under memory pressure and is asking
2154 2452 * to reclaim memory. A return value of 1 indicates that the system
2155 2453 * is under memory pressure and that the arc should adjust accordingly.
2156 2454 */
2157 2455 static int
2158 2456 arc_reclaim_needed(void)
2159 2457 {
2160 2458 uint64_t extra;
2161 2459
2162 2460 #ifdef _KERNEL
2163 2461
2164 2462 if (needfree)
2165 2463 return (1);
2166 2464
2167 2465 /*
2168 2466 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2169 2467 */
2170 2468 extra = desfree;
2171 2469
2172 2470 /*
2173 2471 * check that we're out of range of the pageout scanner. It starts to
2174 2472 * schedule paging if freemem is less than lotsfree and needfree.
2175 2473 * lotsfree is the high-water mark for pageout, and needfree is the
2176 2474 * number of needed free pages. We add extra pages here to make sure
2177 2475 * the scanner doesn't start up while we're freeing memory.
2178 2476 */
2179 2477 if (freemem < lotsfree + needfree + extra)
2180 2478 return (1);
2181 2479
2182 2480 /*
2183 2481 * check to make sure that swapfs has enough space so that anon
2184 2482 * reservations can still succeed. anon_resvmem() checks that the
2185 2483 * availrmem is greater than swapfs_minfree, and the number of reserved
2186 2484 * swap pages. We also add a bit of extra here just to prevent
2187 2485 * circumstances from getting really dire.
2188 2486 */
2189 2487 if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2190 2488 return (1);
2191 2489
2192 2490 #if defined(__i386)
2193 2491 /*
2194 2492 * If we're on an i386 platform, it's possible that we'll exhaust the
2195 2493 * kernel heap space before we ever run out of available physical
2196 2494 * memory. Most checks of the size of the heap_area compare against
2197 2495 * tune.t_minarmem, which is the minimum available real memory that we
2198 2496 * can have in the system. However, this is generally fixed at 25 pages
2199 2497 * which is so low that it's useless. In this comparison, we seek to
2200 2498 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2201 2499 * heap is allocated. (Or, in the calculation, if less than 1/4th is
2202 2500 * free)
2203 2501 */
2204 2502 if (vmem_size(heap_arena, VMEM_FREE) <
2205 2503 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2))
2206 2504 return (1);
2207 2505 #endif
2208 2506
2209 2507 /*
2210 2508 * If zio data pages are being allocated out of a separate heap segment,
2211 2509 * then enforce that the size of available vmem for this arena remains
2212 2510 * above about 1/16th free.
2213 2511 *
2214 2512 * Note: The 1/16th arena free requirement was put in place
2215 2513 * to aggressively evict memory from the arc in order to avoid
2216 2514 * memory fragmentation issues.
2217 2515 */
2218 2516 if (zio_arena != NULL &&
2219 2517 vmem_size(zio_arena, VMEM_FREE) <
2220 2518 (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2221 2519 return (1);
2222 2520 #else
2223 2521 if (spa_get_random(100) == 0)
2224 2522 return (1);
2225 2523 #endif
2226 2524 return (0);
2227 2525 }
2228 2526
2229 2527 static void
2230 2528 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2231 2529 {
2232 2530 size_t i;
2233 2531 kmem_cache_t *prev_cache = NULL;
2234 2532 kmem_cache_t *prev_data_cache = NULL;
2235 2533 extern kmem_cache_t *zio_buf_cache[];
2236 2534 extern kmem_cache_t *zio_data_buf_cache[];
2237 2535
2238 2536 #ifdef _KERNEL
2239 2537 if (arc_meta_used >= arc_meta_limit) {
2240 2538 /*
2241 2539 * We are exceeding our meta-data cache limit.
2242 2540 * Purge some DNLC entries to release holds on meta-data.
2243 2541 */
2244 2542 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2245 2543 }
2246 2544 #if defined(__i386)
2247 2545 /*
2248 2546 * Reclaim unused memory from all kmem caches.
2249 2547 */
2250 2548 kmem_reap();
2251 2549 #endif
2252 2550 #endif
2253 2551
2254 2552 /*
2255 2553 * An aggressive reclamation will shrink the cache size as well as
2256 2554 * reap free buffers from the arc kmem caches.
2257 2555 */
2258 2556 if (strat == ARC_RECLAIM_AGGR)
2259 2557 arc_shrink();
2260 2558
2261 2559 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2262 2560 if (zio_buf_cache[i] != prev_cache) {
2263 2561 prev_cache = zio_buf_cache[i];
2264 2562 kmem_cache_reap_now(zio_buf_cache[i]);
2265 2563 }
2266 2564 if (zio_data_buf_cache[i] != prev_data_cache) {
2267 2565 prev_data_cache = zio_data_buf_cache[i];
2268 2566 kmem_cache_reap_now(zio_data_buf_cache[i]);
2269 2567 }
2270 2568 }
2271 2569 kmem_cache_reap_now(buf_cache);
2272 2570 kmem_cache_reap_now(hdr_cache);
2273 2571
2274 2572 /*
2275 2573 * Ask the vmem areana to reclaim unused memory from its
2276 2574 * quantum caches.
2277 2575 */
2278 2576 if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2279 2577 vmem_qcache_reap(zio_arena);
2280 2578 }
2281 2579
2282 2580 static void
2283 2581 arc_reclaim_thread(void)
2284 2582 {
2285 2583 clock_t growtime = 0;
2286 2584 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
2287 2585 callb_cpr_t cpr;
2288 2586
2289 2587 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2290 2588
2291 2589 mutex_enter(&arc_reclaim_thr_lock);
2292 2590 while (arc_thread_exit == 0) {
2293 2591 if (arc_reclaim_needed()) {
2294 2592
2295 2593 if (arc_no_grow) {
2296 2594 if (last_reclaim == ARC_RECLAIM_CONS) {
2297 2595 last_reclaim = ARC_RECLAIM_AGGR;
2298 2596 } else {
2299 2597 last_reclaim = ARC_RECLAIM_CONS;
2300 2598 }
2301 2599 } else {
2302 2600 arc_no_grow = TRUE;
2303 2601 last_reclaim = ARC_RECLAIM_AGGR;
2304 2602 membar_producer();
2305 2603 }
2306 2604
2307 2605 /* reset the growth delay for every reclaim */
2308 2606 growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2309 2607
2310 2608 arc_kmem_reap_now(last_reclaim);
2311 2609 arc_warm = B_TRUE;
2312 2610
2313 2611 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2314 2612 arc_no_grow = FALSE;
2315 2613 }
2316 2614
2317 2615 arc_adjust();
2318 2616
2319 2617 if (arc_eviction_list != NULL)
2320 2618 arc_do_user_evicts();
2321 2619
2322 2620 /* block until needed, or one second, whichever is shorter */
2323 2621 CALLB_CPR_SAFE_BEGIN(&cpr);
2324 2622 (void) cv_timedwait(&arc_reclaim_thr_cv,
2325 2623 &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2326 2624 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2327 2625 }
2328 2626
2329 2627 arc_thread_exit = 0;
2330 2628 cv_broadcast(&arc_reclaim_thr_cv);
2331 2629 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */
2332 2630 thread_exit();
2333 2631 }
2334 2632
2335 2633 /*
2336 2634 * Adapt arc info given the number of bytes we are trying to add and
2337 2635 * the state that we are comming from. This function is only called
2338 2636 * when we are adding new content to the cache.
2339 2637 */
2340 2638 static void
2341 2639 arc_adapt(int bytes, arc_state_t *state)
2342 2640 {
2343 2641 int mult;
2344 2642 uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2345 2643
2346 2644 if (state == arc_l2c_only)
2347 2645 return;
2348 2646
2349 2647 ASSERT(bytes > 0);
2350 2648 /*
2351 2649 * Adapt the target size of the MRU list:
2352 2650 * - if we just hit in the MRU ghost list, then increase
2353 2651 * the target size of the MRU list.
2354 2652 * - if we just hit in the MFU ghost list, then increase
2355 2653 * the target size of the MFU list by decreasing the
2356 2654 * target size of the MRU list.
2357 2655 */
2358 2656 if (state == arc_mru_ghost) {
2359 2657 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2360 2658 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2361 2659 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2362 2660
2363 2661 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2364 2662 } else if (state == arc_mfu_ghost) {
2365 2663 uint64_t delta;
2366 2664
2367 2665 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2368 2666 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2369 2667 mult = MIN(mult, 10);
2370 2668
2371 2669 delta = MIN(bytes * mult, arc_p);
2372 2670 arc_p = MAX(arc_p_min, arc_p - delta);
2373 2671 }
2374 2672 ASSERT((int64_t)arc_p >= 0);
2375 2673
2376 2674 if (arc_reclaim_needed()) {
2377 2675 cv_signal(&arc_reclaim_thr_cv);
2378 2676 return;
2379 2677 }
2380 2678
2381 2679 if (arc_no_grow)
2382 2680 return;
2383 2681
2384 2682 if (arc_c >= arc_c_max)
2385 2683 return;
2386 2684
2387 2685 /*
2388 2686 * If we're within (2 * maxblocksize) bytes of the target
2389 2687 * cache size, increment the target cache size
2390 2688 */
2391 2689 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2392 2690 atomic_add_64(&arc_c, (int64_t)bytes);
2393 2691 if (arc_c > arc_c_max)
2394 2692 arc_c = arc_c_max;
2395 2693 else if (state == arc_anon)
2396 2694 atomic_add_64(&arc_p, (int64_t)bytes);
2397 2695 if (arc_p > arc_c)
2398 2696 arc_p = arc_c;
2399 2697 }
2400 2698 ASSERT((int64_t)arc_p >= 0);
2401 2699 }
2402 2700
2403 2701 /*
2404 2702 * Check if the cache has reached its limits and eviction is required
2405 2703 * prior to insert.
2406 2704 */
2407 2705 static int
2408 2706 arc_evict_needed(arc_buf_contents_t type)
2409 2707 {
2410 2708 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2411 2709 return (1);
2412 2710
2413 2711 if (arc_reclaim_needed())
2414 2712 return (1);
2415 2713
2416 2714 return (arc_size > arc_c);
2417 2715 }
2418 2716
2419 2717 /*
2420 2718 * The buffer, supplied as the first argument, needs a data block.
2421 2719 * So, if we are at cache max, determine which cache should be victimized.
2422 2720 * We have the following cases:
2423 2721 *
2424 2722 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2425 2723 * In this situation if we're out of space, but the resident size of the MFU is
2426 2724 * under the limit, victimize the MFU cache to satisfy this insertion request.
2427 2725 *
2428 2726 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2429 2727 * Here, we've used up all of the available space for the MRU, so we need to
2430 2728 * evict from our own cache instead. Evict from the set of resident MRU
2431 2729 * entries.
2432 2730 *
2433 2731 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2434 2732 * c minus p represents the MFU space in the cache, since p is the size of the
2435 2733 * cache that is dedicated to the MRU. In this situation there's still space on
2436 2734 * the MFU side, so the MRU side needs to be victimized.
2437 2735 *
2438 2736 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2439 2737 * MFU's resident set is consuming more space than it has been allotted. In
2440 2738 * this situation, we must victimize our own cache, the MFU, for this insertion.
2441 2739 */
2442 2740 static void
2443 2741 arc_get_data_buf(arc_buf_t *buf)
2444 2742 {
2445 2743 arc_state_t *state = buf->b_hdr->b_state;
2446 2744 uint64_t size = buf->b_hdr->b_size;
2447 2745 arc_buf_contents_t type = buf->b_hdr->b_type;
2448 2746
2449 2747 arc_adapt(size, state);
2450 2748
2451 2749 /*
2452 2750 * We have not yet reached cache maximum size,
2453 2751 * just allocate a new buffer.
2454 2752 */
2455 2753 if (!arc_evict_needed(type)) {
2456 2754 if (type == ARC_BUFC_METADATA) {
2457 2755 buf->b_data = zio_buf_alloc(size);
2458 2756 arc_space_consume(size, ARC_SPACE_DATA);
2459 2757 } else {
2460 2758 ASSERT(type == ARC_BUFC_DATA);
2461 2759 buf->b_data = zio_data_buf_alloc(size);
2462 2760 ARCSTAT_INCR(arcstat_data_size, size);
2463 2761 atomic_add_64(&arc_size, size);
2464 2762 }
2465 2763 goto out;
2466 2764 }
2467 2765
2468 2766 /*
2469 2767 * If we are prefetching from the mfu ghost list, this buffer
2470 2768 * will end up on the mru list; so steal space from there.
2471 2769 */
2472 2770 if (state == arc_mfu_ghost)
2473 2771 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2474 2772 else if (state == arc_mru_ghost)
2475 2773 state = arc_mru;
2476 2774
2477 2775 if (state == arc_mru || state == arc_anon) {
2478 2776 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2479 2777 state = (arc_mfu->arcs_lsize[type] >= size &&
2480 2778 arc_p > mru_used) ? arc_mfu : arc_mru;
2481 2779 } else {
2482 2780 /* MFU cases */
2483 2781 uint64_t mfu_space = arc_c - arc_p;
2484 2782 state = (arc_mru->arcs_lsize[type] >= size &&
2485 2783 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2486 2784 }
2487 2785 if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2488 2786 if (type == ARC_BUFC_METADATA) {
2489 2787 buf->b_data = zio_buf_alloc(size);
2490 2788 arc_space_consume(size, ARC_SPACE_DATA);
2491 2789 } else {
2492 2790 ASSERT(type == ARC_BUFC_DATA);
2493 2791 buf->b_data = zio_data_buf_alloc(size);
2494 2792 ARCSTAT_INCR(arcstat_data_size, size);
2495 2793 atomic_add_64(&arc_size, size);
2496 2794 }
2497 2795 ARCSTAT_BUMP(arcstat_recycle_miss);
2498 2796 }
2499 2797 ASSERT(buf->b_data != NULL);
2500 2798 out:
2501 2799 /*
2502 2800 * Update the state size. Note that ghost states have a
2503 2801 * "ghost size" and so don't need to be updated.
2504 2802 */
2505 2803 if (!GHOST_STATE(buf->b_hdr->b_state)) {
2506 2804 arc_buf_hdr_t *hdr = buf->b_hdr;
2507 2805
2508 2806 atomic_add_64(&hdr->b_state->arcs_size, size);
2509 2807 if (list_link_active(&hdr->b_arc_node)) {
2510 2808 ASSERT(refcount_is_zero(&hdr->b_refcnt));
2511 2809 atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2512 2810 }
2513 2811 /*
2514 2812 * If we are growing the cache, and we are adding anonymous
2515 2813 * data, and we have outgrown arc_p, update arc_p
2516 2814 */
2517 2815 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2518 2816 arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2519 2817 arc_p = MIN(arc_c, arc_p + size);
2520 2818 }
2521 2819 }
2522 2820
2523 2821 /*
2524 2822 * This routine is called whenever a buffer is accessed.
2525 2823 * NOTE: the hash lock is dropped in this function.
2526 2824 */
2527 2825 static void
2528 2826 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2529 2827 {
2530 2828 clock_t now;
2531 2829
2532 2830 ASSERT(MUTEX_HELD(hash_lock));
2533 2831
2534 2832 if (buf->b_state == arc_anon) {
2535 2833 /*
2536 2834 * This buffer is not in the cache, and does not
2537 2835 * appear in our "ghost" list. Add the new buffer
2538 2836 * to the MRU state.
2539 2837 */
2540 2838
2541 2839 ASSERT(buf->b_arc_access == 0);
2542 2840 buf->b_arc_access = ddi_get_lbolt();
2543 2841 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2544 2842 arc_change_state(arc_mru, buf, hash_lock);
2545 2843
2546 2844 } else if (buf->b_state == arc_mru) {
2547 2845 now = ddi_get_lbolt();
2548 2846
2549 2847 /*
2550 2848 * If this buffer is here because of a prefetch, then either:
2551 2849 * - clear the flag if this is a "referencing" read
2552 2850 * (any subsequent access will bump this into the MFU state).
2553 2851 * or
2554 2852 * - move the buffer to the head of the list if this is
2555 2853 * another prefetch (to make it less likely to be evicted).
2556 2854 */
2557 2855 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2558 2856 if (refcount_count(&buf->b_refcnt) == 0) {
2559 2857 ASSERT(list_link_active(&buf->b_arc_node));
2560 2858 } else {
2561 2859 buf->b_flags &= ~ARC_PREFETCH;
2562 2860 ARCSTAT_BUMP(arcstat_mru_hits);
2563 2861 }
2564 2862 buf->b_arc_access = now;
2565 2863 return;
2566 2864 }
2567 2865
2568 2866 /*
2569 2867 * This buffer has been "accessed" only once so far,
2570 2868 * but it is still in the cache. Move it to the MFU
2571 2869 * state.
2572 2870 */
2573 2871 if (now > buf->b_arc_access + ARC_MINTIME) {
2574 2872 /*
2575 2873 * More than 125ms have passed since we
2576 2874 * instantiated this buffer. Move it to the
2577 2875 * most frequently used state.
2578 2876 */
2579 2877 buf->b_arc_access = now;
2580 2878 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2581 2879 arc_change_state(arc_mfu, buf, hash_lock);
2582 2880 }
2583 2881 ARCSTAT_BUMP(arcstat_mru_hits);
2584 2882 } else if (buf->b_state == arc_mru_ghost) {
2585 2883 arc_state_t *new_state;
2586 2884 /*
2587 2885 * This buffer has been "accessed" recently, but
2588 2886 * was evicted from the cache. Move it to the
2589 2887 * MFU state.
2590 2888 */
2591 2889
2592 2890 if (buf->b_flags & ARC_PREFETCH) {
2593 2891 new_state = arc_mru;
2594 2892 if (refcount_count(&buf->b_refcnt) > 0)
2595 2893 buf->b_flags &= ~ARC_PREFETCH;
2596 2894 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2597 2895 } else {
2598 2896 new_state = arc_mfu;
2599 2897 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2600 2898 }
2601 2899
2602 2900 buf->b_arc_access = ddi_get_lbolt();
2603 2901 arc_change_state(new_state, buf, hash_lock);
2604 2902
2605 2903 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2606 2904 } else if (buf->b_state == arc_mfu) {
2607 2905 /*
2608 2906 * This buffer has been accessed more than once and is
2609 2907 * still in the cache. Keep it in the MFU state.
2610 2908 *
2611 2909 * NOTE: an add_reference() that occurred when we did
2612 2910 * the arc_read() will have kicked this off the list.
2613 2911 * If it was a prefetch, we will explicitly move it to
2614 2912 * the head of the list now.
2615 2913 */
2616 2914 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2617 2915 ASSERT(refcount_count(&buf->b_refcnt) == 0);
2618 2916 ASSERT(list_link_active(&buf->b_arc_node));
2619 2917 }
2620 2918 ARCSTAT_BUMP(arcstat_mfu_hits);
2621 2919 buf->b_arc_access = ddi_get_lbolt();
2622 2920 } else if (buf->b_state == arc_mfu_ghost) {
2623 2921 arc_state_t *new_state = arc_mfu;
2624 2922 /*
2625 2923 * This buffer has been accessed more than once but has
2626 2924 * been evicted from the cache. Move it back to the
2627 2925 * MFU state.
2628 2926 */
2629 2927
2630 2928 if (buf->b_flags & ARC_PREFETCH) {
2631 2929 /*
2632 2930 * This is a prefetch access...
2633 2931 * move this block back to the MRU state.
2634 2932 */
2635 2933 ASSERT0(refcount_count(&buf->b_refcnt));
2636 2934 new_state = arc_mru;
2637 2935 }
2638 2936
2639 2937 buf->b_arc_access = ddi_get_lbolt();
2640 2938 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2641 2939 arc_change_state(new_state, buf, hash_lock);
2642 2940
2643 2941 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2644 2942 } else if (buf->b_state == arc_l2c_only) {
2645 2943 /*
2646 2944 * This buffer is on the 2nd Level ARC.
2647 2945 */
2648 2946
2649 2947 buf->b_arc_access = ddi_get_lbolt();
2650 2948 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2651 2949 arc_change_state(arc_mfu, buf, hash_lock);
2652 2950 } else {
2653 2951 ASSERT(!"invalid arc state");
2654 2952 }
2655 2953 }
2656 2954
2657 2955 /* a generic arc_done_func_t which you can use */
2658 2956 /* ARGSUSED */
2659 2957 void
2660 2958 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2661 2959 {
2662 2960 if (zio == NULL || zio->io_error == 0)
2663 2961 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2664 2962 VERIFY(arc_buf_remove_ref(buf, arg));
2665 2963 }
2666 2964
2667 2965 /* a generic arc_done_func_t */
2668 2966 void
2669 2967 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2670 2968 {
2671 2969 arc_buf_t **bufp = arg;
2672 2970 if (zio && zio->io_error) {
2673 2971 VERIFY(arc_buf_remove_ref(buf, arg));
2674 2972 *bufp = NULL;
2675 2973 } else {
2676 2974 *bufp = buf;
2677 2975 ASSERT(buf->b_data);
2678 2976 }
2679 2977 }
2680 2978
2681 2979 static void
2682 2980 arc_read_done(zio_t *zio)
2683 2981 {
2684 2982 arc_buf_hdr_t *hdr, *found;
2685 2983 arc_buf_t *buf;
2686 2984 arc_buf_t *abuf; /* buffer we're assigning to callback */
2687 2985 kmutex_t *hash_lock;
2688 2986 arc_callback_t *callback_list, *acb;
2689 2987 int freeable = FALSE;
2690 2988
2691 2989 buf = zio->io_private;
2692 2990 hdr = buf->b_hdr;
2693 2991
2694 2992 /*
2695 2993 * The hdr was inserted into hash-table and removed from lists
2696 2994 * prior to starting I/O. We should find this header, since
2697 2995 * it's in the hash table, and it should be legit since it's
2698 2996 * not possible to evict it during the I/O. The only possible
2699 2997 * reason for it not to be found is if we were freed during the
2700 2998 * read.
2701 2999 */
2702 3000 found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2703 3001 &hash_lock);
2704 3002
2705 3003 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2706 3004 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2707 3005 (found == hdr && HDR_L2_READING(hdr)));
2708 3006
2709 3007 hdr->b_flags &= ~ARC_L2_EVICTED;
2710 3008 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2711 3009 hdr->b_flags &= ~ARC_L2CACHE;
2712 3010
2713 3011 /* byteswap if necessary */
2714 3012 callback_list = hdr->b_acb;
2715 3013 ASSERT(callback_list != NULL);
2716 3014 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2717 3015 dmu_object_byteswap_t bswap =
2718 3016 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2719 3017 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2720 3018 byteswap_uint64_array :
2721 3019 dmu_ot_byteswap[bswap].ob_func;
2722 3020 func(buf->b_data, hdr->b_size);
2723 3021 }
2724 3022
2725 3023 arc_cksum_compute(buf, B_FALSE);
2726 3024 arc_buf_watch(buf);
2727 3025
2728 3026 if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2729 3027 /*
2730 3028 * Only call arc_access on anonymous buffers. This is because
2731 3029 * if we've issued an I/O for an evicted buffer, we've already
2732 3030 * called arc_access (to prevent any simultaneous readers from
2733 3031 * getting confused).
2734 3032 */
2735 3033 arc_access(hdr, hash_lock);
2736 3034 }
2737 3035
2738 3036 /* create copies of the data buffer for the callers */
2739 3037 abuf = buf;
2740 3038 for (acb = callback_list; acb; acb = acb->acb_next) {
2741 3039 if (acb->acb_done) {
2742 3040 if (abuf == NULL) {
2743 3041 ARCSTAT_BUMP(arcstat_duplicate_reads);
2744 3042 abuf = arc_buf_clone(buf);
2745 3043 }
2746 3044 acb->acb_buf = abuf;
2747 3045 abuf = NULL;
2748 3046 }
2749 3047 }
2750 3048 hdr->b_acb = NULL;
2751 3049 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2752 3050 ASSERT(!HDR_BUF_AVAILABLE(hdr));
2753 3051 if (abuf == buf) {
2754 3052 ASSERT(buf->b_efunc == NULL);
2755 3053 ASSERT(hdr->b_datacnt == 1);
2756 3054 hdr->b_flags |= ARC_BUF_AVAILABLE;
2757 3055 }
2758 3056
2759 3057 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2760 3058
2761 3059 if (zio->io_error != 0) {
2762 3060 hdr->b_flags |= ARC_IO_ERROR;
2763 3061 if (hdr->b_state != arc_anon)
2764 3062 arc_change_state(arc_anon, hdr, hash_lock);
2765 3063 if (HDR_IN_HASH_TABLE(hdr))
2766 3064 buf_hash_remove(hdr);
2767 3065 freeable = refcount_is_zero(&hdr->b_refcnt);
2768 3066 }
2769 3067
2770 3068 /*
2771 3069 * Broadcast before we drop the hash_lock to avoid the possibility
2772 3070 * that the hdr (and hence the cv) might be freed before we get to
2773 3071 * the cv_broadcast().
2774 3072 */
2775 3073 cv_broadcast(&hdr->b_cv);
2776 3074
2777 3075 if (hash_lock) {
2778 3076 mutex_exit(hash_lock);
2779 3077 } else {
2780 3078 /*
2781 3079 * This block was freed while we waited for the read to
2782 3080 * complete. It has been removed from the hash table and
2783 3081 * moved to the anonymous state (so that it won't show up
2784 3082 * in the cache).
2785 3083 */
2786 3084 ASSERT3P(hdr->b_state, ==, arc_anon);
2787 3085 freeable = refcount_is_zero(&hdr->b_refcnt);
2788 3086 }
2789 3087
2790 3088 /* execute each callback and free its structure */
2791 3089 while ((acb = callback_list) != NULL) {
2792 3090 if (acb->acb_done)
2793 3091 acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2794 3092
2795 3093 if (acb->acb_zio_dummy != NULL) {
2796 3094 acb->acb_zio_dummy->io_error = zio->io_error;
2797 3095 zio_nowait(acb->acb_zio_dummy);
2798 3096 }
2799 3097
2800 3098 callback_list = acb->acb_next;
2801 3099 kmem_free(acb, sizeof (arc_callback_t));
2802 3100 }
2803 3101
2804 3102 if (freeable)
2805 3103 arc_hdr_destroy(hdr);
2806 3104 }
2807 3105
2808 3106 /*
2809 3107 * "Read" the block at the specified DVA (in bp) via the
2810 3108 * cache. If the block is found in the cache, invoke the provided
2811 3109 * callback immediately and return. Note that the `zio' parameter
2812 3110 * in the callback will be NULL in this case, since no IO was
2813 3111 * required. If the block is not in the cache pass the read request
2814 3112 * on to the spa with a substitute callback function, so that the
2815 3113 * requested block will be added to the cache.
2816 3114 *
2817 3115 * If a read request arrives for a block that has a read in-progress,
2818 3116 * either wait for the in-progress read to complete (and return the
2819 3117 * results); or, if this is a read with a "done" func, add a record
2820 3118 * to the read to invoke the "done" func when the read completes,
2821 3119 * and return; or just return.
2822 3120 *
2823 3121 * arc_read_done() will invoke all the requested "done" functions
2824 3122 * for readers of this block.
2825 3123 */
2826 3124 int
2827 3125 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2828 3126 void *private, int priority, int zio_flags, uint32_t *arc_flags,
2829 3127 const zbookmark_t *zb)
2830 3128 {
2831 3129 arc_buf_hdr_t *hdr;
2832 3130 arc_buf_t *buf = NULL;
2833 3131 kmutex_t *hash_lock;
2834 3132 zio_t *rzio;
2835 3133 uint64_t guid = spa_load_guid(spa);
2836 3134
2837 3135 top:
2838 3136 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2839 3137 &hash_lock);
2840 3138 if (hdr && hdr->b_datacnt > 0) {
2841 3139
2842 3140 *arc_flags |= ARC_CACHED;
2843 3141
2844 3142 if (HDR_IO_IN_PROGRESS(hdr)) {
2845 3143
2846 3144 if (*arc_flags & ARC_WAIT) {
2847 3145 cv_wait(&hdr->b_cv, hash_lock);
2848 3146 mutex_exit(hash_lock);
2849 3147 goto top;
2850 3148 }
2851 3149 ASSERT(*arc_flags & ARC_NOWAIT);
2852 3150
2853 3151 if (done) {
2854 3152 arc_callback_t *acb = NULL;
2855 3153
2856 3154 acb = kmem_zalloc(sizeof (arc_callback_t),
2857 3155 KM_SLEEP);
2858 3156 acb->acb_done = done;
2859 3157 acb->acb_private = private;
2860 3158 if (pio != NULL)
2861 3159 acb->acb_zio_dummy = zio_null(pio,
2862 3160 spa, NULL, NULL, NULL, zio_flags);
2863 3161
2864 3162 ASSERT(acb->acb_done != NULL);
2865 3163 acb->acb_next = hdr->b_acb;
2866 3164 hdr->b_acb = acb;
2867 3165 add_reference(hdr, hash_lock, private);
2868 3166 mutex_exit(hash_lock);
2869 3167 return (0);
2870 3168 }
2871 3169 mutex_exit(hash_lock);
2872 3170 return (0);
2873 3171 }
2874 3172
2875 3173 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2876 3174
2877 3175 if (done) {
2878 3176 add_reference(hdr, hash_lock, private);
2879 3177 /*
2880 3178 * If this block is already in use, create a new
2881 3179 * copy of the data so that we will be guaranteed
2882 3180 * that arc_release() will always succeed.
2883 3181 */
2884 3182 buf = hdr->b_buf;
2885 3183 ASSERT(buf);
2886 3184 ASSERT(buf->b_data);
2887 3185 if (HDR_BUF_AVAILABLE(hdr)) {
2888 3186 ASSERT(buf->b_efunc == NULL);
2889 3187 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2890 3188 } else {
2891 3189 buf = arc_buf_clone(buf);
2892 3190 }
2893 3191
2894 3192 } else if (*arc_flags & ARC_PREFETCH &&
2895 3193 refcount_count(&hdr->b_refcnt) == 0) {
2896 3194 hdr->b_flags |= ARC_PREFETCH;
2897 3195 }
2898 3196 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2899 3197 arc_access(hdr, hash_lock);
2900 3198 if (*arc_flags & ARC_L2CACHE)
2901 3199 hdr->b_flags |= ARC_L2CACHE;
2902 3200 if (*arc_flags & ARC_L2COMPRESS)
2903 3201 hdr->b_flags |= ARC_L2COMPRESS;
2904 3202 mutex_exit(hash_lock);
2905 3203 ARCSTAT_BUMP(arcstat_hits);
2906 3204 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2907 3205 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2908 3206 data, metadata, hits);
2909 3207
2910 3208 if (done)
2911 3209 done(NULL, buf, private);
2912 3210 } else {
2913 3211 uint64_t size = BP_GET_LSIZE(bp);
2914 3212 arc_callback_t *acb;
2915 3213 vdev_t *vd = NULL;
2916 3214 uint64_t addr = 0;
2917 3215 boolean_t devw = B_FALSE;
2918 3216
2919 3217 if (hdr == NULL) {
2920 3218 /* this block is not in the cache */
2921 3219 arc_buf_hdr_t *exists;
2922 3220 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2923 3221 buf = arc_buf_alloc(spa, size, private, type);
2924 3222 hdr = buf->b_hdr;
2925 3223 hdr->b_dva = *BP_IDENTITY(bp);
2926 3224 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
2927 3225 hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2928 3226 exists = buf_hash_insert(hdr, &hash_lock);
2929 3227 if (exists) {
2930 3228 /* somebody beat us to the hash insert */
2931 3229 mutex_exit(hash_lock);
2932 3230 buf_discard_identity(hdr);
2933 3231 (void) arc_buf_remove_ref(buf, private);
2934 3232 goto top; /* restart the IO request */
2935 3233 }
2936 3234 /* if this is a prefetch, we don't have a reference */
2937 3235 if (*arc_flags & ARC_PREFETCH) {
2938 3236 (void) remove_reference(hdr, hash_lock,
2939 3237 private);
2940 3238 hdr->b_flags |= ARC_PREFETCH;
2941 3239 }
2942 3240 if (*arc_flags & ARC_L2CACHE)
2943 3241 hdr->b_flags |= ARC_L2CACHE;
2944 3242 if (*arc_flags & ARC_L2COMPRESS)
2945 3243 hdr->b_flags |= ARC_L2COMPRESS;
2946 3244 if (BP_GET_LEVEL(bp) > 0)
2947 3245 hdr->b_flags |= ARC_INDIRECT;
2948 3246 } else {
2949 3247 /* this block is in the ghost cache */
2950 3248 ASSERT(GHOST_STATE(hdr->b_state));
2951 3249 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2952 3250 ASSERT0(refcount_count(&hdr->b_refcnt));
2953 3251 ASSERT(hdr->b_buf == NULL);
2954 3252
2955 3253 /* if this is a prefetch, we don't have a reference */
2956 3254 if (*arc_flags & ARC_PREFETCH)
2957 3255 hdr->b_flags |= ARC_PREFETCH;
2958 3256 else
2959 3257 add_reference(hdr, hash_lock, private);
2960 3258 if (*arc_flags & ARC_L2CACHE)
2961 3259 hdr->b_flags |= ARC_L2CACHE;
2962 3260 if (*arc_flags & ARC_L2COMPRESS)
2963 3261 hdr->b_flags |= ARC_L2COMPRESS;
2964 3262 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2965 3263 buf->b_hdr = hdr;
2966 3264 buf->b_data = NULL;
2967 3265 buf->b_efunc = NULL;
2968 3266 buf->b_private = NULL;
2969 3267 buf->b_next = NULL;
2970 3268 hdr->b_buf = buf;
2971 3269 ASSERT(hdr->b_datacnt == 0);
2972 3270 hdr->b_datacnt = 1;
2973 3271 arc_get_data_buf(buf);
2974 3272 arc_access(hdr, hash_lock);
2975 3273 }
2976 3274
2977 3275 ASSERT(!GHOST_STATE(hdr->b_state));
2978 3276
2979 3277 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2980 3278 acb->acb_done = done;
2981 3279 acb->acb_private = private;
2982 3280
2983 3281 ASSERT(hdr->b_acb == NULL);
2984 3282 hdr->b_acb = acb;
2985 3283 hdr->b_flags |= ARC_IO_IN_PROGRESS;
2986 3284
2987 3285 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
2988 3286 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
2989 3287 devw = hdr->b_l2hdr->b_dev->l2ad_writing;
2990 3288 addr = hdr->b_l2hdr->b_daddr;
2991 3289 /*
2992 3290 * Lock out device removal.
2993 3291 */
2994 3292 if (vdev_is_dead(vd) ||
2995 3293 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
2996 3294 vd = NULL;
2997 3295 }
2998 3296
2999 3297 mutex_exit(hash_lock);
3000 3298
3001 3299 /*
3002 3300 * At this point, we have a level 1 cache miss. Try again in
3003 3301 * L2ARC if possible.
3004 3302 */
3005 3303 ASSERT3U(hdr->b_size, ==, size);
3006 3304 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3007 3305 uint64_t, size, zbookmark_t *, zb);
3008 3306 ARCSTAT_BUMP(arcstat_misses);
3009 3307 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3010 3308 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3011 3309 data, metadata, misses);
3012 3310
3013 3311 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3014 3312 /*
3015 3313 * Read from the L2ARC if the following are true:
3016 3314 * 1. The L2ARC vdev was previously cached.
3017 3315 * 2. This buffer still has L2ARC metadata.
3018 3316 * 3. This buffer isn't currently writing to the L2ARC.
3019 3317 * 4. The L2ARC entry wasn't evicted, which may
3020 3318 * also have invalidated the vdev.
3021 3319 * 5. This isn't prefetch and l2arc_noprefetch is set.
3022 3320 */
3023 3321 if (hdr->b_l2hdr != NULL &&
3024 3322 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3025 3323 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3026 3324 l2arc_read_callback_t *cb;
3027 3325
3028 3326 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3029 3327 ARCSTAT_BUMP(arcstat_l2_hits);
3030 3328
3031 3329 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3032 3330 KM_SLEEP);
3033 3331 cb->l2rcb_buf = buf;
3034 3332 cb->l2rcb_spa = spa;
3035 3333 cb->l2rcb_bp = *bp;
3036 3334 cb->l2rcb_zb = *zb;
3037 3335 cb->l2rcb_flags = zio_flags;
3038 3336 cb->l2rcb_compress = hdr->b_l2hdr->b_compress;
3039 3337
3040 3338 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3041 3339 addr + size < vd->vdev_psize -
3042 3340 VDEV_LABEL_END_SIZE);
3043 3341
3044 3342 /*
3045 3343 * l2arc read. The SCL_L2ARC lock will be
3046 3344 * released by l2arc_read_done().
3047 3345 * Issue a null zio if the underlying buffer
3048 3346 * was squashed to zero size by compression.
3049 3347 */
3050 3348 if (hdr->b_l2hdr->b_compress ==
3051 3349 ZIO_COMPRESS_EMPTY) {
3052 3350 rzio = zio_null(pio, spa, vd,
3053 3351 l2arc_read_done, cb,
3054 3352 zio_flags | ZIO_FLAG_DONT_CACHE |
3055 3353 ZIO_FLAG_CANFAIL |
3056 3354 ZIO_FLAG_DONT_PROPAGATE |
3057 3355 ZIO_FLAG_DONT_RETRY);
3058 3356 } else {
3059 3357 rzio = zio_read_phys(pio, vd, addr,
3060 3358 hdr->b_l2hdr->b_asize,
3061 3359 buf->b_data, ZIO_CHECKSUM_OFF,
3062 3360 l2arc_read_done, cb, priority,
3063 3361 zio_flags | ZIO_FLAG_DONT_CACHE |
3064 3362 ZIO_FLAG_CANFAIL |
3065 3363 ZIO_FLAG_DONT_PROPAGATE |
3066 3364 ZIO_FLAG_DONT_RETRY, B_FALSE);
3067 3365 }
3068 3366 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3069 3367 zio_t *, rzio);
3070 3368 ARCSTAT_INCR(arcstat_l2_read_bytes,
3071 3369 hdr->b_l2hdr->b_asize);
3072 3370
3073 3371 if (*arc_flags & ARC_NOWAIT) {
3074 3372 zio_nowait(rzio);
3075 3373 return (0);
3076 3374 }
3077 3375
3078 3376 ASSERT(*arc_flags & ARC_WAIT);
3079 3377 if (zio_wait(rzio) == 0)
3080 3378 return (0);
3081 3379
3082 3380 /* l2arc read error; goto zio_read() */
3083 3381 } else {
3084 3382 DTRACE_PROBE1(l2arc__miss,
3085 3383 arc_buf_hdr_t *, hdr);
3086 3384 ARCSTAT_BUMP(arcstat_l2_misses);
3087 3385 if (HDR_L2_WRITING(hdr))
3088 3386 ARCSTAT_BUMP(arcstat_l2_rw_clash);
3089 3387 spa_config_exit(spa, SCL_L2ARC, vd);
3090 3388 }
3091 3389 } else {
3092 3390 if (vd != NULL)
3093 3391 spa_config_exit(spa, SCL_L2ARC, vd);
3094 3392 if (l2arc_ndev != 0) {
3095 3393 DTRACE_PROBE1(l2arc__miss,
3096 3394 arc_buf_hdr_t *, hdr);
3097 3395 ARCSTAT_BUMP(arcstat_l2_misses);
3098 3396 }
3099 3397 }
3100 3398
3101 3399 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3102 3400 arc_read_done, buf, priority, zio_flags, zb);
3103 3401
3104 3402 if (*arc_flags & ARC_WAIT)
3105 3403 return (zio_wait(rzio));
3106 3404
3107 3405 ASSERT(*arc_flags & ARC_NOWAIT);
3108 3406 zio_nowait(rzio);
3109 3407 }
3110 3408 return (0);
3111 3409 }
3112 3410
3113 3411 void
3114 3412 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3115 3413 {
3116 3414 ASSERT(buf->b_hdr != NULL);
3117 3415 ASSERT(buf->b_hdr->b_state != arc_anon);
3118 3416 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3119 3417 ASSERT(buf->b_efunc == NULL);
3120 3418 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3121 3419
3122 3420 buf->b_efunc = func;
3123 3421 buf->b_private = private;
3124 3422 }
3125 3423
3126 3424 /*
3127 3425 * Notify the arc that a block was freed, and thus will never be used again.
3128 3426 */
3129 3427 void
3130 3428 arc_freed(spa_t *spa, const blkptr_t *bp)
3131 3429 {
3132 3430 arc_buf_hdr_t *hdr;
3133 3431 kmutex_t *hash_lock;
3134 3432 uint64_t guid = spa_load_guid(spa);
3135 3433
3136 3434 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3137 3435 &hash_lock);
3138 3436 if (hdr == NULL)
3139 3437 return;
3140 3438 if (HDR_BUF_AVAILABLE(hdr)) {
3141 3439 arc_buf_t *buf = hdr->b_buf;
3142 3440 add_reference(hdr, hash_lock, FTAG);
3143 3441 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3144 3442 mutex_exit(hash_lock);
3145 3443
3146 3444 arc_release(buf, FTAG);
3147 3445 (void) arc_buf_remove_ref(buf, FTAG);
3148 3446 } else {
3149 3447 mutex_exit(hash_lock);
3150 3448 }
3151 3449
3152 3450 }
3153 3451
3154 3452 /*
3155 3453 * This is used by the DMU to let the ARC know that a buffer is
3156 3454 * being evicted, so the ARC should clean up. If this arc buf
3157 3455 * is not yet in the evicted state, it will be put there.
3158 3456 */
3159 3457 int
3160 3458 arc_buf_evict(arc_buf_t *buf)
3161 3459 {
3162 3460 arc_buf_hdr_t *hdr;
3163 3461 kmutex_t *hash_lock;
3164 3462 arc_buf_t **bufp;
3165 3463
3166 3464 mutex_enter(&buf->b_evict_lock);
3167 3465 hdr = buf->b_hdr;
3168 3466 if (hdr == NULL) {
3169 3467 /*
3170 3468 * We are in arc_do_user_evicts().
3171 3469 */
3172 3470 ASSERT(buf->b_data == NULL);
3173 3471 mutex_exit(&buf->b_evict_lock);
3174 3472 return (0);
3175 3473 } else if (buf->b_data == NULL) {
3176 3474 arc_buf_t copy = *buf; /* structure assignment */
3177 3475 /*
3178 3476 * We are on the eviction list; process this buffer now
3179 3477 * but let arc_do_user_evicts() do the reaping.
3180 3478 */
3181 3479 buf->b_efunc = NULL;
3182 3480 mutex_exit(&buf->b_evict_lock);
3183 3481 VERIFY(copy.b_efunc(©) == 0);
3184 3482 return (1);
3185 3483 }
3186 3484 hash_lock = HDR_LOCK(hdr);
3187 3485 mutex_enter(hash_lock);
3188 3486 hdr = buf->b_hdr;
3189 3487 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3190 3488
3191 3489 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3192 3490 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3193 3491
3194 3492 /*
3195 3493 * Pull this buffer off of the hdr
3196 3494 */
3197 3495 bufp = &hdr->b_buf;
3198 3496 while (*bufp != buf)
3199 3497 bufp = &(*bufp)->b_next;
3200 3498 *bufp = buf->b_next;
3201 3499
3202 3500 ASSERT(buf->b_data != NULL);
3203 3501 arc_buf_destroy(buf, FALSE, FALSE);
3204 3502
3205 3503 if (hdr->b_datacnt == 0) {
3206 3504 arc_state_t *old_state = hdr->b_state;
3207 3505 arc_state_t *evicted_state;
3208 3506
3209 3507 ASSERT(hdr->b_buf == NULL);
3210 3508 ASSERT(refcount_is_zero(&hdr->b_refcnt));
3211 3509
3212 3510 evicted_state =
3213 3511 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3214 3512
3215 3513 mutex_enter(&old_state->arcs_mtx);
3216 3514 mutex_enter(&evicted_state->arcs_mtx);
3217 3515
3218 3516 arc_change_state(evicted_state, hdr, hash_lock);
3219 3517 ASSERT(HDR_IN_HASH_TABLE(hdr));
3220 3518 hdr->b_flags |= ARC_IN_HASH_TABLE;
3221 3519 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3222 3520
3223 3521 mutex_exit(&evicted_state->arcs_mtx);
3224 3522 mutex_exit(&old_state->arcs_mtx);
3225 3523 }
3226 3524 mutex_exit(hash_lock);
3227 3525 mutex_exit(&buf->b_evict_lock);
3228 3526
3229 3527 VERIFY(buf->b_efunc(buf) == 0);
3230 3528 buf->b_efunc = NULL;
3231 3529 buf->b_private = NULL;
3232 3530 buf->b_hdr = NULL;
3233 3531 buf->b_next = NULL;
3234 3532 kmem_cache_free(buf_cache, buf);
3235 3533 return (1);
3236 3534 }
3237 3535
3238 3536 /*
3239 3537 * Release this buffer from the cache, making it an anonymous buffer. This
3240 3538 * must be done after a read and prior to modifying the buffer contents.
3241 3539 * If the buffer has more than one reference, we must make
3242 3540 * a new hdr for the buffer.
3243 3541 */
3244 3542 void
3245 3543 arc_release(arc_buf_t *buf, void *tag)
3246 3544 {
3247 3545 arc_buf_hdr_t *hdr;
3248 3546 kmutex_t *hash_lock = NULL;
3249 3547 l2arc_buf_hdr_t *l2hdr;
3250 3548 uint64_t buf_size;
3251 3549
3252 3550 /*
3253 3551 * It would be nice to assert that if it's DMU metadata (level >
3254 3552 * 0 || it's the dnode file), then it must be syncing context.
3255 3553 * But we don't know that information at this level.
3256 3554 */
3257 3555
3258 3556 mutex_enter(&buf->b_evict_lock);
3259 3557 hdr = buf->b_hdr;
3260 3558
3261 3559 /* this buffer is not on any list */
3262 3560 ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3263 3561
3264 3562 if (hdr->b_state == arc_anon) {
3265 3563 /* this buffer is already released */
3266 3564 ASSERT(buf->b_efunc == NULL);
3267 3565 } else {
3268 3566 hash_lock = HDR_LOCK(hdr);
3269 3567 mutex_enter(hash_lock);
3270 3568 hdr = buf->b_hdr;
3271 3569 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3272 3570 }
3273 3571
3274 3572 l2hdr = hdr->b_l2hdr;
3275 3573 if (l2hdr) {
3276 3574 mutex_enter(&l2arc_buflist_mtx);
3277 3575 hdr->b_l2hdr = NULL;
3278 3576 }
3279 3577 buf_size = hdr->b_size;
3280 3578
3281 3579 /*
3282 3580 * Do we have more than one buf?
3283 3581 */
3284 3582 if (hdr->b_datacnt > 1) {
3285 3583 arc_buf_hdr_t *nhdr;
3286 3584 arc_buf_t **bufp;
3287 3585 uint64_t blksz = hdr->b_size;
3288 3586 uint64_t spa = hdr->b_spa;
3289 3587 arc_buf_contents_t type = hdr->b_type;
3290 3588 uint32_t flags = hdr->b_flags;
3291 3589
3292 3590 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3293 3591 /*
3294 3592 * Pull the data off of this hdr and attach it to
3295 3593 * a new anonymous hdr.
3296 3594 */
3297 3595 (void) remove_reference(hdr, hash_lock, tag);
3298 3596 bufp = &hdr->b_buf;
3299 3597 while (*bufp != buf)
3300 3598 bufp = &(*bufp)->b_next;
3301 3599 *bufp = buf->b_next;
3302 3600 buf->b_next = NULL;
3303 3601
3304 3602 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3305 3603 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3306 3604 if (refcount_is_zero(&hdr->b_refcnt)) {
3307 3605 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3308 3606 ASSERT3U(*size, >=, hdr->b_size);
3309 3607 atomic_add_64(size, -hdr->b_size);
3310 3608 }
3311 3609
3312 3610 /*
3313 3611 * We're releasing a duplicate user data buffer, update
3314 3612 * our statistics accordingly.
3315 3613 */
3316 3614 if (hdr->b_type == ARC_BUFC_DATA) {
3317 3615 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3318 3616 ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3319 3617 -hdr->b_size);
3320 3618 }
3321 3619 hdr->b_datacnt -= 1;
3322 3620 arc_cksum_verify(buf);
3323 3621 arc_buf_unwatch(buf);
3324 3622
3325 3623 mutex_exit(hash_lock);
3326 3624
3327 3625 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3328 3626 nhdr->b_size = blksz;
3329 3627 nhdr->b_spa = spa;
3330 3628 nhdr->b_type = type;
3331 3629 nhdr->b_buf = buf;
3332 3630 nhdr->b_state = arc_anon;
3333 3631 nhdr->b_arc_access = 0;
3334 3632 nhdr->b_flags = flags & ARC_L2_WRITING;
3335 3633 nhdr->b_l2hdr = NULL;
3336 3634 nhdr->b_datacnt = 1;
3337 3635 nhdr->b_freeze_cksum = NULL;
3338 3636 (void) refcount_add(&nhdr->b_refcnt, tag);
3339 3637 buf->b_hdr = nhdr;
3340 3638 mutex_exit(&buf->b_evict_lock);
3341 3639 atomic_add_64(&arc_anon->arcs_size, blksz);
3342 3640 } else {
3343 3641 mutex_exit(&buf->b_evict_lock);
3344 3642 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3345 3643 ASSERT(!list_link_active(&hdr->b_arc_node));
3346 3644 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3347 3645 if (hdr->b_state != arc_anon)
3348 3646 arc_change_state(arc_anon, hdr, hash_lock);
3349 3647 hdr->b_arc_access = 0;
3350 3648 if (hash_lock)
3351 3649 mutex_exit(hash_lock);
3352 3650
3353 3651 buf_discard_identity(hdr);
3354 3652 arc_buf_thaw(buf);
3355 3653 }
3356 3654 buf->b_efunc = NULL;
3357 3655 buf->b_private = NULL;
3358 3656
3359 3657 if (l2hdr) {
3360 3658 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3361 3659 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3362 3660 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3363 3661 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3364 3662 mutex_exit(&l2arc_buflist_mtx);
3365 3663 }
3366 3664 }
3367 3665
3368 3666 int
3369 3667 arc_released(arc_buf_t *buf)
3370 3668 {
3371 3669 int released;
3372 3670
3373 3671 mutex_enter(&buf->b_evict_lock);
3374 3672 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3375 3673 mutex_exit(&buf->b_evict_lock);
3376 3674 return (released);
3377 3675 }
3378 3676
3379 3677 int
3380 3678 arc_has_callback(arc_buf_t *buf)
3381 3679 {
3382 3680 int callback;
3383 3681
3384 3682 mutex_enter(&buf->b_evict_lock);
3385 3683 callback = (buf->b_efunc != NULL);
3386 3684 mutex_exit(&buf->b_evict_lock);
3387 3685 return (callback);
3388 3686 }
3389 3687
3390 3688 #ifdef ZFS_DEBUG
3391 3689 int
3392 3690 arc_referenced(arc_buf_t *buf)
3393 3691 {
3394 3692 int referenced;
3395 3693
3396 3694 mutex_enter(&buf->b_evict_lock);
3397 3695 referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3398 3696 mutex_exit(&buf->b_evict_lock);
3399 3697 return (referenced);
3400 3698 }
3401 3699 #endif
3402 3700
3403 3701 static void
3404 3702 arc_write_ready(zio_t *zio)
3405 3703 {
3406 3704 arc_write_callback_t *callback = zio->io_private;
3407 3705 arc_buf_t *buf = callback->awcb_buf;
3408 3706 arc_buf_hdr_t *hdr = buf->b_hdr;
3409 3707
3410 3708 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3411 3709 callback->awcb_ready(zio, buf, callback->awcb_private);
3412 3710
3413 3711 /*
3414 3712 * If the IO is already in progress, then this is a re-write
3415 3713 * attempt, so we need to thaw and re-compute the cksum.
3416 3714 * It is the responsibility of the callback to handle the
3417 3715 * accounting for any re-write attempt.
3418 3716 */
3419 3717 if (HDR_IO_IN_PROGRESS(hdr)) {
3420 3718 mutex_enter(&hdr->b_freeze_lock);
3421 3719 if (hdr->b_freeze_cksum != NULL) {
3422 3720 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3423 3721 hdr->b_freeze_cksum = NULL;
3424 3722 }
3425 3723 mutex_exit(&hdr->b_freeze_lock);
3426 3724 }
3427 3725 arc_cksum_compute(buf, B_FALSE);
3428 3726 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3429 3727 }
3430 3728
3431 3729 static void
3432 3730 arc_write_done(zio_t *zio)
3433 3731 {
3434 3732 arc_write_callback_t *callback = zio->io_private;
3435 3733 arc_buf_t *buf = callback->awcb_buf;
3436 3734 arc_buf_hdr_t *hdr = buf->b_hdr;
3437 3735
3438 3736 ASSERT(hdr->b_acb == NULL);
3439 3737
3440 3738 if (zio->io_error == 0) {
3441 3739 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3442 3740 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3443 3741 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3444 3742 } else {
3445 3743 ASSERT(BUF_EMPTY(hdr));
3446 3744 }
3447 3745
3448 3746 /*
3449 3747 * If the block to be written was all-zero, we may have
3450 3748 * compressed it away. In this case no write was performed
3451 3749 * so there will be no dva/birth/checksum. The buffer must
3452 3750 * therefore remain anonymous (and uncached).
3453 3751 */
3454 3752 if (!BUF_EMPTY(hdr)) {
3455 3753 arc_buf_hdr_t *exists;
3456 3754 kmutex_t *hash_lock;
3457 3755
3458 3756 ASSERT(zio->io_error == 0);
3459 3757
3460 3758 arc_cksum_verify(buf);
3461 3759
3462 3760 exists = buf_hash_insert(hdr, &hash_lock);
3463 3761 if (exists) {
3464 3762 /*
3465 3763 * This can only happen if we overwrite for
3466 3764 * sync-to-convergence, because we remove
3467 3765 * buffers from the hash table when we arc_free().
3468 3766 */
3469 3767 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3470 3768 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3471 3769 panic("bad overwrite, hdr=%p exists=%p",
3472 3770 (void *)hdr, (void *)exists);
3473 3771 ASSERT(refcount_is_zero(&exists->b_refcnt));
3474 3772 arc_change_state(arc_anon, exists, hash_lock);
3475 3773 mutex_exit(hash_lock);
3476 3774 arc_hdr_destroy(exists);
3477 3775 exists = buf_hash_insert(hdr, &hash_lock);
3478 3776 ASSERT3P(exists, ==, NULL);
3479 3777 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3480 3778 /* nopwrite */
3481 3779 ASSERT(zio->io_prop.zp_nopwrite);
3482 3780 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3483 3781 panic("bad nopwrite, hdr=%p exists=%p",
3484 3782 (void *)hdr, (void *)exists);
3485 3783 } else {
3486 3784 /* Dedup */
3487 3785 ASSERT(hdr->b_datacnt == 1);
3488 3786 ASSERT(hdr->b_state == arc_anon);
3489 3787 ASSERT(BP_GET_DEDUP(zio->io_bp));
3490 3788 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3491 3789 }
3492 3790 }
3493 3791 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3494 3792 /* if it's not anon, we are doing a scrub */
3495 3793 if (!exists && hdr->b_state == arc_anon)
3496 3794 arc_access(hdr, hash_lock);
3497 3795 mutex_exit(hash_lock);
3498 3796 } else {
3499 3797 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3500 3798 }
3501 3799
3502 3800 ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3503 3801 callback->awcb_done(zio, buf, callback->awcb_private);
3504 3802
3505 3803 kmem_free(callback, sizeof (arc_write_callback_t));
3506 3804 }
3507 3805
3508 3806 zio_t *
3509 3807 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3510 3808 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3511 3809 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
3512 3810 void *private, int priority, int zio_flags, const zbookmark_t *zb)
3513 3811 {
3514 3812 arc_buf_hdr_t *hdr = buf->b_hdr;
3515 3813 arc_write_callback_t *callback;
3516 3814 zio_t *zio;
3517 3815
3518 3816 ASSERT(ready != NULL);
3519 3817 ASSERT(done != NULL);
3520 3818 ASSERT(!HDR_IO_ERROR(hdr));
3521 3819 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3522 3820 ASSERT(hdr->b_acb == NULL);
3523 3821 if (l2arc)
3524 3822 hdr->b_flags |= ARC_L2CACHE;
3525 3823 if (l2arc_compress)
3526 3824 hdr->b_flags |= ARC_L2COMPRESS;
3527 3825 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3528 3826 callback->awcb_ready = ready;
3529 3827 callback->awcb_done = done;
3530 3828 callback->awcb_private = private;
3531 3829 callback->awcb_buf = buf;
3532 3830
3533 3831 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3534 3832 arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3535 3833
3536 3834 return (zio);
3537 3835 }
3538 3836
3539 3837 static int
3540 3838 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3541 3839 {
3542 3840 #ifdef _KERNEL
3543 3841 uint64_t available_memory = ptob(freemem);
3544 3842 static uint64_t page_load = 0;
3545 3843 static uint64_t last_txg = 0;
3546 3844
3547 3845 #if defined(__i386)
3548 3846 available_memory =
3549 3847 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3550 3848 #endif
3551 3849 if (available_memory >= zfs_write_limit_max)
3552 3850 return (0);
3553 3851
3554 3852 if (txg > last_txg) {
3555 3853 last_txg = txg;
3556 3854 page_load = 0;
3557 3855 }
3558 3856 /*
3559 3857 * If we are in pageout, we know that memory is already tight,
3560 3858 * the arc is already going to be evicting, so we just want to
3561 3859 * continue to let page writes occur as quickly as possible.
3562 3860 */
3563 3861 if (curproc == proc_pageout) {
3564 3862 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3565 3863 return (SET_ERROR(ERESTART));
3566 3864 /* Note: reserve is inflated, so we deflate */
3567 3865 page_load += reserve / 8;
3568 3866 return (0);
3569 3867 } else if (page_load > 0 && arc_reclaim_needed()) {
3570 3868 /* memory is low, delay before restarting */
3571 3869 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3572 3870 return (SET_ERROR(EAGAIN));
3573 3871 }
3574 3872 page_load = 0;
3575 3873
3576 3874 if (arc_size > arc_c_min) {
3577 3875 uint64_t evictable_memory =
3578 3876 arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3579 3877 arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3580 3878 arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3581 3879 arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3582 3880 available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3583 3881 }
3584 3882
3585 3883 if (inflight_data > available_memory / 4) {
3586 3884 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3587 3885 return (SET_ERROR(ERESTART));
3588 3886 }
3589 3887 #endif
3590 3888 return (0);
3591 3889 }
3592 3890
3593 3891 void
3594 3892 arc_tempreserve_clear(uint64_t reserve)
3595 3893 {
3596 3894 atomic_add_64(&arc_tempreserve, -reserve);
3597 3895 ASSERT((int64_t)arc_tempreserve >= 0);
3598 3896 }
3599 3897
3600 3898 int
3601 3899 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3602 3900 {
3603 3901 int error;
3604 3902 uint64_t anon_size;
3605 3903
3606 3904 #ifdef ZFS_DEBUG
3607 3905 /*
3608 3906 * Once in a while, fail for no reason. Everything should cope.
3609 3907 */
3610 3908 if (spa_get_random(10000) == 0) {
3611 3909 dprintf("forcing random failure\n");
3612 3910 return (SET_ERROR(ERESTART));
3613 3911 }
3614 3912 #endif
3615 3913 if (reserve > arc_c/4 && !arc_no_grow)
3616 3914 arc_c = MIN(arc_c_max, reserve * 4);
3617 3915 if (reserve > arc_c)
3618 3916 return (SET_ERROR(ENOMEM));
3619 3917
3620 3918 /*
3621 3919 * Don't count loaned bufs as in flight dirty data to prevent long
3622 3920 * network delays from blocking transactions that are ready to be
3623 3921 * assigned to a txg.
3624 3922 */
3625 3923 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3626 3924
3627 3925 /*
3628 3926 * Writes will, almost always, require additional memory allocations
3629 3927 * in order to compress/encrypt/etc the data. We therefore need to
3630 3928 * make sure that there is sufficient available memory for this.
3631 3929 */
3632 3930 if (error = arc_memory_throttle(reserve, anon_size, txg))
3633 3931 return (error);
3634 3932
3635 3933 /*
3636 3934 * Throttle writes when the amount of dirty data in the cache
3637 3935 * gets too large. We try to keep the cache less than half full
3638 3936 * of dirty blocks so that our sync times don't grow too large.
3639 3937 * Note: if two requests come in concurrently, we might let them
3640 3938 * both succeed, when one of them should fail. Not a huge deal.
3641 3939 */
3642 3940
3643 3941 if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3644 3942 anon_size > arc_c / 4) {
3645 3943 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3646 3944 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3647 3945 arc_tempreserve>>10,
3648 3946 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3649 3947 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3650 3948 reserve>>10, arc_c>>10);
3651 3949 return (SET_ERROR(ERESTART));
3652 3950 }
3653 3951 atomic_add_64(&arc_tempreserve, reserve);
3654 3952 return (0);
3655 3953 }
3656 3954
3657 3955 void
3658 3956 arc_init(void)
3659 3957 {
3660 3958 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3661 3959 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3662 3960
3663 3961 /* Convert seconds to clock ticks */
3664 3962 arc_min_prefetch_lifespan = 1 * hz;
3665 3963
3666 3964 /* Start out with 1/8 of all memory */
3667 3965 arc_c = physmem * PAGESIZE / 8;
3668 3966
3669 3967 #ifdef _KERNEL
3670 3968 /*
3671 3969 * On architectures where the physical memory can be larger
3672 3970 * than the addressable space (intel in 32-bit mode), we may
3673 3971 * need to limit the cache to 1/8 of VM size.
3674 3972 */
3675 3973 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3676 3974 #endif
3677 3975
3678 3976 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3679 3977 arc_c_min = MAX(arc_c / 4, 64<<20);
3680 3978 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3681 3979 if (arc_c * 8 >= 1<<30)
3682 3980 arc_c_max = (arc_c * 8) - (1<<30);
3683 3981 else
3684 3982 arc_c_max = arc_c_min;
3685 3983 arc_c_max = MAX(arc_c * 6, arc_c_max);
3686 3984
3687 3985 /*
3688 3986 * Allow the tunables to override our calculations if they are
3689 3987 * reasonable (ie. over 64MB)
3690 3988 */
3691 3989 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3692 3990 arc_c_max = zfs_arc_max;
3693 3991 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3694 3992 arc_c_min = zfs_arc_min;
3695 3993
3696 3994 arc_c = arc_c_max;
3697 3995 arc_p = (arc_c >> 1);
3698 3996
3699 3997 /* limit meta-data to 1/4 of the arc capacity */
3700 3998 arc_meta_limit = arc_c_max / 4;
3701 3999
3702 4000 /* Allow the tunable to override if it is reasonable */
3703 4001 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3704 4002 arc_meta_limit = zfs_arc_meta_limit;
3705 4003
3706 4004 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3707 4005 arc_c_min = arc_meta_limit / 2;
3708 4006
3709 4007 if (zfs_arc_grow_retry > 0)
3710 4008 arc_grow_retry = zfs_arc_grow_retry;
3711 4009
3712 4010 if (zfs_arc_shrink_shift > 0)
3713 4011 arc_shrink_shift = zfs_arc_shrink_shift;
3714 4012
3715 4013 if (zfs_arc_p_min_shift > 0)
3716 4014 arc_p_min_shift = zfs_arc_p_min_shift;
3717 4015
3718 4016 /* if kmem_flags are set, lets try to use less memory */
3719 4017 if (kmem_debugging())
3720 4018 arc_c = arc_c / 2;
3721 4019 if (arc_c < arc_c_min)
3722 4020 arc_c = arc_c_min;
3723 4021
3724 4022 arc_anon = &ARC_anon;
3725 4023 arc_mru = &ARC_mru;
3726 4024 arc_mru_ghost = &ARC_mru_ghost;
3727 4025 arc_mfu = &ARC_mfu;
3728 4026 arc_mfu_ghost = &ARC_mfu_ghost;
3729 4027 arc_l2c_only = &ARC_l2c_only;
3730 4028 arc_size = 0;
3731 4029
3732 4030 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3733 4031 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3734 4032 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3735 4033 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3736 4034 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3737 4035 mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3738 4036
3739 4037 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3740 4038 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3741 4039 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3742 4040 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3743 4041 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3744 4042 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3745 4043 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3746 4044 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3747 4045 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3748 4046 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3749 4047 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3750 4048 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3751 4049 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3752 4050 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3753 4051 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3754 4052 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3755 4053 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3756 4054 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3757 4055 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3758 4056 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3759 4057
3760 4058 buf_init();
3761 4059
3762 4060 arc_thread_exit = 0;
3763 4061 arc_eviction_list = NULL;
3764 4062 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3765 4063 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3766 4064
3767 4065 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3768 4066 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3769 4067
3770 4068 if (arc_ksp != NULL) {
3771 4069 arc_ksp->ks_data = &arc_stats;
3772 4070 kstat_install(arc_ksp);
3773 4071 }
3774 4072
3775 4073 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3776 4074 TS_RUN, minclsyspri);
3777 4075
3778 4076 arc_dead = FALSE;
3779 4077 arc_warm = B_FALSE;
3780 4078
3781 4079 if (zfs_write_limit_max == 0)
3782 4080 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3783 4081 else
3784 4082 zfs_write_limit_shift = 0;
3785 4083 mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3786 4084 }
3787 4085
3788 4086 void
3789 4087 arc_fini(void)
3790 4088 {
3791 4089 mutex_enter(&arc_reclaim_thr_lock);
3792 4090 arc_thread_exit = 1;
3793 4091 while (arc_thread_exit != 0)
3794 4092 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3795 4093 mutex_exit(&arc_reclaim_thr_lock);
3796 4094
3797 4095 arc_flush(NULL);
3798 4096
3799 4097 arc_dead = TRUE;
3800 4098
3801 4099 if (arc_ksp != NULL) {
3802 4100 kstat_delete(arc_ksp);
3803 4101 arc_ksp = NULL;
3804 4102 }
3805 4103
3806 4104 mutex_destroy(&arc_eviction_mtx);
3807 4105 mutex_destroy(&arc_reclaim_thr_lock);
3808 4106 cv_destroy(&arc_reclaim_thr_cv);
3809 4107
3810 4108 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3811 4109 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3812 4110 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3813 4111 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3814 4112 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3815 4113 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3816 4114 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3817 4115 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3818 4116
3819 4117 mutex_destroy(&arc_anon->arcs_mtx);
3820 4118 mutex_destroy(&arc_mru->arcs_mtx);
3821 4119 mutex_destroy(&arc_mru_ghost->arcs_mtx);
3822 4120 mutex_destroy(&arc_mfu->arcs_mtx);
3823 4121 mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3824 4122 mutex_destroy(&arc_l2c_only->arcs_mtx);
3825 4123
3826 4124 mutex_destroy(&zfs_write_limit_lock);
3827 4125
3828 4126 buf_fini();
3829 4127
3830 4128 ASSERT(arc_loaned_bytes == 0);
3831 4129 }
3832 4130
3833 4131 /*
3834 4132 * Level 2 ARC
3835 4133 *
3836 4134 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3837 4135 * It uses dedicated storage devices to hold cached data, which are populated
3838 4136 * using large infrequent writes. The main role of this cache is to boost
3839 4137 * the performance of random read workloads. The intended L2ARC devices
3840 4138 * include short-stroked disks, solid state disks, and other media with
3841 4139 * substantially faster read latency than disk.
3842 4140 *
3843 4141 * +-----------------------+
3844 4142 * | ARC |
3845 4143 * +-----------------------+
3846 4144 * | ^ ^
3847 4145 * | | |
3848 4146 * l2arc_feed_thread() arc_read()
3849 4147 * | | |
3850 4148 * | l2arc read |
3851 4149 * V | |
3852 4150 * +---------------+ |
3853 4151 * | L2ARC | |
3854 4152 * +---------------+ |
3855 4153 * | ^ |
3856 4154 * l2arc_write() | |
3857 4155 * | | |
3858 4156 * V | |
3859 4157 * +-------+ +-------+
3860 4158 * | vdev | | vdev |
3861 4159 * | cache | | cache |
3862 4160 * +-------+ +-------+
3863 4161 * +=========+ .-----.
3864 4162 * : L2ARC : |-_____-|
3865 4163 * : devices : | Disks |
3866 4164 * +=========+ `-_____-'
3867 4165 *
3868 4166 * Read requests are satisfied from the following sources, in order:
3869 4167 *
3870 4168 * 1) ARC
3871 4169 * 2) vdev cache of L2ARC devices
3872 4170 * 3) L2ARC devices
3873 4171 * 4) vdev cache of disks
3874 4172 * 5) disks
3875 4173 *
3876 4174 * Some L2ARC device types exhibit extremely slow write performance.
3877 4175 * To accommodate for this there are some significant differences between
3878 4176 * the L2ARC and traditional cache design:
3879 4177 *
3880 4178 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
3881 4179 * the ARC behave as usual, freeing buffers and placing headers on ghost
3882 4180 * lists. The ARC does not send buffers to the L2ARC during eviction as
3883 4181 * this would add inflated write latencies for all ARC memory pressure.
3884 4182 *
3885 4183 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3886 4184 * It does this by periodically scanning buffers from the eviction-end of
3887 4185 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3888 4186 * not already there. It scans until a headroom of buffers is satisfied,
3889 4187 * which itself is a buffer for ARC eviction. If a compressible buffer is
3890 4188 * found during scanning and selected for writing to an L2ARC device, we
3891 4189 * temporarily boost scanning headroom during the next scan cycle to make
3892 4190 * sure we adapt to compression effects (which might significantly reduce
3893 4191 * the data volume we write to L2ARC). The thread that does this is
3894 4192 * l2arc_feed_thread(), illustrated below; example sizes are included to
3895 4193 * provide a better sense of ratio than this diagram:
3896 4194 *
3897 4195 * head --> tail
3898 4196 * +---------------------+----------+
3899 4197 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
3900 4198 * +---------------------+----------+ | o L2ARC eligible
3901 4199 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
3902 4200 * +---------------------+----------+ |
3903 4201 * 15.9 Gbytes ^ 32 Mbytes |
3904 4202 * headroom |
3905 4203 * l2arc_feed_thread()
3906 4204 * |
3907 4205 * l2arc write hand <--[oooo]--'
3908 4206 * | 8 Mbyte
3909 4207 * | write max
3910 4208 * V
3911 4209 * +==============================+
3912 4210 * L2ARC dev |####|#|###|###| |####| ... |
3913 4211 * +==============================+
3914 4212 * 32 Gbytes
3915 4213 *
3916 4214 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3917 4215 * evicted, then the L2ARC has cached a buffer much sooner than it probably
3918 4216 * needed to, potentially wasting L2ARC device bandwidth and storage. It is
3919 4217 * safe to say that this is an uncommon case, since buffers at the end of
3920 4218 * the ARC lists have moved there due to inactivity.
3921 4219 *
3922 4220 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3923 4221 * then the L2ARC simply misses copying some buffers. This serves as a
3924 4222 * pressure valve to prevent heavy read workloads from both stalling the ARC
3925 4223 * with waits and clogging the L2ARC with writes. This also helps prevent
3926 4224 * the potential for the L2ARC to churn if it attempts to cache content too
3927 4225 * quickly, such as during backups of the entire pool.
3928 4226 *
3929 4227 * 5. After system boot and before the ARC has filled main memory, there are
3930 4228 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
3931 4229 * lists can remain mostly static. Instead of searching from tail of these
3932 4230 * lists as pictured, the l2arc_feed_thread() will search from the list heads
3933 4231 * for eligible buffers, greatly increasing its chance of finding them.
3934 4232 *
3935 4233 * The L2ARC device write speed is also boosted during this time so that
3936 4234 * the L2ARC warms up faster. Since there have been no ARC evictions yet,
3937 4235 * there are no L2ARC reads, and no fear of degrading read performance
3938 4236 * through increased writes.
3939 4237 *
3940 4238 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
3941 4239 * the vdev queue can aggregate them into larger and fewer writes. Each
3942 4240 * device is written to in a rotor fashion, sweeping writes through
3943 4241 * available space then repeating.
3944 4242 *
3945 4243 * 7. The L2ARC does not store dirty content. It never needs to flush
3946 4244 * write buffers back to disk based storage.
3947 4245 *
3948 4246 * 8. If an ARC buffer is written (and dirtied) which also exists in the
3949 4247 * L2ARC, the now stale L2ARC buffer is immediately dropped.
3950 4248 *
3951 4249 * The performance of the L2ARC can be tweaked by a number of tunables, which
3952 4250 * may be necessary for different workloads:
3953 4251 *
3954 4252 * l2arc_write_max max write bytes per interval
3955 4253 * l2arc_write_boost extra write bytes during device warmup
3956 4254 * l2arc_noprefetch skip caching prefetched buffers
3957 4255 * l2arc_headroom number of max device writes to precache
3958 4256 * l2arc_headroom_boost when we find compressed buffers during ARC
3959 4257 * scanning, we multiply headroom by this
3960 4258 * percentage factor for the next scan cycle,
3961 4259 * since more compressed buffers are likely to
3962 4260 * be present
3963 4261 * l2arc_feed_secs seconds between L2ARC writing
3964 4262 *
3965 4263 * Tunables may be removed or added as future performance improvements are
↓ open down ↓ |
2612 lines elided |
↑ open up ↑ |
3966 4264 * integrated, and also may become zpool properties.
3967 4265 *
3968 4266 * There are three key functions that control how the L2ARC warms up:
3969 4267 *
3970 4268 * l2arc_write_eligible() check if a buffer is eligible to cache
3971 4269 * l2arc_write_size() calculate how much to write
3972 4270 * l2arc_write_interval() calculate sleep delay between writes
3973 4271 *
3974 4272 * These three functions determine what to write, how much, and how quickly
3975 4273 * to send writes.
4274 + *
4275 + * L2ARC persistency:
4276 + *
4277 + * When writing buffers to L2ARC, we periodically add some metadata to
4278 + * make sure we can pick them up after reboot, thus dramatically reducing
4279 + * the impact that any downtime has on the performance of storage systems
4280 + * with large caches.
4281 + *
4282 + * The implementation works fairly simply by integrating the following two
4283 + * modifications:
4284 + *
4285 + * *) Every now and then, at end of an L2ARC feed cycle, we append a piece
4286 + * of metadata (called a "pbuf", or "persistency buffer") to the L2ARC
4287 + * write. This allows us to understand what what's been written, so that
4288 + * we can rebuild the arc_buf_hdr_t structures of the main ARC buffers.
4289 + * The pbuf also includes a "back-reference" pointer to the previous
4290 + * pbuf, forming a linked list of pbufs on the L2ARC device.
4291 + *
4292 + * *) We reserve 4k of space at the start of each L2ARC device for our
4293 + * header bookkeeping purposes. This contains a single 4k uberblock, which
4294 + * contains our top-level reference structures. We update it on each pbuf
4295 + * write. If this write results in an inconsistent uberblock (e.g. due to
4296 + * power failure), we detect this by verifying the uberblock's checksum
4297 + * and simply drop the entries from L2ARC. Once an L2ARC pbuf update
4298 + * completes, we update the uberblock to point to it.
4299 + *
4300 + * Implementation diagram:
4301 + *
4302 + * +=== L2ARC device (not to scale) ======================================+
4303 + * | ____________newest pbuf pointer_____________ |
4304 + * | / \ |
4305 + * | / V |
4306 + * ||l2uberblock|---|bufs|pbuf|bufs|pbuf|bufs|pbuf|bufs|pbuf|---(empty)---|
4307 + * | ^ / ^ / ^ / |
4308 + * | `-prev-' `-prev-' `-prev-' |
4309 + * | pbuf pbuf pbuf |
4310 + * +======================================================================+
4311 + *
4312 + * On-device data structures:
4313 + *
4314 + * (L2ARC persistent uberblock)
4315 + * struct l2uberblock {
4316 + * (these fields are in network byte order)
4317 + * uint32_t magic = 0x12bab10c; l2-ber-block
4318 + * uint8_t version = 0x1;
4319 + * uint8_t reserved = 0x0;
4320 + * uint16_t ublk_flags; see l2uberblock_flags_t
4321 + *
4322 + * (byte order of fields below determined by `ublk_flags')
4323 + * uint64_t spa_guid; what pool this l2arc dev belongs to
4324 + * uint64_t birth_txg; ublk with highest birth_txg is newest
4325 + * uint64_t evict_tail; current evict pointer on l2arc dev
4326 + * uint64_t alloc_space; how much space is alloc'd on the dev
4327 + * uint64_t pbuf_daddr; dev addr of the newest l2pbuf_t
4328 + * uint32_t pbuf_asize; size of newest pbuf
4329 + * uint64_t pbuf_cksum[4]; fletcher4 of newest pbuf
4330 + *
4331 + * uint8_t reserved[3996] = {0x0, 0x0, ... 0x0};
4332 + *
4333 + * uint64_t ublk_cksum[4] = fletcher4(of the 4064 bytes above);
4334 + * } l2dev_uberblock;
4335 + *
4336 + * (L2ARC persistent buffer list)
4337 + * typedef struct l2pbuf_t {
4338 + * (these fields are in network byte order)
4339 + * uint32_t magic = 0xdb0faba6; the-buffer-bag
4340 + * uint8_t version = 0x1;
4341 + * uint8_t reserved = 0x0;
4342 + * uint16_t pbuf_flags; see l2pbuf_flags_t
4343 + *
4344 + * (byte order of fields below determined by `pbuf_flags')
4345 + * uint64_t prev_pbuf_daddr; previous pbuf dev addr
4346 + * uint32_t prev_pbuf_asize; previous pbuf size
4347 + * uint64_t prev_pbuf_cksum[4]; fletcher4(of previous pbuf)
4348 + *
4349 + * uint32_t items_size; uncompressed size of `items' below
4350 + * (if (pbuf_flags & compress) decompress `items' prior to decoding)
4351 + * struct l2pbuf_buf_item {
4352 + * (these fields mirror [l2]arc_buf_hdr fields)
4353 + * uint64_t dva[2]; buffer's DVA
4354 + * uint64_t birth; buffer's birth TXG in ARC
4355 + * uint64_t cksum0; lower 64-bits of buffer's cksum
4356 + * uint64_t freeze_cksum[4]; buffer's freeze cksum
4357 + * uint32_t size; uncompressed buffer data size
4358 + * uint64_t l2daddr; device address (offset) of buf
4359 + * uint32_t l2asize; actual space occupied by buf
4360 + * uint8_t compress; compress algo used on data
4361 + * uint8_t contents_type; buffer's contents type
4362 + * uint16_t reserved = 0x0; for alignment and future use
4363 + * uint32_t flags; buffer's persistent flags
4364 + * } items[]; continues for remainder of pbuf
4365 + * } l2pbuf_t;
4366 + *
4367 + * L2ARC reconstruction:
4368 + *
4369 + * When writing data, we simply write in the standard rotary fashion,
4370 + * evicting buffers as we go and simply writing new data over them (appending
4371 + * an updated l2pbuf_t every now and then). This obviously means that once we
4372 + * loop around the end of the device, we will start cutting into an already
4373 + * committed l2pbuf (and its referenced data buffers), like so:
4374 + *
4375 + * current write head__ __old tail
4376 + * \ /
4377 + * V V
4378 + * <--|bufs|pbuf|bufs|pbuf| |bufs|pbuf|bufs|pbuf|-->
4379 + * ^ ^^^^^^^^^_____________________________
4380 + * | \
4381 + * <<nextwrite>> - will overwrite this pbuf --/
4382 + *
4383 + * When importing the pool, we detect this situation and use it to stop
4384 + * our scanning process:
4385 + * 1) Let `this_pbuf' refer to the current l2pbuf_t and `prev_pbuf' to the
4386 + * previous one.
4387 + * 2) if (fletcher4(prev_pbuf) != this_pbuf->prev_pbuf_cksum)
4388 + * then the pbuf is invalid and stop scanning (goto step 3 below).
4389 + * 3) if (this is the last valid pbuf)
4390 + * discard this pbuf as well (its ARC bufs may have been damaged by a
4391 + * partial overwrite).
4392 + * (We could potentially salvage the remaining good arc bufs above in step 3,
4393 + * buf the cost of doing so probably outweighs the value of the entire pbuf).
4394 + *
4395 + * There is one significant caveat to consider when rebuilding ARC contents
4396 + * from an L2ARC device: what about invalidated buffers? Given the above
4397 + * construction, we cannot update pbufs which we've already written to amend
4398 + * them to remove buffers which were invalidated. Thus, during reconstruction,
4399 + * we might be populating the cache with buffers for data that's not on the
4400 + * main pool anymore, or may have been overwritten!
4401 + *
4402 + * As it turns out, this isn't a problem. Every arc_read request includes
4403 + * both the DVA and, crucially, the birth TXG of the BP the caller is
4404 + * looking for. So even if the cache were populated by completely rotten
4405 + * blocks for data that had been long deleted and/or overwritten, we'll
4406 + * never actually return bad data from the cache, since the DVA with the
4407 + * birth TXG uniquely identify a block in space and time - once created,
4408 + * a block is immutable on disk. The worst thing we have done is wasted
4409 + * some time and memory at l2arc rebuild to reconstruct outdated ARC
4410 + * entries that will get dropped from the l2arc as it is being updated
4411 + * with new blocks.
3976 4412 */
3977 4413
3978 4414 static boolean_t
3979 4415 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
3980 4416 {
3981 4417 /*
3982 4418 * A buffer is *not* eligible for the L2ARC if it:
3983 4419 * 1. belongs to a different spa.
3984 4420 * 2. is already cached on the L2ARC.
3985 4421 * 3. has an I/O in progress (it may be an incomplete read).
3986 4422 * 4. is flagged not eligible (zfs property).
3987 4423 */
3988 4424 if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
3989 4425 HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
3990 4426 return (B_FALSE);
3991 4427
3992 4428 return (B_TRUE);
3993 4429 }
3994 4430
3995 4431 static uint64_t
3996 4432 l2arc_write_size(void)
3997 4433 {
3998 4434 uint64_t size;
3999 4435
4000 4436 /*
4001 4437 * Make sure our globals have meaningful values in case the user
4002 4438 * altered them.
4003 4439 */
4004 4440 size = l2arc_write_max;
4005 4441 if (size == 0) {
4006 4442 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4007 4443 "be greater than zero, resetting it to the default (%d)",
4008 4444 L2ARC_WRITE_SIZE);
4009 4445 size = l2arc_write_max = L2ARC_WRITE_SIZE;
4010 4446 }
4011 4447
4012 4448 if (arc_warm == B_FALSE)
4013 4449 size += l2arc_write_boost;
4014 4450
4015 4451 return (size);
4016 4452
4017 4453 }
4018 4454
4019 4455 static clock_t
4020 4456 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4021 4457 {
4022 4458 clock_t interval, next, now;
4023 4459
4024 4460 /*
4025 4461 * If the ARC lists are busy, increase our write rate; if the
4026 4462 * lists are stale, idle back. This is achieved by checking
4027 4463 * how much we previously wrote - if it was more than half of
4028 4464 * what we wanted, schedule the next write much sooner.
4029 4465 */
4030 4466 if (l2arc_feed_again && wrote > (wanted / 2))
4031 4467 interval = (hz * l2arc_feed_min_ms) / 1000;
↓ open down ↓ |
46 lines elided |
↑ open up ↑ |
4032 4468 else
4033 4469 interval = hz * l2arc_feed_secs;
4034 4470
4035 4471 now = ddi_get_lbolt();
4036 4472 next = MAX(now, MIN(now + interval, began + interval));
4037 4473
4038 4474 return (next);
4039 4475 }
4040 4476
4041 4477 static void
4042 -l2arc_hdr_stat_add(void)
4478 +l2arc_hdr_stat_add(boolean_t from_arc)
4043 4479 {
4044 4480 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4045 - ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4481 + if (from_arc)
4482 + ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4046 4483 }
4047 4484
4048 4485 static void
4049 4486 l2arc_hdr_stat_remove(void)
4050 4487 {
4051 4488 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4052 4489 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4053 4490 }
4054 4491
4055 4492 /*
4056 4493 * Cycle through L2ARC devices. This is how L2ARC load balances.
4057 4494 * If a device is returned, this also returns holding the spa config lock.
4058 4495 */
4059 4496 static l2arc_dev_t *
4060 4497 l2arc_dev_get_next(void)
4061 4498 {
4062 4499 l2arc_dev_t *first, *next = NULL;
4063 4500
4064 4501 /*
4065 4502 * Lock out the removal of spas (spa_namespace_lock), then removal
4066 4503 * of cache devices (l2arc_dev_mtx). Once a device has been selected,
4067 4504 * both locks will be dropped and a spa config lock held instead.
4068 4505 */
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
4069 4506 mutex_enter(&spa_namespace_lock);
4070 4507 mutex_enter(&l2arc_dev_mtx);
4071 4508
4072 4509 /* if there are no vdevs, there is nothing to do */
4073 4510 if (l2arc_ndev == 0)
4074 4511 goto out;
4075 4512
4076 4513 first = NULL;
4077 4514 next = l2arc_dev_last;
4078 4515 do {
4079 - /* loop around the list looking for a non-faulted vdev */
4516 + /*
4517 + * Loop around the list looking for a non-faulted vdev
4518 + * and one that isn't currently doing an L2ARC rebuild.
4519 + */
4080 4520 if (next == NULL) {
4081 4521 next = list_head(l2arc_dev_list);
4082 4522 } else {
4083 4523 next = list_next(l2arc_dev_list, next);
4084 4524 if (next == NULL)
4085 4525 next = list_head(l2arc_dev_list);
4086 4526 }
4087 4527
4088 4528 /* if we have come back to the start, bail out */
4089 4529 if (first == NULL)
4090 4530 first = next;
4091 4531 else if (next == first)
4092 4532 break;
4093 4533
4094 - } while (vdev_is_dead(next->l2ad_vdev));
4534 + } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding);
4095 4535
4096 4536 /* if we were unable to find any usable vdevs, return NULL */
4097 - if (vdev_is_dead(next->l2ad_vdev))
4537 + if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding)
4098 4538 next = NULL;
4099 4539
4100 4540 l2arc_dev_last = next;
4101 4541
4102 4542 out:
4103 4543 mutex_exit(&l2arc_dev_mtx);
4104 4544
4105 4545 /*
4106 4546 * Grab the config lock to prevent the 'next' device from being
4107 4547 * removed while we are writing to it.
4108 4548 */
4109 4549 if (next != NULL)
4110 4550 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4111 4551 mutex_exit(&spa_namespace_lock);
4112 4552
4113 4553 return (next);
4114 4554 }
4115 4555
4116 4556 /*
4117 4557 * Free buffers that were tagged for destruction.
4118 4558 */
4119 4559 static void
4120 4560 l2arc_do_free_on_write()
4121 4561 {
4122 4562 list_t *buflist;
4123 4563 l2arc_data_free_t *df, *df_prev;
4124 4564
4125 4565 mutex_enter(&l2arc_free_on_write_mtx);
4126 4566 buflist = l2arc_free_on_write;
4127 4567
4128 4568 for (df = list_tail(buflist); df; df = df_prev) {
4129 4569 df_prev = list_prev(buflist, df);
4130 4570 ASSERT(df->l2df_data != NULL);
4131 4571 ASSERT(df->l2df_func != NULL);
4132 4572 df->l2df_func(df->l2df_data, df->l2df_size);
4133 4573 list_remove(buflist, df);
4134 4574 kmem_free(df, sizeof (l2arc_data_free_t));
4135 4575 }
4136 4576
4137 4577 mutex_exit(&l2arc_free_on_write_mtx);
4138 4578 }
4139 4579
4140 4580 /*
4141 4581 * A write to a cache device has completed. Update all headers to allow
4142 4582 * reads from these buffers to begin.
4143 4583 */
4144 4584 static void
4145 4585 l2arc_write_done(zio_t *zio)
4146 4586 {
4147 4587 l2arc_write_callback_t *cb;
4148 4588 l2arc_dev_t *dev;
4149 4589 list_t *buflist;
4150 4590 arc_buf_hdr_t *head, *ab, *ab_prev;
4151 4591 l2arc_buf_hdr_t *abl2;
4152 4592 kmutex_t *hash_lock;
4153 4593
4154 4594 cb = zio->io_private;
4155 4595 ASSERT(cb != NULL);
4156 4596 dev = cb->l2wcb_dev;
4157 4597 ASSERT(dev != NULL);
4158 4598 head = cb->l2wcb_head;
4159 4599 ASSERT(head != NULL);
4160 4600 buflist = dev->l2ad_buflist;
4161 4601 ASSERT(buflist != NULL);
4162 4602 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4163 4603 l2arc_write_callback_t *, cb);
4164 4604
↓ open down ↓ |
57 lines elided |
↑ open up ↑ |
4165 4605 if (zio->io_error != 0)
4166 4606 ARCSTAT_BUMP(arcstat_l2_writes_error);
4167 4607
4168 4608 mutex_enter(&l2arc_buflist_mtx);
4169 4609
4170 4610 /*
4171 4611 * All writes completed, or an error was hit.
4172 4612 */
4173 4613 for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4174 4614 ab_prev = list_prev(buflist, ab);
4615 + abl2 = ab->b_l2hdr;
4175 4616
4617 + /*
4618 + * Release the temporary compressed buffer as soon as possible.
4619 + */
4620 + if (abl2->b_compress != ZIO_COMPRESS_OFF)
4621 + l2arc_release_cdata_buf(ab);
4622 +
4176 4623 hash_lock = HDR_LOCK(ab);
4177 4624 if (!mutex_tryenter(hash_lock)) {
4178 4625 /*
4179 4626 * This buffer misses out. It may be in a stage
4180 4627 * of eviction. Its ARC_L2_WRITING flag will be
4181 4628 * left set, denying reads to this buffer.
4182 4629 */
4183 4630 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4184 4631 continue;
4185 4632 }
4186 4633
4187 - abl2 = ab->b_l2hdr;
4188 -
4189 - /*
4190 - * Release the temporary compressed buffer as soon as possible.
4191 - */
4192 - if (abl2->b_compress != ZIO_COMPRESS_OFF)
4193 - l2arc_release_cdata_buf(ab);
4194 -
4195 4634 if (zio->io_error != 0) {
4196 4635 /*
4197 4636 * Error - drop L2ARC entry.
4198 4637 */
4199 4638 list_remove(buflist, ab);
4200 4639 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4201 4640 ab->b_l2hdr = NULL;
4202 4641 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4203 4642 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4204 4643 }
4205 4644
4206 4645 /*
4207 4646 * Allow ARC to begin reads to this L2ARC entry.
4208 4647 */
4209 4648 ab->b_flags &= ~ARC_L2_WRITING;
4210 4649
↓ open down ↓ |
6 lines elided |
↑ open up ↑ |
4211 4650 mutex_exit(hash_lock);
4212 4651 }
4213 4652
4214 4653 atomic_inc_64(&l2arc_writes_done);
4215 4654 list_remove(buflist, head);
4216 4655 kmem_cache_free(hdr_cache, head);
4217 4656 mutex_exit(&l2arc_buflist_mtx);
4218 4657
4219 4658 l2arc_do_free_on_write();
4220 4659
4660 + if (cb->l2wcb_pbuf)
4661 + kmem_free(cb->l2wcb_pbuf, cb->l2wcb_pbuf_size);
4662 + if (cb->l2wcb_ub_buf)
4663 + kmem_free(cb->l2wcb_ub_buf, L2UBERBLOCK_SIZE);
4221 4664 kmem_free(cb, sizeof (l2arc_write_callback_t));
4222 4665 }
4223 4666
4224 4667 /*
4225 4668 * A read to a cache device completed. Validate buffer contents before
4226 4669 * handing over to the regular ARC routines.
4227 4670 */
4228 4671 static void
4229 4672 l2arc_read_done(zio_t *zio)
4230 4673 {
4231 4674 l2arc_read_callback_t *cb;
4232 4675 arc_buf_hdr_t *hdr;
4233 4676 arc_buf_t *buf;
4234 4677 kmutex_t *hash_lock;
4235 4678 int equal;
4236 4679
4237 4680 ASSERT(zio->io_vd != NULL);
4238 4681 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4239 4682
4240 4683 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4241 4684
4242 4685 cb = zio->io_private;
4243 4686 ASSERT(cb != NULL);
4244 4687 buf = cb->l2rcb_buf;
4245 4688 ASSERT(buf != NULL);
4246 4689
4247 4690 hash_lock = HDR_LOCK(buf->b_hdr);
4248 4691 mutex_enter(hash_lock);
4249 4692 hdr = buf->b_hdr;
4250 4693 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4251 4694
4252 4695 /*
4253 4696 * If the buffer was compressed, decompress it first.
4254 4697 */
4255 4698 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4256 4699 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4257 4700 ASSERT(zio->io_data != NULL);
4258 4701
4259 4702 /*
4260 4703 * Check this survived the L2ARC journey.
4261 4704 */
4262 4705 equal = arc_cksum_equal(buf);
4263 4706 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4264 4707 mutex_exit(hash_lock);
4265 4708 zio->io_private = buf;
4266 4709 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
4267 4710 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
4268 4711 arc_read_done(zio);
4269 4712 } else {
4270 4713 mutex_exit(hash_lock);
4271 4714 /*
4272 4715 * Buffer didn't survive caching. Increment stats and
4273 4716 * reissue to the original storage device.
4274 4717 */
4275 4718 if (zio->io_error != 0) {
4276 4719 ARCSTAT_BUMP(arcstat_l2_io_error);
4277 4720 } else {
4278 4721 zio->io_error = SET_ERROR(EIO);
4279 4722 }
4280 4723 if (!equal)
4281 4724 ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4282 4725
4283 4726 /*
4284 4727 * If there's no waiter, issue an async i/o to the primary
4285 4728 * storage now. If there *is* a waiter, the caller must
4286 4729 * issue the i/o in a context where it's OK to block.
4287 4730 */
4288 4731 if (zio->io_waiter == NULL) {
4289 4732 zio_t *pio = zio_unique_parent(zio);
4290 4733
4291 4734 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4292 4735
4293 4736 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4294 4737 buf->b_data, zio->io_size, arc_read_done, buf,
4295 4738 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4296 4739 }
4297 4740 }
4298 4741
4299 4742 kmem_free(cb, sizeof (l2arc_read_callback_t));
4300 4743 }
4301 4744
4302 4745 /*
4303 4746 * This is the list priority from which the L2ARC will search for pages to
4304 4747 * cache. This is used within loops (0..3) to cycle through lists in the
4305 4748 * desired order. This order can have a significant effect on cache
4306 4749 * performance.
4307 4750 *
4308 4751 * Currently the metadata lists are hit first, MFU then MRU, followed by
4309 4752 * the data lists. This function returns a locked list, and also returns
4310 4753 * the lock pointer.
4311 4754 */
4312 4755 static list_t *
4313 4756 l2arc_list_locked(int list_num, kmutex_t **lock)
4314 4757 {
4315 4758 list_t *list = NULL;
4316 4759
4317 4760 ASSERT(list_num >= 0 && list_num <= 3);
4318 4761
4319 4762 switch (list_num) {
4320 4763 case 0:
4321 4764 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4322 4765 *lock = &arc_mfu->arcs_mtx;
4323 4766 break;
4324 4767 case 1:
4325 4768 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4326 4769 *lock = &arc_mru->arcs_mtx;
4327 4770 break;
4328 4771 case 2:
4329 4772 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4330 4773 *lock = &arc_mfu->arcs_mtx;
4331 4774 break;
4332 4775 case 3:
4333 4776 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4334 4777 *lock = &arc_mru->arcs_mtx;
4335 4778 break;
4336 4779 }
4337 4780
4338 4781 ASSERT(!(MUTEX_HELD(*lock)));
4339 4782 mutex_enter(*lock);
4340 4783 return (list);
4341 4784 }
4342 4785
4343 4786 /*
4344 4787 * Evict buffers from the device write hand to the distance specified in
4345 4788 * bytes. This distance may span populated buffers, it may span nothing.
4346 4789 * This is clearing a region on the L2ARC device ready for writing.
4347 4790 * If the 'all' boolean is set, every buffer is evicted.
4348 4791 */
4349 4792 static void
4350 4793 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4351 4794 {
4352 4795 list_t *buflist;
4353 4796 l2arc_buf_hdr_t *abl2;
4354 4797 arc_buf_hdr_t *ab, *ab_prev;
4355 4798 kmutex_t *hash_lock;
4356 4799 uint64_t taddr;
4357 4800
4358 4801 buflist = dev->l2ad_buflist;
4359 4802
4360 4803 if (buflist == NULL)
4361 4804 return;
4362 4805
4363 4806 if (!all && dev->l2ad_first) {
4364 4807 /*
4365 4808 * This is the first sweep through the device. There is
4366 4809 * nothing to evict.
4367 4810 */
4368 4811 return;
4369 4812 }
4370 4813
4371 4814 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4372 4815 /*
4373 4816 * When nearing the end of the device, evict to the end
4374 4817 * before the device write hand jumps to the start.
4375 4818 */
4376 4819 taddr = dev->l2ad_end;
4377 4820 } else {
4378 4821 taddr = dev->l2ad_hand + distance;
4379 4822 }
4380 4823 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4381 4824 uint64_t, taddr, boolean_t, all);
4382 4825
4383 4826 top:
4384 4827 mutex_enter(&l2arc_buflist_mtx);
4385 4828 for (ab = list_tail(buflist); ab; ab = ab_prev) {
4386 4829 ab_prev = list_prev(buflist, ab);
4387 4830
4388 4831 hash_lock = HDR_LOCK(ab);
4389 4832 if (!mutex_tryenter(hash_lock)) {
4390 4833 /*
4391 4834 * Missed the hash lock. Retry.
4392 4835 */
4393 4836 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4394 4837 mutex_exit(&l2arc_buflist_mtx);
4395 4838 mutex_enter(hash_lock);
4396 4839 mutex_exit(hash_lock);
4397 4840 goto top;
4398 4841 }
4399 4842
4400 4843 if (HDR_L2_WRITE_HEAD(ab)) {
4401 4844 /*
4402 4845 * We hit a write head node. Leave it for
4403 4846 * l2arc_write_done().
4404 4847 */
4405 4848 list_remove(buflist, ab);
4406 4849 mutex_exit(hash_lock);
4407 4850 continue;
4408 4851 }
4409 4852
4410 4853 if (!all && ab->b_l2hdr != NULL &&
4411 4854 (ab->b_l2hdr->b_daddr > taddr ||
4412 4855 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4413 4856 /*
4414 4857 * We've evicted to the target address,
4415 4858 * or the end of the device.
4416 4859 */
4417 4860 mutex_exit(hash_lock);
4418 4861 break;
4419 4862 }
4420 4863
4421 4864 if (HDR_FREE_IN_PROGRESS(ab)) {
4422 4865 /*
4423 4866 * Already on the path to destruction.
4424 4867 */
4425 4868 mutex_exit(hash_lock);
4426 4869 continue;
4427 4870 }
4428 4871
4429 4872 if (ab->b_state == arc_l2c_only) {
4430 4873 ASSERT(!HDR_L2_READING(ab));
4431 4874 /*
4432 4875 * This doesn't exist in the ARC. Destroy.
4433 4876 * arc_hdr_destroy() will call list_remove()
4434 4877 * and decrement arcstat_l2_size.
4435 4878 */
4436 4879 arc_change_state(arc_anon, ab, hash_lock);
4437 4880 arc_hdr_destroy(ab);
4438 4881 } else {
4439 4882 /*
4440 4883 * Invalidate issued or about to be issued
4441 4884 * reads, since we may be about to write
4442 4885 * over this location.
4443 4886 */
4444 4887 if (HDR_L2_READING(ab)) {
4445 4888 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4446 4889 ab->b_flags |= ARC_L2_EVICTED;
4447 4890 }
4448 4891
4449 4892 /*
4450 4893 * Tell ARC this no longer exists in L2ARC.
4451 4894 */
4452 4895 if (ab->b_l2hdr != NULL) {
4453 4896 abl2 = ab->b_l2hdr;
4454 4897 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4455 4898 ab->b_l2hdr = NULL;
4456 4899 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4457 4900 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4458 4901 }
4459 4902 list_remove(buflist, ab);
4460 4903
4461 4904 /*
4462 4905 * This may have been leftover after a
4463 4906 * failed write.
4464 4907 */
4465 4908 ab->b_flags &= ~ARC_L2_WRITING;
4466 4909 }
4467 4910 mutex_exit(hash_lock);
4468 4911 }
4469 4912 mutex_exit(&l2arc_buflist_mtx);
4470 4913
4471 4914 vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4472 4915 dev->l2ad_evict = taddr;
4473 4916 }
4474 4917
4475 4918 /*
4476 4919 * Find and write ARC buffers to the L2ARC device.
4477 4920 *
4478 4921 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4479 4922 * for reading until they have completed writing.
4480 4923 * The headroom_boost is an in-out parameter used to maintain headroom boost
4481 4924 * state between calls to this function.
4482 4925 *
4483 4926 * Returns the number of bytes actually written (which may be smaller than
4484 4927 * the delta by which the device hand has changed due to alignment).
4485 4928 */
4486 4929 static uint64_t
4487 4930 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4488 4931 boolean_t *headroom_boost)
4489 4932 {
4490 4933 arc_buf_hdr_t *ab, *ab_prev, *head;
4491 4934 list_t *list;
↓ open down ↓ |
261 lines elided |
↑ open up ↑ |
4492 4935 uint64_t write_asize, write_psize, write_sz, headroom,
4493 4936 buf_compress_minsz;
4494 4937 void *buf_data;
4495 4938 kmutex_t *list_lock;
4496 4939 boolean_t full;
4497 4940 l2arc_write_callback_t *cb;
4498 4941 zio_t *pio, *wzio;
4499 4942 uint64_t guid = spa_load_guid(spa);
4500 4943 const boolean_t do_headroom_boost = *headroom_boost;
4501 4944
4945 + /* persistency-related */
4946 + l2pbuf_t *pb;
4947 + l2pbuf_buflist_t *pb_buflist;
4948 + int num_bufs, buf_index;
4949 +
4502 4950 ASSERT(dev->l2ad_vdev != NULL);
4503 4951
4504 4952 /* Lower the flag now, we might want to raise it again later. */
4505 4953 *headroom_boost = B_FALSE;
4506 4954
4507 4955 pio = NULL;
4956 + cb = NULL;
4508 4957 write_sz = write_asize = write_psize = 0;
4509 4958 full = B_FALSE;
4510 4959 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4511 4960 head->b_flags |= ARC_L2_WRITE_HEAD;
4512 4961
4513 4962 /*
4514 4963 * We will want to try to compress buffers that are at least 2x the
4515 4964 * device sector size.
4516 4965 */
4517 4966 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4518 4967
4968 + pb = &dev->l2ad_pbuf;
4969 + num_bufs = 0;
4970 +
4519 4971 /*
4972 + * We will want to try to compress buffers that are at least 2x the
4973 + * device sector size.
4974 + */
4975 + buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4976 +
4977 + /*
4520 4978 * Copy buffers for L2ARC writing.
4521 4979 */
4522 4980 mutex_enter(&l2arc_buflist_mtx);
4523 4981 for (int try = 0; try <= 3; try++) {
4524 4982 uint64_t passed_sz = 0;
4525 4983
4526 4984 list = l2arc_list_locked(try, &list_lock);
4527 4985
4528 4986 /*
4529 4987 * L2ARC fast warmup.
4530 4988 *
4531 4989 * Until the ARC is warm and starts to evict, read from the
4532 4990 * head of the ARC lists rather than the tail.
4533 4991 */
4534 4992 if (arc_warm == B_FALSE)
4535 4993 ab = list_head(list);
4536 4994 else
4537 4995 ab = list_tail(list);
4538 4996
4539 4997 headroom = target_sz * l2arc_headroom;
4540 4998 if (do_headroom_boost)
4541 4999 headroom = (headroom * l2arc_headroom_boost) / 100;
4542 5000
4543 5001 for (; ab; ab = ab_prev) {
4544 5002 l2arc_buf_hdr_t *l2hdr;
4545 5003 kmutex_t *hash_lock;
4546 5004 uint64_t buf_sz;
4547 5005
4548 5006 if (arc_warm == B_FALSE)
4549 5007 ab_prev = list_next(list, ab);
4550 5008 else
4551 5009 ab_prev = list_prev(list, ab);
4552 5010
4553 5011 hash_lock = HDR_LOCK(ab);
4554 5012 if (!mutex_tryenter(hash_lock)) {
4555 5013 /*
4556 5014 * Skip this buffer rather than waiting.
4557 5015 */
4558 5016 continue;
4559 5017 }
4560 5018
4561 5019 passed_sz += ab->b_size;
4562 5020 if (passed_sz > headroom) {
4563 5021 /*
4564 5022 * Searched too far.
4565 5023 */
4566 5024 mutex_exit(hash_lock);
4567 5025 break;
4568 5026 }
4569 5027
4570 5028 if (!l2arc_write_eligible(guid, ab)) {
4571 5029 mutex_exit(hash_lock);
4572 5030 continue;
4573 5031 }
4574 5032
4575 5033 if ((write_sz + ab->b_size) > target_sz) {
4576 5034 full = B_TRUE;
4577 5035 mutex_exit(hash_lock);
4578 5036 break;
↓ open down ↓ |
49 lines elided |
↑ open up ↑ |
4579 5037 }
4580 5038
4581 5039 if (pio == NULL) {
4582 5040 /*
4583 5041 * Insert a dummy header on the buflist so
4584 5042 * l2arc_write_done() can find where the
4585 5043 * write buffers begin without searching.
4586 5044 */
4587 5045 list_insert_head(dev->l2ad_buflist, head);
4588 5046
4589 - cb = kmem_alloc(
5047 + cb = kmem_zalloc(
4590 5048 sizeof (l2arc_write_callback_t), KM_SLEEP);
4591 5049 cb->l2wcb_dev = dev;
4592 5050 cb->l2wcb_head = head;
4593 5051 pio = zio_root(spa, l2arc_write_done, cb,
4594 5052 ZIO_FLAG_CANFAIL);
4595 5053 }
4596 5054
4597 5055 /*
4598 5056 * Create and add a new L2ARC header.
4599 5057 */
4600 5058 l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4601 5059 l2hdr->b_dev = dev;
4602 5060 ab->b_flags |= ARC_L2_WRITING;
4603 5061
4604 5062 /*
4605 5063 * Temporarily stash the data buffer in b_tmp_cdata.
4606 5064 * The subsequent write step will pick it up from
4607 5065 * there. This is because can't access ab->b_buf
4608 5066 * without holding the hash_lock, which we in turn
4609 5067 * can't access without holding the ARC list locks
4610 5068 * (which we want to avoid during compression/writing).
4611 5069 */
4612 5070 l2hdr->b_compress = ZIO_COMPRESS_OFF;
4613 5071 l2hdr->b_asize = ab->b_size;
4614 5072 l2hdr->b_tmp_cdata = ab->b_buf->b_data;
4615 5073
4616 5074 buf_sz = ab->b_size;
4617 5075 ab->b_l2hdr = l2hdr;
4618 5076
4619 5077 list_insert_head(dev->l2ad_buflist, ab);
4620 5078
↓ open down ↓ |
21 lines elided |
↑ open up ↑ |
4621 5079 /*
4622 5080 * Compute and store the buffer cksum before
4623 5081 * writing. On debug the cksum is verified first.
4624 5082 */
4625 5083 arc_cksum_verify(ab->b_buf);
4626 5084 arc_cksum_compute(ab->b_buf, B_TRUE);
4627 5085
4628 5086 mutex_exit(hash_lock);
4629 5087
4630 5088 write_sz += buf_sz;
5089 + num_bufs++;
4631 5090 }
4632 5091
4633 5092 mutex_exit(list_lock);
4634 5093
4635 5094 if (full == B_TRUE)
4636 5095 break;
4637 5096 }
4638 5097
4639 5098 /* No buffers selected for writing? */
4640 5099 if (pio == NULL) {
4641 5100 ASSERT0(write_sz);
4642 5101 mutex_exit(&l2arc_buflist_mtx);
4643 5102 kmem_cache_free(hdr_cache, head);
4644 5103 return (0);
4645 5104 }
4646 5105
5106 + /* expand the pbuf to include a new list */
5107 + pb_buflist = l2arc_pbuf_buflist_alloc(pb, num_bufs);
5108 +
4647 5109 /*
4648 5110 * Now start writing the buffers. We're starting at the write head
4649 5111 * and work backwards, retracing the course of the buffer selector
4650 5112 * loop above.
4651 5113 */
4652 - for (ab = list_prev(dev->l2ad_buflist, head); ab;
4653 - ab = list_prev(dev->l2ad_buflist, ab)) {
5114 + for (ab = list_prev(dev->l2ad_buflist, head), buf_index = 0; ab;
5115 + ab = list_prev(dev->l2ad_buflist, ab), buf_index++) {
4654 5116 l2arc_buf_hdr_t *l2hdr;
4655 5117 uint64_t buf_sz;
4656 5118
4657 5119 /*
4658 5120 * We shouldn't need to lock the buffer here, since we flagged
4659 5121 * it as ARC_L2_WRITING in the previous step, but we must take
4660 5122 * care to only access its L2 cache parameters. In particular,
4661 5123 * ab->b_buf may be invalid by now due to ARC eviction.
4662 5124 */
4663 5125 l2hdr = ab->b_l2hdr;
4664 5126 l2hdr->b_daddr = dev->l2ad_hand;
4665 5127
4666 5128 if ((ab->b_flags & ARC_L2COMPRESS) &&
4667 5129 l2hdr->b_asize >= buf_compress_minsz) {
4668 5130 if (l2arc_compress_buf(l2hdr)) {
4669 5131 /*
4670 5132 * If compression succeeded, enable headroom
4671 5133 * boost on the next scan cycle.
4672 5134 */
4673 5135 *headroom_boost = B_TRUE;
4674 5136 }
4675 5137 }
4676 5138
4677 5139 /*
4678 5140 * Pick up the buffer data we had previously stashed away
4679 5141 * (and now potentially also compressed).
4680 5142 */
4681 5143 buf_data = l2hdr->b_tmp_cdata;
4682 5144 buf_sz = l2hdr->b_asize;
4683 5145
4684 5146 /* Compression may have squashed the buffer to zero length. */
4685 5147 if (buf_sz != 0) {
4686 5148 uint64_t buf_p_sz;
4687 5149
4688 5150 wzio = zio_write_phys(pio, dev->l2ad_vdev,
4689 5151 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4690 5152 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4691 5153 ZIO_FLAG_CANFAIL, B_FALSE);
4692 5154
4693 5155 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4694 5156 zio_t *, wzio);
↓ open down ↓ |
31 lines elided |
↑ open up ↑ |
4695 5157 (void) zio_nowait(wzio);
4696 5158
4697 5159 write_asize += buf_sz;
4698 5160 /*
4699 5161 * Keep the clock hand suitably device-aligned.
4700 5162 */
4701 5163 buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4702 5164 write_psize += buf_p_sz;
4703 5165 dev->l2ad_hand += buf_p_sz;
4704 5166 }
4705 - }
4706 5167
5168 + l2arc_pbuflist_insert(pb, pb_buflist, ab, buf_index);
5169 + }
5170 + ASSERT(buf_index == num_bufs);
4707 5171 mutex_exit(&l2arc_buflist_mtx);
4708 5172
4709 5173 ASSERT3U(write_asize, <=, target_sz);
4710 5174 ARCSTAT_BUMP(arcstat_l2_writes_sent);
4711 5175 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4712 5176 ARCSTAT_INCR(arcstat_l2_size, write_sz);
4713 5177 ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4714 5178 vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
4715 5179
5180 + /* Is it time to commit this pbuf? */
5181 + if (L2PBUF_IS_FULL(pb) &&
5182 + dev->l2ad_hand + L2PBUF_ENCODED_SIZE(pb) < dev->l2ad_end) {
5183 + l2arc_pbuf_commit(dev, pio, cb);
5184 + l2arc_pbuf_destroy(pb);
5185 + l2arc_pbuf_init(pb);
5186 + }
5187 +
4716 5188 /*
4717 5189 * Bump device hand to the device start if it is approaching the end.
4718 5190 * l2arc_evict() will already have evicted ahead for this case.
4719 5191 */
4720 5192 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4721 5193 vdev_space_update(dev->l2ad_vdev,
4722 5194 dev->l2ad_end - dev->l2ad_hand, 0, 0);
4723 5195 dev->l2ad_hand = dev->l2ad_start;
4724 5196 dev->l2ad_evict = dev->l2ad_start;
4725 5197 dev->l2ad_first = B_FALSE;
4726 5198 }
4727 5199
4728 5200 dev->l2ad_writing = B_TRUE;
4729 5201 (void) zio_wait(pio);
4730 5202 dev->l2ad_writing = B_FALSE;
4731 5203
4732 5204 return (write_asize);
4733 5205 }
4734 5206
4735 5207 /*
4736 5208 * Compresses an L2ARC buffer.
4737 5209 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
4738 5210 * size in l2hdr->b_asize. This routine tries to compress the data and
4739 5211 * depending on the compression result there are three possible outcomes:
4740 5212 * *) The buffer was incompressible. The original l2hdr contents were left
4741 5213 * untouched and are ready for writing to an L2 device.
4742 5214 * *) The buffer was all-zeros, so there is no need to write it to an L2
4743 5215 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
4744 5216 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
4745 5217 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
4746 5218 * data buffer which holds the compressed data to be written, and b_asize
4747 5219 * tells us how much data there is. b_compress is set to the appropriate
4748 5220 * compression algorithm. Once writing is done, invoke
4749 5221 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
4750 5222 *
4751 5223 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
4752 5224 * buffer was incompressible).
4753 5225 */
4754 5226 static boolean_t
4755 5227 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
4756 5228 {
4757 5229 void *cdata;
4758 5230 size_t csize, len;
4759 5231
4760 5232 ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
4761 5233 ASSERT(l2hdr->b_tmp_cdata != NULL);
4762 5234
4763 5235 len = l2hdr->b_asize;
4764 5236 cdata = zio_data_buf_alloc(len);
4765 5237 csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
4766 5238 cdata, l2hdr->b_asize);
4767 5239
4768 5240 if (csize == 0) {
4769 5241 /* zero block, indicate that there's nothing to write */
4770 5242 zio_data_buf_free(cdata, len);
4771 5243 l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
4772 5244 l2hdr->b_asize = 0;
4773 5245 l2hdr->b_tmp_cdata = NULL;
4774 5246 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
4775 5247 return (B_TRUE);
4776 5248 } else if (csize > 0 && csize < len) {
4777 5249 /*
4778 5250 * Compression succeeded, we'll keep the cdata around for
4779 5251 * writing and release it afterwards.
4780 5252 */
4781 5253 l2hdr->b_compress = ZIO_COMPRESS_LZ4;
4782 5254 l2hdr->b_asize = csize;
4783 5255 l2hdr->b_tmp_cdata = cdata;
4784 5256 ARCSTAT_BUMP(arcstat_l2_compress_successes);
4785 5257 return (B_TRUE);
4786 5258 } else {
4787 5259 /*
4788 5260 * Compression failed, release the compressed buffer.
4789 5261 * l2hdr will be left unmodified.
4790 5262 */
4791 5263 zio_data_buf_free(cdata, len);
4792 5264 ARCSTAT_BUMP(arcstat_l2_compress_failures);
4793 5265 return (B_FALSE);
4794 5266 }
4795 5267 }
4796 5268
4797 5269 /*
4798 5270 * Decompresses a zio read back from an l2arc device. On success, the
4799 5271 * underlying zio's io_data buffer is overwritten by the uncompressed
4800 5272 * version. On decompression error (corrupt compressed stream), the
4801 5273 * zio->io_error value is set to signal an I/O error.
4802 5274 *
4803 5275 * Please note that the compressed data stream is not checksummed, so
4804 5276 * if the underlying device is experiencing data corruption, we may feed
4805 5277 * corrupt data to the decompressor, so the decompressor needs to be
4806 5278 * able to handle this situation (LZ4 does).
4807 5279 */
4808 5280 static void
4809 5281 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
4810 5282 {
4811 5283 ASSERT(L2ARC_IS_VALID_COMPRESS(c));
4812 5284
4813 5285 if (zio->io_error != 0) {
4814 5286 /*
4815 5287 * An io error has occured, just restore the original io
4816 5288 * size in preparation for a main pool read.
4817 5289 */
4818 5290 zio->io_orig_size = zio->io_size = hdr->b_size;
4819 5291 return;
4820 5292 }
4821 5293
4822 5294 if (c == ZIO_COMPRESS_EMPTY) {
4823 5295 /*
4824 5296 * An empty buffer results in a null zio, which means we
4825 5297 * need to fill its io_data after we're done restoring the
4826 5298 * buffer's contents.
4827 5299 */
4828 5300 ASSERT(hdr->b_buf != NULL);
4829 5301 bzero(hdr->b_buf->b_data, hdr->b_size);
4830 5302 zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
4831 5303 } else {
4832 5304 ASSERT(zio->io_data != NULL);
4833 5305 /*
4834 5306 * We copy the compressed data from the start of the arc buffer
4835 5307 * (the zio_read will have pulled in only what we need, the
4836 5308 * rest is garbage which we will overwrite at decompression)
4837 5309 * and then decompress back to the ARC data buffer. This way we
4838 5310 * can minimize copying by simply decompressing back over the
4839 5311 * original compressed data (rather than decompressing to an
4840 5312 * aux buffer and then copying back the uncompressed buffer,
4841 5313 * which is likely to be much larger).
4842 5314 */
4843 5315 uint64_t csize;
4844 5316 void *cdata;
4845 5317
4846 5318 csize = zio->io_size;
4847 5319 cdata = zio_data_buf_alloc(csize);
4848 5320 bcopy(zio->io_data, cdata, csize);
4849 5321 if (zio_decompress_data(c, cdata, zio->io_data, csize,
4850 5322 hdr->b_size) != 0)
4851 5323 zio->io_error = EIO;
4852 5324 zio_data_buf_free(cdata, csize);
4853 5325 }
4854 5326
4855 5327 /* Restore the expected uncompressed IO size. */
4856 5328 zio->io_orig_size = zio->io_size = hdr->b_size;
4857 5329 }
4858 5330
4859 5331 /*
4860 5332 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
4861 5333 * This buffer serves as a temporary holder of compressed data while
4862 5334 * the buffer entry is being written to an l2arc device. Once that is
4863 5335 * done, we can dispose of it.
4864 5336 */
4865 5337 static void
4866 5338 l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
4867 5339 {
4868 5340 l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
4869 5341
4870 5342 if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
4871 5343 /*
4872 5344 * If the data was compressed, then we've allocated a
4873 5345 * temporary buffer for it, so now we need to release it.
4874 5346 */
4875 5347 ASSERT(l2hdr->b_tmp_cdata != NULL);
4876 5348 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
4877 5349 }
4878 5350 l2hdr->b_tmp_cdata = NULL;
4879 5351 }
4880 5352
4881 5353 /*
4882 5354 * This thread feeds the L2ARC at regular intervals. This is the beating
4883 5355 * heart of the L2ARC.
4884 5356 */
4885 5357 static void
4886 5358 l2arc_feed_thread(void)
4887 5359 {
4888 5360 callb_cpr_t cpr;
4889 5361 l2arc_dev_t *dev;
4890 5362 spa_t *spa;
4891 5363 uint64_t size, wrote;
4892 5364 clock_t begin, next = ddi_get_lbolt();
4893 5365 boolean_t headroom_boost = B_FALSE;
4894 5366
4895 5367 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4896 5368
4897 5369 mutex_enter(&l2arc_feed_thr_lock);
4898 5370
4899 5371 while (l2arc_thread_exit == 0) {
4900 5372 CALLB_CPR_SAFE_BEGIN(&cpr);
4901 5373 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4902 5374 next);
4903 5375 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4904 5376 next = ddi_get_lbolt() + hz;
4905 5377
4906 5378 /*
4907 5379 * Quick check for L2ARC devices.
4908 5380 */
4909 5381 mutex_enter(&l2arc_dev_mtx);
4910 5382 if (l2arc_ndev == 0) {
4911 5383 mutex_exit(&l2arc_dev_mtx);
4912 5384 continue;
4913 5385 }
4914 5386 mutex_exit(&l2arc_dev_mtx);
4915 5387 begin = ddi_get_lbolt();
4916 5388
4917 5389 /*
4918 5390 * This selects the next l2arc device to write to, and in
4919 5391 * doing so the next spa to feed from: dev->l2ad_spa. This
4920 5392 * will return NULL if there are now no l2arc devices or if
4921 5393 * they are all faulted.
4922 5394 *
4923 5395 * If a device is returned, its spa's config lock is also
4924 5396 * held to prevent device removal. l2arc_dev_get_next()
4925 5397 * will grab and release l2arc_dev_mtx.
4926 5398 */
4927 5399 if ((dev = l2arc_dev_get_next()) == NULL)
4928 5400 continue;
4929 5401
4930 5402 spa = dev->l2ad_spa;
4931 5403 ASSERT(spa != NULL);
4932 5404
4933 5405 /*
4934 5406 * If the pool is read-only then force the feed thread to
4935 5407 * sleep a little longer.
4936 5408 */
4937 5409 if (!spa_writeable(spa)) {
4938 5410 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
4939 5411 spa_config_exit(spa, SCL_L2ARC, dev);
4940 5412 continue;
4941 5413 }
4942 5414
4943 5415 /*
4944 5416 * Avoid contributing to memory pressure.
4945 5417 */
4946 5418 if (arc_reclaim_needed()) {
4947 5419 ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4948 5420 spa_config_exit(spa, SCL_L2ARC, dev);
4949 5421 continue;
4950 5422 }
4951 5423
4952 5424 ARCSTAT_BUMP(arcstat_l2_feeds);
4953 5425
4954 5426 size = l2arc_write_size();
4955 5427
4956 5428 /*
4957 5429 * Evict L2ARC buffers that will be overwritten.
4958 5430 */
4959 5431 l2arc_evict(dev, size, B_FALSE);
4960 5432
4961 5433 /*
4962 5434 * Write ARC buffers.
4963 5435 */
4964 5436 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
4965 5437
4966 5438 /*
4967 5439 * Calculate interval between writes.
4968 5440 */
4969 5441 next = l2arc_write_interval(begin, size, wrote);
4970 5442 spa_config_exit(spa, SCL_L2ARC, dev);
4971 5443 }
4972 5444
4973 5445 l2arc_thread_exit = 0;
4974 5446 cv_broadcast(&l2arc_feed_thr_cv);
4975 5447 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
4976 5448 thread_exit();
4977 5449 }
4978 5450
4979 5451 boolean_t
4980 5452 l2arc_vdev_present(vdev_t *vd)
4981 5453 {
4982 5454 l2arc_dev_t *dev;
4983 5455
4984 5456 mutex_enter(&l2arc_dev_mtx);
4985 5457 for (dev = list_head(l2arc_dev_list); dev != NULL;
4986 5458 dev = list_next(l2arc_dev_list, dev)) {
↓ open down ↓ |
261 lines elided |
↑ open up ↑ |
4987 5459 if (dev->l2ad_vdev == vd)
4988 5460 break;
4989 5461 }
4990 5462 mutex_exit(&l2arc_dev_mtx);
4991 5463
4992 5464 return (dev != NULL);
4993 5465 }
4994 5466
4995 5467 /*
4996 5468 * Add a vdev for use by the L2ARC. By this point the spa has already
4997 - * validated the vdev and opened it.
5469 + * validated the vdev and opened it. The `rebuild' flag indicates whether
5470 + * we should attempt an L2ARC persistency rebuild.
4998 5471 */
4999 5472 void
5000 -l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5473 +l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
5001 5474 {
5002 5475 l2arc_dev_t *adddev;
5003 5476
5004 5477 ASSERT(!l2arc_vdev_present(vd));
5005 5478
5006 5479 /*
5007 5480 * Create a new l2arc device entry.
5008 5481 */
5009 5482 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5010 5483 adddev->l2ad_spa = spa;
5011 5484 adddev->l2ad_vdev = vd;
5012 - adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5485 + adddev->l2ad_start = VDEV_LABEL_START_SIZE + L2UBERBLOCK_SIZE;
5013 5486 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5014 5487 adddev->l2ad_hand = adddev->l2ad_start;
5015 5488 adddev->l2ad_evict = adddev->l2ad_start;
5016 5489 adddev->l2ad_first = B_TRUE;
5017 5490 adddev->l2ad_writing = B_FALSE;
5491 + l2arc_pbuf_init(&adddev->l2ad_pbuf);
5018 5492
5019 5493 /*
5020 5494 * This is a list of all ARC buffers that are still valid on the
5021 5495 * device.
5022 5496 */
5023 5497 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5024 5498 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5025 5499 offsetof(arc_buf_hdr_t, b_l2node));
5026 5500
5027 5501 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5028 5502
5029 5503 /*
5030 5504 * Add device to global list
5031 5505 */
5032 5506 mutex_enter(&l2arc_dev_mtx);
5033 5507 list_insert_head(l2arc_dev_list, adddev);
5034 5508 atomic_inc_64(&l2arc_ndev);
5509 + if (rebuild && l2arc_rebuild_enabled) {
5510 + adddev->l2ad_rebuilding = B_TRUE;
5511 + (void) thread_create(NULL, 0, l2arc_rebuild_start, adddev,
5512 + 0, &p0, TS_RUN, minclsyspri);
5513 + }
5035 5514 mutex_exit(&l2arc_dev_mtx);
5036 5515 }
5037 5516
5038 5517 /*
5039 5518 * Remove a vdev from the L2ARC.
5040 5519 */
5041 5520 void
5042 5521 l2arc_remove_vdev(vdev_t *vd)
5043 5522 {
5044 5523 l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5045 5524
5046 5525 /*
5047 5526 * Find the device by vdev
5048 5527 */
5049 5528 mutex_enter(&l2arc_dev_mtx);
5050 5529 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5051 5530 nextdev = list_next(l2arc_dev_list, dev);
5052 5531 if (vd == dev->l2ad_vdev) {
5053 5532 remdev = dev;
5054 5533 break;
5055 5534 }
5056 5535 }
5057 5536 ASSERT(remdev != NULL);
5058 5537
5059 5538 /*
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
5060 5539 * Remove device from global list
5061 5540 */
5062 5541 list_remove(l2arc_dev_list, remdev);
5063 5542 l2arc_dev_last = NULL; /* may have been invalidated */
5064 5543 atomic_dec_64(&l2arc_ndev);
5065 5544 mutex_exit(&l2arc_dev_mtx);
5066 5545
5067 5546 /*
5068 5547 * Clear all buflists and ARC references. L2ARC device flush.
5069 5548 */
5549 + l2arc_pbuf_destroy(&remdev->l2ad_pbuf);
5070 5550 l2arc_evict(remdev, 0, B_TRUE);
5071 5551 list_destroy(remdev->l2ad_buflist);
5072 5552 kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5073 5553 kmem_free(remdev, sizeof (l2arc_dev_t));
5074 5554 }
5075 5555
5076 5556 void
5077 5557 l2arc_init(void)
5078 5558 {
5079 5559 l2arc_thread_exit = 0;
5080 5560 l2arc_ndev = 0;
5081 5561 l2arc_writes_sent = 0;
5082 5562 l2arc_writes_done = 0;
5083 5563
5084 5564 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5085 5565 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5086 5566 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5087 5567 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5088 5568 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5089 5569
5090 5570 l2arc_dev_list = &L2ARC_dev_list;
5091 5571 l2arc_free_on_write = &L2ARC_free_on_write;
5092 5572 list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5093 5573 offsetof(l2arc_dev_t, l2ad_node));
5094 5574 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5095 5575 offsetof(l2arc_data_free_t, l2df_list_node));
5096 5576 }
5097 5577
5098 5578 void
5099 5579 l2arc_fini(void)
5100 5580 {
5101 5581 /*
5102 5582 * This is called from dmu_fini(), which is called from spa_fini();
5103 5583 * Because of this, we can assume that all l2arc devices have
5104 5584 * already been removed when the pools themselves were removed.
5105 5585 */
5106 5586
5107 5587 l2arc_do_free_on_write();
5108 5588
5109 5589 mutex_destroy(&l2arc_feed_thr_lock);
5110 5590 cv_destroy(&l2arc_feed_thr_cv);
5111 5591 mutex_destroy(&l2arc_dev_mtx);
5112 5592 mutex_destroy(&l2arc_buflist_mtx);
5113 5593 mutex_destroy(&l2arc_free_on_write_mtx);
5114 5594
5115 5595 list_destroy(l2arc_dev_list);
5116 5596 list_destroy(l2arc_free_on_write);
5117 5597 }
5118 5598
5119 5599 void
5120 5600 l2arc_start(void)
5121 5601 {
5122 5602 if (!(spa_mode_global & FWRITE))
5123 5603 return;
5124 5604
5125 5605 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5126 5606 TS_RUN, minclsyspri);
5127 5607 }
5128 5608
5129 5609 void
5130 5610 l2arc_stop(void)
↓ open down ↓ |
51 lines elided |
↑ open up ↑ |
5131 5611 {
5132 5612 if (!(spa_mode_global & FWRITE))
5133 5613 return;
5134 5614
5135 5615 mutex_enter(&l2arc_feed_thr_lock);
5136 5616 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
5137 5617 l2arc_thread_exit = 1;
5138 5618 while (l2arc_thread_exit != 0)
5139 5619 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5140 5620 mutex_exit(&l2arc_feed_thr_lock);
5621 +}
5622 +
5623 +/*
5624 + * Main entry point for L2ARC metadata rebuilding. This function must be
5625 + * called via thread_create so that the L2ARC metadata rebuild doesn't block
5626 + * pool import and may proceed in parallel on all available L2ARC devices.
5627 + */
5628 +static void
5629 +l2arc_rebuild_start(l2arc_dev_t *dev)
5630 +{
5631 + vdev_t *vd = dev->l2ad_vdev;
5632 + spa_t *spa = dev->l2ad_spa;
5633 +
5634 + /* Lock out device removal. */
5635 + spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
5636 + ASSERT(dev->l2ad_rebuilding == B_TRUE);
5637 + l2arc_rebuild(dev);
5638 + dev->l2ad_rebuilding = B_FALSE;
5639 + spa_config_exit(spa, SCL_L2ARC, vd);
5640 + thread_exit();
5641 +}
5642 +
5643 +/*
5644 + * This function implements the actual L2ARC metadata rebuild. It:
5645 + *
5646 + * 1) scans the device for valid l2uberblocks
5647 + * 2) if it finds a good uberblock, starts reading the pbuf chain
5648 + * 3) restores each pbuf's contents to memory
5649 + *
5650 + * Operation stops under any of the following conditions:
5651 + *
5652 + * 1) We reach the end of the pbuf chain (the previous-buffer reference
5653 + * in the pbuf is zero).
5654 + * 2) We encounter *any* error condition (cksum errors, io errors, looped
5655 + * pbufs, etc.).
5656 + * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect
5657 + * from making severely fragmented L2ARC pbufs or slow L2ARC devices
5658 + * prevent a machine from importing the pool (and letting the
5659 + * administrator take corrective action, e.g. by kicking the misbehaving
5660 + * L2ARC device out of the pool, or by reimporting the pool with L2ARC
5661 + * rebuilding disabled).
5662 + */
5663 +static void
5664 +l2arc_rebuild(l2arc_dev_t *dev)
5665 +{
5666 + int err;
5667 + l2uberblock_t ub;
5668 + l2pbuf_t pb;
5669 + zio_t *this_io = NULL, *next_io = NULL;
5670 + int64_t deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout;
5671 +
5672 + if ((err = l2arc_uberblock_find(dev, &ub)) != 0)
5673 + return;
5674 + L2ARC_CHK_REBUILD_TIMEOUT(deadline, /* nop */);
5675 +
5676 + /* set up uberblock update info */
5677 + dev->l2ad_uberblock_birth = ub.ub_birth + 1;
5678 +
5679 + /* initial sanity checks */
5680 + l2arc_pbuf_init(&pb);
5681 + if ((err = l2arc_pbuf_read(dev, ub.ub_pbuf_daddr, ub.ub_pbuf_asize,
5682 + ub.ub_pbuf_cksum, &pb, NULL, &this_io)) != 0) {
5683 + /* root pbuf is bad, we can't do anything about that */
5684 + if (err == EINVAL) {
5685 + ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors);
5686 + } else {
5687 + ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors);
5688 + }
5689 + l2arc_pbuf_destroy(&pb);
5690 + return;
5691 + }
5692 + L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb));
5693 +
5694 + dev->l2ad_evict = ub.ub_evict_tail;
5695 +
5696 + /* keep on chaining in new blocks */
5697 + dev->l2ad_pbuf_daddr = ub.ub_pbuf_daddr;
5698 + dev->l2ad_pbuf_asize = ub.ub_pbuf_asize;
5699 + dev->l2ad_pbuf_cksum = ub.ub_pbuf_cksum;
5700 + dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
5701 + ub.ub_pbuf_daddr + ub.ub_pbuf_asize);
5702 + dev->l2ad_first = ((ub.ub_flags & L2UBLK_EVICT_FIRST) != 0);
5703 +
5704 + /* start the rebuild process */
5705 + for (;;) {
5706 + l2pbuf_t pb_prev;
5707 +
5708 + l2arc_pbuf_init(&pb_prev);
5709 + if ((err = l2arc_pbuf_read(dev, pb.pb_prev_daddr,
5710 + pb.pb_prev_asize, pb.pb_prev_cksum, &pb_prev, this_io,
5711 + &next_io)) != 0) {
5712 + /*
5713 + * We are done reading, discard the last good buffer.
5714 + */
5715 + if (pb.pb_prev_daddr > dev->l2ad_hand &&
5716 + pb.pb_prev_asize > L2PBUF_HDR_SIZE) {
5717 + /* this is an error, we stopped too early */
5718 + if (err == EINVAL) {
5719 + ARCSTAT_BUMP(
5720 + arcstat_l2_rebuild_cksum_errors);
5721 + } else {
5722 + ARCSTAT_BUMP(
5723 + arcstat_l2_rebuild_io_errors);
5724 + }
5725 + }
5726 + l2arc_pbuf_destroy(&pb_prev);
5727 + l2arc_pbuf_destroy(&pb);
5728 + break;
5729 + }
5730 +
5731 + /*
5732 + * Protection against infinite loops of pbufs. This is also
5733 + * our primary termination mechanism - once the buffer list
5734 + * loops around our starting pbuf, we can stop.
5735 + */
5736 + if (pb.pb_prev_daddr >= ub.ub_pbuf_daddr &&
5737 + pb_prev.pb_prev_daddr <= ub.ub_pbuf_daddr) {
5738 + ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors);
5739 + l2arc_pbuf_destroy(&pb);
5740 + l2arc_pbuf_destroy(&pb_prev);
5741 + if (next_io)
5742 + l2arc_pbuf_prefetch_abort(next_io);
5743 + return;
5744 + }
5745 +
5746 + /*
5747 + * Our memory pressure valve. If the system is running low
5748 + * on memory, rather than swamping memory with new ARC buf
5749 + * hdrs, we opt not to reconstruct the L2ARC. At this point,
5750 + * however, we have already set up our L2ARC dev to chain in
5751 + * new metadata pbufs, so the user may choose to re-add the
5752 + * L2ARC dev at a later time to reconstruct it (when there's
5753 + * less memory pressure).
5754 + */
5755 + if (arc_reclaim_needed()) {
5756 + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
5757 + cmn_err(CE_NOTE, "System running low on memory, "
5758 + "aborting L2ARC rebuild.");
5759 + l2arc_pbuf_destroy(&pb);
5760 + l2arc_pbuf_destroy(&pb_prev);
5761 + if (next_io)
5762 + l2arc_pbuf_prefetch_abort(next_io);
5763 + break;
5764 + }
5765 +
5766 + /*
5767 + * Now that we know that the prev_pbuf checks out alright, we
5768 + * can start reconstruction from this pbuf - we can be sure
5769 + * that the L2ARC write hand has not yet reached any of our
5770 + * buffers.
5771 + */
5772 + l2arc_pbuf_restore(dev, &pb);
5773 +
5774 + /* pbuf restored, continue with next one in the list */
5775 + l2arc_pbuf_destroy(&pb);
5776 + pb = pb_prev;
5777 + this_io = next_io;
5778 + next_io = NULL;
5779 +
5780 + L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb));
5781 + }
5782 +
5783 + ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
5784 +}
5785 +
5786 +/*
5787 + * Restores the payload of a pbuf to ARC. This creates empty ARC hdr entries
5788 + * which only contain an l2arc hdr, essentially restoring the buffers to
5789 + * their L2ARC evicted state. This function also updates space usage on the
5790 + * L2ARC vdev to make sure it tracks restored buffers.
5791 + */
5792 +static void
5793 +l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb)
5794 +{
5795 + spa_t *spa;
5796 + uint64_t guid;
5797 + list_t *buflists_list;
5798 + l2pbuf_buflist_t *buflist;
5799 +
5800 + mutex_enter(&l2arc_buflist_mtx);
5801 + spa = dev->l2ad_vdev->vdev_spa;
5802 + guid = spa_load_guid(spa);
5803 + buflists_list = pb->pb_buflists_list;
5804 + for (buflist = list_head(buflists_list); buflist;
5805 + buflist = list_next(buflists_list, buflist)) {
5806 + int i;
5807 + uint64_t size, asize, psize;
5808 +
5809 + size = asize = psize = 0;
5810 + for (i = 0; i < buflist->l2pbl_nbufs; i++) {
5811 + l2arc_hdr_restore(&buflist->l2pbl_bufs[i], dev,
5812 + guid);
5813 + size += buflist->l2pbl_bufs[i].b_size;
5814 + asize += buflist->l2pbl_bufs[i].b_l2asize;
5815 + psize += vdev_psize_to_asize(dev->l2ad_vdev,
5816 + buflist->l2pbl_bufs[i].b_l2asize);
5817 + }
5818 + ARCSTAT_INCR(arcstat_l2_rebuild_arc_bytes, size);
5819 + ARCSTAT_INCR(arcstat_l2_rebuild_l2arc_bytes, asize);
5820 + ARCSTAT_INCR(arcstat_l2_rebuild_bufs, buflist->l2pbl_nbufs);
5821 + vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
5822 + }
5823 + mutex_exit(&l2arc_buflist_mtx);
5824 + ARCSTAT_BUMP(arcstat_l2_rebuild_metabufs);
5825 + vdev_space_update(dev->l2ad_vdev, vdev_psize_to_asize(dev->l2ad_vdev,
5826 + pb->pb_asize), 0, 0);
5827 +}
5828 +
5829 +/*
5830 + * Restores a single ARC buf hdr from a pbuf. The ARC buffer is put into
5831 + * a state indicating that it has been evicted to L2ARC.
5832 + * The `guid' here is the ARC-load-guid from spa_load_guid.
5833 + */
5834 +static void
5835 +l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev, uint64_t guid)
5836 +{
5837 + arc_buf_hdr_t *hdr;
5838 + kmutex_t *hash_lock;
5839 + dva_t dva = {buf->b_dva.dva_word[0], buf->b_dva.dva_word[1]};
5840 +
5841 + hdr = buf_hash_find(guid, &dva, buf->b_birth, &hash_lock);
5842 + if (hdr == NULL) {
5843 + /* not in cache, try to insert */
5844 + arc_buf_hdr_t *exists;
5845 + arc_buf_contents_t type = buf->b_contents_type;
5846 + l2arc_buf_hdr_t *l2hdr;
5847 +
5848 + hdr = arc_buf_hdr_alloc(guid, buf->b_size, type);
5849 + hdr->b_dva = buf->b_dva;
5850 + hdr->b_birth = buf->b_birth;
5851 + hdr->b_cksum0 = buf->b_cksum0;
5852 + hdr->b_size = buf->b_size;
5853 + exists = buf_hash_insert(hdr, &hash_lock);
5854 + if (exists) {
5855 + /* somebody beat us to the hash insert */
5856 + mutex_exit(hash_lock);
5857 + arc_hdr_destroy(hdr);
5858 + ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
5859 + return;
5860 + }
5861 + hdr->b_flags = buf->b_flags;
5862 + mutex_enter(&hdr->b_freeze_lock);
5863 + ASSERT(hdr->b_freeze_cksum == NULL);
5864 + hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
5865 + KM_SLEEP);
5866 + *hdr->b_freeze_cksum = buf->b_freeze_cksum;
5867 + mutex_exit(&hdr->b_freeze_lock);
5868 +
5869 + /* now rebuild the l2arc entry */
5870 + ASSERT(hdr->b_l2hdr == NULL);
5871 + l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
5872 + l2hdr->b_dev = dev;
5873 + l2hdr->b_daddr = buf->b_l2daddr;
5874 + l2hdr->b_asize = buf->b_l2asize;
5875 + l2hdr->b_compress = buf->b_l2compress;
5876 + hdr->b_l2hdr = l2hdr;
5877 + list_insert_head(dev->l2ad_buflist, hdr);
5878 + ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
5879 + ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize);
5880 +
5881 + arc_change_state(arc_l2c_only, hdr, hash_lock);
5882 + }
5883 + mutex_exit(hash_lock);
5884 +}
5885 +
5886 +/*
5887 + * Attempts to locate and read the newest valid uberblock on the provided
5888 + * L2ARC device and writes it to `ub'. On success, this function returns 0,
5889 + * otherwise the appropriate error code is returned.
5890 + */
5891 +static int
5892 +l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub)
5893 +{
5894 + int err = 0;
5895 + uint8_t *ub_buf;
5896 + uint64_t guid;
5897 +
5898 + ARCSTAT_BUMP(arcstat_l2_rebuild_attempts);
5899 + ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP);
5900 + guid = spa_guid(dev->l2ad_vdev->vdev_spa);
5901 +
5902 + if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
5903 + VDEV_LABEL_START_SIZE, L2UBERBLOCK_SIZE, ub_buf,
5904 + ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
5905 + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
5906 + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
5907 + ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors);
5908 + goto cleanup;
5909 + }
5910 +
5911 + /*
5912 + * Initial peek - does the device even have any usable uberblocks?
5913 + * If not, don't bother continuing.
5914 + */
5915 + l2arc_uberblock_decode(ub_buf, ub);
5916 + if (ub->ub_magic != L2UBERBLOCK_MAGIC || ub->ub_version == 0 ||
5917 + ub->ub_version > L2UBERBLOCK_MAX_VERSION ||
5918 + ub->ub_spa_guid != guid) {
5919 + err = ENOTSUP;
5920 + ARCSTAT_BUMP(arcstat_l2_rebuild_unsupported);
5921 + goto cleanup;
5922 + }
5923 +
5924 + /* now check to make sure that what we selected is okay */
5925 + if ((err = l2arc_uberblock_verify(ub_buf, ub, guid)) != 0) {
5926 + if (err == EINVAL) {
5927 + ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors);
5928 + } else {
5929 + ARCSTAT_BUMP(arcstat_l2_rebuild_uberblk_errors);
5930 + }
5931 + goto cleanup;
5932 + }
5933 +
5934 + /* this uberblock is valid */
5935 +
5936 +cleanup:
5937 + kmem_free(ub_buf, L2UBERBLOCK_SIZE);
5938 + return (err);
5939 +}
5940 +
5941 +/*
5942 + * Reads a pbuf from storage, decodes it and validates its contents against
5943 + * the provided checksum. The result is placed in `pb'.
5944 + *
5945 + * The `this_io' and `prefetch_io' arguments are used for pbuf prefetching.
5946 + * When issuing the first pbuf IO during rebuild, you should pass NULL for
5947 + * `this_io'. This function will then issue a sync IO to read the pbuf and
5948 + * also issue an async IO to fetch the next pbuf in the pbuf chain. The
5949 + * prefetch IO is returned in `prefetch_io. On subsequent calls to this
5950 + * function, pass the value returned in `prefetch_io' from the previous
5951 + * call as `this_io' and a fresh `prefetch_io' pointer to hold the next
5952 + * prefetch IO. Prior to the call, you should initialize your `prefetch_io'
5953 + * pointer to be NULL. If no prefetch IO was issued, the pointer is left
5954 + * set at NULL.
5955 + *
5956 + * Actual prefetching takes place in two steps: a header IO (pi_hdr_io)
5957 + * and the main pbuf payload IO (placed in prefetch_io). The pi_hdr_io
5958 + * IO is used internally in this function to be able to `peek' at the next
5959 + * buffer's header before the main IO to read it in completely has finished.
5960 + * We can then begin to issue the IO for the next buffer in the chain before
5961 + * we are done reading, keeping the L2ARC device's pipeline saturated with
5962 + * reads (rather than issuing an IO, waiting for it to complete, validating
5963 + * the returned buffer and issuing the next one). This will make sure that
5964 + * the rebuild proceeds at maximum read throughput.
5965 + *
5966 + * On success, this function returns 0, otherwise it returns an appropriate
5967 + * error code. On error the prefetching IO is aborted and cleared before
5968 + * returning from this function. Therefore, if we return `success', the
5969 + * caller can assume that we have taken care of cleanup of prefetch IOs.
5970 + */
5971 +static int
5972 +l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize,
5973 + zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **prefetch_io)
5974 +{
5975 + int err = 0;
5976 + uint64_t prev_pb_start;
5977 + uint32_t prev_pb_asize;
5978 + zio_cksum_t calc_cksum, prev_pb_cksum;
5979 + l2arc_prefetch_info_t *pi = NULL;
5980 +
5981 + ASSERT(dev != NULL);
5982 + ASSERT(pb != NULL);
5983 + ASSERT(*prefetch_io == NULL);
5984 +
5985 + if (!l2arc_pbuf_ptr_valid(dev, daddr, asize)) {
5986 + /* We could not have issued a prefetch IO for this */
5987 + ASSERT(this_io == NULL);
5988 + return (EINVAL);
5989 + }
5990 +
5991 + /*
5992 + * Check to see if we have issued the IO for this pbuf in a previous
5993 + * run. If not, issue it now.
5994 + */
5995 + if (this_io == NULL)
5996 + this_io = l2arc_pbuf_prefetch(dev->l2ad_vdev, daddr, asize);
5997 +
5998 + /* Pick up the prefetch info buffer and read its contents */
5999 + pi = this_io->io_private;
6000 + ASSERT(pi != NULL);
6001 + ASSERT(asize <= pi->pi_buflen);
6002 +
6003 + /* Wait for the IO to read this pbuf's header to complete */
6004 + if ((err = zio_wait(pi->pi_hdr_io)) != 0) {
6005 + (void) zio_wait(this_io);
6006 + goto cleanup;
6007 + }
6008 +
6009 + /*
6010 + * Peek to see if we can start issuing the next pbuf IO immediately.
6011 + * At this point, only the current pbuf's header has been read.
6012 + */
6013 + if (l2arc_pbuf_decode_prev_ptr(pi->pi_buf, asize, &prev_pb_start,
6014 + &prev_pb_asize, &prev_pb_cksum) == 0) {
6015 + uint64_t this_pb_start, this_pb_end, prev_pb_end;
6016 + /* Detect malformed pbuf references and loops */
6017 + this_pb_start = daddr;
6018 + this_pb_end = daddr + asize;
6019 + prev_pb_end = prev_pb_start + prev_pb_asize;
6020 + if ((prev_pb_start >= this_pb_start && prev_pb_start <
6021 + this_pb_end) ||
6022 + (prev_pb_end >= this_pb_start && prev_pb_end <
6023 + this_pb_end)) {
6024 + ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors);
6025 + cmn_err(CE_WARN, "Looping L2ARC metadata reference "
6026 + "detected, aborting rebuild.");
6027 + err = EINVAL;
6028 + goto cleanup;
6029 + }
6030 + /*
6031 + * Start issuing IO for the next pbuf early - this should
6032 + * help keep the L2ARC device busy while we read, decode
6033 + * and restore this pbuf.
6034 + */
6035 + if (l2arc_pbuf_ptr_valid(dev, prev_pb_start, prev_pb_asize))
6036 + *prefetch_io = l2arc_pbuf_prefetch(dev->l2ad_vdev,
6037 + prev_pb_start, prev_pb_asize);
6038 + }
6039 +
6040 + /* Wait for the main pbuf IO to complete */
6041 + if ((err = zio_wait(this_io)) != 0)
6042 + goto cleanup;
6043 +
6044 + /* Make sure the buffer checks out ok */
6045 + fletcher_4_native(pi->pi_buf, asize, &calc_cksum);
6046 + if (!ZIO_CHECKSUM_EQUAL(calc_cksum, cksum)) {
6047 + err = EINVAL;
6048 + goto cleanup;
6049 + }
6050 +
6051 + /* Now we can take our time decoding this buffer */
6052 + if ((err = l2arc_pbuf_decode(pi->pi_buf, asize, pb)) != 0)
6053 + goto cleanup;
6054 +
6055 + /* This will be used in l2arc_pbuf_restore for space accounting */
6056 + pb->pb_asize = asize;
6057 +
6058 + ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, L2PBUF_ENCODED_SIZE(pb));
6059 + ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, asize);
6060 + ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio,
6061 + pb->pb_payload_asz / asize);
6062 +
6063 +cleanup:
6064 + kmem_free(pi->pi_buf, pi->pi_buflen);
6065 + pi->pi_buf = NULL;
6066 + kmem_free(pi, sizeof (l2arc_prefetch_info_t));
6067 + /* Abort an in-flight prefetch in case of error */
6068 + if (err != 0 && *prefetch_io != NULL) {
6069 + l2arc_pbuf_prefetch_abort(*prefetch_io);
6070 + *prefetch_io = NULL;
6071 + }
6072 + return (err);
6073 +}
6074 +
6075 +/*
6076 + * Validates a pbuf device address to make sure that it can be read
6077 + * from the provided L2ARC device. Returns 1 if the address is within
6078 + * the device's bounds, or 0 if not.
6079 + */
6080 +static int
6081 +l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize)
6082 +{
6083 + uint32_t psize;
6084 + uint64_t end;
6085 +
6086 + psize = vdev_psize_to_asize(dev->l2ad_vdev, asize);
6087 + end = daddr + psize;
6088 +
6089 + if (end > dev->l2ad_end || asize < L2PBUF_HDR_SIZE ||
6090 + asize > L2PBUF_MAX_PAYLOAD_SIZE || daddr < dev->l2ad_start ||
6091 + /* check that the buffer address is correctly aligned */
6092 + (daddr & (vdev_psize_to_asize(dev->l2ad_vdev,
6093 + SPA_MINBLOCKSIZE) - 1)) != 0)
6094 + return (0);
6095 + else
6096 + return (1);
6097 +}
6098 +
6099 +/*
6100 + * Starts an asynchronous read IO to read a pbuf. This is used in pbuf
6101 + * reconstruction to start reading the next pbuf before we are done
6102 + * decoding and reconstructing the current pbuf, to keep the l2arc device
6103 + * nice and hot with read IO to process.
6104 + * The returned zio will contain a newly allocated memory buffers for the IO
6105 + * data which should then be freed by the caller once the zio is no longer
6106 + * needed (i.e. due to it having completed). If you wish to abort this
6107 + * zio, you should do so using l2arc_pbuf_prefetch_abort, which takes care
6108 + * of disposing of the allocated buffers correctly.
6109 + */
6110 +static zio_t *
6111 +l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize)
6112 +{
6113 + uint32_t i, psize;
6114 + zio_t *pio, *hdr_io;
6115 + uint64_t hdr_rsize;
6116 + uint8_t *buf;
6117 + l2arc_prefetch_info_t *pinfo;
6118 +
6119 + psize = vdev_psize_to_asize(vd, asize);
6120 + buf = kmem_alloc(psize, KM_SLEEP);
6121 + pinfo = kmem_alloc(sizeof (l2arc_prefetch_info_t), KM_SLEEP);
6122 + pinfo->pi_buf = buf;
6123 + pinfo->pi_buflen = psize;
6124 +
6125 + /*
6126 + * We start issuing the IO for the pbuf header early. This
6127 + * allows l2arc_pbuf_read to start issuing IO for the next
6128 + * buffer before the current pbuf is read in completely.
6129 + */
6130 +
6131 + hdr_rsize = vdev_psize_to_asize(vd, SPA_MINBLOCKSIZE);
6132 + ASSERT(hdr_rsize <= psize);
6133 + pinfo->pi_hdr_io = zio_root(vd->vdev_spa, NULL, NULL,
6134 + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
6135 + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
6136 + hdr_io = zio_read_phys(pinfo->pi_hdr_io, vd, daddr, hdr_rsize, buf,
6137 + ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
6138 + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
6139 + ZIO_FLAG_DONT_RETRY, B_FALSE);
6140 + (void) zio_nowait(hdr_io);
6141 +
6142 + /*
6143 + * Read in the rest of the pbuf - this can take longer than just
6144 + * having a peek at the header.
6145 + */
6146 + pio = zio_root(vd->vdev_spa, NULL, pinfo, ZIO_FLAG_DONT_CACHE |
6147 + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
6148 + ZIO_FLAG_DONT_RETRY);
6149 + for (i = hdr_rsize; i < psize; ) {
6150 + uint64_t rsize = psize - i;
6151 + zio_t *rzio;
6152 +
6153 + if (psize - i > SPA_MAXBLOCKSIZE)
6154 + rsize = SPA_MAXBLOCKSIZE;
6155 + ASSERT(rsize >= SPA_MINBLOCKSIZE);
6156 + rzio = zio_read_phys(pio, vd, daddr + i,
6157 + rsize, buf + i, ZIO_CHECKSUM_OFF, NULL, NULL,
6158 + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE |
6159 + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
6160 + ZIO_FLAG_DONT_RETRY, B_FALSE);
6161 + (void) zio_nowait(rzio);
6162 + i += rsize;
6163 + }
6164 +
6165 + return (pio);
6166 +}
6167 +
6168 +/*
6169 + * Aborts a zio returned from l2arc_pbuf_prefetch and frees the data
6170 + * buffers allocated for it.
6171 + */
6172 +static void
6173 +l2arc_pbuf_prefetch_abort(zio_t *zio)
6174 +{
6175 + l2arc_prefetch_info_t *pi;
6176 +
6177 + pi = zio->io_private;
6178 + ASSERT(pi != NULL);
6179 + if (pi->pi_hdr_io != NULL)
6180 + (void) zio_wait(pi->pi_hdr_io);
6181 + (void) zio_wait(zio);
6182 + kmem_free(pi->pi_buf, pi->pi_buflen);
6183 + pi->pi_buf = NULL;
6184 + kmem_free(pi, sizeof (l2arc_prefetch_info_t));
6185 +}
6186 +
6187 +/*
6188 + * Encodes an l2uberblock_t structure into a destination buffer. This
6189 + * buffer must be at least L2UBERBLOCK_SIZE bytes long. The resulting
6190 + * uberblock is always of this constant size.
6191 + */
6192 +static void
6193 +l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf)
6194 +{
6195 + zio_cksum_t cksum;
6196 +
6197 + bzero(buf, L2UBERBLOCK_SIZE);
6198 +
6199 +#if defined(_BIG_ENDIAN)
6200 + *(uint32_t *)buf = L2UBERBLOCK_MAGIC;
6201 + *(uint16_t *)(buf + 6) = L2UB_BIG_ENDIAN;
6202 +#else /* !defined(_BIG_ENDIAN) */
6203 + *(uint32_t *)buf = BSWAP_32(L2UBERBLOCK_MAGIC);
6204 + /* zero flags is ok */
6205 +#endif /* !defined(_BIG_ENDIAN) */
6206 + buf[4] = L2UBERBLOCK_MAX_VERSION;
6207 +
6208 + /* rest in native byte order */
6209 + *(uint64_t *)(buf + 8) = ub->ub_spa_guid;
6210 + *(uint64_t *)(buf + 16) = ub->ub_birth;
6211 + *(uint64_t *)(buf + 24) = ub->ub_evict_tail;
6212 + *(uint64_t *)(buf + 32) = ub->ub_alloc_space;
6213 + *(uint64_t *)(buf + 40) = ub->ub_pbuf_daddr;
6214 + *(uint32_t *)(buf + 48) = ub->ub_pbuf_asize;
6215 + bcopy(&ub->ub_pbuf_cksum, buf + 52, 32);
6216 +
6217 + fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum);
6218 + bcopy(&cksum, buf + L2UBERBLOCK_SIZE - 32, 32);
6219 +}
6220 +
6221 +/*
6222 + * Decodes an l2uberblock_t from an on-disk representation. Please note
6223 + * that this function does not perform any uberblock validation and
6224 + * checksumming - call l2arc_uberblock_verify() for that.
6225 + */
6226 +static void
6227 +l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub)
6228 +{
6229 + boolean_t bswap_needed;
6230 +
6231 + /* these always come in big endian */
6232 +#if defined(_BIG_ENDIAN)
6233 + ub->ub_magic = *(uint32_t *)buf;
6234 + ub->ub_flags = *(uint16_t *)(buf + 6);
6235 + bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 1);
6236 +#else /* !defined(_BIG_ENDIAN) */
6237 + ub->ub_magic = BSWAP_32(*(uint32_t *)buf);
6238 + ub->ub_flags = BSWAP_16(*(uint16_t *)(buf + 6));
6239 + bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 0);
6240 +#endif /* !defined(_BIG_ENDIAN) */
6241 + ub->ub_version = buf[4];
6242 +
6243 + ub->ub_spa_guid = *(uint64_t *)(buf + 8);
6244 + ub->ub_birth = *(uint64_t *)(buf + 16);
6245 + ub->ub_evict_tail = *(uint64_t *)(buf + 24);
6246 + ub->ub_alloc_space = *(uint64_t *)(buf + 32);
6247 + ub->ub_pbuf_daddr = *(uint64_t *)(buf + 40);
6248 + ub->ub_pbuf_asize = *(uint32_t *)(buf + 48);
6249 + bcopy(buf + 52, &ub->ub_pbuf_cksum, 36);
6250 + bcopy(buf + L2UBERBLOCK_SIZE - 32, &ub->ub_cksum, 32);
6251 +
6252 + /* swap the rest if endianness doesn't match us */
6253 + if (bswap_needed) {
6254 + ub->ub_spa_guid = BSWAP_64(ub->ub_spa_guid);
6255 + ub->ub_birth = BSWAP_64(ub->ub_birth);
6256 + ub->ub_evict_tail = BSWAP_64(ub->ub_evict_tail);
6257 + ub->ub_alloc_space = BSWAP_64(ub->ub_alloc_space);
6258 + ub->ub_pbuf_daddr = BSWAP_64(ub->ub_pbuf_daddr);
6259 + ub->ub_pbuf_asize = BSWAP_32(ub->ub_pbuf_asize);
6260 + ZIO_CHECKSUM_BSWAP(&ub->ub_pbuf_cksum);
6261 + ZIO_CHECKSUM_BSWAP(&ub->ub_cksum);
6262 + }
6263 +}
6264 +
6265 +/*
6266 + * Verifies whether a decoded uberblock (via l2arc_uberblock_decode()) is
6267 + * valid and matches its checksum.
6268 + */
6269 +static int
6270 +l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub,
6271 + uint64_t guid)
6272 +{
6273 + zio_cksum_t cksum;
6274 +
6275 + if (ub->ub_magic != L2UBERBLOCK_MAGIC ||
6276 + ub->ub_version == 0 || ub->ub_version > L2UBERBLOCK_MAX_VERSION)
6277 + /*
6278 + * bad magic or invalid version => persistent l2arc not
6279 + * supported
6280 + */
6281 + return (ENOTSUP);
6282 +
6283 + if (ub->ub_spa_guid != guid)
6284 + /* this l2arc dev isn't ours */
6285 + return (EINVAL);
6286 +
6287 + fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum);
6288 + if (!ZIO_CHECKSUM_EQUAL(cksum, ub->ub_cksum))
6289 + /* bad checksum, corrupt uberblock */
6290 + return (EINVAL);
6291 +
6292 + return (0);
6293 +}
6294 +
6295 +/*
6296 + * Schedules a zio to update the uberblock on an l2arc device. The zio is
6297 + * initiated as a child of `pio' and `cb' is filled with the information
6298 + * needed to free the uberblock data buffer after writing.
6299 + */
6300 +static void
6301 +l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
6302 +{
6303 + uint8_t *ub_buf;
6304 + l2uberblock_t ub;
6305 + zio_t *wzio;
6306 + vdev_stat_t st;
6307 +
6308 + ASSERT(cb->l2wcb_ub_buf == NULL);
6309 + vdev_get_stats(dev->l2ad_vdev, &st);
6310 +
6311 + bzero(&ub, sizeof (ub));
6312 + ub.ub_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
6313 + ub.ub_birth = dev->l2ad_uberblock_birth++;
6314 + ub.ub_evict_tail = dev->l2ad_evict;
6315 + ub.ub_alloc_space = st.vs_alloc;
6316 + ub.ub_pbuf_daddr = dev->l2ad_pbuf_daddr;
6317 + ub.ub_pbuf_asize = dev->l2ad_pbuf_asize;
6318 + ub.ub_pbuf_cksum = dev->l2ad_pbuf_cksum;
6319 + if (dev->l2ad_first)
6320 + ub.ub_flags |= L2UBLK_EVICT_FIRST;
6321 +
6322 + ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP);
6323 + cb->l2wcb_ub_buf = ub_buf;
6324 + l2arc_uberblock_encode(&ub, ub_buf);
6325 + wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
6326 + L2UBERBLOCK_SIZE, ub_buf, ZIO_CHECKSUM_OFF, NULL, NULL,
6327 + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
6328 + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6329 + zio_t *, wzio);
6330 + (void) zio_nowait(wzio);
6331 +}
6332 +
6333 +/*
6334 + * Encodes a l2pbuf_t structure into the portable on-disk format. The
6335 + * `buf' buffer must be suitably sized to hold the entire uncompressed
6336 + * structure (use L2PBUF_ENCODED_SIZE()). If requested, this function
6337 + * also compresses the buffer.
6338 + *
6339 + * The return value is the length of the resulting encoded pbuf structure.
6340 + * This can be either equal to L2PBUF_ENCODED_SIZE(pb) if no compression
6341 + * was applied, or smaller if compression was applied. In either case,
6342 + * prior to writing to disk, the caller must suitably pad the output
6343 + * buffer so that it is aligned on a multiple of the underlying storage
6344 + * system's block size.
6345 + */
6346 +static uint32_t
6347 +l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen)
6348 +{
6349 + uint16_t flags = 0;
6350 + uint8_t *dst_buf;
6351 + uint32_t enclen;
6352 + l2pbuf_buflist_t *buflist;
6353 +
6354 + enclen = L2PBUF_ENCODED_SIZE(pb);
6355 + ASSERT(buflen >= enclen);
6356 + bzero(buf, enclen);
6357 +
6358 + /* non-header portions of pbufs are in native byte order */
6359 + *(uint64_t *)(buf + 8) = pb->pb_prev_daddr;
6360 + *(uint32_t *)(buf + 16) = pb->pb_prev_asize;
6361 + bcopy(&pb->pb_prev_cksum, buf + 20, 32);
6362 + *(uint32_t *)(buf + 52) = enclen - L2PBUF_HDR_SIZE;
6363 +
6364 + /* first we encode the buflists uncompressed */
6365 + dst_buf = buf + L2PBUF_HDR_SIZE;
6366 + for (buflist = list_head(pb->pb_buflists_list); buflist;
6367 + buflist = list_next(pb->pb_buflists_list, buflist)) {
6368 + int i;
6369 +
6370 + ASSERT(buflist->l2pbl_nbufs != 0);
6371 + for (i = 0; i < buflist->l2pbl_nbufs; i++) {
6372 + l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i];
6373 +
6374 + ASSERT(pbl_buf->b_size != 0);
6375 + *(uint64_t *)dst_buf = pbl_buf->b_dva.dva_word[0];
6376 + *(uint64_t *)(dst_buf + 8) = pbl_buf->b_dva.dva_word[1];
6377 + *(uint64_t *)(dst_buf + 16) = pbl_buf->b_birth;
6378 + *(uint64_t *)(dst_buf + 24) = pbl_buf->b_cksum0;
6379 + bcopy(&pbl_buf->b_freeze_cksum, dst_buf + 32, 32);
6380 + *(uint32_t *)(dst_buf + 64) = pbl_buf->b_size;
6381 + *(uint64_t *)(dst_buf + 68) = pbl_buf->b_l2daddr;
6382 + *(uint32_t *)(dst_buf + 76) = pbl_buf->b_l2asize;
6383 + dst_buf[80] = pbl_buf->b_l2compress;
6384 + dst_buf[81] = pbl_buf->b_contents_type;
6385 + *(uint32_t *)(dst_buf + 84) = pbl_buf->b_flags;
6386 + dst_buf += L2PBUF_BUF_SIZE;
6387 + }
6388 + }
6389 + ASSERT((uint32_t)(dst_buf - buf) == enclen);
6390 +
6391 + /* and then compress them if necessary */
6392 + if (enclen >= l2arc_pbuf_compress_minsz) {
6393 + uint8_t *cbuf;
6394 + size_t slen, clen;
6395 +
6396 + slen = l2arc_pbuf_items_encoded_size(pb);
6397 + cbuf = kmem_alloc(slen, KM_SLEEP);
6398 + clen = lz4_compress(buf + L2PBUF_HDR_SIZE, cbuf, slen, slen, 0);
6399 + ASSERT(clen != 0);
6400 + if (clen < slen) {
6401 + bcopy(cbuf, buf + L2PBUF_HDR_SIZE, clen);
6402 + flags |= L2PBUF_COMPRESSED;
6403 + /* zero out the rest of the input buffer */
6404 + bzero(buf + L2PBUF_HDR_SIZE + clen,
6405 + buflen - (L2PBUF_HDR_SIZE + clen));
6406 + /* adjust our buffer length now that it's shortened */
6407 + enclen = L2PBUF_HDR_SIZE + clen;
6408 + }
6409 + kmem_free(cbuf, slen);
6410 + }
6411 +
6412 + /* the header goes last since `flags' may change due to compression */
6413 +#if defined(_BIG_ENDIAN)
6414 + *(uint32_t *)buf = L2PBUF_MAGIC;
6415 + flags |= L2PBUF_BIG_ENDIAN;
6416 + *(uint16_t *)(buf + 6) = flags;
6417 +#else /* !defined(_BIG_ENDIAN) */
6418 + *(uint32_t *)buf = BSWAP_32(L2PBUF_MAGIC);
6419 + *(uint16_t *)(buf + 6) = BSWAP_16(flags);
6420 +#endif /* !defined(_BIG_ENDIAN) */
6421 + buf[4] = L2PBUF_MAX_VERSION;
6422 +
6423 + return (enclen);
6424 +}
6425 +
6426 +/*
6427 + * Decodes a stored l2pbuf_t structure previously encoded using
6428 + * l2arc_pbuf_encode. The source buffer is not modified. The passed pbuf
6429 + * must be initialized by l2arc_pbuf_init by the caller beforehand, but
6430 + * must not have been used to store any buffers yet.
6431 + *
6432 + * Please note that we don't do checksum verification here, as we don't
6433 + * know our own checksum (that's know by the previous block in the linked
6434 + * list, or by the uberblock). This should be performed by the caller
6435 + * prior to calling l2arc_pbuf_decode.
6436 + */
6437 +static int
6438 +l2arc_pbuf_decode(uint8_t *input_buf, uint32_t buflen, l2pbuf_t *pb)
6439 +{
6440 + boolean_t bswap_needed;
6441 + uint32_t payload_sz, payload_asz;
6442 + uint8_t *src_bufs;
6443 + l2pbuf_buflist_t *buflist;
6444 + int i, nbufs;
6445 +
6446 + ASSERT(input_buf != NULL);
6447 + ASSERT(pb != NULL);
6448 + ASSERT(pb->pb_version != 0);
6449 + ASSERT(pb->pb_nbuflists == 0);
6450 +
6451 + /* no valid buffer can be this small */
6452 + if (buflen < L2PBUF_HDR_SIZE)
6453 + return (EINVAL);
6454 +
6455 + /* these always come in big endian */
6456 +#if defined(_BIG_ENDIAN)
6457 + pb->pb_magic = *(uint32_t *)input_buf;
6458 + pb->pb_flags = *(uint16_t *)(input_buf + 6);
6459 + bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 1);
6460 +#else /* !defined(_BIG_ENDIAN) */
6461 + pb->pb_magic = BSWAP_32(*(uint32_t *)input_buf);
6462 + pb->pb_flags = BSWAP_16(*(uint16_t *)(input_buf + 6));
6463 + bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 0);
6464 +#endif /* !defined(_BIG_ENDIAN) */
6465 + pb->pb_version = input_buf[4];
6466 +
6467 + if (pb->pb_magic != L2PBUF_MAGIC || pb->pb_version == 0)
6468 + return (EINVAL);
6469 + if (pb->pb_version > L2PBUF_MAX_VERSION)
6470 + return (ENOTSUP);
6471 +
6472 + /* remainder of pbuf may need bswap'ping */
6473 + pb->pb_prev_daddr = *(uint64_t *)(input_buf + 8);
6474 + pb->pb_prev_asize = *(uint64_t *)(input_buf + 16);
6475 + bcopy(input_buf + 20, &pb->pb_prev_cksum, 32);
6476 + payload_sz = *(uint32_t *)(input_buf + 52);
6477 + payload_asz = buflen - L2PBUF_HDR_SIZE;
6478 +
6479 + if (bswap_needed) {
6480 + pb->pb_prev_daddr = BSWAP_64(pb->pb_prev_daddr);
6481 + pb->pb_prev_asize = BSWAP_64(pb->pb_prev_asize);
6482 + ZIO_CHECKSUM_BSWAP(&pb->pb_prev_cksum);
6483 + payload_sz = BSWAP_32(payload_sz);
6484 + }
6485 +
6486 + /* check for sensible buffer allocation limits */
6487 + if (((pb->pb_flags & L2PBUF_COMPRESSED) && payload_sz <= payload_asz) ||
6488 + (payload_sz > L2PBUF_MAX_PAYLOAD_SIZE) ||
6489 + (payload_sz % L2PBUF_BUF_SIZE) != 0 || payload_sz == 0)
6490 + return (EINVAL);
6491 + nbufs = payload_sz / L2PBUF_BUF_SIZE;
6492 +
6493 + /* decompression might be needed */
6494 + if (pb->pb_flags & L2PBUF_COMPRESSED) {
6495 + src_bufs = kmem_alloc(payload_sz, KM_SLEEP);
6496 + if (lz4_decompress(input_buf + L2PBUF_HDR_SIZE, src_bufs,
6497 + payload_asz, payload_sz, 0) != 0) {
6498 + kmem_free(src_bufs, payload_sz);
6499 + return (EINVAL);
6500 + }
6501 + } else {
6502 + src_bufs = input_buf + L2PBUF_HDR_SIZE;
6503 + }
6504 +
6505 + /* Decode individual pbuf items from our source buffer. */
6506 + buflist = l2arc_pbuf_buflist_alloc(pb, nbufs);
6507 + for (i = 0; i < nbufs; i++) {
6508 + l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i];
6509 + const uint8_t *src = src_bufs + i * L2PBUF_BUF_SIZE;
6510 +
6511 + pbl_buf->b_dva.dva_word[0] = *(uint64_t *)src;
6512 + pbl_buf->b_dva.dva_word[1] = *(uint64_t *)(src + 8);
6513 + pbl_buf->b_birth = *(uint64_t *)(src + 16);
6514 + pbl_buf->b_cksum0 = *(uint64_t *)(src + 24);
6515 + bcopy(src + 32, &pbl_buf->b_freeze_cksum, 32);
6516 + pbl_buf->b_size = *(uint32_t *)(src + 64);
6517 + pbl_buf->b_l2daddr = *(uint64_t *)(src + 68);
6518 + pbl_buf->b_l2asize = *(uint32_t *)(src + 76);
6519 + pbl_buf->b_l2compress = src[80];
6520 + pbl_buf->b_contents_type = src[81];
6521 + pbl_buf->b_flags = *(uint32_t *)(src + 84);
6522 +
6523 + if (bswap_needed) {
6524 + pbl_buf->b_dva.dva_word[0] =
6525 + BSWAP_64(pbl_buf->b_dva.dva_word[0]);
6526 + pbl_buf->b_dva.dva_word[1] =
6527 + BSWAP_64(pbl_buf->b_dva.dva_word[1]);
6528 + pbl_buf->b_birth = BSWAP_64(pbl_buf->b_birth);
6529 + pbl_buf->b_cksum0 = BSWAP_64(pbl_buf->b_cksum0);
6530 + ZIO_CHECKSUM_BSWAP(&pbl_buf->b_freeze_cksum);
6531 + pbl_buf->b_size = BSWAP_32(pbl_buf->b_size);
6532 + pbl_buf->b_l2daddr = BSWAP_64(pbl_buf->b_l2daddr);
6533 + pbl_buf->b_l2asize = BSWAP_32(pbl_buf->b_l2asize);
6534 + pbl_buf->b_flags = BSWAP_32(pbl_buf->b_flags);
6535 + }
6536 +
6537 + pb->pb_payload_asz += pbl_buf->b_l2asize;
6538 + }
6539 +
6540 + if (pb->pb_flags & L2PBUF_COMPRESSED)
6541 + kmem_free(src_bufs, payload_sz);
6542 +
6543 + return (0);
6544 +}
6545 +
6546 +/*
6547 + * Decodes the previous buffer pointer encoded in a pbuf. This is used
6548 + * during L2ARC reconstruction to "peek" at the next buffer and start
6549 + * issuing IO to fetch it early, before decoding of the current buffer
6550 + * is done (which can take time due to decompression).
6551 + * Returns 0 on success (and fills in the return parameters `daddr',
6552 + * `asize' and `cksum' with the info of the previous pbuf), and an errno
6553 + * on error.
6554 + */
6555 +static int
6556 +l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen, uint64_t *daddr,
6557 + uint32_t *asize, zio_cksum_t *cksum)
6558 +{
6559 + boolean_t bswap_needed;
6560 + uint16_t version, flags;
6561 + uint32_t magic;
6562 +
6563 + ASSERT(buf != NULL);
6564 +
6565 + /* no valid buffer can be this small */
6566 + if (buflen <= L2PBUF_HDR_SIZE)
6567 + return (EINVAL);
6568 +
6569 + /* these always come in big endian */
6570 +#if defined(_BIG_ENDIAN)
6571 + magic = *(uint32_t *)buf;
6572 + flags = *(uint16_t *)(buf + 6);
6573 + bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 1);
6574 +#else /* !defined(_BIG_ENDIAN) */
6575 + magic = BSWAP_32(*(uint32_t *)buf);
6576 + flags = BSWAP_16(*(uint16_t *)(buf + 6));
6577 + bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 0);
6578 +#endif /* !defined(_BIG_ENDIAN) */
6579 + version = buf[4];
6580 +
6581 + if (magic != L2PBUF_MAGIC || version == 0)
6582 + return (EINVAL);
6583 + if (version > L2PBUF_MAX_VERSION)
6584 + return (ENOTSUP);
6585 +
6586 + *daddr = *(uint64_t *)(buf + 4);
6587 + *asize = *(uint64_t *)(buf + 12);
6588 + bcopy(buf + 16, cksum, 32);
6589 +
6590 + if (bswap_needed) {
6591 + *daddr = BSWAP_64(*daddr);
6592 + *asize = BSWAP_64(*asize);
6593 + ZIO_CHECKSUM_BSWAP(cksum);
6594 + }
6595 +
6596 + return (0);
6597 +}
6598 +
6599 +/*
6600 + * Initializes a pbuf structure into a clean state. All version and flags
6601 + * fields are filled in as appropriate for this architecture.
6602 + * If the structure was used before, first call l2arc_pbuf_destroy on it,
6603 + * as this function assumes the structure is uninitialized.
6604 + */
6605 +static void
6606 +l2arc_pbuf_init(l2pbuf_t *pb)
6607 +{
6608 + bzero(pb, sizeof (l2pbuf_t));
6609 + pb->pb_version = L2PBUF_MAX_VERSION;
6610 +#if defined(_BIG_ENDIAN)
6611 + pb->pb_flags |= L2PB_BIG_ENDIAN;
6612 +#endif
6613 + pb->pb_buflists_list = kmem_zalloc(sizeof (list_t), KM_SLEEP);
6614 + list_create(pb->pb_buflists_list, sizeof (l2pbuf_buflist_t),
6615 + offsetof(l2pbuf_buflist_t, l2pbl_node));
6616 +}
6617 +
6618 +/*
6619 + * Destroys a pbuf structure and puts it into a clean state ready to be
6620 + * initialized by l2arc_pbuf_init. All buflists created by
6621 + * l2arc_pbuf_buflist_alloc are released as well.
6622 + */
6623 +static void
6624 +l2arc_pbuf_destroy(l2pbuf_t *pb)
6625 +{
6626 + list_t *buflist_list = pb->pb_buflists_list;
6627 + l2pbuf_buflist_t *buflist;
6628 +
6629 + while ((buflist = list_head(buflist_list)) != NULL) {
6630 + ASSERT(buflist->l2pbl_nbufs > 0);
6631 + kmem_free(buflist->l2pbl_bufs, sizeof (l2pbuf_buf_t) *
6632 + buflist->l2pbl_nbufs);
6633 + list_remove(buflist_list, buflist);
6634 + kmem_free(buflist, sizeof (l2pbuf_buflist_t));
6635 + }
6636 + pb->pb_nbuflists = 0;
6637 + list_destroy(pb->pb_buflists_list);
6638 + kmem_free(pb->pb_buflists_list, sizeof (list_t));
6639 + bzero(pb, sizeof (l2pbuf_t));
6640 +}
6641 +
6642 +/*
6643 + * Allocates a new buflist inside of a pbuf, which can hold up to `nbufs'
6644 + * buffers. This is used during the buffer write cycle - each cycle allocates
6645 + * a new buflist and fills it with buffers it writes. Then, when the pbuf
6646 + * reaches its buflist limit, it is commited to stable storage.
6647 + */
6648 +static l2pbuf_buflist_t *
6649 +l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs)
6650 +{
6651 + l2pbuf_buflist_t *buflist;
6652 +
6653 + ASSERT(pb->pb_buflists_list != NULL);
6654 + buflist = kmem_zalloc(sizeof (l2pbuf_buflist_t), KM_SLEEP);
6655 + buflist->l2pbl_nbufs = nbufs;
6656 + buflist->l2pbl_bufs = kmem_zalloc(sizeof (l2pbuf_buf_t) * nbufs,
6657 + KM_SLEEP);
6658 + list_insert_tail(pb->pb_buflists_list, buflist);
6659 + pb->pb_nbuflists++;
6660 +
6661 + return (buflist);
6662 +}
6663 +
6664 +/*
6665 + * Inserts ARC buffer `ab' into the pbuf `pb' buflist `pbl' at index `idx'.
6666 + * The buffer being inserted must be present in L2ARC.
6667 + */
6668 +static void
6669 +l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl,
6670 + const arc_buf_hdr_t *ab, int index)
6671 +{
6672 + l2pbuf_buf_t *pb_buf;
6673 + const l2arc_buf_hdr_t *l2hdr;
6674 +
6675 + l2hdr = ab->b_l2hdr;
6676 + ASSERT(l2hdr != NULL);
6677 + ASSERT(pbl->l2pbl_nbufs > index);
6678 +
6679 + pb_buf = &pbl->l2pbl_bufs[index];
6680 + pb_buf->b_dva = ab->b_dva;
6681 + pb_buf->b_birth = ab->b_birth;
6682 + pb_buf->b_cksum0 = ab->b_cksum0;
6683 + pb_buf->b_freeze_cksum = *ab->b_freeze_cksum;
6684 + pb_buf->b_size = ab->b_size;
6685 + pb_buf->b_l2daddr = l2hdr->b_daddr;
6686 + pb_buf->b_l2asize = l2hdr->b_asize;
6687 + pb_buf->b_l2compress = l2hdr->b_compress;
6688 + pb_buf->b_contents_type = ab->b_type;
6689 + pb_buf->b_flags = ab->b_flags & L2ARC_PERSIST_FLAGS;
6690 + pb->pb_payload_asz += l2hdr->b_asize;
6691 +}
6692 +
6693 +/*
6694 + * Commits a pbuf to stable storage. This routine is invoked when writing
6695 + * ARC buffers to an L2ARC device. When the pbuf associated with the device
6696 + * has reached its limits (either in size or in number of writes), it is
6697 + * scheduled here for writing.
6698 + * This function allocates some memory to temporarily hold the serialized
6699 + * buffer to be written. This is then released in l2arc_write_done.
6700 + */
6701 +static void
6702 +l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
6703 +{
6704 + l2pbuf_t *pb = &dev->l2ad_pbuf;
6705 + uint64_t i, est_encsize, bufsize, encsize, io_size;
6706 + uint8_t *pb_buf;
6707 +
6708 + pb->pb_prev_daddr = dev->l2ad_pbuf_daddr;
6709 + pb->pb_prev_asize = dev->l2ad_pbuf_asize;
6710 + pb->pb_prev_cksum = dev->l2ad_pbuf_cksum;
6711 +
6712 + est_encsize = L2PBUF_ENCODED_SIZE(pb);
6713 + bufsize = vdev_psize_to_asize(dev->l2ad_vdev, est_encsize);
6714 + pb_buf = kmem_zalloc(bufsize, KM_SLEEP);
6715 + encsize = l2arc_pbuf_encode(pb, pb_buf, bufsize);
6716 + cb->l2wcb_pbuf = pb_buf;
6717 + cb->l2wcb_pbuf_size = bufsize;
6718 +
6719 + dev->l2ad_pbuf_daddr = dev->l2ad_hand;
6720 + dev->l2ad_pbuf_asize = encsize;
6721 + fletcher_4_native(pb_buf, encsize, &dev->l2ad_pbuf_cksum);
6722 +
6723 + io_size = vdev_psize_to_asize(dev->l2ad_vdev, encsize);
6724 + for (i = 0; i < io_size; ) {
6725 + zio_t *wzio;
6726 + uint64_t wsize = io_size - i;
6727 +
6728 + if (wsize > SPA_MAXBLOCKSIZE)
6729 + wsize = SPA_MAXBLOCKSIZE;
6730 + ASSERT(wsize >= SPA_MINBLOCKSIZE);
6731 + wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand + i,
6732 + wsize, pb_buf + i, ZIO_CHECKSUM_OFF, NULL, NULL,
6733 + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
6734 + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6735 + zio_t *, wzio);
6736 + (void) zio_nowait(wzio);
6737 + i += wsize;
6738 + }
6739 +
6740 + dev->l2ad_hand += io_size;
6741 + vdev_space_update(dev->l2ad_vdev, io_size, 0, 0);
6742 + l2arc_uberblock_update(dev, pio, cb);
6743 +
6744 + ARCSTAT_INCR(arcstat_l2_write_bytes, io_size);
6745 + ARCSTAT_BUMP(arcstat_l2_meta_writes);
6746 + ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, est_encsize);
6747 + ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, encsize);
6748 + ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio,
6749 + pb->pb_payload_asz / encsize);
6750 +}
6751 +
6752 +/*
6753 + * Returns the number of bytes occupied by the payload buffer items of
6754 + * a pbuf in portable (on-disk) encoded form, i.e. the bytes following
6755 + * L2PBUF_HDR_SIZE.
6756 + */
6757 +static uint32_t
6758 +l2arc_pbuf_items_encoded_size(l2pbuf_t *pb)
6759 +{
6760 + uint32_t size = 0;
6761 + l2pbuf_buflist_t *buflist;
6762 +
6763 + for (buflist = list_head(pb->pb_buflists_list); buflist != NULL;
6764 + buflist = list_next(pb->pb_buflists_list, buflist))
6765 + size += L2PBUF_BUF_SIZE * buflist->l2pbl_nbufs;
6766 +
6767 + return (size);
5141 6768 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX