Print this page
ARC pressure valve implementation
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/arc.c
+++ new/usr/src/uts/common/fs/zfs/arc.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
24 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
25 25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
26 26 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
27 27 */
28 28
29 29 /*
30 30 * DVA-based Adjustable Replacement Cache
31 31 *
32 32 * While much of the theory of operation used here is
33 33 * based on the self-tuning, low overhead replacement cache
34 34 * presented by Megiddo and Modha at FAST 2003, there are some
35 35 * significant differences:
36 36 *
37 37 * 1. The Megiddo and Modha model assumes any page is evictable.
38 38 * Pages in its cache cannot be "locked" into memory. This makes
39 39 * the eviction algorithm simple: evict the last page in the list.
40 40 * This also make the performance characteristics easy to reason
41 41 * about. Our cache is not so simple. At any given moment, some
42 42 * subset of the blocks in the cache are un-evictable because we
43 43 * have handed out a reference to them. Blocks are only evictable
44 44 * when there are no external references active. This makes
45 45 * eviction far more problematic: we choose to evict the evictable
46 46 * blocks that are the "lowest" in the list.
47 47 *
48 48 * There are times when it is not possible to evict the requested
49 49 * space. In these circumstances we are unable to adjust the cache
50 50 * size. To prevent the cache growing unbounded at these times we
51 51 * implement a "cache throttle" that slows the flow of new data
52 52 * into the cache until we can make space available.
53 53 *
54 54 * 2. The Megiddo and Modha model assumes a fixed cache size.
55 55 * Pages are evicted when the cache is full and there is a cache
56 56 * miss. Our model has a variable sized cache. It grows with
57 57 * high use, but also tries to react to memory pressure from the
58 58 * operating system: decreasing its size when system memory is
59 59 * tight.
60 60 *
61 61 * 3. The Megiddo and Modha model assumes a fixed page size. All
62 62 * elements of the cache are therefore exactly the same size. So
63 63 * when adjusting the cache size following a cache miss, its simply
64 64 * a matter of choosing a single page to evict. In our model, we
65 65 * have variable sized cache blocks (rangeing from 512 bytes to
66 66 * 128K bytes). We therefore choose a set of blocks to evict to make
67 67 * space for a cache miss that approximates as closely as possible
68 68 * the space used by the new block.
69 69 *
70 70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
71 71 * by N. Megiddo & D. Modha, FAST 2003
72 72 */
73 73
74 74 /*
75 75 * The locking model:
76 76 *
77 77 * A new reference to a cache buffer can be obtained in two
78 78 * ways: 1) via a hash table lookup using the DVA as a key,
79 79 * or 2) via one of the ARC lists. The arc_read() interface
80 80 * uses method 1, while the internal arc algorithms for
81 81 * adjusting the cache use method 2. We therefore provide two
82 82 * types of locks: 1) the hash table lock array, and 2) the
83 83 * arc list locks.
84 84 *
85 85 * Buffers do not have their own mutexes, rather they rely on the
86 86 * hash table mutexes for the bulk of their protection (i.e. most
87 87 * fields in the arc_buf_hdr_t are protected by these mutexes).
88 88 *
89 89 * buf_hash_find() returns the appropriate mutex (held) when it
90 90 * locates the requested buffer in the hash table. It returns
91 91 * NULL for the mutex if the buffer was not in the table.
92 92 *
93 93 * buf_hash_remove() expects the appropriate hash mutex to be
94 94 * already held before it is invoked.
95 95 *
96 96 * Each arc state also has a mutex which is used to protect the
97 97 * buffer list associated with the state. When attempting to
98 98 * obtain a hash table lock while holding an arc list lock you
99 99 * must use: mutex_tryenter() to avoid deadlock. Also note that
100 100 * the active state mutex must be held before the ghost state mutex.
101 101 *
102 102 * Arc buffers may have an associated eviction callback function.
103 103 * This function will be invoked prior to removing the buffer (e.g.
104 104 * in arc_do_user_evicts()). Note however that the data associated
105 105 * with the buffer may be evicted prior to the callback. The callback
106 106 * must be made with *no locks held* (to prevent deadlock). Additionally,
107 107 * the users of callbacks must ensure that their private data is
108 108 * protected from simultaneous callbacks from arc_clear_callback()
109 109 * and arc_do_user_evicts().
110 110 *
111 111 * Note that the majority of the performance stats are manipulated
112 112 * with atomic operations.
113 113 *
114 114 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
115 115 *
116 116 * - L2ARC buflist creation
117 117 * - L2ARC buflist eviction
118 118 * - L2ARC write completion, which walks L2ARC buflists
119 119 * - ARC header destruction, as it removes from L2ARC buflists
120 120 * - ARC header release, as it removes from L2ARC buflists
121 121 */
122 122
123 123 #include <sys/spa.h>
124 124 #include <sys/zio.h>
125 125 #include <sys/zio_compress.h>
126 126 #include <sys/zfs_context.h>
127 127 #include <sys/arc.h>
128 128 #include <sys/refcount.h>
129 129 #include <sys/vdev.h>
130 130 #include <sys/vdev_impl.h>
131 131 #include <sys/dsl_pool.h>
132 132 #ifdef _KERNEL
133 133 #include <sys/vmsystm.h>
134 134 #include <vm/anon.h>
135 135 #include <sys/fs/swapnode.h>
136 136 #include <sys/dnlc.h>
137 137 #endif
138 138 #include <sys/callb.h>
139 139 #include <sys/kstat.h>
140 140 #include <zfs_fletcher.h>
141 141
↓ open down ↓ |
141 lines elided |
↑ open up ↑ |
142 142 #ifndef _KERNEL
143 143 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
144 144 boolean_t arc_watch = B_FALSE;
145 145 int arc_procfd;
146 146 #endif
147 147
148 148 static kmutex_t arc_reclaim_thr_lock;
149 149 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
150 150 static uint8_t arc_thread_exit;
151 151
152 +static kmutex_t arc_pressure_thr_lock;
153 +static kcondvar_t arc_pressure_thr_cv;
154 +static uint8_t arc_pressure_thread_exit;
155 +static uint64_t arc_pressure_threshold;
156 +
152 157 #define ARC_REDUCE_DNLC_PERCENT 3
153 158 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
154 159
155 160 typedef enum arc_reclaim_strategy {
156 161 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
157 162 ARC_RECLAIM_CONS /* Conservative reclaim strategy */
158 163 } arc_reclaim_strategy_t;
159 164
160 165 /*
161 166 * The number of iterations through arc_evict_*() before we
162 167 * drop & reacquire the lock.
163 168 */
164 169 int arc_evict_iterations = 100;
165 170
166 171 /* number of seconds before growing cache again */
167 172 static int arc_grow_retry = 60;
168 173
169 174 /* shift of arc_c for calculating both min and max arc_p */
170 175 static int arc_p_min_shift = 4;
171 176
172 177 /* log2(fraction of arc to reclaim) */
173 178 static int arc_shrink_shift = 5;
174 179
175 180 /*
176 181 * minimum lifespan of a prefetch block in clock ticks
177 182 * (initialized in arc_init())
178 183 */
179 184 static int arc_min_prefetch_lifespan;
180 185
181 186 /*
182 187 * If this percent of memory is free, don't throttle.
183 188 */
184 189 int arc_lotsfree_percent = 10;
185 190
186 191 static int arc_dead;
187 192
188 193 /*
189 194 * The arc has filled available memory and has now warmed up.
190 195 */
191 196 static boolean_t arc_warm;
192 197
193 198 /*
194 199 * These tunables are for performance analysis.
195 200 */
196 201 uint64_t zfs_arc_max;
197 202 uint64_t zfs_arc_min;
198 203 uint64_t zfs_arc_meta_limit = 0;
199 204 int zfs_arc_grow_retry = 0;
200 205 int zfs_arc_shrink_shift = 0;
201 206 int zfs_arc_p_min_shift = 0;
202 207 int zfs_disable_dup_eviction = 0;
203 208 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
204 209
205 210 /*
206 211 * Note that buffers can be in one of 6 states:
207 212 * ARC_anon - anonymous (discussed below)
208 213 * ARC_mru - recently used, currently cached
209 214 * ARC_mru_ghost - recentely used, no longer in cache
210 215 * ARC_mfu - frequently used, currently cached
211 216 * ARC_mfu_ghost - frequently used, no longer in cache
212 217 * ARC_l2c_only - exists in L2ARC but not other states
213 218 * When there are no active references to the buffer, they are
214 219 * are linked onto a list in one of these arc states. These are
215 220 * the only buffers that can be evicted or deleted. Within each
216 221 * state there are multiple lists, one for meta-data and one for
217 222 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
218 223 * etc.) is tracked separately so that it can be managed more
219 224 * explicitly: favored over data, limited explicitly.
220 225 *
221 226 * Anonymous buffers are buffers that are not associated with
222 227 * a DVA. These are buffers that hold dirty block copies
223 228 * before they are written to stable storage. By definition,
224 229 * they are "ref'd" and are considered part of arc_mru
225 230 * that cannot be freed. Generally, they will aquire a DVA
226 231 * as they are written and migrate onto the arc_mru list.
227 232 *
228 233 * The ARC_l2c_only state is for buffers that are in the second
229 234 * level ARC but no longer in any of the ARC_m* lists. The second
230 235 * level ARC itself may also contain buffers that are in any of
231 236 * the ARC_m* states - meaning that a buffer can exist in two
232 237 * places. The reason for the ARC_l2c_only state is to keep the
233 238 * buffer header in the hash table, so that reads that hit the
234 239 * second level ARC benefit from these fast lookups.
235 240 */
236 241
237 242 typedef struct arc_state {
238 243 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
239 244 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
240 245 uint64_t arcs_size; /* total amount of data in this state */
241 246 kmutex_t arcs_mtx;
242 247 } arc_state_t;
243 248
244 249 /* The 6 states: */
245 250 static arc_state_t ARC_anon;
246 251 static arc_state_t ARC_mru;
247 252 static arc_state_t ARC_mru_ghost;
248 253 static arc_state_t ARC_mfu;
249 254 static arc_state_t ARC_mfu_ghost;
250 255 static arc_state_t ARC_l2c_only;
251 256
252 257 typedef struct arc_stats {
253 258 kstat_named_t arcstat_hits;
254 259 kstat_named_t arcstat_misses;
255 260 kstat_named_t arcstat_demand_data_hits;
256 261 kstat_named_t arcstat_demand_data_misses;
257 262 kstat_named_t arcstat_demand_metadata_hits;
258 263 kstat_named_t arcstat_demand_metadata_misses;
259 264 kstat_named_t arcstat_prefetch_data_hits;
260 265 kstat_named_t arcstat_prefetch_data_misses;
261 266 kstat_named_t arcstat_prefetch_metadata_hits;
262 267 kstat_named_t arcstat_prefetch_metadata_misses;
263 268 kstat_named_t arcstat_mru_hits;
264 269 kstat_named_t arcstat_mru_ghost_hits;
265 270 kstat_named_t arcstat_mfu_hits;
266 271 kstat_named_t arcstat_mfu_ghost_hits;
267 272 kstat_named_t arcstat_deleted;
268 273 kstat_named_t arcstat_recycle_miss;
269 274 /*
270 275 * Number of buffers that could not be evicted because the hash lock
271 276 * was held by another thread. The lock may not necessarily be held
272 277 * by something using the same buffer, since hash locks are shared
273 278 * by multiple buffers.
274 279 */
275 280 kstat_named_t arcstat_mutex_miss;
276 281 /*
277 282 * Number of buffers skipped because they have I/O in progress, are
278 283 * indrect prefetch buffers that have not lived long enough, or are
279 284 * not from the spa we're trying to evict from.
280 285 */
281 286 kstat_named_t arcstat_evict_skip;
282 287 kstat_named_t arcstat_evict_l2_cached;
283 288 kstat_named_t arcstat_evict_l2_eligible;
284 289 kstat_named_t arcstat_evict_l2_ineligible;
285 290 kstat_named_t arcstat_hash_elements;
286 291 kstat_named_t arcstat_hash_elements_max;
287 292 kstat_named_t arcstat_hash_collisions;
↓ open down ↓ |
126 lines elided |
↑ open up ↑ |
288 293 kstat_named_t arcstat_hash_chains;
289 294 kstat_named_t arcstat_hash_chain_max;
290 295 kstat_named_t arcstat_p;
291 296 kstat_named_t arcstat_c;
292 297 kstat_named_t arcstat_c_min;
293 298 kstat_named_t arcstat_c_max;
294 299 kstat_named_t arcstat_size;
295 300 kstat_named_t arcstat_hdr_size;
296 301 kstat_named_t arcstat_data_size;
297 302 kstat_named_t arcstat_other_size;
303 + kstat_named_t arcstat_growth_rate;
298 304 kstat_named_t arcstat_l2_hits;
299 305 kstat_named_t arcstat_l2_misses;
300 306 kstat_named_t arcstat_l2_feeds;
301 307 kstat_named_t arcstat_l2_rw_clash;
302 308 kstat_named_t arcstat_l2_read_bytes;
303 309 kstat_named_t arcstat_l2_write_bytes;
304 310 kstat_named_t arcstat_l2_writes_sent;
305 311 kstat_named_t arcstat_l2_writes_done;
306 312 kstat_named_t arcstat_l2_writes_error;
307 313 kstat_named_t arcstat_l2_writes_hdr_miss;
308 314 kstat_named_t arcstat_l2_evict_lock_retry;
309 315 kstat_named_t arcstat_l2_evict_reading;
310 316 kstat_named_t arcstat_l2_free_on_write;
311 317 kstat_named_t arcstat_l2_abort_lowmem;
312 318 kstat_named_t arcstat_l2_cksum_bad;
313 319 kstat_named_t arcstat_l2_io_error;
314 320 kstat_named_t arcstat_l2_size;
315 321 kstat_named_t arcstat_l2_asize;
316 322 kstat_named_t arcstat_l2_hdr_size;
317 323 kstat_named_t arcstat_l2_compress_successes;
318 324 kstat_named_t arcstat_l2_compress_zeros;
319 325 kstat_named_t arcstat_l2_compress_failures;
320 326 kstat_named_t arcstat_memory_throttle_count;
321 327 kstat_named_t arcstat_duplicate_buffers;
322 328 kstat_named_t arcstat_duplicate_buffers_size;
323 329 kstat_named_t arcstat_duplicate_reads;
324 330 kstat_named_t arcstat_meta_used;
325 331 kstat_named_t arcstat_meta_limit;
326 332 kstat_named_t arcstat_meta_max;
327 333 } arc_stats_t;
328 334
329 335 static arc_stats_t arc_stats = {
330 336 { "hits", KSTAT_DATA_UINT64 },
331 337 { "misses", KSTAT_DATA_UINT64 },
332 338 { "demand_data_hits", KSTAT_DATA_UINT64 },
333 339 { "demand_data_misses", KSTAT_DATA_UINT64 },
334 340 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
335 341 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
336 342 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
337 343 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
338 344 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
339 345 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
340 346 { "mru_hits", KSTAT_DATA_UINT64 },
341 347 { "mru_ghost_hits", KSTAT_DATA_UINT64 },
342 348 { "mfu_hits", KSTAT_DATA_UINT64 },
343 349 { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
344 350 { "deleted", KSTAT_DATA_UINT64 },
345 351 { "recycle_miss", KSTAT_DATA_UINT64 },
346 352 { "mutex_miss", KSTAT_DATA_UINT64 },
347 353 { "evict_skip", KSTAT_DATA_UINT64 },
348 354 { "evict_l2_cached", KSTAT_DATA_UINT64 },
349 355 { "evict_l2_eligible", KSTAT_DATA_UINT64 },
350 356 { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
351 357 { "hash_elements", KSTAT_DATA_UINT64 },
352 358 { "hash_elements_max", KSTAT_DATA_UINT64 },
353 359 { "hash_collisions", KSTAT_DATA_UINT64 },
↓ open down ↓ |
46 lines elided |
↑ open up ↑ |
354 360 { "hash_chains", KSTAT_DATA_UINT64 },
355 361 { "hash_chain_max", KSTAT_DATA_UINT64 },
356 362 { "p", KSTAT_DATA_UINT64 },
357 363 { "c", KSTAT_DATA_UINT64 },
358 364 { "c_min", KSTAT_DATA_UINT64 },
359 365 { "c_max", KSTAT_DATA_UINT64 },
360 366 { "size", KSTAT_DATA_UINT64 },
361 367 { "hdr_size", KSTAT_DATA_UINT64 },
362 368 { "data_size", KSTAT_DATA_UINT64 },
363 369 { "other_size", KSTAT_DATA_UINT64 },
370 + { "growth_rate", KSTAT_DATA_UINT64 },
364 371 { "l2_hits", KSTAT_DATA_UINT64 },
365 372 { "l2_misses", KSTAT_DATA_UINT64 },
366 373 { "l2_feeds", KSTAT_DATA_UINT64 },
367 374 { "l2_rw_clash", KSTAT_DATA_UINT64 },
368 375 { "l2_read_bytes", KSTAT_DATA_UINT64 },
369 376 { "l2_write_bytes", KSTAT_DATA_UINT64 },
370 377 { "l2_writes_sent", KSTAT_DATA_UINT64 },
371 378 { "l2_writes_done", KSTAT_DATA_UINT64 },
372 379 { "l2_writes_error", KSTAT_DATA_UINT64 },
373 380 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
374 381 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
375 382 { "l2_evict_reading", KSTAT_DATA_UINT64 },
376 383 { "l2_free_on_write", KSTAT_DATA_UINT64 },
377 384 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
378 385 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
379 386 { "l2_io_error", KSTAT_DATA_UINT64 },
380 387 { "l2_size", KSTAT_DATA_UINT64 },
381 388 { "l2_asize", KSTAT_DATA_UINT64 },
382 389 { "l2_hdr_size", KSTAT_DATA_UINT64 },
383 390 { "l2_compress_successes", KSTAT_DATA_UINT64 },
384 391 { "l2_compress_zeros", KSTAT_DATA_UINT64 },
385 392 { "l2_compress_failures", KSTAT_DATA_UINT64 },
386 393 { "memory_throttle_count", KSTAT_DATA_UINT64 },
387 394 { "duplicate_buffers", KSTAT_DATA_UINT64 },
388 395 { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
389 396 { "duplicate_reads", KSTAT_DATA_UINT64 },
390 397 { "arc_meta_used", KSTAT_DATA_UINT64 },
391 398 { "arc_meta_limit", KSTAT_DATA_UINT64 },
392 399 { "arc_meta_max", KSTAT_DATA_UINT64 }
393 400 };
394 401
395 402 #define ARCSTAT(stat) (arc_stats.stat.value.ui64)
396 403
397 404 #define ARCSTAT_INCR(stat, val) \
398 405 atomic_add_64(&arc_stats.stat.value.ui64, (val))
399 406
400 407 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
401 408 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
402 409
403 410 #define ARCSTAT_MAX(stat, val) { \
404 411 uint64_t m; \
405 412 while ((val) > (m = arc_stats.stat.value.ui64) && \
406 413 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
407 414 continue; \
408 415 }
409 416
410 417 #define ARCSTAT_MAXSTAT(stat) \
411 418 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
412 419
413 420 /*
414 421 * We define a macro to allow ARC hits/misses to be easily broken down by
415 422 * two separate conditions, giving a total of four different subtypes for
416 423 * each of hits and misses (so eight statistics total).
417 424 */
418 425 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
419 426 if (cond1) { \
420 427 if (cond2) { \
421 428 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
422 429 } else { \
↓ open down ↓ |
49 lines elided |
↑ open up ↑ |
423 430 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
424 431 } \
425 432 } else { \
426 433 if (cond2) { \
427 434 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
428 435 } else { \
429 436 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
430 437 } \
431 438 }
432 439
440 +/*
441 + * This macro allows us to use kstats as floating averages. Each time we
442 + * update this kstat, we first factor it and the update value by `factor'
443 + * to shrink the new value's contribution to the overall average. This
444 + * macro assumes that integer loads and stores are atomic, but is not
445 + * safe for multiple writers updating the kstat in parallel (only the
446 + * last writer's update will remain).
447 + */
448 +#define ARCSTAT_F_AVG(stat, value, factor) \
449 + do { \
450 + uint64_t x = ARCSTAT(stat); \
451 + x = x - x / factor + (value) / factor; \
452 + ARCSTAT(stat) = x; \
453 + _NOTE(NOTREACHED) \
454 + _NOTE(CONSTCOND) \
455 + } while (0)
456 +
433 457 kstat_t *arc_ksp;
434 458 static arc_state_t *arc_anon;
435 459 static arc_state_t *arc_mru;
436 460 static arc_state_t *arc_mru_ghost;
437 461 static arc_state_t *arc_mfu;
438 462 static arc_state_t *arc_mfu_ghost;
439 463 static arc_state_t *arc_l2c_only;
440 464
441 465 /*
442 466 * There are several ARC variables that are critical to export as kstats --
443 467 * but we don't want to have to grovel around in the kstat whenever we wish to
444 468 * manipulate them. For these variables, we therefore define them to be in
445 469 * terms of the statistic variable. This assures that we are not introducing
446 470 * the possibility of inconsistency by having shadow copies of the variables,
447 471 * while still allowing the code to be readable.
448 472 */
449 473 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
450 474 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
451 475 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */
452 476 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
453 477 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
↓ open down ↓ |
11 lines elided |
↑ open up ↑ |
454 478 #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
455 479 #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
456 480 #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
457 481
458 482 #define L2ARC_IS_VALID_COMPRESS(_c_) \
459 483 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
460 484
461 485 static int arc_no_grow; /* Don't try to grow cache size */
462 486 static uint64_t arc_tempreserve;
463 487 static uint64_t arc_loaned_bytes;
488 +static uint64_t arc_bytes_allocd = 0;
464 489
465 490 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
466 491
467 492 typedef struct arc_callback arc_callback_t;
468 493
469 494 struct arc_callback {
470 495 void *acb_private;
471 496 arc_done_func_t *acb_done;
472 497 arc_buf_t *acb_buf;
473 498 zio_t *acb_zio_dummy;
474 499 arc_callback_t *acb_next;
475 500 };
476 501
477 502 typedef struct arc_write_callback arc_write_callback_t;
478 503
479 504 struct arc_write_callback {
480 505 void *awcb_private;
481 506 arc_done_func_t *awcb_ready;
482 507 arc_done_func_t *awcb_physdone;
483 508 arc_done_func_t *awcb_done;
484 509 arc_buf_t *awcb_buf;
485 510 };
486 511
487 512 struct arc_buf_hdr {
488 513 /* protected by hash lock */
489 514 dva_t b_dva;
490 515 uint64_t b_birth;
491 516 uint64_t b_cksum0;
492 517
493 518 kmutex_t b_freeze_lock;
494 519 zio_cksum_t *b_freeze_cksum;
495 520 void *b_thawed;
496 521
497 522 arc_buf_hdr_t *b_hash_next;
498 523 arc_buf_t *b_buf;
499 524 uint32_t b_flags;
500 525 uint32_t b_datacnt;
501 526
502 527 arc_callback_t *b_acb;
503 528 kcondvar_t b_cv;
504 529
505 530 /* immutable */
506 531 arc_buf_contents_t b_type;
507 532 uint64_t b_size;
508 533 uint64_t b_spa;
509 534
510 535 /* protected by arc state mutex */
511 536 arc_state_t *b_state;
512 537 list_node_t b_arc_node;
513 538
514 539 /* updated atomically */
515 540 clock_t b_arc_access;
516 541
517 542 /* self protecting */
518 543 refcount_t b_refcnt;
519 544
520 545 l2arc_buf_hdr_t *b_l2hdr;
521 546 list_node_t b_l2node;
522 547 };
523 548
524 549 static arc_buf_t *arc_eviction_list;
525 550 static kmutex_t arc_eviction_mtx;
526 551 static arc_buf_hdr_t arc_eviction_hdr;
527 552 static void arc_get_data_buf(arc_buf_t *buf);
528 553 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
529 554 static int arc_evict_needed(arc_buf_contents_t type);
530 555 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
531 556 static void arc_buf_watch(arc_buf_t *buf);
532 557
533 558 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
534 559
535 560 #define GHOST_STATE(state) \
536 561 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
537 562 (state) == arc_l2c_only)
538 563
539 564 /*
540 565 * Private ARC flags. These flags are private ARC only flags that will show up
541 566 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can
542 567 * be passed in as arc_flags in things like arc_read. However, these flags
543 568 * should never be passed and should only be set by ARC code. When adding new
544 569 * public flags, make sure not to smash the private ones.
545 570 */
546 571
547 572 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */
548 573 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */
549 574 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
550 575 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
551 576 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
552 577 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */
553 578 #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */
554 579 #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
555 580 #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
556 581 #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
557 582
558 583 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
559 584 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
560 585 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
561 586 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH)
562 587 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
563 588 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
564 589 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
565 590 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE)
566 591 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
567 592 (hdr)->b_l2hdr != NULL)
568 593 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING)
569 594 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED)
570 595 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
571 596
572 597 /*
573 598 * Other sizes
574 599 */
575 600
576 601 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
577 602 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
578 603
579 604 /*
580 605 * Hash table routines
581 606 */
582 607
583 608 struct ht_table {
584 609 arc_buf_hdr_t *hdr;
585 610 kmutex_t lock;
586 611 };
587 612
588 613 typedef struct buf_hash_table {
589 614 uint64_t ht_mask;
590 615 struct ht_table *ht_table;
591 616 } buf_hash_table_t;
592 617
593 618 #pragma align 64(buf_hash_table)
594 619 static buf_hash_table_t buf_hash_table;
595 620
596 621 #define BUF_HASH_INDEX(spa, dva, birth) \
597 622 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
598 623 #define BUF_HASH_LOCK(idx) (&buf_hash_table.ht_table[idx].lock)
599 624 #define HDR_LOCK(hdr) \
600 625 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
601 626
602 627 uint64_t zfs_crc64_table[256];
603 628
604 629 /*
605 630 * Level 2 ARC
606 631 */
607 632
608 633 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
609 634 #define L2ARC_HEADROOM 2 /* num of writes */
610 635 /*
611 636 * If we discover during ARC scan any buffers to be compressed, we boost
612 637 * our headroom for the next scanning cycle by this percentage multiple.
613 638 */
614 639 #define L2ARC_HEADROOM_BOOST 200
615 640 #define L2ARC_FEED_SECS 1 /* caching interval secs */
616 641 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
617 642
618 643 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
619 644 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
620 645
621 646 /* L2ARC Performance Tunables */
622 647 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
623 648 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
624 649 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
625 650 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
626 651 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
627 652 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
628 653 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
629 654 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
630 655 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
631 656
632 657 /*
633 658 * L2ARC Internals
634 659 */
635 660 typedef struct l2arc_dev {
636 661 vdev_t *l2ad_vdev; /* vdev */
637 662 spa_t *l2ad_spa; /* spa */
638 663 uint64_t l2ad_hand; /* next write location */
639 664 uint64_t l2ad_start; /* first addr on device */
640 665 uint64_t l2ad_end; /* last addr on device */
641 666 uint64_t l2ad_evict; /* last addr eviction reached */
642 667 boolean_t l2ad_first; /* first sweep through */
643 668 boolean_t l2ad_writing; /* currently writing */
644 669 list_t *l2ad_buflist; /* buffer list */
645 670 list_node_t l2ad_node; /* device list node */
646 671 } l2arc_dev_t;
647 672
648 673 static list_t L2ARC_dev_list; /* device list */
649 674 static list_t *l2arc_dev_list; /* device list pointer */
650 675 static kmutex_t l2arc_dev_mtx; /* device list mutex */
651 676 static l2arc_dev_t *l2arc_dev_last; /* last device used */
652 677 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
653 678 static list_t L2ARC_free_on_write; /* free after write buf list */
654 679 static list_t *l2arc_free_on_write; /* free after write list ptr */
655 680 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
656 681 static uint64_t l2arc_ndev; /* number of devices */
657 682
658 683 typedef struct l2arc_read_callback {
659 684 arc_buf_t *l2rcb_buf; /* read buffer */
660 685 spa_t *l2rcb_spa; /* spa */
661 686 blkptr_t l2rcb_bp; /* original blkptr */
662 687 zbookmark_phys_t l2rcb_zb; /* original bookmark */
663 688 int l2rcb_flags; /* original flags */
664 689 enum zio_compress l2rcb_compress; /* applied compress */
665 690 } l2arc_read_callback_t;
666 691
667 692 typedef struct l2arc_write_callback {
668 693 l2arc_dev_t *l2wcb_dev; /* device info */
669 694 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
670 695 } l2arc_write_callback_t;
671 696
672 697 struct l2arc_buf_hdr {
673 698 /* protected by arc_buf_hdr mutex */
674 699 l2arc_dev_t *b_dev; /* L2ARC device */
675 700 uint64_t b_daddr; /* disk address, offset byte */
676 701 /* compression applied to buffer data */
677 702 enum zio_compress b_compress;
678 703 /* real alloc'd buffer size depending on b_compress applied */
679 704 int b_asize;
680 705 /* temporary buffer holder for in-flight compressed data */
681 706 void *b_tmp_cdata;
682 707 };
683 708
684 709 typedef struct l2arc_data_free {
685 710 /* protected by l2arc_free_on_write_mtx */
686 711 void *l2df_data;
687 712 size_t l2df_size;
688 713 void (*l2df_func)(void *, size_t);
689 714 list_node_t l2df_list_node;
690 715 } l2arc_data_free_t;
691 716
692 717 static kmutex_t l2arc_feed_thr_lock;
693 718 static kcondvar_t l2arc_feed_thr_cv;
694 719 static uint8_t l2arc_thread_exit;
695 720
696 721 static void l2arc_read_done(zio_t *zio);
697 722 static void l2arc_hdr_stat_add(void);
698 723 static void l2arc_hdr_stat_remove(void);
699 724
700 725 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
701 726 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
702 727 enum zio_compress c);
703 728 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
704 729
705 730 static uint64_t
706 731 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
707 732 {
708 733 uint8_t *vdva = (uint8_t *)dva;
709 734 uint64_t crc = -1ULL;
710 735 int i;
711 736
712 737 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
713 738
714 739 for (i = 0; i < sizeof (dva_t); i++)
715 740 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
716 741
717 742 crc ^= (spa>>8) ^ birth;
718 743
719 744 return (crc);
720 745 }
721 746
722 747 #define BUF_EMPTY(buf) \
723 748 ((buf)->b_dva.dva_word[0] == 0 && \
724 749 (buf)->b_dva.dva_word[1] == 0 && \
725 750 (buf)->b_cksum0 == 0)
726 751
727 752 #define BUF_EQUAL(spa, dva, birth, buf) \
728 753 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
729 754 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
730 755 ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
731 756
732 757 static void
733 758 buf_discard_identity(arc_buf_hdr_t *hdr)
734 759 {
735 760 hdr->b_dva.dva_word[0] = 0;
736 761 hdr->b_dva.dva_word[1] = 0;
737 762 hdr->b_birth = 0;
738 763 hdr->b_cksum0 = 0;
739 764 }
740 765
741 766 static arc_buf_hdr_t *
742 767 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
743 768 {
744 769 const dva_t *dva = BP_IDENTITY(bp);
745 770 uint64_t birth = BP_PHYSICAL_BIRTH(bp);
746 771 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
747 772 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
748 773 arc_buf_hdr_t *buf;
749 774
750 775 mutex_enter(hash_lock);
751 776 for (buf = buf_hash_table.ht_table[idx].hdr; buf != NULL;
752 777 buf = buf->b_hash_next) {
753 778 if (BUF_EQUAL(spa, dva, birth, buf)) {
754 779 *lockp = hash_lock;
755 780 return (buf);
756 781 }
757 782 }
758 783 mutex_exit(hash_lock);
759 784 *lockp = NULL;
760 785 return (NULL);
761 786 }
762 787
763 788 /*
764 789 * Insert an entry into the hash table. If there is already an element
765 790 * equal to elem in the hash table, then the already existing element
766 791 * will be returned and the new element will not be inserted.
767 792 * Otherwise returns NULL.
768 793 */
769 794 static arc_buf_hdr_t *
770 795 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
771 796 {
772 797 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
773 798 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
774 799 arc_buf_hdr_t *fbuf;
775 800 uint32_t i;
776 801
777 802 ASSERT(!DVA_IS_EMPTY(&buf->b_dva));
778 803 ASSERT(buf->b_birth != 0);
779 804 ASSERT(!HDR_IN_HASH_TABLE(buf));
780 805 *lockp = hash_lock;
781 806 mutex_enter(hash_lock);
782 807 for (fbuf = buf_hash_table.ht_table[idx].hdr, i = 0; fbuf != NULL;
783 808 fbuf = fbuf->b_hash_next, i++) {
784 809 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
785 810 return (fbuf);
786 811 }
787 812
788 813 buf->b_hash_next = buf_hash_table.ht_table[idx].hdr;
789 814 buf_hash_table.ht_table[idx].hdr = buf;
790 815 buf->b_flags |= ARC_IN_HASH_TABLE;
791 816
792 817 /* collect some hash table performance data */
793 818 if (i > 0) {
794 819 ARCSTAT_BUMP(arcstat_hash_collisions);
795 820 if (i == 1)
796 821 ARCSTAT_BUMP(arcstat_hash_chains);
797 822
798 823 ARCSTAT_MAX(arcstat_hash_chain_max, i);
799 824 }
800 825
801 826 ARCSTAT_BUMP(arcstat_hash_elements);
802 827 ARCSTAT_MAXSTAT(arcstat_hash_elements);
803 828
804 829 return (NULL);
805 830 }
806 831
807 832 static void
808 833 buf_hash_remove(arc_buf_hdr_t *buf)
809 834 {
810 835 arc_buf_hdr_t *fbuf, **bufp;
811 836 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
812 837
813 838 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
814 839 ASSERT(HDR_IN_HASH_TABLE(buf));
815 840
816 841 bufp = &buf_hash_table.ht_table[idx].hdr;
817 842 while ((fbuf = *bufp) != buf) {
818 843 ASSERT(fbuf != NULL);
819 844 bufp = &fbuf->b_hash_next;
820 845 }
821 846 *bufp = buf->b_hash_next;
822 847 buf->b_hash_next = NULL;
823 848 buf->b_flags &= ~ARC_IN_HASH_TABLE;
824 849
825 850 /* collect some hash table performance data */
826 851 ARCSTAT_BUMPDOWN(arcstat_hash_elements);
827 852
828 853 if (buf_hash_table.ht_table[idx].hdr &&
829 854 buf_hash_table.ht_table[idx].hdr->b_hash_next == NULL)
830 855 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
831 856 }
832 857
833 858 /*
834 859 * Global data structures and functions for the buf kmem cache.
835 860 */
836 861 static kmem_cache_t *hdr_cache;
837 862 static kmem_cache_t *buf_cache;
838 863
839 864 static void
840 865 buf_fini(void)
841 866 {
842 867 int i;
843 868
844 869 for (i = 0; i < buf_hash_table.ht_mask + 1; i++)
845 870 mutex_destroy(&buf_hash_table.ht_table[i].lock);
846 871 kmem_free(buf_hash_table.ht_table,
847 872 (buf_hash_table.ht_mask + 1) * sizeof (struct ht_table));
848 873 kmem_cache_destroy(hdr_cache);
849 874 kmem_cache_destroy(buf_cache);
850 875 }
851 876
852 877 /*
853 878 * Constructor callback - called when the cache is empty
854 879 * and a new buf is requested.
855 880 */
856 881 /* ARGSUSED */
857 882 static int
858 883 hdr_cons(void *vbuf, void *unused, int kmflag)
859 884 {
860 885 arc_buf_hdr_t *buf = vbuf;
861 886
862 887 bzero(buf, sizeof (arc_buf_hdr_t));
863 888 refcount_create(&buf->b_refcnt);
864 889 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
865 890 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
866 891 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
867 892
868 893 return (0);
869 894 }
870 895
871 896 /* ARGSUSED */
872 897 static int
873 898 buf_cons(void *vbuf, void *unused, int kmflag)
874 899 {
875 900 arc_buf_t *buf = vbuf;
876 901
877 902 bzero(buf, sizeof (arc_buf_t));
878 903 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
879 904 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
880 905
881 906 return (0);
882 907 }
883 908
884 909 /*
885 910 * Destructor callback - called when a cached buf is
886 911 * no longer required.
887 912 */
888 913 /* ARGSUSED */
889 914 static void
890 915 hdr_dest(void *vbuf, void *unused)
891 916 {
892 917 arc_buf_hdr_t *buf = vbuf;
893 918
894 919 ASSERT(BUF_EMPTY(buf));
895 920 refcount_destroy(&buf->b_refcnt);
896 921 cv_destroy(&buf->b_cv);
897 922 mutex_destroy(&buf->b_freeze_lock);
898 923 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
899 924 }
900 925
901 926 /* ARGSUSED */
902 927 static void
903 928 buf_dest(void *vbuf, void *unused)
904 929 {
905 930 arc_buf_t *buf = vbuf;
906 931
907 932 mutex_destroy(&buf->b_evict_lock);
908 933 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
909 934 }
910 935
911 936 /*
912 937 * Reclaim callback -- invoked when memory is low.
913 938 */
914 939 /* ARGSUSED */
915 940 static void
916 941 hdr_recl(void *unused)
917 942 {
918 943 dprintf("hdr_recl called\n");
919 944 /*
920 945 * umem calls the reclaim func when we destroy the buf cache,
921 946 * which is after we do arc_fini().
922 947 */
923 948 if (!arc_dead)
924 949 cv_signal(&arc_reclaim_thr_cv);
925 950 }
926 951
927 952 static void
928 953 buf_init(void)
929 954 {
930 955 uint64_t *ct;
931 956 uint64_t hsize = 1ULL << 12;
932 957 int i, j;
933 958
934 959 /*
935 960 * The hash table is big enough to fill all of physical memory
936 961 * with an average block size of zfs_arc_average_blocksize (default 8K).
937 962 * By default, the table will take up
938 963 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
939 964 */
940 965 while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
941 966 hsize <<= 1;
942 967 retry:
943 968 buf_hash_table.ht_mask = hsize - 1;
944 969 buf_hash_table.ht_table =
945 970 kmem_zalloc(hsize * sizeof (struct ht_table), KM_NOSLEEP);
946 971 if (buf_hash_table.ht_table == NULL) {
947 972 ASSERT(hsize > (1ULL << 8));
948 973 hsize >>= 1;
949 974 goto retry;
950 975 }
951 976
952 977 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
953 978 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
954 979 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
955 980 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
956 981
957 982 for (i = 0; i < 256; i++)
958 983 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
959 984 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
960 985
961 986 for (i = 0; i < hsize; i++) {
962 987 mutex_init(&buf_hash_table.ht_table[i].lock,
963 988 NULL, MUTEX_DEFAULT, NULL);
964 989 }
965 990 }
966 991
967 992 #define ARC_MINTIME (hz>>4) /* 62 ms */
968 993
969 994 static void
970 995 arc_cksum_verify(arc_buf_t *buf)
971 996 {
972 997 zio_cksum_t zc;
973 998
974 999 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
975 1000 return;
976 1001
977 1002 mutex_enter(&buf->b_hdr->b_freeze_lock);
978 1003 if (buf->b_hdr->b_freeze_cksum == NULL ||
979 1004 (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
980 1005 mutex_exit(&buf->b_hdr->b_freeze_lock);
981 1006 return;
982 1007 }
983 1008 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
984 1009 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
985 1010 panic("buffer modified while frozen!");
986 1011 mutex_exit(&buf->b_hdr->b_freeze_lock);
987 1012 }
988 1013
989 1014 static int
990 1015 arc_cksum_equal(arc_buf_t *buf)
991 1016 {
992 1017 zio_cksum_t zc;
993 1018 int equal;
994 1019
995 1020 mutex_enter(&buf->b_hdr->b_freeze_lock);
996 1021 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
997 1022 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
998 1023 mutex_exit(&buf->b_hdr->b_freeze_lock);
999 1024
1000 1025 return (equal);
1001 1026 }
1002 1027
1003 1028 static void
1004 1029 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1005 1030 {
1006 1031 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1007 1032 return;
1008 1033
1009 1034 mutex_enter(&buf->b_hdr->b_freeze_lock);
1010 1035 if (buf->b_hdr->b_freeze_cksum != NULL) {
1011 1036 mutex_exit(&buf->b_hdr->b_freeze_lock);
1012 1037 return;
1013 1038 }
1014 1039 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1015 1040 fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1016 1041 buf->b_hdr->b_freeze_cksum);
1017 1042 mutex_exit(&buf->b_hdr->b_freeze_lock);
1018 1043 arc_buf_watch(buf);
1019 1044 }
1020 1045
1021 1046 #ifndef _KERNEL
1022 1047 typedef struct procctl {
1023 1048 long cmd;
1024 1049 prwatch_t prwatch;
1025 1050 } procctl_t;
1026 1051 #endif
1027 1052
1028 1053 /* ARGSUSED */
1029 1054 static void
1030 1055 arc_buf_unwatch(arc_buf_t *buf)
1031 1056 {
1032 1057 #ifndef _KERNEL
1033 1058 if (arc_watch) {
1034 1059 int result;
1035 1060 procctl_t ctl;
1036 1061 ctl.cmd = PCWATCH;
1037 1062 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1038 1063 ctl.prwatch.pr_size = 0;
1039 1064 ctl.prwatch.pr_wflags = 0;
1040 1065 result = write(arc_procfd, &ctl, sizeof (ctl));
1041 1066 ASSERT3U(result, ==, sizeof (ctl));
1042 1067 }
1043 1068 #endif
1044 1069 }
1045 1070
1046 1071 /* ARGSUSED */
1047 1072 static void
1048 1073 arc_buf_watch(arc_buf_t *buf)
1049 1074 {
1050 1075 #ifndef _KERNEL
1051 1076 if (arc_watch) {
1052 1077 int result;
1053 1078 procctl_t ctl;
1054 1079 ctl.cmd = PCWATCH;
1055 1080 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1056 1081 ctl.prwatch.pr_size = buf->b_hdr->b_size;
1057 1082 ctl.prwatch.pr_wflags = WA_WRITE;
1058 1083 result = write(arc_procfd, &ctl, sizeof (ctl));
1059 1084 ASSERT3U(result, ==, sizeof (ctl));
1060 1085 }
1061 1086 #endif
1062 1087 }
1063 1088
1064 1089 void
1065 1090 arc_buf_thaw(arc_buf_t *buf)
1066 1091 {
1067 1092 if (zfs_flags & ZFS_DEBUG_MODIFY) {
1068 1093 if (buf->b_hdr->b_state != arc_anon)
1069 1094 panic("modifying non-anon buffer!");
1070 1095 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1071 1096 panic("modifying buffer while i/o in progress!");
1072 1097 arc_cksum_verify(buf);
1073 1098 }
1074 1099
1075 1100 mutex_enter(&buf->b_hdr->b_freeze_lock);
1076 1101 if (buf->b_hdr->b_freeze_cksum != NULL) {
1077 1102 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1078 1103 buf->b_hdr->b_freeze_cksum = NULL;
1079 1104 }
1080 1105
1081 1106 if (zfs_flags & ZFS_DEBUG_MODIFY) {
1082 1107 if (buf->b_hdr->b_thawed)
1083 1108 kmem_free(buf->b_hdr->b_thawed, 1);
1084 1109 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1085 1110 }
1086 1111
1087 1112 mutex_exit(&buf->b_hdr->b_freeze_lock);
1088 1113
1089 1114 arc_buf_unwatch(buf);
1090 1115 }
1091 1116
1092 1117 void
1093 1118 arc_buf_freeze(arc_buf_t *buf)
1094 1119 {
1095 1120 kmutex_t *hash_lock;
1096 1121
1097 1122 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1098 1123 return;
1099 1124
1100 1125 hash_lock = HDR_LOCK(buf->b_hdr);
1101 1126 mutex_enter(hash_lock);
1102 1127
1103 1128 ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1104 1129 buf->b_hdr->b_state == arc_anon);
1105 1130 arc_cksum_compute(buf, B_FALSE);
1106 1131 mutex_exit(hash_lock);
1107 1132
1108 1133 }
1109 1134
1110 1135 static void
1111 1136 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1112 1137 {
1113 1138 ASSERT(MUTEX_HELD(hash_lock));
1114 1139
1115 1140 if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1116 1141 (ab->b_state != arc_anon)) {
1117 1142 uint64_t delta = ab->b_size * ab->b_datacnt;
1118 1143 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1119 1144 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1120 1145
1121 1146 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1122 1147 mutex_enter(&ab->b_state->arcs_mtx);
1123 1148 ASSERT(list_link_active(&ab->b_arc_node));
1124 1149 list_remove(list, ab);
1125 1150 if (GHOST_STATE(ab->b_state)) {
1126 1151 ASSERT0(ab->b_datacnt);
1127 1152 ASSERT3P(ab->b_buf, ==, NULL);
1128 1153 delta = ab->b_size;
1129 1154 }
1130 1155 ASSERT(delta > 0);
1131 1156 ASSERT3U(*size, >=, delta);
1132 1157 atomic_add_64(size, -delta);
1133 1158 mutex_exit(&ab->b_state->arcs_mtx);
1134 1159 /* remove the prefetch flag if we get a reference */
1135 1160 if (ab->b_flags & ARC_PREFETCH)
1136 1161 ab->b_flags &= ~ARC_PREFETCH;
1137 1162 }
1138 1163 }
1139 1164
1140 1165 static int
1141 1166 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1142 1167 {
1143 1168 int cnt;
1144 1169 arc_state_t *state = ab->b_state;
1145 1170
1146 1171 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1147 1172 ASSERT(!GHOST_STATE(state));
1148 1173
1149 1174 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1150 1175 (state != arc_anon)) {
1151 1176 uint64_t *size = &state->arcs_lsize[ab->b_type];
1152 1177
1153 1178 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1154 1179 mutex_enter(&state->arcs_mtx);
1155 1180 ASSERT(!list_link_active(&ab->b_arc_node));
1156 1181 list_insert_head(&state->arcs_list[ab->b_type], ab);
1157 1182 ASSERT(ab->b_datacnt > 0);
1158 1183 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1159 1184 mutex_exit(&state->arcs_mtx);
1160 1185 }
1161 1186 return (cnt);
1162 1187 }
1163 1188
1164 1189 /*
1165 1190 * Move the supplied buffer to the indicated state. The mutex
1166 1191 * for the buffer must be held by the caller.
1167 1192 */
1168 1193 static void
1169 1194 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1170 1195 {
1171 1196 arc_state_t *old_state = ab->b_state;
1172 1197 int64_t refcnt = refcount_count(&ab->b_refcnt);
1173 1198 uint64_t from_delta, to_delta;
1174 1199
1175 1200 ASSERT(MUTEX_HELD(hash_lock));
1176 1201 ASSERT3P(new_state, !=, old_state);
1177 1202 ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1178 1203 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1179 1204 ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1180 1205
1181 1206 from_delta = to_delta = ab->b_datacnt * ab->b_size;
1182 1207
1183 1208 /*
1184 1209 * If this buffer is evictable, transfer it from the
1185 1210 * old state list to the new state list.
1186 1211 */
1187 1212 if (refcnt == 0) {
1188 1213 if (old_state != arc_anon) {
1189 1214 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1190 1215 uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1191 1216
1192 1217 if (use_mutex)
1193 1218 mutex_enter(&old_state->arcs_mtx);
1194 1219
1195 1220 ASSERT(list_link_active(&ab->b_arc_node));
1196 1221 list_remove(&old_state->arcs_list[ab->b_type], ab);
1197 1222
1198 1223 /*
1199 1224 * If prefetching out of the ghost cache,
1200 1225 * we will have a non-zero datacnt.
1201 1226 */
1202 1227 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1203 1228 /* ghost elements have a ghost size */
1204 1229 ASSERT(ab->b_buf == NULL);
1205 1230 from_delta = ab->b_size;
1206 1231 }
1207 1232 ASSERT3U(*size, >=, from_delta);
1208 1233 atomic_add_64(size, -from_delta);
1209 1234
1210 1235 if (use_mutex)
1211 1236 mutex_exit(&old_state->arcs_mtx);
1212 1237 }
1213 1238 if (new_state != arc_anon) {
1214 1239 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1215 1240 uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1216 1241
1217 1242 if (use_mutex)
1218 1243 mutex_enter(&new_state->arcs_mtx);
1219 1244
1220 1245 list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1221 1246
1222 1247 /* ghost elements have a ghost size */
1223 1248 if (GHOST_STATE(new_state)) {
1224 1249 ASSERT(ab->b_datacnt == 0);
1225 1250 ASSERT(ab->b_buf == NULL);
1226 1251 to_delta = ab->b_size;
1227 1252 }
1228 1253 atomic_add_64(size, to_delta);
1229 1254
1230 1255 if (use_mutex)
1231 1256 mutex_exit(&new_state->arcs_mtx);
1232 1257 }
1233 1258 }
1234 1259
1235 1260 ASSERT(!BUF_EMPTY(ab));
1236 1261 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1237 1262 buf_hash_remove(ab);
1238 1263
1239 1264 /* adjust state sizes */
1240 1265 if (to_delta)
1241 1266 atomic_add_64(&new_state->arcs_size, to_delta);
1242 1267 if (from_delta) {
1243 1268 ASSERT3U(old_state->arcs_size, >=, from_delta);
1244 1269 atomic_add_64(&old_state->arcs_size, -from_delta);
1245 1270 }
1246 1271 ab->b_state = new_state;
1247 1272
1248 1273 /* adjust l2arc hdr stats */
1249 1274 if (new_state == arc_l2c_only)
1250 1275 l2arc_hdr_stat_add();
1251 1276 else if (old_state == arc_l2c_only)
1252 1277 l2arc_hdr_stat_remove();
1253 1278 }
1254 1279
1255 1280 void
1256 1281 arc_space_consume(uint64_t space, arc_space_type_t type)
1257 1282 {
1258 1283 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1259 1284
1260 1285 switch (type) {
1261 1286 case ARC_SPACE_DATA:
1262 1287 ARCSTAT_INCR(arcstat_data_size, space);
1263 1288 break;
1264 1289 case ARC_SPACE_OTHER:
1265 1290 ARCSTAT_INCR(arcstat_other_size, space);
1266 1291 break;
↓ open down ↓ |
793 lines elided |
↑ open up ↑ |
1267 1292 case ARC_SPACE_HDRS:
1268 1293 ARCSTAT_INCR(arcstat_hdr_size, space);
1269 1294 break;
1270 1295 case ARC_SPACE_L2HDRS:
1271 1296 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1272 1297 break;
1273 1298 }
1274 1299
1275 1300 ARCSTAT_INCR(arcstat_meta_used, space);
1276 1301 atomic_add_64(&arc_size, space);
1302 + atomic_add_64(&arc_bytes_allocd, space);
1277 1303 }
1278 1304
1279 1305 void
1280 1306 arc_space_return(uint64_t space, arc_space_type_t type)
1281 1307 {
1282 1308 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1283 1309
1284 1310 switch (type) {
1285 1311 case ARC_SPACE_DATA:
1286 1312 ARCSTAT_INCR(arcstat_data_size, -space);
1287 1313 break;
1288 1314 case ARC_SPACE_OTHER:
1289 1315 ARCSTAT_INCR(arcstat_other_size, -space);
1290 1316 break;
1291 1317 case ARC_SPACE_HDRS:
1292 1318 ARCSTAT_INCR(arcstat_hdr_size, -space);
1293 1319 break;
1294 1320 case ARC_SPACE_L2HDRS:
1295 1321 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1296 1322 break;
1297 1323 }
1298 1324
1299 1325 ASSERT(arc_meta_used >= space);
1300 1326 if (arc_meta_max < arc_meta_used)
1301 1327 arc_meta_max = arc_meta_used;
1302 1328 ARCSTAT_INCR(arcstat_meta_used, -space);
↓ open down ↓ |
16 lines elided |
↑ open up ↑ |
1303 1329 ASSERT(arc_size >= space);
1304 1330 atomic_add_64(&arc_size, -space);
1305 1331 }
1306 1332
1307 1333 void *
1308 1334 arc_data_buf_alloc(uint64_t size)
1309 1335 {
1310 1336 if (arc_evict_needed(ARC_BUFC_DATA))
1311 1337 cv_signal(&arc_reclaim_thr_cv);
1312 1338 atomic_add_64(&arc_size, size);
1339 + atomic_add_64(&arc_bytes_allocd, size);
1313 1340 return (zio_data_buf_alloc(size));
1314 1341 }
1315 1342
1316 1343 void
1317 1344 arc_data_buf_free(void *buf, uint64_t size)
1318 1345 {
1319 1346 zio_data_buf_free(buf, size);
1320 1347 ASSERT(arc_size >= size);
1321 1348 atomic_add_64(&arc_size, -size);
1322 1349 }
1323 1350
1324 1351 arc_buf_t *
1325 1352 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1326 1353 {
1327 1354 arc_buf_hdr_t *hdr;
1328 1355 arc_buf_t *buf;
1329 1356
1330 1357 ASSERT3U(size, >, 0);
1331 1358 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1332 1359 ASSERT(BUF_EMPTY(hdr));
1333 1360 hdr->b_size = size;
1334 1361 hdr->b_type = type;
1335 1362 hdr->b_spa = spa_load_guid(spa);
1336 1363 hdr->b_state = arc_anon;
1337 1364 hdr->b_arc_access = 0;
1338 1365 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1339 1366 buf->b_hdr = hdr;
1340 1367 buf->b_data = NULL;
1341 1368 buf->b_efunc = NULL;
1342 1369 buf->b_private = NULL;
1343 1370 buf->b_next = NULL;
1344 1371 hdr->b_buf = buf;
1345 1372 arc_get_data_buf(buf);
1346 1373 hdr->b_datacnt = 1;
1347 1374 hdr->b_flags = 0;
1348 1375 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1349 1376 (void) refcount_add(&hdr->b_refcnt, tag);
1350 1377
1351 1378 return (buf);
1352 1379 }
1353 1380
1354 1381 static char *arc_onloan_tag = "onloan";
1355 1382
1356 1383 /*
1357 1384 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1358 1385 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1359 1386 * buffers must be returned to the arc before they can be used by the DMU or
1360 1387 * freed.
1361 1388 */
1362 1389 arc_buf_t *
1363 1390 arc_loan_buf(spa_t *spa, int size)
1364 1391 {
1365 1392 arc_buf_t *buf;
1366 1393
1367 1394 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1368 1395
1369 1396 atomic_add_64(&arc_loaned_bytes, size);
1370 1397 return (buf);
1371 1398 }
1372 1399
1373 1400 /*
1374 1401 * Return a loaned arc buffer to the arc.
1375 1402 */
1376 1403 void
1377 1404 arc_return_buf(arc_buf_t *buf, void *tag)
1378 1405 {
1379 1406 arc_buf_hdr_t *hdr = buf->b_hdr;
1380 1407
1381 1408 ASSERT(buf->b_data != NULL);
1382 1409 (void) refcount_add(&hdr->b_refcnt, tag);
1383 1410 (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1384 1411
1385 1412 atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1386 1413 }
1387 1414
1388 1415 /* Detach an arc_buf from a dbuf (tag) */
1389 1416 void
1390 1417 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1391 1418 {
1392 1419 arc_buf_hdr_t *hdr;
1393 1420
1394 1421 ASSERT(buf->b_data != NULL);
1395 1422 hdr = buf->b_hdr;
1396 1423 (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1397 1424 (void) refcount_remove(&hdr->b_refcnt, tag);
1398 1425 buf->b_efunc = NULL;
1399 1426 buf->b_private = NULL;
1400 1427
1401 1428 atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1402 1429 }
1403 1430
1404 1431 static arc_buf_t *
1405 1432 arc_buf_clone(arc_buf_t *from)
1406 1433 {
1407 1434 arc_buf_t *buf;
1408 1435 arc_buf_hdr_t *hdr = from->b_hdr;
1409 1436 uint64_t size = hdr->b_size;
1410 1437
1411 1438 ASSERT(hdr->b_state != arc_anon);
1412 1439
1413 1440 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1414 1441 buf->b_hdr = hdr;
1415 1442 buf->b_data = NULL;
1416 1443 buf->b_efunc = NULL;
1417 1444 buf->b_private = NULL;
1418 1445 buf->b_next = hdr->b_buf;
1419 1446 hdr->b_buf = buf;
1420 1447 arc_get_data_buf(buf);
1421 1448 bcopy(from->b_data, buf->b_data, size);
1422 1449
1423 1450 /*
1424 1451 * This buffer already exists in the arc so create a duplicate
1425 1452 * copy for the caller. If the buffer is associated with user data
1426 1453 * then track the size and number of duplicates. These stats will be
1427 1454 * updated as duplicate buffers are created and destroyed.
1428 1455 */
1429 1456 if (hdr->b_type == ARC_BUFC_DATA) {
1430 1457 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1431 1458 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1432 1459 }
1433 1460 hdr->b_datacnt += 1;
1434 1461 return (buf);
1435 1462 }
1436 1463
1437 1464 void
1438 1465 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1439 1466 {
1440 1467 arc_buf_hdr_t *hdr;
1441 1468 kmutex_t *hash_lock;
1442 1469
1443 1470 /*
1444 1471 * Check to see if this buffer is evicted. Callers
1445 1472 * must verify b_data != NULL to know if the add_ref
1446 1473 * was successful.
1447 1474 */
1448 1475 mutex_enter(&buf->b_evict_lock);
1449 1476 if (buf->b_data == NULL) {
1450 1477 mutex_exit(&buf->b_evict_lock);
1451 1478 return;
1452 1479 }
1453 1480 hash_lock = HDR_LOCK(buf->b_hdr);
1454 1481 mutex_enter(hash_lock);
1455 1482 hdr = buf->b_hdr;
1456 1483 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1457 1484 mutex_exit(&buf->b_evict_lock);
1458 1485
1459 1486 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1460 1487 add_reference(hdr, hash_lock, tag);
1461 1488 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1462 1489 arc_access(hdr, hash_lock);
1463 1490 mutex_exit(hash_lock);
1464 1491 ARCSTAT_BUMP(arcstat_hits);
1465 1492 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1466 1493 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1467 1494 data, metadata, hits);
1468 1495 }
1469 1496
1470 1497 /*
1471 1498 * Free the arc data buffer. If it is an l2arc write in progress,
1472 1499 * the buffer is placed on l2arc_free_on_write to be freed later.
1473 1500 */
1474 1501 static void
1475 1502 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1476 1503 {
1477 1504 arc_buf_hdr_t *hdr = buf->b_hdr;
1478 1505
1479 1506 if (HDR_L2_WRITING(hdr)) {
1480 1507 l2arc_data_free_t *df;
1481 1508 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1482 1509 df->l2df_data = buf->b_data;
1483 1510 df->l2df_size = hdr->b_size;
1484 1511 df->l2df_func = free_func;
1485 1512 mutex_enter(&l2arc_free_on_write_mtx);
1486 1513 list_insert_head(l2arc_free_on_write, df);
1487 1514 mutex_exit(&l2arc_free_on_write_mtx);
1488 1515 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1489 1516 } else {
1490 1517 free_func(buf->b_data, hdr->b_size);
1491 1518 }
1492 1519 }
1493 1520
1494 1521 /*
1495 1522 * Free up buf->b_data and if 'remove' is set, then pull the
1496 1523 * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
1497 1524 */
1498 1525 static void
1499 1526 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
1500 1527 {
1501 1528 arc_buf_t **bufp;
1502 1529
1503 1530 /* free up data associated with the buf */
1504 1531 if (buf->b_data) {
1505 1532 arc_state_t *state = buf->b_hdr->b_state;
1506 1533 uint64_t size = buf->b_hdr->b_size;
1507 1534 arc_buf_contents_t type = buf->b_hdr->b_type;
1508 1535
1509 1536 arc_cksum_verify(buf);
1510 1537 arc_buf_unwatch(buf);
1511 1538
1512 1539 if (!recycle) {
1513 1540 if (type == ARC_BUFC_METADATA) {
1514 1541 arc_buf_data_free(buf, zio_buf_free);
1515 1542 arc_space_return(size, ARC_SPACE_DATA);
1516 1543 } else {
1517 1544 ASSERT(type == ARC_BUFC_DATA);
1518 1545 arc_buf_data_free(buf, zio_data_buf_free);
1519 1546 ARCSTAT_INCR(arcstat_data_size, -size);
1520 1547 atomic_add_64(&arc_size, -size);
1521 1548 }
1522 1549 }
1523 1550 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1524 1551 uint64_t *cnt = &state->arcs_lsize[type];
1525 1552
1526 1553 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1527 1554 ASSERT(state != arc_anon);
1528 1555
1529 1556 ASSERT3U(*cnt, >=, size);
1530 1557 atomic_add_64(cnt, -size);
1531 1558 }
1532 1559 ASSERT3U(state->arcs_size, >=, size);
1533 1560 atomic_add_64(&state->arcs_size, -size);
1534 1561 buf->b_data = NULL;
1535 1562
1536 1563 /*
1537 1564 * If we're destroying a duplicate buffer make sure
1538 1565 * that the appropriate statistics are updated.
1539 1566 */
1540 1567 if (buf->b_hdr->b_datacnt > 1 &&
1541 1568 buf->b_hdr->b_type == ARC_BUFC_DATA) {
1542 1569 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1543 1570 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1544 1571 }
1545 1572 ASSERT(buf->b_hdr->b_datacnt > 0);
1546 1573 buf->b_hdr->b_datacnt -= 1;
1547 1574 }
1548 1575
1549 1576 /* only remove the buf if requested */
1550 1577 if (!remove)
1551 1578 return;
1552 1579
1553 1580 /* remove the buf from the hdr list */
1554 1581 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1555 1582 continue;
1556 1583 *bufp = buf->b_next;
1557 1584 buf->b_next = NULL;
1558 1585
1559 1586 ASSERT(buf->b_efunc == NULL);
1560 1587
1561 1588 /* clean up the buf */
1562 1589 buf->b_hdr = NULL;
1563 1590 kmem_cache_free(buf_cache, buf);
1564 1591 }
1565 1592
1566 1593 static void
1567 1594 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1568 1595 {
1569 1596 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1570 1597 ASSERT3P(hdr->b_state, ==, arc_anon);
1571 1598 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1572 1599 l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1573 1600
1574 1601 if (l2hdr != NULL) {
1575 1602 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1576 1603 /*
1577 1604 * To prevent arc_free() and l2arc_evict() from
1578 1605 * attempting to free the same buffer at the same time,
1579 1606 * a FREE_IN_PROGRESS flag is given to arc_free() to
1580 1607 * give it priority. l2arc_evict() can't destroy this
1581 1608 * header while we are waiting on l2arc_buflist_mtx.
1582 1609 *
1583 1610 * The hdr may be removed from l2ad_buflist before we
1584 1611 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1585 1612 */
1586 1613 if (!buflist_held) {
1587 1614 mutex_enter(&l2arc_buflist_mtx);
1588 1615 l2hdr = hdr->b_l2hdr;
1589 1616 }
1590 1617
1591 1618 if (l2hdr != NULL) {
1592 1619 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1593 1620 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1594 1621 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1595 1622 if (l2hdr->b_dev->l2ad_vdev)
1596 1623 vdev_space_update(l2hdr->b_dev->l2ad_vdev,
1597 1624 -l2hdr->b_asize, 0, 0);
1598 1625 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1599 1626 if (hdr->b_state == arc_l2c_only)
1600 1627 l2arc_hdr_stat_remove();
1601 1628 hdr->b_l2hdr = NULL;
1602 1629 }
1603 1630
1604 1631 if (!buflist_held)
1605 1632 mutex_exit(&l2arc_buflist_mtx);
1606 1633 }
1607 1634
1608 1635 if (!BUF_EMPTY(hdr)) {
1609 1636 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1610 1637 buf_discard_identity(hdr);
1611 1638 }
1612 1639 while (hdr->b_buf) {
1613 1640 arc_buf_t *buf = hdr->b_buf;
1614 1641
1615 1642 if (buf->b_efunc) {
1616 1643 mutex_enter(&arc_eviction_mtx);
1617 1644 mutex_enter(&buf->b_evict_lock);
1618 1645 ASSERT(buf->b_hdr != NULL);
1619 1646 arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1620 1647 hdr->b_buf = buf->b_next;
1621 1648 buf->b_hdr = &arc_eviction_hdr;
1622 1649 buf->b_next = arc_eviction_list;
1623 1650 arc_eviction_list = buf;
1624 1651 mutex_exit(&buf->b_evict_lock);
1625 1652 mutex_exit(&arc_eviction_mtx);
1626 1653 } else {
1627 1654 arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1628 1655 }
1629 1656 }
1630 1657 if (hdr->b_freeze_cksum != NULL) {
1631 1658 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1632 1659 hdr->b_freeze_cksum = NULL;
1633 1660 }
1634 1661 if (hdr->b_thawed) {
1635 1662 kmem_free(hdr->b_thawed, 1);
1636 1663 hdr->b_thawed = NULL;
1637 1664 }
1638 1665
1639 1666 ASSERT(!list_link_active(&hdr->b_arc_node));
1640 1667 ASSERT3P(hdr->b_hash_next, ==, NULL);
1641 1668 ASSERT3P(hdr->b_acb, ==, NULL);
1642 1669 kmem_cache_free(hdr_cache, hdr);
1643 1670 }
1644 1671
1645 1672 void
1646 1673 arc_buf_free(arc_buf_t *buf, void *tag)
1647 1674 {
1648 1675 arc_buf_hdr_t *hdr = buf->b_hdr;
1649 1676 int hashed = hdr->b_state != arc_anon;
1650 1677
1651 1678 ASSERT(buf->b_efunc == NULL);
1652 1679 ASSERT(buf->b_data != NULL);
1653 1680
1654 1681 if (hashed) {
1655 1682 kmutex_t *hash_lock = HDR_LOCK(hdr);
1656 1683
1657 1684 mutex_enter(hash_lock);
1658 1685 hdr = buf->b_hdr;
1659 1686 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1660 1687
1661 1688 (void) remove_reference(hdr, hash_lock, tag);
1662 1689 if (hdr->b_datacnt > 1) {
1663 1690 arc_buf_destroy(buf, FALSE, TRUE);
1664 1691 } else {
1665 1692 ASSERT(buf == hdr->b_buf);
1666 1693 ASSERT(buf->b_efunc == NULL);
1667 1694 hdr->b_flags |= ARC_BUF_AVAILABLE;
1668 1695 }
1669 1696 mutex_exit(hash_lock);
1670 1697 } else if (HDR_IO_IN_PROGRESS(hdr)) {
1671 1698 int destroy_hdr;
1672 1699 /*
1673 1700 * We are in the middle of an async write. Don't destroy
1674 1701 * this buffer unless the write completes before we finish
1675 1702 * decrementing the reference count.
1676 1703 */
1677 1704 mutex_enter(&arc_eviction_mtx);
1678 1705 (void) remove_reference(hdr, NULL, tag);
1679 1706 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1680 1707 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1681 1708 mutex_exit(&arc_eviction_mtx);
1682 1709 if (destroy_hdr)
1683 1710 arc_hdr_destroy(hdr);
1684 1711 } else {
1685 1712 if (remove_reference(hdr, NULL, tag) > 0)
1686 1713 arc_buf_destroy(buf, FALSE, TRUE);
1687 1714 else
1688 1715 arc_hdr_destroy(hdr);
1689 1716 }
1690 1717 }
1691 1718
1692 1719 boolean_t
1693 1720 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1694 1721 {
1695 1722 arc_buf_hdr_t *hdr = buf->b_hdr;
1696 1723 kmutex_t *hash_lock = HDR_LOCK(hdr);
1697 1724 boolean_t no_callback = (buf->b_efunc == NULL);
1698 1725
1699 1726 if (hdr->b_state == arc_anon) {
1700 1727 ASSERT(hdr->b_datacnt == 1);
1701 1728 arc_buf_free(buf, tag);
1702 1729 return (no_callback);
1703 1730 }
1704 1731
1705 1732 mutex_enter(hash_lock);
1706 1733 hdr = buf->b_hdr;
1707 1734 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1708 1735 ASSERT(hdr->b_state != arc_anon);
1709 1736 ASSERT(buf->b_data != NULL);
1710 1737
1711 1738 (void) remove_reference(hdr, hash_lock, tag);
1712 1739 if (hdr->b_datacnt > 1) {
1713 1740 if (no_callback)
1714 1741 arc_buf_destroy(buf, FALSE, TRUE);
1715 1742 } else if (no_callback) {
1716 1743 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1717 1744 ASSERT(buf->b_efunc == NULL);
1718 1745 hdr->b_flags |= ARC_BUF_AVAILABLE;
1719 1746 }
1720 1747 ASSERT(no_callback || hdr->b_datacnt > 1 ||
1721 1748 refcount_is_zero(&hdr->b_refcnt));
1722 1749 mutex_exit(hash_lock);
1723 1750 return (no_callback);
1724 1751 }
1725 1752
1726 1753 int
1727 1754 arc_buf_size(arc_buf_t *buf)
1728 1755 {
1729 1756 return (buf->b_hdr->b_size);
1730 1757 }
1731 1758
1732 1759 /*
1733 1760 * Called from the DMU to determine if the current buffer should be
1734 1761 * evicted. In order to ensure proper locking, the eviction must be initiated
1735 1762 * from the DMU. Return true if the buffer is associated with user data and
1736 1763 * duplicate buffers still exist.
1737 1764 */
1738 1765 boolean_t
1739 1766 arc_buf_eviction_needed(arc_buf_t *buf)
1740 1767 {
1741 1768 arc_buf_hdr_t *hdr;
1742 1769 boolean_t evict_needed = B_FALSE;
1743 1770
1744 1771 if (zfs_disable_dup_eviction)
1745 1772 return (B_FALSE);
1746 1773
1747 1774 mutex_enter(&buf->b_evict_lock);
1748 1775 hdr = buf->b_hdr;
1749 1776 if (hdr == NULL) {
1750 1777 /*
1751 1778 * We are in arc_do_user_evicts(); let that function
1752 1779 * perform the eviction.
1753 1780 */
1754 1781 ASSERT(buf->b_data == NULL);
1755 1782 mutex_exit(&buf->b_evict_lock);
1756 1783 return (B_FALSE);
1757 1784 } else if (buf->b_data == NULL) {
1758 1785 /*
1759 1786 * We have already been added to the arc eviction list;
1760 1787 * recommend eviction.
1761 1788 */
1762 1789 ASSERT3P(hdr, ==, &arc_eviction_hdr);
1763 1790 mutex_exit(&buf->b_evict_lock);
1764 1791 return (B_TRUE);
1765 1792 }
1766 1793
1767 1794 if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1768 1795 evict_needed = B_TRUE;
1769 1796
1770 1797 mutex_exit(&buf->b_evict_lock);
1771 1798 return (evict_needed);
1772 1799 }
1773 1800
1774 1801 int zfs_fastflush = 1;
1775 1802
1776 1803 /*
1777 1804 * Evict buffers from list until we've removed the specified number of
1778 1805 * bytes. Move the removed buffers to the appropriate evict state.
1779 1806 * If the recycle flag is set, then attempt to "recycle" a buffer:
1780 1807 * - look for a buffer to evict that is `bytes' long.
1781 1808 * - return the data block from this buffer rather than freeing it.
1782 1809 * This flag is used by callers that are trying to make space for a
1783 1810 * new buffer in a full arc cache.
1784 1811 *
1785 1812 * This function makes a "best effort". It skips over any buffers
1786 1813 * it can't get a hash_lock on, and so may not catch all candidates.
1787 1814 * It may also return without evicting as much space as requested.
1788 1815 */
1789 1816 static void *
1790 1817 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1791 1818 arc_buf_contents_t type)
1792 1819 {
1793 1820 arc_state_t *evicted_state;
1794 1821 uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1795 1822 arc_buf_hdr_t *ab, *ab_prev = NULL;
1796 1823 list_t *list = &state->arcs_list[type];
1797 1824 kmutex_t *hash_lock;
1798 1825 boolean_t have_lock;
1799 1826 void *stolen = NULL;
1800 1827 arc_buf_hdr_t marker = { 0 };
1801 1828 int count = 0;
1802 1829
1803 1830 ASSERT(state == arc_mru || state == arc_mfu);
1804 1831
1805 1832 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1806 1833
1807 1834 mutex_enter(&state->arcs_mtx);
1808 1835 mutex_enter(&evicted_state->arcs_mtx);
1809 1836
1810 1837 for (ab = list_tail(list); ab; ab = ab_prev) {
1811 1838 ab_prev = list_prev(list, ab);
1812 1839 /* prefetch buffers have a minimum lifespan */
1813 1840 if (HDR_IO_IN_PROGRESS(ab) ||
1814 1841 (spa && ab->b_spa != spa) ||
1815 1842 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1816 1843 ddi_get_lbolt() - ab->b_arc_access <
1817 1844 arc_min_prefetch_lifespan)) {
1818 1845 skipped++;
1819 1846 continue;
1820 1847 }
1821 1848 /* "lookahead" for better eviction candidate */
1822 1849 if (recycle && ab->b_size != bytes &&
1823 1850 ab_prev && ab_prev->b_size == bytes)
1824 1851 continue;
1825 1852
1826 1853 /* ignore markers */
1827 1854 if (ab->b_spa == 0)
1828 1855 continue;
1829 1856
1830 1857 /*
1831 1858 * It may take a long time to evict all the bufs requested.
1832 1859 * To avoid blocking all arc activity, periodically drop
1833 1860 * the arcs_mtx and give other threads a chance to run
1834 1861 * before reacquiring the lock.
1835 1862 *
1836 1863 * If we are looking for a buffer to recycle, we are in
1837 1864 * the hot code path, so don't sleep.
1838 1865 */
1839 1866 if (!recycle && count++ > arc_evict_iterations) {
1840 1867 list_insert_after(list, ab, &marker);
1841 1868 mutex_exit(&evicted_state->arcs_mtx);
1842 1869 mutex_exit(&state->arcs_mtx);
1843 1870 kpreempt(KPREEMPT_SYNC);
1844 1871 mutex_enter(&state->arcs_mtx);
1845 1872 mutex_enter(&evicted_state->arcs_mtx);
1846 1873 ab_prev = list_prev(list, &marker);
1847 1874 list_remove(list, &marker);
1848 1875 count = 0;
1849 1876 continue;
1850 1877 }
1851 1878
1852 1879 hash_lock = HDR_LOCK(ab);
1853 1880 have_lock = MUTEX_HELD(hash_lock);
1854 1881 if (have_lock || mutex_tryenter(hash_lock)) {
1855 1882 ASSERT0(refcount_count(&ab->b_refcnt));
1856 1883 ASSERT(ab->b_datacnt > 0);
1857 1884 while (ab->b_buf) {
1858 1885 arc_buf_t *buf = ab->b_buf;
1859 1886 if (!mutex_tryenter(&buf->b_evict_lock)) {
1860 1887 missed += 1;
1861 1888 break;
1862 1889 }
1863 1890 if (buf->b_data) {
1864 1891 bytes_evicted += ab->b_size;
1865 1892 if (recycle && ab->b_type == type &&
1866 1893 ab->b_size == bytes &&
1867 1894 !HDR_L2_WRITING(ab)) {
1868 1895 stolen = buf->b_data;
1869 1896 recycle = FALSE;
1870 1897 }
1871 1898 }
1872 1899 if (buf->b_efunc) {
1873 1900 mutex_enter(&arc_eviction_mtx);
1874 1901 arc_buf_destroy(buf,
1875 1902 buf->b_data == stolen, FALSE);
1876 1903 ab->b_buf = buf->b_next;
1877 1904 buf->b_hdr = &arc_eviction_hdr;
1878 1905 buf->b_next = arc_eviction_list;
1879 1906 arc_eviction_list = buf;
1880 1907 mutex_exit(&arc_eviction_mtx);
1881 1908 mutex_exit(&buf->b_evict_lock);
1882 1909 } else {
1883 1910 mutex_exit(&buf->b_evict_lock);
1884 1911 arc_buf_destroy(buf,
1885 1912 buf->b_data == stolen, TRUE);
1886 1913 }
1887 1914 }
1888 1915
1889 1916 if (ab->b_l2hdr) {
1890 1917 ARCSTAT_INCR(arcstat_evict_l2_cached,
1891 1918 ab->b_size);
1892 1919 } else {
1893 1920 if (l2arc_write_eligible(ab->b_spa, ab)) {
1894 1921 ARCSTAT_INCR(arcstat_evict_l2_eligible,
1895 1922 ab->b_size);
1896 1923 } else {
1897 1924 ARCSTAT_INCR(
1898 1925 arcstat_evict_l2_ineligible,
1899 1926 ab->b_size);
1900 1927 }
1901 1928 }
1902 1929
1903 1930 if (ab->b_datacnt == 0) {
1904 1931 arc_change_state(evicted_state, ab, hash_lock);
1905 1932 ASSERT(HDR_IN_HASH_TABLE(ab));
1906 1933 ab->b_flags |= ARC_IN_HASH_TABLE;
1907 1934 ab->b_flags &= ~ARC_BUF_AVAILABLE;
1908 1935 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1909 1936 }
1910 1937 if (!have_lock)
1911 1938 mutex_exit(hash_lock);
1912 1939 if (bytes >= 0 && bytes_evicted >= bytes)
1913 1940 break;
1914 1941 } else {
1915 1942 missed += 1;
1916 1943 }
1917 1944 }
1918 1945
1919 1946 mutex_exit(&evicted_state->arcs_mtx);
1920 1947 mutex_exit(&state->arcs_mtx);
1921 1948
1922 1949 if (bytes_evicted < bytes)
1923 1950 dprintf("only evicted %lld bytes from %x",
1924 1951 (longlong_t)bytes_evicted, state);
1925 1952
1926 1953 if (skipped)
1927 1954 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1928 1955
1929 1956 if (missed)
1930 1957 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1931 1958
1932 1959 /*
1933 1960 * Note: we have just evicted some data into the ghost state,
1934 1961 * potentially putting the ghost size over the desired size. Rather
1935 1962 * that evicting from the ghost list in this hot code path, leave
1936 1963 * this chore to the arc_reclaim_thread().
1937 1964 */
1938 1965
1939 1966 return (stolen);
1940 1967 }
1941 1968
1942 1969 /*
1943 1970 * Remove buffers from list until we've removed the specified number of
1944 1971 * bytes. Destroy the buffers that are removed.
1945 1972 */
1946 1973 static void
1947 1974 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1948 1975 {
1949 1976 arc_buf_hdr_t *ab, *ab_prev;
1950 1977 arc_buf_hdr_t marker = { 0 };
1951 1978 list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1952 1979 kmutex_t *hash_lock;
1953 1980 uint64_t bytes_deleted = 0;
1954 1981 uint64_t bufs_skipped = 0;
1955 1982 int count = 0;
1956 1983
1957 1984 ASSERT(GHOST_STATE(state));
1958 1985 top:
1959 1986 mutex_enter(&state->arcs_mtx);
1960 1987 for (ab = list_tail(list); ab; ab = ab_prev) {
1961 1988 ab_prev = list_prev(list, ab);
1962 1989 if (ab->b_type > ARC_BUFC_NUMTYPES)
1963 1990 panic("invalid ab=%p", (void *)ab);
1964 1991 if (spa && ab->b_spa != spa)
1965 1992 continue;
1966 1993
1967 1994 /* ignore markers */
1968 1995 if (ab->b_spa == 0)
1969 1996 continue;
1970 1997
1971 1998 hash_lock = HDR_LOCK(ab);
1972 1999 /* caller may be trying to modify this buffer, skip it */
1973 2000 if (MUTEX_HELD(hash_lock))
1974 2001 continue;
1975 2002
1976 2003 /*
1977 2004 * It may take a long time to evict all the bufs requested.
1978 2005 * To avoid blocking all arc activity, periodically drop
1979 2006 * the arcs_mtx and give other threads a chance to run
1980 2007 * before reacquiring the lock.
1981 2008 */
1982 2009 if (count++ > arc_evict_iterations) {
1983 2010 list_insert_after(list, ab, &marker);
1984 2011 mutex_exit(&state->arcs_mtx);
1985 2012 kpreempt(KPREEMPT_SYNC);
1986 2013 mutex_enter(&state->arcs_mtx);
1987 2014 ab_prev = list_prev(list, &marker);
1988 2015 list_remove(list, &marker);
1989 2016 count = 0;
1990 2017 continue;
1991 2018 }
1992 2019 if (mutex_tryenter(hash_lock)) {
1993 2020 ASSERT(!HDR_IO_IN_PROGRESS(ab));
1994 2021 ASSERT(ab->b_buf == NULL);
1995 2022 ARCSTAT_BUMP(arcstat_deleted);
1996 2023 bytes_deleted += ab->b_size;
1997 2024
1998 2025 if (ab->b_l2hdr != NULL) {
1999 2026 /*
2000 2027 * This buffer is cached on the 2nd Level ARC;
2001 2028 * don't destroy the header.
2002 2029 */
2003 2030 arc_change_state(arc_l2c_only, ab, hash_lock);
2004 2031 mutex_exit(hash_lock);
2005 2032 } else {
2006 2033 arc_change_state(arc_anon, ab, hash_lock);
2007 2034 mutex_exit(hash_lock);
2008 2035 arc_hdr_destroy(ab);
2009 2036 }
2010 2037
2011 2038 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2012 2039 if (bytes >= 0 && bytes_deleted >= bytes)
2013 2040 break;
2014 2041 } else if (bytes < 0) {
2015 2042 /*
2016 2043 * Insert a list marker and then wait for the
2017 2044 * hash lock to become available. Once its
2018 2045 * available, restart from where we left off.
2019 2046 */
2020 2047 list_insert_after(list, ab, &marker);
2021 2048 mutex_exit(&state->arcs_mtx);
2022 2049 mutex_enter(hash_lock);
2023 2050 mutex_exit(hash_lock);
2024 2051 mutex_enter(&state->arcs_mtx);
2025 2052 ab_prev = list_prev(list, &marker);
2026 2053 list_remove(list, &marker);
2027 2054 } else {
2028 2055 bufs_skipped += 1;
2029 2056 }
2030 2057
2031 2058 }
2032 2059 mutex_exit(&state->arcs_mtx);
2033 2060
2034 2061 if (list == &state->arcs_list[ARC_BUFC_DATA] &&
2035 2062 (bytes < 0 || bytes_deleted < bytes)) {
2036 2063 list = &state->arcs_list[ARC_BUFC_METADATA];
2037 2064 goto top;
2038 2065 }
2039 2066
2040 2067 if (bufs_skipped) {
2041 2068 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2042 2069 ASSERT(bytes >= 0);
2043 2070 }
2044 2071
2045 2072 if (bytes_deleted < bytes)
2046 2073 dprintf("only deleted %lld bytes from %p",
2047 2074 (longlong_t)bytes_deleted, state);
2048 2075 }
2049 2076
2050 2077 static void
2051 2078 arc_adjust(void)
2052 2079 {
2053 2080 int64_t adjustment, delta;
2054 2081
2055 2082 /*
2056 2083 * Adjust MRU size
2057 2084 */
2058 2085
2059 2086 adjustment = MIN((int64_t)(arc_size - arc_c),
2060 2087 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2061 2088 arc_p));
2062 2089
2063 2090 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2064 2091 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2065 2092 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
2066 2093 adjustment -= delta;
2067 2094 }
2068 2095
2069 2096 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2070 2097 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2071 2098 (void) arc_evict(arc_mru, NULL, delta, FALSE,
2072 2099 ARC_BUFC_METADATA);
2073 2100 }
2074 2101
2075 2102 /*
2076 2103 * Adjust MFU size
2077 2104 */
2078 2105
2079 2106 adjustment = arc_size - arc_c;
2080 2107
2081 2108 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2082 2109 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2083 2110 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
2084 2111 adjustment -= delta;
2085 2112 }
2086 2113
2087 2114 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2088 2115 int64_t delta = MIN(adjustment,
2089 2116 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2090 2117 (void) arc_evict(arc_mfu, NULL, delta, FALSE,
2091 2118 ARC_BUFC_METADATA);
2092 2119 }
2093 2120
2094 2121 /*
2095 2122 * Adjust ghost lists
2096 2123 */
2097 2124
2098 2125 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2099 2126
2100 2127 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2101 2128 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2102 2129 arc_evict_ghost(arc_mru_ghost, NULL, delta);
2103 2130 }
↓ open down ↓ |
781 lines elided |
↑ open up ↑ |
2104 2131
2105 2132 adjustment =
2106 2133 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2107 2134
2108 2135 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2109 2136 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2110 2137 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2111 2138 }
2112 2139 }
2113 2140
2141 +#define ACCURACY 1000
2142 +
2114 2143 static void
2144 +arc_reclaim_bytes(uint64_t to_evict)
2145 +{
2146 + uint64_t to_evict_data_mru, to_evict_data_mfu;
2147 + uint64_t to_evict_meta_mru, to_evict_meta_mfu;
2148 +
2149 + to_evict_meta_mru = (((arc_mru->arcs_lsize[ARC_BUFC_METADATA] *
2150 + ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
2151 + to_evict) / ACCURACY;
2152 + to_evict_data_mru = (((arc_mru->arcs_lsize[ARC_BUFC_DATA] *
2153 + ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
2154 + to_evict) / ACCURACY;
2155 + to_evict_meta_mfu = (((arc_mfu->arcs_lsize[ARC_BUFC_METADATA] *
2156 + ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
2157 + to_evict) / ACCURACY;
2158 + to_evict_data_mfu = (((arc_mfu->arcs_lsize[ARC_BUFC_DATA] *
2159 + ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) *
2160 + to_evict) / ACCURACY;
2161 +
2162 + if (to_evict_meta_mru > 0)
2163 + (void) arc_evict(arc_mru, NULL, to_evict_meta_mru, FALSE,
2164 + ARC_BUFC_METADATA);
2165 + if (to_evict_data_mru > 0)
2166 + (void) arc_evict(arc_mru, NULL, to_evict_data_mru, FALSE,
2167 + ARC_BUFC_DATA);
2168 + if (to_evict_meta_mfu > 0)
2169 + (void) arc_evict(arc_mfu, NULL, to_evict_meta_mfu, FALSE,
2170 + ARC_BUFC_METADATA);
2171 + if (to_evict_data_mfu > 0)
2172 + (void) arc_evict(arc_mfu, NULL, to_evict_data_mfu, FALSE,
2173 + ARC_BUFC_DATA);
2174 +}
2175 +
2176 +static void
2115 2177 arc_do_user_evicts(void)
2116 2178 {
2117 2179 mutex_enter(&arc_eviction_mtx);
2118 2180 while (arc_eviction_list != NULL) {
2119 2181 arc_buf_t *buf = arc_eviction_list;
2120 2182 arc_eviction_list = buf->b_next;
2121 2183 mutex_enter(&buf->b_evict_lock);
2122 2184 buf->b_hdr = NULL;
2123 2185 mutex_exit(&buf->b_evict_lock);
2124 2186 mutex_exit(&arc_eviction_mtx);
2125 2187
2126 2188 if (buf->b_efunc != NULL)
2127 2189 VERIFY0(buf->b_efunc(buf->b_private));
2128 2190
2129 2191 buf->b_efunc = NULL;
2130 2192 buf->b_private = NULL;
2131 2193 kmem_cache_free(buf_cache, buf);
2132 2194 mutex_enter(&arc_eviction_mtx);
2133 2195 }
2134 2196 mutex_exit(&arc_eviction_mtx);
2135 2197 }
2136 2198
2137 2199 typedef struct arc_async_flush_data {
2138 2200 uint64_t aaf_guid;
2139 2201 } arc_async_flush_data_t;
2140 2202
2141 2203 static taskq_t *arc_flush_taskq;
2142 2204
2143 2205 static void
2144 2206 _arc_flush(uint64_t guid)
2145 2207 {
2146 2208 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2147 2209 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2148 2210 if (guid)
2149 2211 break;
2150 2212 }
2151 2213 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2152 2214 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2153 2215 if (guid)
2154 2216 break;
2155 2217 }
2156 2218 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2157 2219 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2158 2220 if (guid)
2159 2221 break;
2160 2222 }
2161 2223 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2162 2224 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2163 2225 if (guid)
2164 2226 break;
2165 2227 }
2166 2228
2167 2229 arc_evict_ghost(arc_mru_ghost, guid, -1);
2168 2230 arc_evict_ghost(arc_mfu_ghost, guid, -1);
2169 2231
2170 2232 mutex_enter(&arc_reclaim_thr_lock);
2171 2233 arc_do_user_evicts();
2172 2234 mutex_exit(&arc_reclaim_thr_lock);
2173 2235 }
2174 2236
2175 2237 static void
2176 2238 arc_flush_task(void *arg)
2177 2239 {
2178 2240 arc_async_flush_data_t *aaf = (arc_async_flush_data_t *)arg;
2179 2241 _arc_flush(aaf->aaf_guid);
2180 2242 kmem_free(aaf, sizeof (arc_async_flush_data_t));
2181 2243 }
2182 2244
2183 2245 /*
2184 2246 * Flush all *evictable* data from the cache for the given spa.
2185 2247 * NOTE: this will not touch "active" (i.e. referenced) data.
2186 2248 */
2187 2249 void
2188 2250 arc_flush(spa_t *spa)
2189 2251 {
2190 2252 uint64_t guid = 0;
2191 2253 boolean_t async_flush = (spa ? zfs_fastflush : FALSE);
2192 2254 arc_async_flush_data_t *aaf = NULL;
2193 2255
2194 2256 if (spa) {
2195 2257 guid = spa_load_guid(spa);
2196 2258 if (async_flush) {
2197 2259 aaf = kmem_alloc(sizeof (arc_async_flush_data_t),
2198 2260 KM_SLEEP);
2199 2261 aaf->aaf_guid = guid;
2200 2262 }
2201 2263 }
2202 2264
2203 2265 /*
2204 2266 * Try to flush per-spa remaining ARC ghost buffers and buffers in
2205 2267 * arc_eviction_list asynchronously while a pool is being closed.
2206 2268 * An ARC buffer is bound to spa only by guid, so buffer can
2207 2269 * exist even when pool has already gone. If asynchronous flushing
2208 2270 * fails we fall back to regular (synchronous) one.
2209 2271 * NOTE: If asynchronous flushing had not yet finished when the pool
2210 2272 * was imported again it wouldn't be a problem, even when guids before
2211 2273 * and after export/import are the same. We can evict only unreferenced
2212 2274 * buffers, other are skipped.
2213 2275 */
2214 2276 if (!async_flush || (taskq_dispatch(arc_flush_taskq, arc_flush_task,
2215 2277 aaf, TQ_NOSLEEP) == NULL)) {
2216 2278 _arc_flush(guid);
2217 2279 ASSERT(spa || arc_eviction_list == NULL);
2218 2280 if (async_flush)
2219 2281 kmem_free(aaf, sizeof (arc_async_flush_data_t));
2220 2282 }
2221 2283 }
2222 2284
2223 2285 void
2224 2286 arc_shrink(void)
2225 2287 {
2226 2288 if (arc_c > arc_c_min) {
2227 2289 uint64_t to_free;
2228 2290
2229 2291 #ifdef _KERNEL
2230 2292 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
2231 2293 #else
2232 2294 to_free = arc_c >> arc_shrink_shift;
2233 2295 #endif
2234 2296 if (arc_c > arc_c_min + to_free)
2235 2297 atomic_add_64(&arc_c, -to_free);
2236 2298 else
2237 2299 arc_c = arc_c_min;
2238 2300
2239 2301 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2240 2302 if (arc_c > arc_size)
2241 2303 arc_c = MAX(arc_size, arc_c_min);
↓ open down ↓ |
117 lines elided |
↑ open up ↑ |
2242 2304 if (arc_p > arc_c)
2243 2305 arc_p = (arc_c >> 1);
2244 2306 ASSERT(arc_c >= arc_c_min);
2245 2307 ASSERT((int64_t)arc_p >= 0);
2246 2308 }
2247 2309
2248 2310 if (arc_size > arc_c)
2249 2311 arc_adjust();
2250 2312 }
2251 2313
2314 +#define PHYSMEM_PRESSURE_FRACTION 100
2315 +
2316 +static boolean_t
2317 +arc_mem_pressure(void)
2318 +{
2319 +#ifdef _KERNEL
2320 + uint64_t extra = desfree + physmem / PHYSMEM_PRESSURE_FRACTION;
2321 +
2322 + if ((freemem < lotsfree + needfree + extra) ||
2323 + (needfree || availrmem < swapfs_minfree + swapfs_reserve + extra) ||
2324 + (zio_arena != NULL && vmem_size(zio_arena, VMEM_FREE) <
2325 + (vmem_size(zio_arena, VMEM_ALLOC) >> 4) +
2326 + physmem / PHYSMEM_PRESSURE_FRACTION))
2327 + return (B_TRUE);
2328 +
2329 + return (freemem < physmem / PHYSMEM_PRESSURE_FRACTION);
2330 +#else
2331 + return (0);
2332 +#endif
2333 +}
2334 +
2252 2335 /*
2253 2336 * Determine if the system is under memory pressure and is asking
2254 2337 * to reclaim memory. A return value of 1 indicates that the system
2255 2338 * is under memory pressure and that the arc should adjust accordingly.
2256 2339 */
2257 2340 static int
2258 2341 arc_reclaim_needed(void)
2259 2342 {
2260 2343 uint64_t extra;
2261 2344
2262 2345 #ifdef _KERNEL
2263 2346
2264 2347 if (needfree)
2265 2348 return (1);
2266 2349
2267 2350 /*
2268 2351 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2269 2352 */
2270 2353 extra = desfree;
2271 2354
2272 2355 /*
2273 2356 * check that we're out of range of the pageout scanner. It starts to
2274 2357 * schedule paging if freemem is less than lotsfree and needfree.
2275 2358 * lotsfree is the high-water mark for pageout, and needfree is the
2276 2359 * number of needed free pages. We add extra pages here to make sure
2277 2360 * the scanner doesn't start up while we're freeing memory.
2278 2361 */
2279 2362 if (freemem < lotsfree + needfree + extra)
2280 2363 return (1);
2281 2364
2282 2365 /*
2283 2366 * check to make sure that swapfs has enough space so that anon
2284 2367 * reservations can still succeed. anon_resvmem() checks that the
2285 2368 * availrmem is greater than swapfs_minfree, and the number of reserved
2286 2369 * swap pages. We also add a bit of extra here just to prevent
2287 2370 * circumstances from getting really dire.
2288 2371 */
2289 2372 if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2290 2373 return (1);
2291 2374
2292 2375 /*
2293 2376 * Check that we have enough availrmem that memory locking (e.g., via
2294 2377 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum
2295 2378 * stores the number of pages that cannot be locked; when availrmem
2296 2379 * drops below pages_pp_maximum, page locking mechanisms such as
2297 2380 * page_pp_lock() will fail.)
2298 2381 */
2299 2382 if (availrmem <= pages_pp_maximum)
2300 2383 return (1);
2301 2384
2302 2385 #if defined(__i386)
2303 2386 /*
2304 2387 * If we're on an i386 platform, it's possible that we'll exhaust the
2305 2388 * kernel heap space before we ever run out of available physical
2306 2389 * memory. Most checks of the size of the heap_area compare against
2307 2390 * tune.t_minarmem, which is the minimum available real memory that we
2308 2391 * can have in the system. However, this is generally fixed at 25 pages
2309 2392 * which is so low that it's useless. In this comparison, we seek to
2310 2393 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2311 2394 * heap is allocated. (Or, in the calculation, if less than 1/4th is
2312 2395 * free)
2313 2396 */
2314 2397 if (vmem_size(heap_arena, VMEM_FREE) <
2315 2398 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2))
2316 2399 return (1);
2317 2400 #endif
2318 2401
2319 2402 /*
2320 2403 * If zio data pages are being allocated out of a separate heap segment,
2321 2404 * then enforce that the size of available vmem for this arena remains
2322 2405 * above about 1/16th free.
2323 2406 *
2324 2407 * Note: The 1/16th arena free requirement was put in place
2325 2408 * to aggressively evict memory from the arc in order to avoid
2326 2409 * memory fragmentation issues.
2327 2410 */
2328 2411 if (zio_arena != NULL &&
2329 2412 vmem_size(zio_arena, VMEM_FREE) <
2330 2413 (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2331 2414 return (1);
2332 2415 #else
2333 2416 if (spa_get_random(100) == 0)
2334 2417 return (1);
2335 2418 #endif
2336 2419 return (0);
2337 2420 }
2338 2421
2339 2422 static void
2340 2423 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2341 2424 {
2342 2425 size_t i;
2343 2426 kmem_cache_t *prev_cache = NULL;
2344 2427 kmem_cache_t *prev_data_cache = NULL;
2345 2428 extern kmem_cache_t *zio_buf_cache[];
2346 2429 extern kmem_cache_t *zio_data_buf_cache[];
2347 2430 extern kmem_cache_t *range_seg_cache;
2348 2431
2349 2432 #ifdef _KERNEL
2350 2433 if (arc_meta_used >= arc_meta_limit) {
2351 2434 /*
2352 2435 * We are exceeding our meta-data cache limit.
2353 2436 * Purge some DNLC entries to release holds on meta-data.
2354 2437 */
2355 2438 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2356 2439 }
2357 2440 #if defined(__i386)
2358 2441 /*
2359 2442 * Reclaim unused memory from all kmem caches.
2360 2443 */
2361 2444 kmem_reap();
2362 2445 #endif
2363 2446 #endif
2364 2447
2365 2448 /*
2366 2449 * An aggressive reclamation will shrink the cache size as well as
2367 2450 * reap free buffers from the arc kmem caches.
2368 2451 */
2369 2452 if (strat == ARC_RECLAIM_AGGR)
2370 2453 arc_shrink();
2371 2454
2372 2455 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2373 2456 if (zio_buf_cache[i] != prev_cache) {
2374 2457 prev_cache = zio_buf_cache[i];
2375 2458 kmem_cache_reap_now(zio_buf_cache[i]);
2376 2459 }
2377 2460 if (zio_data_buf_cache[i] != prev_data_cache) {
2378 2461 prev_data_cache = zio_data_buf_cache[i];
2379 2462 kmem_cache_reap_now(zio_data_buf_cache[i]);
2380 2463 }
2381 2464 }
2382 2465 kmem_cache_reap_now(buf_cache);
2383 2466 kmem_cache_reap_now(hdr_cache);
↓ open down ↓ |
122 lines elided |
↑ open up ↑ |
2384 2467 kmem_cache_reap_now(range_seg_cache);
2385 2468
2386 2469 /*
2387 2470 * Ask the vmem areana to reclaim unused memory from its
2388 2471 * quantum caches.
2389 2472 */
2390 2473 if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2391 2474 vmem_qcache_reap(zio_arena);
2392 2475 }
2393 2476
2477 +#define RECLAIMS_PER_SEC 20
2478 +#define STAT_UPDATES_PER_SEC 5
2479 +
2480 +/*
2481 + * During heavy use, the ARC naturally wants to oscilate its arc_c around
2482 + * a maximum memory pressure point which corresponds to the arc_reclaim_needed
2483 + * function evaluating to 1. This results in the arc_size slowly growing
2484 + * towards this reclaim_needed threshold and exceeding it periodically. Once
2485 + * this happens, both arc_c and arc_size are down-adjusted by the
2486 + * arc_reclaim_thread and kmem_reap is initiated. This is problematic on
2487 + * bigmem systems with a small recordsize (4k or 8k), because reaping a kmem
2488 + * cache which contains very large numbers of objects is extremely expensive
2489 + * from an xcall perspective (several seconds of heavy CPU use):
2490 + *
2491 + * (mem)
2492 + * ^ arc_reclaim_thread reacts
2493 + * | | |
2494 + * | V V
2495 + * |
2496 + * | + +
2497 + * | /| /|
2498 + * | ......./..|................/..|.............. arc_reclaim_needed threshold
2499 + * | / \_____________/ \___________/(etc)
2500 + * | / kmem reap kmem reap
2501 + * | /
2502 + * |/
2503 + * +----------------------------------------------------------------->
2504 + * (time)
2505 + *
2506 + * To help address this stairstep pattern, the arc_pressure_thread periodically
2507 + * gauges the distance of the current arc_size to the arc_reclaim_needed
2508 + * threshold by way of an estimation algorithm (in arc_mem_pressure).
2509 + */
2394 2510 static void
2511 +arc_pressure_thread(void)
2512 +{
2513 + clock_t last_update = ddi_get_lbolt();
2514 + callb_cpr_t cpr;
2515 +
2516 + CALLB_CPR_INIT(&cpr, &arc_pressure_thr_lock, callb_generic_cpr, FTAG);
2517 +
2518 + mutex_enter(&arc_pressure_thr_lock);
2519 + while (arc_pressure_thread_exit == 0) {
2520 + clock_t now;
2521 +
2522 + now = ddi_get_lbolt();
2523 + if (now - last_update >= hz / STAT_UPDATES_PER_SEC) {
2524 + uint64_t new_rate;
2525 +
2526 + new_rate = (atomic_swap_64(&arc_bytes_allocd, 0) *
2527 + hz) / (now - last_update);
2528 +
2529 + if (ARCSTAT(arcstat_growth_rate) < new_rate)
2530 + ARCSTAT(arcstat_growth_rate) = new_rate;
2531 + else
2532 + ARCSTAT_F_AVG(arcstat_growth_rate, new_rate, 4);
2533 + last_update = now;
2534 + }
2535 +
2536 + arc_pressure_threshold = arc_c - ARCSTAT(arcstat_growth_rate);
2537 + if (arc_size > arc_pressure_threshold) {
2538 + arc_reclaim_bytes(arc_size - arc_pressure_threshold);
2539 + }
2540 +
2541 + CALLB_CPR_SAFE_BEGIN(&cpr);
2542 + (void) cv_timedwait(&arc_pressure_thr_cv,
2543 + &arc_pressure_thr_lock,
2544 + ddi_get_lbolt() + hz / RECLAIMS_PER_SEC);
2545 + CALLB_CPR_SAFE_END(&cpr, &arc_pressure_thr_lock);
2546 + }
2547 +
2548 + arc_pressure_thread_exit = 0;
2549 + cv_broadcast(&arc_pressure_thr_cv);
2550 + CALLB_CPR_EXIT(&cpr); /* drops arc_pressure_thr_lock */
2551 + thread_exit();
2552 +}
2553 +
2554 +static void
2395 2555 arc_reclaim_thread(void)
2396 2556 {
2397 2557 clock_t growtime = 0;
2398 2558 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
2399 2559 callb_cpr_t cpr;
2400 2560
2401 2561 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2402 2562
2403 2563 mutex_enter(&arc_reclaim_thr_lock);
2404 2564 while (arc_thread_exit == 0) {
2405 2565 if (arc_reclaim_needed()) {
2406 2566
2407 2567 if (arc_no_grow) {
2408 2568 if (last_reclaim == ARC_RECLAIM_CONS) {
2409 2569 last_reclaim = ARC_RECLAIM_AGGR;
2410 2570 } else {
2411 2571 last_reclaim = ARC_RECLAIM_CONS;
2412 2572 }
2413 2573 } else {
2414 2574 arc_no_grow = TRUE;
2415 2575 last_reclaim = ARC_RECLAIM_AGGR;
2416 2576 membar_producer();
2417 2577 }
2418 2578
2419 2579 /* reset the growth delay for every reclaim */
2420 2580 growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2421 2581
2422 2582 arc_kmem_reap_now(last_reclaim);
2423 2583 arc_warm = B_TRUE;
2424 2584
2425 2585 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2426 2586 arc_no_grow = FALSE;
2427 2587 }
2428 2588
2429 2589 arc_adjust();
2430 2590
2431 2591 if (arc_eviction_list != NULL)
2432 2592 arc_do_user_evicts();
2433 2593
2434 2594 /* block until needed, or one second, whichever is shorter */
2435 2595 CALLB_CPR_SAFE_BEGIN(&cpr);
2436 2596 (void) cv_timedwait(&arc_reclaim_thr_cv,
2437 2597 &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2438 2598 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2439 2599 }
2440 2600
2441 2601 arc_thread_exit = 0;
2442 2602 cv_broadcast(&arc_reclaim_thr_cv);
2443 2603 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */
2444 2604 thread_exit();
2445 2605 }
2446 2606
2447 2607 /*
2448 2608 * Adapt arc info given the number of bytes we are trying to add and
2449 2609 * the state that we are comming from. This function is only called
2450 2610 * when we are adding new content to the cache.
2451 2611 */
2452 2612 static void
2453 2613 arc_adapt(int bytes, arc_state_t *state)
2454 2614 {
2455 2615 int mult;
2456 2616 uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2457 2617
2458 2618 if (state == arc_l2c_only)
2459 2619 return;
2460 2620
2461 2621 ASSERT(bytes > 0);
2462 2622 /*
2463 2623 * Adapt the target size of the MRU list:
2464 2624 * - if we just hit in the MRU ghost list, then increase
2465 2625 * the target size of the MRU list.
2466 2626 * - if we just hit in the MFU ghost list, then increase
2467 2627 * the target size of the MFU list by decreasing the
2468 2628 * target size of the MRU list.
2469 2629 */
2470 2630 if (state == arc_mru_ghost) {
2471 2631 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2472 2632 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2473 2633 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2474 2634
2475 2635 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2476 2636 } else if (state == arc_mfu_ghost) {
2477 2637 uint64_t delta;
2478 2638
2479 2639 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2480 2640 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2481 2641 mult = MIN(mult, 10);
2482 2642
2483 2643 delta = MIN(bytes * mult, arc_p);
2484 2644 arc_p = MAX(arc_p_min, arc_p - delta);
2485 2645 }
2486 2646 ASSERT((int64_t)arc_p >= 0);
2487 2647
2488 2648 if (arc_reclaim_needed()) {
2489 2649 cv_signal(&arc_reclaim_thr_cv);
2490 2650 return;
2491 2651 }
2492 2652
↓ open down ↓ |
88 lines elided |
↑ open up ↑ |
2493 2653 if (arc_no_grow)
2494 2654 return;
2495 2655
2496 2656 if (arc_c >= arc_c_max)
2497 2657 return;
2498 2658
2499 2659 /*
2500 2660 * If we're within (2 * maxblocksize) bytes of the target
2501 2661 * cache size, increment the target cache size
2502 2662 */
2503 - if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2663 + if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT) ||
2664 + (arc_size >= arc_pressure_threshold && arc_mem_pressure() == 0)) {
2504 2665 atomic_add_64(&arc_c, (int64_t)bytes);
2505 2666 if (arc_c > arc_c_max)
2506 2667 arc_c = arc_c_max;
2507 2668 else if (state == arc_anon)
2508 2669 atomic_add_64(&arc_p, (int64_t)bytes);
2509 2670 if (arc_p > arc_c)
2510 2671 arc_p = arc_c;
2511 2672 }
2512 2673 ASSERT((int64_t)arc_p >= 0);
2513 2674 }
2514 2675
2515 2676 /*
2516 2677 * Check if the cache has reached its limits and eviction is required
2517 2678 * prior to insert.
2518 2679 */
2519 2680 static int
2520 2681 arc_evict_needed(arc_buf_contents_t type)
2521 2682 {
2522 2683 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2523 2684 return (1);
2524 2685
2525 2686 if (arc_reclaim_needed())
2526 2687 return (1);
2527 2688
2528 2689 return (arc_size > arc_c);
2529 2690 }
2530 2691
2531 2692 /*
2532 2693 * The buffer, supplied as the first argument, needs a data block.
2533 2694 * So, if we are at cache max, determine which cache should be victimized.
2534 2695 * We have the following cases:
2535 2696 *
2536 2697 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2537 2698 * In this situation if we're out of space, but the resident size of the MFU is
2538 2699 * under the limit, victimize the MFU cache to satisfy this insertion request.
2539 2700 *
2540 2701 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2541 2702 * Here, we've used up all of the available space for the MRU, so we need to
2542 2703 * evict from our own cache instead. Evict from the set of resident MRU
2543 2704 * entries.
2544 2705 *
2545 2706 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2546 2707 * c minus p represents the MFU space in the cache, since p is the size of the
2547 2708 * cache that is dedicated to the MRU. In this situation there's still space on
2548 2709 * the MFU side, so the MRU side needs to be victimized.
2549 2710 *
2550 2711 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2551 2712 * MFU's resident set is consuming more space than it has been allotted. In
2552 2713 * this situation, we must victimize our own cache, the MFU, for this insertion.
2553 2714 */
2554 2715 static void
2555 2716 arc_get_data_buf(arc_buf_t *buf)
2556 2717 {
2557 2718 arc_state_t *state = buf->b_hdr->b_state;
2558 2719 uint64_t size = buf->b_hdr->b_size;
2559 2720 arc_buf_contents_t type = buf->b_hdr->b_type;
2560 2721
2561 2722 arc_adapt(size, state);
2562 2723
2563 2724 /*
2564 2725 * We have not yet reached cache maximum size,
2565 2726 * just allocate a new buffer.
↓ open down ↓ |
52 lines elided |
↑ open up ↑ |
2566 2727 */
2567 2728 if (!arc_evict_needed(type)) {
2568 2729 if (type == ARC_BUFC_METADATA) {
2569 2730 buf->b_data = zio_buf_alloc(size);
2570 2731 arc_space_consume(size, ARC_SPACE_DATA);
2571 2732 } else {
2572 2733 ASSERT(type == ARC_BUFC_DATA);
2573 2734 buf->b_data = zio_data_buf_alloc(size);
2574 2735 ARCSTAT_INCR(arcstat_data_size, size);
2575 2736 atomic_add_64(&arc_size, size);
2737 + atomic_add_64(&arc_bytes_allocd, size);
2576 2738 }
2577 2739 goto out;
2578 2740 }
2579 2741
2580 2742 /*
2581 2743 * If we are prefetching from the mfu ghost list, this buffer
2582 2744 * will end up on the mru list; so steal space from there.
2583 2745 */
2584 2746 if (state == arc_mfu_ghost)
2585 2747 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2586 2748 else if (state == arc_mru_ghost)
2587 2749 state = arc_mru;
2588 2750
2589 2751 if (state == arc_mru || state == arc_anon) {
2590 2752 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2591 2753 state = (arc_mfu->arcs_lsize[type] >= size &&
2592 2754 arc_p > mru_used) ? arc_mfu : arc_mru;
2593 2755 } else {
2594 2756 /* MFU cases */
2595 2757 uint64_t mfu_space = arc_c - arc_p;
2596 2758 state = (arc_mru->arcs_lsize[type] >= size &&
2597 2759 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
2598 2760 }
2599 2761 if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2600 2762 if (type == ARC_BUFC_METADATA) {
2601 2763 buf->b_data = zio_buf_alloc(size);
2602 2764 arc_space_consume(size, ARC_SPACE_DATA);
2603 2765 } else {
2604 2766 ASSERT(type == ARC_BUFC_DATA);
2605 2767 buf->b_data = zio_data_buf_alloc(size);
2606 2768 ARCSTAT_INCR(arcstat_data_size, size);
2607 2769 atomic_add_64(&arc_size, size);
2770 + atomic_add_64(&arc_bytes_allocd, size);
2608 2771 }
2609 2772 ARCSTAT_BUMP(arcstat_recycle_miss);
2610 2773 }
2611 2774 ASSERT(buf->b_data != NULL);
2612 2775 out:
2613 2776 /*
2614 2777 * Update the state size. Note that ghost states have a
2615 2778 * "ghost size" and so don't need to be updated.
2616 2779 */
2617 2780 if (!GHOST_STATE(buf->b_hdr->b_state)) {
2618 2781 arc_buf_hdr_t *hdr = buf->b_hdr;
2619 2782
2620 2783 atomic_add_64(&hdr->b_state->arcs_size, size);
2621 2784 if (list_link_active(&hdr->b_arc_node)) {
2622 2785 ASSERT(refcount_is_zero(&hdr->b_refcnt));
2623 2786 atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2624 2787 }
2625 2788 /*
2626 2789 * If we are growing the cache, and we are adding anonymous
2627 2790 * data, and we have outgrown arc_p, update arc_p
2628 2791 */
2629 2792 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2630 2793 arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2631 2794 arc_p = MIN(arc_c, arc_p + size);
2632 2795 }
2633 2796 }
2634 2797
2635 2798 /*
2636 2799 * This routine is called whenever a buffer is accessed.
2637 2800 * NOTE: the hash lock is dropped in this function.
2638 2801 */
2639 2802 static void
2640 2803 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2641 2804 {
2642 2805 clock_t now;
2643 2806
2644 2807 ASSERT(MUTEX_HELD(hash_lock));
2645 2808
2646 2809 if (buf->b_state == arc_anon) {
2647 2810 /*
2648 2811 * This buffer is not in the cache, and does not
2649 2812 * appear in our "ghost" list. Add the new buffer
2650 2813 * to the MRU state.
2651 2814 */
2652 2815
2653 2816 ASSERT(buf->b_arc_access == 0);
2654 2817 buf->b_arc_access = ddi_get_lbolt();
2655 2818 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2656 2819 arc_change_state(arc_mru, buf, hash_lock);
2657 2820
2658 2821 } else if (buf->b_state == arc_mru) {
2659 2822 now = ddi_get_lbolt();
2660 2823
2661 2824 /*
2662 2825 * If this buffer is here because of a prefetch, then either:
2663 2826 * - clear the flag if this is a "referencing" read
2664 2827 * (any subsequent access will bump this into the MFU state).
2665 2828 * or
2666 2829 * - move the buffer to the head of the list if this is
2667 2830 * another prefetch (to make it less likely to be evicted).
2668 2831 */
2669 2832 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2670 2833 if (refcount_count(&buf->b_refcnt) == 0) {
2671 2834 ASSERT(list_link_active(&buf->b_arc_node));
2672 2835 } else {
2673 2836 buf->b_flags &= ~ARC_PREFETCH;
2674 2837 ARCSTAT_BUMP(arcstat_mru_hits);
2675 2838 }
2676 2839 buf->b_arc_access = now;
2677 2840 return;
2678 2841 }
2679 2842
2680 2843 /*
2681 2844 * This buffer has been "accessed" only once so far,
2682 2845 * but it is still in the cache. Move it to the MFU
2683 2846 * state.
2684 2847 */
2685 2848 if (now > buf->b_arc_access + ARC_MINTIME) {
2686 2849 /*
2687 2850 * More than 125ms have passed since we
2688 2851 * instantiated this buffer. Move it to the
2689 2852 * most frequently used state.
2690 2853 */
2691 2854 buf->b_arc_access = now;
2692 2855 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2693 2856 arc_change_state(arc_mfu, buf, hash_lock);
2694 2857 }
2695 2858 ARCSTAT_BUMP(arcstat_mru_hits);
2696 2859 } else if (buf->b_state == arc_mru_ghost) {
2697 2860 arc_state_t *new_state;
2698 2861 /*
2699 2862 * This buffer has been "accessed" recently, but
2700 2863 * was evicted from the cache. Move it to the
2701 2864 * MFU state.
2702 2865 */
2703 2866
2704 2867 if (buf->b_flags & ARC_PREFETCH) {
2705 2868 new_state = arc_mru;
2706 2869 if (refcount_count(&buf->b_refcnt) > 0)
2707 2870 buf->b_flags &= ~ARC_PREFETCH;
2708 2871 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2709 2872 } else {
2710 2873 new_state = arc_mfu;
2711 2874 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2712 2875 }
2713 2876
2714 2877 buf->b_arc_access = ddi_get_lbolt();
2715 2878 arc_change_state(new_state, buf, hash_lock);
2716 2879
2717 2880 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2718 2881 } else if (buf->b_state == arc_mfu) {
2719 2882 /*
2720 2883 * This buffer has been accessed more than once and is
2721 2884 * still in the cache. Keep it in the MFU state.
2722 2885 *
2723 2886 * NOTE: an add_reference() that occurred when we did
2724 2887 * the arc_read() will have kicked this off the list.
2725 2888 * If it was a prefetch, we will explicitly move it to
2726 2889 * the head of the list now.
2727 2890 */
2728 2891 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2729 2892 ASSERT(refcount_count(&buf->b_refcnt) == 0);
2730 2893 ASSERT(list_link_active(&buf->b_arc_node));
2731 2894 }
2732 2895 ARCSTAT_BUMP(arcstat_mfu_hits);
2733 2896 buf->b_arc_access = ddi_get_lbolt();
2734 2897 } else if (buf->b_state == arc_mfu_ghost) {
2735 2898 arc_state_t *new_state = arc_mfu;
2736 2899 /*
2737 2900 * This buffer has been accessed more than once but has
2738 2901 * been evicted from the cache. Move it back to the
2739 2902 * MFU state.
2740 2903 */
2741 2904
2742 2905 if (buf->b_flags & ARC_PREFETCH) {
2743 2906 /*
2744 2907 * This is a prefetch access...
2745 2908 * move this block back to the MRU state.
2746 2909 */
2747 2910 ASSERT0(refcount_count(&buf->b_refcnt));
2748 2911 new_state = arc_mru;
2749 2912 }
2750 2913
2751 2914 buf->b_arc_access = ddi_get_lbolt();
2752 2915 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2753 2916 arc_change_state(new_state, buf, hash_lock);
2754 2917
2755 2918 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2756 2919 } else if (buf->b_state == arc_l2c_only) {
2757 2920 /*
2758 2921 * This buffer is on the 2nd Level ARC.
2759 2922 */
2760 2923
2761 2924 buf->b_arc_access = ddi_get_lbolt();
2762 2925 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2763 2926 arc_change_state(arc_mfu, buf, hash_lock);
2764 2927 } else {
2765 2928 ASSERT(!"invalid arc state");
2766 2929 }
2767 2930 }
2768 2931
2769 2932 /* a generic arc_done_func_t which you can use */
2770 2933 /* ARGSUSED */
2771 2934 void
2772 2935 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2773 2936 {
2774 2937 if (zio == NULL || zio->io_error == 0)
2775 2938 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2776 2939 VERIFY(arc_buf_remove_ref(buf, arg));
2777 2940 }
2778 2941
2779 2942 /* a generic arc_done_func_t */
2780 2943 void
2781 2944 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2782 2945 {
2783 2946 arc_buf_t **bufp = arg;
2784 2947 if (zio && zio->io_error) {
2785 2948 VERIFY(arc_buf_remove_ref(buf, arg));
2786 2949 *bufp = NULL;
2787 2950 } else {
2788 2951 *bufp = buf;
2789 2952 ASSERT(buf->b_data);
2790 2953 }
2791 2954 }
2792 2955
2793 2956 static void
2794 2957 arc_read_done(zio_t *zio)
2795 2958 {
2796 2959 arc_buf_hdr_t *hdr;
2797 2960 arc_buf_t *buf;
2798 2961 arc_buf_t *abuf; /* buffer we're assigning to callback */
2799 2962 kmutex_t *hash_lock = NULL;
2800 2963 arc_callback_t *callback_list, *acb;
2801 2964 int freeable = FALSE;
2802 2965
2803 2966 buf = zio->io_private;
2804 2967 hdr = buf->b_hdr;
2805 2968
2806 2969 /*
2807 2970 * The hdr was inserted into hash-table and removed from lists
2808 2971 * prior to starting I/O. We should find this header, since
2809 2972 * it's in the hash table, and it should be legit since it's
2810 2973 * not possible to evict it during the I/O. The only possible
2811 2974 * reason for it not to be found is if we were freed during the
2812 2975 * read.
2813 2976 */
2814 2977 if (HDR_IN_HASH_TABLE(hdr)) {
2815 2978 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
2816 2979 ASSERT3U(hdr->b_dva.dva_word[0], ==,
2817 2980 BP_IDENTITY(zio->io_bp)->dva_word[0]);
2818 2981 ASSERT3U(hdr->b_dva.dva_word[1], ==,
2819 2982 BP_IDENTITY(zio->io_bp)->dva_word[1]);
2820 2983
2821 2984 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
2822 2985 &hash_lock);
2823 2986
2824 2987 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
2825 2988 hash_lock == NULL) ||
2826 2989 (found == hdr &&
2827 2990 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2828 2991 (found == hdr && HDR_L2_READING(hdr)));
2829 2992 }
2830 2993
2831 2994 hdr->b_flags &= ~ARC_L2_EVICTED;
2832 2995 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2833 2996 hdr->b_flags &= ~ARC_L2CACHE;
2834 2997
2835 2998 /* byteswap if necessary */
2836 2999 callback_list = hdr->b_acb;
2837 3000 ASSERT(callback_list != NULL);
2838 3001 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2839 3002 dmu_object_byteswap_t bswap =
2840 3003 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2841 3004 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2842 3005 byteswap_uint64_array :
2843 3006 dmu_ot_byteswap[bswap].ob_func;
2844 3007 func(buf->b_data, hdr->b_size);
2845 3008 }
2846 3009
2847 3010 arc_cksum_compute(buf, B_FALSE);
2848 3011 arc_buf_watch(buf);
2849 3012
2850 3013 if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2851 3014 /*
2852 3015 * Only call arc_access on anonymous buffers. This is because
2853 3016 * if we've issued an I/O for an evicted buffer, we've already
2854 3017 * called arc_access (to prevent any simultaneous readers from
2855 3018 * getting confused).
2856 3019 */
2857 3020 arc_access(hdr, hash_lock);
2858 3021 }
2859 3022
2860 3023 /* create copies of the data buffer for the callers */
2861 3024 abuf = buf;
2862 3025 for (acb = callback_list; acb; acb = acb->acb_next) {
2863 3026 if (acb->acb_done) {
2864 3027 if (abuf == NULL) {
2865 3028 ARCSTAT_BUMP(arcstat_duplicate_reads);
2866 3029 abuf = arc_buf_clone(buf);
2867 3030 }
2868 3031 acb->acb_buf = abuf;
2869 3032 abuf = NULL;
2870 3033 }
2871 3034 }
2872 3035 hdr->b_acb = NULL;
2873 3036 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2874 3037 ASSERT(!HDR_BUF_AVAILABLE(hdr));
2875 3038 if (abuf == buf) {
2876 3039 ASSERT(buf->b_efunc == NULL);
2877 3040 ASSERT(hdr->b_datacnt == 1);
2878 3041 hdr->b_flags |= ARC_BUF_AVAILABLE;
2879 3042 }
2880 3043
2881 3044 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2882 3045
2883 3046 if (zio->io_error != 0) {
2884 3047 hdr->b_flags |= ARC_IO_ERROR;
2885 3048 if (hdr->b_state != arc_anon)
2886 3049 arc_change_state(arc_anon, hdr, hash_lock);
2887 3050 if (HDR_IN_HASH_TABLE(hdr))
2888 3051 buf_hash_remove(hdr);
2889 3052 freeable = refcount_is_zero(&hdr->b_refcnt);
2890 3053 }
2891 3054
2892 3055 /*
2893 3056 * Broadcast before we drop the hash_lock to avoid the possibility
2894 3057 * that the hdr (and hence the cv) might be freed before we get to
2895 3058 * the cv_broadcast().
2896 3059 */
2897 3060 cv_broadcast(&hdr->b_cv);
2898 3061
2899 3062 if (hash_lock) {
2900 3063 mutex_exit(hash_lock);
2901 3064 } else {
2902 3065 /*
2903 3066 * This block was freed while we waited for the read to
2904 3067 * complete. It has been removed from the hash table and
2905 3068 * moved to the anonymous state (so that it won't show up
2906 3069 * in the cache).
2907 3070 */
2908 3071 ASSERT3P(hdr->b_state, ==, arc_anon);
2909 3072 freeable = refcount_is_zero(&hdr->b_refcnt);
2910 3073 }
2911 3074
2912 3075 /* execute each callback and free its structure */
2913 3076 while ((acb = callback_list) != NULL) {
2914 3077 if (acb->acb_done)
2915 3078 acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2916 3079
2917 3080 if (acb->acb_zio_dummy != NULL) {
2918 3081 acb->acb_zio_dummy->io_error = zio->io_error;
2919 3082 zio_nowait(acb->acb_zio_dummy);
2920 3083 }
2921 3084
2922 3085 callback_list = acb->acb_next;
2923 3086 kmem_free(acb, sizeof (arc_callback_t));
2924 3087 }
2925 3088
2926 3089 if (freeable)
2927 3090 arc_hdr_destroy(hdr);
2928 3091 }
2929 3092
2930 3093 /*
2931 3094 * "Read" the block at the specified DVA (in bp) via the
2932 3095 * cache. If the block is found in the cache, invoke the provided
2933 3096 * callback immediately and return. Note that the `zio' parameter
2934 3097 * in the callback will be NULL in this case, since no IO was
2935 3098 * required. If the block is not in the cache pass the read request
2936 3099 * on to the spa with a substitute callback function, so that the
2937 3100 * requested block will be added to the cache.
2938 3101 *
2939 3102 * If a read request arrives for a block that has a read in-progress,
2940 3103 * either wait for the in-progress read to complete (and return the
2941 3104 * results); or, if this is a read with a "done" func, add a record
2942 3105 * to the read to invoke the "done" func when the read completes,
2943 3106 * and return; or just return.
2944 3107 *
2945 3108 * arc_read_done() will invoke all the requested "done" functions
2946 3109 * for readers of this block.
2947 3110 */
2948 3111 int
2949 3112 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2950 3113 void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
2951 3114 const zbookmark_phys_t *zb)
2952 3115 {
2953 3116 arc_buf_hdr_t *hdr = NULL;
2954 3117 arc_buf_t *buf = NULL;
2955 3118 kmutex_t *hash_lock = NULL;
2956 3119 zio_t *rzio;
2957 3120 uint64_t guid = spa_load_guid(spa);
2958 3121
2959 3122 ASSERT(!BP_IS_EMBEDDED(bp) ||
2960 3123 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
2961 3124
2962 3125 top:
2963 3126 if (!BP_IS_EMBEDDED(bp)) {
2964 3127 /*
2965 3128 * Embedded BP's have no DVA and require no I/O to "read".
2966 3129 * Create an anonymous arc buf to back it.
2967 3130 */
2968 3131 hdr = buf_hash_find(guid, bp, &hash_lock);
2969 3132 }
2970 3133
2971 3134 if (hdr != NULL && hdr->b_datacnt > 0) {
2972 3135
2973 3136 *arc_flags |= ARC_CACHED;
2974 3137
2975 3138 if (HDR_IO_IN_PROGRESS(hdr)) {
2976 3139
2977 3140 if (*arc_flags & ARC_WAIT) {
2978 3141 cv_wait(&hdr->b_cv, hash_lock);
2979 3142 mutex_exit(hash_lock);
2980 3143 goto top;
2981 3144 }
2982 3145 ASSERT(*arc_flags & ARC_NOWAIT);
2983 3146
2984 3147 if (done) {
2985 3148 arc_callback_t *acb = NULL;
2986 3149
2987 3150 acb = kmem_zalloc(sizeof (arc_callback_t),
2988 3151 KM_SLEEP);
2989 3152 acb->acb_done = done;
2990 3153 acb->acb_private = private;
2991 3154 if (pio != NULL)
2992 3155 acb->acb_zio_dummy = zio_null(pio,
2993 3156 spa, NULL, NULL, NULL, zio_flags);
2994 3157
2995 3158 ASSERT(acb->acb_done != NULL);
2996 3159 acb->acb_next = hdr->b_acb;
2997 3160 hdr->b_acb = acb;
2998 3161 add_reference(hdr, hash_lock, private);
2999 3162 mutex_exit(hash_lock);
3000 3163 return (0);
3001 3164 }
3002 3165 mutex_exit(hash_lock);
3003 3166 return (0);
3004 3167 }
3005 3168
3006 3169 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3007 3170
3008 3171 if (done) {
3009 3172 add_reference(hdr, hash_lock, private);
3010 3173 /*
3011 3174 * If this block is already in use, create a new
3012 3175 * copy of the data so that we will be guaranteed
3013 3176 * that arc_release() will always succeed.
3014 3177 */
3015 3178 buf = hdr->b_buf;
3016 3179 ASSERT(buf);
3017 3180 ASSERT(buf->b_data);
3018 3181 if (HDR_BUF_AVAILABLE(hdr)) {
3019 3182 ASSERT(buf->b_efunc == NULL);
3020 3183 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3021 3184 } else {
3022 3185 buf = arc_buf_clone(buf);
3023 3186 }
3024 3187
3025 3188 } else if (*arc_flags & ARC_PREFETCH &&
3026 3189 refcount_count(&hdr->b_refcnt) == 0) {
3027 3190 hdr->b_flags |= ARC_PREFETCH;
3028 3191 }
3029 3192 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3030 3193 arc_access(hdr, hash_lock);
3031 3194 if (*arc_flags & ARC_L2CACHE)
3032 3195 hdr->b_flags |= ARC_L2CACHE;
3033 3196 if (*arc_flags & ARC_L2COMPRESS)
3034 3197 hdr->b_flags |= ARC_L2COMPRESS;
3035 3198 mutex_exit(hash_lock);
3036 3199 ARCSTAT_BUMP(arcstat_hits);
3037 3200 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3038 3201 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3039 3202 data, metadata, hits);
3040 3203
3041 3204 if (done)
3042 3205 done(NULL, buf, private);
3043 3206 } else {
3044 3207 uint64_t size = BP_GET_LSIZE(bp);
3045 3208 arc_callback_t *acb;
3046 3209 vdev_t *vd = NULL;
3047 3210 uint64_t addr = 0;
3048 3211 boolean_t devw = B_FALSE;
3049 3212 enum zio_compress b_compress = ZIO_COMPRESS_OFF;
3050 3213 uint64_t b_asize = 0;
3051 3214
3052 3215 if (hdr == NULL) {
3053 3216 /* this block is not in the cache */
3054 3217 arc_buf_hdr_t *exists = NULL;
3055 3218 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3056 3219 buf = arc_buf_alloc(spa, size, private, type);
3057 3220 hdr = buf->b_hdr;
3058 3221 if (!BP_IS_EMBEDDED(bp)) {
3059 3222 hdr->b_dva = *BP_IDENTITY(bp);
3060 3223 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3061 3224 hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3062 3225 exists = buf_hash_insert(hdr, &hash_lock);
3063 3226 }
3064 3227 if (exists != NULL) {
3065 3228 /* somebody beat us to the hash insert */
3066 3229 mutex_exit(hash_lock);
3067 3230 buf_discard_identity(hdr);
3068 3231 (void) arc_buf_remove_ref(buf, private);
3069 3232 goto top; /* restart the IO request */
3070 3233 }
3071 3234 /* if this is a prefetch, we don't have a reference */
3072 3235 if (*arc_flags & ARC_PREFETCH) {
3073 3236 (void) remove_reference(hdr, hash_lock,
3074 3237 private);
3075 3238 hdr->b_flags |= ARC_PREFETCH;
3076 3239 }
3077 3240 if (*arc_flags & ARC_L2CACHE)
3078 3241 hdr->b_flags |= ARC_L2CACHE;
3079 3242 if (*arc_flags & ARC_L2COMPRESS)
3080 3243 hdr->b_flags |= ARC_L2COMPRESS;
3081 3244 if (BP_GET_LEVEL(bp) > 0)
3082 3245 hdr->b_flags |= ARC_INDIRECT;
3083 3246 } else {
3084 3247 /* this block is in the ghost cache */
3085 3248 ASSERT(GHOST_STATE(hdr->b_state));
3086 3249 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3087 3250 ASSERT0(refcount_count(&hdr->b_refcnt));
3088 3251 ASSERT(hdr->b_buf == NULL);
3089 3252
3090 3253 /* if this is a prefetch, we don't have a reference */
3091 3254 if (*arc_flags & ARC_PREFETCH)
3092 3255 hdr->b_flags |= ARC_PREFETCH;
3093 3256 else
3094 3257 add_reference(hdr, hash_lock, private);
3095 3258 if (*arc_flags & ARC_L2CACHE)
3096 3259 hdr->b_flags |= ARC_L2CACHE;
3097 3260 if (*arc_flags & ARC_L2COMPRESS)
3098 3261 hdr->b_flags |= ARC_L2COMPRESS;
3099 3262 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3100 3263 buf->b_hdr = hdr;
3101 3264 buf->b_data = NULL;
3102 3265 buf->b_efunc = NULL;
3103 3266 buf->b_private = NULL;
3104 3267 buf->b_next = NULL;
3105 3268 hdr->b_buf = buf;
3106 3269 ASSERT(hdr->b_datacnt == 0);
3107 3270 hdr->b_datacnt = 1;
3108 3271 arc_get_data_buf(buf);
3109 3272 arc_access(hdr, hash_lock);
3110 3273 }
3111 3274
3112 3275 ASSERT(!GHOST_STATE(hdr->b_state));
3113 3276
3114 3277 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3115 3278 acb->acb_done = done;
3116 3279 acb->acb_private = private;
3117 3280
3118 3281 ASSERT(hdr->b_acb == NULL);
3119 3282 hdr->b_acb = acb;
3120 3283 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3121 3284
3122 3285 if (hdr->b_l2hdr != NULL &&
3123 3286 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3124 3287 devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3125 3288 addr = hdr->b_l2hdr->b_daddr;
3126 3289 b_compress = hdr->b_l2hdr->b_compress;
3127 3290 b_asize = hdr->b_l2hdr->b_asize;
3128 3291 /*
3129 3292 * Lock out device removal.
3130 3293 */
3131 3294 if (vdev_is_dead(vd) ||
3132 3295 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3133 3296 vd = NULL;
3134 3297 }
3135 3298
3136 3299 if (hash_lock != NULL)
3137 3300 mutex_exit(hash_lock);
3138 3301
3139 3302 /*
3140 3303 * At this point, we have a level 1 cache miss. Try again in
3141 3304 * L2ARC if possible.
3142 3305 */
3143 3306 ASSERT3U(hdr->b_size, ==, size);
3144 3307 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3145 3308 uint64_t, size, zbookmark_phys_t *, zb);
3146 3309 ARCSTAT_BUMP(arcstat_misses);
3147 3310 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3148 3311 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3149 3312 data, metadata, misses);
3150 3313
3151 3314 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3152 3315 /*
3153 3316 * Read from the L2ARC if the following are true:
3154 3317 * 1. The L2ARC vdev was previously cached.
3155 3318 * 2. This buffer still has L2ARC metadata.
3156 3319 * 3. This buffer isn't currently writing to the L2ARC.
3157 3320 * 4. The L2ARC entry wasn't evicted, which may
3158 3321 * also have invalidated the vdev.
3159 3322 * 5. This isn't prefetch and l2arc_noprefetch is set.
3160 3323 */
3161 3324 if (hdr->b_l2hdr != NULL &&
3162 3325 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3163 3326 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3164 3327 l2arc_read_callback_t *cb;
3165 3328
3166 3329 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3167 3330 ARCSTAT_BUMP(arcstat_l2_hits);
3168 3331
3169 3332 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3170 3333 KM_SLEEP);
3171 3334 cb->l2rcb_buf = buf;
3172 3335 cb->l2rcb_spa = spa;
3173 3336 cb->l2rcb_bp = *bp;
3174 3337 cb->l2rcb_zb = *zb;
3175 3338 cb->l2rcb_flags = zio_flags;
3176 3339 cb->l2rcb_compress = b_compress;
3177 3340
3178 3341 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3179 3342 addr + size < vd->vdev_psize -
3180 3343 VDEV_LABEL_END_SIZE);
3181 3344
3182 3345 /*
3183 3346 * l2arc read. The SCL_L2ARC lock will be
3184 3347 * released by l2arc_read_done().
3185 3348 * Issue a null zio if the underlying buffer
3186 3349 * was squashed to zero size by compression.
3187 3350 */
3188 3351 if (b_compress == ZIO_COMPRESS_EMPTY) {
3189 3352 rzio = zio_null(pio, spa, vd,
3190 3353 l2arc_read_done, cb,
3191 3354 zio_flags | ZIO_FLAG_DONT_CACHE |
3192 3355 ZIO_FLAG_CANFAIL |
3193 3356 ZIO_FLAG_DONT_PROPAGATE |
3194 3357 ZIO_FLAG_DONT_RETRY);
3195 3358 } else {
3196 3359 rzio = zio_read_phys(pio, vd, addr,
3197 3360 b_asize, buf->b_data,
3198 3361 ZIO_CHECKSUM_OFF,
3199 3362 l2arc_read_done, cb, priority,
3200 3363 zio_flags | ZIO_FLAG_DONT_CACHE |
3201 3364 ZIO_FLAG_CANFAIL |
3202 3365 ZIO_FLAG_DONT_PROPAGATE |
3203 3366 ZIO_FLAG_DONT_RETRY, B_FALSE);
3204 3367 }
3205 3368 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3206 3369 zio_t *, rzio);
3207 3370 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
3208 3371
3209 3372 if (*arc_flags & ARC_NOWAIT) {
3210 3373 zio_nowait(rzio);
3211 3374 return (0);
3212 3375 }
3213 3376
3214 3377 ASSERT(*arc_flags & ARC_WAIT);
3215 3378 if (zio_wait(rzio) == 0)
3216 3379 return (0);
3217 3380
3218 3381 /* l2arc read error; goto zio_read() */
3219 3382 } else {
3220 3383 DTRACE_PROBE1(l2arc__miss,
3221 3384 arc_buf_hdr_t *, hdr);
3222 3385 ARCSTAT_BUMP(arcstat_l2_misses);
3223 3386 if (HDR_L2_WRITING(hdr))
3224 3387 ARCSTAT_BUMP(arcstat_l2_rw_clash);
3225 3388 spa_config_exit(spa, SCL_L2ARC, vd);
3226 3389 }
3227 3390 } else {
3228 3391 if (vd != NULL)
3229 3392 spa_config_exit(spa, SCL_L2ARC, vd);
3230 3393 if (l2arc_ndev != 0) {
3231 3394 DTRACE_PROBE1(l2arc__miss,
3232 3395 arc_buf_hdr_t *, hdr);
3233 3396 ARCSTAT_BUMP(arcstat_l2_misses);
3234 3397 }
3235 3398 }
3236 3399
3237 3400 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3238 3401 arc_read_done, buf, priority, zio_flags, zb);
3239 3402
3240 3403 if (*arc_flags & ARC_WAIT)
3241 3404 return (zio_wait(rzio));
3242 3405
3243 3406 ASSERT(*arc_flags & ARC_NOWAIT);
3244 3407 zio_nowait(rzio);
3245 3408 }
3246 3409 return (0);
3247 3410 }
3248 3411
3249 3412 void
3250 3413 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3251 3414 {
3252 3415 ASSERT(buf->b_hdr != NULL);
3253 3416 ASSERT(buf->b_hdr->b_state != arc_anon);
3254 3417 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3255 3418 ASSERT(buf->b_efunc == NULL);
3256 3419 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3257 3420
3258 3421 buf->b_efunc = func;
3259 3422 buf->b_private = private;
3260 3423 }
3261 3424
3262 3425 /*
3263 3426 * Notify the arc that a block was freed, and thus will never be used again.
3264 3427 */
3265 3428 void
3266 3429 arc_freed(spa_t *spa, const blkptr_t *bp)
3267 3430 {
3268 3431 arc_buf_hdr_t *hdr;
3269 3432 kmutex_t *hash_lock;
3270 3433 uint64_t guid = spa_load_guid(spa);
3271 3434
3272 3435 ASSERT(!BP_IS_EMBEDDED(bp));
3273 3436
3274 3437 hdr = buf_hash_find(guid, bp, &hash_lock);
3275 3438 if (hdr == NULL)
3276 3439 return;
3277 3440 if (HDR_BUF_AVAILABLE(hdr)) {
3278 3441 arc_buf_t *buf = hdr->b_buf;
3279 3442 add_reference(hdr, hash_lock, FTAG);
3280 3443 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3281 3444 mutex_exit(hash_lock);
3282 3445
3283 3446 arc_release(buf, FTAG);
3284 3447 (void) arc_buf_remove_ref(buf, FTAG);
3285 3448 } else {
3286 3449 mutex_exit(hash_lock);
3287 3450 }
3288 3451
3289 3452 }
3290 3453
3291 3454 /*
3292 3455 * Clear the user eviction callback set by arc_set_callback(), first calling
3293 3456 * it if it exists. Because the presence of a callback keeps an arc_buf cached
3294 3457 * clearing the callback may result in the arc_buf being destroyed. However,
3295 3458 * it will not result in the *last* arc_buf being destroyed, hence the data
3296 3459 * will remain cached in the ARC. We make a copy of the arc buffer here so
3297 3460 * that we can process the callback without holding any locks.
3298 3461 *
3299 3462 * It's possible that the callback is already in the process of being cleared
3300 3463 * by another thread. In this case we can not clear the callback.
3301 3464 *
3302 3465 * Returns B_TRUE if the callback was successfully called and cleared.
3303 3466 */
3304 3467 boolean_t
3305 3468 arc_clear_callback(arc_buf_t *buf)
3306 3469 {
3307 3470 arc_buf_hdr_t *hdr;
3308 3471 kmutex_t *hash_lock;
3309 3472 arc_evict_func_t *efunc = buf->b_efunc;
3310 3473 void *private = buf->b_private;
3311 3474
3312 3475 mutex_enter(&buf->b_evict_lock);
3313 3476 hdr = buf->b_hdr;
3314 3477 if (hdr == NULL) {
3315 3478 /*
3316 3479 * We are in arc_do_user_evicts().
3317 3480 */
3318 3481 ASSERT(buf->b_data == NULL);
3319 3482 mutex_exit(&buf->b_evict_lock);
3320 3483 return (B_FALSE);
3321 3484 } else if (buf->b_data == NULL) {
3322 3485 /*
3323 3486 * We are on the eviction list; process this buffer now
3324 3487 * but let arc_do_user_evicts() do the reaping.
3325 3488 */
3326 3489 buf->b_efunc = NULL;
3327 3490 mutex_exit(&buf->b_evict_lock);
3328 3491 VERIFY0(efunc(private));
3329 3492 return (B_TRUE);
3330 3493 }
3331 3494 hash_lock = HDR_LOCK(hdr);
3332 3495 mutex_enter(hash_lock);
3333 3496 hdr = buf->b_hdr;
3334 3497 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3335 3498
3336 3499 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3337 3500 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3338 3501
3339 3502 buf->b_efunc = NULL;
3340 3503 buf->b_private = NULL;
3341 3504
3342 3505 if (hdr->b_datacnt > 1) {
3343 3506 mutex_exit(&buf->b_evict_lock);
3344 3507 arc_buf_destroy(buf, FALSE, TRUE);
3345 3508 } else {
3346 3509 ASSERT(buf == hdr->b_buf);
3347 3510 hdr->b_flags |= ARC_BUF_AVAILABLE;
3348 3511 mutex_exit(&buf->b_evict_lock);
3349 3512 }
3350 3513
3351 3514 mutex_exit(hash_lock);
3352 3515 VERIFY0(efunc(private));
3353 3516 return (B_TRUE);
3354 3517 }
3355 3518
3356 3519 /*
3357 3520 * Release this buffer from the cache, making it an anonymous buffer. This
3358 3521 * must be done after a read and prior to modifying the buffer contents.
3359 3522 * If the buffer has more than one reference, we must make
3360 3523 * a new hdr for the buffer.
3361 3524 */
3362 3525 void
3363 3526 arc_release(arc_buf_t *buf, void *tag)
3364 3527 {
3365 3528 arc_buf_hdr_t *hdr;
3366 3529 kmutex_t *hash_lock = NULL;
3367 3530 l2arc_buf_hdr_t *l2hdr;
3368 3531 uint64_t buf_size;
3369 3532
3370 3533 /*
3371 3534 * It would be nice to assert that if it's DMU metadata (level >
3372 3535 * 0 || it's the dnode file), then it must be syncing context.
3373 3536 * But we don't know that information at this level.
3374 3537 */
3375 3538
3376 3539 mutex_enter(&buf->b_evict_lock);
3377 3540 hdr = buf->b_hdr;
3378 3541
3379 3542 /* this buffer is not on any list */
3380 3543 ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3381 3544
3382 3545 if (hdr->b_state == arc_anon) {
3383 3546 /* this buffer is already released */
3384 3547 ASSERT(buf->b_efunc == NULL);
3385 3548 } else {
3386 3549 hash_lock = HDR_LOCK(hdr);
3387 3550 mutex_enter(hash_lock);
3388 3551 hdr = buf->b_hdr;
3389 3552 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3390 3553 }
3391 3554
3392 3555 l2hdr = hdr->b_l2hdr;
3393 3556 if (l2hdr) {
3394 3557 mutex_enter(&l2arc_buflist_mtx);
3395 3558 hdr->b_l2hdr = NULL;
3396 3559 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3397 3560 }
3398 3561 buf_size = hdr->b_size;
3399 3562
3400 3563 /*
3401 3564 * Do we have more than one buf?
3402 3565 */
3403 3566 if (hdr->b_datacnt > 1) {
3404 3567 arc_buf_hdr_t *nhdr;
3405 3568 arc_buf_t **bufp;
3406 3569 uint64_t blksz = hdr->b_size;
3407 3570 uint64_t spa = hdr->b_spa;
3408 3571 arc_buf_contents_t type = hdr->b_type;
3409 3572 uint32_t flags = hdr->b_flags;
3410 3573
3411 3574 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3412 3575 /*
3413 3576 * Pull the data off of this hdr and attach it to
3414 3577 * a new anonymous hdr.
3415 3578 */
3416 3579 (void) remove_reference(hdr, hash_lock, tag);
3417 3580 bufp = &hdr->b_buf;
3418 3581 while (*bufp != buf)
3419 3582 bufp = &(*bufp)->b_next;
3420 3583 *bufp = buf->b_next;
3421 3584 buf->b_next = NULL;
3422 3585
3423 3586 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3424 3587 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3425 3588 if (refcount_is_zero(&hdr->b_refcnt)) {
3426 3589 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3427 3590 ASSERT3U(*size, >=, hdr->b_size);
3428 3591 atomic_add_64(size, -hdr->b_size);
3429 3592 }
3430 3593
3431 3594 /*
3432 3595 * We're releasing a duplicate user data buffer, update
3433 3596 * our statistics accordingly.
3434 3597 */
3435 3598 if (hdr->b_type == ARC_BUFC_DATA) {
3436 3599 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3437 3600 ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3438 3601 -hdr->b_size);
3439 3602 }
3440 3603 hdr->b_datacnt -= 1;
3441 3604 arc_cksum_verify(buf);
3442 3605 arc_buf_unwatch(buf);
3443 3606
3444 3607 mutex_exit(hash_lock);
3445 3608
3446 3609 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3447 3610 nhdr->b_size = blksz;
3448 3611 nhdr->b_spa = spa;
3449 3612 nhdr->b_type = type;
3450 3613 nhdr->b_buf = buf;
3451 3614 nhdr->b_state = arc_anon;
3452 3615 nhdr->b_arc_access = 0;
3453 3616 nhdr->b_flags = flags & ARC_L2_WRITING;
3454 3617 nhdr->b_l2hdr = NULL;
3455 3618 nhdr->b_datacnt = 1;
3456 3619 nhdr->b_freeze_cksum = NULL;
3457 3620 (void) refcount_add(&nhdr->b_refcnt, tag);
3458 3621 buf->b_hdr = nhdr;
3459 3622 mutex_exit(&buf->b_evict_lock);
3460 3623 atomic_add_64(&arc_anon->arcs_size, blksz);
3461 3624 } else {
3462 3625 mutex_exit(&buf->b_evict_lock);
3463 3626 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3464 3627 ASSERT(!list_link_active(&hdr->b_arc_node));
3465 3628 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3466 3629 if (hdr->b_state != arc_anon)
3467 3630 arc_change_state(arc_anon, hdr, hash_lock);
3468 3631 hdr->b_arc_access = 0;
3469 3632 if (hash_lock)
3470 3633 mutex_exit(hash_lock);
3471 3634
3472 3635 buf_discard_identity(hdr);
3473 3636 arc_buf_thaw(buf);
3474 3637 }
3475 3638 buf->b_efunc = NULL;
3476 3639 buf->b_private = NULL;
3477 3640
3478 3641 if (l2hdr) {
3479 3642 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3480 3643 if (l2hdr->b_dev->l2ad_vdev)
3481 3644 vdev_space_update(l2hdr->b_dev->l2ad_vdev,
3482 3645 -l2hdr->b_asize, 0, 0);
3483 3646 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3484 3647 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3485 3648 mutex_exit(&l2arc_buflist_mtx);
3486 3649 }
3487 3650 }
3488 3651
3489 3652 int
3490 3653 arc_released(arc_buf_t *buf)
3491 3654 {
3492 3655 int released;
3493 3656
3494 3657 mutex_enter(&buf->b_evict_lock);
3495 3658 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3496 3659 mutex_exit(&buf->b_evict_lock);
3497 3660 return (released);
3498 3661 }
3499 3662
3500 3663 #ifdef ZFS_DEBUG
3501 3664 int
3502 3665 arc_referenced(arc_buf_t *buf)
3503 3666 {
3504 3667 int referenced;
3505 3668
3506 3669 mutex_enter(&buf->b_evict_lock);
3507 3670 referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3508 3671 mutex_exit(&buf->b_evict_lock);
3509 3672 return (referenced);
3510 3673 }
3511 3674 #endif
3512 3675
3513 3676 static void
3514 3677 arc_write_ready(zio_t *zio)
3515 3678 {
3516 3679 arc_write_callback_t *callback = zio->io_private;
3517 3680 arc_buf_t *buf = callback->awcb_buf;
3518 3681 arc_buf_hdr_t *hdr = buf->b_hdr;
3519 3682
3520 3683 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3521 3684 callback->awcb_ready(zio, buf, callback->awcb_private);
3522 3685
3523 3686 /*
3524 3687 * If the IO is already in progress, then this is a re-write
3525 3688 * attempt, so we need to thaw and re-compute the cksum.
3526 3689 * It is the responsibility of the callback to handle the
3527 3690 * accounting for any re-write attempt.
3528 3691 */
3529 3692 if (HDR_IO_IN_PROGRESS(hdr)) {
3530 3693 mutex_enter(&hdr->b_freeze_lock);
3531 3694 if (hdr->b_freeze_cksum != NULL) {
3532 3695 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3533 3696 hdr->b_freeze_cksum = NULL;
3534 3697 }
3535 3698 mutex_exit(&hdr->b_freeze_lock);
3536 3699 }
3537 3700 arc_cksum_compute(buf, B_FALSE);
3538 3701 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3539 3702 }
3540 3703
3541 3704 /*
3542 3705 * The SPA calls this callback for each physical write that happens on behalf
3543 3706 * of a logical write. See the comment in dbuf_write_physdone() for details.
3544 3707 */
3545 3708 static void
3546 3709 arc_write_physdone(zio_t *zio)
3547 3710 {
3548 3711 arc_write_callback_t *cb = zio->io_private;
3549 3712 if (cb->awcb_physdone != NULL)
3550 3713 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
3551 3714 }
3552 3715
3553 3716 static void
3554 3717 arc_write_done(zio_t *zio)
3555 3718 {
3556 3719 arc_write_callback_t *callback = zio->io_private;
3557 3720 arc_buf_t *buf = callback->awcb_buf;
3558 3721 arc_buf_hdr_t *hdr = buf->b_hdr;
3559 3722
3560 3723 ASSERT(hdr->b_acb == NULL);
3561 3724
3562 3725 if (zio->io_error == 0) {
3563 3726 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
3564 3727 buf_discard_identity(hdr);
3565 3728 } else {
3566 3729 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3567 3730 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3568 3731 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3569 3732 }
3570 3733 } else {
3571 3734 ASSERT(BUF_EMPTY(hdr));
3572 3735 }
3573 3736
3574 3737 /*
3575 3738 * If the block to be written was all-zero or compressed enough to be
3576 3739 * embedded in the BP, no write was performed so there will be no
3577 3740 * dva/birth/checksum. The buffer must therefore remain anonymous
3578 3741 * (and uncached).
3579 3742 */
3580 3743 if (!BUF_EMPTY(hdr)) {
3581 3744 arc_buf_hdr_t *exists;
3582 3745 kmutex_t *hash_lock;
3583 3746
3584 3747 ASSERT(zio->io_error == 0);
3585 3748
3586 3749 arc_cksum_verify(buf);
3587 3750
3588 3751 exists = buf_hash_insert(hdr, &hash_lock);
3589 3752 if (exists) {
3590 3753 /*
3591 3754 * This can only happen if we overwrite for
3592 3755 * sync-to-convergence, because we remove
3593 3756 * buffers from the hash table when we arc_free().
3594 3757 */
3595 3758 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3596 3759 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3597 3760 panic("bad overwrite, hdr=%p exists=%p",
3598 3761 (void *)hdr, (void *)exists);
3599 3762 ASSERT(refcount_is_zero(&exists->b_refcnt));
3600 3763 arc_change_state(arc_anon, exists, hash_lock);
3601 3764 mutex_exit(hash_lock);
3602 3765 arc_hdr_destroy(exists);
3603 3766 exists = buf_hash_insert(hdr, &hash_lock);
3604 3767 ASSERT3P(exists, ==, NULL);
3605 3768 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3606 3769 /* nopwrite */
3607 3770 ASSERT(zio->io_prop.zp_nopwrite);
3608 3771 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3609 3772 panic("bad nopwrite, hdr=%p exists=%p",
3610 3773 (void *)hdr, (void *)exists);
3611 3774 } else {
3612 3775 /* Dedup */
3613 3776 ASSERT(hdr->b_datacnt == 1);
3614 3777 ASSERT(hdr->b_state == arc_anon);
3615 3778 ASSERT(BP_GET_DEDUP(zio->io_bp));
3616 3779 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3617 3780 }
3618 3781 }
3619 3782 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3620 3783 /* if it's not anon, we are doing a scrub */
3621 3784 if (!exists && hdr->b_state == arc_anon)
3622 3785 arc_access(hdr, hash_lock);
3623 3786 mutex_exit(hash_lock);
3624 3787 } else {
3625 3788 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3626 3789 }
3627 3790
3628 3791 ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3629 3792 callback->awcb_done(zio, buf, callback->awcb_private);
3630 3793
3631 3794 kmem_free(callback, sizeof (arc_write_callback_t));
3632 3795 }
3633 3796
3634 3797 zio_t *
3635 3798 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3636 3799 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3637 3800 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
3638 3801 arc_done_func_t *done, void *private, zio_priority_t priority,
3639 3802 int zio_flags, const zbookmark_phys_t *zb)
3640 3803 {
3641 3804 arc_buf_hdr_t *hdr = buf->b_hdr;
3642 3805 arc_write_callback_t *callback;
3643 3806 zio_t *zio;
3644 3807
3645 3808 ASSERT(ready != NULL);
3646 3809 ASSERT(done != NULL);
3647 3810 ASSERT(!HDR_IO_ERROR(hdr));
3648 3811 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3649 3812 ASSERT(hdr->b_acb == NULL);
3650 3813 if (l2arc)
3651 3814 hdr->b_flags |= ARC_L2CACHE;
3652 3815 if (l2arc_compress)
3653 3816 hdr->b_flags |= ARC_L2COMPRESS;
3654 3817 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3655 3818 callback->awcb_ready = ready;
3656 3819 callback->awcb_physdone = physdone;
3657 3820 callback->awcb_done = done;
3658 3821 callback->awcb_private = private;
3659 3822 callback->awcb_buf = buf;
3660 3823
3661 3824 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3662 3825 arc_write_ready, arc_write_physdone, arc_write_done, callback,
3663 3826 priority, zio_flags, zb);
3664 3827
3665 3828 return (zio);
3666 3829 }
3667 3830
3668 3831 static int
3669 3832 arc_memory_throttle(uint64_t reserve, uint64_t txg)
3670 3833 {
3671 3834 #ifdef _KERNEL
3672 3835 uint64_t available_memory = ptob(freemem);
3673 3836 static uint64_t page_load = 0;
3674 3837 static uint64_t last_txg = 0;
3675 3838
3676 3839 #if defined(__i386)
3677 3840 available_memory =
3678 3841 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3679 3842 #endif
3680 3843
3681 3844 if (freemem > physmem * arc_lotsfree_percent / 100)
3682 3845 return (0);
3683 3846
3684 3847 if (txg > last_txg) {
3685 3848 last_txg = txg;
3686 3849 page_load = 0;
3687 3850 }
3688 3851 /*
3689 3852 * If we are in pageout, we know that memory is already tight,
3690 3853 * the arc is already going to be evicting, so we just want to
3691 3854 * continue to let page writes occur as quickly as possible.
3692 3855 */
3693 3856 if (curproc == proc_pageout) {
3694 3857 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3695 3858 return (SET_ERROR(ERESTART));
3696 3859 /* Note: reserve is inflated, so we deflate */
3697 3860 page_load += reserve / 8;
3698 3861 return (0);
3699 3862 } else if (page_load > 0 && arc_reclaim_needed()) {
3700 3863 /* memory is low, delay before restarting */
3701 3864 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3702 3865 return (SET_ERROR(EAGAIN));
3703 3866 }
3704 3867 page_load = 0;
3705 3868 #endif
3706 3869 return (0);
3707 3870 }
3708 3871
3709 3872 void
3710 3873 arc_tempreserve_clear(uint64_t reserve)
3711 3874 {
3712 3875 atomic_add_64(&arc_tempreserve, -reserve);
3713 3876 ASSERT((int64_t)arc_tempreserve >= 0);
3714 3877 }
3715 3878
3716 3879 int
3717 3880 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3718 3881 {
3719 3882 int error;
3720 3883 uint64_t anon_size;
3721 3884
3722 3885 if (reserve > arc_c/4 && !arc_no_grow)
3723 3886 arc_c = MIN(arc_c_max, reserve * 4);
3724 3887 if (reserve > arc_c)
3725 3888 return (SET_ERROR(ENOMEM));
3726 3889
3727 3890 /*
3728 3891 * Don't count loaned bufs as in flight dirty data to prevent long
3729 3892 * network delays from blocking transactions that are ready to be
3730 3893 * assigned to a txg.
3731 3894 */
3732 3895 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3733 3896
3734 3897 /*
3735 3898 * Writes will, almost always, require additional memory allocations
3736 3899 * in order to compress/encrypt/etc the data. We therefore need to
3737 3900 * make sure that there is sufficient available memory for this.
3738 3901 */
3739 3902 error = arc_memory_throttle(reserve, txg);
3740 3903 if (error != 0)
3741 3904 return (error);
3742 3905
3743 3906 /*
3744 3907 * Throttle writes when the amount of dirty data in the cache
3745 3908 * gets too large. We try to keep the cache less than half full
3746 3909 * of dirty blocks so that our sync times don't grow too large.
3747 3910 * Note: if two requests come in concurrently, we might let them
3748 3911 * both succeed, when one of them should fail. Not a huge deal.
3749 3912 */
3750 3913
3751 3914 if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3752 3915 anon_size > arc_c / 4) {
3753 3916 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3754 3917 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3755 3918 arc_tempreserve>>10,
3756 3919 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3757 3920 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3758 3921 reserve>>10, arc_c>>10);
3759 3922 return (SET_ERROR(ERESTART));
3760 3923 }
3761 3924 atomic_add_64(&arc_tempreserve, reserve);
3762 3925 return (0);
3763 3926 }
↓ open down ↓ |
1146 lines elided |
↑ open up ↑ |
3764 3927
3765 3928 /* Tuneable, default is 64, which is essentially arbitrary */
3766 3929 int zfs_flush_ntasks = 64;
3767 3930
3768 3931 void
3769 3932 arc_init(void)
3770 3933 {
3771 3934 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3772 3935 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3773 3936
3937 + mutex_init(&arc_pressure_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3938 + cv_init(&arc_pressure_thr_cv, NULL, CV_DEFAULT, NULL);
3939 +
3774 3940 /* Convert seconds to clock ticks */
3775 3941 arc_min_prefetch_lifespan = 1 * hz;
3776 3942
3777 3943 /* Start out with 1/8 of all memory */
3778 3944 arc_c = physmem * PAGESIZE / 8;
3779 3945
3780 3946 #ifdef _KERNEL
3781 3947 /*
3782 3948 * On architectures where the physical memory can be larger
3783 3949 * than the addressable space (intel in 32-bit mode), we may
3784 3950 * need to limit the cache to 1/8 of VM size.
3785 3951 */
3786 3952 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3787 3953 #endif
3788 3954
3955 + /* initial sensible value */
3956 + arc_pressure_threshold = arc_c;
3789 3957 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3790 3958 arc_c_min = MAX(arc_c / 4, 64<<20);
3791 3959 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3792 3960 if (arc_c * 8 >= 1<<30)
3793 3961 arc_c_max = (arc_c * 8) - (1<<30);
3794 3962 else
3795 3963 arc_c_max = arc_c_min;
3796 3964 arc_c_max = MAX(arc_c * 6, arc_c_max);
3797 3965
3798 3966 /*
3799 3967 * Allow the tunables to override our calculations if they are
3800 3968 * reasonable (ie. over 64MB)
3801 3969 */
3802 3970 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3803 3971 arc_c_max = zfs_arc_max;
3804 3972 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3805 3973 arc_c_min = zfs_arc_min;
3806 3974
3807 3975 arc_c = arc_c_max;
3808 3976 arc_p = (arc_c >> 1);
3809 3977
3810 3978 /* limit meta-data to 1/4 of the arc capacity */
3811 3979 arc_meta_limit = arc_c_max / 4;
3812 3980
3813 3981 /* Allow the tunable to override if it is reasonable */
3814 3982 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3815 3983 arc_meta_limit = zfs_arc_meta_limit;
3816 3984
3817 3985 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3818 3986 arc_c_min = arc_meta_limit / 2;
3819 3987
3820 3988 if (zfs_arc_grow_retry > 0)
3821 3989 arc_grow_retry = zfs_arc_grow_retry;
3822 3990
3823 3991 if (zfs_arc_shrink_shift > 0)
3824 3992 arc_shrink_shift = zfs_arc_shrink_shift;
3825 3993
3826 3994 if (zfs_arc_p_min_shift > 0)
3827 3995 arc_p_min_shift = zfs_arc_p_min_shift;
3828 3996
3829 3997 /* if kmem_flags are set, lets try to use less memory */
3830 3998 if (kmem_debugging())
3831 3999 arc_c = arc_c / 2;
3832 4000 if (arc_c < arc_c_min)
3833 4001 arc_c = arc_c_min;
3834 4002
3835 4003 arc_anon = &ARC_anon;
3836 4004 arc_mru = &ARC_mru;
3837 4005 arc_mru_ghost = &ARC_mru_ghost;
3838 4006 arc_mfu = &ARC_mfu;
3839 4007 arc_mfu_ghost = &ARC_mfu_ghost;
3840 4008 arc_l2c_only = &ARC_l2c_only;
3841 4009 arc_size = 0;
3842 4010
3843 4011 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3844 4012 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3845 4013 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3846 4014 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3847 4015 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3848 4016 mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3849 4017
3850 4018 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3851 4019 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3852 4020 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3853 4021 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3854 4022 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3855 4023 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3856 4024 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3857 4025 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3858 4026 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3859 4027 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3860 4028 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3861 4029 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3862 4030 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3863 4031 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3864 4032 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3865 4033 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3866 4034 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3867 4035 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3868 4036 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3869 4037 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3870 4038
3871 4039 arc_flush_taskq = taskq_create("arc_flush_tq",
3872 4040 max_ncpus, minclsyspri, 1, zfs_flush_ntasks, TASKQ_DYNAMIC);
3873 4041 buf_init();
3874 4042
3875 4043 arc_thread_exit = 0;
3876 4044 arc_eviction_list = NULL;
3877 4045 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3878 4046 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3879 4047
↓ open down ↓ |
81 lines elided |
↑ open up ↑ |
3880 4048 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3881 4049 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3882 4050
3883 4051 if (arc_ksp != NULL) {
3884 4052 arc_ksp->ks_data = &arc_stats;
3885 4053 kstat_install(arc_ksp);
3886 4054 }
3887 4055
3888 4056 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3889 4057 TS_RUN, minclsyspri);
4058 + (void) thread_create(NULL, 0, arc_pressure_thread, NULL, 0, &p0,
4059 + TS_RUN, minclsyspri);
3890 4060
3891 4061 arc_dead = FALSE;
3892 4062 arc_warm = B_FALSE;
3893 4063
3894 4064 /*
3895 4065 * Calculate maximum amount of dirty data per pool.
3896 4066 *
3897 4067 * If it has been set by /etc/system, take that.
3898 4068 * Otherwise, use a percentage of physical memory defined by
3899 4069 * zfs_dirty_data_max_percent (default 10%) with a cap at
3900 4070 * zfs_dirty_data_max_max (default 4GB).
3901 4071 */
3902 4072 if (zfs_dirty_data_max == 0) {
3903 4073 zfs_dirty_data_max = physmem * PAGESIZE *
3904 4074 zfs_dirty_data_max_percent / 100;
3905 4075 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
3906 4076 zfs_dirty_data_max_max);
3907 4077 }
3908 4078 }
↓ open down ↓ |
9 lines elided |
↑ open up ↑ |
3909 4079
3910 4080 void
3911 4081 arc_fini(void)
3912 4082 {
3913 4083 mutex_enter(&arc_reclaim_thr_lock);
3914 4084 arc_thread_exit = 1;
3915 4085 while (arc_thread_exit != 0)
3916 4086 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3917 4087 mutex_exit(&arc_reclaim_thr_lock);
3918 4088
4089 + mutex_enter(&arc_pressure_thr_lock);
4090 + arc_pressure_thread_exit = 1;
4091 + while (arc_pressure_thread_exit != 0)
4092 + cv_wait(&arc_pressure_thr_cv, &arc_pressure_thr_lock);
4093 + mutex_exit(&arc_pressure_thr_lock);
4094 +
3919 4095 arc_flush(NULL);
3920 4096
3921 4097 arc_dead = TRUE;
3922 4098
3923 4099 if (arc_ksp != NULL) {
3924 4100 kstat_delete(arc_ksp);
3925 4101 arc_ksp = NULL;
3926 4102 }
3927 4103
3928 4104 mutex_destroy(&arc_eviction_mtx);
3929 4105 mutex_destroy(&arc_reclaim_thr_lock);
3930 4106 cv_destroy(&arc_reclaim_thr_cv);
4107 + mutex_destroy(&arc_pressure_thr_lock);
4108 + cv_destroy(&arc_pressure_thr_cv);
3931 4109
3932 4110 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3933 4111 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3934 4112 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3935 4113 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3936 4114 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3937 4115 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3938 4116 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3939 4117 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3940 4118
3941 4119 mutex_destroy(&arc_anon->arcs_mtx);
3942 4120 mutex_destroy(&arc_mru->arcs_mtx);
3943 4121 mutex_destroy(&arc_mru_ghost->arcs_mtx);
3944 4122 mutex_destroy(&arc_mfu->arcs_mtx);
3945 4123 mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3946 4124 mutex_destroy(&arc_l2c_only->arcs_mtx);
3947 4125
3948 4126 taskq_destroy(arc_flush_taskq);
3949 4127 buf_fini();
3950 4128
3951 4129 ASSERT(arc_loaned_bytes == 0);
3952 4130 }
3953 4131
3954 4132 /*
3955 4133 * Level 2 ARC
3956 4134 *
3957 4135 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3958 4136 * It uses dedicated storage devices to hold cached data, which are populated
3959 4137 * using large infrequent writes. The main role of this cache is to boost
3960 4138 * the performance of random read workloads. The intended L2ARC devices
3961 4139 * include short-stroked disks, solid state disks, and other media with
3962 4140 * substantially faster read latency than disk.
3963 4141 *
3964 4142 * +-----------------------+
3965 4143 * | ARC |
3966 4144 * +-----------------------+
3967 4145 * | ^ ^
3968 4146 * | | |
3969 4147 * l2arc_feed_thread() arc_read()
3970 4148 * | | |
3971 4149 * | l2arc read |
3972 4150 * V | |
3973 4151 * +---------------+ |
3974 4152 * | L2ARC | |
3975 4153 * +---------------+ |
3976 4154 * | ^ |
3977 4155 * l2arc_write() | |
3978 4156 * | | |
3979 4157 * V | |
3980 4158 * +-------+ +-------+
3981 4159 * | vdev | | vdev |
3982 4160 * | cache | | cache |
3983 4161 * +-------+ +-------+
3984 4162 * +=========+ .-----.
3985 4163 * : L2ARC : |-_____-|
3986 4164 * : devices : | Disks |
3987 4165 * +=========+ `-_____-'
3988 4166 *
3989 4167 * Read requests are satisfied from the following sources, in order:
3990 4168 *
3991 4169 * 1) ARC
3992 4170 * 2) vdev cache of L2ARC devices
3993 4171 * 3) L2ARC devices
3994 4172 * 4) vdev cache of disks
3995 4173 * 5) disks
3996 4174 *
3997 4175 * Some L2ARC device types exhibit extremely slow write performance.
3998 4176 * To accommodate for this there are some significant differences between
3999 4177 * the L2ARC and traditional cache design:
4000 4178 *
4001 4179 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
4002 4180 * the ARC behave as usual, freeing buffers and placing headers on ghost
4003 4181 * lists. The ARC does not send buffers to the L2ARC during eviction as
4004 4182 * this would add inflated write latencies for all ARC memory pressure.
4005 4183 *
4006 4184 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4007 4185 * It does this by periodically scanning buffers from the eviction-end of
4008 4186 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4009 4187 * not already there. It scans until a headroom of buffers is satisfied,
4010 4188 * which itself is a buffer for ARC eviction. If a compressible buffer is
4011 4189 * found during scanning and selected for writing to an L2ARC device, we
4012 4190 * temporarily boost scanning headroom during the next scan cycle to make
4013 4191 * sure we adapt to compression effects (which might significantly reduce
4014 4192 * the data volume we write to L2ARC). The thread that does this is
4015 4193 * l2arc_feed_thread(), illustrated below; example sizes are included to
4016 4194 * provide a better sense of ratio than this diagram:
4017 4195 *
4018 4196 * head --> tail
4019 4197 * +---------------------+----------+
4020 4198 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
4021 4199 * +---------------------+----------+ | o L2ARC eligible
4022 4200 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
4023 4201 * +---------------------+----------+ |
4024 4202 * 15.9 Gbytes ^ 32 Mbytes |
4025 4203 * headroom |
4026 4204 * l2arc_feed_thread()
4027 4205 * |
4028 4206 * l2arc write hand <--[oooo]--'
4029 4207 * | 8 Mbyte
4030 4208 * | write max
4031 4209 * V
4032 4210 * +==============================+
4033 4211 * L2ARC dev |####|#|###|###| |####| ... |
4034 4212 * +==============================+
4035 4213 * 32 Gbytes
4036 4214 *
4037 4215 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4038 4216 * evicted, then the L2ARC has cached a buffer much sooner than it probably
4039 4217 * needed to, potentially wasting L2ARC device bandwidth and storage. It is
4040 4218 * safe to say that this is an uncommon case, since buffers at the end of
4041 4219 * the ARC lists have moved there due to inactivity.
4042 4220 *
4043 4221 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4044 4222 * then the L2ARC simply misses copying some buffers. This serves as a
4045 4223 * pressure valve to prevent heavy read workloads from both stalling the ARC
4046 4224 * with waits and clogging the L2ARC with writes. This also helps prevent
4047 4225 * the potential for the L2ARC to churn if it attempts to cache content too
4048 4226 * quickly, such as during backups of the entire pool.
4049 4227 *
4050 4228 * 5. After system boot and before the ARC has filled main memory, there are
4051 4229 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4052 4230 * lists can remain mostly static. Instead of searching from tail of these
4053 4231 * lists as pictured, the l2arc_feed_thread() will search from the list heads
4054 4232 * for eligible buffers, greatly increasing its chance of finding them.
4055 4233 *
4056 4234 * The L2ARC device write speed is also boosted during this time so that
4057 4235 * the L2ARC warms up faster. Since there have been no ARC evictions yet,
4058 4236 * there are no L2ARC reads, and no fear of degrading read performance
4059 4237 * through increased writes.
4060 4238 *
4061 4239 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4062 4240 * the vdev queue can aggregate them into larger and fewer writes. Each
4063 4241 * device is written to in a rotor fashion, sweeping writes through
4064 4242 * available space then repeating.
4065 4243 *
4066 4244 * 7. The L2ARC does not store dirty content. It never needs to flush
4067 4245 * write buffers back to disk based storage.
4068 4246 *
4069 4247 * 8. If an ARC buffer is written (and dirtied) which also exists in the
4070 4248 * L2ARC, the now stale L2ARC buffer is immediately dropped.
4071 4249 *
4072 4250 * The performance of the L2ARC can be tweaked by a number of tunables, which
4073 4251 * may be necessary for different workloads:
4074 4252 *
4075 4253 * l2arc_write_max max write bytes per interval
4076 4254 * l2arc_write_boost extra write bytes during device warmup
4077 4255 * l2arc_noprefetch skip caching prefetched buffers
4078 4256 * l2arc_headroom number of max device writes to precache
4079 4257 * l2arc_headroom_boost when we find compressed buffers during ARC
4080 4258 * scanning, we multiply headroom by this
4081 4259 * percentage factor for the next scan cycle,
4082 4260 * since more compressed buffers are likely to
4083 4261 * be present
4084 4262 * l2arc_feed_secs seconds between L2ARC writing
4085 4263 *
4086 4264 * Tunables may be removed or added as future performance improvements are
4087 4265 * integrated, and also may become zpool properties.
4088 4266 *
4089 4267 * There are three key functions that control how the L2ARC warms up:
4090 4268 *
4091 4269 * l2arc_write_eligible() check if a buffer is eligible to cache
4092 4270 * l2arc_write_size() calculate how much to write
4093 4271 * l2arc_write_interval() calculate sleep delay between writes
4094 4272 *
4095 4273 * These three functions determine what to write, how much, and how quickly
4096 4274 * to send writes.
4097 4275 */
4098 4276
4099 4277 static boolean_t
4100 4278 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4101 4279 {
4102 4280 /*
4103 4281 * A buffer is *not* eligible for the L2ARC if it:
4104 4282 * 1. belongs to a different spa.
4105 4283 * 2. is already cached on the L2ARC.
4106 4284 * 3. has an I/O in progress (it may be an incomplete read).
4107 4285 * 4. is flagged not eligible (zfs property).
4108 4286 */
4109 4287 if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4110 4288 HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4111 4289 return (B_FALSE);
4112 4290
4113 4291 return (B_TRUE);
4114 4292 }
4115 4293
4116 4294 static uint64_t
4117 4295 l2arc_write_size(void)
4118 4296 {
4119 4297 uint64_t size;
4120 4298
4121 4299 /*
4122 4300 * Make sure our globals have meaningful values in case the user
4123 4301 * altered them.
4124 4302 */
4125 4303 size = l2arc_write_max;
4126 4304 if (size == 0) {
4127 4305 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4128 4306 "be greater than zero, resetting it to the default (%d)",
4129 4307 L2ARC_WRITE_SIZE);
4130 4308 size = l2arc_write_max = L2ARC_WRITE_SIZE;
4131 4309 }
4132 4310
4133 4311 if (arc_warm == B_FALSE)
4134 4312 size += l2arc_write_boost;
4135 4313
4136 4314 return (size);
4137 4315
4138 4316 }
4139 4317
4140 4318 static clock_t
4141 4319 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4142 4320 {
4143 4321 clock_t interval, next, now;
4144 4322
4145 4323 /*
4146 4324 * If the ARC lists are busy, increase our write rate; if the
4147 4325 * lists are stale, idle back. This is achieved by checking
4148 4326 * how much we previously wrote - if it was more than half of
4149 4327 * what we wanted, schedule the next write much sooner.
4150 4328 */
4151 4329 if (l2arc_feed_again && wrote > (wanted / 2))
4152 4330 interval = (hz * l2arc_feed_min_ms) / 1000;
4153 4331 else
4154 4332 interval = hz * l2arc_feed_secs;
4155 4333
4156 4334 now = ddi_get_lbolt();
4157 4335 next = MAX(now, MIN(now + interval, began + interval));
4158 4336
4159 4337 return (next);
4160 4338 }
4161 4339
4162 4340 static void
4163 4341 l2arc_hdr_stat_add(void)
4164 4342 {
4165 4343 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4166 4344 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4167 4345 }
4168 4346
4169 4347 static void
4170 4348 l2arc_hdr_stat_remove(void)
4171 4349 {
4172 4350 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4173 4351 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4174 4352 }
4175 4353
4176 4354 /*
4177 4355 * Cycle through L2ARC devices. This is how L2ARC load balances.
4178 4356 * If a device is returned, this also returns holding the spa config lock.
4179 4357 */
4180 4358 static l2arc_dev_t *
4181 4359 l2arc_dev_get_next(void)
4182 4360 {
4183 4361 l2arc_dev_t *first, *next = NULL;
4184 4362
4185 4363 /*
4186 4364 * Lock out the removal of spas (spa_namespace_lock), then removal
4187 4365 * of cache devices (l2arc_dev_mtx). Once a device has been selected,
4188 4366 * both locks will be dropped and a spa config lock held instead.
4189 4367 */
4190 4368 mutex_enter(&spa_namespace_lock);
4191 4369 mutex_enter(&l2arc_dev_mtx);
4192 4370
4193 4371 /* if there are no vdevs, there is nothing to do */
4194 4372 if (l2arc_ndev == 0)
4195 4373 goto out;
4196 4374
4197 4375 first = NULL;
4198 4376 next = l2arc_dev_last;
4199 4377 do {
4200 4378 /* loop around the list looking for a non-faulted vdev */
4201 4379 if (next == NULL) {
4202 4380 next = list_head(l2arc_dev_list);
4203 4381 } else {
4204 4382 next = list_next(l2arc_dev_list, next);
4205 4383 if (next == NULL)
4206 4384 next = list_head(l2arc_dev_list);
4207 4385 }
4208 4386
4209 4387 /* if we have come back to the start, bail out */
4210 4388 if (first == NULL)
4211 4389 first = next;
4212 4390 else if (next == first)
4213 4391 break;
4214 4392
4215 4393 } while (vdev_is_dead(next->l2ad_vdev));
4216 4394
4217 4395 /* if we were unable to find any usable vdevs, return NULL */
4218 4396 if (vdev_is_dead(next->l2ad_vdev))
4219 4397 next = NULL;
4220 4398
4221 4399 l2arc_dev_last = next;
4222 4400
4223 4401 out:
4224 4402 mutex_exit(&l2arc_dev_mtx);
4225 4403
4226 4404 /*
4227 4405 * Grab the config lock to prevent the 'next' device from being
4228 4406 * removed while we are writing to it.
4229 4407 */
4230 4408 if (next != NULL)
4231 4409 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4232 4410 mutex_exit(&spa_namespace_lock);
4233 4411
4234 4412 return (next);
4235 4413 }
4236 4414
4237 4415 /*
4238 4416 * Free buffers that were tagged for destruction.
4239 4417 */
4240 4418 static void
4241 4419 l2arc_do_free_on_write()
4242 4420 {
4243 4421 list_t *buflist;
4244 4422 l2arc_data_free_t *df, *df_prev;
4245 4423
4246 4424 mutex_enter(&l2arc_free_on_write_mtx);
4247 4425 buflist = l2arc_free_on_write;
4248 4426
4249 4427 for (df = list_tail(buflist); df; df = df_prev) {
4250 4428 df_prev = list_prev(buflist, df);
4251 4429 ASSERT(df->l2df_data != NULL);
4252 4430 ASSERT(df->l2df_func != NULL);
4253 4431 df->l2df_func(df->l2df_data, df->l2df_size);
4254 4432 list_remove(buflist, df);
4255 4433 kmem_free(df, sizeof (l2arc_data_free_t));
4256 4434 }
4257 4435
4258 4436 mutex_exit(&l2arc_free_on_write_mtx);
4259 4437 }
4260 4438
4261 4439 /*
4262 4440 * A write to a cache device has completed. Update all headers to allow
4263 4441 * reads from these buffers to begin.
4264 4442 */
4265 4443 static void
4266 4444 l2arc_write_done(zio_t *zio)
4267 4445 {
4268 4446 l2arc_write_callback_t *cb;
4269 4447 l2arc_dev_t *dev;
4270 4448 list_t *buflist;
4271 4449 arc_buf_hdr_t *head, *ab, *ab_prev;
4272 4450 l2arc_buf_hdr_t *abl2;
4273 4451 kmutex_t *hash_lock;
4274 4452 int64_t bytes_dropped = 0;
4275 4453
4276 4454 cb = zio->io_private;
4277 4455 ASSERT(cb != NULL);
4278 4456 dev = cb->l2wcb_dev;
4279 4457 ASSERT(dev != NULL);
4280 4458 head = cb->l2wcb_head;
4281 4459 ASSERT(head != NULL);
4282 4460 buflist = dev->l2ad_buflist;
4283 4461 ASSERT(buflist != NULL);
4284 4462 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4285 4463 l2arc_write_callback_t *, cb);
4286 4464
4287 4465 if (zio->io_error != 0)
4288 4466 ARCSTAT_BUMP(arcstat_l2_writes_error);
4289 4467
4290 4468 mutex_enter(&l2arc_buflist_mtx);
4291 4469
4292 4470 /*
4293 4471 * All writes completed, or an error was hit.
4294 4472 */
4295 4473 for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4296 4474 ab_prev = list_prev(buflist, ab);
4297 4475 abl2 = ab->b_l2hdr;
4298 4476
4299 4477 /*
4300 4478 * Release the temporary compressed buffer as soon as possible.
4301 4479 */
4302 4480 if (abl2->b_compress != ZIO_COMPRESS_OFF)
4303 4481 l2arc_release_cdata_buf(ab);
4304 4482
4305 4483 hash_lock = HDR_LOCK(ab);
4306 4484 if (!mutex_tryenter(hash_lock)) {
4307 4485 /*
4308 4486 * This buffer misses out. It may be in a stage
4309 4487 * of eviction. Its ARC_L2_WRITING flag will be
4310 4488 * left set, denying reads to this buffer.
4311 4489 */
4312 4490 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4313 4491 continue;
4314 4492 }
4315 4493
4316 4494 if (zio->io_error != 0) {
4317 4495 /*
4318 4496 * Error - drop L2ARC entry.
4319 4497 */
4320 4498 list_remove(buflist, ab);
4321 4499 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4322 4500 bytes_dropped += abl2->b_asize;
4323 4501 ab->b_l2hdr = NULL;
4324 4502 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4325 4503 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4326 4504 }
4327 4505
4328 4506 /*
4329 4507 * Allow ARC to begin reads to this L2ARC entry.
4330 4508 */
4331 4509 ab->b_flags &= ~ARC_L2_WRITING;
4332 4510
4333 4511 mutex_exit(hash_lock);
4334 4512 }
4335 4513
4336 4514 atomic_inc_64(&l2arc_writes_done);
4337 4515 list_remove(buflist, head);
4338 4516 kmem_cache_free(hdr_cache, head);
4339 4517 mutex_exit(&l2arc_buflist_mtx);
4340 4518
4341 4519 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
4342 4520
4343 4521 l2arc_do_free_on_write();
4344 4522
4345 4523 kmem_free(cb, sizeof (l2arc_write_callback_t));
4346 4524 }
4347 4525
4348 4526 /*
4349 4527 * A read to a cache device completed. Validate buffer contents before
4350 4528 * handing over to the regular ARC routines.
4351 4529 */
4352 4530 static void
4353 4531 l2arc_read_done(zio_t *zio)
4354 4532 {
4355 4533 l2arc_read_callback_t *cb;
4356 4534 arc_buf_hdr_t *hdr;
4357 4535 arc_buf_t *buf;
4358 4536 kmutex_t *hash_lock;
4359 4537 int equal;
4360 4538
4361 4539 ASSERT(zio->io_vd != NULL);
4362 4540 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4363 4541
4364 4542 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4365 4543
4366 4544 cb = zio->io_private;
4367 4545 ASSERT(cb != NULL);
4368 4546 buf = cb->l2rcb_buf;
4369 4547 ASSERT(buf != NULL);
4370 4548
4371 4549 hash_lock = HDR_LOCK(buf->b_hdr);
4372 4550 mutex_enter(hash_lock);
4373 4551 hdr = buf->b_hdr;
4374 4552 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4375 4553
4376 4554 /*
4377 4555 * If the buffer was compressed, decompress it first.
4378 4556 */
4379 4557 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4380 4558 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4381 4559 ASSERT(zio->io_data != NULL);
4382 4560
4383 4561 /*
4384 4562 * Check this survived the L2ARC journey.
4385 4563 */
4386 4564 equal = arc_cksum_equal(buf);
4387 4565 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4388 4566 mutex_exit(hash_lock);
4389 4567 zio->io_private = buf;
4390 4568 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
4391 4569 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
4392 4570 arc_read_done(zio);
4393 4571 } else {
4394 4572 mutex_exit(hash_lock);
4395 4573 /*
4396 4574 * Buffer didn't survive caching. Increment stats and
4397 4575 * reissue to the original storage device.
4398 4576 */
4399 4577 if (zio->io_error != 0) {
4400 4578 ARCSTAT_BUMP(arcstat_l2_io_error);
4401 4579 } else {
4402 4580 zio->io_error = SET_ERROR(EIO);
4403 4581 }
4404 4582 if (!equal)
4405 4583 ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4406 4584
4407 4585 /*
4408 4586 * If there's no waiter, issue an async i/o to the primary
4409 4587 * storage now. If there *is* a waiter, the caller must
4410 4588 * issue the i/o in a context where it's OK to block.
4411 4589 */
4412 4590 if (zio->io_waiter == NULL) {
4413 4591 zio_t *pio = zio_unique_parent(zio);
4414 4592
4415 4593 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4416 4594
4417 4595 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4418 4596 buf->b_data, zio->io_size, arc_read_done, buf,
4419 4597 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4420 4598 }
4421 4599 }
4422 4600
4423 4601 kmem_free(cb, sizeof (l2arc_read_callback_t));
4424 4602 }
4425 4603
4426 4604 /*
4427 4605 * This is the list priority from which the L2ARC will search for pages to
4428 4606 * cache. This is used within loops (0..3) to cycle through lists in the
4429 4607 * desired order. This order can have a significant effect on cache
4430 4608 * performance.
4431 4609 *
4432 4610 * Currently the metadata lists are hit first, MFU then MRU, followed by
4433 4611 * the data lists. This function returns a locked list, and also returns
4434 4612 * the lock pointer.
4435 4613 */
4436 4614 static list_t *
4437 4615 l2arc_list_locked(int list_num, kmutex_t **lock)
4438 4616 {
4439 4617 list_t *list = NULL;
4440 4618
4441 4619 ASSERT(list_num >= 0 && list_num <= 3);
4442 4620
4443 4621 switch (list_num) {
4444 4622 case 0:
4445 4623 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4446 4624 *lock = &arc_mfu->arcs_mtx;
4447 4625 break;
4448 4626 case 1:
4449 4627 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4450 4628 *lock = &arc_mru->arcs_mtx;
4451 4629 break;
4452 4630 case 2:
4453 4631 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4454 4632 *lock = &arc_mfu->arcs_mtx;
4455 4633 break;
4456 4634 case 3:
4457 4635 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4458 4636 *lock = &arc_mru->arcs_mtx;
4459 4637 break;
4460 4638 }
4461 4639
4462 4640 ASSERT(!(MUTEX_HELD(*lock)));
4463 4641 mutex_enter(*lock);
4464 4642 return (list);
4465 4643 }
4466 4644
4467 4645 /*
4468 4646 * Evict buffers from the device write hand to the distance specified in
4469 4647 * bytes. This distance may span populated buffers, it may span nothing.
4470 4648 * This is clearing a region on the L2ARC device ready for writing.
4471 4649 * If the 'all' boolean is set, every buffer is evicted.
4472 4650 */
4473 4651 static void
4474 4652 _l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all,
4475 4653 boolean_t space_update)
4476 4654 {
4477 4655 list_t *buflist;
4478 4656 l2arc_buf_hdr_t *abl2;
4479 4657 arc_buf_hdr_t *ab, *ab_prev;
4480 4658 kmutex_t *hash_lock;
4481 4659 uint64_t taddr;
4482 4660 int64_t bytes_evicted = 0;
4483 4661
4484 4662 buflist = dev->l2ad_buflist;
4485 4663
4486 4664 if (buflist == NULL)
4487 4665 return;
4488 4666
4489 4667 if (!all && dev->l2ad_first) {
4490 4668 /*
4491 4669 * This is the first sweep through the device. There is
4492 4670 * nothing to evict.
4493 4671 */
4494 4672 return;
4495 4673 }
4496 4674
4497 4675 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4498 4676 /*
4499 4677 * When nearing the end of the device, evict to the end
4500 4678 * before the device write hand jumps to the start.
4501 4679 */
4502 4680 taddr = dev->l2ad_end;
4503 4681 } else {
4504 4682 taddr = dev->l2ad_hand + distance;
4505 4683 }
4506 4684 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4507 4685 uint64_t, taddr, boolean_t, all);
4508 4686
4509 4687 top:
4510 4688 mutex_enter(&l2arc_buflist_mtx);
4511 4689 for (ab = list_tail(buflist); ab; ab = ab_prev) {
4512 4690 ab_prev = list_prev(buflist, ab);
4513 4691
4514 4692 hash_lock = HDR_LOCK(ab);
4515 4693 if (!mutex_tryenter(hash_lock)) {
4516 4694 /*
4517 4695 * Missed the hash lock. Retry.
4518 4696 */
4519 4697 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4520 4698 mutex_exit(&l2arc_buflist_mtx);
4521 4699 mutex_enter(hash_lock);
4522 4700 mutex_exit(hash_lock);
4523 4701 goto top;
4524 4702 }
4525 4703
4526 4704 if (HDR_L2_WRITE_HEAD(ab)) {
4527 4705 /*
4528 4706 * We hit a write head node. Leave it for
4529 4707 * l2arc_write_done().
4530 4708 */
4531 4709 list_remove(buflist, ab);
4532 4710 mutex_exit(hash_lock);
4533 4711 continue;
4534 4712 }
4535 4713
4536 4714 if (!all && ab->b_l2hdr != NULL &&
4537 4715 (ab->b_l2hdr->b_daddr > taddr ||
4538 4716 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4539 4717 /*
4540 4718 * We've evicted to the target address,
4541 4719 * or the end of the device.
4542 4720 */
4543 4721 mutex_exit(hash_lock);
4544 4722 break;
4545 4723 }
4546 4724
4547 4725 if (HDR_FREE_IN_PROGRESS(ab)) {
4548 4726 /*
4549 4727 * Already on the path to destruction.
4550 4728 */
4551 4729 mutex_exit(hash_lock);
4552 4730 continue;
4553 4731 }
4554 4732
4555 4733 if (ab->b_state == arc_l2c_only) {
4556 4734 ASSERT(!HDR_L2_READING(ab));
4557 4735 /*
4558 4736 * This doesn't exist in the ARC. Destroy.
4559 4737 * arc_hdr_destroy() will call list_remove()
4560 4738 * and decrement arcstat_l2_size.
4561 4739 */
4562 4740 arc_change_state(arc_anon, ab, hash_lock);
4563 4741 arc_hdr_destroy(ab);
4564 4742 } else {
4565 4743 /*
4566 4744 * Invalidate issued or about to be issued
4567 4745 * reads, since we may be about to write
4568 4746 * over this location.
4569 4747 */
4570 4748 if (HDR_L2_READING(ab)) {
4571 4749 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4572 4750 ab->b_flags |= ARC_L2_EVICTED;
4573 4751 }
4574 4752
4575 4753 /*
4576 4754 * Tell ARC this no longer exists in L2ARC.
4577 4755 */
4578 4756 if (ab->b_l2hdr != NULL) {
4579 4757 abl2 = ab->b_l2hdr;
4580 4758 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4581 4759 bytes_evicted += abl2->b_asize;
4582 4760 ab->b_l2hdr = NULL;
4583 4761 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4584 4762 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4585 4763 }
4586 4764 list_remove(buflist, ab);
4587 4765
4588 4766 /*
4589 4767 * This may have been leftover after a
4590 4768 * failed write.
4591 4769 */
4592 4770 ab->b_flags &= ~ARC_L2_WRITING;
4593 4771 }
4594 4772 mutex_exit(hash_lock);
4595 4773 }
4596 4774 mutex_exit(&l2arc_buflist_mtx);
4597 4775
4598 4776 /*
4599 4777 * Note: l2ad_vdev can only be touched if space_update is set,
4600 4778 * otherwise the vdev might have been removed by an async
4601 4779 * spa_unload.
4602 4780 */
4603 4781 if (space_update) {
4604 4782 vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0);
4605 4783 dev->l2ad_evict = taddr;
4606 4784 }
4607 4785 }
4608 4786
4609 4787 /*
4610 4788 * Asynchronous task for eviction of all the buffers for this L2ARC device
4611 4789 * The task is dispatched in l2arc_evict()
4612 4790 */
4613 4791 typedef struct {
4614 4792 l2arc_dev_t *dev;
4615 4793 } l2arc_evict_data_t;
4616 4794
4617 4795 static void
4618 4796 l2arc_evict_task(void *arg)
4619 4797 {
4620 4798 l2arc_evict_data_t *d = (l2arc_evict_data_t *)arg;
4621 4799 ASSERT(d && d->dev);
4622 4800
4623 4801 /*
4624 4802 * Evict l2arc buffers asynchronously; we need to keep the device
4625 4803 * around until we are sure there aren't any buffers referencing it.
4626 4804 * We do not need to hold any config locks, etc. because at this point,
4627 4805 * we are the only ones who knows about this device (the in-core
4628 4806 * structure), so no new buffers can be created (e.g. if the pool is
4629 4807 * re-imported while the asynchronous eviction is in progress) that
4630 4808 * reference this same in-core structure. Also remove the vdev link
4631 4809 * since further use of it as l2arc device is prohibited.
4632 4810 */
4633 4811 d->dev->l2ad_vdev = NULL;
4634 4812 _l2arc_evict(d->dev, 0LL, B_TRUE, B_FALSE);
4635 4813
4636 4814 /* Same cleanup as in the synchronous path */
4637 4815 list_destroy(d->dev->l2ad_buflist);
4638 4816 kmem_free(d->dev->l2ad_buflist, sizeof (list_t));
4639 4817 kmem_free(d->dev, sizeof (l2arc_dev_t));
4640 4818 /* Task argument cleanup */
4641 4819 kmem_free(arg, sizeof (l2arc_evict_data_t));
4642 4820 }
4643 4821
4644 4822 boolean_t zfs_l2arc_async_evict = B_TRUE;
4645 4823
4646 4824 /*
4647 4825 * Perform l2arc eviction for buffers associated with this device
4648 4826 * If evicting all buffers (done at pool export time), try to evict
4649 4827 * asynchronously, and fall back to synchronous eviction in case of error
4650 4828 * Tell the caller whether to cleanup the device:
4651 4829 * - B_TRUE means "asynchronous eviction, do not cleanup"
4652 4830 * - B_FALSE means "synchronous eviction, done, please cleanup"
4653 4831 */
4654 4832 static boolean_t
4655 4833 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4656 4834 {
4657 4835 /*
4658 4836 * If we are evicting all the buffers for this device, which happens
4659 4837 * at pool export time, schedule asynchronous task
4660 4838 */
4661 4839 if (all && zfs_l2arc_async_evict) {
4662 4840 l2arc_evict_data_t *arg =
4663 4841 kmem_alloc(sizeof (l2arc_evict_data_t), KM_SLEEP);
4664 4842 arg->dev = dev;
4665 4843
4666 4844 dev->l2ad_evict = dev->l2ad_end;
4667 4845
4668 4846 if ((taskq_dispatch(arc_flush_taskq, l2arc_evict_task,
4669 4847 arg, TQ_NOSLEEP) == NULL)) {
4670 4848 /*
4671 4849 * Failed to dispatch asynchronous task
4672 4850 * cleanup, evict synchronously, avoid adjusting
4673 4851 * vdev space second time
4674 4852 */
4675 4853 kmem_free(arg, sizeof (l2arc_evict_data_t));
4676 4854 _l2arc_evict(dev, distance, all, B_FALSE);
4677 4855 } else {
4678 4856 /*
4679 4857 * Successfull dispatch, vdev space updated
4680 4858 */
4681 4859 return (B_TRUE);
4682 4860 }
4683 4861 } else {
4684 4862 /* Evict synchronously */
4685 4863 _l2arc_evict(dev, distance, all, B_TRUE);
4686 4864 }
4687 4865
4688 4866 return (B_FALSE);
4689 4867 }
4690 4868
4691 4869 /*
4692 4870 * Find and write ARC buffers to the L2ARC device.
4693 4871 *
4694 4872 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4695 4873 * for reading until they have completed writing.
4696 4874 * The headroom_boost is an in-out parameter used to maintain headroom boost
4697 4875 * state between calls to this function.
4698 4876 *
4699 4877 * Returns the number of bytes actually written (which may be smaller than
4700 4878 * the delta by which the device hand has changed due to alignment).
4701 4879 */
4702 4880 static uint64_t
4703 4881 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4704 4882 boolean_t *headroom_boost)
4705 4883 {
4706 4884 arc_buf_hdr_t *ab, *ab_prev, *head;
4707 4885 list_t *list;
4708 4886 uint64_t write_asize, write_psize, write_sz, headroom,
4709 4887 buf_compress_minsz;
4710 4888 void *buf_data;
4711 4889 kmutex_t *list_lock;
4712 4890 boolean_t full;
4713 4891 l2arc_write_callback_t *cb;
4714 4892 zio_t *pio, *wzio;
4715 4893 uint64_t guid = spa_load_guid(spa);
4716 4894 const boolean_t do_headroom_boost = *headroom_boost;
4717 4895
4718 4896 ASSERT(dev->l2ad_vdev != NULL);
4719 4897
4720 4898 /* Lower the flag now, we might want to raise it again later. */
4721 4899 *headroom_boost = B_FALSE;
4722 4900
4723 4901 pio = NULL;
4724 4902 write_sz = write_asize = write_psize = 0;
4725 4903 full = B_FALSE;
4726 4904 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4727 4905 head->b_flags |= ARC_L2_WRITE_HEAD;
4728 4906
4729 4907 /*
4730 4908 * We will want to try to compress buffers that are at least 2x the
4731 4909 * device sector size.
4732 4910 */
4733 4911 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4734 4912
4735 4913 /*
4736 4914 * Copy buffers for L2ARC writing.
4737 4915 */
4738 4916 mutex_enter(&l2arc_buflist_mtx);
4739 4917 for (int try = 0; try <= 3; try++) {
4740 4918 uint64_t passed_sz = 0;
4741 4919
4742 4920 list = l2arc_list_locked(try, &list_lock);
4743 4921
4744 4922 /*
4745 4923 * L2ARC fast warmup.
4746 4924 *
4747 4925 * Until the ARC is warm and starts to evict, read from the
4748 4926 * head of the ARC lists rather than the tail.
4749 4927 */
4750 4928 if (arc_warm == B_FALSE)
4751 4929 ab = list_head(list);
4752 4930 else
4753 4931 ab = list_tail(list);
4754 4932
4755 4933 headroom = target_sz * l2arc_headroom;
4756 4934 if (do_headroom_boost)
4757 4935 headroom = (headroom * l2arc_headroom_boost) / 100;
4758 4936
4759 4937 for (; ab; ab = ab_prev) {
4760 4938 l2arc_buf_hdr_t *l2hdr;
4761 4939 kmutex_t *hash_lock;
4762 4940 uint64_t buf_sz;
4763 4941
4764 4942 if (arc_warm == B_FALSE)
4765 4943 ab_prev = list_next(list, ab);
4766 4944 else
4767 4945 ab_prev = list_prev(list, ab);
4768 4946
4769 4947 hash_lock = HDR_LOCK(ab);
4770 4948 if (!mutex_tryenter(hash_lock)) {
4771 4949 /*
4772 4950 * Skip this buffer rather than waiting.
4773 4951 */
4774 4952 continue;
4775 4953 }
4776 4954
4777 4955 passed_sz += ab->b_size;
4778 4956 if (passed_sz > headroom) {
4779 4957 /*
4780 4958 * Searched too far.
4781 4959 */
4782 4960 mutex_exit(hash_lock);
4783 4961 break;
4784 4962 }
4785 4963
4786 4964 if (!l2arc_write_eligible(guid, ab)) {
4787 4965 mutex_exit(hash_lock);
4788 4966 continue;
4789 4967 }
4790 4968
4791 4969 if ((write_sz + ab->b_size) > target_sz) {
4792 4970 full = B_TRUE;
4793 4971 mutex_exit(hash_lock);
4794 4972 break;
4795 4973 }
4796 4974
4797 4975 if (pio == NULL) {
4798 4976 /*
4799 4977 * Insert a dummy header on the buflist so
4800 4978 * l2arc_write_done() can find where the
4801 4979 * write buffers begin without searching.
4802 4980 */
4803 4981 list_insert_head(dev->l2ad_buflist, head);
4804 4982
4805 4983 cb = kmem_alloc(
4806 4984 sizeof (l2arc_write_callback_t), KM_SLEEP);
4807 4985 cb->l2wcb_dev = dev;
4808 4986 cb->l2wcb_head = head;
4809 4987 pio = zio_root(spa, l2arc_write_done, cb,
4810 4988 ZIO_FLAG_CANFAIL);
4811 4989 }
4812 4990
4813 4991 /*
4814 4992 * Create and add a new L2ARC header.
4815 4993 */
4816 4994 l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4817 4995 l2hdr->b_dev = dev;
4818 4996 ab->b_flags |= ARC_L2_WRITING;
4819 4997
4820 4998 /*
4821 4999 * Temporarily stash the data buffer in b_tmp_cdata.
4822 5000 * The subsequent write step will pick it up from
4823 5001 * there. This is because can't access ab->b_buf
4824 5002 * without holding the hash_lock, which we in turn
4825 5003 * can't access without holding the ARC list locks
4826 5004 * (which we want to avoid during compression/writing).
4827 5005 */
4828 5006 l2hdr->b_compress = ZIO_COMPRESS_OFF;
4829 5007 l2hdr->b_asize = ab->b_size;
4830 5008 l2hdr->b_tmp_cdata = ab->b_buf->b_data;
4831 5009
4832 5010 buf_sz = ab->b_size;
4833 5011 ab->b_l2hdr = l2hdr;
4834 5012
4835 5013 list_insert_head(dev->l2ad_buflist, ab);
4836 5014
4837 5015 /*
4838 5016 * Compute and store the buffer cksum before
4839 5017 * writing. On debug the cksum is verified first.
4840 5018 */
4841 5019 arc_cksum_verify(ab->b_buf);
4842 5020 arc_cksum_compute(ab->b_buf, B_TRUE);
4843 5021
4844 5022 mutex_exit(hash_lock);
4845 5023
4846 5024 write_sz += buf_sz;
4847 5025 }
4848 5026
4849 5027 mutex_exit(list_lock);
4850 5028
4851 5029 if (full == B_TRUE)
4852 5030 break;
4853 5031 }
4854 5032
4855 5033 /* No buffers selected for writing? */
4856 5034 if (pio == NULL) {
4857 5035 ASSERT0(write_sz);
4858 5036 mutex_exit(&l2arc_buflist_mtx);
4859 5037 kmem_cache_free(hdr_cache, head);
4860 5038 return (0);
4861 5039 }
4862 5040
4863 5041 /*
4864 5042 * Now start writing the buffers. We're starting at the write head
4865 5043 * and work backwards, retracing the course of the buffer selector
4866 5044 * loop above.
4867 5045 */
4868 5046 for (ab = list_prev(dev->l2ad_buflist, head); ab;
4869 5047 ab = list_prev(dev->l2ad_buflist, ab)) {
4870 5048 l2arc_buf_hdr_t *l2hdr;
4871 5049 uint64_t buf_sz;
4872 5050
4873 5051 /*
4874 5052 * We shouldn't need to lock the buffer here, since we flagged
4875 5053 * it as ARC_L2_WRITING in the previous step, but we must take
4876 5054 * care to only access its L2 cache parameters. In particular,
4877 5055 * ab->b_buf may be invalid by now due to ARC eviction.
4878 5056 */
4879 5057 l2hdr = ab->b_l2hdr;
4880 5058 l2hdr->b_daddr = dev->l2ad_hand;
4881 5059
4882 5060 if ((ab->b_flags & ARC_L2COMPRESS) &&
4883 5061 l2hdr->b_asize >= buf_compress_minsz) {
4884 5062 if (l2arc_compress_buf(l2hdr)) {
4885 5063 /*
4886 5064 * If compression succeeded, enable headroom
4887 5065 * boost on the next scan cycle.
4888 5066 */
4889 5067 *headroom_boost = B_TRUE;
4890 5068 }
4891 5069 }
4892 5070
4893 5071 /*
4894 5072 * Pick up the buffer data we had previously stashed away
4895 5073 * (and now potentially also compressed).
4896 5074 */
4897 5075 buf_data = l2hdr->b_tmp_cdata;
4898 5076 buf_sz = l2hdr->b_asize;
4899 5077
4900 5078 /* Compression may have squashed the buffer to zero length. */
4901 5079 if (buf_sz != 0) {
4902 5080 uint64_t buf_p_sz;
4903 5081
4904 5082 wzio = zio_write_phys(pio, dev->l2ad_vdev,
4905 5083 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4906 5084 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4907 5085 ZIO_FLAG_CANFAIL, B_FALSE);
4908 5086
4909 5087 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4910 5088 zio_t *, wzio);
4911 5089 (void) zio_nowait(wzio);
4912 5090
4913 5091 write_asize += buf_sz;
4914 5092 /*
4915 5093 * Keep the clock hand suitably device-aligned.
4916 5094 */
4917 5095 buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4918 5096 write_psize += buf_p_sz;
4919 5097 dev->l2ad_hand += buf_p_sz;
4920 5098 }
4921 5099 }
4922 5100
4923 5101 mutex_exit(&l2arc_buflist_mtx);
4924 5102
4925 5103 ASSERT3U(write_asize, <=, target_sz);
4926 5104 ARCSTAT_BUMP(arcstat_l2_writes_sent);
4927 5105 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4928 5106 ARCSTAT_INCR(arcstat_l2_size, write_sz);
4929 5107 ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4930 5108 vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
4931 5109
4932 5110 /*
4933 5111 * Bump device hand to the device start if it is approaching the end.
4934 5112 * l2arc_evict() will already have evicted ahead for this case.
4935 5113 */
4936 5114 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4937 5115 dev->l2ad_hand = dev->l2ad_start;
4938 5116 dev->l2ad_evict = dev->l2ad_start;
4939 5117 dev->l2ad_first = B_FALSE;
4940 5118 }
4941 5119
4942 5120 dev->l2ad_writing = B_TRUE;
4943 5121 (void) zio_wait(pio);
4944 5122 dev->l2ad_writing = B_FALSE;
4945 5123
4946 5124 return (write_asize);
4947 5125 }
4948 5126
4949 5127 /*
4950 5128 * Compresses an L2ARC buffer.
4951 5129 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
4952 5130 * size in l2hdr->b_asize. This routine tries to compress the data and
4953 5131 * depending on the compression result there are three possible outcomes:
4954 5132 * *) The buffer was incompressible. The original l2hdr contents were left
4955 5133 * untouched and are ready for writing to an L2 device.
4956 5134 * *) The buffer was all-zeros, so there is no need to write it to an L2
4957 5135 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
4958 5136 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
4959 5137 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
4960 5138 * data buffer which holds the compressed data to be written, and b_asize
4961 5139 * tells us how much data there is. b_compress is set to the appropriate
4962 5140 * compression algorithm. Once writing is done, invoke
4963 5141 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
4964 5142 *
4965 5143 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
4966 5144 * buffer was incompressible).
4967 5145 */
4968 5146 static boolean_t
4969 5147 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
4970 5148 {
4971 5149 void *cdata;
4972 5150 size_t csize, len, rounded;
4973 5151
4974 5152 ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
4975 5153 ASSERT(l2hdr->b_tmp_cdata != NULL);
4976 5154
4977 5155 len = l2hdr->b_asize;
4978 5156 cdata = zio_data_buf_alloc(len);
4979 5157 csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
4980 5158 cdata, l2hdr->b_asize);
4981 5159
4982 5160 rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
4983 5161 if (rounded > csize) {
4984 5162 bzero((char *)cdata + csize, rounded - csize);
4985 5163 csize = rounded;
4986 5164 }
4987 5165
4988 5166 if (csize == 0) {
4989 5167 /* zero block, indicate that there's nothing to write */
4990 5168 zio_data_buf_free(cdata, len);
4991 5169 l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
4992 5170 l2hdr->b_asize = 0;
4993 5171 l2hdr->b_tmp_cdata = NULL;
4994 5172 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
4995 5173 return (B_TRUE);
4996 5174 } else if (csize > 0 && csize < len) {
4997 5175 /*
4998 5176 * Compression succeeded, we'll keep the cdata around for
4999 5177 * writing and release it afterwards.
5000 5178 */
5001 5179 l2hdr->b_compress = ZIO_COMPRESS_LZ4;
5002 5180 l2hdr->b_asize = csize;
5003 5181 l2hdr->b_tmp_cdata = cdata;
5004 5182 ARCSTAT_BUMP(arcstat_l2_compress_successes);
5005 5183 return (B_TRUE);
5006 5184 } else {
5007 5185 /*
5008 5186 * Compression failed, release the compressed buffer.
5009 5187 * l2hdr will be left unmodified.
5010 5188 */
5011 5189 zio_data_buf_free(cdata, len);
5012 5190 ARCSTAT_BUMP(arcstat_l2_compress_failures);
5013 5191 return (B_FALSE);
5014 5192 }
5015 5193 }
5016 5194
5017 5195 /*
5018 5196 * Decompresses a zio read back from an l2arc device. On success, the
5019 5197 * underlying zio's io_data buffer is overwritten by the uncompressed
5020 5198 * version. On decompression error (corrupt compressed stream), the
5021 5199 * zio->io_error value is set to signal an I/O error.
5022 5200 *
5023 5201 * Please note that the compressed data stream is not checksummed, so
5024 5202 * if the underlying device is experiencing data corruption, we may feed
5025 5203 * corrupt data to the decompressor, so the decompressor needs to be
5026 5204 * able to handle this situation (LZ4 does).
5027 5205 */
5028 5206 static void
5029 5207 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5030 5208 {
5031 5209 ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5032 5210
5033 5211 if (zio->io_error != 0) {
5034 5212 /*
5035 5213 * An io error has occured, just restore the original io
5036 5214 * size in preparation for a main pool read.
5037 5215 */
5038 5216 zio->io_orig_size = zio->io_size = hdr->b_size;
5039 5217 return;
5040 5218 }
5041 5219
5042 5220 if (c == ZIO_COMPRESS_EMPTY) {
5043 5221 /*
5044 5222 * An empty buffer results in a null zio, which means we
5045 5223 * need to fill its io_data after we're done restoring the
5046 5224 * buffer's contents.
5047 5225 */
5048 5226 ASSERT(hdr->b_buf != NULL);
5049 5227 bzero(hdr->b_buf->b_data, hdr->b_size);
5050 5228 zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
5051 5229 } else {
5052 5230 ASSERT(zio->io_data != NULL);
5053 5231 /*
5054 5232 * We copy the compressed data from the start of the arc buffer
5055 5233 * (the zio_read will have pulled in only what we need, the
5056 5234 * rest is garbage which we will overwrite at decompression)
5057 5235 * and then decompress back to the ARC data buffer. This way we
5058 5236 * can minimize copying by simply decompressing back over the
5059 5237 * original compressed data (rather than decompressing to an
5060 5238 * aux buffer and then copying back the uncompressed buffer,
5061 5239 * which is likely to be much larger).
5062 5240 */
5063 5241 uint64_t csize;
5064 5242 void *cdata;
5065 5243
5066 5244 csize = zio->io_size;
5067 5245 cdata = zio_data_buf_alloc(csize);
5068 5246 bcopy(zio->io_data, cdata, csize);
5069 5247 if (zio_decompress_data(c, cdata, zio->io_data, csize,
5070 5248 hdr->b_size) != 0)
5071 5249 zio->io_error = EIO;
5072 5250 zio_data_buf_free(cdata, csize);
5073 5251 }
5074 5252
5075 5253 /* Restore the expected uncompressed IO size. */
5076 5254 zio->io_orig_size = zio->io_size = hdr->b_size;
5077 5255 }
5078 5256
5079 5257 /*
5080 5258 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5081 5259 * This buffer serves as a temporary holder of compressed data while
5082 5260 * the buffer entry is being written to an l2arc device. Once that is
5083 5261 * done, we can dispose of it.
5084 5262 */
5085 5263 static void
5086 5264 l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
5087 5265 {
5088 5266 l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
5089 5267
5090 5268 if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
5091 5269 /*
5092 5270 * If the data was compressed, then we've allocated a
5093 5271 * temporary buffer for it, so now we need to release it.
5094 5272 */
5095 5273 ASSERT(l2hdr->b_tmp_cdata != NULL);
5096 5274 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
5097 5275 }
5098 5276 l2hdr->b_tmp_cdata = NULL;
5099 5277 }
5100 5278
5101 5279 /*
5102 5280 * This thread feeds the L2ARC at regular intervals. This is the beating
5103 5281 * heart of the L2ARC.
5104 5282 */
5105 5283 static void
5106 5284 l2arc_feed_thread(void)
5107 5285 {
5108 5286 callb_cpr_t cpr;
5109 5287 l2arc_dev_t *dev;
5110 5288 spa_t *spa;
5111 5289 uint64_t size, wrote;
5112 5290 clock_t begin, next = ddi_get_lbolt();
5113 5291 boolean_t headroom_boost = B_FALSE;
5114 5292
5115 5293 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5116 5294
5117 5295 mutex_enter(&l2arc_feed_thr_lock);
5118 5296
5119 5297 while (l2arc_thread_exit == 0) {
5120 5298 CALLB_CPR_SAFE_BEGIN(&cpr);
5121 5299 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
5122 5300 next);
5123 5301 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5124 5302 next = ddi_get_lbolt() + hz;
5125 5303
5126 5304 /*
5127 5305 * Quick check for L2ARC devices.
5128 5306 */
5129 5307 mutex_enter(&l2arc_dev_mtx);
5130 5308 if (l2arc_ndev == 0) {
5131 5309 mutex_exit(&l2arc_dev_mtx);
5132 5310 continue;
5133 5311 }
5134 5312 mutex_exit(&l2arc_dev_mtx);
5135 5313 begin = ddi_get_lbolt();
5136 5314
5137 5315 /*
5138 5316 * This selects the next l2arc device to write to, and in
5139 5317 * doing so the next spa to feed from: dev->l2ad_spa. This
5140 5318 * will return NULL if there are now no l2arc devices or if
5141 5319 * they are all faulted.
5142 5320 *
5143 5321 * If a device is returned, its spa's config lock is also
5144 5322 * held to prevent device removal. l2arc_dev_get_next()
5145 5323 * will grab and release l2arc_dev_mtx.
5146 5324 */
5147 5325 if ((dev = l2arc_dev_get_next()) == NULL)
5148 5326 continue;
5149 5327
5150 5328 spa = dev->l2ad_spa;
5151 5329 ASSERT(spa != NULL);
5152 5330
5153 5331 /*
5154 5332 * If the pool is read-only then force the feed thread to
5155 5333 * sleep a little longer.
5156 5334 */
5157 5335 if (!spa_writeable(spa)) {
5158 5336 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5159 5337 spa_config_exit(spa, SCL_L2ARC, dev);
5160 5338 continue;
5161 5339 }
5162 5340
5163 5341 /*
5164 5342 * Avoid contributing to memory pressure.
5165 5343 */
5166 5344 if (arc_reclaim_needed()) {
5167 5345 ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5168 5346 spa_config_exit(spa, SCL_L2ARC, dev);
5169 5347 continue;
5170 5348 }
5171 5349
5172 5350 ARCSTAT_BUMP(arcstat_l2_feeds);
5173 5351
5174 5352 size = l2arc_write_size();
5175 5353
5176 5354 /*
5177 5355 * Evict L2ARC buffers that will be overwritten.
5178 5356 * B_FALSE guarantees synchronous eviction.
5179 5357 */
5180 5358 (void) l2arc_evict(dev, size, B_FALSE);
5181 5359
5182 5360 /*
5183 5361 * Write ARC buffers.
5184 5362 */
5185 5363 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5186 5364
5187 5365 /*
5188 5366 * Calculate interval between writes.
5189 5367 */
5190 5368 next = l2arc_write_interval(begin, size, wrote);
5191 5369 spa_config_exit(spa, SCL_L2ARC, dev);
5192 5370 }
5193 5371
5194 5372 l2arc_thread_exit = 0;
5195 5373 cv_broadcast(&l2arc_feed_thr_cv);
5196 5374 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
5197 5375 thread_exit();
5198 5376 }
5199 5377
5200 5378 boolean_t
5201 5379 l2arc_vdev_present(vdev_t *vd)
5202 5380 {
5203 5381 l2arc_dev_t *dev;
5204 5382
5205 5383 mutex_enter(&l2arc_dev_mtx);
5206 5384 for (dev = list_head(l2arc_dev_list); dev != NULL;
5207 5385 dev = list_next(l2arc_dev_list, dev)) {
5208 5386 if (dev->l2ad_vdev == vd)
5209 5387 break;
5210 5388 }
5211 5389 mutex_exit(&l2arc_dev_mtx);
5212 5390
5213 5391 return (dev != NULL);
5214 5392 }
5215 5393
5216 5394 /*
5217 5395 * Add a vdev for use by the L2ARC. By this point the spa has already
5218 5396 * validated the vdev and opened it.
5219 5397 */
5220 5398 void
5221 5399 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5222 5400 {
5223 5401 l2arc_dev_t *adddev;
5224 5402
5225 5403 ASSERT(!l2arc_vdev_present(vd));
5226 5404
5227 5405 /*
5228 5406 * Create a new l2arc device entry.
5229 5407 */
5230 5408 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5231 5409 adddev->l2ad_spa = spa;
5232 5410 adddev->l2ad_vdev = vd;
5233 5411 adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5234 5412 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5235 5413 adddev->l2ad_hand = adddev->l2ad_start;
5236 5414 adddev->l2ad_evict = adddev->l2ad_start;
5237 5415 adddev->l2ad_first = B_TRUE;
5238 5416 adddev->l2ad_writing = B_FALSE;
5239 5417
5240 5418 /*
5241 5419 * This is a list of all ARC buffers that are still valid on the
5242 5420 * device.
5243 5421 */
5244 5422 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5245 5423 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5246 5424 offsetof(arc_buf_hdr_t, b_l2node));
5247 5425
5248 5426 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5249 5427
5250 5428 /*
5251 5429 * Add device to global list
5252 5430 */
5253 5431 mutex_enter(&l2arc_dev_mtx);
5254 5432 list_insert_head(l2arc_dev_list, adddev);
5255 5433 atomic_inc_64(&l2arc_ndev);
5256 5434 mutex_exit(&l2arc_dev_mtx);
5257 5435 }
5258 5436
5259 5437 /*
5260 5438 * Remove a vdev from the L2ARC.
5261 5439 */
5262 5440 void
5263 5441 l2arc_remove_vdev(vdev_t *vd)
5264 5442 {
5265 5443 l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5266 5444
5267 5445 /*
5268 5446 * Find the device by vdev
5269 5447 */
5270 5448 mutex_enter(&l2arc_dev_mtx);
5271 5449 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5272 5450 nextdev = list_next(l2arc_dev_list, dev);
5273 5451 if (vd == dev->l2ad_vdev) {
5274 5452 remdev = dev;
5275 5453 break;
5276 5454 }
5277 5455 }
5278 5456 ASSERT(remdev != NULL);
5279 5457
5280 5458 /*
5281 5459 * Remove device from global list
5282 5460 */
5283 5461 list_remove(l2arc_dev_list, remdev);
5284 5462 l2arc_dev_last = NULL; /* may have been invalidated */
5285 5463 atomic_dec_64(&l2arc_ndev);
5286 5464 mutex_exit(&l2arc_dev_mtx);
5287 5465
5288 5466 /*
5289 5467 * Clear all buflists and ARC references. L2ARC device flush.
5290 5468 */
5291 5469 if (l2arc_evict(remdev, 0, B_TRUE) == B_FALSE) {
5292 5470 /*
5293 5471 * The eviction was done synchronously, cleanup here
5294 5472 * Otherwise, the asynchronous task will cleanup
5295 5473 */
5296 5474 list_destroy(remdev->l2ad_buflist);
5297 5475 kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5298 5476 kmem_free(remdev, sizeof (l2arc_dev_t));
5299 5477 }
5300 5478 }
5301 5479
5302 5480 void
5303 5481 l2arc_init(void)
5304 5482 {
5305 5483 l2arc_thread_exit = 0;
5306 5484 l2arc_ndev = 0;
5307 5485 l2arc_writes_sent = 0;
5308 5486 l2arc_writes_done = 0;
5309 5487
5310 5488 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5311 5489 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5312 5490 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5313 5491 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5314 5492 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5315 5493
5316 5494 l2arc_dev_list = &L2ARC_dev_list;
5317 5495 l2arc_free_on_write = &L2ARC_free_on_write;
5318 5496 list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5319 5497 offsetof(l2arc_dev_t, l2ad_node));
5320 5498 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5321 5499 offsetof(l2arc_data_free_t, l2df_list_node));
5322 5500 }
5323 5501
5324 5502 void
5325 5503 l2arc_fini(void)
5326 5504 {
5327 5505 /*
5328 5506 * This is called from dmu_fini(), which is called from spa_fini();
5329 5507 * Because of this, we can assume that all l2arc devices have
5330 5508 * already been removed when the pools themselves were removed.
5331 5509 */
5332 5510
5333 5511 l2arc_do_free_on_write();
5334 5512
5335 5513 mutex_destroy(&l2arc_feed_thr_lock);
5336 5514 cv_destroy(&l2arc_feed_thr_cv);
5337 5515 mutex_destroy(&l2arc_dev_mtx);
5338 5516 mutex_destroy(&l2arc_buflist_mtx);
5339 5517 mutex_destroy(&l2arc_free_on_write_mtx);
5340 5518
5341 5519 list_destroy(l2arc_dev_list);
5342 5520 list_destroy(l2arc_free_on_write);
5343 5521 }
5344 5522
5345 5523 void
5346 5524 l2arc_start(void)
5347 5525 {
5348 5526 if (!(spa_mode_global & FWRITE))
5349 5527 return;
5350 5528
5351 5529 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5352 5530 TS_RUN, minclsyspri);
5353 5531 }
5354 5532
5355 5533 void
5356 5534 l2arc_stop(void)
5357 5535 {
5358 5536 if (!(spa_mode_global & FWRITE))
5359 5537 return;
5360 5538
5361 5539 mutex_enter(&l2arc_feed_thr_lock);
5362 5540 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
5363 5541 l2arc_thread_exit = 1;
5364 5542 while (l2arc_thread_exit != 0)
5365 5543 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5366 5544 mutex_exit(&l2arc_feed_thr_lock);
5367 5545 }
↓ open down ↓ |
1427 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX