1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2013 by Delphix. All rights reserved. 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26 */ 27 28 /* 29 * DVA-based Adjustable Replacement Cache 30 * 31 * While much of the theory of operation used here is 32 * based on the self-tuning, low overhead replacement cache 33 * presented by Megiddo and Modha at FAST 2003, there are some 34 * significant differences: 35 * 36 * 1. The Megiddo and Modha model assumes any page is evictable. 37 * Pages in its cache cannot be "locked" into memory. This makes 38 * the eviction algorithm simple: evict the last page in the list. 39 * This also make the performance characteristics easy to reason 40 * about. Our cache is not so simple. At any given moment, some 41 * subset of the blocks in the cache are un-evictable because we 42 * have handed out a reference to them. Blocks are only evictable 43 * when there are no external references active. This makes 44 * eviction far more problematic: we choose to evict the evictable 45 * blocks that are the "lowest" in the list. 46 * 47 * There are times when it is not possible to evict the requested 48 * space. In these circumstances we are unable to adjust the cache 49 * size. To prevent the cache growing unbounded at these times we 50 * implement a "cache throttle" that slows the flow of new data 51 * into the cache until we can make space available. 52 * 53 * 2. The Megiddo and Modha model assumes a fixed cache size. 54 * Pages are evicted when the cache is full and there is a cache 55 * miss. Our model has a variable sized cache. It grows with 56 * high use, but also tries to react to memory pressure from the 57 * operating system: decreasing its size when system memory is 58 * tight. 59 * 60 * 3. The Megiddo and Modha model assumes a fixed page size. All 61 * elements of the cache are therefore exactly the same size. So 62 * when adjusting the cache size following a cache miss, its simply 63 * a matter of choosing a single page to evict. In our model, we 64 * have variable sized cache blocks (rangeing from 512 bytes to 65 * 128K bytes). We therefore choose a set of blocks to evict to make 66 * space for a cache miss that approximates as closely as possible 67 * the space used by the new block. 68 * 69 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70 * by N. Megiddo & D. Modha, FAST 2003 71 */ 72 73 /* 74 * The locking model: 75 * 76 * A new reference to a cache buffer can be obtained in two 77 * ways: 1) via a hash table lookup using the DVA as a key, 78 * or 2) via one of the ARC lists. The arc_read() interface 79 * uses method 1, while the internal arc algorithms for 80 * adjusting the cache use method 2. We therefore provide two 81 * types of locks: 1) the hash table lock array, and 2) the 82 * arc list locks. 83 * 84 * Buffers do not have their own mutexes, rather they rely on the 85 * hash table mutexes for the bulk of their protection (i.e. most 86 * fields in the arc_buf_hdr_t are protected by these mutexes). 87 * 88 * buf_hash_find() returns the appropriate mutex (held) when it 89 * locates the requested buffer in the hash table. It returns 90 * NULL for the mutex if the buffer was not in the table. 91 * 92 * buf_hash_remove() expects the appropriate hash mutex to be 93 * already held before it is invoked. 94 * 95 * Each arc state also has a mutex which is used to protect the 96 * buffer list associated with the state. When attempting to 97 * obtain a hash table lock while holding an arc list lock you 98 * must use: mutex_tryenter() to avoid deadlock. Also note that 99 * the active state mutex must be held before the ghost state mutex. 100 * 101 * Arc buffers may have an associated eviction callback function. 102 * This function will be invoked prior to removing the buffer (e.g. 103 * in arc_do_user_evicts()). Note however that the data associated 104 * with the buffer may be evicted prior to the callback. The callback 105 * must be made with *no locks held* (to prevent deadlock). Additionally, 106 * the users of callbacks must ensure that their private data is 107 * protected from simultaneous callbacks from arc_buf_evict() 108 * and arc_do_user_evicts(). 109 * 110 * Note that the majority of the performance stats are manipulated 111 * with atomic operations. 112 * 113 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: 114 * 115 * - L2ARC buflist creation 116 * - L2ARC buflist eviction 117 * - L2ARC write completion, which walks L2ARC buflists 118 * - ARC header destruction, as it removes from L2ARC buflists 119 * - ARC header release, as it removes from L2ARC buflists 120 */ 121 122 #include <sys/spa.h> 123 #include <sys/zio.h> 124 #include <sys/zio_compress.h> 125 #include <sys/zfs_context.h> 126 #include <sys/arc.h> 127 #include <sys/refcount.h> 128 #include <sys/vdev.h> 129 #include <sys/vdev_impl.h> 130 #include <sys/dsl_pool.h> 131 #ifdef _KERNEL 132 #include <sys/vmsystm.h> 133 #include <vm/anon.h> 134 #include <sys/fs/swapnode.h> 135 #include <sys/dnlc.h> 136 #endif 137 #include <sys/callb.h> 138 #include <sys/kstat.h> 139 #include <zfs_fletcher.h> 140 141 #ifndef _KERNEL 142 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 143 boolean_t arc_watch = B_FALSE; 144 int arc_procfd; 145 #endif 146 147 static kmutex_t arc_reclaim_thr_lock; 148 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 149 static uint8_t arc_thread_exit; 150 151 #define ARC_REDUCE_DNLC_PERCENT 3 152 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 153 154 typedef enum arc_reclaim_strategy { 155 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 156 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 157 } arc_reclaim_strategy_t; 158 159 /* 160 * The number of iterations through arc_evict_*() before we 161 * drop & reacquire the lock. 162 */ 163 int arc_evict_iterations = 100; 164 165 /* number of seconds before growing cache again */ 166 static int arc_grow_retry = 60; 167 168 /* shift of arc_c for calculating both min and max arc_p */ 169 static int arc_p_min_shift = 4; 170 171 /* log2(fraction of arc to reclaim) */ 172 static int arc_shrink_shift = 5; 173 174 /* 175 * minimum lifespan of a prefetch block in clock ticks 176 * (initialized in arc_init()) 177 */ 178 static int arc_min_prefetch_lifespan; 179 180 /* 181 * If this percent of memory is free, don't throttle. 182 */ 183 int arc_lotsfree_percent = 10; 184 185 static int arc_dead; 186 187 /* 188 * The arc has filled available memory and has now warmed up. 189 */ 190 static boolean_t arc_warm; 191 192 /* 193 * These tunables are for performance analysis. 194 */ 195 uint64_t zfs_arc_max; 196 uint64_t zfs_arc_min; 197 uint64_t zfs_arc_meta_limit = 0; 198 int zfs_arc_grow_retry = 0; 199 int zfs_arc_shrink_shift = 0; 200 int zfs_arc_p_min_shift = 0; 201 int zfs_disable_dup_eviction = 0; 202 203 /* 204 * Note that buffers can be in one of 6 states: 205 * ARC_anon - anonymous (discussed below) 206 * ARC_mru - recently used, currently cached 207 * ARC_mru_ghost - recentely used, no longer in cache 208 * ARC_mfu - frequently used, currently cached 209 * ARC_mfu_ghost - frequently used, no longer in cache 210 * ARC_l2c_only - exists in L2ARC but not other states 211 * When there are no active references to the buffer, they are 212 * are linked onto a list in one of these arc states. These are 213 * the only buffers that can be evicted or deleted. Within each 214 * state there are multiple lists, one for meta-data and one for 215 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 216 * etc.) is tracked separately so that it can be managed more 217 * explicitly: favored over data, limited explicitly. 218 * 219 * Anonymous buffers are buffers that are not associated with 220 * a DVA. These are buffers that hold dirty block copies 221 * before they are written to stable storage. By definition, 222 * they are "ref'd" and are considered part of arc_mru 223 * that cannot be freed. Generally, they will aquire a DVA 224 * as they are written and migrate onto the arc_mru list. 225 * 226 * The ARC_l2c_only state is for buffers that are in the second 227 * level ARC but no longer in any of the ARC_m* lists. The second 228 * level ARC itself may also contain buffers that are in any of 229 * the ARC_m* states - meaning that a buffer can exist in two 230 * places. The reason for the ARC_l2c_only state is to keep the 231 * buffer header in the hash table, so that reads that hit the 232 * second level ARC benefit from these fast lookups. 233 */ 234 235 typedef struct arc_state { 236 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ 237 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 238 uint64_t arcs_size; /* total amount of data in this state */ 239 kmutex_t arcs_mtx; 240 } arc_state_t; 241 242 /* The 6 states: */ 243 static arc_state_t ARC_anon; 244 static arc_state_t ARC_mru; 245 static arc_state_t ARC_mru_ghost; 246 static arc_state_t ARC_mfu; 247 static arc_state_t ARC_mfu_ghost; 248 static arc_state_t ARC_l2c_only; 249 250 typedef struct arc_stats { 251 kstat_named_t arcstat_hits; 252 kstat_named_t arcstat_misses; 253 kstat_named_t arcstat_demand_data_hits; 254 kstat_named_t arcstat_demand_data_misses; 255 kstat_named_t arcstat_demand_metadata_hits; 256 kstat_named_t arcstat_demand_metadata_misses; 257 kstat_named_t arcstat_prefetch_data_hits; 258 kstat_named_t arcstat_prefetch_data_misses; 259 kstat_named_t arcstat_prefetch_metadata_hits; 260 kstat_named_t arcstat_prefetch_metadata_misses; 261 kstat_named_t arcstat_mru_hits; 262 kstat_named_t arcstat_mru_ghost_hits; 263 kstat_named_t arcstat_mfu_hits; 264 kstat_named_t arcstat_mfu_ghost_hits; 265 kstat_named_t arcstat_deleted; 266 kstat_named_t arcstat_recycle_miss; 267 /* 268 * Number of buffers that could not be evicted because the hash lock 269 * was held by another thread. The lock may not necessarily be held 270 * by something using the same buffer, since hash locks are shared 271 * by multiple buffers. 272 */ 273 kstat_named_t arcstat_mutex_miss; 274 /* 275 * Number of buffers skipped because they have I/O in progress, are 276 * indrect prefetch buffers that have not lived long enough, or are 277 * not from the spa we're trying to evict from. 278 */ 279 kstat_named_t arcstat_evict_skip; 280 kstat_named_t arcstat_evict_l2_cached; 281 kstat_named_t arcstat_evict_l2_eligible; 282 kstat_named_t arcstat_evict_l2_ineligible; 283 kstat_named_t arcstat_hash_elements; 284 kstat_named_t arcstat_hash_elements_max; 285 kstat_named_t arcstat_hash_collisions; 286 kstat_named_t arcstat_hash_chains; 287 kstat_named_t arcstat_hash_chain_max; 288 kstat_named_t arcstat_p; 289 kstat_named_t arcstat_c; 290 kstat_named_t arcstat_c_min; 291 kstat_named_t arcstat_c_max; 292 kstat_named_t arcstat_size; 293 kstat_named_t arcstat_hdr_size; 294 kstat_named_t arcstat_data_size; 295 kstat_named_t arcstat_other_size; 296 kstat_named_t arcstat_l2_hits; 297 kstat_named_t arcstat_l2_misses; 298 kstat_named_t arcstat_l2_feeds; 299 kstat_named_t arcstat_l2_rw_clash; 300 kstat_named_t arcstat_l2_read_bytes; 301 kstat_named_t arcstat_l2_write_bytes; 302 kstat_named_t arcstat_l2_writes_sent; 303 kstat_named_t arcstat_l2_writes_done; 304 kstat_named_t arcstat_l2_writes_error; 305 kstat_named_t arcstat_l2_writes_hdr_miss; 306 kstat_named_t arcstat_l2_evict_lock_retry; 307 kstat_named_t arcstat_l2_evict_reading; 308 kstat_named_t arcstat_l2_free_on_write; 309 kstat_named_t arcstat_l2_abort_lowmem; 310 kstat_named_t arcstat_l2_cksum_bad; 311 kstat_named_t arcstat_l2_io_error; 312 kstat_named_t arcstat_l2_size; 313 kstat_named_t arcstat_l2_asize; 314 kstat_named_t arcstat_l2_hdr_size; 315 kstat_named_t arcstat_l2_compress_successes; 316 kstat_named_t arcstat_l2_compress_zeros; 317 kstat_named_t arcstat_l2_compress_failures; 318 kstat_named_t arcstat_memory_throttle_count; 319 kstat_named_t arcstat_duplicate_buffers; 320 kstat_named_t arcstat_duplicate_buffers_size; 321 kstat_named_t arcstat_duplicate_reads; 322 kstat_named_t arcstat_meta_used; 323 kstat_named_t arcstat_meta_limit; 324 kstat_named_t arcstat_meta_max; 325 } arc_stats_t; 326 327 static arc_stats_t arc_stats = { 328 { "hits", KSTAT_DATA_UINT64 }, 329 { "misses", KSTAT_DATA_UINT64 }, 330 { "demand_data_hits", KSTAT_DATA_UINT64 }, 331 { "demand_data_misses", KSTAT_DATA_UINT64 }, 332 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 333 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 334 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 335 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 336 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 337 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 338 { "mru_hits", KSTAT_DATA_UINT64 }, 339 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 340 { "mfu_hits", KSTAT_DATA_UINT64 }, 341 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 342 { "deleted", KSTAT_DATA_UINT64 }, 343 { "recycle_miss", KSTAT_DATA_UINT64 }, 344 { "mutex_miss", KSTAT_DATA_UINT64 }, 345 { "evict_skip", KSTAT_DATA_UINT64 }, 346 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 347 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 348 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 349 { "hash_elements", KSTAT_DATA_UINT64 }, 350 { "hash_elements_max", KSTAT_DATA_UINT64 }, 351 { "hash_collisions", KSTAT_DATA_UINT64 }, 352 { "hash_chains", KSTAT_DATA_UINT64 }, 353 { "hash_chain_max", KSTAT_DATA_UINT64 }, 354 { "p", KSTAT_DATA_UINT64 }, 355 { "c", KSTAT_DATA_UINT64 }, 356 { "c_min", KSTAT_DATA_UINT64 }, 357 { "c_max", KSTAT_DATA_UINT64 }, 358 { "size", KSTAT_DATA_UINT64 }, 359 { "hdr_size", KSTAT_DATA_UINT64 }, 360 { "data_size", KSTAT_DATA_UINT64 }, 361 { "other_size", KSTAT_DATA_UINT64 }, 362 { "l2_hits", KSTAT_DATA_UINT64 }, 363 { "l2_misses", KSTAT_DATA_UINT64 }, 364 { "l2_feeds", KSTAT_DATA_UINT64 }, 365 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 366 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 367 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 368 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 369 { "l2_writes_done", KSTAT_DATA_UINT64 }, 370 { "l2_writes_error", KSTAT_DATA_UINT64 }, 371 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 372 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 373 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 374 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 375 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 376 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 377 { "l2_io_error", KSTAT_DATA_UINT64 }, 378 { "l2_size", KSTAT_DATA_UINT64 }, 379 { "l2_asize", KSTAT_DATA_UINT64 }, 380 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 381 { "l2_compress_successes", KSTAT_DATA_UINT64 }, 382 { "l2_compress_zeros", KSTAT_DATA_UINT64 }, 383 { "l2_compress_failures", KSTAT_DATA_UINT64 }, 384 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 385 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 386 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 387 { "duplicate_reads", KSTAT_DATA_UINT64 }, 388 { "arc_meta_used", KSTAT_DATA_UINT64 }, 389 { "arc_meta_limit", KSTAT_DATA_UINT64 }, 390 { "arc_meta_max", KSTAT_DATA_UINT64 } 391 }; 392 393 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 394 395 #define ARCSTAT_INCR(stat, val) \ 396 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 397 398 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 399 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 400 401 #define ARCSTAT_MAX(stat, val) { \ 402 uint64_t m; \ 403 while ((val) > (m = arc_stats.stat.value.ui64) && \ 404 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 405 continue; \ 406 } 407 408 #define ARCSTAT_MAXSTAT(stat) \ 409 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 410 411 /* 412 * We define a macro to allow ARC hits/misses to be easily broken down by 413 * two separate conditions, giving a total of four different subtypes for 414 * each of hits and misses (so eight statistics total). 415 */ 416 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 417 if (cond1) { \ 418 if (cond2) { \ 419 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 420 } else { \ 421 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 422 } \ 423 } else { \ 424 if (cond2) { \ 425 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 426 } else { \ 427 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 428 } \ 429 } 430 431 kstat_t *arc_ksp; 432 static arc_state_t *arc_anon; 433 static arc_state_t *arc_mru; 434 static arc_state_t *arc_mru_ghost; 435 static arc_state_t *arc_mfu; 436 static arc_state_t *arc_mfu_ghost; 437 static arc_state_t *arc_l2c_only; 438 439 /* 440 * There are several ARC variables that are critical to export as kstats -- 441 * but we don't want to have to grovel around in the kstat whenever we wish to 442 * manipulate them. For these variables, we therefore define them to be in 443 * terms of the statistic variable. This assures that we are not introducing 444 * the possibility of inconsistency by having shadow copies of the variables, 445 * while still allowing the code to be readable. 446 */ 447 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 448 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 449 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 450 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 451 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 452 #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 453 #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 454 #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 455 456 #define L2ARC_IS_VALID_COMPRESS(_c_) \ 457 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) 458 459 static int arc_no_grow; /* Don't try to grow cache size */ 460 static uint64_t arc_tempreserve; 461 static uint64_t arc_loaned_bytes; 462 463 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; 464 465 typedef struct arc_callback arc_callback_t; 466 467 struct arc_callback { 468 void *acb_private; 469 arc_done_func_t *acb_done; 470 arc_buf_t *acb_buf; 471 zio_t *acb_zio_dummy; 472 arc_callback_t *acb_next; 473 }; 474 475 typedef struct arc_write_callback arc_write_callback_t; 476 477 struct arc_write_callback { 478 void *awcb_private; 479 arc_done_func_t *awcb_ready; 480 arc_done_func_t *awcb_physdone; 481 arc_done_func_t *awcb_done; 482 arc_buf_t *awcb_buf; 483 }; 484 485 struct arc_buf_hdr { 486 /* protected by hash lock */ 487 dva_t b_dva; 488 uint64_t b_birth; 489 uint64_t b_cksum0; 490 491 kmutex_t b_freeze_lock; 492 zio_cksum_t *b_freeze_cksum; 493 void *b_thawed; 494 495 arc_buf_hdr_t *b_hash_next; 496 arc_buf_t *b_buf; 497 uint32_t b_flags; 498 uint32_t b_datacnt; 499 500 arc_callback_t *b_acb; 501 kcondvar_t b_cv; 502 503 /* immutable */ 504 arc_buf_contents_t b_type; 505 uint64_t b_size; 506 uint64_t b_spa; 507 508 /* protected by arc state mutex */ 509 arc_state_t *b_state; 510 list_node_t b_arc_node; 511 512 /* updated atomically */ 513 clock_t b_arc_access; 514 515 /* self protecting */ 516 refcount_t b_refcnt; 517 518 l2arc_buf_hdr_t *b_l2hdr; 519 list_node_t b_l2node; 520 }; 521 522 static arc_buf_t *arc_eviction_list; 523 static kmutex_t arc_eviction_mtx; 524 static arc_buf_hdr_t arc_eviction_hdr; 525 static void arc_get_data_buf(arc_buf_t *buf); 526 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 527 static int arc_evict_needed(arc_buf_contents_t type); 528 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); 529 static void arc_buf_watch(arc_buf_t *buf); 530 531 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); 532 533 #define GHOST_STATE(state) \ 534 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 535 (state) == arc_l2c_only) 536 537 /* 538 * Private ARC flags. These flags are private ARC only flags that will show up 539 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 540 * be passed in as arc_flags in things like arc_read. However, these flags 541 * should never be passed and should only be set by ARC code. When adding new 542 * public flags, make sure not to smash the private ones. 543 */ 544 545 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 546 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 547 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 548 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 549 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 550 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 551 #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ 552 #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ 553 #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ 554 #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ 555 556 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 557 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 558 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 559 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) 560 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 561 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 562 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) 563 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) 564 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ 565 (hdr)->b_l2hdr != NULL) 566 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) 567 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) 568 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) 569 570 /* 571 * Other sizes 572 */ 573 574 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 575 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) 576 577 /* 578 * Hash table routines 579 */ 580 581 #define HT_LOCK_PAD 64 582 583 struct ht_lock { 584 kmutex_t ht_lock; 585 #ifdef _KERNEL 586 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 587 #endif 588 }; 589 590 #define BUF_LOCKS 256 591 typedef struct buf_hash_table { 592 uint64_t ht_mask; 593 arc_buf_hdr_t **ht_table; 594 struct ht_lock ht_locks[BUF_LOCKS]; 595 } buf_hash_table_t; 596 597 static buf_hash_table_t buf_hash_table; 598 599 #define BUF_HASH_INDEX(spa, dva, birth) \ 600 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 601 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 602 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 603 #define HDR_LOCK(hdr) \ 604 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 605 606 uint64_t zfs_crc64_table[256]; 607 608 /* 609 * Level 2 ARC 610 */ 611 612 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 613 #define L2ARC_HEADROOM 2 /* num of writes */ 614 /* 615 * If we discover during ARC scan any buffers to be compressed, we boost 616 * our headroom for the next scanning cycle by this percentage multiple. 617 */ 618 #define L2ARC_HEADROOM_BOOST 200 619 #define L2ARC_FEED_SECS 1 /* caching interval secs */ 620 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 621 622 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 623 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 624 625 /* L2ARC Performance Tunables */ 626 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 627 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 628 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 629 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 630 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 631 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 632 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 633 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 634 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 635 636 /* 637 * L2ARC Internals 638 */ 639 typedef struct l2arc_dev { 640 vdev_t *l2ad_vdev; /* vdev */ 641 spa_t *l2ad_spa; /* spa */ 642 uint64_t l2ad_hand; /* next write location */ 643 uint64_t l2ad_start; /* first addr on device */ 644 uint64_t l2ad_end; /* last addr on device */ 645 uint64_t l2ad_evict; /* last addr eviction reached */ 646 boolean_t l2ad_first; /* first sweep through */ 647 boolean_t l2ad_writing; /* currently writing */ 648 list_t *l2ad_buflist; /* buffer list */ 649 list_node_t l2ad_node; /* device list node */ 650 } l2arc_dev_t; 651 652 static list_t L2ARC_dev_list; /* device list */ 653 static list_t *l2arc_dev_list; /* device list pointer */ 654 static kmutex_t l2arc_dev_mtx; /* device list mutex */ 655 static l2arc_dev_t *l2arc_dev_last; /* last device used */ 656 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ 657 static list_t L2ARC_free_on_write; /* free after write buf list */ 658 static list_t *l2arc_free_on_write; /* free after write list ptr */ 659 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 660 static uint64_t l2arc_ndev; /* number of devices */ 661 662 typedef struct l2arc_read_callback { 663 arc_buf_t *l2rcb_buf; /* read buffer */ 664 spa_t *l2rcb_spa; /* spa */ 665 blkptr_t l2rcb_bp; /* original blkptr */ 666 zbookmark_t l2rcb_zb; /* original bookmark */ 667 int l2rcb_flags; /* original flags */ 668 enum zio_compress l2rcb_compress; /* applied compress */ 669 } l2arc_read_callback_t; 670 671 typedef struct l2arc_write_callback { 672 l2arc_dev_t *l2wcb_dev; /* device info */ 673 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 674 } l2arc_write_callback_t; 675 676 struct l2arc_buf_hdr { 677 /* protected by arc_buf_hdr mutex */ 678 l2arc_dev_t *b_dev; /* L2ARC device */ 679 uint64_t b_daddr; /* disk address, offset byte */ 680 /* compression applied to buffer data */ 681 enum zio_compress b_compress; 682 /* real alloc'd buffer size depending on b_compress applied */ 683 int b_asize; 684 /* temporary buffer holder for in-flight compressed data */ 685 void *b_tmp_cdata; 686 }; 687 688 typedef struct l2arc_data_free { 689 /* protected by l2arc_free_on_write_mtx */ 690 void *l2df_data; 691 size_t l2df_size; 692 void (*l2df_func)(void *, size_t); 693 list_node_t l2df_list_node; 694 } l2arc_data_free_t; 695 696 static kmutex_t l2arc_feed_thr_lock; 697 static kcondvar_t l2arc_feed_thr_cv; 698 static uint8_t l2arc_thread_exit; 699 700 static void l2arc_read_done(zio_t *zio); 701 static void l2arc_hdr_stat_add(void); 702 static void l2arc_hdr_stat_remove(void); 703 704 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr); 705 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, 706 enum zio_compress c); 707 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab); 708 709 static uint64_t 710 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 711 { 712 uint8_t *vdva = (uint8_t *)dva; 713 uint64_t crc = -1ULL; 714 int i; 715 716 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 717 718 for (i = 0; i < sizeof (dva_t); i++) 719 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 720 721 crc ^= (spa>>8) ^ birth; 722 723 return (crc); 724 } 725 726 #define BUF_EMPTY(buf) \ 727 ((buf)->b_dva.dva_word[0] == 0 && \ 728 (buf)->b_dva.dva_word[1] == 0 && \ 729 (buf)->b_birth == 0) 730 731 #define BUF_EQUAL(spa, dva, birth, buf) \ 732 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 733 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 734 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 735 736 static void 737 buf_discard_identity(arc_buf_hdr_t *hdr) 738 { 739 hdr->b_dva.dva_word[0] = 0; 740 hdr->b_dva.dva_word[1] = 0; 741 hdr->b_birth = 0; 742 hdr->b_cksum0 = 0; 743 } 744 745 static arc_buf_hdr_t * 746 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) 747 { 748 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 749 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 750 arc_buf_hdr_t *buf; 751 752 mutex_enter(hash_lock); 753 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 754 buf = buf->b_hash_next) { 755 if (BUF_EQUAL(spa, dva, birth, buf)) { 756 *lockp = hash_lock; 757 return (buf); 758 } 759 } 760 mutex_exit(hash_lock); 761 *lockp = NULL; 762 return (NULL); 763 } 764 765 /* 766 * Insert an entry into the hash table. If there is already an element 767 * equal to elem in the hash table, then the already existing element 768 * will be returned and the new element will not be inserted. 769 * Otherwise returns NULL. 770 */ 771 static arc_buf_hdr_t * 772 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 773 { 774 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 775 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 776 arc_buf_hdr_t *fbuf; 777 uint32_t i; 778 779 ASSERT(!HDR_IN_HASH_TABLE(buf)); 780 *lockp = hash_lock; 781 mutex_enter(hash_lock); 782 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 783 fbuf = fbuf->b_hash_next, i++) { 784 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 785 return (fbuf); 786 } 787 788 buf->b_hash_next = buf_hash_table.ht_table[idx]; 789 buf_hash_table.ht_table[idx] = buf; 790 buf->b_flags |= ARC_IN_HASH_TABLE; 791 792 /* collect some hash table performance data */ 793 if (i > 0) { 794 ARCSTAT_BUMP(arcstat_hash_collisions); 795 if (i == 1) 796 ARCSTAT_BUMP(arcstat_hash_chains); 797 798 ARCSTAT_MAX(arcstat_hash_chain_max, i); 799 } 800 801 ARCSTAT_BUMP(arcstat_hash_elements); 802 ARCSTAT_MAXSTAT(arcstat_hash_elements); 803 804 return (NULL); 805 } 806 807 static void 808 buf_hash_remove(arc_buf_hdr_t *buf) 809 { 810 arc_buf_hdr_t *fbuf, **bufp; 811 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 812 813 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 814 ASSERT(HDR_IN_HASH_TABLE(buf)); 815 816 bufp = &buf_hash_table.ht_table[idx]; 817 while ((fbuf = *bufp) != buf) { 818 ASSERT(fbuf != NULL); 819 bufp = &fbuf->b_hash_next; 820 } 821 *bufp = buf->b_hash_next; 822 buf->b_hash_next = NULL; 823 buf->b_flags &= ~ARC_IN_HASH_TABLE; 824 825 /* collect some hash table performance data */ 826 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 827 828 if (buf_hash_table.ht_table[idx] && 829 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 830 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 831 } 832 833 /* 834 * Global data structures and functions for the buf kmem cache. 835 */ 836 static kmem_cache_t *hdr_cache; 837 static kmem_cache_t *buf_cache; 838 839 static void 840 buf_fini(void) 841 { 842 int i; 843 844 kmem_free(buf_hash_table.ht_table, 845 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 846 for (i = 0; i < BUF_LOCKS; i++) 847 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 848 kmem_cache_destroy(hdr_cache); 849 kmem_cache_destroy(buf_cache); 850 } 851 852 /* 853 * Constructor callback - called when the cache is empty 854 * and a new buf is requested. 855 */ 856 /* ARGSUSED */ 857 static int 858 hdr_cons(void *vbuf, void *unused, int kmflag) 859 { 860 arc_buf_hdr_t *buf = vbuf; 861 862 bzero(buf, sizeof (arc_buf_hdr_t)); 863 refcount_create(&buf->b_refcnt); 864 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 865 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 866 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 867 868 return (0); 869 } 870 871 /* ARGSUSED */ 872 static int 873 buf_cons(void *vbuf, void *unused, int kmflag) 874 { 875 arc_buf_t *buf = vbuf; 876 877 bzero(buf, sizeof (arc_buf_t)); 878 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 879 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 880 881 return (0); 882 } 883 884 /* 885 * Destructor callback - called when a cached buf is 886 * no longer required. 887 */ 888 /* ARGSUSED */ 889 static void 890 hdr_dest(void *vbuf, void *unused) 891 { 892 arc_buf_hdr_t *buf = vbuf; 893 894 ASSERT(BUF_EMPTY(buf)); 895 refcount_destroy(&buf->b_refcnt); 896 cv_destroy(&buf->b_cv); 897 mutex_destroy(&buf->b_freeze_lock); 898 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 899 } 900 901 /* ARGSUSED */ 902 static void 903 buf_dest(void *vbuf, void *unused) 904 { 905 arc_buf_t *buf = vbuf; 906 907 mutex_destroy(&buf->b_evict_lock); 908 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 909 } 910 911 /* 912 * Reclaim callback -- invoked when memory is low. 913 */ 914 /* ARGSUSED */ 915 static void 916 hdr_recl(void *unused) 917 { 918 dprintf("hdr_recl called\n"); 919 /* 920 * umem calls the reclaim func when we destroy the buf cache, 921 * which is after we do arc_fini(). 922 */ 923 if (!arc_dead) 924 cv_signal(&arc_reclaim_thr_cv); 925 } 926 927 static void 928 buf_init(void) 929 { 930 uint64_t *ct; 931 uint64_t hsize = 1ULL << 12; 932 int i, j; 933 934 /* 935 * The hash table is big enough to fill all of physical memory 936 * with an average 64K block size. The table will take up 937 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 938 */ 939 while (hsize * 65536 < physmem * PAGESIZE) 940 hsize <<= 1; 941 retry: 942 buf_hash_table.ht_mask = hsize - 1; 943 buf_hash_table.ht_table = 944 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 945 if (buf_hash_table.ht_table == NULL) { 946 ASSERT(hsize > (1ULL << 8)); 947 hsize >>= 1; 948 goto retry; 949 } 950 951 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 952 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 953 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 954 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 955 956 for (i = 0; i < 256; i++) 957 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 958 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 959 960 for (i = 0; i < BUF_LOCKS; i++) { 961 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 962 NULL, MUTEX_DEFAULT, NULL); 963 } 964 } 965 966 #define ARC_MINTIME (hz>>4) /* 62 ms */ 967 968 static void 969 arc_cksum_verify(arc_buf_t *buf) 970 { 971 zio_cksum_t zc; 972 973 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 974 return; 975 976 mutex_enter(&buf->b_hdr->b_freeze_lock); 977 if (buf->b_hdr->b_freeze_cksum == NULL || 978 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 979 mutex_exit(&buf->b_hdr->b_freeze_lock); 980 return; 981 } 982 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 983 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 984 panic("buffer modified while frozen!"); 985 mutex_exit(&buf->b_hdr->b_freeze_lock); 986 } 987 988 static int 989 arc_cksum_equal(arc_buf_t *buf) 990 { 991 zio_cksum_t zc; 992 int equal; 993 994 mutex_enter(&buf->b_hdr->b_freeze_lock); 995 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 996 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 997 mutex_exit(&buf->b_hdr->b_freeze_lock); 998 999 return (equal); 1000 } 1001 1002 static void 1003 arc_cksum_compute(arc_buf_t *buf, boolean_t force) 1004 { 1005 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1006 return; 1007 1008 mutex_enter(&buf->b_hdr->b_freeze_lock); 1009 if (buf->b_hdr->b_freeze_cksum != NULL) { 1010 mutex_exit(&buf->b_hdr->b_freeze_lock); 1011 return; 1012 } 1013 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1014 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1015 buf->b_hdr->b_freeze_cksum); 1016 mutex_exit(&buf->b_hdr->b_freeze_lock); 1017 arc_buf_watch(buf); 1018 } 1019 1020 #ifndef _KERNEL 1021 typedef struct procctl { 1022 long cmd; 1023 prwatch_t prwatch; 1024 } procctl_t; 1025 #endif 1026 1027 /* ARGSUSED */ 1028 static void 1029 arc_buf_unwatch(arc_buf_t *buf) 1030 { 1031 #ifndef _KERNEL 1032 if (arc_watch) { 1033 int result; 1034 procctl_t ctl; 1035 ctl.cmd = PCWATCH; 1036 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1037 ctl.prwatch.pr_size = 0; 1038 ctl.prwatch.pr_wflags = 0; 1039 result = write(arc_procfd, &ctl, sizeof (ctl)); 1040 ASSERT3U(result, ==, sizeof (ctl)); 1041 } 1042 #endif 1043 } 1044 1045 /* ARGSUSED */ 1046 static void 1047 arc_buf_watch(arc_buf_t *buf) 1048 { 1049 #ifndef _KERNEL 1050 if (arc_watch) { 1051 int result; 1052 procctl_t ctl; 1053 ctl.cmd = PCWATCH; 1054 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1055 ctl.prwatch.pr_size = buf->b_hdr->b_size; 1056 ctl.prwatch.pr_wflags = WA_WRITE; 1057 result = write(arc_procfd, &ctl, sizeof (ctl)); 1058 ASSERT3U(result, ==, sizeof (ctl)); 1059 } 1060 #endif 1061 } 1062 1063 void 1064 arc_buf_thaw(arc_buf_t *buf) 1065 { 1066 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1067 if (buf->b_hdr->b_state != arc_anon) 1068 panic("modifying non-anon buffer!"); 1069 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 1070 panic("modifying buffer while i/o in progress!"); 1071 arc_cksum_verify(buf); 1072 } 1073 1074 mutex_enter(&buf->b_hdr->b_freeze_lock); 1075 if (buf->b_hdr->b_freeze_cksum != NULL) { 1076 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1077 buf->b_hdr->b_freeze_cksum = NULL; 1078 } 1079 1080 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1081 if (buf->b_hdr->b_thawed) 1082 kmem_free(buf->b_hdr->b_thawed, 1); 1083 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); 1084 } 1085 1086 mutex_exit(&buf->b_hdr->b_freeze_lock); 1087 1088 arc_buf_unwatch(buf); 1089 } 1090 1091 void 1092 arc_buf_freeze(arc_buf_t *buf) 1093 { 1094 kmutex_t *hash_lock; 1095 1096 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1097 return; 1098 1099 hash_lock = HDR_LOCK(buf->b_hdr); 1100 mutex_enter(hash_lock); 1101 1102 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1103 buf->b_hdr->b_state == arc_anon); 1104 arc_cksum_compute(buf, B_FALSE); 1105 mutex_exit(hash_lock); 1106 1107 } 1108 1109 static void 1110 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 1111 { 1112 ASSERT(MUTEX_HELD(hash_lock)); 1113 1114 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 1115 (ab->b_state != arc_anon)) { 1116 uint64_t delta = ab->b_size * ab->b_datacnt; 1117 list_t *list = &ab->b_state->arcs_list[ab->b_type]; 1118 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; 1119 1120 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); 1121 mutex_enter(&ab->b_state->arcs_mtx); 1122 ASSERT(list_link_active(&ab->b_arc_node)); 1123 list_remove(list, ab); 1124 if (GHOST_STATE(ab->b_state)) { 1125 ASSERT0(ab->b_datacnt); 1126 ASSERT3P(ab->b_buf, ==, NULL); 1127 delta = ab->b_size; 1128 } 1129 ASSERT(delta > 0); 1130 ASSERT3U(*size, >=, delta); 1131 atomic_add_64(size, -delta); 1132 mutex_exit(&ab->b_state->arcs_mtx); 1133 /* remove the prefetch flag if we get a reference */ 1134 if (ab->b_flags & ARC_PREFETCH) 1135 ab->b_flags &= ~ARC_PREFETCH; 1136 } 1137 } 1138 1139 static int 1140 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 1141 { 1142 int cnt; 1143 arc_state_t *state = ab->b_state; 1144 1145 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1146 ASSERT(!GHOST_STATE(state)); 1147 1148 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 1149 (state != arc_anon)) { 1150 uint64_t *size = &state->arcs_lsize[ab->b_type]; 1151 1152 ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 1153 mutex_enter(&state->arcs_mtx); 1154 ASSERT(!list_link_active(&ab->b_arc_node)); 1155 list_insert_head(&state->arcs_list[ab->b_type], ab); 1156 ASSERT(ab->b_datacnt > 0); 1157 atomic_add_64(size, ab->b_size * ab->b_datacnt); 1158 mutex_exit(&state->arcs_mtx); 1159 } 1160 return (cnt); 1161 } 1162 1163 /* 1164 * Move the supplied buffer to the indicated state. The mutex 1165 * for the buffer must be held by the caller. 1166 */ 1167 static void 1168 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 1169 { 1170 arc_state_t *old_state = ab->b_state; 1171 int64_t refcnt = refcount_count(&ab->b_refcnt); 1172 uint64_t from_delta, to_delta; 1173 1174 ASSERT(MUTEX_HELD(hash_lock)); 1175 ASSERT3P(new_state, !=, old_state); 1176 ASSERT(refcnt == 0 || ab->b_datacnt > 0); 1177 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 1178 ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); 1179 1180 from_delta = to_delta = ab->b_datacnt * ab->b_size; 1181 1182 /* 1183 * If this buffer is evictable, transfer it from the 1184 * old state list to the new state list. 1185 */ 1186 if (refcnt == 0) { 1187 if (old_state != arc_anon) { 1188 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 1189 uint64_t *size = &old_state->arcs_lsize[ab->b_type]; 1190 1191 if (use_mutex) 1192 mutex_enter(&old_state->arcs_mtx); 1193 1194 ASSERT(list_link_active(&ab->b_arc_node)); 1195 list_remove(&old_state->arcs_list[ab->b_type], ab); 1196 1197 /* 1198 * If prefetching out of the ghost cache, 1199 * we will have a non-zero datacnt. 1200 */ 1201 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 1202 /* ghost elements have a ghost size */ 1203 ASSERT(ab->b_buf == NULL); 1204 from_delta = ab->b_size; 1205 } 1206 ASSERT3U(*size, >=, from_delta); 1207 atomic_add_64(size, -from_delta); 1208 1209 if (use_mutex) 1210 mutex_exit(&old_state->arcs_mtx); 1211 } 1212 if (new_state != arc_anon) { 1213 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 1214 uint64_t *size = &new_state->arcs_lsize[ab->b_type]; 1215 1216 if (use_mutex) 1217 mutex_enter(&new_state->arcs_mtx); 1218 1219 list_insert_head(&new_state->arcs_list[ab->b_type], ab); 1220 1221 /* ghost elements have a ghost size */ 1222 if (GHOST_STATE(new_state)) { 1223 ASSERT(ab->b_datacnt == 0); 1224 ASSERT(ab->b_buf == NULL); 1225 to_delta = ab->b_size; 1226 } 1227 atomic_add_64(size, to_delta); 1228 1229 if (use_mutex) 1230 mutex_exit(&new_state->arcs_mtx); 1231 } 1232 } 1233 1234 ASSERT(!BUF_EMPTY(ab)); 1235 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) 1236 buf_hash_remove(ab); 1237 1238 /* adjust state sizes */ 1239 if (to_delta) 1240 atomic_add_64(&new_state->arcs_size, to_delta); 1241 if (from_delta) { 1242 ASSERT3U(old_state->arcs_size, >=, from_delta); 1243 atomic_add_64(&old_state->arcs_size, -from_delta); 1244 } 1245 ab->b_state = new_state; 1246 1247 /* adjust l2arc hdr stats */ 1248 if (new_state == arc_l2c_only) 1249 l2arc_hdr_stat_add(); 1250 else if (old_state == arc_l2c_only) 1251 l2arc_hdr_stat_remove(); 1252 } 1253 1254 void 1255 arc_space_consume(uint64_t space, arc_space_type_t type) 1256 { 1257 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1258 1259 switch (type) { 1260 case ARC_SPACE_DATA: 1261 ARCSTAT_INCR(arcstat_data_size, space); 1262 break; 1263 case ARC_SPACE_OTHER: 1264 ARCSTAT_INCR(arcstat_other_size, space); 1265 break; 1266 case ARC_SPACE_HDRS: 1267 ARCSTAT_INCR(arcstat_hdr_size, space); 1268 break; 1269 case ARC_SPACE_L2HDRS: 1270 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1271 break; 1272 } 1273 1274 ARCSTAT_INCR(arcstat_meta_used, space); 1275 atomic_add_64(&arc_size, space); 1276 } 1277 1278 void 1279 arc_space_return(uint64_t space, arc_space_type_t type) 1280 { 1281 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1282 1283 switch (type) { 1284 case ARC_SPACE_DATA: 1285 ARCSTAT_INCR(arcstat_data_size, -space); 1286 break; 1287 case ARC_SPACE_OTHER: 1288 ARCSTAT_INCR(arcstat_other_size, -space); 1289 break; 1290 case ARC_SPACE_HDRS: 1291 ARCSTAT_INCR(arcstat_hdr_size, -space); 1292 break; 1293 case ARC_SPACE_L2HDRS: 1294 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1295 break; 1296 } 1297 1298 ASSERT(arc_meta_used >= space); 1299 if (arc_meta_max < arc_meta_used) 1300 arc_meta_max = arc_meta_used; 1301 ARCSTAT_INCR(arcstat_meta_used, -space); 1302 ASSERT(arc_size >= space); 1303 atomic_add_64(&arc_size, -space); 1304 } 1305 1306 void * 1307 arc_data_buf_alloc(uint64_t size) 1308 { 1309 if (arc_evict_needed(ARC_BUFC_DATA)) 1310 cv_signal(&arc_reclaim_thr_cv); 1311 atomic_add_64(&arc_size, size); 1312 return (zio_data_buf_alloc(size)); 1313 } 1314 1315 void 1316 arc_data_buf_free(void *buf, uint64_t size) 1317 { 1318 zio_data_buf_free(buf, size); 1319 ASSERT(arc_size >= size); 1320 atomic_add_64(&arc_size, -size); 1321 } 1322 1323 arc_buf_t * 1324 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 1325 { 1326 arc_buf_hdr_t *hdr; 1327 arc_buf_t *buf; 1328 1329 ASSERT3U(size, >, 0); 1330 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 1331 ASSERT(BUF_EMPTY(hdr)); 1332 hdr->b_size = size; 1333 hdr->b_type = type; 1334 hdr->b_spa = spa_load_guid(spa); 1335 hdr->b_state = arc_anon; 1336 hdr->b_arc_access = 0; 1337 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1338 buf->b_hdr = hdr; 1339 buf->b_data = NULL; 1340 buf->b_efunc = NULL; 1341 buf->b_private = NULL; 1342 buf->b_next = NULL; 1343 hdr->b_buf = buf; 1344 arc_get_data_buf(buf); 1345 hdr->b_datacnt = 1; 1346 hdr->b_flags = 0; 1347 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1348 (void) refcount_add(&hdr->b_refcnt, tag); 1349 1350 return (buf); 1351 } 1352 1353 static char *arc_onloan_tag = "onloan"; 1354 1355 /* 1356 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1357 * flight data by arc_tempreserve_space() until they are "returned". Loaned 1358 * buffers must be returned to the arc before they can be used by the DMU or 1359 * freed. 1360 */ 1361 arc_buf_t * 1362 arc_loan_buf(spa_t *spa, int size) 1363 { 1364 arc_buf_t *buf; 1365 1366 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 1367 1368 atomic_add_64(&arc_loaned_bytes, size); 1369 return (buf); 1370 } 1371 1372 /* 1373 * Return a loaned arc buffer to the arc. 1374 */ 1375 void 1376 arc_return_buf(arc_buf_t *buf, void *tag) 1377 { 1378 arc_buf_hdr_t *hdr = buf->b_hdr; 1379 1380 ASSERT(buf->b_data != NULL); 1381 (void) refcount_add(&hdr->b_refcnt, tag); 1382 (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); 1383 1384 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 1385 } 1386 1387 /* Detach an arc_buf from a dbuf (tag) */ 1388 void 1389 arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 1390 { 1391 arc_buf_hdr_t *hdr; 1392 1393 ASSERT(buf->b_data != NULL); 1394 hdr = buf->b_hdr; 1395 (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); 1396 (void) refcount_remove(&hdr->b_refcnt, tag); 1397 buf->b_efunc = NULL; 1398 buf->b_private = NULL; 1399 1400 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 1401 } 1402 1403 static arc_buf_t * 1404 arc_buf_clone(arc_buf_t *from) 1405 { 1406 arc_buf_t *buf; 1407 arc_buf_hdr_t *hdr = from->b_hdr; 1408 uint64_t size = hdr->b_size; 1409 1410 ASSERT(hdr->b_state != arc_anon); 1411 1412 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1413 buf->b_hdr = hdr; 1414 buf->b_data = NULL; 1415 buf->b_efunc = NULL; 1416 buf->b_private = NULL; 1417 buf->b_next = hdr->b_buf; 1418 hdr->b_buf = buf; 1419 arc_get_data_buf(buf); 1420 bcopy(from->b_data, buf->b_data, size); 1421 1422 /* 1423 * This buffer already exists in the arc so create a duplicate 1424 * copy for the caller. If the buffer is associated with user data 1425 * then track the size and number of duplicates. These stats will be 1426 * updated as duplicate buffers are created and destroyed. 1427 */ 1428 if (hdr->b_type == ARC_BUFC_DATA) { 1429 ARCSTAT_BUMP(arcstat_duplicate_buffers); 1430 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 1431 } 1432 hdr->b_datacnt += 1; 1433 return (buf); 1434 } 1435 1436 void 1437 arc_buf_add_ref(arc_buf_t *buf, void* tag) 1438 { 1439 arc_buf_hdr_t *hdr; 1440 kmutex_t *hash_lock; 1441 1442 /* 1443 * Check to see if this buffer is evicted. Callers 1444 * must verify b_data != NULL to know if the add_ref 1445 * was successful. 1446 */ 1447 mutex_enter(&buf->b_evict_lock); 1448 if (buf->b_data == NULL) { 1449 mutex_exit(&buf->b_evict_lock); 1450 return; 1451 } 1452 hash_lock = HDR_LOCK(buf->b_hdr); 1453 mutex_enter(hash_lock); 1454 hdr = buf->b_hdr; 1455 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1456 mutex_exit(&buf->b_evict_lock); 1457 1458 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 1459 add_reference(hdr, hash_lock, tag); 1460 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 1461 arc_access(hdr, hash_lock); 1462 mutex_exit(hash_lock); 1463 ARCSTAT_BUMP(arcstat_hits); 1464 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 1465 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 1466 data, metadata, hits); 1467 } 1468 1469 /* 1470 * Free the arc data buffer. If it is an l2arc write in progress, 1471 * the buffer is placed on l2arc_free_on_write to be freed later. 1472 */ 1473 static void 1474 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 1475 { 1476 arc_buf_hdr_t *hdr = buf->b_hdr; 1477 1478 if (HDR_L2_WRITING(hdr)) { 1479 l2arc_data_free_t *df; 1480 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 1481 df->l2df_data = buf->b_data; 1482 df->l2df_size = hdr->b_size; 1483 df->l2df_func = free_func; 1484 mutex_enter(&l2arc_free_on_write_mtx); 1485 list_insert_head(l2arc_free_on_write, df); 1486 mutex_exit(&l2arc_free_on_write_mtx); 1487 ARCSTAT_BUMP(arcstat_l2_free_on_write); 1488 } else { 1489 free_func(buf->b_data, hdr->b_size); 1490 } 1491 } 1492 1493 static void 1494 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 1495 { 1496 arc_buf_t **bufp; 1497 1498 /* free up data associated with the buf */ 1499 if (buf->b_data) { 1500 arc_state_t *state = buf->b_hdr->b_state; 1501 uint64_t size = buf->b_hdr->b_size; 1502 arc_buf_contents_t type = buf->b_hdr->b_type; 1503 1504 arc_cksum_verify(buf); 1505 arc_buf_unwatch(buf); 1506 1507 if (!recycle) { 1508 if (type == ARC_BUFC_METADATA) { 1509 arc_buf_data_free(buf, zio_buf_free); 1510 arc_space_return(size, ARC_SPACE_DATA); 1511 } else { 1512 ASSERT(type == ARC_BUFC_DATA); 1513 arc_buf_data_free(buf, zio_data_buf_free); 1514 ARCSTAT_INCR(arcstat_data_size, -size); 1515 atomic_add_64(&arc_size, -size); 1516 } 1517 } 1518 if (list_link_active(&buf->b_hdr->b_arc_node)) { 1519 uint64_t *cnt = &state->arcs_lsize[type]; 1520 1521 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 1522 ASSERT(state != arc_anon); 1523 1524 ASSERT3U(*cnt, >=, size); 1525 atomic_add_64(cnt, -size); 1526 } 1527 ASSERT3U(state->arcs_size, >=, size); 1528 atomic_add_64(&state->arcs_size, -size); 1529 buf->b_data = NULL; 1530 1531 /* 1532 * If we're destroying a duplicate buffer make sure 1533 * that the appropriate statistics are updated. 1534 */ 1535 if (buf->b_hdr->b_datacnt > 1 && 1536 buf->b_hdr->b_type == ARC_BUFC_DATA) { 1537 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 1538 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 1539 } 1540 ASSERT(buf->b_hdr->b_datacnt > 0); 1541 buf->b_hdr->b_datacnt -= 1; 1542 } 1543 1544 /* only remove the buf if requested */ 1545 if (!all) 1546 return; 1547 1548 /* remove the buf from the hdr list */ 1549 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 1550 continue; 1551 *bufp = buf->b_next; 1552 buf->b_next = NULL; 1553 1554 ASSERT(buf->b_efunc == NULL); 1555 1556 /* clean up the buf */ 1557 buf->b_hdr = NULL; 1558 kmem_cache_free(buf_cache, buf); 1559 } 1560 1561 static void 1562 arc_hdr_destroy(arc_buf_hdr_t *hdr) 1563 { 1564 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1565 ASSERT3P(hdr->b_state, ==, arc_anon); 1566 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1567 l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; 1568 1569 if (l2hdr != NULL) { 1570 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); 1571 /* 1572 * To prevent arc_free() and l2arc_evict() from 1573 * attempting to free the same buffer at the same time, 1574 * a FREE_IN_PROGRESS flag is given to arc_free() to 1575 * give it priority. l2arc_evict() can't destroy this 1576 * header while we are waiting on l2arc_buflist_mtx. 1577 * 1578 * The hdr may be removed from l2ad_buflist before we 1579 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. 1580 */ 1581 if (!buflist_held) { 1582 mutex_enter(&l2arc_buflist_mtx); 1583 l2hdr = hdr->b_l2hdr; 1584 } 1585 1586 if (l2hdr != NULL) { 1587 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 1588 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 1589 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 1590 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 1591 if (hdr->b_state == arc_l2c_only) 1592 l2arc_hdr_stat_remove(); 1593 hdr->b_l2hdr = NULL; 1594 } 1595 1596 if (!buflist_held) 1597 mutex_exit(&l2arc_buflist_mtx); 1598 } 1599 1600 if (!BUF_EMPTY(hdr)) { 1601 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1602 buf_discard_identity(hdr); 1603 } 1604 while (hdr->b_buf) { 1605 arc_buf_t *buf = hdr->b_buf; 1606 1607 if (buf->b_efunc) { 1608 mutex_enter(&arc_eviction_mtx); 1609 mutex_enter(&buf->b_evict_lock); 1610 ASSERT(buf->b_hdr != NULL); 1611 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1612 hdr->b_buf = buf->b_next; 1613 buf->b_hdr = &arc_eviction_hdr; 1614 buf->b_next = arc_eviction_list; 1615 arc_eviction_list = buf; 1616 mutex_exit(&buf->b_evict_lock); 1617 mutex_exit(&arc_eviction_mtx); 1618 } else { 1619 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1620 } 1621 } 1622 if (hdr->b_freeze_cksum != NULL) { 1623 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1624 hdr->b_freeze_cksum = NULL; 1625 } 1626 if (hdr->b_thawed) { 1627 kmem_free(hdr->b_thawed, 1); 1628 hdr->b_thawed = NULL; 1629 } 1630 1631 ASSERT(!list_link_active(&hdr->b_arc_node)); 1632 ASSERT3P(hdr->b_hash_next, ==, NULL); 1633 ASSERT3P(hdr->b_acb, ==, NULL); 1634 kmem_cache_free(hdr_cache, hdr); 1635 } 1636 1637 void 1638 arc_buf_free(arc_buf_t *buf, void *tag) 1639 { 1640 arc_buf_hdr_t *hdr = buf->b_hdr; 1641 int hashed = hdr->b_state != arc_anon; 1642 1643 ASSERT(buf->b_efunc == NULL); 1644 ASSERT(buf->b_data != NULL); 1645 1646 if (hashed) { 1647 kmutex_t *hash_lock = HDR_LOCK(hdr); 1648 1649 mutex_enter(hash_lock); 1650 hdr = buf->b_hdr; 1651 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1652 1653 (void) remove_reference(hdr, hash_lock, tag); 1654 if (hdr->b_datacnt > 1) { 1655 arc_buf_destroy(buf, FALSE, TRUE); 1656 } else { 1657 ASSERT(buf == hdr->b_buf); 1658 ASSERT(buf->b_efunc == NULL); 1659 hdr->b_flags |= ARC_BUF_AVAILABLE; 1660 } 1661 mutex_exit(hash_lock); 1662 } else if (HDR_IO_IN_PROGRESS(hdr)) { 1663 int destroy_hdr; 1664 /* 1665 * We are in the middle of an async write. Don't destroy 1666 * this buffer unless the write completes before we finish 1667 * decrementing the reference count. 1668 */ 1669 mutex_enter(&arc_eviction_mtx); 1670 (void) remove_reference(hdr, NULL, tag); 1671 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1672 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1673 mutex_exit(&arc_eviction_mtx); 1674 if (destroy_hdr) 1675 arc_hdr_destroy(hdr); 1676 } else { 1677 if (remove_reference(hdr, NULL, tag) > 0) 1678 arc_buf_destroy(buf, FALSE, TRUE); 1679 else 1680 arc_hdr_destroy(hdr); 1681 } 1682 } 1683 1684 boolean_t 1685 arc_buf_remove_ref(arc_buf_t *buf, void* tag) 1686 { 1687 arc_buf_hdr_t *hdr = buf->b_hdr; 1688 kmutex_t *hash_lock = HDR_LOCK(hdr); 1689 boolean_t no_callback = (buf->b_efunc == NULL); 1690 1691 if (hdr->b_state == arc_anon) { 1692 ASSERT(hdr->b_datacnt == 1); 1693 arc_buf_free(buf, tag); 1694 return (no_callback); 1695 } 1696 1697 mutex_enter(hash_lock); 1698 hdr = buf->b_hdr; 1699 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1700 ASSERT(hdr->b_state != arc_anon); 1701 ASSERT(buf->b_data != NULL); 1702 1703 (void) remove_reference(hdr, hash_lock, tag); 1704 if (hdr->b_datacnt > 1) { 1705 if (no_callback) 1706 arc_buf_destroy(buf, FALSE, TRUE); 1707 } else if (no_callback) { 1708 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1709 ASSERT(buf->b_efunc == NULL); 1710 hdr->b_flags |= ARC_BUF_AVAILABLE; 1711 } 1712 ASSERT(no_callback || hdr->b_datacnt > 1 || 1713 refcount_is_zero(&hdr->b_refcnt)); 1714 mutex_exit(hash_lock); 1715 return (no_callback); 1716 } 1717 1718 int 1719 arc_buf_size(arc_buf_t *buf) 1720 { 1721 return (buf->b_hdr->b_size); 1722 } 1723 1724 /* 1725 * Called from the DMU to determine if the current buffer should be 1726 * evicted. In order to ensure proper locking, the eviction must be initiated 1727 * from the DMU. Return true if the buffer is associated with user data and 1728 * duplicate buffers still exist. 1729 */ 1730 boolean_t 1731 arc_buf_eviction_needed(arc_buf_t *buf) 1732 { 1733 arc_buf_hdr_t *hdr; 1734 boolean_t evict_needed = B_FALSE; 1735 1736 if (zfs_disable_dup_eviction) 1737 return (B_FALSE); 1738 1739 mutex_enter(&buf->b_evict_lock); 1740 hdr = buf->b_hdr; 1741 if (hdr == NULL) { 1742 /* 1743 * We are in arc_do_user_evicts(); let that function 1744 * perform the eviction. 1745 */ 1746 ASSERT(buf->b_data == NULL); 1747 mutex_exit(&buf->b_evict_lock); 1748 return (B_FALSE); 1749 } else if (buf->b_data == NULL) { 1750 /* 1751 * We have already been added to the arc eviction list; 1752 * recommend eviction. 1753 */ 1754 ASSERT3P(hdr, ==, &arc_eviction_hdr); 1755 mutex_exit(&buf->b_evict_lock); 1756 return (B_TRUE); 1757 } 1758 1759 if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) 1760 evict_needed = B_TRUE; 1761 1762 mutex_exit(&buf->b_evict_lock); 1763 return (evict_needed); 1764 } 1765 1766 /* 1767 * Evict buffers from list until we've removed the specified number of 1768 * bytes. Move the removed buffers to the appropriate evict state. 1769 * If the recycle flag is set, then attempt to "recycle" a buffer: 1770 * - look for a buffer to evict that is `bytes' long. 1771 * - return the data block from this buffer rather than freeing it. 1772 * This flag is used by callers that are trying to make space for a 1773 * new buffer in a full arc cache. 1774 * 1775 * This function makes a "best effort". It skips over any buffers 1776 * it can't get a hash_lock on, and so may not catch all candidates. 1777 * It may also return without evicting as much space as requested. 1778 */ 1779 static void * 1780 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, 1781 arc_buf_contents_t type) 1782 { 1783 arc_state_t *evicted_state; 1784 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1785 arc_buf_hdr_t *ab, *ab_prev = NULL; 1786 list_t *list = &state->arcs_list[type]; 1787 kmutex_t *hash_lock; 1788 boolean_t have_lock; 1789 void *stolen = NULL; 1790 arc_buf_hdr_t marker = { 0 }; 1791 int count = 0; 1792 1793 ASSERT(state == arc_mru || state == arc_mfu); 1794 1795 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1796 1797 mutex_enter(&state->arcs_mtx); 1798 mutex_enter(&evicted_state->arcs_mtx); 1799 1800 for (ab = list_tail(list); ab; ab = ab_prev) { 1801 ab_prev = list_prev(list, ab); 1802 /* prefetch buffers have a minimum lifespan */ 1803 if (HDR_IO_IN_PROGRESS(ab) || 1804 (spa && ab->b_spa != spa) || 1805 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1806 ddi_get_lbolt() - ab->b_arc_access < 1807 arc_min_prefetch_lifespan)) { 1808 skipped++; 1809 continue; 1810 } 1811 /* "lookahead" for better eviction candidate */ 1812 if (recycle && ab->b_size != bytes && 1813 ab_prev && ab_prev->b_size == bytes) 1814 continue; 1815 1816 /* ignore markers */ 1817 if (ab->b_spa == 0) 1818 continue; 1819 1820 /* 1821 * It may take a long time to evict all the bufs requested. 1822 * To avoid blocking all arc activity, periodically drop 1823 * the arcs_mtx and give other threads a chance to run 1824 * before reacquiring the lock. 1825 * 1826 * If we are looking for a buffer to recycle, we are in 1827 * the hot code path, so don't sleep. 1828 */ 1829 if (!recycle && count++ > arc_evict_iterations) { 1830 list_insert_after(list, ab, &marker); 1831 mutex_exit(&evicted_state->arcs_mtx); 1832 mutex_exit(&state->arcs_mtx); 1833 kpreempt(KPREEMPT_SYNC); 1834 mutex_enter(&state->arcs_mtx); 1835 mutex_enter(&evicted_state->arcs_mtx); 1836 ab_prev = list_prev(list, &marker); 1837 list_remove(list, &marker); 1838 count = 0; 1839 continue; 1840 } 1841 1842 hash_lock = HDR_LOCK(ab); 1843 have_lock = MUTEX_HELD(hash_lock); 1844 if (have_lock || mutex_tryenter(hash_lock)) { 1845 ASSERT0(refcount_count(&ab->b_refcnt)); 1846 ASSERT(ab->b_datacnt > 0); 1847 while (ab->b_buf) { 1848 arc_buf_t *buf = ab->b_buf; 1849 if (!mutex_tryenter(&buf->b_evict_lock)) { 1850 missed += 1; 1851 break; 1852 } 1853 if (buf->b_data) { 1854 bytes_evicted += ab->b_size; 1855 if (recycle && ab->b_type == type && 1856 ab->b_size == bytes && 1857 !HDR_L2_WRITING(ab)) { 1858 stolen = buf->b_data; 1859 recycle = FALSE; 1860 } 1861 } 1862 if (buf->b_efunc) { 1863 mutex_enter(&arc_eviction_mtx); 1864 arc_buf_destroy(buf, 1865 buf->b_data == stolen, FALSE); 1866 ab->b_buf = buf->b_next; 1867 buf->b_hdr = &arc_eviction_hdr; 1868 buf->b_next = arc_eviction_list; 1869 arc_eviction_list = buf; 1870 mutex_exit(&arc_eviction_mtx); 1871 mutex_exit(&buf->b_evict_lock); 1872 } else { 1873 mutex_exit(&buf->b_evict_lock); 1874 arc_buf_destroy(buf, 1875 buf->b_data == stolen, TRUE); 1876 } 1877 } 1878 1879 if (ab->b_l2hdr) { 1880 ARCSTAT_INCR(arcstat_evict_l2_cached, 1881 ab->b_size); 1882 } else { 1883 if (l2arc_write_eligible(ab->b_spa, ab)) { 1884 ARCSTAT_INCR(arcstat_evict_l2_eligible, 1885 ab->b_size); 1886 } else { 1887 ARCSTAT_INCR( 1888 arcstat_evict_l2_ineligible, 1889 ab->b_size); 1890 } 1891 } 1892 1893 if (ab->b_datacnt == 0) { 1894 arc_change_state(evicted_state, ab, hash_lock); 1895 ASSERT(HDR_IN_HASH_TABLE(ab)); 1896 ab->b_flags |= ARC_IN_HASH_TABLE; 1897 ab->b_flags &= ~ARC_BUF_AVAILABLE; 1898 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 1899 } 1900 if (!have_lock) 1901 mutex_exit(hash_lock); 1902 if (bytes >= 0 && bytes_evicted >= bytes) 1903 break; 1904 } else { 1905 missed += 1; 1906 } 1907 } 1908 1909 mutex_exit(&evicted_state->arcs_mtx); 1910 mutex_exit(&state->arcs_mtx); 1911 1912 if (bytes_evicted < bytes) 1913 dprintf("only evicted %lld bytes from %x", 1914 (longlong_t)bytes_evicted, state); 1915 1916 if (skipped) 1917 ARCSTAT_INCR(arcstat_evict_skip, skipped); 1918 1919 if (missed) 1920 ARCSTAT_INCR(arcstat_mutex_miss, missed); 1921 1922 /* 1923 * Note: we have just evicted some data into the ghost state, 1924 * potentially putting the ghost size over the desired size. Rather 1925 * that evicting from the ghost list in this hot code path, leave 1926 * this chore to the arc_reclaim_thread(). 1927 */ 1928 1929 return (stolen); 1930 } 1931 1932 /* 1933 * Remove buffers from list until we've removed the specified number of 1934 * bytes. Destroy the buffers that are removed. 1935 */ 1936 static void 1937 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) 1938 { 1939 arc_buf_hdr_t *ab, *ab_prev; 1940 arc_buf_hdr_t marker = { 0 }; 1941 list_t *list = &state->arcs_list[ARC_BUFC_DATA]; 1942 kmutex_t *hash_lock; 1943 uint64_t bytes_deleted = 0; 1944 uint64_t bufs_skipped = 0; 1945 int count = 0; 1946 1947 ASSERT(GHOST_STATE(state)); 1948 top: 1949 mutex_enter(&state->arcs_mtx); 1950 for (ab = list_tail(list); ab; ab = ab_prev) { 1951 ab_prev = list_prev(list, ab); 1952 if (ab->b_type > ARC_BUFC_NUMTYPES) 1953 panic("invalid ab=%p", (void *)ab); 1954 if (spa && ab->b_spa != spa) 1955 continue; 1956 1957 /* ignore markers */ 1958 if (ab->b_spa == 0) 1959 continue; 1960 1961 hash_lock = HDR_LOCK(ab); 1962 /* caller may be trying to modify this buffer, skip it */ 1963 if (MUTEX_HELD(hash_lock)) 1964 continue; 1965 1966 /* 1967 * It may take a long time to evict all the bufs requested. 1968 * To avoid blocking all arc activity, periodically drop 1969 * the arcs_mtx and give other threads a chance to run 1970 * before reacquiring the lock. 1971 */ 1972 if (count++ > arc_evict_iterations) { 1973 list_insert_after(list, ab, &marker); 1974 mutex_exit(&state->arcs_mtx); 1975 kpreempt(KPREEMPT_SYNC); 1976 mutex_enter(&state->arcs_mtx); 1977 ab_prev = list_prev(list, &marker); 1978 list_remove(list, &marker); 1979 count = 0; 1980 continue; 1981 } 1982 if (mutex_tryenter(hash_lock)) { 1983 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1984 ASSERT(ab->b_buf == NULL); 1985 ARCSTAT_BUMP(arcstat_deleted); 1986 bytes_deleted += ab->b_size; 1987 1988 if (ab->b_l2hdr != NULL) { 1989 /* 1990 * This buffer is cached on the 2nd Level ARC; 1991 * don't destroy the header. 1992 */ 1993 arc_change_state(arc_l2c_only, ab, hash_lock); 1994 mutex_exit(hash_lock); 1995 } else { 1996 arc_change_state(arc_anon, ab, hash_lock); 1997 mutex_exit(hash_lock); 1998 arc_hdr_destroy(ab); 1999 } 2000 2001 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 2002 if (bytes >= 0 && bytes_deleted >= bytes) 2003 break; 2004 } else if (bytes < 0) { 2005 /* 2006 * Insert a list marker and then wait for the 2007 * hash lock to become available. Once its 2008 * available, restart from where we left off. 2009 */ 2010 list_insert_after(list, ab, &marker); 2011 mutex_exit(&state->arcs_mtx); 2012 mutex_enter(hash_lock); 2013 mutex_exit(hash_lock); 2014 mutex_enter(&state->arcs_mtx); 2015 ab_prev = list_prev(list, &marker); 2016 list_remove(list, &marker); 2017 } else { 2018 bufs_skipped += 1; 2019 } 2020 2021 } 2022 mutex_exit(&state->arcs_mtx); 2023 2024 if (list == &state->arcs_list[ARC_BUFC_DATA] && 2025 (bytes < 0 || bytes_deleted < bytes)) { 2026 list = &state->arcs_list[ARC_BUFC_METADATA]; 2027 goto top; 2028 } 2029 2030 if (bufs_skipped) { 2031 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 2032 ASSERT(bytes >= 0); 2033 } 2034 2035 if (bytes_deleted < bytes) 2036 dprintf("only deleted %lld bytes from %p", 2037 (longlong_t)bytes_deleted, state); 2038 } 2039 2040 static void 2041 arc_adjust(void) 2042 { 2043 int64_t adjustment, delta; 2044 2045 /* 2046 * Adjust MRU size 2047 */ 2048 2049 adjustment = MIN((int64_t)(arc_size - arc_c), 2050 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 2051 arc_p)); 2052 2053 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 2054 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 2055 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA); 2056 adjustment -= delta; 2057 } 2058 2059 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2060 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 2061 (void) arc_evict(arc_mru, NULL, delta, FALSE, 2062 ARC_BUFC_METADATA); 2063 } 2064 2065 /* 2066 * Adjust MFU size 2067 */ 2068 2069 adjustment = arc_size - arc_c; 2070 2071 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 2072 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 2073 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA); 2074 adjustment -= delta; 2075 } 2076 2077 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2078 int64_t delta = MIN(adjustment, 2079 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 2080 (void) arc_evict(arc_mfu, NULL, delta, FALSE, 2081 ARC_BUFC_METADATA); 2082 } 2083 2084 /* 2085 * Adjust ghost lists 2086 */ 2087 2088 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 2089 2090 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 2091 delta = MIN(arc_mru_ghost->arcs_size, adjustment); 2092 arc_evict_ghost(arc_mru_ghost, NULL, delta); 2093 } 2094 2095 adjustment = 2096 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 2097 2098 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 2099 delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 2100 arc_evict_ghost(arc_mfu_ghost, NULL, delta); 2101 } 2102 } 2103 2104 static void 2105 arc_do_user_evicts(void) 2106 { 2107 mutex_enter(&arc_eviction_mtx); 2108 while (arc_eviction_list != NULL) { 2109 arc_buf_t *buf = arc_eviction_list; 2110 arc_eviction_list = buf->b_next; 2111 mutex_enter(&buf->b_evict_lock); 2112 buf->b_hdr = NULL; 2113 mutex_exit(&buf->b_evict_lock); 2114 mutex_exit(&arc_eviction_mtx); 2115 2116 if (buf->b_efunc != NULL) 2117 VERIFY(buf->b_efunc(buf) == 0); 2118 2119 buf->b_efunc = NULL; 2120 buf->b_private = NULL; 2121 kmem_cache_free(buf_cache, buf); 2122 mutex_enter(&arc_eviction_mtx); 2123 } 2124 mutex_exit(&arc_eviction_mtx); 2125 } 2126 2127 /* 2128 * Flush all *evictable* data from the cache for the given spa. 2129 * NOTE: this will not touch "active" (i.e. referenced) data. 2130 */ 2131 void 2132 arc_flush(spa_t *spa) 2133 { 2134 uint64_t guid = 0; 2135 2136 if (spa) 2137 guid = spa_load_guid(spa); 2138 2139 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { 2140 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 2141 if (spa) 2142 break; 2143 } 2144 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { 2145 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 2146 if (spa) 2147 break; 2148 } 2149 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { 2150 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 2151 if (spa) 2152 break; 2153 } 2154 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { 2155 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 2156 if (spa) 2157 break; 2158 } 2159 2160 arc_evict_ghost(arc_mru_ghost, guid, -1); 2161 arc_evict_ghost(arc_mfu_ghost, guid, -1); 2162 2163 mutex_enter(&arc_reclaim_thr_lock); 2164 arc_do_user_evicts(); 2165 mutex_exit(&arc_reclaim_thr_lock); 2166 ASSERT(spa || arc_eviction_list == NULL); 2167 } 2168 2169 void 2170 arc_shrink(void) 2171 { 2172 if (arc_c > arc_c_min) { 2173 uint64_t to_free; 2174 2175 #ifdef _KERNEL 2176 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); 2177 #else 2178 to_free = arc_c >> arc_shrink_shift; 2179 #endif 2180 if (arc_c > arc_c_min + to_free) 2181 atomic_add_64(&arc_c, -to_free); 2182 else 2183 arc_c = arc_c_min; 2184 2185 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 2186 if (arc_c > arc_size) 2187 arc_c = MAX(arc_size, arc_c_min); 2188 if (arc_p > arc_c) 2189 arc_p = (arc_c >> 1); 2190 ASSERT(arc_c >= arc_c_min); 2191 ASSERT((int64_t)arc_p >= 0); 2192 } 2193 2194 if (arc_size > arc_c) 2195 arc_adjust(); 2196 } 2197 2198 /* 2199 * Determine if the system is under memory pressure and is asking 2200 * to reclaim memory. A return value of 1 indicates that the system 2201 * is under memory pressure and that the arc should adjust accordingly. 2202 */ 2203 static int 2204 arc_reclaim_needed(void) 2205 { 2206 uint64_t extra; 2207 2208 #ifdef _KERNEL 2209 2210 if (needfree) 2211 return (1); 2212 2213 /* 2214 * take 'desfree' extra pages, so we reclaim sooner, rather than later 2215 */ 2216 extra = desfree; 2217 2218 /* 2219 * check that we're out of range of the pageout scanner. It starts to 2220 * schedule paging if freemem is less than lotsfree and needfree. 2221 * lotsfree is the high-water mark for pageout, and needfree is the 2222 * number of needed free pages. We add extra pages here to make sure 2223 * the scanner doesn't start up while we're freeing memory. 2224 */ 2225 if (freemem < lotsfree + needfree + extra) 2226 return (1); 2227 2228 /* 2229 * check to make sure that swapfs has enough space so that anon 2230 * reservations can still succeed. anon_resvmem() checks that the 2231 * availrmem is greater than swapfs_minfree, and the number of reserved 2232 * swap pages. We also add a bit of extra here just to prevent 2233 * circumstances from getting really dire. 2234 */ 2235 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 2236 return (1); 2237 2238 #if defined(__i386) 2239 /* 2240 * If we're on an i386 platform, it's possible that we'll exhaust the 2241 * kernel heap space before we ever run out of available physical 2242 * memory. Most checks of the size of the heap_area compare against 2243 * tune.t_minarmem, which is the minimum available real memory that we 2244 * can have in the system. However, this is generally fixed at 25 pages 2245 * which is so low that it's useless. In this comparison, we seek to 2246 * calculate the total heap-size, and reclaim if more than 3/4ths of the 2247 * heap is allocated. (Or, in the calculation, if less than 1/4th is 2248 * free) 2249 */ 2250 if (vmem_size(heap_arena, VMEM_FREE) < 2251 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) 2252 return (1); 2253 #endif 2254 2255 /* 2256 * If zio data pages are being allocated out of a separate heap segment, 2257 * then enforce that the size of available vmem for this arena remains 2258 * above about 1/16th free. 2259 * 2260 * Note: The 1/16th arena free requirement was put in place 2261 * to aggressively evict memory from the arc in order to avoid 2262 * memory fragmentation issues. 2263 */ 2264 if (zio_arena != NULL && 2265 vmem_size(zio_arena, VMEM_FREE) < 2266 (vmem_size(zio_arena, VMEM_ALLOC) >> 4)) 2267 return (1); 2268 #else 2269 if (spa_get_random(100) == 0) 2270 return (1); 2271 #endif 2272 return (0); 2273 } 2274 2275 static void 2276 arc_kmem_reap_now(arc_reclaim_strategy_t strat) 2277 { 2278 size_t i; 2279 kmem_cache_t *prev_cache = NULL; 2280 kmem_cache_t *prev_data_cache = NULL; 2281 extern kmem_cache_t *zio_buf_cache[]; 2282 extern kmem_cache_t *zio_data_buf_cache[]; 2283 2284 #ifdef _KERNEL 2285 if (arc_meta_used >= arc_meta_limit) { 2286 /* 2287 * We are exceeding our meta-data cache limit. 2288 * Purge some DNLC entries to release holds on meta-data. 2289 */ 2290 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 2291 } 2292 #if defined(__i386) 2293 /* 2294 * Reclaim unused memory from all kmem caches. 2295 */ 2296 kmem_reap(); 2297 #endif 2298 #endif 2299 2300 /* 2301 * An aggressive reclamation will shrink the cache size as well as 2302 * reap free buffers from the arc kmem caches. 2303 */ 2304 if (strat == ARC_RECLAIM_AGGR) 2305 arc_shrink(); 2306 2307 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 2308 if (zio_buf_cache[i] != prev_cache) { 2309 prev_cache = zio_buf_cache[i]; 2310 kmem_cache_reap_now(zio_buf_cache[i]); 2311 } 2312 if (zio_data_buf_cache[i] != prev_data_cache) { 2313 prev_data_cache = zio_data_buf_cache[i]; 2314 kmem_cache_reap_now(zio_data_buf_cache[i]); 2315 } 2316 } 2317 kmem_cache_reap_now(buf_cache); 2318 kmem_cache_reap_now(hdr_cache); 2319 2320 /* 2321 * Ask the vmem areana to reclaim unused memory from its 2322 * quantum caches. 2323 */ 2324 if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR) 2325 vmem_qcache_reap(zio_arena); 2326 } 2327 2328 static void 2329 arc_reclaim_thread(void) 2330 { 2331 clock_t growtime = 0; 2332 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 2333 callb_cpr_t cpr; 2334 2335 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 2336 2337 mutex_enter(&arc_reclaim_thr_lock); 2338 while (arc_thread_exit == 0) { 2339 if (arc_reclaim_needed()) { 2340 2341 if (arc_no_grow) { 2342 if (last_reclaim == ARC_RECLAIM_CONS) { 2343 last_reclaim = ARC_RECLAIM_AGGR; 2344 } else { 2345 last_reclaim = ARC_RECLAIM_CONS; 2346 } 2347 } else { 2348 arc_no_grow = TRUE; 2349 last_reclaim = ARC_RECLAIM_AGGR; 2350 membar_producer(); 2351 } 2352 2353 /* reset the growth delay for every reclaim */ 2354 growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 2355 2356 arc_kmem_reap_now(last_reclaim); 2357 arc_warm = B_TRUE; 2358 2359 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { 2360 arc_no_grow = FALSE; 2361 } 2362 2363 arc_adjust(); 2364 2365 if (arc_eviction_list != NULL) 2366 arc_do_user_evicts(); 2367 2368 /* block until needed, or one second, whichever is shorter */ 2369 CALLB_CPR_SAFE_BEGIN(&cpr); 2370 (void) cv_timedwait(&arc_reclaim_thr_cv, 2371 &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz)); 2372 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 2373 } 2374 2375 arc_thread_exit = 0; 2376 cv_broadcast(&arc_reclaim_thr_cv); 2377 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 2378 thread_exit(); 2379 } 2380 2381 /* 2382 * Adapt arc info given the number of bytes we are trying to add and 2383 * the state that we are comming from. This function is only called 2384 * when we are adding new content to the cache. 2385 */ 2386 static void 2387 arc_adapt(int bytes, arc_state_t *state) 2388 { 2389 int mult; 2390 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 2391 2392 if (state == arc_l2c_only) 2393 return; 2394 2395 ASSERT(bytes > 0); 2396 /* 2397 * Adapt the target size of the MRU list: 2398 * - if we just hit in the MRU ghost list, then increase 2399 * the target size of the MRU list. 2400 * - if we just hit in the MFU ghost list, then increase 2401 * the target size of the MFU list by decreasing the 2402 * target size of the MRU list. 2403 */ 2404 if (state == arc_mru_ghost) { 2405 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 2406 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 2407 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 2408 2409 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 2410 } else if (state == arc_mfu_ghost) { 2411 uint64_t delta; 2412 2413 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 2414 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 2415 mult = MIN(mult, 10); 2416 2417 delta = MIN(bytes * mult, arc_p); 2418 arc_p = MAX(arc_p_min, arc_p - delta); 2419 } 2420 ASSERT((int64_t)arc_p >= 0); 2421 2422 if (arc_reclaim_needed()) { 2423 cv_signal(&arc_reclaim_thr_cv); 2424 return; 2425 } 2426 2427 if (arc_no_grow) 2428 return; 2429 2430 if (arc_c >= arc_c_max) 2431 return; 2432 2433 /* 2434 * If we're within (2 * maxblocksize) bytes of the target 2435 * cache size, increment the target cache size 2436 */ 2437 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 2438 atomic_add_64(&arc_c, (int64_t)bytes); 2439 if (arc_c > arc_c_max) 2440 arc_c = arc_c_max; 2441 else if (state == arc_anon) 2442 atomic_add_64(&arc_p, (int64_t)bytes); 2443 if (arc_p > arc_c) 2444 arc_p = arc_c; 2445 } 2446 ASSERT((int64_t)arc_p >= 0); 2447 } 2448 2449 /* 2450 * Check if the cache has reached its limits and eviction is required 2451 * prior to insert. 2452 */ 2453 static int 2454 arc_evict_needed(arc_buf_contents_t type) 2455 { 2456 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 2457 return (1); 2458 2459 if (arc_reclaim_needed()) 2460 return (1); 2461 2462 return (arc_size > arc_c); 2463 } 2464 2465 /* 2466 * The buffer, supplied as the first argument, needs a data block. 2467 * So, if we are at cache max, determine which cache should be victimized. 2468 * We have the following cases: 2469 * 2470 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 2471 * In this situation if we're out of space, but the resident size of the MFU is 2472 * under the limit, victimize the MFU cache to satisfy this insertion request. 2473 * 2474 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 2475 * Here, we've used up all of the available space for the MRU, so we need to 2476 * evict from our own cache instead. Evict from the set of resident MRU 2477 * entries. 2478 * 2479 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 2480 * c minus p represents the MFU space in the cache, since p is the size of the 2481 * cache that is dedicated to the MRU. In this situation there's still space on 2482 * the MFU side, so the MRU side needs to be victimized. 2483 * 2484 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 2485 * MFU's resident set is consuming more space than it has been allotted. In 2486 * this situation, we must victimize our own cache, the MFU, for this insertion. 2487 */ 2488 static void 2489 arc_get_data_buf(arc_buf_t *buf) 2490 { 2491 arc_state_t *state = buf->b_hdr->b_state; 2492 uint64_t size = buf->b_hdr->b_size; 2493 arc_buf_contents_t type = buf->b_hdr->b_type; 2494 2495 arc_adapt(size, state); 2496 2497 /* 2498 * We have not yet reached cache maximum size, 2499 * just allocate a new buffer. 2500 */ 2501 if (!arc_evict_needed(type)) { 2502 if (type == ARC_BUFC_METADATA) { 2503 buf->b_data = zio_buf_alloc(size); 2504 arc_space_consume(size, ARC_SPACE_DATA); 2505 } else { 2506 ASSERT(type == ARC_BUFC_DATA); 2507 buf->b_data = zio_data_buf_alloc(size); 2508 ARCSTAT_INCR(arcstat_data_size, size); 2509 atomic_add_64(&arc_size, size); 2510 } 2511 goto out; 2512 } 2513 2514 /* 2515 * If we are prefetching from the mfu ghost list, this buffer 2516 * will end up on the mru list; so steal space from there. 2517 */ 2518 if (state == arc_mfu_ghost) 2519 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 2520 else if (state == arc_mru_ghost) 2521 state = arc_mru; 2522 2523 if (state == arc_mru || state == arc_anon) { 2524 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 2525 state = (arc_mfu->arcs_lsize[type] >= size && 2526 arc_p > mru_used) ? arc_mfu : arc_mru; 2527 } else { 2528 /* MFU cases */ 2529 uint64_t mfu_space = arc_c - arc_p; 2530 state = (arc_mru->arcs_lsize[type] >= size && 2531 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 2532 } 2533 if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { 2534 if (type == ARC_BUFC_METADATA) { 2535 buf->b_data = zio_buf_alloc(size); 2536 arc_space_consume(size, ARC_SPACE_DATA); 2537 } else { 2538 ASSERT(type == ARC_BUFC_DATA); 2539 buf->b_data = zio_data_buf_alloc(size); 2540 ARCSTAT_INCR(arcstat_data_size, size); 2541 atomic_add_64(&arc_size, size); 2542 } 2543 ARCSTAT_BUMP(arcstat_recycle_miss); 2544 } 2545 ASSERT(buf->b_data != NULL); 2546 out: 2547 /* 2548 * Update the state size. Note that ghost states have a 2549 * "ghost size" and so don't need to be updated. 2550 */ 2551 if (!GHOST_STATE(buf->b_hdr->b_state)) { 2552 arc_buf_hdr_t *hdr = buf->b_hdr; 2553 2554 atomic_add_64(&hdr->b_state->arcs_size, size); 2555 if (list_link_active(&hdr->b_arc_node)) { 2556 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2557 atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 2558 } 2559 /* 2560 * If we are growing the cache, and we are adding anonymous 2561 * data, and we have outgrown arc_p, update arc_p 2562 */ 2563 if (arc_size < arc_c && hdr->b_state == arc_anon && 2564 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 2565 arc_p = MIN(arc_c, arc_p + size); 2566 } 2567 } 2568 2569 /* 2570 * This routine is called whenever a buffer is accessed. 2571 * NOTE: the hash lock is dropped in this function. 2572 */ 2573 static void 2574 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 2575 { 2576 clock_t now; 2577 2578 ASSERT(MUTEX_HELD(hash_lock)); 2579 2580 if (buf->b_state == arc_anon) { 2581 /* 2582 * This buffer is not in the cache, and does not 2583 * appear in our "ghost" list. Add the new buffer 2584 * to the MRU state. 2585 */ 2586 2587 ASSERT(buf->b_arc_access == 0); 2588 buf->b_arc_access = ddi_get_lbolt(); 2589 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2590 arc_change_state(arc_mru, buf, hash_lock); 2591 2592 } else if (buf->b_state == arc_mru) { 2593 now = ddi_get_lbolt(); 2594 2595 /* 2596 * If this buffer is here because of a prefetch, then either: 2597 * - clear the flag if this is a "referencing" read 2598 * (any subsequent access will bump this into the MFU state). 2599 * or 2600 * - move the buffer to the head of the list if this is 2601 * another prefetch (to make it less likely to be evicted). 2602 */ 2603 if ((buf->b_flags & ARC_PREFETCH) != 0) { 2604 if (refcount_count(&buf->b_refcnt) == 0) { 2605 ASSERT(list_link_active(&buf->b_arc_node)); 2606 } else { 2607 buf->b_flags &= ~ARC_PREFETCH; 2608 ARCSTAT_BUMP(arcstat_mru_hits); 2609 } 2610 buf->b_arc_access = now; 2611 return; 2612 } 2613 2614 /* 2615 * This buffer has been "accessed" only once so far, 2616 * but it is still in the cache. Move it to the MFU 2617 * state. 2618 */ 2619 if (now > buf->b_arc_access + ARC_MINTIME) { 2620 /* 2621 * More than 125ms have passed since we 2622 * instantiated this buffer. Move it to the 2623 * most frequently used state. 2624 */ 2625 buf->b_arc_access = now; 2626 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2627 arc_change_state(arc_mfu, buf, hash_lock); 2628 } 2629 ARCSTAT_BUMP(arcstat_mru_hits); 2630 } else if (buf->b_state == arc_mru_ghost) { 2631 arc_state_t *new_state; 2632 /* 2633 * This buffer has been "accessed" recently, but 2634 * was evicted from the cache. Move it to the 2635 * MFU state. 2636 */ 2637 2638 if (buf->b_flags & ARC_PREFETCH) { 2639 new_state = arc_mru; 2640 if (refcount_count(&buf->b_refcnt) > 0) 2641 buf->b_flags &= ~ARC_PREFETCH; 2642 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2643 } else { 2644 new_state = arc_mfu; 2645 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2646 } 2647 2648 buf->b_arc_access = ddi_get_lbolt(); 2649 arc_change_state(new_state, buf, hash_lock); 2650 2651 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 2652 } else if (buf->b_state == arc_mfu) { 2653 /* 2654 * This buffer has been accessed more than once and is 2655 * still in the cache. Keep it in the MFU state. 2656 * 2657 * NOTE: an add_reference() that occurred when we did 2658 * the arc_read() will have kicked this off the list. 2659 * If it was a prefetch, we will explicitly move it to 2660 * the head of the list now. 2661 */ 2662 if ((buf->b_flags & ARC_PREFETCH) != 0) { 2663 ASSERT(refcount_count(&buf->b_refcnt) == 0); 2664 ASSERT(list_link_active(&buf->b_arc_node)); 2665 } 2666 ARCSTAT_BUMP(arcstat_mfu_hits); 2667 buf->b_arc_access = ddi_get_lbolt(); 2668 } else if (buf->b_state == arc_mfu_ghost) { 2669 arc_state_t *new_state = arc_mfu; 2670 /* 2671 * This buffer has been accessed more than once but has 2672 * been evicted from the cache. Move it back to the 2673 * MFU state. 2674 */ 2675 2676 if (buf->b_flags & ARC_PREFETCH) { 2677 /* 2678 * This is a prefetch access... 2679 * move this block back to the MRU state. 2680 */ 2681 ASSERT0(refcount_count(&buf->b_refcnt)); 2682 new_state = arc_mru; 2683 } 2684 2685 buf->b_arc_access = ddi_get_lbolt(); 2686 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2687 arc_change_state(new_state, buf, hash_lock); 2688 2689 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 2690 } else if (buf->b_state == arc_l2c_only) { 2691 /* 2692 * This buffer is on the 2nd Level ARC. 2693 */ 2694 2695 buf->b_arc_access = ddi_get_lbolt(); 2696 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2697 arc_change_state(arc_mfu, buf, hash_lock); 2698 } else { 2699 ASSERT(!"invalid arc state"); 2700 } 2701 } 2702 2703 /* a generic arc_done_func_t which you can use */ 2704 /* ARGSUSED */ 2705 void 2706 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 2707 { 2708 if (zio == NULL || zio->io_error == 0) 2709 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 2710 VERIFY(arc_buf_remove_ref(buf, arg)); 2711 } 2712 2713 /* a generic arc_done_func_t */ 2714 void 2715 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 2716 { 2717 arc_buf_t **bufp = arg; 2718 if (zio && zio->io_error) { 2719 VERIFY(arc_buf_remove_ref(buf, arg)); 2720 *bufp = NULL; 2721 } else { 2722 *bufp = buf; 2723 ASSERT(buf->b_data); 2724 } 2725 } 2726 2727 static void 2728 arc_read_done(zio_t *zio) 2729 { 2730 arc_buf_hdr_t *hdr, *found; 2731 arc_buf_t *buf; 2732 arc_buf_t *abuf; /* buffer we're assigning to callback */ 2733 kmutex_t *hash_lock; 2734 arc_callback_t *callback_list, *acb; 2735 int freeable = FALSE; 2736 2737 buf = zio->io_private; 2738 hdr = buf->b_hdr; 2739 2740 /* 2741 * The hdr was inserted into hash-table and removed from lists 2742 * prior to starting I/O. We should find this header, since 2743 * it's in the hash table, and it should be legit since it's 2744 * not possible to evict it during the I/O. The only possible 2745 * reason for it not to be found is if we were freed during the 2746 * read. 2747 */ 2748 found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, 2749 &hash_lock); 2750 2751 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 2752 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 2753 (found == hdr && HDR_L2_READING(hdr))); 2754 2755 hdr->b_flags &= ~ARC_L2_EVICTED; 2756 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) 2757 hdr->b_flags &= ~ARC_L2CACHE; 2758 2759 /* byteswap if necessary */ 2760 callback_list = hdr->b_acb; 2761 ASSERT(callback_list != NULL); 2762 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 2763 dmu_object_byteswap_t bswap = 2764 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 2765 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 2766 byteswap_uint64_array : 2767 dmu_ot_byteswap[bswap].ob_func; 2768 func(buf->b_data, hdr->b_size); 2769 } 2770 2771 arc_cksum_compute(buf, B_FALSE); 2772 arc_buf_watch(buf); 2773 2774 if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { 2775 /* 2776 * Only call arc_access on anonymous buffers. This is because 2777 * if we've issued an I/O for an evicted buffer, we've already 2778 * called arc_access (to prevent any simultaneous readers from 2779 * getting confused). 2780 */ 2781 arc_access(hdr, hash_lock); 2782 } 2783 2784 /* create copies of the data buffer for the callers */ 2785 abuf = buf; 2786 for (acb = callback_list; acb; acb = acb->acb_next) { 2787 if (acb->acb_done) { 2788 if (abuf == NULL) { 2789 ARCSTAT_BUMP(arcstat_duplicate_reads); 2790 abuf = arc_buf_clone(buf); 2791 } 2792 acb->acb_buf = abuf; 2793 abuf = NULL; 2794 } 2795 } 2796 hdr->b_acb = NULL; 2797 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2798 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 2799 if (abuf == buf) { 2800 ASSERT(buf->b_efunc == NULL); 2801 ASSERT(hdr->b_datacnt == 1); 2802 hdr->b_flags |= ARC_BUF_AVAILABLE; 2803 } 2804 2805 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 2806 2807 if (zio->io_error != 0) { 2808 hdr->b_flags |= ARC_IO_ERROR; 2809 if (hdr->b_state != arc_anon) 2810 arc_change_state(arc_anon, hdr, hash_lock); 2811 if (HDR_IN_HASH_TABLE(hdr)) 2812 buf_hash_remove(hdr); 2813 freeable = refcount_is_zero(&hdr->b_refcnt); 2814 } 2815 2816 /* 2817 * Broadcast before we drop the hash_lock to avoid the possibility 2818 * that the hdr (and hence the cv) might be freed before we get to 2819 * the cv_broadcast(). 2820 */ 2821 cv_broadcast(&hdr->b_cv); 2822 2823 if (hash_lock) { 2824 mutex_exit(hash_lock); 2825 } else { 2826 /* 2827 * This block was freed while we waited for the read to 2828 * complete. It has been removed from the hash table and 2829 * moved to the anonymous state (so that it won't show up 2830 * in the cache). 2831 */ 2832 ASSERT3P(hdr->b_state, ==, arc_anon); 2833 freeable = refcount_is_zero(&hdr->b_refcnt); 2834 } 2835 2836 /* execute each callback and free its structure */ 2837 while ((acb = callback_list) != NULL) { 2838 if (acb->acb_done) 2839 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 2840 2841 if (acb->acb_zio_dummy != NULL) { 2842 acb->acb_zio_dummy->io_error = zio->io_error; 2843 zio_nowait(acb->acb_zio_dummy); 2844 } 2845 2846 callback_list = acb->acb_next; 2847 kmem_free(acb, sizeof (arc_callback_t)); 2848 } 2849 2850 if (freeable) 2851 arc_hdr_destroy(hdr); 2852 } 2853 2854 /* 2855 * "Read" the block at the specified DVA (in bp) via the 2856 * cache. If the block is found in the cache, invoke the provided 2857 * callback immediately and return. Note that the `zio' parameter 2858 * in the callback will be NULL in this case, since no IO was 2859 * required. If the block is not in the cache pass the read request 2860 * on to the spa with a substitute callback function, so that the 2861 * requested block will be added to the cache. 2862 * 2863 * If a read request arrives for a block that has a read in-progress, 2864 * either wait for the in-progress read to complete (and return the 2865 * results); or, if this is a read with a "done" func, add a record 2866 * to the read to invoke the "done" func when the read completes, 2867 * and return; or just return. 2868 * 2869 * arc_read_done() will invoke all the requested "done" functions 2870 * for readers of this block. 2871 */ 2872 int 2873 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 2874 void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags, 2875 const zbookmark_t *zb) 2876 { 2877 arc_buf_hdr_t *hdr; 2878 arc_buf_t *buf = NULL; 2879 kmutex_t *hash_lock; 2880 zio_t *rzio; 2881 uint64_t guid = spa_load_guid(spa); 2882 2883 top: 2884 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), 2885 &hash_lock); 2886 if (hdr && hdr->b_datacnt > 0) { 2887 2888 *arc_flags |= ARC_CACHED; 2889 2890 if (HDR_IO_IN_PROGRESS(hdr)) { 2891 2892 if (*arc_flags & ARC_WAIT) { 2893 cv_wait(&hdr->b_cv, hash_lock); 2894 mutex_exit(hash_lock); 2895 goto top; 2896 } 2897 ASSERT(*arc_flags & ARC_NOWAIT); 2898 2899 if (done) { 2900 arc_callback_t *acb = NULL; 2901 2902 acb = kmem_zalloc(sizeof (arc_callback_t), 2903 KM_SLEEP); 2904 acb->acb_done = done; 2905 acb->acb_private = private; 2906 if (pio != NULL) 2907 acb->acb_zio_dummy = zio_null(pio, 2908 spa, NULL, NULL, NULL, zio_flags); 2909 2910 ASSERT(acb->acb_done != NULL); 2911 acb->acb_next = hdr->b_acb; 2912 hdr->b_acb = acb; 2913 add_reference(hdr, hash_lock, private); 2914 mutex_exit(hash_lock); 2915 return (0); 2916 } 2917 mutex_exit(hash_lock); 2918 return (0); 2919 } 2920 2921 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2922 2923 if (done) { 2924 add_reference(hdr, hash_lock, private); 2925 /* 2926 * If this block is already in use, create a new 2927 * copy of the data so that we will be guaranteed 2928 * that arc_release() will always succeed. 2929 */ 2930 buf = hdr->b_buf; 2931 ASSERT(buf); 2932 ASSERT(buf->b_data); 2933 if (HDR_BUF_AVAILABLE(hdr)) { 2934 ASSERT(buf->b_efunc == NULL); 2935 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 2936 } else { 2937 buf = arc_buf_clone(buf); 2938 } 2939 2940 } else if (*arc_flags & ARC_PREFETCH && 2941 refcount_count(&hdr->b_refcnt) == 0) { 2942 hdr->b_flags |= ARC_PREFETCH; 2943 } 2944 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2945 arc_access(hdr, hash_lock); 2946 if (*arc_flags & ARC_L2CACHE) 2947 hdr->b_flags |= ARC_L2CACHE; 2948 if (*arc_flags & ARC_L2COMPRESS) 2949 hdr->b_flags |= ARC_L2COMPRESS; 2950 mutex_exit(hash_lock); 2951 ARCSTAT_BUMP(arcstat_hits); 2952 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2953 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2954 data, metadata, hits); 2955 2956 if (done) 2957 done(NULL, buf, private); 2958 } else { 2959 uint64_t size = BP_GET_LSIZE(bp); 2960 arc_callback_t *acb; 2961 vdev_t *vd = NULL; 2962 uint64_t addr = 0; 2963 boolean_t devw = B_FALSE; 2964 2965 if (hdr == NULL) { 2966 /* this block is not in the cache */ 2967 arc_buf_hdr_t *exists; 2968 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 2969 buf = arc_buf_alloc(spa, size, private, type); 2970 hdr = buf->b_hdr; 2971 hdr->b_dva = *BP_IDENTITY(bp); 2972 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 2973 hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 2974 exists = buf_hash_insert(hdr, &hash_lock); 2975 if (exists) { 2976 /* somebody beat us to the hash insert */ 2977 mutex_exit(hash_lock); 2978 buf_discard_identity(hdr); 2979 (void) arc_buf_remove_ref(buf, private); 2980 goto top; /* restart the IO request */ 2981 } 2982 /* if this is a prefetch, we don't have a reference */ 2983 if (*arc_flags & ARC_PREFETCH) { 2984 (void) remove_reference(hdr, hash_lock, 2985 private); 2986 hdr->b_flags |= ARC_PREFETCH; 2987 } 2988 if (*arc_flags & ARC_L2CACHE) 2989 hdr->b_flags |= ARC_L2CACHE; 2990 if (*arc_flags & ARC_L2COMPRESS) 2991 hdr->b_flags |= ARC_L2COMPRESS; 2992 if (BP_GET_LEVEL(bp) > 0) 2993 hdr->b_flags |= ARC_INDIRECT; 2994 } else { 2995 /* this block is in the ghost cache */ 2996 ASSERT(GHOST_STATE(hdr->b_state)); 2997 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2998 ASSERT0(refcount_count(&hdr->b_refcnt)); 2999 ASSERT(hdr->b_buf == NULL); 3000 3001 /* if this is a prefetch, we don't have a reference */ 3002 if (*arc_flags & ARC_PREFETCH) 3003 hdr->b_flags |= ARC_PREFETCH; 3004 else 3005 add_reference(hdr, hash_lock, private); 3006 if (*arc_flags & ARC_L2CACHE) 3007 hdr->b_flags |= ARC_L2CACHE; 3008 if (*arc_flags & ARC_L2COMPRESS) 3009 hdr->b_flags |= ARC_L2COMPRESS; 3010 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 3011 buf->b_hdr = hdr; 3012 buf->b_data = NULL; 3013 buf->b_efunc = NULL; 3014 buf->b_private = NULL; 3015 buf->b_next = NULL; 3016 hdr->b_buf = buf; 3017 ASSERT(hdr->b_datacnt == 0); 3018 hdr->b_datacnt = 1; 3019 arc_get_data_buf(buf); 3020 arc_access(hdr, hash_lock); 3021 } 3022 3023 ASSERT(!GHOST_STATE(hdr->b_state)); 3024 3025 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 3026 acb->acb_done = done; 3027 acb->acb_private = private; 3028 3029 ASSERT(hdr->b_acb == NULL); 3030 hdr->b_acb = acb; 3031 hdr->b_flags |= ARC_IO_IN_PROGRESS; 3032 3033 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && 3034 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { 3035 devw = hdr->b_l2hdr->b_dev->l2ad_writing; 3036 addr = hdr->b_l2hdr->b_daddr; 3037 /* 3038 * Lock out device removal. 3039 */ 3040 if (vdev_is_dead(vd) || 3041 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 3042 vd = NULL; 3043 } 3044 3045 mutex_exit(hash_lock); 3046 3047 /* 3048 * At this point, we have a level 1 cache miss. Try again in 3049 * L2ARC if possible. 3050 */ 3051 ASSERT3U(hdr->b_size, ==, size); 3052 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 3053 uint64_t, size, zbookmark_t *, zb); 3054 ARCSTAT_BUMP(arcstat_misses); 3055 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 3056 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 3057 data, metadata, misses); 3058 3059 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 3060 /* 3061 * Read from the L2ARC if the following are true: 3062 * 1. The L2ARC vdev was previously cached. 3063 * 2. This buffer still has L2ARC metadata. 3064 * 3. This buffer isn't currently writing to the L2ARC. 3065 * 4. The L2ARC entry wasn't evicted, which may 3066 * also have invalidated the vdev. 3067 * 5. This isn't prefetch and l2arc_noprefetch is set. 3068 */ 3069 if (hdr->b_l2hdr != NULL && 3070 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 3071 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 3072 l2arc_read_callback_t *cb; 3073 3074 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 3075 ARCSTAT_BUMP(arcstat_l2_hits); 3076 3077 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 3078 KM_SLEEP); 3079 cb->l2rcb_buf = buf; 3080 cb->l2rcb_spa = spa; 3081 cb->l2rcb_bp = *bp; 3082 cb->l2rcb_zb = *zb; 3083 cb->l2rcb_flags = zio_flags; 3084 cb->l2rcb_compress = hdr->b_l2hdr->b_compress; 3085 3086 ASSERT(addr >= VDEV_LABEL_START_SIZE && 3087 addr + size < vd->vdev_psize - 3088 VDEV_LABEL_END_SIZE); 3089 3090 /* 3091 * l2arc read. The SCL_L2ARC lock will be 3092 * released by l2arc_read_done(). 3093 * Issue a null zio if the underlying buffer 3094 * was squashed to zero size by compression. 3095 */ 3096 if (hdr->b_l2hdr->b_compress == 3097 ZIO_COMPRESS_EMPTY) { 3098 rzio = zio_null(pio, spa, vd, 3099 l2arc_read_done, cb, 3100 zio_flags | ZIO_FLAG_DONT_CACHE | 3101 ZIO_FLAG_CANFAIL | 3102 ZIO_FLAG_DONT_PROPAGATE | 3103 ZIO_FLAG_DONT_RETRY); 3104 } else { 3105 rzio = zio_read_phys(pio, vd, addr, 3106 hdr->b_l2hdr->b_asize, 3107 buf->b_data, ZIO_CHECKSUM_OFF, 3108 l2arc_read_done, cb, priority, 3109 zio_flags | ZIO_FLAG_DONT_CACHE | 3110 ZIO_FLAG_CANFAIL | 3111 ZIO_FLAG_DONT_PROPAGATE | 3112 ZIO_FLAG_DONT_RETRY, B_FALSE); 3113 } 3114 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 3115 zio_t *, rzio); 3116 ARCSTAT_INCR(arcstat_l2_read_bytes, 3117 hdr->b_l2hdr->b_asize); 3118 3119 if (*arc_flags & ARC_NOWAIT) { 3120 zio_nowait(rzio); 3121 return (0); 3122 } 3123 3124 ASSERT(*arc_flags & ARC_WAIT); 3125 if (zio_wait(rzio) == 0) 3126 return (0); 3127 3128 /* l2arc read error; goto zio_read() */ 3129 } else { 3130 DTRACE_PROBE1(l2arc__miss, 3131 arc_buf_hdr_t *, hdr); 3132 ARCSTAT_BUMP(arcstat_l2_misses); 3133 if (HDR_L2_WRITING(hdr)) 3134 ARCSTAT_BUMP(arcstat_l2_rw_clash); 3135 spa_config_exit(spa, SCL_L2ARC, vd); 3136 } 3137 } else { 3138 if (vd != NULL) 3139 spa_config_exit(spa, SCL_L2ARC, vd); 3140 if (l2arc_ndev != 0) { 3141 DTRACE_PROBE1(l2arc__miss, 3142 arc_buf_hdr_t *, hdr); 3143 ARCSTAT_BUMP(arcstat_l2_misses); 3144 } 3145 } 3146 3147 rzio = zio_read(pio, spa, bp, buf->b_data, size, 3148 arc_read_done, buf, priority, zio_flags, zb); 3149 3150 if (*arc_flags & ARC_WAIT) 3151 return (zio_wait(rzio)); 3152 3153 ASSERT(*arc_flags & ARC_NOWAIT); 3154 zio_nowait(rzio); 3155 } 3156 return (0); 3157 } 3158 3159 void 3160 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 3161 { 3162 ASSERT(buf->b_hdr != NULL); 3163 ASSERT(buf->b_hdr->b_state != arc_anon); 3164 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 3165 ASSERT(buf->b_efunc == NULL); 3166 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 3167 3168 buf->b_efunc = func; 3169 buf->b_private = private; 3170 } 3171 3172 /* 3173 * Notify the arc that a block was freed, and thus will never be used again. 3174 */ 3175 void 3176 arc_freed(spa_t *spa, const blkptr_t *bp) 3177 { 3178 arc_buf_hdr_t *hdr; 3179 kmutex_t *hash_lock; 3180 uint64_t guid = spa_load_guid(spa); 3181 3182 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), 3183 &hash_lock); 3184 if (hdr == NULL) 3185 return; 3186 if (HDR_BUF_AVAILABLE(hdr)) { 3187 arc_buf_t *buf = hdr->b_buf; 3188 add_reference(hdr, hash_lock, FTAG); 3189 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 3190 mutex_exit(hash_lock); 3191 3192 arc_release(buf, FTAG); 3193 (void) arc_buf_remove_ref(buf, FTAG); 3194 } else { 3195 mutex_exit(hash_lock); 3196 } 3197 3198 } 3199 3200 /* 3201 * This is used by the DMU to let the ARC know that a buffer is 3202 * being evicted, so the ARC should clean up. If this arc buf 3203 * is not yet in the evicted state, it will be put there. 3204 */ 3205 int 3206 arc_buf_evict(arc_buf_t *buf) 3207 { 3208 arc_buf_hdr_t *hdr; 3209 kmutex_t *hash_lock; 3210 arc_buf_t **bufp; 3211 3212 mutex_enter(&buf->b_evict_lock); 3213 hdr = buf->b_hdr; 3214 if (hdr == NULL) { 3215 /* 3216 * We are in arc_do_user_evicts(). 3217 */ 3218 ASSERT(buf->b_data == NULL); 3219 mutex_exit(&buf->b_evict_lock); 3220 return (0); 3221 } else if (buf->b_data == NULL) { 3222 arc_buf_t copy = *buf; /* structure assignment */ 3223 /* 3224 * We are on the eviction list; process this buffer now 3225 * but let arc_do_user_evicts() do the reaping. 3226 */ 3227 buf->b_efunc = NULL; 3228 mutex_exit(&buf->b_evict_lock); 3229 VERIFY(copy.b_efunc(©) == 0); 3230 return (1); 3231 } 3232 hash_lock = HDR_LOCK(hdr); 3233 mutex_enter(hash_lock); 3234 hdr = buf->b_hdr; 3235 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 3236 3237 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 3238 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 3239 3240 /* 3241 * Pull this buffer off of the hdr 3242 */ 3243 bufp = &hdr->b_buf; 3244 while (*bufp != buf) 3245 bufp = &(*bufp)->b_next; 3246 *bufp = buf->b_next; 3247 3248 ASSERT(buf->b_data != NULL); 3249 arc_buf_destroy(buf, FALSE, FALSE); 3250 3251 if (hdr->b_datacnt == 0) { 3252 arc_state_t *old_state = hdr->b_state; 3253 arc_state_t *evicted_state; 3254 3255 ASSERT(hdr->b_buf == NULL); 3256 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 3257 3258 evicted_state = 3259 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 3260 3261 mutex_enter(&old_state->arcs_mtx); 3262 mutex_enter(&evicted_state->arcs_mtx); 3263 3264 arc_change_state(evicted_state, hdr, hash_lock); 3265 ASSERT(HDR_IN_HASH_TABLE(hdr)); 3266 hdr->b_flags |= ARC_IN_HASH_TABLE; 3267 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 3268 3269 mutex_exit(&evicted_state->arcs_mtx); 3270 mutex_exit(&old_state->arcs_mtx); 3271 } 3272 mutex_exit(hash_lock); 3273 mutex_exit(&buf->b_evict_lock); 3274 3275 VERIFY(buf->b_efunc(buf) == 0); 3276 buf->b_efunc = NULL; 3277 buf->b_private = NULL; 3278 buf->b_hdr = NULL; 3279 buf->b_next = NULL; 3280 kmem_cache_free(buf_cache, buf); 3281 return (1); 3282 } 3283 3284 /* 3285 * Release this buffer from the cache, making it an anonymous buffer. This 3286 * must be done after a read and prior to modifying the buffer contents. 3287 * If the buffer has more than one reference, we must make 3288 * a new hdr for the buffer. 3289 */ 3290 void 3291 arc_release(arc_buf_t *buf, void *tag) 3292 { 3293 arc_buf_hdr_t *hdr; 3294 kmutex_t *hash_lock = NULL; 3295 l2arc_buf_hdr_t *l2hdr; 3296 uint64_t buf_size; 3297 3298 /* 3299 * It would be nice to assert that if it's DMU metadata (level > 3300 * 0 || it's the dnode file), then it must be syncing context. 3301 * But we don't know that information at this level. 3302 */ 3303 3304 mutex_enter(&buf->b_evict_lock); 3305 hdr = buf->b_hdr; 3306 3307 /* this buffer is not on any list */ 3308 ASSERT(refcount_count(&hdr->b_refcnt) > 0); 3309 3310 if (hdr->b_state == arc_anon) { 3311 /* this buffer is already released */ 3312 ASSERT(buf->b_efunc == NULL); 3313 } else { 3314 hash_lock = HDR_LOCK(hdr); 3315 mutex_enter(hash_lock); 3316 hdr = buf->b_hdr; 3317 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 3318 } 3319 3320 l2hdr = hdr->b_l2hdr; 3321 if (l2hdr) { 3322 mutex_enter(&l2arc_buflist_mtx); 3323 hdr->b_l2hdr = NULL; 3324 } 3325 buf_size = hdr->b_size; 3326 3327 /* 3328 * Do we have more than one buf? 3329 */ 3330 if (hdr->b_datacnt > 1) { 3331 arc_buf_hdr_t *nhdr; 3332 arc_buf_t **bufp; 3333 uint64_t blksz = hdr->b_size; 3334 uint64_t spa = hdr->b_spa; 3335 arc_buf_contents_t type = hdr->b_type; 3336 uint32_t flags = hdr->b_flags; 3337 3338 ASSERT(hdr->b_buf != buf || buf->b_next != NULL); 3339 /* 3340 * Pull the data off of this hdr and attach it to 3341 * a new anonymous hdr. 3342 */ 3343 (void) remove_reference(hdr, hash_lock, tag); 3344 bufp = &hdr->b_buf; 3345 while (*bufp != buf) 3346 bufp = &(*bufp)->b_next; 3347 *bufp = buf->b_next; 3348 buf->b_next = NULL; 3349 3350 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 3351 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 3352 if (refcount_is_zero(&hdr->b_refcnt)) { 3353 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; 3354 ASSERT3U(*size, >=, hdr->b_size); 3355 atomic_add_64(size, -hdr->b_size); 3356 } 3357 3358 /* 3359 * We're releasing a duplicate user data buffer, update 3360 * our statistics accordingly. 3361 */ 3362 if (hdr->b_type == ARC_BUFC_DATA) { 3363 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 3364 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 3365 -hdr->b_size); 3366 } 3367 hdr->b_datacnt -= 1; 3368 arc_cksum_verify(buf); 3369 arc_buf_unwatch(buf); 3370 3371 mutex_exit(hash_lock); 3372 3373 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 3374 nhdr->b_size = blksz; 3375 nhdr->b_spa = spa; 3376 nhdr->b_type = type; 3377 nhdr->b_buf = buf; 3378 nhdr->b_state = arc_anon; 3379 nhdr->b_arc_access = 0; 3380 nhdr->b_flags = flags & ARC_L2_WRITING; 3381 nhdr->b_l2hdr = NULL; 3382 nhdr->b_datacnt = 1; 3383 nhdr->b_freeze_cksum = NULL; 3384 (void) refcount_add(&nhdr->b_refcnt, tag); 3385 buf->b_hdr = nhdr; 3386 mutex_exit(&buf->b_evict_lock); 3387 atomic_add_64(&arc_anon->arcs_size, blksz); 3388 } else { 3389 mutex_exit(&buf->b_evict_lock); 3390 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 3391 ASSERT(!list_link_active(&hdr->b_arc_node)); 3392 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3393 if (hdr->b_state != arc_anon) 3394 arc_change_state(arc_anon, hdr, hash_lock); 3395 hdr->b_arc_access = 0; 3396 if (hash_lock) 3397 mutex_exit(hash_lock); 3398 3399 buf_discard_identity(hdr); 3400 arc_buf_thaw(buf); 3401 } 3402 buf->b_efunc = NULL; 3403 buf->b_private = NULL; 3404 3405 if (l2hdr) { 3406 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 3407 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 3408 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 3409 ARCSTAT_INCR(arcstat_l2_size, -buf_size); 3410 mutex_exit(&l2arc_buflist_mtx); 3411 } 3412 } 3413 3414 int 3415 arc_released(arc_buf_t *buf) 3416 { 3417 int released; 3418 3419 mutex_enter(&buf->b_evict_lock); 3420 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 3421 mutex_exit(&buf->b_evict_lock); 3422 return (released); 3423 } 3424 3425 int 3426 arc_has_callback(arc_buf_t *buf) 3427 { 3428 int callback; 3429 3430 mutex_enter(&buf->b_evict_lock); 3431 callback = (buf->b_efunc != NULL); 3432 mutex_exit(&buf->b_evict_lock); 3433 return (callback); 3434 } 3435 3436 #ifdef ZFS_DEBUG 3437 int 3438 arc_referenced(arc_buf_t *buf) 3439 { 3440 int referenced; 3441 3442 mutex_enter(&buf->b_evict_lock); 3443 referenced = (refcount_count(&buf->b_hdr->b_refcnt)); 3444 mutex_exit(&buf->b_evict_lock); 3445 return (referenced); 3446 } 3447 #endif 3448 3449 static void 3450 arc_write_ready(zio_t *zio) 3451 { 3452 arc_write_callback_t *callback = zio->io_private; 3453 arc_buf_t *buf = callback->awcb_buf; 3454 arc_buf_hdr_t *hdr = buf->b_hdr; 3455 3456 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 3457 callback->awcb_ready(zio, buf, callback->awcb_private); 3458 3459 /* 3460 * If the IO is already in progress, then this is a re-write 3461 * attempt, so we need to thaw and re-compute the cksum. 3462 * It is the responsibility of the callback to handle the 3463 * accounting for any re-write attempt. 3464 */ 3465 if (HDR_IO_IN_PROGRESS(hdr)) { 3466 mutex_enter(&hdr->b_freeze_lock); 3467 if (hdr->b_freeze_cksum != NULL) { 3468 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 3469 hdr->b_freeze_cksum = NULL; 3470 } 3471 mutex_exit(&hdr->b_freeze_lock); 3472 } 3473 arc_cksum_compute(buf, B_FALSE); 3474 hdr->b_flags |= ARC_IO_IN_PROGRESS; 3475 } 3476 3477 /* 3478 * The SPA calls this callback for each physical write that happens on behalf 3479 * of a logical write. See the comment in dbuf_write_physdone() for details. 3480 */ 3481 static void 3482 arc_write_physdone(zio_t *zio) 3483 { 3484 arc_write_callback_t *cb = zio->io_private; 3485 if (cb->awcb_physdone != NULL) 3486 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 3487 } 3488 3489 static void 3490 arc_write_done(zio_t *zio) 3491 { 3492 arc_write_callback_t *callback = zio->io_private; 3493 arc_buf_t *buf = callback->awcb_buf; 3494 arc_buf_hdr_t *hdr = buf->b_hdr; 3495 3496 ASSERT(hdr->b_acb == NULL); 3497 3498 if (zio->io_error == 0) { 3499 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 3500 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 3501 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 3502 } else { 3503 ASSERT(BUF_EMPTY(hdr)); 3504 } 3505 3506 /* 3507 * If the block to be written was all-zero, we may have 3508 * compressed it away. In this case no write was performed 3509 * so there will be no dva/birth/checksum. The buffer must 3510 * therefore remain anonymous (and uncached). 3511 */ 3512 if (!BUF_EMPTY(hdr)) { 3513 arc_buf_hdr_t *exists; 3514 kmutex_t *hash_lock; 3515 3516 ASSERT(zio->io_error == 0); 3517 3518 arc_cksum_verify(buf); 3519 3520 exists = buf_hash_insert(hdr, &hash_lock); 3521 if (exists) { 3522 /* 3523 * This can only happen if we overwrite for 3524 * sync-to-convergence, because we remove 3525 * buffers from the hash table when we arc_free(). 3526 */ 3527 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 3528 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 3529 panic("bad overwrite, hdr=%p exists=%p", 3530 (void *)hdr, (void *)exists); 3531 ASSERT(refcount_is_zero(&exists->b_refcnt)); 3532 arc_change_state(arc_anon, exists, hash_lock); 3533 mutex_exit(hash_lock); 3534 arc_hdr_destroy(exists); 3535 exists = buf_hash_insert(hdr, &hash_lock); 3536 ASSERT3P(exists, ==, NULL); 3537 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 3538 /* nopwrite */ 3539 ASSERT(zio->io_prop.zp_nopwrite); 3540 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 3541 panic("bad nopwrite, hdr=%p exists=%p", 3542 (void *)hdr, (void *)exists); 3543 } else { 3544 /* Dedup */ 3545 ASSERT(hdr->b_datacnt == 1); 3546 ASSERT(hdr->b_state == arc_anon); 3547 ASSERT(BP_GET_DEDUP(zio->io_bp)); 3548 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 3549 } 3550 } 3551 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3552 /* if it's not anon, we are doing a scrub */ 3553 if (!exists && hdr->b_state == arc_anon) 3554 arc_access(hdr, hash_lock); 3555 mutex_exit(hash_lock); 3556 } else { 3557 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3558 } 3559 3560 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 3561 callback->awcb_done(zio, buf, callback->awcb_private); 3562 3563 kmem_free(callback, sizeof (arc_write_callback_t)); 3564 } 3565 3566 zio_t * 3567 arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 3568 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, 3569 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, 3570 arc_done_func_t *done, void *private, zio_priority_t priority, 3571 int zio_flags, const zbookmark_t *zb) 3572 { 3573 arc_buf_hdr_t *hdr = buf->b_hdr; 3574 arc_write_callback_t *callback; 3575 zio_t *zio; 3576 3577 ASSERT(ready != NULL); 3578 ASSERT(done != NULL); 3579 ASSERT(!HDR_IO_ERROR(hdr)); 3580 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 3581 ASSERT(hdr->b_acb == NULL); 3582 if (l2arc) 3583 hdr->b_flags |= ARC_L2CACHE; 3584 if (l2arc_compress) 3585 hdr->b_flags |= ARC_L2COMPRESS; 3586 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 3587 callback->awcb_ready = ready; 3588 callback->awcb_physdone = physdone; 3589 callback->awcb_done = done; 3590 callback->awcb_private = private; 3591 callback->awcb_buf = buf; 3592 3593 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 3594 arc_write_ready, arc_write_physdone, arc_write_done, callback, 3595 priority, zio_flags, zb); 3596 3597 return (zio); 3598 } 3599 3600 static int 3601 arc_memory_throttle(uint64_t reserve, uint64_t txg) 3602 { 3603 #ifdef _KERNEL 3604 uint64_t available_memory = ptob(freemem); 3605 static uint64_t page_load = 0; 3606 static uint64_t last_txg = 0; 3607 3608 #if defined(__i386) 3609 available_memory = 3610 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); 3611 #endif 3612 3613 if (freemem > physmem * arc_lotsfree_percent / 100) 3614 return (0); 3615 3616 if (txg > last_txg) { 3617 last_txg = txg; 3618 page_load = 0; 3619 } 3620 /* 3621 * If we are in pageout, we know that memory is already tight, 3622 * the arc is already going to be evicting, so we just want to 3623 * continue to let page writes occur as quickly as possible. 3624 */ 3625 if (curproc == proc_pageout) { 3626 if (page_load > MAX(ptob(minfree), available_memory) / 4) 3627 return (SET_ERROR(ERESTART)); 3628 /* Note: reserve is inflated, so we deflate */ 3629 page_load += reserve / 8; 3630 return (0); 3631 } else if (page_load > 0 && arc_reclaim_needed()) { 3632 /* memory is low, delay before restarting */ 3633 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3634 return (SET_ERROR(EAGAIN)); 3635 } 3636 page_load = 0; 3637 #endif 3638 return (0); 3639 } 3640 3641 void 3642 arc_tempreserve_clear(uint64_t reserve) 3643 { 3644 atomic_add_64(&arc_tempreserve, -reserve); 3645 ASSERT((int64_t)arc_tempreserve >= 0); 3646 } 3647 3648 int 3649 arc_tempreserve_space(uint64_t reserve, uint64_t txg) 3650 { 3651 int error; 3652 uint64_t anon_size; 3653 3654 if (reserve > arc_c/4 && !arc_no_grow) 3655 arc_c = MIN(arc_c_max, reserve * 4); 3656 if (reserve > arc_c) 3657 return (SET_ERROR(ENOMEM)); 3658 3659 /* 3660 * Don't count loaned bufs as in flight dirty data to prevent long 3661 * network delays from blocking transactions that are ready to be 3662 * assigned to a txg. 3663 */ 3664 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 3665 3666 /* 3667 * Writes will, almost always, require additional memory allocations 3668 * in order to compress/encrypt/etc the data. We therefore need to 3669 * make sure that there is sufficient available memory for this. 3670 */ 3671 error = arc_memory_throttle(reserve, txg); 3672 if (error != 0) 3673 return (error); 3674 3675 /* 3676 * Throttle writes when the amount of dirty data in the cache 3677 * gets too large. We try to keep the cache less than half full 3678 * of dirty blocks so that our sync times don't grow too large. 3679 * Note: if two requests come in concurrently, we might let them 3680 * both succeed, when one of them should fail. Not a huge deal. 3681 */ 3682 3683 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 3684 anon_size > arc_c / 4) { 3685 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 3686 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 3687 arc_tempreserve>>10, 3688 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 3689 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 3690 reserve>>10, arc_c>>10); 3691 return (SET_ERROR(ERESTART)); 3692 } 3693 atomic_add_64(&arc_tempreserve, reserve); 3694 return (0); 3695 } 3696 3697 void 3698 arc_init(void) 3699 { 3700 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 3701 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 3702 3703 /* Convert seconds to clock ticks */ 3704 arc_min_prefetch_lifespan = 1 * hz; 3705 3706 /* Start out with 1/8 of all memory */ 3707 arc_c = physmem * PAGESIZE / 8; 3708 3709 #ifdef _KERNEL 3710 /* 3711 * On architectures where the physical memory can be larger 3712 * than the addressable space (intel in 32-bit mode), we may 3713 * need to limit the cache to 1/8 of VM size. 3714 */ 3715 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 3716 #endif 3717 3718 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 3719 arc_c_min = MAX(arc_c / 4, 64<<20); 3720 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 3721 if (arc_c * 8 >= 1<<30) 3722 arc_c_max = (arc_c * 8) - (1<<30); 3723 else 3724 arc_c_max = arc_c_min; 3725 arc_c_max = MAX(arc_c * 6, arc_c_max); 3726 3727 /* 3728 * Allow the tunables to override our calculations if they are 3729 * reasonable (ie. over 64MB) 3730 */ 3731 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) 3732 arc_c_max = zfs_arc_max; 3733 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) 3734 arc_c_min = zfs_arc_min; 3735 3736 arc_c = arc_c_max; 3737 arc_p = (arc_c >> 1); 3738 3739 /* limit meta-data to 1/4 of the arc capacity */ 3740 arc_meta_limit = arc_c_max / 4; 3741 3742 /* Allow the tunable to override if it is reasonable */ 3743 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 3744 arc_meta_limit = zfs_arc_meta_limit; 3745 3746 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 3747 arc_c_min = arc_meta_limit / 2; 3748 3749 if (zfs_arc_grow_retry > 0) 3750 arc_grow_retry = zfs_arc_grow_retry; 3751 3752 if (zfs_arc_shrink_shift > 0) 3753 arc_shrink_shift = zfs_arc_shrink_shift; 3754 3755 if (zfs_arc_p_min_shift > 0) 3756 arc_p_min_shift = zfs_arc_p_min_shift; 3757 3758 /* if kmem_flags are set, lets try to use less memory */ 3759 if (kmem_debugging()) 3760 arc_c = arc_c / 2; 3761 if (arc_c < arc_c_min) 3762 arc_c = arc_c_min; 3763 3764 arc_anon = &ARC_anon; 3765 arc_mru = &ARC_mru; 3766 arc_mru_ghost = &ARC_mru_ghost; 3767 arc_mfu = &ARC_mfu; 3768 arc_mfu_ghost = &ARC_mfu_ghost; 3769 arc_l2c_only = &ARC_l2c_only; 3770 arc_size = 0; 3771 3772 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3773 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3774 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3775 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3776 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3777 mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3778 3779 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 3780 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3781 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 3782 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3783 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 3784 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3785 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 3786 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3787 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 3788 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3789 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 3790 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3791 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 3792 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3793 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 3794 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3795 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], 3796 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3797 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], 3798 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3799 3800 buf_init(); 3801 3802 arc_thread_exit = 0; 3803 arc_eviction_list = NULL; 3804 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 3805 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 3806 3807 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 3808 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 3809 3810 if (arc_ksp != NULL) { 3811 arc_ksp->ks_data = &arc_stats; 3812 kstat_install(arc_ksp); 3813 } 3814 3815 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 3816 TS_RUN, minclsyspri); 3817 3818 arc_dead = FALSE; 3819 arc_warm = B_FALSE; 3820 3821 /* 3822 * Calculate maximum amount of dirty data per pool. 3823 * 3824 * If it has been set by /etc/system, take that. 3825 * Otherwise, use a percentage of physical memory defined by 3826 * zfs_dirty_data_max_percent (default 10%) with a cap at 3827 * zfs_dirty_data_max_max (default 4GB). 3828 */ 3829 if (zfs_dirty_data_max == 0) { 3830 zfs_dirty_data_max = physmem * PAGESIZE * 3831 zfs_dirty_data_max_percent / 100; 3832 zfs_dirty_data_max = MIN(zfs_dirty_data_max, 3833 zfs_dirty_data_max_max); 3834 } 3835 } 3836 3837 void 3838 arc_fini(void) 3839 { 3840 mutex_enter(&arc_reclaim_thr_lock); 3841 arc_thread_exit = 1; 3842 while (arc_thread_exit != 0) 3843 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 3844 mutex_exit(&arc_reclaim_thr_lock); 3845 3846 arc_flush(NULL); 3847 3848 arc_dead = TRUE; 3849 3850 if (arc_ksp != NULL) { 3851 kstat_delete(arc_ksp); 3852 arc_ksp = NULL; 3853 } 3854 3855 mutex_destroy(&arc_eviction_mtx); 3856 mutex_destroy(&arc_reclaim_thr_lock); 3857 cv_destroy(&arc_reclaim_thr_cv); 3858 3859 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 3860 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 3861 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 3862 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 3863 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 3864 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 3865 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 3866 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 3867 3868 mutex_destroy(&arc_anon->arcs_mtx); 3869 mutex_destroy(&arc_mru->arcs_mtx); 3870 mutex_destroy(&arc_mru_ghost->arcs_mtx); 3871 mutex_destroy(&arc_mfu->arcs_mtx); 3872 mutex_destroy(&arc_mfu_ghost->arcs_mtx); 3873 mutex_destroy(&arc_l2c_only->arcs_mtx); 3874 3875 buf_fini(); 3876 3877 ASSERT(arc_loaned_bytes == 0); 3878 } 3879 3880 /* 3881 * Level 2 ARC 3882 * 3883 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 3884 * It uses dedicated storage devices to hold cached data, which are populated 3885 * using large infrequent writes. The main role of this cache is to boost 3886 * the performance of random read workloads. The intended L2ARC devices 3887 * include short-stroked disks, solid state disks, and other media with 3888 * substantially faster read latency than disk. 3889 * 3890 * +-----------------------+ 3891 * | ARC | 3892 * +-----------------------+ 3893 * | ^ ^ 3894 * | | | 3895 * l2arc_feed_thread() arc_read() 3896 * | | | 3897 * | l2arc read | 3898 * V | | 3899 * +---------------+ | 3900 * | L2ARC | | 3901 * +---------------+ | 3902 * | ^ | 3903 * l2arc_write() | | 3904 * | | | 3905 * V | | 3906 * +-------+ +-------+ 3907 * | vdev | | vdev | 3908 * | cache | | cache | 3909 * +-------+ +-------+ 3910 * +=========+ .-----. 3911 * : L2ARC : |-_____-| 3912 * : devices : | Disks | 3913 * +=========+ `-_____-' 3914 * 3915 * Read requests are satisfied from the following sources, in order: 3916 * 3917 * 1) ARC 3918 * 2) vdev cache of L2ARC devices 3919 * 3) L2ARC devices 3920 * 4) vdev cache of disks 3921 * 5) disks 3922 * 3923 * Some L2ARC device types exhibit extremely slow write performance. 3924 * To accommodate for this there are some significant differences between 3925 * the L2ARC and traditional cache design: 3926 * 3927 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 3928 * the ARC behave as usual, freeing buffers and placing headers on ghost 3929 * lists. The ARC does not send buffers to the L2ARC during eviction as 3930 * this would add inflated write latencies for all ARC memory pressure. 3931 * 3932 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 3933 * It does this by periodically scanning buffers from the eviction-end of 3934 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 3935 * not already there. It scans until a headroom of buffers is satisfied, 3936 * which itself is a buffer for ARC eviction. If a compressible buffer is 3937 * found during scanning and selected for writing to an L2ARC device, we 3938 * temporarily boost scanning headroom during the next scan cycle to make 3939 * sure we adapt to compression effects (which might significantly reduce 3940 * the data volume we write to L2ARC). The thread that does this is 3941 * l2arc_feed_thread(), illustrated below; example sizes are included to 3942 * provide a better sense of ratio than this diagram: 3943 * 3944 * head --> tail 3945 * +---------------------+----------+ 3946 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 3947 * +---------------------+----------+ | o L2ARC eligible 3948 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 3949 * +---------------------+----------+ | 3950 * 15.9 Gbytes ^ 32 Mbytes | 3951 * headroom | 3952 * l2arc_feed_thread() 3953 * | 3954 * l2arc write hand <--[oooo]--' 3955 * | 8 Mbyte 3956 * | write max 3957 * V 3958 * +==============================+ 3959 * L2ARC dev |####|#|###|###| |####| ... | 3960 * +==============================+ 3961 * 32 Gbytes 3962 * 3963 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 3964 * evicted, then the L2ARC has cached a buffer much sooner than it probably 3965 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 3966 * safe to say that this is an uncommon case, since buffers at the end of 3967 * the ARC lists have moved there due to inactivity. 3968 * 3969 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 3970 * then the L2ARC simply misses copying some buffers. This serves as a 3971 * pressure valve to prevent heavy read workloads from both stalling the ARC 3972 * with waits and clogging the L2ARC with writes. This also helps prevent 3973 * the potential for the L2ARC to churn if it attempts to cache content too 3974 * quickly, such as during backups of the entire pool. 3975 * 3976 * 5. After system boot and before the ARC has filled main memory, there are 3977 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 3978 * lists can remain mostly static. Instead of searching from tail of these 3979 * lists as pictured, the l2arc_feed_thread() will search from the list heads 3980 * for eligible buffers, greatly increasing its chance of finding them. 3981 * 3982 * The L2ARC device write speed is also boosted during this time so that 3983 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 3984 * there are no L2ARC reads, and no fear of degrading read performance 3985 * through increased writes. 3986 * 3987 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 3988 * the vdev queue can aggregate them into larger and fewer writes. Each 3989 * device is written to in a rotor fashion, sweeping writes through 3990 * available space then repeating. 3991 * 3992 * 7. The L2ARC does not store dirty content. It never needs to flush 3993 * write buffers back to disk based storage. 3994 * 3995 * 8. If an ARC buffer is written (and dirtied) which also exists in the 3996 * L2ARC, the now stale L2ARC buffer is immediately dropped. 3997 * 3998 * The performance of the L2ARC can be tweaked by a number of tunables, which 3999 * may be necessary for different workloads: 4000 * 4001 * l2arc_write_max max write bytes per interval 4002 * l2arc_write_boost extra write bytes during device warmup 4003 * l2arc_noprefetch skip caching prefetched buffers 4004 * l2arc_headroom number of max device writes to precache 4005 * l2arc_headroom_boost when we find compressed buffers during ARC 4006 * scanning, we multiply headroom by this 4007 * percentage factor for the next scan cycle, 4008 * since more compressed buffers are likely to 4009 * be present 4010 * l2arc_feed_secs seconds between L2ARC writing 4011 * 4012 * Tunables may be removed or added as future performance improvements are 4013 * integrated, and also may become zpool properties. 4014 * 4015 * There are three key functions that control how the L2ARC warms up: 4016 * 4017 * l2arc_write_eligible() check if a buffer is eligible to cache 4018 * l2arc_write_size() calculate how much to write 4019 * l2arc_write_interval() calculate sleep delay between writes 4020 * 4021 * These three functions determine what to write, how much, and how quickly 4022 * to send writes. 4023 */ 4024 4025 static boolean_t 4026 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) 4027 { 4028 /* 4029 * A buffer is *not* eligible for the L2ARC if it: 4030 * 1. belongs to a different spa. 4031 * 2. is already cached on the L2ARC. 4032 * 3. has an I/O in progress (it may be an incomplete read). 4033 * 4. is flagged not eligible (zfs property). 4034 */ 4035 if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL || 4036 HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) 4037 return (B_FALSE); 4038 4039 return (B_TRUE); 4040 } 4041 4042 static uint64_t 4043 l2arc_write_size(void) 4044 { 4045 uint64_t size; 4046 4047 /* 4048 * Make sure our globals have meaningful values in case the user 4049 * altered them. 4050 */ 4051 size = l2arc_write_max; 4052 if (size == 0) { 4053 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 4054 "be greater than zero, resetting it to the default (%d)", 4055 L2ARC_WRITE_SIZE); 4056 size = l2arc_write_max = L2ARC_WRITE_SIZE; 4057 } 4058 4059 if (arc_warm == B_FALSE) 4060 size += l2arc_write_boost; 4061 4062 return (size); 4063 4064 } 4065 4066 static clock_t 4067 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 4068 { 4069 clock_t interval, next, now; 4070 4071 /* 4072 * If the ARC lists are busy, increase our write rate; if the 4073 * lists are stale, idle back. This is achieved by checking 4074 * how much we previously wrote - if it was more than half of 4075 * what we wanted, schedule the next write much sooner. 4076 */ 4077 if (l2arc_feed_again && wrote > (wanted / 2)) 4078 interval = (hz * l2arc_feed_min_ms) / 1000; 4079 else 4080 interval = hz * l2arc_feed_secs; 4081 4082 now = ddi_get_lbolt(); 4083 next = MAX(now, MIN(now + interval, began + interval)); 4084 4085 return (next); 4086 } 4087 4088 static void 4089 l2arc_hdr_stat_add(void) 4090 { 4091 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); 4092 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); 4093 } 4094 4095 static void 4096 l2arc_hdr_stat_remove(void) 4097 { 4098 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); 4099 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); 4100 } 4101 4102 /* 4103 * Cycle through L2ARC devices. This is how L2ARC load balances. 4104 * If a device is returned, this also returns holding the spa config lock. 4105 */ 4106 static l2arc_dev_t * 4107 l2arc_dev_get_next(void) 4108 { 4109 l2arc_dev_t *first, *next = NULL; 4110 4111 /* 4112 * Lock out the removal of spas (spa_namespace_lock), then removal 4113 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 4114 * both locks will be dropped and a spa config lock held instead. 4115 */ 4116 mutex_enter(&spa_namespace_lock); 4117 mutex_enter(&l2arc_dev_mtx); 4118 4119 /* if there are no vdevs, there is nothing to do */ 4120 if (l2arc_ndev == 0) 4121 goto out; 4122 4123 first = NULL; 4124 next = l2arc_dev_last; 4125 do { 4126 /* loop around the list looking for a non-faulted vdev */ 4127 if (next == NULL) { 4128 next = list_head(l2arc_dev_list); 4129 } else { 4130 next = list_next(l2arc_dev_list, next); 4131 if (next == NULL) 4132 next = list_head(l2arc_dev_list); 4133 } 4134 4135 /* if we have come back to the start, bail out */ 4136 if (first == NULL) 4137 first = next; 4138 else if (next == first) 4139 break; 4140 4141 } while (vdev_is_dead(next->l2ad_vdev)); 4142 4143 /* if we were unable to find any usable vdevs, return NULL */ 4144 if (vdev_is_dead(next->l2ad_vdev)) 4145 next = NULL; 4146 4147 l2arc_dev_last = next; 4148 4149 out: 4150 mutex_exit(&l2arc_dev_mtx); 4151 4152 /* 4153 * Grab the config lock to prevent the 'next' device from being 4154 * removed while we are writing to it. 4155 */ 4156 if (next != NULL) 4157 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 4158 mutex_exit(&spa_namespace_lock); 4159 4160 return (next); 4161 } 4162 4163 /* 4164 * Free buffers that were tagged for destruction. 4165 */ 4166 static void 4167 l2arc_do_free_on_write() 4168 { 4169 list_t *buflist; 4170 l2arc_data_free_t *df, *df_prev; 4171 4172 mutex_enter(&l2arc_free_on_write_mtx); 4173 buflist = l2arc_free_on_write; 4174 4175 for (df = list_tail(buflist); df; df = df_prev) { 4176 df_prev = list_prev(buflist, df); 4177 ASSERT(df->l2df_data != NULL); 4178 ASSERT(df->l2df_func != NULL); 4179 df->l2df_func(df->l2df_data, df->l2df_size); 4180 list_remove(buflist, df); 4181 kmem_free(df, sizeof (l2arc_data_free_t)); 4182 } 4183 4184 mutex_exit(&l2arc_free_on_write_mtx); 4185 } 4186 4187 /* 4188 * A write to a cache device has completed. Update all headers to allow 4189 * reads from these buffers to begin. 4190 */ 4191 static void 4192 l2arc_write_done(zio_t *zio) 4193 { 4194 l2arc_write_callback_t *cb; 4195 l2arc_dev_t *dev; 4196 list_t *buflist; 4197 arc_buf_hdr_t *head, *ab, *ab_prev; 4198 l2arc_buf_hdr_t *abl2; 4199 kmutex_t *hash_lock; 4200 4201 cb = zio->io_private; 4202 ASSERT(cb != NULL); 4203 dev = cb->l2wcb_dev; 4204 ASSERT(dev != NULL); 4205 head = cb->l2wcb_head; 4206 ASSERT(head != NULL); 4207 buflist = dev->l2ad_buflist; 4208 ASSERT(buflist != NULL); 4209 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 4210 l2arc_write_callback_t *, cb); 4211 4212 if (zio->io_error != 0) 4213 ARCSTAT_BUMP(arcstat_l2_writes_error); 4214 4215 mutex_enter(&l2arc_buflist_mtx); 4216 4217 /* 4218 * All writes completed, or an error was hit. 4219 */ 4220 for (ab = list_prev(buflist, head); ab; ab = ab_prev) { 4221 ab_prev = list_prev(buflist, ab); 4222 4223 hash_lock = HDR_LOCK(ab); 4224 if (!mutex_tryenter(hash_lock)) { 4225 /* 4226 * This buffer misses out. It may be in a stage 4227 * of eviction. Its ARC_L2_WRITING flag will be 4228 * left set, denying reads to this buffer. 4229 */ 4230 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 4231 continue; 4232 } 4233 4234 abl2 = ab->b_l2hdr; 4235 4236 /* 4237 * Release the temporary compressed buffer as soon as possible. 4238 */ 4239 if (abl2->b_compress != ZIO_COMPRESS_OFF) 4240 l2arc_release_cdata_buf(ab); 4241 4242 if (zio->io_error != 0) { 4243 /* 4244 * Error - drop L2ARC entry. 4245 */ 4246 list_remove(buflist, ab); 4247 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); 4248 ab->b_l2hdr = NULL; 4249 kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 4250 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4251 } 4252 4253 /* 4254 * Allow ARC to begin reads to this L2ARC entry. 4255 */ 4256 ab->b_flags &= ~ARC_L2_WRITING; 4257 4258 mutex_exit(hash_lock); 4259 } 4260 4261 atomic_inc_64(&l2arc_writes_done); 4262 list_remove(buflist, head); 4263 kmem_cache_free(hdr_cache, head); 4264 mutex_exit(&l2arc_buflist_mtx); 4265 4266 l2arc_do_free_on_write(); 4267 4268 kmem_free(cb, sizeof (l2arc_write_callback_t)); 4269 } 4270 4271 /* 4272 * A read to a cache device completed. Validate buffer contents before 4273 * handing over to the regular ARC routines. 4274 */ 4275 static void 4276 l2arc_read_done(zio_t *zio) 4277 { 4278 l2arc_read_callback_t *cb; 4279 arc_buf_hdr_t *hdr; 4280 arc_buf_t *buf; 4281 kmutex_t *hash_lock; 4282 int equal; 4283 4284 ASSERT(zio->io_vd != NULL); 4285 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 4286 4287 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 4288 4289 cb = zio->io_private; 4290 ASSERT(cb != NULL); 4291 buf = cb->l2rcb_buf; 4292 ASSERT(buf != NULL); 4293 4294 hash_lock = HDR_LOCK(buf->b_hdr); 4295 mutex_enter(hash_lock); 4296 hdr = buf->b_hdr; 4297 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4298 4299 /* 4300 * If the buffer was compressed, decompress it first. 4301 */ 4302 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) 4303 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); 4304 ASSERT(zio->io_data != NULL); 4305 4306 /* 4307 * Check this survived the L2ARC journey. 4308 */ 4309 equal = arc_cksum_equal(buf); 4310 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 4311 mutex_exit(hash_lock); 4312 zio->io_private = buf; 4313 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 4314 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 4315 arc_read_done(zio); 4316 } else { 4317 mutex_exit(hash_lock); 4318 /* 4319 * Buffer didn't survive caching. Increment stats and 4320 * reissue to the original storage device. 4321 */ 4322 if (zio->io_error != 0) { 4323 ARCSTAT_BUMP(arcstat_l2_io_error); 4324 } else { 4325 zio->io_error = SET_ERROR(EIO); 4326 } 4327 if (!equal) 4328 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 4329 4330 /* 4331 * If there's no waiter, issue an async i/o to the primary 4332 * storage now. If there *is* a waiter, the caller must 4333 * issue the i/o in a context where it's OK to block. 4334 */ 4335 if (zio->io_waiter == NULL) { 4336 zio_t *pio = zio_unique_parent(zio); 4337 4338 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 4339 4340 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 4341 buf->b_data, zio->io_size, arc_read_done, buf, 4342 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 4343 } 4344 } 4345 4346 kmem_free(cb, sizeof (l2arc_read_callback_t)); 4347 } 4348 4349 /* 4350 * This is the list priority from which the L2ARC will search for pages to 4351 * cache. This is used within loops (0..3) to cycle through lists in the 4352 * desired order. This order can have a significant effect on cache 4353 * performance. 4354 * 4355 * Currently the metadata lists are hit first, MFU then MRU, followed by 4356 * the data lists. This function returns a locked list, and also returns 4357 * the lock pointer. 4358 */ 4359 static list_t * 4360 l2arc_list_locked(int list_num, kmutex_t **lock) 4361 { 4362 list_t *list = NULL; 4363 4364 ASSERT(list_num >= 0 && list_num <= 3); 4365 4366 switch (list_num) { 4367 case 0: 4368 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; 4369 *lock = &arc_mfu->arcs_mtx; 4370 break; 4371 case 1: 4372 list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; 4373 *lock = &arc_mru->arcs_mtx; 4374 break; 4375 case 2: 4376 list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; 4377 *lock = &arc_mfu->arcs_mtx; 4378 break; 4379 case 3: 4380 list = &arc_mru->arcs_list[ARC_BUFC_DATA]; 4381 *lock = &arc_mru->arcs_mtx; 4382 break; 4383 } 4384 4385 ASSERT(!(MUTEX_HELD(*lock))); 4386 mutex_enter(*lock); 4387 return (list); 4388 } 4389 4390 /* 4391 * Evict buffers from the device write hand to the distance specified in 4392 * bytes. This distance may span populated buffers, it may span nothing. 4393 * This is clearing a region on the L2ARC device ready for writing. 4394 * If the 'all' boolean is set, every buffer is evicted. 4395 */ 4396 static void 4397 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 4398 { 4399 list_t *buflist; 4400 l2arc_buf_hdr_t *abl2; 4401 arc_buf_hdr_t *ab, *ab_prev; 4402 kmutex_t *hash_lock; 4403 uint64_t taddr; 4404 4405 buflist = dev->l2ad_buflist; 4406 4407 if (buflist == NULL) 4408 return; 4409 4410 if (!all && dev->l2ad_first) { 4411 /* 4412 * This is the first sweep through the device. There is 4413 * nothing to evict. 4414 */ 4415 return; 4416 } 4417 4418 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 4419 /* 4420 * When nearing the end of the device, evict to the end 4421 * before the device write hand jumps to the start. 4422 */ 4423 taddr = dev->l2ad_end; 4424 } else { 4425 taddr = dev->l2ad_hand + distance; 4426 } 4427 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 4428 uint64_t, taddr, boolean_t, all); 4429 4430 top: 4431 mutex_enter(&l2arc_buflist_mtx); 4432 for (ab = list_tail(buflist); ab; ab = ab_prev) { 4433 ab_prev = list_prev(buflist, ab); 4434 4435 hash_lock = HDR_LOCK(ab); 4436 if (!mutex_tryenter(hash_lock)) { 4437 /* 4438 * Missed the hash lock. Retry. 4439 */ 4440 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 4441 mutex_exit(&l2arc_buflist_mtx); 4442 mutex_enter(hash_lock); 4443 mutex_exit(hash_lock); 4444 goto top; 4445 } 4446 4447 if (HDR_L2_WRITE_HEAD(ab)) { 4448 /* 4449 * We hit a write head node. Leave it for 4450 * l2arc_write_done(). 4451 */ 4452 list_remove(buflist, ab); 4453 mutex_exit(hash_lock); 4454 continue; 4455 } 4456 4457 if (!all && ab->b_l2hdr != NULL && 4458 (ab->b_l2hdr->b_daddr > taddr || 4459 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { 4460 /* 4461 * We've evicted to the target address, 4462 * or the end of the device. 4463 */ 4464 mutex_exit(hash_lock); 4465 break; 4466 } 4467 4468 if (HDR_FREE_IN_PROGRESS(ab)) { 4469 /* 4470 * Already on the path to destruction. 4471 */ 4472 mutex_exit(hash_lock); 4473 continue; 4474 } 4475 4476 if (ab->b_state == arc_l2c_only) { 4477 ASSERT(!HDR_L2_READING(ab)); 4478 /* 4479 * This doesn't exist in the ARC. Destroy. 4480 * arc_hdr_destroy() will call list_remove() 4481 * and decrement arcstat_l2_size. 4482 */ 4483 arc_change_state(arc_anon, ab, hash_lock); 4484 arc_hdr_destroy(ab); 4485 } else { 4486 /* 4487 * Invalidate issued or about to be issued 4488 * reads, since we may be about to write 4489 * over this location. 4490 */ 4491 if (HDR_L2_READING(ab)) { 4492 ARCSTAT_BUMP(arcstat_l2_evict_reading); 4493 ab->b_flags |= ARC_L2_EVICTED; 4494 } 4495 4496 /* 4497 * Tell ARC this no longer exists in L2ARC. 4498 */ 4499 if (ab->b_l2hdr != NULL) { 4500 abl2 = ab->b_l2hdr; 4501 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); 4502 ab->b_l2hdr = NULL; 4503 kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 4504 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4505 } 4506 list_remove(buflist, ab); 4507 4508 /* 4509 * This may have been leftover after a 4510 * failed write. 4511 */ 4512 ab->b_flags &= ~ARC_L2_WRITING; 4513 } 4514 mutex_exit(hash_lock); 4515 } 4516 mutex_exit(&l2arc_buflist_mtx); 4517 4518 vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0); 4519 dev->l2ad_evict = taddr; 4520 } 4521 4522 /* 4523 * Find and write ARC buffers to the L2ARC device. 4524 * 4525 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid 4526 * for reading until they have completed writing. 4527 * The headroom_boost is an in-out parameter used to maintain headroom boost 4528 * state between calls to this function. 4529 * 4530 * Returns the number of bytes actually written (which may be smaller than 4531 * the delta by which the device hand has changed due to alignment). 4532 */ 4533 static uint64_t 4534 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, 4535 boolean_t *headroom_boost) 4536 { 4537 arc_buf_hdr_t *ab, *ab_prev, *head; 4538 list_t *list; 4539 uint64_t write_asize, write_psize, write_sz, headroom, 4540 buf_compress_minsz; 4541 void *buf_data; 4542 kmutex_t *list_lock; 4543 boolean_t full; 4544 l2arc_write_callback_t *cb; 4545 zio_t *pio, *wzio; 4546 uint64_t guid = spa_load_guid(spa); 4547 const boolean_t do_headroom_boost = *headroom_boost; 4548 4549 ASSERT(dev->l2ad_vdev != NULL); 4550 4551 /* Lower the flag now, we might want to raise it again later. */ 4552 *headroom_boost = B_FALSE; 4553 4554 pio = NULL; 4555 write_sz = write_asize = write_psize = 0; 4556 full = B_FALSE; 4557 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 4558 head->b_flags |= ARC_L2_WRITE_HEAD; 4559 4560 /* 4561 * We will want to try to compress buffers that are at least 2x the 4562 * device sector size. 4563 */ 4564 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 4565 4566 /* 4567 * Copy buffers for L2ARC writing. 4568 */ 4569 mutex_enter(&l2arc_buflist_mtx); 4570 for (int try = 0; try <= 3; try++) { 4571 uint64_t passed_sz = 0; 4572 4573 list = l2arc_list_locked(try, &list_lock); 4574 4575 /* 4576 * L2ARC fast warmup. 4577 * 4578 * Until the ARC is warm and starts to evict, read from the 4579 * head of the ARC lists rather than the tail. 4580 */ 4581 if (arc_warm == B_FALSE) 4582 ab = list_head(list); 4583 else 4584 ab = list_tail(list); 4585 4586 headroom = target_sz * l2arc_headroom; 4587 if (do_headroom_boost) 4588 headroom = (headroom * l2arc_headroom_boost) / 100; 4589 4590 for (; ab; ab = ab_prev) { 4591 l2arc_buf_hdr_t *l2hdr; 4592 kmutex_t *hash_lock; 4593 uint64_t buf_sz; 4594 4595 if (arc_warm == B_FALSE) 4596 ab_prev = list_next(list, ab); 4597 else 4598 ab_prev = list_prev(list, ab); 4599 4600 hash_lock = HDR_LOCK(ab); 4601 if (!mutex_tryenter(hash_lock)) { 4602 /* 4603 * Skip this buffer rather than waiting. 4604 */ 4605 continue; 4606 } 4607 4608 passed_sz += ab->b_size; 4609 if (passed_sz > headroom) { 4610 /* 4611 * Searched too far. 4612 */ 4613 mutex_exit(hash_lock); 4614 break; 4615 } 4616 4617 if (!l2arc_write_eligible(guid, ab)) { 4618 mutex_exit(hash_lock); 4619 continue; 4620 } 4621 4622 if ((write_sz + ab->b_size) > target_sz) { 4623 full = B_TRUE; 4624 mutex_exit(hash_lock); 4625 break; 4626 } 4627 4628 if (pio == NULL) { 4629 /* 4630 * Insert a dummy header on the buflist so 4631 * l2arc_write_done() can find where the 4632 * write buffers begin without searching. 4633 */ 4634 list_insert_head(dev->l2ad_buflist, head); 4635 4636 cb = kmem_alloc( 4637 sizeof (l2arc_write_callback_t), KM_SLEEP); 4638 cb->l2wcb_dev = dev; 4639 cb->l2wcb_head = head; 4640 pio = zio_root(spa, l2arc_write_done, cb, 4641 ZIO_FLAG_CANFAIL); 4642 } 4643 4644 /* 4645 * Create and add a new L2ARC header. 4646 */ 4647 l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); 4648 l2hdr->b_dev = dev; 4649 ab->b_flags |= ARC_L2_WRITING; 4650 4651 /* 4652 * Temporarily stash the data buffer in b_tmp_cdata. 4653 * The subsequent write step will pick it up from 4654 * there. This is because can't access ab->b_buf 4655 * without holding the hash_lock, which we in turn 4656 * can't access without holding the ARC list locks 4657 * (which we want to avoid during compression/writing). 4658 */ 4659 l2hdr->b_compress = ZIO_COMPRESS_OFF; 4660 l2hdr->b_asize = ab->b_size; 4661 l2hdr->b_tmp_cdata = ab->b_buf->b_data; 4662 4663 buf_sz = ab->b_size; 4664 ab->b_l2hdr = l2hdr; 4665 4666 list_insert_head(dev->l2ad_buflist, ab); 4667 4668 /* 4669 * Compute and store the buffer cksum before 4670 * writing. On debug the cksum is verified first. 4671 */ 4672 arc_cksum_verify(ab->b_buf); 4673 arc_cksum_compute(ab->b_buf, B_TRUE); 4674 4675 mutex_exit(hash_lock); 4676 4677 write_sz += buf_sz; 4678 } 4679 4680 mutex_exit(list_lock); 4681 4682 if (full == B_TRUE) 4683 break; 4684 } 4685 4686 /* No buffers selected for writing? */ 4687 if (pio == NULL) { 4688 ASSERT0(write_sz); 4689 mutex_exit(&l2arc_buflist_mtx); 4690 kmem_cache_free(hdr_cache, head); 4691 return (0); 4692 } 4693 4694 /* 4695 * Now start writing the buffers. We're starting at the write head 4696 * and work backwards, retracing the course of the buffer selector 4697 * loop above. 4698 */ 4699 for (ab = list_prev(dev->l2ad_buflist, head); ab; 4700 ab = list_prev(dev->l2ad_buflist, ab)) { 4701 l2arc_buf_hdr_t *l2hdr; 4702 uint64_t buf_sz; 4703 4704 /* 4705 * We shouldn't need to lock the buffer here, since we flagged 4706 * it as ARC_L2_WRITING in the previous step, but we must take 4707 * care to only access its L2 cache parameters. In particular, 4708 * ab->b_buf may be invalid by now due to ARC eviction. 4709 */ 4710 l2hdr = ab->b_l2hdr; 4711 l2hdr->b_daddr = dev->l2ad_hand; 4712 4713 if ((ab->b_flags & ARC_L2COMPRESS) && 4714 l2hdr->b_asize >= buf_compress_minsz) { 4715 if (l2arc_compress_buf(l2hdr)) { 4716 /* 4717 * If compression succeeded, enable headroom 4718 * boost on the next scan cycle. 4719 */ 4720 *headroom_boost = B_TRUE; 4721 } 4722 } 4723 4724 /* 4725 * Pick up the buffer data we had previously stashed away 4726 * (and now potentially also compressed). 4727 */ 4728 buf_data = l2hdr->b_tmp_cdata; 4729 buf_sz = l2hdr->b_asize; 4730 4731 /* Compression may have squashed the buffer to zero length. */ 4732 if (buf_sz != 0) { 4733 uint64_t buf_p_sz; 4734 4735 wzio = zio_write_phys(pio, dev->l2ad_vdev, 4736 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 4737 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 4738 ZIO_FLAG_CANFAIL, B_FALSE); 4739 4740 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 4741 zio_t *, wzio); 4742 (void) zio_nowait(wzio); 4743 4744 write_asize += buf_sz; 4745 /* 4746 * Keep the clock hand suitably device-aligned. 4747 */ 4748 buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 4749 write_psize += buf_p_sz; 4750 dev->l2ad_hand += buf_p_sz; 4751 } 4752 } 4753 4754 mutex_exit(&l2arc_buflist_mtx); 4755 4756 ASSERT3U(write_asize, <=, target_sz); 4757 ARCSTAT_BUMP(arcstat_l2_writes_sent); 4758 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); 4759 ARCSTAT_INCR(arcstat_l2_size, write_sz); 4760 ARCSTAT_INCR(arcstat_l2_asize, write_asize); 4761 vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0); 4762 4763 /* 4764 * Bump device hand to the device start if it is approaching the end. 4765 * l2arc_evict() will already have evicted ahead for this case. 4766 */ 4767 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 4768 vdev_space_update(dev->l2ad_vdev, 4769 dev->l2ad_end - dev->l2ad_hand, 0, 0); 4770 dev->l2ad_hand = dev->l2ad_start; 4771 dev->l2ad_evict = dev->l2ad_start; 4772 dev->l2ad_first = B_FALSE; 4773 } 4774 4775 dev->l2ad_writing = B_TRUE; 4776 (void) zio_wait(pio); 4777 dev->l2ad_writing = B_FALSE; 4778 4779 return (write_asize); 4780 } 4781 4782 /* 4783 * Compresses an L2ARC buffer. 4784 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its 4785 * size in l2hdr->b_asize. This routine tries to compress the data and 4786 * depending on the compression result there are three possible outcomes: 4787 * *) The buffer was incompressible. The original l2hdr contents were left 4788 * untouched and are ready for writing to an L2 device. 4789 * *) The buffer was all-zeros, so there is no need to write it to an L2 4790 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is 4791 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. 4792 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary 4793 * data buffer which holds the compressed data to be written, and b_asize 4794 * tells us how much data there is. b_compress is set to the appropriate 4795 * compression algorithm. Once writing is done, invoke 4796 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. 4797 * 4798 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the 4799 * buffer was incompressible). 4800 */ 4801 static boolean_t 4802 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) 4803 { 4804 void *cdata; 4805 size_t csize, len; 4806 4807 ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF); 4808 ASSERT(l2hdr->b_tmp_cdata != NULL); 4809 4810 len = l2hdr->b_asize; 4811 cdata = zio_data_buf_alloc(len); 4812 csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata, 4813 cdata, l2hdr->b_asize); 4814 4815 if (csize == 0) { 4816 /* zero block, indicate that there's nothing to write */ 4817 zio_data_buf_free(cdata, len); 4818 l2hdr->b_compress = ZIO_COMPRESS_EMPTY; 4819 l2hdr->b_asize = 0; 4820 l2hdr->b_tmp_cdata = NULL; 4821 ARCSTAT_BUMP(arcstat_l2_compress_zeros); 4822 return (B_TRUE); 4823 } else if (csize > 0 && csize < len) { 4824 /* 4825 * Compression succeeded, we'll keep the cdata around for 4826 * writing and release it afterwards. 4827 */ 4828 l2hdr->b_compress = ZIO_COMPRESS_LZ4; 4829 l2hdr->b_asize = csize; 4830 l2hdr->b_tmp_cdata = cdata; 4831 ARCSTAT_BUMP(arcstat_l2_compress_successes); 4832 return (B_TRUE); 4833 } else { 4834 /* 4835 * Compression failed, release the compressed buffer. 4836 * l2hdr will be left unmodified. 4837 */ 4838 zio_data_buf_free(cdata, len); 4839 ARCSTAT_BUMP(arcstat_l2_compress_failures); 4840 return (B_FALSE); 4841 } 4842 } 4843 4844 /* 4845 * Decompresses a zio read back from an l2arc device. On success, the 4846 * underlying zio's io_data buffer is overwritten by the uncompressed 4847 * version. On decompression error (corrupt compressed stream), the 4848 * zio->io_error value is set to signal an I/O error. 4849 * 4850 * Please note that the compressed data stream is not checksummed, so 4851 * if the underlying device is experiencing data corruption, we may feed 4852 * corrupt data to the decompressor, so the decompressor needs to be 4853 * able to handle this situation (LZ4 does). 4854 */ 4855 static void 4856 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) 4857 { 4858 ASSERT(L2ARC_IS_VALID_COMPRESS(c)); 4859 4860 if (zio->io_error != 0) { 4861 /* 4862 * An io error has occured, just restore the original io 4863 * size in preparation for a main pool read. 4864 */ 4865 zio->io_orig_size = zio->io_size = hdr->b_size; 4866 return; 4867 } 4868 4869 if (c == ZIO_COMPRESS_EMPTY) { 4870 /* 4871 * An empty buffer results in a null zio, which means we 4872 * need to fill its io_data after we're done restoring the 4873 * buffer's contents. 4874 */ 4875 ASSERT(hdr->b_buf != NULL); 4876 bzero(hdr->b_buf->b_data, hdr->b_size); 4877 zio->io_data = zio->io_orig_data = hdr->b_buf->b_data; 4878 } else { 4879 ASSERT(zio->io_data != NULL); 4880 /* 4881 * We copy the compressed data from the start of the arc buffer 4882 * (the zio_read will have pulled in only what we need, the 4883 * rest is garbage which we will overwrite at decompression) 4884 * and then decompress back to the ARC data buffer. This way we 4885 * can minimize copying by simply decompressing back over the 4886 * original compressed data (rather than decompressing to an 4887 * aux buffer and then copying back the uncompressed buffer, 4888 * which is likely to be much larger). 4889 */ 4890 uint64_t csize; 4891 void *cdata; 4892 4893 csize = zio->io_size; 4894 cdata = zio_data_buf_alloc(csize); 4895 bcopy(zio->io_data, cdata, csize); 4896 if (zio_decompress_data(c, cdata, zio->io_data, csize, 4897 hdr->b_size) != 0) 4898 zio->io_error = EIO; 4899 zio_data_buf_free(cdata, csize); 4900 } 4901 4902 /* Restore the expected uncompressed IO size. */ 4903 zio->io_orig_size = zio->io_size = hdr->b_size; 4904 } 4905 4906 /* 4907 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. 4908 * This buffer serves as a temporary holder of compressed data while 4909 * the buffer entry is being written to an l2arc device. Once that is 4910 * done, we can dispose of it. 4911 */ 4912 static void 4913 l2arc_release_cdata_buf(arc_buf_hdr_t *ab) 4914 { 4915 l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr; 4916 4917 if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) { 4918 /* 4919 * If the data was compressed, then we've allocated a 4920 * temporary buffer for it, so now we need to release it. 4921 */ 4922 ASSERT(l2hdr->b_tmp_cdata != NULL); 4923 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size); 4924 } 4925 l2hdr->b_tmp_cdata = NULL; 4926 } 4927 4928 /* 4929 * This thread feeds the L2ARC at regular intervals. This is the beating 4930 * heart of the L2ARC. 4931 */ 4932 static void 4933 l2arc_feed_thread(void) 4934 { 4935 callb_cpr_t cpr; 4936 l2arc_dev_t *dev; 4937 spa_t *spa; 4938 uint64_t size, wrote; 4939 clock_t begin, next = ddi_get_lbolt(); 4940 boolean_t headroom_boost = B_FALSE; 4941 4942 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 4943 4944 mutex_enter(&l2arc_feed_thr_lock); 4945 4946 while (l2arc_thread_exit == 0) { 4947 CALLB_CPR_SAFE_BEGIN(&cpr); 4948 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 4949 next); 4950 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 4951 next = ddi_get_lbolt() + hz; 4952 4953 /* 4954 * Quick check for L2ARC devices. 4955 */ 4956 mutex_enter(&l2arc_dev_mtx); 4957 if (l2arc_ndev == 0) { 4958 mutex_exit(&l2arc_dev_mtx); 4959 continue; 4960 } 4961 mutex_exit(&l2arc_dev_mtx); 4962 begin = ddi_get_lbolt(); 4963 4964 /* 4965 * This selects the next l2arc device to write to, and in 4966 * doing so the next spa to feed from: dev->l2ad_spa. This 4967 * will return NULL if there are now no l2arc devices or if 4968 * they are all faulted. 4969 * 4970 * If a device is returned, its spa's config lock is also 4971 * held to prevent device removal. l2arc_dev_get_next() 4972 * will grab and release l2arc_dev_mtx. 4973 */ 4974 if ((dev = l2arc_dev_get_next()) == NULL) 4975 continue; 4976 4977 spa = dev->l2ad_spa; 4978 ASSERT(spa != NULL); 4979 4980 /* 4981 * If the pool is read-only then force the feed thread to 4982 * sleep a little longer. 4983 */ 4984 if (!spa_writeable(spa)) { 4985 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 4986 spa_config_exit(spa, SCL_L2ARC, dev); 4987 continue; 4988 } 4989 4990 /* 4991 * Avoid contributing to memory pressure. 4992 */ 4993 if (arc_reclaim_needed()) { 4994 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 4995 spa_config_exit(spa, SCL_L2ARC, dev); 4996 continue; 4997 } 4998 4999 ARCSTAT_BUMP(arcstat_l2_feeds); 5000 5001 size = l2arc_write_size(); 5002 5003 /* 5004 * Evict L2ARC buffers that will be overwritten. 5005 */ 5006 l2arc_evict(dev, size, B_FALSE); 5007 5008 /* 5009 * Write ARC buffers. 5010 */ 5011 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); 5012 5013 /* 5014 * Calculate interval between writes. 5015 */ 5016 next = l2arc_write_interval(begin, size, wrote); 5017 spa_config_exit(spa, SCL_L2ARC, dev); 5018 } 5019 5020 l2arc_thread_exit = 0; 5021 cv_broadcast(&l2arc_feed_thr_cv); 5022 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 5023 thread_exit(); 5024 } 5025 5026 boolean_t 5027 l2arc_vdev_present(vdev_t *vd) 5028 { 5029 l2arc_dev_t *dev; 5030 5031 mutex_enter(&l2arc_dev_mtx); 5032 for (dev = list_head(l2arc_dev_list); dev != NULL; 5033 dev = list_next(l2arc_dev_list, dev)) { 5034 if (dev->l2ad_vdev == vd) 5035 break; 5036 } 5037 mutex_exit(&l2arc_dev_mtx); 5038 5039 return (dev != NULL); 5040 } 5041 5042 /* 5043 * Add a vdev for use by the L2ARC. By this point the spa has already 5044 * validated the vdev and opened it. 5045 */ 5046 void 5047 l2arc_add_vdev(spa_t *spa, vdev_t *vd) 5048 { 5049 l2arc_dev_t *adddev; 5050 5051 ASSERT(!l2arc_vdev_present(vd)); 5052 5053 /* 5054 * Create a new l2arc device entry. 5055 */ 5056 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 5057 adddev->l2ad_spa = spa; 5058 adddev->l2ad_vdev = vd; 5059 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 5060 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 5061 adddev->l2ad_hand = adddev->l2ad_start; 5062 adddev->l2ad_evict = adddev->l2ad_start; 5063 adddev->l2ad_first = B_TRUE; 5064 adddev->l2ad_writing = B_FALSE; 5065 5066 /* 5067 * This is a list of all ARC buffers that are still valid on the 5068 * device. 5069 */ 5070 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); 5071 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 5072 offsetof(arc_buf_hdr_t, b_l2node)); 5073 5074 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 5075 5076 /* 5077 * Add device to global list 5078 */ 5079 mutex_enter(&l2arc_dev_mtx); 5080 list_insert_head(l2arc_dev_list, adddev); 5081 atomic_inc_64(&l2arc_ndev); 5082 mutex_exit(&l2arc_dev_mtx); 5083 } 5084 5085 /* 5086 * Remove a vdev from the L2ARC. 5087 */ 5088 void 5089 l2arc_remove_vdev(vdev_t *vd) 5090 { 5091 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 5092 5093 /* 5094 * Find the device by vdev 5095 */ 5096 mutex_enter(&l2arc_dev_mtx); 5097 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 5098 nextdev = list_next(l2arc_dev_list, dev); 5099 if (vd == dev->l2ad_vdev) { 5100 remdev = dev; 5101 break; 5102 } 5103 } 5104 ASSERT(remdev != NULL); 5105 5106 /* 5107 * Remove device from global list 5108 */ 5109 list_remove(l2arc_dev_list, remdev); 5110 l2arc_dev_last = NULL; /* may have been invalidated */ 5111 atomic_dec_64(&l2arc_ndev); 5112 mutex_exit(&l2arc_dev_mtx); 5113 5114 /* 5115 * Clear all buflists and ARC references. L2ARC device flush. 5116 */ 5117 l2arc_evict(remdev, 0, B_TRUE); 5118 list_destroy(remdev->l2ad_buflist); 5119 kmem_free(remdev->l2ad_buflist, sizeof (list_t)); 5120 kmem_free(remdev, sizeof (l2arc_dev_t)); 5121 } 5122 5123 void 5124 l2arc_init(void) 5125 { 5126 l2arc_thread_exit = 0; 5127 l2arc_ndev = 0; 5128 l2arc_writes_sent = 0; 5129 l2arc_writes_done = 0; 5130 5131 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 5132 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 5133 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 5134 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); 5135 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 5136 5137 l2arc_dev_list = &L2ARC_dev_list; 5138 l2arc_free_on_write = &L2ARC_free_on_write; 5139 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 5140 offsetof(l2arc_dev_t, l2ad_node)); 5141 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 5142 offsetof(l2arc_data_free_t, l2df_list_node)); 5143 } 5144 5145 void 5146 l2arc_fini(void) 5147 { 5148 /* 5149 * This is called from dmu_fini(), which is called from spa_fini(); 5150 * Because of this, we can assume that all l2arc devices have 5151 * already been removed when the pools themselves were removed. 5152 */ 5153 5154 l2arc_do_free_on_write(); 5155 5156 mutex_destroy(&l2arc_feed_thr_lock); 5157 cv_destroy(&l2arc_feed_thr_cv); 5158 mutex_destroy(&l2arc_dev_mtx); 5159 mutex_destroy(&l2arc_buflist_mtx); 5160 mutex_destroy(&l2arc_free_on_write_mtx); 5161 5162 list_destroy(l2arc_dev_list); 5163 list_destroy(l2arc_free_on_write); 5164 } 5165 5166 void 5167 l2arc_start(void) 5168 { 5169 if (!(spa_mode_global & FWRITE)) 5170 return; 5171 5172 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 5173 TS_RUN, minclsyspri); 5174 } 5175 5176 void 5177 l2arc_stop(void) 5178 { 5179 if (!(spa_mode_global & FWRITE)) 5180 return; 5181 5182 mutex_enter(&l2arc_feed_thr_lock); 5183 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 5184 l2arc_thread_exit = 1; 5185 while (l2arc_thread_exit != 0) 5186 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 5187 mutex_exit(&l2arc_feed_thr_lock); 5188 }