1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2013 by Delphix. All rights reserved. 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26 */ 27 28 /* 29 * DVA-based Adjustable Replacement Cache 30 * 31 * While much of the theory of operation used here is 32 * based on the self-tuning, low overhead replacement cache 33 * presented by Megiddo and Modha at FAST 2003, there are some 34 * significant differences: 35 * 36 * 1. The Megiddo and Modha model assumes any page is evictable. 37 * Pages in its cache cannot be "locked" into memory. This makes 38 * the eviction algorithm simple: evict the last page in the list. 39 * This also make the performance characteristics easy to reason 40 * about. Our cache is not so simple. At any given moment, some 41 * subset of the blocks in the cache are un-evictable because we 42 * have handed out a reference to them. Blocks are only evictable 43 * when there are no external references active. This makes 44 * eviction far more problematic: we choose to evict the evictable 45 * blocks that are the "lowest" in the list. 46 * 47 * There are times when it is not possible to evict the requested 48 * space. In these circumstances we are unable to adjust the cache 49 * size. To prevent the cache growing unbounded at these times we 50 * implement a "cache throttle" that slows the flow of new data 51 * into the cache until we can make space available. 52 * 53 * 2. The Megiddo and Modha model assumes a fixed cache size. 54 * Pages are evicted when the cache is full and there is a cache 55 * miss. Our model has a variable sized cache. It grows with 56 * high use, but also tries to react to memory pressure from the 57 * operating system: decreasing its size when system memory is 58 * tight. 59 * 60 * 3. The Megiddo and Modha model assumes a fixed page size. All 61 * elements of the cache are therefore exactly the same size. So 62 * when adjusting the cache size following a cache miss, its simply 63 * a matter of choosing a single page to evict. In our model, we 64 * have variable sized cache blocks (rangeing from 512 bytes to 65 * 128K bytes). We therefore choose a set of blocks to evict to make 66 * space for a cache miss that approximates as closely as possible 67 * the space used by the new block. 68 * 69 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70 * by N. Megiddo & D. Modha, FAST 2003 71 */ 72 73 /* 74 * The locking model: 75 * 76 * A new reference to a cache buffer can be obtained in two 77 * ways: 1) via a hash table lookup using the DVA as a key, 78 * or 2) via one of the ARC lists. The arc_read() interface 79 * uses method 1, while the internal arc algorithms for 80 * adjusting the cache use method 2. We therefore provide two 81 * types of locks: 1) the hash table lock array, and 2) the 82 * arc list locks. 83 * 84 * Buffers do not have their own mutexes, rather they rely on the 85 * hash table mutexes for the bulk of their protection (i.e. most 86 * fields in the arc_buf_hdr_t are protected by these mutexes). 87 * 88 * buf_hash_find() returns the appropriate mutex (held) when it 89 * locates the requested buffer in the hash table. It returns 90 * NULL for the mutex if the buffer was not in the table. 91 * 92 * buf_hash_remove() expects the appropriate hash mutex to be 93 * already held before it is invoked. 94 * 95 * Each arc state also has a mutex which is used to protect the 96 * buffer list associated with the state. When attempting to 97 * obtain a hash table lock while holding an arc list lock you 98 * must use: mutex_tryenter() to avoid deadlock. Also note that 99 * the active state mutex must be held before the ghost state mutex. 100 * 101 * Arc buffers may have an associated eviction callback function. 102 * This function will be invoked prior to removing the buffer (e.g. 103 * in arc_do_user_evicts()). Note however that the data associated 104 * with the buffer may be evicted prior to the callback. The callback 105 * must be made with *no locks held* (to prevent deadlock). Additionally, 106 * the users of callbacks must ensure that their private data is 107 * protected from simultaneous callbacks from arc_buf_evict() 108 * and arc_do_user_evicts(). 109 * 110 * Note that the majority of the performance stats are manipulated 111 * with atomic operations. 112 * 113 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: 114 * 115 * - L2ARC buflist creation 116 * - L2ARC buflist eviction 117 * - L2ARC write completion, which walks L2ARC buflists 118 * - ARC header destruction, as it removes from L2ARC buflists 119 * - ARC header release, as it removes from L2ARC buflists 120 */ 121 122 #include <sys/spa.h> 123 #include <sys/zio.h> 124 #include <sys/zio_compress.h> 125 #include <sys/zfs_context.h> 126 #include <sys/arc.h> 127 #include <sys/refcount.h> 128 #include <sys/vdev.h> 129 #include <sys/vdev_impl.h> 130 #ifdef _KERNEL 131 #include <sys/vmsystm.h> 132 #include <vm/anon.h> 133 #include <sys/fs/swapnode.h> 134 #include <sys/dnlc.h> 135 #endif 136 #include <sys/callb.h> 137 #include <sys/kstat.h> 138 #include <zfs_fletcher.h> 139 140 #ifndef _KERNEL 141 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 142 boolean_t arc_watch = B_FALSE; 143 int arc_procfd; 144 #endif 145 146 static kmutex_t arc_reclaim_thr_lock; 147 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 148 static uint8_t arc_thread_exit; 149 150 extern int zfs_write_limit_shift; 151 extern uint64_t zfs_write_limit_max; 152 extern kmutex_t zfs_write_limit_lock; 153 154 #define ARC_REDUCE_DNLC_PERCENT 3 155 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 156 157 typedef enum arc_reclaim_strategy { 158 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 159 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 160 } arc_reclaim_strategy_t; 161 162 /* number of seconds before growing cache again */ 163 static int arc_grow_retry = 60; 164 165 /* shift of arc_c for calculating both min and max arc_p */ 166 static int arc_p_min_shift = 4; 167 168 /* log2(fraction of arc to reclaim) */ 169 static int arc_shrink_shift = 5; 170 171 /* 172 * minimum lifespan of a prefetch block in clock ticks 173 * (initialized in arc_init()) 174 */ 175 static int arc_min_prefetch_lifespan; 176 177 static int arc_dead; 178 179 /* 180 * The arc has filled available memory and has now warmed up. 181 */ 182 static boolean_t arc_warm; 183 184 /* 185 * These tunables are for performance analysis. 186 */ 187 uint64_t zfs_arc_max; 188 uint64_t zfs_arc_min; 189 uint64_t zfs_arc_meta_limit = 0; 190 int zfs_arc_grow_retry = 0; 191 int zfs_arc_shrink_shift = 0; 192 int zfs_arc_p_min_shift = 0; 193 int zfs_disable_dup_eviction = 0; 194 195 /* 196 * Note that buffers can be in one of 6 states: 197 * ARC_anon - anonymous (discussed below) 198 * ARC_mru - recently used, currently cached 199 * ARC_mru_ghost - recentely used, no longer in cache 200 * ARC_mfu - frequently used, currently cached 201 * ARC_mfu_ghost - frequently used, no longer in cache 202 * ARC_l2c_only - exists in L2ARC but not other states 203 * When there are no active references to the buffer, they are 204 * are linked onto a list in one of these arc states. These are 205 * the only buffers that can be evicted or deleted. Within each 206 * state there are multiple lists, one for meta-data and one for 207 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 208 * etc.) is tracked separately so that it can be managed more 209 * explicitly: favored over data, limited explicitly. 210 * 211 * Anonymous buffers are buffers that are not associated with 212 * a DVA. These are buffers that hold dirty block copies 213 * before they are written to stable storage. By definition, 214 * they are "ref'd" and are considered part of arc_mru 215 * that cannot be freed. Generally, they will aquire a DVA 216 * as they are written and migrate onto the arc_mru list. 217 * 218 * The ARC_l2c_only state is for buffers that are in the second 219 * level ARC but no longer in any of the ARC_m* lists. The second 220 * level ARC itself may also contain buffers that are in any of 221 * the ARC_m* states - meaning that a buffer can exist in two 222 * places. The reason for the ARC_l2c_only state is to keep the 223 * buffer header in the hash table, so that reads that hit the 224 * second level ARC benefit from these fast lookups. 225 */ 226 227 typedef struct arc_state { 228 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ 229 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 230 uint64_t arcs_size; /* total amount of data in this state */ 231 kmutex_t arcs_mtx; 232 } arc_state_t; 233 234 /* The 6 states: */ 235 static arc_state_t ARC_anon; 236 static arc_state_t ARC_mru; 237 static arc_state_t ARC_mru_ghost; 238 static arc_state_t ARC_mfu; 239 static arc_state_t ARC_mfu_ghost; 240 static arc_state_t ARC_l2c_only; 241 242 typedef struct arc_stats { 243 kstat_named_t arcstat_hits; 244 kstat_named_t arcstat_misses; 245 kstat_named_t arcstat_demand_data_hits; 246 kstat_named_t arcstat_demand_data_misses; 247 kstat_named_t arcstat_demand_metadata_hits; 248 kstat_named_t arcstat_demand_metadata_misses; 249 kstat_named_t arcstat_prefetch_data_hits; 250 kstat_named_t arcstat_prefetch_data_misses; 251 kstat_named_t arcstat_prefetch_metadata_hits; 252 kstat_named_t arcstat_prefetch_metadata_misses; 253 kstat_named_t arcstat_mru_hits; 254 kstat_named_t arcstat_mru_ghost_hits; 255 kstat_named_t arcstat_mfu_hits; 256 kstat_named_t arcstat_mfu_ghost_hits; 257 kstat_named_t arcstat_deleted; 258 kstat_named_t arcstat_recycle_miss; 259 /* 260 * Number of buffers that could not be evicted because the hash lock 261 * was held by another thread. The lock may not necessarily be held 262 * by something using the same buffer, since hash locks are shared 263 * by multiple buffers. 264 */ 265 kstat_named_t arcstat_mutex_miss; 266 /* 267 * Number of buffers skipped because they have I/O in progress, are 268 * indrect prefetch buffers that have not lived long enough, or are 269 * not from the spa we're trying to evict from. 270 */ 271 kstat_named_t arcstat_evict_skip; 272 kstat_named_t arcstat_evict_l2_cached; 273 kstat_named_t arcstat_evict_l2_eligible; 274 kstat_named_t arcstat_evict_l2_ineligible; 275 kstat_named_t arcstat_hash_elements; 276 kstat_named_t arcstat_hash_elements_max; 277 kstat_named_t arcstat_hash_collisions; 278 kstat_named_t arcstat_hash_chains; 279 kstat_named_t arcstat_hash_chain_max; 280 kstat_named_t arcstat_p; 281 kstat_named_t arcstat_c; 282 kstat_named_t arcstat_c_min; 283 kstat_named_t arcstat_c_max; 284 kstat_named_t arcstat_size; 285 kstat_named_t arcstat_hdr_size; 286 kstat_named_t arcstat_data_size; 287 kstat_named_t arcstat_other_size; 288 kstat_named_t arcstat_l2_hits; 289 kstat_named_t arcstat_l2_misses; 290 kstat_named_t arcstat_l2_feeds; 291 kstat_named_t arcstat_l2_rw_clash; 292 kstat_named_t arcstat_l2_read_bytes; 293 kstat_named_t arcstat_l2_write_bytes; 294 kstat_named_t arcstat_l2_writes_sent; 295 kstat_named_t arcstat_l2_writes_done; 296 kstat_named_t arcstat_l2_writes_error; 297 kstat_named_t arcstat_l2_evict_lock_retry; 298 kstat_named_t arcstat_l2_evict_reading; 299 kstat_named_t arcstat_l2_free_on_write; 300 kstat_named_t arcstat_l2_abort_lowmem; 301 kstat_named_t arcstat_l2_cksum_bad; 302 kstat_named_t arcstat_l2_io_error; 303 kstat_named_t arcstat_l2_size; 304 kstat_named_t arcstat_l2_asize; 305 kstat_named_t arcstat_l2_hdr_size; 306 kstat_named_t arcstat_l2_compress_successes; 307 kstat_named_t arcstat_l2_compress_zeros; 308 kstat_named_t arcstat_l2_compress_failures; 309 kstat_named_t arcstat_memory_throttle_count; 310 kstat_named_t arcstat_duplicate_buffers; 311 kstat_named_t arcstat_duplicate_buffers_size; 312 kstat_named_t arcstat_duplicate_reads; 313 kstat_named_t arcstat_meta_used; 314 kstat_named_t arcstat_meta_limit; 315 kstat_named_t arcstat_meta_max; 316 } arc_stats_t; 317 318 static arc_stats_t arc_stats = { 319 { "hits", KSTAT_DATA_UINT64 }, 320 { "misses", KSTAT_DATA_UINT64 }, 321 { "demand_data_hits", KSTAT_DATA_UINT64 }, 322 { "demand_data_misses", KSTAT_DATA_UINT64 }, 323 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 324 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 325 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 326 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 327 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 328 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 329 { "mru_hits", KSTAT_DATA_UINT64 }, 330 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 331 { "mfu_hits", KSTAT_DATA_UINT64 }, 332 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 333 { "deleted", KSTAT_DATA_UINT64 }, 334 { "recycle_miss", KSTAT_DATA_UINT64 }, 335 { "mutex_miss", KSTAT_DATA_UINT64 }, 336 { "evict_skip", KSTAT_DATA_UINT64 }, 337 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 338 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 339 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 340 { "hash_elements", KSTAT_DATA_UINT64 }, 341 { "hash_elements_max", KSTAT_DATA_UINT64 }, 342 { "hash_collisions", KSTAT_DATA_UINT64 }, 343 { "hash_chains", KSTAT_DATA_UINT64 }, 344 { "hash_chain_max", KSTAT_DATA_UINT64 }, 345 { "p", KSTAT_DATA_UINT64 }, 346 { "c", KSTAT_DATA_UINT64 }, 347 { "c_min", KSTAT_DATA_UINT64 }, 348 { "c_max", KSTAT_DATA_UINT64 }, 349 { "size", KSTAT_DATA_UINT64 }, 350 { "hdr_size", KSTAT_DATA_UINT64 }, 351 { "data_size", KSTAT_DATA_UINT64 }, 352 { "other_size", KSTAT_DATA_UINT64 }, 353 { "l2_hits", KSTAT_DATA_UINT64 }, 354 { "l2_misses", KSTAT_DATA_UINT64 }, 355 { "l2_feeds", KSTAT_DATA_UINT64 }, 356 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 357 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 358 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 359 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 360 { "l2_writes_done", KSTAT_DATA_UINT64 }, 361 { "l2_writes_error", KSTAT_DATA_UINT64 }, 362 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 363 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 364 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 365 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 366 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 367 { "l2_io_error", KSTAT_DATA_UINT64 }, 368 { "l2_size", KSTAT_DATA_UINT64 }, 369 { "l2_asize", KSTAT_DATA_UINT64 }, 370 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 371 { "l2_compress_successes", KSTAT_DATA_UINT64 }, 372 { "l2_compress_zeros", KSTAT_DATA_UINT64 }, 373 { "l2_compress_failures", KSTAT_DATA_UINT64 }, 374 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 375 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 376 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 377 { "duplicate_reads", KSTAT_DATA_UINT64 }, 378 { "arc_meta_used", KSTAT_DATA_UINT64 }, 379 { "arc_meta_limit", KSTAT_DATA_UINT64 }, 380 { "arc_meta_max", KSTAT_DATA_UINT64 } 381 }; 382 383 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 384 385 #define ARCSTAT_INCR(stat, val) \ 386 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 387 388 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 389 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 390 391 #define ARCSTAT_MAX(stat, val) { \ 392 uint64_t m; \ 393 while ((val) > (m = arc_stats.stat.value.ui64) && \ 394 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 395 continue; \ 396 } 397 398 #define ARCSTAT_MAXSTAT(stat) \ 399 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 400 401 /* 402 * We define a macro to allow ARC hits/misses to be easily broken down by 403 * two separate conditions, giving a total of four different subtypes for 404 * each of hits and misses (so eight statistics total). 405 */ 406 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 407 if (cond1) { \ 408 if (cond2) { \ 409 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 410 } else { \ 411 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 412 } \ 413 } else { \ 414 if (cond2) { \ 415 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 416 } else { \ 417 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 418 } \ 419 } 420 421 kstat_t *arc_ksp; 422 static arc_state_t *arc_anon; 423 static arc_state_t *arc_mru; 424 static arc_state_t *arc_mru_ghost; 425 static arc_state_t *arc_mfu; 426 static arc_state_t *arc_mfu_ghost; 427 static arc_state_t *arc_l2c_only; 428 429 /* 430 * There are several ARC variables that are critical to export as kstats -- 431 * but we don't want to have to grovel around in the kstat whenever we wish to 432 * manipulate them. For these variables, we therefore define them to be in 433 * terms of the statistic variable. This assures that we are not introducing 434 * the possibility of inconsistency by having shadow copies of the variables, 435 * while still allowing the code to be readable. 436 */ 437 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 438 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 439 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 440 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 441 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 442 #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 443 #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 444 #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 445 446 #define L2ARC_IS_VALID_COMPRESS(_c_) \ 447 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) 448 449 static int arc_no_grow; /* Don't try to grow cache size */ 450 static uint64_t arc_tempreserve; 451 static uint64_t arc_loaned_bytes; 452 453 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; 454 455 typedef struct arc_callback arc_callback_t; 456 457 struct arc_callback { 458 void *acb_private; 459 arc_done_func_t *acb_done; 460 arc_buf_t *acb_buf; 461 zio_t *acb_zio_dummy; 462 arc_callback_t *acb_next; 463 }; 464 465 typedef struct arc_write_callback arc_write_callback_t; 466 467 struct arc_write_callback { 468 void *awcb_private; 469 arc_done_func_t *awcb_ready; 470 arc_done_func_t *awcb_done; 471 arc_buf_t *awcb_buf; 472 }; 473 474 struct arc_buf_hdr { 475 /* protected by hash lock */ 476 dva_t b_dva; 477 uint64_t b_birth; 478 uint64_t b_cksum0; 479 480 kmutex_t b_freeze_lock; 481 zio_cksum_t *b_freeze_cksum; 482 void *b_thawed; 483 484 arc_buf_hdr_t *b_hash_next; 485 arc_buf_t *b_buf; 486 uint32_t b_flags; 487 uint32_t b_datacnt; 488 489 arc_callback_t *b_acb; 490 kcondvar_t b_cv; 491 492 /* immutable */ 493 arc_buf_contents_t b_type; 494 uint64_t b_size; 495 uint64_t b_spa; 496 497 /* protected by arc state mutex */ 498 arc_state_t *b_state; 499 list_node_t b_arc_node; 500 501 /* updated atomically */ 502 clock_t b_arc_access; 503 504 /* self protecting */ 505 refcount_t b_refcnt; 506 507 l2arc_buf_hdr_t *b_l2hdr; 508 list_node_t b_l2node; 509 }; 510 511 static arc_buf_t *arc_eviction_list; 512 static kmutex_t arc_eviction_mtx; 513 static arc_buf_hdr_t arc_eviction_hdr; 514 static void arc_get_data_buf(arc_buf_t *buf); 515 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 516 static int arc_evict_needed(arc_buf_contents_t type); 517 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); 518 static void arc_buf_watch(arc_buf_t *buf); 519 520 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); 521 522 #define GHOST_STATE(state) \ 523 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 524 (state) == arc_l2c_only) 525 526 /* 527 * Private ARC flags. These flags are private ARC only flags that will show up 528 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 529 * be passed in as arc_flags in things like arc_read. However, these flags 530 * should never be passed and should only be set by ARC code. When adding new 531 * public flags, make sure not to smash the private ones. 532 */ 533 534 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 535 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 536 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 537 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 538 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 539 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 540 #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ 541 #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ 542 #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ 543 #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ 544 545 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 546 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 547 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 548 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) 549 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 550 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 551 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) 552 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) 553 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ 554 (hdr)->b_l2hdr != NULL) 555 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) 556 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) 557 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) 558 559 /* 560 * Other sizes 561 */ 562 563 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 564 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) 565 566 /* 567 * Hash table routines 568 */ 569 570 #define HT_LOCK_PAD 64 571 572 struct ht_lock { 573 kmutex_t ht_lock; 574 #ifdef _KERNEL 575 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 576 #endif 577 }; 578 579 #define BUF_LOCKS 256 580 typedef struct buf_hash_table { 581 uint64_t ht_mask; 582 arc_buf_hdr_t **ht_table; 583 struct ht_lock ht_locks[BUF_LOCKS]; 584 } buf_hash_table_t; 585 586 static buf_hash_table_t buf_hash_table; 587 588 #define BUF_HASH_INDEX(spa, dva, birth) \ 589 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 590 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 591 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 592 #define HDR_LOCK(hdr) \ 593 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 594 595 uint64_t zfs_crc64_table[256]; 596 597 /* 598 * Level 2 ARC 599 */ 600 601 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 602 #define L2ARC_HEADROOM 2 /* num of writes */ 603 /* 604 * If we discover during ARC scan any buffers to be compressed, we boost 605 * our headroom for the next scanning cycle by this percentage multiple. 606 */ 607 #define L2ARC_HEADROOM_BOOST 200 608 #define L2ARC_FEED_SECS 1 /* caching interval secs */ 609 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 610 611 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 612 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 613 614 /* L2ARC Performance Tunables */ 615 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 616 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 617 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 618 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 619 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 620 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 621 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 622 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 623 boolean_t l2arc_norw = B_FALSE; /* no reads during writes */ 624 625 /* 626 * L2ARC Internals 627 */ 628 typedef struct l2arc_dev { 629 vdev_t *l2ad_vdev; /* vdev */ 630 spa_t *l2ad_spa; /* spa */ 631 uint64_t l2ad_hand; /* next write location */ 632 uint64_t l2ad_start; /* first addr on device */ 633 uint64_t l2ad_end; /* last addr on device */ 634 uint64_t l2ad_evict; /* last addr eviction reached */ 635 boolean_t l2ad_first; /* first sweep through */ 636 boolean_t l2ad_writing; /* currently writing */ 637 list_t *l2ad_buflist; /* buffer list */ 638 list_node_t l2ad_node; /* device list node */ 639 } l2arc_dev_t; 640 641 static list_t L2ARC_dev_list; /* device list */ 642 static list_t *l2arc_dev_list; /* device list pointer */ 643 static kmutex_t l2arc_dev_mtx; /* device list mutex */ 644 static l2arc_dev_t *l2arc_dev_last; /* last device used */ 645 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ 646 static list_t L2ARC_free_on_write; /* free after write buf list */ 647 static list_t *l2arc_free_on_write; /* free after write list ptr */ 648 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 649 static uint64_t l2arc_ndev; /* number of devices */ 650 651 typedef struct l2arc_read_callback { 652 arc_buf_t *l2rcb_buf; /* read buffer */ 653 spa_t *l2rcb_spa; /* spa */ 654 blkptr_t l2rcb_bp; /* original blkptr */ 655 zbookmark_t l2rcb_zb; /* original bookmark */ 656 int l2rcb_flags; /* original flags */ 657 enum zio_compress l2rcb_compress; /* applied compress */ 658 } l2arc_read_callback_t; 659 660 typedef struct l2arc_write_callback { 661 l2arc_dev_t *l2wcb_dev; /* device info */ 662 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 663 } l2arc_write_callback_t; 664 665 struct l2arc_buf_hdr { 666 /* protected by arc_buf_hdr mutex */ 667 l2arc_dev_t *b_dev; /* L2ARC device */ 668 uint64_t b_daddr; /* disk address, offset byte */ 669 /* compression applied to buffer data */ 670 enum zio_compress b_compress; 671 /* real alloc'd buffer size depending on b_compress applied */ 672 int b_asize; 673 /* temporary buffer holder for in-flight compressed data */ 674 void *b_tmp_cdata; 675 }; 676 677 typedef struct l2arc_data_free { 678 /* protected by l2arc_free_on_write_mtx */ 679 void *l2df_data; 680 size_t l2df_size; 681 void (*l2df_func)(void *, size_t); 682 list_node_t l2df_list_node; 683 } l2arc_data_free_t; 684 685 static kmutex_t l2arc_feed_thr_lock; 686 static kcondvar_t l2arc_feed_thr_cv; 687 static uint8_t l2arc_thread_exit; 688 689 static void l2arc_read_done(zio_t *zio); 690 static void l2arc_hdr_stat_add(void); 691 static void l2arc_hdr_stat_remove(void); 692 693 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr); 694 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, 695 enum zio_compress c); 696 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab); 697 698 static uint64_t 699 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 700 { 701 uint8_t *vdva = (uint8_t *)dva; 702 uint64_t crc = -1ULL; 703 int i; 704 705 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 706 707 for (i = 0; i < sizeof (dva_t); i++) 708 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 709 710 crc ^= (spa>>8) ^ birth; 711 712 return (crc); 713 } 714 715 #define BUF_EMPTY(buf) \ 716 ((buf)->b_dva.dva_word[0] == 0 && \ 717 (buf)->b_dva.dva_word[1] == 0 && \ 718 (buf)->b_birth == 0) 719 720 #define BUF_EQUAL(spa, dva, birth, buf) \ 721 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 722 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 723 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 724 725 static void 726 buf_discard_identity(arc_buf_hdr_t *hdr) 727 { 728 hdr->b_dva.dva_word[0] = 0; 729 hdr->b_dva.dva_word[1] = 0; 730 hdr->b_birth = 0; 731 hdr->b_cksum0 = 0; 732 } 733 734 static arc_buf_hdr_t * 735 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) 736 { 737 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 738 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 739 arc_buf_hdr_t *buf; 740 741 mutex_enter(hash_lock); 742 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 743 buf = buf->b_hash_next) { 744 if (BUF_EQUAL(spa, dva, birth, buf)) { 745 *lockp = hash_lock; 746 return (buf); 747 } 748 } 749 mutex_exit(hash_lock); 750 *lockp = NULL; 751 return (NULL); 752 } 753 754 /* 755 * Insert an entry into the hash table. If there is already an element 756 * equal to elem in the hash table, then the already existing element 757 * will be returned and the new element will not be inserted. 758 * Otherwise returns NULL. 759 */ 760 static arc_buf_hdr_t * 761 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 762 { 763 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 764 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 765 arc_buf_hdr_t *fbuf; 766 uint32_t i; 767 768 ASSERT(!HDR_IN_HASH_TABLE(buf)); 769 *lockp = hash_lock; 770 mutex_enter(hash_lock); 771 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 772 fbuf = fbuf->b_hash_next, i++) { 773 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 774 return (fbuf); 775 } 776 777 buf->b_hash_next = buf_hash_table.ht_table[idx]; 778 buf_hash_table.ht_table[idx] = buf; 779 buf->b_flags |= ARC_IN_HASH_TABLE; 780 781 /* collect some hash table performance data */ 782 if (i > 0) { 783 ARCSTAT_BUMP(arcstat_hash_collisions); 784 if (i == 1) 785 ARCSTAT_BUMP(arcstat_hash_chains); 786 787 ARCSTAT_MAX(arcstat_hash_chain_max, i); 788 } 789 790 ARCSTAT_BUMP(arcstat_hash_elements); 791 ARCSTAT_MAXSTAT(arcstat_hash_elements); 792 793 return (NULL); 794 } 795 796 static void 797 buf_hash_remove(arc_buf_hdr_t *buf) 798 { 799 arc_buf_hdr_t *fbuf, **bufp; 800 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 801 802 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 803 ASSERT(HDR_IN_HASH_TABLE(buf)); 804 805 bufp = &buf_hash_table.ht_table[idx]; 806 while ((fbuf = *bufp) != buf) { 807 ASSERT(fbuf != NULL); 808 bufp = &fbuf->b_hash_next; 809 } 810 *bufp = buf->b_hash_next; 811 buf->b_hash_next = NULL; 812 buf->b_flags &= ~ARC_IN_HASH_TABLE; 813 814 /* collect some hash table performance data */ 815 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 816 817 if (buf_hash_table.ht_table[idx] && 818 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 819 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 820 } 821 822 /* 823 * Global data structures and functions for the buf kmem cache. 824 */ 825 static kmem_cache_t *hdr_cache; 826 static kmem_cache_t *buf_cache; 827 828 static void 829 buf_fini(void) 830 { 831 int i; 832 833 kmem_free(buf_hash_table.ht_table, 834 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 835 for (i = 0; i < BUF_LOCKS; i++) 836 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 837 kmem_cache_destroy(hdr_cache); 838 kmem_cache_destroy(buf_cache); 839 } 840 841 /* 842 * Constructor callback - called when the cache is empty 843 * and a new buf is requested. 844 */ 845 /* ARGSUSED */ 846 static int 847 hdr_cons(void *vbuf, void *unused, int kmflag) 848 { 849 arc_buf_hdr_t *buf = vbuf; 850 851 bzero(buf, sizeof (arc_buf_hdr_t)); 852 refcount_create(&buf->b_refcnt); 853 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 854 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 855 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 856 857 return (0); 858 } 859 860 /* ARGSUSED */ 861 static int 862 buf_cons(void *vbuf, void *unused, int kmflag) 863 { 864 arc_buf_t *buf = vbuf; 865 866 bzero(buf, sizeof (arc_buf_t)); 867 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 868 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 869 870 return (0); 871 } 872 873 /* 874 * Destructor callback - called when a cached buf is 875 * no longer required. 876 */ 877 /* ARGSUSED */ 878 static void 879 hdr_dest(void *vbuf, void *unused) 880 { 881 arc_buf_hdr_t *buf = vbuf; 882 883 ASSERT(BUF_EMPTY(buf)); 884 refcount_destroy(&buf->b_refcnt); 885 cv_destroy(&buf->b_cv); 886 mutex_destroy(&buf->b_freeze_lock); 887 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 888 } 889 890 /* ARGSUSED */ 891 static void 892 buf_dest(void *vbuf, void *unused) 893 { 894 arc_buf_t *buf = vbuf; 895 896 mutex_destroy(&buf->b_evict_lock); 897 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 898 } 899 900 /* 901 * Reclaim callback -- invoked when memory is low. 902 */ 903 /* ARGSUSED */ 904 static void 905 hdr_recl(void *unused) 906 { 907 dprintf("hdr_recl called\n"); 908 /* 909 * umem calls the reclaim func when we destroy the buf cache, 910 * which is after we do arc_fini(). 911 */ 912 if (!arc_dead) 913 cv_signal(&arc_reclaim_thr_cv); 914 } 915 916 static void 917 buf_init(void) 918 { 919 uint64_t *ct; 920 uint64_t hsize = 1ULL << 12; 921 int i, j; 922 923 /* 924 * The hash table is big enough to fill all of physical memory 925 * with an average 64K block size. The table will take up 926 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 927 */ 928 while (hsize * 65536 < physmem * PAGESIZE) 929 hsize <<= 1; 930 retry: 931 buf_hash_table.ht_mask = hsize - 1; 932 buf_hash_table.ht_table = 933 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 934 if (buf_hash_table.ht_table == NULL) { 935 ASSERT(hsize > (1ULL << 8)); 936 hsize >>= 1; 937 goto retry; 938 } 939 940 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 941 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 942 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 943 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 944 945 for (i = 0; i < 256; i++) 946 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 947 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 948 949 for (i = 0; i < BUF_LOCKS; i++) { 950 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 951 NULL, MUTEX_DEFAULT, NULL); 952 } 953 } 954 955 #define ARC_MINTIME (hz>>4) /* 62 ms */ 956 957 static void 958 arc_cksum_verify(arc_buf_t *buf) 959 { 960 zio_cksum_t zc; 961 962 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 963 return; 964 965 mutex_enter(&buf->b_hdr->b_freeze_lock); 966 if (buf->b_hdr->b_freeze_cksum == NULL || 967 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 968 mutex_exit(&buf->b_hdr->b_freeze_lock); 969 return; 970 } 971 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 972 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 973 panic("buffer modified while frozen!"); 974 mutex_exit(&buf->b_hdr->b_freeze_lock); 975 } 976 977 static int 978 arc_cksum_equal(arc_buf_t *buf) 979 { 980 zio_cksum_t zc; 981 int equal; 982 983 mutex_enter(&buf->b_hdr->b_freeze_lock); 984 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 985 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 986 mutex_exit(&buf->b_hdr->b_freeze_lock); 987 988 return (equal); 989 } 990 991 static void 992 arc_cksum_compute(arc_buf_t *buf, boolean_t force) 993 { 994 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 995 return; 996 997 mutex_enter(&buf->b_hdr->b_freeze_lock); 998 if (buf->b_hdr->b_freeze_cksum != NULL) { 999 mutex_exit(&buf->b_hdr->b_freeze_lock); 1000 return; 1001 } 1002 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1003 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1004 buf->b_hdr->b_freeze_cksum); 1005 mutex_exit(&buf->b_hdr->b_freeze_lock); 1006 arc_buf_watch(buf); 1007 } 1008 1009 #ifndef _KERNEL 1010 typedef struct procctl { 1011 long cmd; 1012 prwatch_t prwatch; 1013 } procctl_t; 1014 #endif 1015 1016 /* ARGSUSED */ 1017 static void 1018 arc_buf_unwatch(arc_buf_t *buf) 1019 { 1020 #ifndef _KERNEL 1021 if (arc_watch) { 1022 int result; 1023 procctl_t ctl; 1024 ctl.cmd = PCWATCH; 1025 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1026 ctl.prwatch.pr_size = 0; 1027 ctl.prwatch.pr_wflags = 0; 1028 result = write(arc_procfd, &ctl, sizeof (ctl)); 1029 ASSERT3U(result, ==, sizeof (ctl)); 1030 } 1031 #endif 1032 } 1033 1034 /* ARGSUSED */ 1035 static void 1036 arc_buf_watch(arc_buf_t *buf) 1037 { 1038 #ifndef _KERNEL 1039 if (arc_watch) { 1040 int result; 1041 procctl_t ctl; 1042 ctl.cmd = PCWATCH; 1043 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1044 ctl.prwatch.pr_size = buf->b_hdr->b_size; 1045 ctl.prwatch.pr_wflags = WA_WRITE; 1046 result = write(arc_procfd, &ctl, sizeof (ctl)); 1047 ASSERT3U(result, ==, sizeof (ctl)); 1048 } 1049 #endif 1050 } 1051 1052 void 1053 arc_buf_thaw(arc_buf_t *buf) 1054 { 1055 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1056 if (buf->b_hdr->b_state != arc_anon) 1057 panic("modifying non-anon buffer!"); 1058 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 1059 panic("modifying buffer while i/o in progress!"); 1060 arc_cksum_verify(buf); 1061 } 1062 1063 mutex_enter(&buf->b_hdr->b_freeze_lock); 1064 if (buf->b_hdr->b_freeze_cksum != NULL) { 1065 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1066 buf->b_hdr->b_freeze_cksum = NULL; 1067 } 1068 1069 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1070 if (buf->b_hdr->b_thawed) 1071 kmem_free(buf->b_hdr->b_thawed, 1); 1072 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); 1073 } 1074 1075 mutex_exit(&buf->b_hdr->b_freeze_lock); 1076 1077 arc_buf_unwatch(buf); 1078 } 1079 1080 void 1081 arc_buf_freeze(arc_buf_t *buf) 1082 { 1083 kmutex_t *hash_lock; 1084 1085 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1086 return; 1087 1088 hash_lock = HDR_LOCK(buf->b_hdr); 1089 mutex_enter(hash_lock); 1090 1091 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1092 buf->b_hdr->b_state == arc_anon); 1093 arc_cksum_compute(buf, B_FALSE); 1094 mutex_exit(hash_lock); 1095 1096 } 1097 1098 static void 1099 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 1100 { 1101 ASSERT(MUTEX_HELD(hash_lock)); 1102 1103 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 1104 (ab->b_state != arc_anon)) { 1105 uint64_t delta = ab->b_size * ab->b_datacnt; 1106 list_t *list = &ab->b_state->arcs_list[ab->b_type]; 1107 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; 1108 1109 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); 1110 mutex_enter(&ab->b_state->arcs_mtx); 1111 ASSERT(list_link_active(&ab->b_arc_node)); 1112 list_remove(list, ab); 1113 if (GHOST_STATE(ab->b_state)) { 1114 ASSERT0(ab->b_datacnt); 1115 ASSERT3P(ab->b_buf, ==, NULL); 1116 delta = ab->b_size; 1117 } 1118 ASSERT(delta > 0); 1119 ASSERT3U(*size, >=, delta); 1120 atomic_add_64(size, -delta); 1121 mutex_exit(&ab->b_state->arcs_mtx); 1122 /* remove the prefetch flag if we get a reference */ 1123 if (ab->b_flags & ARC_PREFETCH) 1124 ab->b_flags &= ~ARC_PREFETCH; 1125 } 1126 } 1127 1128 static int 1129 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 1130 { 1131 int cnt; 1132 arc_state_t *state = ab->b_state; 1133 1134 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1135 ASSERT(!GHOST_STATE(state)); 1136 1137 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 1138 (state != arc_anon)) { 1139 uint64_t *size = &state->arcs_lsize[ab->b_type]; 1140 1141 ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 1142 mutex_enter(&state->arcs_mtx); 1143 ASSERT(!list_link_active(&ab->b_arc_node)); 1144 list_insert_head(&state->arcs_list[ab->b_type], ab); 1145 ASSERT(ab->b_datacnt > 0); 1146 atomic_add_64(size, ab->b_size * ab->b_datacnt); 1147 mutex_exit(&state->arcs_mtx); 1148 } 1149 return (cnt); 1150 } 1151 1152 /* 1153 * Move the supplied buffer to the indicated state. The mutex 1154 * for the buffer must be held by the caller. 1155 */ 1156 static void 1157 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 1158 { 1159 arc_state_t *old_state = ab->b_state; 1160 int64_t refcnt = refcount_count(&ab->b_refcnt); 1161 uint64_t from_delta, to_delta; 1162 1163 ASSERT(MUTEX_HELD(hash_lock)); 1164 ASSERT(new_state != old_state); 1165 ASSERT(refcnt == 0 || ab->b_datacnt > 0); 1166 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 1167 ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); 1168 1169 from_delta = to_delta = ab->b_datacnt * ab->b_size; 1170 1171 /* 1172 * If this buffer is evictable, transfer it from the 1173 * old state list to the new state list. 1174 */ 1175 if (refcnt == 0) { 1176 if (old_state != arc_anon) { 1177 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 1178 uint64_t *size = &old_state->arcs_lsize[ab->b_type]; 1179 1180 if (use_mutex) 1181 mutex_enter(&old_state->arcs_mtx); 1182 1183 ASSERT(list_link_active(&ab->b_arc_node)); 1184 list_remove(&old_state->arcs_list[ab->b_type], ab); 1185 1186 /* 1187 * If prefetching out of the ghost cache, 1188 * we will have a non-zero datacnt. 1189 */ 1190 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 1191 /* ghost elements have a ghost size */ 1192 ASSERT(ab->b_buf == NULL); 1193 from_delta = ab->b_size; 1194 } 1195 ASSERT3U(*size, >=, from_delta); 1196 atomic_add_64(size, -from_delta); 1197 1198 if (use_mutex) 1199 mutex_exit(&old_state->arcs_mtx); 1200 } 1201 if (new_state != arc_anon) { 1202 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 1203 uint64_t *size = &new_state->arcs_lsize[ab->b_type]; 1204 1205 if (use_mutex) 1206 mutex_enter(&new_state->arcs_mtx); 1207 1208 list_insert_head(&new_state->arcs_list[ab->b_type], ab); 1209 1210 /* ghost elements have a ghost size */ 1211 if (GHOST_STATE(new_state)) { 1212 ASSERT(ab->b_datacnt == 0); 1213 ASSERT(ab->b_buf == NULL); 1214 to_delta = ab->b_size; 1215 } 1216 atomic_add_64(size, to_delta); 1217 1218 if (use_mutex) 1219 mutex_exit(&new_state->arcs_mtx); 1220 } 1221 } 1222 1223 ASSERT(!BUF_EMPTY(ab)); 1224 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) 1225 buf_hash_remove(ab); 1226 1227 /* adjust state sizes */ 1228 if (to_delta) 1229 atomic_add_64(&new_state->arcs_size, to_delta); 1230 if (from_delta) { 1231 ASSERT3U(old_state->arcs_size, >=, from_delta); 1232 atomic_add_64(&old_state->arcs_size, -from_delta); 1233 } 1234 ab->b_state = new_state; 1235 1236 /* adjust l2arc hdr stats */ 1237 if (new_state == arc_l2c_only) 1238 l2arc_hdr_stat_add(); 1239 else if (old_state == arc_l2c_only) 1240 l2arc_hdr_stat_remove(); 1241 } 1242 1243 void 1244 arc_space_consume(uint64_t space, arc_space_type_t type) 1245 { 1246 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1247 1248 switch (type) { 1249 case ARC_SPACE_DATA: 1250 ARCSTAT_INCR(arcstat_data_size, space); 1251 break; 1252 case ARC_SPACE_OTHER: 1253 ARCSTAT_INCR(arcstat_other_size, space); 1254 break; 1255 case ARC_SPACE_HDRS: 1256 ARCSTAT_INCR(arcstat_hdr_size, space); 1257 break; 1258 case ARC_SPACE_L2HDRS: 1259 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1260 break; 1261 } 1262 1263 ARCSTAT_INCR(arcstat_meta_used, space); 1264 atomic_add_64(&arc_size, space); 1265 } 1266 1267 void 1268 arc_space_return(uint64_t space, arc_space_type_t type) 1269 { 1270 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1271 1272 switch (type) { 1273 case ARC_SPACE_DATA: 1274 ARCSTAT_INCR(arcstat_data_size, -space); 1275 break; 1276 case ARC_SPACE_OTHER: 1277 ARCSTAT_INCR(arcstat_other_size, -space); 1278 break; 1279 case ARC_SPACE_HDRS: 1280 ARCSTAT_INCR(arcstat_hdr_size, -space); 1281 break; 1282 case ARC_SPACE_L2HDRS: 1283 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1284 break; 1285 } 1286 1287 ASSERT(arc_meta_used >= space); 1288 if (arc_meta_max < arc_meta_used) 1289 arc_meta_max = arc_meta_used; 1290 ARCSTAT_INCR(arcstat_meta_used, -space); 1291 ASSERT(arc_size >= space); 1292 atomic_add_64(&arc_size, -space); 1293 } 1294 1295 void * 1296 arc_data_buf_alloc(uint64_t size) 1297 { 1298 if (arc_evict_needed(ARC_BUFC_DATA)) 1299 cv_signal(&arc_reclaim_thr_cv); 1300 atomic_add_64(&arc_size, size); 1301 return (zio_data_buf_alloc(size)); 1302 } 1303 1304 void 1305 arc_data_buf_free(void *buf, uint64_t size) 1306 { 1307 zio_data_buf_free(buf, size); 1308 ASSERT(arc_size >= size); 1309 atomic_add_64(&arc_size, -size); 1310 } 1311 1312 arc_buf_t * 1313 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 1314 { 1315 arc_buf_hdr_t *hdr; 1316 arc_buf_t *buf; 1317 1318 ASSERT3U(size, >, 0); 1319 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 1320 ASSERT(BUF_EMPTY(hdr)); 1321 hdr->b_size = size; 1322 hdr->b_type = type; 1323 hdr->b_spa = spa_load_guid(spa); 1324 hdr->b_state = arc_anon; 1325 hdr->b_arc_access = 0; 1326 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1327 buf->b_hdr = hdr; 1328 buf->b_data = NULL; 1329 buf->b_efunc = NULL; 1330 buf->b_private = NULL; 1331 buf->b_next = NULL; 1332 hdr->b_buf = buf; 1333 arc_get_data_buf(buf); 1334 hdr->b_datacnt = 1; 1335 hdr->b_flags = 0; 1336 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1337 (void) refcount_add(&hdr->b_refcnt, tag); 1338 1339 return (buf); 1340 } 1341 1342 static char *arc_onloan_tag = "onloan"; 1343 1344 /* 1345 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1346 * flight data by arc_tempreserve_space() until they are "returned". Loaned 1347 * buffers must be returned to the arc before they can be used by the DMU or 1348 * freed. 1349 */ 1350 arc_buf_t * 1351 arc_loan_buf(spa_t *spa, int size) 1352 { 1353 arc_buf_t *buf; 1354 1355 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 1356 1357 atomic_add_64(&arc_loaned_bytes, size); 1358 return (buf); 1359 } 1360 1361 /* 1362 * Return a loaned arc buffer to the arc. 1363 */ 1364 void 1365 arc_return_buf(arc_buf_t *buf, void *tag) 1366 { 1367 arc_buf_hdr_t *hdr = buf->b_hdr; 1368 1369 ASSERT(buf->b_data != NULL); 1370 (void) refcount_add(&hdr->b_refcnt, tag); 1371 (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); 1372 1373 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 1374 } 1375 1376 /* Detach an arc_buf from a dbuf (tag) */ 1377 void 1378 arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 1379 { 1380 arc_buf_hdr_t *hdr; 1381 1382 ASSERT(buf->b_data != NULL); 1383 hdr = buf->b_hdr; 1384 (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); 1385 (void) refcount_remove(&hdr->b_refcnt, tag); 1386 buf->b_efunc = NULL; 1387 buf->b_private = NULL; 1388 1389 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 1390 } 1391 1392 static arc_buf_t * 1393 arc_buf_clone(arc_buf_t *from) 1394 { 1395 arc_buf_t *buf; 1396 arc_buf_hdr_t *hdr = from->b_hdr; 1397 uint64_t size = hdr->b_size; 1398 1399 ASSERT(hdr->b_state != arc_anon); 1400 1401 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1402 buf->b_hdr = hdr; 1403 buf->b_data = NULL; 1404 buf->b_efunc = NULL; 1405 buf->b_private = NULL; 1406 buf->b_next = hdr->b_buf; 1407 hdr->b_buf = buf; 1408 arc_get_data_buf(buf); 1409 bcopy(from->b_data, buf->b_data, size); 1410 1411 /* 1412 * This buffer already exists in the arc so create a duplicate 1413 * copy for the caller. If the buffer is associated with user data 1414 * then track the size and number of duplicates. These stats will be 1415 * updated as duplicate buffers are created and destroyed. 1416 */ 1417 if (hdr->b_type == ARC_BUFC_DATA) { 1418 ARCSTAT_BUMP(arcstat_duplicate_buffers); 1419 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 1420 } 1421 hdr->b_datacnt += 1; 1422 return (buf); 1423 } 1424 1425 void 1426 arc_buf_add_ref(arc_buf_t *buf, void* tag) 1427 { 1428 arc_buf_hdr_t *hdr; 1429 kmutex_t *hash_lock; 1430 1431 /* 1432 * Check to see if this buffer is evicted. Callers 1433 * must verify b_data != NULL to know if the add_ref 1434 * was successful. 1435 */ 1436 mutex_enter(&buf->b_evict_lock); 1437 if (buf->b_data == NULL) { 1438 mutex_exit(&buf->b_evict_lock); 1439 return; 1440 } 1441 hash_lock = HDR_LOCK(buf->b_hdr); 1442 mutex_enter(hash_lock); 1443 hdr = buf->b_hdr; 1444 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1445 mutex_exit(&buf->b_evict_lock); 1446 1447 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 1448 add_reference(hdr, hash_lock, tag); 1449 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 1450 arc_access(hdr, hash_lock); 1451 mutex_exit(hash_lock); 1452 ARCSTAT_BUMP(arcstat_hits); 1453 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 1454 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 1455 data, metadata, hits); 1456 } 1457 1458 /* 1459 * Free the arc data buffer. If it is an l2arc write in progress, 1460 * the buffer is placed on l2arc_free_on_write to be freed later. 1461 */ 1462 static void 1463 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 1464 { 1465 arc_buf_hdr_t *hdr = buf->b_hdr; 1466 1467 if (HDR_L2_WRITING(hdr)) { 1468 l2arc_data_free_t *df; 1469 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 1470 df->l2df_data = buf->b_data; 1471 df->l2df_size = hdr->b_size; 1472 df->l2df_func = free_func; 1473 mutex_enter(&l2arc_free_on_write_mtx); 1474 list_insert_head(l2arc_free_on_write, df); 1475 mutex_exit(&l2arc_free_on_write_mtx); 1476 ARCSTAT_BUMP(arcstat_l2_free_on_write); 1477 } else { 1478 free_func(buf->b_data, hdr->b_size); 1479 } 1480 } 1481 1482 static void 1483 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 1484 { 1485 arc_buf_t **bufp; 1486 1487 /* free up data associated with the buf */ 1488 if (buf->b_data) { 1489 arc_state_t *state = buf->b_hdr->b_state; 1490 uint64_t size = buf->b_hdr->b_size; 1491 arc_buf_contents_t type = buf->b_hdr->b_type; 1492 1493 arc_cksum_verify(buf); 1494 arc_buf_unwatch(buf); 1495 1496 if (!recycle) { 1497 if (type == ARC_BUFC_METADATA) { 1498 arc_buf_data_free(buf, zio_buf_free); 1499 arc_space_return(size, ARC_SPACE_DATA); 1500 } else { 1501 ASSERT(type == ARC_BUFC_DATA); 1502 arc_buf_data_free(buf, zio_data_buf_free); 1503 ARCSTAT_INCR(arcstat_data_size, -size); 1504 atomic_add_64(&arc_size, -size); 1505 } 1506 } 1507 if (list_link_active(&buf->b_hdr->b_arc_node)) { 1508 uint64_t *cnt = &state->arcs_lsize[type]; 1509 1510 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 1511 ASSERT(state != arc_anon); 1512 1513 ASSERT3U(*cnt, >=, size); 1514 atomic_add_64(cnt, -size); 1515 } 1516 ASSERT3U(state->arcs_size, >=, size); 1517 atomic_add_64(&state->arcs_size, -size); 1518 buf->b_data = NULL; 1519 1520 /* 1521 * If we're destroying a duplicate buffer make sure 1522 * that the appropriate statistics are updated. 1523 */ 1524 if (buf->b_hdr->b_datacnt > 1 && 1525 buf->b_hdr->b_type == ARC_BUFC_DATA) { 1526 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 1527 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 1528 } 1529 ASSERT(buf->b_hdr->b_datacnt > 0); 1530 buf->b_hdr->b_datacnt -= 1; 1531 } 1532 1533 /* only remove the buf if requested */ 1534 if (!all) 1535 return; 1536 1537 /* remove the buf from the hdr list */ 1538 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 1539 continue; 1540 *bufp = buf->b_next; 1541 buf->b_next = NULL; 1542 1543 ASSERT(buf->b_efunc == NULL); 1544 1545 /* clean up the buf */ 1546 buf->b_hdr = NULL; 1547 kmem_cache_free(buf_cache, buf); 1548 } 1549 1550 static void 1551 arc_hdr_destroy(arc_buf_hdr_t *hdr) 1552 { 1553 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1554 ASSERT3P(hdr->b_state, ==, arc_anon); 1555 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1556 l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; 1557 1558 if (l2hdr != NULL) { 1559 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); 1560 /* 1561 * To prevent arc_free() and l2arc_evict() from 1562 * attempting to free the same buffer at the same time, 1563 * a FREE_IN_PROGRESS flag is given to arc_free() to 1564 * give it priority. l2arc_evict() can't destroy this 1565 * header while we are waiting on l2arc_buflist_mtx. 1566 * 1567 * The hdr may be removed from l2ad_buflist before we 1568 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. 1569 */ 1570 if (!buflist_held) { 1571 mutex_enter(&l2arc_buflist_mtx); 1572 l2hdr = hdr->b_l2hdr; 1573 } 1574 1575 if (l2hdr != NULL) { 1576 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 1577 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 1578 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 1579 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 1580 if (hdr->b_state == arc_l2c_only) 1581 l2arc_hdr_stat_remove(); 1582 hdr->b_l2hdr = NULL; 1583 } 1584 1585 if (!buflist_held) 1586 mutex_exit(&l2arc_buflist_mtx); 1587 } 1588 1589 if (!BUF_EMPTY(hdr)) { 1590 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1591 buf_discard_identity(hdr); 1592 } 1593 while (hdr->b_buf) { 1594 arc_buf_t *buf = hdr->b_buf; 1595 1596 if (buf->b_efunc) { 1597 mutex_enter(&arc_eviction_mtx); 1598 mutex_enter(&buf->b_evict_lock); 1599 ASSERT(buf->b_hdr != NULL); 1600 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1601 hdr->b_buf = buf->b_next; 1602 buf->b_hdr = &arc_eviction_hdr; 1603 buf->b_next = arc_eviction_list; 1604 arc_eviction_list = buf; 1605 mutex_exit(&buf->b_evict_lock); 1606 mutex_exit(&arc_eviction_mtx); 1607 } else { 1608 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1609 } 1610 } 1611 if (hdr->b_freeze_cksum != NULL) { 1612 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1613 hdr->b_freeze_cksum = NULL; 1614 } 1615 if (hdr->b_thawed) { 1616 kmem_free(hdr->b_thawed, 1); 1617 hdr->b_thawed = NULL; 1618 } 1619 1620 ASSERT(!list_link_active(&hdr->b_arc_node)); 1621 ASSERT3P(hdr->b_hash_next, ==, NULL); 1622 ASSERT3P(hdr->b_acb, ==, NULL); 1623 kmem_cache_free(hdr_cache, hdr); 1624 } 1625 1626 void 1627 arc_buf_free(arc_buf_t *buf, void *tag) 1628 { 1629 arc_buf_hdr_t *hdr = buf->b_hdr; 1630 int hashed = hdr->b_state != arc_anon; 1631 1632 ASSERT(buf->b_efunc == NULL); 1633 ASSERT(buf->b_data != NULL); 1634 1635 if (hashed) { 1636 kmutex_t *hash_lock = HDR_LOCK(hdr); 1637 1638 mutex_enter(hash_lock); 1639 hdr = buf->b_hdr; 1640 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1641 1642 (void) remove_reference(hdr, hash_lock, tag); 1643 if (hdr->b_datacnt > 1) { 1644 arc_buf_destroy(buf, FALSE, TRUE); 1645 } else { 1646 ASSERT(buf == hdr->b_buf); 1647 ASSERT(buf->b_efunc == NULL); 1648 hdr->b_flags |= ARC_BUF_AVAILABLE; 1649 } 1650 mutex_exit(hash_lock); 1651 } else if (HDR_IO_IN_PROGRESS(hdr)) { 1652 int destroy_hdr; 1653 /* 1654 * We are in the middle of an async write. Don't destroy 1655 * this buffer unless the write completes before we finish 1656 * decrementing the reference count. 1657 */ 1658 mutex_enter(&arc_eviction_mtx); 1659 (void) remove_reference(hdr, NULL, tag); 1660 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1661 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1662 mutex_exit(&arc_eviction_mtx); 1663 if (destroy_hdr) 1664 arc_hdr_destroy(hdr); 1665 } else { 1666 if (remove_reference(hdr, NULL, tag) > 0) 1667 arc_buf_destroy(buf, FALSE, TRUE); 1668 else 1669 arc_hdr_destroy(hdr); 1670 } 1671 } 1672 1673 boolean_t 1674 arc_buf_remove_ref(arc_buf_t *buf, void* tag) 1675 { 1676 arc_buf_hdr_t *hdr = buf->b_hdr; 1677 kmutex_t *hash_lock = HDR_LOCK(hdr); 1678 boolean_t no_callback = (buf->b_efunc == NULL); 1679 1680 if (hdr->b_state == arc_anon) { 1681 ASSERT(hdr->b_datacnt == 1); 1682 arc_buf_free(buf, tag); 1683 return (no_callback); 1684 } 1685 1686 mutex_enter(hash_lock); 1687 hdr = buf->b_hdr; 1688 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1689 ASSERT(hdr->b_state != arc_anon); 1690 ASSERT(buf->b_data != NULL); 1691 1692 (void) remove_reference(hdr, hash_lock, tag); 1693 if (hdr->b_datacnt > 1) { 1694 if (no_callback) 1695 arc_buf_destroy(buf, FALSE, TRUE); 1696 } else if (no_callback) { 1697 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1698 ASSERT(buf->b_efunc == NULL); 1699 hdr->b_flags |= ARC_BUF_AVAILABLE; 1700 } 1701 ASSERT(no_callback || hdr->b_datacnt > 1 || 1702 refcount_is_zero(&hdr->b_refcnt)); 1703 mutex_exit(hash_lock); 1704 return (no_callback); 1705 } 1706 1707 int 1708 arc_buf_size(arc_buf_t *buf) 1709 { 1710 return (buf->b_hdr->b_size); 1711 } 1712 1713 /* 1714 * Called from the DMU to determine if the current buffer should be 1715 * evicted. In order to ensure proper locking, the eviction must be initiated 1716 * from the DMU. Return true if the buffer is associated with user data and 1717 * duplicate buffers still exist. 1718 */ 1719 boolean_t 1720 arc_buf_eviction_needed(arc_buf_t *buf) 1721 { 1722 arc_buf_hdr_t *hdr; 1723 boolean_t evict_needed = B_FALSE; 1724 1725 if (zfs_disable_dup_eviction) 1726 return (B_FALSE); 1727 1728 mutex_enter(&buf->b_evict_lock); 1729 hdr = buf->b_hdr; 1730 if (hdr == NULL) { 1731 /* 1732 * We are in arc_do_user_evicts(); let that function 1733 * perform the eviction. 1734 */ 1735 ASSERT(buf->b_data == NULL); 1736 mutex_exit(&buf->b_evict_lock); 1737 return (B_FALSE); 1738 } else if (buf->b_data == NULL) { 1739 /* 1740 * We have already been added to the arc eviction list; 1741 * recommend eviction. 1742 */ 1743 ASSERT3P(hdr, ==, &arc_eviction_hdr); 1744 mutex_exit(&buf->b_evict_lock); 1745 return (B_TRUE); 1746 } 1747 1748 if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) 1749 evict_needed = B_TRUE; 1750 1751 mutex_exit(&buf->b_evict_lock); 1752 return (evict_needed); 1753 } 1754 1755 /* 1756 * Evict buffers from list until we've removed the specified number of 1757 * bytes. Move the removed buffers to the appropriate evict state. 1758 * If the recycle flag is set, then attempt to "recycle" a buffer: 1759 * - look for a buffer to evict that is `bytes' long. 1760 * - return the data block from this buffer rather than freeing it. 1761 * This flag is used by callers that are trying to make space for a 1762 * new buffer in a full arc cache. 1763 * 1764 * This function makes a "best effort". It skips over any buffers 1765 * it can't get a hash_lock on, and so may not catch all candidates. 1766 * It may also return without evicting as much space as requested. 1767 */ 1768 static void * 1769 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, 1770 arc_buf_contents_t type) 1771 { 1772 arc_state_t *evicted_state; 1773 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1774 arc_buf_hdr_t *ab, *ab_prev = NULL; 1775 list_t *list = &state->arcs_list[type]; 1776 kmutex_t *hash_lock; 1777 boolean_t have_lock; 1778 void *stolen = NULL; 1779 1780 ASSERT(state == arc_mru || state == arc_mfu); 1781 1782 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1783 1784 mutex_enter(&state->arcs_mtx); 1785 mutex_enter(&evicted_state->arcs_mtx); 1786 1787 for (ab = list_tail(list); ab; ab = ab_prev) { 1788 ab_prev = list_prev(list, ab); 1789 /* prefetch buffers have a minimum lifespan */ 1790 if (HDR_IO_IN_PROGRESS(ab) || 1791 (spa && ab->b_spa != spa) || 1792 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1793 ddi_get_lbolt() - ab->b_arc_access < 1794 arc_min_prefetch_lifespan)) { 1795 skipped++; 1796 continue; 1797 } 1798 /* "lookahead" for better eviction candidate */ 1799 if (recycle && ab->b_size != bytes && 1800 ab_prev && ab_prev->b_size == bytes) 1801 continue; 1802 hash_lock = HDR_LOCK(ab); 1803 have_lock = MUTEX_HELD(hash_lock); 1804 if (have_lock || mutex_tryenter(hash_lock)) { 1805 ASSERT0(refcount_count(&ab->b_refcnt)); 1806 ASSERT(ab->b_datacnt > 0); 1807 while (ab->b_buf) { 1808 arc_buf_t *buf = ab->b_buf; 1809 if (!mutex_tryenter(&buf->b_evict_lock)) { 1810 missed += 1; 1811 break; 1812 } 1813 if (buf->b_data) { 1814 bytes_evicted += ab->b_size; 1815 if (recycle && ab->b_type == type && 1816 ab->b_size == bytes && 1817 !HDR_L2_WRITING(ab)) { 1818 stolen = buf->b_data; 1819 recycle = FALSE; 1820 } 1821 } 1822 if (buf->b_efunc) { 1823 mutex_enter(&arc_eviction_mtx); 1824 arc_buf_destroy(buf, 1825 buf->b_data == stolen, FALSE); 1826 ab->b_buf = buf->b_next; 1827 buf->b_hdr = &arc_eviction_hdr; 1828 buf->b_next = arc_eviction_list; 1829 arc_eviction_list = buf; 1830 mutex_exit(&arc_eviction_mtx); 1831 mutex_exit(&buf->b_evict_lock); 1832 } else { 1833 mutex_exit(&buf->b_evict_lock); 1834 arc_buf_destroy(buf, 1835 buf->b_data == stolen, TRUE); 1836 } 1837 } 1838 1839 if (ab->b_l2hdr) { 1840 ARCSTAT_INCR(arcstat_evict_l2_cached, 1841 ab->b_size); 1842 } else { 1843 if (l2arc_write_eligible(ab->b_spa, ab)) { 1844 ARCSTAT_INCR(arcstat_evict_l2_eligible, 1845 ab->b_size); 1846 } else { 1847 ARCSTAT_INCR( 1848 arcstat_evict_l2_ineligible, 1849 ab->b_size); 1850 } 1851 } 1852 1853 if (ab->b_datacnt == 0) { 1854 arc_change_state(evicted_state, ab, hash_lock); 1855 ASSERT(HDR_IN_HASH_TABLE(ab)); 1856 ab->b_flags |= ARC_IN_HASH_TABLE; 1857 ab->b_flags &= ~ARC_BUF_AVAILABLE; 1858 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 1859 } 1860 if (!have_lock) 1861 mutex_exit(hash_lock); 1862 if (bytes >= 0 && bytes_evicted >= bytes) 1863 break; 1864 } else { 1865 missed += 1; 1866 } 1867 } 1868 1869 mutex_exit(&evicted_state->arcs_mtx); 1870 mutex_exit(&state->arcs_mtx); 1871 1872 if (bytes_evicted < bytes) 1873 dprintf("only evicted %lld bytes from %x", 1874 (longlong_t)bytes_evicted, state); 1875 1876 if (skipped) 1877 ARCSTAT_INCR(arcstat_evict_skip, skipped); 1878 1879 if (missed) 1880 ARCSTAT_INCR(arcstat_mutex_miss, missed); 1881 1882 /* 1883 * We have just evicted some data into the ghost state, make 1884 * sure we also adjust the ghost state size if necessary. 1885 */ 1886 if (arc_no_grow && 1887 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { 1888 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + 1889 arc_mru_ghost->arcs_size - arc_c; 1890 1891 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { 1892 int64_t todelete = 1893 MIN(arc_mru_ghost->arcs_lsize[type], mru_over); 1894 arc_evict_ghost(arc_mru_ghost, NULL, todelete); 1895 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { 1896 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], 1897 arc_mru_ghost->arcs_size + 1898 arc_mfu_ghost->arcs_size - arc_c); 1899 arc_evict_ghost(arc_mfu_ghost, NULL, todelete); 1900 } 1901 } 1902 1903 return (stolen); 1904 } 1905 1906 /* 1907 * Remove buffers from list until we've removed the specified number of 1908 * bytes. Destroy the buffers that are removed. 1909 */ 1910 static void 1911 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) 1912 { 1913 arc_buf_hdr_t *ab, *ab_prev; 1914 arc_buf_hdr_t marker = { 0 }; 1915 list_t *list = &state->arcs_list[ARC_BUFC_DATA]; 1916 kmutex_t *hash_lock; 1917 uint64_t bytes_deleted = 0; 1918 uint64_t bufs_skipped = 0; 1919 1920 ASSERT(GHOST_STATE(state)); 1921 top: 1922 mutex_enter(&state->arcs_mtx); 1923 for (ab = list_tail(list); ab; ab = ab_prev) { 1924 ab_prev = list_prev(list, ab); 1925 if (spa && ab->b_spa != spa) 1926 continue; 1927 1928 /* ignore markers */ 1929 if (ab->b_spa == 0) 1930 continue; 1931 1932 hash_lock = HDR_LOCK(ab); 1933 /* caller may be trying to modify this buffer, skip it */ 1934 if (MUTEX_HELD(hash_lock)) 1935 continue; 1936 if (mutex_tryenter(hash_lock)) { 1937 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1938 ASSERT(ab->b_buf == NULL); 1939 ARCSTAT_BUMP(arcstat_deleted); 1940 bytes_deleted += ab->b_size; 1941 1942 if (ab->b_l2hdr != NULL) { 1943 /* 1944 * This buffer is cached on the 2nd Level ARC; 1945 * don't destroy the header. 1946 */ 1947 arc_change_state(arc_l2c_only, ab, hash_lock); 1948 mutex_exit(hash_lock); 1949 } else { 1950 arc_change_state(arc_anon, ab, hash_lock); 1951 mutex_exit(hash_lock); 1952 arc_hdr_destroy(ab); 1953 } 1954 1955 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 1956 if (bytes >= 0 && bytes_deleted >= bytes) 1957 break; 1958 } else if (bytes < 0) { 1959 /* 1960 * Insert a list marker and then wait for the 1961 * hash lock to become available. Once its 1962 * available, restart from where we left off. 1963 */ 1964 list_insert_after(list, ab, &marker); 1965 mutex_exit(&state->arcs_mtx); 1966 mutex_enter(hash_lock); 1967 mutex_exit(hash_lock); 1968 mutex_enter(&state->arcs_mtx); 1969 ab_prev = list_prev(list, &marker); 1970 list_remove(list, &marker); 1971 } else 1972 bufs_skipped += 1; 1973 } 1974 mutex_exit(&state->arcs_mtx); 1975 1976 if (list == &state->arcs_list[ARC_BUFC_DATA] && 1977 (bytes < 0 || bytes_deleted < bytes)) { 1978 list = &state->arcs_list[ARC_BUFC_METADATA]; 1979 goto top; 1980 } 1981 1982 if (bufs_skipped) { 1983 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 1984 ASSERT(bytes >= 0); 1985 } 1986 1987 if (bytes_deleted < bytes) 1988 dprintf("only deleted %lld bytes from %p", 1989 (longlong_t)bytes_deleted, state); 1990 } 1991 1992 static void 1993 arc_adjust(void) 1994 { 1995 int64_t adjustment, delta; 1996 1997 /* 1998 * Adjust MRU size 1999 */ 2000 2001 adjustment = MIN((int64_t)(arc_size - arc_c), 2002 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 2003 arc_p)); 2004 2005 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 2006 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 2007 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA); 2008 adjustment -= delta; 2009 } 2010 2011 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2012 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 2013 (void) arc_evict(arc_mru, NULL, delta, FALSE, 2014 ARC_BUFC_METADATA); 2015 } 2016 2017 /* 2018 * Adjust MFU size 2019 */ 2020 2021 adjustment = arc_size - arc_c; 2022 2023 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 2024 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 2025 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA); 2026 adjustment -= delta; 2027 } 2028 2029 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2030 int64_t delta = MIN(adjustment, 2031 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 2032 (void) arc_evict(arc_mfu, NULL, delta, FALSE, 2033 ARC_BUFC_METADATA); 2034 } 2035 2036 /* 2037 * Adjust ghost lists 2038 */ 2039 2040 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 2041 2042 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 2043 delta = MIN(arc_mru_ghost->arcs_size, adjustment); 2044 arc_evict_ghost(arc_mru_ghost, NULL, delta); 2045 } 2046 2047 adjustment = 2048 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 2049 2050 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 2051 delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 2052 arc_evict_ghost(arc_mfu_ghost, NULL, delta); 2053 } 2054 } 2055 2056 static void 2057 arc_do_user_evicts(void) 2058 { 2059 mutex_enter(&arc_eviction_mtx); 2060 while (arc_eviction_list != NULL) { 2061 arc_buf_t *buf = arc_eviction_list; 2062 arc_eviction_list = buf->b_next; 2063 mutex_enter(&buf->b_evict_lock); 2064 buf->b_hdr = NULL; 2065 mutex_exit(&buf->b_evict_lock); 2066 mutex_exit(&arc_eviction_mtx); 2067 2068 if (buf->b_efunc != NULL) 2069 VERIFY(buf->b_efunc(buf) == 0); 2070 2071 buf->b_efunc = NULL; 2072 buf->b_private = NULL; 2073 kmem_cache_free(buf_cache, buf); 2074 mutex_enter(&arc_eviction_mtx); 2075 } 2076 mutex_exit(&arc_eviction_mtx); 2077 } 2078 2079 /* 2080 * Flush all *evictable* data from the cache for the given spa. 2081 * NOTE: this will not touch "active" (i.e. referenced) data. 2082 */ 2083 void 2084 arc_flush(spa_t *spa) 2085 { 2086 uint64_t guid = 0; 2087 2088 if (spa) 2089 guid = spa_load_guid(spa); 2090 2091 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { 2092 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 2093 if (spa) 2094 break; 2095 } 2096 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { 2097 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 2098 if (spa) 2099 break; 2100 } 2101 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { 2102 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 2103 if (spa) 2104 break; 2105 } 2106 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { 2107 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 2108 if (spa) 2109 break; 2110 } 2111 2112 arc_evict_ghost(arc_mru_ghost, guid, -1); 2113 arc_evict_ghost(arc_mfu_ghost, guid, -1); 2114 2115 mutex_enter(&arc_reclaim_thr_lock); 2116 arc_do_user_evicts(); 2117 mutex_exit(&arc_reclaim_thr_lock); 2118 ASSERT(spa || arc_eviction_list == NULL); 2119 } 2120 2121 void 2122 arc_shrink(void) 2123 { 2124 if (arc_c > arc_c_min) { 2125 uint64_t to_free; 2126 2127 #ifdef _KERNEL 2128 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); 2129 #else 2130 to_free = arc_c >> arc_shrink_shift; 2131 #endif 2132 if (arc_c > arc_c_min + to_free) 2133 atomic_add_64(&arc_c, -to_free); 2134 else 2135 arc_c = arc_c_min; 2136 2137 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 2138 if (arc_c > arc_size) 2139 arc_c = MAX(arc_size, arc_c_min); 2140 if (arc_p > arc_c) 2141 arc_p = (arc_c >> 1); 2142 ASSERT(arc_c >= arc_c_min); 2143 ASSERT((int64_t)arc_p >= 0); 2144 } 2145 2146 if (arc_size > arc_c) 2147 arc_adjust(); 2148 } 2149 2150 /* 2151 * Determine if the system is under memory pressure and is asking 2152 * to reclaim memory. A return value of 1 indicates that the system 2153 * is under memory pressure and that the arc should adjust accordingly. 2154 */ 2155 static int 2156 arc_reclaim_needed(void) 2157 { 2158 uint64_t extra; 2159 2160 #ifdef _KERNEL 2161 2162 if (needfree) 2163 return (1); 2164 2165 /* 2166 * take 'desfree' extra pages, so we reclaim sooner, rather than later 2167 */ 2168 extra = desfree; 2169 2170 /* 2171 * check that we're out of range of the pageout scanner. It starts to 2172 * schedule paging if freemem is less than lotsfree and needfree. 2173 * lotsfree is the high-water mark for pageout, and needfree is the 2174 * number of needed free pages. We add extra pages here to make sure 2175 * the scanner doesn't start up while we're freeing memory. 2176 */ 2177 if (freemem < lotsfree + needfree + extra) 2178 return (1); 2179 2180 /* 2181 * check to make sure that swapfs has enough space so that anon 2182 * reservations can still succeed. anon_resvmem() checks that the 2183 * availrmem is greater than swapfs_minfree, and the number of reserved 2184 * swap pages. We also add a bit of extra here just to prevent 2185 * circumstances from getting really dire. 2186 */ 2187 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 2188 return (1); 2189 2190 #if defined(__i386) 2191 /* 2192 * If we're on an i386 platform, it's possible that we'll exhaust the 2193 * kernel heap space before we ever run out of available physical 2194 * memory. Most checks of the size of the heap_area compare against 2195 * tune.t_minarmem, which is the minimum available real memory that we 2196 * can have in the system. However, this is generally fixed at 25 pages 2197 * which is so low that it's useless. In this comparison, we seek to 2198 * calculate the total heap-size, and reclaim if more than 3/4ths of the 2199 * heap is allocated. (Or, in the calculation, if less than 1/4th is 2200 * free) 2201 */ 2202 if (vmem_size(heap_arena, VMEM_FREE) < 2203 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) 2204 return (1); 2205 #endif 2206 2207 /* 2208 * If zio data pages are being allocated out of a separate heap segment, 2209 * then enforce that the size of available vmem for this arena remains 2210 * above about 1/16th free. 2211 * 2212 * Note: The 1/16th arena free requirement was put in place 2213 * to aggressively evict memory from the arc in order to avoid 2214 * memory fragmentation issues. 2215 */ 2216 if (zio_arena != NULL && 2217 vmem_size(zio_arena, VMEM_FREE) < 2218 (vmem_size(zio_arena, VMEM_ALLOC) >> 4)) 2219 return (1); 2220 #else 2221 if (spa_get_random(100) == 0) 2222 return (1); 2223 #endif 2224 return (0); 2225 } 2226 2227 static void 2228 arc_kmem_reap_now(arc_reclaim_strategy_t strat) 2229 { 2230 size_t i; 2231 kmem_cache_t *prev_cache = NULL; 2232 kmem_cache_t *prev_data_cache = NULL; 2233 extern kmem_cache_t *zio_buf_cache[]; 2234 extern kmem_cache_t *zio_data_buf_cache[]; 2235 2236 #ifdef _KERNEL 2237 if (arc_meta_used >= arc_meta_limit) { 2238 /* 2239 * We are exceeding our meta-data cache limit. 2240 * Purge some DNLC entries to release holds on meta-data. 2241 */ 2242 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 2243 } 2244 #if defined(__i386) 2245 /* 2246 * Reclaim unused memory from all kmem caches. 2247 */ 2248 kmem_reap(); 2249 #endif 2250 #endif 2251 2252 /* 2253 * An aggressive reclamation will shrink the cache size as well as 2254 * reap free buffers from the arc kmem caches. 2255 */ 2256 if (strat == ARC_RECLAIM_AGGR) 2257 arc_shrink(); 2258 2259 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 2260 if (zio_buf_cache[i] != prev_cache) { 2261 prev_cache = zio_buf_cache[i]; 2262 kmem_cache_reap_now(zio_buf_cache[i]); 2263 } 2264 if (zio_data_buf_cache[i] != prev_data_cache) { 2265 prev_data_cache = zio_data_buf_cache[i]; 2266 kmem_cache_reap_now(zio_data_buf_cache[i]); 2267 } 2268 } 2269 kmem_cache_reap_now(buf_cache); 2270 kmem_cache_reap_now(hdr_cache); 2271 2272 /* 2273 * Ask the vmem areana to reclaim unused memory from its 2274 * quantum caches. 2275 */ 2276 if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR) 2277 vmem_qcache_reap(zio_arena); 2278 } 2279 2280 static void 2281 arc_reclaim_thread(void) 2282 { 2283 clock_t growtime = 0; 2284 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 2285 callb_cpr_t cpr; 2286 2287 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 2288 2289 mutex_enter(&arc_reclaim_thr_lock); 2290 while (arc_thread_exit == 0) { 2291 if (arc_reclaim_needed()) { 2292 2293 if (arc_no_grow) { 2294 if (last_reclaim == ARC_RECLAIM_CONS) { 2295 last_reclaim = ARC_RECLAIM_AGGR; 2296 } else { 2297 last_reclaim = ARC_RECLAIM_CONS; 2298 } 2299 } else { 2300 arc_no_grow = TRUE; 2301 last_reclaim = ARC_RECLAIM_AGGR; 2302 membar_producer(); 2303 } 2304 2305 /* reset the growth delay for every reclaim */ 2306 growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 2307 2308 arc_kmem_reap_now(last_reclaim); 2309 arc_warm = B_TRUE; 2310 2311 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { 2312 arc_no_grow = FALSE; 2313 } 2314 2315 arc_adjust(); 2316 2317 if (arc_eviction_list != NULL) 2318 arc_do_user_evicts(); 2319 2320 /* block until needed, or one second, whichever is shorter */ 2321 CALLB_CPR_SAFE_BEGIN(&cpr); 2322 (void) cv_timedwait(&arc_reclaim_thr_cv, 2323 &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz)); 2324 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 2325 } 2326 2327 arc_thread_exit = 0; 2328 cv_broadcast(&arc_reclaim_thr_cv); 2329 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 2330 thread_exit(); 2331 } 2332 2333 /* 2334 * Adapt arc info given the number of bytes we are trying to add and 2335 * the state that we are comming from. This function is only called 2336 * when we are adding new content to the cache. 2337 */ 2338 static void 2339 arc_adapt(int bytes, arc_state_t *state) 2340 { 2341 int mult; 2342 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 2343 2344 if (state == arc_l2c_only) 2345 return; 2346 2347 ASSERT(bytes > 0); 2348 /* 2349 * Adapt the target size of the MRU list: 2350 * - if we just hit in the MRU ghost list, then increase 2351 * the target size of the MRU list. 2352 * - if we just hit in the MFU ghost list, then increase 2353 * the target size of the MFU list by decreasing the 2354 * target size of the MRU list. 2355 */ 2356 if (state == arc_mru_ghost) { 2357 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 2358 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 2359 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 2360 2361 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 2362 } else if (state == arc_mfu_ghost) { 2363 uint64_t delta; 2364 2365 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 2366 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 2367 mult = MIN(mult, 10); 2368 2369 delta = MIN(bytes * mult, arc_p); 2370 arc_p = MAX(arc_p_min, arc_p - delta); 2371 } 2372 ASSERT((int64_t)arc_p >= 0); 2373 2374 if (arc_reclaim_needed()) { 2375 cv_signal(&arc_reclaim_thr_cv); 2376 return; 2377 } 2378 2379 if (arc_no_grow) 2380 return; 2381 2382 if (arc_c >= arc_c_max) 2383 return; 2384 2385 /* 2386 * If we're within (2 * maxblocksize) bytes of the target 2387 * cache size, increment the target cache size 2388 */ 2389 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 2390 atomic_add_64(&arc_c, (int64_t)bytes); 2391 if (arc_c > arc_c_max) 2392 arc_c = arc_c_max; 2393 else if (state == arc_anon) 2394 atomic_add_64(&arc_p, (int64_t)bytes); 2395 if (arc_p > arc_c) 2396 arc_p = arc_c; 2397 } 2398 ASSERT((int64_t)arc_p >= 0); 2399 } 2400 2401 /* 2402 * Check if the cache has reached its limits and eviction is required 2403 * prior to insert. 2404 */ 2405 static int 2406 arc_evict_needed(arc_buf_contents_t type) 2407 { 2408 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 2409 return (1); 2410 2411 if (arc_reclaim_needed()) 2412 return (1); 2413 2414 return (arc_size > arc_c); 2415 } 2416 2417 /* 2418 * The buffer, supplied as the first argument, needs a data block. 2419 * So, if we are at cache max, determine which cache should be victimized. 2420 * We have the following cases: 2421 * 2422 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 2423 * In this situation if we're out of space, but the resident size of the MFU is 2424 * under the limit, victimize the MFU cache to satisfy this insertion request. 2425 * 2426 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 2427 * Here, we've used up all of the available space for the MRU, so we need to 2428 * evict from our own cache instead. Evict from the set of resident MRU 2429 * entries. 2430 * 2431 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 2432 * c minus p represents the MFU space in the cache, since p is the size of the 2433 * cache that is dedicated to the MRU. In this situation there's still space on 2434 * the MFU side, so the MRU side needs to be victimized. 2435 * 2436 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 2437 * MFU's resident set is consuming more space than it has been allotted. In 2438 * this situation, we must victimize our own cache, the MFU, for this insertion. 2439 */ 2440 static void 2441 arc_get_data_buf(arc_buf_t *buf) 2442 { 2443 arc_state_t *state = buf->b_hdr->b_state; 2444 uint64_t size = buf->b_hdr->b_size; 2445 arc_buf_contents_t type = buf->b_hdr->b_type; 2446 2447 arc_adapt(size, state); 2448 2449 /* 2450 * We have not yet reached cache maximum size, 2451 * just allocate a new buffer. 2452 */ 2453 if (!arc_evict_needed(type)) { 2454 if (type == ARC_BUFC_METADATA) { 2455 buf->b_data = zio_buf_alloc(size); 2456 arc_space_consume(size, ARC_SPACE_DATA); 2457 } else { 2458 ASSERT(type == ARC_BUFC_DATA); 2459 buf->b_data = zio_data_buf_alloc(size); 2460 ARCSTAT_INCR(arcstat_data_size, size); 2461 atomic_add_64(&arc_size, size); 2462 } 2463 goto out; 2464 } 2465 2466 /* 2467 * If we are prefetching from the mfu ghost list, this buffer 2468 * will end up on the mru list; so steal space from there. 2469 */ 2470 if (state == arc_mfu_ghost) 2471 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 2472 else if (state == arc_mru_ghost) 2473 state = arc_mru; 2474 2475 if (state == arc_mru || state == arc_anon) { 2476 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 2477 state = (arc_mfu->arcs_lsize[type] >= size && 2478 arc_p > mru_used) ? arc_mfu : arc_mru; 2479 } else { 2480 /* MFU cases */ 2481 uint64_t mfu_space = arc_c - arc_p; 2482 state = (arc_mru->arcs_lsize[type] >= size && 2483 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 2484 } 2485 if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { 2486 if (type == ARC_BUFC_METADATA) { 2487 buf->b_data = zio_buf_alloc(size); 2488 arc_space_consume(size, ARC_SPACE_DATA); 2489 } else { 2490 ASSERT(type == ARC_BUFC_DATA); 2491 buf->b_data = zio_data_buf_alloc(size); 2492 ARCSTAT_INCR(arcstat_data_size, size); 2493 atomic_add_64(&arc_size, size); 2494 } 2495 ARCSTAT_BUMP(arcstat_recycle_miss); 2496 } 2497 ASSERT(buf->b_data != NULL); 2498 out: 2499 /* 2500 * Update the state size. Note that ghost states have a 2501 * "ghost size" and so don't need to be updated. 2502 */ 2503 if (!GHOST_STATE(buf->b_hdr->b_state)) { 2504 arc_buf_hdr_t *hdr = buf->b_hdr; 2505 2506 atomic_add_64(&hdr->b_state->arcs_size, size); 2507 if (list_link_active(&hdr->b_arc_node)) { 2508 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2509 atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 2510 } 2511 /* 2512 * If we are growing the cache, and we are adding anonymous 2513 * data, and we have outgrown arc_p, update arc_p 2514 */ 2515 if (arc_size < arc_c && hdr->b_state == arc_anon && 2516 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 2517 arc_p = MIN(arc_c, arc_p + size); 2518 } 2519 } 2520 2521 /* 2522 * This routine is called whenever a buffer is accessed. 2523 * NOTE: the hash lock is dropped in this function. 2524 */ 2525 static void 2526 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 2527 { 2528 clock_t now; 2529 2530 ASSERT(MUTEX_HELD(hash_lock)); 2531 2532 if (buf->b_state == arc_anon) { 2533 /* 2534 * This buffer is not in the cache, and does not 2535 * appear in our "ghost" list. Add the new buffer 2536 * to the MRU state. 2537 */ 2538 2539 ASSERT(buf->b_arc_access == 0); 2540 buf->b_arc_access = ddi_get_lbolt(); 2541 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2542 arc_change_state(arc_mru, buf, hash_lock); 2543 2544 } else if (buf->b_state == arc_mru) { 2545 now = ddi_get_lbolt(); 2546 2547 /* 2548 * If this buffer is here because of a prefetch, then either: 2549 * - clear the flag if this is a "referencing" read 2550 * (any subsequent access will bump this into the MFU state). 2551 * or 2552 * - move the buffer to the head of the list if this is 2553 * another prefetch (to make it less likely to be evicted). 2554 */ 2555 if ((buf->b_flags & ARC_PREFETCH) != 0) { 2556 if (refcount_count(&buf->b_refcnt) == 0) { 2557 ASSERT(list_link_active(&buf->b_arc_node)); 2558 } else { 2559 buf->b_flags &= ~ARC_PREFETCH; 2560 ARCSTAT_BUMP(arcstat_mru_hits); 2561 } 2562 buf->b_arc_access = now; 2563 return; 2564 } 2565 2566 /* 2567 * This buffer has been "accessed" only once so far, 2568 * but it is still in the cache. Move it to the MFU 2569 * state. 2570 */ 2571 if (now > buf->b_arc_access + ARC_MINTIME) { 2572 /* 2573 * More than 125ms have passed since we 2574 * instantiated this buffer. Move it to the 2575 * most frequently used state. 2576 */ 2577 buf->b_arc_access = now; 2578 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2579 arc_change_state(arc_mfu, buf, hash_lock); 2580 } 2581 ARCSTAT_BUMP(arcstat_mru_hits); 2582 } else if (buf->b_state == arc_mru_ghost) { 2583 arc_state_t *new_state; 2584 /* 2585 * This buffer has been "accessed" recently, but 2586 * was evicted from the cache. Move it to the 2587 * MFU state. 2588 */ 2589 2590 if (buf->b_flags & ARC_PREFETCH) { 2591 new_state = arc_mru; 2592 if (refcount_count(&buf->b_refcnt) > 0) 2593 buf->b_flags &= ~ARC_PREFETCH; 2594 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2595 } else { 2596 new_state = arc_mfu; 2597 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2598 } 2599 2600 buf->b_arc_access = ddi_get_lbolt(); 2601 arc_change_state(new_state, buf, hash_lock); 2602 2603 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 2604 } else if (buf->b_state == arc_mfu) { 2605 /* 2606 * This buffer has been accessed more than once and is 2607 * still in the cache. Keep it in the MFU state. 2608 * 2609 * NOTE: an add_reference() that occurred when we did 2610 * the arc_read() will have kicked this off the list. 2611 * If it was a prefetch, we will explicitly move it to 2612 * the head of the list now. 2613 */ 2614 if ((buf->b_flags & ARC_PREFETCH) != 0) { 2615 ASSERT(refcount_count(&buf->b_refcnt) == 0); 2616 ASSERT(list_link_active(&buf->b_arc_node)); 2617 } 2618 ARCSTAT_BUMP(arcstat_mfu_hits); 2619 buf->b_arc_access = ddi_get_lbolt(); 2620 } else if (buf->b_state == arc_mfu_ghost) { 2621 arc_state_t *new_state = arc_mfu; 2622 /* 2623 * This buffer has been accessed more than once but has 2624 * been evicted from the cache. Move it back to the 2625 * MFU state. 2626 */ 2627 2628 if (buf->b_flags & ARC_PREFETCH) { 2629 /* 2630 * This is a prefetch access... 2631 * move this block back to the MRU state. 2632 */ 2633 ASSERT0(refcount_count(&buf->b_refcnt)); 2634 new_state = arc_mru; 2635 } 2636 2637 buf->b_arc_access = ddi_get_lbolt(); 2638 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2639 arc_change_state(new_state, buf, hash_lock); 2640 2641 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 2642 } else if (buf->b_state == arc_l2c_only) { 2643 /* 2644 * This buffer is on the 2nd Level ARC. 2645 */ 2646 2647 buf->b_arc_access = ddi_get_lbolt(); 2648 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2649 arc_change_state(arc_mfu, buf, hash_lock); 2650 } else { 2651 ASSERT(!"invalid arc state"); 2652 } 2653 } 2654 2655 /* a generic arc_done_func_t which you can use */ 2656 /* ARGSUSED */ 2657 void 2658 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 2659 { 2660 if (zio == NULL || zio->io_error == 0) 2661 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 2662 VERIFY(arc_buf_remove_ref(buf, arg)); 2663 } 2664 2665 /* a generic arc_done_func_t */ 2666 void 2667 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 2668 { 2669 arc_buf_t **bufp = arg; 2670 if (zio && zio->io_error) { 2671 VERIFY(arc_buf_remove_ref(buf, arg)); 2672 *bufp = NULL; 2673 } else { 2674 *bufp = buf; 2675 ASSERT(buf->b_data); 2676 } 2677 } 2678 2679 static void 2680 arc_read_done(zio_t *zio) 2681 { 2682 arc_buf_hdr_t *hdr, *found; 2683 arc_buf_t *buf; 2684 arc_buf_t *abuf; /* buffer we're assigning to callback */ 2685 kmutex_t *hash_lock; 2686 arc_callback_t *callback_list, *acb; 2687 int freeable = FALSE; 2688 2689 buf = zio->io_private; 2690 hdr = buf->b_hdr; 2691 2692 /* 2693 * The hdr was inserted into hash-table and removed from lists 2694 * prior to starting I/O. We should find this header, since 2695 * it's in the hash table, and it should be legit since it's 2696 * not possible to evict it during the I/O. The only possible 2697 * reason for it not to be found is if we were freed during the 2698 * read. 2699 */ 2700 found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, 2701 &hash_lock); 2702 2703 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 2704 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 2705 (found == hdr && HDR_L2_READING(hdr))); 2706 2707 hdr->b_flags &= ~ARC_L2_EVICTED; 2708 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) 2709 hdr->b_flags &= ~ARC_L2CACHE; 2710 2711 /* byteswap if necessary */ 2712 callback_list = hdr->b_acb; 2713 ASSERT(callback_list != NULL); 2714 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 2715 dmu_object_byteswap_t bswap = 2716 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 2717 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 2718 byteswap_uint64_array : 2719 dmu_ot_byteswap[bswap].ob_func; 2720 func(buf->b_data, hdr->b_size); 2721 } 2722 2723 arc_cksum_compute(buf, B_FALSE); 2724 arc_buf_watch(buf); 2725 2726 if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { 2727 /* 2728 * Only call arc_access on anonymous buffers. This is because 2729 * if we've issued an I/O for an evicted buffer, we've already 2730 * called arc_access (to prevent any simultaneous readers from 2731 * getting confused). 2732 */ 2733 arc_access(hdr, hash_lock); 2734 } 2735 2736 /* create copies of the data buffer for the callers */ 2737 abuf = buf; 2738 for (acb = callback_list; acb; acb = acb->acb_next) { 2739 if (acb->acb_done) { 2740 if (abuf == NULL) { 2741 ARCSTAT_BUMP(arcstat_duplicate_reads); 2742 abuf = arc_buf_clone(buf); 2743 } 2744 acb->acb_buf = abuf; 2745 abuf = NULL; 2746 } 2747 } 2748 hdr->b_acb = NULL; 2749 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2750 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 2751 if (abuf == buf) { 2752 ASSERT(buf->b_efunc == NULL); 2753 ASSERT(hdr->b_datacnt == 1); 2754 hdr->b_flags |= ARC_BUF_AVAILABLE; 2755 } 2756 2757 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 2758 2759 if (zio->io_error != 0) { 2760 hdr->b_flags |= ARC_IO_ERROR; 2761 if (hdr->b_state != arc_anon) 2762 arc_change_state(arc_anon, hdr, hash_lock); 2763 if (HDR_IN_HASH_TABLE(hdr)) 2764 buf_hash_remove(hdr); 2765 freeable = refcount_is_zero(&hdr->b_refcnt); 2766 } 2767 2768 /* 2769 * Broadcast before we drop the hash_lock to avoid the possibility 2770 * that the hdr (and hence the cv) might be freed before we get to 2771 * the cv_broadcast(). 2772 */ 2773 cv_broadcast(&hdr->b_cv); 2774 2775 if (hash_lock) { 2776 mutex_exit(hash_lock); 2777 } else { 2778 /* 2779 * This block was freed while we waited for the read to 2780 * complete. It has been removed from the hash table and 2781 * moved to the anonymous state (so that it won't show up 2782 * in the cache). 2783 */ 2784 ASSERT3P(hdr->b_state, ==, arc_anon); 2785 freeable = refcount_is_zero(&hdr->b_refcnt); 2786 } 2787 2788 /* execute each callback and free its structure */ 2789 while ((acb = callback_list) != NULL) { 2790 if (acb->acb_done) 2791 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 2792 2793 if (acb->acb_zio_dummy != NULL) { 2794 acb->acb_zio_dummy->io_error = zio->io_error; 2795 zio_nowait(acb->acb_zio_dummy); 2796 } 2797 2798 callback_list = acb->acb_next; 2799 kmem_free(acb, sizeof (arc_callback_t)); 2800 } 2801 2802 if (freeable) 2803 arc_hdr_destroy(hdr); 2804 } 2805 2806 /* 2807 * "Read" the block at the specified DVA (in bp) via the 2808 * cache. If the block is found in the cache, invoke the provided 2809 * callback immediately and return. Note that the `zio' parameter 2810 * in the callback will be NULL in this case, since no IO was 2811 * required. If the block is not in the cache pass the read request 2812 * on to the spa with a substitute callback function, so that the 2813 * requested block will be added to the cache. 2814 * 2815 * If a read request arrives for a block that has a read in-progress, 2816 * either wait for the in-progress read to complete (and return the 2817 * results); or, if this is a read with a "done" func, add a record 2818 * to the read to invoke the "done" func when the read completes, 2819 * and return; or just return. 2820 * 2821 * arc_read_done() will invoke all the requested "done" functions 2822 * for readers of this block. 2823 */ 2824 int 2825 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 2826 void *private, int priority, int zio_flags, uint32_t *arc_flags, 2827 const zbookmark_t *zb) 2828 { 2829 arc_buf_hdr_t *hdr; 2830 arc_buf_t *buf = NULL; 2831 kmutex_t *hash_lock; 2832 zio_t *rzio; 2833 uint64_t guid = spa_load_guid(spa); 2834 2835 top: 2836 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), 2837 &hash_lock); 2838 if (hdr && hdr->b_datacnt > 0) { 2839 2840 *arc_flags |= ARC_CACHED; 2841 2842 if (HDR_IO_IN_PROGRESS(hdr)) { 2843 2844 if (*arc_flags & ARC_WAIT) { 2845 cv_wait(&hdr->b_cv, hash_lock); 2846 mutex_exit(hash_lock); 2847 goto top; 2848 } 2849 ASSERT(*arc_flags & ARC_NOWAIT); 2850 2851 if (done) { 2852 arc_callback_t *acb = NULL; 2853 2854 acb = kmem_zalloc(sizeof (arc_callback_t), 2855 KM_SLEEP); 2856 acb->acb_done = done; 2857 acb->acb_private = private; 2858 if (pio != NULL) 2859 acb->acb_zio_dummy = zio_null(pio, 2860 spa, NULL, NULL, NULL, zio_flags); 2861 2862 ASSERT(acb->acb_done != NULL); 2863 acb->acb_next = hdr->b_acb; 2864 hdr->b_acb = acb; 2865 add_reference(hdr, hash_lock, private); 2866 mutex_exit(hash_lock); 2867 return (0); 2868 } 2869 mutex_exit(hash_lock); 2870 return (0); 2871 } 2872 2873 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2874 2875 if (done) { 2876 add_reference(hdr, hash_lock, private); 2877 /* 2878 * If this block is already in use, create a new 2879 * copy of the data so that we will be guaranteed 2880 * that arc_release() will always succeed. 2881 */ 2882 buf = hdr->b_buf; 2883 ASSERT(buf); 2884 ASSERT(buf->b_data); 2885 if (HDR_BUF_AVAILABLE(hdr)) { 2886 ASSERT(buf->b_efunc == NULL); 2887 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 2888 } else { 2889 buf = arc_buf_clone(buf); 2890 } 2891 2892 } else if (*arc_flags & ARC_PREFETCH && 2893 refcount_count(&hdr->b_refcnt) == 0) { 2894 hdr->b_flags |= ARC_PREFETCH; 2895 } 2896 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2897 arc_access(hdr, hash_lock); 2898 if (*arc_flags & ARC_L2CACHE) 2899 hdr->b_flags |= ARC_L2CACHE; 2900 if (*arc_flags & ARC_L2COMPRESS) 2901 hdr->b_flags |= ARC_L2COMPRESS; 2902 mutex_exit(hash_lock); 2903 ARCSTAT_BUMP(arcstat_hits); 2904 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2905 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2906 data, metadata, hits); 2907 2908 if (done) 2909 done(NULL, buf, private); 2910 } else { 2911 uint64_t size = BP_GET_LSIZE(bp); 2912 arc_callback_t *acb; 2913 vdev_t *vd = NULL; 2914 uint64_t addr = 0; 2915 boolean_t devw = B_FALSE; 2916 2917 if (hdr == NULL) { 2918 /* this block is not in the cache */ 2919 arc_buf_hdr_t *exists; 2920 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 2921 buf = arc_buf_alloc(spa, size, private, type); 2922 hdr = buf->b_hdr; 2923 hdr->b_dva = *BP_IDENTITY(bp); 2924 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 2925 hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 2926 exists = buf_hash_insert(hdr, &hash_lock); 2927 if (exists) { 2928 /* somebody beat us to the hash insert */ 2929 mutex_exit(hash_lock); 2930 buf_discard_identity(hdr); 2931 (void) arc_buf_remove_ref(buf, private); 2932 goto top; /* restart the IO request */ 2933 } 2934 /* if this is a prefetch, we don't have a reference */ 2935 if (*arc_flags & ARC_PREFETCH) { 2936 (void) remove_reference(hdr, hash_lock, 2937 private); 2938 hdr->b_flags |= ARC_PREFETCH; 2939 } 2940 if (*arc_flags & ARC_L2CACHE) 2941 hdr->b_flags |= ARC_L2CACHE; 2942 if (*arc_flags & ARC_L2COMPRESS) 2943 hdr->b_flags |= ARC_L2COMPRESS; 2944 if (BP_GET_LEVEL(bp) > 0) 2945 hdr->b_flags |= ARC_INDIRECT; 2946 } else { 2947 /* this block is in the ghost cache */ 2948 ASSERT(GHOST_STATE(hdr->b_state)); 2949 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2950 ASSERT0(refcount_count(&hdr->b_refcnt)); 2951 ASSERT(hdr->b_buf == NULL); 2952 2953 /* if this is a prefetch, we don't have a reference */ 2954 if (*arc_flags & ARC_PREFETCH) 2955 hdr->b_flags |= ARC_PREFETCH; 2956 else 2957 add_reference(hdr, hash_lock, private); 2958 if (*arc_flags & ARC_L2CACHE) 2959 hdr->b_flags |= ARC_L2CACHE; 2960 if (*arc_flags & ARC_L2COMPRESS) 2961 hdr->b_flags |= ARC_L2COMPRESS; 2962 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2963 buf->b_hdr = hdr; 2964 buf->b_data = NULL; 2965 buf->b_efunc = NULL; 2966 buf->b_private = NULL; 2967 buf->b_next = NULL; 2968 hdr->b_buf = buf; 2969 ASSERT(hdr->b_datacnt == 0); 2970 hdr->b_datacnt = 1; 2971 arc_get_data_buf(buf); 2972 arc_access(hdr, hash_lock); 2973 } 2974 2975 ASSERT(!GHOST_STATE(hdr->b_state)); 2976 2977 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 2978 acb->acb_done = done; 2979 acb->acb_private = private; 2980 2981 ASSERT(hdr->b_acb == NULL); 2982 hdr->b_acb = acb; 2983 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2984 2985 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && 2986 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { 2987 devw = hdr->b_l2hdr->b_dev->l2ad_writing; 2988 addr = hdr->b_l2hdr->b_daddr; 2989 /* 2990 * Lock out device removal. 2991 */ 2992 if (vdev_is_dead(vd) || 2993 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 2994 vd = NULL; 2995 } 2996 2997 mutex_exit(hash_lock); 2998 2999 /* 3000 * At this point, we have a level 1 cache miss. Try again in 3001 * L2ARC if possible. 3002 */ 3003 ASSERT3U(hdr->b_size, ==, size); 3004 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 3005 uint64_t, size, zbookmark_t *, zb); 3006 ARCSTAT_BUMP(arcstat_misses); 3007 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 3008 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 3009 data, metadata, misses); 3010 3011 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 3012 /* 3013 * Read from the L2ARC if the following are true: 3014 * 1. The L2ARC vdev was previously cached. 3015 * 2. This buffer still has L2ARC metadata. 3016 * 3. This buffer isn't currently writing to the L2ARC. 3017 * 4. The L2ARC entry wasn't evicted, which may 3018 * also have invalidated the vdev. 3019 * 5. This isn't prefetch and l2arc_noprefetch is set. 3020 */ 3021 if (hdr->b_l2hdr != NULL && 3022 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 3023 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 3024 l2arc_read_callback_t *cb; 3025 3026 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 3027 ARCSTAT_BUMP(arcstat_l2_hits); 3028 3029 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 3030 KM_SLEEP); 3031 cb->l2rcb_buf = buf; 3032 cb->l2rcb_spa = spa; 3033 cb->l2rcb_bp = *bp; 3034 cb->l2rcb_zb = *zb; 3035 cb->l2rcb_flags = zio_flags; 3036 cb->l2rcb_compress = hdr->b_l2hdr->b_compress; 3037 3038 ASSERT(addr >= VDEV_LABEL_START_SIZE && 3039 addr + size < vd->vdev_psize - 3040 VDEV_LABEL_END_SIZE); 3041 3042 /* 3043 * l2arc read. The SCL_L2ARC lock will be 3044 * released by l2arc_read_done(). 3045 * Issue a null zio if the underlying buffer 3046 * was squashed to zero size by compression. 3047 */ 3048 if (hdr->b_l2hdr->b_compress == 3049 ZIO_COMPRESS_EMPTY) { 3050 rzio = zio_null(pio, spa, vd, 3051 l2arc_read_done, cb, 3052 zio_flags | ZIO_FLAG_DONT_CACHE | 3053 ZIO_FLAG_CANFAIL | 3054 ZIO_FLAG_DONT_PROPAGATE | 3055 ZIO_FLAG_DONT_RETRY); 3056 } else { 3057 rzio = zio_read_phys(pio, vd, addr, 3058 hdr->b_l2hdr->b_asize, 3059 buf->b_data, ZIO_CHECKSUM_OFF, 3060 l2arc_read_done, cb, priority, 3061 zio_flags | ZIO_FLAG_DONT_CACHE | 3062 ZIO_FLAG_CANFAIL | 3063 ZIO_FLAG_DONT_PROPAGATE | 3064 ZIO_FLAG_DONT_RETRY, B_FALSE); 3065 } 3066 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 3067 zio_t *, rzio); 3068 ARCSTAT_INCR(arcstat_l2_read_bytes, 3069 hdr->b_l2hdr->b_asize); 3070 3071 if (*arc_flags & ARC_NOWAIT) { 3072 zio_nowait(rzio); 3073 return (0); 3074 } 3075 3076 ASSERT(*arc_flags & ARC_WAIT); 3077 if (zio_wait(rzio) == 0) 3078 return (0); 3079 3080 /* l2arc read error; goto zio_read() */ 3081 } else { 3082 DTRACE_PROBE1(l2arc__miss, 3083 arc_buf_hdr_t *, hdr); 3084 ARCSTAT_BUMP(arcstat_l2_misses); 3085 if (HDR_L2_WRITING(hdr)) 3086 ARCSTAT_BUMP(arcstat_l2_rw_clash); 3087 spa_config_exit(spa, SCL_L2ARC, vd); 3088 } 3089 } else { 3090 if (vd != NULL) 3091 spa_config_exit(spa, SCL_L2ARC, vd); 3092 if (l2arc_ndev != 0) { 3093 DTRACE_PROBE1(l2arc__miss, 3094 arc_buf_hdr_t *, hdr); 3095 ARCSTAT_BUMP(arcstat_l2_misses); 3096 } 3097 } 3098 3099 rzio = zio_read(pio, spa, bp, buf->b_data, size, 3100 arc_read_done, buf, priority, zio_flags, zb); 3101 3102 if (*arc_flags & ARC_WAIT) 3103 return (zio_wait(rzio)); 3104 3105 ASSERT(*arc_flags & ARC_NOWAIT); 3106 zio_nowait(rzio); 3107 } 3108 return (0); 3109 } 3110 3111 void 3112 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 3113 { 3114 ASSERT(buf->b_hdr != NULL); 3115 ASSERT(buf->b_hdr->b_state != arc_anon); 3116 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 3117 ASSERT(buf->b_efunc == NULL); 3118 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 3119 3120 buf->b_efunc = func; 3121 buf->b_private = private; 3122 } 3123 3124 /* 3125 * Notify the arc that a block was freed, and thus will never be used again. 3126 */ 3127 void 3128 arc_freed(spa_t *spa, const blkptr_t *bp) 3129 { 3130 arc_buf_hdr_t *hdr; 3131 kmutex_t *hash_lock; 3132 uint64_t guid = spa_load_guid(spa); 3133 3134 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), 3135 &hash_lock); 3136 if (hdr == NULL) 3137 return; 3138 if (HDR_BUF_AVAILABLE(hdr)) { 3139 arc_buf_t *buf = hdr->b_buf; 3140 add_reference(hdr, hash_lock, FTAG); 3141 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 3142 mutex_exit(hash_lock); 3143 3144 arc_release(buf, FTAG); 3145 (void) arc_buf_remove_ref(buf, FTAG); 3146 } else { 3147 mutex_exit(hash_lock); 3148 } 3149 3150 } 3151 3152 /* 3153 * This is used by the DMU to let the ARC know that a buffer is 3154 * being evicted, so the ARC should clean up. If this arc buf 3155 * is not yet in the evicted state, it will be put there. 3156 */ 3157 int 3158 arc_buf_evict(arc_buf_t *buf) 3159 { 3160 arc_buf_hdr_t *hdr; 3161 kmutex_t *hash_lock; 3162 arc_buf_t **bufp; 3163 3164 mutex_enter(&buf->b_evict_lock); 3165 hdr = buf->b_hdr; 3166 if (hdr == NULL) { 3167 /* 3168 * We are in arc_do_user_evicts(). 3169 */ 3170 ASSERT(buf->b_data == NULL); 3171 mutex_exit(&buf->b_evict_lock); 3172 return (0); 3173 } else if (buf->b_data == NULL) { 3174 arc_buf_t copy = *buf; /* structure assignment */ 3175 /* 3176 * We are on the eviction list; process this buffer now 3177 * but let arc_do_user_evicts() do the reaping. 3178 */ 3179 buf->b_efunc = NULL; 3180 mutex_exit(&buf->b_evict_lock); 3181 VERIFY(copy.b_efunc(©) == 0); 3182 return (1); 3183 } 3184 hash_lock = HDR_LOCK(hdr); 3185 mutex_enter(hash_lock); 3186 hdr = buf->b_hdr; 3187 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 3188 3189 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 3190 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 3191 3192 /* 3193 * Pull this buffer off of the hdr 3194 */ 3195 bufp = &hdr->b_buf; 3196 while (*bufp != buf) 3197 bufp = &(*bufp)->b_next; 3198 *bufp = buf->b_next; 3199 3200 ASSERT(buf->b_data != NULL); 3201 arc_buf_destroy(buf, FALSE, FALSE); 3202 3203 if (hdr->b_datacnt == 0) { 3204 arc_state_t *old_state = hdr->b_state; 3205 arc_state_t *evicted_state; 3206 3207 ASSERT(hdr->b_buf == NULL); 3208 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 3209 3210 evicted_state = 3211 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 3212 3213 mutex_enter(&old_state->arcs_mtx); 3214 mutex_enter(&evicted_state->arcs_mtx); 3215 3216 arc_change_state(evicted_state, hdr, hash_lock); 3217 ASSERT(HDR_IN_HASH_TABLE(hdr)); 3218 hdr->b_flags |= ARC_IN_HASH_TABLE; 3219 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 3220 3221 mutex_exit(&evicted_state->arcs_mtx); 3222 mutex_exit(&old_state->arcs_mtx); 3223 } 3224 mutex_exit(hash_lock); 3225 mutex_exit(&buf->b_evict_lock); 3226 3227 VERIFY(buf->b_efunc(buf) == 0); 3228 buf->b_efunc = NULL; 3229 buf->b_private = NULL; 3230 buf->b_hdr = NULL; 3231 buf->b_next = NULL; 3232 kmem_cache_free(buf_cache, buf); 3233 return (1); 3234 } 3235 3236 /* 3237 * Release this buffer from the cache, making it an anonymous buffer. This 3238 * must be done after a read and prior to modifying the buffer contents. 3239 * If the buffer has more than one reference, we must make 3240 * a new hdr for the buffer. 3241 */ 3242 void 3243 arc_release(arc_buf_t *buf, void *tag) 3244 { 3245 arc_buf_hdr_t *hdr; 3246 kmutex_t *hash_lock = NULL; 3247 l2arc_buf_hdr_t *l2hdr; 3248 uint64_t buf_size; 3249 3250 /* 3251 * It would be nice to assert that if it's DMU metadata (level > 3252 * 0 || it's the dnode file), then it must be syncing context. 3253 * But we don't know that information at this level. 3254 */ 3255 3256 mutex_enter(&buf->b_evict_lock); 3257 hdr = buf->b_hdr; 3258 3259 /* this buffer is not on any list */ 3260 ASSERT(refcount_count(&hdr->b_refcnt) > 0); 3261 3262 if (hdr->b_state == arc_anon) { 3263 /* this buffer is already released */ 3264 ASSERT(buf->b_efunc == NULL); 3265 } else { 3266 hash_lock = HDR_LOCK(hdr); 3267 mutex_enter(hash_lock); 3268 hdr = buf->b_hdr; 3269 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 3270 } 3271 3272 l2hdr = hdr->b_l2hdr; 3273 if (l2hdr) { 3274 mutex_enter(&l2arc_buflist_mtx); 3275 hdr->b_l2hdr = NULL; 3276 } 3277 buf_size = hdr->b_size; 3278 3279 /* 3280 * Do we have more than one buf? 3281 */ 3282 if (hdr->b_datacnt > 1) { 3283 arc_buf_hdr_t *nhdr; 3284 arc_buf_t **bufp; 3285 uint64_t blksz = hdr->b_size; 3286 uint64_t spa = hdr->b_spa; 3287 arc_buf_contents_t type = hdr->b_type; 3288 uint32_t flags = hdr->b_flags; 3289 3290 ASSERT(hdr->b_buf != buf || buf->b_next != NULL); 3291 /* 3292 * Pull the data off of this hdr and attach it to 3293 * a new anonymous hdr. 3294 */ 3295 (void) remove_reference(hdr, hash_lock, tag); 3296 bufp = &hdr->b_buf; 3297 while (*bufp != buf) 3298 bufp = &(*bufp)->b_next; 3299 *bufp = buf->b_next; 3300 buf->b_next = NULL; 3301 3302 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 3303 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 3304 if (refcount_is_zero(&hdr->b_refcnt)) { 3305 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; 3306 ASSERT3U(*size, >=, hdr->b_size); 3307 atomic_add_64(size, -hdr->b_size); 3308 } 3309 3310 /* 3311 * We're releasing a duplicate user data buffer, update 3312 * our statistics accordingly. 3313 */ 3314 if (hdr->b_type == ARC_BUFC_DATA) { 3315 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 3316 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 3317 -hdr->b_size); 3318 } 3319 hdr->b_datacnt -= 1; 3320 arc_cksum_verify(buf); 3321 arc_buf_unwatch(buf); 3322 3323 mutex_exit(hash_lock); 3324 3325 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 3326 nhdr->b_size = blksz; 3327 nhdr->b_spa = spa; 3328 nhdr->b_type = type; 3329 nhdr->b_buf = buf; 3330 nhdr->b_state = arc_anon; 3331 nhdr->b_arc_access = 0; 3332 nhdr->b_flags = flags & ARC_L2_WRITING; 3333 nhdr->b_l2hdr = NULL; 3334 nhdr->b_datacnt = 1; 3335 nhdr->b_freeze_cksum = NULL; 3336 (void) refcount_add(&nhdr->b_refcnt, tag); 3337 buf->b_hdr = nhdr; 3338 mutex_exit(&buf->b_evict_lock); 3339 atomic_add_64(&arc_anon->arcs_size, blksz); 3340 } else { 3341 mutex_exit(&buf->b_evict_lock); 3342 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 3343 ASSERT(!list_link_active(&hdr->b_arc_node)); 3344 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3345 if (hdr->b_state != arc_anon) 3346 arc_change_state(arc_anon, hdr, hash_lock); 3347 hdr->b_arc_access = 0; 3348 if (hash_lock) 3349 mutex_exit(hash_lock); 3350 3351 buf_discard_identity(hdr); 3352 arc_buf_thaw(buf); 3353 } 3354 buf->b_efunc = NULL; 3355 buf->b_private = NULL; 3356 3357 if (l2hdr) { 3358 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 3359 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 3360 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 3361 ARCSTAT_INCR(arcstat_l2_size, -buf_size); 3362 mutex_exit(&l2arc_buflist_mtx); 3363 } 3364 } 3365 3366 int 3367 arc_released(arc_buf_t *buf) 3368 { 3369 int released; 3370 3371 mutex_enter(&buf->b_evict_lock); 3372 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 3373 mutex_exit(&buf->b_evict_lock); 3374 return (released); 3375 } 3376 3377 int 3378 arc_has_callback(arc_buf_t *buf) 3379 { 3380 int callback; 3381 3382 mutex_enter(&buf->b_evict_lock); 3383 callback = (buf->b_efunc != NULL); 3384 mutex_exit(&buf->b_evict_lock); 3385 return (callback); 3386 } 3387 3388 #ifdef ZFS_DEBUG 3389 int 3390 arc_referenced(arc_buf_t *buf) 3391 { 3392 int referenced; 3393 3394 mutex_enter(&buf->b_evict_lock); 3395 referenced = (refcount_count(&buf->b_hdr->b_refcnt)); 3396 mutex_exit(&buf->b_evict_lock); 3397 return (referenced); 3398 } 3399 #endif 3400 3401 static void 3402 arc_write_ready(zio_t *zio) 3403 { 3404 arc_write_callback_t *callback = zio->io_private; 3405 arc_buf_t *buf = callback->awcb_buf; 3406 arc_buf_hdr_t *hdr = buf->b_hdr; 3407 3408 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 3409 callback->awcb_ready(zio, buf, callback->awcb_private); 3410 3411 /* 3412 * If the IO is already in progress, then this is a re-write 3413 * attempt, so we need to thaw and re-compute the cksum. 3414 * It is the responsibility of the callback to handle the 3415 * accounting for any re-write attempt. 3416 */ 3417 if (HDR_IO_IN_PROGRESS(hdr)) { 3418 mutex_enter(&hdr->b_freeze_lock); 3419 if (hdr->b_freeze_cksum != NULL) { 3420 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 3421 hdr->b_freeze_cksum = NULL; 3422 } 3423 mutex_exit(&hdr->b_freeze_lock); 3424 } 3425 arc_cksum_compute(buf, B_FALSE); 3426 hdr->b_flags |= ARC_IO_IN_PROGRESS; 3427 } 3428 3429 static void 3430 arc_write_done(zio_t *zio) 3431 { 3432 arc_write_callback_t *callback = zio->io_private; 3433 arc_buf_t *buf = callback->awcb_buf; 3434 arc_buf_hdr_t *hdr = buf->b_hdr; 3435 3436 ASSERT(hdr->b_acb == NULL); 3437 3438 if (zio->io_error == 0) { 3439 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 3440 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 3441 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 3442 } else { 3443 ASSERT(BUF_EMPTY(hdr)); 3444 } 3445 3446 /* 3447 * If the block to be written was all-zero, we may have 3448 * compressed it away. In this case no write was performed 3449 * so there will be no dva/birth/checksum. The buffer must 3450 * therefore remain anonymous (and uncached). 3451 */ 3452 if (!BUF_EMPTY(hdr)) { 3453 arc_buf_hdr_t *exists; 3454 kmutex_t *hash_lock; 3455 3456 ASSERT(zio->io_error == 0); 3457 3458 arc_cksum_verify(buf); 3459 3460 exists = buf_hash_insert(hdr, &hash_lock); 3461 if (exists) { 3462 /* 3463 * This can only happen if we overwrite for 3464 * sync-to-convergence, because we remove 3465 * buffers from the hash table when we arc_free(). 3466 */ 3467 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 3468 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 3469 panic("bad overwrite, hdr=%p exists=%p", 3470 (void *)hdr, (void *)exists); 3471 ASSERT(refcount_is_zero(&exists->b_refcnt)); 3472 arc_change_state(arc_anon, exists, hash_lock); 3473 mutex_exit(hash_lock); 3474 arc_hdr_destroy(exists); 3475 exists = buf_hash_insert(hdr, &hash_lock); 3476 ASSERT3P(exists, ==, NULL); 3477 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 3478 /* nopwrite */ 3479 ASSERT(zio->io_prop.zp_nopwrite); 3480 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 3481 panic("bad nopwrite, hdr=%p exists=%p", 3482 (void *)hdr, (void *)exists); 3483 } else { 3484 /* Dedup */ 3485 ASSERT(hdr->b_datacnt == 1); 3486 ASSERT(hdr->b_state == arc_anon); 3487 ASSERT(BP_GET_DEDUP(zio->io_bp)); 3488 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 3489 } 3490 } 3491 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3492 /* if it's not anon, we are doing a scrub */ 3493 if (!exists && hdr->b_state == arc_anon) 3494 arc_access(hdr, hash_lock); 3495 mutex_exit(hash_lock); 3496 } else { 3497 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3498 } 3499 3500 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 3501 callback->awcb_done(zio, buf, callback->awcb_private); 3502 3503 kmem_free(callback, sizeof (arc_write_callback_t)); 3504 } 3505 3506 zio_t * 3507 arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 3508 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, 3509 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done, 3510 void *private, int priority, int zio_flags, const zbookmark_t *zb) 3511 { 3512 arc_buf_hdr_t *hdr = buf->b_hdr; 3513 arc_write_callback_t *callback; 3514 zio_t *zio; 3515 3516 ASSERT(ready != NULL); 3517 ASSERT(done != NULL); 3518 ASSERT(!HDR_IO_ERROR(hdr)); 3519 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 3520 ASSERT(hdr->b_acb == NULL); 3521 if (l2arc) 3522 hdr->b_flags |= ARC_L2CACHE; 3523 if (l2arc_compress) 3524 hdr->b_flags |= ARC_L2COMPRESS; 3525 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 3526 callback->awcb_ready = ready; 3527 callback->awcb_done = done; 3528 callback->awcb_private = private; 3529 callback->awcb_buf = buf; 3530 3531 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 3532 arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); 3533 3534 return (zio); 3535 } 3536 3537 static int 3538 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) 3539 { 3540 #ifdef _KERNEL 3541 uint64_t available_memory = ptob(freemem); 3542 static uint64_t page_load = 0; 3543 static uint64_t last_txg = 0; 3544 3545 #if defined(__i386) 3546 available_memory = 3547 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); 3548 #endif 3549 if (available_memory >= zfs_write_limit_max) 3550 return (0); 3551 3552 if (txg > last_txg) { 3553 last_txg = txg; 3554 page_load = 0; 3555 } 3556 /* 3557 * If we are in pageout, we know that memory is already tight, 3558 * the arc is already going to be evicting, so we just want to 3559 * continue to let page writes occur as quickly as possible. 3560 */ 3561 if (curproc == proc_pageout) { 3562 if (page_load > MAX(ptob(minfree), available_memory) / 4) 3563 return (SET_ERROR(ERESTART)); 3564 /* Note: reserve is inflated, so we deflate */ 3565 page_load += reserve / 8; 3566 return (0); 3567 } else if (page_load > 0 && arc_reclaim_needed()) { 3568 /* memory is low, delay before restarting */ 3569 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3570 return (SET_ERROR(EAGAIN)); 3571 } 3572 page_load = 0; 3573 3574 if (arc_size > arc_c_min) { 3575 uint64_t evictable_memory = 3576 arc_mru->arcs_lsize[ARC_BUFC_DATA] + 3577 arc_mru->arcs_lsize[ARC_BUFC_METADATA] + 3578 arc_mfu->arcs_lsize[ARC_BUFC_DATA] + 3579 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; 3580 available_memory += MIN(evictable_memory, arc_size - arc_c_min); 3581 } 3582 3583 if (inflight_data > available_memory / 4) { 3584 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3585 return (SET_ERROR(ERESTART)); 3586 } 3587 #endif 3588 return (0); 3589 } 3590 3591 void 3592 arc_tempreserve_clear(uint64_t reserve) 3593 { 3594 atomic_add_64(&arc_tempreserve, -reserve); 3595 ASSERT((int64_t)arc_tempreserve >= 0); 3596 } 3597 3598 int 3599 arc_tempreserve_space(uint64_t reserve, uint64_t txg) 3600 { 3601 int error; 3602 uint64_t anon_size; 3603 3604 #ifdef ZFS_DEBUG 3605 /* 3606 * Once in a while, fail for no reason. Everything should cope. 3607 */ 3608 if (spa_get_random(10000) == 0) { 3609 dprintf("forcing random failure\n"); 3610 return (SET_ERROR(ERESTART)); 3611 } 3612 #endif 3613 if (reserve > arc_c/4 && !arc_no_grow) 3614 arc_c = MIN(arc_c_max, reserve * 4); 3615 if (reserve > arc_c) 3616 return (SET_ERROR(ENOMEM)); 3617 3618 /* 3619 * Don't count loaned bufs as in flight dirty data to prevent long 3620 * network delays from blocking transactions that are ready to be 3621 * assigned to a txg. 3622 */ 3623 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 3624 3625 /* 3626 * Writes will, almost always, require additional memory allocations 3627 * in order to compress/encrypt/etc the data. We therefore need to 3628 * make sure that there is sufficient available memory for this. 3629 */ 3630 if (error = arc_memory_throttle(reserve, anon_size, txg)) 3631 return (error); 3632 3633 /* 3634 * Throttle writes when the amount of dirty data in the cache 3635 * gets too large. We try to keep the cache less than half full 3636 * of dirty blocks so that our sync times don't grow too large. 3637 * Note: if two requests come in concurrently, we might let them 3638 * both succeed, when one of them should fail. Not a huge deal. 3639 */ 3640 3641 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 3642 anon_size > arc_c / 4) { 3643 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 3644 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 3645 arc_tempreserve>>10, 3646 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 3647 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 3648 reserve>>10, arc_c>>10); 3649 return (SET_ERROR(ERESTART)); 3650 } 3651 atomic_add_64(&arc_tempreserve, reserve); 3652 return (0); 3653 } 3654 3655 void 3656 arc_init(void) 3657 { 3658 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 3659 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 3660 3661 /* Convert seconds to clock ticks */ 3662 arc_min_prefetch_lifespan = 1 * hz; 3663 3664 /* Start out with 1/8 of all memory */ 3665 arc_c = physmem * PAGESIZE / 8; 3666 3667 #ifdef _KERNEL 3668 /* 3669 * On architectures where the physical memory can be larger 3670 * than the addressable space (intel in 32-bit mode), we may 3671 * need to limit the cache to 1/8 of VM size. 3672 */ 3673 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 3674 #endif 3675 3676 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 3677 arc_c_min = MAX(arc_c / 4, 64<<20); 3678 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 3679 if (arc_c * 8 >= 1<<30) 3680 arc_c_max = (arc_c * 8) - (1<<30); 3681 else 3682 arc_c_max = arc_c_min; 3683 arc_c_max = MAX(arc_c * 6, arc_c_max); 3684 3685 /* 3686 * Allow the tunables to override our calculations if they are 3687 * reasonable (ie. over 64MB) 3688 */ 3689 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) 3690 arc_c_max = zfs_arc_max; 3691 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) 3692 arc_c_min = zfs_arc_min; 3693 3694 arc_c = arc_c_max; 3695 arc_p = (arc_c >> 1); 3696 3697 /* limit meta-data to 1/4 of the arc capacity */ 3698 arc_meta_limit = arc_c_max / 4; 3699 3700 /* Allow the tunable to override if it is reasonable */ 3701 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 3702 arc_meta_limit = zfs_arc_meta_limit; 3703 3704 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 3705 arc_c_min = arc_meta_limit / 2; 3706 3707 if (zfs_arc_grow_retry > 0) 3708 arc_grow_retry = zfs_arc_grow_retry; 3709 3710 if (zfs_arc_shrink_shift > 0) 3711 arc_shrink_shift = zfs_arc_shrink_shift; 3712 3713 if (zfs_arc_p_min_shift > 0) 3714 arc_p_min_shift = zfs_arc_p_min_shift; 3715 3716 /* if kmem_flags are set, lets try to use less memory */ 3717 if (kmem_debugging()) 3718 arc_c = arc_c / 2; 3719 if (arc_c < arc_c_min) 3720 arc_c = arc_c_min; 3721 3722 arc_anon = &ARC_anon; 3723 arc_mru = &ARC_mru; 3724 arc_mru_ghost = &ARC_mru_ghost; 3725 arc_mfu = &ARC_mfu; 3726 arc_mfu_ghost = &ARC_mfu_ghost; 3727 arc_l2c_only = &ARC_l2c_only; 3728 arc_size = 0; 3729 3730 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3731 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3732 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3733 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3734 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3735 mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3736 3737 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 3738 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3739 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 3740 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3741 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 3742 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3743 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 3744 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3745 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 3746 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3747 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 3748 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3749 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 3750 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3751 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 3752 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3753 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], 3754 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3755 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], 3756 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3757 3758 buf_init(); 3759 3760 arc_thread_exit = 0; 3761 arc_eviction_list = NULL; 3762 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 3763 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 3764 3765 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 3766 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 3767 3768 if (arc_ksp != NULL) { 3769 arc_ksp->ks_data = &arc_stats; 3770 kstat_install(arc_ksp); 3771 } 3772 3773 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 3774 TS_RUN, minclsyspri); 3775 3776 arc_dead = FALSE; 3777 arc_warm = B_FALSE; 3778 3779 if (zfs_write_limit_max == 0) 3780 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; 3781 else 3782 zfs_write_limit_shift = 0; 3783 mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); 3784 } 3785 3786 void 3787 arc_fini(void) 3788 { 3789 mutex_enter(&arc_reclaim_thr_lock); 3790 arc_thread_exit = 1; 3791 while (arc_thread_exit != 0) 3792 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 3793 mutex_exit(&arc_reclaim_thr_lock); 3794 3795 arc_flush(NULL); 3796 3797 arc_dead = TRUE; 3798 3799 if (arc_ksp != NULL) { 3800 kstat_delete(arc_ksp); 3801 arc_ksp = NULL; 3802 } 3803 3804 mutex_destroy(&arc_eviction_mtx); 3805 mutex_destroy(&arc_reclaim_thr_lock); 3806 cv_destroy(&arc_reclaim_thr_cv); 3807 3808 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 3809 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 3810 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 3811 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 3812 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 3813 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 3814 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 3815 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 3816 3817 mutex_destroy(&arc_anon->arcs_mtx); 3818 mutex_destroy(&arc_mru->arcs_mtx); 3819 mutex_destroy(&arc_mru_ghost->arcs_mtx); 3820 mutex_destroy(&arc_mfu->arcs_mtx); 3821 mutex_destroy(&arc_mfu_ghost->arcs_mtx); 3822 mutex_destroy(&arc_l2c_only->arcs_mtx); 3823 3824 mutex_destroy(&zfs_write_limit_lock); 3825 3826 buf_fini(); 3827 3828 ASSERT(arc_loaned_bytes == 0); 3829 } 3830 3831 /* 3832 * Level 2 ARC 3833 * 3834 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 3835 * It uses dedicated storage devices to hold cached data, which are populated 3836 * using large infrequent writes. The main role of this cache is to boost 3837 * the performance of random read workloads. The intended L2ARC devices 3838 * include short-stroked disks, solid state disks, and other media with 3839 * substantially faster read latency than disk. 3840 * 3841 * +-----------------------+ 3842 * | ARC | 3843 * +-----------------------+ 3844 * | ^ ^ 3845 * | | | 3846 * l2arc_feed_thread() arc_read() 3847 * | | | 3848 * | l2arc read | 3849 * V | | 3850 * +---------------+ | 3851 * | L2ARC | | 3852 * +---------------+ | 3853 * | ^ | 3854 * l2arc_write() | | 3855 * | | | 3856 * V | | 3857 * +-------+ +-------+ 3858 * | vdev | | vdev | 3859 * | cache | | cache | 3860 * +-------+ +-------+ 3861 * +=========+ .-----. 3862 * : L2ARC : |-_____-| 3863 * : devices : | Disks | 3864 * +=========+ `-_____-' 3865 * 3866 * Read requests are satisfied from the following sources, in order: 3867 * 3868 * 1) ARC 3869 * 2) vdev cache of L2ARC devices 3870 * 3) L2ARC devices 3871 * 4) vdev cache of disks 3872 * 5) disks 3873 * 3874 * Some L2ARC device types exhibit extremely slow write performance. 3875 * To accommodate for this there are some significant differences between 3876 * the L2ARC and traditional cache design: 3877 * 3878 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 3879 * the ARC behave as usual, freeing buffers and placing headers on ghost 3880 * lists. The ARC does not send buffers to the L2ARC during eviction as 3881 * this would add inflated write latencies for all ARC memory pressure. 3882 * 3883 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 3884 * It does this by periodically scanning buffers from the eviction-end of 3885 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 3886 * not already there. It scans until a headroom of buffers is satisfied, 3887 * which itself is a buffer for ARC eviction. If a compressible buffer is 3888 * found during scanning and selected for writing to an L2ARC device, we 3889 * temporarily boost scanning headroom during the next scan cycle to make 3890 * sure we adapt to compression effects (which might significantly reduce 3891 * the data volume we write to L2ARC). The thread that does this is 3892 * l2arc_feed_thread(), illustrated below; example sizes are included to 3893 * provide a better sense of ratio than this diagram: 3894 * 3895 * head --> tail 3896 * +---------------------+----------+ 3897 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 3898 * +---------------------+----------+ | o L2ARC eligible 3899 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 3900 * +---------------------+----------+ | 3901 * 15.9 Gbytes ^ 32 Mbytes | 3902 * headroom | 3903 * l2arc_feed_thread() 3904 * | 3905 * l2arc write hand <--[oooo]--' 3906 * | 8 Mbyte 3907 * | write max 3908 * V 3909 * +==============================+ 3910 * L2ARC dev |####|#|###|###| |####| ... | 3911 * +==============================+ 3912 * 32 Gbytes 3913 * 3914 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 3915 * evicted, then the L2ARC has cached a buffer much sooner than it probably 3916 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 3917 * safe to say that this is an uncommon case, since buffers at the end of 3918 * the ARC lists have moved there due to inactivity. 3919 * 3920 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 3921 * then the L2ARC simply misses copying some buffers. This serves as a 3922 * pressure valve to prevent heavy read workloads from both stalling the ARC 3923 * with waits and clogging the L2ARC with writes. This also helps prevent 3924 * the potential for the L2ARC to churn if it attempts to cache content too 3925 * quickly, such as during backups of the entire pool. 3926 * 3927 * 5. After system boot and before the ARC has filled main memory, there are 3928 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 3929 * lists can remain mostly static. Instead of searching from tail of these 3930 * lists as pictured, the l2arc_feed_thread() will search from the list heads 3931 * for eligible buffers, greatly increasing its chance of finding them. 3932 * 3933 * The L2ARC device write speed is also boosted during this time so that 3934 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 3935 * there are no L2ARC reads, and no fear of degrading read performance 3936 * through increased writes. 3937 * 3938 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 3939 * the vdev queue can aggregate them into larger and fewer writes. Each 3940 * device is written to in a rotor fashion, sweeping writes through 3941 * available space then repeating. 3942 * 3943 * 7. The L2ARC does not store dirty content. It never needs to flush 3944 * write buffers back to disk based storage. 3945 * 3946 * 8. If an ARC buffer is written (and dirtied) which also exists in the 3947 * L2ARC, the now stale L2ARC buffer is immediately dropped. 3948 * 3949 * The performance of the L2ARC can be tweaked by a number of tunables, which 3950 * may be necessary for different workloads: 3951 * 3952 * l2arc_write_max max write bytes per interval 3953 * l2arc_write_boost extra write bytes during device warmup 3954 * l2arc_noprefetch skip caching prefetched buffers 3955 * l2arc_headroom number of max device writes to precache 3956 * l2arc_headroom_boost when we find compressed buffers during ARC 3957 * scanning, we multiply headroom by this 3958 * percentage factor for the next scan cycle, 3959 * since more compressed buffers are likely to 3960 * be present 3961 * l2arc_feed_secs seconds between L2ARC writing 3962 * 3963 * Tunables may be removed or added as future performance improvements are 3964 * integrated, and also may become zpool properties. 3965 * 3966 * There are three key functions that control how the L2ARC warms up: 3967 * 3968 * l2arc_write_eligible() check if a buffer is eligible to cache 3969 * l2arc_write_size() calculate how much to write 3970 * l2arc_write_interval() calculate sleep delay between writes 3971 * 3972 * These three functions determine what to write, how much, and how quickly 3973 * to send writes. 3974 */ 3975 3976 static boolean_t 3977 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) 3978 { 3979 /* 3980 * A buffer is *not* eligible for the L2ARC if it: 3981 * 1. belongs to a different spa. 3982 * 2. is already cached on the L2ARC. 3983 * 3. has an I/O in progress (it may be an incomplete read). 3984 * 4. is flagged not eligible (zfs property). 3985 */ 3986 if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL || 3987 HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) 3988 return (B_FALSE); 3989 3990 return (B_TRUE); 3991 } 3992 3993 static uint64_t 3994 l2arc_write_size(void) 3995 { 3996 uint64_t size; 3997 3998 /* 3999 * Make sure our globals have meaningful values in case the user 4000 * altered them. 4001 */ 4002 size = l2arc_write_max; 4003 if (size == 0) { 4004 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 4005 "be greater than zero, resetting it to the default (%d)", 4006 L2ARC_WRITE_SIZE); 4007 size = l2arc_write_max = L2ARC_WRITE_SIZE; 4008 } 4009 4010 if (arc_warm == B_FALSE) 4011 size += l2arc_write_boost; 4012 4013 return (size); 4014 4015 } 4016 4017 static clock_t 4018 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 4019 { 4020 clock_t interval, next, now; 4021 4022 /* 4023 * If the ARC lists are busy, increase our write rate; if the 4024 * lists are stale, idle back. This is achieved by checking 4025 * how much we previously wrote - if it was more than half of 4026 * what we wanted, schedule the next write much sooner. 4027 */ 4028 if (l2arc_feed_again && wrote > (wanted / 2)) 4029 interval = (hz * l2arc_feed_min_ms) / 1000; 4030 else 4031 interval = hz * l2arc_feed_secs; 4032 4033 now = ddi_get_lbolt(); 4034 next = MAX(now, MIN(now + interval, began + interval)); 4035 4036 return (next); 4037 } 4038 4039 static void 4040 l2arc_hdr_stat_add(void) 4041 { 4042 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); 4043 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); 4044 } 4045 4046 static void 4047 l2arc_hdr_stat_remove(void) 4048 { 4049 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); 4050 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); 4051 } 4052 4053 /* 4054 * Cycle through L2ARC devices. This is how L2ARC load balances. 4055 * If a device is returned, this also returns holding the spa config lock. 4056 */ 4057 static l2arc_dev_t * 4058 l2arc_dev_get_next(void) 4059 { 4060 l2arc_dev_t *first, *next = NULL; 4061 4062 /* 4063 * Lock out the removal of spas (spa_namespace_lock), then removal 4064 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 4065 * both locks will be dropped and a spa config lock held instead. 4066 */ 4067 mutex_enter(&spa_namespace_lock); 4068 mutex_enter(&l2arc_dev_mtx); 4069 4070 /* if there are no vdevs, there is nothing to do */ 4071 if (l2arc_ndev == 0) 4072 goto out; 4073 4074 first = NULL; 4075 next = l2arc_dev_last; 4076 do { 4077 /* loop around the list looking for a non-faulted vdev */ 4078 if (next == NULL) { 4079 next = list_head(l2arc_dev_list); 4080 } else { 4081 next = list_next(l2arc_dev_list, next); 4082 if (next == NULL) 4083 next = list_head(l2arc_dev_list); 4084 } 4085 4086 /* if we have come back to the start, bail out */ 4087 if (first == NULL) 4088 first = next; 4089 else if (next == first) 4090 break; 4091 4092 } while (vdev_is_dead(next->l2ad_vdev)); 4093 4094 /* if we were unable to find any usable vdevs, return NULL */ 4095 if (vdev_is_dead(next->l2ad_vdev)) 4096 next = NULL; 4097 4098 l2arc_dev_last = next; 4099 4100 out: 4101 mutex_exit(&l2arc_dev_mtx); 4102 4103 /* 4104 * Grab the config lock to prevent the 'next' device from being 4105 * removed while we are writing to it. 4106 */ 4107 if (next != NULL) 4108 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 4109 mutex_exit(&spa_namespace_lock); 4110 4111 return (next); 4112 } 4113 4114 /* 4115 * Free buffers that were tagged for destruction. 4116 */ 4117 static void 4118 l2arc_do_free_on_write() 4119 { 4120 list_t *buflist; 4121 l2arc_data_free_t *df, *df_prev; 4122 4123 mutex_enter(&l2arc_free_on_write_mtx); 4124 buflist = l2arc_free_on_write; 4125 4126 for (df = list_tail(buflist); df; df = df_prev) { 4127 df_prev = list_prev(buflist, df); 4128 ASSERT(df->l2df_data != NULL); 4129 ASSERT(df->l2df_func != NULL); 4130 df->l2df_func(df->l2df_data, df->l2df_size); 4131 list_remove(buflist, df); 4132 kmem_free(df, sizeof (l2arc_data_free_t)); 4133 } 4134 4135 mutex_exit(&l2arc_free_on_write_mtx); 4136 } 4137 4138 /* 4139 * A write to a cache device has completed. Update all headers to allow 4140 * reads from these buffers to begin. 4141 */ 4142 static void 4143 l2arc_write_done(zio_t *zio) 4144 { 4145 l2arc_write_callback_t *cb; 4146 l2arc_dev_t *dev; 4147 list_t *buflist; 4148 arc_buf_hdr_t *head, *ab, *ab_prev; 4149 l2arc_buf_hdr_t *l2hdr; 4150 kmutex_t *hash_lock; 4151 4152 cb = zio->io_private; 4153 ASSERT(cb != NULL); 4154 dev = cb->l2wcb_dev; 4155 ASSERT(dev != NULL); 4156 head = cb->l2wcb_head; 4157 ASSERT(head != NULL); 4158 buflist = dev->l2ad_buflist; 4159 ASSERT(buflist != NULL); 4160 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 4161 l2arc_write_callback_t *, cb); 4162 4163 if (zio->io_error != 0) 4164 ARCSTAT_BUMP(arcstat_l2_writes_error); 4165 4166 mutex_enter(&l2arc_buflist_mtx); 4167 4168 /* 4169 * All writes completed, or an error was hit. 4170 */ 4171 dev->l2ad_writing = B_FALSE; 4172 for (ab = list_prev(buflist, head); ab; ab = ab_prev) { 4173 ab_prev = list_prev(buflist, ab); 4174 4175 hash_lock = HDR_LOCK(ab); 4176 mutex_enter(hash_lock); 4177 4178 l2hdr = ab->b_l2hdr; 4179 4180 /* 4181 * Release the temporary compressed buffer as soon as possible. 4182 */ 4183 if (l2hdr->b_compress != ZIO_COMPRESS_OFF) 4184 l2arc_release_cdata_buf(ab); 4185 4186 if (zio->io_error != 0) { 4187 /* 4188 * Error - drop L2ARC entry. 4189 */ 4190 list_remove(buflist, ab); 4191 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 4192 ab->b_l2hdr = NULL; 4193 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 4194 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4195 } 4196 4197 /* 4198 * Allow ARC to begin reads to this L2ARC entry. 4199 */ 4200 ab->b_flags &= ~ARC_L2_WRITING; 4201 4202 mutex_exit(hash_lock); 4203 } 4204 4205 atomic_inc_64(&l2arc_writes_done); 4206 list_remove(buflist, head); 4207 kmem_cache_free(hdr_cache, head); 4208 mutex_exit(&l2arc_buflist_mtx); 4209 4210 l2arc_do_free_on_write(); 4211 4212 kmem_free(cb, sizeof (l2arc_write_callback_t)); 4213 } 4214 4215 /* 4216 * A read to a cache device completed. Validate buffer contents before 4217 * handing over to the regular ARC routines. 4218 */ 4219 static void 4220 l2arc_read_done(zio_t *zio) 4221 { 4222 l2arc_read_callback_t *cb; 4223 arc_buf_hdr_t *hdr; 4224 arc_buf_t *buf; 4225 kmutex_t *hash_lock; 4226 int equal; 4227 4228 ASSERT(zio->io_vd != NULL); 4229 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 4230 4231 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 4232 4233 cb = zio->io_private; 4234 ASSERT(cb != NULL); 4235 buf = cb->l2rcb_buf; 4236 ASSERT(buf != NULL); 4237 4238 hash_lock = HDR_LOCK(buf->b_hdr); 4239 mutex_enter(hash_lock); 4240 hdr = buf->b_hdr; 4241 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4242 4243 /* 4244 * If the buffer was compressed, decompress it first. 4245 */ 4246 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) 4247 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); 4248 ASSERT(zio->io_data != NULL); 4249 4250 /* 4251 * Check this survived the L2ARC journey. 4252 */ 4253 equal = arc_cksum_equal(buf); 4254 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 4255 mutex_exit(hash_lock); 4256 zio->io_private = buf; 4257 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 4258 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 4259 arc_read_done(zio); 4260 } else { 4261 mutex_exit(hash_lock); 4262 /* 4263 * Buffer didn't survive caching. Increment stats and 4264 * reissue to the original storage device. 4265 */ 4266 if (zio->io_error != 0) { 4267 ARCSTAT_BUMP(arcstat_l2_io_error); 4268 } else { 4269 zio->io_error = SET_ERROR(EIO); 4270 } 4271 if (!equal) 4272 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 4273 4274 /* 4275 * If there's no waiter, issue an async i/o to the primary 4276 * storage now. If there *is* a waiter, the caller must 4277 * issue the i/o in a context where it's OK to block. 4278 */ 4279 if (zio->io_waiter == NULL) { 4280 zio_t *pio = zio_unique_parent(zio); 4281 4282 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 4283 4284 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 4285 buf->b_data, zio->io_size, arc_read_done, buf, 4286 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 4287 } 4288 } 4289 4290 kmem_free(cb, sizeof (l2arc_read_callback_t)); 4291 } 4292 4293 /* 4294 * This is the list priority from which the L2ARC will search for pages to 4295 * cache. This is used within loops (0..3) to cycle through lists in the 4296 * desired order. This order can have a significant effect on cache 4297 * performance. 4298 * 4299 * Currently the metadata lists are hit first, MFU then MRU, followed by 4300 * the data lists. This function returns a locked list, and also returns 4301 * the lock pointer. 4302 */ 4303 static list_t * 4304 l2arc_list_locked(int list_num, kmutex_t **lock) 4305 { 4306 list_t *list = NULL; 4307 4308 ASSERT(list_num >= 0 && list_num <= 3); 4309 4310 switch (list_num) { 4311 case 0: 4312 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; 4313 *lock = &arc_mfu->arcs_mtx; 4314 break; 4315 case 1: 4316 list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; 4317 *lock = &arc_mru->arcs_mtx; 4318 break; 4319 case 2: 4320 list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; 4321 *lock = &arc_mfu->arcs_mtx; 4322 break; 4323 case 3: 4324 list = &arc_mru->arcs_list[ARC_BUFC_DATA]; 4325 *lock = &arc_mru->arcs_mtx; 4326 break; 4327 } 4328 4329 ASSERT(!(MUTEX_HELD(*lock))); 4330 mutex_enter(*lock); 4331 return (list); 4332 } 4333 4334 /* 4335 * Evict buffers from the device write hand to the distance specified in 4336 * bytes. This distance may span populated buffers, it may span nothing. 4337 * This is clearing a region on the L2ARC device ready for writing. 4338 * If the 'all' boolean is set, every buffer is evicted. 4339 */ 4340 static void 4341 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 4342 { 4343 list_t *buflist; 4344 l2arc_buf_hdr_t *l2hdr; 4345 arc_buf_hdr_t *ab, *ab_prev; 4346 kmutex_t *hash_lock; 4347 uint64_t taddr; 4348 4349 buflist = dev->l2ad_buflist; 4350 4351 if (buflist == NULL) 4352 return; 4353 4354 if (!all && dev->l2ad_first) { 4355 /* 4356 * This is the first sweep through the device. There is 4357 * nothing to evict. 4358 */ 4359 return; 4360 } 4361 4362 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 4363 /* 4364 * When nearing the end of the device, evict to the end 4365 * before the device write hand jumps to the start. 4366 */ 4367 taddr = dev->l2ad_end; 4368 } else { 4369 taddr = dev->l2ad_hand + distance; 4370 } 4371 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 4372 uint64_t, taddr, boolean_t, all); 4373 4374 top: 4375 mutex_enter(&l2arc_buflist_mtx); 4376 for (ab = list_tail(buflist); ab; ab = ab_prev) { 4377 ab_prev = list_prev(buflist, ab); 4378 4379 hash_lock = HDR_LOCK(ab); 4380 if (!mutex_tryenter(hash_lock)) { 4381 /* 4382 * Missed the hash lock. Retry. 4383 */ 4384 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 4385 mutex_exit(&l2arc_buflist_mtx); 4386 mutex_enter(hash_lock); 4387 mutex_exit(hash_lock); 4388 goto top; 4389 } 4390 4391 if (HDR_L2_WRITE_HEAD(ab)) { 4392 /* 4393 * We hit a write head node. Leave it for 4394 * l2arc_write_done(). 4395 */ 4396 list_remove(buflist, ab); 4397 mutex_exit(hash_lock); 4398 continue; 4399 } 4400 4401 if (!all && ab->b_l2hdr != NULL && 4402 (ab->b_l2hdr->b_daddr > taddr || 4403 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { 4404 /* 4405 * We've evicted to the target address, 4406 * or the end of the device. 4407 */ 4408 mutex_exit(hash_lock); 4409 break; 4410 } 4411 4412 if (HDR_FREE_IN_PROGRESS(ab)) { 4413 /* 4414 * Already on the path to destruction. 4415 */ 4416 mutex_exit(hash_lock); 4417 continue; 4418 } 4419 4420 if (ab->b_state == arc_l2c_only) { 4421 ASSERT(!HDR_L2_READING(ab)); 4422 /* 4423 * This doesn't exist in the ARC. Destroy. 4424 * arc_hdr_destroy() will call list_remove() 4425 * and decrement arcstat_l2_size. 4426 */ 4427 arc_change_state(arc_anon, ab, hash_lock); 4428 arc_hdr_destroy(ab); 4429 } else { 4430 /* 4431 * Invalidate issued or about to be issued 4432 * reads, since we may be about to write 4433 * over this location. 4434 */ 4435 if (HDR_L2_READING(ab)) { 4436 ARCSTAT_BUMP(arcstat_l2_evict_reading); 4437 ab->b_flags |= ARC_L2_EVICTED; 4438 } 4439 4440 /* 4441 * Tell ARC this no longer exists in L2ARC. 4442 */ 4443 if (ab->b_l2hdr != NULL) { 4444 l2hdr = ab->b_l2hdr; 4445 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 4446 ab->b_l2hdr = NULL; 4447 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 4448 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4449 } 4450 list_remove(buflist, ab); 4451 4452 /* 4453 * This may have been leftover after a 4454 * failed write. 4455 */ 4456 ab->b_flags &= ~ARC_L2_WRITING; 4457 } 4458 mutex_exit(hash_lock); 4459 } 4460 mutex_exit(&l2arc_buflist_mtx); 4461 4462 vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0); 4463 dev->l2ad_evict = taddr; 4464 } 4465 4466 /* 4467 * Find and write ARC buffers to the L2ARC device. 4468 * 4469 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid 4470 * for reading until they have completed writing. 4471 * The headroom_boost is an in-out parameter used to maintain headroom boost 4472 * state between calls to this function. 4473 * 4474 * Returns the number of bytes actually written (which may be smaller than 4475 * the delta by which the device hand has changed due to alignment). 4476 */ 4477 static uint64_t 4478 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, 4479 boolean_t *headroom_boost) 4480 { 4481 arc_buf_hdr_t *ab, *ab_prev, *head; 4482 list_t *list; 4483 uint64_t write_asize, write_psize, write_sz, headroom, 4484 buf_compress_minsz; 4485 void *buf_data; 4486 kmutex_t *list_lock; 4487 boolean_t full; 4488 l2arc_write_callback_t *cb; 4489 zio_t *pio, *wzio; 4490 uint64_t guid = spa_load_guid(spa); 4491 const boolean_t do_headroom_boost = *headroom_boost; 4492 4493 ASSERT(dev->l2ad_vdev != NULL); 4494 4495 /* Lower the flag now, we might want to raise it again later. */ 4496 *headroom_boost = B_FALSE; 4497 4498 pio = NULL; 4499 write_sz = write_asize = write_psize = 0; 4500 full = B_FALSE; 4501 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 4502 head->b_flags |= ARC_L2_WRITE_HEAD; 4503 4504 /* 4505 * We will want to try to compress buffers that are at least 2x the 4506 * device sector size. 4507 */ 4508 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 4509 4510 /* 4511 * Copy buffers for L2ARC writing. 4512 */ 4513 mutex_enter(&l2arc_buflist_mtx); 4514 for (int try = 0; try <= 3; try++) { 4515 uint64_t passed_sz = 0; 4516 4517 list = l2arc_list_locked(try, &list_lock); 4518 4519 /* 4520 * L2ARC fast warmup. 4521 * 4522 * Until the ARC is warm and starts to evict, read from the 4523 * head of the ARC lists rather than the tail. 4524 */ 4525 if (arc_warm == B_FALSE) 4526 ab = list_head(list); 4527 else 4528 ab = list_tail(list); 4529 4530 headroom = target_sz * l2arc_headroom; 4531 if (do_headroom_boost) 4532 headroom = (headroom * l2arc_headroom_boost) / 100; 4533 4534 for (; ab; ab = ab_prev) { 4535 l2arc_buf_hdr_t *l2hdr; 4536 kmutex_t *hash_lock; 4537 uint64_t buf_sz; 4538 4539 if (arc_warm == B_FALSE) 4540 ab_prev = list_next(list, ab); 4541 else 4542 ab_prev = list_prev(list, ab); 4543 4544 hash_lock = HDR_LOCK(ab); 4545 if (!mutex_tryenter(hash_lock)) { 4546 /* 4547 * Skip this buffer rather than waiting. 4548 */ 4549 continue; 4550 } 4551 4552 passed_sz += ab->b_size; 4553 if (passed_sz > headroom) { 4554 /* 4555 * Searched too far. 4556 */ 4557 mutex_exit(hash_lock); 4558 break; 4559 } 4560 4561 if (!l2arc_write_eligible(guid, ab)) { 4562 mutex_exit(hash_lock); 4563 continue; 4564 } 4565 4566 if ((write_sz + ab->b_size) > target_sz) { 4567 full = B_TRUE; 4568 mutex_exit(hash_lock); 4569 break; 4570 } 4571 4572 if (pio == NULL) { 4573 /* 4574 * Insert a dummy header on the buflist so 4575 * l2arc_write_done() can find where the 4576 * write buffers begin without searching. 4577 */ 4578 list_insert_head(dev->l2ad_buflist, head); 4579 4580 cb = kmem_alloc( 4581 sizeof (l2arc_write_callback_t), KM_SLEEP); 4582 cb->l2wcb_dev = dev; 4583 cb->l2wcb_head = head; 4584 pio = zio_root(spa, l2arc_write_done, cb, 4585 ZIO_FLAG_CANFAIL); 4586 } 4587 4588 /* 4589 * Create and add a new L2ARC header. 4590 */ 4591 l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); 4592 l2hdr->b_dev = dev; 4593 ab->b_flags |= ARC_L2_WRITING; 4594 4595 /* 4596 * Temporarily stash the data buffer in b_tmp_cdata. 4597 * The subsequent write step will pick it up from 4598 * there. This is because can't access ab->b_buf 4599 * without holding the hash_lock, which we in turn 4600 * can't access without holding the ARC list locks 4601 * (which we want to avoid during compression/writing). 4602 */ 4603 l2hdr->b_compress = ZIO_COMPRESS_OFF; 4604 l2hdr->b_asize = ab->b_size; 4605 l2hdr->b_tmp_cdata = ab->b_buf->b_data; 4606 4607 buf_sz = ab->b_size; 4608 ab->b_l2hdr = l2hdr; 4609 4610 list_insert_head(dev->l2ad_buflist, ab); 4611 4612 /* 4613 * Compute and store the buffer cksum before 4614 * writing. On debug the cksum is verified first. 4615 */ 4616 arc_cksum_verify(ab->b_buf); 4617 arc_cksum_compute(ab->b_buf, B_TRUE); 4618 4619 mutex_exit(hash_lock); 4620 4621 write_sz += buf_sz; 4622 } 4623 4624 mutex_exit(list_lock); 4625 4626 if (full == B_TRUE) 4627 break; 4628 } 4629 4630 /* No buffers selected for writing? */ 4631 if (pio == NULL) { 4632 ASSERT0(write_sz); 4633 mutex_exit(&l2arc_buflist_mtx); 4634 kmem_cache_free(hdr_cache, head); 4635 return (0); 4636 } 4637 4638 /* 4639 * Now start writing the buffers. We're starting at the write head 4640 * and work backwards, retracing the course of the buffer selector 4641 * loop above. 4642 */ 4643 for (ab = list_prev(dev->l2ad_buflist, head); ab; 4644 ab = list_prev(dev->l2ad_buflist, ab)) { 4645 l2arc_buf_hdr_t *l2hdr; 4646 uint64_t buf_sz; 4647 4648 /* 4649 * We shouldn't need to lock the buffer here, since we flagged 4650 * it as ARC_L2_WRITING in the previous step, but we must take 4651 * care to only access its L2 cache parameters. In particular, 4652 * ab->b_buf may be invalid by now due to ARC eviction. 4653 */ 4654 l2hdr = ab->b_l2hdr; 4655 l2hdr->b_daddr = dev->l2ad_hand; 4656 4657 if ((ab->b_flags & ARC_L2COMPRESS) && 4658 l2hdr->b_asize >= buf_compress_minsz) { 4659 if (l2arc_compress_buf(l2hdr)) { 4660 /* 4661 * If compression succeeded, enable headroom 4662 * boost on the next scan cycle. 4663 */ 4664 *headroom_boost = B_TRUE; 4665 } 4666 } 4667 4668 /* 4669 * Pick up the buffer data we had previously stashed away 4670 * (and now potentially also compressed). 4671 */ 4672 buf_data = l2hdr->b_tmp_cdata; 4673 buf_sz = l2hdr->b_asize; 4674 4675 /* Compression may have squashed the buffer to zero length. */ 4676 if (buf_sz != 0) { 4677 uint64_t buf_p_sz; 4678 4679 wzio = zio_write_phys(pio, dev->l2ad_vdev, 4680 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 4681 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 4682 ZIO_FLAG_CANFAIL, B_FALSE); 4683 4684 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 4685 zio_t *, wzio); 4686 (void) zio_nowait(wzio); 4687 4688 write_asize += buf_sz; 4689 /* 4690 * Keep the clock hand suitably device-aligned. 4691 */ 4692 buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 4693 write_psize += buf_p_sz; 4694 dev->l2ad_hand += buf_p_sz; 4695 } 4696 } 4697 4698 mutex_exit(&l2arc_buflist_mtx); 4699 4700 ASSERT3U(write_asize, <=, target_sz); 4701 ARCSTAT_BUMP(arcstat_l2_writes_sent); 4702 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); 4703 ARCSTAT_INCR(arcstat_l2_size, write_sz); 4704 ARCSTAT_INCR(arcstat_l2_asize, write_asize); 4705 vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0); 4706 4707 /* 4708 * Bump device hand to the device start if it is approaching the end. 4709 * l2arc_evict() will already have evicted ahead for this case. 4710 */ 4711 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 4712 vdev_space_update(dev->l2ad_vdev, 4713 dev->l2ad_end - dev->l2ad_hand, 0, 0); 4714 dev->l2ad_hand = dev->l2ad_start; 4715 dev->l2ad_evict = dev->l2ad_start; 4716 dev->l2ad_first = B_FALSE; 4717 } 4718 4719 /* dev->l2ad_writing will be lowered in the zio done callback */ 4720 dev->l2ad_writing = B_TRUE; 4721 (void) zio_wait(pio); 4722 ASSERT(dev->l2ad_writing == B_FALSE); 4723 4724 return (write_asize); 4725 } 4726 4727 /* 4728 * Compresses an L2ARC buffer. 4729 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its 4730 * size in l2hdr->b_asize. This routine tries to compress the data and 4731 * depending on the compression result there are three possible outcomes: 4732 * *) The buffer was incompressible. The original l2hdr contents were left 4733 * untouched and are ready for writing to an L2 device. 4734 * *) The buffer was all-zeros, so there is no need to write it to an L2 4735 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is 4736 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. 4737 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary 4738 * data buffer which holds the compressed data to be written, and b_asize 4739 * tells us how much data there is. b_compress is set to the appropriate 4740 * compression algorithm. Once writing is done, invoke 4741 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. 4742 * 4743 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the 4744 * buffer was incompressible). 4745 */ 4746 static boolean_t 4747 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) 4748 { 4749 void *cdata; 4750 size_t csize, len; 4751 4752 ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF); 4753 ASSERT(l2hdr->b_tmp_cdata != NULL); 4754 4755 len = l2hdr->b_asize; 4756 cdata = zio_data_buf_alloc(len); 4757 csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata, 4758 cdata, l2hdr->b_asize); 4759 4760 if (csize == 0) { 4761 /* zero block, indicate that there's nothing to write */ 4762 zio_data_buf_free(cdata, len); 4763 l2hdr->b_compress = ZIO_COMPRESS_EMPTY; 4764 l2hdr->b_asize = 0; 4765 l2hdr->b_tmp_cdata = NULL; 4766 ARCSTAT_BUMP(arcstat_l2_compress_zeros); 4767 return (B_TRUE); 4768 } else if (csize > 0 && csize < len) { 4769 /* 4770 * Compression succeeded, we'll keep the cdata around for 4771 * writing and release it afterwards. 4772 */ 4773 l2hdr->b_compress = ZIO_COMPRESS_LZ4; 4774 l2hdr->b_asize = csize; 4775 l2hdr->b_tmp_cdata = cdata; 4776 ARCSTAT_BUMP(arcstat_l2_compress_successes); 4777 return (B_TRUE); 4778 } else { 4779 /* 4780 * Compression failed, release the compressed buffer. 4781 * l2hdr will be left unmodified. 4782 */ 4783 zio_data_buf_free(cdata, len); 4784 ARCSTAT_BUMP(arcstat_l2_compress_failures); 4785 return (B_FALSE); 4786 } 4787 } 4788 4789 /* 4790 * Decompresses a zio read back from an l2arc device. On success, the 4791 * underlying zio's io_data buffer is overwritten by the uncompressed 4792 * version. On decompression error (corrupt compressed stream), the 4793 * zio->io_error value is set to signal an I/O error. 4794 * 4795 * Please note that the compressed data stream is not checksummed, so 4796 * if the underlying device is experiencing data corruption, we may feed 4797 * corrupt data to the decompressor, so the decompressor needs to be 4798 * able to handle this situation (LZ4 does). 4799 */ 4800 static void 4801 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) 4802 { 4803 ASSERT(L2ARC_IS_VALID_COMPRESS(c)); 4804 4805 if (zio->io_error != 0) { 4806 /* 4807 * An io error has occured, just restore the original io 4808 * size in preparation for a main pool read. 4809 */ 4810 zio->io_orig_size = zio->io_size = hdr->b_size; 4811 return; 4812 } 4813 4814 if (c == ZIO_COMPRESS_EMPTY) { 4815 /* 4816 * An empty buffer results in a null zio, which means we 4817 * need to fill its io_data after we're done restoring the 4818 * buffer's contents. 4819 */ 4820 ASSERT(hdr->b_buf != NULL); 4821 bzero(hdr->b_buf->b_data, hdr->b_size); 4822 zio->io_data = zio->io_orig_data = hdr->b_buf->b_data; 4823 } else { 4824 ASSERT(zio->io_data != NULL); 4825 /* 4826 * We copy the compressed data from the start of the arc buffer 4827 * (the zio_read will have pulled in only what we need, the 4828 * rest is garbage which we will overwrite at decompression) 4829 * and then decompress back to the ARC data buffer. This way we 4830 * can minimize copying by simply decompressing back over the 4831 * original compressed data (rather than decompressing to an 4832 * aux buffer and then copying back the uncompressed buffer, 4833 * which is likely to be much larger). 4834 */ 4835 uint64_t csize; 4836 void *cdata; 4837 4838 csize = zio->io_size; 4839 cdata = zio_data_buf_alloc(csize); 4840 bcopy(zio->io_data, cdata, csize); 4841 if (zio_decompress_data(c, cdata, zio->io_data, csize, 4842 hdr->b_size) != 0) 4843 zio->io_error = EIO; 4844 zio_data_buf_free(cdata, csize); 4845 } 4846 4847 /* Restore the expected uncompressed IO size. */ 4848 zio->io_orig_size = zio->io_size = hdr->b_size; 4849 } 4850 4851 /* 4852 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. 4853 * This buffer serves as a temporary holder of compressed data while 4854 * the buffer entry is being written to an l2arc device. Once that is 4855 * done, we can dispose of it. 4856 */ 4857 static void 4858 l2arc_release_cdata_buf(arc_buf_hdr_t *ab) 4859 { 4860 l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr; 4861 4862 if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) { 4863 /* 4864 * If the data was compressed, then we've allocated a 4865 * temporary buffer for it, so now we need to release it. 4866 */ 4867 ASSERT(l2hdr->b_tmp_cdata != NULL); 4868 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size); 4869 } 4870 l2hdr->b_tmp_cdata = NULL; 4871 } 4872 4873 /* 4874 * This thread feeds the L2ARC at regular intervals. This is the beating 4875 * heart of the L2ARC. 4876 */ 4877 static void 4878 l2arc_feed_thread(void) 4879 { 4880 callb_cpr_t cpr; 4881 l2arc_dev_t *dev; 4882 spa_t *spa; 4883 uint64_t size, wrote; 4884 clock_t begin, next = ddi_get_lbolt(); 4885 boolean_t headroom_boost = B_FALSE; 4886 4887 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 4888 4889 mutex_enter(&l2arc_feed_thr_lock); 4890 4891 while (l2arc_thread_exit == 0) { 4892 CALLB_CPR_SAFE_BEGIN(&cpr); 4893 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 4894 next); 4895 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 4896 next = ddi_get_lbolt() + hz; 4897 4898 /* 4899 * Quick check for L2ARC devices. 4900 */ 4901 mutex_enter(&l2arc_dev_mtx); 4902 if (l2arc_ndev == 0) { 4903 mutex_exit(&l2arc_dev_mtx); 4904 continue; 4905 } 4906 mutex_exit(&l2arc_dev_mtx); 4907 begin = ddi_get_lbolt(); 4908 4909 /* 4910 * This selects the next l2arc device to write to, and in 4911 * doing so the next spa to feed from: dev->l2ad_spa. This 4912 * will return NULL if there are now no l2arc devices or if 4913 * they are all faulted. 4914 * 4915 * If a device is returned, its spa's config lock is also 4916 * held to prevent device removal. l2arc_dev_get_next() 4917 * will grab and release l2arc_dev_mtx. 4918 */ 4919 if ((dev = l2arc_dev_get_next()) == NULL) 4920 continue; 4921 4922 spa = dev->l2ad_spa; 4923 ASSERT(spa != NULL); 4924 4925 /* 4926 * If the pool is read-only then force the feed thread to 4927 * sleep a little longer. 4928 */ 4929 if (!spa_writeable(spa)) { 4930 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 4931 spa_config_exit(spa, SCL_L2ARC, dev); 4932 continue; 4933 } 4934 4935 /* 4936 * Avoid contributing to memory pressure. 4937 */ 4938 if (arc_reclaim_needed()) { 4939 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 4940 spa_config_exit(spa, SCL_L2ARC, dev); 4941 continue; 4942 } 4943 4944 ARCSTAT_BUMP(arcstat_l2_feeds); 4945 4946 size = l2arc_write_size(); 4947 4948 /* 4949 * Evict L2ARC buffers that will be overwritten. 4950 */ 4951 l2arc_evict(dev, size, B_FALSE); 4952 4953 /* 4954 * Write ARC buffers. 4955 */ 4956 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); 4957 4958 /* 4959 * Calculate interval between writes. 4960 */ 4961 next = l2arc_write_interval(begin, size, wrote); 4962 spa_config_exit(spa, SCL_L2ARC, dev); 4963 } 4964 4965 l2arc_thread_exit = 0; 4966 cv_broadcast(&l2arc_feed_thr_cv); 4967 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 4968 thread_exit(); 4969 } 4970 4971 boolean_t 4972 l2arc_vdev_present(vdev_t *vd) 4973 { 4974 l2arc_dev_t *dev; 4975 4976 mutex_enter(&l2arc_dev_mtx); 4977 for (dev = list_head(l2arc_dev_list); dev != NULL; 4978 dev = list_next(l2arc_dev_list, dev)) { 4979 if (dev->l2ad_vdev == vd) 4980 break; 4981 } 4982 mutex_exit(&l2arc_dev_mtx); 4983 4984 return (dev != NULL); 4985 } 4986 4987 /* 4988 * Add a vdev for use by the L2ARC. By this point the spa has already 4989 * validated the vdev and opened it. 4990 */ 4991 void 4992 l2arc_add_vdev(spa_t *spa, vdev_t *vd) 4993 { 4994 l2arc_dev_t *adddev; 4995 4996 ASSERT(!l2arc_vdev_present(vd)); 4997 4998 /* 4999 * Create a new l2arc device entry. 5000 */ 5001 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 5002 adddev->l2ad_spa = spa; 5003 adddev->l2ad_vdev = vd; 5004 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 5005 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 5006 adddev->l2ad_hand = adddev->l2ad_start; 5007 adddev->l2ad_evict = adddev->l2ad_start; 5008 adddev->l2ad_first = B_TRUE; 5009 adddev->l2ad_writing = B_FALSE; 5010 5011 /* 5012 * This is a list of all ARC buffers that are still valid on the 5013 * device. 5014 */ 5015 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); 5016 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 5017 offsetof(arc_buf_hdr_t, b_l2node)); 5018 5019 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 5020 5021 /* 5022 * Add device to global list 5023 */ 5024 mutex_enter(&l2arc_dev_mtx); 5025 list_insert_head(l2arc_dev_list, adddev); 5026 atomic_inc_64(&l2arc_ndev); 5027 mutex_exit(&l2arc_dev_mtx); 5028 } 5029 5030 /* 5031 * Remove a vdev from the L2ARC. 5032 */ 5033 void 5034 l2arc_remove_vdev(vdev_t *vd) 5035 { 5036 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 5037 5038 /* 5039 * Find the device by vdev 5040 */ 5041 mutex_enter(&l2arc_dev_mtx); 5042 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 5043 nextdev = list_next(l2arc_dev_list, dev); 5044 if (vd == dev->l2ad_vdev) { 5045 remdev = dev; 5046 break; 5047 } 5048 } 5049 ASSERT(remdev != NULL); 5050 5051 /* 5052 * Remove device from global list 5053 */ 5054 list_remove(l2arc_dev_list, remdev); 5055 l2arc_dev_last = NULL; /* may have been invalidated */ 5056 atomic_dec_64(&l2arc_ndev); 5057 mutex_exit(&l2arc_dev_mtx); 5058 5059 /* 5060 * Clear all buflists and ARC references. L2ARC device flush. 5061 */ 5062 l2arc_evict(remdev, 0, B_TRUE); 5063 list_destroy(remdev->l2ad_buflist); 5064 kmem_free(remdev->l2ad_buflist, sizeof (list_t)); 5065 kmem_free(remdev, sizeof (l2arc_dev_t)); 5066 } 5067 5068 void 5069 l2arc_init(void) 5070 { 5071 l2arc_thread_exit = 0; 5072 l2arc_ndev = 0; 5073 l2arc_writes_sent = 0; 5074 l2arc_writes_done = 0; 5075 5076 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 5077 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 5078 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 5079 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); 5080 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 5081 5082 l2arc_dev_list = &L2ARC_dev_list; 5083 l2arc_free_on_write = &L2ARC_free_on_write; 5084 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 5085 offsetof(l2arc_dev_t, l2ad_node)); 5086 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 5087 offsetof(l2arc_data_free_t, l2df_list_node)); 5088 } 5089 5090 void 5091 l2arc_fini(void) 5092 { 5093 /* 5094 * This is called from dmu_fini(), which is called from spa_fini(); 5095 * Because of this, we can assume that all l2arc devices have 5096 * already been removed when the pools themselves were removed. 5097 */ 5098 5099 l2arc_do_free_on_write(); 5100 5101 mutex_destroy(&l2arc_feed_thr_lock); 5102 cv_destroy(&l2arc_feed_thr_cv); 5103 mutex_destroy(&l2arc_dev_mtx); 5104 mutex_destroy(&l2arc_buflist_mtx); 5105 mutex_destroy(&l2arc_free_on_write_mtx); 5106 5107 list_destroy(l2arc_dev_list); 5108 list_destroy(l2arc_free_on_write); 5109 } 5110 5111 void 5112 l2arc_start(void) 5113 { 5114 if (!(spa_mode_global & FWRITE)) 5115 return; 5116 5117 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 5118 TS_RUN, minclsyspri); 5119 } 5120 5121 void 5122 l2arc_stop(void) 5123 { 5124 if (!(spa_mode_global & FWRITE)) 5125 return; 5126 5127 mutex_enter(&l2arc_feed_thr_lock); 5128 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 5129 l2arc_thread_exit = 1; 5130 while (l2arc_thread_exit != 0) 5131 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 5132 mutex_exit(&l2arc_feed_thr_lock); 5133 }