1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26 * Copyright 2014 Nexenta Systems, Inc. All rights reserved. 27 */ 28 29 /* 30 * DVA-based Adjustable Replacement Cache 31 * 32 * While much of the theory of operation used here is 33 * based on the self-tuning, low overhead replacement cache 34 * presented by Megiddo and Modha at FAST 2003, there are some 35 * significant differences: 36 * 37 * 1. The Megiddo and Modha model assumes any page is evictable. 38 * Pages in its cache cannot be "locked" into memory. This makes 39 * the eviction algorithm simple: evict the last page in the list. 40 * This also make the performance characteristics easy to reason 41 * about. Our cache is not so simple. At any given moment, some 42 * subset of the blocks in the cache are un-evictable because we 43 * have handed out a reference to them. Blocks are only evictable 44 * when there are no external references active. This makes 45 * eviction far more problematic: we choose to evict the evictable 46 * blocks that are the "lowest" in the list. 47 * 48 * There are times when it is not possible to evict the requested 49 * space. In these circumstances we are unable to adjust the cache 50 * size. To prevent the cache growing unbounded at these times we 51 * implement a "cache throttle" that slows the flow of new data 52 * into the cache until we can make space available. 53 * 54 * 2. The Megiddo and Modha model assumes a fixed cache size. 55 * Pages are evicted when the cache is full and there is a cache 56 * miss. Our model has a variable sized cache. It grows with 57 * high use, but also tries to react to memory pressure from the 58 * operating system: decreasing its size when system memory is 59 * tight. 60 * 61 * 3. The Megiddo and Modha model assumes a fixed page size. All 62 * elements of the cache are therefore exactly the same size. So 63 * when adjusting the cache size following a cache miss, its simply 64 * a matter of choosing a single page to evict. In our model, we 65 * have variable sized cache blocks (rangeing from 512 bytes to 66 * 128K bytes). We therefore choose a set of blocks to evict to make 67 * space for a cache miss that approximates as closely as possible 68 * the space used by the new block. 69 * 70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71 * by N. Megiddo & D. Modha, FAST 2003 72 */ 73 74 /* 75 * The locking model: 76 * 77 * A new reference to a cache buffer can be obtained in two 78 * ways: 1) via a hash table lookup using the DVA as a key, 79 * or 2) via one of the ARC lists. The arc_read() interface 80 * uses method 1, while the internal arc algorithms for 81 * adjusting the cache use method 2. We therefore provide two 82 * types of locks: 1) the hash table lock array, and 2) the 83 * arc list locks. 84 * 85 * Buffers do not have their own mutexes, rather they rely on the 86 * hash table mutexes for the bulk of their protection (i.e. most 87 * fields in the arc_buf_hdr_t are protected by these mutexes). 88 * 89 * buf_hash_find() returns the appropriate mutex (held) when it 90 * locates the requested buffer in the hash table. It returns 91 * NULL for the mutex if the buffer was not in the table. 92 * 93 * buf_hash_remove() expects the appropriate hash mutex to be 94 * already held before it is invoked. 95 * 96 * Each arc state also has a mutex which is used to protect the 97 * buffer list associated with the state. When attempting to 98 * obtain a hash table lock while holding an arc list lock you 99 * must use: mutex_tryenter() to avoid deadlock. Also note that 100 * the active state mutex must be held before the ghost state mutex. 101 * 102 * Arc buffers may have an associated eviction callback function. 103 * This function will be invoked prior to removing the buffer (e.g. 104 * in arc_do_user_evicts()). Note however that the data associated 105 * with the buffer may be evicted prior to the callback. The callback 106 * must be made with *no locks held* (to prevent deadlock). Additionally, 107 * the users of callbacks must ensure that their private data is 108 * protected from simultaneous callbacks from arc_clear_callback() 109 * and arc_do_user_evicts(). 110 * 111 * Note that the majority of the performance stats are manipulated 112 * with atomic operations. 113 * 114 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: 115 * 116 * - L2ARC buflist creation 117 * - L2ARC buflist eviction 118 * - L2ARC write completion, which walks L2ARC buflists 119 * - ARC header destruction, as it removes from L2ARC buflists 120 * - ARC header release, as it removes from L2ARC buflists 121 */ 122 123 #include <sys/spa.h> 124 #include <sys/zio.h> 125 #include <sys/zio_compress.h> 126 #include <sys/zfs_context.h> 127 #include <sys/arc.h> 128 #include <sys/refcount.h> 129 #include <sys/vdev.h> 130 #include <sys/vdev_impl.h> 131 #include <sys/dsl_pool.h> 132 #ifdef _KERNEL 133 #include <sys/vmsystm.h> 134 #include <vm/anon.h> 135 #include <sys/fs/swapnode.h> 136 #include <sys/dnlc.h> 137 #endif 138 #include <sys/callb.h> 139 #include <sys/kstat.h> 140 #include <zfs_fletcher.h> 141 142 #ifndef _KERNEL 143 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 144 boolean_t arc_watch = B_FALSE; 145 int arc_procfd; 146 #endif 147 148 static kmutex_t arc_reclaim_thr_lock; 149 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 150 static uint8_t arc_thread_exit; 151 152 static kmutex_t arc_pressure_thr_lock; 153 static kcondvar_t arc_pressure_thr_cv; 154 static uint8_t arc_pressure_thread_exit; 155 static uint64_t arc_pressure_threshold; 156 157 #define ARC_REDUCE_DNLC_PERCENT 3 158 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 159 160 typedef enum arc_reclaim_strategy { 161 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 162 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 163 } arc_reclaim_strategy_t; 164 165 /* 166 * The number of iterations through arc_evict_*() before we 167 * drop & reacquire the lock. 168 */ 169 int arc_evict_iterations = 100; 170 171 /* number of seconds before growing cache again */ 172 static int arc_grow_retry = 60; 173 174 /* shift of arc_c for calculating both min and max arc_p */ 175 static int arc_p_min_shift = 4; 176 177 /* log2(fraction of arc to reclaim) */ 178 static int arc_shrink_shift = 5; 179 180 /* 181 * minimum lifespan of a prefetch block in clock ticks 182 * (initialized in arc_init()) 183 */ 184 static int arc_min_prefetch_lifespan; 185 186 /* 187 * If this percent of memory is free, don't throttle. 188 */ 189 int arc_lotsfree_percent = 10; 190 191 static int arc_dead; 192 193 /* 194 * The arc has filled available memory and has now warmed up. 195 */ 196 static boolean_t arc_warm; 197 198 /* 199 * These tunables are for performance analysis. 200 */ 201 uint64_t zfs_arc_max; 202 uint64_t zfs_arc_min; 203 uint64_t zfs_arc_meta_limit = 0; 204 int zfs_arc_grow_retry = 0; 205 int zfs_arc_shrink_shift = 0; 206 int zfs_arc_p_min_shift = 0; 207 int zfs_disable_dup_eviction = 0; 208 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 209 210 /* 211 * Note that buffers can be in one of 6 states: 212 * ARC_anon - anonymous (discussed below) 213 * ARC_mru - recently used, currently cached 214 * ARC_mru_ghost - recentely used, no longer in cache 215 * ARC_mfu - frequently used, currently cached 216 * ARC_mfu_ghost - frequently used, no longer in cache 217 * ARC_l2c_only - exists in L2ARC but not other states 218 * When there are no active references to the buffer, they are 219 * are linked onto a list in one of these arc states. These are 220 * the only buffers that can be evicted or deleted. Within each 221 * state there are multiple lists, one for meta-data and one for 222 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 223 * etc.) is tracked separately so that it can be managed more 224 * explicitly: favored over data, limited explicitly. 225 * 226 * Anonymous buffers are buffers that are not associated with 227 * a DVA. These are buffers that hold dirty block copies 228 * before they are written to stable storage. By definition, 229 * they are "ref'd" and are considered part of arc_mru 230 * that cannot be freed. Generally, they will aquire a DVA 231 * as they are written and migrate onto the arc_mru list. 232 * 233 * The ARC_l2c_only state is for buffers that are in the second 234 * level ARC but no longer in any of the ARC_m* lists. The second 235 * level ARC itself may also contain buffers that are in any of 236 * the ARC_m* states - meaning that a buffer can exist in two 237 * places. The reason for the ARC_l2c_only state is to keep the 238 * buffer header in the hash table, so that reads that hit the 239 * second level ARC benefit from these fast lookups. 240 */ 241 242 typedef struct arc_state { 243 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ 244 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 245 uint64_t arcs_size; /* total amount of data in this state */ 246 kmutex_t arcs_mtx; 247 } arc_state_t; 248 249 /* The 6 states: */ 250 static arc_state_t ARC_anon; 251 static arc_state_t ARC_mru; 252 static arc_state_t ARC_mru_ghost; 253 static arc_state_t ARC_mfu; 254 static arc_state_t ARC_mfu_ghost; 255 static arc_state_t ARC_l2c_only; 256 257 typedef struct arc_stats { 258 kstat_named_t arcstat_hits; 259 kstat_named_t arcstat_misses; 260 kstat_named_t arcstat_demand_data_hits; 261 kstat_named_t arcstat_demand_data_misses; 262 kstat_named_t arcstat_demand_metadata_hits; 263 kstat_named_t arcstat_demand_metadata_misses; 264 kstat_named_t arcstat_prefetch_data_hits; 265 kstat_named_t arcstat_prefetch_data_misses; 266 kstat_named_t arcstat_prefetch_metadata_hits; 267 kstat_named_t arcstat_prefetch_metadata_misses; 268 kstat_named_t arcstat_mru_hits; 269 kstat_named_t arcstat_mru_ghost_hits; 270 kstat_named_t arcstat_mfu_hits; 271 kstat_named_t arcstat_mfu_ghost_hits; 272 kstat_named_t arcstat_deleted; 273 kstat_named_t arcstat_recycle_miss; 274 /* 275 * Number of buffers that could not be evicted because the hash lock 276 * was held by another thread. The lock may not necessarily be held 277 * by something using the same buffer, since hash locks are shared 278 * by multiple buffers. 279 */ 280 kstat_named_t arcstat_mutex_miss; 281 /* 282 * Number of buffers skipped because they have I/O in progress, are 283 * indrect prefetch buffers that have not lived long enough, or are 284 * not from the spa we're trying to evict from. 285 */ 286 kstat_named_t arcstat_evict_skip; 287 kstat_named_t arcstat_evict_l2_cached; 288 kstat_named_t arcstat_evict_l2_eligible; 289 kstat_named_t arcstat_evict_l2_ineligible; 290 kstat_named_t arcstat_hash_elements; 291 kstat_named_t arcstat_hash_elements_max; 292 kstat_named_t arcstat_hash_collisions; 293 kstat_named_t arcstat_hash_chains; 294 kstat_named_t arcstat_hash_chain_max; 295 kstat_named_t arcstat_p; 296 kstat_named_t arcstat_c; 297 kstat_named_t arcstat_c_min; 298 kstat_named_t arcstat_c_max; 299 kstat_named_t arcstat_size; 300 kstat_named_t arcstat_hdr_size; 301 kstat_named_t arcstat_data_size; 302 kstat_named_t arcstat_other_size; 303 kstat_named_t arcstat_growth_rate; 304 kstat_named_t arcstat_l2_hits; 305 kstat_named_t arcstat_l2_misses; 306 kstat_named_t arcstat_l2_feeds; 307 kstat_named_t arcstat_l2_rw_clash; 308 kstat_named_t arcstat_l2_read_bytes; 309 kstat_named_t arcstat_l2_write_bytes; 310 kstat_named_t arcstat_l2_writes_sent; 311 kstat_named_t arcstat_l2_writes_done; 312 kstat_named_t arcstat_l2_writes_error; 313 kstat_named_t arcstat_l2_writes_hdr_miss; 314 kstat_named_t arcstat_l2_evict_lock_retry; 315 kstat_named_t arcstat_l2_evict_reading; 316 kstat_named_t arcstat_l2_free_on_write; 317 kstat_named_t arcstat_l2_abort_lowmem; 318 kstat_named_t arcstat_l2_cksum_bad; 319 kstat_named_t arcstat_l2_io_error; 320 kstat_named_t arcstat_l2_size; 321 kstat_named_t arcstat_l2_asize; 322 kstat_named_t arcstat_l2_hdr_size; 323 kstat_named_t arcstat_l2_compress_successes; 324 kstat_named_t arcstat_l2_compress_zeros; 325 kstat_named_t arcstat_l2_compress_failures; 326 kstat_named_t arcstat_memory_throttle_count; 327 kstat_named_t arcstat_duplicate_buffers; 328 kstat_named_t arcstat_duplicate_buffers_size; 329 kstat_named_t arcstat_duplicate_reads; 330 kstat_named_t arcstat_meta_used; 331 kstat_named_t arcstat_meta_limit; 332 kstat_named_t arcstat_meta_max; 333 } arc_stats_t; 334 335 static arc_stats_t arc_stats = { 336 { "hits", KSTAT_DATA_UINT64 }, 337 { "misses", KSTAT_DATA_UINT64 }, 338 { "demand_data_hits", KSTAT_DATA_UINT64 }, 339 { "demand_data_misses", KSTAT_DATA_UINT64 }, 340 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 341 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 342 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 343 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 344 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 345 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 346 { "mru_hits", KSTAT_DATA_UINT64 }, 347 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 348 { "mfu_hits", KSTAT_DATA_UINT64 }, 349 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 350 { "deleted", KSTAT_DATA_UINT64 }, 351 { "recycle_miss", KSTAT_DATA_UINT64 }, 352 { "mutex_miss", KSTAT_DATA_UINT64 }, 353 { "evict_skip", KSTAT_DATA_UINT64 }, 354 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 355 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 356 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 357 { "hash_elements", KSTAT_DATA_UINT64 }, 358 { "hash_elements_max", KSTAT_DATA_UINT64 }, 359 { "hash_collisions", KSTAT_DATA_UINT64 }, 360 { "hash_chains", KSTAT_DATA_UINT64 }, 361 { "hash_chain_max", KSTAT_DATA_UINT64 }, 362 { "p", KSTAT_DATA_UINT64 }, 363 { "c", KSTAT_DATA_UINT64 }, 364 { "c_min", KSTAT_DATA_UINT64 }, 365 { "c_max", KSTAT_DATA_UINT64 }, 366 { "size", KSTAT_DATA_UINT64 }, 367 { "hdr_size", KSTAT_DATA_UINT64 }, 368 { "data_size", KSTAT_DATA_UINT64 }, 369 { "other_size", KSTAT_DATA_UINT64 }, 370 { "growth_rate", KSTAT_DATA_UINT64 }, 371 { "l2_hits", KSTAT_DATA_UINT64 }, 372 { "l2_misses", KSTAT_DATA_UINT64 }, 373 { "l2_feeds", KSTAT_DATA_UINT64 }, 374 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 375 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 376 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 377 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 378 { "l2_writes_done", KSTAT_DATA_UINT64 }, 379 { "l2_writes_error", KSTAT_DATA_UINT64 }, 380 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 381 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 382 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 383 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 384 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 385 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 386 { "l2_io_error", KSTAT_DATA_UINT64 }, 387 { "l2_size", KSTAT_DATA_UINT64 }, 388 { "l2_asize", KSTAT_DATA_UINT64 }, 389 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 390 { "l2_compress_successes", KSTAT_DATA_UINT64 }, 391 { "l2_compress_zeros", KSTAT_DATA_UINT64 }, 392 { "l2_compress_failures", KSTAT_DATA_UINT64 }, 393 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 394 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 395 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 396 { "duplicate_reads", KSTAT_DATA_UINT64 }, 397 { "arc_meta_used", KSTAT_DATA_UINT64 }, 398 { "arc_meta_limit", KSTAT_DATA_UINT64 }, 399 { "arc_meta_max", KSTAT_DATA_UINT64 } 400 }; 401 402 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 403 404 #define ARCSTAT_INCR(stat, val) \ 405 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 406 407 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 408 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 409 410 #define ARCSTAT_MAX(stat, val) { \ 411 uint64_t m; \ 412 while ((val) > (m = arc_stats.stat.value.ui64) && \ 413 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 414 continue; \ 415 } 416 417 #define ARCSTAT_MAXSTAT(stat) \ 418 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 419 420 /* 421 * We define a macro to allow ARC hits/misses to be easily broken down by 422 * two separate conditions, giving a total of four different subtypes for 423 * each of hits and misses (so eight statistics total). 424 */ 425 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 426 if (cond1) { \ 427 if (cond2) { \ 428 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 429 } else { \ 430 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 431 } \ 432 } else { \ 433 if (cond2) { \ 434 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 435 } else { \ 436 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 437 } \ 438 } 439 440 /* 441 * This macro allows us to use kstats as floating averages. Each time we 442 * update this kstat, we first factor it and the update value by `factor' 443 * to shrink the new value's contribution to the overall average. This 444 * macro assumes that integer loads and stores are atomic, but is not 445 * safe for multiple writers updating the kstat in parallel (only the 446 * last writer's update will remain). 447 */ 448 #define ARCSTAT_F_AVG(stat, value, factor) \ 449 do { \ 450 uint64_t x = ARCSTAT(stat); \ 451 x = x - x / factor + (value) / factor; \ 452 ARCSTAT(stat) = x; \ 453 _NOTE(NOTREACHED) \ 454 _NOTE(CONSTCOND) \ 455 } while (0) 456 457 kstat_t *arc_ksp; 458 static arc_state_t *arc_anon; 459 static arc_state_t *arc_mru; 460 static arc_state_t *arc_mru_ghost; 461 static arc_state_t *arc_mfu; 462 static arc_state_t *arc_mfu_ghost; 463 static arc_state_t *arc_l2c_only; 464 465 /* 466 * There are several ARC variables that are critical to export as kstats -- 467 * but we don't want to have to grovel around in the kstat whenever we wish to 468 * manipulate them. For these variables, we therefore define them to be in 469 * terms of the statistic variable. This assures that we are not introducing 470 * the possibility of inconsistency by having shadow copies of the variables, 471 * while still allowing the code to be readable. 472 */ 473 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 474 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 475 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 476 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 477 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 478 #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 479 #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 480 #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 481 482 #define L2ARC_IS_VALID_COMPRESS(_c_) \ 483 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) 484 485 static int arc_no_grow; /* Don't try to grow cache size */ 486 static uint64_t arc_tempreserve; 487 static uint64_t arc_loaned_bytes; 488 static uint64_t arc_bytes_allocd = 0; 489 490 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; 491 492 typedef struct arc_callback arc_callback_t; 493 494 struct arc_callback { 495 void *acb_private; 496 arc_done_func_t *acb_done; 497 arc_buf_t *acb_buf; 498 zio_t *acb_zio_dummy; 499 arc_callback_t *acb_next; 500 }; 501 502 typedef struct arc_write_callback arc_write_callback_t; 503 504 struct arc_write_callback { 505 void *awcb_private; 506 arc_done_func_t *awcb_ready; 507 arc_done_func_t *awcb_physdone; 508 arc_done_func_t *awcb_done; 509 arc_buf_t *awcb_buf; 510 }; 511 512 struct arc_buf_hdr { 513 /* protected by hash lock */ 514 dva_t b_dva; 515 uint64_t b_birth; 516 uint64_t b_cksum0; 517 518 kmutex_t b_freeze_lock; 519 zio_cksum_t *b_freeze_cksum; 520 void *b_thawed; 521 522 arc_buf_hdr_t *b_hash_next; 523 arc_buf_t *b_buf; 524 uint32_t b_flags; 525 uint32_t b_datacnt; 526 527 arc_callback_t *b_acb; 528 kcondvar_t b_cv; 529 530 /* immutable */ 531 arc_buf_contents_t b_type; 532 uint64_t b_size; 533 uint64_t b_spa; 534 535 /* protected by arc state mutex */ 536 arc_state_t *b_state; 537 list_node_t b_arc_node; 538 539 /* updated atomically */ 540 clock_t b_arc_access; 541 542 /* self protecting */ 543 refcount_t b_refcnt; 544 545 l2arc_buf_hdr_t *b_l2hdr; 546 list_node_t b_l2node; 547 }; 548 549 static arc_buf_t *arc_eviction_list; 550 static kmutex_t arc_eviction_mtx; 551 static arc_buf_hdr_t arc_eviction_hdr; 552 static void arc_get_data_buf(arc_buf_t *buf); 553 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 554 static int arc_evict_needed(arc_buf_contents_t type); 555 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); 556 static void arc_buf_watch(arc_buf_t *buf); 557 558 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); 559 560 #define GHOST_STATE(state) \ 561 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 562 (state) == arc_l2c_only) 563 564 /* 565 * Private ARC flags. These flags are private ARC only flags that will show up 566 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 567 * be passed in as arc_flags in things like arc_read. However, these flags 568 * should never be passed and should only be set by ARC code. When adding new 569 * public flags, make sure not to smash the private ones. 570 */ 571 572 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 573 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 574 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 575 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 576 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 577 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 578 #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ 579 #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ 580 #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ 581 #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ 582 583 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 584 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 585 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 586 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) 587 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 588 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 589 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) 590 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) 591 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ 592 (hdr)->b_l2hdr != NULL) 593 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) 594 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) 595 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) 596 597 /* 598 * Other sizes 599 */ 600 601 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 602 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) 603 604 /* 605 * Hash table routines 606 */ 607 608 struct ht_table { 609 arc_buf_hdr_t *hdr; 610 kmutex_t lock; 611 }; 612 613 typedef struct buf_hash_table { 614 uint64_t ht_mask; 615 struct ht_table *ht_table; 616 } buf_hash_table_t; 617 618 #pragma align 64(buf_hash_table) 619 static buf_hash_table_t buf_hash_table; 620 621 #define BUF_HASH_INDEX(spa, dva, birth) \ 622 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 623 #define BUF_HASH_LOCK(idx) (&buf_hash_table.ht_table[idx].lock) 624 #define HDR_LOCK(hdr) \ 625 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 626 627 uint64_t zfs_crc64_table[256]; 628 629 /* 630 * Level 2 ARC 631 */ 632 633 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 634 #define L2ARC_HEADROOM 2 /* num of writes */ 635 /* 636 * If we discover during ARC scan any buffers to be compressed, we boost 637 * our headroom for the next scanning cycle by this percentage multiple. 638 */ 639 #define L2ARC_HEADROOM_BOOST 200 640 #define L2ARC_FEED_SECS 1 /* caching interval secs */ 641 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 642 643 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 644 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 645 646 /* L2ARC Performance Tunables */ 647 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 648 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 649 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 650 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 651 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 652 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 653 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 654 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 655 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 656 657 /* 658 * L2ARC Internals 659 */ 660 typedef struct l2arc_dev { 661 vdev_t *l2ad_vdev; /* vdev */ 662 spa_t *l2ad_spa; /* spa */ 663 uint64_t l2ad_hand; /* next write location */ 664 uint64_t l2ad_start; /* first addr on device */ 665 uint64_t l2ad_end; /* last addr on device */ 666 uint64_t l2ad_evict; /* last addr eviction reached */ 667 boolean_t l2ad_first; /* first sweep through */ 668 boolean_t l2ad_writing; /* currently writing */ 669 list_t *l2ad_buflist; /* buffer list */ 670 list_node_t l2ad_node; /* device list node */ 671 } l2arc_dev_t; 672 673 static list_t L2ARC_dev_list; /* device list */ 674 static list_t *l2arc_dev_list; /* device list pointer */ 675 static kmutex_t l2arc_dev_mtx; /* device list mutex */ 676 static l2arc_dev_t *l2arc_dev_last; /* last device used */ 677 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ 678 static list_t L2ARC_free_on_write; /* free after write buf list */ 679 static list_t *l2arc_free_on_write; /* free after write list ptr */ 680 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 681 static uint64_t l2arc_ndev; /* number of devices */ 682 683 typedef struct l2arc_read_callback { 684 arc_buf_t *l2rcb_buf; /* read buffer */ 685 spa_t *l2rcb_spa; /* spa */ 686 blkptr_t l2rcb_bp; /* original blkptr */ 687 zbookmark_phys_t l2rcb_zb; /* original bookmark */ 688 int l2rcb_flags; /* original flags */ 689 enum zio_compress l2rcb_compress; /* applied compress */ 690 } l2arc_read_callback_t; 691 692 typedef struct l2arc_write_callback { 693 l2arc_dev_t *l2wcb_dev; /* device info */ 694 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 695 } l2arc_write_callback_t; 696 697 struct l2arc_buf_hdr { 698 /* protected by arc_buf_hdr mutex */ 699 l2arc_dev_t *b_dev; /* L2ARC device */ 700 uint64_t b_daddr; /* disk address, offset byte */ 701 /* compression applied to buffer data */ 702 enum zio_compress b_compress; 703 /* real alloc'd buffer size depending on b_compress applied */ 704 int b_asize; 705 /* temporary buffer holder for in-flight compressed data */ 706 void *b_tmp_cdata; 707 }; 708 709 typedef struct l2arc_data_free { 710 /* protected by l2arc_free_on_write_mtx */ 711 void *l2df_data; 712 size_t l2df_size; 713 void (*l2df_func)(void *, size_t); 714 list_node_t l2df_list_node; 715 } l2arc_data_free_t; 716 717 static kmutex_t l2arc_feed_thr_lock; 718 static kcondvar_t l2arc_feed_thr_cv; 719 static uint8_t l2arc_thread_exit; 720 721 static void l2arc_read_done(zio_t *zio); 722 static void l2arc_hdr_stat_add(void); 723 static void l2arc_hdr_stat_remove(void); 724 725 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr); 726 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, 727 enum zio_compress c); 728 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab); 729 730 static uint64_t 731 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 732 { 733 uint8_t *vdva = (uint8_t *)dva; 734 uint64_t crc = -1ULL; 735 int i; 736 737 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 738 739 for (i = 0; i < sizeof (dva_t); i++) 740 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 741 742 crc ^= (spa>>8) ^ birth; 743 744 return (crc); 745 } 746 747 #define BUF_EMPTY(buf) \ 748 ((buf)->b_dva.dva_word[0] == 0 && \ 749 (buf)->b_dva.dva_word[1] == 0 && \ 750 (buf)->b_cksum0 == 0) 751 752 #define BUF_EQUAL(spa, dva, birth, buf) \ 753 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 754 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 755 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 756 757 static void 758 buf_discard_identity(arc_buf_hdr_t *hdr) 759 { 760 hdr->b_dva.dva_word[0] = 0; 761 hdr->b_dva.dva_word[1] = 0; 762 hdr->b_birth = 0; 763 hdr->b_cksum0 = 0; 764 } 765 766 static arc_buf_hdr_t * 767 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 768 { 769 const dva_t *dva = BP_IDENTITY(bp); 770 uint64_t birth = BP_PHYSICAL_BIRTH(bp); 771 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 772 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 773 arc_buf_hdr_t *buf; 774 775 mutex_enter(hash_lock); 776 for (buf = buf_hash_table.ht_table[idx].hdr; buf != NULL; 777 buf = buf->b_hash_next) { 778 if (BUF_EQUAL(spa, dva, birth, buf)) { 779 *lockp = hash_lock; 780 return (buf); 781 } 782 } 783 mutex_exit(hash_lock); 784 *lockp = NULL; 785 return (NULL); 786 } 787 788 /* 789 * Insert an entry into the hash table. If there is already an element 790 * equal to elem in the hash table, then the already existing element 791 * will be returned and the new element will not be inserted. 792 * Otherwise returns NULL. 793 */ 794 static arc_buf_hdr_t * 795 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 796 { 797 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 798 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 799 arc_buf_hdr_t *fbuf; 800 uint32_t i; 801 802 ASSERT(!DVA_IS_EMPTY(&buf->b_dva)); 803 ASSERT(buf->b_birth != 0); 804 ASSERT(!HDR_IN_HASH_TABLE(buf)); 805 *lockp = hash_lock; 806 mutex_enter(hash_lock); 807 for (fbuf = buf_hash_table.ht_table[idx].hdr, i = 0; fbuf != NULL; 808 fbuf = fbuf->b_hash_next, i++) { 809 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 810 return (fbuf); 811 } 812 813 buf->b_hash_next = buf_hash_table.ht_table[idx].hdr; 814 buf_hash_table.ht_table[idx].hdr = buf; 815 buf->b_flags |= ARC_IN_HASH_TABLE; 816 817 /* collect some hash table performance data */ 818 if (i > 0) { 819 ARCSTAT_BUMP(arcstat_hash_collisions); 820 if (i == 1) 821 ARCSTAT_BUMP(arcstat_hash_chains); 822 823 ARCSTAT_MAX(arcstat_hash_chain_max, i); 824 } 825 826 ARCSTAT_BUMP(arcstat_hash_elements); 827 ARCSTAT_MAXSTAT(arcstat_hash_elements); 828 829 return (NULL); 830 } 831 832 static void 833 buf_hash_remove(arc_buf_hdr_t *buf) 834 { 835 arc_buf_hdr_t *fbuf, **bufp; 836 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 837 838 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 839 ASSERT(HDR_IN_HASH_TABLE(buf)); 840 841 bufp = &buf_hash_table.ht_table[idx].hdr; 842 while ((fbuf = *bufp) != buf) { 843 ASSERT(fbuf != NULL); 844 bufp = &fbuf->b_hash_next; 845 } 846 *bufp = buf->b_hash_next; 847 buf->b_hash_next = NULL; 848 buf->b_flags &= ~ARC_IN_HASH_TABLE; 849 850 /* collect some hash table performance data */ 851 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 852 853 if (buf_hash_table.ht_table[idx].hdr && 854 buf_hash_table.ht_table[idx].hdr->b_hash_next == NULL) 855 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 856 } 857 858 /* 859 * Global data structures and functions for the buf kmem cache. 860 */ 861 static kmem_cache_t *hdr_cache; 862 static kmem_cache_t *buf_cache; 863 864 static void 865 buf_fini(void) 866 { 867 int i; 868 869 for (i = 0; i < buf_hash_table.ht_mask + 1; i++) 870 mutex_destroy(&buf_hash_table.ht_table[i].lock); 871 kmem_free(buf_hash_table.ht_table, 872 (buf_hash_table.ht_mask + 1) * sizeof (struct ht_table)); 873 kmem_cache_destroy(hdr_cache); 874 kmem_cache_destroy(buf_cache); 875 } 876 877 /* 878 * Constructor callback - called when the cache is empty 879 * and a new buf is requested. 880 */ 881 /* ARGSUSED */ 882 static int 883 hdr_cons(void *vbuf, void *unused, int kmflag) 884 { 885 arc_buf_hdr_t *buf = vbuf; 886 887 bzero(buf, sizeof (arc_buf_hdr_t)); 888 refcount_create(&buf->b_refcnt); 889 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 890 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 891 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 892 893 return (0); 894 } 895 896 /* ARGSUSED */ 897 static int 898 buf_cons(void *vbuf, void *unused, int kmflag) 899 { 900 arc_buf_t *buf = vbuf; 901 902 bzero(buf, sizeof (arc_buf_t)); 903 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 904 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 905 906 return (0); 907 } 908 909 /* 910 * Destructor callback - called when a cached buf is 911 * no longer required. 912 */ 913 /* ARGSUSED */ 914 static void 915 hdr_dest(void *vbuf, void *unused) 916 { 917 arc_buf_hdr_t *buf = vbuf; 918 919 ASSERT(BUF_EMPTY(buf)); 920 refcount_destroy(&buf->b_refcnt); 921 cv_destroy(&buf->b_cv); 922 mutex_destroy(&buf->b_freeze_lock); 923 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 924 } 925 926 /* ARGSUSED */ 927 static void 928 buf_dest(void *vbuf, void *unused) 929 { 930 arc_buf_t *buf = vbuf; 931 932 mutex_destroy(&buf->b_evict_lock); 933 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 934 } 935 936 /* 937 * Reclaim callback -- invoked when memory is low. 938 */ 939 /* ARGSUSED */ 940 static void 941 hdr_recl(void *unused) 942 { 943 dprintf("hdr_recl called\n"); 944 /* 945 * umem calls the reclaim func when we destroy the buf cache, 946 * which is after we do arc_fini(). 947 */ 948 if (!arc_dead) 949 cv_signal(&arc_reclaim_thr_cv); 950 } 951 952 static void 953 buf_init(void) 954 { 955 uint64_t *ct; 956 uint64_t hsize = 1ULL << 12; 957 int i, j; 958 959 /* 960 * The hash table is big enough to fill all of physical memory 961 * with an average block size of zfs_arc_average_blocksize (default 8K). 962 * By default, the table will take up 963 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 964 */ 965 while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE) 966 hsize <<= 1; 967 retry: 968 buf_hash_table.ht_mask = hsize - 1; 969 buf_hash_table.ht_table = 970 kmem_zalloc(hsize * sizeof (struct ht_table), KM_NOSLEEP); 971 if (buf_hash_table.ht_table == NULL) { 972 ASSERT(hsize > (1ULL << 8)); 973 hsize >>= 1; 974 goto retry; 975 } 976 977 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 978 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 979 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 980 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 981 982 for (i = 0; i < 256; i++) 983 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 984 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 985 986 for (i = 0; i < hsize; i++) { 987 mutex_init(&buf_hash_table.ht_table[i].lock, 988 NULL, MUTEX_DEFAULT, NULL); 989 } 990 } 991 992 #define ARC_MINTIME (hz>>4) /* 62 ms */ 993 994 static void 995 arc_cksum_verify(arc_buf_t *buf) 996 { 997 zio_cksum_t zc; 998 999 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1000 return; 1001 1002 mutex_enter(&buf->b_hdr->b_freeze_lock); 1003 if (buf->b_hdr->b_freeze_cksum == NULL || 1004 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 1005 mutex_exit(&buf->b_hdr->b_freeze_lock); 1006 return; 1007 } 1008 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1009 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 1010 panic("buffer modified while frozen!"); 1011 mutex_exit(&buf->b_hdr->b_freeze_lock); 1012 } 1013 1014 static int 1015 arc_cksum_equal(arc_buf_t *buf) 1016 { 1017 zio_cksum_t zc; 1018 int equal; 1019 1020 mutex_enter(&buf->b_hdr->b_freeze_lock); 1021 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1022 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 1023 mutex_exit(&buf->b_hdr->b_freeze_lock); 1024 1025 return (equal); 1026 } 1027 1028 static void 1029 arc_cksum_compute(arc_buf_t *buf, boolean_t force) 1030 { 1031 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1032 return; 1033 1034 mutex_enter(&buf->b_hdr->b_freeze_lock); 1035 if (buf->b_hdr->b_freeze_cksum != NULL) { 1036 mutex_exit(&buf->b_hdr->b_freeze_lock); 1037 return; 1038 } 1039 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1040 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1041 buf->b_hdr->b_freeze_cksum); 1042 mutex_exit(&buf->b_hdr->b_freeze_lock); 1043 arc_buf_watch(buf); 1044 } 1045 1046 #ifndef _KERNEL 1047 typedef struct procctl { 1048 long cmd; 1049 prwatch_t prwatch; 1050 } procctl_t; 1051 #endif 1052 1053 /* ARGSUSED */ 1054 static void 1055 arc_buf_unwatch(arc_buf_t *buf) 1056 { 1057 #ifndef _KERNEL 1058 if (arc_watch) { 1059 int result; 1060 procctl_t ctl; 1061 ctl.cmd = PCWATCH; 1062 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1063 ctl.prwatch.pr_size = 0; 1064 ctl.prwatch.pr_wflags = 0; 1065 result = write(arc_procfd, &ctl, sizeof (ctl)); 1066 ASSERT3U(result, ==, sizeof (ctl)); 1067 } 1068 #endif 1069 } 1070 1071 /* ARGSUSED */ 1072 static void 1073 arc_buf_watch(arc_buf_t *buf) 1074 { 1075 #ifndef _KERNEL 1076 if (arc_watch) { 1077 int result; 1078 procctl_t ctl; 1079 ctl.cmd = PCWATCH; 1080 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1081 ctl.prwatch.pr_size = buf->b_hdr->b_size; 1082 ctl.prwatch.pr_wflags = WA_WRITE; 1083 result = write(arc_procfd, &ctl, sizeof (ctl)); 1084 ASSERT3U(result, ==, sizeof (ctl)); 1085 } 1086 #endif 1087 } 1088 1089 void 1090 arc_buf_thaw(arc_buf_t *buf) 1091 { 1092 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1093 if (buf->b_hdr->b_state != arc_anon) 1094 panic("modifying non-anon buffer!"); 1095 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 1096 panic("modifying buffer while i/o in progress!"); 1097 arc_cksum_verify(buf); 1098 } 1099 1100 mutex_enter(&buf->b_hdr->b_freeze_lock); 1101 if (buf->b_hdr->b_freeze_cksum != NULL) { 1102 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1103 buf->b_hdr->b_freeze_cksum = NULL; 1104 } 1105 1106 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1107 if (buf->b_hdr->b_thawed) 1108 kmem_free(buf->b_hdr->b_thawed, 1); 1109 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); 1110 } 1111 1112 mutex_exit(&buf->b_hdr->b_freeze_lock); 1113 1114 arc_buf_unwatch(buf); 1115 } 1116 1117 void 1118 arc_buf_freeze(arc_buf_t *buf) 1119 { 1120 kmutex_t *hash_lock; 1121 1122 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1123 return; 1124 1125 hash_lock = HDR_LOCK(buf->b_hdr); 1126 mutex_enter(hash_lock); 1127 1128 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1129 buf->b_hdr->b_state == arc_anon); 1130 arc_cksum_compute(buf, B_FALSE); 1131 mutex_exit(hash_lock); 1132 1133 } 1134 1135 static void 1136 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 1137 { 1138 ASSERT(MUTEX_HELD(hash_lock)); 1139 1140 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 1141 (ab->b_state != arc_anon)) { 1142 uint64_t delta = ab->b_size * ab->b_datacnt; 1143 list_t *list = &ab->b_state->arcs_list[ab->b_type]; 1144 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; 1145 1146 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); 1147 mutex_enter(&ab->b_state->arcs_mtx); 1148 ASSERT(list_link_active(&ab->b_arc_node)); 1149 list_remove(list, ab); 1150 if (GHOST_STATE(ab->b_state)) { 1151 ASSERT0(ab->b_datacnt); 1152 ASSERT3P(ab->b_buf, ==, NULL); 1153 delta = ab->b_size; 1154 } 1155 ASSERT(delta > 0); 1156 ASSERT3U(*size, >=, delta); 1157 atomic_add_64(size, -delta); 1158 mutex_exit(&ab->b_state->arcs_mtx); 1159 /* remove the prefetch flag if we get a reference */ 1160 if (ab->b_flags & ARC_PREFETCH) 1161 ab->b_flags &= ~ARC_PREFETCH; 1162 } 1163 } 1164 1165 static int 1166 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 1167 { 1168 int cnt; 1169 arc_state_t *state = ab->b_state; 1170 1171 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1172 ASSERT(!GHOST_STATE(state)); 1173 1174 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 1175 (state != arc_anon)) { 1176 uint64_t *size = &state->arcs_lsize[ab->b_type]; 1177 1178 ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 1179 mutex_enter(&state->arcs_mtx); 1180 ASSERT(!list_link_active(&ab->b_arc_node)); 1181 list_insert_head(&state->arcs_list[ab->b_type], ab); 1182 ASSERT(ab->b_datacnt > 0); 1183 atomic_add_64(size, ab->b_size * ab->b_datacnt); 1184 mutex_exit(&state->arcs_mtx); 1185 } 1186 return (cnt); 1187 } 1188 1189 /* 1190 * Move the supplied buffer to the indicated state. The mutex 1191 * for the buffer must be held by the caller. 1192 */ 1193 static void 1194 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 1195 { 1196 arc_state_t *old_state = ab->b_state; 1197 int64_t refcnt = refcount_count(&ab->b_refcnt); 1198 uint64_t from_delta, to_delta; 1199 1200 ASSERT(MUTEX_HELD(hash_lock)); 1201 ASSERT3P(new_state, !=, old_state); 1202 ASSERT(refcnt == 0 || ab->b_datacnt > 0); 1203 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 1204 ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); 1205 1206 from_delta = to_delta = ab->b_datacnt * ab->b_size; 1207 1208 /* 1209 * If this buffer is evictable, transfer it from the 1210 * old state list to the new state list. 1211 */ 1212 if (refcnt == 0) { 1213 if (old_state != arc_anon) { 1214 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 1215 uint64_t *size = &old_state->arcs_lsize[ab->b_type]; 1216 1217 if (use_mutex) 1218 mutex_enter(&old_state->arcs_mtx); 1219 1220 ASSERT(list_link_active(&ab->b_arc_node)); 1221 list_remove(&old_state->arcs_list[ab->b_type], ab); 1222 1223 /* 1224 * If prefetching out of the ghost cache, 1225 * we will have a non-zero datacnt. 1226 */ 1227 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 1228 /* ghost elements have a ghost size */ 1229 ASSERT(ab->b_buf == NULL); 1230 from_delta = ab->b_size; 1231 } 1232 ASSERT3U(*size, >=, from_delta); 1233 atomic_add_64(size, -from_delta); 1234 1235 if (use_mutex) 1236 mutex_exit(&old_state->arcs_mtx); 1237 } 1238 if (new_state != arc_anon) { 1239 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 1240 uint64_t *size = &new_state->arcs_lsize[ab->b_type]; 1241 1242 if (use_mutex) 1243 mutex_enter(&new_state->arcs_mtx); 1244 1245 list_insert_head(&new_state->arcs_list[ab->b_type], ab); 1246 1247 /* ghost elements have a ghost size */ 1248 if (GHOST_STATE(new_state)) { 1249 ASSERT(ab->b_datacnt == 0); 1250 ASSERT(ab->b_buf == NULL); 1251 to_delta = ab->b_size; 1252 } 1253 atomic_add_64(size, to_delta); 1254 1255 if (use_mutex) 1256 mutex_exit(&new_state->arcs_mtx); 1257 } 1258 } 1259 1260 ASSERT(!BUF_EMPTY(ab)); 1261 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) 1262 buf_hash_remove(ab); 1263 1264 /* adjust state sizes */ 1265 if (to_delta) 1266 atomic_add_64(&new_state->arcs_size, to_delta); 1267 if (from_delta) { 1268 ASSERT3U(old_state->arcs_size, >=, from_delta); 1269 atomic_add_64(&old_state->arcs_size, -from_delta); 1270 } 1271 ab->b_state = new_state; 1272 1273 /* adjust l2arc hdr stats */ 1274 if (new_state == arc_l2c_only) 1275 l2arc_hdr_stat_add(); 1276 else if (old_state == arc_l2c_only) 1277 l2arc_hdr_stat_remove(); 1278 } 1279 1280 void 1281 arc_space_consume(uint64_t space, arc_space_type_t type) 1282 { 1283 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1284 1285 switch (type) { 1286 case ARC_SPACE_DATA: 1287 ARCSTAT_INCR(arcstat_data_size, space); 1288 break; 1289 case ARC_SPACE_OTHER: 1290 ARCSTAT_INCR(arcstat_other_size, space); 1291 break; 1292 case ARC_SPACE_HDRS: 1293 ARCSTAT_INCR(arcstat_hdr_size, space); 1294 break; 1295 case ARC_SPACE_L2HDRS: 1296 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1297 break; 1298 } 1299 1300 ARCSTAT_INCR(arcstat_meta_used, space); 1301 atomic_add_64(&arc_size, space); 1302 atomic_add_64(&arc_bytes_allocd, space); 1303 } 1304 1305 void 1306 arc_space_return(uint64_t space, arc_space_type_t type) 1307 { 1308 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1309 1310 switch (type) { 1311 case ARC_SPACE_DATA: 1312 ARCSTAT_INCR(arcstat_data_size, -space); 1313 break; 1314 case ARC_SPACE_OTHER: 1315 ARCSTAT_INCR(arcstat_other_size, -space); 1316 break; 1317 case ARC_SPACE_HDRS: 1318 ARCSTAT_INCR(arcstat_hdr_size, -space); 1319 break; 1320 case ARC_SPACE_L2HDRS: 1321 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1322 break; 1323 } 1324 1325 ASSERT(arc_meta_used >= space); 1326 if (arc_meta_max < arc_meta_used) 1327 arc_meta_max = arc_meta_used; 1328 ARCSTAT_INCR(arcstat_meta_used, -space); 1329 ASSERT(arc_size >= space); 1330 atomic_add_64(&arc_size, -space); 1331 } 1332 1333 void * 1334 arc_data_buf_alloc(uint64_t size) 1335 { 1336 if (arc_evict_needed(ARC_BUFC_DATA)) 1337 cv_signal(&arc_reclaim_thr_cv); 1338 atomic_add_64(&arc_size, size); 1339 atomic_add_64(&arc_bytes_allocd, size); 1340 return (zio_data_buf_alloc(size)); 1341 } 1342 1343 void 1344 arc_data_buf_free(void *buf, uint64_t size) 1345 { 1346 zio_data_buf_free(buf, size); 1347 ASSERT(arc_size >= size); 1348 atomic_add_64(&arc_size, -size); 1349 } 1350 1351 arc_buf_t * 1352 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 1353 { 1354 arc_buf_hdr_t *hdr; 1355 arc_buf_t *buf; 1356 1357 ASSERT3U(size, >, 0); 1358 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 1359 ASSERT(BUF_EMPTY(hdr)); 1360 hdr->b_size = size; 1361 hdr->b_type = type; 1362 hdr->b_spa = spa_load_guid(spa); 1363 hdr->b_state = arc_anon; 1364 hdr->b_arc_access = 0; 1365 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1366 buf->b_hdr = hdr; 1367 buf->b_data = NULL; 1368 buf->b_efunc = NULL; 1369 buf->b_private = NULL; 1370 buf->b_next = NULL; 1371 hdr->b_buf = buf; 1372 arc_get_data_buf(buf); 1373 hdr->b_datacnt = 1; 1374 hdr->b_flags = 0; 1375 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1376 (void) refcount_add(&hdr->b_refcnt, tag); 1377 1378 return (buf); 1379 } 1380 1381 static char *arc_onloan_tag = "onloan"; 1382 1383 /* 1384 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1385 * flight data by arc_tempreserve_space() until they are "returned". Loaned 1386 * buffers must be returned to the arc before they can be used by the DMU or 1387 * freed. 1388 */ 1389 arc_buf_t * 1390 arc_loan_buf(spa_t *spa, int size) 1391 { 1392 arc_buf_t *buf; 1393 1394 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 1395 1396 atomic_add_64(&arc_loaned_bytes, size); 1397 return (buf); 1398 } 1399 1400 /* 1401 * Return a loaned arc buffer to the arc. 1402 */ 1403 void 1404 arc_return_buf(arc_buf_t *buf, void *tag) 1405 { 1406 arc_buf_hdr_t *hdr = buf->b_hdr; 1407 1408 ASSERT(buf->b_data != NULL); 1409 (void) refcount_add(&hdr->b_refcnt, tag); 1410 (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); 1411 1412 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 1413 } 1414 1415 /* Detach an arc_buf from a dbuf (tag) */ 1416 void 1417 arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 1418 { 1419 arc_buf_hdr_t *hdr; 1420 1421 ASSERT(buf->b_data != NULL); 1422 hdr = buf->b_hdr; 1423 (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); 1424 (void) refcount_remove(&hdr->b_refcnt, tag); 1425 buf->b_efunc = NULL; 1426 buf->b_private = NULL; 1427 1428 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 1429 } 1430 1431 static arc_buf_t * 1432 arc_buf_clone(arc_buf_t *from) 1433 { 1434 arc_buf_t *buf; 1435 arc_buf_hdr_t *hdr = from->b_hdr; 1436 uint64_t size = hdr->b_size; 1437 1438 ASSERT(hdr->b_state != arc_anon); 1439 1440 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1441 buf->b_hdr = hdr; 1442 buf->b_data = NULL; 1443 buf->b_efunc = NULL; 1444 buf->b_private = NULL; 1445 buf->b_next = hdr->b_buf; 1446 hdr->b_buf = buf; 1447 arc_get_data_buf(buf); 1448 bcopy(from->b_data, buf->b_data, size); 1449 1450 /* 1451 * This buffer already exists in the arc so create a duplicate 1452 * copy for the caller. If the buffer is associated with user data 1453 * then track the size and number of duplicates. These stats will be 1454 * updated as duplicate buffers are created and destroyed. 1455 */ 1456 if (hdr->b_type == ARC_BUFC_DATA) { 1457 ARCSTAT_BUMP(arcstat_duplicate_buffers); 1458 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 1459 } 1460 hdr->b_datacnt += 1; 1461 return (buf); 1462 } 1463 1464 void 1465 arc_buf_add_ref(arc_buf_t *buf, void* tag) 1466 { 1467 arc_buf_hdr_t *hdr; 1468 kmutex_t *hash_lock; 1469 1470 /* 1471 * Check to see if this buffer is evicted. Callers 1472 * must verify b_data != NULL to know if the add_ref 1473 * was successful. 1474 */ 1475 mutex_enter(&buf->b_evict_lock); 1476 if (buf->b_data == NULL) { 1477 mutex_exit(&buf->b_evict_lock); 1478 return; 1479 } 1480 hash_lock = HDR_LOCK(buf->b_hdr); 1481 mutex_enter(hash_lock); 1482 hdr = buf->b_hdr; 1483 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1484 mutex_exit(&buf->b_evict_lock); 1485 1486 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 1487 add_reference(hdr, hash_lock, tag); 1488 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 1489 arc_access(hdr, hash_lock); 1490 mutex_exit(hash_lock); 1491 ARCSTAT_BUMP(arcstat_hits); 1492 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 1493 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 1494 data, metadata, hits); 1495 } 1496 1497 /* 1498 * Free the arc data buffer. If it is an l2arc write in progress, 1499 * the buffer is placed on l2arc_free_on_write to be freed later. 1500 */ 1501 static void 1502 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 1503 { 1504 arc_buf_hdr_t *hdr = buf->b_hdr; 1505 1506 if (HDR_L2_WRITING(hdr)) { 1507 l2arc_data_free_t *df; 1508 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 1509 df->l2df_data = buf->b_data; 1510 df->l2df_size = hdr->b_size; 1511 df->l2df_func = free_func; 1512 mutex_enter(&l2arc_free_on_write_mtx); 1513 list_insert_head(l2arc_free_on_write, df); 1514 mutex_exit(&l2arc_free_on_write_mtx); 1515 ARCSTAT_BUMP(arcstat_l2_free_on_write); 1516 } else { 1517 free_func(buf->b_data, hdr->b_size); 1518 } 1519 } 1520 1521 /* 1522 * Free up buf->b_data and if 'remove' is set, then pull the 1523 * arc_buf_t off of the the arc_buf_hdr_t's list and free it. 1524 */ 1525 static void 1526 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) 1527 { 1528 arc_buf_t **bufp; 1529 1530 /* free up data associated with the buf */ 1531 if (buf->b_data) { 1532 arc_state_t *state = buf->b_hdr->b_state; 1533 uint64_t size = buf->b_hdr->b_size; 1534 arc_buf_contents_t type = buf->b_hdr->b_type; 1535 1536 arc_cksum_verify(buf); 1537 arc_buf_unwatch(buf); 1538 1539 if (!recycle) { 1540 if (type == ARC_BUFC_METADATA) { 1541 arc_buf_data_free(buf, zio_buf_free); 1542 arc_space_return(size, ARC_SPACE_DATA); 1543 } else { 1544 ASSERT(type == ARC_BUFC_DATA); 1545 arc_buf_data_free(buf, zio_data_buf_free); 1546 ARCSTAT_INCR(arcstat_data_size, -size); 1547 atomic_add_64(&arc_size, -size); 1548 } 1549 } 1550 if (list_link_active(&buf->b_hdr->b_arc_node)) { 1551 uint64_t *cnt = &state->arcs_lsize[type]; 1552 1553 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 1554 ASSERT(state != arc_anon); 1555 1556 ASSERT3U(*cnt, >=, size); 1557 atomic_add_64(cnt, -size); 1558 } 1559 ASSERT3U(state->arcs_size, >=, size); 1560 atomic_add_64(&state->arcs_size, -size); 1561 buf->b_data = NULL; 1562 1563 /* 1564 * If we're destroying a duplicate buffer make sure 1565 * that the appropriate statistics are updated. 1566 */ 1567 if (buf->b_hdr->b_datacnt > 1 && 1568 buf->b_hdr->b_type == ARC_BUFC_DATA) { 1569 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 1570 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 1571 } 1572 ASSERT(buf->b_hdr->b_datacnt > 0); 1573 buf->b_hdr->b_datacnt -= 1; 1574 } 1575 1576 /* only remove the buf if requested */ 1577 if (!remove) 1578 return; 1579 1580 /* remove the buf from the hdr list */ 1581 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 1582 continue; 1583 *bufp = buf->b_next; 1584 buf->b_next = NULL; 1585 1586 ASSERT(buf->b_efunc == NULL); 1587 1588 /* clean up the buf */ 1589 buf->b_hdr = NULL; 1590 kmem_cache_free(buf_cache, buf); 1591 } 1592 1593 static void 1594 arc_hdr_destroy(arc_buf_hdr_t *hdr) 1595 { 1596 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1597 ASSERT3P(hdr->b_state, ==, arc_anon); 1598 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1599 l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; 1600 1601 if (l2hdr != NULL) { 1602 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); 1603 /* 1604 * To prevent arc_free() and l2arc_evict() from 1605 * attempting to free the same buffer at the same time, 1606 * a FREE_IN_PROGRESS flag is given to arc_free() to 1607 * give it priority. l2arc_evict() can't destroy this 1608 * header while we are waiting on l2arc_buflist_mtx. 1609 * 1610 * The hdr may be removed from l2ad_buflist before we 1611 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. 1612 */ 1613 if (!buflist_held) { 1614 mutex_enter(&l2arc_buflist_mtx); 1615 l2hdr = hdr->b_l2hdr; 1616 } 1617 1618 if (l2hdr != NULL) { 1619 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 1620 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 1621 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 1622 if (l2hdr->b_dev->l2ad_vdev) 1623 vdev_space_update(l2hdr->b_dev->l2ad_vdev, 1624 -l2hdr->b_asize, 0, 0); 1625 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 1626 if (hdr->b_state == arc_l2c_only) 1627 l2arc_hdr_stat_remove(); 1628 hdr->b_l2hdr = NULL; 1629 } 1630 1631 if (!buflist_held) 1632 mutex_exit(&l2arc_buflist_mtx); 1633 } 1634 1635 if (!BUF_EMPTY(hdr)) { 1636 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1637 buf_discard_identity(hdr); 1638 } 1639 while (hdr->b_buf) { 1640 arc_buf_t *buf = hdr->b_buf; 1641 1642 if (buf->b_efunc) { 1643 mutex_enter(&arc_eviction_mtx); 1644 mutex_enter(&buf->b_evict_lock); 1645 ASSERT(buf->b_hdr != NULL); 1646 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1647 hdr->b_buf = buf->b_next; 1648 buf->b_hdr = &arc_eviction_hdr; 1649 buf->b_next = arc_eviction_list; 1650 arc_eviction_list = buf; 1651 mutex_exit(&buf->b_evict_lock); 1652 mutex_exit(&arc_eviction_mtx); 1653 } else { 1654 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1655 } 1656 } 1657 if (hdr->b_freeze_cksum != NULL) { 1658 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1659 hdr->b_freeze_cksum = NULL; 1660 } 1661 if (hdr->b_thawed) { 1662 kmem_free(hdr->b_thawed, 1); 1663 hdr->b_thawed = NULL; 1664 } 1665 1666 ASSERT(!list_link_active(&hdr->b_arc_node)); 1667 ASSERT3P(hdr->b_hash_next, ==, NULL); 1668 ASSERT3P(hdr->b_acb, ==, NULL); 1669 kmem_cache_free(hdr_cache, hdr); 1670 } 1671 1672 void 1673 arc_buf_free(arc_buf_t *buf, void *tag) 1674 { 1675 arc_buf_hdr_t *hdr = buf->b_hdr; 1676 int hashed = hdr->b_state != arc_anon; 1677 1678 ASSERT(buf->b_efunc == NULL); 1679 ASSERT(buf->b_data != NULL); 1680 1681 if (hashed) { 1682 kmutex_t *hash_lock = HDR_LOCK(hdr); 1683 1684 mutex_enter(hash_lock); 1685 hdr = buf->b_hdr; 1686 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1687 1688 (void) remove_reference(hdr, hash_lock, tag); 1689 if (hdr->b_datacnt > 1) { 1690 arc_buf_destroy(buf, FALSE, TRUE); 1691 } else { 1692 ASSERT(buf == hdr->b_buf); 1693 ASSERT(buf->b_efunc == NULL); 1694 hdr->b_flags |= ARC_BUF_AVAILABLE; 1695 } 1696 mutex_exit(hash_lock); 1697 } else if (HDR_IO_IN_PROGRESS(hdr)) { 1698 int destroy_hdr; 1699 /* 1700 * We are in the middle of an async write. Don't destroy 1701 * this buffer unless the write completes before we finish 1702 * decrementing the reference count. 1703 */ 1704 mutex_enter(&arc_eviction_mtx); 1705 (void) remove_reference(hdr, NULL, tag); 1706 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1707 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1708 mutex_exit(&arc_eviction_mtx); 1709 if (destroy_hdr) 1710 arc_hdr_destroy(hdr); 1711 } else { 1712 if (remove_reference(hdr, NULL, tag) > 0) 1713 arc_buf_destroy(buf, FALSE, TRUE); 1714 else 1715 arc_hdr_destroy(hdr); 1716 } 1717 } 1718 1719 boolean_t 1720 arc_buf_remove_ref(arc_buf_t *buf, void* tag) 1721 { 1722 arc_buf_hdr_t *hdr = buf->b_hdr; 1723 kmutex_t *hash_lock = HDR_LOCK(hdr); 1724 boolean_t no_callback = (buf->b_efunc == NULL); 1725 1726 if (hdr->b_state == arc_anon) { 1727 ASSERT(hdr->b_datacnt == 1); 1728 arc_buf_free(buf, tag); 1729 return (no_callback); 1730 } 1731 1732 mutex_enter(hash_lock); 1733 hdr = buf->b_hdr; 1734 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1735 ASSERT(hdr->b_state != arc_anon); 1736 ASSERT(buf->b_data != NULL); 1737 1738 (void) remove_reference(hdr, hash_lock, tag); 1739 if (hdr->b_datacnt > 1) { 1740 if (no_callback) 1741 arc_buf_destroy(buf, FALSE, TRUE); 1742 } else if (no_callback) { 1743 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1744 ASSERT(buf->b_efunc == NULL); 1745 hdr->b_flags |= ARC_BUF_AVAILABLE; 1746 } 1747 ASSERT(no_callback || hdr->b_datacnt > 1 || 1748 refcount_is_zero(&hdr->b_refcnt)); 1749 mutex_exit(hash_lock); 1750 return (no_callback); 1751 } 1752 1753 int 1754 arc_buf_size(arc_buf_t *buf) 1755 { 1756 return (buf->b_hdr->b_size); 1757 } 1758 1759 /* 1760 * Called from the DMU to determine if the current buffer should be 1761 * evicted. In order to ensure proper locking, the eviction must be initiated 1762 * from the DMU. Return true if the buffer is associated with user data and 1763 * duplicate buffers still exist. 1764 */ 1765 boolean_t 1766 arc_buf_eviction_needed(arc_buf_t *buf) 1767 { 1768 arc_buf_hdr_t *hdr; 1769 boolean_t evict_needed = B_FALSE; 1770 1771 if (zfs_disable_dup_eviction) 1772 return (B_FALSE); 1773 1774 mutex_enter(&buf->b_evict_lock); 1775 hdr = buf->b_hdr; 1776 if (hdr == NULL) { 1777 /* 1778 * We are in arc_do_user_evicts(); let that function 1779 * perform the eviction. 1780 */ 1781 ASSERT(buf->b_data == NULL); 1782 mutex_exit(&buf->b_evict_lock); 1783 return (B_FALSE); 1784 } else if (buf->b_data == NULL) { 1785 /* 1786 * We have already been added to the arc eviction list; 1787 * recommend eviction. 1788 */ 1789 ASSERT3P(hdr, ==, &arc_eviction_hdr); 1790 mutex_exit(&buf->b_evict_lock); 1791 return (B_TRUE); 1792 } 1793 1794 if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) 1795 evict_needed = B_TRUE; 1796 1797 mutex_exit(&buf->b_evict_lock); 1798 return (evict_needed); 1799 } 1800 1801 int zfs_fastflush = 1; 1802 1803 /* 1804 * Evict buffers from list until we've removed the specified number of 1805 * bytes. Move the removed buffers to the appropriate evict state. 1806 * If the recycle flag is set, then attempt to "recycle" a buffer: 1807 * - look for a buffer to evict that is `bytes' long. 1808 * - return the data block from this buffer rather than freeing it. 1809 * This flag is used by callers that are trying to make space for a 1810 * new buffer in a full arc cache. 1811 * 1812 * This function makes a "best effort". It skips over any buffers 1813 * it can't get a hash_lock on, and so may not catch all candidates. 1814 * It may also return without evicting as much space as requested. 1815 */ 1816 static void * 1817 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, 1818 arc_buf_contents_t type) 1819 { 1820 arc_state_t *evicted_state; 1821 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1822 arc_buf_hdr_t *ab, *ab_prev = NULL; 1823 list_t *list = &state->arcs_list[type]; 1824 kmutex_t *hash_lock; 1825 boolean_t have_lock; 1826 void *stolen = NULL; 1827 arc_buf_hdr_t marker = { 0 }; 1828 int count = 0; 1829 1830 ASSERT(state == arc_mru || state == arc_mfu); 1831 1832 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1833 1834 mutex_enter(&state->arcs_mtx); 1835 mutex_enter(&evicted_state->arcs_mtx); 1836 1837 for (ab = list_tail(list); ab; ab = ab_prev) { 1838 ab_prev = list_prev(list, ab); 1839 /* prefetch buffers have a minimum lifespan */ 1840 if (HDR_IO_IN_PROGRESS(ab) || 1841 (spa && ab->b_spa != spa) || 1842 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1843 ddi_get_lbolt() - ab->b_arc_access < 1844 arc_min_prefetch_lifespan)) { 1845 skipped++; 1846 continue; 1847 } 1848 /* "lookahead" for better eviction candidate */ 1849 if (recycle && ab->b_size != bytes && 1850 ab_prev && ab_prev->b_size == bytes) 1851 continue; 1852 1853 /* ignore markers */ 1854 if (ab->b_spa == 0) 1855 continue; 1856 1857 /* 1858 * It may take a long time to evict all the bufs requested. 1859 * To avoid blocking all arc activity, periodically drop 1860 * the arcs_mtx and give other threads a chance to run 1861 * before reacquiring the lock. 1862 * 1863 * If we are looking for a buffer to recycle, we are in 1864 * the hot code path, so don't sleep. 1865 */ 1866 if (!recycle && count++ > arc_evict_iterations) { 1867 list_insert_after(list, ab, &marker); 1868 mutex_exit(&evicted_state->arcs_mtx); 1869 mutex_exit(&state->arcs_mtx); 1870 kpreempt(KPREEMPT_SYNC); 1871 mutex_enter(&state->arcs_mtx); 1872 mutex_enter(&evicted_state->arcs_mtx); 1873 ab_prev = list_prev(list, &marker); 1874 list_remove(list, &marker); 1875 count = 0; 1876 continue; 1877 } 1878 1879 hash_lock = HDR_LOCK(ab); 1880 have_lock = MUTEX_HELD(hash_lock); 1881 if (have_lock || mutex_tryenter(hash_lock)) { 1882 ASSERT0(refcount_count(&ab->b_refcnt)); 1883 ASSERT(ab->b_datacnt > 0); 1884 while (ab->b_buf) { 1885 arc_buf_t *buf = ab->b_buf; 1886 if (!mutex_tryenter(&buf->b_evict_lock)) { 1887 missed += 1; 1888 break; 1889 } 1890 if (buf->b_data) { 1891 bytes_evicted += ab->b_size; 1892 if (recycle && ab->b_type == type && 1893 ab->b_size == bytes && 1894 !HDR_L2_WRITING(ab)) { 1895 stolen = buf->b_data; 1896 recycle = FALSE; 1897 } 1898 } 1899 if (buf->b_efunc) { 1900 mutex_enter(&arc_eviction_mtx); 1901 arc_buf_destroy(buf, 1902 buf->b_data == stolen, FALSE); 1903 ab->b_buf = buf->b_next; 1904 buf->b_hdr = &arc_eviction_hdr; 1905 buf->b_next = arc_eviction_list; 1906 arc_eviction_list = buf; 1907 mutex_exit(&arc_eviction_mtx); 1908 mutex_exit(&buf->b_evict_lock); 1909 } else { 1910 mutex_exit(&buf->b_evict_lock); 1911 arc_buf_destroy(buf, 1912 buf->b_data == stolen, TRUE); 1913 } 1914 } 1915 1916 if (ab->b_l2hdr) { 1917 ARCSTAT_INCR(arcstat_evict_l2_cached, 1918 ab->b_size); 1919 } else { 1920 if (l2arc_write_eligible(ab->b_spa, ab)) { 1921 ARCSTAT_INCR(arcstat_evict_l2_eligible, 1922 ab->b_size); 1923 } else { 1924 ARCSTAT_INCR( 1925 arcstat_evict_l2_ineligible, 1926 ab->b_size); 1927 } 1928 } 1929 1930 if (ab->b_datacnt == 0) { 1931 arc_change_state(evicted_state, ab, hash_lock); 1932 ASSERT(HDR_IN_HASH_TABLE(ab)); 1933 ab->b_flags |= ARC_IN_HASH_TABLE; 1934 ab->b_flags &= ~ARC_BUF_AVAILABLE; 1935 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 1936 } 1937 if (!have_lock) 1938 mutex_exit(hash_lock); 1939 if (bytes >= 0 && bytes_evicted >= bytes) 1940 break; 1941 } else { 1942 missed += 1; 1943 } 1944 } 1945 1946 mutex_exit(&evicted_state->arcs_mtx); 1947 mutex_exit(&state->arcs_mtx); 1948 1949 if (bytes_evicted < bytes) 1950 dprintf("only evicted %lld bytes from %x", 1951 (longlong_t)bytes_evicted, state); 1952 1953 if (skipped) 1954 ARCSTAT_INCR(arcstat_evict_skip, skipped); 1955 1956 if (missed) 1957 ARCSTAT_INCR(arcstat_mutex_miss, missed); 1958 1959 /* 1960 * Note: we have just evicted some data into the ghost state, 1961 * potentially putting the ghost size over the desired size. Rather 1962 * that evicting from the ghost list in this hot code path, leave 1963 * this chore to the arc_reclaim_thread(). 1964 */ 1965 1966 return (stolen); 1967 } 1968 1969 /* 1970 * Remove buffers from list until we've removed the specified number of 1971 * bytes. Destroy the buffers that are removed. 1972 */ 1973 static void 1974 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) 1975 { 1976 arc_buf_hdr_t *ab, *ab_prev; 1977 arc_buf_hdr_t marker = { 0 }; 1978 list_t *list = &state->arcs_list[ARC_BUFC_DATA]; 1979 kmutex_t *hash_lock; 1980 uint64_t bytes_deleted = 0; 1981 uint64_t bufs_skipped = 0; 1982 int count = 0; 1983 1984 ASSERT(GHOST_STATE(state)); 1985 top: 1986 mutex_enter(&state->arcs_mtx); 1987 for (ab = list_tail(list); ab; ab = ab_prev) { 1988 ab_prev = list_prev(list, ab); 1989 if (ab->b_type > ARC_BUFC_NUMTYPES) 1990 panic("invalid ab=%p", (void *)ab); 1991 if (spa && ab->b_spa != spa) 1992 continue; 1993 1994 /* ignore markers */ 1995 if (ab->b_spa == 0) 1996 continue; 1997 1998 hash_lock = HDR_LOCK(ab); 1999 /* caller may be trying to modify this buffer, skip it */ 2000 if (MUTEX_HELD(hash_lock)) 2001 continue; 2002 2003 /* 2004 * It may take a long time to evict all the bufs requested. 2005 * To avoid blocking all arc activity, periodically drop 2006 * the arcs_mtx and give other threads a chance to run 2007 * before reacquiring the lock. 2008 */ 2009 if (count++ > arc_evict_iterations) { 2010 list_insert_after(list, ab, &marker); 2011 mutex_exit(&state->arcs_mtx); 2012 kpreempt(KPREEMPT_SYNC); 2013 mutex_enter(&state->arcs_mtx); 2014 ab_prev = list_prev(list, &marker); 2015 list_remove(list, &marker); 2016 count = 0; 2017 continue; 2018 } 2019 if (mutex_tryenter(hash_lock)) { 2020 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 2021 ASSERT(ab->b_buf == NULL); 2022 ARCSTAT_BUMP(arcstat_deleted); 2023 bytes_deleted += ab->b_size; 2024 2025 if (ab->b_l2hdr != NULL) { 2026 /* 2027 * This buffer is cached on the 2nd Level ARC; 2028 * don't destroy the header. 2029 */ 2030 arc_change_state(arc_l2c_only, ab, hash_lock); 2031 mutex_exit(hash_lock); 2032 } else { 2033 arc_change_state(arc_anon, ab, hash_lock); 2034 mutex_exit(hash_lock); 2035 arc_hdr_destroy(ab); 2036 } 2037 2038 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 2039 if (bytes >= 0 && bytes_deleted >= bytes) 2040 break; 2041 } else if (bytes < 0) { 2042 /* 2043 * Insert a list marker and then wait for the 2044 * hash lock to become available. Once its 2045 * available, restart from where we left off. 2046 */ 2047 list_insert_after(list, ab, &marker); 2048 mutex_exit(&state->arcs_mtx); 2049 mutex_enter(hash_lock); 2050 mutex_exit(hash_lock); 2051 mutex_enter(&state->arcs_mtx); 2052 ab_prev = list_prev(list, &marker); 2053 list_remove(list, &marker); 2054 } else { 2055 bufs_skipped += 1; 2056 } 2057 2058 } 2059 mutex_exit(&state->arcs_mtx); 2060 2061 if (list == &state->arcs_list[ARC_BUFC_DATA] && 2062 (bytes < 0 || bytes_deleted < bytes)) { 2063 list = &state->arcs_list[ARC_BUFC_METADATA]; 2064 goto top; 2065 } 2066 2067 if (bufs_skipped) { 2068 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 2069 ASSERT(bytes >= 0); 2070 } 2071 2072 if (bytes_deleted < bytes) 2073 dprintf("only deleted %lld bytes from %p", 2074 (longlong_t)bytes_deleted, state); 2075 } 2076 2077 static void 2078 arc_adjust(void) 2079 { 2080 int64_t adjustment, delta; 2081 2082 /* 2083 * Adjust MRU size 2084 */ 2085 2086 adjustment = MIN((int64_t)(arc_size - arc_c), 2087 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 2088 arc_p)); 2089 2090 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 2091 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 2092 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA); 2093 adjustment -= delta; 2094 } 2095 2096 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2097 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 2098 (void) arc_evict(arc_mru, NULL, delta, FALSE, 2099 ARC_BUFC_METADATA); 2100 } 2101 2102 /* 2103 * Adjust MFU size 2104 */ 2105 2106 adjustment = arc_size - arc_c; 2107 2108 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 2109 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 2110 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA); 2111 adjustment -= delta; 2112 } 2113 2114 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2115 int64_t delta = MIN(adjustment, 2116 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 2117 (void) arc_evict(arc_mfu, NULL, delta, FALSE, 2118 ARC_BUFC_METADATA); 2119 } 2120 2121 /* 2122 * Adjust ghost lists 2123 */ 2124 2125 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 2126 2127 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 2128 delta = MIN(arc_mru_ghost->arcs_size, adjustment); 2129 arc_evict_ghost(arc_mru_ghost, NULL, delta); 2130 } 2131 2132 adjustment = 2133 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 2134 2135 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 2136 delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 2137 arc_evict_ghost(arc_mfu_ghost, NULL, delta); 2138 } 2139 } 2140 2141 #define ACCURACY 1000 2142 2143 static void 2144 arc_reclaim_bytes(uint64_t to_evict) 2145 { 2146 uint64_t to_evict_data_mru, to_evict_data_mfu; 2147 uint64_t to_evict_meta_mru, to_evict_meta_mfu; 2148 2149 to_evict_meta_mru = (((arc_mru->arcs_lsize[ARC_BUFC_METADATA] * 2150 ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) * 2151 to_evict) / ACCURACY; 2152 to_evict_data_mru = (((arc_mru->arcs_lsize[ARC_BUFC_DATA] * 2153 ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) * 2154 to_evict) / ACCURACY; 2155 to_evict_meta_mfu = (((arc_mfu->arcs_lsize[ARC_BUFC_METADATA] * 2156 ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) * 2157 to_evict) / ACCURACY; 2158 to_evict_data_mfu = (((arc_mfu->arcs_lsize[ARC_BUFC_DATA] * 2159 ACCURACY) / (arc_mru->arcs_size + arc_mfu->arcs_size)) * 2160 to_evict) / ACCURACY; 2161 2162 if (to_evict_meta_mru > 0) 2163 (void) arc_evict(arc_mru, NULL, to_evict_meta_mru, FALSE, 2164 ARC_BUFC_METADATA); 2165 if (to_evict_data_mru > 0) 2166 (void) arc_evict(arc_mru, NULL, to_evict_data_mru, FALSE, 2167 ARC_BUFC_DATA); 2168 if (to_evict_meta_mfu > 0) 2169 (void) arc_evict(arc_mfu, NULL, to_evict_meta_mfu, FALSE, 2170 ARC_BUFC_METADATA); 2171 if (to_evict_data_mfu > 0) 2172 (void) arc_evict(arc_mfu, NULL, to_evict_data_mfu, FALSE, 2173 ARC_BUFC_DATA); 2174 } 2175 2176 static void 2177 arc_do_user_evicts(void) 2178 { 2179 mutex_enter(&arc_eviction_mtx); 2180 while (arc_eviction_list != NULL) { 2181 arc_buf_t *buf = arc_eviction_list; 2182 arc_eviction_list = buf->b_next; 2183 mutex_enter(&buf->b_evict_lock); 2184 buf->b_hdr = NULL; 2185 mutex_exit(&buf->b_evict_lock); 2186 mutex_exit(&arc_eviction_mtx); 2187 2188 if (buf->b_efunc != NULL) 2189 VERIFY0(buf->b_efunc(buf->b_private)); 2190 2191 buf->b_efunc = NULL; 2192 buf->b_private = NULL; 2193 kmem_cache_free(buf_cache, buf); 2194 mutex_enter(&arc_eviction_mtx); 2195 } 2196 mutex_exit(&arc_eviction_mtx); 2197 } 2198 2199 typedef struct arc_async_flush_data { 2200 uint64_t aaf_guid; 2201 } arc_async_flush_data_t; 2202 2203 static taskq_t *arc_flush_taskq; 2204 2205 static void 2206 _arc_flush(uint64_t guid) 2207 { 2208 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { 2209 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 2210 if (guid) 2211 break; 2212 } 2213 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { 2214 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 2215 if (guid) 2216 break; 2217 } 2218 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { 2219 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 2220 if (guid) 2221 break; 2222 } 2223 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { 2224 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 2225 if (guid) 2226 break; 2227 } 2228 2229 arc_evict_ghost(arc_mru_ghost, guid, -1); 2230 arc_evict_ghost(arc_mfu_ghost, guid, -1); 2231 2232 mutex_enter(&arc_reclaim_thr_lock); 2233 arc_do_user_evicts(); 2234 mutex_exit(&arc_reclaim_thr_lock); 2235 } 2236 2237 static void 2238 arc_flush_task(void *arg) 2239 { 2240 arc_async_flush_data_t *aaf = (arc_async_flush_data_t *)arg; 2241 _arc_flush(aaf->aaf_guid); 2242 kmem_free(aaf, sizeof (arc_async_flush_data_t)); 2243 } 2244 2245 /* 2246 * Flush all *evictable* data from the cache for the given spa. 2247 * NOTE: this will not touch "active" (i.e. referenced) data. 2248 */ 2249 void 2250 arc_flush(spa_t *spa) 2251 { 2252 uint64_t guid = 0; 2253 boolean_t async_flush = (spa ? zfs_fastflush : FALSE); 2254 arc_async_flush_data_t *aaf = NULL; 2255 2256 if (spa) { 2257 guid = spa_load_guid(spa); 2258 if (async_flush) { 2259 aaf = kmem_alloc(sizeof (arc_async_flush_data_t), 2260 KM_SLEEP); 2261 aaf->aaf_guid = guid; 2262 } 2263 } 2264 2265 /* 2266 * Try to flush per-spa remaining ARC ghost buffers and buffers in 2267 * arc_eviction_list asynchronously while a pool is being closed. 2268 * An ARC buffer is bound to spa only by guid, so buffer can 2269 * exist even when pool has already gone. If asynchronous flushing 2270 * fails we fall back to regular (synchronous) one. 2271 * NOTE: If asynchronous flushing had not yet finished when the pool 2272 * was imported again it wouldn't be a problem, even when guids before 2273 * and after export/import are the same. We can evict only unreferenced 2274 * buffers, other are skipped. 2275 */ 2276 if (!async_flush || (taskq_dispatch(arc_flush_taskq, arc_flush_task, 2277 aaf, TQ_NOSLEEP) == NULL)) { 2278 _arc_flush(guid); 2279 ASSERT(spa || arc_eviction_list == NULL); 2280 if (async_flush) 2281 kmem_free(aaf, sizeof (arc_async_flush_data_t)); 2282 } 2283 } 2284 2285 void 2286 arc_shrink(void) 2287 { 2288 if (arc_c > arc_c_min) { 2289 uint64_t to_free; 2290 2291 #ifdef _KERNEL 2292 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); 2293 #else 2294 to_free = arc_c >> arc_shrink_shift; 2295 #endif 2296 if (arc_c > arc_c_min + to_free) 2297 atomic_add_64(&arc_c, -to_free); 2298 else 2299 arc_c = arc_c_min; 2300 2301 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 2302 if (arc_c > arc_size) 2303 arc_c = MAX(arc_size, arc_c_min); 2304 if (arc_p > arc_c) 2305 arc_p = (arc_c >> 1); 2306 ASSERT(arc_c >= arc_c_min); 2307 ASSERT((int64_t)arc_p >= 0); 2308 } 2309 2310 if (arc_size > arc_c) 2311 arc_adjust(); 2312 } 2313 2314 #define PHYSMEM_PRESSURE_FRACTION 100 2315 2316 static boolean_t 2317 arc_mem_pressure(void) 2318 { 2319 #ifdef _KERNEL 2320 uint64_t extra = desfree + physmem / PHYSMEM_PRESSURE_FRACTION; 2321 2322 if ((freemem < lotsfree + needfree + extra) || 2323 (needfree || availrmem < swapfs_minfree + swapfs_reserve + extra) || 2324 (zio_arena != NULL && vmem_size(zio_arena, VMEM_FREE) < 2325 (vmem_size(zio_arena, VMEM_ALLOC) >> 4) + 2326 physmem / PHYSMEM_PRESSURE_FRACTION)) 2327 return (B_TRUE); 2328 2329 return (freemem < physmem / PHYSMEM_PRESSURE_FRACTION); 2330 #else 2331 return (0); 2332 #endif 2333 } 2334 2335 /* 2336 * Determine if the system is under memory pressure and is asking 2337 * to reclaim memory. A return value of 1 indicates that the system 2338 * is under memory pressure and that the arc should adjust accordingly. 2339 */ 2340 static int 2341 arc_reclaim_needed(void) 2342 { 2343 uint64_t extra; 2344 2345 #ifdef _KERNEL 2346 2347 if (needfree) 2348 return (1); 2349 2350 /* 2351 * take 'desfree' extra pages, so we reclaim sooner, rather than later 2352 */ 2353 extra = desfree; 2354 2355 /* 2356 * check that we're out of range of the pageout scanner. It starts to 2357 * schedule paging if freemem is less than lotsfree and needfree. 2358 * lotsfree is the high-water mark for pageout, and needfree is the 2359 * number of needed free pages. We add extra pages here to make sure 2360 * the scanner doesn't start up while we're freeing memory. 2361 */ 2362 if (freemem < lotsfree + needfree + extra) 2363 return (1); 2364 2365 /* 2366 * check to make sure that swapfs has enough space so that anon 2367 * reservations can still succeed. anon_resvmem() checks that the 2368 * availrmem is greater than swapfs_minfree, and the number of reserved 2369 * swap pages. We also add a bit of extra here just to prevent 2370 * circumstances from getting really dire. 2371 */ 2372 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 2373 return (1); 2374 2375 /* 2376 * Check that we have enough availrmem that memory locking (e.g., via 2377 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 2378 * stores the number of pages that cannot be locked; when availrmem 2379 * drops below pages_pp_maximum, page locking mechanisms such as 2380 * page_pp_lock() will fail.) 2381 */ 2382 if (availrmem <= pages_pp_maximum) 2383 return (1); 2384 2385 #if defined(__i386) 2386 /* 2387 * If we're on an i386 platform, it's possible that we'll exhaust the 2388 * kernel heap space before we ever run out of available physical 2389 * memory. Most checks of the size of the heap_area compare against 2390 * tune.t_minarmem, which is the minimum available real memory that we 2391 * can have in the system. However, this is generally fixed at 25 pages 2392 * which is so low that it's useless. In this comparison, we seek to 2393 * calculate the total heap-size, and reclaim if more than 3/4ths of the 2394 * heap is allocated. (Or, in the calculation, if less than 1/4th is 2395 * free) 2396 */ 2397 if (vmem_size(heap_arena, VMEM_FREE) < 2398 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) 2399 return (1); 2400 #endif 2401 2402 /* 2403 * If zio data pages are being allocated out of a separate heap segment, 2404 * then enforce that the size of available vmem for this arena remains 2405 * above about 1/16th free. 2406 * 2407 * Note: The 1/16th arena free requirement was put in place 2408 * to aggressively evict memory from the arc in order to avoid 2409 * memory fragmentation issues. 2410 */ 2411 if (zio_arena != NULL && 2412 vmem_size(zio_arena, VMEM_FREE) < 2413 (vmem_size(zio_arena, VMEM_ALLOC) >> 4)) 2414 return (1); 2415 #else 2416 if (spa_get_random(100) == 0) 2417 return (1); 2418 #endif 2419 return (0); 2420 } 2421 2422 static void 2423 arc_kmem_reap_now(arc_reclaim_strategy_t strat) 2424 { 2425 size_t i; 2426 kmem_cache_t *prev_cache = NULL; 2427 kmem_cache_t *prev_data_cache = NULL; 2428 extern kmem_cache_t *zio_buf_cache[]; 2429 extern kmem_cache_t *zio_data_buf_cache[]; 2430 extern kmem_cache_t *range_seg_cache; 2431 2432 #ifdef _KERNEL 2433 if (arc_meta_used >= arc_meta_limit) { 2434 /* 2435 * We are exceeding our meta-data cache limit. 2436 * Purge some DNLC entries to release holds on meta-data. 2437 */ 2438 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 2439 } 2440 #if defined(__i386) 2441 /* 2442 * Reclaim unused memory from all kmem caches. 2443 */ 2444 kmem_reap(); 2445 #endif 2446 #endif 2447 2448 /* 2449 * An aggressive reclamation will shrink the cache size as well as 2450 * reap free buffers from the arc kmem caches. 2451 */ 2452 if (strat == ARC_RECLAIM_AGGR) 2453 arc_shrink(); 2454 2455 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 2456 if (zio_buf_cache[i] != prev_cache) { 2457 prev_cache = zio_buf_cache[i]; 2458 kmem_cache_reap_now(zio_buf_cache[i]); 2459 } 2460 if (zio_data_buf_cache[i] != prev_data_cache) { 2461 prev_data_cache = zio_data_buf_cache[i]; 2462 kmem_cache_reap_now(zio_data_buf_cache[i]); 2463 } 2464 } 2465 kmem_cache_reap_now(buf_cache); 2466 kmem_cache_reap_now(hdr_cache); 2467 kmem_cache_reap_now(range_seg_cache); 2468 2469 /* 2470 * Ask the vmem areana to reclaim unused memory from its 2471 * quantum caches. 2472 */ 2473 if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR) 2474 vmem_qcache_reap(zio_arena); 2475 } 2476 2477 #define RECLAIMS_PER_SEC 20 2478 #define STAT_UPDATES_PER_SEC 5 2479 2480 /* 2481 * During heavy use, the ARC naturally wants to oscilate its arc_c around 2482 * a maximum memory pressure point which corresponds to the arc_reclaim_needed 2483 * function evaluating to 1. This results in the arc_size slowly growing 2484 * towards this reclaim_needed threshold and exceeding it periodically. Once 2485 * this happens, both arc_c and arc_size are down-adjusted by the 2486 * arc_reclaim_thread and kmem_reap is initiated. This is problematic on 2487 * bigmem systems with a small recordsize (4k or 8k), because reaping a kmem 2488 * cache which contains very large numbers of objects is extremely expensive 2489 * from an xcall perspective (several seconds of heavy CPU use): 2490 * 2491 * (mem) 2492 * ^ arc_reclaim_thread reacts 2493 * | | | 2494 * | V V 2495 * | 2496 * | + + 2497 * | /| /| 2498 * | ......./..|................/..|.............. arc_reclaim_needed threshold 2499 * | / \_____________/ \___________/(etc) 2500 * | / kmem reap kmem reap 2501 * | / 2502 * |/ 2503 * +-----------------------------------------------------------------> 2504 * (time) 2505 * 2506 * To help address this stairstep pattern, the arc_pressure_thread periodically 2507 * gauges the distance of the current arc_size to the arc_reclaim_needed 2508 * threshold by way of an estimation algorithm (in arc_mem_pressure). 2509 */ 2510 static void 2511 arc_pressure_thread(void) 2512 { 2513 clock_t last_update = ddi_get_lbolt(); 2514 callb_cpr_t cpr; 2515 2516 CALLB_CPR_INIT(&cpr, &arc_pressure_thr_lock, callb_generic_cpr, FTAG); 2517 2518 mutex_enter(&arc_pressure_thr_lock); 2519 while (arc_pressure_thread_exit == 0) { 2520 clock_t now; 2521 2522 now = ddi_get_lbolt(); 2523 if (now - last_update >= hz / STAT_UPDATES_PER_SEC) { 2524 uint64_t new_rate; 2525 2526 new_rate = (atomic_swap_64(&arc_bytes_allocd, 0) * 2527 hz) / (now - last_update); 2528 2529 if (ARCSTAT(arcstat_growth_rate) < new_rate) 2530 ARCSTAT(arcstat_growth_rate) = new_rate; 2531 else 2532 ARCSTAT_F_AVG(arcstat_growth_rate, new_rate, 4); 2533 last_update = now; 2534 } 2535 2536 arc_pressure_threshold = arc_c - ARCSTAT(arcstat_growth_rate); 2537 if (arc_size > arc_pressure_threshold) { 2538 arc_reclaim_bytes(arc_size - arc_pressure_threshold); 2539 } 2540 2541 CALLB_CPR_SAFE_BEGIN(&cpr); 2542 (void) cv_timedwait(&arc_pressure_thr_cv, 2543 &arc_pressure_thr_lock, 2544 ddi_get_lbolt() + hz / RECLAIMS_PER_SEC); 2545 CALLB_CPR_SAFE_END(&cpr, &arc_pressure_thr_lock); 2546 } 2547 2548 arc_pressure_thread_exit = 0; 2549 cv_broadcast(&arc_pressure_thr_cv); 2550 CALLB_CPR_EXIT(&cpr); /* drops arc_pressure_thr_lock */ 2551 thread_exit(); 2552 } 2553 2554 static void 2555 arc_reclaim_thread(void) 2556 { 2557 clock_t growtime = 0; 2558 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 2559 callb_cpr_t cpr; 2560 2561 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 2562 2563 mutex_enter(&arc_reclaim_thr_lock); 2564 while (arc_thread_exit == 0) { 2565 if (arc_reclaim_needed()) { 2566 2567 if (arc_no_grow) { 2568 if (last_reclaim == ARC_RECLAIM_CONS) { 2569 last_reclaim = ARC_RECLAIM_AGGR; 2570 } else { 2571 last_reclaim = ARC_RECLAIM_CONS; 2572 } 2573 } else { 2574 arc_no_grow = TRUE; 2575 last_reclaim = ARC_RECLAIM_AGGR; 2576 membar_producer(); 2577 } 2578 2579 /* reset the growth delay for every reclaim */ 2580 growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 2581 2582 arc_kmem_reap_now(last_reclaim); 2583 arc_warm = B_TRUE; 2584 2585 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { 2586 arc_no_grow = FALSE; 2587 } 2588 2589 arc_adjust(); 2590 2591 if (arc_eviction_list != NULL) 2592 arc_do_user_evicts(); 2593 2594 /* block until needed, or one second, whichever is shorter */ 2595 CALLB_CPR_SAFE_BEGIN(&cpr); 2596 (void) cv_timedwait(&arc_reclaim_thr_cv, 2597 &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz)); 2598 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 2599 } 2600 2601 arc_thread_exit = 0; 2602 cv_broadcast(&arc_reclaim_thr_cv); 2603 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 2604 thread_exit(); 2605 } 2606 2607 /* 2608 * Adapt arc info given the number of bytes we are trying to add and 2609 * the state that we are comming from. This function is only called 2610 * when we are adding new content to the cache. 2611 */ 2612 static void 2613 arc_adapt(int bytes, arc_state_t *state) 2614 { 2615 int mult; 2616 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 2617 2618 if (state == arc_l2c_only) 2619 return; 2620 2621 ASSERT(bytes > 0); 2622 /* 2623 * Adapt the target size of the MRU list: 2624 * - if we just hit in the MRU ghost list, then increase 2625 * the target size of the MRU list. 2626 * - if we just hit in the MFU ghost list, then increase 2627 * the target size of the MFU list by decreasing the 2628 * target size of the MRU list. 2629 */ 2630 if (state == arc_mru_ghost) { 2631 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 2632 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 2633 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 2634 2635 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 2636 } else if (state == arc_mfu_ghost) { 2637 uint64_t delta; 2638 2639 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 2640 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 2641 mult = MIN(mult, 10); 2642 2643 delta = MIN(bytes * mult, arc_p); 2644 arc_p = MAX(arc_p_min, arc_p - delta); 2645 } 2646 ASSERT((int64_t)arc_p >= 0); 2647 2648 if (arc_reclaim_needed()) { 2649 cv_signal(&arc_reclaim_thr_cv); 2650 return; 2651 } 2652 2653 if (arc_no_grow) 2654 return; 2655 2656 if (arc_c >= arc_c_max) 2657 return; 2658 2659 /* 2660 * If we're within (2 * maxblocksize) bytes of the target 2661 * cache size, increment the target cache size 2662 */ 2663 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT) || 2664 (arc_size >= arc_pressure_threshold && arc_mem_pressure() == 0)) { 2665 atomic_add_64(&arc_c, (int64_t)bytes); 2666 if (arc_c > arc_c_max) 2667 arc_c = arc_c_max; 2668 else if (state == arc_anon) 2669 atomic_add_64(&arc_p, (int64_t)bytes); 2670 if (arc_p > arc_c) 2671 arc_p = arc_c; 2672 } 2673 ASSERT((int64_t)arc_p >= 0); 2674 } 2675 2676 /* 2677 * Check if the cache has reached its limits and eviction is required 2678 * prior to insert. 2679 */ 2680 static int 2681 arc_evict_needed(arc_buf_contents_t type) 2682 { 2683 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 2684 return (1); 2685 2686 if (arc_reclaim_needed()) 2687 return (1); 2688 2689 return (arc_size > arc_c); 2690 } 2691 2692 /* 2693 * The buffer, supplied as the first argument, needs a data block. 2694 * So, if we are at cache max, determine which cache should be victimized. 2695 * We have the following cases: 2696 * 2697 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 2698 * In this situation if we're out of space, but the resident size of the MFU is 2699 * under the limit, victimize the MFU cache to satisfy this insertion request. 2700 * 2701 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 2702 * Here, we've used up all of the available space for the MRU, so we need to 2703 * evict from our own cache instead. Evict from the set of resident MRU 2704 * entries. 2705 * 2706 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 2707 * c minus p represents the MFU space in the cache, since p is the size of the 2708 * cache that is dedicated to the MRU. In this situation there's still space on 2709 * the MFU side, so the MRU side needs to be victimized. 2710 * 2711 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 2712 * MFU's resident set is consuming more space than it has been allotted. In 2713 * this situation, we must victimize our own cache, the MFU, for this insertion. 2714 */ 2715 static void 2716 arc_get_data_buf(arc_buf_t *buf) 2717 { 2718 arc_state_t *state = buf->b_hdr->b_state; 2719 uint64_t size = buf->b_hdr->b_size; 2720 arc_buf_contents_t type = buf->b_hdr->b_type; 2721 2722 arc_adapt(size, state); 2723 2724 /* 2725 * We have not yet reached cache maximum size, 2726 * just allocate a new buffer. 2727 */ 2728 if (!arc_evict_needed(type)) { 2729 if (type == ARC_BUFC_METADATA) { 2730 buf->b_data = zio_buf_alloc(size); 2731 arc_space_consume(size, ARC_SPACE_DATA); 2732 } else { 2733 ASSERT(type == ARC_BUFC_DATA); 2734 buf->b_data = zio_data_buf_alloc(size); 2735 ARCSTAT_INCR(arcstat_data_size, size); 2736 atomic_add_64(&arc_size, size); 2737 atomic_add_64(&arc_bytes_allocd, size); 2738 } 2739 goto out; 2740 } 2741 2742 /* 2743 * If we are prefetching from the mfu ghost list, this buffer 2744 * will end up on the mru list; so steal space from there. 2745 */ 2746 if (state == arc_mfu_ghost) 2747 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 2748 else if (state == arc_mru_ghost) 2749 state = arc_mru; 2750 2751 if (state == arc_mru || state == arc_anon) { 2752 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 2753 state = (arc_mfu->arcs_lsize[type] >= size && 2754 arc_p > mru_used) ? arc_mfu : arc_mru; 2755 } else { 2756 /* MFU cases */ 2757 uint64_t mfu_space = arc_c - arc_p; 2758 state = (arc_mru->arcs_lsize[type] >= size && 2759 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 2760 } 2761 if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { 2762 if (type == ARC_BUFC_METADATA) { 2763 buf->b_data = zio_buf_alloc(size); 2764 arc_space_consume(size, ARC_SPACE_DATA); 2765 } else { 2766 ASSERT(type == ARC_BUFC_DATA); 2767 buf->b_data = zio_data_buf_alloc(size); 2768 ARCSTAT_INCR(arcstat_data_size, size); 2769 atomic_add_64(&arc_size, size); 2770 atomic_add_64(&arc_bytes_allocd, size); 2771 } 2772 ARCSTAT_BUMP(arcstat_recycle_miss); 2773 } 2774 ASSERT(buf->b_data != NULL); 2775 out: 2776 /* 2777 * Update the state size. Note that ghost states have a 2778 * "ghost size" and so don't need to be updated. 2779 */ 2780 if (!GHOST_STATE(buf->b_hdr->b_state)) { 2781 arc_buf_hdr_t *hdr = buf->b_hdr; 2782 2783 atomic_add_64(&hdr->b_state->arcs_size, size); 2784 if (list_link_active(&hdr->b_arc_node)) { 2785 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2786 atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 2787 } 2788 /* 2789 * If we are growing the cache, and we are adding anonymous 2790 * data, and we have outgrown arc_p, update arc_p 2791 */ 2792 if (arc_size < arc_c && hdr->b_state == arc_anon && 2793 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 2794 arc_p = MIN(arc_c, arc_p + size); 2795 } 2796 } 2797 2798 /* 2799 * This routine is called whenever a buffer is accessed. 2800 * NOTE: the hash lock is dropped in this function. 2801 */ 2802 static void 2803 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 2804 { 2805 clock_t now; 2806 2807 ASSERT(MUTEX_HELD(hash_lock)); 2808 2809 if (buf->b_state == arc_anon) { 2810 /* 2811 * This buffer is not in the cache, and does not 2812 * appear in our "ghost" list. Add the new buffer 2813 * to the MRU state. 2814 */ 2815 2816 ASSERT(buf->b_arc_access == 0); 2817 buf->b_arc_access = ddi_get_lbolt(); 2818 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2819 arc_change_state(arc_mru, buf, hash_lock); 2820 2821 } else if (buf->b_state == arc_mru) { 2822 now = ddi_get_lbolt(); 2823 2824 /* 2825 * If this buffer is here because of a prefetch, then either: 2826 * - clear the flag if this is a "referencing" read 2827 * (any subsequent access will bump this into the MFU state). 2828 * or 2829 * - move the buffer to the head of the list if this is 2830 * another prefetch (to make it less likely to be evicted). 2831 */ 2832 if ((buf->b_flags & ARC_PREFETCH) != 0) { 2833 if (refcount_count(&buf->b_refcnt) == 0) { 2834 ASSERT(list_link_active(&buf->b_arc_node)); 2835 } else { 2836 buf->b_flags &= ~ARC_PREFETCH; 2837 ARCSTAT_BUMP(arcstat_mru_hits); 2838 } 2839 buf->b_arc_access = now; 2840 return; 2841 } 2842 2843 /* 2844 * This buffer has been "accessed" only once so far, 2845 * but it is still in the cache. Move it to the MFU 2846 * state. 2847 */ 2848 if (now > buf->b_arc_access + ARC_MINTIME) { 2849 /* 2850 * More than 125ms have passed since we 2851 * instantiated this buffer. Move it to the 2852 * most frequently used state. 2853 */ 2854 buf->b_arc_access = now; 2855 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2856 arc_change_state(arc_mfu, buf, hash_lock); 2857 } 2858 ARCSTAT_BUMP(arcstat_mru_hits); 2859 } else if (buf->b_state == arc_mru_ghost) { 2860 arc_state_t *new_state; 2861 /* 2862 * This buffer has been "accessed" recently, but 2863 * was evicted from the cache. Move it to the 2864 * MFU state. 2865 */ 2866 2867 if (buf->b_flags & ARC_PREFETCH) { 2868 new_state = arc_mru; 2869 if (refcount_count(&buf->b_refcnt) > 0) 2870 buf->b_flags &= ~ARC_PREFETCH; 2871 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2872 } else { 2873 new_state = arc_mfu; 2874 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2875 } 2876 2877 buf->b_arc_access = ddi_get_lbolt(); 2878 arc_change_state(new_state, buf, hash_lock); 2879 2880 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 2881 } else if (buf->b_state == arc_mfu) { 2882 /* 2883 * This buffer has been accessed more than once and is 2884 * still in the cache. Keep it in the MFU state. 2885 * 2886 * NOTE: an add_reference() that occurred when we did 2887 * the arc_read() will have kicked this off the list. 2888 * If it was a prefetch, we will explicitly move it to 2889 * the head of the list now. 2890 */ 2891 if ((buf->b_flags & ARC_PREFETCH) != 0) { 2892 ASSERT(refcount_count(&buf->b_refcnt) == 0); 2893 ASSERT(list_link_active(&buf->b_arc_node)); 2894 } 2895 ARCSTAT_BUMP(arcstat_mfu_hits); 2896 buf->b_arc_access = ddi_get_lbolt(); 2897 } else if (buf->b_state == arc_mfu_ghost) { 2898 arc_state_t *new_state = arc_mfu; 2899 /* 2900 * This buffer has been accessed more than once but has 2901 * been evicted from the cache. Move it back to the 2902 * MFU state. 2903 */ 2904 2905 if (buf->b_flags & ARC_PREFETCH) { 2906 /* 2907 * This is a prefetch access... 2908 * move this block back to the MRU state. 2909 */ 2910 ASSERT0(refcount_count(&buf->b_refcnt)); 2911 new_state = arc_mru; 2912 } 2913 2914 buf->b_arc_access = ddi_get_lbolt(); 2915 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2916 arc_change_state(new_state, buf, hash_lock); 2917 2918 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 2919 } else if (buf->b_state == arc_l2c_only) { 2920 /* 2921 * This buffer is on the 2nd Level ARC. 2922 */ 2923 2924 buf->b_arc_access = ddi_get_lbolt(); 2925 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2926 arc_change_state(arc_mfu, buf, hash_lock); 2927 } else { 2928 ASSERT(!"invalid arc state"); 2929 } 2930 } 2931 2932 /* a generic arc_done_func_t which you can use */ 2933 /* ARGSUSED */ 2934 void 2935 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 2936 { 2937 if (zio == NULL || zio->io_error == 0) 2938 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 2939 VERIFY(arc_buf_remove_ref(buf, arg)); 2940 } 2941 2942 /* a generic arc_done_func_t */ 2943 void 2944 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 2945 { 2946 arc_buf_t **bufp = arg; 2947 if (zio && zio->io_error) { 2948 VERIFY(arc_buf_remove_ref(buf, arg)); 2949 *bufp = NULL; 2950 } else { 2951 *bufp = buf; 2952 ASSERT(buf->b_data); 2953 } 2954 } 2955 2956 static void 2957 arc_read_done(zio_t *zio) 2958 { 2959 arc_buf_hdr_t *hdr; 2960 arc_buf_t *buf; 2961 arc_buf_t *abuf; /* buffer we're assigning to callback */ 2962 kmutex_t *hash_lock = NULL; 2963 arc_callback_t *callback_list, *acb; 2964 int freeable = FALSE; 2965 2966 buf = zio->io_private; 2967 hdr = buf->b_hdr; 2968 2969 /* 2970 * The hdr was inserted into hash-table and removed from lists 2971 * prior to starting I/O. We should find this header, since 2972 * it's in the hash table, and it should be legit since it's 2973 * not possible to evict it during the I/O. The only possible 2974 * reason for it not to be found is if we were freed during the 2975 * read. 2976 */ 2977 if (HDR_IN_HASH_TABLE(hdr)) { 2978 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 2979 ASSERT3U(hdr->b_dva.dva_word[0], ==, 2980 BP_IDENTITY(zio->io_bp)->dva_word[0]); 2981 ASSERT3U(hdr->b_dva.dva_word[1], ==, 2982 BP_IDENTITY(zio->io_bp)->dva_word[1]); 2983 2984 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 2985 &hash_lock); 2986 2987 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && 2988 hash_lock == NULL) || 2989 (found == hdr && 2990 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 2991 (found == hdr && HDR_L2_READING(hdr))); 2992 } 2993 2994 hdr->b_flags &= ~ARC_L2_EVICTED; 2995 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) 2996 hdr->b_flags &= ~ARC_L2CACHE; 2997 2998 /* byteswap if necessary */ 2999 callback_list = hdr->b_acb; 3000 ASSERT(callback_list != NULL); 3001 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 3002 dmu_object_byteswap_t bswap = 3003 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 3004 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 3005 byteswap_uint64_array : 3006 dmu_ot_byteswap[bswap].ob_func; 3007 func(buf->b_data, hdr->b_size); 3008 } 3009 3010 arc_cksum_compute(buf, B_FALSE); 3011 arc_buf_watch(buf); 3012 3013 if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { 3014 /* 3015 * Only call arc_access on anonymous buffers. This is because 3016 * if we've issued an I/O for an evicted buffer, we've already 3017 * called arc_access (to prevent any simultaneous readers from 3018 * getting confused). 3019 */ 3020 arc_access(hdr, hash_lock); 3021 } 3022 3023 /* create copies of the data buffer for the callers */ 3024 abuf = buf; 3025 for (acb = callback_list; acb; acb = acb->acb_next) { 3026 if (acb->acb_done) { 3027 if (abuf == NULL) { 3028 ARCSTAT_BUMP(arcstat_duplicate_reads); 3029 abuf = arc_buf_clone(buf); 3030 } 3031 acb->acb_buf = abuf; 3032 abuf = NULL; 3033 } 3034 } 3035 hdr->b_acb = NULL; 3036 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3037 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 3038 if (abuf == buf) { 3039 ASSERT(buf->b_efunc == NULL); 3040 ASSERT(hdr->b_datacnt == 1); 3041 hdr->b_flags |= ARC_BUF_AVAILABLE; 3042 } 3043 3044 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 3045 3046 if (zio->io_error != 0) { 3047 hdr->b_flags |= ARC_IO_ERROR; 3048 if (hdr->b_state != arc_anon) 3049 arc_change_state(arc_anon, hdr, hash_lock); 3050 if (HDR_IN_HASH_TABLE(hdr)) 3051 buf_hash_remove(hdr); 3052 freeable = refcount_is_zero(&hdr->b_refcnt); 3053 } 3054 3055 /* 3056 * Broadcast before we drop the hash_lock to avoid the possibility 3057 * that the hdr (and hence the cv) might be freed before we get to 3058 * the cv_broadcast(). 3059 */ 3060 cv_broadcast(&hdr->b_cv); 3061 3062 if (hash_lock) { 3063 mutex_exit(hash_lock); 3064 } else { 3065 /* 3066 * This block was freed while we waited for the read to 3067 * complete. It has been removed from the hash table and 3068 * moved to the anonymous state (so that it won't show up 3069 * in the cache). 3070 */ 3071 ASSERT3P(hdr->b_state, ==, arc_anon); 3072 freeable = refcount_is_zero(&hdr->b_refcnt); 3073 } 3074 3075 /* execute each callback and free its structure */ 3076 while ((acb = callback_list) != NULL) { 3077 if (acb->acb_done) 3078 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 3079 3080 if (acb->acb_zio_dummy != NULL) { 3081 acb->acb_zio_dummy->io_error = zio->io_error; 3082 zio_nowait(acb->acb_zio_dummy); 3083 } 3084 3085 callback_list = acb->acb_next; 3086 kmem_free(acb, sizeof (arc_callback_t)); 3087 } 3088 3089 if (freeable) 3090 arc_hdr_destroy(hdr); 3091 } 3092 3093 /* 3094 * "Read" the block at the specified DVA (in bp) via the 3095 * cache. If the block is found in the cache, invoke the provided 3096 * callback immediately and return. Note that the `zio' parameter 3097 * in the callback will be NULL in this case, since no IO was 3098 * required. If the block is not in the cache pass the read request 3099 * on to the spa with a substitute callback function, so that the 3100 * requested block will be added to the cache. 3101 * 3102 * If a read request arrives for a block that has a read in-progress, 3103 * either wait for the in-progress read to complete (and return the 3104 * results); or, if this is a read with a "done" func, add a record 3105 * to the read to invoke the "done" func when the read completes, 3106 * and return; or just return. 3107 * 3108 * arc_read_done() will invoke all the requested "done" functions 3109 * for readers of this block. 3110 */ 3111 int 3112 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 3113 void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags, 3114 const zbookmark_phys_t *zb) 3115 { 3116 arc_buf_hdr_t *hdr = NULL; 3117 arc_buf_t *buf = NULL; 3118 kmutex_t *hash_lock = NULL; 3119 zio_t *rzio; 3120 uint64_t guid = spa_load_guid(spa); 3121 3122 ASSERT(!BP_IS_EMBEDDED(bp) || 3123 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 3124 3125 top: 3126 if (!BP_IS_EMBEDDED(bp)) { 3127 /* 3128 * Embedded BP's have no DVA and require no I/O to "read". 3129 * Create an anonymous arc buf to back it. 3130 */ 3131 hdr = buf_hash_find(guid, bp, &hash_lock); 3132 } 3133 3134 if (hdr != NULL && hdr->b_datacnt > 0) { 3135 3136 *arc_flags |= ARC_CACHED; 3137 3138 if (HDR_IO_IN_PROGRESS(hdr)) { 3139 3140 if (*arc_flags & ARC_WAIT) { 3141 cv_wait(&hdr->b_cv, hash_lock); 3142 mutex_exit(hash_lock); 3143 goto top; 3144 } 3145 ASSERT(*arc_flags & ARC_NOWAIT); 3146 3147 if (done) { 3148 arc_callback_t *acb = NULL; 3149 3150 acb = kmem_zalloc(sizeof (arc_callback_t), 3151 KM_SLEEP); 3152 acb->acb_done = done; 3153 acb->acb_private = private; 3154 if (pio != NULL) 3155 acb->acb_zio_dummy = zio_null(pio, 3156 spa, NULL, NULL, NULL, zio_flags); 3157 3158 ASSERT(acb->acb_done != NULL); 3159 acb->acb_next = hdr->b_acb; 3160 hdr->b_acb = acb; 3161 add_reference(hdr, hash_lock, private); 3162 mutex_exit(hash_lock); 3163 return (0); 3164 } 3165 mutex_exit(hash_lock); 3166 return (0); 3167 } 3168 3169 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 3170 3171 if (done) { 3172 add_reference(hdr, hash_lock, private); 3173 /* 3174 * If this block is already in use, create a new 3175 * copy of the data so that we will be guaranteed 3176 * that arc_release() will always succeed. 3177 */ 3178 buf = hdr->b_buf; 3179 ASSERT(buf); 3180 ASSERT(buf->b_data); 3181 if (HDR_BUF_AVAILABLE(hdr)) { 3182 ASSERT(buf->b_efunc == NULL); 3183 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 3184 } else { 3185 buf = arc_buf_clone(buf); 3186 } 3187 3188 } else if (*arc_flags & ARC_PREFETCH && 3189 refcount_count(&hdr->b_refcnt) == 0) { 3190 hdr->b_flags |= ARC_PREFETCH; 3191 } 3192 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 3193 arc_access(hdr, hash_lock); 3194 if (*arc_flags & ARC_L2CACHE) 3195 hdr->b_flags |= ARC_L2CACHE; 3196 if (*arc_flags & ARC_L2COMPRESS) 3197 hdr->b_flags |= ARC_L2COMPRESS; 3198 mutex_exit(hash_lock); 3199 ARCSTAT_BUMP(arcstat_hits); 3200 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 3201 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 3202 data, metadata, hits); 3203 3204 if (done) 3205 done(NULL, buf, private); 3206 } else { 3207 uint64_t size = BP_GET_LSIZE(bp); 3208 arc_callback_t *acb; 3209 vdev_t *vd = NULL; 3210 uint64_t addr = 0; 3211 boolean_t devw = B_FALSE; 3212 enum zio_compress b_compress = ZIO_COMPRESS_OFF; 3213 uint64_t b_asize = 0; 3214 3215 if (hdr == NULL) { 3216 /* this block is not in the cache */ 3217 arc_buf_hdr_t *exists = NULL; 3218 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 3219 buf = arc_buf_alloc(spa, size, private, type); 3220 hdr = buf->b_hdr; 3221 if (!BP_IS_EMBEDDED(bp)) { 3222 hdr->b_dva = *BP_IDENTITY(bp); 3223 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 3224 hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 3225 exists = buf_hash_insert(hdr, &hash_lock); 3226 } 3227 if (exists != NULL) { 3228 /* somebody beat us to the hash insert */ 3229 mutex_exit(hash_lock); 3230 buf_discard_identity(hdr); 3231 (void) arc_buf_remove_ref(buf, private); 3232 goto top; /* restart the IO request */ 3233 } 3234 /* if this is a prefetch, we don't have a reference */ 3235 if (*arc_flags & ARC_PREFETCH) { 3236 (void) remove_reference(hdr, hash_lock, 3237 private); 3238 hdr->b_flags |= ARC_PREFETCH; 3239 } 3240 if (*arc_flags & ARC_L2CACHE) 3241 hdr->b_flags |= ARC_L2CACHE; 3242 if (*arc_flags & ARC_L2COMPRESS) 3243 hdr->b_flags |= ARC_L2COMPRESS; 3244 if (BP_GET_LEVEL(bp) > 0) 3245 hdr->b_flags |= ARC_INDIRECT; 3246 } else { 3247 /* this block is in the ghost cache */ 3248 ASSERT(GHOST_STATE(hdr->b_state)); 3249 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3250 ASSERT0(refcount_count(&hdr->b_refcnt)); 3251 ASSERT(hdr->b_buf == NULL); 3252 3253 /* if this is a prefetch, we don't have a reference */ 3254 if (*arc_flags & ARC_PREFETCH) 3255 hdr->b_flags |= ARC_PREFETCH; 3256 else 3257 add_reference(hdr, hash_lock, private); 3258 if (*arc_flags & ARC_L2CACHE) 3259 hdr->b_flags |= ARC_L2CACHE; 3260 if (*arc_flags & ARC_L2COMPRESS) 3261 hdr->b_flags |= ARC_L2COMPRESS; 3262 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 3263 buf->b_hdr = hdr; 3264 buf->b_data = NULL; 3265 buf->b_efunc = NULL; 3266 buf->b_private = NULL; 3267 buf->b_next = NULL; 3268 hdr->b_buf = buf; 3269 ASSERT(hdr->b_datacnt == 0); 3270 hdr->b_datacnt = 1; 3271 arc_get_data_buf(buf); 3272 arc_access(hdr, hash_lock); 3273 } 3274 3275 ASSERT(!GHOST_STATE(hdr->b_state)); 3276 3277 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 3278 acb->acb_done = done; 3279 acb->acb_private = private; 3280 3281 ASSERT(hdr->b_acb == NULL); 3282 hdr->b_acb = acb; 3283 hdr->b_flags |= ARC_IO_IN_PROGRESS; 3284 3285 if (hdr->b_l2hdr != NULL && 3286 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { 3287 devw = hdr->b_l2hdr->b_dev->l2ad_writing; 3288 addr = hdr->b_l2hdr->b_daddr; 3289 b_compress = hdr->b_l2hdr->b_compress; 3290 b_asize = hdr->b_l2hdr->b_asize; 3291 /* 3292 * Lock out device removal. 3293 */ 3294 if (vdev_is_dead(vd) || 3295 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 3296 vd = NULL; 3297 } 3298 3299 if (hash_lock != NULL) 3300 mutex_exit(hash_lock); 3301 3302 /* 3303 * At this point, we have a level 1 cache miss. Try again in 3304 * L2ARC if possible. 3305 */ 3306 ASSERT3U(hdr->b_size, ==, size); 3307 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 3308 uint64_t, size, zbookmark_phys_t *, zb); 3309 ARCSTAT_BUMP(arcstat_misses); 3310 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 3311 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 3312 data, metadata, misses); 3313 3314 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 3315 /* 3316 * Read from the L2ARC if the following are true: 3317 * 1. The L2ARC vdev was previously cached. 3318 * 2. This buffer still has L2ARC metadata. 3319 * 3. This buffer isn't currently writing to the L2ARC. 3320 * 4. The L2ARC entry wasn't evicted, which may 3321 * also have invalidated the vdev. 3322 * 5. This isn't prefetch and l2arc_noprefetch is set. 3323 */ 3324 if (hdr->b_l2hdr != NULL && 3325 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 3326 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 3327 l2arc_read_callback_t *cb; 3328 3329 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 3330 ARCSTAT_BUMP(arcstat_l2_hits); 3331 3332 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 3333 KM_SLEEP); 3334 cb->l2rcb_buf = buf; 3335 cb->l2rcb_spa = spa; 3336 cb->l2rcb_bp = *bp; 3337 cb->l2rcb_zb = *zb; 3338 cb->l2rcb_flags = zio_flags; 3339 cb->l2rcb_compress = b_compress; 3340 3341 ASSERT(addr >= VDEV_LABEL_START_SIZE && 3342 addr + size < vd->vdev_psize - 3343 VDEV_LABEL_END_SIZE); 3344 3345 /* 3346 * l2arc read. The SCL_L2ARC lock will be 3347 * released by l2arc_read_done(). 3348 * Issue a null zio if the underlying buffer 3349 * was squashed to zero size by compression. 3350 */ 3351 if (b_compress == ZIO_COMPRESS_EMPTY) { 3352 rzio = zio_null(pio, spa, vd, 3353 l2arc_read_done, cb, 3354 zio_flags | ZIO_FLAG_DONT_CACHE | 3355 ZIO_FLAG_CANFAIL | 3356 ZIO_FLAG_DONT_PROPAGATE | 3357 ZIO_FLAG_DONT_RETRY); 3358 } else { 3359 rzio = zio_read_phys(pio, vd, addr, 3360 b_asize, buf->b_data, 3361 ZIO_CHECKSUM_OFF, 3362 l2arc_read_done, cb, priority, 3363 zio_flags | ZIO_FLAG_DONT_CACHE | 3364 ZIO_FLAG_CANFAIL | 3365 ZIO_FLAG_DONT_PROPAGATE | 3366 ZIO_FLAG_DONT_RETRY, B_FALSE); 3367 } 3368 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 3369 zio_t *, rzio); 3370 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); 3371 3372 if (*arc_flags & ARC_NOWAIT) { 3373 zio_nowait(rzio); 3374 return (0); 3375 } 3376 3377 ASSERT(*arc_flags & ARC_WAIT); 3378 if (zio_wait(rzio) == 0) 3379 return (0); 3380 3381 /* l2arc read error; goto zio_read() */ 3382 } else { 3383 DTRACE_PROBE1(l2arc__miss, 3384 arc_buf_hdr_t *, hdr); 3385 ARCSTAT_BUMP(arcstat_l2_misses); 3386 if (HDR_L2_WRITING(hdr)) 3387 ARCSTAT_BUMP(arcstat_l2_rw_clash); 3388 spa_config_exit(spa, SCL_L2ARC, vd); 3389 } 3390 } else { 3391 if (vd != NULL) 3392 spa_config_exit(spa, SCL_L2ARC, vd); 3393 if (l2arc_ndev != 0) { 3394 DTRACE_PROBE1(l2arc__miss, 3395 arc_buf_hdr_t *, hdr); 3396 ARCSTAT_BUMP(arcstat_l2_misses); 3397 } 3398 } 3399 3400 rzio = zio_read(pio, spa, bp, buf->b_data, size, 3401 arc_read_done, buf, priority, zio_flags, zb); 3402 3403 if (*arc_flags & ARC_WAIT) 3404 return (zio_wait(rzio)); 3405 3406 ASSERT(*arc_flags & ARC_NOWAIT); 3407 zio_nowait(rzio); 3408 } 3409 return (0); 3410 } 3411 3412 void 3413 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 3414 { 3415 ASSERT(buf->b_hdr != NULL); 3416 ASSERT(buf->b_hdr->b_state != arc_anon); 3417 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 3418 ASSERT(buf->b_efunc == NULL); 3419 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 3420 3421 buf->b_efunc = func; 3422 buf->b_private = private; 3423 } 3424 3425 /* 3426 * Notify the arc that a block was freed, and thus will never be used again. 3427 */ 3428 void 3429 arc_freed(spa_t *spa, const blkptr_t *bp) 3430 { 3431 arc_buf_hdr_t *hdr; 3432 kmutex_t *hash_lock; 3433 uint64_t guid = spa_load_guid(spa); 3434 3435 ASSERT(!BP_IS_EMBEDDED(bp)); 3436 3437 hdr = buf_hash_find(guid, bp, &hash_lock); 3438 if (hdr == NULL) 3439 return; 3440 if (HDR_BUF_AVAILABLE(hdr)) { 3441 arc_buf_t *buf = hdr->b_buf; 3442 add_reference(hdr, hash_lock, FTAG); 3443 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 3444 mutex_exit(hash_lock); 3445 3446 arc_release(buf, FTAG); 3447 (void) arc_buf_remove_ref(buf, FTAG); 3448 } else { 3449 mutex_exit(hash_lock); 3450 } 3451 3452 } 3453 3454 /* 3455 * Clear the user eviction callback set by arc_set_callback(), first calling 3456 * it if it exists. Because the presence of a callback keeps an arc_buf cached 3457 * clearing the callback may result in the arc_buf being destroyed. However, 3458 * it will not result in the *last* arc_buf being destroyed, hence the data 3459 * will remain cached in the ARC. We make a copy of the arc buffer here so 3460 * that we can process the callback without holding any locks. 3461 * 3462 * It's possible that the callback is already in the process of being cleared 3463 * by another thread. In this case we can not clear the callback. 3464 * 3465 * Returns B_TRUE if the callback was successfully called and cleared. 3466 */ 3467 boolean_t 3468 arc_clear_callback(arc_buf_t *buf) 3469 { 3470 arc_buf_hdr_t *hdr; 3471 kmutex_t *hash_lock; 3472 arc_evict_func_t *efunc = buf->b_efunc; 3473 void *private = buf->b_private; 3474 3475 mutex_enter(&buf->b_evict_lock); 3476 hdr = buf->b_hdr; 3477 if (hdr == NULL) { 3478 /* 3479 * We are in arc_do_user_evicts(). 3480 */ 3481 ASSERT(buf->b_data == NULL); 3482 mutex_exit(&buf->b_evict_lock); 3483 return (B_FALSE); 3484 } else if (buf->b_data == NULL) { 3485 /* 3486 * We are on the eviction list; process this buffer now 3487 * but let arc_do_user_evicts() do the reaping. 3488 */ 3489 buf->b_efunc = NULL; 3490 mutex_exit(&buf->b_evict_lock); 3491 VERIFY0(efunc(private)); 3492 return (B_TRUE); 3493 } 3494 hash_lock = HDR_LOCK(hdr); 3495 mutex_enter(hash_lock); 3496 hdr = buf->b_hdr; 3497 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 3498 3499 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 3500 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 3501 3502 buf->b_efunc = NULL; 3503 buf->b_private = NULL; 3504 3505 if (hdr->b_datacnt > 1) { 3506 mutex_exit(&buf->b_evict_lock); 3507 arc_buf_destroy(buf, FALSE, TRUE); 3508 } else { 3509 ASSERT(buf == hdr->b_buf); 3510 hdr->b_flags |= ARC_BUF_AVAILABLE; 3511 mutex_exit(&buf->b_evict_lock); 3512 } 3513 3514 mutex_exit(hash_lock); 3515 VERIFY0(efunc(private)); 3516 return (B_TRUE); 3517 } 3518 3519 /* 3520 * Release this buffer from the cache, making it an anonymous buffer. This 3521 * must be done after a read and prior to modifying the buffer contents. 3522 * If the buffer has more than one reference, we must make 3523 * a new hdr for the buffer. 3524 */ 3525 void 3526 arc_release(arc_buf_t *buf, void *tag) 3527 { 3528 arc_buf_hdr_t *hdr; 3529 kmutex_t *hash_lock = NULL; 3530 l2arc_buf_hdr_t *l2hdr; 3531 uint64_t buf_size; 3532 3533 /* 3534 * It would be nice to assert that if it's DMU metadata (level > 3535 * 0 || it's the dnode file), then it must be syncing context. 3536 * But we don't know that information at this level. 3537 */ 3538 3539 mutex_enter(&buf->b_evict_lock); 3540 hdr = buf->b_hdr; 3541 3542 /* this buffer is not on any list */ 3543 ASSERT(refcount_count(&hdr->b_refcnt) > 0); 3544 3545 if (hdr->b_state == arc_anon) { 3546 /* this buffer is already released */ 3547 ASSERT(buf->b_efunc == NULL); 3548 } else { 3549 hash_lock = HDR_LOCK(hdr); 3550 mutex_enter(hash_lock); 3551 hdr = buf->b_hdr; 3552 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 3553 } 3554 3555 l2hdr = hdr->b_l2hdr; 3556 if (l2hdr) { 3557 mutex_enter(&l2arc_buflist_mtx); 3558 hdr->b_l2hdr = NULL; 3559 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 3560 } 3561 buf_size = hdr->b_size; 3562 3563 /* 3564 * Do we have more than one buf? 3565 */ 3566 if (hdr->b_datacnt > 1) { 3567 arc_buf_hdr_t *nhdr; 3568 arc_buf_t **bufp; 3569 uint64_t blksz = hdr->b_size; 3570 uint64_t spa = hdr->b_spa; 3571 arc_buf_contents_t type = hdr->b_type; 3572 uint32_t flags = hdr->b_flags; 3573 3574 ASSERT(hdr->b_buf != buf || buf->b_next != NULL); 3575 /* 3576 * Pull the data off of this hdr and attach it to 3577 * a new anonymous hdr. 3578 */ 3579 (void) remove_reference(hdr, hash_lock, tag); 3580 bufp = &hdr->b_buf; 3581 while (*bufp != buf) 3582 bufp = &(*bufp)->b_next; 3583 *bufp = buf->b_next; 3584 buf->b_next = NULL; 3585 3586 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 3587 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 3588 if (refcount_is_zero(&hdr->b_refcnt)) { 3589 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; 3590 ASSERT3U(*size, >=, hdr->b_size); 3591 atomic_add_64(size, -hdr->b_size); 3592 } 3593 3594 /* 3595 * We're releasing a duplicate user data buffer, update 3596 * our statistics accordingly. 3597 */ 3598 if (hdr->b_type == ARC_BUFC_DATA) { 3599 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 3600 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 3601 -hdr->b_size); 3602 } 3603 hdr->b_datacnt -= 1; 3604 arc_cksum_verify(buf); 3605 arc_buf_unwatch(buf); 3606 3607 mutex_exit(hash_lock); 3608 3609 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 3610 nhdr->b_size = blksz; 3611 nhdr->b_spa = spa; 3612 nhdr->b_type = type; 3613 nhdr->b_buf = buf; 3614 nhdr->b_state = arc_anon; 3615 nhdr->b_arc_access = 0; 3616 nhdr->b_flags = flags & ARC_L2_WRITING; 3617 nhdr->b_l2hdr = NULL; 3618 nhdr->b_datacnt = 1; 3619 nhdr->b_freeze_cksum = NULL; 3620 (void) refcount_add(&nhdr->b_refcnt, tag); 3621 buf->b_hdr = nhdr; 3622 mutex_exit(&buf->b_evict_lock); 3623 atomic_add_64(&arc_anon->arcs_size, blksz); 3624 } else { 3625 mutex_exit(&buf->b_evict_lock); 3626 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 3627 ASSERT(!list_link_active(&hdr->b_arc_node)); 3628 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3629 if (hdr->b_state != arc_anon) 3630 arc_change_state(arc_anon, hdr, hash_lock); 3631 hdr->b_arc_access = 0; 3632 if (hash_lock) 3633 mutex_exit(hash_lock); 3634 3635 buf_discard_identity(hdr); 3636 arc_buf_thaw(buf); 3637 } 3638 buf->b_efunc = NULL; 3639 buf->b_private = NULL; 3640 3641 if (l2hdr) { 3642 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 3643 if (l2hdr->b_dev->l2ad_vdev) 3644 vdev_space_update(l2hdr->b_dev->l2ad_vdev, 3645 -l2hdr->b_asize, 0, 0); 3646 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 3647 ARCSTAT_INCR(arcstat_l2_size, -buf_size); 3648 mutex_exit(&l2arc_buflist_mtx); 3649 } 3650 } 3651 3652 int 3653 arc_released(arc_buf_t *buf) 3654 { 3655 int released; 3656 3657 mutex_enter(&buf->b_evict_lock); 3658 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 3659 mutex_exit(&buf->b_evict_lock); 3660 return (released); 3661 } 3662 3663 #ifdef ZFS_DEBUG 3664 int 3665 arc_referenced(arc_buf_t *buf) 3666 { 3667 int referenced; 3668 3669 mutex_enter(&buf->b_evict_lock); 3670 referenced = (refcount_count(&buf->b_hdr->b_refcnt)); 3671 mutex_exit(&buf->b_evict_lock); 3672 return (referenced); 3673 } 3674 #endif 3675 3676 static void 3677 arc_write_ready(zio_t *zio) 3678 { 3679 arc_write_callback_t *callback = zio->io_private; 3680 arc_buf_t *buf = callback->awcb_buf; 3681 arc_buf_hdr_t *hdr = buf->b_hdr; 3682 3683 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 3684 callback->awcb_ready(zio, buf, callback->awcb_private); 3685 3686 /* 3687 * If the IO is already in progress, then this is a re-write 3688 * attempt, so we need to thaw and re-compute the cksum. 3689 * It is the responsibility of the callback to handle the 3690 * accounting for any re-write attempt. 3691 */ 3692 if (HDR_IO_IN_PROGRESS(hdr)) { 3693 mutex_enter(&hdr->b_freeze_lock); 3694 if (hdr->b_freeze_cksum != NULL) { 3695 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 3696 hdr->b_freeze_cksum = NULL; 3697 } 3698 mutex_exit(&hdr->b_freeze_lock); 3699 } 3700 arc_cksum_compute(buf, B_FALSE); 3701 hdr->b_flags |= ARC_IO_IN_PROGRESS; 3702 } 3703 3704 /* 3705 * The SPA calls this callback for each physical write that happens on behalf 3706 * of a logical write. See the comment in dbuf_write_physdone() for details. 3707 */ 3708 static void 3709 arc_write_physdone(zio_t *zio) 3710 { 3711 arc_write_callback_t *cb = zio->io_private; 3712 if (cb->awcb_physdone != NULL) 3713 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 3714 } 3715 3716 static void 3717 arc_write_done(zio_t *zio) 3718 { 3719 arc_write_callback_t *callback = zio->io_private; 3720 arc_buf_t *buf = callback->awcb_buf; 3721 arc_buf_hdr_t *hdr = buf->b_hdr; 3722 3723 ASSERT(hdr->b_acb == NULL); 3724 3725 if (zio->io_error == 0) { 3726 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 3727 buf_discard_identity(hdr); 3728 } else { 3729 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 3730 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 3731 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 3732 } 3733 } else { 3734 ASSERT(BUF_EMPTY(hdr)); 3735 } 3736 3737 /* 3738 * If the block to be written was all-zero or compressed enough to be 3739 * embedded in the BP, no write was performed so there will be no 3740 * dva/birth/checksum. The buffer must therefore remain anonymous 3741 * (and uncached). 3742 */ 3743 if (!BUF_EMPTY(hdr)) { 3744 arc_buf_hdr_t *exists; 3745 kmutex_t *hash_lock; 3746 3747 ASSERT(zio->io_error == 0); 3748 3749 arc_cksum_verify(buf); 3750 3751 exists = buf_hash_insert(hdr, &hash_lock); 3752 if (exists) { 3753 /* 3754 * This can only happen if we overwrite for 3755 * sync-to-convergence, because we remove 3756 * buffers from the hash table when we arc_free(). 3757 */ 3758 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 3759 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 3760 panic("bad overwrite, hdr=%p exists=%p", 3761 (void *)hdr, (void *)exists); 3762 ASSERT(refcount_is_zero(&exists->b_refcnt)); 3763 arc_change_state(arc_anon, exists, hash_lock); 3764 mutex_exit(hash_lock); 3765 arc_hdr_destroy(exists); 3766 exists = buf_hash_insert(hdr, &hash_lock); 3767 ASSERT3P(exists, ==, NULL); 3768 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 3769 /* nopwrite */ 3770 ASSERT(zio->io_prop.zp_nopwrite); 3771 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 3772 panic("bad nopwrite, hdr=%p exists=%p", 3773 (void *)hdr, (void *)exists); 3774 } else { 3775 /* Dedup */ 3776 ASSERT(hdr->b_datacnt == 1); 3777 ASSERT(hdr->b_state == arc_anon); 3778 ASSERT(BP_GET_DEDUP(zio->io_bp)); 3779 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 3780 } 3781 } 3782 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3783 /* if it's not anon, we are doing a scrub */ 3784 if (!exists && hdr->b_state == arc_anon) 3785 arc_access(hdr, hash_lock); 3786 mutex_exit(hash_lock); 3787 } else { 3788 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3789 } 3790 3791 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 3792 callback->awcb_done(zio, buf, callback->awcb_private); 3793 3794 kmem_free(callback, sizeof (arc_write_callback_t)); 3795 } 3796 3797 zio_t * 3798 arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 3799 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, 3800 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, 3801 arc_done_func_t *done, void *private, zio_priority_t priority, 3802 int zio_flags, const zbookmark_phys_t *zb) 3803 { 3804 arc_buf_hdr_t *hdr = buf->b_hdr; 3805 arc_write_callback_t *callback; 3806 zio_t *zio; 3807 3808 ASSERT(ready != NULL); 3809 ASSERT(done != NULL); 3810 ASSERT(!HDR_IO_ERROR(hdr)); 3811 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 3812 ASSERT(hdr->b_acb == NULL); 3813 if (l2arc) 3814 hdr->b_flags |= ARC_L2CACHE; 3815 if (l2arc_compress) 3816 hdr->b_flags |= ARC_L2COMPRESS; 3817 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 3818 callback->awcb_ready = ready; 3819 callback->awcb_physdone = physdone; 3820 callback->awcb_done = done; 3821 callback->awcb_private = private; 3822 callback->awcb_buf = buf; 3823 3824 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 3825 arc_write_ready, arc_write_physdone, arc_write_done, callback, 3826 priority, zio_flags, zb); 3827 3828 return (zio); 3829 } 3830 3831 static int 3832 arc_memory_throttle(uint64_t reserve, uint64_t txg) 3833 { 3834 #ifdef _KERNEL 3835 uint64_t available_memory = ptob(freemem); 3836 static uint64_t page_load = 0; 3837 static uint64_t last_txg = 0; 3838 3839 #if defined(__i386) 3840 available_memory = 3841 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); 3842 #endif 3843 3844 if (freemem > physmem * arc_lotsfree_percent / 100) 3845 return (0); 3846 3847 if (txg > last_txg) { 3848 last_txg = txg; 3849 page_load = 0; 3850 } 3851 /* 3852 * If we are in pageout, we know that memory is already tight, 3853 * the arc is already going to be evicting, so we just want to 3854 * continue to let page writes occur as quickly as possible. 3855 */ 3856 if (curproc == proc_pageout) { 3857 if (page_load > MAX(ptob(minfree), available_memory) / 4) 3858 return (SET_ERROR(ERESTART)); 3859 /* Note: reserve is inflated, so we deflate */ 3860 page_load += reserve / 8; 3861 return (0); 3862 } else if (page_load > 0 && arc_reclaim_needed()) { 3863 /* memory is low, delay before restarting */ 3864 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3865 return (SET_ERROR(EAGAIN)); 3866 } 3867 page_load = 0; 3868 #endif 3869 return (0); 3870 } 3871 3872 void 3873 arc_tempreserve_clear(uint64_t reserve) 3874 { 3875 atomic_add_64(&arc_tempreserve, -reserve); 3876 ASSERT((int64_t)arc_tempreserve >= 0); 3877 } 3878 3879 int 3880 arc_tempreserve_space(uint64_t reserve, uint64_t txg) 3881 { 3882 int error; 3883 uint64_t anon_size; 3884 3885 if (reserve > arc_c/4 && !arc_no_grow) 3886 arc_c = MIN(arc_c_max, reserve * 4); 3887 if (reserve > arc_c) 3888 return (SET_ERROR(ENOMEM)); 3889 3890 /* 3891 * Don't count loaned bufs as in flight dirty data to prevent long 3892 * network delays from blocking transactions that are ready to be 3893 * assigned to a txg. 3894 */ 3895 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 3896 3897 /* 3898 * Writes will, almost always, require additional memory allocations 3899 * in order to compress/encrypt/etc the data. We therefore need to 3900 * make sure that there is sufficient available memory for this. 3901 */ 3902 error = arc_memory_throttle(reserve, txg); 3903 if (error != 0) 3904 return (error); 3905 3906 /* 3907 * Throttle writes when the amount of dirty data in the cache 3908 * gets too large. We try to keep the cache less than half full 3909 * of dirty blocks so that our sync times don't grow too large. 3910 * Note: if two requests come in concurrently, we might let them 3911 * both succeed, when one of them should fail. Not a huge deal. 3912 */ 3913 3914 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 3915 anon_size > arc_c / 4) { 3916 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 3917 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 3918 arc_tempreserve>>10, 3919 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 3920 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 3921 reserve>>10, arc_c>>10); 3922 return (SET_ERROR(ERESTART)); 3923 } 3924 atomic_add_64(&arc_tempreserve, reserve); 3925 return (0); 3926 } 3927 3928 /* Tuneable, default is 64, which is essentially arbitrary */ 3929 int zfs_flush_ntasks = 64; 3930 3931 void 3932 arc_init(void) 3933 { 3934 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 3935 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 3936 3937 mutex_init(&arc_pressure_thr_lock, NULL, MUTEX_DEFAULT, NULL); 3938 cv_init(&arc_pressure_thr_cv, NULL, CV_DEFAULT, NULL); 3939 3940 /* Convert seconds to clock ticks */ 3941 arc_min_prefetch_lifespan = 1 * hz; 3942 3943 /* Start out with 1/8 of all memory */ 3944 arc_c = physmem * PAGESIZE / 8; 3945 3946 #ifdef _KERNEL 3947 /* 3948 * On architectures where the physical memory can be larger 3949 * than the addressable space (intel in 32-bit mode), we may 3950 * need to limit the cache to 1/8 of VM size. 3951 */ 3952 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 3953 #endif 3954 3955 /* initial sensible value */ 3956 arc_pressure_threshold = arc_c; 3957 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 3958 arc_c_min = MAX(arc_c / 4, 64<<20); 3959 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 3960 if (arc_c * 8 >= 1<<30) 3961 arc_c_max = (arc_c * 8) - (1<<30); 3962 else 3963 arc_c_max = arc_c_min; 3964 arc_c_max = MAX(arc_c * 6, arc_c_max); 3965 3966 /* 3967 * Allow the tunables to override our calculations if they are 3968 * reasonable (ie. over 64MB) 3969 */ 3970 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) 3971 arc_c_max = zfs_arc_max; 3972 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) 3973 arc_c_min = zfs_arc_min; 3974 3975 arc_c = arc_c_max; 3976 arc_p = (arc_c >> 1); 3977 3978 /* limit meta-data to 1/4 of the arc capacity */ 3979 arc_meta_limit = arc_c_max / 4; 3980 3981 /* Allow the tunable to override if it is reasonable */ 3982 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 3983 arc_meta_limit = zfs_arc_meta_limit; 3984 3985 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 3986 arc_c_min = arc_meta_limit / 2; 3987 3988 if (zfs_arc_grow_retry > 0) 3989 arc_grow_retry = zfs_arc_grow_retry; 3990 3991 if (zfs_arc_shrink_shift > 0) 3992 arc_shrink_shift = zfs_arc_shrink_shift; 3993 3994 if (zfs_arc_p_min_shift > 0) 3995 arc_p_min_shift = zfs_arc_p_min_shift; 3996 3997 /* if kmem_flags are set, lets try to use less memory */ 3998 if (kmem_debugging()) 3999 arc_c = arc_c / 2; 4000 if (arc_c < arc_c_min) 4001 arc_c = arc_c_min; 4002 4003 arc_anon = &ARC_anon; 4004 arc_mru = &ARC_mru; 4005 arc_mru_ghost = &ARC_mru_ghost; 4006 arc_mfu = &ARC_mfu; 4007 arc_mfu_ghost = &ARC_mfu_ghost; 4008 arc_l2c_only = &ARC_l2c_only; 4009 arc_size = 0; 4010 4011 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4012 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4013 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4014 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4015 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4016 mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4017 4018 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 4019 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4020 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 4021 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4022 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 4023 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4024 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 4025 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4026 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 4027 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4028 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 4029 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4030 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 4031 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4032 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 4033 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4034 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], 4035 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4036 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], 4037 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4038 4039 arc_flush_taskq = taskq_create("arc_flush_tq", 4040 max_ncpus, minclsyspri, 1, zfs_flush_ntasks, TASKQ_DYNAMIC); 4041 buf_init(); 4042 4043 arc_thread_exit = 0; 4044 arc_eviction_list = NULL; 4045 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 4046 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 4047 4048 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 4049 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 4050 4051 if (arc_ksp != NULL) { 4052 arc_ksp->ks_data = &arc_stats; 4053 kstat_install(arc_ksp); 4054 } 4055 4056 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 4057 TS_RUN, minclsyspri); 4058 (void) thread_create(NULL, 0, arc_pressure_thread, NULL, 0, &p0, 4059 TS_RUN, minclsyspri); 4060 4061 arc_dead = FALSE; 4062 arc_warm = B_FALSE; 4063 4064 /* 4065 * Calculate maximum amount of dirty data per pool. 4066 * 4067 * If it has been set by /etc/system, take that. 4068 * Otherwise, use a percentage of physical memory defined by 4069 * zfs_dirty_data_max_percent (default 10%) with a cap at 4070 * zfs_dirty_data_max_max (default 4GB). 4071 */ 4072 if (zfs_dirty_data_max == 0) { 4073 zfs_dirty_data_max = physmem * PAGESIZE * 4074 zfs_dirty_data_max_percent / 100; 4075 zfs_dirty_data_max = MIN(zfs_dirty_data_max, 4076 zfs_dirty_data_max_max); 4077 } 4078 } 4079 4080 void 4081 arc_fini(void) 4082 { 4083 mutex_enter(&arc_reclaim_thr_lock); 4084 arc_thread_exit = 1; 4085 while (arc_thread_exit != 0) 4086 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 4087 mutex_exit(&arc_reclaim_thr_lock); 4088 4089 mutex_enter(&arc_pressure_thr_lock); 4090 arc_pressure_thread_exit = 1; 4091 while (arc_pressure_thread_exit != 0) 4092 cv_wait(&arc_pressure_thr_cv, &arc_pressure_thr_lock); 4093 mutex_exit(&arc_pressure_thr_lock); 4094 4095 arc_flush(NULL); 4096 4097 arc_dead = TRUE; 4098 4099 if (arc_ksp != NULL) { 4100 kstat_delete(arc_ksp); 4101 arc_ksp = NULL; 4102 } 4103 4104 mutex_destroy(&arc_eviction_mtx); 4105 mutex_destroy(&arc_reclaim_thr_lock); 4106 cv_destroy(&arc_reclaim_thr_cv); 4107 mutex_destroy(&arc_pressure_thr_lock); 4108 cv_destroy(&arc_pressure_thr_cv); 4109 4110 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 4111 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 4112 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 4113 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 4114 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 4115 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 4116 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 4117 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 4118 4119 mutex_destroy(&arc_anon->arcs_mtx); 4120 mutex_destroy(&arc_mru->arcs_mtx); 4121 mutex_destroy(&arc_mru_ghost->arcs_mtx); 4122 mutex_destroy(&arc_mfu->arcs_mtx); 4123 mutex_destroy(&arc_mfu_ghost->arcs_mtx); 4124 mutex_destroy(&arc_l2c_only->arcs_mtx); 4125 4126 taskq_destroy(arc_flush_taskq); 4127 buf_fini(); 4128 4129 ASSERT(arc_loaned_bytes == 0); 4130 } 4131 4132 /* 4133 * Level 2 ARC 4134 * 4135 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 4136 * It uses dedicated storage devices to hold cached data, which are populated 4137 * using large infrequent writes. The main role of this cache is to boost 4138 * the performance of random read workloads. The intended L2ARC devices 4139 * include short-stroked disks, solid state disks, and other media with 4140 * substantially faster read latency than disk. 4141 * 4142 * +-----------------------+ 4143 * | ARC | 4144 * +-----------------------+ 4145 * | ^ ^ 4146 * | | | 4147 * l2arc_feed_thread() arc_read() 4148 * | | | 4149 * | l2arc read | 4150 * V | | 4151 * +---------------+ | 4152 * | L2ARC | | 4153 * +---------------+ | 4154 * | ^ | 4155 * l2arc_write() | | 4156 * | | | 4157 * V | | 4158 * +-------+ +-------+ 4159 * | vdev | | vdev | 4160 * | cache | | cache | 4161 * +-------+ +-------+ 4162 * +=========+ .-----. 4163 * : L2ARC : |-_____-| 4164 * : devices : | Disks | 4165 * +=========+ `-_____-' 4166 * 4167 * Read requests are satisfied from the following sources, in order: 4168 * 4169 * 1) ARC 4170 * 2) vdev cache of L2ARC devices 4171 * 3) L2ARC devices 4172 * 4) vdev cache of disks 4173 * 5) disks 4174 * 4175 * Some L2ARC device types exhibit extremely slow write performance. 4176 * To accommodate for this there are some significant differences between 4177 * the L2ARC and traditional cache design: 4178 * 4179 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 4180 * the ARC behave as usual, freeing buffers and placing headers on ghost 4181 * lists. The ARC does not send buffers to the L2ARC during eviction as 4182 * this would add inflated write latencies for all ARC memory pressure. 4183 * 4184 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 4185 * It does this by periodically scanning buffers from the eviction-end of 4186 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 4187 * not already there. It scans until a headroom of buffers is satisfied, 4188 * which itself is a buffer for ARC eviction. If a compressible buffer is 4189 * found during scanning and selected for writing to an L2ARC device, we 4190 * temporarily boost scanning headroom during the next scan cycle to make 4191 * sure we adapt to compression effects (which might significantly reduce 4192 * the data volume we write to L2ARC). The thread that does this is 4193 * l2arc_feed_thread(), illustrated below; example sizes are included to 4194 * provide a better sense of ratio than this diagram: 4195 * 4196 * head --> tail 4197 * +---------------------+----------+ 4198 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 4199 * +---------------------+----------+ | o L2ARC eligible 4200 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 4201 * +---------------------+----------+ | 4202 * 15.9 Gbytes ^ 32 Mbytes | 4203 * headroom | 4204 * l2arc_feed_thread() 4205 * | 4206 * l2arc write hand <--[oooo]--' 4207 * | 8 Mbyte 4208 * | write max 4209 * V 4210 * +==============================+ 4211 * L2ARC dev |####|#|###|###| |####| ... | 4212 * +==============================+ 4213 * 32 Gbytes 4214 * 4215 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 4216 * evicted, then the L2ARC has cached a buffer much sooner than it probably 4217 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 4218 * safe to say that this is an uncommon case, since buffers at the end of 4219 * the ARC lists have moved there due to inactivity. 4220 * 4221 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 4222 * then the L2ARC simply misses copying some buffers. This serves as a 4223 * pressure valve to prevent heavy read workloads from both stalling the ARC 4224 * with waits and clogging the L2ARC with writes. This also helps prevent 4225 * the potential for the L2ARC to churn if it attempts to cache content too 4226 * quickly, such as during backups of the entire pool. 4227 * 4228 * 5. After system boot and before the ARC has filled main memory, there are 4229 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 4230 * lists can remain mostly static. Instead of searching from tail of these 4231 * lists as pictured, the l2arc_feed_thread() will search from the list heads 4232 * for eligible buffers, greatly increasing its chance of finding them. 4233 * 4234 * The L2ARC device write speed is also boosted during this time so that 4235 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 4236 * there are no L2ARC reads, and no fear of degrading read performance 4237 * through increased writes. 4238 * 4239 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 4240 * the vdev queue can aggregate them into larger and fewer writes. Each 4241 * device is written to in a rotor fashion, sweeping writes through 4242 * available space then repeating. 4243 * 4244 * 7. The L2ARC does not store dirty content. It never needs to flush 4245 * write buffers back to disk based storage. 4246 * 4247 * 8. If an ARC buffer is written (and dirtied) which also exists in the 4248 * L2ARC, the now stale L2ARC buffer is immediately dropped. 4249 * 4250 * The performance of the L2ARC can be tweaked by a number of tunables, which 4251 * may be necessary for different workloads: 4252 * 4253 * l2arc_write_max max write bytes per interval 4254 * l2arc_write_boost extra write bytes during device warmup 4255 * l2arc_noprefetch skip caching prefetched buffers 4256 * l2arc_headroom number of max device writes to precache 4257 * l2arc_headroom_boost when we find compressed buffers during ARC 4258 * scanning, we multiply headroom by this 4259 * percentage factor for the next scan cycle, 4260 * since more compressed buffers are likely to 4261 * be present 4262 * l2arc_feed_secs seconds between L2ARC writing 4263 * 4264 * Tunables may be removed or added as future performance improvements are 4265 * integrated, and also may become zpool properties. 4266 * 4267 * There are three key functions that control how the L2ARC warms up: 4268 * 4269 * l2arc_write_eligible() check if a buffer is eligible to cache 4270 * l2arc_write_size() calculate how much to write 4271 * l2arc_write_interval() calculate sleep delay between writes 4272 * 4273 * These three functions determine what to write, how much, and how quickly 4274 * to send writes. 4275 */ 4276 4277 static boolean_t 4278 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) 4279 { 4280 /* 4281 * A buffer is *not* eligible for the L2ARC if it: 4282 * 1. belongs to a different spa. 4283 * 2. is already cached on the L2ARC. 4284 * 3. has an I/O in progress (it may be an incomplete read). 4285 * 4. is flagged not eligible (zfs property). 4286 */ 4287 if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL || 4288 HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) 4289 return (B_FALSE); 4290 4291 return (B_TRUE); 4292 } 4293 4294 static uint64_t 4295 l2arc_write_size(void) 4296 { 4297 uint64_t size; 4298 4299 /* 4300 * Make sure our globals have meaningful values in case the user 4301 * altered them. 4302 */ 4303 size = l2arc_write_max; 4304 if (size == 0) { 4305 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 4306 "be greater than zero, resetting it to the default (%d)", 4307 L2ARC_WRITE_SIZE); 4308 size = l2arc_write_max = L2ARC_WRITE_SIZE; 4309 } 4310 4311 if (arc_warm == B_FALSE) 4312 size += l2arc_write_boost; 4313 4314 return (size); 4315 4316 } 4317 4318 static clock_t 4319 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 4320 { 4321 clock_t interval, next, now; 4322 4323 /* 4324 * If the ARC lists are busy, increase our write rate; if the 4325 * lists are stale, idle back. This is achieved by checking 4326 * how much we previously wrote - if it was more than half of 4327 * what we wanted, schedule the next write much sooner. 4328 */ 4329 if (l2arc_feed_again && wrote > (wanted / 2)) 4330 interval = (hz * l2arc_feed_min_ms) / 1000; 4331 else 4332 interval = hz * l2arc_feed_secs; 4333 4334 now = ddi_get_lbolt(); 4335 next = MAX(now, MIN(now + interval, began + interval)); 4336 4337 return (next); 4338 } 4339 4340 static void 4341 l2arc_hdr_stat_add(void) 4342 { 4343 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); 4344 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); 4345 } 4346 4347 static void 4348 l2arc_hdr_stat_remove(void) 4349 { 4350 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); 4351 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); 4352 } 4353 4354 /* 4355 * Cycle through L2ARC devices. This is how L2ARC load balances. 4356 * If a device is returned, this also returns holding the spa config lock. 4357 */ 4358 static l2arc_dev_t * 4359 l2arc_dev_get_next(void) 4360 { 4361 l2arc_dev_t *first, *next = NULL; 4362 4363 /* 4364 * Lock out the removal of spas (spa_namespace_lock), then removal 4365 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 4366 * both locks will be dropped and a spa config lock held instead. 4367 */ 4368 mutex_enter(&spa_namespace_lock); 4369 mutex_enter(&l2arc_dev_mtx); 4370 4371 /* if there are no vdevs, there is nothing to do */ 4372 if (l2arc_ndev == 0) 4373 goto out; 4374 4375 first = NULL; 4376 next = l2arc_dev_last; 4377 do { 4378 /* loop around the list looking for a non-faulted vdev */ 4379 if (next == NULL) { 4380 next = list_head(l2arc_dev_list); 4381 } else { 4382 next = list_next(l2arc_dev_list, next); 4383 if (next == NULL) 4384 next = list_head(l2arc_dev_list); 4385 } 4386 4387 /* if we have come back to the start, bail out */ 4388 if (first == NULL) 4389 first = next; 4390 else if (next == first) 4391 break; 4392 4393 } while (vdev_is_dead(next->l2ad_vdev)); 4394 4395 /* if we were unable to find any usable vdevs, return NULL */ 4396 if (vdev_is_dead(next->l2ad_vdev)) 4397 next = NULL; 4398 4399 l2arc_dev_last = next; 4400 4401 out: 4402 mutex_exit(&l2arc_dev_mtx); 4403 4404 /* 4405 * Grab the config lock to prevent the 'next' device from being 4406 * removed while we are writing to it. 4407 */ 4408 if (next != NULL) 4409 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 4410 mutex_exit(&spa_namespace_lock); 4411 4412 return (next); 4413 } 4414 4415 /* 4416 * Free buffers that were tagged for destruction. 4417 */ 4418 static void 4419 l2arc_do_free_on_write() 4420 { 4421 list_t *buflist; 4422 l2arc_data_free_t *df, *df_prev; 4423 4424 mutex_enter(&l2arc_free_on_write_mtx); 4425 buflist = l2arc_free_on_write; 4426 4427 for (df = list_tail(buflist); df; df = df_prev) { 4428 df_prev = list_prev(buflist, df); 4429 ASSERT(df->l2df_data != NULL); 4430 ASSERT(df->l2df_func != NULL); 4431 df->l2df_func(df->l2df_data, df->l2df_size); 4432 list_remove(buflist, df); 4433 kmem_free(df, sizeof (l2arc_data_free_t)); 4434 } 4435 4436 mutex_exit(&l2arc_free_on_write_mtx); 4437 } 4438 4439 /* 4440 * A write to a cache device has completed. Update all headers to allow 4441 * reads from these buffers to begin. 4442 */ 4443 static void 4444 l2arc_write_done(zio_t *zio) 4445 { 4446 l2arc_write_callback_t *cb; 4447 l2arc_dev_t *dev; 4448 list_t *buflist; 4449 arc_buf_hdr_t *head, *ab, *ab_prev; 4450 l2arc_buf_hdr_t *abl2; 4451 kmutex_t *hash_lock; 4452 int64_t bytes_dropped = 0; 4453 4454 cb = zio->io_private; 4455 ASSERT(cb != NULL); 4456 dev = cb->l2wcb_dev; 4457 ASSERT(dev != NULL); 4458 head = cb->l2wcb_head; 4459 ASSERT(head != NULL); 4460 buflist = dev->l2ad_buflist; 4461 ASSERT(buflist != NULL); 4462 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 4463 l2arc_write_callback_t *, cb); 4464 4465 if (zio->io_error != 0) 4466 ARCSTAT_BUMP(arcstat_l2_writes_error); 4467 4468 mutex_enter(&l2arc_buflist_mtx); 4469 4470 /* 4471 * All writes completed, or an error was hit. 4472 */ 4473 for (ab = list_prev(buflist, head); ab; ab = ab_prev) { 4474 ab_prev = list_prev(buflist, ab); 4475 abl2 = ab->b_l2hdr; 4476 4477 /* 4478 * Release the temporary compressed buffer as soon as possible. 4479 */ 4480 if (abl2->b_compress != ZIO_COMPRESS_OFF) 4481 l2arc_release_cdata_buf(ab); 4482 4483 hash_lock = HDR_LOCK(ab); 4484 if (!mutex_tryenter(hash_lock)) { 4485 /* 4486 * This buffer misses out. It may be in a stage 4487 * of eviction. Its ARC_L2_WRITING flag will be 4488 * left set, denying reads to this buffer. 4489 */ 4490 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 4491 continue; 4492 } 4493 4494 if (zio->io_error != 0) { 4495 /* 4496 * Error - drop L2ARC entry. 4497 */ 4498 list_remove(buflist, ab); 4499 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); 4500 bytes_dropped += abl2->b_asize; 4501 ab->b_l2hdr = NULL; 4502 kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 4503 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4504 } 4505 4506 /* 4507 * Allow ARC to begin reads to this L2ARC entry. 4508 */ 4509 ab->b_flags &= ~ARC_L2_WRITING; 4510 4511 mutex_exit(hash_lock); 4512 } 4513 4514 atomic_inc_64(&l2arc_writes_done); 4515 list_remove(buflist, head); 4516 kmem_cache_free(hdr_cache, head); 4517 mutex_exit(&l2arc_buflist_mtx); 4518 4519 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 4520 4521 l2arc_do_free_on_write(); 4522 4523 kmem_free(cb, sizeof (l2arc_write_callback_t)); 4524 } 4525 4526 /* 4527 * A read to a cache device completed. Validate buffer contents before 4528 * handing over to the regular ARC routines. 4529 */ 4530 static void 4531 l2arc_read_done(zio_t *zio) 4532 { 4533 l2arc_read_callback_t *cb; 4534 arc_buf_hdr_t *hdr; 4535 arc_buf_t *buf; 4536 kmutex_t *hash_lock; 4537 int equal; 4538 4539 ASSERT(zio->io_vd != NULL); 4540 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 4541 4542 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 4543 4544 cb = zio->io_private; 4545 ASSERT(cb != NULL); 4546 buf = cb->l2rcb_buf; 4547 ASSERT(buf != NULL); 4548 4549 hash_lock = HDR_LOCK(buf->b_hdr); 4550 mutex_enter(hash_lock); 4551 hdr = buf->b_hdr; 4552 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4553 4554 /* 4555 * If the buffer was compressed, decompress it first. 4556 */ 4557 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) 4558 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); 4559 ASSERT(zio->io_data != NULL); 4560 4561 /* 4562 * Check this survived the L2ARC journey. 4563 */ 4564 equal = arc_cksum_equal(buf); 4565 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 4566 mutex_exit(hash_lock); 4567 zio->io_private = buf; 4568 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 4569 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 4570 arc_read_done(zio); 4571 } else { 4572 mutex_exit(hash_lock); 4573 /* 4574 * Buffer didn't survive caching. Increment stats and 4575 * reissue to the original storage device. 4576 */ 4577 if (zio->io_error != 0) { 4578 ARCSTAT_BUMP(arcstat_l2_io_error); 4579 } else { 4580 zio->io_error = SET_ERROR(EIO); 4581 } 4582 if (!equal) 4583 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 4584 4585 /* 4586 * If there's no waiter, issue an async i/o to the primary 4587 * storage now. If there *is* a waiter, the caller must 4588 * issue the i/o in a context where it's OK to block. 4589 */ 4590 if (zio->io_waiter == NULL) { 4591 zio_t *pio = zio_unique_parent(zio); 4592 4593 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 4594 4595 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 4596 buf->b_data, zio->io_size, arc_read_done, buf, 4597 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 4598 } 4599 } 4600 4601 kmem_free(cb, sizeof (l2arc_read_callback_t)); 4602 } 4603 4604 /* 4605 * This is the list priority from which the L2ARC will search for pages to 4606 * cache. This is used within loops (0..3) to cycle through lists in the 4607 * desired order. This order can have a significant effect on cache 4608 * performance. 4609 * 4610 * Currently the metadata lists are hit first, MFU then MRU, followed by 4611 * the data lists. This function returns a locked list, and also returns 4612 * the lock pointer. 4613 */ 4614 static list_t * 4615 l2arc_list_locked(int list_num, kmutex_t **lock) 4616 { 4617 list_t *list = NULL; 4618 4619 ASSERT(list_num >= 0 && list_num <= 3); 4620 4621 switch (list_num) { 4622 case 0: 4623 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; 4624 *lock = &arc_mfu->arcs_mtx; 4625 break; 4626 case 1: 4627 list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; 4628 *lock = &arc_mru->arcs_mtx; 4629 break; 4630 case 2: 4631 list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; 4632 *lock = &arc_mfu->arcs_mtx; 4633 break; 4634 case 3: 4635 list = &arc_mru->arcs_list[ARC_BUFC_DATA]; 4636 *lock = &arc_mru->arcs_mtx; 4637 break; 4638 } 4639 4640 ASSERT(!(MUTEX_HELD(*lock))); 4641 mutex_enter(*lock); 4642 return (list); 4643 } 4644 4645 /* 4646 * Evict buffers from the device write hand to the distance specified in 4647 * bytes. This distance may span populated buffers, it may span nothing. 4648 * This is clearing a region on the L2ARC device ready for writing. 4649 * If the 'all' boolean is set, every buffer is evicted. 4650 */ 4651 static void 4652 _l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all, 4653 boolean_t space_update) 4654 { 4655 list_t *buflist; 4656 l2arc_buf_hdr_t *abl2; 4657 arc_buf_hdr_t *ab, *ab_prev; 4658 kmutex_t *hash_lock; 4659 uint64_t taddr; 4660 int64_t bytes_evicted = 0; 4661 4662 buflist = dev->l2ad_buflist; 4663 4664 if (buflist == NULL) 4665 return; 4666 4667 if (!all && dev->l2ad_first) { 4668 /* 4669 * This is the first sweep through the device. There is 4670 * nothing to evict. 4671 */ 4672 return; 4673 } 4674 4675 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 4676 /* 4677 * When nearing the end of the device, evict to the end 4678 * before the device write hand jumps to the start. 4679 */ 4680 taddr = dev->l2ad_end; 4681 } else { 4682 taddr = dev->l2ad_hand + distance; 4683 } 4684 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 4685 uint64_t, taddr, boolean_t, all); 4686 4687 top: 4688 mutex_enter(&l2arc_buflist_mtx); 4689 for (ab = list_tail(buflist); ab; ab = ab_prev) { 4690 ab_prev = list_prev(buflist, ab); 4691 4692 hash_lock = HDR_LOCK(ab); 4693 if (!mutex_tryenter(hash_lock)) { 4694 /* 4695 * Missed the hash lock. Retry. 4696 */ 4697 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 4698 mutex_exit(&l2arc_buflist_mtx); 4699 mutex_enter(hash_lock); 4700 mutex_exit(hash_lock); 4701 goto top; 4702 } 4703 4704 if (HDR_L2_WRITE_HEAD(ab)) { 4705 /* 4706 * We hit a write head node. Leave it for 4707 * l2arc_write_done(). 4708 */ 4709 list_remove(buflist, ab); 4710 mutex_exit(hash_lock); 4711 continue; 4712 } 4713 4714 if (!all && ab->b_l2hdr != NULL && 4715 (ab->b_l2hdr->b_daddr > taddr || 4716 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { 4717 /* 4718 * We've evicted to the target address, 4719 * or the end of the device. 4720 */ 4721 mutex_exit(hash_lock); 4722 break; 4723 } 4724 4725 if (HDR_FREE_IN_PROGRESS(ab)) { 4726 /* 4727 * Already on the path to destruction. 4728 */ 4729 mutex_exit(hash_lock); 4730 continue; 4731 } 4732 4733 if (ab->b_state == arc_l2c_only) { 4734 ASSERT(!HDR_L2_READING(ab)); 4735 /* 4736 * This doesn't exist in the ARC. Destroy. 4737 * arc_hdr_destroy() will call list_remove() 4738 * and decrement arcstat_l2_size. 4739 */ 4740 arc_change_state(arc_anon, ab, hash_lock); 4741 arc_hdr_destroy(ab); 4742 } else { 4743 /* 4744 * Invalidate issued or about to be issued 4745 * reads, since we may be about to write 4746 * over this location. 4747 */ 4748 if (HDR_L2_READING(ab)) { 4749 ARCSTAT_BUMP(arcstat_l2_evict_reading); 4750 ab->b_flags |= ARC_L2_EVICTED; 4751 } 4752 4753 /* 4754 * Tell ARC this no longer exists in L2ARC. 4755 */ 4756 if (ab->b_l2hdr != NULL) { 4757 abl2 = ab->b_l2hdr; 4758 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); 4759 bytes_evicted += abl2->b_asize; 4760 ab->b_l2hdr = NULL; 4761 kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 4762 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4763 } 4764 list_remove(buflist, ab); 4765 4766 /* 4767 * This may have been leftover after a 4768 * failed write. 4769 */ 4770 ab->b_flags &= ~ARC_L2_WRITING; 4771 } 4772 mutex_exit(hash_lock); 4773 } 4774 mutex_exit(&l2arc_buflist_mtx); 4775 4776 /* 4777 * Note: l2ad_vdev can only be touched if space_update is set, 4778 * otherwise the vdev might have been removed by an async 4779 * spa_unload. 4780 */ 4781 if (space_update) { 4782 vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0); 4783 dev->l2ad_evict = taddr; 4784 } 4785 } 4786 4787 /* 4788 * Asynchronous task for eviction of all the buffers for this L2ARC device 4789 * The task is dispatched in l2arc_evict() 4790 */ 4791 typedef struct { 4792 l2arc_dev_t *dev; 4793 } l2arc_evict_data_t; 4794 4795 static void 4796 l2arc_evict_task(void *arg) 4797 { 4798 l2arc_evict_data_t *d = (l2arc_evict_data_t *)arg; 4799 ASSERT(d && d->dev); 4800 4801 /* 4802 * Evict l2arc buffers asynchronously; we need to keep the device 4803 * around until we are sure there aren't any buffers referencing it. 4804 * We do not need to hold any config locks, etc. because at this point, 4805 * we are the only ones who knows about this device (the in-core 4806 * structure), so no new buffers can be created (e.g. if the pool is 4807 * re-imported while the asynchronous eviction is in progress) that 4808 * reference this same in-core structure. Also remove the vdev link 4809 * since further use of it as l2arc device is prohibited. 4810 */ 4811 d->dev->l2ad_vdev = NULL; 4812 _l2arc_evict(d->dev, 0LL, B_TRUE, B_FALSE); 4813 4814 /* Same cleanup as in the synchronous path */ 4815 list_destroy(d->dev->l2ad_buflist); 4816 kmem_free(d->dev->l2ad_buflist, sizeof (list_t)); 4817 kmem_free(d->dev, sizeof (l2arc_dev_t)); 4818 /* Task argument cleanup */ 4819 kmem_free(arg, sizeof (l2arc_evict_data_t)); 4820 } 4821 4822 boolean_t zfs_l2arc_async_evict = B_TRUE; 4823 4824 /* 4825 * Perform l2arc eviction for buffers associated with this device 4826 * If evicting all buffers (done at pool export time), try to evict 4827 * asynchronously, and fall back to synchronous eviction in case of error 4828 * Tell the caller whether to cleanup the device: 4829 * - B_TRUE means "asynchronous eviction, do not cleanup" 4830 * - B_FALSE means "synchronous eviction, done, please cleanup" 4831 */ 4832 static boolean_t 4833 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 4834 { 4835 /* 4836 * If we are evicting all the buffers for this device, which happens 4837 * at pool export time, schedule asynchronous task 4838 */ 4839 if (all && zfs_l2arc_async_evict) { 4840 l2arc_evict_data_t *arg = 4841 kmem_alloc(sizeof (l2arc_evict_data_t), KM_SLEEP); 4842 arg->dev = dev; 4843 4844 dev->l2ad_evict = dev->l2ad_end; 4845 4846 if ((taskq_dispatch(arc_flush_taskq, l2arc_evict_task, 4847 arg, TQ_NOSLEEP) == NULL)) { 4848 /* 4849 * Failed to dispatch asynchronous task 4850 * cleanup, evict synchronously, avoid adjusting 4851 * vdev space second time 4852 */ 4853 kmem_free(arg, sizeof (l2arc_evict_data_t)); 4854 _l2arc_evict(dev, distance, all, B_FALSE); 4855 } else { 4856 /* 4857 * Successfull dispatch, vdev space updated 4858 */ 4859 return (B_TRUE); 4860 } 4861 } else { 4862 /* Evict synchronously */ 4863 _l2arc_evict(dev, distance, all, B_TRUE); 4864 } 4865 4866 return (B_FALSE); 4867 } 4868 4869 /* 4870 * Find and write ARC buffers to the L2ARC device. 4871 * 4872 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid 4873 * for reading until they have completed writing. 4874 * The headroom_boost is an in-out parameter used to maintain headroom boost 4875 * state between calls to this function. 4876 * 4877 * Returns the number of bytes actually written (which may be smaller than 4878 * the delta by which the device hand has changed due to alignment). 4879 */ 4880 static uint64_t 4881 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, 4882 boolean_t *headroom_boost) 4883 { 4884 arc_buf_hdr_t *ab, *ab_prev, *head; 4885 list_t *list; 4886 uint64_t write_asize, write_psize, write_sz, headroom, 4887 buf_compress_minsz; 4888 void *buf_data; 4889 kmutex_t *list_lock; 4890 boolean_t full; 4891 l2arc_write_callback_t *cb; 4892 zio_t *pio, *wzio; 4893 uint64_t guid = spa_load_guid(spa); 4894 const boolean_t do_headroom_boost = *headroom_boost; 4895 4896 ASSERT(dev->l2ad_vdev != NULL); 4897 4898 /* Lower the flag now, we might want to raise it again later. */ 4899 *headroom_boost = B_FALSE; 4900 4901 pio = NULL; 4902 write_sz = write_asize = write_psize = 0; 4903 full = B_FALSE; 4904 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 4905 head->b_flags |= ARC_L2_WRITE_HEAD; 4906 4907 /* 4908 * We will want to try to compress buffers that are at least 2x the 4909 * device sector size. 4910 */ 4911 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 4912 4913 /* 4914 * Copy buffers for L2ARC writing. 4915 */ 4916 mutex_enter(&l2arc_buflist_mtx); 4917 for (int try = 0; try <= 3; try++) { 4918 uint64_t passed_sz = 0; 4919 4920 list = l2arc_list_locked(try, &list_lock); 4921 4922 /* 4923 * L2ARC fast warmup. 4924 * 4925 * Until the ARC is warm and starts to evict, read from the 4926 * head of the ARC lists rather than the tail. 4927 */ 4928 if (arc_warm == B_FALSE) 4929 ab = list_head(list); 4930 else 4931 ab = list_tail(list); 4932 4933 headroom = target_sz * l2arc_headroom; 4934 if (do_headroom_boost) 4935 headroom = (headroom * l2arc_headroom_boost) / 100; 4936 4937 for (; ab; ab = ab_prev) { 4938 l2arc_buf_hdr_t *l2hdr; 4939 kmutex_t *hash_lock; 4940 uint64_t buf_sz; 4941 4942 if (arc_warm == B_FALSE) 4943 ab_prev = list_next(list, ab); 4944 else 4945 ab_prev = list_prev(list, ab); 4946 4947 hash_lock = HDR_LOCK(ab); 4948 if (!mutex_tryenter(hash_lock)) { 4949 /* 4950 * Skip this buffer rather than waiting. 4951 */ 4952 continue; 4953 } 4954 4955 passed_sz += ab->b_size; 4956 if (passed_sz > headroom) { 4957 /* 4958 * Searched too far. 4959 */ 4960 mutex_exit(hash_lock); 4961 break; 4962 } 4963 4964 if (!l2arc_write_eligible(guid, ab)) { 4965 mutex_exit(hash_lock); 4966 continue; 4967 } 4968 4969 if ((write_sz + ab->b_size) > target_sz) { 4970 full = B_TRUE; 4971 mutex_exit(hash_lock); 4972 break; 4973 } 4974 4975 if (pio == NULL) { 4976 /* 4977 * Insert a dummy header on the buflist so 4978 * l2arc_write_done() can find where the 4979 * write buffers begin without searching. 4980 */ 4981 list_insert_head(dev->l2ad_buflist, head); 4982 4983 cb = kmem_alloc( 4984 sizeof (l2arc_write_callback_t), KM_SLEEP); 4985 cb->l2wcb_dev = dev; 4986 cb->l2wcb_head = head; 4987 pio = zio_root(spa, l2arc_write_done, cb, 4988 ZIO_FLAG_CANFAIL); 4989 } 4990 4991 /* 4992 * Create and add a new L2ARC header. 4993 */ 4994 l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); 4995 l2hdr->b_dev = dev; 4996 ab->b_flags |= ARC_L2_WRITING; 4997 4998 /* 4999 * Temporarily stash the data buffer in b_tmp_cdata. 5000 * The subsequent write step will pick it up from 5001 * there. This is because can't access ab->b_buf 5002 * without holding the hash_lock, which we in turn 5003 * can't access without holding the ARC list locks 5004 * (which we want to avoid during compression/writing). 5005 */ 5006 l2hdr->b_compress = ZIO_COMPRESS_OFF; 5007 l2hdr->b_asize = ab->b_size; 5008 l2hdr->b_tmp_cdata = ab->b_buf->b_data; 5009 5010 buf_sz = ab->b_size; 5011 ab->b_l2hdr = l2hdr; 5012 5013 list_insert_head(dev->l2ad_buflist, ab); 5014 5015 /* 5016 * Compute and store the buffer cksum before 5017 * writing. On debug the cksum is verified first. 5018 */ 5019 arc_cksum_verify(ab->b_buf); 5020 arc_cksum_compute(ab->b_buf, B_TRUE); 5021 5022 mutex_exit(hash_lock); 5023 5024 write_sz += buf_sz; 5025 } 5026 5027 mutex_exit(list_lock); 5028 5029 if (full == B_TRUE) 5030 break; 5031 } 5032 5033 /* No buffers selected for writing? */ 5034 if (pio == NULL) { 5035 ASSERT0(write_sz); 5036 mutex_exit(&l2arc_buflist_mtx); 5037 kmem_cache_free(hdr_cache, head); 5038 return (0); 5039 } 5040 5041 /* 5042 * Now start writing the buffers. We're starting at the write head 5043 * and work backwards, retracing the course of the buffer selector 5044 * loop above. 5045 */ 5046 for (ab = list_prev(dev->l2ad_buflist, head); ab; 5047 ab = list_prev(dev->l2ad_buflist, ab)) { 5048 l2arc_buf_hdr_t *l2hdr; 5049 uint64_t buf_sz; 5050 5051 /* 5052 * We shouldn't need to lock the buffer here, since we flagged 5053 * it as ARC_L2_WRITING in the previous step, but we must take 5054 * care to only access its L2 cache parameters. In particular, 5055 * ab->b_buf may be invalid by now due to ARC eviction. 5056 */ 5057 l2hdr = ab->b_l2hdr; 5058 l2hdr->b_daddr = dev->l2ad_hand; 5059 5060 if ((ab->b_flags & ARC_L2COMPRESS) && 5061 l2hdr->b_asize >= buf_compress_minsz) { 5062 if (l2arc_compress_buf(l2hdr)) { 5063 /* 5064 * If compression succeeded, enable headroom 5065 * boost on the next scan cycle. 5066 */ 5067 *headroom_boost = B_TRUE; 5068 } 5069 } 5070 5071 /* 5072 * Pick up the buffer data we had previously stashed away 5073 * (and now potentially also compressed). 5074 */ 5075 buf_data = l2hdr->b_tmp_cdata; 5076 buf_sz = l2hdr->b_asize; 5077 5078 /* Compression may have squashed the buffer to zero length. */ 5079 if (buf_sz != 0) { 5080 uint64_t buf_p_sz; 5081 5082 wzio = zio_write_phys(pio, dev->l2ad_vdev, 5083 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 5084 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 5085 ZIO_FLAG_CANFAIL, B_FALSE); 5086 5087 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 5088 zio_t *, wzio); 5089 (void) zio_nowait(wzio); 5090 5091 write_asize += buf_sz; 5092 /* 5093 * Keep the clock hand suitably device-aligned. 5094 */ 5095 buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 5096 write_psize += buf_p_sz; 5097 dev->l2ad_hand += buf_p_sz; 5098 } 5099 } 5100 5101 mutex_exit(&l2arc_buflist_mtx); 5102 5103 ASSERT3U(write_asize, <=, target_sz); 5104 ARCSTAT_BUMP(arcstat_l2_writes_sent); 5105 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); 5106 ARCSTAT_INCR(arcstat_l2_size, write_sz); 5107 ARCSTAT_INCR(arcstat_l2_asize, write_asize); 5108 vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); 5109 5110 /* 5111 * Bump device hand to the device start if it is approaching the end. 5112 * l2arc_evict() will already have evicted ahead for this case. 5113 */ 5114 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 5115 dev->l2ad_hand = dev->l2ad_start; 5116 dev->l2ad_evict = dev->l2ad_start; 5117 dev->l2ad_first = B_FALSE; 5118 } 5119 5120 dev->l2ad_writing = B_TRUE; 5121 (void) zio_wait(pio); 5122 dev->l2ad_writing = B_FALSE; 5123 5124 return (write_asize); 5125 } 5126 5127 /* 5128 * Compresses an L2ARC buffer. 5129 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its 5130 * size in l2hdr->b_asize. This routine tries to compress the data and 5131 * depending on the compression result there are three possible outcomes: 5132 * *) The buffer was incompressible. The original l2hdr contents were left 5133 * untouched and are ready for writing to an L2 device. 5134 * *) The buffer was all-zeros, so there is no need to write it to an L2 5135 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is 5136 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. 5137 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary 5138 * data buffer which holds the compressed data to be written, and b_asize 5139 * tells us how much data there is. b_compress is set to the appropriate 5140 * compression algorithm. Once writing is done, invoke 5141 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. 5142 * 5143 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the 5144 * buffer was incompressible). 5145 */ 5146 static boolean_t 5147 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) 5148 { 5149 void *cdata; 5150 size_t csize, len, rounded; 5151 5152 ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF); 5153 ASSERT(l2hdr->b_tmp_cdata != NULL); 5154 5155 len = l2hdr->b_asize; 5156 cdata = zio_data_buf_alloc(len); 5157 csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata, 5158 cdata, l2hdr->b_asize); 5159 5160 rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE); 5161 if (rounded > csize) { 5162 bzero((char *)cdata + csize, rounded - csize); 5163 csize = rounded; 5164 } 5165 5166 if (csize == 0) { 5167 /* zero block, indicate that there's nothing to write */ 5168 zio_data_buf_free(cdata, len); 5169 l2hdr->b_compress = ZIO_COMPRESS_EMPTY; 5170 l2hdr->b_asize = 0; 5171 l2hdr->b_tmp_cdata = NULL; 5172 ARCSTAT_BUMP(arcstat_l2_compress_zeros); 5173 return (B_TRUE); 5174 } else if (csize > 0 && csize < len) { 5175 /* 5176 * Compression succeeded, we'll keep the cdata around for 5177 * writing and release it afterwards. 5178 */ 5179 l2hdr->b_compress = ZIO_COMPRESS_LZ4; 5180 l2hdr->b_asize = csize; 5181 l2hdr->b_tmp_cdata = cdata; 5182 ARCSTAT_BUMP(arcstat_l2_compress_successes); 5183 return (B_TRUE); 5184 } else { 5185 /* 5186 * Compression failed, release the compressed buffer. 5187 * l2hdr will be left unmodified. 5188 */ 5189 zio_data_buf_free(cdata, len); 5190 ARCSTAT_BUMP(arcstat_l2_compress_failures); 5191 return (B_FALSE); 5192 } 5193 } 5194 5195 /* 5196 * Decompresses a zio read back from an l2arc device. On success, the 5197 * underlying zio's io_data buffer is overwritten by the uncompressed 5198 * version. On decompression error (corrupt compressed stream), the 5199 * zio->io_error value is set to signal an I/O error. 5200 * 5201 * Please note that the compressed data stream is not checksummed, so 5202 * if the underlying device is experiencing data corruption, we may feed 5203 * corrupt data to the decompressor, so the decompressor needs to be 5204 * able to handle this situation (LZ4 does). 5205 */ 5206 static void 5207 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) 5208 { 5209 ASSERT(L2ARC_IS_VALID_COMPRESS(c)); 5210 5211 if (zio->io_error != 0) { 5212 /* 5213 * An io error has occured, just restore the original io 5214 * size in preparation for a main pool read. 5215 */ 5216 zio->io_orig_size = zio->io_size = hdr->b_size; 5217 return; 5218 } 5219 5220 if (c == ZIO_COMPRESS_EMPTY) { 5221 /* 5222 * An empty buffer results in a null zio, which means we 5223 * need to fill its io_data after we're done restoring the 5224 * buffer's contents. 5225 */ 5226 ASSERT(hdr->b_buf != NULL); 5227 bzero(hdr->b_buf->b_data, hdr->b_size); 5228 zio->io_data = zio->io_orig_data = hdr->b_buf->b_data; 5229 } else { 5230 ASSERT(zio->io_data != NULL); 5231 /* 5232 * We copy the compressed data from the start of the arc buffer 5233 * (the zio_read will have pulled in only what we need, the 5234 * rest is garbage which we will overwrite at decompression) 5235 * and then decompress back to the ARC data buffer. This way we 5236 * can minimize copying by simply decompressing back over the 5237 * original compressed data (rather than decompressing to an 5238 * aux buffer and then copying back the uncompressed buffer, 5239 * which is likely to be much larger). 5240 */ 5241 uint64_t csize; 5242 void *cdata; 5243 5244 csize = zio->io_size; 5245 cdata = zio_data_buf_alloc(csize); 5246 bcopy(zio->io_data, cdata, csize); 5247 if (zio_decompress_data(c, cdata, zio->io_data, csize, 5248 hdr->b_size) != 0) 5249 zio->io_error = EIO; 5250 zio_data_buf_free(cdata, csize); 5251 } 5252 5253 /* Restore the expected uncompressed IO size. */ 5254 zio->io_orig_size = zio->io_size = hdr->b_size; 5255 } 5256 5257 /* 5258 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. 5259 * This buffer serves as a temporary holder of compressed data while 5260 * the buffer entry is being written to an l2arc device. Once that is 5261 * done, we can dispose of it. 5262 */ 5263 static void 5264 l2arc_release_cdata_buf(arc_buf_hdr_t *ab) 5265 { 5266 l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr; 5267 5268 if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) { 5269 /* 5270 * If the data was compressed, then we've allocated a 5271 * temporary buffer for it, so now we need to release it. 5272 */ 5273 ASSERT(l2hdr->b_tmp_cdata != NULL); 5274 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size); 5275 } 5276 l2hdr->b_tmp_cdata = NULL; 5277 } 5278 5279 /* 5280 * This thread feeds the L2ARC at regular intervals. This is the beating 5281 * heart of the L2ARC. 5282 */ 5283 static void 5284 l2arc_feed_thread(void) 5285 { 5286 callb_cpr_t cpr; 5287 l2arc_dev_t *dev; 5288 spa_t *spa; 5289 uint64_t size, wrote; 5290 clock_t begin, next = ddi_get_lbolt(); 5291 boolean_t headroom_boost = B_FALSE; 5292 5293 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 5294 5295 mutex_enter(&l2arc_feed_thr_lock); 5296 5297 while (l2arc_thread_exit == 0) { 5298 CALLB_CPR_SAFE_BEGIN(&cpr); 5299 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 5300 next); 5301 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 5302 next = ddi_get_lbolt() + hz; 5303 5304 /* 5305 * Quick check for L2ARC devices. 5306 */ 5307 mutex_enter(&l2arc_dev_mtx); 5308 if (l2arc_ndev == 0) { 5309 mutex_exit(&l2arc_dev_mtx); 5310 continue; 5311 } 5312 mutex_exit(&l2arc_dev_mtx); 5313 begin = ddi_get_lbolt(); 5314 5315 /* 5316 * This selects the next l2arc device to write to, and in 5317 * doing so the next spa to feed from: dev->l2ad_spa. This 5318 * will return NULL if there are now no l2arc devices or if 5319 * they are all faulted. 5320 * 5321 * If a device is returned, its spa's config lock is also 5322 * held to prevent device removal. l2arc_dev_get_next() 5323 * will grab and release l2arc_dev_mtx. 5324 */ 5325 if ((dev = l2arc_dev_get_next()) == NULL) 5326 continue; 5327 5328 spa = dev->l2ad_spa; 5329 ASSERT(spa != NULL); 5330 5331 /* 5332 * If the pool is read-only then force the feed thread to 5333 * sleep a little longer. 5334 */ 5335 if (!spa_writeable(spa)) { 5336 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 5337 spa_config_exit(spa, SCL_L2ARC, dev); 5338 continue; 5339 } 5340 5341 /* 5342 * Avoid contributing to memory pressure. 5343 */ 5344 if (arc_reclaim_needed()) { 5345 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 5346 spa_config_exit(spa, SCL_L2ARC, dev); 5347 continue; 5348 } 5349 5350 ARCSTAT_BUMP(arcstat_l2_feeds); 5351 5352 size = l2arc_write_size(); 5353 5354 /* 5355 * Evict L2ARC buffers that will be overwritten. 5356 * B_FALSE guarantees synchronous eviction. 5357 */ 5358 (void) l2arc_evict(dev, size, B_FALSE); 5359 5360 /* 5361 * Write ARC buffers. 5362 */ 5363 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); 5364 5365 /* 5366 * Calculate interval between writes. 5367 */ 5368 next = l2arc_write_interval(begin, size, wrote); 5369 spa_config_exit(spa, SCL_L2ARC, dev); 5370 } 5371 5372 l2arc_thread_exit = 0; 5373 cv_broadcast(&l2arc_feed_thr_cv); 5374 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 5375 thread_exit(); 5376 } 5377 5378 boolean_t 5379 l2arc_vdev_present(vdev_t *vd) 5380 { 5381 l2arc_dev_t *dev; 5382 5383 mutex_enter(&l2arc_dev_mtx); 5384 for (dev = list_head(l2arc_dev_list); dev != NULL; 5385 dev = list_next(l2arc_dev_list, dev)) { 5386 if (dev->l2ad_vdev == vd) 5387 break; 5388 } 5389 mutex_exit(&l2arc_dev_mtx); 5390 5391 return (dev != NULL); 5392 } 5393 5394 /* 5395 * Add a vdev for use by the L2ARC. By this point the spa has already 5396 * validated the vdev and opened it. 5397 */ 5398 void 5399 l2arc_add_vdev(spa_t *spa, vdev_t *vd) 5400 { 5401 l2arc_dev_t *adddev; 5402 5403 ASSERT(!l2arc_vdev_present(vd)); 5404 5405 /* 5406 * Create a new l2arc device entry. 5407 */ 5408 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 5409 adddev->l2ad_spa = spa; 5410 adddev->l2ad_vdev = vd; 5411 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 5412 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 5413 adddev->l2ad_hand = adddev->l2ad_start; 5414 adddev->l2ad_evict = adddev->l2ad_start; 5415 adddev->l2ad_first = B_TRUE; 5416 adddev->l2ad_writing = B_FALSE; 5417 5418 /* 5419 * This is a list of all ARC buffers that are still valid on the 5420 * device. 5421 */ 5422 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); 5423 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 5424 offsetof(arc_buf_hdr_t, b_l2node)); 5425 5426 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 5427 5428 /* 5429 * Add device to global list 5430 */ 5431 mutex_enter(&l2arc_dev_mtx); 5432 list_insert_head(l2arc_dev_list, adddev); 5433 atomic_inc_64(&l2arc_ndev); 5434 mutex_exit(&l2arc_dev_mtx); 5435 } 5436 5437 /* 5438 * Remove a vdev from the L2ARC. 5439 */ 5440 void 5441 l2arc_remove_vdev(vdev_t *vd) 5442 { 5443 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 5444 5445 /* 5446 * Find the device by vdev 5447 */ 5448 mutex_enter(&l2arc_dev_mtx); 5449 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 5450 nextdev = list_next(l2arc_dev_list, dev); 5451 if (vd == dev->l2ad_vdev) { 5452 remdev = dev; 5453 break; 5454 } 5455 } 5456 ASSERT(remdev != NULL); 5457 5458 /* 5459 * Remove device from global list 5460 */ 5461 list_remove(l2arc_dev_list, remdev); 5462 l2arc_dev_last = NULL; /* may have been invalidated */ 5463 atomic_dec_64(&l2arc_ndev); 5464 mutex_exit(&l2arc_dev_mtx); 5465 5466 /* 5467 * Clear all buflists and ARC references. L2ARC device flush. 5468 */ 5469 if (l2arc_evict(remdev, 0, B_TRUE) == B_FALSE) { 5470 /* 5471 * The eviction was done synchronously, cleanup here 5472 * Otherwise, the asynchronous task will cleanup 5473 */ 5474 list_destroy(remdev->l2ad_buflist); 5475 kmem_free(remdev->l2ad_buflist, sizeof (list_t)); 5476 kmem_free(remdev, sizeof (l2arc_dev_t)); 5477 } 5478 } 5479 5480 void 5481 l2arc_init(void) 5482 { 5483 l2arc_thread_exit = 0; 5484 l2arc_ndev = 0; 5485 l2arc_writes_sent = 0; 5486 l2arc_writes_done = 0; 5487 5488 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 5489 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 5490 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 5491 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); 5492 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 5493 5494 l2arc_dev_list = &L2ARC_dev_list; 5495 l2arc_free_on_write = &L2ARC_free_on_write; 5496 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 5497 offsetof(l2arc_dev_t, l2ad_node)); 5498 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 5499 offsetof(l2arc_data_free_t, l2df_list_node)); 5500 } 5501 5502 void 5503 l2arc_fini(void) 5504 { 5505 /* 5506 * This is called from dmu_fini(), which is called from spa_fini(); 5507 * Because of this, we can assume that all l2arc devices have 5508 * already been removed when the pools themselves were removed. 5509 */ 5510 5511 l2arc_do_free_on_write(); 5512 5513 mutex_destroy(&l2arc_feed_thr_lock); 5514 cv_destroy(&l2arc_feed_thr_cv); 5515 mutex_destroy(&l2arc_dev_mtx); 5516 mutex_destroy(&l2arc_buflist_mtx); 5517 mutex_destroy(&l2arc_free_on_write_mtx); 5518 5519 list_destroy(l2arc_dev_list); 5520 list_destroy(l2arc_free_on_write); 5521 } 5522 5523 void 5524 l2arc_start(void) 5525 { 5526 if (!(spa_mode_global & FWRITE)) 5527 return; 5528 5529 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 5530 TS_RUN, minclsyspri); 5531 } 5532 5533 void 5534 l2arc_stop(void) 5535 { 5536 if (!(spa_mode_global & FWRITE)) 5537 return; 5538 5539 mutex_enter(&l2arc_feed_thr_lock); 5540 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 5541 l2arc_thread_exit = 1; 5542 while (l2arc_thread_exit != 0) 5543 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 5544 mutex_exit(&l2arc_feed_thr_lock); 5545 }