1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2013 by Delphix. All rights reserved. 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26 */ 27 28 /* 29 * DVA-based Adjustable Replacement Cache 30 * 31 * While much of the theory of operation used here is 32 * based on the self-tuning, low overhead replacement cache 33 * presented by Megiddo and Modha at FAST 2003, there are some 34 * significant differences: 35 * 36 * 1. The Megiddo and Modha model assumes any page is evictable. 37 * Pages in its cache cannot be "locked" into memory. This makes 38 * the eviction algorithm simple: evict the last page in the list. 39 * This also make the performance characteristics easy to reason 40 * about. Our cache is not so simple. At any given moment, some 41 * subset of the blocks in the cache are un-evictable because we 42 * have handed out a reference to them. Blocks are only evictable 43 * when there are no external references active. This makes 44 * eviction far more problematic: we choose to evict the evictable 45 * blocks that are the "lowest" in the list. 46 * 47 * There are times when it is not possible to evict the requested 48 * space. In these circumstances we are unable to adjust the cache 49 * size. To prevent the cache growing unbounded at these times we 50 * implement a "cache throttle" that slows the flow of new data 51 * into the cache until we can make space available. 52 * 53 * 2. The Megiddo and Modha model assumes a fixed cache size. 54 * Pages are evicted when the cache is full and there is a cache 55 * miss. Our model has a variable sized cache. It grows with 56 * high use, but also tries to react to memory pressure from the 57 * operating system: decreasing its size when system memory is 58 * tight. 59 * 60 * 3. The Megiddo and Modha model assumes a fixed page size. All 61 * elements of the cache are therefore exactly the same size. So 62 * when adjusting the cache size following a cache miss, its simply 63 * a matter of choosing a single page to evict. In our model, we 64 * have variable sized cache blocks (rangeing from 512 bytes to 65 * 128K bytes). We therefore choose a set of blocks to evict to make 66 * space for a cache miss that approximates as closely as possible 67 * the space used by the new block. 68 * 69 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70 * by N. Megiddo & D. Modha, FAST 2003 71 */ 72 73 /* 74 * The locking model: 75 * 76 * A new reference to a cache buffer can be obtained in two 77 * ways: 1) via a hash table lookup using the DVA as a key, 78 * or 2) via one of the ARC lists. The arc_read() interface 79 * uses method 1, while the internal arc algorithms for 80 * adjusting the cache use method 2. We therefore provide two 81 * types of locks: 1) the hash table lock array, and 2) the 82 * arc list locks. 83 * 84 * Buffers do not have their own mutexes, rather they rely on the 85 * hash table mutexes for the bulk of their protection (i.e. most 86 * fields in the arc_buf_hdr_t are protected by these mutexes). 87 * 88 * buf_hash_find() returns the appropriate mutex (held) when it 89 * locates the requested buffer in the hash table. It returns 90 * NULL for the mutex if the buffer was not in the table. 91 * 92 * buf_hash_remove() expects the appropriate hash mutex to be 93 * already held before it is invoked. 94 * 95 * Each arc state also has a mutex which is used to protect the 96 * buffer list associated with the state. When attempting to 97 * obtain a hash table lock while holding an arc list lock you 98 * must use: mutex_tryenter() to avoid deadlock. Also note that 99 * the active state mutex must be held before the ghost state mutex. 100 * 101 * Arc buffers may have an associated eviction callback function. 102 * This function will be invoked prior to removing the buffer (e.g. 103 * in arc_do_user_evicts()). Note however that the data associated 104 * with the buffer may be evicted prior to the callback. The callback 105 * must be made with *no locks held* (to prevent deadlock). Additionally, 106 * the users of callbacks must ensure that their private data is 107 * protected from simultaneous callbacks from arc_buf_evict() 108 * and arc_do_user_evicts(). 109 * 110 * Note that the majority of the performance stats are manipulated 111 * with atomic operations. 112 * 113 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: 114 * 115 * - L2ARC buflist creation 116 * - L2ARC buflist eviction 117 * - L2ARC write completion, which walks L2ARC buflists 118 * - ARC header destruction, as it removes from L2ARC buflists 119 * - ARC header release, as it removes from L2ARC buflists 120 */ 121 122 #include <sys/spa.h> 123 #include <sys/zio.h> 124 #include <sys/zio_compress.h> 125 #include <sys/zfs_context.h> 126 #include <sys/arc.h> 127 #include <sys/refcount.h> 128 #include <sys/vdev.h> 129 #include <sys/vdev_impl.h> 130 #ifdef _KERNEL 131 #include <sys/vmsystm.h> 132 #include <vm/anon.h> 133 #include <sys/fs/swapnode.h> 134 #include <sys/dnlc.h> 135 #endif 136 #include <sys/callb.h> 137 #include <sys/kstat.h> 138 #include <zfs_fletcher.h> 139 #include <sys/byteorder.h> 140 141 #ifndef _KERNEL 142 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 143 boolean_t arc_watch = B_FALSE; 144 int arc_procfd; 145 #endif 146 147 static kmutex_t arc_reclaim_thr_lock; 148 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 149 static uint8_t arc_thread_exit; 150 151 extern int zfs_write_limit_shift; 152 extern uint64_t zfs_write_limit_max; 153 extern kmutex_t zfs_write_limit_lock; 154 155 #define ARC_REDUCE_DNLC_PERCENT 3 156 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 157 158 typedef enum arc_reclaim_strategy { 159 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 160 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 161 } arc_reclaim_strategy_t; 162 163 /* number of seconds before growing cache again */ 164 static int arc_grow_retry = 60; 165 166 /* shift of arc_c for calculating both min and max arc_p */ 167 static int arc_p_min_shift = 4; 168 169 /* log2(fraction of arc to reclaim) */ 170 static int arc_shrink_shift = 5; 171 172 /* 173 * minimum lifespan of a prefetch block in clock ticks 174 * (initialized in arc_init()) 175 */ 176 static int arc_min_prefetch_lifespan; 177 178 static int arc_dead; 179 180 /* 181 * The arc has filled available memory and has now warmed up. 182 */ 183 static boolean_t arc_warm; 184 185 /* 186 * These tunables are for performance analysis. 187 */ 188 uint64_t zfs_arc_max; 189 uint64_t zfs_arc_min; 190 uint64_t zfs_arc_meta_limit = 0; 191 int zfs_arc_grow_retry = 0; 192 int zfs_arc_shrink_shift = 0; 193 int zfs_arc_p_min_shift = 0; 194 int zfs_disable_dup_eviction = 0; 195 196 /* 197 * Note that buffers can be in one of 6 states: 198 * ARC_anon - anonymous (discussed below) 199 * ARC_mru - recently used, currently cached 200 * ARC_mru_ghost - recentely used, no longer in cache 201 * ARC_mfu - frequently used, currently cached 202 * ARC_mfu_ghost - frequently used, no longer in cache 203 * ARC_l2c_only - exists in L2ARC but not other states 204 * When there are no active references to the buffer, they are 205 * are linked onto a list in one of these arc states. These are 206 * the only buffers that can be evicted or deleted. Within each 207 * state there are multiple lists, one for meta-data and one for 208 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 209 * etc.) is tracked separately so that it can be managed more 210 * explicitly: favored over data, limited explicitly. 211 * 212 * Anonymous buffers are buffers that are not associated with 213 * a DVA. These are buffers that hold dirty block copies 214 * before they are written to stable storage. By definition, 215 * they are "ref'd" and are considered part of arc_mru 216 * that cannot be freed. Generally, they will aquire a DVA 217 * as they are written and migrate onto the arc_mru list. 218 * 219 * The ARC_l2c_only state is for buffers that are in the second 220 * level ARC but no longer in any of the ARC_m* lists. The second 221 * level ARC itself may also contain buffers that are in any of 222 * the ARC_m* states - meaning that a buffer can exist in two 223 * places. The reason for the ARC_l2c_only state is to keep the 224 * buffer header in the hash table, so that reads that hit the 225 * second level ARC benefit from these fast lookups. 226 */ 227 228 typedef struct arc_state { 229 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ 230 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 231 uint64_t arcs_size; /* total amount of data in this state */ 232 kmutex_t arcs_mtx; 233 } arc_state_t; 234 235 /* The 6 states: */ 236 static arc_state_t ARC_anon; 237 static arc_state_t ARC_mru; 238 static arc_state_t ARC_mru_ghost; 239 static arc_state_t ARC_mfu; 240 static arc_state_t ARC_mfu_ghost; 241 static arc_state_t ARC_l2c_only; 242 243 typedef struct arc_stats { 244 kstat_named_t arcstat_hits; 245 kstat_named_t arcstat_misses; 246 kstat_named_t arcstat_demand_data_hits; 247 kstat_named_t arcstat_demand_data_misses; 248 kstat_named_t arcstat_demand_metadata_hits; 249 kstat_named_t arcstat_demand_metadata_misses; 250 kstat_named_t arcstat_prefetch_data_hits; 251 kstat_named_t arcstat_prefetch_data_misses; 252 kstat_named_t arcstat_prefetch_metadata_hits; 253 kstat_named_t arcstat_prefetch_metadata_misses; 254 kstat_named_t arcstat_mru_hits; 255 kstat_named_t arcstat_mru_ghost_hits; 256 kstat_named_t arcstat_mfu_hits; 257 kstat_named_t arcstat_mfu_ghost_hits; 258 kstat_named_t arcstat_deleted; 259 kstat_named_t arcstat_recycle_miss; 260 /* 261 * Number of buffers that could not be evicted because the hash lock 262 * was held by another thread. The lock may not necessarily be held 263 * by something using the same buffer, since hash locks are shared 264 * by multiple buffers. 265 */ 266 kstat_named_t arcstat_mutex_miss; 267 /* 268 * Number of buffers skipped because they have I/O in progress, are 269 * indrect prefetch buffers that have not lived long enough, or are 270 * not from the spa we're trying to evict from. 271 */ 272 kstat_named_t arcstat_evict_skip; 273 kstat_named_t arcstat_evict_l2_cached; 274 kstat_named_t arcstat_evict_l2_eligible; 275 kstat_named_t arcstat_evict_l2_ineligible; 276 kstat_named_t arcstat_hash_elements; 277 kstat_named_t arcstat_hash_elements_max; 278 kstat_named_t arcstat_hash_collisions; 279 kstat_named_t arcstat_hash_chains; 280 kstat_named_t arcstat_hash_chain_max; 281 kstat_named_t arcstat_p; 282 kstat_named_t arcstat_c; 283 kstat_named_t arcstat_c_min; 284 kstat_named_t arcstat_c_max; 285 kstat_named_t arcstat_size; 286 kstat_named_t arcstat_hdr_size; 287 kstat_named_t arcstat_data_size; 288 kstat_named_t arcstat_other_size; 289 kstat_named_t arcstat_l2_hits; 290 kstat_named_t arcstat_l2_misses; 291 kstat_named_t arcstat_l2_feeds; 292 kstat_named_t arcstat_l2_rw_clash; 293 kstat_named_t arcstat_l2_read_bytes; 294 kstat_named_t arcstat_l2_write_bytes; 295 kstat_named_t arcstat_l2_writes_sent; 296 kstat_named_t arcstat_l2_writes_done; 297 kstat_named_t arcstat_l2_writes_error; 298 kstat_named_t arcstat_l2_writes_hdr_miss; 299 kstat_named_t arcstat_l2_evict_lock_retry; 300 kstat_named_t arcstat_l2_evict_reading; 301 kstat_named_t arcstat_l2_free_on_write; 302 kstat_named_t arcstat_l2_abort_lowmem; 303 kstat_named_t arcstat_l2_cksum_bad; 304 kstat_named_t arcstat_l2_io_error; 305 kstat_named_t arcstat_l2_size; 306 kstat_named_t arcstat_l2_asize; 307 kstat_named_t arcstat_l2_hdr_size; 308 kstat_named_t arcstat_l2_compress_successes; 309 kstat_named_t arcstat_l2_compress_zeros; 310 kstat_named_t arcstat_l2_compress_failures; 311 kstat_named_t arcstat_l2_meta_writes; 312 kstat_named_t arcstat_l2_meta_avg_size; 313 kstat_named_t arcstat_l2_meta_avg_asize; 314 kstat_named_t arcstat_l2_asize_to_meta_ratio; 315 kstat_named_t arcstat_l2_rebuild_attempts; 316 kstat_named_t arcstat_l2_rebuild_successes; 317 kstat_named_t arcstat_l2_rebuild_unsupported; 318 kstat_named_t arcstat_l2_rebuild_timeout; 319 kstat_named_t arcstat_l2_rebuild_arc_bytes; 320 kstat_named_t arcstat_l2_rebuild_l2arc_bytes; 321 kstat_named_t arcstat_l2_rebuild_bufs; 322 kstat_named_t arcstat_l2_rebuild_bufs_precached; 323 kstat_named_t arcstat_l2_rebuild_metabufs; 324 kstat_named_t arcstat_l2_rebuild_uberblk_errors; 325 kstat_named_t arcstat_l2_rebuild_io_errors; 326 kstat_named_t arcstat_l2_rebuild_cksum_errors; 327 kstat_named_t arcstat_l2_rebuild_loop_errors; 328 kstat_named_t arcstat_l2_rebuild_abort_lowmem; 329 kstat_named_t arcstat_memory_throttle_count; 330 kstat_named_t arcstat_duplicate_buffers; 331 kstat_named_t arcstat_duplicate_buffers_size; 332 kstat_named_t arcstat_duplicate_reads; 333 kstat_named_t arcstat_meta_used; 334 kstat_named_t arcstat_meta_limit; 335 kstat_named_t arcstat_meta_max; 336 } arc_stats_t; 337 338 static arc_stats_t arc_stats = { 339 { "hits", KSTAT_DATA_UINT64 }, 340 { "misses", KSTAT_DATA_UINT64 }, 341 { "demand_data_hits", KSTAT_DATA_UINT64 }, 342 { "demand_data_misses", KSTAT_DATA_UINT64 }, 343 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 344 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 345 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 346 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 347 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 348 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 349 { "mru_hits", KSTAT_DATA_UINT64 }, 350 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 351 { "mfu_hits", KSTAT_DATA_UINT64 }, 352 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 353 { "deleted", KSTAT_DATA_UINT64 }, 354 { "recycle_miss", KSTAT_DATA_UINT64 }, 355 { "mutex_miss", KSTAT_DATA_UINT64 }, 356 { "evict_skip", KSTAT_DATA_UINT64 }, 357 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 358 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 359 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 360 { "hash_elements", KSTAT_DATA_UINT64 }, 361 { "hash_elements_max", KSTAT_DATA_UINT64 }, 362 { "hash_collisions", KSTAT_DATA_UINT64 }, 363 { "hash_chains", KSTAT_DATA_UINT64 }, 364 { "hash_chain_max", KSTAT_DATA_UINT64 }, 365 { "p", KSTAT_DATA_UINT64 }, 366 { "c", KSTAT_DATA_UINT64 }, 367 { "c_min", KSTAT_DATA_UINT64 }, 368 { "c_max", KSTAT_DATA_UINT64 }, 369 { "size", KSTAT_DATA_UINT64 }, 370 { "hdr_size", KSTAT_DATA_UINT64 }, 371 { "data_size", KSTAT_DATA_UINT64 }, 372 { "other_size", KSTAT_DATA_UINT64 }, 373 { "l2_hits", KSTAT_DATA_UINT64 }, 374 { "l2_misses", KSTAT_DATA_UINT64 }, 375 { "l2_feeds", KSTAT_DATA_UINT64 }, 376 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 377 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 378 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 379 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 380 { "l2_writes_done", KSTAT_DATA_UINT64 }, 381 { "l2_writes_error", KSTAT_DATA_UINT64 }, 382 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 383 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 384 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 385 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 386 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 387 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 388 { "l2_io_error", KSTAT_DATA_UINT64 }, 389 { "l2_size", KSTAT_DATA_UINT64 }, 390 { "l2_asize", KSTAT_DATA_UINT64 }, 391 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 392 { "l2_compress_successes", KSTAT_DATA_UINT64 }, 393 { "l2_compress_zeros", KSTAT_DATA_UINT64 }, 394 { "l2_compress_failures", KSTAT_DATA_UINT64 }, 395 { "l2_meta_writes", KSTAT_DATA_UINT64 }, 396 { "l2_meta_avg_size", KSTAT_DATA_UINT64 }, 397 { "l2_meta_avg_asize", KSTAT_DATA_UINT64 }, 398 { "l2_asize_to_meta_ratio", KSTAT_DATA_UINT64 }, 399 { "l2_rebuild_attempts", KSTAT_DATA_UINT64 }, 400 { "l2_rebuild_successes", KSTAT_DATA_UINT64 }, 401 { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, 402 { "l2_rebuild_timeout", KSTAT_DATA_UINT64 }, 403 { "l2_rebuild_arc_bytes", KSTAT_DATA_UINT64 }, 404 { "l2_rebuild_l2arc_bytes", KSTAT_DATA_UINT64 }, 405 { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, 406 { "l2_rebuild_precached", KSTAT_DATA_UINT64 }, 407 { "l2_rebuild_metabufs", KSTAT_DATA_UINT64 }, 408 { "l2_rebuild_uberblk_errors", KSTAT_DATA_UINT64 }, 409 { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 }, 410 { "l2_rebuild_cksum_errors", KSTAT_DATA_UINT64 }, 411 { "l2_rebuild_loop_errors", KSTAT_DATA_UINT64 }, 412 { "l2_rebuild_abort_lowmem", KSTAT_DATA_UINT64 }, 413 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 414 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 415 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 416 { "duplicate_reads", KSTAT_DATA_UINT64 }, 417 { "arc_meta_used", KSTAT_DATA_UINT64 }, 418 { "arc_meta_limit", KSTAT_DATA_UINT64 }, 419 { "arc_meta_max", KSTAT_DATA_UINT64 } 420 }; 421 422 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 423 424 #define ARCSTAT_INCR(stat, val) \ 425 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 426 427 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 428 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 429 430 #define ARCSTAT_MAX(stat, val) { \ 431 uint64_t m; \ 432 while ((val) > (m = arc_stats.stat.value.ui64) && \ 433 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 434 continue; \ 435 } 436 437 #define ARCSTAT_MAXSTAT(stat) \ 438 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 439 440 /* 441 * We define a macro to allow ARC hits/misses to be easily broken down by 442 * two separate conditions, giving a total of four different subtypes for 443 * each of hits and misses (so eight statistics total). 444 */ 445 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 446 if (cond1) { \ 447 if (cond2) { \ 448 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 449 } else { \ 450 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 451 } \ 452 } else { \ 453 if (cond2) { \ 454 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 455 } else { \ 456 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 457 } \ 458 } 459 460 /* 461 * This macro allows us to use kstats as floating averages. Each time we 462 * update this kstat, we first factor it and the update value by 463 * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall 464 * average. This macro assumes that integer loads and stores are atomic, but 465 * is not safe for multiple writers updating the kstat in parallel (only the 466 * last writer's update will remain). 467 */ 468 #define ARCSTAT_F_AVG_FACTOR 3 469 #define ARCSTAT_F_AVG(stat, value) \ 470 do { \ 471 uint64_t x = ARCSTAT(stat); \ 472 x = x - x / ARCSTAT_F_AVG_FACTOR + \ 473 (value) / ARCSTAT_F_AVG_FACTOR; \ 474 ARCSTAT(stat) = x; \ 475 _NOTE(NOTREACHED) \ 476 _NOTE(CONSTCOND) \ 477 } while (0) 478 479 kstat_t *arc_ksp; 480 static arc_state_t *arc_anon; 481 static arc_state_t *arc_mru; 482 static arc_state_t *arc_mru_ghost; 483 static arc_state_t *arc_mfu; 484 static arc_state_t *arc_mfu_ghost; 485 static arc_state_t *arc_l2c_only; 486 487 /* 488 * There are several ARC variables that are critical to export as kstats -- 489 * but we don't want to have to grovel around in the kstat whenever we wish to 490 * manipulate them. For these variables, we therefore define them to be in 491 * terms of the statistic variable. This assures that we are not introducing 492 * the possibility of inconsistency by having shadow copies of the variables, 493 * while still allowing the code to be readable. 494 */ 495 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 496 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 497 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 498 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 499 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 500 #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 501 #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 502 #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 503 504 #define L2ARC_IS_VALID_COMPRESS(_c_) \ 505 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) 506 507 static int arc_no_grow; /* Don't try to grow cache size */ 508 static uint64_t arc_tempreserve; 509 static uint64_t arc_loaned_bytes; 510 511 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; 512 513 typedef struct arc_callback arc_callback_t; 514 515 struct arc_callback { 516 void *acb_private; 517 arc_done_func_t *acb_done; 518 arc_buf_t *acb_buf; 519 zio_t *acb_zio_dummy; 520 arc_callback_t *acb_next; 521 }; 522 523 typedef struct arc_write_callback arc_write_callback_t; 524 525 struct arc_write_callback { 526 void *awcb_private; 527 arc_done_func_t *awcb_ready; 528 arc_done_func_t *awcb_done; 529 arc_buf_t *awcb_buf; 530 }; 531 532 struct arc_buf_hdr { 533 /* protected by hash lock */ 534 dva_t b_dva; 535 uint64_t b_birth; 536 uint64_t b_cksum0; 537 538 kmutex_t b_freeze_lock; 539 zio_cksum_t *b_freeze_cksum; 540 void *b_thawed; 541 542 arc_buf_hdr_t *b_hash_next; 543 arc_buf_t *b_buf; 544 uint32_t b_flags; 545 uint32_t b_datacnt; 546 547 arc_callback_t *b_acb; 548 kcondvar_t b_cv; 549 550 /* immutable */ 551 arc_buf_contents_t b_type; 552 uint64_t b_size; 553 uint64_t b_spa; 554 555 /* protected by arc state mutex */ 556 arc_state_t *b_state; 557 list_node_t b_arc_node; 558 559 /* updated atomically */ 560 clock_t b_arc_access; 561 562 /* self protecting */ 563 refcount_t b_refcnt; 564 565 l2arc_buf_hdr_t *b_l2hdr; 566 list_node_t b_l2node; 567 }; 568 569 static arc_buf_t *arc_eviction_list; 570 static kmutex_t arc_eviction_mtx; 571 static arc_buf_hdr_t arc_eviction_hdr; 572 static void arc_get_data_buf(arc_buf_t *buf); 573 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 574 static int arc_evict_needed(arc_buf_contents_t type); 575 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); 576 static void arc_buf_watch(arc_buf_t *buf); 577 578 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); 579 580 #define GHOST_STATE(state) \ 581 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 582 (state) == arc_l2c_only) 583 584 /* 585 * Private ARC flags. These flags are private ARC only flags that will show up 586 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 587 * be passed in as arc_flags in things like arc_read. However, these flags 588 * should never be passed and should only be set by ARC code. When adding new 589 * public flags, make sure not to smash the private ones. 590 */ 591 592 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 593 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 594 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 595 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 596 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 597 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 598 #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ 599 #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ 600 #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ 601 #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ 602 603 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 604 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 605 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 606 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) 607 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 608 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 609 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) 610 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) 611 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ 612 (hdr)->b_l2hdr != NULL) 613 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) 614 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) 615 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) 616 617 /* 618 * Other sizes 619 */ 620 621 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 622 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) 623 624 /* 625 * Hash table routines 626 */ 627 628 #define HT_LOCK_PAD 64 629 630 struct ht_lock { 631 kmutex_t ht_lock; 632 #ifdef _KERNEL 633 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 634 #endif 635 }; 636 637 #define BUF_LOCKS 256 638 typedef struct buf_hash_table { 639 uint64_t ht_mask; 640 arc_buf_hdr_t **ht_table; 641 struct ht_lock ht_locks[BUF_LOCKS]; 642 } buf_hash_table_t; 643 644 static buf_hash_table_t buf_hash_table; 645 646 #define BUF_HASH_INDEX(spa, dva, birth) \ 647 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 648 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 649 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 650 #define HDR_LOCK(hdr) \ 651 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 652 653 uint64_t zfs_crc64_table[256]; 654 655 /* 656 * Level 2 ARC 657 */ 658 659 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 660 #define L2ARC_HEADROOM 2 /* num of writes */ 661 /* 662 * If we discover during ARC scan any buffers to be compressed, we boost 663 * our headroom for the next scanning cycle by this percentage multiple. 664 */ 665 #define L2ARC_HEADROOM_BOOST 200 666 #define L2ARC_FEED_SECS 1 /* caching interval secs */ 667 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 668 669 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 670 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 671 672 /* L2ARC Performance Tunables */ 673 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 674 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 675 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 676 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 677 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 678 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 679 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 680 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 681 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 682 683 /* 684 * L2ARC Internals 685 */ 686 typedef struct l2arc_dev l2arc_dev_t; 687 static list_t L2ARC_dev_list; /* device list */ 688 static list_t *l2arc_dev_list; /* device list pointer */ 689 static kmutex_t l2arc_dev_mtx; /* device list mutex */ 690 static l2arc_dev_t *l2arc_dev_last; /* last device used */ 691 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ 692 static list_t L2ARC_free_on_write; /* free after write buf list */ 693 static list_t *l2arc_free_on_write; /* free after write list ptr */ 694 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 695 static uint64_t l2arc_ndev; /* number of devices */ 696 697 typedef struct l2arc_read_callback { 698 arc_buf_t *l2rcb_buf; /* read buffer */ 699 spa_t *l2rcb_spa; /* spa */ 700 blkptr_t l2rcb_bp; /* original blkptr */ 701 zbookmark_t l2rcb_zb; /* original bookmark */ 702 int l2rcb_flags; /* original flags */ 703 enum zio_compress l2rcb_compress; /* applied compress */ 704 } l2arc_read_callback_t; 705 706 typedef struct l2arc_write_callback { 707 l2arc_dev_t *l2wcb_dev; /* device info */ 708 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 709 uint8_t *l2wcb_pbuf; /* pbuf sent in this write */ 710 uint32_t l2wcb_pbuf_size; /* size of committed pbuf */ 711 uint8_t *l2wcb_ub_buf; /* uberblock in this write */ 712 } l2arc_write_callback_t; 713 714 struct l2arc_buf_hdr { 715 /* protected by arc_buf_hdr mutex */ 716 l2arc_dev_t *b_dev; /* L2ARC device */ 717 uint64_t b_daddr; /* disk address, offset byte */ 718 /* compression applied to buffer data */ 719 enum zio_compress b_compress; 720 /* real alloc'd buffer size depending on b_compress applied */ 721 int b_asize; 722 /* temporary buffer holder for in-flight compressed data */ 723 void *b_tmp_cdata; 724 }; 725 726 typedef struct l2arc_data_free { 727 /* protected by l2arc_free_on_write_mtx */ 728 void *l2df_data; 729 size_t l2df_size; 730 void (*l2df_func)(void *, size_t); 731 list_node_t l2df_list_node; 732 } l2arc_data_free_t; 733 734 static kmutex_t l2arc_feed_thr_lock; 735 static kcondvar_t l2arc_feed_thr_cv; 736 static uint8_t l2arc_thread_exit; 737 738 static void l2arc_read_done(zio_t *zio); 739 static void l2arc_hdr_stat_add(boolean_t from_arc); 740 static void l2arc_hdr_stat_remove(void); 741 742 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr); 743 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, 744 enum zio_compress c); 745 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab); 746 747 typedef enum { 748 L2UBLK_BIG_ENDIAN = (1 << 0), /* little endian assumed otherwise */ 749 L2UBLK_EVICT_FIRST = (1 << 1) /* mirror of l2ad_first in l2dev */ 750 } l2uberblock_flags_t; 751 752 typedef struct l2uberblock { 753 uint32_t ub_magic; 754 uint8_t ub_version; 755 l2uberblock_flags_t ub_flags; 756 757 uint64_t ub_spa_guid; 758 uint64_t ub_birth; 759 uint64_t ub_evict_tail; /* current evict pointer */ 760 uint64_t ub_alloc_space; /* vdev space alloc status */ 761 uint64_t ub_pbuf_daddr; /* address of newest pbuf */ 762 uint32_t ub_pbuf_asize; /* size of newest pbuf */ 763 zio_cksum_t ub_pbuf_cksum; /* fletcher4 of newest pbuf */ 764 765 zio_cksum_t ub_cksum; /* cksum of uberblock */ 766 } l2uberblock_t; 767 768 typedef enum { 769 L2PBUF_BIG_ENDIAN = (1 << 0), /* little endian assumed otherwise */ 770 L2PBUF_COMPRESSED = (1 << 1) /* pbuf data items are compressed */ 771 } l2pbuf_flags_t; 772 773 typedef struct l2pbuf { 774 uint32_t pb_magic; 775 unsigned int pb_version; 776 l2pbuf_flags_t pb_flags; 777 778 uint64_t pb_prev_daddr; /* address of previous pbuf */ 779 uint32_t pb_prev_asize; /* size of previous pbuf */ 780 zio_cksum_t pb_prev_cksum; /* fletcher4 of prev. pbuf */ 781 782 /* 783 * This is a set of item lists that are contained in this pbuf. Each 784 * L2ARC write appends a new l2pbuf_buflist_t array of l2pbuf_buf_t's. 785 * This serves as a soft timeout feature - once the limit of the 786 * number of item lists that a pbuf can hold is reached, the pbuf is 787 * flushed to stable storage, regardless of its total size. 788 */ 789 list_t *pb_buflists_list; 790 791 /* 792 * Number of compressed bytes referenced by items in this pbuf and 793 * the number of lists present. 794 * This is not actually written to storage, it is only used by 795 * internal algorithms which check for when a pbuf reaches a 796 * certain size limit, after which it is flushed in a write. 797 */ 798 uint64_t pb_payload_asz; 799 /* Same thing for number of buflists */ 800 int pb_nbuflists; 801 802 /* 803 * Filled in by l2arc_pbuf_read to hold this pbuf's alloc'd size. 804 * This is then used by l2arc_pbuf_restore to update used space 805 * on the L2ARC vdev. 806 */ 807 size_t pb_asize; 808 } l2pbuf_t; 809 810 typedef struct l2pbuf_buf l2pbuf_buf_t; 811 typedef struct l2pbuf_buflist { 812 uint32_t l2pbl_nbufs; 813 l2pbuf_buf_t *l2pbl_bufs; 814 list_node_t l2pbl_node; 815 } l2pbuf_buflist_t; 816 817 struct l2pbuf_buf { 818 dva_t b_dva; /* dva of buffer */ 819 uint64_t b_birth; /* birth txg of buffer */ 820 uint64_t b_cksum0; 821 zio_cksum_t b_freeze_cksum; 822 uint32_t b_size; /* uncompressed buf size */ 823 uint64_t b_l2daddr; /* buf location on l2dev */ 824 uint32_t b_l2asize; /* actual buf data size */ 825 enum zio_compress b_l2compress; /* compression applied */ 826 uint16_t b_contents_type; 827 uint32_t b_flags; 828 }; 829 830 struct l2arc_dev { 831 vdev_t *l2ad_vdev; /* vdev */ 832 spa_t *l2ad_spa; /* spa */ 833 uint64_t l2ad_hand; /* next write location */ 834 uint64_t l2ad_start; /* first addr on device */ 835 uint64_t l2ad_end; /* last addr on device */ 836 uint64_t l2ad_evict; /* last addr eviction reached */ 837 boolean_t l2ad_first; /* first sweep through */ 838 boolean_t l2ad_writing; /* currently writing */ 839 list_t *l2ad_buflist; /* buffer list */ 840 list_node_t l2ad_node; /* device list node */ 841 l2pbuf_t l2ad_pbuf; /* currently open pbuf */ 842 uint64_t l2ad_pbuf_daddr; /* prev pbuf daddr */ 843 uint64_t l2ad_pbuf_asize; /* prev pbuf asize */ 844 zio_cksum_t l2ad_pbuf_cksum; /* prev pbuf cksum */ 845 /* uberblock birth counter - incremented for each committed uberblk */ 846 uint64_t l2ad_uberblock_birth; 847 /* flag indicating whether a rebuild is currently going on */ 848 boolean_t l2ad_rebuilding; 849 }; 850 851 /* Stores information about an L2ARC prefetch zio */ 852 typedef struct l2arc_prefetch_info { 853 uint8_t *pi_buf; /* where the zio writes to */ 854 uint64_t pi_buflen; /* length of `buf' */ 855 zio_t *pi_hdr_io; /* see l2arc_pbuf_read below */ 856 } l2arc_prefetch_info_t; 857 858 /* 256 x 4k of l2uberblocks */ 859 #define L2UBERBLOCK_SIZE 4096 860 #define L2UBERBLOCK_MAGIC 0x12bab10c 861 #define L2UBERBLOCK_MAX_VERSION 1 /* our maximum uberblock version */ 862 #define L2PBUF_MAGIC 0xdb0faba6 863 #define L2PBUF_MAX_VERSION 1 /* our maximum pbuf version */ 864 #define L2PBUF_BUF_SIZE 88 /* size of one pbuf buf entry */ 865 #define L2PBUF_HDR_SIZE 56 /* pbuf header excluding any payload */ 866 #define L2PBUF_ENCODED_SIZE(_pb) \ 867 (L2PBUF_HDR_SIZE + l2arc_pbuf_items_encoded_size(_pb)) 868 /* 869 * Allocation limit for the payload of a pbuf. This also fundamentally 870 * limits the number of bufs we can reference in a pbuf. 871 */ 872 #define L2PBUF_MAX_PAYLOAD_SIZE (24 * 1024 * 1024) 873 #define L2PBUF_MAX_BUFS (L2PBUF_MAX_PAYLOAD_SIZE / L2PBUF_BUF_SIZE) 874 #define L2PBUF_COMPRESS_MINSZ 8192 /* minimum size to compress a pbuf */ 875 #define L2PBUF_MAXSZ 100 * 1024 * 1024 /* maximum pbuf size */ 876 #define L2PBUF_MAX_BUFLISTS 128 /* max number of buflists per pbuf */ 877 #define L2ARC_REBUILD_TIMEOUT 60 /* a rebuild may take at most 60s */ 878 #define L2PBUF_IS_FULL(_pb) \ 879 ((_pb)->pb_payload_asz > l2arc_pbuf_max_sz || \ 880 (_pb)->pb_nbuflists + 1 >= l2arc_pbuf_max_buflists) 881 /* 882 * These are the flags we allow to persist in L2ARC pbufs. The other flags 883 * of an ARC buffer pertain to the buffer's runtime behavior. 884 */ 885 #define L2ARC_PERSIST_FLAGS \ 886 (ARC_IN_HASH_TABLE | ARC_L2CACHE | ARC_L2COMPRESS | ARC_PREFETCH) 887 888 /* 889 * Used during L2ARC rebuild after each read operation to check whether we 890 * haven't exceeded the rebuild timeout value. 891 */ 892 #define L2ARC_CHK_REBUILD_TIMEOUT(_deadline_, ...) \ 893 do { \ 894 if ((_deadline_) != 0 && (_deadline_) < ddi_get_lbolt64()) { \ 895 __VA_ARGS__; \ 896 ARCSTAT_BUMP(arcstat_l2_rebuild_timeout); \ 897 cmn_err(CE_WARN, "L2ARC rebuild is taking too long, " \ 898 "dropping remaining L2ARC metadata."); \ 899 return; \ 900 } \ 901 _NOTE(NOTREACHED) \ 902 _NOTE(CONSTCOND) \ 903 } while (0) 904 905 /* 906 * Performance tuning of L2ARC persistency: 907 * 908 * l2arc_pbuf_compress_minsz : Minimum size of a pbuf in order to attempt 909 * compressing it. 910 * l2arc_pbuf_max_sz : Upper bound on the physical size of L2ARC buffers 911 * referenced from a pbuf. Once a pbuf reaches this size, it is 912 * committed to stable storage. Ideally, there should be approx. 913 * l2arc_dev_size / l2arc_pbuf_max_sz pbufs on an L2ARC device. 914 * l2arc_pbuf_max_buflists : Maximum number of L2ARC feed cycles that will 915 * be buffered in a pbuf before it is committed to L2ARC. This 916 * puts a soft temporal upper bound on pbuf commit intervals. 917 * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at 918 * pool import or when adding one manually later) will attempt 919 * to rebuild L2ARC buffer contents. In special circumstances, 920 * the administrator may want to set this to B_FALSE, if they 921 * are having trouble importing a pool or attaching an L2ARC 922 * device (e.g. the L2ARC device is slow to read in stored pbuf 923 * metadata, or the metadata has become somehow 924 * fragmented/unusable). 925 * l2arc_rebuild_timeout : A hard timeout value on L2ARC rebuilding to help 926 * avoid a slow L2ARC device from preventing pool import. If we 927 * are not done rebuilding an L2ARC device by this time, we 928 * stop the rebuild and return immediately. 929 */ 930 uint64_t l2arc_pbuf_compress_minsz = L2PBUF_COMPRESS_MINSZ; 931 uint64_t l2arc_pbuf_max_sz = L2PBUF_MAXSZ; 932 uint64_t l2arc_pbuf_max_buflists = L2PBUF_MAX_BUFLISTS; 933 boolean_t l2arc_rebuild_enabled = B_TRUE; 934 uint64_t l2arc_rebuild_timeout = L2ARC_REBUILD_TIMEOUT; 935 936 static void l2arc_rebuild_start(l2arc_dev_t *dev); 937 static void l2arc_rebuild(l2arc_dev_t *dev); 938 static void l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb); 939 static void l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev, 940 uint64_t guid); 941 942 static int l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub); 943 static int l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize, 944 zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **next_io); 945 static int l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr, 946 uint32_t asize); 947 static zio_t *l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize); 948 static void l2arc_pbuf_prefetch_abort(zio_t *zio); 949 950 static void l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf); 951 static void l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub); 952 static int l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub, 953 uint64_t guid); 954 static void l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio, 955 l2arc_write_callback_t *cb); 956 957 static uint32_t l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen); 958 static int l2arc_pbuf_decode(uint8_t *buf, uint32_t buflen, 959 l2pbuf_t *pbuf); 960 static int l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen, 961 uint64_t *daddr, uint32_t *asize, zio_cksum_t *cksum); 962 static void l2arc_pbuf_init(l2pbuf_t *pb); 963 static void l2arc_pbuf_destroy(l2pbuf_t *pb); 964 static void l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio, 965 l2arc_write_callback_t *cb); 966 static l2pbuf_buflist_t *l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs); 967 static void l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl, 968 const arc_buf_hdr_t *ab, int index); 969 static uint32_t l2arc_pbuf_items_encoded_size(l2pbuf_t *pb); 970 971 static uint64_t 972 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 973 { 974 uint8_t *vdva = (uint8_t *)dva; 975 uint64_t crc = -1ULL; 976 int i; 977 978 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 979 980 for (i = 0; i < sizeof (dva_t); i++) 981 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 982 983 crc ^= (spa>>8) ^ birth; 984 985 return (crc); 986 } 987 988 #define BUF_EMPTY(buf) \ 989 ((buf)->b_dva.dva_word[0] == 0 && \ 990 (buf)->b_dva.dva_word[1] == 0 && \ 991 (buf)->b_birth == 0) 992 993 #define BUF_EQUAL(spa, dva, birth, buf) \ 994 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 995 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 996 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 997 998 static void 999 buf_discard_identity(arc_buf_hdr_t *hdr) 1000 { 1001 hdr->b_dva.dva_word[0] = 0; 1002 hdr->b_dva.dva_word[1] = 0; 1003 hdr->b_birth = 0; 1004 hdr->b_cksum0 = 0; 1005 } 1006 1007 static arc_buf_hdr_t * 1008 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) 1009 { 1010 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1011 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1012 arc_buf_hdr_t *buf; 1013 1014 mutex_enter(hash_lock); 1015 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 1016 buf = buf->b_hash_next) { 1017 if (BUF_EQUAL(spa, dva, birth, buf)) { 1018 *lockp = hash_lock; 1019 return (buf); 1020 } 1021 } 1022 mutex_exit(hash_lock); 1023 *lockp = NULL; 1024 return (NULL); 1025 } 1026 1027 /* 1028 * Insert an entry into the hash table. If there is already an element 1029 * equal to elem in the hash table, then the already existing element 1030 * will be returned and the new element will not be inserted. 1031 * Otherwise returns NULL. 1032 */ 1033 static arc_buf_hdr_t * 1034 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 1035 { 1036 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 1037 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1038 arc_buf_hdr_t *fbuf; 1039 uint32_t i; 1040 1041 ASSERT(!HDR_IN_HASH_TABLE(buf)); 1042 *lockp = hash_lock; 1043 mutex_enter(hash_lock); 1044 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 1045 fbuf = fbuf->b_hash_next, i++) { 1046 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 1047 return (fbuf); 1048 } 1049 1050 buf->b_hash_next = buf_hash_table.ht_table[idx]; 1051 buf_hash_table.ht_table[idx] = buf; 1052 buf->b_flags |= ARC_IN_HASH_TABLE; 1053 1054 /* collect some hash table performance data */ 1055 if (i > 0) { 1056 ARCSTAT_BUMP(arcstat_hash_collisions); 1057 if (i == 1) 1058 ARCSTAT_BUMP(arcstat_hash_chains); 1059 1060 ARCSTAT_MAX(arcstat_hash_chain_max, i); 1061 } 1062 1063 ARCSTAT_BUMP(arcstat_hash_elements); 1064 ARCSTAT_MAXSTAT(arcstat_hash_elements); 1065 1066 return (NULL); 1067 } 1068 1069 static void 1070 buf_hash_remove(arc_buf_hdr_t *buf) 1071 { 1072 arc_buf_hdr_t *fbuf, **bufp; 1073 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 1074 1075 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1076 ASSERT(HDR_IN_HASH_TABLE(buf)); 1077 1078 bufp = &buf_hash_table.ht_table[idx]; 1079 while ((fbuf = *bufp) != buf) { 1080 ASSERT(fbuf != NULL); 1081 bufp = &fbuf->b_hash_next; 1082 } 1083 *bufp = buf->b_hash_next; 1084 buf->b_hash_next = NULL; 1085 buf->b_flags &= ~ARC_IN_HASH_TABLE; 1086 1087 /* collect some hash table performance data */ 1088 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1089 1090 if (buf_hash_table.ht_table[idx] && 1091 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1092 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1093 } 1094 1095 /* 1096 * Global data structures and functions for the buf kmem cache. 1097 */ 1098 static kmem_cache_t *hdr_cache; 1099 static kmem_cache_t *buf_cache; 1100 1101 static void 1102 buf_fini(void) 1103 { 1104 int i; 1105 1106 kmem_free(buf_hash_table.ht_table, 1107 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1108 for (i = 0; i < BUF_LOCKS; i++) 1109 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1110 kmem_cache_destroy(hdr_cache); 1111 kmem_cache_destroy(buf_cache); 1112 } 1113 1114 /* 1115 * Constructor callback - called when the cache is empty 1116 * and a new buf is requested. 1117 */ 1118 /* ARGSUSED */ 1119 static int 1120 hdr_cons(void *vbuf, void *unused, int kmflag) 1121 { 1122 arc_buf_hdr_t *buf = vbuf; 1123 1124 bzero(buf, sizeof (arc_buf_hdr_t)); 1125 refcount_create(&buf->b_refcnt); 1126 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 1127 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1128 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 1129 1130 return (0); 1131 } 1132 1133 /* ARGSUSED */ 1134 static int 1135 buf_cons(void *vbuf, void *unused, int kmflag) 1136 { 1137 arc_buf_t *buf = vbuf; 1138 1139 bzero(buf, sizeof (arc_buf_t)); 1140 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1141 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1142 1143 return (0); 1144 } 1145 1146 /* 1147 * Destructor callback - called when a cached buf is 1148 * no longer required. 1149 */ 1150 /* ARGSUSED */ 1151 static void 1152 hdr_dest(void *vbuf, void *unused) 1153 { 1154 arc_buf_hdr_t *buf = vbuf; 1155 1156 ASSERT(BUF_EMPTY(buf)); 1157 refcount_destroy(&buf->b_refcnt); 1158 cv_destroy(&buf->b_cv); 1159 mutex_destroy(&buf->b_freeze_lock); 1160 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 1161 } 1162 1163 /* ARGSUSED */ 1164 static void 1165 buf_dest(void *vbuf, void *unused) 1166 { 1167 arc_buf_t *buf = vbuf; 1168 1169 mutex_destroy(&buf->b_evict_lock); 1170 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1171 } 1172 1173 /* 1174 * Reclaim callback -- invoked when memory is low. 1175 */ 1176 /* ARGSUSED */ 1177 static void 1178 hdr_recl(void *unused) 1179 { 1180 dprintf("hdr_recl called\n"); 1181 /* 1182 * umem calls the reclaim func when we destroy the buf cache, 1183 * which is after we do arc_fini(). 1184 */ 1185 if (!arc_dead) 1186 cv_signal(&arc_reclaim_thr_cv); 1187 } 1188 1189 static void 1190 buf_init(void) 1191 { 1192 uint64_t *ct; 1193 uint64_t hsize = 1ULL << 12; 1194 int i, j; 1195 1196 /* 1197 * The hash table is big enough to fill all of physical memory 1198 * with an average 64K block size. The table will take up 1199 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 1200 */ 1201 while (hsize * 65536 < physmem * PAGESIZE) 1202 hsize <<= 1; 1203 retry: 1204 buf_hash_table.ht_mask = hsize - 1; 1205 buf_hash_table.ht_table = 1206 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1207 if (buf_hash_table.ht_table == NULL) { 1208 ASSERT(hsize > (1ULL << 8)); 1209 hsize >>= 1; 1210 goto retry; 1211 } 1212 1213 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 1214 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 1215 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1216 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1217 1218 for (i = 0; i < 256; i++) 1219 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1220 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1221 1222 for (i = 0; i < BUF_LOCKS; i++) { 1223 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1224 NULL, MUTEX_DEFAULT, NULL); 1225 } 1226 } 1227 1228 #define ARC_MINTIME (hz>>4) /* 62 ms */ 1229 1230 static void 1231 arc_cksum_verify(arc_buf_t *buf) 1232 { 1233 zio_cksum_t zc; 1234 1235 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1236 return; 1237 1238 mutex_enter(&buf->b_hdr->b_freeze_lock); 1239 if (buf->b_hdr->b_freeze_cksum == NULL || 1240 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 1241 mutex_exit(&buf->b_hdr->b_freeze_lock); 1242 return; 1243 } 1244 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1245 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 1246 panic("buffer modified while frozen!"); 1247 mutex_exit(&buf->b_hdr->b_freeze_lock); 1248 } 1249 1250 static int 1251 arc_cksum_equal(arc_buf_t *buf) 1252 { 1253 zio_cksum_t zc; 1254 int equal; 1255 1256 mutex_enter(&buf->b_hdr->b_freeze_lock); 1257 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1258 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 1259 mutex_exit(&buf->b_hdr->b_freeze_lock); 1260 1261 return (equal); 1262 } 1263 1264 static void 1265 arc_cksum_compute(arc_buf_t *buf, boolean_t force) 1266 { 1267 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1268 return; 1269 1270 mutex_enter(&buf->b_hdr->b_freeze_lock); 1271 if (buf->b_hdr->b_freeze_cksum != NULL) { 1272 mutex_exit(&buf->b_hdr->b_freeze_lock); 1273 return; 1274 } 1275 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1276 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1277 buf->b_hdr->b_freeze_cksum); 1278 mutex_exit(&buf->b_hdr->b_freeze_lock); 1279 arc_buf_watch(buf); 1280 } 1281 1282 #ifndef _KERNEL 1283 typedef struct procctl { 1284 long cmd; 1285 prwatch_t prwatch; 1286 } procctl_t; 1287 #endif 1288 1289 /* ARGSUSED */ 1290 static void 1291 arc_buf_unwatch(arc_buf_t *buf) 1292 { 1293 #ifndef _KERNEL 1294 if (arc_watch) { 1295 int result; 1296 procctl_t ctl; 1297 ctl.cmd = PCWATCH; 1298 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1299 ctl.prwatch.pr_size = 0; 1300 ctl.prwatch.pr_wflags = 0; 1301 result = write(arc_procfd, &ctl, sizeof (ctl)); 1302 ASSERT3U(result, ==, sizeof (ctl)); 1303 } 1304 #endif 1305 } 1306 1307 /* ARGSUSED */ 1308 static void 1309 arc_buf_watch(arc_buf_t *buf) 1310 { 1311 #ifndef _KERNEL 1312 if (arc_watch) { 1313 int result; 1314 procctl_t ctl; 1315 ctl.cmd = PCWATCH; 1316 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1317 ctl.prwatch.pr_size = buf->b_hdr->b_size; 1318 ctl.prwatch.pr_wflags = WA_WRITE; 1319 result = write(arc_procfd, &ctl, sizeof (ctl)); 1320 ASSERT3U(result, ==, sizeof (ctl)); 1321 } 1322 #endif 1323 } 1324 1325 void 1326 arc_buf_thaw(arc_buf_t *buf) 1327 { 1328 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1329 if (buf->b_hdr->b_state != arc_anon) 1330 panic("modifying non-anon buffer!"); 1331 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 1332 panic("modifying buffer while i/o in progress!"); 1333 arc_cksum_verify(buf); 1334 } 1335 1336 mutex_enter(&buf->b_hdr->b_freeze_lock); 1337 if (buf->b_hdr->b_freeze_cksum != NULL) { 1338 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1339 buf->b_hdr->b_freeze_cksum = NULL; 1340 } 1341 1342 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1343 if (buf->b_hdr->b_thawed) 1344 kmem_free(buf->b_hdr->b_thawed, 1); 1345 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); 1346 } 1347 1348 mutex_exit(&buf->b_hdr->b_freeze_lock); 1349 1350 arc_buf_unwatch(buf); 1351 } 1352 1353 void 1354 arc_buf_freeze(arc_buf_t *buf) 1355 { 1356 kmutex_t *hash_lock; 1357 1358 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1359 return; 1360 1361 hash_lock = HDR_LOCK(buf->b_hdr); 1362 mutex_enter(hash_lock); 1363 1364 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1365 buf->b_hdr->b_state == arc_anon); 1366 arc_cksum_compute(buf, B_FALSE); 1367 mutex_exit(hash_lock); 1368 1369 } 1370 1371 static void 1372 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 1373 { 1374 ASSERT(MUTEX_HELD(hash_lock)); 1375 1376 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 1377 (ab->b_state != arc_anon)) { 1378 uint64_t delta = ab->b_size * ab->b_datacnt; 1379 list_t *list = &ab->b_state->arcs_list[ab->b_type]; 1380 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; 1381 1382 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); 1383 mutex_enter(&ab->b_state->arcs_mtx); 1384 ASSERT(list_link_active(&ab->b_arc_node)); 1385 list_remove(list, ab); 1386 if (GHOST_STATE(ab->b_state)) { 1387 ASSERT0(ab->b_datacnt); 1388 ASSERT3P(ab->b_buf, ==, NULL); 1389 delta = ab->b_size; 1390 } 1391 ASSERT(delta > 0); 1392 ASSERT3U(*size, >=, delta); 1393 atomic_add_64(size, -delta); 1394 mutex_exit(&ab->b_state->arcs_mtx); 1395 /* remove the prefetch flag if we get a reference */ 1396 if (ab->b_flags & ARC_PREFETCH) 1397 ab->b_flags &= ~ARC_PREFETCH; 1398 } 1399 } 1400 1401 static int 1402 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 1403 { 1404 int cnt; 1405 arc_state_t *state = ab->b_state; 1406 1407 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1408 ASSERT(!GHOST_STATE(state)); 1409 1410 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 1411 (state != arc_anon)) { 1412 uint64_t *size = &state->arcs_lsize[ab->b_type]; 1413 1414 ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 1415 mutex_enter(&state->arcs_mtx); 1416 ASSERT(!list_link_active(&ab->b_arc_node)); 1417 list_insert_head(&state->arcs_list[ab->b_type], ab); 1418 ASSERT(ab->b_datacnt > 0); 1419 atomic_add_64(size, ab->b_size * ab->b_datacnt); 1420 mutex_exit(&state->arcs_mtx); 1421 } 1422 return (cnt); 1423 } 1424 1425 /* 1426 * Move the supplied buffer to the indicated state. The mutex 1427 * for the buffer must be held by the caller. 1428 */ 1429 static void 1430 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 1431 { 1432 arc_state_t *old_state = ab->b_state; 1433 int64_t refcnt = refcount_count(&ab->b_refcnt); 1434 uint64_t from_delta, to_delta; 1435 1436 ASSERT(MUTEX_HELD(hash_lock)); 1437 ASSERT(new_state != old_state); 1438 ASSERT(refcnt == 0 || ab->b_datacnt > 0); 1439 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 1440 ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); 1441 1442 from_delta = to_delta = ab->b_datacnt * ab->b_size; 1443 1444 /* 1445 * If this buffer is evictable, transfer it from the 1446 * old state list to the new state list. 1447 */ 1448 if (refcnt == 0) { 1449 if (old_state != arc_anon) { 1450 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 1451 uint64_t *size = &old_state->arcs_lsize[ab->b_type]; 1452 1453 if (use_mutex) 1454 mutex_enter(&old_state->arcs_mtx); 1455 1456 ASSERT(list_link_active(&ab->b_arc_node)); 1457 list_remove(&old_state->arcs_list[ab->b_type], ab); 1458 1459 /* 1460 * If prefetching out of the ghost cache, 1461 * we will have a non-zero datacnt. 1462 */ 1463 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 1464 /* ghost elements have a ghost size */ 1465 ASSERT(ab->b_buf == NULL); 1466 from_delta = ab->b_size; 1467 } 1468 ASSERT3U(*size, >=, from_delta); 1469 atomic_add_64(size, -from_delta); 1470 1471 if (use_mutex) 1472 mutex_exit(&old_state->arcs_mtx); 1473 } 1474 if (new_state != arc_anon) { 1475 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 1476 uint64_t *size = &new_state->arcs_lsize[ab->b_type]; 1477 1478 if (use_mutex) 1479 mutex_enter(&new_state->arcs_mtx); 1480 1481 list_insert_head(&new_state->arcs_list[ab->b_type], ab); 1482 1483 /* ghost elements have a ghost size */ 1484 if (GHOST_STATE(new_state)) { 1485 ASSERT(ab->b_datacnt == 0); 1486 ASSERT(ab->b_buf == NULL); 1487 to_delta = ab->b_size; 1488 } 1489 atomic_add_64(size, to_delta); 1490 1491 if (use_mutex) 1492 mutex_exit(&new_state->arcs_mtx); 1493 } 1494 } 1495 1496 ASSERT(!BUF_EMPTY(ab)); 1497 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) 1498 buf_hash_remove(ab); 1499 1500 /* adjust state sizes */ 1501 if (to_delta) 1502 atomic_add_64(&new_state->arcs_size, to_delta); 1503 if (from_delta) { 1504 ASSERT3U(old_state->arcs_size, >=, from_delta); 1505 atomic_add_64(&old_state->arcs_size, -from_delta); 1506 } 1507 ab->b_state = new_state; 1508 1509 /* adjust l2arc hdr stats */ 1510 if (new_state == arc_l2c_only) 1511 l2arc_hdr_stat_add(old_state != arc_anon); 1512 else if (old_state == arc_l2c_only) 1513 l2arc_hdr_stat_remove(); 1514 } 1515 1516 void 1517 arc_space_consume(uint64_t space, arc_space_type_t type) 1518 { 1519 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1520 1521 switch (type) { 1522 case ARC_SPACE_DATA: 1523 ARCSTAT_INCR(arcstat_data_size, space); 1524 break; 1525 case ARC_SPACE_OTHER: 1526 ARCSTAT_INCR(arcstat_other_size, space); 1527 break; 1528 case ARC_SPACE_HDRS: 1529 ARCSTAT_INCR(arcstat_hdr_size, space); 1530 break; 1531 case ARC_SPACE_L2HDRS: 1532 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1533 break; 1534 } 1535 1536 ARCSTAT_INCR(arcstat_meta_used, space); 1537 atomic_add_64(&arc_size, space); 1538 } 1539 1540 void 1541 arc_space_return(uint64_t space, arc_space_type_t type) 1542 { 1543 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1544 1545 switch (type) { 1546 case ARC_SPACE_DATA: 1547 ARCSTAT_INCR(arcstat_data_size, -space); 1548 break; 1549 case ARC_SPACE_OTHER: 1550 ARCSTAT_INCR(arcstat_other_size, -space); 1551 break; 1552 case ARC_SPACE_HDRS: 1553 ARCSTAT_INCR(arcstat_hdr_size, -space); 1554 break; 1555 case ARC_SPACE_L2HDRS: 1556 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1557 break; 1558 } 1559 1560 ASSERT(arc_meta_used >= space); 1561 if (arc_meta_max < arc_meta_used) 1562 arc_meta_max = arc_meta_used; 1563 ARCSTAT_INCR(arcstat_meta_used, -space); 1564 ASSERT(arc_size >= space); 1565 atomic_add_64(&arc_size, -space); 1566 } 1567 1568 void * 1569 arc_data_buf_alloc(uint64_t size) 1570 { 1571 if (arc_evict_needed(ARC_BUFC_DATA)) 1572 cv_signal(&arc_reclaim_thr_cv); 1573 atomic_add_64(&arc_size, size); 1574 return (zio_data_buf_alloc(size)); 1575 } 1576 1577 void 1578 arc_data_buf_free(void *buf, uint64_t size) 1579 { 1580 zio_data_buf_free(buf, size); 1581 ASSERT(arc_size >= size); 1582 atomic_add_64(&arc_size, -size); 1583 } 1584 1585 arc_buf_t * 1586 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 1587 { 1588 arc_buf_hdr_t *hdr; 1589 arc_buf_t *buf; 1590 1591 ASSERT3U(size, >, 0); 1592 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 1593 ASSERT(BUF_EMPTY(hdr)); 1594 hdr->b_size = size; 1595 hdr->b_type = type; 1596 hdr->b_spa = spa_load_guid(spa); 1597 hdr->b_state = arc_anon; 1598 hdr->b_arc_access = 0; 1599 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1600 buf->b_hdr = hdr; 1601 buf->b_data = NULL; 1602 buf->b_efunc = NULL; 1603 buf->b_private = NULL; 1604 buf->b_next = NULL; 1605 hdr->b_buf = buf; 1606 arc_get_data_buf(buf); 1607 hdr->b_datacnt = 1; 1608 hdr->b_flags = 0; 1609 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1610 (void) refcount_add(&hdr->b_refcnt, tag); 1611 1612 return (buf); 1613 } 1614 1615 /* 1616 * Allocates an empty arc_buf_hdr structure (lacking any data buffer). 1617 * This is used during l2arc reconstruction to make empty ARC buffers 1618 * which circumvent the regular disk->arc->l2arc path and instead come 1619 * into being in the reverse order, i.e. l2arc->arc->(disk). 1620 */ 1621 arc_buf_hdr_t * 1622 arc_buf_hdr_alloc(uint64_t guid, int size, arc_buf_contents_t type) 1623 { 1624 arc_buf_hdr_t *hdr; 1625 1626 ASSERT3U(size, >, 0); 1627 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 1628 ASSERT(BUF_EMPTY(hdr)); 1629 hdr->b_size = size; 1630 hdr->b_type = type; 1631 hdr->b_spa = guid; 1632 hdr->b_state = arc_anon; 1633 hdr->b_arc_access = 0; 1634 hdr->b_buf = NULL; 1635 hdr->b_datacnt = 0; 1636 hdr->b_flags = 0; 1637 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1638 1639 return (hdr); 1640 } 1641 1642 static char *arc_onloan_tag = "onloan"; 1643 1644 /* 1645 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1646 * flight data by arc_tempreserve_space() until they are "returned". Loaned 1647 * buffers must be returned to the arc before they can be used by the DMU or 1648 * freed. 1649 */ 1650 arc_buf_t * 1651 arc_loan_buf(spa_t *spa, int size) 1652 { 1653 arc_buf_t *buf; 1654 1655 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 1656 1657 atomic_add_64(&arc_loaned_bytes, size); 1658 return (buf); 1659 } 1660 1661 /* 1662 * Return a loaned arc buffer to the arc. 1663 */ 1664 void 1665 arc_return_buf(arc_buf_t *buf, void *tag) 1666 { 1667 arc_buf_hdr_t *hdr = buf->b_hdr; 1668 1669 ASSERT(buf->b_data != NULL); 1670 (void) refcount_add(&hdr->b_refcnt, tag); 1671 (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); 1672 1673 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 1674 } 1675 1676 /* Detach an arc_buf from a dbuf (tag) */ 1677 void 1678 arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 1679 { 1680 arc_buf_hdr_t *hdr; 1681 1682 ASSERT(buf->b_data != NULL); 1683 hdr = buf->b_hdr; 1684 (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); 1685 (void) refcount_remove(&hdr->b_refcnt, tag); 1686 buf->b_efunc = NULL; 1687 buf->b_private = NULL; 1688 1689 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 1690 } 1691 1692 static arc_buf_t * 1693 arc_buf_clone(arc_buf_t *from) 1694 { 1695 arc_buf_t *buf; 1696 arc_buf_hdr_t *hdr = from->b_hdr; 1697 uint64_t size = hdr->b_size; 1698 1699 ASSERT(hdr->b_state != arc_anon); 1700 1701 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1702 buf->b_hdr = hdr; 1703 buf->b_data = NULL; 1704 buf->b_efunc = NULL; 1705 buf->b_private = NULL; 1706 buf->b_next = hdr->b_buf; 1707 hdr->b_buf = buf; 1708 arc_get_data_buf(buf); 1709 bcopy(from->b_data, buf->b_data, size); 1710 1711 /* 1712 * This buffer already exists in the arc so create a duplicate 1713 * copy for the caller. If the buffer is associated with user data 1714 * then track the size and number of duplicates. These stats will be 1715 * updated as duplicate buffers are created and destroyed. 1716 */ 1717 if (hdr->b_type == ARC_BUFC_DATA) { 1718 ARCSTAT_BUMP(arcstat_duplicate_buffers); 1719 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 1720 } 1721 hdr->b_datacnt += 1; 1722 return (buf); 1723 } 1724 1725 void 1726 arc_buf_add_ref(arc_buf_t *buf, void* tag) 1727 { 1728 arc_buf_hdr_t *hdr; 1729 kmutex_t *hash_lock; 1730 1731 /* 1732 * Check to see if this buffer is evicted. Callers 1733 * must verify b_data != NULL to know if the add_ref 1734 * was successful. 1735 */ 1736 mutex_enter(&buf->b_evict_lock); 1737 if (buf->b_data == NULL) { 1738 mutex_exit(&buf->b_evict_lock); 1739 return; 1740 } 1741 hash_lock = HDR_LOCK(buf->b_hdr); 1742 mutex_enter(hash_lock); 1743 hdr = buf->b_hdr; 1744 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1745 mutex_exit(&buf->b_evict_lock); 1746 1747 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 1748 add_reference(hdr, hash_lock, tag); 1749 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 1750 arc_access(hdr, hash_lock); 1751 mutex_exit(hash_lock); 1752 ARCSTAT_BUMP(arcstat_hits); 1753 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 1754 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 1755 data, metadata, hits); 1756 } 1757 1758 /* 1759 * Free the arc data buffer. If it is an l2arc write in progress, 1760 * the buffer is placed on l2arc_free_on_write to be freed later. 1761 */ 1762 static void 1763 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 1764 { 1765 arc_buf_hdr_t *hdr = buf->b_hdr; 1766 1767 if (HDR_L2_WRITING(hdr)) { 1768 l2arc_data_free_t *df; 1769 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 1770 df->l2df_data = buf->b_data; 1771 df->l2df_size = hdr->b_size; 1772 df->l2df_func = free_func; 1773 mutex_enter(&l2arc_free_on_write_mtx); 1774 list_insert_head(l2arc_free_on_write, df); 1775 mutex_exit(&l2arc_free_on_write_mtx); 1776 ARCSTAT_BUMP(arcstat_l2_free_on_write); 1777 } else { 1778 free_func(buf->b_data, hdr->b_size); 1779 } 1780 } 1781 1782 static void 1783 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 1784 { 1785 arc_buf_t **bufp; 1786 1787 /* free up data associated with the buf */ 1788 if (buf->b_data) { 1789 arc_state_t *state = buf->b_hdr->b_state; 1790 uint64_t size = buf->b_hdr->b_size; 1791 arc_buf_contents_t type = buf->b_hdr->b_type; 1792 1793 arc_cksum_verify(buf); 1794 arc_buf_unwatch(buf); 1795 1796 if (!recycle) { 1797 if (type == ARC_BUFC_METADATA) { 1798 arc_buf_data_free(buf, zio_buf_free); 1799 arc_space_return(size, ARC_SPACE_DATA); 1800 } else { 1801 ASSERT(type == ARC_BUFC_DATA); 1802 arc_buf_data_free(buf, zio_data_buf_free); 1803 ARCSTAT_INCR(arcstat_data_size, -size); 1804 atomic_add_64(&arc_size, -size); 1805 } 1806 } 1807 if (list_link_active(&buf->b_hdr->b_arc_node)) { 1808 uint64_t *cnt = &state->arcs_lsize[type]; 1809 1810 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 1811 ASSERT(state != arc_anon); 1812 1813 ASSERT3U(*cnt, >=, size); 1814 atomic_add_64(cnt, -size); 1815 } 1816 ASSERT3U(state->arcs_size, >=, size); 1817 atomic_add_64(&state->arcs_size, -size); 1818 buf->b_data = NULL; 1819 1820 /* 1821 * If we're destroying a duplicate buffer make sure 1822 * that the appropriate statistics are updated. 1823 */ 1824 if (buf->b_hdr->b_datacnt > 1 && 1825 buf->b_hdr->b_type == ARC_BUFC_DATA) { 1826 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 1827 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 1828 } 1829 ASSERT(buf->b_hdr->b_datacnt > 0); 1830 buf->b_hdr->b_datacnt -= 1; 1831 } 1832 1833 /* only remove the buf if requested */ 1834 if (!all) 1835 return; 1836 1837 /* remove the buf from the hdr list */ 1838 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 1839 continue; 1840 *bufp = buf->b_next; 1841 buf->b_next = NULL; 1842 1843 ASSERT(buf->b_efunc == NULL); 1844 1845 /* clean up the buf */ 1846 buf->b_hdr = NULL; 1847 kmem_cache_free(buf_cache, buf); 1848 } 1849 1850 static void 1851 arc_hdr_destroy(arc_buf_hdr_t *hdr) 1852 { 1853 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1854 ASSERT3P(hdr->b_state, ==, arc_anon); 1855 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1856 l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; 1857 1858 if (l2hdr != NULL) { 1859 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); 1860 /* 1861 * To prevent arc_free() and l2arc_evict() from 1862 * attempting to free the same buffer at the same time, 1863 * a FREE_IN_PROGRESS flag is given to arc_free() to 1864 * give it priority. l2arc_evict() can't destroy this 1865 * header while we are waiting on l2arc_buflist_mtx. 1866 * 1867 * The hdr may be removed from l2ad_buflist before we 1868 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. 1869 */ 1870 if (!buflist_held) { 1871 mutex_enter(&l2arc_buflist_mtx); 1872 l2hdr = hdr->b_l2hdr; 1873 } 1874 1875 if (l2hdr != NULL) { 1876 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 1877 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 1878 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 1879 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 1880 if (hdr->b_state == arc_l2c_only) 1881 l2arc_hdr_stat_remove(); 1882 hdr->b_l2hdr = NULL; 1883 } 1884 1885 if (!buflist_held) 1886 mutex_exit(&l2arc_buflist_mtx); 1887 } 1888 1889 if (!BUF_EMPTY(hdr)) { 1890 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1891 buf_discard_identity(hdr); 1892 } 1893 while (hdr->b_buf) { 1894 arc_buf_t *buf = hdr->b_buf; 1895 1896 if (buf->b_efunc) { 1897 mutex_enter(&arc_eviction_mtx); 1898 mutex_enter(&buf->b_evict_lock); 1899 ASSERT(buf->b_hdr != NULL); 1900 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1901 hdr->b_buf = buf->b_next; 1902 buf->b_hdr = &arc_eviction_hdr; 1903 buf->b_next = arc_eviction_list; 1904 arc_eviction_list = buf; 1905 mutex_exit(&buf->b_evict_lock); 1906 mutex_exit(&arc_eviction_mtx); 1907 } else { 1908 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1909 } 1910 } 1911 if (hdr->b_freeze_cksum != NULL) { 1912 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1913 hdr->b_freeze_cksum = NULL; 1914 } 1915 if (hdr->b_thawed) { 1916 kmem_free(hdr->b_thawed, 1); 1917 hdr->b_thawed = NULL; 1918 } 1919 1920 ASSERT(!list_link_active(&hdr->b_arc_node)); 1921 ASSERT3P(hdr->b_hash_next, ==, NULL); 1922 ASSERT3P(hdr->b_acb, ==, NULL); 1923 kmem_cache_free(hdr_cache, hdr); 1924 } 1925 1926 void 1927 arc_buf_free(arc_buf_t *buf, void *tag) 1928 { 1929 arc_buf_hdr_t *hdr = buf->b_hdr; 1930 int hashed = hdr->b_state != arc_anon; 1931 1932 ASSERT(buf->b_efunc == NULL); 1933 ASSERT(buf->b_data != NULL); 1934 1935 if (hashed) { 1936 kmutex_t *hash_lock = HDR_LOCK(hdr); 1937 1938 mutex_enter(hash_lock); 1939 hdr = buf->b_hdr; 1940 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1941 1942 (void) remove_reference(hdr, hash_lock, tag); 1943 if (hdr->b_datacnt > 1) { 1944 arc_buf_destroy(buf, FALSE, TRUE); 1945 } else { 1946 ASSERT(buf == hdr->b_buf); 1947 ASSERT(buf->b_efunc == NULL); 1948 hdr->b_flags |= ARC_BUF_AVAILABLE; 1949 } 1950 mutex_exit(hash_lock); 1951 } else if (HDR_IO_IN_PROGRESS(hdr)) { 1952 int destroy_hdr; 1953 /* 1954 * We are in the middle of an async write. Don't destroy 1955 * this buffer unless the write completes before we finish 1956 * decrementing the reference count. 1957 */ 1958 mutex_enter(&arc_eviction_mtx); 1959 (void) remove_reference(hdr, NULL, tag); 1960 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1961 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1962 mutex_exit(&arc_eviction_mtx); 1963 if (destroy_hdr) 1964 arc_hdr_destroy(hdr); 1965 } else { 1966 if (remove_reference(hdr, NULL, tag) > 0) 1967 arc_buf_destroy(buf, FALSE, TRUE); 1968 else 1969 arc_hdr_destroy(hdr); 1970 } 1971 } 1972 1973 boolean_t 1974 arc_buf_remove_ref(arc_buf_t *buf, void* tag) 1975 { 1976 arc_buf_hdr_t *hdr = buf->b_hdr; 1977 kmutex_t *hash_lock = HDR_LOCK(hdr); 1978 boolean_t no_callback = (buf->b_efunc == NULL); 1979 1980 if (hdr->b_state == arc_anon) { 1981 ASSERT(hdr->b_datacnt == 1); 1982 arc_buf_free(buf, tag); 1983 return (no_callback); 1984 } 1985 1986 mutex_enter(hash_lock); 1987 hdr = buf->b_hdr; 1988 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1989 ASSERT(hdr->b_state != arc_anon); 1990 ASSERT(buf->b_data != NULL); 1991 1992 (void) remove_reference(hdr, hash_lock, tag); 1993 if (hdr->b_datacnt > 1) { 1994 if (no_callback) 1995 arc_buf_destroy(buf, FALSE, TRUE); 1996 } else if (no_callback) { 1997 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1998 ASSERT(buf->b_efunc == NULL); 1999 hdr->b_flags |= ARC_BUF_AVAILABLE; 2000 } 2001 ASSERT(no_callback || hdr->b_datacnt > 1 || 2002 refcount_is_zero(&hdr->b_refcnt)); 2003 mutex_exit(hash_lock); 2004 return (no_callback); 2005 } 2006 2007 int 2008 arc_buf_size(arc_buf_t *buf) 2009 { 2010 return (buf->b_hdr->b_size); 2011 } 2012 2013 /* 2014 * Called from the DMU to determine if the current buffer should be 2015 * evicted. In order to ensure proper locking, the eviction must be initiated 2016 * from the DMU. Return true if the buffer is associated with user data and 2017 * duplicate buffers still exist. 2018 */ 2019 boolean_t 2020 arc_buf_eviction_needed(arc_buf_t *buf) 2021 { 2022 arc_buf_hdr_t *hdr; 2023 boolean_t evict_needed = B_FALSE; 2024 2025 if (zfs_disable_dup_eviction) 2026 return (B_FALSE); 2027 2028 mutex_enter(&buf->b_evict_lock); 2029 hdr = buf->b_hdr; 2030 if (hdr == NULL) { 2031 /* 2032 * We are in arc_do_user_evicts(); let that function 2033 * perform the eviction. 2034 */ 2035 ASSERT(buf->b_data == NULL); 2036 mutex_exit(&buf->b_evict_lock); 2037 return (B_FALSE); 2038 } else if (buf->b_data == NULL) { 2039 /* 2040 * We have already been added to the arc eviction list; 2041 * recommend eviction. 2042 */ 2043 ASSERT3P(hdr, ==, &arc_eviction_hdr); 2044 mutex_exit(&buf->b_evict_lock); 2045 return (B_TRUE); 2046 } 2047 2048 if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) 2049 evict_needed = B_TRUE; 2050 2051 mutex_exit(&buf->b_evict_lock); 2052 return (evict_needed); 2053 } 2054 2055 /* 2056 * Evict buffers from list until we've removed the specified number of 2057 * bytes. Move the removed buffers to the appropriate evict state. 2058 * If the recycle flag is set, then attempt to "recycle" a buffer: 2059 * - look for a buffer to evict that is `bytes' long. 2060 * - return the data block from this buffer rather than freeing it. 2061 * This flag is used by callers that are trying to make space for a 2062 * new buffer in a full arc cache. 2063 * 2064 * This function makes a "best effort". It skips over any buffers 2065 * it can't get a hash_lock on, and so may not catch all candidates. 2066 * It may also return without evicting as much space as requested. 2067 */ 2068 static void * 2069 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, 2070 arc_buf_contents_t type) 2071 { 2072 arc_state_t *evicted_state; 2073 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 2074 arc_buf_hdr_t *ab, *ab_prev = NULL; 2075 list_t *list = &state->arcs_list[type]; 2076 kmutex_t *hash_lock; 2077 boolean_t have_lock; 2078 void *stolen = NULL; 2079 2080 ASSERT(state == arc_mru || state == arc_mfu); 2081 2082 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2083 2084 mutex_enter(&state->arcs_mtx); 2085 mutex_enter(&evicted_state->arcs_mtx); 2086 2087 for (ab = list_tail(list); ab; ab = ab_prev) { 2088 ab_prev = list_prev(list, ab); 2089 /* prefetch buffers have a minimum lifespan */ 2090 if (HDR_IO_IN_PROGRESS(ab) || 2091 (spa && ab->b_spa != spa) || 2092 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 2093 ddi_get_lbolt() - ab->b_arc_access < 2094 arc_min_prefetch_lifespan)) { 2095 skipped++; 2096 continue; 2097 } 2098 /* "lookahead" for better eviction candidate */ 2099 if (recycle && ab->b_size != bytes && 2100 ab_prev && ab_prev->b_size == bytes) 2101 continue; 2102 hash_lock = HDR_LOCK(ab); 2103 have_lock = MUTEX_HELD(hash_lock); 2104 if (have_lock || mutex_tryenter(hash_lock)) { 2105 ASSERT0(refcount_count(&ab->b_refcnt)); 2106 ASSERT(ab->b_datacnt > 0); 2107 while (ab->b_buf) { 2108 arc_buf_t *buf = ab->b_buf; 2109 if (!mutex_tryenter(&buf->b_evict_lock)) { 2110 missed += 1; 2111 break; 2112 } 2113 if (buf->b_data) { 2114 bytes_evicted += ab->b_size; 2115 if (recycle && ab->b_type == type && 2116 ab->b_size == bytes && 2117 !HDR_L2_WRITING(ab)) { 2118 stolen = buf->b_data; 2119 recycle = FALSE; 2120 } 2121 } 2122 if (buf->b_efunc) { 2123 mutex_enter(&arc_eviction_mtx); 2124 arc_buf_destroy(buf, 2125 buf->b_data == stolen, FALSE); 2126 ab->b_buf = buf->b_next; 2127 buf->b_hdr = &arc_eviction_hdr; 2128 buf->b_next = arc_eviction_list; 2129 arc_eviction_list = buf; 2130 mutex_exit(&arc_eviction_mtx); 2131 mutex_exit(&buf->b_evict_lock); 2132 } else { 2133 mutex_exit(&buf->b_evict_lock); 2134 arc_buf_destroy(buf, 2135 buf->b_data == stolen, TRUE); 2136 } 2137 } 2138 2139 if (ab->b_l2hdr) { 2140 ARCSTAT_INCR(arcstat_evict_l2_cached, 2141 ab->b_size); 2142 } else { 2143 if (l2arc_write_eligible(ab->b_spa, ab)) { 2144 ARCSTAT_INCR(arcstat_evict_l2_eligible, 2145 ab->b_size); 2146 } else { 2147 ARCSTAT_INCR( 2148 arcstat_evict_l2_ineligible, 2149 ab->b_size); 2150 } 2151 } 2152 2153 if (ab->b_datacnt == 0) { 2154 arc_change_state(evicted_state, ab, hash_lock); 2155 ASSERT(HDR_IN_HASH_TABLE(ab)); 2156 ab->b_flags |= ARC_IN_HASH_TABLE; 2157 ab->b_flags &= ~ARC_BUF_AVAILABLE; 2158 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 2159 } 2160 if (!have_lock) 2161 mutex_exit(hash_lock); 2162 if (bytes >= 0 && bytes_evicted >= bytes) 2163 break; 2164 } else { 2165 missed += 1; 2166 } 2167 } 2168 2169 mutex_exit(&evicted_state->arcs_mtx); 2170 mutex_exit(&state->arcs_mtx); 2171 2172 if (bytes_evicted < bytes) 2173 dprintf("only evicted %lld bytes from %x", 2174 (longlong_t)bytes_evicted, state); 2175 2176 if (skipped) 2177 ARCSTAT_INCR(arcstat_evict_skip, skipped); 2178 2179 if (missed) 2180 ARCSTAT_INCR(arcstat_mutex_miss, missed); 2181 2182 /* 2183 * We have just evicted some data into the ghost state, make 2184 * sure we also adjust the ghost state size if necessary. 2185 */ 2186 if (arc_no_grow && 2187 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { 2188 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + 2189 arc_mru_ghost->arcs_size - arc_c; 2190 2191 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { 2192 int64_t todelete = 2193 MIN(arc_mru_ghost->arcs_lsize[type], mru_over); 2194 arc_evict_ghost(arc_mru_ghost, NULL, todelete); 2195 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { 2196 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], 2197 arc_mru_ghost->arcs_size + 2198 arc_mfu_ghost->arcs_size - arc_c); 2199 arc_evict_ghost(arc_mfu_ghost, NULL, todelete); 2200 } 2201 } 2202 2203 return (stolen); 2204 } 2205 2206 /* 2207 * Remove buffers from list until we've removed the specified number of 2208 * bytes. Destroy the buffers that are removed. 2209 */ 2210 static void 2211 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) 2212 { 2213 arc_buf_hdr_t *ab, *ab_prev; 2214 arc_buf_hdr_t marker = { 0 }; 2215 list_t *list = &state->arcs_list[ARC_BUFC_DATA]; 2216 kmutex_t *hash_lock; 2217 uint64_t bytes_deleted = 0; 2218 uint64_t bufs_skipped = 0; 2219 2220 ASSERT(GHOST_STATE(state)); 2221 top: 2222 mutex_enter(&state->arcs_mtx); 2223 for (ab = list_tail(list); ab; ab = ab_prev) { 2224 ab_prev = list_prev(list, ab); 2225 if (spa && ab->b_spa != spa) 2226 continue; 2227 2228 /* ignore markers */ 2229 if (ab->b_spa == 0) 2230 continue; 2231 2232 hash_lock = HDR_LOCK(ab); 2233 /* caller may be trying to modify this buffer, skip it */ 2234 if (MUTEX_HELD(hash_lock)) 2235 continue; 2236 if (mutex_tryenter(hash_lock)) { 2237 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 2238 ASSERT(ab->b_buf == NULL); 2239 ARCSTAT_BUMP(arcstat_deleted); 2240 bytes_deleted += ab->b_size; 2241 2242 if (ab->b_l2hdr != NULL) { 2243 /* 2244 * This buffer is cached on the 2nd Level ARC; 2245 * don't destroy the header. 2246 */ 2247 arc_change_state(arc_l2c_only, ab, hash_lock); 2248 mutex_exit(hash_lock); 2249 } else { 2250 arc_change_state(arc_anon, ab, hash_lock); 2251 mutex_exit(hash_lock); 2252 arc_hdr_destroy(ab); 2253 } 2254 2255 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 2256 if (bytes >= 0 && bytes_deleted >= bytes) 2257 break; 2258 } else if (bytes < 0) { 2259 /* 2260 * Insert a list marker and then wait for the 2261 * hash lock to become available. Once its 2262 * available, restart from where we left off. 2263 */ 2264 list_insert_after(list, ab, &marker); 2265 mutex_exit(&state->arcs_mtx); 2266 mutex_enter(hash_lock); 2267 mutex_exit(hash_lock); 2268 mutex_enter(&state->arcs_mtx); 2269 ab_prev = list_prev(list, &marker); 2270 list_remove(list, &marker); 2271 } else 2272 bufs_skipped += 1; 2273 } 2274 mutex_exit(&state->arcs_mtx); 2275 2276 if (list == &state->arcs_list[ARC_BUFC_DATA] && 2277 (bytes < 0 || bytes_deleted < bytes)) { 2278 list = &state->arcs_list[ARC_BUFC_METADATA]; 2279 goto top; 2280 } 2281 2282 if (bufs_skipped) { 2283 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 2284 ASSERT(bytes >= 0); 2285 } 2286 2287 if (bytes_deleted < bytes) 2288 dprintf("only deleted %lld bytes from %p", 2289 (longlong_t)bytes_deleted, state); 2290 } 2291 2292 static void 2293 arc_adjust(void) 2294 { 2295 int64_t adjustment, delta; 2296 2297 /* 2298 * Adjust MRU size 2299 */ 2300 2301 adjustment = MIN((int64_t)(arc_size - arc_c), 2302 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 2303 arc_p)); 2304 2305 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 2306 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 2307 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA); 2308 adjustment -= delta; 2309 } 2310 2311 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2312 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 2313 (void) arc_evict(arc_mru, NULL, delta, FALSE, 2314 ARC_BUFC_METADATA); 2315 } 2316 2317 /* 2318 * Adjust MFU size 2319 */ 2320 2321 adjustment = arc_size - arc_c; 2322 2323 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 2324 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 2325 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA); 2326 adjustment -= delta; 2327 } 2328 2329 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2330 int64_t delta = MIN(adjustment, 2331 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 2332 (void) arc_evict(arc_mfu, NULL, delta, FALSE, 2333 ARC_BUFC_METADATA); 2334 } 2335 2336 /* 2337 * Adjust ghost lists 2338 */ 2339 2340 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 2341 2342 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 2343 delta = MIN(arc_mru_ghost->arcs_size, adjustment); 2344 arc_evict_ghost(arc_mru_ghost, NULL, delta); 2345 } 2346 2347 adjustment = 2348 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 2349 2350 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 2351 delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 2352 arc_evict_ghost(arc_mfu_ghost, NULL, delta); 2353 } 2354 } 2355 2356 static void 2357 arc_do_user_evicts(void) 2358 { 2359 mutex_enter(&arc_eviction_mtx); 2360 while (arc_eviction_list != NULL) { 2361 arc_buf_t *buf = arc_eviction_list; 2362 arc_eviction_list = buf->b_next; 2363 mutex_enter(&buf->b_evict_lock); 2364 buf->b_hdr = NULL; 2365 mutex_exit(&buf->b_evict_lock); 2366 mutex_exit(&arc_eviction_mtx); 2367 2368 if (buf->b_efunc != NULL) 2369 VERIFY(buf->b_efunc(buf) == 0); 2370 2371 buf->b_efunc = NULL; 2372 buf->b_private = NULL; 2373 kmem_cache_free(buf_cache, buf); 2374 mutex_enter(&arc_eviction_mtx); 2375 } 2376 mutex_exit(&arc_eviction_mtx); 2377 } 2378 2379 /* 2380 * Flush all *evictable* data from the cache for the given spa. 2381 * NOTE: this will not touch "active" (i.e. referenced) data. 2382 */ 2383 void 2384 arc_flush(spa_t *spa) 2385 { 2386 uint64_t guid = 0; 2387 2388 if (spa) 2389 guid = spa_load_guid(spa); 2390 2391 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { 2392 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 2393 if (spa) 2394 break; 2395 } 2396 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { 2397 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 2398 if (spa) 2399 break; 2400 } 2401 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { 2402 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 2403 if (spa) 2404 break; 2405 } 2406 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { 2407 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 2408 if (spa) 2409 break; 2410 } 2411 2412 arc_evict_ghost(arc_mru_ghost, guid, -1); 2413 arc_evict_ghost(arc_mfu_ghost, guid, -1); 2414 2415 mutex_enter(&arc_reclaim_thr_lock); 2416 arc_do_user_evicts(); 2417 mutex_exit(&arc_reclaim_thr_lock); 2418 ASSERT(spa || arc_eviction_list == NULL); 2419 } 2420 2421 void 2422 arc_shrink(void) 2423 { 2424 if (arc_c > arc_c_min) { 2425 uint64_t to_free; 2426 2427 #ifdef _KERNEL 2428 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); 2429 #else 2430 to_free = arc_c >> arc_shrink_shift; 2431 #endif 2432 if (arc_c > arc_c_min + to_free) 2433 atomic_add_64(&arc_c, -to_free); 2434 else 2435 arc_c = arc_c_min; 2436 2437 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 2438 if (arc_c > arc_size) 2439 arc_c = MAX(arc_size, arc_c_min); 2440 if (arc_p > arc_c) 2441 arc_p = (arc_c >> 1); 2442 ASSERT(arc_c >= arc_c_min); 2443 ASSERT((int64_t)arc_p >= 0); 2444 } 2445 2446 if (arc_size > arc_c) 2447 arc_adjust(); 2448 } 2449 2450 /* 2451 * Determine if the system is under memory pressure and is asking 2452 * to reclaim memory. A return value of 1 indicates that the system 2453 * is under memory pressure and that the arc should adjust accordingly. 2454 */ 2455 static int 2456 arc_reclaim_needed(void) 2457 { 2458 uint64_t extra; 2459 2460 #ifdef _KERNEL 2461 2462 if (needfree) 2463 return (1); 2464 2465 /* 2466 * take 'desfree' extra pages, so we reclaim sooner, rather than later 2467 */ 2468 extra = desfree; 2469 2470 /* 2471 * check that we're out of range of the pageout scanner. It starts to 2472 * schedule paging if freemem is less than lotsfree and needfree. 2473 * lotsfree is the high-water mark for pageout, and needfree is the 2474 * number of needed free pages. We add extra pages here to make sure 2475 * the scanner doesn't start up while we're freeing memory. 2476 */ 2477 if (freemem < lotsfree + needfree + extra) 2478 return (1); 2479 2480 /* 2481 * check to make sure that swapfs has enough space so that anon 2482 * reservations can still succeed. anon_resvmem() checks that the 2483 * availrmem is greater than swapfs_minfree, and the number of reserved 2484 * swap pages. We also add a bit of extra here just to prevent 2485 * circumstances from getting really dire. 2486 */ 2487 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 2488 return (1); 2489 2490 #if defined(__i386) 2491 /* 2492 * If we're on an i386 platform, it's possible that we'll exhaust the 2493 * kernel heap space before we ever run out of available physical 2494 * memory. Most checks of the size of the heap_area compare against 2495 * tune.t_minarmem, which is the minimum available real memory that we 2496 * can have in the system. However, this is generally fixed at 25 pages 2497 * which is so low that it's useless. In this comparison, we seek to 2498 * calculate the total heap-size, and reclaim if more than 3/4ths of the 2499 * heap is allocated. (Or, in the calculation, if less than 1/4th is 2500 * free) 2501 */ 2502 if (vmem_size(heap_arena, VMEM_FREE) < 2503 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) 2504 return (1); 2505 #endif 2506 2507 /* 2508 * If zio data pages are being allocated out of a separate heap segment, 2509 * then enforce that the size of available vmem for this arena remains 2510 * above about 1/16th free. 2511 * 2512 * Note: The 1/16th arena free requirement was put in place 2513 * to aggressively evict memory from the arc in order to avoid 2514 * memory fragmentation issues. 2515 */ 2516 if (zio_arena != NULL && 2517 vmem_size(zio_arena, VMEM_FREE) < 2518 (vmem_size(zio_arena, VMEM_ALLOC) >> 4)) 2519 return (1); 2520 #else 2521 if (spa_get_random(100) == 0) 2522 return (1); 2523 #endif 2524 return (0); 2525 } 2526 2527 static void 2528 arc_kmem_reap_now(arc_reclaim_strategy_t strat) 2529 { 2530 size_t i; 2531 kmem_cache_t *prev_cache = NULL; 2532 kmem_cache_t *prev_data_cache = NULL; 2533 extern kmem_cache_t *zio_buf_cache[]; 2534 extern kmem_cache_t *zio_data_buf_cache[]; 2535 2536 #ifdef _KERNEL 2537 if (arc_meta_used >= arc_meta_limit) { 2538 /* 2539 * We are exceeding our meta-data cache limit. 2540 * Purge some DNLC entries to release holds on meta-data. 2541 */ 2542 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 2543 } 2544 #if defined(__i386) 2545 /* 2546 * Reclaim unused memory from all kmem caches. 2547 */ 2548 kmem_reap(); 2549 #endif 2550 #endif 2551 2552 /* 2553 * An aggressive reclamation will shrink the cache size as well as 2554 * reap free buffers from the arc kmem caches. 2555 */ 2556 if (strat == ARC_RECLAIM_AGGR) 2557 arc_shrink(); 2558 2559 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 2560 if (zio_buf_cache[i] != prev_cache) { 2561 prev_cache = zio_buf_cache[i]; 2562 kmem_cache_reap_now(zio_buf_cache[i]); 2563 } 2564 if (zio_data_buf_cache[i] != prev_data_cache) { 2565 prev_data_cache = zio_data_buf_cache[i]; 2566 kmem_cache_reap_now(zio_data_buf_cache[i]); 2567 } 2568 } 2569 kmem_cache_reap_now(buf_cache); 2570 kmem_cache_reap_now(hdr_cache); 2571 2572 /* 2573 * Ask the vmem areana to reclaim unused memory from its 2574 * quantum caches. 2575 */ 2576 if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR) 2577 vmem_qcache_reap(zio_arena); 2578 } 2579 2580 static void 2581 arc_reclaim_thread(void) 2582 { 2583 clock_t growtime = 0; 2584 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 2585 callb_cpr_t cpr; 2586 2587 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 2588 2589 mutex_enter(&arc_reclaim_thr_lock); 2590 while (arc_thread_exit == 0) { 2591 if (arc_reclaim_needed()) { 2592 2593 if (arc_no_grow) { 2594 if (last_reclaim == ARC_RECLAIM_CONS) { 2595 last_reclaim = ARC_RECLAIM_AGGR; 2596 } else { 2597 last_reclaim = ARC_RECLAIM_CONS; 2598 } 2599 } else { 2600 arc_no_grow = TRUE; 2601 last_reclaim = ARC_RECLAIM_AGGR; 2602 membar_producer(); 2603 } 2604 2605 /* reset the growth delay for every reclaim */ 2606 growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 2607 2608 arc_kmem_reap_now(last_reclaim); 2609 arc_warm = B_TRUE; 2610 2611 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { 2612 arc_no_grow = FALSE; 2613 } 2614 2615 arc_adjust(); 2616 2617 if (arc_eviction_list != NULL) 2618 arc_do_user_evicts(); 2619 2620 /* block until needed, or one second, whichever is shorter */ 2621 CALLB_CPR_SAFE_BEGIN(&cpr); 2622 (void) cv_timedwait(&arc_reclaim_thr_cv, 2623 &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz)); 2624 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 2625 } 2626 2627 arc_thread_exit = 0; 2628 cv_broadcast(&arc_reclaim_thr_cv); 2629 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 2630 thread_exit(); 2631 } 2632 2633 /* 2634 * Adapt arc info given the number of bytes we are trying to add and 2635 * the state that we are comming from. This function is only called 2636 * when we are adding new content to the cache. 2637 */ 2638 static void 2639 arc_adapt(int bytes, arc_state_t *state) 2640 { 2641 int mult; 2642 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 2643 2644 if (state == arc_l2c_only) 2645 return; 2646 2647 ASSERT(bytes > 0); 2648 /* 2649 * Adapt the target size of the MRU list: 2650 * - if we just hit in the MRU ghost list, then increase 2651 * the target size of the MRU list. 2652 * - if we just hit in the MFU ghost list, then increase 2653 * the target size of the MFU list by decreasing the 2654 * target size of the MRU list. 2655 */ 2656 if (state == arc_mru_ghost) { 2657 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 2658 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 2659 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 2660 2661 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 2662 } else if (state == arc_mfu_ghost) { 2663 uint64_t delta; 2664 2665 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 2666 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 2667 mult = MIN(mult, 10); 2668 2669 delta = MIN(bytes * mult, arc_p); 2670 arc_p = MAX(arc_p_min, arc_p - delta); 2671 } 2672 ASSERT((int64_t)arc_p >= 0); 2673 2674 if (arc_reclaim_needed()) { 2675 cv_signal(&arc_reclaim_thr_cv); 2676 return; 2677 } 2678 2679 if (arc_no_grow) 2680 return; 2681 2682 if (arc_c >= arc_c_max) 2683 return; 2684 2685 /* 2686 * If we're within (2 * maxblocksize) bytes of the target 2687 * cache size, increment the target cache size 2688 */ 2689 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 2690 atomic_add_64(&arc_c, (int64_t)bytes); 2691 if (arc_c > arc_c_max) 2692 arc_c = arc_c_max; 2693 else if (state == arc_anon) 2694 atomic_add_64(&arc_p, (int64_t)bytes); 2695 if (arc_p > arc_c) 2696 arc_p = arc_c; 2697 } 2698 ASSERT((int64_t)arc_p >= 0); 2699 } 2700 2701 /* 2702 * Check if the cache has reached its limits and eviction is required 2703 * prior to insert. 2704 */ 2705 static int 2706 arc_evict_needed(arc_buf_contents_t type) 2707 { 2708 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 2709 return (1); 2710 2711 if (arc_reclaim_needed()) 2712 return (1); 2713 2714 return (arc_size > arc_c); 2715 } 2716 2717 /* 2718 * The buffer, supplied as the first argument, needs a data block. 2719 * So, if we are at cache max, determine which cache should be victimized. 2720 * We have the following cases: 2721 * 2722 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 2723 * In this situation if we're out of space, but the resident size of the MFU is 2724 * under the limit, victimize the MFU cache to satisfy this insertion request. 2725 * 2726 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 2727 * Here, we've used up all of the available space for the MRU, so we need to 2728 * evict from our own cache instead. Evict from the set of resident MRU 2729 * entries. 2730 * 2731 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 2732 * c minus p represents the MFU space in the cache, since p is the size of the 2733 * cache that is dedicated to the MRU. In this situation there's still space on 2734 * the MFU side, so the MRU side needs to be victimized. 2735 * 2736 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 2737 * MFU's resident set is consuming more space than it has been allotted. In 2738 * this situation, we must victimize our own cache, the MFU, for this insertion. 2739 */ 2740 static void 2741 arc_get_data_buf(arc_buf_t *buf) 2742 { 2743 arc_state_t *state = buf->b_hdr->b_state; 2744 uint64_t size = buf->b_hdr->b_size; 2745 arc_buf_contents_t type = buf->b_hdr->b_type; 2746 2747 arc_adapt(size, state); 2748 2749 /* 2750 * We have not yet reached cache maximum size, 2751 * just allocate a new buffer. 2752 */ 2753 if (!arc_evict_needed(type)) { 2754 if (type == ARC_BUFC_METADATA) { 2755 buf->b_data = zio_buf_alloc(size); 2756 arc_space_consume(size, ARC_SPACE_DATA); 2757 } else { 2758 ASSERT(type == ARC_BUFC_DATA); 2759 buf->b_data = zio_data_buf_alloc(size); 2760 ARCSTAT_INCR(arcstat_data_size, size); 2761 atomic_add_64(&arc_size, size); 2762 } 2763 goto out; 2764 } 2765 2766 /* 2767 * If we are prefetching from the mfu ghost list, this buffer 2768 * will end up on the mru list; so steal space from there. 2769 */ 2770 if (state == arc_mfu_ghost) 2771 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 2772 else if (state == arc_mru_ghost) 2773 state = arc_mru; 2774 2775 if (state == arc_mru || state == arc_anon) { 2776 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 2777 state = (arc_mfu->arcs_lsize[type] >= size && 2778 arc_p > mru_used) ? arc_mfu : arc_mru; 2779 } else { 2780 /* MFU cases */ 2781 uint64_t mfu_space = arc_c - arc_p; 2782 state = (arc_mru->arcs_lsize[type] >= size && 2783 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 2784 } 2785 if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { 2786 if (type == ARC_BUFC_METADATA) { 2787 buf->b_data = zio_buf_alloc(size); 2788 arc_space_consume(size, ARC_SPACE_DATA); 2789 } else { 2790 ASSERT(type == ARC_BUFC_DATA); 2791 buf->b_data = zio_data_buf_alloc(size); 2792 ARCSTAT_INCR(arcstat_data_size, size); 2793 atomic_add_64(&arc_size, size); 2794 } 2795 ARCSTAT_BUMP(arcstat_recycle_miss); 2796 } 2797 ASSERT(buf->b_data != NULL); 2798 out: 2799 /* 2800 * Update the state size. Note that ghost states have a 2801 * "ghost size" and so don't need to be updated. 2802 */ 2803 if (!GHOST_STATE(buf->b_hdr->b_state)) { 2804 arc_buf_hdr_t *hdr = buf->b_hdr; 2805 2806 atomic_add_64(&hdr->b_state->arcs_size, size); 2807 if (list_link_active(&hdr->b_arc_node)) { 2808 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2809 atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 2810 } 2811 /* 2812 * If we are growing the cache, and we are adding anonymous 2813 * data, and we have outgrown arc_p, update arc_p 2814 */ 2815 if (arc_size < arc_c && hdr->b_state == arc_anon && 2816 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 2817 arc_p = MIN(arc_c, arc_p + size); 2818 } 2819 } 2820 2821 /* 2822 * This routine is called whenever a buffer is accessed. 2823 * NOTE: the hash lock is dropped in this function. 2824 */ 2825 static void 2826 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 2827 { 2828 clock_t now; 2829 2830 ASSERT(MUTEX_HELD(hash_lock)); 2831 2832 if (buf->b_state == arc_anon) { 2833 /* 2834 * This buffer is not in the cache, and does not 2835 * appear in our "ghost" list. Add the new buffer 2836 * to the MRU state. 2837 */ 2838 2839 ASSERT(buf->b_arc_access == 0); 2840 buf->b_arc_access = ddi_get_lbolt(); 2841 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2842 arc_change_state(arc_mru, buf, hash_lock); 2843 2844 } else if (buf->b_state == arc_mru) { 2845 now = ddi_get_lbolt(); 2846 2847 /* 2848 * If this buffer is here because of a prefetch, then either: 2849 * - clear the flag if this is a "referencing" read 2850 * (any subsequent access will bump this into the MFU state). 2851 * or 2852 * - move the buffer to the head of the list if this is 2853 * another prefetch (to make it less likely to be evicted). 2854 */ 2855 if ((buf->b_flags & ARC_PREFETCH) != 0) { 2856 if (refcount_count(&buf->b_refcnt) == 0) { 2857 ASSERT(list_link_active(&buf->b_arc_node)); 2858 } else { 2859 buf->b_flags &= ~ARC_PREFETCH; 2860 ARCSTAT_BUMP(arcstat_mru_hits); 2861 } 2862 buf->b_arc_access = now; 2863 return; 2864 } 2865 2866 /* 2867 * This buffer has been "accessed" only once so far, 2868 * but it is still in the cache. Move it to the MFU 2869 * state. 2870 */ 2871 if (now > buf->b_arc_access + ARC_MINTIME) { 2872 /* 2873 * More than 125ms have passed since we 2874 * instantiated this buffer. Move it to the 2875 * most frequently used state. 2876 */ 2877 buf->b_arc_access = now; 2878 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2879 arc_change_state(arc_mfu, buf, hash_lock); 2880 } 2881 ARCSTAT_BUMP(arcstat_mru_hits); 2882 } else if (buf->b_state == arc_mru_ghost) { 2883 arc_state_t *new_state; 2884 /* 2885 * This buffer has been "accessed" recently, but 2886 * was evicted from the cache. Move it to the 2887 * MFU state. 2888 */ 2889 2890 if (buf->b_flags & ARC_PREFETCH) { 2891 new_state = arc_mru; 2892 if (refcount_count(&buf->b_refcnt) > 0) 2893 buf->b_flags &= ~ARC_PREFETCH; 2894 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2895 } else { 2896 new_state = arc_mfu; 2897 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2898 } 2899 2900 buf->b_arc_access = ddi_get_lbolt(); 2901 arc_change_state(new_state, buf, hash_lock); 2902 2903 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 2904 } else if (buf->b_state == arc_mfu) { 2905 /* 2906 * This buffer has been accessed more than once and is 2907 * still in the cache. Keep it in the MFU state. 2908 * 2909 * NOTE: an add_reference() that occurred when we did 2910 * the arc_read() will have kicked this off the list. 2911 * If it was a prefetch, we will explicitly move it to 2912 * the head of the list now. 2913 */ 2914 if ((buf->b_flags & ARC_PREFETCH) != 0) { 2915 ASSERT(refcount_count(&buf->b_refcnt) == 0); 2916 ASSERT(list_link_active(&buf->b_arc_node)); 2917 } 2918 ARCSTAT_BUMP(arcstat_mfu_hits); 2919 buf->b_arc_access = ddi_get_lbolt(); 2920 } else if (buf->b_state == arc_mfu_ghost) { 2921 arc_state_t *new_state = arc_mfu; 2922 /* 2923 * This buffer has been accessed more than once but has 2924 * been evicted from the cache. Move it back to the 2925 * MFU state. 2926 */ 2927 2928 if (buf->b_flags & ARC_PREFETCH) { 2929 /* 2930 * This is a prefetch access... 2931 * move this block back to the MRU state. 2932 */ 2933 ASSERT0(refcount_count(&buf->b_refcnt)); 2934 new_state = arc_mru; 2935 } 2936 2937 buf->b_arc_access = ddi_get_lbolt(); 2938 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2939 arc_change_state(new_state, buf, hash_lock); 2940 2941 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 2942 } else if (buf->b_state == arc_l2c_only) { 2943 /* 2944 * This buffer is on the 2nd Level ARC. 2945 */ 2946 2947 buf->b_arc_access = ddi_get_lbolt(); 2948 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2949 arc_change_state(arc_mfu, buf, hash_lock); 2950 } else { 2951 ASSERT(!"invalid arc state"); 2952 } 2953 } 2954 2955 /* a generic arc_done_func_t which you can use */ 2956 /* ARGSUSED */ 2957 void 2958 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 2959 { 2960 if (zio == NULL || zio->io_error == 0) 2961 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 2962 VERIFY(arc_buf_remove_ref(buf, arg)); 2963 } 2964 2965 /* a generic arc_done_func_t */ 2966 void 2967 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 2968 { 2969 arc_buf_t **bufp = arg; 2970 if (zio && zio->io_error) { 2971 VERIFY(arc_buf_remove_ref(buf, arg)); 2972 *bufp = NULL; 2973 } else { 2974 *bufp = buf; 2975 ASSERT(buf->b_data); 2976 } 2977 } 2978 2979 static void 2980 arc_read_done(zio_t *zio) 2981 { 2982 arc_buf_hdr_t *hdr, *found; 2983 arc_buf_t *buf; 2984 arc_buf_t *abuf; /* buffer we're assigning to callback */ 2985 kmutex_t *hash_lock; 2986 arc_callback_t *callback_list, *acb; 2987 int freeable = FALSE; 2988 2989 buf = zio->io_private; 2990 hdr = buf->b_hdr; 2991 2992 /* 2993 * The hdr was inserted into hash-table and removed from lists 2994 * prior to starting I/O. We should find this header, since 2995 * it's in the hash table, and it should be legit since it's 2996 * not possible to evict it during the I/O. The only possible 2997 * reason for it not to be found is if we were freed during the 2998 * read. 2999 */ 3000 found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, 3001 &hash_lock); 3002 3003 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 3004 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 3005 (found == hdr && HDR_L2_READING(hdr))); 3006 3007 hdr->b_flags &= ~ARC_L2_EVICTED; 3008 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) 3009 hdr->b_flags &= ~ARC_L2CACHE; 3010 3011 /* byteswap if necessary */ 3012 callback_list = hdr->b_acb; 3013 ASSERT(callback_list != NULL); 3014 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 3015 dmu_object_byteswap_t bswap = 3016 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 3017 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 3018 byteswap_uint64_array : 3019 dmu_ot_byteswap[bswap].ob_func; 3020 func(buf->b_data, hdr->b_size); 3021 } 3022 3023 arc_cksum_compute(buf, B_FALSE); 3024 arc_buf_watch(buf); 3025 3026 if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { 3027 /* 3028 * Only call arc_access on anonymous buffers. This is because 3029 * if we've issued an I/O for an evicted buffer, we've already 3030 * called arc_access (to prevent any simultaneous readers from 3031 * getting confused). 3032 */ 3033 arc_access(hdr, hash_lock); 3034 } 3035 3036 /* create copies of the data buffer for the callers */ 3037 abuf = buf; 3038 for (acb = callback_list; acb; acb = acb->acb_next) { 3039 if (acb->acb_done) { 3040 if (abuf == NULL) { 3041 ARCSTAT_BUMP(arcstat_duplicate_reads); 3042 abuf = arc_buf_clone(buf); 3043 } 3044 acb->acb_buf = abuf; 3045 abuf = NULL; 3046 } 3047 } 3048 hdr->b_acb = NULL; 3049 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3050 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 3051 if (abuf == buf) { 3052 ASSERT(buf->b_efunc == NULL); 3053 ASSERT(hdr->b_datacnt == 1); 3054 hdr->b_flags |= ARC_BUF_AVAILABLE; 3055 } 3056 3057 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 3058 3059 if (zio->io_error != 0) { 3060 hdr->b_flags |= ARC_IO_ERROR; 3061 if (hdr->b_state != arc_anon) 3062 arc_change_state(arc_anon, hdr, hash_lock); 3063 if (HDR_IN_HASH_TABLE(hdr)) 3064 buf_hash_remove(hdr); 3065 freeable = refcount_is_zero(&hdr->b_refcnt); 3066 } 3067 3068 /* 3069 * Broadcast before we drop the hash_lock to avoid the possibility 3070 * that the hdr (and hence the cv) might be freed before we get to 3071 * the cv_broadcast(). 3072 */ 3073 cv_broadcast(&hdr->b_cv); 3074 3075 if (hash_lock) { 3076 mutex_exit(hash_lock); 3077 } else { 3078 /* 3079 * This block was freed while we waited for the read to 3080 * complete. It has been removed from the hash table and 3081 * moved to the anonymous state (so that it won't show up 3082 * in the cache). 3083 */ 3084 ASSERT3P(hdr->b_state, ==, arc_anon); 3085 freeable = refcount_is_zero(&hdr->b_refcnt); 3086 } 3087 3088 /* execute each callback and free its structure */ 3089 while ((acb = callback_list) != NULL) { 3090 if (acb->acb_done) 3091 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 3092 3093 if (acb->acb_zio_dummy != NULL) { 3094 acb->acb_zio_dummy->io_error = zio->io_error; 3095 zio_nowait(acb->acb_zio_dummy); 3096 } 3097 3098 callback_list = acb->acb_next; 3099 kmem_free(acb, sizeof (arc_callback_t)); 3100 } 3101 3102 if (freeable) 3103 arc_hdr_destroy(hdr); 3104 } 3105 3106 /* 3107 * "Read" the block at the specified DVA (in bp) via the 3108 * cache. If the block is found in the cache, invoke the provided 3109 * callback immediately and return. Note that the `zio' parameter 3110 * in the callback will be NULL in this case, since no IO was 3111 * required. If the block is not in the cache pass the read request 3112 * on to the spa with a substitute callback function, so that the 3113 * requested block will be added to the cache. 3114 * 3115 * If a read request arrives for a block that has a read in-progress, 3116 * either wait for the in-progress read to complete (and return the 3117 * results); or, if this is a read with a "done" func, add a record 3118 * to the read to invoke the "done" func when the read completes, 3119 * and return; or just return. 3120 * 3121 * arc_read_done() will invoke all the requested "done" functions 3122 * for readers of this block. 3123 */ 3124 int 3125 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 3126 void *private, int priority, int zio_flags, uint32_t *arc_flags, 3127 const zbookmark_t *zb) 3128 { 3129 arc_buf_hdr_t *hdr; 3130 arc_buf_t *buf = NULL; 3131 kmutex_t *hash_lock; 3132 zio_t *rzio; 3133 uint64_t guid = spa_load_guid(spa); 3134 3135 top: 3136 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), 3137 &hash_lock); 3138 if (hdr && hdr->b_datacnt > 0) { 3139 3140 *arc_flags |= ARC_CACHED; 3141 3142 if (HDR_IO_IN_PROGRESS(hdr)) { 3143 3144 if (*arc_flags & ARC_WAIT) { 3145 cv_wait(&hdr->b_cv, hash_lock); 3146 mutex_exit(hash_lock); 3147 goto top; 3148 } 3149 ASSERT(*arc_flags & ARC_NOWAIT); 3150 3151 if (done) { 3152 arc_callback_t *acb = NULL; 3153 3154 acb = kmem_zalloc(sizeof (arc_callback_t), 3155 KM_SLEEP); 3156 acb->acb_done = done; 3157 acb->acb_private = private; 3158 if (pio != NULL) 3159 acb->acb_zio_dummy = zio_null(pio, 3160 spa, NULL, NULL, NULL, zio_flags); 3161 3162 ASSERT(acb->acb_done != NULL); 3163 acb->acb_next = hdr->b_acb; 3164 hdr->b_acb = acb; 3165 add_reference(hdr, hash_lock, private); 3166 mutex_exit(hash_lock); 3167 return (0); 3168 } 3169 mutex_exit(hash_lock); 3170 return (0); 3171 } 3172 3173 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 3174 3175 if (done) { 3176 add_reference(hdr, hash_lock, private); 3177 /* 3178 * If this block is already in use, create a new 3179 * copy of the data so that we will be guaranteed 3180 * that arc_release() will always succeed. 3181 */ 3182 buf = hdr->b_buf; 3183 ASSERT(buf); 3184 ASSERT(buf->b_data); 3185 if (HDR_BUF_AVAILABLE(hdr)) { 3186 ASSERT(buf->b_efunc == NULL); 3187 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 3188 } else { 3189 buf = arc_buf_clone(buf); 3190 } 3191 3192 } else if (*arc_flags & ARC_PREFETCH && 3193 refcount_count(&hdr->b_refcnt) == 0) { 3194 hdr->b_flags |= ARC_PREFETCH; 3195 } 3196 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 3197 arc_access(hdr, hash_lock); 3198 if (*arc_flags & ARC_L2CACHE) 3199 hdr->b_flags |= ARC_L2CACHE; 3200 if (*arc_flags & ARC_L2COMPRESS) 3201 hdr->b_flags |= ARC_L2COMPRESS; 3202 mutex_exit(hash_lock); 3203 ARCSTAT_BUMP(arcstat_hits); 3204 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 3205 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 3206 data, metadata, hits); 3207 3208 if (done) 3209 done(NULL, buf, private); 3210 } else { 3211 uint64_t size = BP_GET_LSIZE(bp); 3212 arc_callback_t *acb; 3213 vdev_t *vd = NULL; 3214 uint64_t addr = 0; 3215 boolean_t devw = B_FALSE; 3216 3217 if (hdr == NULL) { 3218 /* this block is not in the cache */ 3219 arc_buf_hdr_t *exists; 3220 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 3221 buf = arc_buf_alloc(spa, size, private, type); 3222 hdr = buf->b_hdr; 3223 hdr->b_dva = *BP_IDENTITY(bp); 3224 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 3225 hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 3226 exists = buf_hash_insert(hdr, &hash_lock); 3227 if (exists) { 3228 /* somebody beat us to the hash insert */ 3229 mutex_exit(hash_lock); 3230 buf_discard_identity(hdr); 3231 (void) arc_buf_remove_ref(buf, private); 3232 goto top; /* restart the IO request */ 3233 } 3234 /* if this is a prefetch, we don't have a reference */ 3235 if (*arc_flags & ARC_PREFETCH) { 3236 (void) remove_reference(hdr, hash_lock, 3237 private); 3238 hdr->b_flags |= ARC_PREFETCH; 3239 } 3240 if (*arc_flags & ARC_L2CACHE) 3241 hdr->b_flags |= ARC_L2CACHE; 3242 if (*arc_flags & ARC_L2COMPRESS) 3243 hdr->b_flags |= ARC_L2COMPRESS; 3244 if (BP_GET_LEVEL(bp) > 0) 3245 hdr->b_flags |= ARC_INDIRECT; 3246 } else { 3247 /* this block is in the ghost cache */ 3248 ASSERT(GHOST_STATE(hdr->b_state)); 3249 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3250 ASSERT0(refcount_count(&hdr->b_refcnt)); 3251 ASSERT(hdr->b_buf == NULL); 3252 3253 /* if this is a prefetch, we don't have a reference */ 3254 if (*arc_flags & ARC_PREFETCH) 3255 hdr->b_flags |= ARC_PREFETCH; 3256 else 3257 add_reference(hdr, hash_lock, private); 3258 if (*arc_flags & ARC_L2CACHE) 3259 hdr->b_flags |= ARC_L2CACHE; 3260 if (*arc_flags & ARC_L2COMPRESS) 3261 hdr->b_flags |= ARC_L2COMPRESS; 3262 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 3263 buf->b_hdr = hdr; 3264 buf->b_data = NULL; 3265 buf->b_efunc = NULL; 3266 buf->b_private = NULL; 3267 buf->b_next = NULL; 3268 hdr->b_buf = buf; 3269 ASSERT(hdr->b_datacnt == 0); 3270 hdr->b_datacnt = 1; 3271 arc_get_data_buf(buf); 3272 arc_access(hdr, hash_lock); 3273 } 3274 3275 ASSERT(!GHOST_STATE(hdr->b_state)); 3276 3277 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 3278 acb->acb_done = done; 3279 acb->acb_private = private; 3280 3281 ASSERT(hdr->b_acb == NULL); 3282 hdr->b_acb = acb; 3283 hdr->b_flags |= ARC_IO_IN_PROGRESS; 3284 3285 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && 3286 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { 3287 devw = hdr->b_l2hdr->b_dev->l2ad_writing; 3288 addr = hdr->b_l2hdr->b_daddr; 3289 /* 3290 * Lock out device removal. 3291 */ 3292 if (vdev_is_dead(vd) || 3293 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 3294 vd = NULL; 3295 } 3296 3297 mutex_exit(hash_lock); 3298 3299 /* 3300 * At this point, we have a level 1 cache miss. Try again in 3301 * L2ARC if possible. 3302 */ 3303 ASSERT3U(hdr->b_size, ==, size); 3304 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 3305 uint64_t, size, zbookmark_t *, zb); 3306 ARCSTAT_BUMP(arcstat_misses); 3307 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 3308 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 3309 data, metadata, misses); 3310 3311 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 3312 /* 3313 * Read from the L2ARC if the following are true: 3314 * 1. The L2ARC vdev was previously cached. 3315 * 2. This buffer still has L2ARC metadata. 3316 * 3. This buffer isn't currently writing to the L2ARC. 3317 * 4. The L2ARC entry wasn't evicted, which may 3318 * also have invalidated the vdev. 3319 * 5. This isn't prefetch and l2arc_noprefetch is set. 3320 */ 3321 if (hdr->b_l2hdr != NULL && 3322 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 3323 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 3324 l2arc_read_callback_t *cb; 3325 3326 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 3327 ARCSTAT_BUMP(arcstat_l2_hits); 3328 3329 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 3330 KM_SLEEP); 3331 cb->l2rcb_buf = buf; 3332 cb->l2rcb_spa = spa; 3333 cb->l2rcb_bp = *bp; 3334 cb->l2rcb_zb = *zb; 3335 cb->l2rcb_flags = zio_flags; 3336 cb->l2rcb_compress = hdr->b_l2hdr->b_compress; 3337 3338 ASSERT(addr >= VDEV_LABEL_START_SIZE && 3339 addr + size < vd->vdev_psize - 3340 VDEV_LABEL_END_SIZE); 3341 3342 /* 3343 * l2arc read. The SCL_L2ARC lock will be 3344 * released by l2arc_read_done(). 3345 * Issue a null zio if the underlying buffer 3346 * was squashed to zero size by compression. 3347 */ 3348 if (hdr->b_l2hdr->b_compress == 3349 ZIO_COMPRESS_EMPTY) { 3350 rzio = zio_null(pio, spa, vd, 3351 l2arc_read_done, cb, 3352 zio_flags | ZIO_FLAG_DONT_CACHE | 3353 ZIO_FLAG_CANFAIL | 3354 ZIO_FLAG_DONT_PROPAGATE | 3355 ZIO_FLAG_DONT_RETRY); 3356 } else { 3357 rzio = zio_read_phys(pio, vd, addr, 3358 hdr->b_l2hdr->b_asize, 3359 buf->b_data, ZIO_CHECKSUM_OFF, 3360 l2arc_read_done, cb, priority, 3361 zio_flags | ZIO_FLAG_DONT_CACHE | 3362 ZIO_FLAG_CANFAIL | 3363 ZIO_FLAG_DONT_PROPAGATE | 3364 ZIO_FLAG_DONT_RETRY, B_FALSE); 3365 } 3366 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 3367 zio_t *, rzio); 3368 ARCSTAT_INCR(arcstat_l2_read_bytes, 3369 hdr->b_l2hdr->b_asize); 3370 3371 if (*arc_flags & ARC_NOWAIT) { 3372 zio_nowait(rzio); 3373 return (0); 3374 } 3375 3376 ASSERT(*arc_flags & ARC_WAIT); 3377 if (zio_wait(rzio) == 0) 3378 return (0); 3379 3380 /* l2arc read error; goto zio_read() */ 3381 } else { 3382 DTRACE_PROBE1(l2arc__miss, 3383 arc_buf_hdr_t *, hdr); 3384 ARCSTAT_BUMP(arcstat_l2_misses); 3385 if (HDR_L2_WRITING(hdr)) 3386 ARCSTAT_BUMP(arcstat_l2_rw_clash); 3387 spa_config_exit(spa, SCL_L2ARC, vd); 3388 } 3389 } else { 3390 if (vd != NULL) 3391 spa_config_exit(spa, SCL_L2ARC, vd); 3392 if (l2arc_ndev != 0) { 3393 DTRACE_PROBE1(l2arc__miss, 3394 arc_buf_hdr_t *, hdr); 3395 ARCSTAT_BUMP(arcstat_l2_misses); 3396 } 3397 } 3398 3399 rzio = zio_read(pio, spa, bp, buf->b_data, size, 3400 arc_read_done, buf, priority, zio_flags, zb); 3401 3402 if (*arc_flags & ARC_WAIT) 3403 return (zio_wait(rzio)); 3404 3405 ASSERT(*arc_flags & ARC_NOWAIT); 3406 zio_nowait(rzio); 3407 } 3408 return (0); 3409 } 3410 3411 void 3412 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 3413 { 3414 ASSERT(buf->b_hdr != NULL); 3415 ASSERT(buf->b_hdr->b_state != arc_anon); 3416 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 3417 ASSERT(buf->b_efunc == NULL); 3418 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 3419 3420 buf->b_efunc = func; 3421 buf->b_private = private; 3422 } 3423 3424 /* 3425 * Notify the arc that a block was freed, and thus will never be used again. 3426 */ 3427 void 3428 arc_freed(spa_t *spa, const blkptr_t *bp) 3429 { 3430 arc_buf_hdr_t *hdr; 3431 kmutex_t *hash_lock; 3432 uint64_t guid = spa_load_guid(spa); 3433 3434 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), 3435 &hash_lock); 3436 if (hdr == NULL) 3437 return; 3438 if (HDR_BUF_AVAILABLE(hdr)) { 3439 arc_buf_t *buf = hdr->b_buf; 3440 add_reference(hdr, hash_lock, FTAG); 3441 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 3442 mutex_exit(hash_lock); 3443 3444 arc_release(buf, FTAG); 3445 (void) arc_buf_remove_ref(buf, FTAG); 3446 } else { 3447 mutex_exit(hash_lock); 3448 } 3449 3450 } 3451 3452 /* 3453 * This is used by the DMU to let the ARC know that a buffer is 3454 * being evicted, so the ARC should clean up. If this arc buf 3455 * is not yet in the evicted state, it will be put there. 3456 */ 3457 int 3458 arc_buf_evict(arc_buf_t *buf) 3459 { 3460 arc_buf_hdr_t *hdr; 3461 kmutex_t *hash_lock; 3462 arc_buf_t **bufp; 3463 3464 mutex_enter(&buf->b_evict_lock); 3465 hdr = buf->b_hdr; 3466 if (hdr == NULL) { 3467 /* 3468 * We are in arc_do_user_evicts(). 3469 */ 3470 ASSERT(buf->b_data == NULL); 3471 mutex_exit(&buf->b_evict_lock); 3472 return (0); 3473 } else if (buf->b_data == NULL) { 3474 arc_buf_t copy = *buf; /* structure assignment */ 3475 /* 3476 * We are on the eviction list; process this buffer now 3477 * but let arc_do_user_evicts() do the reaping. 3478 */ 3479 buf->b_efunc = NULL; 3480 mutex_exit(&buf->b_evict_lock); 3481 VERIFY(copy.b_efunc(©) == 0); 3482 return (1); 3483 } 3484 hash_lock = HDR_LOCK(hdr); 3485 mutex_enter(hash_lock); 3486 hdr = buf->b_hdr; 3487 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 3488 3489 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 3490 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 3491 3492 /* 3493 * Pull this buffer off of the hdr 3494 */ 3495 bufp = &hdr->b_buf; 3496 while (*bufp != buf) 3497 bufp = &(*bufp)->b_next; 3498 *bufp = buf->b_next; 3499 3500 ASSERT(buf->b_data != NULL); 3501 arc_buf_destroy(buf, FALSE, FALSE); 3502 3503 if (hdr->b_datacnt == 0) { 3504 arc_state_t *old_state = hdr->b_state; 3505 arc_state_t *evicted_state; 3506 3507 ASSERT(hdr->b_buf == NULL); 3508 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 3509 3510 evicted_state = 3511 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 3512 3513 mutex_enter(&old_state->arcs_mtx); 3514 mutex_enter(&evicted_state->arcs_mtx); 3515 3516 arc_change_state(evicted_state, hdr, hash_lock); 3517 ASSERT(HDR_IN_HASH_TABLE(hdr)); 3518 hdr->b_flags |= ARC_IN_HASH_TABLE; 3519 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 3520 3521 mutex_exit(&evicted_state->arcs_mtx); 3522 mutex_exit(&old_state->arcs_mtx); 3523 } 3524 mutex_exit(hash_lock); 3525 mutex_exit(&buf->b_evict_lock); 3526 3527 VERIFY(buf->b_efunc(buf) == 0); 3528 buf->b_efunc = NULL; 3529 buf->b_private = NULL; 3530 buf->b_hdr = NULL; 3531 buf->b_next = NULL; 3532 kmem_cache_free(buf_cache, buf); 3533 return (1); 3534 } 3535 3536 /* 3537 * Release this buffer from the cache, making it an anonymous buffer. This 3538 * must be done after a read and prior to modifying the buffer contents. 3539 * If the buffer has more than one reference, we must make 3540 * a new hdr for the buffer. 3541 */ 3542 void 3543 arc_release(arc_buf_t *buf, void *tag) 3544 { 3545 arc_buf_hdr_t *hdr; 3546 kmutex_t *hash_lock = NULL; 3547 l2arc_buf_hdr_t *l2hdr; 3548 uint64_t buf_size; 3549 3550 /* 3551 * It would be nice to assert that if it's DMU metadata (level > 3552 * 0 || it's the dnode file), then it must be syncing context. 3553 * But we don't know that information at this level. 3554 */ 3555 3556 mutex_enter(&buf->b_evict_lock); 3557 hdr = buf->b_hdr; 3558 3559 /* this buffer is not on any list */ 3560 ASSERT(refcount_count(&hdr->b_refcnt) > 0); 3561 3562 if (hdr->b_state == arc_anon) { 3563 /* this buffer is already released */ 3564 ASSERT(buf->b_efunc == NULL); 3565 } else { 3566 hash_lock = HDR_LOCK(hdr); 3567 mutex_enter(hash_lock); 3568 hdr = buf->b_hdr; 3569 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 3570 } 3571 3572 l2hdr = hdr->b_l2hdr; 3573 if (l2hdr) { 3574 mutex_enter(&l2arc_buflist_mtx); 3575 hdr->b_l2hdr = NULL; 3576 } 3577 buf_size = hdr->b_size; 3578 3579 /* 3580 * Do we have more than one buf? 3581 */ 3582 if (hdr->b_datacnt > 1) { 3583 arc_buf_hdr_t *nhdr; 3584 arc_buf_t **bufp; 3585 uint64_t blksz = hdr->b_size; 3586 uint64_t spa = hdr->b_spa; 3587 arc_buf_contents_t type = hdr->b_type; 3588 uint32_t flags = hdr->b_flags; 3589 3590 ASSERT(hdr->b_buf != buf || buf->b_next != NULL); 3591 /* 3592 * Pull the data off of this hdr and attach it to 3593 * a new anonymous hdr. 3594 */ 3595 (void) remove_reference(hdr, hash_lock, tag); 3596 bufp = &hdr->b_buf; 3597 while (*bufp != buf) 3598 bufp = &(*bufp)->b_next; 3599 *bufp = buf->b_next; 3600 buf->b_next = NULL; 3601 3602 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 3603 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 3604 if (refcount_is_zero(&hdr->b_refcnt)) { 3605 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; 3606 ASSERT3U(*size, >=, hdr->b_size); 3607 atomic_add_64(size, -hdr->b_size); 3608 } 3609 3610 /* 3611 * We're releasing a duplicate user data buffer, update 3612 * our statistics accordingly. 3613 */ 3614 if (hdr->b_type == ARC_BUFC_DATA) { 3615 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 3616 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 3617 -hdr->b_size); 3618 } 3619 hdr->b_datacnt -= 1; 3620 arc_cksum_verify(buf); 3621 arc_buf_unwatch(buf); 3622 3623 mutex_exit(hash_lock); 3624 3625 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 3626 nhdr->b_size = blksz; 3627 nhdr->b_spa = spa; 3628 nhdr->b_type = type; 3629 nhdr->b_buf = buf; 3630 nhdr->b_state = arc_anon; 3631 nhdr->b_arc_access = 0; 3632 nhdr->b_flags = flags & ARC_L2_WRITING; 3633 nhdr->b_l2hdr = NULL; 3634 nhdr->b_datacnt = 1; 3635 nhdr->b_freeze_cksum = NULL; 3636 (void) refcount_add(&nhdr->b_refcnt, tag); 3637 buf->b_hdr = nhdr; 3638 mutex_exit(&buf->b_evict_lock); 3639 atomic_add_64(&arc_anon->arcs_size, blksz); 3640 } else { 3641 mutex_exit(&buf->b_evict_lock); 3642 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 3643 ASSERT(!list_link_active(&hdr->b_arc_node)); 3644 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3645 if (hdr->b_state != arc_anon) 3646 arc_change_state(arc_anon, hdr, hash_lock); 3647 hdr->b_arc_access = 0; 3648 if (hash_lock) 3649 mutex_exit(hash_lock); 3650 3651 buf_discard_identity(hdr); 3652 arc_buf_thaw(buf); 3653 } 3654 buf->b_efunc = NULL; 3655 buf->b_private = NULL; 3656 3657 if (l2hdr) { 3658 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 3659 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 3660 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 3661 ARCSTAT_INCR(arcstat_l2_size, -buf_size); 3662 mutex_exit(&l2arc_buflist_mtx); 3663 } 3664 } 3665 3666 int 3667 arc_released(arc_buf_t *buf) 3668 { 3669 int released; 3670 3671 mutex_enter(&buf->b_evict_lock); 3672 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 3673 mutex_exit(&buf->b_evict_lock); 3674 return (released); 3675 } 3676 3677 int 3678 arc_has_callback(arc_buf_t *buf) 3679 { 3680 int callback; 3681 3682 mutex_enter(&buf->b_evict_lock); 3683 callback = (buf->b_efunc != NULL); 3684 mutex_exit(&buf->b_evict_lock); 3685 return (callback); 3686 } 3687 3688 #ifdef ZFS_DEBUG 3689 int 3690 arc_referenced(arc_buf_t *buf) 3691 { 3692 int referenced; 3693 3694 mutex_enter(&buf->b_evict_lock); 3695 referenced = (refcount_count(&buf->b_hdr->b_refcnt)); 3696 mutex_exit(&buf->b_evict_lock); 3697 return (referenced); 3698 } 3699 #endif 3700 3701 static void 3702 arc_write_ready(zio_t *zio) 3703 { 3704 arc_write_callback_t *callback = zio->io_private; 3705 arc_buf_t *buf = callback->awcb_buf; 3706 arc_buf_hdr_t *hdr = buf->b_hdr; 3707 3708 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 3709 callback->awcb_ready(zio, buf, callback->awcb_private); 3710 3711 /* 3712 * If the IO is already in progress, then this is a re-write 3713 * attempt, so we need to thaw and re-compute the cksum. 3714 * It is the responsibility of the callback to handle the 3715 * accounting for any re-write attempt. 3716 */ 3717 if (HDR_IO_IN_PROGRESS(hdr)) { 3718 mutex_enter(&hdr->b_freeze_lock); 3719 if (hdr->b_freeze_cksum != NULL) { 3720 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 3721 hdr->b_freeze_cksum = NULL; 3722 } 3723 mutex_exit(&hdr->b_freeze_lock); 3724 } 3725 arc_cksum_compute(buf, B_FALSE); 3726 hdr->b_flags |= ARC_IO_IN_PROGRESS; 3727 } 3728 3729 static void 3730 arc_write_done(zio_t *zio) 3731 { 3732 arc_write_callback_t *callback = zio->io_private; 3733 arc_buf_t *buf = callback->awcb_buf; 3734 arc_buf_hdr_t *hdr = buf->b_hdr; 3735 3736 ASSERT(hdr->b_acb == NULL); 3737 3738 if (zio->io_error == 0) { 3739 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 3740 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 3741 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 3742 } else { 3743 ASSERT(BUF_EMPTY(hdr)); 3744 } 3745 3746 /* 3747 * If the block to be written was all-zero, we may have 3748 * compressed it away. In this case no write was performed 3749 * so there will be no dva/birth/checksum. The buffer must 3750 * therefore remain anonymous (and uncached). 3751 */ 3752 if (!BUF_EMPTY(hdr)) { 3753 arc_buf_hdr_t *exists; 3754 kmutex_t *hash_lock; 3755 3756 ASSERT(zio->io_error == 0); 3757 3758 arc_cksum_verify(buf); 3759 3760 exists = buf_hash_insert(hdr, &hash_lock); 3761 if (exists) { 3762 /* 3763 * This can only happen if we overwrite for 3764 * sync-to-convergence, because we remove 3765 * buffers from the hash table when we arc_free(). 3766 */ 3767 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 3768 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 3769 panic("bad overwrite, hdr=%p exists=%p", 3770 (void *)hdr, (void *)exists); 3771 ASSERT(refcount_is_zero(&exists->b_refcnt)); 3772 arc_change_state(arc_anon, exists, hash_lock); 3773 mutex_exit(hash_lock); 3774 arc_hdr_destroy(exists); 3775 exists = buf_hash_insert(hdr, &hash_lock); 3776 ASSERT3P(exists, ==, NULL); 3777 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 3778 /* nopwrite */ 3779 ASSERT(zio->io_prop.zp_nopwrite); 3780 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 3781 panic("bad nopwrite, hdr=%p exists=%p", 3782 (void *)hdr, (void *)exists); 3783 } else { 3784 /* Dedup */ 3785 ASSERT(hdr->b_datacnt == 1); 3786 ASSERT(hdr->b_state == arc_anon); 3787 ASSERT(BP_GET_DEDUP(zio->io_bp)); 3788 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 3789 } 3790 } 3791 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3792 /* if it's not anon, we are doing a scrub */ 3793 if (!exists && hdr->b_state == arc_anon) 3794 arc_access(hdr, hash_lock); 3795 mutex_exit(hash_lock); 3796 } else { 3797 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3798 } 3799 3800 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 3801 callback->awcb_done(zio, buf, callback->awcb_private); 3802 3803 kmem_free(callback, sizeof (arc_write_callback_t)); 3804 } 3805 3806 zio_t * 3807 arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 3808 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, 3809 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done, 3810 void *private, int priority, int zio_flags, const zbookmark_t *zb) 3811 { 3812 arc_buf_hdr_t *hdr = buf->b_hdr; 3813 arc_write_callback_t *callback; 3814 zio_t *zio; 3815 3816 ASSERT(ready != NULL); 3817 ASSERT(done != NULL); 3818 ASSERT(!HDR_IO_ERROR(hdr)); 3819 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 3820 ASSERT(hdr->b_acb == NULL); 3821 if (l2arc) 3822 hdr->b_flags |= ARC_L2CACHE; 3823 if (l2arc_compress) 3824 hdr->b_flags |= ARC_L2COMPRESS; 3825 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 3826 callback->awcb_ready = ready; 3827 callback->awcb_done = done; 3828 callback->awcb_private = private; 3829 callback->awcb_buf = buf; 3830 3831 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 3832 arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); 3833 3834 return (zio); 3835 } 3836 3837 static int 3838 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) 3839 { 3840 #ifdef _KERNEL 3841 uint64_t available_memory = ptob(freemem); 3842 static uint64_t page_load = 0; 3843 static uint64_t last_txg = 0; 3844 3845 #if defined(__i386) 3846 available_memory = 3847 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); 3848 #endif 3849 if (available_memory >= zfs_write_limit_max) 3850 return (0); 3851 3852 if (txg > last_txg) { 3853 last_txg = txg; 3854 page_load = 0; 3855 } 3856 /* 3857 * If we are in pageout, we know that memory is already tight, 3858 * the arc is already going to be evicting, so we just want to 3859 * continue to let page writes occur as quickly as possible. 3860 */ 3861 if (curproc == proc_pageout) { 3862 if (page_load > MAX(ptob(minfree), available_memory) / 4) 3863 return (SET_ERROR(ERESTART)); 3864 /* Note: reserve is inflated, so we deflate */ 3865 page_load += reserve / 8; 3866 return (0); 3867 } else if (page_load > 0 && arc_reclaim_needed()) { 3868 /* memory is low, delay before restarting */ 3869 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3870 return (SET_ERROR(EAGAIN)); 3871 } 3872 page_load = 0; 3873 3874 if (arc_size > arc_c_min) { 3875 uint64_t evictable_memory = 3876 arc_mru->arcs_lsize[ARC_BUFC_DATA] + 3877 arc_mru->arcs_lsize[ARC_BUFC_METADATA] + 3878 arc_mfu->arcs_lsize[ARC_BUFC_DATA] + 3879 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; 3880 available_memory += MIN(evictable_memory, arc_size - arc_c_min); 3881 } 3882 3883 if (inflight_data > available_memory / 4) { 3884 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3885 return (SET_ERROR(ERESTART)); 3886 } 3887 #endif 3888 return (0); 3889 } 3890 3891 void 3892 arc_tempreserve_clear(uint64_t reserve) 3893 { 3894 atomic_add_64(&arc_tempreserve, -reserve); 3895 ASSERT((int64_t)arc_tempreserve >= 0); 3896 } 3897 3898 int 3899 arc_tempreserve_space(uint64_t reserve, uint64_t txg) 3900 { 3901 int error; 3902 uint64_t anon_size; 3903 3904 #ifdef ZFS_DEBUG 3905 /* 3906 * Once in a while, fail for no reason. Everything should cope. 3907 */ 3908 if (spa_get_random(10000) == 0) { 3909 dprintf("forcing random failure\n"); 3910 return (SET_ERROR(ERESTART)); 3911 } 3912 #endif 3913 if (reserve > arc_c/4 && !arc_no_grow) 3914 arc_c = MIN(arc_c_max, reserve * 4); 3915 if (reserve > arc_c) 3916 return (SET_ERROR(ENOMEM)); 3917 3918 /* 3919 * Don't count loaned bufs as in flight dirty data to prevent long 3920 * network delays from blocking transactions that are ready to be 3921 * assigned to a txg. 3922 */ 3923 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 3924 3925 /* 3926 * Writes will, almost always, require additional memory allocations 3927 * in order to compress/encrypt/etc the data. We therefore need to 3928 * make sure that there is sufficient available memory for this. 3929 */ 3930 if (error = arc_memory_throttle(reserve, anon_size, txg)) 3931 return (error); 3932 3933 /* 3934 * Throttle writes when the amount of dirty data in the cache 3935 * gets too large. We try to keep the cache less than half full 3936 * of dirty blocks so that our sync times don't grow too large. 3937 * Note: if two requests come in concurrently, we might let them 3938 * both succeed, when one of them should fail. Not a huge deal. 3939 */ 3940 3941 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 3942 anon_size > arc_c / 4) { 3943 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 3944 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 3945 arc_tempreserve>>10, 3946 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 3947 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 3948 reserve>>10, arc_c>>10); 3949 return (SET_ERROR(ERESTART)); 3950 } 3951 atomic_add_64(&arc_tempreserve, reserve); 3952 return (0); 3953 } 3954 3955 void 3956 arc_init(void) 3957 { 3958 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 3959 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 3960 3961 /* Convert seconds to clock ticks */ 3962 arc_min_prefetch_lifespan = 1 * hz; 3963 3964 /* Start out with 1/8 of all memory */ 3965 arc_c = physmem * PAGESIZE / 8; 3966 3967 #ifdef _KERNEL 3968 /* 3969 * On architectures where the physical memory can be larger 3970 * than the addressable space (intel in 32-bit mode), we may 3971 * need to limit the cache to 1/8 of VM size. 3972 */ 3973 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 3974 #endif 3975 3976 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 3977 arc_c_min = MAX(arc_c / 4, 64<<20); 3978 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 3979 if (arc_c * 8 >= 1<<30) 3980 arc_c_max = (arc_c * 8) - (1<<30); 3981 else 3982 arc_c_max = arc_c_min; 3983 arc_c_max = MAX(arc_c * 6, arc_c_max); 3984 3985 /* 3986 * Allow the tunables to override our calculations if they are 3987 * reasonable (ie. over 64MB) 3988 */ 3989 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) 3990 arc_c_max = zfs_arc_max; 3991 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) 3992 arc_c_min = zfs_arc_min; 3993 3994 arc_c = arc_c_max; 3995 arc_p = (arc_c >> 1); 3996 3997 /* limit meta-data to 1/4 of the arc capacity */ 3998 arc_meta_limit = arc_c_max / 4; 3999 4000 /* Allow the tunable to override if it is reasonable */ 4001 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 4002 arc_meta_limit = zfs_arc_meta_limit; 4003 4004 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 4005 arc_c_min = arc_meta_limit / 2; 4006 4007 if (zfs_arc_grow_retry > 0) 4008 arc_grow_retry = zfs_arc_grow_retry; 4009 4010 if (zfs_arc_shrink_shift > 0) 4011 arc_shrink_shift = zfs_arc_shrink_shift; 4012 4013 if (zfs_arc_p_min_shift > 0) 4014 arc_p_min_shift = zfs_arc_p_min_shift; 4015 4016 /* if kmem_flags are set, lets try to use less memory */ 4017 if (kmem_debugging()) 4018 arc_c = arc_c / 2; 4019 if (arc_c < arc_c_min) 4020 arc_c = arc_c_min; 4021 4022 arc_anon = &ARC_anon; 4023 arc_mru = &ARC_mru; 4024 arc_mru_ghost = &ARC_mru_ghost; 4025 arc_mfu = &ARC_mfu; 4026 arc_mfu_ghost = &ARC_mfu_ghost; 4027 arc_l2c_only = &ARC_l2c_only; 4028 arc_size = 0; 4029 4030 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4031 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4032 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4033 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4034 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4035 mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4036 4037 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 4038 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4039 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 4040 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4041 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 4042 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4043 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 4044 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4045 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 4046 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4047 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 4048 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4049 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 4050 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4051 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 4052 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4053 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], 4054 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4055 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], 4056 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 4057 4058 buf_init(); 4059 4060 arc_thread_exit = 0; 4061 arc_eviction_list = NULL; 4062 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 4063 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 4064 4065 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 4066 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 4067 4068 if (arc_ksp != NULL) { 4069 arc_ksp->ks_data = &arc_stats; 4070 kstat_install(arc_ksp); 4071 } 4072 4073 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 4074 TS_RUN, minclsyspri); 4075 4076 arc_dead = FALSE; 4077 arc_warm = B_FALSE; 4078 4079 if (zfs_write_limit_max == 0) 4080 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; 4081 else 4082 zfs_write_limit_shift = 0; 4083 mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); 4084 } 4085 4086 void 4087 arc_fini(void) 4088 { 4089 mutex_enter(&arc_reclaim_thr_lock); 4090 arc_thread_exit = 1; 4091 while (arc_thread_exit != 0) 4092 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 4093 mutex_exit(&arc_reclaim_thr_lock); 4094 4095 arc_flush(NULL); 4096 4097 arc_dead = TRUE; 4098 4099 if (arc_ksp != NULL) { 4100 kstat_delete(arc_ksp); 4101 arc_ksp = NULL; 4102 } 4103 4104 mutex_destroy(&arc_eviction_mtx); 4105 mutex_destroy(&arc_reclaim_thr_lock); 4106 cv_destroy(&arc_reclaim_thr_cv); 4107 4108 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 4109 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 4110 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 4111 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 4112 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 4113 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 4114 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 4115 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 4116 4117 mutex_destroy(&arc_anon->arcs_mtx); 4118 mutex_destroy(&arc_mru->arcs_mtx); 4119 mutex_destroy(&arc_mru_ghost->arcs_mtx); 4120 mutex_destroy(&arc_mfu->arcs_mtx); 4121 mutex_destroy(&arc_mfu_ghost->arcs_mtx); 4122 mutex_destroy(&arc_l2c_only->arcs_mtx); 4123 4124 mutex_destroy(&zfs_write_limit_lock); 4125 4126 buf_fini(); 4127 4128 ASSERT(arc_loaned_bytes == 0); 4129 } 4130 4131 /* 4132 * Level 2 ARC 4133 * 4134 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 4135 * It uses dedicated storage devices to hold cached data, which are populated 4136 * using large infrequent writes. The main role of this cache is to boost 4137 * the performance of random read workloads. The intended L2ARC devices 4138 * include short-stroked disks, solid state disks, and other media with 4139 * substantially faster read latency than disk. 4140 * 4141 * +-----------------------+ 4142 * | ARC | 4143 * +-----------------------+ 4144 * | ^ ^ 4145 * | | | 4146 * l2arc_feed_thread() arc_read() 4147 * | | | 4148 * | l2arc read | 4149 * V | | 4150 * +---------------+ | 4151 * | L2ARC | | 4152 * +---------------+ | 4153 * | ^ | 4154 * l2arc_write() | | 4155 * | | | 4156 * V | | 4157 * +-------+ +-------+ 4158 * | vdev | | vdev | 4159 * | cache | | cache | 4160 * +-------+ +-------+ 4161 * +=========+ .-----. 4162 * : L2ARC : |-_____-| 4163 * : devices : | Disks | 4164 * +=========+ `-_____-' 4165 * 4166 * Read requests are satisfied from the following sources, in order: 4167 * 4168 * 1) ARC 4169 * 2) vdev cache of L2ARC devices 4170 * 3) L2ARC devices 4171 * 4) vdev cache of disks 4172 * 5) disks 4173 * 4174 * Some L2ARC device types exhibit extremely slow write performance. 4175 * To accommodate for this there are some significant differences between 4176 * the L2ARC and traditional cache design: 4177 * 4178 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 4179 * the ARC behave as usual, freeing buffers and placing headers on ghost 4180 * lists. The ARC does not send buffers to the L2ARC during eviction as 4181 * this would add inflated write latencies for all ARC memory pressure. 4182 * 4183 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 4184 * It does this by periodically scanning buffers from the eviction-end of 4185 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 4186 * not already there. It scans until a headroom of buffers is satisfied, 4187 * which itself is a buffer for ARC eviction. If a compressible buffer is 4188 * found during scanning and selected for writing to an L2ARC device, we 4189 * temporarily boost scanning headroom during the next scan cycle to make 4190 * sure we adapt to compression effects (which might significantly reduce 4191 * the data volume we write to L2ARC). The thread that does this is 4192 * l2arc_feed_thread(), illustrated below; example sizes are included to 4193 * provide a better sense of ratio than this diagram: 4194 * 4195 * head --> tail 4196 * +---------------------+----------+ 4197 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 4198 * +---------------------+----------+ | o L2ARC eligible 4199 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 4200 * +---------------------+----------+ | 4201 * 15.9 Gbytes ^ 32 Mbytes | 4202 * headroom | 4203 * l2arc_feed_thread() 4204 * | 4205 * l2arc write hand <--[oooo]--' 4206 * | 8 Mbyte 4207 * | write max 4208 * V 4209 * +==============================+ 4210 * L2ARC dev |####|#|###|###| |####| ... | 4211 * +==============================+ 4212 * 32 Gbytes 4213 * 4214 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 4215 * evicted, then the L2ARC has cached a buffer much sooner than it probably 4216 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 4217 * safe to say that this is an uncommon case, since buffers at the end of 4218 * the ARC lists have moved there due to inactivity. 4219 * 4220 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 4221 * then the L2ARC simply misses copying some buffers. This serves as a 4222 * pressure valve to prevent heavy read workloads from both stalling the ARC 4223 * with waits and clogging the L2ARC with writes. This also helps prevent 4224 * the potential for the L2ARC to churn if it attempts to cache content too 4225 * quickly, such as during backups of the entire pool. 4226 * 4227 * 5. After system boot and before the ARC has filled main memory, there are 4228 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 4229 * lists can remain mostly static. Instead of searching from tail of these 4230 * lists as pictured, the l2arc_feed_thread() will search from the list heads 4231 * for eligible buffers, greatly increasing its chance of finding them. 4232 * 4233 * The L2ARC device write speed is also boosted during this time so that 4234 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 4235 * there are no L2ARC reads, and no fear of degrading read performance 4236 * through increased writes. 4237 * 4238 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 4239 * the vdev queue can aggregate them into larger and fewer writes. Each 4240 * device is written to in a rotor fashion, sweeping writes through 4241 * available space then repeating. 4242 * 4243 * 7. The L2ARC does not store dirty content. It never needs to flush 4244 * write buffers back to disk based storage. 4245 * 4246 * 8. If an ARC buffer is written (and dirtied) which also exists in the 4247 * L2ARC, the now stale L2ARC buffer is immediately dropped. 4248 * 4249 * The performance of the L2ARC can be tweaked by a number of tunables, which 4250 * may be necessary for different workloads: 4251 * 4252 * l2arc_write_max max write bytes per interval 4253 * l2arc_write_boost extra write bytes during device warmup 4254 * l2arc_noprefetch skip caching prefetched buffers 4255 * l2arc_headroom number of max device writes to precache 4256 * l2arc_headroom_boost when we find compressed buffers during ARC 4257 * scanning, we multiply headroom by this 4258 * percentage factor for the next scan cycle, 4259 * since more compressed buffers are likely to 4260 * be present 4261 * l2arc_feed_secs seconds between L2ARC writing 4262 * 4263 * Tunables may be removed or added as future performance improvements are 4264 * integrated, and also may become zpool properties. 4265 * 4266 * There are three key functions that control how the L2ARC warms up: 4267 * 4268 * l2arc_write_eligible() check if a buffer is eligible to cache 4269 * l2arc_write_size() calculate how much to write 4270 * l2arc_write_interval() calculate sleep delay between writes 4271 * 4272 * These three functions determine what to write, how much, and how quickly 4273 * to send writes. 4274 * 4275 * L2ARC persistency: 4276 * 4277 * When writing buffers to L2ARC, we periodically add some metadata to 4278 * make sure we can pick them up after reboot, thus dramatically reducing 4279 * the impact that any downtime has on the performance of storage systems 4280 * with large caches. 4281 * 4282 * The implementation works fairly simply by integrating the following two 4283 * modifications: 4284 * 4285 * *) Every now and then, at end of an L2ARC feed cycle, we append a piece 4286 * of metadata (called a "pbuf", or "persistency buffer") to the L2ARC 4287 * write. This allows us to understand what what's been written, so that 4288 * we can rebuild the arc_buf_hdr_t structures of the main ARC buffers. 4289 * The pbuf also includes a "back-reference" pointer to the previous 4290 * pbuf, forming a linked list of pbufs on the L2ARC device. 4291 * 4292 * *) We reserve 4k of space at the start of each L2ARC device for our 4293 * header bookkeeping purposes. This contains a single 4k uberblock, which 4294 * contains our top-level reference structures. We update it on each pbuf 4295 * write. If this write results in an inconsistent uberblock (e.g. due to 4296 * power failure), we detect this by verifying the uberblock's checksum 4297 * and simply drop the entries from L2ARC. Once an L2ARC pbuf update 4298 * completes, we update the uberblock to point to it. 4299 * 4300 * Implementation diagram: 4301 * 4302 * +=== L2ARC device (not to scale) ======================================+ 4303 * | ____________newest pbuf pointer_____________ | 4304 * | / \ | 4305 * | / V | 4306 * ||l2uberblock|---|bufs|pbuf|bufs|pbuf|bufs|pbuf|bufs|pbuf|---(empty)---| 4307 * | ^ / ^ / ^ / | 4308 * | `-prev-' `-prev-' `-prev-' | 4309 * | pbuf pbuf pbuf | 4310 * +======================================================================+ 4311 * 4312 * On-device data structures: 4313 * 4314 * (L2ARC persistent uberblock) 4315 * struct l2uberblock { 4316 * (these fields are in network byte order) 4317 * uint32_t magic = 0x12bab10c; l2-ber-block 4318 * uint8_t version = 0x1; 4319 * uint8_t reserved = 0x0; 4320 * uint16_t ublk_flags; see l2uberblock_flags_t 4321 * 4322 * (byte order of fields below determined by `ublk_flags') 4323 * uint64_t spa_guid; what pool this l2arc dev belongs to 4324 * uint64_t birth_txg; ublk with highest birth_txg is newest 4325 * uint64_t evict_tail; current evict pointer on l2arc dev 4326 * uint64_t alloc_space; how much space is alloc'd on the dev 4327 * uint64_t pbuf_daddr; dev addr of the newest l2pbuf_t 4328 * uint32_t pbuf_asize; size of newest pbuf 4329 * uint64_t pbuf_cksum[4]; fletcher4 of newest pbuf 4330 * 4331 * uint8_t reserved[3996] = {0x0, 0x0, ... 0x0}; 4332 * 4333 * uint64_t ublk_cksum[4] = fletcher4(of the 4064 bytes above); 4334 * } l2dev_uberblock; 4335 * 4336 * (L2ARC persistent buffer list) 4337 * typedef struct l2pbuf_t { 4338 * (these fields are in network byte order) 4339 * uint32_t magic = 0xdb0faba6; the-buffer-bag 4340 * uint8_t version = 0x1; 4341 * uint8_t reserved = 0x0; 4342 * uint16_t pbuf_flags; see l2pbuf_flags_t 4343 * 4344 * (byte order of fields below determined by `pbuf_flags') 4345 * uint64_t prev_pbuf_daddr; previous pbuf dev addr 4346 * uint32_t prev_pbuf_asize; previous pbuf size 4347 * uint64_t prev_pbuf_cksum[4]; fletcher4(of previous pbuf) 4348 * 4349 * uint32_t items_size; uncompressed size of `items' below 4350 * (if (pbuf_flags & compress) decompress `items' prior to decoding) 4351 * struct l2pbuf_buf_item { 4352 * (these fields mirror [l2]arc_buf_hdr fields) 4353 * uint64_t dva[2]; buffer's DVA 4354 * uint64_t birth; buffer's birth TXG in ARC 4355 * uint64_t cksum0; lower 64-bits of buffer's cksum 4356 * uint64_t freeze_cksum[4]; buffer's freeze cksum 4357 * uint32_t size; uncompressed buffer data size 4358 * uint64_t l2daddr; device address (offset) of buf 4359 * uint32_t l2asize; actual space occupied by buf 4360 * uint8_t compress; compress algo used on data 4361 * uint8_t contents_type; buffer's contents type 4362 * uint16_t reserved = 0x0; for alignment and future use 4363 * uint32_t flags; buffer's persistent flags 4364 * } items[]; continues for remainder of pbuf 4365 * } l2pbuf_t; 4366 * 4367 * L2ARC reconstruction: 4368 * 4369 * When writing data, we simply write in the standard rotary fashion, 4370 * evicting buffers as we go and simply writing new data over them (appending 4371 * an updated l2pbuf_t every now and then). This obviously means that once we 4372 * loop around the end of the device, we will start cutting into an already 4373 * committed l2pbuf (and its referenced data buffers), like so: 4374 * 4375 * current write head__ __old tail 4376 * \ / 4377 * V V 4378 * <--|bufs|pbuf|bufs|pbuf| |bufs|pbuf|bufs|pbuf|--> 4379 * ^ ^^^^^^^^^_____________________________ 4380 * | \ 4381 * <<nextwrite>> - will overwrite this pbuf --/ 4382 * 4383 * When importing the pool, we detect this situation and use it to stop 4384 * our scanning process: 4385 * 1) Let `this_pbuf' refer to the current l2pbuf_t and `prev_pbuf' to the 4386 * previous one. 4387 * 2) if (fletcher4(prev_pbuf) != this_pbuf->prev_pbuf_cksum) 4388 * then the pbuf is invalid and stop scanning (goto step 3 below). 4389 * 3) if (this is the last valid pbuf) 4390 * discard this pbuf as well (its ARC bufs may have been damaged by a 4391 * partial overwrite). 4392 * (We could potentially salvage the remaining good arc bufs above in step 3, 4393 * buf the cost of doing so probably outweighs the value of the entire pbuf). 4394 * 4395 * There is one significant caveat to consider when rebuilding ARC contents 4396 * from an L2ARC device: what about invalidated buffers? Given the above 4397 * construction, we cannot update pbufs which we've already written to amend 4398 * them to remove buffers which were invalidated. Thus, during reconstruction, 4399 * we might be populating the cache with buffers for data that's not on the 4400 * main pool anymore, or may have been overwritten! 4401 * 4402 * As it turns out, this isn't a problem. Every arc_read request includes 4403 * both the DVA and, crucially, the birth TXG of the BP the caller is 4404 * looking for. So even if the cache were populated by completely rotten 4405 * blocks for data that had been long deleted and/or overwritten, we'll 4406 * never actually return bad data from the cache, since the DVA with the 4407 * birth TXG uniquely identify a block in space and time - once created, 4408 * a block is immutable on disk. The worst thing we have done is wasted 4409 * some time and memory at l2arc rebuild to reconstruct outdated ARC 4410 * entries that will get dropped from the l2arc as it is being updated 4411 * with new blocks. 4412 */ 4413 4414 static boolean_t 4415 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) 4416 { 4417 /* 4418 * A buffer is *not* eligible for the L2ARC if it: 4419 * 1. belongs to a different spa. 4420 * 2. is already cached on the L2ARC. 4421 * 3. has an I/O in progress (it may be an incomplete read). 4422 * 4. is flagged not eligible (zfs property). 4423 */ 4424 if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL || 4425 HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) 4426 return (B_FALSE); 4427 4428 return (B_TRUE); 4429 } 4430 4431 static uint64_t 4432 l2arc_write_size(void) 4433 { 4434 uint64_t size; 4435 4436 /* 4437 * Make sure our globals have meaningful values in case the user 4438 * altered them. 4439 */ 4440 size = l2arc_write_max; 4441 if (size == 0) { 4442 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 4443 "be greater than zero, resetting it to the default (%d)", 4444 L2ARC_WRITE_SIZE); 4445 size = l2arc_write_max = L2ARC_WRITE_SIZE; 4446 } 4447 4448 if (arc_warm == B_FALSE) 4449 size += l2arc_write_boost; 4450 4451 return (size); 4452 4453 } 4454 4455 static clock_t 4456 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 4457 { 4458 clock_t interval, next, now; 4459 4460 /* 4461 * If the ARC lists are busy, increase our write rate; if the 4462 * lists are stale, idle back. This is achieved by checking 4463 * how much we previously wrote - if it was more than half of 4464 * what we wanted, schedule the next write much sooner. 4465 */ 4466 if (l2arc_feed_again && wrote > (wanted / 2)) 4467 interval = (hz * l2arc_feed_min_ms) / 1000; 4468 else 4469 interval = hz * l2arc_feed_secs; 4470 4471 now = ddi_get_lbolt(); 4472 next = MAX(now, MIN(now + interval, began + interval)); 4473 4474 return (next); 4475 } 4476 4477 static void 4478 l2arc_hdr_stat_add(boolean_t from_arc) 4479 { 4480 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); 4481 if (from_arc) 4482 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); 4483 } 4484 4485 static void 4486 l2arc_hdr_stat_remove(void) 4487 { 4488 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); 4489 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); 4490 } 4491 4492 /* 4493 * Cycle through L2ARC devices. This is how L2ARC load balances. 4494 * If a device is returned, this also returns holding the spa config lock. 4495 */ 4496 static l2arc_dev_t * 4497 l2arc_dev_get_next(void) 4498 { 4499 l2arc_dev_t *first, *next = NULL; 4500 4501 /* 4502 * Lock out the removal of spas (spa_namespace_lock), then removal 4503 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 4504 * both locks will be dropped and a spa config lock held instead. 4505 */ 4506 mutex_enter(&spa_namespace_lock); 4507 mutex_enter(&l2arc_dev_mtx); 4508 4509 /* if there are no vdevs, there is nothing to do */ 4510 if (l2arc_ndev == 0) 4511 goto out; 4512 4513 first = NULL; 4514 next = l2arc_dev_last; 4515 do { 4516 /* 4517 * Loop around the list looking for a non-faulted vdev 4518 * and one that isn't currently doing an L2ARC rebuild. 4519 */ 4520 if (next == NULL) { 4521 next = list_head(l2arc_dev_list); 4522 } else { 4523 next = list_next(l2arc_dev_list, next); 4524 if (next == NULL) 4525 next = list_head(l2arc_dev_list); 4526 } 4527 4528 /* if we have come back to the start, bail out */ 4529 if (first == NULL) 4530 first = next; 4531 else if (next == first) 4532 break; 4533 4534 } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding); 4535 4536 /* if we were unable to find any usable vdevs, return NULL */ 4537 if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuilding) 4538 next = NULL; 4539 4540 l2arc_dev_last = next; 4541 4542 out: 4543 mutex_exit(&l2arc_dev_mtx); 4544 4545 /* 4546 * Grab the config lock to prevent the 'next' device from being 4547 * removed while we are writing to it. 4548 */ 4549 if (next != NULL) 4550 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 4551 mutex_exit(&spa_namespace_lock); 4552 4553 return (next); 4554 } 4555 4556 /* 4557 * Free buffers that were tagged for destruction. 4558 */ 4559 static void 4560 l2arc_do_free_on_write() 4561 { 4562 list_t *buflist; 4563 l2arc_data_free_t *df, *df_prev; 4564 4565 mutex_enter(&l2arc_free_on_write_mtx); 4566 buflist = l2arc_free_on_write; 4567 4568 for (df = list_tail(buflist); df; df = df_prev) { 4569 df_prev = list_prev(buflist, df); 4570 ASSERT(df->l2df_data != NULL); 4571 ASSERT(df->l2df_func != NULL); 4572 df->l2df_func(df->l2df_data, df->l2df_size); 4573 list_remove(buflist, df); 4574 kmem_free(df, sizeof (l2arc_data_free_t)); 4575 } 4576 4577 mutex_exit(&l2arc_free_on_write_mtx); 4578 } 4579 4580 /* 4581 * A write to a cache device has completed. Update all headers to allow 4582 * reads from these buffers to begin. 4583 */ 4584 static void 4585 l2arc_write_done(zio_t *zio) 4586 { 4587 l2arc_write_callback_t *cb; 4588 l2arc_dev_t *dev; 4589 list_t *buflist; 4590 arc_buf_hdr_t *head, *ab, *ab_prev; 4591 l2arc_buf_hdr_t *abl2; 4592 kmutex_t *hash_lock; 4593 4594 cb = zio->io_private; 4595 ASSERT(cb != NULL); 4596 dev = cb->l2wcb_dev; 4597 ASSERT(dev != NULL); 4598 head = cb->l2wcb_head; 4599 ASSERT(head != NULL); 4600 buflist = dev->l2ad_buflist; 4601 ASSERT(buflist != NULL); 4602 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 4603 l2arc_write_callback_t *, cb); 4604 4605 if (zio->io_error != 0) 4606 ARCSTAT_BUMP(arcstat_l2_writes_error); 4607 4608 mutex_enter(&l2arc_buflist_mtx); 4609 4610 /* 4611 * All writes completed, or an error was hit. 4612 */ 4613 for (ab = list_prev(buflist, head); ab; ab = ab_prev) { 4614 ab_prev = list_prev(buflist, ab); 4615 abl2 = ab->b_l2hdr; 4616 4617 /* 4618 * Release the temporary compressed buffer as soon as possible. 4619 */ 4620 if (abl2->b_compress != ZIO_COMPRESS_OFF) 4621 l2arc_release_cdata_buf(ab); 4622 4623 hash_lock = HDR_LOCK(ab); 4624 if (!mutex_tryenter(hash_lock)) { 4625 /* 4626 * This buffer misses out. It may be in a stage 4627 * of eviction. Its ARC_L2_WRITING flag will be 4628 * left set, denying reads to this buffer. 4629 */ 4630 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 4631 continue; 4632 } 4633 4634 if (zio->io_error != 0) { 4635 /* 4636 * Error - drop L2ARC entry. 4637 */ 4638 list_remove(buflist, ab); 4639 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); 4640 ab->b_l2hdr = NULL; 4641 kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 4642 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4643 } 4644 4645 /* 4646 * Allow ARC to begin reads to this L2ARC entry. 4647 */ 4648 ab->b_flags &= ~ARC_L2_WRITING; 4649 4650 mutex_exit(hash_lock); 4651 } 4652 4653 atomic_inc_64(&l2arc_writes_done); 4654 list_remove(buflist, head); 4655 kmem_cache_free(hdr_cache, head); 4656 mutex_exit(&l2arc_buflist_mtx); 4657 4658 l2arc_do_free_on_write(); 4659 4660 if (cb->l2wcb_pbuf) 4661 kmem_free(cb->l2wcb_pbuf, cb->l2wcb_pbuf_size); 4662 if (cb->l2wcb_ub_buf) 4663 kmem_free(cb->l2wcb_ub_buf, L2UBERBLOCK_SIZE); 4664 kmem_free(cb, sizeof (l2arc_write_callback_t)); 4665 } 4666 4667 /* 4668 * A read to a cache device completed. Validate buffer contents before 4669 * handing over to the regular ARC routines. 4670 */ 4671 static void 4672 l2arc_read_done(zio_t *zio) 4673 { 4674 l2arc_read_callback_t *cb; 4675 arc_buf_hdr_t *hdr; 4676 arc_buf_t *buf; 4677 kmutex_t *hash_lock; 4678 int equal; 4679 4680 ASSERT(zio->io_vd != NULL); 4681 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 4682 4683 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 4684 4685 cb = zio->io_private; 4686 ASSERT(cb != NULL); 4687 buf = cb->l2rcb_buf; 4688 ASSERT(buf != NULL); 4689 4690 hash_lock = HDR_LOCK(buf->b_hdr); 4691 mutex_enter(hash_lock); 4692 hdr = buf->b_hdr; 4693 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4694 4695 /* 4696 * If the buffer was compressed, decompress it first. 4697 */ 4698 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) 4699 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); 4700 ASSERT(zio->io_data != NULL); 4701 4702 /* 4703 * Check this survived the L2ARC journey. 4704 */ 4705 equal = arc_cksum_equal(buf); 4706 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 4707 mutex_exit(hash_lock); 4708 zio->io_private = buf; 4709 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 4710 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 4711 arc_read_done(zio); 4712 } else { 4713 mutex_exit(hash_lock); 4714 /* 4715 * Buffer didn't survive caching. Increment stats and 4716 * reissue to the original storage device. 4717 */ 4718 if (zio->io_error != 0) { 4719 ARCSTAT_BUMP(arcstat_l2_io_error); 4720 } else { 4721 zio->io_error = SET_ERROR(EIO); 4722 } 4723 if (!equal) 4724 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 4725 4726 /* 4727 * If there's no waiter, issue an async i/o to the primary 4728 * storage now. If there *is* a waiter, the caller must 4729 * issue the i/o in a context where it's OK to block. 4730 */ 4731 if (zio->io_waiter == NULL) { 4732 zio_t *pio = zio_unique_parent(zio); 4733 4734 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 4735 4736 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 4737 buf->b_data, zio->io_size, arc_read_done, buf, 4738 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 4739 } 4740 } 4741 4742 kmem_free(cb, sizeof (l2arc_read_callback_t)); 4743 } 4744 4745 /* 4746 * This is the list priority from which the L2ARC will search for pages to 4747 * cache. This is used within loops (0..3) to cycle through lists in the 4748 * desired order. This order can have a significant effect on cache 4749 * performance. 4750 * 4751 * Currently the metadata lists are hit first, MFU then MRU, followed by 4752 * the data lists. This function returns a locked list, and also returns 4753 * the lock pointer. 4754 */ 4755 static list_t * 4756 l2arc_list_locked(int list_num, kmutex_t **lock) 4757 { 4758 list_t *list = NULL; 4759 4760 ASSERT(list_num >= 0 && list_num <= 3); 4761 4762 switch (list_num) { 4763 case 0: 4764 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; 4765 *lock = &arc_mfu->arcs_mtx; 4766 break; 4767 case 1: 4768 list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; 4769 *lock = &arc_mru->arcs_mtx; 4770 break; 4771 case 2: 4772 list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; 4773 *lock = &arc_mfu->arcs_mtx; 4774 break; 4775 case 3: 4776 list = &arc_mru->arcs_list[ARC_BUFC_DATA]; 4777 *lock = &arc_mru->arcs_mtx; 4778 break; 4779 } 4780 4781 ASSERT(!(MUTEX_HELD(*lock))); 4782 mutex_enter(*lock); 4783 return (list); 4784 } 4785 4786 /* 4787 * Evict buffers from the device write hand to the distance specified in 4788 * bytes. This distance may span populated buffers, it may span nothing. 4789 * This is clearing a region on the L2ARC device ready for writing. 4790 * If the 'all' boolean is set, every buffer is evicted. 4791 */ 4792 static void 4793 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 4794 { 4795 list_t *buflist; 4796 l2arc_buf_hdr_t *abl2; 4797 arc_buf_hdr_t *ab, *ab_prev; 4798 kmutex_t *hash_lock; 4799 uint64_t taddr; 4800 4801 buflist = dev->l2ad_buflist; 4802 4803 if (buflist == NULL) 4804 return; 4805 4806 if (!all && dev->l2ad_first) { 4807 /* 4808 * This is the first sweep through the device. There is 4809 * nothing to evict. 4810 */ 4811 return; 4812 } 4813 4814 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 4815 /* 4816 * When nearing the end of the device, evict to the end 4817 * before the device write hand jumps to the start. 4818 */ 4819 taddr = dev->l2ad_end; 4820 } else { 4821 taddr = dev->l2ad_hand + distance; 4822 } 4823 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 4824 uint64_t, taddr, boolean_t, all); 4825 4826 top: 4827 mutex_enter(&l2arc_buflist_mtx); 4828 for (ab = list_tail(buflist); ab; ab = ab_prev) { 4829 ab_prev = list_prev(buflist, ab); 4830 4831 hash_lock = HDR_LOCK(ab); 4832 if (!mutex_tryenter(hash_lock)) { 4833 /* 4834 * Missed the hash lock. Retry. 4835 */ 4836 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 4837 mutex_exit(&l2arc_buflist_mtx); 4838 mutex_enter(hash_lock); 4839 mutex_exit(hash_lock); 4840 goto top; 4841 } 4842 4843 if (HDR_L2_WRITE_HEAD(ab)) { 4844 /* 4845 * We hit a write head node. Leave it for 4846 * l2arc_write_done(). 4847 */ 4848 list_remove(buflist, ab); 4849 mutex_exit(hash_lock); 4850 continue; 4851 } 4852 4853 if (!all && ab->b_l2hdr != NULL && 4854 (ab->b_l2hdr->b_daddr > taddr || 4855 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { 4856 /* 4857 * We've evicted to the target address, 4858 * or the end of the device. 4859 */ 4860 mutex_exit(hash_lock); 4861 break; 4862 } 4863 4864 if (HDR_FREE_IN_PROGRESS(ab)) { 4865 /* 4866 * Already on the path to destruction. 4867 */ 4868 mutex_exit(hash_lock); 4869 continue; 4870 } 4871 4872 if (ab->b_state == arc_l2c_only) { 4873 ASSERT(!HDR_L2_READING(ab)); 4874 /* 4875 * This doesn't exist in the ARC. Destroy. 4876 * arc_hdr_destroy() will call list_remove() 4877 * and decrement arcstat_l2_size. 4878 */ 4879 arc_change_state(arc_anon, ab, hash_lock); 4880 arc_hdr_destroy(ab); 4881 } else { 4882 /* 4883 * Invalidate issued or about to be issued 4884 * reads, since we may be about to write 4885 * over this location. 4886 */ 4887 if (HDR_L2_READING(ab)) { 4888 ARCSTAT_BUMP(arcstat_l2_evict_reading); 4889 ab->b_flags |= ARC_L2_EVICTED; 4890 } 4891 4892 /* 4893 * Tell ARC this no longer exists in L2ARC. 4894 */ 4895 if (ab->b_l2hdr != NULL) { 4896 abl2 = ab->b_l2hdr; 4897 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); 4898 ab->b_l2hdr = NULL; 4899 kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 4900 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4901 } 4902 list_remove(buflist, ab); 4903 4904 /* 4905 * This may have been leftover after a 4906 * failed write. 4907 */ 4908 ab->b_flags &= ~ARC_L2_WRITING; 4909 } 4910 mutex_exit(hash_lock); 4911 } 4912 mutex_exit(&l2arc_buflist_mtx); 4913 4914 vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0); 4915 dev->l2ad_evict = taddr; 4916 } 4917 4918 /* 4919 * Find and write ARC buffers to the L2ARC device. 4920 * 4921 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid 4922 * for reading until they have completed writing. 4923 * The headroom_boost is an in-out parameter used to maintain headroom boost 4924 * state between calls to this function. 4925 * 4926 * Returns the number of bytes actually written (which may be smaller than 4927 * the delta by which the device hand has changed due to alignment). 4928 */ 4929 static uint64_t 4930 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, 4931 boolean_t *headroom_boost) 4932 { 4933 arc_buf_hdr_t *ab, *ab_prev, *head; 4934 list_t *list; 4935 uint64_t write_asize, write_psize, write_sz, headroom, 4936 buf_compress_minsz; 4937 void *buf_data; 4938 kmutex_t *list_lock; 4939 boolean_t full; 4940 l2arc_write_callback_t *cb; 4941 zio_t *pio, *wzio; 4942 uint64_t guid = spa_load_guid(spa); 4943 const boolean_t do_headroom_boost = *headroom_boost; 4944 4945 /* persistency-related */ 4946 l2pbuf_t *pb; 4947 l2pbuf_buflist_t *pb_buflist; 4948 int num_bufs, buf_index; 4949 4950 ASSERT(dev->l2ad_vdev != NULL); 4951 4952 /* Lower the flag now, we might want to raise it again later. */ 4953 *headroom_boost = B_FALSE; 4954 4955 pio = NULL; 4956 cb = NULL; 4957 write_sz = write_asize = write_psize = 0; 4958 full = B_FALSE; 4959 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 4960 head->b_flags |= ARC_L2_WRITE_HEAD; 4961 4962 /* 4963 * We will want to try to compress buffers that are at least 2x the 4964 * device sector size. 4965 */ 4966 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 4967 4968 pb = &dev->l2ad_pbuf; 4969 num_bufs = 0; 4970 4971 /* 4972 * We will want to try to compress buffers that are at least 2x the 4973 * device sector size. 4974 */ 4975 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 4976 4977 /* 4978 * Copy buffers for L2ARC writing. 4979 */ 4980 mutex_enter(&l2arc_buflist_mtx); 4981 for (int try = 0; try <= 3; try++) { 4982 uint64_t passed_sz = 0; 4983 4984 list = l2arc_list_locked(try, &list_lock); 4985 4986 /* 4987 * L2ARC fast warmup. 4988 * 4989 * Until the ARC is warm and starts to evict, read from the 4990 * head of the ARC lists rather than the tail. 4991 */ 4992 if (arc_warm == B_FALSE) 4993 ab = list_head(list); 4994 else 4995 ab = list_tail(list); 4996 4997 headroom = target_sz * l2arc_headroom; 4998 if (do_headroom_boost) 4999 headroom = (headroom * l2arc_headroom_boost) / 100; 5000 5001 for (; ab; ab = ab_prev) { 5002 l2arc_buf_hdr_t *l2hdr; 5003 kmutex_t *hash_lock; 5004 uint64_t buf_sz; 5005 5006 if (arc_warm == B_FALSE) 5007 ab_prev = list_next(list, ab); 5008 else 5009 ab_prev = list_prev(list, ab); 5010 5011 hash_lock = HDR_LOCK(ab); 5012 if (!mutex_tryenter(hash_lock)) { 5013 /* 5014 * Skip this buffer rather than waiting. 5015 */ 5016 continue; 5017 } 5018 5019 passed_sz += ab->b_size; 5020 if (passed_sz > headroom) { 5021 /* 5022 * Searched too far. 5023 */ 5024 mutex_exit(hash_lock); 5025 break; 5026 } 5027 5028 if (!l2arc_write_eligible(guid, ab)) { 5029 mutex_exit(hash_lock); 5030 continue; 5031 } 5032 5033 if ((write_sz + ab->b_size) > target_sz) { 5034 full = B_TRUE; 5035 mutex_exit(hash_lock); 5036 break; 5037 } 5038 5039 if (pio == NULL) { 5040 /* 5041 * Insert a dummy header on the buflist so 5042 * l2arc_write_done() can find where the 5043 * write buffers begin without searching. 5044 */ 5045 list_insert_head(dev->l2ad_buflist, head); 5046 5047 cb = kmem_zalloc( 5048 sizeof (l2arc_write_callback_t), KM_SLEEP); 5049 cb->l2wcb_dev = dev; 5050 cb->l2wcb_head = head; 5051 pio = zio_root(spa, l2arc_write_done, cb, 5052 ZIO_FLAG_CANFAIL); 5053 } 5054 5055 /* 5056 * Create and add a new L2ARC header. 5057 */ 5058 l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); 5059 l2hdr->b_dev = dev; 5060 ab->b_flags |= ARC_L2_WRITING; 5061 5062 /* 5063 * Temporarily stash the data buffer in b_tmp_cdata. 5064 * The subsequent write step will pick it up from 5065 * there. This is because can't access ab->b_buf 5066 * without holding the hash_lock, which we in turn 5067 * can't access without holding the ARC list locks 5068 * (which we want to avoid during compression/writing). 5069 */ 5070 l2hdr->b_compress = ZIO_COMPRESS_OFF; 5071 l2hdr->b_asize = ab->b_size; 5072 l2hdr->b_tmp_cdata = ab->b_buf->b_data; 5073 5074 buf_sz = ab->b_size; 5075 ab->b_l2hdr = l2hdr; 5076 5077 list_insert_head(dev->l2ad_buflist, ab); 5078 5079 /* 5080 * Compute and store the buffer cksum before 5081 * writing. On debug the cksum is verified first. 5082 */ 5083 arc_cksum_verify(ab->b_buf); 5084 arc_cksum_compute(ab->b_buf, B_TRUE); 5085 5086 mutex_exit(hash_lock); 5087 5088 write_sz += buf_sz; 5089 num_bufs++; 5090 } 5091 5092 mutex_exit(list_lock); 5093 5094 if (full == B_TRUE) 5095 break; 5096 } 5097 5098 /* No buffers selected for writing? */ 5099 if (pio == NULL) { 5100 ASSERT0(write_sz); 5101 mutex_exit(&l2arc_buflist_mtx); 5102 kmem_cache_free(hdr_cache, head); 5103 return (0); 5104 } 5105 5106 /* expand the pbuf to include a new list */ 5107 pb_buflist = l2arc_pbuf_buflist_alloc(pb, num_bufs); 5108 5109 /* 5110 * Now start writing the buffers. We're starting at the write head 5111 * and work backwards, retracing the course of the buffer selector 5112 * loop above. 5113 */ 5114 for (ab = list_prev(dev->l2ad_buflist, head), buf_index = 0; ab; 5115 ab = list_prev(dev->l2ad_buflist, ab), buf_index++) { 5116 l2arc_buf_hdr_t *l2hdr; 5117 uint64_t buf_sz; 5118 5119 /* 5120 * We shouldn't need to lock the buffer here, since we flagged 5121 * it as ARC_L2_WRITING in the previous step, but we must take 5122 * care to only access its L2 cache parameters. In particular, 5123 * ab->b_buf may be invalid by now due to ARC eviction. 5124 */ 5125 l2hdr = ab->b_l2hdr; 5126 l2hdr->b_daddr = dev->l2ad_hand; 5127 5128 if ((ab->b_flags & ARC_L2COMPRESS) && 5129 l2hdr->b_asize >= buf_compress_minsz) { 5130 if (l2arc_compress_buf(l2hdr)) { 5131 /* 5132 * If compression succeeded, enable headroom 5133 * boost on the next scan cycle. 5134 */ 5135 *headroom_boost = B_TRUE; 5136 } 5137 } 5138 5139 /* 5140 * Pick up the buffer data we had previously stashed away 5141 * (and now potentially also compressed). 5142 */ 5143 buf_data = l2hdr->b_tmp_cdata; 5144 buf_sz = l2hdr->b_asize; 5145 5146 /* Compression may have squashed the buffer to zero length. */ 5147 if (buf_sz != 0) { 5148 uint64_t buf_p_sz; 5149 5150 wzio = zio_write_phys(pio, dev->l2ad_vdev, 5151 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 5152 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 5153 ZIO_FLAG_CANFAIL, B_FALSE); 5154 5155 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 5156 zio_t *, wzio); 5157 (void) zio_nowait(wzio); 5158 5159 write_asize += buf_sz; 5160 /* 5161 * Keep the clock hand suitably device-aligned. 5162 */ 5163 buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 5164 write_psize += buf_p_sz; 5165 dev->l2ad_hand += buf_p_sz; 5166 } 5167 5168 l2arc_pbuflist_insert(pb, pb_buflist, ab, buf_index); 5169 } 5170 ASSERT(buf_index == num_bufs); 5171 mutex_exit(&l2arc_buflist_mtx); 5172 5173 ASSERT3U(write_asize, <=, target_sz); 5174 ARCSTAT_BUMP(arcstat_l2_writes_sent); 5175 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); 5176 ARCSTAT_INCR(arcstat_l2_size, write_sz); 5177 ARCSTAT_INCR(arcstat_l2_asize, write_asize); 5178 vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0); 5179 5180 /* Is it time to commit this pbuf? */ 5181 if (L2PBUF_IS_FULL(pb) && 5182 dev->l2ad_hand + L2PBUF_ENCODED_SIZE(pb) < dev->l2ad_end) { 5183 l2arc_pbuf_commit(dev, pio, cb); 5184 l2arc_pbuf_destroy(pb); 5185 l2arc_pbuf_init(pb); 5186 } 5187 5188 /* 5189 * Bump device hand to the device start if it is approaching the end. 5190 * l2arc_evict() will already have evicted ahead for this case. 5191 */ 5192 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 5193 vdev_space_update(dev->l2ad_vdev, 5194 dev->l2ad_end - dev->l2ad_hand, 0, 0); 5195 dev->l2ad_hand = dev->l2ad_start; 5196 dev->l2ad_evict = dev->l2ad_start; 5197 dev->l2ad_first = B_FALSE; 5198 } 5199 5200 dev->l2ad_writing = B_TRUE; 5201 (void) zio_wait(pio); 5202 dev->l2ad_writing = B_FALSE; 5203 5204 return (write_asize); 5205 } 5206 5207 /* 5208 * Compresses an L2ARC buffer. 5209 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its 5210 * size in l2hdr->b_asize. This routine tries to compress the data and 5211 * depending on the compression result there are three possible outcomes: 5212 * *) The buffer was incompressible. The original l2hdr contents were left 5213 * untouched and are ready for writing to an L2 device. 5214 * *) The buffer was all-zeros, so there is no need to write it to an L2 5215 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is 5216 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. 5217 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary 5218 * data buffer which holds the compressed data to be written, and b_asize 5219 * tells us how much data there is. b_compress is set to the appropriate 5220 * compression algorithm. Once writing is done, invoke 5221 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. 5222 * 5223 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the 5224 * buffer was incompressible). 5225 */ 5226 static boolean_t 5227 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) 5228 { 5229 void *cdata; 5230 size_t csize, len; 5231 5232 ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF); 5233 ASSERT(l2hdr->b_tmp_cdata != NULL); 5234 5235 len = l2hdr->b_asize; 5236 cdata = zio_data_buf_alloc(len); 5237 csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata, 5238 cdata, l2hdr->b_asize); 5239 5240 if (csize == 0) { 5241 /* zero block, indicate that there's nothing to write */ 5242 zio_data_buf_free(cdata, len); 5243 l2hdr->b_compress = ZIO_COMPRESS_EMPTY; 5244 l2hdr->b_asize = 0; 5245 l2hdr->b_tmp_cdata = NULL; 5246 ARCSTAT_BUMP(arcstat_l2_compress_zeros); 5247 return (B_TRUE); 5248 } else if (csize > 0 && csize < len) { 5249 /* 5250 * Compression succeeded, we'll keep the cdata around for 5251 * writing and release it afterwards. 5252 */ 5253 l2hdr->b_compress = ZIO_COMPRESS_LZ4; 5254 l2hdr->b_asize = csize; 5255 l2hdr->b_tmp_cdata = cdata; 5256 ARCSTAT_BUMP(arcstat_l2_compress_successes); 5257 return (B_TRUE); 5258 } else { 5259 /* 5260 * Compression failed, release the compressed buffer. 5261 * l2hdr will be left unmodified. 5262 */ 5263 zio_data_buf_free(cdata, len); 5264 ARCSTAT_BUMP(arcstat_l2_compress_failures); 5265 return (B_FALSE); 5266 } 5267 } 5268 5269 /* 5270 * Decompresses a zio read back from an l2arc device. On success, the 5271 * underlying zio's io_data buffer is overwritten by the uncompressed 5272 * version. On decompression error (corrupt compressed stream), the 5273 * zio->io_error value is set to signal an I/O error. 5274 * 5275 * Please note that the compressed data stream is not checksummed, so 5276 * if the underlying device is experiencing data corruption, we may feed 5277 * corrupt data to the decompressor, so the decompressor needs to be 5278 * able to handle this situation (LZ4 does). 5279 */ 5280 static void 5281 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) 5282 { 5283 ASSERT(L2ARC_IS_VALID_COMPRESS(c)); 5284 5285 if (zio->io_error != 0) { 5286 /* 5287 * An io error has occured, just restore the original io 5288 * size in preparation for a main pool read. 5289 */ 5290 zio->io_orig_size = zio->io_size = hdr->b_size; 5291 return; 5292 } 5293 5294 if (c == ZIO_COMPRESS_EMPTY) { 5295 /* 5296 * An empty buffer results in a null zio, which means we 5297 * need to fill its io_data after we're done restoring the 5298 * buffer's contents. 5299 */ 5300 ASSERT(hdr->b_buf != NULL); 5301 bzero(hdr->b_buf->b_data, hdr->b_size); 5302 zio->io_data = zio->io_orig_data = hdr->b_buf->b_data; 5303 } else { 5304 ASSERT(zio->io_data != NULL); 5305 /* 5306 * We copy the compressed data from the start of the arc buffer 5307 * (the zio_read will have pulled in only what we need, the 5308 * rest is garbage which we will overwrite at decompression) 5309 * and then decompress back to the ARC data buffer. This way we 5310 * can minimize copying by simply decompressing back over the 5311 * original compressed data (rather than decompressing to an 5312 * aux buffer and then copying back the uncompressed buffer, 5313 * which is likely to be much larger). 5314 */ 5315 uint64_t csize; 5316 void *cdata; 5317 5318 csize = zio->io_size; 5319 cdata = zio_data_buf_alloc(csize); 5320 bcopy(zio->io_data, cdata, csize); 5321 if (zio_decompress_data(c, cdata, zio->io_data, csize, 5322 hdr->b_size) != 0) 5323 zio->io_error = EIO; 5324 zio_data_buf_free(cdata, csize); 5325 } 5326 5327 /* Restore the expected uncompressed IO size. */ 5328 zio->io_orig_size = zio->io_size = hdr->b_size; 5329 } 5330 5331 /* 5332 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. 5333 * This buffer serves as a temporary holder of compressed data while 5334 * the buffer entry is being written to an l2arc device. Once that is 5335 * done, we can dispose of it. 5336 */ 5337 static void 5338 l2arc_release_cdata_buf(arc_buf_hdr_t *ab) 5339 { 5340 l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr; 5341 5342 if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) { 5343 /* 5344 * If the data was compressed, then we've allocated a 5345 * temporary buffer for it, so now we need to release it. 5346 */ 5347 ASSERT(l2hdr->b_tmp_cdata != NULL); 5348 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size); 5349 } 5350 l2hdr->b_tmp_cdata = NULL; 5351 } 5352 5353 /* 5354 * This thread feeds the L2ARC at regular intervals. This is the beating 5355 * heart of the L2ARC. 5356 */ 5357 static void 5358 l2arc_feed_thread(void) 5359 { 5360 callb_cpr_t cpr; 5361 l2arc_dev_t *dev; 5362 spa_t *spa; 5363 uint64_t size, wrote; 5364 clock_t begin, next = ddi_get_lbolt(); 5365 boolean_t headroom_boost = B_FALSE; 5366 5367 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 5368 5369 mutex_enter(&l2arc_feed_thr_lock); 5370 5371 while (l2arc_thread_exit == 0) { 5372 CALLB_CPR_SAFE_BEGIN(&cpr); 5373 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 5374 next); 5375 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 5376 next = ddi_get_lbolt() + hz; 5377 5378 /* 5379 * Quick check for L2ARC devices. 5380 */ 5381 mutex_enter(&l2arc_dev_mtx); 5382 if (l2arc_ndev == 0) { 5383 mutex_exit(&l2arc_dev_mtx); 5384 continue; 5385 } 5386 mutex_exit(&l2arc_dev_mtx); 5387 begin = ddi_get_lbolt(); 5388 5389 /* 5390 * This selects the next l2arc device to write to, and in 5391 * doing so the next spa to feed from: dev->l2ad_spa. This 5392 * will return NULL if there are now no l2arc devices or if 5393 * they are all faulted. 5394 * 5395 * If a device is returned, its spa's config lock is also 5396 * held to prevent device removal. l2arc_dev_get_next() 5397 * will grab and release l2arc_dev_mtx. 5398 */ 5399 if ((dev = l2arc_dev_get_next()) == NULL) 5400 continue; 5401 5402 spa = dev->l2ad_spa; 5403 ASSERT(spa != NULL); 5404 5405 /* 5406 * If the pool is read-only then force the feed thread to 5407 * sleep a little longer. 5408 */ 5409 if (!spa_writeable(spa)) { 5410 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 5411 spa_config_exit(spa, SCL_L2ARC, dev); 5412 continue; 5413 } 5414 5415 /* 5416 * Avoid contributing to memory pressure. 5417 */ 5418 if (arc_reclaim_needed()) { 5419 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 5420 spa_config_exit(spa, SCL_L2ARC, dev); 5421 continue; 5422 } 5423 5424 ARCSTAT_BUMP(arcstat_l2_feeds); 5425 5426 size = l2arc_write_size(); 5427 5428 /* 5429 * Evict L2ARC buffers that will be overwritten. 5430 */ 5431 l2arc_evict(dev, size, B_FALSE); 5432 5433 /* 5434 * Write ARC buffers. 5435 */ 5436 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); 5437 5438 /* 5439 * Calculate interval between writes. 5440 */ 5441 next = l2arc_write_interval(begin, size, wrote); 5442 spa_config_exit(spa, SCL_L2ARC, dev); 5443 } 5444 5445 l2arc_thread_exit = 0; 5446 cv_broadcast(&l2arc_feed_thr_cv); 5447 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 5448 thread_exit(); 5449 } 5450 5451 boolean_t 5452 l2arc_vdev_present(vdev_t *vd) 5453 { 5454 l2arc_dev_t *dev; 5455 5456 mutex_enter(&l2arc_dev_mtx); 5457 for (dev = list_head(l2arc_dev_list); dev != NULL; 5458 dev = list_next(l2arc_dev_list, dev)) { 5459 if (dev->l2ad_vdev == vd) 5460 break; 5461 } 5462 mutex_exit(&l2arc_dev_mtx); 5463 5464 return (dev != NULL); 5465 } 5466 5467 /* 5468 * Add a vdev for use by the L2ARC. By this point the spa has already 5469 * validated the vdev and opened it. The `rebuild' flag indicates whether 5470 * we should attempt an L2ARC persistency rebuild. 5471 */ 5472 void 5473 l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild) 5474 { 5475 l2arc_dev_t *adddev; 5476 5477 ASSERT(!l2arc_vdev_present(vd)); 5478 5479 /* 5480 * Create a new l2arc device entry. 5481 */ 5482 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 5483 adddev->l2ad_spa = spa; 5484 adddev->l2ad_vdev = vd; 5485 adddev->l2ad_start = VDEV_LABEL_START_SIZE + L2UBERBLOCK_SIZE; 5486 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 5487 adddev->l2ad_hand = adddev->l2ad_start; 5488 adddev->l2ad_evict = adddev->l2ad_start; 5489 adddev->l2ad_first = B_TRUE; 5490 adddev->l2ad_writing = B_FALSE; 5491 l2arc_pbuf_init(&adddev->l2ad_pbuf); 5492 5493 /* 5494 * This is a list of all ARC buffers that are still valid on the 5495 * device. 5496 */ 5497 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); 5498 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 5499 offsetof(arc_buf_hdr_t, b_l2node)); 5500 5501 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 5502 5503 /* 5504 * Add device to global list 5505 */ 5506 mutex_enter(&l2arc_dev_mtx); 5507 list_insert_head(l2arc_dev_list, adddev); 5508 atomic_inc_64(&l2arc_ndev); 5509 if (rebuild && l2arc_rebuild_enabled) { 5510 adddev->l2ad_rebuilding = B_TRUE; 5511 (void) thread_create(NULL, 0, l2arc_rebuild_start, adddev, 5512 0, &p0, TS_RUN, minclsyspri); 5513 } 5514 mutex_exit(&l2arc_dev_mtx); 5515 } 5516 5517 /* 5518 * Remove a vdev from the L2ARC. 5519 */ 5520 void 5521 l2arc_remove_vdev(vdev_t *vd) 5522 { 5523 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 5524 5525 /* 5526 * Find the device by vdev 5527 */ 5528 mutex_enter(&l2arc_dev_mtx); 5529 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 5530 nextdev = list_next(l2arc_dev_list, dev); 5531 if (vd == dev->l2ad_vdev) { 5532 remdev = dev; 5533 break; 5534 } 5535 } 5536 ASSERT(remdev != NULL); 5537 5538 /* 5539 * Remove device from global list 5540 */ 5541 list_remove(l2arc_dev_list, remdev); 5542 l2arc_dev_last = NULL; /* may have been invalidated */ 5543 atomic_dec_64(&l2arc_ndev); 5544 mutex_exit(&l2arc_dev_mtx); 5545 5546 /* 5547 * Clear all buflists and ARC references. L2ARC device flush. 5548 */ 5549 l2arc_pbuf_destroy(&remdev->l2ad_pbuf); 5550 l2arc_evict(remdev, 0, B_TRUE); 5551 list_destroy(remdev->l2ad_buflist); 5552 kmem_free(remdev->l2ad_buflist, sizeof (list_t)); 5553 kmem_free(remdev, sizeof (l2arc_dev_t)); 5554 } 5555 5556 void 5557 l2arc_init(void) 5558 { 5559 l2arc_thread_exit = 0; 5560 l2arc_ndev = 0; 5561 l2arc_writes_sent = 0; 5562 l2arc_writes_done = 0; 5563 5564 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 5565 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 5566 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 5567 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); 5568 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 5569 5570 l2arc_dev_list = &L2ARC_dev_list; 5571 l2arc_free_on_write = &L2ARC_free_on_write; 5572 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 5573 offsetof(l2arc_dev_t, l2ad_node)); 5574 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 5575 offsetof(l2arc_data_free_t, l2df_list_node)); 5576 } 5577 5578 void 5579 l2arc_fini(void) 5580 { 5581 /* 5582 * This is called from dmu_fini(), which is called from spa_fini(); 5583 * Because of this, we can assume that all l2arc devices have 5584 * already been removed when the pools themselves were removed. 5585 */ 5586 5587 l2arc_do_free_on_write(); 5588 5589 mutex_destroy(&l2arc_feed_thr_lock); 5590 cv_destroy(&l2arc_feed_thr_cv); 5591 mutex_destroy(&l2arc_dev_mtx); 5592 mutex_destroy(&l2arc_buflist_mtx); 5593 mutex_destroy(&l2arc_free_on_write_mtx); 5594 5595 list_destroy(l2arc_dev_list); 5596 list_destroy(l2arc_free_on_write); 5597 } 5598 5599 void 5600 l2arc_start(void) 5601 { 5602 if (!(spa_mode_global & FWRITE)) 5603 return; 5604 5605 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 5606 TS_RUN, minclsyspri); 5607 } 5608 5609 void 5610 l2arc_stop(void) 5611 { 5612 if (!(spa_mode_global & FWRITE)) 5613 return; 5614 5615 mutex_enter(&l2arc_feed_thr_lock); 5616 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 5617 l2arc_thread_exit = 1; 5618 while (l2arc_thread_exit != 0) 5619 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 5620 mutex_exit(&l2arc_feed_thr_lock); 5621 } 5622 5623 /* 5624 * Main entry point for L2ARC metadata rebuilding. This function must be 5625 * called via thread_create so that the L2ARC metadata rebuild doesn't block 5626 * pool import and may proceed in parallel on all available L2ARC devices. 5627 */ 5628 static void 5629 l2arc_rebuild_start(l2arc_dev_t *dev) 5630 { 5631 vdev_t *vd = dev->l2ad_vdev; 5632 spa_t *spa = dev->l2ad_spa; 5633 5634 /* Lock out device removal. */ 5635 spa_config_enter(spa, SCL_L2ARC, vd, RW_READER); 5636 ASSERT(dev->l2ad_rebuilding == B_TRUE); 5637 l2arc_rebuild(dev); 5638 dev->l2ad_rebuilding = B_FALSE; 5639 spa_config_exit(spa, SCL_L2ARC, vd); 5640 thread_exit(); 5641 } 5642 5643 /* 5644 * This function implements the actual L2ARC metadata rebuild. It: 5645 * 5646 * 1) scans the device for valid l2uberblocks 5647 * 2) if it finds a good uberblock, starts reading the pbuf chain 5648 * 3) restores each pbuf's contents to memory 5649 * 5650 * Operation stops under any of the following conditions: 5651 * 5652 * 1) We reach the end of the pbuf chain (the previous-buffer reference 5653 * in the pbuf is zero). 5654 * 2) We encounter *any* error condition (cksum errors, io errors, looped 5655 * pbufs, etc.). 5656 * 3) The l2arc_rebuild_timeout is hit - this is a final resort to protect 5657 * from making severely fragmented L2ARC pbufs or slow L2ARC devices 5658 * prevent a machine from importing the pool (and letting the 5659 * administrator take corrective action, e.g. by kicking the misbehaving 5660 * L2ARC device out of the pool, or by reimporting the pool with L2ARC 5661 * rebuilding disabled). 5662 */ 5663 static void 5664 l2arc_rebuild(l2arc_dev_t *dev) 5665 { 5666 int err; 5667 l2uberblock_t ub; 5668 l2pbuf_t pb; 5669 zio_t *this_io = NULL, *next_io = NULL; 5670 int64_t deadline = ddi_get_lbolt64() + hz * l2arc_rebuild_timeout; 5671 5672 if ((err = l2arc_uberblock_find(dev, &ub)) != 0) 5673 return; 5674 L2ARC_CHK_REBUILD_TIMEOUT(deadline, /* nop */); 5675 5676 /* set up uberblock update info */ 5677 dev->l2ad_uberblock_birth = ub.ub_birth + 1; 5678 5679 /* initial sanity checks */ 5680 l2arc_pbuf_init(&pb); 5681 if ((err = l2arc_pbuf_read(dev, ub.ub_pbuf_daddr, ub.ub_pbuf_asize, 5682 ub.ub_pbuf_cksum, &pb, NULL, &this_io)) != 0) { 5683 /* root pbuf is bad, we can't do anything about that */ 5684 if (err == EINVAL) { 5685 ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors); 5686 } else { 5687 ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors); 5688 } 5689 l2arc_pbuf_destroy(&pb); 5690 return; 5691 } 5692 L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb)); 5693 5694 dev->l2ad_evict = ub.ub_evict_tail; 5695 5696 /* keep on chaining in new blocks */ 5697 dev->l2ad_pbuf_daddr = ub.ub_pbuf_daddr; 5698 dev->l2ad_pbuf_asize = ub.ub_pbuf_asize; 5699 dev->l2ad_pbuf_cksum = ub.ub_pbuf_cksum; 5700 dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev, 5701 ub.ub_pbuf_daddr + ub.ub_pbuf_asize); 5702 dev->l2ad_first = ((ub.ub_flags & L2UBLK_EVICT_FIRST) != 0); 5703 5704 /* start the rebuild process */ 5705 for (;;) { 5706 l2pbuf_t pb_prev; 5707 5708 l2arc_pbuf_init(&pb_prev); 5709 if ((err = l2arc_pbuf_read(dev, pb.pb_prev_daddr, 5710 pb.pb_prev_asize, pb.pb_prev_cksum, &pb_prev, this_io, 5711 &next_io)) != 0) { 5712 /* 5713 * We are done reading, discard the last good buffer. 5714 */ 5715 if (pb.pb_prev_daddr > dev->l2ad_hand && 5716 pb.pb_prev_asize > L2PBUF_HDR_SIZE) { 5717 /* this is an error, we stopped too early */ 5718 if (err == EINVAL) { 5719 ARCSTAT_BUMP( 5720 arcstat_l2_rebuild_cksum_errors); 5721 } else { 5722 ARCSTAT_BUMP( 5723 arcstat_l2_rebuild_io_errors); 5724 } 5725 } 5726 l2arc_pbuf_destroy(&pb_prev); 5727 l2arc_pbuf_destroy(&pb); 5728 break; 5729 } 5730 5731 /* 5732 * Protection against infinite loops of pbufs. This is also 5733 * our primary termination mechanism - once the buffer list 5734 * loops around our starting pbuf, we can stop. 5735 */ 5736 if (pb.pb_prev_daddr >= ub.ub_pbuf_daddr && 5737 pb_prev.pb_prev_daddr <= ub.ub_pbuf_daddr) { 5738 ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors); 5739 l2arc_pbuf_destroy(&pb); 5740 l2arc_pbuf_destroy(&pb_prev); 5741 if (next_io) 5742 l2arc_pbuf_prefetch_abort(next_io); 5743 return; 5744 } 5745 5746 /* 5747 * Our memory pressure valve. If the system is running low 5748 * on memory, rather than swamping memory with new ARC buf 5749 * hdrs, we opt not to reconstruct the L2ARC. At this point, 5750 * however, we have already set up our L2ARC dev to chain in 5751 * new metadata pbufs, so the user may choose to re-add the 5752 * L2ARC dev at a later time to reconstruct it (when there's 5753 * less memory pressure). 5754 */ 5755 if (arc_reclaim_needed()) { 5756 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem); 5757 cmn_err(CE_NOTE, "System running low on memory, " 5758 "aborting L2ARC rebuild."); 5759 l2arc_pbuf_destroy(&pb); 5760 l2arc_pbuf_destroy(&pb_prev); 5761 if (next_io) 5762 l2arc_pbuf_prefetch_abort(next_io); 5763 break; 5764 } 5765 5766 /* 5767 * Now that we know that the prev_pbuf checks out alright, we 5768 * can start reconstruction from this pbuf - we can be sure 5769 * that the L2ARC write hand has not yet reached any of our 5770 * buffers. 5771 */ 5772 l2arc_pbuf_restore(dev, &pb); 5773 5774 /* pbuf restored, continue with next one in the list */ 5775 l2arc_pbuf_destroy(&pb); 5776 pb = pb_prev; 5777 this_io = next_io; 5778 next_io = NULL; 5779 5780 L2ARC_CHK_REBUILD_TIMEOUT(deadline, l2arc_pbuf_destroy(&pb)); 5781 } 5782 5783 ARCSTAT_BUMP(arcstat_l2_rebuild_successes); 5784 } 5785 5786 /* 5787 * Restores the payload of a pbuf to ARC. This creates empty ARC hdr entries 5788 * which only contain an l2arc hdr, essentially restoring the buffers to 5789 * their L2ARC evicted state. This function also updates space usage on the 5790 * L2ARC vdev to make sure it tracks restored buffers. 5791 */ 5792 static void 5793 l2arc_pbuf_restore(l2arc_dev_t *dev, l2pbuf_t *pb) 5794 { 5795 spa_t *spa; 5796 uint64_t guid; 5797 list_t *buflists_list; 5798 l2pbuf_buflist_t *buflist; 5799 5800 mutex_enter(&l2arc_buflist_mtx); 5801 spa = dev->l2ad_vdev->vdev_spa; 5802 guid = spa_load_guid(spa); 5803 buflists_list = pb->pb_buflists_list; 5804 for (buflist = list_head(buflists_list); buflist; 5805 buflist = list_next(buflists_list, buflist)) { 5806 int i; 5807 uint64_t size, asize, psize; 5808 5809 size = asize = psize = 0; 5810 for (i = 0; i < buflist->l2pbl_nbufs; i++) { 5811 l2arc_hdr_restore(&buflist->l2pbl_bufs[i], dev, 5812 guid); 5813 size += buflist->l2pbl_bufs[i].b_size; 5814 asize += buflist->l2pbl_bufs[i].b_l2asize; 5815 psize += vdev_psize_to_asize(dev->l2ad_vdev, 5816 buflist->l2pbl_bufs[i].b_l2asize); 5817 } 5818 ARCSTAT_INCR(arcstat_l2_rebuild_arc_bytes, size); 5819 ARCSTAT_INCR(arcstat_l2_rebuild_l2arc_bytes, asize); 5820 ARCSTAT_INCR(arcstat_l2_rebuild_bufs, buflist->l2pbl_nbufs); 5821 vdev_space_update(dev->l2ad_vdev, psize, 0, 0); 5822 } 5823 mutex_exit(&l2arc_buflist_mtx); 5824 ARCSTAT_BUMP(arcstat_l2_rebuild_metabufs); 5825 vdev_space_update(dev->l2ad_vdev, vdev_psize_to_asize(dev->l2ad_vdev, 5826 pb->pb_asize), 0, 0); 5827 } 5828 5829 /* 5830 * Restores a single ARC buf hdr from a pbuf. The ARC buffer is put into 5831 * a state indicating that it has been evicted to L2ARC. 5832 * The `guid' here is the ARC-load-guid from spa_load_guid. 5833 */ 5834 static void 5835 l2arc_hdr_restore(const l2pbuf_buf_t *buf, l2arc_dev_t *dev, uint64_t guid) 5836 { 5837 arc_buf_hdr_t *hdr; 5838 kmutex_t *hash_lock; 5839 dva_t dva = {buf->b_dva.dva_word[0], buf->b_dva.dva_word[1]}; 5840 5841 hdr = buf_hash_find(guid, &dva, buf->b_birth, &hash_lock); 5842 if (hdr == NULL) { 5843 /* not in cache, try to insert */ 5844 arc_buf_hdr_t *exists; 5845 arc_buf_contents_t type = buf->b_contents_type; 5846 l2arc_buf_hdr_t *l2hdr; 5847 5848 hdr = arc_buf_hdr_alloc(guid, buf->b_size, type); 5849 hdr->b_dva = buf->b_dva; 5850 hdr->b_birth = buf->b_birth; 5851 hdr->b_cksum0 = buf->b_cksum0; 5852 hdr->b_size = buf->b_size; 5853 exists = buf_hash_insert(hdr, &hash_lock); 5854 if (exists) { 5855 /* somebody beat us to the hash insert */ 5856 mutex_exit(hash_lock); 5857 arc_hdr_destroy(hdr); 5858 ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached); 5859 return; 5860 } 5861 hdr->b_flags = buf->b_flags; 5862 mutex_enter(&hdr->b_freeze_lock); 5863 ASSERT(hdr->b_freeze_cksum == NULL); 5864 hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), 5865 KM_SLEEP); 5866 *hdr->b_freeze_cksum = buf->b_freeze_cksum; 5867 mutex_exit(&hdr->b_freeze_lock); 5868 5869 /* now rebuild the l2arc entry */ 5870 ASSERT(hdr->b_l2hdr == NULL); 5871 l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); 5872 l2hdr->b_dev = dev; 5873 l2hdr->b_daddr = buf->b_l2daddr; 5874 l2hdr->b_asize = buf->b_l2asize; 5875 l2hdr->b_compress = buf->b_l2compress; 5876 hdr->b_l2hdr = l2hdr; 5877 list_insert_head(dev->l2ad_buflist, hdr); 5878 ARCSTAT_INCR(arcstat_l2_size, hdr->b_size); 5879 ARCSTAT_INCR(arcstat_l2_asize, l2hdr->b_asize); 5880 5881 arc_change_state(arc_l2c_only, hdr, hash_lock); 5882 } 5883 mutex_exit(hash_lock); 5884 } 5885 5886 /* 5887 * Attempts to locate and read the newest valid uberblock on the provided 5888 * L2ARC device and writes it to `ub'. On success, this function returns 0, 5889 * otherwise the appropriate error code is returned. 5890 */ 5891 static int 5892 l2arc_uberblock_find(l2arc_dev_t *dev, l2uberblock_t *ub) 5893 { 5894 int err = 0; 5895 uint8_t *ub_buf; 5896 uint64_t guid; 5897 5898 ARCSTAT_BUMP(arcstat_l2_rebuild_attempts); 5899 ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP); 5900 guid = spa_guid(dev->l2ad_vdev->vdev_spa); 5901 5902 if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, 5903 VDEV_LABEL_START_SIZE, L2UBERBLOCK_SIZE, ub_buf, 5904 ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 5905 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | 5906 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) { 5907 ARCSTAT_BUMP(arcstat_l2_rebuild_io_errors); 5908 goto cleanup; 5909 } 5910 5911 /* 5912 * Initial peek - does the device even have any usable uberblocks? 5913 * If not, don't bother continuing. 5914 */ 5915 l2arc_uberblock_decode(ub_buf, ub); 5916 if (ub->ub_magic != L2UBERBLOCK_MAGIC || ub->ub_version == 0 || 5917 ub->ub_version > L2UBERBLOCK_MAX_VERSION || 5918 ub->ub_spa_guid != guid) { 5919 err = ENOTSUP; 5920 ARCSTAT_BUMP(arcstat_l2_rebuild_unsupported); 5921 goto cleanup; 5922 } 5923 5924 /* now check to make sure that what we selected is okay */ 5925 if ((err = l2arc_uberblock_verify(ub_buf, ub, guid)) != 0) { 5926 if (err == EINVAL) { 5927 ARCSTAT_BUMP(arcstat_l2_rebuild_cksum_errors); 5928 } else { 5929 ARCSTAT_BUMP(arcstat_l2_rebuild_uberblk_errors); 5930 } 5931 goto cleanup; 5932 } 5933 5934 /* this uberblock is valid */ 5935 5936 cleanup: 5937 kmem_free(ub_buf, L2UBERBLOCK_SIZE); 5938 return (err); 5939 } 5940 5941 /* 5942 * Reads a pbuf from storage, decodes it and validates its contents against 5943 * the provided checksum. The result is placed in `pb'. 5944 * 5945 * The `this_io' and `prefetch_io' arguments are used for pbuf prefetching. 5946 * When issuing the first pbuf IO during rebuild, you should pass NULL for 5947 * `this_io'. This function will then issue a sync IO to read the pbuf and 5948 * also issue an async IO to fetch the next pbuf in the pbuf chain. The 5949 * prefetch IO is returned in `prefetch_io. On subsequent calls to this 5950 * function, pass the value returned in `prefetch_io' from the previous 5951 * call as `this_io' and a fresh `prefetch_io' pointer to hold the next 5952 * prefetch IO. Prior to the call, you should initialize your `prefetch_io' 5953 * pointer to be NULL. If no prefetch IO was issued, the pointer is left 5954 * set at NULL. 5955 * 5956 * Actual prefetching takes place in two steps: a header IO (pi_hdr_io) 5957 * and the main pbuf payload IO (placed in prefetch_io). The pi_hdr_io 5958 * IO is used internally in this function to be able to `peek' at the next 5959 * buffer's header before the main IO to read it in completely has finished. 5960 * We can then begin to issue the IO for the next buffer in the chain before 5961 * we are done reading, keeping the L2ARC device's pipeline saturated with 5962 * reads (rather than issuing an IO, waiting for it to complete, validating 5963 * the returned buffer and issuing the next one). This will make sure that 5964 * the rebuild proceeds at maximum read throughput. 5965 * 5966 * On success, this function returns 0, otherwise it returns an appropriate 5967 * error code. On error the prefetching IO is aborted and cleared before 5968 * returning from this function. Therefore, if we return `success', the 5969 * caller can assume that we have taken care of cleanup of prefetch IOs. 5970 */ 5971 static int 5972 l2arc_pbuf_read(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize, 5973 zio_cksum_t cksum, l2pbuf_t *pb, zio_t *this_io, zio_t **prefetch_io) 5974 { 5975 int err = 0; 5976 uint64_t prev_pb_start; 5977 uint32_t prev_pb_asize; 5978 zio_cksum_t calc_cksum, prev_pb_cksum; 5979 l2arc_prefetch_info_t *pi = NULL; 5980 5981 ASSERT(dev != NULL); 5982 ASSERT(pb != NULL); 5983 ASSERT(*prefetch_io == NULL); 5984 5985 if (!l2arc_pbuf_ptr_valid(dev, daddr, asize)) { 5986 /* We could not have issued a prefetch IO for this */ 5987 ASSERT(this_io == NULL); 5988 return (EINVAL); 5989 } 5990 5991 /* 5992 * Check to see if we have issued the IO for this pbuf in a previous 5993 * run. If not, issue it now. 5994 */ 5995 if (this_io == NULL) 5996 this_io = l2arc_pbuf_prefetch(dev->l2ad_vdev, daddr, asize); 5997 5998 /* Pick up the prefetch info buffer and read its contents */ 5999 pi = this_io->io_private; 6000 ASSERT(pi != NULL); 6001 ASSERT(asize <= pi->pi_buflen); 6002 6003 /* Wait for the IO to read this pbuf's header to complete */ 6004 if ((err = zio_wait(pi->pi_hdr_io)) != 0) { 6005 (void) zio_wait(this_io); 6006 goto cleanup; 6007 } 6008 6009 /* 6010 * Peek to see if we can start issuing the next pbuf IO immediately. 6011 * At this point, only the current pbuf's header has been read. 6012 */ 6013 if (l2arc_pbuf_decode_prev_ptr(pi->pi_buf, asize, &prev_pb_start, 6014 &prev_pb_asize, &prev_pb_cksum) == 0) { 6015 uint64_t this_pb_start, this_pb_end, prev_pb_end; 6016 /* Detect malformed pbuf references and loops */ 6017 this_pb_start = daddr; 6018 this_pb_end = daddr + asize; 6019 prev_pb_end = prev_pb_start + prev_pb_asize; 6020 if ((prev_pb_start >= this_pb_start && prev_pb_start < 6021 this_pb_end) || 6022 (prev_pb_end >= this_pb_start && prev_pb_end < 6023 this_pb_end)) { 6024 ARCSTAT_BUMP(arcstat_l2_rebuild_loop_errors); 6025 cmn_err(CE_WARN, "Looping L2ARC metadata reference " 6026 "detected, aborting rebuild."); 6027 err = EINVAL; 6028 goto cleanup; 6029 } 6030 /* 6031 * Start issuing IO for the next pbuf early - this should 6032 * help keep the L2ARC device busy while we read, decode 6033 * and restore this pbuf. 6034 */ 6035 if (l2arc_pbuf_ptr_valid(dev, prev_pb_start, prev_pb_asize)) 6036 *prefetch_io = l2arc_pbuf_prefetch(dev->l2ad_vdev, 6037 prev_pb_start, prev_pb_asize); 6038 } 6039 6040 /* Wait for the main pbuf IO to complete */ 6041 if ((err = zio_wait(this_io)) != 0) 6042 goto cleanup; 6043 6044 /* Make sure the buffer checks out ok */ 6045 fletcher_4_native(pi->pi_buf, asize, &calc_cksum); 6046 if (!ZIO_CHECKSUM_EQUAL(calc_cksum, cksum)) { 6047 err = EINVAL; 6048 goto cleanup; 6049 } 6050 6051 /* Now we can take our time decoding this buffer */ 6052 if ((err = l2arc_pbuf_decode(pi->pi_buf, asize, pb)) != 0) 6053 goto cleanup; 6054 6055 /* This will be used in l2arc_pbuf_restore for space accounting */ 6056 pb->pb_asize = asize; 6057 6058 ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, L2PBUF_ENCODED_SIZE(pb)); 6059 ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, asize); 6060 ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio, 6061 pb->pb_payload_asz / asize); 6062 6063 cleanup: 6064 kmem_free(pi->pi_buf, pi->pi_buflen); 6065 pi->pi_buf = NULL; 6066 kmem_free(pi, sizeof (l2arc_prefetch_info_t)); 6067 /* Abort an in-flight prefetch in case of error */ 6068 if (err != 0 && *prefetch_io != NULL) { 6069 l2arc_pbuf_prefetch_abort(*prefetch_io); 6070 *prefetch_io = NULL; 6071 } 6072 return (err); 6073 } 6074 6075 /* 6076 * Validates a pbuf device address to make sure that it can be read 6077 * from the provided L2ARC device. Returns 1 if the address is within 6078 * the device's bounds, or 0 if not. 6079 */ 6080 static int 6081 l2arc_pbuf_ptr_valid(l2arc_dev_t *dev, uint64_t daddr, uint32_t asize) 6082 { 6083 uint32_t psize; 6084 uint64_t end; 6085 6086 psize = vdev_psize_to_asize(dev->l2ad_vdev, asize); 6087 end = daddr + psize; 6088 6089 if (end > dev->l2ad_end || asize < L2PBUF_HDR_SIZE || 6090 asize > L2PBUF_MAX_PAYLOAD_SIZE || daddr < dev->l2ad_start || 6091 /* check that the buffer address is correctly aligned */ 6092 (daddr & (vdev_psize_to_asize(dev->l2ad_vdev, 6093 SPA_MINBLOCKSIZE) - 1)) != 0) 6094 return (0); 6095 else 6096 return (1); 6097 } 6098 6099 /* 6100 * Starts an asynchronous read IO to read a pbuf. This is used in pbuf 6101 * reconstruction to start reading the next pbuf before we are done 6102 * decoding and reconstructing the current pbuf, to keep the l2arc device 6103 * nice and hot with read IO to process. 6104 * The returned zio will contain a newly allocated memory buffers for the IO 6105 * data which should then be freed by the caller once the zio is no longer 6106 * needed (i.e. due to it having completed). If you wish to abort this 6107 * zio, you should do so using l2arc_pbuf_prefetch_abort, which takes care 6108 * of disposing of the allocated buffers correctly. 6109 */ 6110 static zio_t * 6111 l2arc_pbuf_prefetch(vdev_t *vd, uint64_t daddr, uint32_t asize) 6112 { 6113 uint32_t i, psize; 6114 zio_t *pio, *hdr_io; 6115 uint64_t hdr_rsize; 6116 uint8_t *buf; 6117 l2arc_prefetch_info_t *pinfo; 6118 6119 psize = vdev_psize_to_asize(vd, asize); 6120 buf = kmem_alloc(psize, KM_SLEEP); 6121 pinfo = kmem_alloc(sizeof (l2arc_prefetch_info_t), KM_SLEEP); 6122 pinfo->pi_buf = buf; 6123 pinfo->pi_buflen = psize; 6124 6125 /* 6126 * We start issuing the IO for the pbuf header early. This 6127 * allows l2arc_pbuf_read to start issuing IO for the next 6128 * buffer before the current pbuf is read in completely. 6129 */ 6130 6131 hdr_rsize = vdev_psize_to_asize(vd, SPA_MINBLOCKSIZE); 6132 ASSERT(hdr_rsize <= psize); 6133 pinfo->pi_hdr_io = zio_root(vd->vdev_spa, NULL, NULL, 6134 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | 6135 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); 6136 hdr_io = zio_read_phys(pinfo->pi_hdr_io, vd, daddr, hdr_rsize, buf, 6137 ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_SYNC_READ, 6138 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | 6139 ZIO_FLAG_DONT_RETRY, B_FALSE); 6140 (void) zio_nowait(hdr_io); 6141 6142 /* 6143 * Read in the rest of the pbuf - this can take longer than just 6144 * having a peek at the header. 6145 */ 6146 pio = zio_root(vd->vdev_spa, NULL, pinfo, ZIO_FLAG_DONT_CACHE | 6147 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | 6148 ZIO_FLAG_DONT_RETRY); 6149 for (i = hdr_rsize; i < psize; ) { 6150 uint64_t rsize = psize - i; 6151 zio_t *rzio; 6152 6153 if (psize - i > SPA_MAXBLOCKSIZE) 6154 rsize = SPA_MAXBLOCKSIZE; 6155 ASSERT(rsize >= SPA_MINBLOCKSIZE); 6156 rzio = zio_read_phys(pio, vd, daddr + i, 6157 rsize, buf + i, ZIO_CHECKSUM_OFF, NULL, NULL, 6158 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | 6159 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | 6160 ZIO_FLAG_DONT_RETRY, B_FALSE); 6161 (void) zio_nowait(rzio); 6162 i += rsize; 6163 } 6164 6165 return (pio); 6166 } 6167 6168 /* 6169 * Aborts a zio returned from l2arc_pbuf_prefetch and frees the data 6170 * buffers allocated for it. 6171 */ 6172 static void 6173 l2arc_pbuf_prefetch_abort(zio_t *zio) 6174 { 6175 l2arc_prefetch_info_t *pi; 6176 6177 pi = zio->io_private; 6178 ASSERT(pi != NULL); 6179 if (pi->pi_hdr_io != NULL) 6180 (void) zio_wait(pi->pi_hdr_io); 6181 (void) zio_wait(zio); 6182 kmem_free(pi->pi_buf, pi->pi_buflen); 6183 pi->pi_buf = NULL; 6184 kmem_free(pi, sizeof (l2arc_prefetch_info_t)); 6185 } 6186 6187 /* 6188 * Encodes an l2uberblock_t structure into a destination buffer. This 6189 * buffer must be at least L2UBERBLOCK_SIZE bytes long. The resulting 6190 * uberblock is always of this constant size. 6191 */ 6192 static void 6193 l2arc_uberblock_encode(const l2uberblock_t *ub, uint8_t *buf) 6194 { 6195 zio_cksum_t cksum; 6196 6197 bzero(buf, L2UBERBLOCK_SIZE); 6198 6199 #if defined(_BIG_ENDIAN) 6200 *(uint32_t *)buf = L2UBERBLOCK_MAGIC; 6201 *(uint16_t *)(buf + 6) = L2UB_BIG_ENDIAN; 6202 #else /* !defined(_BIG_ENDIAN) */ 6203 *(uint32_t *)buf = BSWAP_32(L2UBERBLOCK_MAGIC); 6204 /* zero flags is ok */ 6205 #endif /* !defined(_BIG_ENDIAN) */ 6206 buf[4] = L2UBERBLOCK_MAX_VERSION; 6207 6208 /* rest in native byte order */ 6209 *(uint64_t *)(buf + 8) = ub->ub_spa_guid; 6210 *(uint64_t *)(buf + 16) = ub->ub_birth; 6211 *(uint64_t *)(buf + 24) = ub->ub_evict_tail; 6212 *(uint64_t *)(buf + 32) = ub->ub_alloc_space; 6213 *(uint64_t *)(buf + 40) = ub->ub_pbuf_daddr; 6214 *(uint32_t *)(buf + 48) = ub->ub_pbuf_asize; 6215 bcopy(&ub->ub_pbuf_cksum, buf + 52, 32); 6216 6217 fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum); 6218 bcopy(&cksum, buf + L2UBERBLOCK_SIZE - 32, 32); 6219 } 6220 6221 /* 6222 * Decodes an l2uberblock_t from an on-disk representation. Please note 6223 * that this function does not perform any uberblock validation and 6224 * checksumming - call l2arc_uberblock_verify() for that. 6225 */ 6226 static void 6227 l2arc_uberblock_decode(const uint8_t *buf, l2uberblock_t *ub) 6228 { 6229 boolean_t bswap_needed; 6230 6231 /* these always come in big endian */ 6232 #if defined(_BIG_ENDIAN) 6233 ub->ub_magic = *(uint32_t *)buf; 6234 ub->ub_flags = *(uint16_t *)(buf + 6); 6235 bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 1); 6236 #else /* !defined(_BIG_ENDIAN) */ 6237 ub->ub_magic = BSWAP_32(*(uint32_t *)buf); 6238 ub->ub_flags = BSWAP_16(*(uint16_t *)(buf + 6)); 6239 bswap_needed = ((ub->ub_flags & L2UBLK_BIG_ENDIAN) != 0); 6240 #endif /* !defined(_BIG_ENDIAN) */ 6241 ub->ub_version = buf[4]; 6242 6243 ub->ub_spa_guid = *(uint64_t *)(buf + 8); 6244 ub->ub_birth = *(uint64_t *)(buf + 16); 6245 ub->ub_evict_tail = *(uint64_t *)(buf + 24); 6246 ub->ub_alloc_space = *(uint64_t *)(buf + 32); 6247 ub->ub_pbuf_daddr = *(uint64_t *)(buf + 40); 6248 ub->ub_pbuf_asize = *(uint32_t *)(buf + 48); 6249 bcopy(buf + 52, &ub->ub_pbuf_cksum, 36); 6250 bcopy(buf + L2UBERBLOCK_SIZE - 32, &ub->ub_cksum, 32); 6251 6252 /* swap the rest if endianness doesn't match us */ 6253 if (bswap_needed) { 6254 ub->ub_spa_guid = BSWAP_64(ub->ub_spa_guid); 6255 ub->ub_birth = BSWAP_64(ub->ub_birth); 6256 ub->ub_evict_tail = BSWAP_64(ub->ub_evict_tail); 6257 ub->ub_alloc_space = BSWAP_64(ub->ub_alloc_space); 6258 ub->ub_pbuf_daddr = BSWAP_64(ub->ub_pbuf_daddr); 6259 ub->ub_pbuf_asize = BSWAP_32(ub->ub_pbuf_asize); 6260 ZIO_CHECKSUM_BSWAP(&ub->ub_pbuf_cksum); 6261 ZIO_CHECKSUM_BSWAP(&ub->ub_cksum); 6262 } 6263 } 6264 6265 /* 6266 * Verifies whether a decoded uberblock (via l2arc_uberblock_decode()) is 6267 * valid and matches its checksum. 6268 */ 6269 static int 6270 l2arc_uberblock_verify(const uint8_t *buf, const l2uberblock_t *ub, 6271 uint64_t guid) 6272 { 6273 zio_cksum_t cksum; 6274 6275 if (ub->ub_magic != L2UBERBLOCK_MAGIC || 6276 ub->ub_version == 0 || ub->ub_version > L2UBERBLOCK_MAX_VERSION) 6277 /* 6278 * bad magic or invalid version => persistent l2arc not 6279 * supported 6280 */ 6281 return (ENOTSUP); 6282 6283 if (ub->ub_spa_guid != guid) 6284 /* this l2arc dev isn't ours */ 6285 return (EINVAL); 6286 6287 fletcher_4_native(buf, L2UBERBLOCK_SIZE - 32, &cksum); 6288 if (!ZIO_CHECKSUM_EQUAL(cksum, ub->ub_cksum)) 6289 /* bad checksum, corrupt uberblock */ 6290 return (EINVAL); 6291 6292 return (0); 6293 } 6294 6295 /* 6296 * Schedules a zio to update the uberblock on an l2arc device. The zio is 6297 * initiated as a child of `pio' and `cb' is filled with the information 6298 * needed to free the uberblock data buffer after writing. 6299 */ 6300 static void 6301 l2arc_uberblock_update(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) 6302 { 6303 uint8_t *ub_buf; 6304 l2uberblock_t ub; 6305 zio_t *wzio; 6306 vdev_stat_t st; 6307 6308 ASSERT(cb->l2wcb_ub_buf == NULL); 6309 vdev_get_stats(dev->l2ad_vdev, &st); 6310 6311 bzero(&ub, sizeof (ub)); 6312 ub.ub_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); 6313 ub.ub_birth = dev->l2ad_uberblock_birth++; 6314 ub.ub_evict_tail = dev->l2ad_evict; 6315 ub.ub_alloc_space = st.vs_alloc; 6316 ub.ub_pbuf_daddr = dev->l2ad_pbuf_daddr; 6317 ub.ub_pbuf_asize = dev->l2ad_pbuf_asize; 6318 ub.ub_pbuf_cksum = dev->l2ad_pbuf_cksum; 6319 if (dev->l2ad_first) 6320 ub.ub_flags |= L2UBLK_EVICT_FIRST; 6321 6322 ub_buf = kmem_alloc(L2UBERBLOCK_SIZE, KM_SLEEP); 6323 cb->l2wcb_ub_buf = ub_buf; 6324 l2arc_uberblock_encode(&ub, ub_buf); 6325 wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, 6326 L2UBERBLOCK_SIZE, ub_buf, ZIO_CHECKSUM_OFF, NULL, NULL, 6327 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); 6328 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 6329 zio_t *, wzio); 6330 (void) zio_nowait(wzio); 6331 } 6332 6333 /* 6334 * Encodes a l2pbuf_t structure into the portable on-disk format. The 6335 * `buf' buffer must be suitably sized to hold the entire uncompressed 6336 * structure (use L2PBUF_ENCODED_SIZE()). If requested, this function 6337 * also compresses the buffer. 6338 * 6339 * The return value is the length of the resulting encoded pbuf structure. 6340 * This can be either equal to L2PBUF_ENCODED_SIZE(pb) if no compression 6341 * was applied, or smaller if compression was applied. In either case, 6342 * prior to writing to disk, the caller must suitably pad the output 6343 * buffer so that it is aligned on a multiple of the underlying storage 6344 * system's block size. 6345 */ 6346 static uint32_t 6347 l2arc_pbuf_encode(l2pbuf_t *pb, uint8_t *buf, uint32_t buflen) 6348 { 6349 uint16_t flags = 0; 6350 uint8_t *dst_buf; 6351 uint32_t enclen; 6352 l2pbuf_buflist_t *buflist; 6353 6354 enclen = L2PBUF_ENCODED_SIZE(pb); 6355 ASSERT(buflen >= enclen); 6356 bzero(buf, enclen); 6357 6358 /* non-header portions of pbufs are in native byte order */ 6359 *(uint64_t *)(buf + 8) = pb->pb_prev_daddr; 6360 *(uint32_t *)(buf + 16) = pb->pb_prev_asize; 6361 bcopy(&pb->pb_prev_cksum, buf + 20, 32); 6362 *(uint32_t *)(buf + 52) = enclen - L2PBUF_HDR_SIZE; 6363 6364 /* first we encode the buflists uncompressed */ 6365 dst_buf = buf + L2PBUF_HDR_SIZE; 6366 for (buflist = list_head(pb->pb_buflists_list); buflist; 6367 buflist = list_next(pb->pb_buflists_list, buflist)) { 6368 int i; 6369 6370 ASSERT(buflist->l2pbl_nbufs != 0); 6371 for (i = 0; i < buflist->l2pbl_nbufs; i++) { 6372 l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i]; 6373 6374 ASSERT(pbl_buf->b_size != 0); 6375 *(uint64_t *)dst_buf = pbl_buf->b_dva.dva_word[0]; 6376 *(uint64_t *)(dst_buf + 8) = pbl_buf->b_dva.dva_word[1]; 6377 *(uint64_t *)(dst_buf + 16) = pbl_buf->b_birth; 6378 *(uint64_t *)(dst_buf + 24) = pbl_buf->b_cksum0; 6379 bcopy(&pbl_buf->b_freeze_cksum, dst_buf + 32, 32); 6380 *(uint32_t *)(dst_buf + 64) = pbl_buf->b_size; 6381 *(uint64_t *)(dst_buf + 68) = pbl_buf->b_l2daddr; 6382 *(uint32_t *)(dst_buf + 76) = pbl_buf->b_l2asize; 6383 dst_buf[80] = pbl_buf->b_l2compress; 6384 dst_buf[81] = pbl_buf->b_contents_type; 6385 *(uint32_t *)(dst_buf + 84) = pbl_buf->b_flags; 6386 dst_buf += L2PBUF_BUF_SIZE; 6387 } 6388 } 6389 ASSERT((uint32_t)(dst_buf - buf) == enclen); 6390 6391 /* and then compress them if necessary */ 6392 if (enclen >= l2arc_pbuf_compress_minsz) { 6393 uint8_t *cbuf; 6394 size_t slen, clen; 6395 6396 slen = l2arc_pbuf_items_encoded_size(pb); 6397 cbuf = kmem_alloc(slen, KM_SLEEP); 6398 clen = lz4_compress(buf + L2PBUF_HDR_SIZE, cbuf, slen, slen, 0); 6399 ASSERT(clen != 0); 6400 if (clen < slen) { 6401 bcopy(cbuf, buf + L2PBUF_HDR_SIZE, clen); 6402 flags |= L2PBUF_COMPRESSED; 6403 /* zero out the rest of the input buffer */ 6404 bzero(buf + L2PBUF_HDR_SIZE + clen, 6405 buflen - (L2PBUF_HDR_SIZE + clen)); 6406 /* adjust our buffer length now that it's shortened */ 6407 enclen = L2PBUF_HDR_SIZE + clen; 6408 } 6409 kmem_free(cbuf, slen); 6410 } 6411 6412 /* the header goes last since `flags' may change due to compression */ 6413 #if defined(_BIG_ENDIAN) 6414 *(uint32_t *)buf = L2PBUF_MAGIC; 6415 flags |= L2PBUF_BIG_ENDIAN; 6416 *(uint16_t *)(buf + 6) = flags; 6417 #else /* !defined(_BIG_ENDIAN) */ 6418 *(uint32_t *)buf = BSWAP_32(L2PBUF_MAGIC); 6419 *(uint16_t *)(buf + 6) = BSWAP_16(flags); 6420 #endif /* !defined(_BIG_ENDIAN) */ 6421 buf[4] = L2PBUF_MAX_VERSION; 6422 6423 return (enclen); 6424 } 6425 6426 /* 6427 * Decodes a stored l2pbuf_t structure previously encoded using 6428 * l2arc_pbuf_encode. The source buffer is not modified. The passed pbuf 6429 * must be initialized by l2arc_pbuf_init by the caller beforehand, but 6430 * must not have been used to store any buffers yet. 6431 * 6432 * Please note that we don't do checksum verification here, as we don't 6433 * know our own checksum (that's know by the previous block in the linked 6434 * list, or by the uberblock). This should be performed by the caller 6435 * prior to calling l2arc_pbuf_decode. 6436 */ 6437 static int 6438 l2arc_pbuf_decode(uint8_t *input_buf, uint32_t buflen, l2pbuf_t *pb) 6439 { 6440 boolean_t bswap_needed; 6441 uint32_t payload_sz, payload_asz; 6442 uint8_t *src_bufs; 6443 l2pbuf_buflist_t *buflist; 6444 int i, nbufs; 6445 6446 ASSERT(input_buf != NULL); 6447 ASSERT(pb != NULL); 6448 ASSERT(pb->pb_version != 0); 6449 ASSERT(pb->pb_nbuflists == 0); 6450 6451 /* no valid buffer can be this small */ 6452 if (buflen < L2PBUF_HDR_SIZE) 6453 return (EINVAL); 6454 6455 /* these always come in big endian */ 6456 #if defined(_BIG_ENDIAN) 6457 pb->pb_magic = *(uint32_t *)input_buf; 6458 pb->pb_flags = *(uint16_t *)(input_buf + 6); 6459 bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 1); 6460 #else /* !defined(_BIG_ENDIAN) */ 6461 pb->pb_magic = BSWAP_32(*(uint32_t *)input_buf); 6462 pb->pb_flags = BSWAP_16(*(uint16_t *)(input_buf + 6)); 6463 bswap_needed = ((pb->pb_flags & L2PBUF_BIG_ENDIAN) != 0); 6464 #endif /* !defined(_BIG_ENDIAN) */ 6465 pb->pb_version = input_buf[4]; 6466 6467 if (pb->pb_magic != L2PBUF_MAGIC || pb->pb_version == 0) 6468 return (EINVAL); 6469 if (pb->pb_version > L2PBUF_MAX_VERSION) 6470 return (ENOTSUP); 6471 6472 /* remainder of pbuf may need bswap'ping */ 6473 pb->pb_prev_daddr = *(uint64_t *)(input_buf + 8); 6474 pb->pb_prev_asize = *(uint64_t *)(input_buf + 16); 6475 bcopy(input_buf + 20, &pb->pb_prev_cksum, 32); 6476 payload_sz = *(uint32_t *)(input_buf + 52); 6477 payload_asz = buflen - L2PBUF_HDR_SIZE; 6478 6479 if (bswap_needed) { 6480 pb->pb_prev_daddr = BSWAP_64(pb->pb_prev_daddr); 6481 pb->pb_prev_asize = BSWAP_64(pb->pb_prev_asize); 6482 ZIO_CHECKSUM_BSWAP(&pb->pb_prev_cksum); 6483 payload_sz = BSWAP_32(payload_sz); 6484 } 6485 6486 /* check for sensible buffer allocation limits */ 6487 if (((pb->pb_flags & L2PBUF_COMPRESSED) && payload_sz <= payload_asz) || 6488 (payload_sz > L2PBUF_MAX_PAYLOAD_SIZE) || 6489 (payload_sz % L2PBUF_BUF_SIZE) != 0 || payload_sz == 0) 6490 return (EINVAL); 6491 nbufs = payload_sz / L2PBUF_BUF_SIZE; 6492 6493 /* decompression might be needed */ 6494 if (pb->pb_flags & L2PBUF_COMPRESSED) { 6495 src_bufs = kmem_alloc(payload_sz, KM_SLEEP); 6496 if (lz4_decompress(input_buf + L2PBUF_HDR_SIZE, src_bufs, 6497 payload_asz, payload_sz, 0) != 0) { 6498 kmem_free(src_bufs, payload_sz); 6499 return (EINVAL); 6500 } 6501 } else { 6502 src_bufs = input_buf + L2PBUF_HDR_SIZE; 6503 } 6504 6505 /* Decode individual pbuf items from our source buffer. */ 6506 buflist = l2arc_pbuf_buflist_alloc(pb, nbufs); 6507 for (i = 0; i < nbufs; i++) { 6508 l2pbuf_buf_t *pbl_buf = &buflist->l2pbl_bufs[i]; 6509 const uint8_t *src = src_bufs + i * L2PBUF_BUF_SIZE; 6510 6511 pbl_buf->b_dva.dva_word[0] = *(uint64_t *)src; 6512 pbl_buf->b_dva.dva_word[1] = *(uint64_t *)(src + 8); 6513 pbl_buf->b_birth = *(uint64_t *)(src + 16); 6514 pbl_buf->b_cksum0 = *(uint64_t *)(src + 24); 6515 bcopy(src + 32, &pbl_buf->b_freeze_cksum, 32); 6516 pbl_buf->b_size = *(uint32_t *)(src + 64); 6517 pbl_buf->b_l2daddr = *(uint64_t *)(src + 68); 6518 pbl_buf->b_l2asize = *(uint32_t *)(src + 76); 6519 pbl_buf->b_l2compress = src[80]; 6520 pbl_buf->b_contents_type = src[81]; 6521 pbl_buf->b_flags = *(uint32_t *)(src + 84); 6522 6523 if (bswap_needed) { 6524 pbl_buf->b_dva.dva_word[0] = 6525 BSWAP_64(pbl_buf->b_dva.dva_word[0]); 6526 pbl_buf->b_dva.dva_word[1] = 6527 BSWAP_64(pbl_buf->b_dva.dva_word[1]); 6528 pbl_buf->b_birth = BSWAP_64(pbl_buf->b_birth); 6529 pbl_buf->b_cksum0 = BSWAP_64(pbl_buf->b_cksum0); 6530 ZIO_CHECKSUM_BSWAP(&pbl_buf->b_freeze_cksum); 6531 pbl_buf->b_size = BSWAP_32(pbl_buf->b_size); 6532 pbl_buf->b_l2daddr = BSWAP_64(pbl_buf->b_l2daddr); 6533 pbl_buf->b_l2asize = BSWAP_32(pbl_buf->b_l2asize); 6534 pbl_buf->b_flags = BSWAP_32(pbl_buf->b_flags); 6535 } 6536 6537 pb->pb_payload_asz += pbl_buf->b_l2asize; 6538 } 6539 6540 if (pb->pb_flags & L2PBUF_COMPRESSED) 6541 kmem_free(src_bufs, payload_sz); 6542 6543 return (0); 6544 } 6545 6546 /* 6547 * Decodes the previous buffer pointer encoded in a pbuf. This is used 6548 * during L2ARC reconstruction to "peek" at the next buffer and start 6549 * issuing IO to fetch it early, before decoding of the current buffer 6550 * is done (which can take time due to decompression). 6551 * Returns 0 on success (and fills in the return parameters `daddr', 6552 * `asize' and `cksum' with the info of the previous pbuf), and an errno 6553 * on error. 6554 */ 6555 static int 6556 l2arc_pbuf_decode_prev_ptr(const uint8_t *buf, size_t buflen, uint64_t *daddr, 6557 uint32_t *asize, zio_cksum_t *cksum) 6558 { 6559 boolean_t bswap_needed; 6560 uint16_t version, flags; 6561 uint32_t magic; 6562 6563 ASSERT(buf != NULL); 6564 6565 /* no valid buffer can be this small */ 6566 if (buflen <= L2PBUF_HDR_SIZE) 6567 return (EINVAL); 6568 6569 /* these always come in big endian */ 6570 #if defined(_BIG_ENDIAN) 6571 magic = *(uint32_t *)buf; 6572 flags = *(uint16_t *)(buf + 6); 6573 bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 1); 6574 #else /* !defined(_BIG_ENDIAN) */ 6575 magic = BSWAP_32(*(uint32_t *)buf); 6576 flags = BSWAP_16(*(uint16_t *)(buf + 6)); 6577 bswap_needed = ((flags & L2PBUF_BIG_ENDIAN) != 0); 6578 #endif /* !defined(_BIG_ENDIAN) */ 6579 version = buf[4]; 6580 6581 if (magic != L2PBUF_MAGIC || version == 0) 6582 return (EINVAL); 6583 if (version > L2PBUF_MAX_VERSION) 6584 return (ENOTSUP); 6585 6586 *daddr = *(uint64_t *)(buf + 4); 6587 *asize = *(uint64_t *)(buf + 12); 6588 bcopy(buf + 16, cksum, 32); 6589 6590 if (bswap_needed) { 6591 *daddr = BSWAP_64(*daddr); 6592 *asize = BSWAP_64(*asize); 6593 ZIO_CHECKSUM_BSWAP(cksum); 6594 } 6595 6596 return (0); 6597 } 6598 6599 /* 6600 * Initializes a pbuf structure into a clean state. All version and flags 6601 * fields are filled in as appropriate for this architecture. 6602 * If the structure was used before, first call l2arc_pbuf_destroy on it, 6603 * as this function assumes the structure is uninitialized. 6604 */ 6605 static void 6606 l2arc_pbuf_init(l2pbuf_t *pb) 6607 { 6608 bzero(pb, sizeof (l2pbuf_t)); 6609 pb->pb_version = L2PBUF_MAX_VERSION; 6610 #if defined(_BIG_ENDIAN) 6611 pb->pb_flags |= L2PB_BIG_ENDIAN; 6612 #endif 6613 pb->pb_buflists_list = kmem_zalloc(sizeof (list_t), KM_SLEEP); 6614 list_create(pb->pb_buflists_list, sizeof (l2pbuf_buflist_t), 6615 offsetof(l2pbuf_buflist_t, l2pbl_node)); 6616 } 6617 6618 /* 6619 * Destroys a pbuf structure and puts it into a clean state ready to be 6620 * initialized by l2arc_pbuf_init. All buflists created by 6621 * l2arc_pbuf_buflist_alloc are released as well. 6622 */ 6623 static void 6624 l2arc_pbuf_destroy(l2pbuf_t *pb) 6625 { 6626 list_t *buflist_list = pb->pb_buflists_list; 6627 l2pbuf_buflist_t *buflist; 6628 6629 while ((buflist = list_head(buflist_list)) != NULL) { 6630 ASSERT(buflist->l2pbl_nbufs > 0); 6631 kmem_free(buflist->l2pbl_bufs, sizeof (l2pbuf_buf_t) * 6632 buflist->l2pbl_nbufs); 6633 list_remove(buflist_list, buflist); 6634 kmem_free(buflist, sizeof (l2pbuf_buflist_t)); 6635 } 6636 pb->pb_nbuflists = 0; 6637 list_destroy(pb->pb_buflists_list); 6638 kmem_free(pb->pb_buflists_list, sizeof (list_t)); 6639 bzero(pb, sizeof (l2pbuf_t)); 6640 } 6641 6642 /* 6643 * Allocates a new buflist inside of a pbuf, which can hold up to `nbufs' 6644 * buffers. This is used during the buffer write cycle - each cycle allocates 6645 * a new buflist and fills it with buffers it writes. Then, when the pbuf 6646 * reaches its buflist limit, it is commited to stable storage. 6647 */ 6648 static l2pbuf_buflist_t * 6649 l2arc_pbuf_buflist_alloc(l2pbuf_t *pb, int nbufs) 6650 { 6651 l2pbuf_buflist_t *buflist; 6652 6653 ASSERT(pb->pb_buflists_list != NULL); 6654 buflist = kmem_zalloc(sizeof (l2pbuf_buflist_t), KM_SLEEP); 6655 buflist->l2pbl_nbufs = nbufs; 6656 buflist->l2pbl_bufs = kmem_zalloc(sizeof (l2pbuf_buf_t) * nbufs, 6657 KM_SLEEP); 6658 list_insert_tail(pb->pb_buflists_list, buflist); 6659 pb->pb_nbuflists++; 6660 6661 return (buflist); 6662 } 6663 6664 /* 6665 * Inserts ARC buffer `ab' into the pbuf `pb' buflist `pbl' at index `idx'. 6666 * The buffer being inserted must be present in L2ARC. 6667 */ 6668 static void 6669 l2arc_pbuflist_insert(l2pbuf_t *pb, l2pbuf_buflist_t *pbl, 6670 const arc_buf_hdr_t *ab, int index) 6671 { 6672 l2pbuf_buf_t *pb_buf; 6673 const l2arc_buf_hdr_t *l2hdr; 6674 6675 l2hdr = ab->b_l2hdr; 6676 ASSERT(l2hdr != NULL); 6677 ASSERT(pbl->l2pbl_nbufs > index); 6678 6679 pb_buf = &pbl->l2pbl_bufs[index]; 6680 pb_buf->b_dva = ab->b_dva; 6681 pb_buf->b_birth = ab->b_birth; 6682 pb_buf->b_cksum0 = ab->b_cksum0; 6683 pb_buf->b_freeze_cksum = *ab->b_freeze_cksum; 6684 pb_buf->b_size = ab->b_size; 6685 pb_buf->b_l2daddr = l2hdr->b_daddr; 6686 pb_buf->b_l2asize = l2hdr->b_asize; 6687 pb_buf->b_l2compress = l2hdr->b_compress; 6688 pb_buf->b_contents_type = ab->b_type; 6689 pb_buf->b_flags = ab->b_flags & L2ARC_PERSIST_FLAGS; 6690 pb->pb_payload_asz += l2hdr->b_asize; 6691 } 6692 6693 /* 6694 * Commits a pbuf to stable storage. This routine is invoked when writing 6695 * ARC buffers to an L2ARC device. When the pbuf associated with the device 6696 * has reached its limits (either in size or in number of writes), it is 6697 * scheduled here for writing. 6698 * This function allocates some memory to temporarily hold the serialized 6699 * buffer to be written. This is then released in l2arc_write_done. 6700 */ 6701 static void 6702 l2arc_pbuf_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) 6703 { 6704 l2pbuf_t *pb = &dev->l2ad_pbuf; 6705 uint64_t i, est_encsize, bufsize, encsize, io_size; 6706 uint8_t *pb_buf; 6707 6708 pb->pb_prev_daddr = dev->l2ad_pbuf_daddr; 6709 pb->pb_prev_asize = dev->l2ad_pbuf_asize; 6710 pb->pb_prev_cksum = dev->l2ad_pbuf_cksum; 6711 6712 est_encsize = L2PBUF_ENCODED_SIZE(pb); 6713 bufsize = vdev_psize_to_asize(dev->l2ad_vdev, est_encsize); 6714 pb_buf = kmem_zalloc(bufsize, KM_SLEEP); 6715 encsize = l2arc_pbuf_encode(pb, pb_buf, bufsize); 6716 cb->l2wcb_pbuf = pb_buf; 6717 cb->l2wcb_pbuf_size = bufsize; 6718 6719 dev->l2ad_pbuf_daddr = dev->l2ad_hand; 6720 dev->l2ad_pbuf_asize = encsize; 6721 fletcher_4_native(pb_buf, encsize, &dev->l2ad_pbuf_cksum); 6722 6723 io_size = vdev_psize_to_asize(dev->l2ad_vdev, encsize); 6724 for (i = 0; i < io_size; ) { 6725 zio_t *wzio; 6726 uint64_t wsize = io_size - i; 6727 6728 if (wsize > SPA_MAXBLOCKSIZE) 6729 wsize = SPA_MAXBLOCKSIZE; 6730 ASSERT(wsize >= SPA_MINBLOCKSIZE); 6731 wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand + i, 6732 wsize, pb_buf + i, ZIO_CHECKSUM_OFF, NULL, NULL, 6733 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); 6734 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 6735 zio_t *, wzio); 6736 (void) zio_nowait(wzio); 6737 i += wsize; 6738 } 6739 6740 dev->l2ad_hand += io_size; 6741 vdev_space_update(dev->l2ad_vdev, io_size, 0, 0); 6742 l2arc_uberblock_update(dev, pio, cb); 6743 6744 ARCSTAT_INCR(arcstat_l2_write_bytes, io_size); 6745 ARCSTAT_BUMP(arcstat_l2_meta_writes); 6746 ARCSTAT_F_AVG(arcstat_l2_meta_avg_size, est_encsize); 6747 ARCSTAT_F_AVG(arcstat_l2_meta_avg_asize, encsize); 6748 ARCSTAT_F_AVG(arcstat_l2_asize_to_meta_ratio, 6749 pb->pb_payload_asz / encsize); 6750 } 6751 6752 /* 6753 * Returns the number of bytes occupied by the payload buffer items of 6754 * a pbuf in portable (on-disk) encoded form, i.e. the bytes following 6755 * L2PBUF_HDR_SIZE. 6756 */ 6757 static uint32_t 6758 l2arc_pbuf_items_encoded_size(l2pbuf_t *pb) 6759 { 6760 uint32_t size = 0; 6761 l2pbuf_buflist_t *buflist; 6762 6763 for (buflist = list_head(pb->pb_buflists_list); buflist != NULL; 6764 buflist = list_next(pb->pb_buflists_list, buflist)) 6765 size += L2PBUF_BUF_SIZE * buflist->l2pbl_nbufs; 6766 6767 return (size); 6768 }