Print this page
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4104 ::spa_space no longer works
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/metaslab.c
+++ new/usr/src/uts/common/fs/zfs/metaslab.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 2013 by Delphix. All rights reserved.
↓ open down ↓ |
23 lines elided |
↑ open up ↑ |
24 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25 25 */
26 26
27 27 #include <sys/zfs_context.h>
28 28 #include <sys/dmu.h>
29 29 #include <sys/dmu_tx.h>
30 30 #include <sys/space_map.h>
31 31 #include <sys/metaslab_impl.h>
32 32 #include <sys/vdev_impl.h>
33 33 #include <sys/zio.h>
34 +#include <sys/spa_impl.h>
34 35
35 36 /*
36 37 * Allow allocations to switch to gang blocks quickly. We do this to
37 38 * avoid having to load lots of space_maps in a given txg. There are,
38 39 * however, some cases where we want to avoid "fast" ganging and instead
39 40 * we want to do an exhaustive search of all metaslabs on this device.
40 41 * Currently we don't allow any gang, zil, or dump device related allocations
41 42 * to "fast" gang.
42 43 */
43 44 #define CAN_FASTGANG(flags) \
44 45 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
45 46 METASLAB_GANG_AVOID)))
46 47
48 +#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
49 +#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
50 +#define METASLAB_ACTIVE_MASK \
51 + (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
52 +
47 53 uint64_t metaslab_aliquot = 512ULL << 10;
48 54 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
49 55
50 56 /*
51 57 * The in-core space map representation is more compact than its on-disk form.
52 58 * The zfs_condense_pct determines how much more compact the in-core
53 59 * space_map representation must be before we compact it on-disk.
54 60 * Values should be greater than or equal to 100.
55 61 */
56 62 int zfs_condense_pct = 200;
57 63
58 64 /*
59 65 * This value defines the number of allowed allocation failures per vdev.
60 66 * If a device reaches this threshold in a given txg then we consider skipping
61 67 * allocations on that device. The value of zfs_mg_alloc_failures is computed
62 68 * in zio_init() unless it has been overridden in /etc/system.
63 69 */
64 70 int zfs_mg_alloc_failures = 0;
65 71
66 72 /*
67 73 * The zfs_mg_noalloc_threshold defines which metaslab groups should
68 74 * be eligible for allocation. The value is defined as a percentage of
69 75 * a free space. Metaslab groups that have more free space than
70 76 * zfs_mg_noalloc_threshold are always eligible for allocations. Once
71 77 * a metaslab group's free space is less than or equal to the
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
72 78 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
73 79 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
74 80 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
75 81 * groups are allowed to accept allocations. Gang blocks are always
76 82 * eligible to allocate on any metaslab group. The default value of 0 means
77 83 * no metaslab group will be excluded based on this criterion.
78 84 */
79 85 int zfs_mg_noalloc_threshold = 0;
80 86
81 87 /*
82 - * Metaslab debugging: when set, keeps all space maps in core to verify frees.
88 + * When set will load all metaslabs when pool is first opened.
83 89 */
84 -static int metaslab_debug = 0;
90 +int metaslab_debug_load = 0;
85 91
86 92 /*
93 + * When set will prevent metaslabs from being unloaded.
94 + */
95 +int metaslab_debug_unload = 0;
96 +
97 +/*
87 98 * Minimum size which forces the dynamic allocator to change
88 99 * it's allocation strategy. Once the space map cannot satisfy
89 100 * an allocation of this size then it switches to using more
90 101 * aggressive strategy (i.e search by size rather than offset).
91 102 */
92 103 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
93 104
94 105 /*
95 106 * The minimum free space, in percent, which must be available
96 107 * in a space map to continue allocations in a first-fit fashion.
97 108 * Once the space_map's free space drops below this level we dynamically
98 109 * switch to using best-fit allocations.
↓ open down ↓ |
2 lines elided |
↑ open up ↑ |
99 110 */
100 111 int metaslab_df_free_pct = 4;
101 112
102 113 /*
103 114 * A metaslab is considered "free" if it contains a contiguous
104 115 * segment which is greater than metaslab_min_alloc_size.
105 116 */
106 117 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
107 118
108 119 /*
109 - * Max number of space_maps to prefetch.
120 + * Percentage of all cpus that can be used by the metaslab taskq.
110 121 */
111 -int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
122 +int metaslab_load_pct = 50;
112 123
113 124 /*
114 - * Percentage bonus multiplier for metaslabs that are in the bonus area.
125 + * Determines how many txgs a metaslab may remain loaded without having any
126 + * allocations from it. As long as a metaslab continues to be used we will
127 + * keep it loaded.
115 128 */
116 -int metaslab_smo_bonus_pct = 150;
129 +int metaslab_unload_delay = TXG_SIZE * 2;
117 130
118 131 /*
119 132 * Should we be willing to write data to degraded vdevs?
120 133 */
121 134 boolean_t zfs_write_to_degraded = B_FALSE;
122 135
123 136 /*
137 + * Max number of metaslabs per group to preload.
138 + */
139 +int metaslab_preload_limit = SPA_DVAS_PER_BP;
140 +
141 +/*
142 + * Enable/disable preloading of metaslab.
143 + */
144 +boolean_t metaslab_preload_enabled = B_TRUE;
145 +
146 +/*
147 + * Enable/disable additional weight factor for each metaslab.
148 + */
149 +boolean_t metaslab_weight_factor_enable = B_FALSE;
150 +
151 +
152 +/*
124 153 * ==========================================================================
125 154 * Metaslab classes
126 155 * ==========================================================================
127 156 */
128 157 metaslab_class_t *
129 -metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
158 +metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
130 159 {
131 160 metaslab_class_t *mc;
132 161
133 162 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
134 163
135 164 mc->mc_spa = spa;
136 165 mc->mc_rotor = NULL;
137 166 mc->mc_ops = ops;
138 167
139 168 return (mc);
140 169 }
141 170
142 171 void
143 172 metaslab_class_destroy(metaslab_class_t *mc)
144 173 {
145 174 ASSERT(mc->mc_rotor == NULL);
146 175 ASSERT(mc->mc_alloc == 0);
147 176 ASSERT(mc->mc_deferred == 0);
148 177 ASSERT(mc->mc_space == 0);
149 178 ASSERT(mc->mc_dspace == 0);
150 179
151 180 kmem_free(mc, sizeof (metaslab_class_t));
152 181 }
153 182
154 183 int
155 184 metaslab_class_validate(metaslab_class_t *mc)
156 185 {
157 186 metaslab_group_t *mg;
158 187 vdev_t *vd;
159 188
160 189 /*
161 190 * Must hold one of the spa_config locks.
162 191 */
163 192 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
164 193 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
165 194
166 195 if ((mg = mc->mc_rotor) == NULL)
167 196 return (0);
168 197
169 198 do {
170 199 vd = mg->mg_vd;
171 200 ASSERT(vd->vdev_mg != NULL);
172 201 ASSERT3P(vd->vdev_top, ==, vd);
173 202 ASSERT3P(mg->mg_class, ==, mc);
174 203 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
175 204 } while ((mg = mg->mg_next) != mc->mc_rotor);
176 205
177 206 return (0);
178 207 }
179 208
180 209 void
181 210 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
182 211 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
183 212 {
184 213 atomic_add_64(&mc->mc_alloc, alloc_delta);
185 214 atomic_add_64(&mc->mc_deferred, defer_delta);
186 215 atomic_add_64(&mc->mc_space, space_delta);
187 216 atomic_add_64(&mc->mc_dspace, dspace_delta);
188 217 }
189 218
190 219 uint64_t
191 220 metaslab_class_get_alloc(metaslab_class_t *mc)
192 221 {
193 222 return (mc->mc_alloc);
194 223 }
195 224
196 225 uint64_t
197 226 metaslab_class_get_deferred(metaslab_class_t *mc)
198 227 {
199 228 return (mc->mc_deferred);
200 229 }
201 230
202 231 uint64_t
203 232 metaslab_class_get_space(metaslab_class_t *mc)
204 233 {
205 234 return (mc->mc_space);
206 235 }
207 236
208 237 uint64_t
209 238 metaslab_class_get_dspace(metaslab_class_t *mc)
210 239 {
211 240 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
212 241 }
213 242
214 243 /*
215 244 * ==========================================================================
216 245 * Metaslab groups
217 246 * ==========================================================================
218 247 */
219 248 static int
220 249 metaslab_compare(const void *x1, const void *x2)
221 250 {
222 251 const metaslab_t *m1 = x1;
↓ open down ↓ |
83 lines elided |
↑ open up ↑ |
223 252 const metaslab_t *m2 = x2;
224 253
225 254 if (m1->ms_weight < m2->ms_weight)
226 255 return (1);
227 256 if (m1->ms_weight > m2->ms_weight)
228 257 return (-1);
229 258
230 259 /*
231 260 * If the weights are identical, use the offset to force uniqueness.
232 261 */
233 - if (m1->ms_map->sm_start < m2->ms_map->sm_start)
262 + if (m1->ms_start < m2->ms_start)
234 263 return (-1);
235 - if (m1->ms_map->sm_start > m2->ms_map->sm_start)
264 + if (m1->ms_start > m2->ms_start)
236 265 return (1);
237 266
238 267 ASSERT3P(m1, ==, m2);
239 268
240 269 return (0);
241 270 }
242 271
243 272 /*
244 273 * Update the allocatable flag and the metaslab group's capacity.
245 274 * The allocatable flag is set to true if the capacity is below
246 275 * the zfs_mg_noalloc_threshold. If a metaslab group transitions
247 276 * from allocatable to non-allocatable or vice versa then the metaslab
248 277 * group's class is updated to reflect the transition.
249 278 */
250 279 static void
251 280 metaslab_group_alloc_update(metaslab_group_t *mg)
252 281 {
253 282 vdev_t *vd = mg->mg_vd;
254 283 metaslab_class_t *mc = mg->mg_class;
255 284 vdev_stat_t *vs = &vd->vdev_stat;
256 285 boolean_t was_allocatable;
257 286
258 287 ASSERT(vd == vd->vdev_top);
259 288
260 289 mutex_enter(&mg->mg_lock);
261 290 was_allocatable = mg->mg_allocatable;
262 291
263 292 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
264 293 (vs->vs_space + 1);
265 294
266 295 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
267 296
268 297 /*
269 298 * The mc_alloc_groups maintains a count of the number of
270 299 * groups in this metaslab class that are still above the
271 300 * zfs_mg_noalloc_threshold. This is used by the allocating
272 301 * threads to determine if they should avoid allocations to
273 302 * a given group. The allocator will avoid allocations to a group
274 303 * if that group has reached or is below the zfs_mg_noalloc_threshold
275 304 * and there are still other groups that are above the threshold.
276 305 * When a group transitions from allocatable to non-allocatable or
277 306 * vice versa we update the metaslab class to reflect that change.
278 307 * When the mc_alloc_groups value drops to 0 that means that all
279 308 * groups have reached the zfs_mg_noalloc_threshold making all groups
280 309 * eligible for allocations. This effectively means that all devices
281 310 * are balanced again.
282 311 */
283 312 if (was_allocatable && !mg->mg_allocatable)
284 313 mc->mc_alloc_groups--;
285 314 else if (!was_allocatable && mg->mg_allocatable)
286 315 mc->mc_alloc_groups++;
287 316 mutex_exit(&mg->mg_lock);
288 317 }
289 318
290 319 metaslab_group_t *
291 320 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
292 321 {
↓ open down ↓ |
47 lines elided |
↑ open up ↑ |
293 322 metaslab_group_t *mg;
294 323
295 324 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
296 325 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
297 326 avl_create(&mg->mg_metaslab_tree, metaslab_compare,
298 327 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
299 328 mg->mg_vd = vd;
300 329 mg->mg_class = mc;
301 330 mg->mg_activation_count = 0;
302 331
332 + mg->mg_taskq = taskq_create("metaslab_group_tasksq", metaslab_load_pct,
333 + minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
334 +
303 335 return (mg);
304 336 }
305 337
306 338 void
307 339 metaslab_group_destroy(metaslab_group_t *mg)
308 340 {
309 341 ASSERT(mg->mg_prev == NULL);
310 342 ASSERT(mg->mg_next == NULL);
311 343 /*
312 344 * We may have gone below zero with the activation count
313 345 * either because we never activated in the first place or
314 346 * because we're done, and possibly removing the vdev.
315 347 */
316 348 ASSERT(mg->mg_activation_count <= 0);
317 349
318 350 avl_destroy(&mg->mg_metaslab_tree);
319 351 mutex_destroy(&mg->mg_lock);
320 352 kmem_free(mg, sizeof (metaslab_group_t));
321 353 }
322 354
323 355 void
324 356 metaslab_group_activate(metaslab_group_t *mg)
325 357 {
326 358 metaslab_class_t *mc = mg->mg_class;
327 359 metaslab_group_t *mgprev, *mgnext;
328 360
329 361 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
330 362
331 363 ASSERT(mc->mc_rotor != mg);
332 364 ASSERT(mg->mg_prev == NULL);
333 365 ASSERT(mg->mg_next == NULL);
334 366 ASSERT(mg->mg_activation_count <= 0);
335 367
336 368 if (++mg->mg_activation_count <= 0)
337 369 return;
338 370
339 371 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
340 372 metaslab_group_alloc_update(mg);
341 373
342 374 if ((mgprev = mc->mc_rotor) == NULL) {
343 375 mg->mg_prev = mg;
344 376 mg->mg_next = mg;
345 377 } else {
346 378 mgnext = mgprev->mg_next;
347 379 mg->mg_prev = mgprev;
348 380 mg->mg_next = mgnext;
349 381 mgprev->mg_next = mg;
350 382 mgnext->mg_prev = mg;
351 383 }
352 384 mc->mc_rotor = mg;
353 385 }
354 386
355 387 void
356 388 metaslab_group_passivate(metaslab_group_t *mg)
357 389 {
358 390 metaslab_class_t *mc = mg->mg_class;
359 391 metaslab_group_t *mgprev, *mgnext;
360 392
↓ open down ↓ |
48 lines elided |
↑ open up ↑ |
361 393 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
362 394
363 395 if (--mg->mg_activation_count != 0) {
364 396 ASSERT(mc->mc_rotor != mg);
365 397 ASSERT(mg->mg_prev == NULL);
366 398 ASSERT(mg->mg_next == NULL);
367 399 ASSERT(mg->mg_activation_count < 0);
368 400 return;
369 401 }
370 402
403 + taskq_wait(mg->mg_taskq);
404 +
371 405 mgprev = mg->mg_prev;
372 406 mgnext = mg->mg_next;
373 407
374 408 if (mg == mgnext) {
375 409 mc->mc_rotor = NULL;
376 410 } else {
377 411 mc->mc_rotor = mgnext;
378 412 mgprev->mg_next = mgnext;
379 413 mgnext->mg_prev = mgprev;
380 414 }
381 415
382 416 mg->mg_prev = NULL;
383 417 mg->mg_next = NULL;
384 418 }
385 419
386 420 static void
387 421 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
388 422 {
389 423 mutex_enter(&mg->mg_lock);
390 424 ASSERT(msp->ms_group == NULL);
391 425 msp->ms_group = mg;
392 426 msp->ms_weight = 0;
393 427 avl_add(&mg->mg_metaslab_tree, msp);
394 428 mutex_exit(&mg->mg_lock);
395 429 }
396 430
397 431 static void
398 432 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
399 433 {
400 434 mutex_enter(&mg->mg_lock);
401 435 ASSERT(msp->ms_group == mg);
402 436 avl_remove(&mg->mg_metaslab_tree, msp);
403 437 msp->ms_group = NULL;
404 438 mutex_exit(&mg->mg_lock);
405 439 }
406 440
407 441 static void
408 442 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
409 443 {
410 444 /*
411 445 * Although in principle the weight can be any value, in
412 446 * practice we do not use values in the range [1, 510].
413 447 */
414 448 ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
415 449 ASSERT(MUTEX_HELD(&msp->ms_lock));
416 450
417 451 mutex_enter(&mg->mg_lock);
418 452 ASSERT(msp->ms_group == mg);
419 453 avl_remove(&mg->mg_metaslab_tree, msp);
420 454 msp->ms_weight = weight;
421 455 avl_add(&mg->mg_metaslab_tree, msp);
422 456 mutex_exit(&mg->mg_lock);
423 457 }
424 458
425 459 /*
426 460 * Determine if a given metaslab group should skip allocations. A metaslab
427 461 * group should avoid allocations if its used capacity has crossed the
428 462 * zfs_mg_noalloc_threshold and there is at least one metaslab group
429 463 * that can still handle allocations.
430 464 */
431 465 static boolean_t
432 466 metaslab_group_allocatable(metaslab_group_t *mg)
433 467 {
434 468 vdev_t *vd = mg->mg_vd;
435 469 spa_t *spa = vd->vdev_spa;
436 470 metaslab_class_t *mc = mg->mg_class;
437 471
438 472 /*
439 473 * A metaslab group is considered allocatable if its free capacity
↓ open down ↓ |
59 lines elided |
↑ open up ↑ |
440 474 * is greater than the set value of zfs_mg_noalloc_threshold, it's
441 475 * associated with a slog, or there are no other metaslab groups
442 476 * with free capacity greater than zfs_mg_noalloc_threshold.
443 477 */
444 478 return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
445 479 mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
446 480 }
447 481
448 482 /*
449 483 * ==========================================================================
450 - * Common allocator routines
484 + * Range tree callbacks
451 485 * ==========================================================================
452 486 */
487 +
488 +/*
489 + * Comparison function for the private size-ordered tree. Tree is sorted
490 + * by size, larger sizes at the end of the tree.
491 + */
453 492 static int
454 -metaslab_segsize_compare(const void *x1, const void *x2)
493 +metaslab_rangesize_compare(const void *x1, const void *x2)
455 494 {
456 - const space_seg_t *s1 = x1;
457 - const space_seg_t *s2 = x2;
458 - uint64_t ss_size1 = s1->ss_end - s1->ss_start;
459 - uint64_t ss_size2 = s2->ss_end - s2->ss_start;
495 + const range_seg_t *r1 = x1;
496 + const range_seg_t *r2 = x2;
497 + uint64_t rs_size1 = r1->rs_end - r1->rs_start;
498 + uint64_t rs_size2 = r2->rs_end - r2->rs_start;
460 499
461 - if (ss_size1 < ss_size2)
500 + if (rs_size1 < rs_size2)
462 501 return (-1);
463 - if (ss_size1 > ss_size2)
502 + if (rs_size1 > rs_size2)
464 503 return (1);
465 504
466 - if (s1->ss_start < s2->ss_start)
505 + if (r1->rs_start < r2->rs_start)
467 506 return (-1);
468 - if (s1->ss_start > s2->ss_start)
507 +
508 + if (r1->rs_start > r2->rs_start)
469 509 return (1);
470 510
471 511 return (0);
472 512 }
473 513
474 514 /*
475 - * This is a helper function that can be used by the allocator to find
476 - * a suitable block to allocate. This will search the specified AVL
477 - * tree looking for a block that matches the specified criteria.
515 + * Create any block allocator specific components. The current allocators
516 + * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
478 517 */
479 -static uint64_t
480 -metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
481 - uint64_t align)
518 +static void
519 +metaslab_rt_create(range_tree_t *rt, void *arg)
482 520 {
483 - space_seg_t *ss, ssearch;
484 - avl_index_t where;
521 + metaslab_t *msp = arg;
485 522
486 - ssearch.ss_start = *cursor;
487 - ssearch.ss_end = *cursor + size;
523 + ASSERT3P(rt->rt_arg, ==, msp);
524 + ASSERT(msp->ms_tree == NULL);
488 525
489 - ss = avl_find(t, &ssearch, &where);
490 - if (ss == NULL)
491 - ss = avl_nearest(t, where, AVL_AFTER);
492 -
493 - while (ss != NULL) {
494 - uint64_t offset = P2ROUNDUP(ss->ss_start, align);
495 -
496 - if (offset + size <= ss->ss_end) {
497 - *cursor = offset + size;
498 - return (offset);
499 - }
500 - ss = AVL_NEXT(t, ss);
501 - }
502 -
503 - /*
504 - * If we know we've searched the whole map (*cursor == 0), give up.
505 - * Otherwise, reset the cursor to the beginning and try again.
506 - */
507 - if (*cursor == 0)
508 - return (-1ULL);
509 -
510 - *cursor = 0;
511 - return (metaslab_block_picker(t, cursor, size, align));
526 + avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
527 + sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
512 528 }
513 529
530 +/*
531 + * Destroy the block allocator specific components.
532 + */
514 533 static void
515 -metaslab_pp_load(space_map_t *sm)
534 +metaslab_rt_destroy(range_tree_t *rt, void *arg)
516 535 {
517 - space_seg_t *ss;
536 + metaslab_t *msp = arg;
518 537
519 - ASSERT(sm->sm_ppd == NULL);
520 - sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
538 + ASSERT3P(rt->rt_arg, ==, msp);
539 + ASSERT3P(msp->ms_tree, ==, rt);
540 + ASSERT0(avl_numnodes(&msp->ms_size_tree));
521 541
522 - sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
523 - avl_create(sm->sm_pp_root, metaslab_segsize_compare,
524 - sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
525 -
526 - for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
527 - avl_add(sm->sm_pp_root, ss);
542 + avl_destroy(&msp->ms_size_tree);
528 543 }
529 544
530 545 static void
531 -metaslab_pp_unload(space_map_t *sm)
546 +metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
532 547 {
533 - void *cookie = NULL;
548 + metaslab_t *msp = arg;
534 549
535 - kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
536 - sm->sm_ppd = NULL;
537 -
538 - while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
539 - /* tear down the tree */
540 - }
541 -
542 - avl_destroy(sm->sm_pp_root);
543 - kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
544 - sm->sm_pp_root = NULL;
550 + ASSERT3P(rt->rt_arg, ==, msp);
551 + ASSERT3P(msp->ms_tree, ==, rt);
552 + VERIFY(!msp->ms_condensing);
553 + avl_add(&msp->ms_size_tree, rs);
545 554 }
546 555
547 -/* ARGSUSED */
548 556 static void
549 -metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
557 +metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
550 558 {
551 - /* No need to update cursor */
559 + metaslab_t *msp = arg;
560 +
561 + ASSERT3P(rt->rt_arg, ==, msp);
562 + ASSERT3P(msp->ms_tree, ==, rt);
563 + VERIFY(!msp->ms_condensing);
564 + avl_remove(&msp->ms_size_tree, rs);
552 565 }
553 566
554 -/* ARGSUSED */
555 567 static void
556 -metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
568 +metaslab_rt_vacate(range_tree_t *rt, void *arg)
557 569 {
558 - /* No need to update cursor */
570 + metaslab_t *msp = arg;
571 +
572 + ASSERT3P(rt->rt_arg, ==, msp);
573 + ASSERT3P(msp->ms_tree, ==, rt);
574 +
575 + /*
576 + * Normally one would walk the tree freeing nodes along the way.
577 + * Since the nodes are shared with the range trees we can avoid
578 + * walking all nodes and just reinitialize the avl tree. The nodes
579 + * will be freed by the range tree, so we don't want to free them here.
580 + */
581 + avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
582 + sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
559 583 }
560 584
585 +static range_tree_ops_t metaslab_rt_ops = {
586 + metaslab_rt_create,
587 + metaslab_rt_destroy,
588 + metaslab_rt_add,
589 + metaslab_rt_remove,
590 + metaslab_rt_vacate
591 +};
592 +
561 593 /*
594 + * ==========================================================================
595 + * Metaslab block operations
596 + * ==========================================================================
597 + */
598 +
599 +/*
562 600 * Return the maximum contiguous segment within the metaslab.
563 601 */
564 602 uint64_t
565 -metaslab_pp_maxsize(space_map_t *sm)
603 +metaslab_block_maxsize(metaslab_t *msp)
566 604 {
567 - avl_tree_t *t = sm->sm_pp_root;
568 - space_seg_t *ss;
605 + avl_tree_t *t = &msp->ms_size_tree;
606 + range_seg_t *rs;
569 607
570 - if (t == NULL || (ss = avl_last(t)) == NULL)
608 + if (t == NULL || (rs = avl_last(t)) == NULL)
571 609 return (0ULL);
572 610
573 - return (ss->ss_end - ss->ss_start);
611 + return (rs->rs_end - rs->rs_start);
574 612 }
575 613
614 +uint64_t
615 +metaslab_block_alloc(metaslab_t *msp, uint64_t size)
616 +{
617 + uint64_t start;
618 + range_tree_t *rt = msp->ms_tree;
619 +
620 + VERIFY(!msp->ms_condensing);
621 +
622 + start = msp->ms_ops->msop_alloc(msp, size);
623 + if (start != -1ULL) {
624 + vdev_t *vd = msp->ms_group->mg_vd;
625 +
626 + VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
627 + VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
628 + VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
629 + range_tree_remove(rt, start, size);
630 + }
631 + return (start);
632 +}
633 +
576 634 /*
577 635 * ==========================================================================
636 + * Common allocator routines
637 + * ==========================================================================
638 + */
639 +
640 +/*
641 + * This is a helper function that can be used by the allocator to find
642 + * a suitable block to allocate. This will search the specified AVL
643 + * tree looking for a block that matches the specified criteria.
644 + */
645 +static uint64_t
646 +metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
647 + uint64_t align)
648 +{
649 + range_seg_t *rs, rsearch;
650 + avl_index_t where;
651 +
652 + rsearch.rs_start = *cursor;
653 + rsearch.rs_end = *cursor + size;
654 +
655 + rs = avl_find(t, &rsearch, &where);
656 + if (rs == NULL)
657 + rs = avl_nearest(t, where, AVL_AFTER);
658 +
659 + while (rs != NULL) {
660 + uint64_t offset = P2ROUNDUP(rs->rs_start, align);
661 +
662 + if (offset + size <= rs->rs_end) {
663 + *cursor = offset + size;
664 + return (offset);
665 + }
666 + rs = AVL_NEXT(t, rs);
667 + }
668 +
669 + /*
670 + * If we know we've searched the whole map (*cursor == 0), give up.
671 + * Otherwise, reset the cursor to the beginning and try again.
672 + */
673 + if (*cursor == 0)
674 + return (-1ULL);
675 +
676 + *cursor = 0;
677 + return (metaslab_block_picker(t, cursor, size, align));
678 +}
679 +
680 +/*
681 + * ==========================================================================
578 682 * The first-fit block allocator
579 683 * ==========================================================================
580 684 */
581 685 static uint64_t
582 -metaslab_ff_alloc(space_map_t *sm, uint64_t size)
686 +metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
583 687 {
584 - avl_tree_t *t = &sm->sm_root;
688 + /*
689 + * Find the largest power of 2 block size that evenly divides the
690 + * requested size. This is used to try to allocate blocks with similar
691 + * alignment from the same area of the metaslab (i.e. same cursor
692 + * bucket) but it does not guarantee that other allocations sizes
693 + * may exist in the same region.
694 + */
585 695 uint64_t align = size & -size;
586 - uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
696 + uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
697 + avl_tree_t *t = &msp->ms_tree->rt_root;
587 698
588 699 return (metaslab_block_picker(t, cursor, size, align));
589 700 }
590 701
591 702 /* ARGSUSED */
592 -boolean_t
593 -metaslab_ff_fragmented(space_map_t *sm)
703 +static boolean_t
704 +metaslab_ff_fragmented(metaslab_t *msp)
594 705 {
595 706 return (B_TRUE);
596 707 }
597 708
598 -static space_map_ops_t metaslab_ff_ops = {
599 - metaslab_pp_load,
600 - metaslab_pp_unload,
709 +static metaslab_ops_t metaslab_ff_ops = {
601 710 metaslab_ff_alloc,
602 - metaslab_pp_claim,
603 - metaslab_pp_free,
604 - metaslab_pp_maxsize,
605 711 metaslab_ff_fragmented
606 712 };
607 713
608 714 /*
609 715 * ==========================================================================
610 716 * Dynamic block allocator -
611 717 * Uses the first fit allocation scheme until space get low and then
612 718 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
613 719 * and metaslab_df_free_pct to determine when to switch the allocation scheme.
614 720 * ==========================================================================
615 721 */
616 722 static uint64_t
617 -metaslab_df_alloc(space_map_t *sm, uint64_t size)
723 +metaslab_df_alloc(metaslab_t *msp, uint64_t size)
618 724 {
619 - avl_tree_t *t = &sm->sm_root;
725 + /*
726 + * Find the largest power of 2 block size that evenly divides the
727 + * requested size. This is used to try to allocate blocks with similar
728 + * alignment from the same area of the metaslab (i.e. same cursor
729 + * bucket) but it does not guarantee that other allocations sizes
730 + * may exist in the same region.
731 + */
620 732 uint64_t align = size & -size;
621 - uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
622 - uint64_t max_size = metaslab_pp_maxsize(sm);
623 - int free_pct = sm->sm_space * 100 / sm->sm_size;
733 + uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
734 + range_tree_t *rt = msp->ms_tree;
735 + avl_tree_t *t = &rt->rt_root;
736 + uint64_t max_size = metaslab_block_maxsize(msp);
737 + int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
624 738
625 - ASSERT(MUTEX_HELD(sm->sm_lock));
626 - ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
739 + ASSERT(MUTEX_HELD(&msp->ms_lock));
740 + ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
627 741
628 742 if (max_size < size)
629 743 return (-1ULL);
630 744
631 745 /*
632 746 * If we're running low on space switch to using the size
633 747 * sorted AVL tree (best-fit).
634 748 */
635 749 if (max_size < metaslab_df_alloc_threshold ||
636 750 free_pct < metaslab_df_free_pct) {
637 - t = sm->sm_pp_root;
751 + t = &msp->ms_size_tree;
638 752 *cursor = 0;
639 753 }
640 754
641 755 return (metaslab_block_picker(t, cursor, size, 1ULL));
642 756 }
643 757
644 758 static boolean_t
645 -metaslab_df_fragmented(space_map_t *sm)
759 +metaslab_df_fragmented(metaslab_t *msp)
646 760 {
647 - uint64_t max_size = metaslab_pp_maxsize(sm);
648 - int free_pct = sm->sm_space * 100 / sm->sm_size;
761 + range_tree_t *rt = msp->ms_tree;
762 + uint64_t max_size = metaslab_block_maxsize(msp);
763 + int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
649 764
650 765 if (max_size >= metaslab_df_alloc_threshold &&
651 766 free_pct >= metaslab_df_free_pct)
652 767 return (B_FALSE);
653 768
654 769 return (B_TRUE);
655 770 }
656 771
657 -static space_map_ops_t metaslab_df_ops = {
658 - metaslab_pp_load,
659 - metaslab_pp_unload,
772 +static metaslab_ops_t metaslab_df_ops = {
660 773 metaslab_df_alloc,
661 - metaslab_pp_claim,
662 - metaslab_pp_free,
663 - metaslab_pp_maxsize,
664 774 metaslab_df_fragmented
665 775 };
666 776
667 777 /*
668 778 * ==========================================================================
669 - * Other experimental allocators
779 + * Cursor fit block allocator -
780 + * Select the largest region in the metaslab, set the cursor to the beginning
781 + * of the range and the cursor_end to the end of the range. As allocations
782 + * are made advance the cursor. Continue allocating from the cursor until
783 + * the range is exhausted and then find a new range.
670 784 * ==========================================================================
671 785 */
672 786 static uint64_t
673 -metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
787 +metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
674 788 {
675 - avl_tree_t *t = &sm->sm_root;
676 - uint64_t *cursor = (uint64_t *)sm->sm_ppd;
677 - uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
678 - uint64_t max_size = metaslab_pp_maxsize(sm);
679 - uint64_t rsize = size;
789 + range_tree_t *rt = msp->ms_tree;
790 + avl_tree_t *t = &msp->ms_size_tree;
791 + uint64_t *cursor = &msp->ms_lbas[0];
792 + uint64_t *cursor_end = &msp->ms_lbas[1];
680 793 uint64_t offset = 0;
681 794
682 - ASSERT(MUTEX_HELD(sm->sm_lock));
683 - ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
795 + ASSERT(MUTEX_HELD(&msp->ms_lock));
796 + ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
684 797
685 - if (max_size < size)
686 - return (-1ULL);
798 + ASSERT3U(*cursor_end, >=, *cursor);
687 799
688 - ASSERT3U(*extent_end, >=, *cursor);
800 + if ((*cursor + size) > *cursor_end) {
801 + range_seg_t *rs;
689 802
690 - /*
691 - * If we're running low on space switch to using the size
692 - * sorted AVL tree (best-fit).
693 - */
694 - if ((*cursor + size) > *extent_end) {
803 + rs = avl_last(&msp->ms_size_tree);
804 + if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
805 + return (-1ULL);
695 806
696 - t = sm->sm_pp_root;
697 - *cursor = *extent_end = 0;
698 -
699 - if (max_size > 2 * SPA_MAXBLOCKSIZE)
700 - rsize = MIN(metaslab_min_alloc_size, max_size);
701 - offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
702 - if (offset != -1)
703 - *cursor = offset + size;
704 - } else {
705 - offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
807 + *cursor = rs->rs_start;
808 + *cursor_end = rs->rs_end;
706 809 }
707 - ASSERT3U(*cursor, <=, *extent_end);
810 +
811 + offset = *cursor;
812 + *cursor += size;
813 +
708 814 return (offset);
709 815 }
710 816
711 817 static boolean_t
712 -metaslab_cdf_fragmented(space_map_t *sm)
818 +metaslab_cf_fragmented(metaslab_t *msp)
713 819 {
714 - uint64_t max_size = metaslab_pp_maxsize(sm);
715 -
716 - if (max_size > (metaslab_min_alloc_size * 10))
717 - return (B_FALSE);
718 - return (B_TRUE);
820 + return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size);
719 821 }
720 822
721 -static space_map_ops_t metaslab_cdf_ops = {
722 - metaslab_pp_load,
723 - metaslab_pp_unload,
724 - metaslab_cdf_alloc,
725 - metaslab_pp_claim,
726 - metaslab_pp_free,
727 - metaslab_pp_maxsize,
728 - metaslab_cdf_fragmented
823 +static metaslab_ops_t metaslab_cf_ops = {
824 + metaslab_cf_alloc,
825 + metaslab_cf_fragmented
729 826 };
730 827
828 +/*
829 + * ==========================================================================
830 + * New dynamic fit allocator -
831 + * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
832 + * contiguous blocks. If no region is found then just use the largest segment
833 + * that remains.
834 + * ==========================================================================
835 + */
836 +
837 +/*
838 + * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
839 + * to request from the allocator.
840 + */
731 841 uint64_t metaslab_ndf_clump_shift = 4;
732 842
733 843 static uint64_t
734 -metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
844 +metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
735 845 {
736 - avl_tree_t *t = &sm->sm_root;
846 + avl_tree_t *t = &msp->ms_tree->rt_root;
737 847 avl_index_t where;
738 - space_seg_t *ss, ssearch;
848 + range_seg_t *rs, rsearch;
739 849 uint64_t hbit = highbit(size);
740 - uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
741 - uint64_t max_size = metaslab_pp_maxsize(sm);
850 + uint64_t *cursor = &msp->ms_lbas[hbit - 1];
851 + uint64_t max_size = metaslab_block_maxsize(msp);
742 852
743 - ASSERT(MUTEX_HELD(sm->sm_lock));
744 - ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
853 + ASSERT(MUTEX_HELD(&msp->ms_lock));
854 + ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
745 855
746 856 if (max_size < size)
747 857 return (-1ULL);
748 858
749 - ssearch.ss_start = *cursor;
750 - ssearch.ss_end = *cursor + size;
859 + rsearch.rs_start = *cursor;
860 + rsearch.rs_end = *cursor + size;
751 861
752 - ss = avl_find(t, &ssearch, &where);
753 - if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
754 - t = sm->sm_pp_root;
862 + rs = avl_find(t, &rsearch, &where);
863 + if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
864 + t = &msp->ms_size_tree;
755 865
756 - ssearch.ss_start = 0;
757 - ssearch.ss_end = MIN(max_size,
866 + rsearch.rs_start = 0;
867 + rsearch.rs_end = MIN(max_size,
758 868 1ULL << (hbit + metaslab_ndf_clump_shift));
759 - ss = avl_find(t, &ssearch, &where);
760 - if (ss == NULL)
761 - ss = avl_nearest(t, where, AVL_AFTER);
762 - ASSERT(ss != NULL);
869 + rs = avl_find(t, &rsearch, &where);
870 + if (rs == NULL)
871 + rs = avl_nearest(t, where, AVL_AFTER);
872 + ASSERT(rs != NULL);
763 873 }
764 874
765 - if (ss != NULL) {
766 - if (ss->ss_start + size <= ss->ss_end) {
767 - *cursor = ss->ss_start + size;
768 - return (ss->ss_start);
769 - }
875 + if ((rs->rs_end - rs->rs_start) >= size) {
876 + *cursor = rs->rs_start + size;
877 + return (rs->rs_start);
770 878 }
771 879 return (-1ULL);
772 880 }
773 881
774 882 static boolean_t
775 -metaslab_ndf_fragmented(space_map_t *sm)
883 +metaslab_ndf_fragmented(metaslab_t *msp)
776 884 {
777 - uint64_t max_size = metaslab_pp_maxsize(sm);
778 -
779 - if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
780 - return (B_FALSE);
781 - return (B_TRUE);
885 + return (metaslab_block_maxsize(msp) <=
886 + (metaslab_min_alloc_size << metaslab_ndf_clump_shift));
782 887 }
783 888
784 -
785 -static space_map_ops_t metaslab_ndf_ops = {
786 - metaslab_pp_load,
787 - metaslab_pp_unload,
889 +static metaslab_ops_t metaslab_ndf_ops = {
788 890 metaslab_ndf_alloc,
789 - metaslab_pp_claim,
790 - metaslab_pp_free,
791 - metaslab_pp_maxsize,
792 891 metaslab_ndf_fragmented
793 892 };
794 893
795 -space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
894 +metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
796 895
797 896 /*
798 897 * ==========================================================================
799 898 * Metaslabs
800 899 * ==========================================================================
801 900 */
901 +
902 +/*
903 + * Wait for any in-progress metaslab loads to complete.
904 + */
905 +void
906 +metaslab_load_wait(metaslab_t *msp)
907 +{
908 + ASSERT(MUTEX_HELD(&msp->ms_lock));
909 +
910 + while (msp->ms_loading) {
911 + ASSERT(!msp->ms_loaded);
912 + cv_wait(&msp->ms_load_cv, &msp->ms_lock);
913 + }
914 +}
915 +
916 +int
917 +metaslab_load(metaslab_t *msp)
918 +{
919 + int error = 0;
920 +
921 + ASSERT(MUTEX_HELD(&msp->ms_lock));
922 + ASSERT(!msp->ms_loaded);
923 + ASSERT(!msp->ms_loading);
924 +
925 + msp->ms_loading = B_TRUE;
926 +
927 + /*
928 + * If the space map has not been allocated yet, then treat
929 + * all the space in the metaslab as free and add it to the
930 + * ms_tree.
931 + */
932 + if (msp->ms_sm != NULL)
933 + error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
934 + else
935 + range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
936 +
937 + msp->ms_loaded = (error == 0);
938 + msp->ms_loading = B_FALSE;
939 +
940 + if (msp->ms_loaded) {
941 + for (int t = 0; t < TXG_DEFER_SIZE; t++) {
942 + range_tree_walk(msp->ms_defertree[t],
943 + range_tree_remove, msp->ms_tree);
944 + }
945 + }
946 + cv_broadcast(&msp->ms_load_cv);
947 + return (error);
948 +}
949 +
950 +void
951 +metaslab_unload(metaslab_t *msp)
952 +{
953 + ASSERT(MUTEX_HELD(&msp->ms_lock));
954 + range_tree_vacate(msp->ms_tree, NULL, NULL);
955 + msp->ms_loaded = B_FALSE;
956 + msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
957 +}
958 +
802 959 metaslab_t *
803 -metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
804 - uint64_t start, uint64_t size, uint64_t txg)
960 +metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg)
805 961 {
806 962 vdev_t *vd = mg->mg_vd;
963 + objset_t *mos = vd->vdev_spa->spa_meta_objset;
807 964 metaslab_t *msp;
808 965
809 966 msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
810 967 mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
968 + cv_init(&msp->ms_load_cv, NULL, CV_DEFAULT, NULL);
969 + msp->ms_id = id;
970 + msp->ms_start = id << vd->vdev_ms_shift;
971 + msp->ms_size = 1ULL << vd->vdev_ms_shift;
811 972
812 - msp->ms_smo_syncing = *smo;
973 + /*
974 + * We only open space map objects that already exist. All others
975 + * will be opened when we finally allocate an object for it.
976 + */
977 + if (object != 0) {
978 + VERIFY0(space_map_open(&msp->ms_sm, mos, object, msp->ms_start,
979 + msp->ms_size, vd->vdev_ashift, &msp->ms_lock));
980 + ASSERT(msp->ms_sm != NULL);
981 + }
813 982
814 983 /*
815 - * We create the main space map here, but we don't create the
816 - * allocmaps and freemaps until metaslab_sync_done(). This serves
984 + * We create the main range tree here, but we don't create the
985 + * alloctree and freetree until metaslab_sync_done(). This serves
817 986 * two purposes: it allows metaslab_sync_done() to detect the
818 987 * addition of new space; and for debugging, it ensures that we'd
819 988 * data fault on any attempt to use this metaslab before it's ready.
820 989 */
821 - msp->ms_map = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
822 - space_map_create(msp->ms_map, start, size,
823 - vd->vdev_ashift, &msp->ms_lock);
824 -
990 + msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock);
825 991 metaslab_group_add(mg, msp);
826 992
827 - if (metaslab_debug && smo->smo_object != 0) {
828 - mutex_enter(&msp->ms_lock);
829 - VERIFY(space_map_load(msp->ms_map, mg->mg_class->mc_ops,
830 - SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
831 - mutex_exit(&msp->ms_lock);
832 - }
993 + msp->ms_ops = mg->mg_class->mc_ops;
833 994
834 995 /*
835 996 * If we're opening an existing pool (txg == 0) or creating
836 997 * a new one (txg == TXG_INITIAL), all space is available now.
837 998 * If we're adding space to an existing pool, the new space
838 999 * does not become available until after this txg has synced.
839 1000 */
840 1001 if (txg <= TXG_INITIAL)
841 1002 metaslab_sync_done(msp, 0);
842 1003
1004 + /*
1005 + * If metaslab_debug_load is set and we're initializing a metaslab
1006 + * that has an allocated space_map object then load the its space
1007 + * map so that can verify frees.
1008 + */
1009 + if (metaslab_debug_load && msp->ms_sm != NULL) {
1010 + mutex_enter(&msp->ms_lock);
1011 + VERIFY0(metaslab_load(msp));
1012 + mutex_exit(&msp->ms_lock);
1013 + }
1014 +
843 1015 if (txg != 0) {
844 1016 vdev_dirty(vd, 0, NULL, txg);
845 1017 vdev_dirty(vd, VDD_METASLAB, msp, txg);
846 1018 }
847 1019
848 1020 return (msp);
849 1021 }
850 1022
851 1023 void
852 1024 metaslab_fini(metaslab_t *msp)
853 1025 {
854 1026 metaslab_group_t *mg = msp->ms_group;
855 1027
856 - vdev_space_update(mg->mg_vd,
857 - -msp->ms_smo.smo_alloc, 0, -msp->ms_map->sm_size);
858 -
859 1028 metaslab_group_remove(mg, msp);
860 1029
861 1030 mutex_enter(&msp->ms_lock);
862 1031
863 - space_map_unload(msp->ms_map);
864 - space_map_destroy(msp->ms_map);
865 - kmem_free(msp->ms_map, sizeof (*msp->ms_map));
1032 + VERIFY(msp->ms_group == NULL);
1033 + vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1034 + 0, -msp->ms_size);
1035 + space_map_close(msp->ms_sm);
866 1036
1037 + metaslab_unload(msp);
1038 + range_tree_destroy(msp->ms_tree);
1039 +
867 1040 for (int t = 0; t < TXG_SIZE; t++) {
868 - space_map_destroy(msp->ms_allocmap[t]);
869 - space_map_destroy(msp->ms_freemap[t]);
870 - kmem_free(msp->ms_allocmap[t], sizeof (*msp->ms_allocmap[t]));
871 - kmem_free(msp->ms_freemap[t], sizeof (*msp->ms_freemap[t]));
1041 + range_tree_destroy(msp->ms_alloctree[t]);
1042 + range_tree_destroy(msp->ms_freetree[t]);
872 1043 }
873 1044
874 1045 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
875 - space_map_destroy(msp->ms_defermap[t]);
876 - kmem_free(msp->ms_defermap[t], sizeof (*msp->ms_defermap[t]));
1046 + range_tree_destroy(msp->ms_defertree[t]);
877 1047 }
878 1048
879 1049 ASSERT0(msp->ms_deferspace);
880 1050
881 1051 mutex_exit(&msp->ms_lock);
1052 + cv_destroy(&msp->ms_load_cv);
882 1053 mutex_destroy(&msp->ms_lock);
883 1054
884 1055 kmem_free(msp, sizeof (metaslab_t));
885 1056 }
886 1057
887 -#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
888 -#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
889 -#define METASLAB_ACTIVE_MASK \
890 - (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
1058 +/*
1059 + * Apply a weighting factor based on the histogram information for this
1060 + * metaslab. The current weighting factor is somewhat arbitrary and requires
1061 + * additional investigation. The implementation provides a measure of
1062 + * "weighted" free space and gives a higher weighting for larger contiguous
1063 + * regions. The weighting factor is determined by counting the number of
1064 + * sm_shift sectors that exist in each region represented by the histogram.
1065 + * That value is then multiplied by the power of 2 exponent and the sm_shift
1066 + * value.
1067 + *
1068 + * For example, assume the 2^21 histogram bucket has 4 2MB regions and the
1069 + * metaslab has an sm_shift value of 9 (512B):
1070 + *
1071 + * 1) calculate the number of sm_shift sectors in the region:
1072 + * 2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
1073 + * 2) multiply by the power of 2 exponent and the sm_shift value:
1074 + * 16384 * 21 * 9 = 3096576
1075 + * This value will be added to the weighting of the metaslab.
1076 + */
1077 +static uint64_t
1078 +metaslab_weight_factor(metaslab_t *msp)
1079 +{
1080 + uint64_t factor = 0;
1081 + uint64_t sectors;
1082 + int i;
891 1083
1084 + /*
1085 + * A null space map means that the entire metaslab is free,
1086 + * calculate a weight factor that spans the entire size of the
1087 + * metaslab.
1088 + */
1089 + if (msp->ms_sm == NULL) {
1090 + vdev_t *vd = msp->ms_group->mg_vd;
1091 +
1092 + i = highbit(msp->ms_size) - 1;
1093 + sectors = msp->ms_size >> vd->vdev_ashift;
1094 + return (sectors * i * vd->vdev_ashift);
1095 + }
1096 +
1097 + if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
1098 + return (0);
1099 +
1100 + for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) {
1101 + if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1102 + continue;
1103 +
1104 + /*
1105 + * Determine the number of sm_shift sectors in the region
1106 + * indicated by the histogram. For example, given an
1107 + * sm_shift value of 9 (512 bytes) and i = 4 then we know
1108 + * that we're looking at an 8K region in the histogram
1109 + * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the
1110 + * number of sm_shift sectors (512 bytes in this example),
1111 + * we would take 8192 / 512 = 16. Since the histogram
1112 + * is offset by sm_shift we can simply use the value of
1113 + * of i to calculate this (i.e. 2^i = 16 where i = 4).
1114 + */
1115 + sectors = msp->ms_sm->sm_phys->smp_histogram[i] << i;
1116 + factor += (i + msp->ms_sm->sm_shift) * sectors;
1117 + }
1118 + return (factor * msp->ms_sm->sm_shift);
1119 +}
1120 +
892 1121 static uint64_t
893 1122 metaslab_weight(metaslab_t *msp)
894 1123 {
895 1124 metaslab_group_t *mg = msp->ms_group;
896 - space_map_t *sm = msp->ms_map;
897 - space_map_obj_t *smo = &msp->ms_smo;
898 1125 vdev_t *vd = mg->mg_vd;
899 1126 uint64_t weight, space;
900 1127
901 1128 ASSERT(MUTEX_HELD(&msp->ms_lock));
902 1129
903 1130 /*
904 1131 * This vdev is in the process of being removed so there is nothing
905 1132 * for us to do here.
906 1133 */
907 1134 if (vd->vdev_removing) {
908 - ASSERT0(smo->smo_alloc);
1135 + ASSERT0(space_map_allocated(msp->ms_sm));
909 1136 ASSERT0(vd->vdev_ms_shift);
910 1137 return (0);
911 1138 }
912 1139
913 1140 /*
914 1141 * The baseline weight is the metaslab's free space.
915 1142 */
916 - space = sm->sm_size - smo->smo_alloc;
1143 + space = msp->ms_size - space_map_allocated(msp->ms_sm);
917 1144 weight = space;
918 1145
919 1146 /*
920 1147 * Modern disks have uniform bit density and constant angular velocity.
921 1148 * Therefore, the outer recording zones are faster (higher bandwidth)
922 1149 * than the inner zones by the ratio of outer to inner track diameter,
923 1150 * which is typically around 2:1. We account for this by assigning
924 1151 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
925 1152 * In effect, this means that we'll select the metaslab with the most
926 1153 * free bandwidth rather than simply the one with the most free space.
927 1154 */
928 - weight = 2 * weight -
929 - ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
1155 + weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
930 1156 ASSERT(weight >= space && weight <= 2 * space);
931 1157
932 - /*
933 - * For locality, assign higher weight to metaslabs which have
934 - * a lower offset than what we've already activated.
935 - */
936 - if (sm->sm_start <= mg->mg_bonus_area)
937 - weight *= (metaslab_smo_bonus_pct / 100);
938 - ASSERT(weight >= space &&
939 - weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
1158 + msp->ms_factor = metaslab_weight_factor(msp);
1159 + if (metaslab_weight_factor_enable)
1160 + weight += msp->ms_factor;
940 1161
941 - if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
1162 + if (msp->ms_loaded && !msp->ms_ops->msop_fragmented(msp)) {
942 1163 /*
943 1164 * If this metaslab is one we're actively using, adjust its
944 1165 * weight to make it preferable to any inactive metaslab so
945 1166 * we'll polish it off.
946 1167 */
947 1168 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
948 1169 }
1170 +
949 1171 return (weight);
950 1172 }
951 1173
952 -static void
953 -metaslab_prefetch(metaslab_group_t *mg)
954 -{
955 - spa_t *spa = mg->mg_vd->vdev_spa;
956 - metaslab_t *msp;
957 - avl_tree_t *t = &mg->mg_metaslab_tree;
958 - int m;
959 -
960 - mutex_enter(&mg->mg_lock);
961 -
962 - /*
963 - * Prefetch the next potential metaslabs
964 - */
965 - for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
966 - space_map_t *sm = msp->ms_map;
967 - space_map_obj_t *smo = &msp->ms_smo;
968 -
969 - /* If we have reached our prefetch limit then we're done */
970 - if (m >= metaslab_prefetch_limit)
971 - break;
972 -
973 - if (!sm->sm_loaded && smo->smo_object != 0) {
974 - mutex_exit(&mg->mg_lock);
975 - dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
976 - 0ULL, smo->smo_objsize);
977 - mutex_enter(&mg->mg_lock);
978 - }
979 - }
980 - mutex_exit(&mg->mg_lock);
981 -}
982 -
983 1174 static int
984 1175 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
985 1176 {
986 - metaslab_group_t *mg = msp->ms_group;
987 - space_map_t *sm = msp->ms_map;
988 - space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
989 -
990 1177 ASSERT(MUTEX_HELD(&msp->ms_lock));
991 1178
992 1179 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
993 - space_map_load_wait(sm);
994 - if (!sm->sm_loaded) {
995 - space_map_obj_t *smo = &msp->ms_smo;
996 -
997 - int error = space_map_load(sm, sm_ops, SM_FREE, smo,
998 - spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
999 - if (error) {
1180 + metaslab_load_wait(msp);
1181 + if (!msp->ms_loaded) {
1182 + int error = metaslab_load(msp);
1183 + if (error) {
1000 1184 metaslab_group_sort(msp->ms_group, msp, 0);
1001 1185 return (error);
1002 1186 }
1003 - for (int t = 0; t < TXG_DEFER_SIZE; t++)
1004 - space_map_walk(msp->ms_defermap[t],
1005 - space_map_claim, sm);
1006 -
1007 1187 }
1008 1188
1009 - /*
1010 - * Track the bonus area as we activate new metaslabs.
1011 - */
1012 - if (sm->sm_start > mg->mg_bonus_area) {
1013 - mutex_enter(&mg->mg_lock);
1014 - mg->mg_bonus_area = sm->sm_start;
1015 - mutex_exit(&mg->mg_lock);
1016 - }
1017 -
1018 1189 metaslab_group_sort(msp->ms_group, msp,
1019 1190 msp->ms_weight | activation_weight);
1020 1191 }
1021 - ASSERT(sm->sm_loaded);
1192 + ASSERT(msp->ms_loaded);
1022 1193 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1023 1194
1024 1195 return (0);
1025 1196 }
1026 1197
1027 1198 static void
1028 1199 metaslab_passivate(metaslab_t *msp, uint64_t size)
1029 1200 {
1030 1201 /*
1031 1202 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1032 1203 * this metaslab again. In that case, it had better be empty,
1033 1204 * or we would be leaving space on the table.
1034 1205 */
1035 - ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map->sm_space == 0);
1206 + ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0);
1036 1207 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
1037 1208 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1038 1209 }
1039 1210
1211 +static void
1212 +metaslab_preload(void *arg)
1213 +{
1214 + metaslab_t *msp = arg;
1215 + spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1216 +
1217 + mutex_enter(&msp->ms_lock);
1218 + metaslab_load_wait(msp);
1219 + if (!msp->ms_loaded)
1220 + (void) metaslab_load(msp);
1221 +
1222 + /*
1223 + * Set the ms_access_txg value so that we don't unload it right away.
1224 + */
1225 + msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
1226 + mutex_exit(&msp->ms_lock);
1227 +}
1228 +
1229 +static void
1230 +metaslab_group_preload(metaslab_group_t *mg)
1231 +{
1232 + spa_t *spa = mg->mg_vd->vdev_spa;
1233 + metaslab_t *msp;
1234 + avl_tree_t *t = &mg->mg_metaslab_tree;
1235 + int m = 0;
1236 +
1237 + if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1238 + taskq_wait(mg->mg_taskq);
1239 + return;
1240 + }
1241 + mutex_enter(&mg->mg_lock);
1242 +
1243 + /*
1244 + * Prefetch the next potential metaslabs
1245 + */
1246 + for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
1247 +
1248 + /* If we have reached our preload limit then we're done */
1249 + if (++m > metaslab_preload_limit)
1250 + break;
1251 +
1252 + VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1253 + msp, TQ_SLEEP) != NULL);
1254 + }
1255 + mutex_exit(&mg->mg_lock);
1256 +}
1257 +
1040 1258 /*
1041 - * Determine if the in-core space map representation can be condensed on-disk.
1042 - * We would like to use the following criteria to make our decision:
1259 + * Determine if the space map's on-disk footprint is past our tolerance
1260 + * for inefficiency. We would like to use the following criteria to make
1261 + * our decision:
1043 1262 *
1044 1263 * 1. The size of the space map object should not dramatically increase as a
1045 - * result of writing out our in-core free map.
1264 + * result of writing out the free space range tree.
1046 1265 *
1047 1266 * 2. The minimal on-disk space map representation is zfs_condense_pct/100
1048 - * times the size than the in-core representation (i.e. zfs_condense_pct = 110
1049 - * and in-core = 1MB, minimal = 1.1.MB).
1267 + * times the size than the free space range tree representation
1268 + * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
1050 1269 *
1051 1270 * Checking the first condition is tricky since we don't want to walk
1052 1271 * the entire AVL tree calculating the estimated on-disk size. Instead we
1053 - * use the size-ordered AVL tree in the space map and calculate the
1054 - * size required for the largest segment in our in-core free map. If the
1272 + * use the size-ordered range tree in the metaslab and calculate the
1273 + * size required to write out the largest segment in our free tree. If the
1055 1274 * size required to represent that segment on disk is larger than the space
1056 1275 * map object then we avoid condensing this map.
1057 1276 *
1058 1277 * To determine the second criterion we use a best-case estimate and assume
1059 1278 * each segment can be represented on-disk as a single 64-bit entry. We refer
1060 1279 * to this best-case estimate as the space map's minimal form.
1061 1280 */
1062 1281 static boolean_t
1063 1282 metaslab_should_condense(metaslab_t *msp)
1064 1283 {
1065 - space_map_t *sm = msp->ms_map;
1066 - space_map_obj_t *smo = &msp->ms_smo_syncing;
1067 - space_seg_t *ss;
1284 + space_map_t *sm = msp->ms_sm;
1285 + range_seg_t *rs;
1068 1286 uint64_t size, entries, segsz;
1069 1287
1070 1288 ASSERT(MUTEX_HELD(&msp->ms_lock));
1071 - ASSERT(sm->sm_loaded);
1289 + ASSERT(msp->ms_loaded);
1072 1290
1073 1291 /*
1074 - * Use the sm_pp_root AVL tree, which is ordered by size, to obtain
1075 - * the largest segment in the in-core free map. If the tree is
1076 - * empty then we should condense the map.
1292 + * Use the ms_size_tree range tree, which is ordered by size, to
1293 + * obtain the largest segment in the free tree. If the tree is empty
1294 + * then we should condense the map.
1077 1295 */
1078 - ss = avl_last(sm->sm_pp_root);
1079 - if (ss == NULL)
1296 + rs = avl_last(&msp->ms_size_tree);
1297 + if (rs == NULL)
1080 1298 return (B_TRUE);
1081 1299
1082 1300 /*
1083 1301 * Calculate the number of 64-bit entries this segment would
1084 1302 * require when written to disk. If this single segment would be
1085 1303 * larger on-disk than the entire current on-disk structure, then
1086 1304 * clearly condensing will increase the on-disk structure size.
1087 1305 */
1088 - size = (ss->ss_end - ss->ss_start) >> sm->sm_shift;
1306 + size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
1089 1307 entries = size / (MIN(size, SM_RUN_MAX));
1090 1308 segsz = entries * sizeof (uint64_t);
1091 1309
1092 - return (segsz <= smo->smo_objsize &&
1093 - smo->smo_objsize >= (zfs_condense_pct *
1094 - sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) / 100);
1310 + return (segsz <= space_map_length(msp->ms_sm) &&
1311 + space_map_length(msp->ms_sm) >= (zfs_condense_pct *
1312 + sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100);
1095 1313 }
1096 1314
1097 1315 /*
1098 1316 * Condense the on-disk space map representation to its minimized form.
1099 1317 * The minimized form consists of a small number of allocations followed by
1100 - * the in-core free map.
1318 + * the entries of the free range tree.
1101 1319 */
1102 1320 static void
1103 1321 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
1104 1322 {
1105 1323 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1106 - space_map_t *freemap = msp->ms_freemap[txg & TXG_MASK];
1107 - space_map_t condense_map;
1108 - space_map_t *sm = msp->ms_map;
1109 - objset_t *mos = spa_meta_objset(spa);
1110 - space_map_obj_t *smo = &msp->ms_smo_syncing;
1324 + range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
1325 + range_tree_t *condense_tree;
1326 + space_map_t *sm = msp->ms_sm;
1111 1327
1112 1328 ASSERT(MUTEX_HELD(&msp->ms_lock));
1113 1329 ASSERT3U(spa_sync_pass(spa), ==, 1);
1114 - ASSERT(sm->sm_loaded);
1330 + ASSERT(msp->ms_loaded);
1115 1331
1116 1332 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
1117 - "smo size %llu, segments %lu", txg,
1118 - (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
1119 - smo->smo_objsize, avl_numnodes(&sm->sm_root));
1333 + "smp size %llu, segments %lu", txg, msp->ms_id, msp,
1334 + space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root));
1120 1335
1121 1336 /*
1122 - * Create an map that is a 100% allocated map. We remove segments
1337 + * Create an range tree that is 100% allocated. We remove segments
1123 1338 * that have been freed in this txg, any deferred frees that exist,
1124 1339 * and any allocation in the future. Removing segments should be
1125 - * a relatively inexpensive operation since we expect these maps to
1126 - * a small number of nodes.
1340 + * a relatively inexpensive operation since we expect these trees to
1341 + * have a small number of nodes.
1127 1342 */
1128 - space_map_create(&condense_map, sm->sm_start, sm->sm_size,
1129 - sm->sm_shift, sm->sm_lock);
1130 - space_map_add(&condense_map, condense_map.sm_start,
1131 - condense_map.sm_size);
1343 + condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
1344 + range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
1132 1345
1133 1346 /*
1134 - * Remove what's been freed in this txg from the condense_map.
1347 + * Remove what's been freed in this txg from the condense_tree.
1135 1348 * Since we're in sync_pass 1, we know that all the frees from
1136 - * this txg are in the freemap.
1349 + * this txg are in the freetree.
1137 1350 */
1138 - space_map_walk(freemap, space_map_remove, &condense_map);
1351 + range_tree_walk(freetree, range_tree_remove, condense_tree);
1139 1352
1140 - for (int t = 0; t < TXG_DEFER_SIZE; t++)
1141 - space_map_walk(msp->ms_defermap[t],
1142 - space_map_remove, &condense_map);
1353 + for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1354 + range_tree_walk(msp->ms_defertree[t],
1355 + range_tree_remove, condense_tree);
1356 + }
1143 1357
1144 - for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
1145 - space_map_walk(msp->ms_allocmap[(txg + t) & TXG_MASK],
1146 - space_map_remove, &condense_map);
1358 + for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1359 + range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
1360 + range_tree_remove, condense_tree);
1361 + }
1147 1362
1148 1363 /*
1149 1364 * We're about to drop the metaslab's lock thus allowing
1150 1365 * other consumers to change it's content. Set the
1151 - * space_map's sm_condensing flag to ensure that
1366 + * metaslab's ms_condensing flag to ensure that
1152 1367 * allocations on this metaslab do not occur while we're
1153 1368 * in the middle of committing it to disk. This is only critical
1154 - * for the ms_map as all other space_maps use per txg
1369 + * for the ms_tree as all other range trees use per txg
1155 1370 * views of their content.
1156 1371 */
1157 - sm->sm_condensing = B_TRUE;
1372 + msp->ms_condensing = B_TRUE;
1158 1373
1159 1374 mutex_exit(&msp->ms_lock);
1160 - space_map_truncate(smo, mos, tx);
1375 + space_map_truncate(sm, tx);
1161 1376 mutex_enter(&msp->ms_lock);
1162 1377
1163 1378 /*
1164 1379 * While we would ideally like to create a space_map representation
1165 1380 * that consists only of allocation records, doing so can be
1166 - * prohibitively expensive because the in-core free map can be
1381 + * prohibitively expensive because the in-core free tree can be
1167 1382 * large, and therefore computationally expensive to subtract
1168 - * from the condense_map. Instead we sync out two maps, a cheap
1169 - * allocation only map followed by the in-core free map. While not
1383 + * from the condense_tree. Instead we sync out two trees, a cheap
1384 + * allocation only tree followed by the in-core free tree. While not
1170 1385 * optimal, this is typically close to optimal, and much cheaper to
1171 1386 * compute.
1172 1387 */
1173 - space_map_sync(&condense_map, SM_ALLOC, smo, mos, tx);
1174 - space_map_vacate(&condense_map, NULL, NULL);
1175 - space_map_destroy(&condense_map);
1388 + space_map_write(sm, condense_tree, SM_ALLOC, tx);
1389 + range_tree_vacate(condense_tree, NULL, NULL);
1390 + range_tree_destroy(condense_tree);
1176 1391
1177 - space_map_sync(sm, SM_FREE, smo, mos, tx);
1178 - sm->sm_condensing = B_FALSE;
1179 -
1180 - spa_dbgmsg(spa, "condensed: txg %llu, msp[%llu] %p, "
1181 - "smo size %llu", txg,
1182 - (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
1183 - smo->smo_objsize);
1392 + space_map_write(sm, msp->ms_tree, SM_FREE, tx);
1393 + msp->ms_condensing = B_FALSE;
1184 1394 }
1185 1395
1186 1396 /*
1187 1397 * Write a metaslab to disk in the context of the specified transaction group.
1188 1398 */
1189 1399 void
1190 1400 metaslab_sync(metaslab_t *msp, uint64_t txg)
1191 1401 {
1192 - vdev_t *vd = msp->ms_group->mg_vd;
1402 + metaslab_group_t *mg = msp->ms_group;
1403 + vdev_t *vd = mg->mg_vd;
1193 1404 spa_t *spa = vd->vdev_spa;
1194 1405 objset_t *mos = spa_meta_objset(spa);
1195 - space_map_t *allocmap = msp->ms_allocmap[txg & TXG_MASK];
1196 - space_map_t **freemap = &msp->ms_freemap[txg & TXG_MASK];
1197 - space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1198 - space_map_t *sm = msp->ms_map;
1199 - space_map_obj_t *smo = &msp->ms_smo_syncing;
1200 - dmu_buf_t *db;
1406 + range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
1407 + range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
1408 + range_tree_t **freed_tree =
1409 + &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1201 1410 dmu_tx_t *tx;
1411 + uint64_t object = space_map_object(msp->ms_sm);
1202 1412
1203 1413 ASSERT(!vd->vdev_ishole);
1204 1414
1205 1415 /*
1206 1416 * This metaslab has just been added so there's no work to do now.
1207 1417 */
1208 - if (*freemap == NULL) {
1209 - ASSERT3P(allocmap, ==, NULL);
1418 + if (*freetree == NULL) {
1419 + ASSERT3P(alloctree, ==, NULL);
1210 1420 return;
1211 1421 }
1212 1422
1213 - ASSERT3P(allocmap, !=, NULL);
1214 - ASSERT3P(*freemap, !=, NULL);
1215 - ASSERT3P(*freed_map, !=, NULL);
1423 + ASSERT3P(alloctree, !=, NULL);
1424 + ASSERT3P(*freetree, !=, NULL);
1425 + ASSERT3P(*freed_tree, !=, NULL);
1216 1426
1217 - if (allocmap->sm_space == 0 && (*freemap)->sm_space == 0)
1427 + if (range_tree_space(alloctree) == 0 &&
1428 + range_tree_space(*freetree) == 0)
1218 1429 return;
1219 1430
1220 1431 /*
1221 1432 * The only state that can actually be changing concurrently with
1222 - * metaslab_sync() is the metaslab's ms_map. No other thread can
1223 - * be modifying this txg's allocmap, freemap, freed_map, or smo.
1224 - * Therefore, we only hold ms_lock to satify space_map ASSERTs.
1225 - * We drop it whenever we call into the DMU, because the DMU
1226 - * can call down to us (e.g. via zio_free()) at any time.
1433 + * metaslab_sync() is the metaslab's ms_tree. No other thread can
1434 + * be modifying this txg's alloctree, freetree, freed_tree, or
1435 + * space_map_phys_t. Therefore, we only hold ms_lock to satify
1436 + * space_map ASSERTs. We drop it whenever we call into the DMU,
1437 + * because the DMU can call down to us (e.g. via zio_free()) at
1438 + * any time.
1227 1439 */
1228 1440
1229 1441 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
1230 1442
1231 - if (smo->smo_object == 0) {
1232 - ASSERT(smo->smo_objsize == 0);
1233 - ASSERT(smo->smo_alloc == 0);
1234 - smo->smo_object = dmu_object_alloc(mos,
1235 - DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
1236 - DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
1237 - ASSERT(smo->smo_object != 0);
1238 - dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
1239 - (sm->sm_start >> vd->vdev_ms_shift),
1240 - sizeof (uint64_t), &smo->smo_object, tx);
1443 + if (msp->ms_sm == NULL) {
1444 + uint64_t new_object;
1445 +
1446 + new_object = space_map_alloc(mos, tx);
1447 + VERIFY3U(new_object, !=, 0);
1448 +
1449 + VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
1450 + msp->ms_start, msp->ms_size, vd->vdev_ashift,
1451 + &msp->ms_lock));
1452 + ASSERT(msp->ms_sm != NULL);
1241 1453 }
1242 1454
1243 1455 mutex_enter(&msp->ms_lock);
1244 1456
1245 - if (sm->sm_loaded && spa_sync_pass(spa) == 1 &&
1457 + if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
1246 1458 metaslab_should_condense(msp)) {
1247 1459 metaslab_condense(msp, txg, tx);
1248 1460 } else {
1249 - space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
1250 - space_map_sync(*freemap, SM_FREE, smo, mos, tx);
1461 + space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
1462 + space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
1251 1463 }
1252 1464
1253 - space_map_vacate(allocmap, NULL, NULL);
1465 + range_tree_vacate(alloctree, NULL, NULL);
1254 1466
1467 + if (msp->ms_loaded) {
1468 + /*
1469 + * When the space map is loaded, we have an accruate
1470 + * histogram in the range tree. This gives us an opportunity
1471 + * to bring the space map's histogram up-to-date so we clear
1472 + * it first before updating it.
1473 + */
1474 + space_map_histogram_clear(msp->ms_sm);
1475 + space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
1476 + } else {
1477 + /*
1478 + * Since the space map is not loaded we simply update the
1479 + * exisiting histogram with what was freed in this txg. This
1480 + * means that the on-disk histogram may not have an accurate
1481 + * view of the free space but it's close enough to allow
1482 + * us to make allocation decisions.
1483 + */
1484 + space_map_histogram_add(msp->ms_sm, *freetree, tx);
1485 + }
1486 +
1255 1487 /*
1256 - * For sync pass 1, we avoid walking the entire space map and
1257 - * instead will just swap the pointers for freemap and
1258 - * freed_map. We can safely do this since the freed_map is
1488 + * For sync pass 1, we avoid traversing this txg's free range tree
1489 + * and instead will just swap the pointers for freetree and
1490 + * freed_tree. We can safely do this since the freed_tree is
1259 1491 * guaranteed to be empty on the initial pass.
1260 1492 */
1261 1493 if (spa_sync_pass(spa) == 1) {
1262 - ASSERT0((*freed_map)->sm_space);
1263 - ASSERT0(avl_numnodes(&(*freed_map)->sm_root));
1264 - space_map_swap(freemap, freed_map);
1494 + range_tree_swap(freetree, freed_tree);
1265 1495 } else {
1266 - space_map_vacate(*freemap, space_map_add, *freed_map);
1496 + range_tree_vacate(*freetree, range_tree_add, *freed_tree);
1267 1497 }
1268 1498
1269 - ASSERT0(msp->ms_allocmap[txg & TXG_MASK]->sm_space);
1270 - ASSERT0(msp->ms_freemap[txg & TXG_MASK]->sm_space);
1499 + ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1500 + ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1271 1501
1272 1502 mutex_exit(&msp->ms_lock);
1273 1503
1274 - VERIFY0(dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
1275 - dmu_buf_will_dirty(db, tx);
1276 - ASSERT3U(db->db_size, >=, sizeof (*smo));
1277 - bcopy(smo, db->db_data, sizeof (*smo));
1278 - dmu_buf_rele(db, FTAG);
1279 -
1504 + if (object != space_map_object(msp->ms_sm)) {
1505 + object = space_map_object(msp->ms_sm);
1506 + dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
1507 + msp->ms_id, sizeof (uint64_t), &object, tx);
1508 + }
1280 1509 dmu_tx_commit(tx);
1281 1510 }
1282 1511
1283 1512 /*
1284 1513 * Called after a transaction group has completely synced to mark
1285 1514 * all of the metaslab's free space as usable.
1286 1515 */
1287 1516 void
1288 1517 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1289 1518 {
1290 - space_map_obj_t *smo = &msp->ms_smo;
1291 - space_map_obj_t *smosync = &msp->ms_smo_syncing;
1292 - space_map_t *sm = msp->ms_map;
1293 - space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1294 - space_map_t **defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
1295 1519 metaslab_group_t *mg = msp->ms_group;
1296 1520 vdev_t *vd = mg->mg_vd;
1521 + range_tree_t **freed_tree;
1522 + range_tree_t **defer_tree;
1297 1523 int64_t alloc_delta, defer_delta;
1298 1524
1299 1525 ASSERT(!vd->vdev_ishole);
1300 1526
1301 1527 mutex_enter(&msp->ms_lock);
1302 1528
1303 1529 /*
1304 1530 * If this metaslab is just becoming available, initialize its
1305 - * allocmaps, freemaps, and defermap and add its capacity to the vdev.
1531 + * alloctrees, freetrees, and defertree and add its capacity to
1532 + * the vdev.
1306 1533 */
1307 - if (*freed_map == NULL) {
1308 - ASSERT(*defer_map == NULL);
1534 + if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
1309 1535 for (int t = 0; t < TXG_SIZE; t++) {
1310 - msp->ms_allocmap[t] = kmem_zalloc(sizeof (space_map_t),
1311 - KM_SLEEP);
1312 - space_map_create(msp->ms_allocmap[t], sm->sm_start,
1313 - sm->sm_size, sm->sm_shift, sm->sm_lock);
1314 - msp->ms_freemap[t] = kmem_zalloc(sizeof (space_map_t),
1315 - KM_SLEEP);
1316 - space_map_create(msp->ms_freemap[t], sm->sm_start,
1317 - sm->sm_size, sm->sm_shift, sm->sm_lock);
1536 + ASSERT(msp->ms_alloctree[t] == NULL);
1537 + ASSERT(msp->ms_freetree[t] == NULL);
1538 +
1539 + msp->ms_alloctree[t] = range_tree_create(NULL, msp,
1540 + &msp->ms_lock);
1541 + msp->ms_freetree[t] = range_tree_create(NULL, msp,
1542 + &msp->ms_lock);
1318 1543 }
1319 1544
1320 1545 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1321 - msp->ms_defermap[t] = kmem_zalloc(sizeof (space_map_t),
1322 - KM_SLEEP);
1323 - space_map_create(msp->ms_defermap[t], sm->sm_start,
1324 - sm->sm_size, sm->sm_shift, sm->sm_lock);
1546 + ASSERT(msp->ms_defertree[t] == NULL);
1547 +
1548 + msp->ms_defertree[t] = range_tree_create(NULL, msp,
1549 + &msp->ms_lock);
1325 1550 }
1326 1551
1327 - freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1328 - defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
1329 -
1330 - vdev_space_update(vd, 0, 0, sm->sm_size);
1552 + vdev_space_update(vd, 0, 0, msp->ms_size);
1331 1553 }
1332 1554
1333 - alloc_delta = smosync->smo_alloc - smo->smo_alloc;
1334 - defer_delta = (*freed_map)->sm_space - (*defer_map)->sm_space;
1555 + freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1556 + defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
1335 1557
1558 + alloc_delta = space_map_alloc_delta(msp->ms_sm);
1559 + defer_delta = range_tree_space(*freed_tree) -
1560 + range_tree_space(*defer_tree);
1561 +
1336 1562 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
1337 1563
1338 - ASSERT(msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0);
1339 - ASSERT(msp->ms_freemap[txg & TXG_MASK]->sm_space == 0);
1564 + ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1565 + ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1340 1566
1341 1567 /*
1342 - * If there's a space_map_load() in progress, wait for it to complete
1568 + * If there's a metaslab_load() in progress, wait for it to complete
1343 1569 * so that we have a consistent view of the in-core space map.
1344 1570 */
1345 - space_map_load_wait(sm);
1571 + metaslab_load_wait(msp);
1346 1572
1347 1573 /*
1348 - * Move the frees from the defer_map to this map (if it's loaded).
1349 - * Swap the freed_map and the defer_map -- this is safe to do
1350 - * because we've just emptied out the defer_map.
1574 + * Move the frees from the defer_tree back to the free
1575 + * range tree (if it's loaded). Swap the freed_tree and the
1576 + * defer_tree -- this is safe to do because we've just emptied out
1577 + * the defer_tree.
1351 1578 */
1352 - space_map_vacate(*defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
1353 - ASSERT0((*defer_map)->sm_space);
1354 - ASSERT0(avl_numnodes(&(*defer_map)->sm_root));
1355 - space_map_swap(freed_map, defer_map);
1579 + range_tree_vacate(*defer_tree,
1580 + msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
1581 + range_tree_swap(freed_tree, defer_tree);
1356 1582
1357 - *smo = *smosync;
1583 + space_map_update(msp->ms_sm);
1358 1584
1359 1585 msp->ms_deferspace += defer_delta;
1360 1586 ASSERT3S(msp->ms_deferspace, >=, 0);
1361 - ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
1587 + ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
1362 1588 if (msp->ms_deferspace != 0) {
1363 1589 /*
1364 1590 * Keep syncing this metaslab until all deferred frees
1365 1591 * are back in circulation.
1366 1592 */
1367 1593 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1368 1594 }
1369 1595
1370 - /*
1371 - * If the map is loaded but no longer active, evict it as soon as all
1372 - * future allocations have synced. (If we unloaded it now and then
1373 - * loaded a moment later, the map wouldn't reflect those allocations.)
1374 - */
1375 - if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1376 - int evictable = 1;
1596 + if (msp->ms_loaded && msp->ms_access_txg < txg) {
1597 + for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1598 + VERIFY0(range_tree_space(
1599 + msp->ms_alloctree[(txg + t) & TXG_MASK]));
1600 + }
1377 1601
1378 - for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
1379 - if (msp->ms_allocmap[(txg + t) & TXG_MASK]->sm_space)
1380 - evictable = 0;
1381 -
1382 - if (evictable && !metaslab_debug)
1383 - space_map_unload(sm);
1602 + if (!metaslab_debug_unload)
1603 + metaslab_unload(msp);
1384 1604 }
1385 1605
1386 1606 metaslab_group_sort(mg, msp, metaslab_weight(msp));
1387 -
1388 1607 mutex_exit(&msp->ms_lock);
1608 +
1389 1609 }
1390 1610
1391 1611 void
1392 1612 metaslab_sync_reassess(metaslab_group_t *mg)
1393 1613 {
1394 - vdev_t *vd = mg->mg_vd;
1395 1614 int64_t failures = mg->mg_alloc_failures;
1396 1615
1397 1616 metaslab_group_alloc_update(mg);
1398 -
1399 - /*
1400 - * Re-evaluate all metaslabs which have lower offsets than the
1401 - * bonus area.
1402 - */
1403 - for (int m = 0; m < vd->vdev_ms_count; m++) {
1404 - metaslab_t *msp = vd->vdev_ms[m];
1405 -
1406 - if (msp->ms_map->sm_start > mg->mg_bonus_area)
1407 - break;
1408 -
1409 - mutex_enter(&msp->ms_lock);
1410 - metaslab_group_sort(mg, msp, metaslab_weight(msp));
1411 - mutex_exit(&msp->ms_lock);
1412 - }
1413 -
1414 1617 atomic_add_64(&mg->mg_alloc_failures, -failures);
1415 1618
1416 1619 /*
1417 - * Prefetch the next potential metaslabs
1620 + * Preload the next potential metaslabs
1418 1621 */
1419 - metaslab_prefetch(mg);
1622 + metaslab_group_preload(mg);
1420 1623 }
1421 1624
1422 1625 static uint64_t
1423 1626 metaslab_distance(metaslab_t *msp, dva_t *dva)
1424 1627 {
1425 1628 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
1426 1629 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
1427 - uint64_t start = msp->ms_map->sm_start >> ms_shift;
1630 + uint64_t start = msp->ms_id;
1428 1631
1429 1632 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
1430 1633 return (1ULL << 63);
1431 1634
1432 1635 if (offset < start)
1433 1636 return ((start - offset) << ms_shift);
1434 1637 if (offset > start)
1435 1638 return ((offset - start) << ms_shift);
1436 1639 return (0);
1437 1640 }
1438 1641
1439 1642 static uint64_t
1440 1643 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1441 1644 uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
1442 1645 {
1443 1646 spa_t *spa = mg->mg_vd->vdev_spa;
1444 1647 metaslab_t *msp = NULL;
1445 1648 uint64_t offset = -1ULL;
1446 1649 avl_tree_t *t = &mg->mg_metaslab_tree;
1447 1650 uint64_t activation_weight;
1448 1651 uint64_t target_distance;
1449 1652 int i;
1450 1653
1451 1654 activation_weight = METASLAB_WEIGHT_PRIMARY;
1452 1655 for (i = 0; i < d; i++) {
1453 1656 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
1454 1657 activation_weight = METASLAB_WEIGHT_SECONDARY;
1455 1658 break;
1456 1659 }
1457 1660 }
1458 1661
1459 1662 for (;;) {
1460 1663 boolean_t was_active;
1461 1664
1462 1665 mutex_enter(&mg->mg_lock);
1463 1666 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1464 1667 if (msp->ms_weight < asize) {
1465 1668 spa_dbgmsg(spa, "%s: failed to meet weight "
1466 1669 "requirement: vdev %llu, txg %llu, mg %p, "
1467 1670 "msp %p, psize %llu, asize %llu, "
1468 1671 "failures %llu, weight %llu",
↓ open down ↓ |
31 lines elided |
↑ open up ↑ |
1469 1672 spa_name(spa), mg->mg_vd->vdev_id, txg,
1470 1673 mg, msp, psize, asize,
1471 1674 mg->mg_alloc_failures, msp->ms_weight);
1472 1675 mutex_exit(&mg->mg_lock);
1473 1676 return (-1ULL);
1474 1677 }
1475 1678
1476 1679 /*
1477 1680 * If the selected metaslab is condensing, skip it.
1478 1681 */
1479 - if (msp->ms_map->sm_condensing)
1682 + if (msp->ms_condensing)
1480 1683 continue;
1481 1684
1482 1685 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1483 1686 if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1484 1687 break;
1485 1688
1486 1689 target_distance = min_distance +
1487 - (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
1690 + (space_map_allocated(msp->ms_sm) != 0 ? 0 :
1691 + min_distance >> 1);
1488 1692
1489 1693 for (i = 0; i < d; i++)
1490 1694 if (metaslab_distance(msp, &dva[i]) <
1491 1695 target_distance)
1492 1696 break;
1493 1697 if (i == d)
1494 1698 break;
1495 1699 }
1496 1700 mutex_exit(&mg->mg_lock);
1497 1701 if (msp == NULL)
1498 1702 return (-1ULL);
1499 1703
1500 1704 mutex_enter(&msp->ms_lock);
1501 1705
1502 1706 /*
1503 1707 * If we've already reached the allowable number of failed
↓ open down ↓ |
6 lines elided |
↑ open up ↑ |
1504 1708 * allocation attempts on this metaslab group then we
1505 1709 * consider skipping it. We skip it only if we're allowed
1506 1710 * to "fast" gang, the physical size is larger than
1507 1711 * a gang block, and we're attempting to allocate from
1508 1712 * the primary metaslab.
1509 1713 */
1510 1714 if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1511 1715 CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1512 1716 activation_weight == METASLAB_WEIGHT_PRIMARY) {
1513 1717 spa_dbgmsg(spa, "%s: skipping metaslab group: "
1514 - "vdev %llu, txg %llu, mg %p, psize %llu, "
1515 - "asize %llu, failures %llu", spa_name(spa),
1516 - mg->mg_vd->vdev_id, txg, mg, psize, asize,
1718 + "vdev %llu, txg %llu, mg %p, msp[%llu] %p, "
1719 + "psize %llu, asize %llu, failures %llu",
1720 + spa_name(spa), mg->mg_vd->vdev_id, txg, mg,
1721 + msp->ms_id, msp, psize, asize,
1517 1722 mg->mg_alloc_failures);
1518 1723 mutex_exit(&msp->ms_lock);
1519 1724 return (-1ULL);
1520 1725 }
1521 1726
1522 1727 /*
1523 1728 * Ensure that the metaslab we have selected is still
1524 1729 * capable of handling our request. It's possible that
1525 1730 * another thread may have changed the weight while we
1526 1731 * were blocked on the metaslab lock.
1527 1732 */
1528 1733 if (msp->ms_weight < asize || (was_active &&
1529 1734 !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1530 1735 activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1531 1736 mutex_exit(&msp->ms_lock);
1532 1737 continue;
1533 1738 }
1534 1739
1535 1740 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1536 1741 activation_weight == METASLAB_WEIGHT_PRIMARY) {
1537 1742 metaslab_passivate(msp,
1538 1743 msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1539 1744 mutex_exit(&msp->ms_lock);
1540 1745 continue;
1541 1746 }
1542 1747
↓ open down ↓ |
16 lines elided |
↑ open up ↑ |
1543 1748 if (metaslab_activate(msp, activation_weight) != 0) {
1544 1749 mutex_exit(&msp->ms_lock);
1545 1750 continue;
1546 1751 }
1547 1752
1548 1753 /*
1549 1754 * If this metaslab is currently condensing then pick again as
1550 1755 * we can't manipulate this metaslab until it's committed
1551 1756 * to disk.
1552 1757 */
1553 - if (msp->ms_map->sm_condensing) {
1758 + if (msp->ms_condensing) {
1554 1759 mutex_exit(&msp->ms_lock);
1555 1760 continue;
1556 1761 }
1557 1762
1558 - if ((offset = space_map_alloc(msp->ms_map, asize)) != -1ULL)
1763 + if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
1559 1764 break;
1560 1765
1561 1766 atomic_inc_64(&mg->mg_alloc_failures);
1562 1767
1563 - metaslab_passivate(msp, space_map_maxsize(msp->ms_map));
1564 -
1768 + metaslab_passivate(msp, metaslab_block_maxsize(msp));
1565 1769 mutex_exit(&msp->ms_lock);
1566 1770 }
1567 1771
1568 - if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0)
1772 + if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
1569 1773 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1570 1774
1571 - space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, asize);
1775 + range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
1776 + msp->ms_access_txg = txg + metaslab_unload_delay;
1572 1777
1573 1778 mutex_exit(&msp->ms_lock);
1574 1779
1575 1780 return (offset);
1576 1781 }
1577 1782
1578 1783 /*
1579 1784 * Allocate a block for the specified i/o.
1580 1785 */
1581 1786 static int
1582 1787 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1583 1788 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
1584 1789 {
1585 1790 metaslab_group_t *mg, *rotor;
1586 1791 vdev_t *vd;
1587 1792 int dshift = 3;
1588 1793 int all_zero;
1589 1794 int zio_lock = B_FALSE;
1590 1795 boolean_t allocatable;
1591 1796 uint64_t offset = -1ULL;
1592 1797 uint64_t asize;
1593 1798 uint64_t distance;
1594 1799
1595 1800 ASSERT(!DVA_IS_VALID(&dva[d]));
1596 1801
1597 1802 /*
1598 1803 * For testing, make some blocks above a certain size be gang blocks.
1599 1804 */
1600 1805 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
1601 1806 return (SET_ERROR(ENOSPC));
1602 1807
1603 1808 /*
1604 1809 * Start at the rotor and loop through all mgs until we find something.
1605 1810 * Note that there's no locking on mc_rotor or mc_aliquot because
1606 1811 * nothing actually breaks if we miss a few updates -- we just won't
1607 1812 * allocate quite as evenly. It all balances out over time.
1608 1813 *
1609 1814 * If we are doing ditto or log blocks, try to spread them across
1610 1815 * consecutive vdevs. If we're forced to reuse a vdev before we've
1611 1816 * allocated all of our ditto blocks, then try and spread them out on
1612 1817 * that vdev as much as possible. If it turns out to not be possible,
1613 1818 * gradually lower our standards until anything becomes acceptable.
1614 1819 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
1615 1820 * gives us hope of containing our fault domains to something we're
1616 1821 * able to reason about. Otherwise, any two top-level vdev failures
1617 1822 * will guarantee the loss of data. With consecutive allocation,
1618 1823 * only two adjacent top-level vdev failures will result in data loss.
1619 1824 *
1620 1825 * If we are doing gang blocks (hintdva is non-NULL), try to keep
1621 1826 * ourselves on the same vdev as our gang block header. That
1622 1827 * way, we can hope for locality in vdev_cache, plus it makes our
1623 1828 * fault domains something tractable.
1624 1829 */
1625 1830 if (hintdva) {
1626 1831 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
1627 1832
1628 1833 /*
1629 1834 * It's possible the vdev we're using as the hint no
1630 1835 * longer exists (i.e. removed). Consult the rotor when
1631 1836 * all else fails.
1632 1837 */
1633 1838 if (vd != NULL) {
1634 1839 mg = vd->vdev_mg;
1635 1840
1636 1841 if (flags & METASLAB_HINTBP_AVOID &&
1637 1842 mg->mg_next != NULL)
1638 1843 mg = mg->mg_next;
1639 1844 } else {
1640 1845 mg = mc->mc_rotor;
1641 1846 }
1642 1847 } else if (d != 0) {
1643 1848 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
1644 1849 mg = vd->vdev_mg->mg_next;
1645 1850 } else {
1646 1851 mg = mc->mc_rotor;
1647 1852 }
1648 1853
1649 1854 /*
1650 1855 * If the hint put us into the wrong metaslab class, or into a
1651 1856 * metaslab group that has been passivated, just follow the rotor.
1652 1857 */
1653 1858 if (mg->mg_class != mc || mg->mg_activation_count <= 0)
1654 1859 mg = mc->mc_rotor;
1655 1860
1656 1861 rotor = mg;
1657 1862 top:
1658 1863 all_zero = B_TRUE;
1659 1864 do {
1660 1865 ASSERT(mg->mg_activation_count == 1);
1661 1866
1662 1867 vd = mg->mg_vd;
1663 1868
1664 1869 /*
1665 1870 * Don't allocate from faulted devices.
1666 1871 */
1667 1872 if (zio_lock) {
1668 1873 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1669 1874 allocatable = vdev_allocatable(vd);
1670 1875 spa_config_exit(spa, SCL_ZIO, FTAG);
1671 1876 } else {
1672 1877 allocatable = vdev_allocatable(vd);
1673 1878 }
1674 1879
1675 1880 /*
1676 1881 * Determine if the selected metaslab group is eligible
1677 1882 * for allocations. If we're ganging or have requested
1678 1883 * an allocation for the smallest gang block size
1679 1884 * then we don't want to avoid allocating to the this
1680 1885 * metaslab group. If we're in this condition we should
1681 1886 * try to allocate from any device possible so that we
1682 1887 * don't inadvertently return ENOSPC and suspend the pool
1683 1888 * even though space is still available.
1684 1889 */
1685 1890 if (allocatable && CAN_FASTGANG(flags) &&
1686 1891 psize > SPA_GANGBLOCKSIZE)
1687 1892 allocatable = metaslab_group_allocatable(mg);
1688 1893
1689 1894 if (!allocatable)
1690 1895 goto next;
1691 1896
1692 1897 /*
1693 1898 * Avoid writing single-copy data to a failing vdev
1694 1899 * unless the user instructs us that it is okay.
1695 1900 */
1696 1901 if ((vd->vdev_stat.vs_write_errors > 0 ||
1697 1902 vd->vdev_state < VDEV_STATE_HEALTHY) &&
1698 1903 d == 0 && dshift == 3 &&
1699 1904 !(zfs_write_to_degraded && vd->vdev_state ==
1700 1905 VDEV_STATE_DEGRADED)) {
1701 1906 all_zero = B_FALSE;
1702 1907 goto next;
1703 1908 }
1704 1909
1705 1910 ASSERT(mg->mg_class == mc);
1706 1911
1707 1912 distance = vd->vdev_asize >> dshift;
1708 1913 if (distance <= (1ULL << vd->vdev_ms_shift))
1709 1914 distance = 0;
1710 1915 else
1711 1916 all_zero = B_FALSE;
1712 1917
1713 1918 asize = vdev_psize_to_asize(vd, psize);
1714 1919 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1715 1920
1716 1921 offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
1717 1922 dva, d, flags);
1718 1923 if (offset != -1ULL) {
1719 1924 /*
1720 1925 * If we've just selected this metaslab group,
1721 1926 * figure out whether the corresponding vdev is
1722 1927 * over- or under-used relative to the pool,
1723 1928 * and set an allocation bias to even it out.
1724 1929 */
1725 1930 if (mc->mc_aliquot == 0) {
1726 1931 vdev_stat_t *vs = &vd->vdev_stat;
1727 1932 int64_t vu, cu;
1728 1933
1729 1934 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
1730 1935 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
1731 1936
1732 1937 /*
1733 1938 * Calculate how much more or less we should
1734 1939 * try to allocate from this device during
1735 1940 * this iteration around the rotor.
1736 1941 * For example, if a device is 80% full
1737 1942 * and the pool is 20% full then we should
1738 1943 * reduce allocations by 60% on this device.
1739 1944 *
1740 1945 * mg_bias = (20 - 80) * 512K / 100 = -307K
1741 1946 *
1742 1947 * This reduces allocations by 307K for this
1743 1948 * iteration.
1744 1949 */
1745 1950 mg->mg_bias = ((cu - vu) *
1746 1951 (int64_t)mg->mg_aliquot) / 100;
1747 1952 }
1748 1953
1749 1954 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1750 1955 mg->mg_aliquot + mg->mg_bias) {
1751 1956 mc->mc_rotor = mg->mg_next;
1752 1957 mc->mc_aliquot = 0;
1753 1958 }
1754 1959
1755 1960 DVA_SET_VDEV(&dva[d], vd->vdev_id);
1756 1961 DVA_SET_OFFSET(&dva[d], offset);
1757 1962 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
1758 1963 DVA_SET_ASIZE(&dva[d], asize);
1759 1964
1760 1965 return (0);
1761 1966 }
1762 1967 next:
1763 1968 mc->mc_rotor = mg->mg_next;
1764 1969 mc->mc_aliquot = 0;
1765 1970 } while ((mg = mg->mg_next) != rotor);
1766 1971
1767 1972 if (!all_zero) {
1768 1973 dshift++;
1769 1974 ASSERT(dshift < 64);
1770 1975 goto top;
1771 1976 }
1772 1977
1773 1978 if (!allocatable && !zio_lock) {
1774 1979 dshift = 3;
1775 1980 zio_lock = B_TRUE;
1776 1981 goto top;
1777 1982 }
1778 1983
1779 1984 bzero(&dva[d], sizeof (dva_t));
1780 1985
1781 1986 return (SET_ERROR(ENOSPC));
1782 1987 }
1783 1988
1784 1989 /*
1785 1990 * Free the block represented by DVA in the context of the specified
1786 1991 * transaction group.
1787 1992 */
1788 1993 static void
1789 1994 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
1790 1995 {
1791 1996 uint64_t vdev = DVA_GET_VDEV(dva);
1792 1997 uint64_t offset = DVA_GET_OFFSET(dva);
1793 1998 uint64_t size = DVA_GET_ASIZE(dva);
1794 1999 vdev_t *vd;
1795 2000 metaslab_t *msp;
1796 2001
1797 2002 ASSERT(DVA_IS_VALID(dva));
1798 2003
1799 2004 if (txg > spa_freeze_txg(spa))
1800 2005 return;
1801 2006
1802 2007 if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1803 2008 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
1804 2009 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
1805 2010 (u_longlong_t)vdev, (u_longlong_t)offset);
1806 2011 ASSERT(0);
1807 2012 return;
↓ open down ↓ |
226 lines elided |
↑ open up ↑ |
1808 2013 }
1809 2014
1810 2015 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1811 2016
1812 2017 if (DVA_GET_GANG(dva))
1813 2018 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1814 2019
1815 2020 mutex_enter(&msp->ms_lock);
1816 2021
1817 2022 if (now) {
1818 - space_map_remove(msp->ms_allocmap[txg & TXG_MASK],
2023 + range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
1819 2024 offset, size);
1820 - space_map_free(msp->ms_map, offset, size);
2025 +
2026 + VERIFY(!msp->ms_condensing);
2027 + VERIFY3U(offset, >=, msp->ms_start);
2028 + VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
2029 + VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
2030 + msp->ms_size);
2031 + VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2032 + VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2033 + range_tree_add(msp->ms_tree, offset, size);
1821 2034 } else {
1822 - if (msp->ms_freemap[txg & TXG_MASK]->sm_space == 0)
2035 + if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
1823 2036 vdev_dirty(vd, VDD_METASLAB, msp, txg);
1824 - space_map_add(msp->ms_freemap[txg & TXG_MASK], offset, size);
2037 + range_tree_add(msp->ms_freetree[txg & TXG_MASK],
2038 + offset, size);
1825 2039 }
1826 2040
1827 2041 mutex_exit(&msp->ms_lock);
1828 2042 }
1829 2043
1830 2044 /*
1831 2045 * Intent log support: upon opening the pool after a crash, notify the SPA
1832 2046 * of blocks that the intent log has allocated for immediate write, but
1833 2047 * which are still considered free by the SPA because the last transaction
1834 2048 * group didn't commit yet.
1835 2049 */
1836 2050 static int
1837 2051 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
1838 2052 {
1839 2053 uint64_t vdev = DVA_GET_VDEV(dva);
1840 2054 uint64_t offset = DVA_GET_OFFSET(dva);
1841 2055 uint64_t size = DVA_GET_ASIZE(dva);
1842 2056 vdev_t *vd;
1843 2057 metaslab_t *msp;
1844 2058 int error = 0;
1845 2059
1846 2060 ASSERT(DVA_IS_VALID(dva));
1847 2061
1848 2062 if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
1849 2063 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
1850 2064 return (SET_ERROR(ENXIO));
1851 2065
1852 2066 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1853 2067
1854 2068 if (DVA_GET_GANG(dva))
1855 2069 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1856 2070
1857 2071 mutex_enter(&msp->ms_lock);
1858 2072
1859 - if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map->sm_loaded)
2073 + if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
1860 2074 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
1861 2075
1862 - if (error == 0 && !space_map_contains(msp->ms_map, offset, size))
2076 + if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
1863 2077 error = SET_ERROR(ENOENT);
1864 2078
1865 2079 if (error || txg == 0) { /* txg == 0 indicates dry run */
1866 2080 mutex_exit(&msp->ms_lock);
1867 2081 return (error);
1868 2082 }
1869 2083
1870 - space_map_claim(msp->ms_map, offset, size);
2084 + VERIFY(!msp->ms_condensing);
2085 + VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2086 + VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2087 + VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
2088 + range_tree_remove(msp->ms_tree, offset, size);
1871 2089
1872 2090 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
1873 - if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0)
2091 + if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
1874 2092 vdev_dirty(vd, VDD_METASLAB, msp, txg);
1875 - space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, size);
2093 + range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
1876 2094 }
1877 2095
1878 2096 mutex_exit(&msp->ms_lock);
1879 2097
1880 2098 return (0);
1881 2099 }
1882 2100
1883 2101 int
1884 2102 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
1885 2103 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
1886 2104 {
1887 2105 dva_t *dva = bp->blk_dva;
1888 2106 dva_t *hintdva = hintbp->blk_dva;
1889 2107 int error = 0;
1890 2108
1891 2109 ASSERT(bp->blk_birth == 0);
1892 2110 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
1893 2111
1894 2112 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1895 2113
1896 2114 if (mc->mc_rotor == NULL) { /* no vdevs in this class */
1897 2115 spa_config_exit(spa, SCL_ALLOC, FTAG);
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
1898 2116 return (SET_ERROR(ENOSPC));
1899 2117 }
1900 2118
1901 2119 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
1902 2120 ASSERT(BP_GET_NDVAS(bp) == 0);
1903 2121 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
1904 2122
1905 2123 for (int d = 0; d < ndvas; d++) {
1906 2124 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
1907 2125 txg, flags);
1908 - if (error) {
2126 + if (error != 0) {
1909 2127 for (d--; d >= 0; d--) {
1910 2128 metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
1911 2129 bzero(&dva[d], sizeof (dva_t));
1912 2130 }
1913 2131 spa_config_exit(spa, SCL_ALLOC, FTAG);
1914 2132 return (error);
1915 2133 }
1916 2134 }
1917 2135 ASSERT(error == 0);
1918 2136 ASSERT(BP_GET_NDVAS(bp) == ndvas);
1919 2137
1920 2138 spa_config_exit(spa, SCL_ALLOC, FTAG);
1921 2139
1922 2140 BP_SET_BIRTH(bp, txg, txg);
1923 2141
1924 2142 return (0);
1925 2143 }
1926 2144
1927 2145 void
1928 2146 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
1929 2147 {
1930 2148 const dva_t *dva = bp->blk_dva;
1931 2149 int ndvas = BP_GET_NDVAS(bp);
1932 2150
1933 2151 ASSERT(!BP_IS_HOLE(bp));
1934 2152 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
1935 2153
1936 2154 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
1937 2155
1938 2156 for (int d = 0; d < ndvas; d++)
1939 2157 metaslab_free_dva(spa, &dva[d], txg, now);
1940 2158
1941 2159 spa_config_exit(spa, SCL_FREE, FTAG);
1942 2160 }
1943 2161
1944 2162 int
1945 2163 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
1946 2164 {
1947 2165 const dva_t *dva = bp->blk_dva;
1948 2166 int ndvas = BP_GET_NDVAS(bp);
1949 2167 int error = 0;
1950 2168
1951 2169 ASSERT(!BP_IS_HOLE(bp));
1952 2170
1953 2171 if (txg != 0) {
1954 2172 /*
1955 2173 * First do a dry run to make sure all DVAs are claimable,
1956 2174 * so we don't have to unwind from partial failures below.
1957 2175 */
1958 2176 if ((error = metaslab_claim(spa, bp, 0)) != 0)
1959 2177 return (error);
1960 2178 }
1961 2179
1962 2180 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1963 2181
1964 2182 for (int d = 0; d < ndvas; d++)
↓ open down ↓ |
46 lines elided |
↑ open up ↑ |
1965 2183 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
1966 2184 break;
1967 2185
1968 2186 spa_config_exit(spa, SCL_ALLOC, FTAG);
1969 2187
1970 2188 ASSERT(error == 0 || txg == 0);
1971 2189
1972 2190 return (error);
1973 2191 }
1974 2192
1975 -static void
1976 -checkmap(space_map_t *sm, uint64_t off, uint64_t size)
1977 -{
1978 - space_seg_t *ss;
1979 - avl_index_t where;
1980 -
1981 - mutex_enter(sm->sm_lock);
1982 - ss = space_map_find(sm, off, size, &where);
1983 - if (ss != NULL)
1984 - panic("freeing free block; ss=%p", (void *)ss);
1985 - mutex_exit(sm->sm_lock);
1986 -}
1987 -
1988 2193 void
1989 2194 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
1990 2195 {
1991 2196 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
1992 2197 return;
1993 2198
1994 2199 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
1995 2200 for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
1996 - uint64_t vdid = DVA_GET_VDEV(&bp->blk_dva[i]);
1997 - vdev_t *vd = vdev_lookup_top(spa, vdid);
1998 - uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[i]);
2201 + uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
2202 + vdev_t *vd = vdev_lookup_top(spa, vdev);
2203 + uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
1999 2204 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
2000 - metaslab_t *ms = vd->vdev_ms[off >> vd->vdev_ms_shift];
2205 + metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2001 2206
2002 - if (ms->ms_map->sm_loaded)
2003 - checkmap(ms->ms_map, off, size);
2207 + if (msp->ms_loaded)
2208 + range_tree_verify(msp->ms_tree, offset, size);
2004 2209
2005 2210 for (int j = 0; j < TXG_SIZE; j++)
2006 - checkmap(ms->ms_freemap[j], off, size);
2211 + range_tree_verify(msp->ms_freetree[j], offset, size);
2007 2212 for (int j = 0; j < TXG_DEFER_SIZE; j++)
2008 - checkmap(ms->ms_defermap[j], off, size);
2213 + range_tree_verify(msp->ms_defertree[j], offset, size);
2009 2214 }
2010 2215 spa_config_exit(spa, SCL_VDEV, FTAG);
2011 2216 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX