Print this page
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4104 ::spa_space no longer works
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/space_map.c
+++ new/usr/src/uts/common/fs/zfs/space_map.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25 /*
26 - * Copyright (c) 2012 by Delphix. All rights reserved.
26 + * Copyright (c) 2013 by Delphix. All rights reserved.
27 27 */
28 28
29 29 #include <sys/zfs_context.h>
30 30 #include <sys/spa.h>
31 31 #include <sys/dmu.h>
32 +#include <sys/dmu_tx.h>
33 +#include <sys/dnode.h>
34 +#include <sys/dsl_pool.h>
32 35 #include <sys/zio.h>
33 36 #include <sys/space_map.h>
37 +#include <sys/refcount.h>
38 +#include <sys/zfeature.h>
34 39
35 -static kmem_cache_t *space_seg_cache;
36 -
37 -void
38 -space_map_init(void)
39 -{
40 - ASSERT(space_seg_cache == NULL);
41 - space_seg_cache = kmem_cache_create("space_seg_cache",
42 - sizeof (space_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
43 -}
44 -
45 -void
46 -space_map_fini(void)
47 -{
48 - kmem_cache_destroy(space_seg_cache);
49 - space_seg_cache = NULL;
50 -}
51 -
52 40 /*
53 - * Space map routines.
54 - * NOTE: caller is responsible for all locking.
41 + * This value controls how the space map's block size is allowed to grow.
42 + * If the value is set to the same size as SPACE_MAP_INITIAL_BLOCKSIZE then
43 + * the space map block size will remain fixed. Setting this value to something
44 + * greater than SPACE_MAP_INITIAL_BLOCKSIZE will allow the space map to
45 + * increase its block size as needed. To maintain backwards compatibilty the
46 + * space map's block size must be a power of 2 and SPACE_MAP_INITIAL_BLOCKSIZE
47 + * or larger.
55 48 */
56 -static int
57 -space_map_seg_compare(const void *x1, const void *x2)
58 -{
59 - const space_seg_t *s1 = x1;
60 - const space_seg_t *s2 = x2;
49 +int space_map_max_blksz = (1 << 12);
61 50
62 - if (s1->ss_start < s2->ss_start) {
63 - if (s1->ss_end > s2->ss_start)
64 - return (0);
65 - return (-1);
66 - }
67 - if (s1->ss_start > s2->ss_start) {
68 - if (s1->ss_start < s2->ss_end)
69 - return (0);
70 - return (1);
71 - }
72 - return (0);
73 -}
74 -
75 -void
76 -space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift,
77 - kmutex_t *lp)
78 -{
79 - bzero(sm, sizeof (*sm));
80 -
81 - cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL);
82 -
83 - avl_create(&sm->sm_root, space_map_seg_compare,
84 - sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
85 -
86 - sm->sm_start = start;
87 - sm->sm_size = size;
88 - sm->sm_shift = shift;
89 - sm->sm_lock = lp;
90 -}
91 -
92 -void
93 -space_map_destroy(space_map_t *sm)
94 -{
95 - ASSERT(!sm->sm_loaded && !sm->sm_loading);
96 - VERIFY0(sm->sm_space);
97 - avl_destroy(&sm->sm_root);
98 - cv_destroy(&sm->sm_load_cv);
99 -}
100 -
101 -void
102 -space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
103 -{
104 - avl_index_t where;
105 - space_seg_t *ss_before, *ss_after, *ss;
106 - uint64_t end = start + size;
107 - int merge_before, merge_after;
108 -
109 - ASSERT(MUTEX_HELD(sm->sm_lock));
110 - VERIFY(!sm->sm_condensing);
111 - VERIFY(size != 0);
112 - VERIFY3U(start, >=, sm->sm_start);
113 - VERIFY3U(end, <=, sm->sm_start + sm->sm_size);
114 - VERIFY(sm->sm_space + size <= sm->sm_size);
115 - VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
116 - VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
117 -
118 - ss = space_map_find(sm, start, size, &where);
119 - if (ss != NULL) {
120 - zfs_panic_recover("zfs: allocating allocated segment"
121 - "(offset=%llu size=%llu)\n",
122 - (longlong_t)start, (longlong_t)size);
123 - return;
124 - }
125 -
126 - /* Make sure we don't overlap with either of our neighbors */
127 - VERIFY(ss == NULL);
128 -
129 - ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE);
130 - ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER);
131 -
132 - merge_before = (ss_before != NULL && ss_before->ss_end == start);
133 - merge_after = (ss_after != NULL && ss_after->ss_start == end);
134 -
135 - if (merge_before && merge_after) {
136 - avl_remove(&sm->sm_root, ss_before);
137 - if (sm->sm_pp_root) {
138 - avl_remove(sm->sm_pp_root, ss_before);
139 - avl_remove(sm->sm_pp_root, ss_after);
140 - }
141 - ss_after->ss_start = ss_before->ss_start;
142 - kmem_cache_free(space_seg_cache, ss_before);
143 - ss = ss_after;
144 - } else if (merge_before) {
145 - ss_before->ss_end = end;
146 - if (sm->sm_pp_root)
147 - avl_remove(sm->sm_pp_root, ss_before);
148 - ss = ss_before;
149 - } else if (merge_after) {
150 - ss_after->ss_start = start;
151 - if (sm->sm_pp_root)
152 - avl_remove(sm->sm_pp_root, ss_after);
153 - ss = ss_after;
154 - } else {
155 - ss = kmem_cache_alloc(space_seg_cache, KM_SLEEP);
156 - ss->ss_start = start;
157 - ss->ss_end = end;
158 - avl_insert(&sm->sm_root, ss, where);
159 - }
160 -
161 - if (sm->sm_pp_root)
162 - avl_add(sm->sm_pp_root, ss);
163 -
164 - sm->sm_space += size;
165 -}
166 -
167 -void
168 -space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
169 -{
170 - avl_index_t where;
171 - space_seg_t *ss, *newseg;
172 - uint64_t end = start + size;
173 - int left_over, right_over;
174 -
175 - VERIFY(!sm->sm_condensing);
176 - ss = space_map_find(sm, start, size, &where);
177 -
178 - /* Make sure we completely overlap with someone */
179 - if (ss == NULL) {
180 - zfs_panic_recover("zfs: freeing free segment "
181 - "(offset=%llu size=%llu)",
182 - (longlong_t)start, (longlong_t)size);
183 - return;
184 - }
185 - VERIFY3U(ss->ss_start, <=, start);
186 - VERIFY3U(ss->ss_end, >=, end);
187 - VERIFY(sm->sm_space - size <= sm->sm_size);
188 -
189 - left_over = (ss->ss_start != start);
190 - right_over = (ss->ss_end != end);
191 -
192 - if (sm->sm_pp_root)
193 - avl_remove(sm->sm_pp_root, ss);
194 -
195 - if (left_over && right_over) {
196 - newseg = kmem_cache_alloc(space_seg_cache, KM_SLEEP);
197 - newseg->ss_start = end;
198 - newseg->ss_end = ss->ss_end;
199 - ss->ss_end = start;
200 - avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
201 - if (sm->sm_pp_root)
202 - avl_add(sm->sm_pp_root, newseg);
203 - } else if (left_over) {
204 - ss->ss_end = start;
205 - } else if (right_over) {
206 - ss->ss_start = end;
207 - } else {
208 - avl_remove(&sm->sm_root, ss);
209 - kmem_cache_free(space_seg_cache, ss);
210 - ss = NULL;
211 - }
212 -
213 - if (sm->sm_pp_root && ss != NULL)
214 - avl_add(sm->sm_pp_root, ss);
215 -
216 - sm->sm_space -= size;
217 -}
218 -
219 -space_seg_t *
220 -space_map_find(space_map_t *sm, uint64_t start, uint64_t size,
221 - avl_index_t *wherep)
222 -{
223 - space_seg_t ssearch, *ss;
224 -
225 - ASSERT(MUTEX_HELD(sm->sm_lock));
226 - VERIFY(size != 0);
227 - VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
228 - VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
229 -
230 - ssearch.ss_start = start;
231 - ssearch.ss_end = start + size;
232 - ss = avl_find(&sm->sm_root, &ssearch, wherep);
233 -
234 - if (ss != NULL && ss->ss_start <= start && ss->ss_end >= start + size)
235 - return (ss);
236 - return (NULL);
237 -}
238 -
239 -boolean_t
240 -space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
241 -{
242 - avl_index_t where;
243 -
244 - return (space_map_find(sm, start, size, &where) != 0);
245 -}
246 -
247 -void
248 -space_map_swap(space_map_t **msrc, space_map_t **mdst)
249 -{
250 - space_map_t *sm;
251 -
252 - ASSERT(MUTEX_HELD((*msrc)->sm_lock));
253 - ASSERT0((*mdst)->sm_space);
254 - ASSERT0(avl_numnodes(&(*mdst)->sm_root));
255 -
256 - sm = *msrc;
257 - *msrc = *mdst;
258 - *mdst = sm;
259 -}
260 -
261 -void
262 -space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
263 -{
264 - space_seg_t *ss;
265 - void *cookie = NULL;
266 -
267 - ASSERT(MUTEX_HELD(sm->sm_lock));
268 -
269 - while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
270 - if (func != NULL)
271 - func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
272 - kmem_cache_free(space_seg_cache, ss);
273 - }
274 - sm->sm_space = 0;
275 -}
276 -
277 -void
278 -space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
279 -{
280 - space_seg_t *ss;
281 -
282 - ASSERT(MUTEX_HELD(sm->sm_lock));
283 -
284 - for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
285 - func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
286 -}
287 -
288 51 /*
289 - * Wait for any in-progress space_map_load() to complete.
290 - */
291 -void
292 -space_map_load_wait(space_map_t *sm)
293 -{
294 - ASSERT(MUTEX_HELD(sm->sm_lock));
295 -
296 - while (sm->sm_loading) {
297 - ASSERT(!sm->sm_loaded);
298 - cv_wait(&sm->sm_load_cv, sm->sm_lock);
299 - }
300 -}
301 -
302 -/*
52 + * Load the space map disk into the specified range tree. Segments of maptype
53 + * are added to the range tree, other segment types are removed.
54 + *
303 55 * Note: space_map_load() will drop sm_lock across dmu_read() calls.
304 56 * The caller must be OK with this.
305 57 */
306 58 int
307 -space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
308 - space_map_obj_t *smo, objset_t *os)
59 +space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
309 60 {
310 61 uint64_t *entry, *entry_map, *entry_map_end;
311 62 uint64_t bufsize, size, offset, end, space;
312 - uint64_t mapstart = sm->sm_start;
313 63 int error = 0;
314 64
315 65 ASSERT(MUTEX_HELD(sm->sm_lock));
316 - ASSERT(!sm->sm_loaded);
317 - ASSERT(!sm->sm_loading);
318 66
319 - sm->sm_loading = B_TRUE;
320 - end = smo->smo_objsize;
321 - space = smo->smo_alloc;
67 + end = space_map_length(sm);
68 + space = space_map_allocated(sm);
322 69
323 - ASSERT(sm->sm_ops == NULL);
324 - VERIFY0(sm->sm_space);
70 + VERIFY0(range_tree_space(rt));
325 71
326 72 if (maptype == SM_FREE) {
327 - space_map_add(sm, sm->sm_start, sm->sm_size);
73 + range_tree_add(rt, sm->sm_start, sm->sm_size);
328 74 space = sm->sm_size - space;
329 75 }
330 76
331 - bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT;
77 + bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
332 78 entry_map = zio_buf_alloc(bufsize);
333 79
334 80 mutex_exit(sm->sm_lock);
335 - if (end > bufsize)
336 - dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize);
81 + if (end > bufsize) {
82 + dmu_prefetch(sm->sm_os, space_map_object(sm), bufsize,
83 + end - bufsize);
84 + }
337 85 mutex_enter(sm->sm_lock);
338 86
339 87 for (offset = 0; offset < end; offset += bufsize) {
340 88 size = MIN(end - offset, bufsize);
341 89 VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
342 90 VERIFY(size != 0);
91 + ASSERT3U(sm->sm_blksz, !=, 0);
343 92
344 93 dprintf("object=%llu offset=%llx size=%llx\n",
345 - smo->smo_object, offset, size);
94 + space_map_object(sm), offset, size);
346 95
347 96 mutex_exit(sm->sm_lock);
348 - error = dmu_read(os, smo->smo_object, offset, size, entry_map,
349 - DMU_READ_PREFETCH);
97 + error = dmu_read(sm->sm_os, space_map_object(sm), offset, size,
98 + entry_map, DMU_READ_PREFETCH);
350 99 mutex_enter(sm->sm_lock);
351 100 if (error != 0)
352 101 break;
353 102
354 103 entry_map_end = entry_map + (size / sizeof (uint64_t));
355 104 for (entry = entry_map; entry < entry_map_end; entry++) {
356 105 uint64_t e = *entry;
106 + uint64_t offset, size;
357 107
358 108 if (SM_DEBUG_DECODE(e)) /* Skip debug entries */
359 109 continue;
360 110
361 - (SM_TYPE_DECODE(e) == maptype ?
362 - space_map_add : space_map_remove)(sm,
363 - (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart,
364 - SM_RUN_DECODE(e) << sm->sm_shift);
111 + offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
112 + sm->sm_start;
113 + size = SM_RUN_DECODE(e) << sm->sm_shift;
114 +
115 + VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift));
116 + VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift));
117 + VERIFY3U(offset, >=, sm->sm_start);
118 + VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size);
119 + if (SM_TYPE_DECODE(e) == maptype) {
120 + VERIFY3U(range_tree_space(rt) + size, <=,
121 + sm->sm_size);
122 + range_tree_add(rt, offset, size);
123 + } else {
124 + range_tree_remove(rt, offset, size);
125 + }
365 126 }
366 127 }
367 128
368 - if (error == 0) {
369 - VERIFY3U(sm->sm_space, ==, space);
129 + if (error == 0)
130 + VERIFY3U(range_tree_space(rt), ==, space);
131 + else
132 + range_tree_vacate(rt, NULL, NULL);
370 133
371 - sm->sm_loaded = B_TRUE;
372 - sm->sm_ops = ops;
373 - if (ops != NULL)
374 - ops->smop_load(sm);
375 - } else {
376 - space_map_vacate(sm, NULL, NULL);
377 - }
378 -
379 134 zio_buf_free(entry_map, bufsize);
135 + return (error);
136 +}
380 137
381 - sm->sm_loading = B_FALSE;
138 +void
139 +space_map_histogram_clear(space_map_t *sm)
140 +{
141 + if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
142 + return;
382 143
383 - cv_broadcast(&sm->sm_load_cv);
144 + bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram));
145 +}
384 146
385 - return (error);
147 +boolean_t
148 +space_map_histogram_verify(space_map_t *sm, range_tree_t *rt)
149 +{
150 + /*
151 + * Verify that the in-core range tree does not have any
152 + * ranges smaller than our sm_shift size.
153 + */
154 + for (int i = 0; i < sm->sm_shift; i++) {
155 + if (rt->rt_histogram[i] != 0)
156 + return (B_FALSE);
157 + }
158 + return (B_TRUE);
386 159 }
387 160
388 161 void
389 -space_map_unload(space_map_t *sm)
162 +space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx)
390 163 {
391 - ASSERT(MUTEX_HELD(sm->sm_lock));
164 + int idx = 0;
392 165
393 - if (sm->sm_loaded && sm->sm_ops != NULL)
394 - sm->sm_ops->smop_unload(sm);
166 + ASSERT(MUTEX_HELD(rt->rt_lock));
167 + ASSERT(dmu_tx_is_syncing(tx));
168 + VERIFY3U(space_map_object(sm), !=, 0);
395 169
396 - sm->sm_loaded = B_FALSE;
397 - sm->sm_ops = NULL;
170 + if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
171 + return;
398 172
399 - space_map_vacate(sm, NULL, NULL);
400 -}
173 + dmu_buf_will_dirty(sm->sm_dbuf, tx);
401 174
402 -uint64_t
403 -space_map_maxsize(space_map_t *sm)
404 -{
405 - ASSERT(sm->sm_ops != NULL);
406 - return (sm->sm_ops->smop_max(sm));
175 + ASSERT(space_map_histogram_verify(sm, rt));
176 +
177 + /*
178 + * Transfer the content of the range tree histogram to the space
179 + * map histogram. The space map histogram contains 32 buckets ranging
180 + * between 2^sm_shift to 2^(32+sm_shift-1). The range tree,
181 + * however, can represent ranges from 2^0 to 2^63. Since the space
182 + * map only cares about allocatable blocks (minimum of sm_shift) we
183 + * can safely ignore all ranges in the range tree smaller than sm_shift.
184 + */
185 + for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
186 +
187 + /*
188 + * Since the largest histogram bucket in the space map is
189 + * 2^(32+sm_shift-1), we need to normalize the values in
190 + * the range tree for any bucket larger than that size. For
191 + * example given an sm_shift of 9, ranges larger than 2^40
192 + * would get normalized as if they were 1TB ranges. Assume
193 + * the range tree had a count of 5 in the 2^44 (16TB) bucket,
194 + * the calculation below would normalize this to 5 * 2^4 (16).
195 + */
196 + ASSERT3U(i, >=, idx + sm->sm_shift);
197 + sm->sm_phys->smp_histogram[idx] +=
198 + rt->rt_histogram[i] << (i - idx - sm->sm_shift);
199 +
200 + /*
201 + * Increment the space map's index as long as we haven't
202 + * reached the maximum bucket size. Accumulate all ranges
203 + * larger than the max bucket size into the last bucket.
204 + */
205 + if (idx < SPACE_MAP_HISTOGRAM_SIZE(sm) - 1) {
206 + ASSERT3U(idx + sm->sm_shift, ==, i);
207 + idx++;
208 + ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE(sm));
209 + }
210 + }
407 211 }
408 212
409 213 uint64_t
410 -space_map_alloc(space_map_t *sm, uint64_t size)
214 +space_map_entries(space_map_t *sm, range_tree_t *rt)
411 215 {
412 - uint64_t start;
216 + avl_tree_t *t = &rt->rt_root;
217 + range_seg_t *rs;
218 + uint64_t size, entries;
413 219
414 - start = sm->sm_ops->smop_alloc(sm, size);
415 - if (start != -1ULL)
416 - space_map_remove(sm, start, size);
417 - return (start);
418 -}
220 + /*
221 + * All space_maps always have a debug entry so account for it here.
222 + */
223 + entries = 1;
419 224
420 -void
421 -space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
422 -{
423 - sm->sm_ops->smop_claim(sm, start, size);
424 - space_map_remove(sm, start, size);
225 + /*
226 + * Traverse the range tree and calculate the number of space map
227 + * entries that would be required to write out the range tree.
228 + */
229 + for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
230 + size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
231 + entries += howmany(size, SM_RUN_MAX);
232 + }
233 + return (entries);
425 234 }
426 235
427 236 void
428 -space_map_free(space_map_t *sm, uint64_t start, uint64_t size)
237 +space_map_set_blocksize(space_map_t *sm, uint64_t size, dmu_tx_t *tx)
429 238 {
430 - space_map_add(sm, start, size);
431 - sm->sm_ops->smop_free(sm, start, size);
239 + uint32_t blksz;
240 + u_longlong_t blocks;
241 +
242 + ASSERT3U(sm->sm_blksz, !=, 0);
243 + ASSERT3U(space_map_object(sm), !=, 0);
244 + ASSERT(sm->sm_dbuf != NULL);
245 + VERIFY(ISP2(space_map_max_blksz));
246 +
247 + if (sm->sm_blksz >= space_map_max_blksz)
248 + return;
249 +
250 + /*
251 + * The object contains more than one block so we can't adjust
252 + * its size.
253 + */
254 + if (sm->sm_phys->smp_objsize > sm->sm_blksz)
255 + return;
256 +
257 + if (size > sm->sm_blksz) {
258 + uint64_t newsz;
259 +
260 + /*
261 + * Older software versions treat space map blocks as fixed
262 + * entities. The DMU is capable of handling different block
263 + * sizes making it possible for us to increase the
264 + * block size and maintain backwards compatibility. The
265 + * caveat is that the new block sizes must be a
266 + * power of 2 so that old software can append to the file,
267 + * adding more blocks. The block size can grow until it
268 + * reaches space_map_max_blksz.
269 + */
270 + newsz = ISP2(size) ? size : 1ULL << highbit(size);
271 + if (newsz > space_map_max_blksz)
272 + newsz = space_map_max_blksz;
273 +
274 + VERIFY0(dmu_object_set_blocksize(sm->sm_os,
275 + space_map_object(sm), newsz, 0, tx));
276 + dmu_object_size_from_db(sm->sm_dbuf, &blksz, &blocks);
277 +
278 + zfs_dbgmsg("txg %llu, spa %s, increasing blksz from %d to %d",
279 + dmu_tx_get_txg(tx), spa_name(dmu_objset_spa(sm->sm_os)),
280 + sm->sm_blksz, blksz);
281 +
282 + VERIFY3U(newsz, ==, blksz);
283 + VERIFY3U(sm->sm_blksz, <, blksz);
284 + sm->sm_blksz = blksz;
285 + }
432 286 }
433 287
434 288 /*
435 - * Note: space_map_sync() will drop sm_lock across dmu_write() calls.
289 + * Note: space_map_write() will drop sm_lock across dmu_write() calls.
436 290 */
437 291 void
438 -space_map_sync(space_map_t *sm, uint8_t maptype,
439 - space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
292 +space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
293 + dmu_tx_t *tx)
440 294 {
295 + objset_t *os = sm->sm_os;
441 296 spa_t *spa = dmu_objset_spa(os);
442 - avl_tree_t *t = &sm->sm_root;
443 - space_seg_t *ss;
444 - uint64_t bufsize, start, size, run_len, total, sm_space, nodes;
297 + avl_tree_t *t = &rt->rt_root;
298 + range_seg_t *rs;
299 + uint64_t size, total, rt_space, nodes;
445 300 uint64_t *entry, *entry_map, *entry_map_end;
301 + uint64_t newsz, expected_entries, actual_entries = 1;
446 302
447 - ASSERT(MUTEX_HELD(sm->sm_lock));
303 + ASSERT(MUTEX_HELD(rt->rt_lock));
304 + ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
305 + VERIFY3U(space_map_object(sm), !=, 0);
306 + dmu_buf_will_dirty(sm->sm_dbuf, tx);
448 307
449 - if (sm->sm_space == 0)
308 + /*
309 + * This field is no longer necessary since the in-core space map
310 + * now contains the object number but is maintained for backwards
311 + * compatibility.
312 + */
313 + sm->sm_phys->smp_object = sm->sm_object;
314 +
315 + if (range_tree_space(rt) == 0) {
316 + VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object);
450 317 return;
318 + }
451 319
452 - dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n",
453 - smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa),
454 - maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root),
455 - sm->sm_space);
456 -
457 320 if (maptype == SM_ALLOC)
458 - smo->smo_alloc += sm->sm_space;
321 + sm->sm_phys->smp_alloc += range_tree_space(rt);
459 322 else
460 - smo->smo_alloc -= sm->sm_space;
323 + sm->sm_phys->smp_alloc -= range_tree_space(rt);
461 324
462 - bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t);
463 - bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT);
464 - entry_map = zio_buf_alloc(bufsize);
465 - entry_map_end = entry_map + (bufsize / sizeof (uint64_t));
325 + expected_entries = space_map_entries(sm, rt);
326 +
327 + /*
328 + * Calculate the new size for the space map on-disk and see if
329 + * we can grow the block size to accommodate the new size.
330 + */
331 + newsz = sm->sm_phys->smp_objsize + expected_entries * sizeof (uint64_t);
332 + space_map_set_blocksize(sm, newsz, tx);
333 +
334 + entry_map = zio_buf_alloc(sm->sm_blksz);
335 + entry_map_end = entry_map + (sm->sm_blksz / sizeof (uint64_t));
466 336 entry = entry_map;
467 337
468 338 *entry++ = SM_DEBUG_ENCODE(1) |
469 339 SM_DEBUG_ACTION_ENCODE(maptype) |
470 340 SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
471 341 SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
472 342
473 343 total = 0;
474 - nodes = avl_numnodes(&sm->sm_root);
475 - sm_space = sm->sm_space;
476 - for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
477 - size = ss->ss_end - ss->ss_start;
478 - start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
344 + nodes = avl_numnodes(&rt->rt_root);
345 + rt_space = range_tree_space(rt);
346 + for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
347 + uint64_t start;
479 348
480 - total += size;
481 - size >>= sm->sm_shift;
349 + size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
350 + start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
482 351
483 - while (size) {
352 + total += size << sm->sm_shift;
353 +
354 + while (size != 0) {
355 + uint64_t run_len;
356 +
484 357 run_len = MIN(size, SM_RUN_MAX);
485 358
486 359 if (entry == entry_map_end) {
487 - mutex_exit(sm->sm_lock);
488 - dmu_write(os, smo->smo_object, smo->smo_objsize,
489 - bufsize, entry_map, tx);
490 - mutex_enter(sm->sm_lock);
491 - smo->smo_objsize += bufsize;
360 + mutex_exit(rt->rt_lock);
361 + dmu_write(os, space_map_object(sm),
362 + sm->sm_phys->smp_objsize, sm->sm_blksz,
363 + entry_map, tx);
364 + mutex_enter(rt->rt_lock);
365 + sm->sm_phys->smp_objsize += sm->sm_blksz;
492 366 entry = entry_map;
493 367 }
494 368
495 369 *entry++ = SM_OFFSET_ENCODE(start) |
496 370 SM_TYPE_ENCODE(maptype) |
497 371 SM_RUN_ENCODE(run_len);
498 372
499 373 start += run_len;
500 374 size -= run_len;
375 + actual_entries++;
501 376 }
502 377 }
503 378
504 379 if (entry != entry_map) {
505 380 size = (entry - entry_map) * sizeof (uint64_t);
506 - mutex_exit(sm->sm_lock);
507 - dmu_write(os, smo->smo_object, smo->smo_objsize,
381 + mutex_exit(rt->rt_lock);
382 + dmu_write(os, space_map_object(sm), sm->sm_phys->smp_objsize,
508 383 size, entry_map, tx);
509 - mutex_enter(sm->sm_lock);
510 - smo->smo_objsize += size;
384 + mutex_enter(rt->rt_lock);
385 + sm->sm_phys->smp_objsize += size;
511 386 }
387 + ASSERT3U(expected_entries, ==, actual_entries);
512 388
513 389 /*
514 390 * Ensure that the space_map's accounting wasn't changed
515 391 * while we were in the middle of writing it out.
516 392 */
517 - VERIFY3U(nodes, ==, avl_numnodes(&sm->sm_root));
518 - VERIFY3U(sm->sm_space, ==, sm_space);
519 - VERIFY3U(sm->sm_space, ==, total);
393 + VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root));
394 + VERIFY3U(range_tree_space(rt), ==, rt_space);
395 + VERIFY3U(range_tree_space(rt), ==, total);
520 396
521 - zio_buf_free(entry_map, bufsize);
397 + zio_buf_free(entry_map, sm->sm_blksz);
522 398 }
523 399
524 -void
525 -space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
400 +static int
401 +space_map_open_impl(space_map_t *sm)
526 402 {
527 - VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0);
403 + int error;
404 + u_longlong_t blocks;
528 405
529 - smo->smo_objsize = 0;
530 - smo->smo_alloc = 0;
406 + error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf);
407 + if (error)
408 + return (error);
409 +
410 + dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks);
411 + sm->sm_phys = sm->sm_dbuf->db_data;
412 + return (0);
531 413 }
532 414
533 -/*
534 - * Space map reference trees.
535 - *
536 - * A space map is a collection of integers. Every integer is either
537 - * in the map, or it's not. A space map reference tree generalizes
538 - * the idea: it allows its members to have arbitrary reference counts,
539 - * as opposed to the implicit reference count of 0 or 1 in a space map.
540 - * This representation comes in handy when computing the union or
541 - * intersection of multiple space maps. For example, the union of
542 - * N space maps is the subset of the reference tree with refcnt >= 1.
543 - * The intersection of N space maps is the subset with refcnt >= N.
544 - *
545 - * [It's very much like a Fourier transform. Unions and intersections
546 - * are hard to perform in the 'space map domain', so we convert the maps
547 - * into the 'reference count domain', where it's trivial, then invert.]
548 - *
549 - * vdev_dtl_reassess() uses computations of this form to determine
550 - * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
551 - * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
552 - * has an outage wherever refcnt >= vdev_children.
553 - */
554 -static int
555 -space_map_ref_compare(const void *x1, const void *x2)
415 +int
416 +space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
417 + uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp)
556 418 {
557 - const space_ref_t *sr1 = x1;
558 - const space_ref_t *sr2 = x2;
419 + space_map_t *sm;
420 + int error;
559 421
560 - if (sr1->sr_offset < sr2->sr_offset)
561 - return (-1);
562 - if (sr1->sr_offset > sr2->sr_offset)
563 - return (1);
422 + ASSERT(*smp == NULL);
423 + ASSERT(os != NULL);
424 + ASSERT(object != 0);
564 425
565 - if (sr1 < sr2)
566 - return (-1);
567 - if (sr1 > sr2)
568 - return (1);
426 + sm = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
569 427
428 + sm->sm_start = start;
429 + sm->sm_size = size;
430 + sm->sm_shift = shift;
431 + sm->sm_lock = lp;
432 + sm->sm_os = os;
433 + sm->sm_object = object;
434 +
435 + error = space_map_open_impl(sm);
436 + if (error != 0) {
437 + space_map_close(sm);
438 + return (error);
439 + }
440 +
441 + *smp = sm;
442 +
570 443 return (0);
571 444 }
572 445
573 446 void
574 -space_map_ref_create(avl_tree_t *t)
447 +space_map_close(space_map_t *sm)
575 448 {
576 - avl_create(t, space_map_ref_compare,
577 - sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
578 -}
449 + if (sm == NULL)
450 + return;
579 451
580 -void
581 -space_map_ref_destroy(avl_tree_t *t)
582 -{
583 - space_ref_t *sr;
584 - void *cookie = NULL;
452 + if (sm->sm_dbuf != NULL)
453 + dmu_buf_rele(sm->sm_dbuf, sm);
454 + sm->sm_dbuf = NULL;
455 + sm->sm_phys = NULL;
585 456
586 - while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
587 - kmem_free(sr, sizeof (*sr));
588 -
589 - avl_destroy(t);
457 + kmem_free(sm, sizeof (*sm));
590 458 }
591 459
592 460 static void
593 -space_map_ref_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
461 +space_map_reallocate(space_map_t *sm, dmu_tx_t *tx)
594 462 {
595 - space_ref_t *sr;
463 + ASSERT(dmu_tx_is_syncing(tx));
596 464
597 - sr = kmem_alloc(sizeof (*sr), KM_SLEEP);
598 - sr->sr_offset = offset;
599 - sr->sr_refcnt = refcnt;
465 + space_map_free(sm, tx);
466 + dmu_buf_rele(sm->sm_dbuf, sm);
600 467
601 - avl_add(t, sr);
468 + sm->sm_object = space_map_alloc(sm->sm_os, tx);
469 + VERIFY0(space_map_open_impl(sm));
602 470 }
603 471
604 472 void
605 -space_map_ref_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
606 - int64_t refcnt)
473 +space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
607 474 {
608 - space_map_ref_add_node(t, start, refcnt);
609 - space_map_ref_add_node(t, end, -refcnt);
475 + objset_t *os = sm->sm_os;
476 + spa_t *spa = dmu_objset_spa(os);
477 + zfeature_info_t *space_map_histogram =
478 + &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM];
479 + dmu_object_info_t doi;
480 + int bonuslen;
481 +
482 + ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
483 + ASSERT(dmu_tx_is_syncing(tx));
484 +
485 + VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
486 + dmu_object_info_from_db(sm->sm_dbuf, &doi);
487 +
488 + if (spa_feature_is_enabled(spa, space_map_histogram)) {
489 + bonuslen = sizeof (space_map_phys_t);
490 + ASSERT3U(bonuslen, <=, dmu_bonus_max());
491 + } else {
492 + bonuslen = SPACE_MAP_SIZE_V0;
493 + }
494 +
495 + if (bonuslen != doi.doi_bonus_size ||
496 + doi.doi_data_block_size != SPACE_MAP_INITIAL_BLOCKSIZE) {
497 + zfs_dbgmsg("txg %llu, spa %s, reallocating: "
498 + "old bonus %u, old blocksz %u", dmu_tx_get_txg(tx),
499 + spa_name(spa), doi.doi_bonus_size, doi.doi_data_block_size);
500 + space_map_reallocate(sm, tx);
501 + VERIFY3U(sm->sm_blksz, ==, SPACE_MAP_INITIAL_BLOCKSIZE);
502 + }
503 +
504 + dmu_buf_will_dirty(sm->sm_dbuf, tx);
505 + sm->sm_phys->smp_objsize = 0;
506 + sm->sm_phys->smp_alloc = 0;
610 507 }
611 508
612 509 /*
613 - * Convert (or add) a space map into a reference tree.
510 + * Update the in-core space_map allocation and length values.
614 511 */
615 512 void
616 -space_map_ref_add_map(avl_tree_t *t, space_map_t *sm, int64_t refcnt)
513 +space_map_update(space_map_t *sm)
617 514 {
618 - space_seg_t *ss;
515 + if (sm == NULL)
516 + return;
619 517
620 518 ASSERT(MUTEX_HELD(sm->sm_lock));
621 519
622 - for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
623 - space_map_ref_add_seg(t, ss->ss_start, ss->ss_end, refcnt);
520 + sm->sm_alloc = sm->sm_phys->smp_alloc;
521 + sm->sm_length = sm->sm_phys->smp_objsize;
624 522 }
625 523
626 -/*
627 - * Convert a reference tree into a space map. The space map will contain
628 - * all members of the reference tree for which refcnt >= minref.
629 - */
524 +uint64_t
525 +space_map_alloc(objset_t *os, dmu_tx_t *tx)
526 +{
527 + spa_t *spa = dmu_objset_spa(os);
528 + zfeature_info_t *space_map_histogram =
529 + &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM];
530 + uint64_t object;
531 + int bonuslen;
532 +
533 + if (spa_feature_is_enabled(spa, space_map_histogram)) {
534 + spa_feature_incr(spa, space_map_histogram, tx);
535 + bonuslen = sizeof (space_map_phys_t);
536 + ASSERT3U(bonuslen, <=, dmu_bonus_max());
537 + } else {
538 + bonuslen = SPACE_MAP_SIZE_V0;
539 + }
540 +
541 + object = dmu_object_alloc(os,
542 + DMU_OT_SPACE_MAP, SPACE_MAP_INITIAL_BLOCKSIZE,
543 + DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
544 +
545 + return (object);
546 +}
547 +
630 548 void
631 -space_map_ref_generate_map(avl_tree_t *t, space_map_t *sm, int64_t minref)
549 +space_map_free(space_map_t *sm, dmu_tx_t *tx)
632 550 {
633 - uint64_t start = -1ULL;
634 - int64_t refcnt = 0;
635 - space_ref_t *sr;
551 + spa_t *spa;
552 + zfeature_info_t *space_map_histogram =
553 + &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM];
636 554
637 - ASSERT(MUTEX_HELD(sm->sm_lock));
555 + if (sm == NULL)
556 + return;
638 557
639 - space_map_vacate(sm, NULL, NULL);
558 + spa = dmu_objset_spa(sm->sm_os);
559 + if (spa_feature_is_enabled(spa, space_map_histogram)) {
560 + dmu_object_info_t doi;
640 561
641 - for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
642 - refcnt += sr->sr_refcnt;
643 - if (refcnt >= minref) {
644 - if (start == -1ULL) {
645 - start = sr->sr_offset;
646 - }
647 - } else {
648 - if (start != -1ULL) {
649 - uint64_t end = sr->sr_offset;
650 - ASSERT(start <= end);
651 - if (end > start)
652 - space_map_add(sm, start, end - start);
653 - start = -1ULL;
654 - }
562 + dmu_object_info_from_db(sm->sm_dbuf, &doi);
563 + if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) {
564 + VERIFY(spa_feature_is_active(spa, space_map_histogram));
565 + spa_feature_decr(spa, space_map_histogram, tx);
655 566 }
656 567 }
657 - ASSERT(refcnt == 0);
658 - ASSERT(start == -1ULL);
568 +
569 + VERIFY3U(dmu_object_free(sm->sm_os, space_map_object(sm), tx), ==, 0);
570 + sm->sm_object = 0;
571 +}
572 +
573 +uint64_t
574 +space_map_object(space_map_t *sm)
575 +{
576 + return (sm != NULL ? sm->sm_object : 0);
577 +}
578 +
579 +/*
580 + * Returns the already synced, on-disk allocated space.
581 + */
582 +uint64_t
583 +space_map_allocated(space_map_t *sm)
584 +{
585 + return (sm != NULL ? sm->sm_alloc : 0);
586 +}
587 +
588 +/*
589 + * Returns the already synced, on-disk length;
590 + */
591 +uint64_t
592 +space_map_length(space_map_t *sm)
593 +{
594 + return (sm != NULL ? sm->sm_length : 0);
595 +}
596 +
597 +/*
598 + * Returns the allocated space that is currently syncing.
599 + */
600 +int64_t
601 +space_map_alloc_delta(space_map_t *sm)
602 +{
603 + if (sm == NULL)
604 + return (0);
605 + ASSERT(sm->sm_dbuf != NULL);
606 + return (sm->sm_phys->smp_alloc - space_map_allocated(sm));
659 607 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX