1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright (c) 2013 by Delphix. All rights reserved. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa.h> 31 #include <sys/dmu.h> 32 #include <sys/dnode.h> 33 #include <sys/zio.h> 34 #include <sys/range_tree.h> 35 36 static kmem_cache_t *range_seg_cache; 37 38 void 39 range_tree_init(void) 40 { 41 ASSERT(range_seg_cache == NULL); 42 range_seg_cache = kmem_cache_create("range_seg_cache", 43 sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 44 } 45 46 void 47 range_tree_fini(void) 48 { 49 kmem_cache_destroy(range_seg_cache); 50 range_seg_cache = NULL; 51 } 52 53 void 54 range_tree_stat_verify(range_tree_t *rt) 55 { 56 range_seg_t *rs; 57 uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; 58 int i; 59 60 for (rs = avl_first(&rt->rt_root); rs != NULL; 61 rs = AVL_NEXT(&rt->rt_root, rs)) { 62 uint64_t size = rs->rs_end - rs->rs_start; 63 int idx = highbit(size) - 1; 64 65 hist[idx]++; 66 ASSERT3U(hist[idx], !=, 0); 67 } 68 69 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { 70 if (hist[i] != rt->rt_histogram[i]) { 71 zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu", 72 i, hist, hist[i], rt->rt_histogram[i]); 73 } 74 VERIFY3U(hist[i], ==, rt->rt_histogram[i]); 75 } 76 } 77 78 static void 79 range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) 80 { 81 uint64_t size = rs->rs_end - rs->rs_start; 82 int idx = highbit(size) - 1; 83 84 ASSERT3U(idx, <, 85 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); 86 87 ASSERT(MUTEX_HELD(rt->rt_lock)); 88 rt->rt_histogram[idx]++; 89 ASSERT3U(rt->rt_histogram[idx], !=, 0); 90 } 91 92 static void 93 range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) 94 { 95 uint64_t size = rs->rs_end - rs->rs_start; 96 int idx = highbit(size) - 1; 97 98 ASSERT3U(idx, <, 99 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); 100 101 ASSERT(MUTEX_HELD(rt->rt_lock)); 102 ASSERT3U(rt->rt_histogram[idx], !=, 0); 103 rt->rt_histogram[idx]--; 104 } 105 106 /* 107 * NOTE: caller is responsible for all locking. 108 */ 109 static int 110 range_tree_seg_compare(const void *x1, const void *x2) 111 { 112 const range_seg_t *r1 = x1; 113 const range_seg_t *r2 = x2; 114 115 if (r1->rs_start < r2->rs_start) { 116 if (r1->rs_end > r2->rs_start) 117 return (0); 118 return (-1); 119 } 120 if (r1->rs_start > r2->rs_start) { 121 if (r1->rs_start < r2->rs_end) 122 return (0); 123 return (1); 124 } 125 return (0); 126 } 127 128 range_tree_t * 129 range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp) 130 { 131 range_tree_t *rt; 132 133 rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); 134 135 avl_create(&rt->rt_root, range_tree_seg_compare, 136 sizeof (range_seg_t), offsetof(range_seg_t, rs_node)); 137 138 rt->rt_lock = lp; 139 rt->rt_ops = ops; 140 rt->rt_arg = arg; 141 142 if (rt->rt_ops != NULL) 143 rt->rt_ops->rtop_create(rt, rt->rt_arg); 144 145 return (rt); 146 } 147 148 void 149 range_tree_destroy(range_tree_t *rt) 150 { 151 VERIFY0(rt->rt_space); 152 153 if (rt->rt_ops != NULL) 154 rt->rt_ops->rtop_destroy(rt, rt->rt_arg); 155 156 avl_destroy(&rt->rt_root); 157 kmem_free(rt, sizeof (*rt)); 158 } 159 160 void 161 range_tree_add(void *arg, uint64_t start, uint64_t size) 162 { 163 range_tree_t *rt = arg; 164 avl_index_t where; 165 range_seg_t rsearch, *rs_before, *rs_after, *rs; 166 uint64_t end = start + size; 167 boolean_t merge_before, merge_after; 168 169 ASSERT(MUTEX_HELD(rt->rt_lock)); 170 VERIFY(size != 0); 171 172 rsearch.rs_start = start; 173 rsearch.rs_end = end; 174 rs = avl_find(&rt->rt_root, &rsearch, &where); 175 176 if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) { 177 zfs_panic_recover("zfs: allocating allocated segment" 178 "(offset=%llu size=%llu)\n", 179 (longlong_t)start, (longlong_t)size); 180 return; 181 } 182 183 /* Make sure we don't overlap with either of our neighbors */ 184 VERIFY(rs == NULL); 185 186 rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE); 187 rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER); 188 189 merge_before = (rs_before != NULL && rs_before->rs_end == start); 190 merge_after = (rs_after != NULL && rs_after->rs_start == end); 191 192 if (merge_before && merge_after) { 193 avl_remove(&rt->rt_root, rs_before); 194 if (rt->rt_ops != NULL) { 195 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); 196 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); 197 } 198 199 range_tree_stat_decr(rt, rs_before); 200 range_tree_stat_decr(rt, rs_after); 201 202 rs_after->rs_start = rs_before->rs_start; 203 kmem_cache_free(range_seg_cache, rs_before); 204 rs = rs_after; 205 } else if (merge_before) { 206 if (rt->rt_ops != NULL) 207 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); 208 209 range_tree_stat_decr(rt, rs_before); 210 211 rs_before->rs_end = end; 212 rs = rs_before; 213 } else if (merge_after) { 214 if (rt->rt_ops != NULL) 215 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); 216 217 range_tree_stat_decr(rt, rs_after); 218 219 rs_after->rs_start = start; 220 rs = rs_after; 221 } else { 222 rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP); 223 rs->rs_start = start; 224 rs->rs_end = end; 225 avl_insert(&rt->rt_root, rs, where); 226 } 227 228 if (rt->rt_ops != NULL) 229 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); 230 231 range_tree_stat_incr(rt, rs); 232 rt->rt_space += size; 233 } 234 235 void 236 range_tree_remove(void *arg, uint64_t start, uint64_t size) 237 { 238 range_tree_t *rt = arg; 239 avl_index_t where; 240 range_seg_t rsearch, *rs, *newseg; 241 uint64_t end = start + size; 242 boolean_t left_over, right_over; 243 244 ASSERT(MUTEX_HELD(rt->rt_lock)); 245 VERIFY3U(size, !=, 0); 246 VERIFY3U(size, <=, rt->rt_space); 247 248 rsearch.rs_start = start; 249 rsearch.rs_end = end; 250 rs = avl_find(&rt->rt_root, &rsearch, &where); 251 252 /* Make sure we completely overlap with someone */ 253 if (rs == NULL) { 254 zfs_panic_recover("zfs: freeing free segment " 255 "(offset=%llu size=%llu)", 256 (longlong_t)start, (longlong_t)size); 257 return; 258 } 259 VERIFY3U(rs->rs_start, <=, start); 260 VERIFY3U(rs->rs_end, >=, end); 261 262 left_over = (rs->rs_start != start); 263 right_over = (rs->rs_end != end); 264 265 range_tree_stat_decr(rt, rs); 266 267 if (rt->rt_ops != NULL) 268 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); 269 270 if (left_over && right_over) { 271 newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP); 272 newseg->rs_start = end; 273 newseg->rs_end = rs->rs_end; 274 range_tree_stat_incr(rt, newseg); 275 276 rs->rs_end = start; 277 278 avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER); 279 if (rt->rt_ops != NULL) 280 rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg); 281 } else if (left_over) { 282 rs->rs_end = start; 283 } else if (right_over) { 284 rs->rs_start = end; 285 } else { 286 avl_remove(&rt->rt_root, rs); 287 kmem_cache_free(range_seg_cache, rs); 288 rs = NULL; 289 } 290 291 if (rs != NULL) { 292 range_tree_stat_incr(rt, rs); 293 294 if (rt->rt_ops != NULL) 295 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); 296 } 297 298 rt->rt_space -= size; 299 } 300 301 static range_seg_t * 302 range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size, 303 avl_index_t *wherep) 304 { 305 range_seg_t rsearch, *rs; 306 uint64_t end = start + size; 307 308 ASSERT(MUTEX_HELD(rt->rt_lock)); 309 VERIFY(size != 0); 310 311 rsearch.rs_start = start; 312 rsearch.rs_end = end; 313 rs = avl_find(&rt->rt_root, &rsearch, wherep); 314 315 if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) 316 return (rs); 317 return (NULL); 318 } 319 320 void 321 range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size) 322 { 323 range_seg_t *rs; 324 avl_index_t where; 325 326 mutex_enter(rt->rt_lock); 327 rs = range_tree_find(rt, off, size, &where); 328 if (rs != NULL) 329 panic("freeing free block; rs=%p", (void *)rs); 330 mutex_exit(rt->rt_lock); 331 } 332 333 boolean_t 334 range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size) 335 { 336 avl_index_t where; 337 338 return (range_tree_find(rt, start, size, &where) != NULL); 339 } 340 341 void 342 range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) 343 { 344 range_tree_t *rt; 345 346 ASSERT(MUTEX_HELD((*rtsrc)->rt_lock)); 347 ASSERT0(range_tree_space(*rtdst)); 348 ASSERT0(avl_numnodes(&(*rtdst)->rt_root)); 349 350 rt = *rtsrc; 351 *rtsrc = *rtdst; 352 *rtdst = rt; 353 } 354 355 void 356 range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) 357 { 358 range_seg_t *rs; 359 void *cookie = NULL; 360 361 ASSERT(MUTEX_HELD(rt->rt_lock)); 362 363 if (rt->rt_ops != NULL) 364 rt->rt_ops->rtop_vacate(rt, rt->rt_arg); 365 366 while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) { 367 if (func != NULL) 368 func(arg, rs->rs_start, rs->rs_end - rs->rs_start); 369 kmem_cache_free(range_seg_cache, rs); 370 } 371 372 bzero(rt->rt_histogram, sizeof (rt->rt_histogram)); 373 rt->rt_space = 0; 374 } 375 376 void 377 range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) 378 { 379 range_seg_t *rs; 380 381 ASSERT(MUTEX_HELD(rt->rt_lock)); 382 383 for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) 384 func(arg, rs->rs_start, rs->rs_end - rs->rs_start); 385 } 386 387 uint64_t 388 range_tree_space(range_tree_t *rt) 389 { 390 return (rt->rt_space); 391 }