1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 /*
  26  * Copyright (c) 2013 by Delphix. All rights reserved.
  27  */
  28 
  29 #include <sys/zfs_context.h>
  30 #include <sys/spa.h>
  31 #include <sys/dmu.h>
  32 #include <sys/dnode.h>
  33 #include <sys/zio.h>
  34 #include <sys/range_tree.h>
  35 
  36 static kmem_cache_t *range_seg_cache;
  37 
  38 void
  39 range_tree_init(void)
  40 {
  41         ASSERT(range_seg_cache == NULL);
  42         range_seg_cache = kmem_cache_create("range_seg_cache",
  43             sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
  44 }
  45 
  46 void
  47 range_tree_fini(void)
  48 {
  49         kmem_cache_destroy(range_seg_cache);
  50         range_seg_cache = NULL;
  51 }
  52 
  53 void
  54 range_tree_stat_verify(range_tree_t *rt)
  55 {
  56         range_seg_t *rs;
  57         uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
  58         int i;
  59 
  60         for (rs = avl_first(&rt->rt_root); rs != NULL;
  61             rs = AVL_NEXT(&rt->rt_root, rs)) {
  62                 uint64_t size = rs->rs_end - rs->rs_start;
  63                 int idx = highbit(size) - 1;
  64 
  65                 hist[idx]++;
  66                 ASSERT3U(hist[idx], !=, 0);
  67         }
  68 
  69         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
  70                 if (hist[i] != rt->rt_histogram[i]) {
  71                         zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
  72                             i, hist, hist[i], rt->rt_histogram[i]);
  73                 }
  74                 VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
  75         }
  76 }
  77 
  78 static void
  79 range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
  80 {
  81         uint64_t size = rs->rs_end - rs->rs_start;
  82         int idx = highbit(size) - 1;
  83 
  84         ASSERT3U(idx, <,
  85             sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
  86 
  87         ASSERT(MUTEX_HELD(rt->rt_lock));
  88         rt->rt_histogram[idx]++;
  89         ASSERT3U(rt->rt_histogram[idx], !=, 0);
  90 }
  91 
  92 static void
  93 range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
  94 {
  95         uint64_t size = rs->rs_end - rs->rs_start;
  96         int idx = highbit(size) - 1;
  97 
  98         ASSERT3U(idx, <,
  99             sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
 100 
 101         ASSERT(MUTEX_HELD(rt->rt_lock));
 102         ASSERT3U(rt->rt_histogram[idx], !=, 0);
 103         rt->rt_histogram[idx]--;
 104 }
 105 
 106 /*
 107  * NOTE: caller is responsible for all locking.
 108  */
 109 static int
 110 range_tree_seg_compare(const void *x1, const void *x2)
 111 {
 112         const range_seg_t *r1 = x1;
 113         const range_seg_t *r2 = x2;
 114 
 115         if (r1->rs_start < r2->rs_start) {
 116                 if (r1->rs_end > r2->rs_start)
 117                         return (0);
 118                 return (-1);
 119         }
 120         if (r1->rs_start > r2->rs_start) {
 121                 if (r1->rs_start < r2->rs_end)
 122                         return (0);
 123                 return (1);
 124         }
 125         return (0);
 126 }
 127 
 128 range_tree_t *
 129 range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
 130 {
 131         range_tree_t *rt;
 132 
 133         rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
 134 
 135         avl_create(&rt->rt_root, range_tree_seg_compare,
 136             sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
 137 
 138         rt->rt_lock = lp;
 139         rt->rt_ops = ops;
 140         rt->rt_arg = arg;
 141 
 142         if (rt->rt_ops != NULL)
 143                 rt->rt_ops->rtop_create(rt, rt->rt_arg);
 144 
 145         return (rt);
 146 }
 147 
 148 void
 149 range_tree_destroy(range_tree_t *rt)
 150 {
 151         VERIFY0(rt->rt_space);
 152 
 153         if (rt->rt_ops != NULL)
 154                 rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
 155 
 156         avl_destroy(&rt->rt_root);
 157         kmem_free(rt, sizeof (*rt));
 158 }
 159 
 160 void
 161 range_tree_add(void *arg, uint64_t start, uint64_t size)
 162 {
 163         range_tree_t *rt = arg;
 164         avl_index_t where;
 165         range_seg_t rsearch, *rs_before, *rs_after, *rs;
 166         uint64_t end = start + size;
 167         boolean_t merge_before, merge_after;
 168 
 169         ASSERT(MUTEX_HELD(rt->rt_lock));
 170         VERIFY(size != 0);
 171 
 172         rsearch.rs_start = start;
 173         rsearch.rs_end = end;
 174         rs = avl_find(&rt->rt_root, &rsearch, &where);
 175 
 176         if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
 177                 zfs_panic_recover("zfs: allocating allocated segment"
 178                     "(offset=%llu size=%llu)\n",
 179                     (longlong_t)start, (longlong_t)size);
 180                 return;
 181         }
 182 
 183         /* Make sure we don't overlap with either of our neighbors */
 184         VERIFY(rs == NULL);
 185 
 186         rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
 187         rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
 188 
 189         merge_before = (rs_before != NULL && rs_before->rs_end == start);
 190         merge_after = (rs_after != NULL && rs_after->rs_start == end);
 191 
 192         if (merge_before && merge_after) {
 193                 avl_remove(&rt->rt_root, rs_before);
 194                 if (rt->rt_ops != NULL) {
 195                         rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
 196                         rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
 197                 }
 198 
 199                 range_tree_stat_decr(rt, rs_before);
 200                 range_tree_stat_decr(rt, rs_after);
 201 
 202                 rs_after->rs_start = rs_before->rs_start;
 203                 kmem_cache_free(range_seg_cache, rs_before);
 204                 rs = rs_after;
 205         } else if (merge_before) {
 206                 if (rt->rt_ops != NULL)
 207                         rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
 208 
 209                 range_tree_stat_decr(rt, rs_before);
 210 
 211                 rs_before->rs_end = end;
 212                 rs = rs_before;
 213         } else if (merge_after) {
 214                 if (rt->rt_ops != NULL)
 215                         rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
 216 
 217                 range_tree_stat_decr(rt, rs_after);
 218 
 219                 rs_after->rs_start = start;
 220                 rs = rs_after;
 221         } else {
 222                 rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
 223                 rs->rs_start = start;
 224                 rs->rs_end = end;
 225                 avl_insert(&rt->rt_root, rs, where);
 226         }
 227 
 228         if (rt->rt_ops != NULL)
 229                 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
 230 
 231         range_tree_stat_incr(rt, rs);
 232         rt->rt_space += size;
 233 }
 234 
 235 void
 236 range_tree_remove(void *arg, uint64_t start, uint64_t size)
 237 {
 238         range_tree_t *rt = arg;
 239         avl_index_t where;
 240         range_seg_t rsearch, *rs, *newseg;
 241         uint64_t end = start + size;
 242         boolean_t left_over, right_over;
 243 
 244         ASSERT(MUTEX_HELD(rt->rt_lock));
 245         VERIFY3U(size, !=, 0);
 246         VERIFY3U(size, <=, rt->rt_space);
 247 
 248         rsearch.rs_start = start;
 249         rsearch.rs_end = end;
 250         rs = avl_find(&rt->rt_root, &rsearch, &where);
 251 
 252         /* Make sure we completely overlap with someone */
 253         if (rs == NULL) {
 254                 zfs_panic_recover("zfs: freeing free segment "
 255                     "(offset=%llu size=%llu)",
 256                     (longlong_t)start, (longlong_t)size);
 257                 return;
 258         }
 259         VERIFY3U(rs->rs_start, <=, start);
 260         VERIFY3U(rs->rs_end, >=, end);
 261 
 262         left_over = (rs->rs_start != start);
 263         right_over = (rs->rs_end != end);
 264 
 265         range_tree_stat_decr(rt, rs);
 266 
 267         if (rt->rt_ops != NULL)
 268                 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
 269 
 270         if (left_over && right_over) {
 271                 newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
 272                 newseg->rs_start = end;
 273                 newseg->rs_end = rs->rs_end;
 274                 range_tree_stat_incr(rt, newseg);
 275 
 276                 rs->rs_end = start;
 277 
 278                 avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
 279                 if (rt->rt_ops != NULL)
 280                         rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
 281         } else if (left_over) {
 282                 rs->rs_end = start;
 283         } else if (right_over) {
 284                 rs->rs_start = end;
 285         } else {
 286                 avl_remove(&rt->rt_root, rs);
 287                 kmem_cache_free(range_seg_cache, rs);
 288                 rs = NULL;
 289         }
 290 
 291         if (rs != NULL) {
 292                 range_tree_stat_incr(rt, rs);
 293 
 294                 if (rt->rt_ops != NULL)
 295                         rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
 296         }
 297 
 298         rt->rt_space -= size;
 299 }
 300 
 301 static range_seg_t *
 302 range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size,
 303     avl_index_t *wherep)
 304 {
 305         range_seg_t rsearch, *rs;
 306         uint64_t end = start + size;
 307 
 308         ASSERT(MUTEX_HELD(rt->rt_lock));
 309         VERIFY(size != 0);
 310 
 311         rsearch.rs_start = start;
 312         rsearch.rs_end = end;
 313         rs = avl_find(&rt->rt_root, &rsearch, wherep);
 314 
 315         if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end)
 316                 return (rs);
 317         return (NULL);
 318 }
 319 
 320 void
 321 range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
 322 {
 323         range_seg_t *rs;
 324         avl_index_t where;
 325 
 326         mutex_enter(rt->rt_lock);
 327         rs = range_tree_find(rt, off, size, &where);
 328         if (rs != NULL)
 329                 panic("freeing free block; rs=%p", (void *)rs);
 330         mutex_exit(rt->rt_lock);
 331 }
 332 
 333 boolean_t
 334 range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
 335 {
 336         avl_index_t where;
 337 
 338         return (range_tree_find(rt, start, size, &where) != NULL);
 339 }
 340 
 341 void
 342 range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
 343 {
 344         range_tree_t *rt;
 345 
 346         ASSERT(MUTEX_HELD((*rtsrc)->rt_lock));
 347         ASSERT0(range_tree_space(*rtdst));
 348         ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
 349 
 350         rt = *rtsrc;
 351         *rtsrc = *rtdst;
 352         *rtdst = rt;
 353 }
 354 
 355 void
 356 range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
 357 {
 358         range_seg_t *rs;
 359         void *cookie = NULL;
 360 
 361         ASSERT(MUTEX_HELD(rt->rt_lock));
 362 
 363         if (rt->rt_ops != NULL)
 364                 rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
 365 
 366         while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
 367                 if (func != NULL)
 368                         func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
 369                 kmem_cache_free(range_seg_cache, rs);
 370         }
 371 
 372         bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
 373         rt->rt_space = 0;
 374 }
 375 
 376 void
 377 range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
 378 {
 379         range_seg_t *rs;
 380 
 381         ASSERT(MUTEX_HELD(rt->rt_lock));
 382 
 383         for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
 384                 func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
 385 }
 386 
 387 uint64_t
 388 range_tree_space(range_tree_t *rt)
 389 {
 390         return (rt->rt_space);
 391 }