1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/ksynch.h> 28 #include <sys/kmem.h> 29 #include <sys/errno.h> 30 #include <sys/cmn_err.h> 31 #include <sys/debug.h> 32 #include <sys/cred.h> 33 #include <sys/file.h> 34 #include <sys/ddi.h> 35 #include <sys/nsctl/nsctl.h> 36 #include <sys/unistat/spcs_s.h> 37 #include <sys/unistat/spcs_errors.h> 38 39 #include <sys/unistat/spcs_s_k.h> 40 #include "dsw.h" 41 #include "dsw_dev.h" 42 43 #ifdef DS_DDICT 44 #include "../contract.h" 45 #endif 46 47 #include <sys/sdt.h> /* dtrace is S10 or later */ 48 49 /* 50 * Instant Image. 51 * 52 * This file contains the chunk map lookup functions of II. 53 * 54 */ 55 #define CHUNK_FBA(chunk) DSW_CHK2FBA(chunk) 56 57 extern int ii_debug; /* debug level switch */ 58 int ii_map_debug = 0; 59 60 #ifdef II_MULTIMULTI_TERABYTE 61 typedef int64_t nodeid_t; 62 typedef int32_t nodeid32_t; 63 #else 64 typedef int32_t nodeid_t; 65 #endif 66 67 typedef struct ii_node { 68 chunkid_t vchunk_id; /* virtual chunk id */ 69 } NODE; 70 71 typedef struct ii_nodelink_s { 72 chunkid_t next_chunk; 73 } ii_nodelink_t; 74 75 static int nodes_per_fba = FBA_SIZE(1) / sizeof (NODE); 76 77 ii_header_t *_ii_bm_header_get(_ii_info_t *ip, nsc_buf_t **tmp); 78 int _ii_bm_header_put(ii_header_t *hdr, _ii_info_t *ip, 79 nsc_buf_t *tmp); 80 void _ii_rlse_devs(_ii_info_t *, int); 81 int _ii_rsrv_devs(_ii_info_t *, int, int); 82 void _ii_error(_ii_info_t *, int); 83 /* 84 * Private functions for use in this file. 85 */ 86 static void free_node(_ii_info_t *ip, NODE *np, nodeid_t ni); 87 static chunkid_t ii_alloc_overflow(_ii_info_t *ip); 88 void ii_free_overflow(_ii_info_t *, chunkid_t); 89 extern int _ii_nsc_io(_ii_info_t *, int, nsc_fd_t *, int, nsc_off_t, 90 unsigned char *, nsc_size_t); 91 92 static int 93 update_tree_header(_ii_info_t *ip) 94 { 95 ii_header_t *header; 96 nsc_buf_t *tmp = NULL; 97 98 mutex_enter(&ip->bi_mutex); 99 header = _ii_bm_header_get(ip, &tmp); 100 if (header == NULL) { 101 /* bitmap is probably offline */ 102 mutex_exit(&ip->bi_mutex); 103 DTRACE_PROBE(_iit_update_tree_header_end); 104 return (1); 105 } 106 header->ii_mstchks = ip->bi_mstchks; 107 header->ii_shdchks = ip->bi_shdchks; 108 header->ii_shdchkused = ip->bi_shdchkused; 109 header->ii_shdfchk = ip->bi_shdfchk; 110 (void) _ii_bm_header_put(header, ip, tmp); 111 mutex_exit(&ip->bi_mutex); 112 113 return (0); 114 } 115 116 static int 117 update_overflow_header(_ii_info_t *ip, _ii_overflow_t *op) 118 { 119 (void) _ii_nsc_io(ip, KS_OVR, op->ii_dev->bi_fd, NSC_WRBUF, 120 II_OHEADER_FBA, (unsigned char *)&(op->ii_do), 121 sizeof (_ii_doverflow_t)); 122 123 return (0); 124 } 125 126 static int 127 node_io(_ii_info_t *ip, NODE *np, nodeid_t node, int flag) 128 { 129 int rc; 130 int node_fba; 131 int tree_fba = ip->bi_copyfba + (ip->bi_copyfba-ip->bi_shdfba); 132 int offset; 133 nsc_buf_t *tmp = NULL; 134 135 /* 136 * Don't use _ii_nsc_io() as _ii_nsc_io() requires io to start at 137 * an fba boundary. 138 */ 139 140 /* calculate location of node on bitmap file */ 141 offset = (node % nodes_per_fba) * sizeof (NODE); 142 node_fba = tree_fba + node / nodes_per_fba; 143 144 /* read disk block containing node */ 145 rc = nsc_alloc_buf(ip->bi_bmpfd, node_fba, 1, NSC_RDBUF|flag, &tmp); 146 if (!II_SUCCESS(rc)) { 147 _ii_error(ip, DSW_BMPOFFLINE); 148 if (tmp) 149 (void) nsc_free_buf(tmp); 150 151 DTRACE_PROBE(_iit_node_io_end); 152 return (1); 153 } 154 155 /* copy node and update bitmap file if needed */ 156 rc = 0; 157 if (flag == NSC_RDBUF) 158 bcopy(tmp->sb_vec->sv_addr+offset, np, sizeof (NODE)); 159 else { 160 bcopy(np, tmp->sb_vec->sv_addr+offset, sizeof (NODE)); 161 II_NSC_WRITE(ip, bitmap, rc, tmp, node_fba, 1, 0); 162 if (!II_SUCCESS(rc)) { 163 _ii_error(ip, DSW_BMPOFFLINE); 164 rc = EIO; 165 } 166 } 167 if (tmp) 168 (void) nsc_free_buf(tmp); 169 170 return (0); 171 } 172 173 static int 174 node_fba_fill(_ii_info_t *ip, nsc_size_t nchunks, chunkid_t vchunk_id) 175 { 176 int rc; 177 nsc_off_t fba; 178 nsc_size_t fbas; 179 nsc_size_t maxfbas; 180 nsc_buf_t *bp; 181 nsc_vec_t *vp; 182 183 /* Determine maximum number of FBAs to allocate */ 184 rc = nsc_maxfbas(ip->bi_bmpfd, 0, &maxfbas); 185 if (!II_SUCCESS(rc)) 186 maxfbas = DSW_CBLK_FBA; 187 188 /* Write out blocks of initialied NODEs */ 189 fba = ip->bi_copyfba + (ip->bi_copyfba-ip->bi_shdfba); 190 fbas = FBA_LEN(nchunks * sizeof (NODE)); 191 while (fbas > 0) { 192 193 /* Determine number of FBA to allocate this time */ 194 if (fbas < maxfbas) maxfbas = fbas; 195 196 /* Allocate buffer which map to FBAs containing NODEs */ 197 bp = NULL; 198 rc = nsc_alloc_buf(ip->bi_bmpfd, fba, maxfbas, NSC_WRBUF, &bp); 199 if (!II_SUCCESS(rc)) { 200 _ii_error(ip, DSW_BMPOFFLINE); 201 DTRACE_PROBE(alloc_buf_failed); 202 return (EIO); 203 } 204 205 /* traverse vector list, filling wth initialized NODEs */ 206 for (vp = bp->sb_vec; vp->sv_addr && vp->sv_len; vp++) { 207 NODE *pnode = (NODE *)vp->sv_addr; 208 NODE *enode = (NODE *)(vp->sv_addr + vp->sv_len); 209 while (pnode < enode) { 210 pnode->vchunk_id = vchunk_id; 211 pnode++; 212 } 213 } 214 215 /* write FBAs containing initialized NODEs */ 216 II_NSC_WRITE(ip, bitmap, rc, bp, fba, maxfbas, 0); 217 if (!II_SUCCESS(rc)) { 218 _ii_error(ip, DSW_BMPOFFLINE); 219 (void) nsc_free_buf(bp); 220 DTRACE_PROBE(write_failed); 221 return (EIO); 222 } 223 224 /* free the buffer */ 225 (void) nsc_free_buf(bp); 226 227 /* Adjust nsc buffer values */ 228 fba += maxfbas; 229 fbas -= maxfbas; 230 } 231 232 return (0); 233 } 234 235 /* 236 * Reads the node into core and returns a pointer to it. 237 */ 238 239 static NODE * 240 read_node(_ii_info_t *ip, nodeid_t node) 241 { 242 NODE *new; 243 244 new = (NODE *)kmem_alloc(sizeof (NODE), KM_SLEEP); 245 246 if (node_io(ip, new, node, NSC_RDBUF)) { 247 kmem_free(new, sizeof (NODE)); 248 new = NULL; 249 } 250 251 return (new); 252 } 253 254 255 static chunkid_t 256 alloc_chunk(_ii_info_t *ip) 257 { 258 ii_nodelink_t nl; 259 int fba; 260 chunkid_t rc = II_NULLCHUNK; 261 262 mutex_enter(&ip->bi_chksmutex); 263 if (ip->bi_shdchkused < ip->bi_shdchks) { 264 rc = ip->bi_shdchkused++; 265 } else if (ip->bi_shdfchk != II_NULLCHUNK) { 266 ASSERT(ip->bi_shdfchk >= 0 && ip->bi_shdfchk < ip->bi_shdchks); 267 rc = ip->bi_shdfchk; 268 fba = CHUNK_FBA(rc); 269 (void) _ii_rsrv_devs(ip, SHDR, II_INTERNAL); 270 (void) _ii_nsc_io(ip, KS_SHD, SHDFD(ip), NSC_RDBUF, fba, 271 (unsigned char *)&nl, sizeof (nl)); 272 _ii_rlse_devs(ip, SHDR); 273 ip->bi_shdfchk = nl.next_chunk; 274 ASSERT(ip->bi_shdfchk == II_NULLCHUNK || 275 (ip->bi_shdfchk >= 0 && ip->bi_shdfchk < ip->bi_shdchks)); 276 } else { 277 278 /* into overflow */ 279 rc = ii_alloc_overflow(ip); 280 } 281 mutex_exit(&ip->bi_chksmutex); 282 (void) update_tree_header(ip); 283 284 return (rc); 285 } 286 287 /* 288 * releases memory for node 289 */ 290 static void /*ARGSUSED*/ 291 release_node(_ii_info_t *ip, NODE *np, nodeid_t ni) 292 { 293 kmem_free(np, sizeof (NODE)); 294 295 } 296 297 static void 298 write_node(_ii_info_t *ip, NODE *np, nodeid_t ni) 299 { 300 (void) node_io(ip, np, ni, NSC_WRBUF); 301 release_node(ip, np, ni); 302 303 } 304 305 static void 306 free_node(_ii_info_t *ip, NODE *np, nodeid_t ni) 307 { 308 ii_nodelink_t nl; 309 int fba; 310 311 if (np == NULL) { 312 DTRACE_PROBE(_iit_free_node_end); 313 return; 314 } 315 316 mutex_enter(&ip->bi_chksmutex); 317 if (II_ISOVERFLOW(np->vchunk_id)) { 318 /* link chunk onto overflow free list */ 319 ii_free_overflow(ip, np->vchunk_id); 320 } else { 321 /* write old free list head into chunk */ 322 nl.next_chunk = ip->bi_shdfchk; 323 ip->bi_shdfchk = np->vchunk_id; 324 ASSERT(ip->bi_shdfchk == II_NULLCHUNK || 325 (ip->bi_shdfchk >= 0 && ip->bi_shdfchk < ip->bi_shdchks)); 326 fba = CHUNK_FBA(np->vchunk_id); 327 (void) _ii_rsrv_devs(ip, SHDR, II_INTERNAL); 328 (void) _ii_nsc_io(ip, KS_SHD, SHDFD(ip), NSC_WRBUF, fba, 329 (unsigned char *)&nl, sizeof (nl)); 330 _ii_rlse_devs(ip, SHDR); 331 /* update free counts */ 332 /* ip->bi_unused++; */ 333 } 334 np->vchunk_id = II_NULLCHUNK; 335 (void) node_io(ip, np, ni, NSC_WRBUF); 336 (void) update_tree_header(ip); 337 mutex_exit(&ip->bi_chksmutex); 338 339 } 340 341 /* 342 * Public functions for dsw_dev to use. 343 */ 344 345 /* 346 * Overflow volume functions. 347 */ 348 349 /* put overflow chunk on the overflow volume free list */ 350 void 351 ii_free_overflow(_ii_info_t *ip, chunkid_t chunk) 352 { 353 ii_nodelink_t nl; 354 _ii_overflow_t *op; 355 int fba; 356 357 if (!II_ISOVERFLOW(chunk)) { 358 DTRACE_PROBE(_iit_free_overflow_end_1); 359 return; 360 } 361 chunk = II_2OVERFLOW(chunk); 362 363 op = ip->bi_overflow; 364 if (op == NULL) { 365 #ifdef DEBUG 366 cmn_err(CE_PANIC, "overflow used, but not attached ip %p", 367 (void *) ip); 368 #endif 369 DTRACE_PROBE(_iit_free_overflow_end_2); 370 return; 371 } 372 mutex_enter(&(op->ii_mutex)); 373 374 DTRACE_PROBE(_iit_free_overflow); 375 376 /* write old free list head into chunk */ 377 nl.next_chunk = op->ii_freehead; 378 fba = CHUNK_FBA(chunk); 379 (void) nsc_reserve(op->ii_dev->bi_fd, NSC_MULTI); 380 (void) _ii_nsc_io(ip, KS_OVR, op->ii_dev->bi_fd, NSC_WRBUF, fba, 381 (unsigned char *)&nl, sizeof (nl)); 382 /* update free counts */ 383 op->ii_unused++; 384 ASSERT(op->ii_used > 0); /* always use 1 for header */ 385 386 /* write chunk id into header freelist start */ 387 op->ii_freehead = chunk; 388 389 (void) update_overflow_header(ip, op); 390 nsc_release(op->ii_dev->bi_fd); 391 mutex_exit(&(op->ii_mutex)); 392 393 } 394 395 /* reclaim any overflow storage used by the volume */ 396 void 397 ii_reclaim_overflow(_ii_info_t *ip) 398 { 399 NODE *node; 400 nodeid_t node_id; 401 _ii_overflow_t *op; 402 403 if ((ip->bi_flags & (DSW_VOVERFLOW | DSW_FRECLAIM)) == 0) { 404 DTRACE_PROBE(_iit_reclaim_overflow_end); 405 return; 406 } 407 408 /* 409 * Determine whether overflow should be reclaimed: 410 * 1/ If we're not doing a group volume update 411 * OR 412 * 2/ If the number of detaches != number of attached vols 413 */ 414 op = ip->bi_overflow; 415 if (op && (((op->ii_flags & IIO_VOL_UPDATE) == 0) || 416 (op->ii_detachcnt != op->ii_drefcnt))) { 417 #ifndef II_MULTIMULTI_TERABYTE 418 /* assert volume size fits into node_id */ 419 ASSERT(ip->bi_mstchks <= INT32_MAX); 420 #endif 421 for (node_id = 0; node_id < ip->bi_mstchks; node_id++) { 422 if ((node = read_node(ip, node_id)) == NULL) { 423 DTRACE_PROBE(_iit_reclaim_overflow_end); 424 return; 425 } 426 ii_free_overflow(ip, node->vchunk_id); 427 release_node(ip, node, node_id); 428 } 429 } else { 430 /* need to reset the overflow volume header */ 431 op->ii_freehead = II_NULLNODE; 432 op->ii_used = 1; /* we have used the header */ 433 op->ii_unused = op->ii_nchunks - op->ii_used; 434 (void) update_overflow_header(ip, op); 435 } 436 437 DTRACE_PROBE(_iit_reclaim_overflow); 438 439 if ((ip->bi_flags & DSW_VOVERFLOW) == DSW_VOVERFLOW) { 440 mutex_enter(&ip->bi_mutex); 441 II_FLAG_CLR(DSW_VOVERFLOW, ip); 442 mutex_exit(&ip->bi_mutex); 443 } 444 --iigkstat.spilled_over.value.ul; 445 446 } 447 448 static chunkid_t 449 ii_alloc_overflow(_ii_info_t *ip) 450 { 451 chunkid_t chunk; 452 ii_nodelink_t nl; 453 _ii_overflow_t *op; 454 int fba; 455 456 if ((op = ip->bi_overflow) == NULL) { 457 DTRACE_PROBE(_iit_alloc_overflow_end); 458 return (II_NULLCHUNK); /* no overflow volume attached */ 459 } 460 461 mutex_enter(&(op->ii_mutex)); 462 463 DTRACE_PROBE(_iit_alloc_overflow); 464 465 if (op->ii_unused < 1) { 466 mutex_exit(&(op->ii_mutex)); 467 DTRACE_PROBE(_iit_alloc_overflow_end); 468 return (II_NULLCHUNK); 469 } 470 (void) nsc_reserve(op->ii_dev->bi_fd, NSC_MULTI); 471 if (op->ii_freehead != II_NULLCHUNK) { 472 /* pick first from free list */ 473 chunk = op->ii_freehead; 474 fba = CHUNK_FBA(chunk); 475 (void) _ii_nsc_io(ip, KS_OVR, op->ii_dev->bi_fd, NSC_RDBUF, fba, 476 (unsigned char *)&nl, sizeof (nl)); 477 op->ii_freehead = nl.next_chunk; 478 /* decrease unused count, fix bug 4419956 */ 479 op->ii_unused--; 480 } else { 481 /* otherwise pick first unused */ 482 if (op->ii_used > op->ii_nchunks) 483 chunk = II_NULLCHUNK; 484 else { 485 chunk = op->ii_used++; 486 op->ii_unused--; 487 } 488 } 489 if (chunk != II_NULLCHUNK) { 490 chunk = II_2OVERFLOW(chunk); 491 if ((ip->bi_flags&DSW_VOVERFLOW) == 0) { 492 mutex_enter(&ip->bi_mutex); 493 II_FLAG_SET(DSW_VOVERFLOW, ip); 494 mutex_exit(&ip->bi_mutex); 495 ++iigkstat.spilled_over.value.ul; 496 } 497 } 498 (void) update_overflow_header(ip, op); 499 nsc_release(op->ii_dev->bi_fd); 500 mutex_exit(&(op->ii_mutex)); 501 502 return (chunk); 503 } 504 /* 505 * Find or insert key into search tree. 506 */ 507 508 chunkid_t 509 ii_tsearch(_ii_info_t *ip, chunkid_t chunk_id) 510 /* Address of the root of the tree */ 511 { 512 NODE *rootp = NULL; 513 chunkid_t n; /* New node id if key not found */ 514 515 if ((rootp = read_node(ip, chunk_id)) == NULL) { 516 DTRACE_PROBE(_iit_tsearch_end); 517 return (II_NULLNODE); 518 } 519 n = rootp->vchunk_id; 520 if (n != II_NULLCHUNK) { /* chunk allocated, return location */ 521 release_node(ip, rootp, 0); 522 DTRACE_PROBE(_iit_tsearch_end); 523 return (n); 524 } 525 n = alloc_chunk(ip); 526 if (n != II_NULLCHUNK) { 527 rootp->vchunk_id = n; 528 write_node(ip, rootp, chunk_id); 529 } else 530 release_node(ip, rootp, 0); 531 532 return (n); 533 } 534 535 /* Delete node with key chunkid */ 536 void 537 ii_tdelete(_ii_info_t *ip, 538 chunkid_t chunkid) /* Key to be deleted */ 539 { 540 NODE *np = NULL; 541 542 if ((np = read_node(ip, chunkid)) == NULL) { 543 DTRACE_PROBE(_iit_tdelete_end); 544 return; 545 } 546 547 ASSERT(np->vchunk_id != II_NULLCHUNK); 548 free_node(ip, np, chunkid); 549 np->vchunk_id = II_NULLCHUNK; 550 write_node(ip, np, chunkid); 551 552 } 553 554 /* 555 * initialise an empty map for ip 556 */ 557 558 int 559 ii_tinit(_ii_info_t *ip) 560 { 561 int rc = 0; 562 563 /* overflow can't be attached before first call to this function */ 564 if (ip->bi_overflow) 565 ii_reclaim_overflow(ip); 566 567 mutex_enter(&ip->bi_chksmutex); 568 ip->bi_shdfchk = II_NULLCHUNK; /* set freelist to empty chain */ 569 ip->bi_shdchkused = 0; 570 571 /* fill index (bi_mstchks size) with II_NULLCHUNK */ 572 rc = node_fba_fill(ip, ip->bi_mstchks, II_NULLCHUNK); 573 if (rc == 0) 574 rc = update_tree_header(ip); 575 mutex_exit(&ip->bi_chksmutex); 576 577 return (rc); 578 } 579 580 /* 581 * Calculate the size of map space provided by a bitmap volume with 582 * tree_len fba's spare for the tree. 583 */ 584 585 nsc_size_t 586 ii_btsize(nsc_size_t tree_len) 587 { 588 nsc_size_t nchunks; 589 590 nchunks = tree_len * nodes_per_fba; 591 592 if (ii_debug > 1) 593 cmn_err(CE_NOTE, 594 "!ii_btsize: bitmap with %" NSC_SZFMT 595 " spare fba's will map %" NSC_SZFMT " chunks", 596 tree_len, nchunks); 597 598 return (nchunks); 599 }