1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #include <sys/types.h>
  27 #include <sys/ksynch.h>
  28 #include <sys/kmem.h>
  29 #include <sys/errno.h>
  30 #include <sys/cmn_err.h>
  31 #include <sys/debug.h>
  32 #include <sys/cred.h>
  33 #include <sys/file.h>
  34 #include <sys/ddi.h>
  35 #include <sys/nsctl/nsctl.h>
  36 #include <sys/unistat/spcs_s.h>
  37 #include <sys/unistat/spcs_errors.h>
  38 
  39 #include <sys/unistat/spcs_s_k.h>
  40 #include "dsw.h"
  41 #include "dsw_dev.h"
  42 
  43 #ifdef DS_DDICT
  44 #include "../contract.h"
  45 #endif
  46 
  47 #include <sys/sdt.h>              /* dtrace is S10 or later */
  48 
  49 /*
  50  * Instant Image.
  51  *
  52  * This file contains the chunk map lookup functions of II.
  53  *
  54  */
  55 #define CHUNK_FBA(chunk) DSW_CHK2FBA(chunk)
  56 
  57 extern int ii_debug;    /* debug level switch */
  58 int ii_map_debug = 0;
  59 
  60 #ifdef II_MULTIMULTI_TERABYTE
  61 typedef int64_t nodeid_t;
  62 typedef int32_t nodeid32_t;
  63 #else
  64 typedef int32_t nodeid_t;
  65 #endif
  66 
  67 typedef struct  ii_node {
  68         chunkid_t       vchunk_id;              /* virtual chunk id */
  69 } NODE;
  70 
  71 typedef struct ii_nodelink_s {
  72         chunkid_t       next_chunk;
  73 } ii_nodelink_t;
  74 
  75 static  int     nodes_per_fba = FBA_SIZE(1) / sizeof (NODE);
  76 
  77 ii_header_t *_ii_bm_header_get(_ii_info_t *ip, nsc_buf_t **tmp);
  78 int _ii_bm_header_put(ii_header_t *hdr, _ii_info_t *ip,
  79     nsc_buf_t *tmp);
  80 void _ii_rlse_devs(_ii_info_t *, int);
  81 int _ii_rsrv_devs(_ii_info_t *, int, int);
  82 void _ii_error(_ii_info_t *, int);
  83 /*
  84  * Private functions for use in this file.
  85  */
  86 static void free_node(_ii_info_t *ip, NODE *np, nodeid_t ni);
  87 static chunkid_t ii_alloc_overflow(_ii_info_t *ip);
  88 void ii_free_overflow(_ii_info_t *, chunkid_t);
  89 extern int _ii_nsc_io(_ii_info_t *, int, nsc_fd_t *, int, nsc_off_t,
  90     unsigned char *, nsc_size_t);
  91 
  92 static int
  93 update_tree_header(_ii_info_t *ip)
  94 {
  95         ii_header_t *header;
  96         nsc_buf_t       *tmp = NULL;
  97 
  98         mutex_enter(&ip->bi_mutex);
  99         header = _ii_bm_header_get(ip, &tmp);
 100         if (header == NULL) {
 101                 /* bitmap is probably offline */
 102                 mutex_exit(&ip->bi_mutex);
 103                 DTRACE_PROBE(_iit_update_tree_header_end);
 104                 return (1);
 105         }
 106         header->ii_mstchks = ip->bi_mstchks;
 107         header->ii_shdchks = ip->bi_shdchks;
 108         header->ii_shdchkused = ip->bi_shdchkused;
 109         header->ii_shdfchk = ip->bi_shdfchk;
 110         (void) _ii_bm_header_put(header, ip, tmp);
 111         mutex_exit(&ip->bi_mutex);
 112 
 113         return (0);
 114 }
 115 
 116 static int
 117 update_overflow_header(_ii_info_t *ip, _ii_overflow_t *op)
 118 {
 119         (void) _ii_nsc_io(ip, KS_OVR, op->ii_dev->bi_fd, NSC_WRBUF,
 120             II_OHEADER_FBA, (unsigned char *)&(op->ii_do),
 121             sizeof (_ii_doverflow_t));
 122 
 123         return (0);
 124 }
 125 
 126 static int
 127 node_io(_ii_info_t *ip, NODE *np, nodeid_t node, int flag)
 128 {
 129         int     rc;
 130         int     node_fba;
 131         int     tree_fba = ip->bi_copyfba + (ip->bi_copyfba-ip->bi_shdfba);
 132         int     offset;
 133         nsc_buf_t *tmp = NULL;
 134 
 135         /*
 136          * Don't use _ii_nsc_io() as _ii_nsc_io() requires io to start at
 137          * an fba boundary.
 138          */
 139 
 140         /* calculate location of node on bitmap file */
 141         offset = (node % nodes_per_fba) * sizeof (NODE);
 142         node_fba = tree_fba + node / nodes_per_fba;
 143 
 144         /* read disk block containing node */
 145         rc = nsc_alloc_buf(ip->bi_bmpfd, node_fba, 1, NSC_RDBUF|flag, &tmp);
 146         if (!II_SUCCESS(rc)) {
 147                 _ii_error(ip, DSW_BMPOFFLINE);
 148                 if (tmp)
 149                         (void) nsc_free_buf(tmp);
 150 
 151                 DTRACE_PROBE(_iit_node_io_end);
 152                 return (1);
 153         }
 154 
 155         /* copy node and update bitmap file if needed */
 156         rc = 0;
 157         if (flag == NSC_RDBUF)
 158                 bcopy(tmp->sb_vec->sv_addr+offset, np, sizeof (NODE));
 159         else {
 160                 bcopy(np, tmp->sb_vec->sv_addr+offset, sizeof (NODE));
 161                 II_NSC_WRITE(ip, bitmap, rc, tmp, node_fba, 1, 0);
 162                 if (!II_SUCCESS(rc)) {
 163                         _ii_error(ip, DSW_BMPOFFLINE);
 164                         rc = EIO;
 165                 }
 166         }
 167         if (tmp)
 168                 (void) nsc_free_buf(tmp);
 169 
 170         return (0);
 171 }
 172 
 173 static int
 174 node_fba_fill(_ii_info_t *ip, nsc_size_t nchunks, chunkid_t vchunk_id)
 175 {
 176         int     rc;
 177         nsc_off_t       fba;
 178         nsc_size_t      fbas;
 179         nsc_size_t      maxfbas;
 180         nsc_buf_t *bp;
 181         nsc_vec_t *vp;
 182 
 183         /* Determine maximum number of FBAs to allocate */
 184         rc =  nsc_maxfbas(ip->bi_bmpfd, 0, &maxfbas);
 185         if (!II_SUCCESS(rc))
 186                 maxfbas = DSW_CBLK_FBA;
 187 
 188         /* Write out blocks of initialied NODEs */
 189         fba = ip->bi_copyfba + (ip->bi_copyfba-ip->bi_shdfba);
 190         fbas = FBA_LEN(nchunks * sizeof (NODE));
 191         while (fbas > 0) {
 192 
 193                 /* Determine number of FBA to allocate this time */
 194                 if (fbas < maxfbas) maxfbas = fbas;
 195 
 196                 /* Allocate buffer which map to FBAs containing NODEs */
 197                 bp = NULL;
 198                 rc = nsc_alloc_buf(ip->bi_bmpfd, fba, maxfbas, NSC_WRBUF, &bp);
 199                 if (!II_SUCCESS(rc)) {
 200                         _ii_error(ip, DSW_BMPOFFLINE);
 201                         DTRACE_PROBE(alloc_buf_failed);
 202                         return (EIO);
 203                 }
 204 
 205                 /* traverse vector list, filling wth initialized NODEs */
 206                 for (vp = bp->sb_vec; vp->sv_addr && vp->sv_len; vp++) {
 207                         NODE *pnode = (NODE *)vp->sv_addr;
 208                         NODE *enode = (NODE *)(vp->sv_addr +  vp->sv_len);
 209                         while (pnode < enode) {
 210                                 pnode->vchunk_id = vchunk_id;
 211                                 pnode++;
 212                         }
 213                 }
 214 
 215                 /* write FBAs containing initialized NODEs */
 216                 II_NSC_WRITE(ip, bitmap, rc, bp, fba, maxfbas, 0);
 217                 if (!II_SUCCESS(rc)) {
 218                         _ii_error(ip, DSW_BMPOFFLINE);
 219                         (void) nsc_free_buf(bp);
 220                         DTRACE_PROBE(write_failed);
 221                         return (EIO);
 222                 }
 223 
 224                 /* free the buffer */
 225                 (void) nsc_free_buf(bp);
 226 
 227                 /* Adjust nsc buffer values */
 228                 fba += maxfbas;
 229                 fbas -= maxfbas;
 230         }
 231 
 232         return (0);
 233 }
 234 
 235 /*
 236  * Reads the node into core and returns a pointer to it.
 237  */
 238 
 239 static NODE *
 240 read_node(_ii_info_t *ip, nodeid_t node)
 241 {
 242         NODE *new;
 243 
 244         new = (NODE *)kmem_alloc(sizeof (NODE), KM_SLEEP);
 245 
 246         if (node_io(ip, new, node, NSC_RDBUF)) {
 247                 kmem_free(new, sizeof (NODE));
 248                 new = NULL;
 249         }
 250 
 251         return (new);
 252 }
 253 
 254 
 255 static chunkid_t
 256 alloc_chunk(_ii_info_t *ip)
 257 {
 258         ii_nodelink_t nl;
 259         int fba;
 260         chunkid_t rc = II_NULLCHUNK;
 261 
 262         mutex_enter(&ip->bi_chksmutex);
 263         if (ip->bi_shdchkused < ip->bi_shdchks) {
 264                 rc = ip->bi_shdchkused++;
 265         } else if (ip->bi_shdfchk != II_NULLCHUNK) {
 266                 ASSERT(ip->bi_shdfchk >= 0 && ip->bi_shdfchk < ip->bi_shdchks);
 267                 rc = ip->bi_shdfchk;
 268                 fba = CHUNK_FBA(rc);
 269                 (void) _ii_rsrv_devs(ip, SHDR, II_INTERNAL);
 270                 (void) _ii_nsc_io(ip, KS_SHD, SHDFD(ip), NSC_RDBUF, fba,
 271                     (unsigned char *)&nl, sizeof (nl));
 272                 _ii_rlse_devs(ip, SHDR);
 273                 ip->bi_shdfchk = nl.next_chunk;
 274                 ASSERT(ip->bi_shdfchk == II_NULLCHUNK ||
 275                     (ip->bi_shdfchk >= 0 && ip->bi_shdfchk < ip->bi_shdchks));
 276         } else {
 277 
 278                 /* into overflow */
 279                 rc = ii_alloc_overflow(ip);
 280         }
 281         mutex_exit(&ip->bi_chksmutex);
 282         (void) update_tree_header(ip);
 283 
 284         return (rc);
 285 }
 286 
 287 /*
 288  * releases memory for node
 289  */
 290 static void     /*ARGSUSED*/
 291 release_node(_ii_info_t *ip, NODE *np, nodeid_t ni)
 292 {
 293         kmem_free(np, sizeof (NODE));
 294 
 295 }
 296 
 297 static void
 298 write_node(_ii_info_t *ip, NODE *np, nodeid_t ni)
 299 {
 300         (void) node_io(ip, np, ni, NSC_WRBUF);
 301         release_node(ip, np, ni);
 302 
 303 }
 304 
 305 static void
 306 free_node(_ii_info_t *ip, NODE *np, nodeid_t ni)
 307 {
 308         ii_nodelink_t nl;
 309         int     fba;
 310 
 311         if (np == NULL) {
 312                 DTRACE_PROBE(_iit_free_node_end);
 313                 return;
 314         }
 315 
 316         mutex_enter(&ip->bi_chksmutex);
 317         if (II_ISOVERFLOW(np->vchunk_id)) {
 318                 /* link chunk onto overflow free list */
 319                 ii_free_overflow(ip, np->vchunk_id);
 320         } else {
 321                 /* write old free list head into chunk */
 322                 nl.next_chunk = ip->bi_shdfchk;
 323                 ip->bi_shdfchk = np->vchunk_id;
 324                 ASSERT(ip->bi_shdfchk == II_NULLCHUNK ||
 325                     (ip->bi_shdfchk >= 0 && ip->bi_shdfchk < ip->bi_shdchks));
 326                 fba = CHUNK_FBA(np->vchunk_id);
 327                 (void) _ii_rsrv_devs(ip, SHDR, II_INTERNAL);
 328                 (void) _ii_nsc_io(ip, KS_SHD, SHDFD(ip), NSC_WRBUF, fba,
 329                     (unsigned char *)&nl, sizeof (nl));
 330                 _ii_rlse_devs(ip, SHDR);
 331                 /* update free counts */
 332                 /* ip->bi_unused++; */
 333         }
 334         np->vchunk_id = II_NULLCHUNK;
 335         (void) node_io(ip, np, ni, NSC_WRBUF);
 336         (void) update_tree_header(ip);
 337         mutex_exit(&ip->bi_chksmutex);
 338 
 339 }
 340 
 341 /*
 342  * Public functions for dsw_dev to use.
 343  */
 344 
 345 /*
 346  * Overflow volume functions.
 347  */
 348 
 349 /* put overflow chunk on the overflow volume free list */
 350 void
 351 ii_free_overflow(_ii_info_t *ip, chunkid_t chunk)
 352 {
 353         ii_nodelink_t nl;
 354         _ii_overflow_t *op;
 355         int fba;
 356 
 357         if (!II_ISOVERFLOW(chunk)) {
 358                 DTRACE_PROBE(_iit_free_overflow_end_1);
 359                 return;
 360         }
 361         chunk = II_2OVERFLOW(chunk);
 362 
 363         op = ip->bi_overflow;
 364         if (op == NULL) {
 365 #ifdef DEBUG
 366                 cmn_err(CE_PANIC, "overflow used, but not attached ip %p",
 367                     (void *) ip);
 368 #endif
 369                 DTRACE_PROBE(_iit_free_overflow_end_2);
 370                 return;
 371         }
 372         mutex_enter(&(op->ii_mutex));
 373 
 374         DTRACE_PROBE(_iit_free_overflow);
 375 
 376         /* write old free list head into chunk */
 377         nl.next_chunk = op->ii_freehead;
 378         fba = CHUNK_FBA(chunk);
 379         (void) nsc_reserve(op->ii_dev->bi_fd, NSC_MULTI);
 380         (void) _ii_nsc_io(ip, KS_OVR, op->ii_dev->bi_fd, NSC_WRBUF, fba,
 381             (unsigned char *)&nl, sizeof (nl));
 382         /* update free counts */
 383         op->ii_unused++;
 384         ASSERT(op->ii_used > 0);          /* always use 1 for header */
 385 
 386         /* write chunk id into header freelist start */
 387         op->ii_freehead =  chunk;
 388 
 389         (void) update_overflow_header(ip, op);
 390         nsc_release(op->ii_dev->bi_fd);
 391         mutex_exit(&(op->ii_mutex));
 392 
 393 }
 394 
 395 /* reclaim any overflow storage used by the volume */
 396 void
 397 ii_reclaim_overflow(_ii_info_t *ip)
 398 {
 399         NODE    *node;
 400         nodeid_t node_id;
 401         _ii_overflow_t *op;
 402 
 403         if ((ip->bi_flags & (DSW_VOVERFLOW | DSW_FRECLAIM)) == 0) {
 404                 DTRACE_PROBE(_iit_reclaim_overflow_end);
 405                 return;
 406         }
 407 
 408         /*
 409          * Determine whether overflow should be reclaimed:
 410          * 1/ If we're not doing a group volume update
 411          * OR
 412          * 2/ If the number of detaches != number of attached vols
 413          */
 414         op = ip->bi_overflow;
 415         if (op && (((op->ii_flags & IIO_VOL_UPDATE) == 0) ||
 416             (op->ii_detachcnt != op->ii_drefcnt))) {
 417 #ifndef II_MULTIMULTI_TERABYTE
 418                 /* assert volume size fits into node_id */
 419                 ASSERT(ip->bi_mstchks <= INT32_MAX);
 420 #endif
 421                 for (node_id = 0; node_id < ip->bi_mstchks; node_id++) {
 422                         if ((node = read_node(ip, node_id)) == NULL) {
 423                                 DTRACE_PROBE(_iit_reclaim_overflow_end);
 424                                 return;
 425                         }
 426                         ii_free_overflow(ip, node->vchunk_id);
 427                         release_node(ip, node, node_id);
 428                 }
 429         } else {
 430                 /* need to reset the overflow volume header */
 431                 op->ii_freehead = II_NULLNODE;
 432                 op->ii_used = 1;             /* we have used the header */
 433                 op->ii_unused = op->ii_nchunks - op->ii_used;
 434                 (void) update_overflow_header(ip, op);
 435         }
 436 
 437         DTRACE_PROBE(_iit_reclaim_overflow);
 438 
 439         if ((ip->bi_flags & DSW_VOVERFLOW) == DSW_VOVERFLOW) {
 440                 mutex_enter(&ip->bi_mutex);
 441                 II_FLAG_CLR(DSW_VOVERFLOW, ip);
 442                 mutex_exit(&ip->bi_mutex);
 443         }
 444         --iigkstat.spilled_over.value.ul;
 445 
 446 }
 447 
 448 static chunkid_t
 449 ii_alloc_overflow(_ii_info_t *ip)
 450 {
 451         chunkid_t chunk;
 452         ii_nodelink_t nl;
 453         _ii_overflow_t *op;
 454         int fba;
 455 
 456         if ((op = ip->bi_overflow) == NULL) {
 457                 DTRACE_PROBE(_iit_alloc_overflow_end);
 458                 return (II_NULLCHUNK);  /* no overflow volume attached */
 459         }
 460 
 461         mutex_enter(&(op->ii_mutex));
 462 
 463         DTRACE_PROBE(_iit_alloc_overflow);
 464 
 465         if (op->ii_unused < 1) {
 466                 mutex_exit(&(op->ii_mutex));
 467                 DTRACE_PROBE(_iit_alloc_overflow_end);
 468                 return (II_NULLCHUNK);
 469         }
 470         (void) nsc_reserve(op->ii_dev->bi_fd, NSC_MULTI);
 471         if (op->ii_freehead != II_NULLCHUNK) {
 472                 /* pick first from free list */
 473                 chunk = op->ii_freehead;
 474                 fba = CHUNK_FBA(chunk);
 475                 (void) _ii_nsc_io(ip, KS_OVR, op->ii_dev->bi_fd, NSC_RDBUF, fba,
 476                     (unsigned char *)&nl, sizeof (nl));
 477                 op->ii_freehead = nl.next_chunk;
 478                 /* decrease unused count, fix bug 4419956 */
 479                 op->ii_unused--;
 480         } else {
 481                 /* otherwise pick first unused */
 482                 if (op->ii_used > op->ii_nchunks)
 483                         chunk = II_NULLCHUNK;
 484                 else {
 485                         chunk = op->ii_used++;
 486                         op->ii_unused--;
 487                 }
 488         }
 489         if (chunk != II_NULLCHUNK) {
 490                 chunk = II_2OVERFLOW(chunk);
 491                 if ((ip->bi_flags&DSW_VOVERFLOW) == 0) {
 492                         mutex_enter(&ip->bi_mutex);
 493                         II_FLAG_SET(DSW_VOVERFLOW, ip);
 494                         mutex_exit(&ip->bi_mutex);
 495                         ++iigkstat.spilled_over.value.ul;
 496                 }
 497         }
 498         (void) update_overflow_header(ip, op);
 499         nsc_release(op->ii_dev->bi_fd);
 500         mutex_exit(&(op->ii_mutex));
 501 
 502         return (chunk);
 503 }
 504 /*
 505  * Find or insert key into search tree.
 506  */
 507 
 508 chunkid_t
 509 ii_tsearch(_ii_info_t *ip, chunkid_t chunk_id)
 510                         /* Address of the root of the tree */
 511 {
 512         NODE    *rootp = NULL;
 513         chunkid_t n;    /* New node id if key not found */
 514 
 515         if ((rootp = read_node(ip, chunk_id)) == NULL) {
 516                 DTRACE_PROBE(_iit_tsearch_end);
 517                 return (II_NULLNODE);
 518         }
 519         n = rootp->vchunk_id;
 520         if (n != II_NULLCHUNK) { /* chunk allocated, return location */
 521                 release_node(ip, rootp, 0);
 522                 DTRACE_PROBE(_iit_tsearch_end);
 523                 return (n);
 524         }
 525         n = alloc_chunk(ip);
 526         if (n != II_NULLCHUNK) {
 527                 rootp->vchunk_id = n;
 528                 write_node(ip, rootp, chunk_id);
 529         } else
 530                 release_node(ip, rootp, 0);
 531 
 532         return (n);
 533 }
 534 
 535 /* Delete node with key chunkid */
 536 void
 537 ii_tdelete(_ii_info_t *ip,
 538         chunkid_t chunkid)      /* Key to be deleted */
 539 {
 540         NODE *np = NULL;
 541 
 542         if ((np = read_node(ip, chunkid)) == NULL) {
 543                 DTRACE_PROBE(_iit_tdelete_end);
 544                 return;
 545         }
 546 
 547         ASSERT(np->vchunk_id != II_NULLCHUNK);
 548         free_node(ip, np, chunkid);
 549         np->vchunk_id = II_NULLCHUNK;
 550         write_node(ip, np, chunkid);
 551 
 552 }
 553 
 554 /*
 555  * initialise an empty map for ip
 556  */
 557 
 558 int
 559 ii_tinit(_ii_info_t *ip)
 560 {
 561         int rc = 0;
 562 
 563         /* overflow can't be attached before first call to this function */
 564         if (ip->bi_overflow)
 565                 ii_reclaim_overflow(ip);
 566 
 567         mutex_enter(&ip->bi_chksmutex);
 568         ip->bi_shdfchk = II_NULLCHUNK;       /* set freelist to empty chain */
 569         ip->bi_shdchkused = 0;
 570 
 571         /* fill index (bi_mstchks size) with II_NULLCHUNK */
 572         rc = node_fba_fill(ip, ip->bi_mstchks, II_NULLCHUNK);
 573         if (rc == 0)
 574                 rc = update_tree_header(ip);
 575         mutex_exit(&ip->bi_chksmutex);
 576 
 577         return (rc);
 578 }
 579 
 580 /*
 581  * Calculate the size of map space provided by a bitmap volume with
 582  * tree_len fba's spare for the tree.
 583  */
 584 
 585 nsc_size_t
 586 ii_btsize(nsc_size_t tree_len)
 587 {
 588         nsc_size_t nchunks;
 589 
 590         nchunks = tree_len * nodes_per_fba;
 591 
 592         if (ii_debug > 1)
 593                 cmn_err(CE_NOTE,
 594                     "!ii_btsize: bitmap with %" NSC_SZFMT
 595                     " spare fba's will map %" NSC_SZFMT " chunks",
 596                     tree_len, nchunks);
 597 
 598         return (nchunks);
 599 }