1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * University Copyright- Copyright (c) 1982, 1986, 1988
  31  * The Regents of the University of California
  32  * All Rights Reserved
  33  *
  34  * University Acknowledgment- Portions of this document are derived from
  35  * software developed by the University of California, Berkeley, and its
  36  * contributors.
  37  */
  38 
  39 #include <sys/types.h>
  40 #include <sys/t_lock.h>
  41 #include <sys/param.h>
  42 #include <sys/time.h>
  43 #include <sys/fs/ufs_fs.h>
  44 #include <sys/cmn_err.h>
  45 
  46 #ifdef _KERNEL
  47 
  48 #include <sys/systm.h>
  49 #include <sys/sysmacros.h>
  50 #include <sys/buf.h>
  51 #include <sys/conf.h>
  52 #include <sys/user.h>
  53 #include <sys/var.h>
  54 #include <sys/vfs.h>
  55 #include <sys/vnode.h>
  56 #include <sys/proc.h>
  57 #include <sys/debug.h>
  58 #include <sys/fssnap_if.h>
  59 #include <sys/fs/ufs_inode.h>
  60 #include <sys/fs/ufs_trans.h>
  61 #include <sys/fs/ufs_panic.h>
  62 #include <sys/fs/ufs_bio.h>
  63 #include <sys/fs/ufs_log.h>
  64 #include <sys/kmem.h>
  65 #include <sys/policy.h>
  66 #include <vm/hat.h>
  67 #include <vm/as.h>
  68 #include <vm/seg.h>
  69 #include <vm/pvn.h>
  70 #include <vm/seg_map.h>
  71 #include <sys/swap.h>
  72 #include <vm/seg_kmem.h>
  73 
  74 #else  /* _KERNEL */
  75 
  76 #define ASSERT(x)               /* don't use asserts for fsck et al */
  77 
  78 #endif  /* _KERNEL */
  79 
  80 #ifdef _KERNEL
  81 
  82 /*
  83  * Used to verify that a given entry on the ufs_instances list (see below)
  84  * still refers to a mounted file system.
  85  *
  86  * XXX: This is a crock that substitutes for proper locking to coordinate
  87  *      updates to and uses of the entries in ufs_instances.
  88  */
  89 struct check_node {
  90         struct vfs *vfsp;
  91         struct ufsvfs *ufsvfs;
  92         dev_t vfs_dev;
  93 };
  94 
  95 static vfs_t *still_mounted(struct check_node *);
  96 
  97 /*
  98  * All ufs file system instances are linked together into a list starting at
  99  * ufs_instances.  The list is updated as part of mount and unmount.  It's
 100  * consulted in ufs_update, to allow syncing out all ufs file system instances
 101  * in a batch.
 102  *
 103  * ufsvfs_mutex guards access to this list and to the {,old}ufsvfslist
 104  * manipulated in ufs_funmount_cleanup.  (A given ufs instance is always on
 105  * exactly one of these lists except while it's being allocated or
 106  * deallocated.)
 107  */
 108 struct ufsvfs   *ufs_instances;
 109 extern kmutex_t         ufsvfs_mutex;   /* XXX: move this to ufs_inode.h? */
 110 
 111 /*
 112  * ufsvfs list manipulation routines
 113  */
 114 
 115 /*
 116  * Link ufsp in at the head of the list of ufs_instances.
 117  */
 118 void
 119 ufs_vfs_add(struct ufsvfs *ufsp)
 120 {
 121         mutex_enter(&ufsvfs_mutex);
 122         ufsp->vfs_next = ufs_instances;
 123         ufs_instances = ufsp;
 124         mutex_exit(&ufsvfs_mutex);
 125 }
 126 
 127 /*
 128  * Remove ufsp from the list of ufs_instances.
 129  *
 130  * Does no error checking; ufsp is assumed to actually be on the list.
 131  */
 132 void
 133 ufs_vfs_remove(struct ufsvfs *ufsp)
 134 {
 135         struct ufsvfs   **delpt = &ufs_instances;
 136 
 137         mutex_enter(&ufsvfs_mutex);
 138         for (; *delpt != NULL; delpt = &((*delpt)->vfs_next)) {
 139                 if (*delpt == ufsp) {
 140                         *delpt = ufsp->vfs_next;
 141                         ufsp->vfs_next = NULL;
 142                         break;
 143                 }
 144         }
 145         mutex_exit(&ufsvfs_mutex);
 146 }
 147 
 148 /*
 149  * Clean up state resulting from a forcible unmount that couldn't be handled
 150  * directly during the unmount.  (See commentary in the unmount code for more
 151  * info.)
 152  */
 153 static void
 154 ufs_funmount_cleanup()
 155 {
 156         struct ufsvfs           *ufsvfsp;
 157         extern struct ufsvfs    *oldufsvfslist, *ufsvfslist;
 158 
 159         /*
 160          * Assumption: it's now safe to blow away the entries on
 161          * oldufsvfslist.
 162          */
 163         mutex_enter(&ufsvfs_mutex);
 164         while ((ufsvfsp = oldufsvfslist) != NULL) {
 165                 oldufsvfslist = ufsvfsp->vfs_next;
 166 
 167                 mutex_destroy(&ufsvfsp->vfs_lock);
 168                 kmem_free(ufsvfsp, sizeof (struct ufsvfs));
 169         }
 170         /*
 171          * Rotate more recent unmount entries into place in preparation for
 172          * the next time around.
 173          */
 174         oldufsvfslist = ufsvfslist;
 175         ufsvfslist = NULL;
 176         mutex_exit(&ufsvfs_mutex);
 177 }
 178 
 179 
 180 /*
 181  * ufs_update performs the ufs part of `sync'.  It goes through the disk
 182  * queues to initiate sandbagged IO; goes through the inodes to write
 183  * modified nodes; and it goes through the mount table to initiate
 184  * the writing of the modified super blocks.
 185  */
 186 extern time_t   time;
 187 time_t          ufs_sync_time;
 188 time_t          ufs_sync_time_secs = 1;
 189 
 190 extern kmutex_t ufs_scan_lock;
 191 
 192 void
 193 ufs_update(int flag)
 194 {
 195         struct vfs *vfsp;
 196         struct fs *fs;
 197         struct ufsvfs *ufsp;
 198         struct ufsvfs *ufsnext;
 199         struct ufsvfs *update_list = NULL;
 200         int check_cnt = 0;
 201         size_t check_size;
 202         struct check_node *check_list, *ptr;
 203         int cheap = flag & SYNC_ATTR;
 204 
 205         /*
 206          * This is a hack.  A design flaw in the forced unmount protocol
 207          * could allow a thread to attempt to use a kmem_freed ufsvfs
 208          * structure in ufs_lockfs_begin/ufs_check_lockfs.  This window
 209          * is difficult to hit, even during the lockfs stress tests.
 210          * So the hacky fix is to wait awhile before kmem_free'ing the
 211          * ufsvfs structures for forcibly unmounted file systems.  `Awhile'
 212          * is defined as every other call from fsflush (~60 seconds).
 213          */
 214         if (cheap)
 215                 ufs_funmount_cleanup();
 216 
 217         /*
 218          * Examine all ufsvfs structures and add those that we can lock to the
 219          * update list.  This is so that we don't hold the list lock for a
 220          * long time.  If vfs_lock fails for a file system instance, then skip
 221          * it because somebody is doing a unmount on it.
 222          */
 223         mutex_enter(&ufsvfs_mutex);
 224         for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) {
 225                 vfsp = ufsp->vfs_vfs;
 226                 if (vfs_lock(vfsp) != 0)
 227                         continue;
 228                 ufsp->vfs_wnext = update_list;
 229                 update_list = ufsp;
 230                 check_cnt++;
 231         }
 232         mutex_exit(&ufsvfs_mutex);
 233 
 234         if (update_list == NULL)
 235                 return;
 236 
 237         check_size = sizeof (struct check_node) * check_cnt;
 238         check_list = ptr = kmem_alloc(check_size, KM_NOSLEEP);
 239 
 240         /*
 241          * Write back modified superblocks.
 242          * Consistency check that the superblock of
 243          * each file system is still in the buffer cache.
 244          *
 245          * Note that the update_list traversal is done without the protection
 246          * of an overall list lock, so it's necessary to rely on the fact that
 247          * each entry of the list is vfs_locked when moving from one entry to
 248          * the next.  This works because a concurrent attempt to add an entry
 249          * to another thread's update_list won't find it, since it'll already
 250          * be locked.
 251          */
 252         check_cnt = 0;
 253         for (ufsp = update_list; ufsp != NULL; ufsp = ufsnext) {
 254                 /*
 255                  * Need to grab the next ptr before we unlock this one so
 256                  * another thread doesn't grab it and change it before we move
 257                  * on to the next vfs.  (Once we unlock it, it's ok if another
 258                  * thread finds it to add it to its own update_list; we don't
 259                  * attempt to refer to it through our list any more.)
 260                  */
 261                 ufsnext = ufsp->vfs_wnext;
 262                 vfsp = ufsp->vfs_vfs;
 263 
 264                 /*
 265                  * Seems like this can't happen, so perhaps it should become
 266                  * an ASSERT(vfsp->vfs_data != NULL).
 267                  */
 268                 if (!vfsp->vfs_data) {
 269                         vfs_unlock(vfsp);
 270                         continue;
 271                 }
 272 
 273                 fs = ufsp->vfs_fs;
 274 
 275                 /*
 276                  * don't update a locked superblock during a panic; it
 277                  * may be in an inconsistent state
 278                  */
 279                 if (panicstr) {
 280                         if (!mutex_tryenter(&ufsp->vfs_lock)) {
 281                                 vfs_unlock(vfsp);
 282                                 continue;
 283                         }
 284                 } else
 285                         mutex_enter(&ufsp->vfs_lock);
 286                 /*
 287                  * Build up the STABLE check list, so we can unlock the vfs
 288                  * until we do the actual checking.
 289                  */
 290                 if (check_list != NULL) {
 291                         if ((fs->fs_ronly == 0) &&
 292                             (fs->fs_clean != FSBAD) &&
 293                             (fs->fs_clean != FSSUSPEND)) {
 294                                 ptr->vfsp = vfsp;
 295                                 ptr->ufsvfs = ufsp;
 296                                 ptr->vfs_dev = vfsp->vfs_dev;
 297                                 ptr++;
 298                                 check_cnt++;
 299                         }
 300                 }
 301 
 302                 /*
 303                  * superblock is not modified
 304                  */
 305                 if (fs->fs_fmod == 0) {
 306                         mutex_exit(&ufsp->vfs_lock);
 307                         vfs_unlock(vfsp);
 308                         continue;
 309                 }
 310                 if (fs->fs_ronly != 0) {
 311                         mutex_exit(&ufsp->vfs_lock);
 312                         vfs_unlock(vfsp);
 313                         (void) ufs_fault(ufsp->vfs_root,
 314                             "fs = %s update: ro fs mod\n", fs->fs_fsmnt);
 315                         /*
 316                          * XXX: Why is this a return instead of a continue?
 317                          *      This may be an attempt to replace a panic with
 318                          *      something less drastic, but there's cleanup we
 319                          *      should be doing that's not being done (e.g.,
 320                          *      unlocking the remaining entries on the list).
 321                          */
 322                         return;
 323                 }
 324                 fs->fs_fmod = 0;
 325                 mutex_exit(&ufsp->vfs_lock);
 326                 TRANS_SBUPDATE(ufsp, vfsp, TOP_SBUPDATE_UPDATE);
 327                 vfs_unlock(vfsp);
 328         }
 329 
 330         ufs_sync_time = time;
 331 
 332         /*
 333          * Avoid racing with ufs_unmount() and ufs_sync().
 334          */
 335         mutex_enter(&ufs_scan_lock);
 336 
 337         (void) ufs_scan_inodes(1, ufs_sync_inode, (void *)(uintptr_t)cheap,
 338             NULL);
 339 
 340         mutex_exit(&ufs_scan_lock);
 341 
 342         /*
 343          * Force stale buffer cache information to be flushed,
 344          * for all devices.  This should cause any remaining control
 345          * information (e.g., cg and inode info) to be flushed back.
 346          */
 347         bflush((dev_t)NODEV);
 348 
 349         if (check_list == NULL)
 350                 return;
 351 
 352         /*
 353          * For each UFS filesystem in the STABLE check_list, update
 354          * the clean flag if warranted.
 355          */
 356         for (ptr = check_list; check_cnt > 0; check_cnt--, ptr++) {
 357                 int     error;
 358 
 359                 /*
 360                  * still_mounted() returns with vfsp and the vfs_reflock
 361                  * held if ptr refers to a vfs that is still mounted.
 362                  */
 363                 if ((vfsp = still_mounted(ptr)) == NULL)
 364                         continue;
 365                 ufs_checkclean(vfsp);
 366                 /*
 367                  * commit any outstanding async transactions
 368                  */
 369                 ufsp = (struct ufsvfs *)vfsp->vfs_data;
 370                 curthread->t_flag |= T_DONTBLOCK;
 371                 TRANS_BEGIN_SYNC(ufsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE,
 372                     error);
 373                 if (!error) {
 374                         TRANS_END_SYNC(ufsp, error, TOP_COMMIT_UPDATE,
 375                             TOP_COMMIT_SIZE);
 376                 }
 377                 curthread->t_flag &= ~T_DONTBLOCK;
 378 
 379                 vfs_unlock(vfsp);
 380         }
 381 
 382         kmem_free(check_list, check_size);
 383 }
 384 
 385 int
 386 ufs_sync_inode(struct inode *ip, void *arg)
 387 {
 388         int cheap = (int)(uintptr_t)arg;
 389         struct ufsvfs *ufsvfsp;
 390         uint_t flag = ip->i_flag;
 391 
 392         if (cheap && ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) == 0))
 393                 return (0);
 394 
 395         /*
 396          * if we are panic'ing; then don't update the inode if this
 397          * file system is FSSTABLE.  Otherwise, we would have to
 398          * force the superblock to FSACTIVE and the superblock
 399          * may not be in a good state.  Also, if the inode is
 400          * IREF'ed then it may be in an inconsistent state.  Don't
 401          * push it.  Finally, don't push the inode if the fs is
 402          * logging; the transaction will be discarded at boot.
 403          */
 404         if (panicstr) {
 405 
 406                 if (flag & IREF)
 407                         return (0);
 408 
 409                 if (ip->i_ufsvfs == NULL ||
 410                     (ip->i_fs->fs_clean == FSSTABLE ||
 411                     ip->i_fs->fs_clean == FSLOG))
 412                                 return (0);
 413         }
 414 
 415         ufsvfsp = ip->i_ufsvfs;
 416 
 417         /*
 418          * Limit access time only updates
 419          */
 420         if (((flag & (IMOD|IMODACC|IUPD|ICHG|IACC)) == IMODACC) && ufsvfsp) {
 421                 /*
 422                  * if file system has deferred access time turned on and there
 423                  * was no IO recently, don't bother flushing it. It will be
 424                  * flushed when I/Os start again.
 425                  */
 426                 if (cheap && (ufsvfsp->vfs_dfritime & UFS_DFRATIME) &&
 427                     (ufsvfsp->vfs_iotstamp + ufs_iowait < ddi_get_lbolt()))
 428                         return (0);
 429                 /*
 430                  * an app issueing a sync() can take forever on a trans device
 431                  * when NetWorker or find is running because all of the
 432                  * directorys' access times have to be updated. So, we limit
 433                  * the time we spend updating access times per sync.
 434                  */
 435                 if (TRANS_ISTRANS(ufsvfsp) && ((ufs_sync_time +
 436                     ufs_sync_time_secs) < time))
 437                         return (0);
 438         }
 439 
 440         /*
 441          * if we are running on behalf of the flush thread or this is
 442          * a swap file, then simply do a delay update of the inode.
 443          * Otherwise, push the pages and then do a delayed inode update.
 444          */
 445         if (cheap || IS_SWAPVP(ITOV(ip))) {
 446                 TRANS_IUPDAT(ip, 0);
 447         } else {
 448                 (void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_SYNC);
 449         }
 450         return (0);
 451 }
 452 
 453 /*
 454  * Flush all the pages associated with an inode using the given 'flags',
 455  * then force inode information to be written back using the given 'waitfor'.
 456  */
 457 int
 458 ufs_syncip(struct inode *ip, int flags, int waitfor, top_t topid)
 459 {
 460         int     error;
 461         struct vnode *vp = ITOV(ip);
 462         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
 463         int dotrans = 0;
 464 
 465         /*
 466          * Return if file system has been forcibly umounted.
 467          */
 468         if (ufsvfsp == NULL)
 469                 return (EIO);
 470         /*
 471          * don't need to VOP_PUTPAGE if there are no pages
 472          */
 473         if (!vn_has_cached_data(vp) || vp->v_type == VCHR) {
 474                 error = 0;
 475         } else {
 476                 /*
 477                  * if the inode we're working on is a shadow inode
 478                  * or quota inode we need to make sure that the
 479                  * ufs_putpage call is inside a transaction as this
 480                  * could include meta data changes.
 481                  */
 482                 if ((ip->i_mode & IFMT) == IFSHAD ||
 483                     ufsvfsp->vfs_qinod == ip) {
 484                         dotrans = 1;
 485                         curthread->t_flag |= T_DONTBLOCK;
 486                         TRANS_BEGIN_ASYNC(ufsvfsp, TOP_PUTPAGE,
 487                             TOP_PUTPAGE_SIZE(ip));
 488                 }
 489                 error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
 490                     flags, CRED(), NULL);
 491                 if (dotrans) {
 492                         TRANS_END_ASYNC(ufsvfsp, TOP_PUTPAGE,
 493                             TOP_PUTPAGE_SIZE(ip));
 494                         curthread->t_flag &= ~T_DONTBLOCK;
 495                         dotrans = 0;
 496                 }
 497         }
 498         if (panicstr && TRANS_ISTRANS(ufsvfsp))
 499                 goto out;
 500         /*
 501          * waitfor represents two things -
 502          * 1. whether data sync or file sync.
 503          * 2. if file sync then ufs_iupdat should 'waitfor' disk i/o or not.
 504          */
 505         if (waitfor == I_DSYNC) {
 506                 /*
 507                  * If data sync, only IATTCHG (size/block change) requires
 508                  * inode update, fdatasync()/FDSYNC implementation.
 509                  */
 510                 if (ip->i_flag & (IBDWRITE|IATTCHG)) {
 511                         /*
 512                          * Enter a transaction to provide mutual exclusion
 513                          * with deltamap_push and avoid a race where
 514                          * the inode flush could get dropped.
 515                          */
 516                         if ((curthread->t_flag & T_DONTBLOCK) == 0) {
 517                                 dotrans = 1;
 518                                 curthread->t_flag |= T_DONTBLOCK;
 519                                 TRANS_BEGIN_ASYNC(ufsvfsp, topid,
 520                                     TOP_SYNCIP_SIZE);
 521                         }
 522                         rw_enter(&ip->i_contents, RW_READER);
 523                         mutex_enter(&ip->i_tlock);
 524                         ip->i_flag &= ~IMODTIME;
 525                         mutex_exit(&ip->i_tlock);
 526                         ufs_iupdat(ip, 1);
 527                         rw_exit(&ip->i_contents);
 528                         if (dotrans) {
 529                                 TRANS_END_ASYNC(ufsvfsp, topid,
 530                                     TOP_SYNCIP_SIZE);
 531                                 curthread->t_flag &= ~T_DONTBLOCK;
 532                         }
 533                 }
 534         } else {
 535                 /* For file sync, any inode change requires inode update */
 536                 if (ip->i_flag & (IBDWRITE|IUPD|IACC|ICHG|IMOD|IMODACC)) {
 537                         /*
 538                          * Enter a transaction to provide mutual exclusion
 539                          * with deltamap_push and avoid a race where
 540                          * the inode flush could get dropped.
 541                          */
 542                         if ((curthread->t_flag & T_DONTBLOCK) == 0) {
 543                                 dotrans = 1;
 544                                 curthread->t_flag |= T_DONTBLOCK;
 545                                 TRANS_BEGIN_ASYNC(ufsvfsp, topid,
 546                                     TOP_SYNCIP_SIZE);
 547                         }
 548                         rw_enter(&ip->i_contents, RW_READER);
 549                         mutex_enter(&ip->i_tlock);
 550                         ip->i_flag &= ~IMODTIME;
 551                         mutex_exit(&ip->i_tlock);
 552                         ufs_iupdat(ip, waitfor);
 553                         rw_exit(&ip->i_contents);
 554                         if (dotrans) {
 555                                 TRANS_END_ASYNC(ufsvfsp, topid,
 556                                     TOP_SYNCIP_SIZE);
 557                                 curthread->t_flag &= ~T_DONTBLOCK;
 558                         }
 559                 }
 560         }
 561 
 562 out:
 563         return (error);
 564 }
 565 /*
 566  * Flush all indirect blocks related to an inode.
 567  * Supports triple indirect blocks also.
 568  */
 569 int
 570 ufs_sync_indir(struct inode *ip)
 571 {
 572         int i;
 573         daddr_t blkno;
 574         daddr_t lbn;    /* logical blkno of last blk in file */
 575         daddr_t clbn;   /* current logical blk */
 576         daddr32_t *bap;
 577         struct fs *fs;
 578         struct buf *bp;
 579         int bsize;
 580         struct ufsvfs *ufsvfsp;
 581         int j;
 582         daddr_t indirect_blkno;
 583         daddr32_t *indirect_bap;
 584         struct buf *indirect_bp;
 585 
 586         ufsvfsp = ip->i_ufsvfs;
 587         /*
 588          * unnecessary when logging; allocation blocks are kept up-to-date
 589          */
 590         if (TRANS_ISTRANS(ufsvfsp))
 591                 return (0);
 592 
 593         fs = ufsvfsp->vfs_fs;
 594         bsize = fs->fs_bsize;
 595         lbn = (daddr_t)lblkno(fs, ip->i_size - 1);
 596         if (lbn < NDADDR)
 597                 return (0);     /* No indirect blocks used */
 598         if (lbn < NDADDR + NINDIR(fs)) {
 599                 /* File has one indirect block. */
 600                 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, ip->i_ib[0]));
 601                 return (0);
 602         }
 603 
 604         /* Write out all the first level indirect blocks */
 605         for (i = 0; i <= NIADDR; i++) {
 606                 if ((blkno = ip->i_ib[i]) == 0)
 607                         continue;
 608                 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
 609         }
 610         /* Write out second level of indirect blocks */
 611         if ((blkno = ip->i_ib[1]) == 0)
 612                 return (0);
 613         bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize);
 614         if (bp->b_flags & B_ERROR) {
 615                 brelse(bp);
 616                 return (EIO);
 617         }
 618         bap = bp->b_un.b_daddr;
 619         clbn = NDADDR + NINDIR(fs);
 620         for (i = 0; i < NINDIR(fs); i++) {
 621                 if (clbn > lbn)
 622                         break;
 623                 clbn += NINDIR(fs);
 624                 if ((blkno = bap[i]) == 0)
 625                         continue;
 626                 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
 627         }
 628 
 629         brelse(bp);
 630         /* write out third level indirect blocks */
 631 
 632         if ((blkno = ip->i_ib[2]) == 0)
 633                 return (0);
 634 
 635         bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize);
 636         if (bp->b_flags & B_ERROR) {
 637                 brelse(bp);
 638                 return (EIO);
 639         }
 640         bap = bp->b_un.b_daddr;
 641         clbn = NDADDR + NINDIR(fs) + (NINDIR(fs) * NINDIR(fs));
 642 
 643         for (i = 0; i < NINDIR(fs); i++) {
 644                 if (clbn > lbn)
 645                         break;
 646                 if ((indirect_blkno = bap[i]) == 0)
 647                         continue;
 648                 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, indirect_blkno));
 649                 indirect_bp = UFS_BREAD(ufsvfsp, ip->i_dev,
 650                     (daddr_t)fsbtodb(fs, indirect_blkno), bsize);
 651                 if (indirect_bp->b_flags & B_ERROR) {
 652                         brelse(indirect_bp);
 653                         brelse(bp);
 654                         return (EIO);
 655                 }
 656                 indirect_bap = indirect_bp->b_un.b_daddr;
 657                 for (j = 0; j < NINDIR(fs); j++) {
 658                         if (clbn > lbn)
 659                                 break;
 660                         clbn += NINDIR(fs);
 661                         if ((blkno = indirect_bap[j]) == 0)
 662                                 continue;
 663                         blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
 664                 }
 665                 brelse(indirect_bp);
 666         }
 667         brelse(bp);
 668 
 669         return (0);
 670 }
 671 
 672 /*
 673  * Flush all indirect blocks related to an offset of a file.
 674  * read/write in sync mode may have to flush indirect blocks.
 675  */
 676 int
 677 ufs_indirblk_sync(struct inode *ip, offset_t off)
 678 {
 679         daddr_t lbn;
 680         struct  fs *fs;
 681         struct  buf *bp;
 682         int     i, j, shft;
 683         daddr_t ob, nb, tbn;
 684         daddr32_t *bap;
 685         int     nindirshift, nindiroffset;
 686         struct ufsvfs *ufsvfsp;
 687 
 688         ufsvfsp = ip->i_ufsvfs;
 689         /*
 690          * unnecessary when logging; allocation blocks are kept up-to-date
 691          */
 692         if (TRANS_ISTRANS(ufsvfsp))
 693                 return (0);
 694 
 695         fs = ufsvfsp->vfs_fs;
 696 
 697         lbn = (daddr_t)lblkno(fs, off);
 698         if (lbn < 0)
 699                 return (EFBIG);
 700 
 701         /* The first NDADDR are direct so nothing to do */
 702         if (lbn < NDADDR)
 703                 return (0);
 704 
 705         nindirshift = ip->i_ufsvfs->vfs_nindirshift;
 706         nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
 707 
 708         /* Determine level of indirect blocks */
 709         shft = 0;
 710         tbn = lbn - NDADDR;
 711         for (j = NIADDR; j > 0; j--) {
 712                 longlong_t      sh;
 713 
 714                 shft += nindirshift;
 715                 sh = 1LL << shft;
 716                 if (tbn < sh)
 717                         break;
 718                 tbn -= (daddr_t)sh;
 719         }
 720 
 721         if (j == 0)
 722                 return (EFBIG);
 723 
 724         if ((nb = ip->i_ib[NIADDR - j]) == 0)
 725                         return (0);             /* UFS Hole */
 726 
 727         /* Flush first level indirect block */
 728         blkflush(ip->i_dev, fsbtodb(fs, nb));
 729 
 730         /* Fetch through next levels */
 731         for (; j < NIADDR; j++) {
 732                 ob = nb;
 733                 bp = UFS_BREAD(ufsvfsp,
 734                     ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
 735                 if (bp->b_flags & B_ERROR) {
 736                         brelse(bp);
 737                         return (EIO);
 738                 }
 739                 bap = bp->b_un.b_daddr;
 740                 shft -= nindirshift;            /* sh / nindir */
 741                 i = (tbn >> shft) & nindiroffset; /* (tbn /sh) & nindir */
 742                 nb = bap[i];
 743                 brelse(bp);
 744                 if (nb == 0) {
 745                         return (0);             /* UFS hole */
 746                 }
 747                 blkflush(ip->i_dev, fsbtodb(fs, nb));
 748         }
 749         return (0);
 750 }
 751 
 752 #ifdef DEBUG
 753 
 754 /*
 755  * The bad block checking routines: ufs_indir_badblock() and ufs_badblock()
 756  * are very expensive. It's been found from profiling that we're
 757  * spending 6-7% of our time in ufs_badblock, and another 1-2% in
 758  * ufs_indir_badblock. They are only called via ASSERTs (from debug kernels).
 759  * In addition from experience no failures have been found in recent
 760  * years. So the following tunable can be set to enable checking.
 761  */
 762 int ufs_badblock_checks = 0;
 763 
 764 /*
 765  * Check that a given indirect block contains blocks in range
 766  */
 767 int
 768 ufs_indir_badblock(struct inode *ip, daddr32_t *bap)
 769 {
 770         int i;
 771         int err = 0;
 772 
 773         if (ufs_badblock_checks) {
 774                 for (i = 0; i < NINDIR(ip->i_fs) - 1; i++)
 775                         if (bap[i] != 0 && (err = ufs_badblock(ip, bap[i])))
 776                                 break;
 777         }
 778         return (err);
 779 }
 780 
 781 /*
 782  * Check that a specified block number is in range.
 783  */
 784 int
 785 ufs_badblock(struct inode *ip, daddr_t bn)
 786 {
 787         long    c;
 788         daddr_t sum;
 789 
 790         if (!ufs_badblock_checks)
 791                 return (0);
 792         ASSERT(bn);
 793         if (bn <= 0 || bn > ip->i_fs->fs_size)
 794                 return (bn);
 795 
 796         sum = 0;
 797         c = dtog(ip->i_fs, bn);
 798         if (c == 0) {
 799                 sum = howmany(ip->i_fs->fs_cssize, ip->i_fs->fs_fsize);
 800         }
 801         /*
 802          * if block no. is below this cylinder group,
 803          * within the space reserved for superblock, inodes, (summary data)
 804          * or if it is above this cylinder group
 805          * then its invalid
 806          * It's hard to see how we'd be outside this cyl, but let's be careful.
 807          */
 808         if ((bn < cgbase(ip->i_fs, c)) ||
 809             (bn >= cgsblock(ip->i_fs, c) && bn < cgdmin(ip->i_fs, c)+sum) ||
 810             (bn >= (unsigned)cgbase(ip->i_fs, c+1)))
 811                 return (bn);
 812 
 813         return (0);     /* not a bad block */
 814 }
 815 
 816 #endif /* DEBUG */
 817 
 818 /*
 819  * When i_rwlock is write-locked or has a writer pended, then the inode
 820  * is going to change in a way that the filesystem will be marked as
 821  * active. So no need to let the filesystem be mark as stable now.
 822  * Also to ensure the filesystem consistency during the directory
 823  * operations, filesystem cannot be marked as stable if i_rwlock of
 824  * the directory inode is write-locked.
 825  */
 826 
 827 /*
 828  * Check for busy inodes for this filesystem.
 829  * NOTE: Needs better way to do this expensive operation in the future.
 830  */
 831 static void
 832 ufs_icheck(struct ufsvfs *ufsvfsp, int *isbusyp, int *isreclaimp)
 833 {
 834         union  ihead    *ih;
 835         struct inode    *ip;
 836         int             i;
 837         int             isnottrans      = !TRANS_ISTRANS(ufsvfsp);
 838         int             isbusy          = *isbusyp;
 839         int             isreclaim       = *isreclaimp;
 840 
 841         for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
 842                 mutex_enter(&ih_lock[i]);
 843                 for (ip = ih->ih_chain[0];
 844                     ip != (struct inode *)ih;
 845                     ip = ip->i_forw) {
 846                         /*
 847                          * if inode is busy/modified/deleted, filesystem is busy
 848                          */
 849                         if (ip->i_ufsvfs != ufsvfsp)
 850                                 continue;
 851                         if ((ip->i_flag & (IMOD | IUPD | ICHG)) ||
 852                             (RW_ISWRITER(&ip->i_rwlock)))
 853                                 isbusy = 1;
 854                         if ((ip->i_nlink <= 0) && (ip->i_flag & IREF))
 855                                 isreclaim = 1;
 856                         if (isbusy && (isreclaim || isnottrans))
 857                                 break;
 858                 }
 859                 mutex_exit(&ih_lock[i]);
 860                 if (isbusy && (isreclaim || isnottrans))
 861                         break;
 862         }
 863         *isbusyp = isbusy;
 864         *isreclaimp = isreclaim;
 865 }
 866 
 867 /*
 868  * As part of the ufs 'sync' operation, this routine is called to mark
 869  * the filesystem as STABLE if there is no modified metadata in memory.
 870  */
 871 void
 872 ufs_checkclean(struct vfs *vfsp)
 873 {
 874         struct ufsvfs   *ufsvfsp        = (struct ufsvfs *)vfsp->vfs_data;
 875         struct fs       *fs             = ufsvfsp->vfs_fs;
 876         int             isbusy;
 877         int             isreclaim;
 878         int             updatesb;
 879 
 880         ASSERT(vfs_lock_held(vfsp));
 881 
 882         /*
 883          * filesystem is stable or cleanflag processing is disabled; do nothing
 884          *      no transitions when panic'ing
 885          */
 886         if (fs->fs_ronly ||
 887             fs->fs_clean == FSBAD ||
 888             fs->fs_clean == FSSUSPEND ||
 889             fs->fs_clean == FSSTABLE ||
 890             panicstr)
 891                 return;
 892 
 893         /*
 894          * if logging and nothing to reclaim; do nothing
 895          */
 896         if ((fs->fs_clean == FSLOG) &&
 897             (((fs->fs_reclaim & FS_RECLAIM) == 0) ||
 898             (fs->fs_reclaim & FS_RECLAIMING)))
 899                 return;
 900 
 901         /*
 902          * FS_CHECKCLEAN is reset if the file system goes dirty
 903          * FS_CHECKRECLAIM is reset if a file gets deleted
 904          */
 905         mutex_enter(&ufsvfsp->vfs_lock);
 906         fs->fs_reclaim |= (FS_CHECKCLEAN | FS_CHECKRECLAIM);
 907         mutex_exit(&ufsvfsp->vfs_lock);
 908 
 909         updatesb = 0;
 910 
 911         /*
 912          * if logging or buffers are busy; do nothing
 913          */
 914         isbusy = isreclaim = 0;
 915         if ((fs->fs_clean == FSLOG) ||
 916             (bcheck(vfsp->vfs_dev, ufsvfsp->vfs_bufp)))
 917                 isbusy = 1;
 918 
 919         /*
 920          * isreclaim == TRUE means can't change the state of fs_reclaim
 921          */
 922         isreclaim =
 923             ((fs->fs_clean == FSLOG) &&
 924             (((fs->fs_reclaim & FS_RECLAIM) == 0) ||
 925             (fs->fs_reclaim & FS_RECLAIMING)));
 926 
 927         /*
 928          * if fs is busy or can't change the state of fs_reclaim; do nothing
 929          */
 930         if (isbusy && isreclaim)
 931                 return;
 932 
 933         /*
 934          * look for busy or deleted inodes; (deleted == needs reclaim)
 935          */
 936         ufs_icheck(ufsvfsp, &isbusy, &isreclaim);
 937 
 938         mutex_enter(&ufsvfsp->vfs_lock);
 939 
 940         /*
 941          * IF POSSIBLE, RESET RECLAIM
 942          */
 943         /*
 944          * the reclaim thread is not running
 945          */
 946         if ((fs->fs_reclaim & FS_RECLAIMING) == 0)
 947                 /*
 948                  * no files were deleted during the scan
 949                  */
 950                 if (fs->fs_reclaim & FS_CHECKRECLAIM)
 951                         /*
 952                          * no deleted files were found in the inode cache
 953                          */
 954                         if ((isreclaim == 0) && (fs->fs_reclaim & FS_RECLAIM)) {
 955                                 fs->fs_reclaim &= ~FS_RECLAIM;
 956                                 updatesb = 1;
 957                         }
 958         /*
 959          * IF POSSIBLE, SET STABLE
 960          */
 961         /*
 962          * not logging
 963          */
 964         if (fs->fs_clean != FSLOG)
 965                 /*
 966                  * file system has not gone dirty since the scan began
 967                  */
 968                 if (fs->fs_reclaim & FS_CHECKCLEAN)
 969                         /*
 970                          * nothing dirty was found in the buffer or inode cache
 971                          */
 972                         if ((isbusy == 0) && (isreclaim == 0) &&
 973                             (fs->fs_clean != FSSTABLE)) {
 974                                 fs->fs_clean = FSSTABLE;
 975                                 updatesb = 1;
 976                         }
 977 
 978         mutex_exit(&ufsvfsp->vfs_lock);
 979         if (updatesb) {
 980                 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
 981         }
 982 }
 983 
 984 /*
 985  * called whenever an unlink occurs
 986  */
 987 void
 988 ufs_setreclaim(struct inode *ip)
 989 {
 990         struct ufsvfs   *ufsvfsp        = ip->i_ufsvfs;
 991         struct fs       *fs             = ufsvfsp->vfs_fs;
 992 
 993         if (ip->i_nlink || fs->fs_ronly || (fs->fs_clean != FSLOG))
 994                 return;
 995 
 996         /*
 997          * reclaim-needed bit is already set or we need to tell
 998          * ufs_checkclean that a file has been deleted
 999          */
1000         if ((fs->fs_reclaim & (FS_RECLAIM | FS_CHECKRECLAIM)) == FS_RECLAIM)
1001                 return;
1002 
1003         mutex_enter(&ufsvfsp->vfs_lock);
1004         /*
1005          * inform ufs_checkclean that the file system has gone dirty
1006          */
1007         fs->fs_reclaim &= ~FS_CHECKRECLAIM;
1008 
1009         /*
1010          * set the reclaim-needed bit
1011          */
1012         if ((fs->fs_reclaim & FS_RECLAIM) == 0) {
1013                 fs->fs_reclaim |= FS_RECLAIM;
1014                 ufs_sbwrite(ufsvfsp);
1015         }
1016         mutex_exit(&ufsvfsp->vfs_lock);
1017 }
1018 
1019 /*
1020  * Before any modified metadata written back to the disk, this routine
1021  * is called to mark the filesystem as ACTIVE.
1022  */
1023 void
1024 ufs_notclean(struct ufsvfs *ufsvfsp)
1025 {
1026         struct fs *fs = ufsvfsp->vfs_fs;
1027 
1028         ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1029         ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs));
1030 
1031         /*
1032          * inform ufs_checkclean that the file system has gone dirty
1033          */
1034         fs->fs_reclaim &= ~FS_CHECKCLEAN;
1035 
1036         /*
1037          * ignore if active or bad or suspended or readonly or logging
1038          */
1039         if ((fs->fs_clean == FSACTIVE) || (fs->fs_clean == FSLOG) ||
1040             (fs->fs_clean == FSBAD) || (fs->fs_clean == FSSUSPEND) ||
1041             (fs->fs_ronly)) {
1042                 mutex_exit(&ufsvfsp->vfs_lock);
1043                 return;
1044         }
1045         fs->fs_clean = FSACTIVE;
1046         /*
1047          * write superblock synchronously
1048          */
1049         ufs_sbwrite(ufsvfsp);
1050         mutex_exit(&ufsvfsp->vfs_lock);
1051 }
1052 
1053 /*
1054  * ufs specific fbwrite()
1055  */
1056 int
1057 ufs_fbwrite(struct fbuf *fbp, struct inode *ip)
1058 {
1059         struct ufsvfs   *ufsvfsp        = ip->i_ufsvfs;
1060 
1061         if (TRANS_ISTRANS(ufsvfsp))
1062                 return (fbwrite(fbp));
1063         mutex_enter(&ufsvfsp->vfs_lock);
1064         ufs_notclean(ufsvfsp);
1065         return ((ufsvfsp->vfs_dio) ? fbdwrite(fbp) : fbwrite(fbp));
1066 }
1067 
1068 /*
1069  * ufs specific fbiwrite()
1070  */
1071 int
1072 ufs_fbiwrite(struct fbuf *fbp, struct inode *ip, daddr_t bn, long bsize)
1073 {
1074         struct ufsvfs   *ufsvfsp        = ip->i_ufsvfs;
1075         o_mode_t        ifmt            = ip->i_mode & IFMT;
1076         buf_t           *bp;
1077         int             error;
1078 
1079         mutex_enter(&ufsvfsp->vfs_lock);
1080         ufs_notclean(ufsvfsp);
1081         if (ifmt == IFDIR || ifmt == IFSHAD || ifmt == IFATTRDIR ||
1082             (ip->i_ufsvfs->vfs_qinod == ip)) {
1083                 TRANS_DELTA(ufsvfsp, ldbtob(bn * (offset_t)(btod(bsize))),
1084                     fbp->fb_count, DT_FBI, 0, 0);
1085         }
1086         /*
1087          * Inlined version of fbiwrite()
1088          */
1089         bp = pageio_setup((struct page *)NULL, fbp->fb_count,
1090             ip->i_devvp, B_WRITE);
1091         bp->b_flags &= ~B_PAGEIO;
1092         bp->b_un.b_addr = fbp->fb_addr;
1093 
1094         bp->b_blkno = bn * btod(bsize);
1095         bp->b_dev = cmpdev(ip->i_dev);    /* store in old dev format */
1096         bp->b_edev = ip->i_dev;
1097         bp->b_proc = NULL;                   /* i.e. the kernel */
1098         bp->b_file = ip->i_vnode;
1099         bp->b_offset = -1;
1100 
1101         if (ufsvfsp->vfs_log) {
1102                 lufs_write_strategy(ufsvfsp->vfs_log, bp);
1103         } else if (ufsvfsp->vfs_snapshot) {
1104                 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
1105         } else {
1106                 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
1107                 ub.ub_fbiwrites.value.ul++;
1108                 (void) bdev_strategy(bp);
1109                 lwp_stat_update(LWP_STAT_OUBLK, 1);
1110         }
1111         error = biowait(bp);
1112         pageio_done(bp);
1113         fbrelse(fbp, S_OTHER);
1114         return (error);
1115 }
1116 
1117 /*
1118  * Write the ufs superblock only.
1119  */
1120 void
1121 ufs_sbwrite(struct ufsvfs *ufsvfsp)
1122 {
1123         char sav_fs_fmod;
1124         struct fs *fs = ufsvfsp->vfs_fs;
1125         struct buf *bp = ufsvfsp->vfs_bufp;
1126 
1127         ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1128 
1129         /*
1130          * for ulockfs processing, limit the superblock writes
1131          */
1132         if ((ufsvfsp->vfs_ulockfs.ul_sbowner) &&
1133             (curthread != ufsvfsp->vfs_ulockfs.ul_sbowner)) {
1134                 /* try again later */
1135                 fs->fs_fmod = 1;
1136                 return;
1137         }
1138 
1139         ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs));
1140         /*
1141          * update superblock timestamp and fs_clean checksum
1142          * if marked FSBAD, we always want an erroneous
1143          * checksum to force repair
1144          */
1145         fs->fs_time = gethrestime_sec();
1146         fs->fs_state = (fs->fs_clean != FSBAD) ?
1147             FSOKAY - fs->fs_time : -(FSOKAY - fs->fs_time);
1148         switch (fs->fs_clean) {
1149         case FSCLEAN:
1150         case FSSTABLE:
1151                 fs->fs_reclaim &= ~FS_RECLAIM;
1152                 break;
1153         case FSACTIVE:
1154         case FSSUSPEND:
1155         case FSBAD:
1156         case FSLOG:
1157                 break;
1158         default:
1159                 fs->fs_clean = FSACTIVE;
1160                 break;
1161         }
1162         /*
1163          * reset incore only bits
1164          */
1165         fs->fs_reclaim &= ~(FS_CHECKCLEAN | FS_CHECKRECLAIM);
1166 
1167         /*
1168          * delta the whole superblock
1169          */
1170         TRANS_DELTA(ufsvfsp, ldbtob(SBLOCK), sizeof (struct fs),
1171             DT_SB, NULL, 0);
1172         /*
1173          * retain the incore state of fs_fmod; set the ondisk state to 0
1174          */
1175         sav_fs_fmod = fs->fs_fmod;
1176         fs->fs_fmod = 0;
1177 
1178         /*
1179          * Don't release the buffer after written to the disk
1180          */
1181         UFS_BWRITE2(ufsvfsp, bp);
1182         fs->fs_fmod = sav_fs_fmod;   /* reset fs_fmod's incore state */
1183 }
1184 
1185 /*
1186  * Returns vfs pointer if vfs still being mounted. vfs lock is held.
1187  * Otherwise, returns NULL.
1188  *
1189  * For our purposes, "still mounted" means that the file system still appears
1190  * on the list of UFS file system instances.
1191  */
1192 static vfs_t *
1193 still_mounted(struct check_node *checkp)
1194 {
1195         struct vfs      *vfsp;
1196         struct ufsvfs   *ufsp;
1197 
1198         mutex_enter(&ufsvfs_mutex);
1199         for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) {
1200                 if (ufsp != checkp->ufsvfs)
1201                         continue;
1202                 /*
1203                  * Tentative match:  verify it and try to lock.  (It's not at
1204                  * all clear how the verification could fail, given that we've
1205                  * gotten this far.  We would have had to reallocate the
1206                  * ufsvfs struct at hand for a new incarnation; is that really
1207                  * possible in the interval from constructing the check_node
1208                  * to here?)
1209                  */
1210                 vfsp = ufsp->vfs_vfs;
1211                 if (vfsp != checkp->vfsp)
1212                         continue;
1213                 if (vfsp->vfs_dev != checkp->vfs_dev)
1214                         continue;
1215                 if (vfs_lock(vfsp) != 0)
1216                         continue;
1217 
1218                 mutex_exit(&ufsvfs_mutex);
1219                 return (vfsp);
1220         }
1221         mutex_exit(&ufsvfs_mutex);
1222         return (NULL);
1223 }
1224 
1225 int
1226 ufs_si_io_done(struct buf *bp)
1227 {
1228         sema_v(&bp->b_io);
1229         return (0);
1230 }
1231 
1232 #define SI_BUFSZ roundup(sizeof (struct cg), DEV_BSIZE)
1233 #define NSIBUF 32
1234 
1235 /*
1236  * ufs_construct_si()
1237  * Read each cylinder group in turn and construct the summary information
1238  */
1239 static int
1240 ufs_construct_si(dev_t dev, struct fs *fs, struct ufsvfs *ufsvfsp)
1241 {
1242         buf_t *bps, *bp;
1243         char *bufs;
1244         struct csum *sip = fs->fs_u.fs_csp;
1245         struct cg *cgp;
1246         int i, ncg;
1247         int error = 0, cg = 0;
1248 
1249         bps = kmem_alloc(NSIBUF * sizeof (buf_t), KM_SLEEP);
1250         bufs = kmem_alloc(NSIBUF * SI_BUFSZ, KM_SLEEP);
1251 
1252         /*
1253          * Initialise the buffer headers
1254          */
1255         for (bp = bps, i = 0; i < NSIBUF; i++, bp++) {
1256                 bioinit(bp);
1257                 bp->b_iodone = ufs_si_io_done;
1258                 bp->b_bufsize = bp->b_bcount = SI_BUFSZ;
1259                 bp->b_flags = B_READ;
1260                 bp->b_un.b_addr = bufs + (i * SI_BUFSZ);
1261                 bp->b_edev = dev;
1262         }
1263 
1264         /*
1265          * Repeat while there are cylinder groups left to read.
1266          */
1267         do {
1268                 /*
1269                  * Issue upto NSIBUF asynchronous reads
1270                  */
1271                 ncg = MIN(NSIBUF, (fs->fs_ncg - cg));
1272                 for (bp = bps, i = 0; i < ncg; i++, bp++) {
1273                         bp->b_blkno = (daddr_t)fsbtodb(fs, cgtod(fs, cg + i));
1274                         if (ufsvfsp->vfs_log) {
1275                                 lufs_read_strategy(ufsvfsp->vfs_log, bp);
1276                         } else {
1277                                 (void) bdev_strategy(bp);
1278                         }
1279                 }
1280 
1281                 /*
1282                  * wait for each read to finish;
1283                  * check for errors and copy the csum info
1284                  */
1285                 for (bp = bps, i = 0; i < ncg; i++, bp++) {
1286                         sema_p(&bp->b_io);
1287                         if (!error) {
1288                                 cgp = bp->b_un.b_cg;
1289                                 sip[cg + i] = cgp->cg_cs;
1290                                 error = geterror(bp);
1291                         }
1292                 }
1293                 if (error) {
1294                         goto err;
1295                 }
1296                 cg += ncg;
1297         } while (cg < fs->fs_ncg);
1298 
1299 err:
1300         kmem_free(bps, NSIBUF * sizeof (buf_t));
1301         kmem_free(bufs, NSIBUF * SI_BUFSZ);
1302         return (error);
1303 }
1304 
1305 /*
1306  * ufs_getsummaryinfo
1307  */
1308 int
1309 ufs_getsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs)
1310 {
1311         int             i;              /* `for' loop counter */
1312         ssize_t         size;           /* bytes of summary info to read */
1313         daddr_t         frags;          /* frags of summary info to read */
1314         caddr_t         sip;            /* summary info */
1315         struct buf      *tp;            /* tmp buf */
1316 
1317         /*
1318          * maintain metadata map for trans device (debug only)
1319          */
1320         TRANS_MATA_SI(ufsvfsp, fs);
1321 
1322         /*
1323          * Compute #frags and allocate space for summary info
1324          */
1325         frags = howmany(fs->fs_cssize, fs->fs_fsize);
1326         sip = kmem_alloc((size_t)fs->fs_cssize, KM_SLEEP);
1327         fs->fs_u.fs_csp = (struct csum *)sip;
1328 
1329         if (fs->fs_si == FS_SI_BAD) {
1330                 /*
1331                  * The summary information is unknown, read it in from
1332                  * the cylinder groups.
1333                  */
1334                 if (TRANS_ISTRANS(ufsvfsp) && !TRANS_ISERROR(ufsvfsp) &&
1335                     ufsvfsp->vfs_log->un_logmap) {
1336                         logmap_roll_dev(ufsvfsp->vfs_log); /* flush the log */
1337                 }
1338                 bzero(sip, (size_t)fs->fs_cssize);
1339                 if (ufs_construct_si(dev, fs, ufsvfsp)) {
1340                         kmem_free(fs->fs_u.fs_csp, fs->fs_cssize);
1341                         fs->fs_u.fs_csp = NULL;
1342                         return (EIO);
1343                 }
1344         } else {
1345                 /* Read summary info a fs block at a time */
1346                 size = fs->fs_bsize;
1347                 for (i = 0; i < frags; i += fs->fs_frag) {
1348                         if (i + fs->fs_frag > frags)
1349                                 /*
1350                                  * This happens only the last iteration, so
1351                                  * don't worry about size being reset
1352                                  */
1353                                 size = (frags - i) * fs->fs_fsize;
1354                         tp = UFS_BREAD(ufsvfsp, dev,
1355                             (daddr_t)fsbtodb(fs, fs->fs_csaddr+i), size);
1356                         tp->b_flags |= B_STALE | B_AGE;
1357                         if (tp->b_flags & B_ERROR) {
1358                                 kmem_free(fs->fs_u.fs_csp, fs->fs_cssize);
1359                                 fs->fs_u.fs_csp = NULL;
1360                                 brelse(tp);
1361                                 return (EIO);
1362                         }
1363                         bcopy(tp->b_un.b_addr, sip, size);
1364                         sip += size;
1365                         brelse(tp);
1366                 }
1367         }
1368         bzero((caddr_t)&fs->fs_cstotal, sizeof (fs->fs_cstotal));
1369         for (i = 0; i < fs->fs_ncg; ++i) {
1370                 fs->fs_cstotal.cs_ndir += fs->fs_cs(fs, i).cs_ndir;
1371                 fs->fs_cstotal.cs_nbfree += fs->fs_cs(fs, i).cs_nbfree;
1372                 fs->fs_cstotal.cs_nifree += fs->fs_cs(fs, i).cs_nifree;
1373                 fs->fs_cstotal.cs_nffree += fs->fs_cs(fs, i).cs_nffree;
1374         }
1375         return (0);
1376 }
1377 
1378 /*
1379  * ufs_putsummaryinfo() stores all the cylinder group summary information
1380  * This is only used when logging, but the file system may not
1381  * be logging at the time, eg a read-only mount to flush the log
1382  * may push the summary info out.
1383  */
1384 int
1385 ufs_putsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs)
1386 {
1387         struct buf      b, *bp;         /* tmp buf */
1388         caddr_t         sip;            /* summary info */
1389         ssize_t         size;           /* bytes of summary info to write */
1390         daddr_t         frags;          /* frags of summary info to write */
1391         int             i;              /* `for' loop counter */
1392         int             error;          /* error */
1393 
1394         if (TRANS_ISERROR(ufsvfsp)) {
1395                 return (EIO);
1396         }
1397 
1398         if ((fs->fs_si != FS_SI_BAD) || !ufsvfsp->vfs_nolog_si) {
1399                 return (0);
1400         }
1401 
1402         bp = &b;
1403         bioinit(bp);
1404         bp->b_iodone = ufs_si_io_done;
1405         bp->b_bufsize = size = fs->fs_bsize;
1406         bp->b_flags = B_WRITE;
1407         bp->b_un.b_addr = kmem_alloc(size, KM_SLEEP);
1408         bp->b_edev = dev;
1409         frags = howmany(fs->fs_cssize, fs->fs_fsize);
1410         sip = (caddr_t)fs->fs_u.fs_csp;
1411 
1412         /* Write summary info one fs block at a time */
1413         for (error = 0, i = 0; (i < frags) && (error == 0); i += fs->fs_frag) {
1414                 if (i + fs->fs_frag > frags) {
1415                         /*
1416                          * This happens only the last iteration, so
1417                          * don't worry about size being reset
1418                          */
1419                         size = (frags - i) * fs->fs_fsize;
1420                 }
1421                 bcopy(sip, bp->b_un.b_addr, size);
1422                 bp->b_blkno = (daddr_t)fsbtodb(fs, fs->fs_csaddr+i);
1423                 bp->b_bcount = size;
1424                 (void) bdev_strategy(bp);
1425                 sema_p(&bp->b_io); /* wait for write to complete */
1426                 error = geterror(bp);
1427                 sip += size;
1428         }
1429         kmem_free(bp->b_un.b_addr, fs->fs_bsize);
1430         if (!error) {
1431                 fs->fs_si = FS_SI_OK;
1432         }
1433         return (error);
1434 }
1435 
1436 /*
1437  * Decide whether it is okay to remove within a sticky directory.
1438  * Two conditions need to be met:  write access to the directory
1439  * is needed.  In sticky directories, write access is not sufficient;
1440  * you can remove entries from a directory only if you own the directory,
1441  * if you are privileged, if you own the entry or if the entry is
1442  * a plain file and you have write access to that file.
1443  * Function returns 0 if remove access is granted.
1444  * Note, the caller is responsible for holding the i_contents lock
1445  * at least as reader on the inquired inode 'ip'.
1446  */
1447 int
1448 ufs_sticky_remove_access(struct inode *dp, struct inode *ip, struct cred *cr)
1449 {
1450         uid_t uid;
1451 
1452         ASSERT(RW_LOCK_HELD(&ip->i_contents));
1453 
1454         if ((dp->i_mode & ISVTX) &&
1455             (uid = crgetuid(cr)) != dp->i_uid &&
1456             uid != ip->i_uid &&
1457             ((ip->i_mode & IFMT) != IFREG ||
1458             ufs_iaccess(ip, IWRITE, cr, 0) != 0))
1459                 return (secpolicy_vnode_remove(cr));
1460 
1461         return (0);
1462 }
1463 #endif  /* _KERNEL */
1464 
1465 extern  int around[9];
1466 extern  int inside[9];
1467 extern  uchar_t *fragtbl[];
1468 
1469 /*
1470  * Update the frsum fields to reflect addition or deletion
1471  * of some frags.
1472  */
1473 void
1474 fragacct(struct fs *fs, int fragmap, int32_t *fraglist, int cnt)
1475 {
1476         int inblk;
1477         int field, subfield;
1478         int siz, pos;
1479 
1480         /*
1481          * ufsvfsp->vfs_lock is held when calling this.
1482          */
1483         inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1;
1484         fragmap <<= 1;
1485         for (siz = 1; siz < fs->fs_frag; siz++) {
1486                 if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0)
1487                         continue;
1488                 field = around[siz];
1489                 subfield = inside[siz];
1490                 for (pos = siz; pos <= fs->fs_frag; pos++) {
1491                         if ((fragmap & field) == subfield) {
1492                                 fraglist[siz] += cnt;
1493                                 ASSERT(fraglist[siz] >= 0);
1494                                 pos += siz;
1495                                 field <<= siz;
1496                                 subfield <<= siz;
1497                         }
1498                         field <<= 1;
1499                         subfield <<= 1;
1500                 }
1501         }
1502 }
1503 
1504 /*
1505  * Block operations
1506  */
1507 
1508 /*
1509  * Check if a block is available
1510  */
1511 int
1512 isblock(struct fs *fs, uchar_t *cp, daddr_t h)
1513 {
1514         uchar_t mask;
1515 
1516         ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1517             fs->fs_frag == 1);
1518         /*
1519          * ufsvfsp->vfs_lock is held when calling this.
1520          */
1521         switch ((int)fs->fs_frag) {
1522         case 8:
1523                 return (cp[h] == 0xff);
1524         case 4:
1525                 mask = 0x0f << ((h & 0x1) << 2);
1526                 return ((cp[h >> 1] & mask) == mask);
1527         case 2:
1528                 mask = 0x03 << ((h & 0x3) << 1);
1529                 return ((cp[h >> 2] & mask) == mask);
1530         case 1:
1531                 mask = 0x01 << (h & 0x7);
1532                 return ((cp[h >> 3] & mask) == mask);
1533         default:
1534 #ifndef _KERNEL
1535                 cmn_err(CE_PANIC, "isblock: illegal fs->fs_frag value (%d)",
1536                     fs->fs_frag);
1537 #endif /* _KERNEL */
1538                 return (0);
1539         }
1540 }
1541 
1542 /*
1543  * Take a block out of the map
1544  */
1545 void
1546 clrblock(struct fs *fs, uchar_t *cp, daddr_t h)
1547 {
1548         ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1549             fs->fs_frag == 1);
1550         /*
1551          * ufsvfsp->vfs_lock is held when calling this.
1552          */
1553         switch ((int)fs->fs_frag) {
1554         case 8:
1555                 cp[h] = 0;
1556                 return;
1557         case 4:
1558                 cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2));
1559                 return;
1560         case 2:
1561                 cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1));
1562                 return;
1563         case 1:
1564                 cp[h >> 3] &= ~(0x01 << (h & 0x7));
1565                 return;
1566         default:
1567 #ifndef _KERNEL
1568                 cmn_err(CE_PANIC, "clrblock: illegal fs->fs_frag value (%d)",
1569                     fs->fs_frag);
1570 #endif /* _KERNEL */
1571                 return;
1572         }
1573 }
1574 
1575 /*
1576  * Is block allocated?
1577  */
1578 int
1579 isclrblock(struct fs *fs, uchar_t *cp, daddr_t h)
1580 {
1581         uchar_t mask;
1582         int     frag;
1583         /*
1584          * ufsvfsp->vfs_lock is held when calling this.
1585          */
1586         frag = fs->fs_frag;
1587         ASSERT(frag == 8 || frag == 4 || frag == 2 || frag == 1);
1588         switch (frag) {
1589         case 8:
1590                 return (cp[h] == 0);
1591         case 4:
1592                 mask = ~(0x0f << ((h & 0x1) << 2));
1593                 return (cp[h >> 1] == (cp[h >> 1] & mask));
1594         case 2:
1595                 mask =  ~(0x03 << ((h & 0x3) << 1));
1596                 return (cp[h >> 2] == (cp[h >> 2] & mask));
1597         case 1:
1598                 mask = ~(0x01 << (h & 0x7));
1599                 return (cp[h >> 3] == (cp[h >> 3] & mask));
1600         default:
1601 #ifndef _KERNEL
1602                 cmn_err(CE_PANIC, "isclrblock: illegal fs->fs_frag value (%d)",
1603                     fs->fs_frag);
1604 #endif /* _KERNEL */
1605                 break;
1606         }
1607         return (0);
1608 }
1609 
1610 /*
1611  * Put a block into the map
1612  */
1613 void
1614 setblock(struct fs *fs, uchar_t *cp, daddr_t h)
1615 {
1616         ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1617             fs->fs_frag == 1);
1618         /*
1619          * ufsvfsp->vfs_lock is held when calling this.
1620          */
1621         switch ((int)fs->fs_frag) {
1622         case 8:
1623                 cp[h] = 0xff;
1624                 return;
1625         case 4:
1626                 cp[h >> 1] |= (0x0f << ((h & 0x1) << 2));
1627                 return;
1628         case 2:
1629                 cp[h >> 2] |= (0x03 << ((h & 0x3) << 1));
1630                 return;
1631         case 1:
1632                 cp[h >> 3] |= (0x01 << (h & 0x7));
1633                 return;
1634         default:
1635 #ifndef _KERNEL
1636                 cmn_err(CE_PANIC, "setblock: illegal fs->fs_frag value (%d)",
1637                     fs->fs_frag);
1638 #endif /* _KERNEL */
1639                 return;
1640         }
1641 }
1642 
1643 int
1644 skpc(char c, uint_t len, char *cp)
1645 {
1646         if (len == 0)
1647                 return (0);
1648         while (*cp++ == c && --len)
1649                 ;
1650         return (len);
1651 }