Print this page
6660 ufs may read too many indirect blocks, flush a random block to disk
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/ufs/ufs_subr.c
+++ new/usr/src/uts/common/fs/ufs/ufs_subr.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 27 /* All Rights Reserved */
28 28
29 29 /*
30 30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 31 * The Regents of the University of California
32 32 * All Rights Reserved
33 33 *
34 34 * University Acknowledgment- Portions of this document are derived from
35 35 * software developed by the University of California, Berkeley, and its
36 36 * contributors.
37 37 */
38 38
39 39 #include <sys/types.h>
40 40 #include <sys/t_lock.h>
41 41 #include <sys/param.h>
42 42 #include <sys/time.h>
43 43 #include <sys/fs/ufs_fs.h>
44 44 #include <sys/cmn_err.h>
45 45
46 46 #ifdef _KERNEL
47 47
48 48 #include <sys/systm.h>
49 49 #include <sys/sysmacros.h>
50 50 #include <sys/buf.h>
51 51 #include <sys/conf.h>
52 52 #include <sys/user.h>
53 53 #include <sys/var.h>
54 54 #include <sys/vfs.h>
55 55 #include <sys/vnode.h>
56 56 #include <sys/proc.h>
57 57 #include <sys/debug.h>
58 58 #include <sys/fssnap_if.h>
59 59 #include <sys/fs/ufs_inode.h>
60 60 #include <sys/fs/ufs_trans.h>
61 61 #include <sys/fs/ufs_panic.h>
62 62 #include <sys/fs/ufs_bio.h>
63 63 #include <sys/fs/ufs_log.h>
64 64 #include <sys/kmem.h>
65 65 #include <sys/policy.h>
66 66 #include <vm/hat.h>
67 67 #include <vm/as.h>
68 68 #include <vm/seg.h>
69 69 #include <vm/pvn.h>
70 70 #include <vm/seg_map.h>
71 71 #include <sys/swap.h>
72 72 #include <vm/seg_kmem.h>
73 73
74 74 #else /* _KERNEL */
75 75
76 76 #define ASSERT(x) /* don't use asserts for fsck et al */
77 77
78 78 #endif /* _KERNEL */
79 79
80 80 #ifdef _KERNEL
81 81
82 82 /*
83 83 * Used to verify that a given entry on the ufs_instances list (see below)
84 84 * still refers to a mounted file system.
85 85 *
86 86 * XXX: This is a crock that substitutes for proper locking to coordinate
87 87 * updates to and uses of the entries in ufs_instances.
88 88 */
89 89 struct check_node {
90 90 struct vfs *vfsp;
91 91 struct ufsvfs *ufsvfs;
92 92 dev_t vfs_dev;
93 93 };
94 94
95 95 static vfs_t *still_mounted(struct check_node *);
96 96
97 97 /*
98 98 * All ufs file system instances are linked together into a list starting at
99 99 * ufs_instances. The list is updated as part of mount and unmount. It's
100 100 * consulted in ufs_update, to allow syncing out all ufs file system instances
101 101 * in a batch.
102 102 *
103 103 * ufsvfs_mutex guards access to this list and to the {,old}ufsvfslist
104 104 * manipulated in ufs_funmount_cleanup. (A given ufs instance is always on
105 105 * exactly one of these lists except while it's being allocated or
106 106 * deallocated.)
107 107 */
108 108 struct ufsvfs *ufs_instances;
109 109 extern kmutex_t ufsvfs_mutex; /* XXX: move this to ufs_inode.h? */
110 110
111 111 /*
112 112 * ufsvfs list manipulation routines
113 113 */
114 114
115 115 /*
116 116 * Link ufsp in at the head of the list of ufs_instances.
117 117 */
118 118 void
119 119 ufs_vfs_add(struct ufsvfs *ufsp)
120 120 {
121 121 mutex_enter(&ufsvfs_mutex);
122 122 ufsp->vfs_next = ufs_instances;
123 123 ufs_instances = ufsp;
124 124 mutex_exit(&ufsvfs_mutex);
125 125 }
126 126
127 127 /*
128 128 * Remove ufsp from the list of ufs_instances.
129 129 *
130 130 * Does no error checking; ufsp is assumed to actually be on the list.
131 131 */
132 132 void
133 133 ufs_vfs_remove(struct ufsvfs *ufsp)
134 134 {
135 135 struct ufsvfs **delpt = &ufs_instances;
136 136
137 137 mutex_enter(&ufsvfs_mutex);
138 138 for (; *delpt != NULL; delpt = &((*delpt)->vfs_next)) {
139 139 if (*delpt == ufsp) {
140 140 *delpt = ufsp->vfs_next;
141 141 ufsp->vfs_next = NULL;
142 142 break;
143 143 }
144 144 }
145 145 mutex_exit(&ufsvfs_mutex);
146 146 }
147 147
148 148 /*
149 149 * Clean up state resulting from a forcible unmount that couldn't be handled
150 150 * directly during the unmount. (See commentary in the unmount code for more
151 151 * info.)
152 152 */
153 153 static void
154 154 ufs_funmount_cleanup()
155 155 {
156 156 struct ufsvfs *ufsvfsp;
157 157 extern struct ufsvfs *oldufsvfslist, *ufsvfslist;
158 158
159 159 /*
160 160 * Assumption: it's now safe to blow away the entries on
161 161 * oldufsvfslist.
162 162 */
163 163 mutex_enter(&ufsvfs_mutex);
164 164 while ((ufsvfsp = oldufsvfslist) != NULL) {
165 165 oldufsvfslist = ufsvfsp->vfs_next;
166 166
167 167 mutex_destroy(&ufsvfsp->vfs_lock);
168 168 kmem_free(ufsvfsp, sizeof (struct ufsvfs));
169 169 }
170 170 /*
171 171 * Rotate more recent unmount entries into place in preparation for
172 172 * the next time around.
173 173 */
174 174 oldufsvfslist = ufsvfslist;
175 175 ufsvfslist = NULL;
176 176 mutex_exit(&ufsvfs_mutex);
177 177 }
178 178
179 179
180 180 /*
181 181 * ufs_update performs the ufs part of `sync'. It goes through the disk
182 182 * queues to initiate sandbagged IO; goes through the inodes to write
183 183 * modified nodes; and it goes through the mount table to initiate
184 184 * the writing of the modified super blocks.
185 185 */
186 186 extern time_t time;
187 187 time_t ufs_sync_time;
188 188 time_t ufs_sync_time_secs = 1;
189 189
190 190 extern kmutex_t ufs_scan_lock;
191 191
192 192 void
193 193 ufs_update(int flag)
194 194 {
195 195 struct vfs *vfsp;
196 196 struct fs *fs;
197 197 struct ufsvfs *ufsp;
198 198 struct ufsvfs *ufsnext;
199 199 struct ufsvfs *update_list = NULL;
200 200 int check_cnt = 0;
201 201 size_t check_size;
202 202 struct check_node *check_list, *ptr;
203 203 int cheap = flag & SYNC_ATTR;
204 204
205 205 /*
206 206 * This is a hack. A design flaw in the forced unmount protocol
207 207 * could allow a thread to attempt to use a kmem_freed ufsvfs
208 208 * structure in ufs_lockfs_begin/ufs_check_lockfs. This window
209 209 * is difficult to hit, even during the lockfs stress tests.
210 210 * So the hacky fix is to wait awhile before kmem_free'ing the
211 211 * ufsvfs structures for forcibly unmounted file systems. `Awhile'
212 212 * is defined as every other call from fsflush (~60 seconds).
213 213 */
214 214 if (cheap)
215 215 ufs_funmount_cleanup();
216 216
217 217 /*
218 218 * Examine all ufsvfs structures and add those that we can lock to the
219 219 * update list. This is so that we don't hold the list lock for a
220 220 * long time. If vfs_lock fails for a file system instance, then skip
221 221 * it because somebody is doing a unmount on it.
222 222 */
223 223 mutex_enter(&ufsvfs_mutex);
224 224 for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) {
225 225 vfsp = ufsp->vfs_vfs;
226 226 if (vfs_lock(vfsp) != 0)
227 227 continue;
228 228 ufsp->vfs_wnext = update_list;
229 229 update_list = ufsp;
230 230 check_cnt++;
231 231 }
232 232 mutex_exit(&ufsvfs_mutex);
233 233
234 234 if (update_list == NULL)
235 235 return;
236 236
237 237 check_size = sizeof (struct check_node) * check_cnt;
238 238 check_list = ptr = kmem_alloc(check_size, KM_NOSLEEP);
239 239
240 240 /*
241 241 * Write back modified superblocks.
242 242 * Consistency check that the superblock of
243 243 * each file system is still in the buffer cache.
244 244 *
245 245 * Note that the update_list traversal is done without the protection
246 246 * of an overall list lock, so it's necessary to rely on the fact that
247 247 * each entry of the list is vfs_locked when moving from one entry to
248 248 * the next. This works because a concurrent attempt to add an entry
249 249 * to another thread's update_list won't find it, since it'll already
250 250 * be locked.
251 251 */
252 252 check_cnt = 0;
253 253 for (ufsp = update_list; ufsp != NULL; ufsp = ufsnext) {
254 254 /*
255 255 * Need to grab the next ptr before we unlock this one so
256 256 * another thread doesn't grab it and change it before we move
257 257 * on to the next vfs. (Once we unlock it, it's ok if another
258 258 * thread finds it to add it to its own update_list; we don't
259 259 * attempt to refer to it through our list any more.)
260 260 */
261 261 ufsnext = ufsp->vfs_wnext;
262 262 vfsp = ufsp->vfs_vfs;
263 263
264 264 /*
265 265 * Seems like this can't happen, so perhaps it should become
266 266 * an ASSERT(vfsp->vfs_data != NULL).
267 267 */
268 268 if (!vfsp->vfs_data) {
269 269 vfs_unlock(vfsp);
270 270 continue;
271 271 }
272 272
273 273 fs = ufsp->vfs_fs;
274 274
275 275 /*
276 276 * don't update a locked superblock during a panic; it
277 277 * may be in an inconsistent state
278 278 */
279 279 if (panicstr) {
280 280 if (!mutex_tryenter(&ufsp->vfs_lock)) {
281 281 vfs_unlock(vfsp);
282 282 continue;
283 283 }
284 284 } else
285 285 mutex_enter(&ufsp->vfs_lock);
286 286 /*
287 287 * Build up the STABLE check list, so we can unlock the vfs
288 288 * until we do the actual checking.
289 289 */
290 290 if (check_list != NULL) {
291 291 if ((fs->fs_ronly == 0) &&
292 292 (fs->fs_clean != FSBAD) &&
293 293 (fs->fs_clean != FSSUSPEND)) {
294 294 ptr->vfsp = vfsp;
295 295 ptr->ufsvfs = ufsp;
296 296 ptr->vfs_dev = vfsp->vfs_dev;
297 297 ptr++;
298 298 check_cnt++;
299 299 }
300 300 }
301 301
302 302 /*
303 303 * superblock is not modified
304 304 */
305 305 if (fs->fs_fmod == 0) {
306 306 mutex_exit(&ufsp->vfs_lock);
307 307 vfs_unlock(vfsp);
308 308 continue;
309 309 }
310 310 if (fs->fs_ronly != 0) {
311 311 mutex_exit(&ufsp->vfs_lock);
312 312 vfs_unlock(vfsp);
313 313 (void) ufs_fault(ufsp->vfs_root,
314 314 "fs = %s update: ro fs mod\n", fs->fs_fsmnt);
315 315 /*
316 316 * XXX: Why is this a return instead of a continue?
317 317 * This may be an attempt to replace a panic with
318 318 * something less drastic, but there's cleanup we
319 319 * should be doing that's not being done (e.g.,
320 320 * unlocking the remaining entries on the list).
321 321 */
322 322 return;
323 323 }
324 324 fs->fs_fmod = 0;
325 325 mutex_exit(&ufsp->vfs_lock);
326 326 TRANS_SBUPDATE(ufsp, vfsp, TOP_SBUPDATE_UPDATE);
327 327 vfs_unlock(vfsp);
328 328 }
329 329
330 330 ufs_sync_time = time;
331 331
332 332 /*
333 333 * Avoid racing with ufs_unmount() and ufs_sync().
334 334 */
335 335 mutex_enter(&ufs_scan_lock);
336 336
337 337 (void) ufs_scan_inodes(1, ufs_sync_inode, (void *)(uintptr_t)cheap,
338 338 NULL);
339 339
340 340 mutex_exit(&ufs_scan_lock);
341 341
342 342 /*
343 343 * Force stale buffer cache information to be flushed,
344 344 * for all devices. This should cause any remaining control
345 345 * information (e.g., cg and inode info) to be flushed back.
346 346 */
347 347 bflush((dev_t)NODEV);
348 348
349 349 if (check_list == NULL)
350 350 return;
351 351
352 352 /*
353 353 * For each UFS filesystem in the STABLE check_list, update
354 354 * the clean flag if warranted.
355 355 */
356 356 for (ptr = check_list; check_cnt > 0; check_cnt--, ptr++) {
357 357 int error;
358 358
359 359 /*
360 360 * still_mounted() returns with vfsp and the vfs_reflock
361 361 * held if ptr refers to a vfs that is still mounted.
362 362 */
363 363 if ((vfsp = still_mounted(ptr)) == NULL)
364 364 continue;
365 365 ufs_checkclean(vfsp);
366 366 /*
367 367 * commit any outstanding async transactions
368 368 */
369 369 ufsp = (struct ufsvfs *)vfsp->vfs_data;
370 370 curthread->t_flag |= T_DONTBLOCK;
371 371 TRANS_BEGIN_SYNC(ufsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE,
372 372 error);
373 373 if (!error) {
374 374 TRANS_END_SYNC(ufsp, error, TOP_COMMIT_UPDATE,
375 375 TOP_COMMIT_SIZE);
376 376 }
377 377 curthread->t_flag &= ~T_DONTBLOCK;
378 378
379 379 vfs_unlock(vfsp);
380 380 }
381 381
382 382 kmem_free(check_list, check_size);
383 383 }
384 384
385 385 int
386 386 ufs_sync_inode(struct inode *ip, void *arg)
387 387 {
388 388 int cheap = (int)(uintptr_t)arg;
389 389 struct ufsvfs *ufsvfsp;
390 390 uint_t flag = ip->i_flag;
391 391
392 392 if (cheap && ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) == 0))
393 393 return (0);
394 394
395 395 /*
396 396 * if we are panic'ing; then don't update the inode if this
397 397 * file system is FSSTABLE. Otherwise, we would have to
398 398 * force the superblock to FSACTIVE and the superblock
399 399 * may not be in a good state. Also, if the inode is
400 400 * IREF'ed then it may be in an inconsistent state. Don't
401 401 * push it. Finally, don't push the inode if the fs is
402 402 * logging; the transaction will be discarded at boot.
403 403 */
404 404 if (panicstr) {
405 405
406 406 if (flag & IREF)
407 407 return (0);
408 408
409 409 if (ip->i_ufsvfs == NULL ||
410 410 (ip->i_fs->fs_clean == FSSTABLE ||
411 411 ip->i_fs->fs_clean == FSLOG))
412 412 return (0);
413 413 }
414 414
415 415 ufsvfsp = ip->i_ufsvfs;
416 416
417 417 /*
418 418 * Limit access time only updates
419 419 */
420 420 if (((flag & (IMOD|IMODACC|IUPD|ICHG|IACC)) == IMODACC) && ufsvfsp) {
421 421 /*
422 422 * if file system has deferred access time turned on and there
423 423 * was no IO recently, don't bother flushing it. It will be
424 424 * flushed when I/Os start again.
425 425 */
426 426 if (cheap && (ufsvfsp->vfs_dfritime & UFS_DFRATIME) &&
427 427 (ufsvfsp->vfs_iotstamp + ufs_iowait < ddi_get_lbolt()))
428 428 return (0);
429 429 /*
430 430 * an app issueing a sync() can take forever on a trans device
431 431 * when NetWorker or find is running because all of the
432 432 * directorys' access times have to be updated. So, we limit
433 433 * the time we spend updating access times per sync.
434 434 */
435 435 if (TRANS_ISTRANS(ufsvfsp) && ((ufs_sync_time +
436 436 ufs_sync_time_secs) < time))
437 437 return (0);
438 438 }
439 439
440 440 /*
441 441 * if we are running on behalf of the flush thread or this is
442 442 * a swap file, then simply do a delay update of the inode.
443 443 * Otherwise, push the pages and then do a delayed inode update.
444 444 */
445 445 if (cheap || IS_SWAPVP(ITOV(ip))) {
446 446 TRANS_IUPDAT(ip, 0);
447 447 } else {
448 448 (void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_SYNC);
449 449 }
450 450 return (0);
451 451 }
452 452
453 453 /*
454 454 * Flush all the pages associated with an inode using the given 'flags',
455 455 * then force inode information to be written back using the given 'waitfor'.
456 456 */
457 457 int
458 458 ufs_syncip(struct inode *ip, int flags, int waitfor, top_t topid)
459 459 {
460 460 int error;
461 461 struct vnode *vp = ITOV(ip);
462 462 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
463 463 int dotrans = 0;
464 464
465 465 /*
466 466 * Return if file system has been forcibly umounted.
467 467 */
468 468 if (ufsvfsp == NULL)
469 469 return (EIO);
470 470 /*
471 471 * don't need to VOP_PUTPAGE if there are no pages
472 472 */
473 473 if (!vn_has_cached_data(vp) || vp->v_type == VCHR) {
474 474 error = 0;
475 475 } else {
476 476 /*
477 477 * if the inode we're working on is a shadow inode
478 478 * or quota inode we need to make sure that the
479 479 * ufs_putpage call is inside a transaction as this
480 480 * could include meta data changes.
481 481 */
482 482 if ((ip->i_mode & IFMT) == IFSHAD ||
483 483 ufsvfsp->vfs_qinod == ip) {
484 484 dotrans = 1;
485 485 curthread->t_flag |= T_DONTBLOCK;
486 486 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_PUTPAGE,
487 487 TOP_PUTPAGE_SIZE(ip));
488 488 }
489 489 error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
490 490 flags, CRED(), NULL);
491 491 if (dotrans) {
492 492 TRANS_END_ASYNC(ufsvfsp, TOP_PUTPAGE,
493 493 TOP_PUTPAGE_SIZE(ip));
494 494 curthread->t_flag &= ~T_DONTBLOCK;
495 495 dotrans = 0;
496 496 }
497 497 }
498 498 if (panicstr && TRANS_ISTRANS(ufsvfsp))
499 499 goto out;
500 500 /*
501 501 * waitfor represents two things -
502 502 * 1. whether data sync or file sync.
503 503 * 2. if file sync then ufs_iupdat should 'waitfor' disk i/o or not.
504 504 */
505 505 if (waitfor == I_DSYNC) {
506 506 /*
507 507 * If data sync, only IATTCHG (size/block change) requires
508 508 * inode update, fdatasync()/FDSYNC implementation.
509 509 */
510 510 if (ip->i_flag & (IBDWRITE|IATTCHG)) {
511 511 /*
512 512 * Enter a transaction to provide mutual exclusion
513 513 * with deltamap_push and avoid a race where
514 514 * the inode flush could get dropped.
515 515 */
516 516 if ((curthread->t_flag & T_DONTBLOCK) == 0) {
517 517 dotrans = 1;
518 518 curthread->t_flag |= T_DONTBLOCK;
519 519 TRANS_BEGIN_ASYNC(ufsvfsp, topid,
520 520 TOP_SYNCIP_SIZE);
521 521 }
522 522 rw_enter(&ip->i_contents, RW_READER);
523 523 mutex_enter(&ip->i_tlock);
524 524 ip->i_flag &= ~IMODTIME;
525 525 mutex_exit(&ip->i_tlock);
526 526 ufs_iupdat(ip, 1);
527 527 rw_exit(&ip->i_contents);
528 528 if (dotrans) {
529 529 TRANS_END_ASYNC(ufsvfsp, topid,
530 530 TOP_SYNCIP_SIZE);
531 531 curthread->t_flag &= ~T_DONTBLOCK;
532 532 }
533 533 }
534 534 } else {
535 535 /* For file sync, any inode change requires inode update */
536 536 if (ip->i_flag & (IBDWRITE|IUPD|IACC|ICHG|IMOD|IMODACC)) {
537 537 /*
538 538 * Enter a transaction to provide mutual exclusion
539 539 * with deltamap_push and avoid a race where
540 540 * the inode flush could get dropped.
541 541 */
542 542 if ((curthread->t_flag & T_DONTBLOCK) == 0) {
543 543 dotrans = 1;
544 544 curthread->t_flag |= T_DONTBLOCK;
545 545 TRANS_BEGIN_ASYNC(ufsvfsp, topid,
546 546 TOP_SYNCIP_SIZE);
547 547 }
548 548 rw_enter(&ip->i_contents, RW_READER);
549 549 mutex_enter(&ip->i_tlock);
550 550 ip->i_flag &= ~IMODTIME;
551 551 mutex_exit(&ip->i_tlock);
552 552 ufs_iupdat(ip, waitfor);
553 553 rw_exit(&ip->i_contents);
554 554 if (dotrans) {
555 555 TRANS_END_ASYNC(ufsvfsp, topid,
556 556 TOP_SYNCIP_SIZE);
557 557 curthread->t_flag &= ~T_DONTBLOCK;
558 558 }
559 559 }
560 560 }
561 561
562 562 out:
563 563 return (error);
564 564 }
565 565 /*
566 566 * Flush all indirect blocks related to an inode.
567 567 * Supports triple indirect blocks also.
568 568 */
569 569 int
570 570 ufs_sync_indir(struct inode *ip)
571 571 {
572 572 int i;
573 573 daddr_t blkno;
574 574 daddr_t lbn; /* logical blkno of last blk in file */
575 575 daddr_t clbn; /* current logical blk */
576 576 daddr32_t *bap;
577 577 struct fs *fs;
578 578 struct buf *bp;
579 579 int bsize;
580 580 struct ufsvfs *ufsvfsp;
581 581 int j;
582 582 daddr_t indirect_blkno;
583 583 daddr32_t *indirect_bap;
584 584 struct buf *indirect_bp;
585 585
586 586 ufsvfsp = ip->i_ufsvfs;
587 587 /*
588 588 * unnecessary when logging; allocation blocks are kept up-to-date
589 589 */
590 590 if (TRANS_ISTRANS(ufsvfsp))
591 591 return (0);
592 592
593 593 fs = ufsvfsp->vfs_fs;
594 594 bsize = fs->fs_bsize;
↓ open down ↓ |
594 lines elided |
↑ open up ↑ |
595 595 lbn = (daddr_t)lblkno(fs, ip->i_size - 1);
596 596 if (lbn < NDADDR)
597 597 return (0); /* No indirect blocks used */
598 598 if (lbn < NDADDR + NINDIR(fs)) {
599 599 /* File has one indirect block. */
600 600 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, ip->i_ib[0]));
601 601 return (0);
602 602 }
603 603
604 604 /* Write out all the first level indirect blocks */
605 - for (i = 0; i <= NIADDR; i++) {
605 + for (i = 0; i < NIADDR; i++) {
606 606 if ((blkno = ip->i_ib[i]) == 0)
607 607 continue;
608 608 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
609 609 }
610 610 /* Write out second level of indirect blocks */
611 611 if ((blkno = ip->i_ib[1]) == 0)
612 612 return (0);
613 613 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize);
614 614 if (bp->b_flags & B_ERROR) {
615 615 brelse(bp);
616 616 return (EIO);
617 617 }
618 618 bap = bp->b_un.b_daddr;
619 619 clbn = NDADDR + NINDIR(fs);
620 620 for (i = 0; i < NINDIR(fs); i++) {
621 621 if (clbn > lbn)
622 622 break;
623 623 clbn += NINDIR(fs);
624 624 if ((blkno = bap[i]) == 0)
625 625 continue;
626 626 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
627 627 }
628 628
629 629 brelse(bp);
630 630 /* write out third level indirect blocks */
631 631
632 632 if ((blkno = ip->i_ib[2]) == 0)
633 633 return (0);
634 634
635 635 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize);
636 636 if (bp->b_flags & B_ERROR) {
637 637 brelse(bp);
638 638 return (EIO);
639 639 }
640 640 bap = bp->b_un.b_daddr;
641 641 clbn = NDADDR + NINDIR(fs) + (NINDIR(fs) * NINDIR(fs));
642 642
643 643 for (i = 0; i < NINDIR(fs); i++) {
644 644 if (clbn > lbn)
645 645 break;
646 646 if ((indirect_blkno = bap[i]) == 0)
647 647 continue;
648 648 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, indirect_blkno));
649 649 indirect_bp = UFS_BREAD(ufsvfsp, ip->i_dev,
650 650 (daddr_t)fsbtodb(fs, indirect_blkno), bsize);
651 651 if (indirect_bp->b_flags & B_ERROR) {
652 652 brelse(indirect_bp);
653 653 brelse(bp);
654 654 return (EIO);
655 655 }
656 656 indirect_bap = indirect_bp->b_un.b_daddr;
657 657 for (j = 0; j < NINDIR(fs); j++) {
658 658 if (clbn > lbn)
659 659 break;
660 660 clbn += NINDIR(fs);
661 661 if ((blkno = indirect_bap[j]) == 0)
662 662 continue;
663 663 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
664 664 }
665 665 brelse(indirect_bp);
666 666 }
667 667 brelse(bp);
668 668
669 669 return (0);
670 670 }
671 671
672 672 /*
673 673 * Flush all indirect blocks related to an offset of a file.
674 674 * read/write in sync mode may have to flush indirect blocks.
675 675 */
676 676 int
677 677 ufs_indirblk_sync(struct inode *ip, offset_t off)
678 678 {
679 679 daddr_t lbn;
680 680 struct fs *fs;
681 681 struct buf *bp;
682 682 int i, j, shft;
683 683 daddr_t ob, nb, tbn;
684 684 daddr32_t *bap;
685 685 int nindirshift, nindiroffset;
686 686 struct ufsvfs *ufsvfsp;
687 687
688 688 ufsvfsp = ip->i_ufsvfs;
689 689 /*
690 690 * unnecessary when logging; allocation blocks are kept up-to-date
691 691 */
692 692 if (TRANS_ISTRANS(ufsvfsp))
693 693 return (0);
694 694
695 695 fs = ufsvfsp->vfs_fs;
696 696
697 697 lbn = (daddr_t)lblkno(fs, off);
698 698 if (lbn < 0)
699 699 return (EFBIG);
700 700
701 701 /* The first NDADDR are direct so nothing to do */
702 702 if (lbn < NDADDR)
703 703 return (0);
704 704
705 705 nindirshift = ip->i_ufsvfs->vfs_nindirshift;
706 706 nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
707 707
708 708 /* Determine level of indirect blocks */
709 709 shft = 0;
710 710 tbn = lbn - NDADDR;
711 711 for (j = NIADDR; j > 0; j--) {
712 712 longlong_t sh;
713 713
714 714 shft += nindirshift;
715 715 sh = 1LL << shft;
716 716 if (tbn < sh)
717 717 break;
718 718 tbn -= (daddr_t)sh;
719 719 }
720 720
721 721 if (j == 0)
722 722 return (EFBIG);
723 723
724 724 if ((nb = ip->i_ib[NIADDR - j]) == 0)
725 725 return (0); /* UFS Hole */
726 726
727 727 /* Flush first level indirect block */
728 728 blkflush(ip->i_dev, fsbtodb(fs, nb));
729 729
730 730 /* Fetch through next levels */
731 731 for (; j < NIADDR; j++) {
732 732 ob = nb;
733 733 bp = UFS_BREAD(ufsvfsp,
734 734 ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
735 735 if (bp->b_flags & B_ERROR) {
736 736 brelse(bp);
737 737 return (EIO);
738 738 }
739 739 bap = bp->b_un.b_daddr;
740 740 shft -= nindirshift; /* sh / nindir */
741 741 i = (tbn >> shft) & nindiroffset; /* (tbn /sh) & nindir */
742 742 nb = bap[i];
743 743 brelse(bp);
744 744 if (nb == 0) {
745 745 return (0); /* UFS hole */
746 746 }
747 747 blkflush(ip->i_dev, fsbtodb(fs, nb));
748 748 }
749 749 return (0);
750 750 }
751 751
752 752 #ifdef DEBUG
753 753
754 754 /*
755 755 * The bad block checking routines: ufs_indir_badblock() and ufs_badblock()
756 756 * are very expensive. It's been found from profiling that we're
757 757 * spending 6-7% of our time in ufs_badblock, and another 1-2% in
758 758 * ufs_indir_badblock. They are only called via ASSERTs (from debug kernels).
759 759 * In addition from experience no failures have been found in recent
760 760 * years. So the following tunable can be set to enable checking.
761 761 */
762 762 int ufs_badblock_checks = 0;
763 763
764 764 /*
765 765 * Check that a given indirect block contains blocks in range
766 766 */
767 767 int
768 768 ufs_indir_badblock(struct inode *ip, daddr32_t *bap)
769 769 {
770 770 int i;
771 771 int err = 0;
772 772
773 773 if (ufs_badblock_checks) {
774 774 for (i = 0; i < NINDIR(ip->i_fs) - 1; i++)
775 775 if (bap[i] != 0 && (err = ufs_badblock(ip, bap[i])))
776 776 break;
777 777 }
778 778 return (err);
779 779 }
780 780
781 781 /*
782 782 * Check that a specified block number is in range.
783 783 */
784 784 int
785 785 ufs_badblock(struct inode *ip, daddr_t bn)
786 786 {
787 787 long c;
788 788 daddr_t sum;
789 789
790 790 if (!ufs_badblock_checks)
791 791 return (0);
792 792 ASSERT(bn);
793 793 if (bn <= 0 || bn > ip->i_fs->fs_size)
794 794 return (bn);
795 795
796 796 sum = 0;
797 797 c = dtog(ip->i_fs, bn);
798 798 if (c == 0) {
799 799 sum = howmany(ip->i_fs->fs_cssize, ip->i_fs->fs_fsize);
800 800 }
801 801 /*
802 802 * if block no. is below this cylinder group,
803 803 * within the space reserved for superblock, inodes, (summary data)
804 804 * or if it is above this cylinder group
805 805 * then its invalid
806 806 * It's hard to see how we'd be outside this cyl, but let's be careful.
807 807 */
808 808 if ((bn < cgbase(ip->i_fs, c)) ||
809 809 (bn >= cgsblock(ip->i_fs, c) && bn < cgdmin(ip->i_fs, c)+sum) ||
810 810 (bn >= (unsigned)cgbase(ip->i_fs, c+1)))
811 811 return (bn);
812 812
813 813 return (0); /* not a bad block */
814 814 }
815 815
816 816 #endif /* DEBUG */
817 817
818 818 /*
819 819 * When i_rwlock is write-locked or has a writer pended, then the inode
820 820 * is going to change in a way that the filesystem will be marked as
821 821 * active. So no need to let the filesystem be mark as stable now.
822 822 * Also to ensure the filesystem consistency during the directory
823 823 * operations, filesystem cannot be marked as stable if i_rwlock of
824 824 * the directory inode is write-locked.
825 825 */
826 826
827 827 /*
828 828 * Check for busy inodes for this filesystem.
829 829 * NOTE: Needs better way to do this expensive operation in the future.
830 830 */
831 831 static void
832 832 ufs_icheck(struct ufsvfs *ufsvfsp, int *isbusyp, int *isreclaimp)
833 833 {
834 834 union ihead *ih;
835 835 struct inode *ip;
836 836 int i;
837 837 int isnottrans = !TRANS_ISTRANS(ufsvfsp);
838 838 int isbusy = *isbusyp;
839 839 int isreclaim = *isreclaimp;
840 840
841 841 for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
842 842 mutex_enter(&ih_lock[i]);
843 843 for (ip = ih->ih_chain[0];
844 844 ip != (struct inode *)ih;
845 845 ip = ip->i_forw) {
846 846 /*
847 847 * if inode is busy/modified/deleted, filesystem is busy
848 848 */
849 849 if (ip->i_ufsvfs != ufsvfsp)
850 850 continue;
851 851 if ((ip->i_flag & (IMOD | IUPD | ICHG)) ||
852 852 (RW_ISWRITER(&ip->i_rwlock)))
853 853 isbusy = 1;
854 854 if ((ip->i_nlink <= 0) && (ip->i_flag & IREF))
855 855 isreclaim = 1;
856 856 if (isbusy && (isreclaim || isnottrans))
857 857 break;
858 858 }
859 859 mutex_exit(&ih_lock[i]);
860 860 if (isbusy && (isreclaim || isnottrans))
861 861 break;
862 862 }
863 863 *isbusyp = isbusy;
864 864 *isreclaimp = isreclaim;
865 865 }
866 866
867 867 /*
868 868 * As part of the ufs 'sync' operation, this routine is called to mark
869 869 * the filesystem as STABLE if there is no modified metadata in memory.
870 870 */
871 871 void
872 872 ufs_checkclean(struct vfs *vfsp)
873 873 {
874 874 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
875 875 struct fs *fs = ufsvfsp->vfs_fs;
876 876 int isbusy;
877 877 int isreclaim;
878 878 int updatesb;
879 879
880 880 ASSERT(vfs_lock_held(vfsp));
881 881
882 882 /*
883 883 * filesystem is stable or cleanflag processing is disabled; do nothing
884 884 * no transitions when panic'ing
885 885 */
886 886 if (fs->fs_ronly ||
887 887 fs->fs_clean == FSBAD ||
888 888 fs->fs_clean == FSSUSPEND ||
889 889 fs->fs_clean == FSSTABLE ||
890 890 panicstr)
891 891 return;
892 892
893 893 /*
894 894 * if logging and nothing to reclaim; do nothing
895 895 */
896 896 if ((fs->fs_clean == FSLOG) &&
897 897 (((fs->fs_reclaim & FS_RECLAIM) == 0) ||
898 898 (fs->fs_reclaim & FS_RECLAIMING)))
899 899 return;
900 900
901 901 /*
902 902 * FS_CHECKCLEAN is reset if the file system goes dirty
903 903 * FS_CHECKRECLAIM is reset if a file gets deleted
904 904 */
905 905 mutex_enter(&ufsvfsp->vfs_lock);
906 906 fs->fs_reclaim |= (FS_CHECKCLEAN | FS_CHECKRECLAIM);
907 907 mutex_exit(&ufsvfsp->vfs_lock);
908 908
909 909 updatesb = 0;
910 910
911 911 /*
912 912 * if logging or buffers are busy; do nothing
913 913 */
914 914 isbusy = isreclaim = 0;
915 915 if ((fs->fs_clean == FSLOG) ||
916 916 (bcheck(vfsp->vfs_dev, ufsvfsp->vfs_bufp)))
917 917 isbusy = 1;
918 918
919 919 /*
920 920 * isreclaim == TRUE means can't change the state of fs_reclaim
921 921 */
922 922 isreclaim =
923 923 ((fs->fs_clean == FSLOG) &&
924 924 (((fs->fs_reclaim & FS_RECLAIM) == 0) ||
925 925 (fs->fs_reclaim & FS_RECLAIMING)));
926 926
927 927 /*
928 928 * if fs is busy or can't change the state of fs_reclaim; do nothing
929 929 */
930 930 if (isbusy && isreclaim)
931 931 return;
932 932
933 933 /*
934 934 * look for busy or deleted inodes; (deleted == needs reclaim)
935 935 */
936 936 ufs_icheck(ufsvfsp, &isbusy, &isreclaim);
937 937
938 938 mutex_enter(&ufsvfsp->vfs_lock);
939 939
940 940 /*
941 941 * IF POSSIBLE, RESET RECLAIM
942 942 */
943 943 /*
944 944 * the reclaim thread is not running
945 945 */
946 946 if ((fs->fs_reclaim & FS_RECLAIMING) == 0)
947 947 /*
948 948 * no files were deleted during the scan
949 949 */
950 950 if (fs->fs_reclaim & FS_CHECKRECLAIM)
951 951 /*
952 952 * no deleted files were found in the inode cache
953 953 */
954 954 if ((isreclaim == 0) && (fs->fs_reclaim & FS_RECLAIM)) {
955 955 fs->fs_reclaim &= ~FS_RECLAIM;
956 956 updatesb = 1;
957 957 }
958 958 /*
959 959 * IF POSSIBLE, SET STABLE
960 960 */
961 961 /*
962 962 * not logging
963 963 */
964 964 if (fs->fs_clean != FSLOG)
965 965 /*
966 966 * file system has not gone dirty since the scan began
967 967 */
968 968 if (fs->fs_reclaim & FS_CHECKCLEAN)
969 969 /*
970 970 * nothing dirty was found in the buffer or inode cache
971 971 */
972 972 if ((isbusy == 0) && (isreclaim == 0) &&
973 973 (fs->fs_clean != FSSTABLE)) {
974 974 fs->fs_clean = FSSTABLE;
975 975 updatesb = 1;
976 976 }
977 977
978 978 mutex_exit(&ufsvfsp->vfs_lock);
979 979 if (updatesb) {
980 980 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
981 981 }
982 982 }
983 983
984 984 /*
985 985 * called whenever an unlink occurs
986 986 */
987 987 void
988 988 ufs_setreclaim(struct inode *ip)
989 989 {
990 990 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
991 991 struct fs *fs = ufsvfsp->vfs_fs;
992 992
993 993 if (ip->i_nlink || fs->fs_ronly || (fs->fs_clean != FSLOG))
994 994 return;
995 995
996 996 /*
997 997 * reclaim-needed bit is already set or we need to tell
998 998 * ufs_checkclean that a file has been deleted
999 999 */
1000 1000 if ((fs->fs_reclaim & (FS_RECLAIM | FS_CHECKRECLAIM)) == FS_RECLAIM)
1001 1001 return;
1002 1002
1003 1003 mutex_enter(&ufsvfsp->vfs_lock);
1004 1004 /*
1005 1005 * inform ufs_checkclean that the file system has gone dirty
1006 1006 */
1007 1007 fs->fs_reclaim &= ~FS_CHECKRECLAIM;
1008 1008
1009 1009 /*
1010 1010 * set the reclaim-needed bit
1011 1011 */
1012 1012 if ((fs->fs_reclaim & FS_RECLAIM) == 0) {
1013 1013 fs->fs_reclaim |= FS_RECLAIM;
1014 1014 ufs_sbwrite(ufsvfsp);
1015 1015 }
1016 1016 mutex_exit(&ufsvfsp->vfs_lock);
1017 1017 }
1018 1018
1019 1019 /*
1020 1020 * Before any modified metadata written back to the disk, this routine
1021 1021 * is called to mark the filesystem as ACTIVE.
1022 1022 */
1023 1023 void
1024 1024 ufs_notclean(struct ufsvfs *ufsvfsp)
1025 1025 {
1026 1026 struct fs *fs = ufsvfsp->vfs_fs;
1027 1027
1028 1028 ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1029 1029 ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs));
1030 1030
1031 1031 /*
1032 1032 * inform ufs_checkclean that the file system has gone dirty
1033 1033 */
1034 1034 fs->fs_reclaim &= ~FS_CHECKCLEAN;
1035 1035
1036 1036 /*
1037 1037 * ignore if active or bad or suspended or readonly or logging
1038 1038 */
1039 1039 if ((fs->fs_clean == FSACTIVE) || (fs->fs_clean == FSLOG) ||
1040 1040 (fs->fs_clean == FSBAD) || (fs->fs_clean == FSSUSPEND) ||
1041 1041 (fs->fs_ronly)) {
1042 1042 mutex_exit(&ufsvfsp->vfs_lock);
1043 1043 return;
1044 1044 }
1045 1045 fs->fs_clean = FSACTIVE;
1046 1046 /*
1047 1047 * write superblock synchronously
1048 1048 */
1049 1049 ufs_sbwrite(ufsvfsp);
1050 1050 mutex_exit(&ufsvfsp->vfs_lock);
1051 1051 }
1052 1052
1053 1053 /*
1054 1054 * ufs specific fbwrite()
1055 1055 */
1056 1056 int
1057 1057 ufs_fbwrite(struct fbuf *fbp, struct inode *ip)
1058 1058 {
1059 1059 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1060 1060
1061 1061 if (TRANS_ISTRANS(ufsvfsp))
1062 1062 return (fbwrite(fbp));
1063 1063 mutex_enter(&ufsvfsp->vfs_lock);
1064 1064 ufs_notclean(ufsvfsp);
1065 1065 return ((ufsvfsp->vfs_dio) ? fbdwrite(fbp) : fbwrite(fbp));
1066 1066 }
1067 1067
1068 1068 /*
1069 1069 * ufs specific fbiwrite()
1070 1070 */
1071 1071 int
1072 1072 ufs_fbiwrite(struct fbuf *fbp, struct inode *ip, daddr_t bn, long bsize)
1073 1073 {
1074 1074 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1075 1075 o_mode_t ifmt = ip->i_mode & IFMT;
1076 1076 buf_t *bp;
1077 1077 int error;
1078 1078
1079 1079 mutex_enter(&ufsvfsp->vfs_lock);
1080 1080 ufs_notclean(ufsvfsp);
1081 1081 if (ifmt == IFDIR || ifmt == IFSHAD || ifmt == IFATTRDIR ||
1082 1082 (ip->i_ufsvfs->vfs_qinod == ip)) {
1083 1083 TRANS_DELTA(ufsvfsp, ldbtob(bn * (offset_t)(btod(bsize))),
1084 1084 fbp->fb_count, DT_FBI, 0, 0);
1085 1085 }
1086 1086 /*
1087 1087 * Inlined version of fbiwrite()
1088 1088 */
1089 1089 bp = pageio_setup((struct page *)NULL, fbp->fb_count,
1090 1090 ip->i_devvp, B_WRITE);
1091 1091 bp->b_flags &= ~B_PAGEIO;
1092 1092 bp->b_un.b_addr = fbp->fb_addr;
1093 1093
1094 1094 bp->b_blkno = bn * btod(bsize);
1095 1095 bp->b_dev = cmpdev(ip->i_dev); /* store in old dev format */
1096 1096 bp->b_edev = ip->i_dev;
1097 1097 bp->b_proc = NULL; /* i.e. the kernel */
1098 1098 bp->b_file = ip->i_vnode;
1099 1099 bp->b_offset = -1;
1100 1100
1101 1101 if (ufsvfsp->vfs_log) {
1102 1102 lufs_write_strategy(ufsvfsp->vfs_log, bp);
1103 1103 } else if (ufsvfsp->vfs_snapshot) {
1104 1104 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
1105 1105 } else {
1106 1106 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
1107 1107 ub.ub_fbiwrites.value.ul++;
1108 1108 (void) bdev_strategy(bp);
1109 1109 lwp_stat_update(LWP_STAT_OUBLK, 1);
1110 1110 }
1111 1111 error = biowait(bp);
1112 1112 pageio_done(bp);
1113 1113 fbrelse(fbp, S_OTHER);
1114 1114 return (error);
1115 1115 }
1116 1116
1117 1117 /*
1118 1118 * Write the ufs superblock only.
1119 1119 */
1120 1120 void
1121 1121 ufs_sbwrite(struct ufsvfs *ufsvfsp)
1122 1122 {
1123 1123 char sav_fs_fmod;
1124 1124 struct fs *fs = ufsvfsp->vfs_fs;
1125 1125 struct buf *bp = ufsvfsp->vfs_bufp;
1126 1126
1127 1127 ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1128 1128
1129 1129 /*
1130 1130 * for ulockfs processing, limit the superblock writes
1131 1131 */
1132 1132 if ((ufsvfsp->vfs_ulockfs.ul_sbowner) &&
1133 1133 (curthread != ufsvfsp->vfs_ulockfs.ul_sbowner)) {
1134 1134 /* try again later */
1135 1135 fs->fs_fmod = 1;
1136 1136 return;
1137 1137 }
1138 1138
1139 1139 ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs));
1140 1140 /*
1141 1141 * update superblock timestamp and fs_clean checksum
1142 1142 * if marked FSBAD, we always want an erroneous
1143 1143 * checksum to force repair
1144 1144 */
1145 1145 fs->fs_time = gethrestime_sec();
1146 1146 fs->fs_state = (fs->fs_clean != FSBAD) ?
1147 1147 FSOKAY - fs->fs_time : -(FSOKAY - fs->fs_time);
1148 1148 switch (fs->fs_clean) {
1149 1149 case FSCLEAN:
1150 1150 case FSSTABLE:
1151 1151 fs->fs_reclaim &= ~FS_RECLAIM;
1152 1152 break;
1153 1153 case FSACTIVE:
1154 1154 case FSSUSPEND:
1155 1155 case FSBAD:
1156 1156 case FSLOG:
1157 1157 break;
1158 1158 default:
1159 1159 fs->fs_clean = FSACTIVE;
1160 1160 break;
1161 1161 }
1162 1162 /*
1163 1163 * reset incore only bits
1164 1164 */
1165 1165 fs->fs_reclaim &= ~(FS_CHECKCLEAN | FS_CHECKRECLAIM);
1166 1166
1167 1167 /*
1168 1168 * delta the whole superblock
1169 1169 */
1170 1170 TRANS_DELTA(ufsvfsp, ldbtob(SBLOCK), sizeof (struct fs),
1171 1171 DT_SB, NULL, 0);
1172 1172 /*
1173 1173 * retain the incore state of fs_fmod; set the ondisk state to 0
1174 1174 */
1175 1175 sav_fs_fmod = fs->fs_fmod;
1176 1176 fs->fs_fmod = 0;
1177 1177
1178 1178 /*
1179 1179 * Don't release the buffer after written to the disk
1180 1180 */
1181 1181 UFS_BWRITE2(ufsvfsp, bp);
1182 1182 fs->fs_fmod = sav_fs_fmod; /* reset fs_fmod's incore state */
1183 1183 }
1184 1184
1185 1185 /*
1186 1186 * Returns vfs pointer if vfs still being mounted. vfs lock is held.
1187 1187 * Otherwise, returns NULL.
1188 1188 *
1189 1189 * For our purposes, "still mounted" means that the file system still appears
1190 1190 * on the list of UFS file system instances.
1191 1191 */
1192 1192 static vfs_t *
1193 1193 still_mounted(struct check_node *checkp)
1194 1194 {
1195 1195 struct vfs *vfsp;
1196 1196 struct ufsvfs *ufsp;
1197 1197
1198 1198 mutex_enter(&ufsvfs_mutex);
1199 1199 for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) {
1200 1200 if (ufsp != checkp->ufsvfs)
1201 1201 continue;
1202 1202 /*
1203 1203 * Tentative match: verify it and try to lock. (It's not at
1204 1204 * all clear how the verification could fail, given that we've
1205 1205 * gotten this far. We would have had to reallocate the
1206 1206 * ufsvfs struct at hand for a new incarnation; is that really
1207 1207 * possible in the interval from constructing the check_node
1208 1208 * to here?)
1209 1209 */
1210 1210 vfsp = ufsp->vfs_vfs;
1211 1211 if (vfsp != checkp->vfsp)
1212 1212 continue;
1213 1213 if (vfsp->vfs_dev != checkp->vfs_dev)
1214 1214 continue;
1215 1215 if (vfs_lock(vfsp) != 0)
1216 1216 continue;
1217 1217
1218 1218 mutex_exit(&ufsvfs_mutex);
1219 1219 return (vfsp);
1220 1220 }
1221 1221 mutex_exit(&ufsvfs_mutex);
1222 1222 return (NULL);
1223 1223 }
1224 1224
1225 1225 int
1226 1226 ufs_si_io_done(struct buf *bp)
1227 1227 {
1228 1228 sema_v(&bp->b_io);
1229 1229 return (0);
1230 1230 }
1231 1231
1232 1232 #define SI_BUFSZ roundup(sizeof (struct cg), DEV_BSIZE)
1233 1233 #define NSIBUF 32
1234 1234
1235 1235 /*
1236 1236 * ufs_construct_si()
1237 1237 * Read each cylinder group in turn and construct the summary information
1238 1238 */
1239 1239 static int
1240 1240 ufs_construct_si(dev_t dev, struct fs *fs, struct ufsvfs *ufsvfsp)
1241 1241 {
1242 1242 buf_t *bps, *bp;
1243 1243 char *bufs;
1244 1244 struct csum *sip = fs->fs_u.fs_csp;
1245 1245 struct cg *cgp;
1246 1246 int i, ncg;
1247 1247 int error = 0, cg = 0;
1248 1248
1249 1249 bps = kmem_alloc(NSIBUF * sizeof (buf_t), KM_SLEEP);
1250 1250 bufs = kmem_alloc(NSIBUF * SI_BUFSZ, KM_SLEEP);
1251 1251
1252 1252 /*
1253 1253 * Initialise the buffer headers
1254 1254 */
1255 1255 for (bp = bps, i = 0; i < NSIBUF; i++, bp++) {
1256 1256 bioinit(bp);
1257 1257 bp->b_iodone = ufs_si_io_done;
1258 1258 bp->b_bufsize = bp->b_bcount = SI_BUFSZ;
1259 1259 bp->b_flags = B_READ;
1260 1260 bp->b_un.b_addr = bufs + (i * SI_BUFSZ);
1261 1261 bp->b_edev = dev;
1262 1262 }
1263 1263
1264 1264 /*
1265 1265 * Repeat while there are cylinder groups left to read.
1266 1266 */
1267 1267 do {
1268 1268 /*
1269 1269 * Issue upto NSIBUF asynchronous reads
1270 1270 */
1271 1271 ncg = MIN(NSIBUF, (fs->fs_ncg - cg));
1272 1272 for (bp = bps, i = 0; i < ncg; i++, bp++) {
1273 1273 bp->b_blkno = (daddr_t)fsbtodb(fs, cgtod(fs, cg + i));
1274 1274 if (ufsvfsp->vfs_log) {
1275 1275 lufs_read_strategy(ufsvfsp->vfs_log, bp);
1276 1276 } else {
1277 1277 (void) bdev_strategy(bp);
1278 1278 }
1279 1279 }
1280 1280
1281 1281 /*
1282 1282 * wait for each read to finish;
1283 1283 * check for errors and copy the csum info
1284 1284 */
1285 1285 for (bp = bps, i = 0; i < ncg; i++, bp++) {
1286 1286 sema_p(&bp->b_io);
1287 1287 if (!error) {
1288 1288 cgp = bp->b_un.b_cg;
1289 1289 sip[cg + i] = cgp->cg_cs;
1290 1290 error = geterror(bp);
1291 1291 }
1292 1292 }
1293 1293 if (error) {
1294 1294 goto err;
1295 1295 }
1296 1296 cg += ncg;
1297 1297 } while (cg < fs->fs_ncg);
1298 1298
1299 1299 err:
1300 1300 kmem_free(bps, NSIBUF * sizeof (buf_t));
1301 1301 kmem_free(bufs, NSIBUF * SI_BUFSZ);
1302 1302 return (error);
1303 1303 }
1304 1304
1305 1305 /*
1306 1306 * ufs_getsummaryinfo
1307 1307 */
1308 1308 int
1309 1309 ufs_getsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs)
1310 1310 {
1311 1311 int i; /* `for' loop counter */
1312 1312 ssize_t size; /* bytes of summary info to read */
1313 1313 daddr_t frags; /* frags of summary info to read */
1314 1314 caddr_t sip; /* summary info */
1315 1315 struct buf *tp; /* tmp buf */
1316 1316
1317 1317 /*
1318 1318 * maintain metadata map for trans device (debug only)
1319 1319 */
1320 1320 TRANS_MATA_SI(ufsvfsp, fs);
1321 1321
1322 1322 /*
1323 1323 * Compute #frags and allocate space for summary info
1324 1324 */
1325 1325 frags = howmany(fs->fs_cssize, fs->fs_fsize);
1326 1326 sip = kmem_alloc((size_t)fs->fs_cssize, KM_SLEEP);
1327 1327 fs->fs_u.fs_csp = (struct csum *)sip;
1328 1328
1329 1329 if (fs->fs_si == FS_SI_BAD) {
1330 1330 /*
1331 1331 * The summary information is unknown, read it in from
1332 1332 * the cylinder groups.
1333 1333 */
1334 1334 if (TRANS_ISTRANS(ufsvfsp) && !TRANS_ISERROR(ufsvfsp) &&
1335 1335 ufsvfsp->vfs_log->un_logmap) {
1336 1336 logmap_roll_dev(ufsvfsp->vfs_log); /* flush the log */
1337 1337 }
1338 1338 bzero(sip, (size_t)fs->fs_cssize);
1339 1339 if (ufs_construct_si(dev, fs, ufsvfsp)) {
1340 1340 kmem_free(fs->fs_u.fs_csp, fs->fs_cssize);
1341 1341 fs->fs_u.fs_csp = NULL;
1342 1342 return (EIO);
1343 1343 }
1344 1344 } else {
1345 1345 /* Read summary info a fs block at a time */
1346 1346 size = fs->fs_bsize;
1347 1347 for (i = 0; i < frags; i += fs->fs_frag) {
1348 1348 if (i + fs->fs_frag > frags)
1349 1349 /*
1350 1350 * This happens only the last iteration, so
1351 1351 * don't worry about size being reset
1352 1352 */
1353 1353 size = (frags - i) * fs->fs_fsize;
1354 1354 tp = UFS_BREAD(ufsvfsp, dev,
1355 1355 (daddr_t)fsbtodb(fs, fs->fs_csaddr+i), size);
1356 1356 tp->b_flags |= B_STALE | B_AGE;
1357 1357 if (tp->b_flags & B_ERROR) {
1358 1358 kmem_free(fs->fs_u.fs_csp, fs->fs_cssize);
1359 1359 fs->fs_u.fs_csp = NULL;
1360 1360 brelse(tp);
1361 1361 return (EIO);
1362 1362 }
1363 1363 bcopy(tp->b_un.b_addr, sip, size);
1364 1364 sip += size;
1365 1365 brelse(tp);
1366 1366 }
1367 1367 }
1368 1368 bzero((caddr_t)&fs->fs_cstotal, sizeof (fs->fs_cstotal));
1369 1369 for (i = 0; i < fs->fs_ncg; ++i) {
1370 1370 fs->fs_cstotal.cs_ndir += fs->fs_cs(fs, i).cs_ndir;
1371 1371 fs->fs_cstotal.cs_nbfree += fs->fs_cs(fs, i).cs_nbfree;
1372 1372 fs->fs_cstotal.cs_nifree += fs->fs_cs(fs, i).cs_nifree;
1373 1373 fs->fs_cstotal.cs_nffree += fs->fs_cs(fs, i).cs_nffree;
1374 1374 }
1375 1375 return (0);
1376 1376 }
1377 1377
1378 1378 /*
1379 1379 * ufs_putsummaryinfo() stores all the cylinder group summary information
1380 1380 * This is only used when logging, but the file system may not
1381 1381 * be logging at the time, eg a read-only mount to flush the log
1382 1382 * may push the summary info out.
1383 1383 */
1384 1384 int
1385 1385 ufs_putsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs)
1386 1386 {
1387 1387 struct buf b, *bp; /* tmp buf */
1388 1388 caddr_t sip; /* summary info */
1389 1389 ssize_t size; /* bytes of summary info to write */
1390 1390 daddr_t frags; /* frags of summary info to write */
1391 1391 int i; /* `for' loop counter */
1392 1392 int error; /* error */
1393 1393
1394 1394 if (TRANS_ISERROR(ufsvfsp)) {
1395 1395 return (EIO);
1396 1396 }
1397 1397
1398 1398 if ((fs->fs_si != FS_SI_BAD) || !ufsvfsp->vfs_nolog_si) {
1399 1399 return (0);
1400 1400 }
1401 1401
1402 1402 bp = &b;
1403 1403 bioinit(bp);
1404 1404 bp->b_iodone = ufs_si_io_done;
1405 1405 bp->b_bufsize = size = fs->fs_bsize;
1406 1406 bp->b_flags = B_WRITE;
1407 1407 bp->b_un.b_addr = kmem_alloc(size, KM_SLEEP);
1408 1408 bp->b_edev = dev;
1409 1409 frags = howmany(fs->fs_cssize, fs->fs_fsize);
1410 1410 sip = (caddr_t)fs->fs_u.fs_csp;
1411 1411
1412 1412 /* Write summary info one fs block at a time */
1413 1413 for (error = 0, i = 0; (i < frags) && (error == 0); i += fs->fs_frag) {
1414 1414 if (i + fs->fs_frag > frags) {
1415 1415 /*
1416 1416 * This happens only the last iteration, so
1417 1417 * don't worry about size being reset
1418 1418 */
1419 1419 size = (frags - i) * fs->fs_fsize;
1420 1420 }
1421 1421 bcopy(sip, bp->b_un.b_addr, size);
1422 1422 bp->b_blkno = (daddr_t)fsbtodb(fs, fs->fs_csaddr+i);
1423 1423 bp->b_bcount = size;
1424 1424 (void) bdev_strategy(bp);
1425 1425 sema_p(&bp->b_io); /* wait for write to complete */
1426 1426 error = geterror(bp);
1427 1427 sip += size;
1428 1428 }
1429 1429 kmem_free(bp->b_un.b_addr, fs->fs_bsize);
1430 1430 if (!error) {
1431 1431 fs->fs_si = FS_SI_OK;
1432 1432 }
1433 1433 return (error);
1434 1434 }
1435 1435
1436 1436 /*
1437 1437 * Decide whether it is okay to remove within a sticky directory.
1438 1438 * Two conditions need to be met: write access to the directory
1439 1439 * is needed. In sticky directories, write access is not sufficient;
1440 1440 * you can remove entries from a directory only if you own the directory,
1441 1441 * if you are privileged, if you own the entry or if the entry is
1442 1442 * a plain file and you have write access to that file.
1443 1443 * Function returns 0 if remove access is granted.
1444 1444 * Note, the caller is responsible for holding the i_contents lock
1445 1445 * at least as reader on the inquired inode 'ip'.
1446 1446 */
1447 1447 int
1448 1448 ufs_sticky_remove_access(struct inode *dp, struct inode *ip, struct cred *cr)
1449 1449 {
1450 1450 uid_t uid;
1451 1451
1452 1452 ASSERT(RW_LOCK_HELD(&ip->i_contents));
1453 1453
1454 1454 if ((dp->i_mode & ISVTX) &&
1455 1455 (uid = crgetuid(cr)) != dp->i_uid &&
1456 1456 uid != ip->i_uid &&
1457 1457 ((ip->i_mode & IFMT) != IFREG ||
1458 1458 ufs_iaccess(ip, IWRITE, cr, 0) != 0))
1459 1459 return (secpolicy_vnode_remove(cr));
1460 1460
1461 1461 return (0);
1462 1462 }
1463 1463 #endif /* _KERNEL */
1464 1464
1465 1465 extern int around[9];
1466 1466 extern int inside[9];
1467 1467 extern uchar_t *fragtbl[];
1468 1468
1469 1469 /*
1470 1470 * Update the frsum fields to reflect addition or deletion
1471 1471 * of some frags.
1472 1472 */
1473 1473 void
1474 1474 fragacct(struct fs *fs, int fragmap, int32_t *fraglist, int cnt)
1475 1475 {
1476 1476 int inblk;
1477 1477 int field, subfield;
1478 1478 int siz, pos;
1479 1479
1480 1480 /*
1481 1481 * ufsvfsp->vfs_lock is held when calling this.
1482 1482 */
1483 1483 inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1;
1484 1484 fragmap <<= 1;
1485 1485 for (siz = 1; siz < fs->fs_frag; siz++) {
1486 1486 if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0)
1487 1487 continue;
1488 1488 field = around[siz];
1489 1489 subfield = inside[siz];
1490 1490 for (pos = siz; pos <= fs->fs_frag; pos++) {
1491 1491 if ((fragmap & field) == subfield) {
1492 1492 fraglist[siz] += cnt;
1493 1493 ASSERT(fraglist[siz] >= 0);
1494 1494 pos += siz;
1495 1495 field <<= siz;
1496 1496 subfield <<= siz;
1497 1497 }
1498 1498 field <<= 1;
1499 1499 subfield <<= 1;
1500 1500 }
1501 1501 }
1502 1502 }
1503 1503
1504 1504 /*
1505 1505 * Block operations
1506 1506 */
1507 1507
1508 1508 /*
1509 1509 * Check if a block is available
1510 1510 */
1511 1511 int
1512 1512 isblock(struct fs *fs, uchar_t *cp, daddr_t h)
1513 1513 {
1514 1514 uchar_t mask;
1515 1515
1516 1516 ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1517 1517 fs->fs_frag == 1);
1518 1518 /*
1519 1519 * ufsvfsp->vfs_lock is held when calling this.
1520 1520 */
1521 1521 switch ((int)fs->fs_frag) {
1522 1522 case 8:
1523 1523 return (cp[h] == 0xff);
1524 1524 case 4:
1525 1525 mask = 0x0f << ((h & 0x1) << 2);
1526 1526 return ((cp[h >> 1] & mask) == mask);
1527 1527 case 2:
1528 1528 mask = 0x03 << ((h & 0x3) << 1);
1529 1529 return ((cp[h >> 2] & mask) == mask);
1530 1530 case 1:
1531 1531 mask = 0x01 << (h & 0x7);
1532 1532 return ((cp[h >> 3] & mask) == mask);
1533 1533 default:
1534 1534 #ifndef _KERNEL
1535 1535 cmn_err(CE_PANIC, "isblock: illegal fs->fs_frag value (%d)",
1536 1536 fs->fs_frag);
1537 1537 #endif /* _KERNEL */
1538 1538 return (0);
1539 1539 }
1540 1540 }
1541 1541
1542 1542 /*
1543 1543 * Take a block out of the map
1544 1544 */
1545 1545 void
1546 1546 clrblock(struct fs *fs, uchar_t *cp, daddr_t h)
1547 1547 {
1548 1548 ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1549 1549 fs->fs_frag == 1);
1550 1550 /*
1551 1551 * ufsvfsp->vfs_lock is held when calling this.
1552 1552 */
1553 1553 switch ((int)fs->fs_frag) {
1554 1554 case 8:
1555 1555 cp[h] = 0;
1556 1556 return;
1557 1557 case 4:
1558 1558 cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2));
1559 1559 return;
1560 1560 case 2:
1561 1561 cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1));
1562 1562 return;
1563 1563 case 1:
1564 1564 cp[h >> 3] &= ~(0x01 << (h & 0x7));
1565 1565 return;
1566 1566 default:
1567 1567 #ifndef _KERNEL
1568 1568 cmn_err(CE_PANIC, "clrblock: illegal fs->fs_frag value (%d)",
1569 1569 fs->fs_frag);
1570 1570 #endif /* _KERNEL */
1571 1571 return;
1572 1572 }
1573 1573 }
1574 1574
1575 1575 /*
1576 1576 * Is block allocated?
1577 1577 */
1578 1578 int
1579 1579 isclrblock(struct fs *fs, uchar_t *cp, daddr_t h)
1580 1580 {
1581 1581 uchar_t mask;
1582 1582 int frag;
1583 1583 /*
1584 1584 * ufsvfsp->vfs_lock is held when calling this.
1585 1585 */
1586 1586 frag = fs->fs_frag;
1587 1587 ASSERT(frag == 8 || frag == 4 || frag == 2 || frag == 1);
1588 1588 switch (frag) {
1589 1589 case 8:
1590 1590 return (cp[h] == 0);
1591 1591 case 4:
1592 1592 mask = ~(0x0f << ((h & 0x1) << 2));
1593 1593 return (cp[h >> 1] == (cp[h >> 1] & mask));
1594 1594 case 2:
1595 1595 mask = ~(0x03 << ((h & 0x3) << 1));
1596 1596 return (cp[h >> 2] == (cp[h >> 2] & mask));
1597 1597 case 1:
1598 1598 mask = ~(0x01 << (h & 0x7));
1599 1599 return (cp[h >> 3] == (cp[h >> 3] & mask));
1600 1600 default:
1601 1601 #ifndef _KERNEL
1602 1602 cmn_err(CE_PANIC, "isclrblock: illegal fs->fs_frag value (%d)",
1603 1603 fs->fs_frag);
1604 1604 #endif /* _KERNEL */
1605 1605 break;
1606 1606 }
1607 1607 return (0);
1608 1608 }
1609 1609
1610 1610 /*
1611 1611 * Put a block into the map
1612 1612 */
1613 1613 void
1614 1614 setblock(struct fs *fs, uchar_t *cp, daddr_t h)
1615 1615 {
1616 1616 ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1617 1617 fs->fs_frag == 1);
1618 1618 /*
1619 1619 * ufsvfsp->vfs_lock is held when calling this.
1620 1620 */
1621 1621 switch ((int)fs->fs_frag) {
1622 1622 case 8:
1623 1623 cp[h] = 0xff;
1624 1624 return;
1625 1625 case 4:
1626 1626 cp[h >> 1] |= (0x0f << ((h & 0x1) << 2));
1627 1627 return;
1628 1628 case 2:
1629 1629 cp[h >> 2] |= (0x03 << ((h & 0x3) << 1));
1630 1630 return;
1631 1631 case 1:
1632 1632 cp[h >> 3] |= (0x01 << (h & 0x7));
1633 1633 return;
1634 1634 default:
1635 1635 #ifndef _KERNEL
1636 1636 cmn_err(CE_PANIC, "setblock: illegal fs->fs_frag value (%d)",
1637 1637 fs->fs_frag);
1638 1638 #endif /* _KERNEL */
1639 1639 return;
1640 1640 }
1641 1641 }
1642 1642
1643 1643 int
1644 1644 skpc(char c, uint_t len, char *cp)
1645 1645 {
1646 1646 if (len == 0)
1647 1647 return (0);
1648 1648 while (*cp++ == c && --len)
1649 1649 ;
1650 1650 return (len);
1651 1651 }
↓ open down ↓ |
1036 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX