Print this page
OS-7753 THREAD_KPRI_RELEASE does nothing of the sort
Reviewed by: Bryan Cantrill <bryan@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/os/bio.c
+++ new/usr/src/uts/common/os/bio.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 - * Copyright 2011 Joyent, Inc. All rights reserved.
24 + * Copyright 2019 Joyent, Inc.
25 25 */
26 26
27 27 /*
28 28 * Copyright (c) 2016 by Delphix. All rights reserved.
29 29 */
30 30
31 31 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
32 32 /* All Rights Reserved */
33 33
34 34 /*
35 35 * University Copyright- Copyright (c) 1982, 1986, 1988
36 36 * The Regents of the University of California
37 37 * All Rights Reserved
38 38 *
39 39 * University Acknowledgment- Portions of this document are derived from
40 40 * software developed by the University of California, Berkeley, and its
41 41 * contributors.
42 42 */
43 43
44 44 #include <sys/types.h>
45 45 #include <sys/t_lock.h>
46 46 #include <sys/sysmacros.h>
47 47 #include <sys/conf.h>
48 48 #include <sys/cpuvar.h>
49 49 #include <sys/errno.h>
50 50 #include <sys/debug.h>
51 51 #include <sys/buf.h>
52 52 #include <sys/var.h>
53 53 #include <sys/vnode.h>
54 54 #include <sys/bitmap.h>
55 55 #include <sys/cmn_err.h>
56 56 #include <sys/kmem.h>
57 57 #include <sys/vmem.h>
58 58 #include <sys/atomic.h>
59 59 #include <vm/seg_kmem.h>
60 60 #include <vm/page.h>
61 61 #include <vm/pvn.h>
62 62 #include <sys/vtrace.h>
63 63 #include <sys/tnf_probe.h>
64 64 #include <sys/fs/ufs_inode.h>
65 65 #include <sys/fs/ufs_bio.h>
66 66 #include <sys/fs/ufs_log.h>
67 67 #include <sys/systm.h>
68 68 #include <sys/vfs.h>
69 69 #include <sys/sdt.h>
70 70
71 71 /* Locks */
72 72 static kmutex_t blist_lock; /* protects b_list */
73 73 static kmutex_t bhdr_lock; /* protects the bhdrlist */
74 74 static kmutex_t bfree_lock; /* protects the bfreelist structure */
75 75
76 76 struct hbuf *hbuf; /* Hash buckets */
77 77 struct dwbuf *dwbuf; /* Delayed write buckets */
78 78 static struct buf *bhdrlist; /* buf header free list */
79 79 static int nbuf; /* number of buffer headers allocated */
80 80
81 81 static int lastindex; /* Reference point on where to start */
82 82 /* when looking for free buffers */
83 83
84 84 #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask)
85 85 #define EMPTY_LIST ((struct buf *)-1)
86 86
87 87 static kcondvar_t bio_mem_cv; /* Condition variables */
88 88 static kcondvar_t bio_flushinval_cv;
89 89 static int bio_doingflush; /* flush in progress */
90 90 static int bio_doinginval; /* inval in progress */
91 91 static int bio_flinv_cv_wanted; /* someone waiting for cv */
92 92
93 93 /*
94 94 * Statistics on the buffer cache
95 95 */
96 96 struct biostats biostats = {
97 97 { "buffer_cache_lookups", KSTAT_DATA_UINT32 },
98 98 { "buffer_cache_hits", KSTAT_DATA_UINT32 },
99 99 { "new_buffer_requests", KSTAT_DATA_UINT32 },
100 100 { "waits_for_buffer_allocs", KSTAT_DATA_UINT32 },
101 101 { "buffers_locked_by_someone", KSTAT_DATA_UINT32 },
102 102 { "duplicate_buffers_found", KSTAT_DATA_UINT32 }
103 103 };
104 104
105 105 /*
106 106 * kstat data
107 107 */
108 108 kstat_named_t *biostats_ptr = (kstat_named_t *)&biostats;
109 109 uint_t biostats_ndata = (uint_t)(sizeof (biostats) /
110 110 sizeof (kstat_named_t));
111 111
112 112 /*
113 113 * Statistics on ufs buffer cache
114 114 * Not protected by locks
115 115 */
116 116 struct ufsbiostats ub = {
117 117 { "breads", KSTAT_DATA_UINT32 },
118 118 { "bwrites", KSTAT_DATA_UINT32 },
119 119 { "fbiwrites", KSTAT_DATA_UINT32 },
120 120 { "getpages", KSTAT_DATA_UINT32 },
121 121 { "getras", KSTAT_DATA_UINT32 },
122 122 { "putsyncs", KSTAT_DATA_UINT32 },
123 123 { "putasyncs", KSTAT_DATA_UINT32 },
124 124 { "putpageios", KSTAT_DATA_UINT32 },
125 125 };
126 126
127 127 /*
128 128 * more UFS Logging eccentricities...
129 129 *
130 130 * required since "#pragma weak ..." doesn't work in reverse order.
131 131 * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers
132 132 * to ufs routines don't get plugged into bio.c calls so
133 133 * we initialize it when setting up the "lufsops" table
134 134 * in "lufs.c:_init()"
135 135 */
136 136 void (*bio_lufs_strategy)(void *, buf_t *);
137 137 void (*bio_snapshot_strategy)(void *, buf_t *);
138 138
139 139
140 140 /* Private routines */
141 141 static struct buf *bio_getfreeblk(long);
142 142 static void bio_mem_get(long);
143 143 static void bio_bhdr_free(struct buf *);
144 144 static struct buf *bio_bhdr_alloc(void);
145 145 static void bio_recycle(int, long);
146 146 static void bio_pageio_done(struct buf *);
147 147 static int bio_incore(dev_t, daddr_t);
148 148
149 149 /*
150 150 * Buffer cache constants
151 151 */
152 152 #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */
153 153 #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */
154 154 #define BIO_BHDR_POOL 100 /* Default bhdr pool size */
155 155 #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */
156 156 #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024)
157 157 #define BIO_HASHLEN 4 /* Target length of hash chains */
158 158
159 159
160 160 /* Flags for bio_recycle() */
161 161 #define BIO_HEADER 0x01
162 162 #define BIO_MEM 0x02
163 163
164 164 extern int bufhwm; /* User tunable - high water mark for mem */
165 165 extern int bufhwm_pct; /* ditto - given in % of physmem */
166 166
167 167 /*
168 168 * The following routines allocate and free
169 169 * buffers with various side effects. In general the
170 170 * arguments to an allocate routine are a device and
171 171 * a block number, and the value is a pointer to
172 172 * to the buffer header; the buffer returned is locked with a
173 173 * binary semaphore so that no one else can touch it. If the block was
174 174 * already in core, no I/O need be done; if it is
175 175 * already locked, the process waits until it becomes free.
176 176 * The following routines allocate a buffer:
177 177 * getblk
178 178 * bread/BREAD
179 179 * breada
180 180 * Eventually the buffer must be released, possibly with the
181 181 * side effect of writing it out, by using one of
182 182 * bwrite/BWRITE/brwrite
183 183 * bdwrite/bdrwrite
184 184 * bawrite
185 185 * brelse
186 186 *
187 187 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
188 188 * Instead, a binary semaphore, b_sem is used to gain exclusive access to
189 189 * a buffer and a binary semaphore, b_io is used for I/O synchronization.
190 190 * B_DONE is still used to denote a buffer with I/O complete on it.
191 191 *
192 192 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
193 193 * should not be used where a very accurate count of the free buffers is
194 194 * needed.
195 195 */
196 196
197 197 /*
198 198 * Read in (if necessary) the block and return a buffer pointer.
199 199 *
200 200 * This interface is provided for binary compatibility. Using
201 201 * BREAD() directly avoids the extra function call overhead invoked
202 202 * by calling this routine.
203 203 */
204 204 struct buf *
205 205 bread(dev_t dev, daddr_t blkno, long bsize)
206 206 {
207 207 return (BREAD(dev, blkno, bsize));
208 208 }
209 209
210 210 /*
211 211 * Common code for reading a buffer with various options
212 212 *
213 213 * Read in (if necessary) the block and return a buffer pointer.
214 214 */
215 215 struct buf *
216 216 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
217 217 {
218 218 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
219 219 struct buf *bp;
220 220 klwp_t *lwp = ttolwp(curthread);
221 221
222 222 CPU_STATS_ADD_K(sys, lread, 1);
223 223 bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
224 224 if (bp->b_flags & B_DONE)
225 225 return (bp);
226 226 bp->b_flags |= B_READ;
227 227 ASSERT(bp->b_bcount == bsize);
228 228 if (ufsvfsp == NULL) { /* !ufs */
229 229 (void) bdev_strategy(bp);
230 230 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
231 231 /* ufs && logging */
232 232 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
233 233 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
234 234 /* ufs && snapshots */
235 235 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
236 236 } else {
237 237 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
238 238 ub.ub_breads.value.ul++; /* ufs && !logging */
239 239 (void) bdev_strategy(bp);
240 240 }
241 241 if (lwp != NULL)
242 242 lwp->lwp_ru.inblock++;
243 243 CPU_STATS_ADD_K(sys, bread, 1);
244 244 (void) biowait(bp);
245 245 return (bp);
246 246 }
247 247
248 248 /*
249 249 * Read in the block, like bread, but also start I/O on the
250 250 * read-ahead block (which is not allocated to the caller).
251 251 */
252 252 struct buf *
253 253 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
254 254 {
255 255 struct buf *bp, *rabp;
256 256 klwp_t *lwp = ttolwp(curthread);
257 257
258 258 bp = NULL;
259 259 if (!bio_incore(dev, blkno)) {
260 260 CPU_STATS_ADD_K(sys, lread, 1);
261 261 bp = GETBLK(dev, blkno, bsize);
262 262 if ((bp->b_flags & B_DONE) == 0) {
263 263 bp->b_flags |= B_READ;
264 264 bp->b_bcount = bsize;
265 265 (void) bdev_strategy(bp);
266 266 if (lwp != NULL)
267 267 lwp->lwp_ru.inblock++;
268 268 CPU_STATS_ADD_K(sys, bread, 1);
269 269 }
270 270 }
271 271 if (rablkno && bfreelist.b_bcount > 1 &&
272 272 !bio_incore(dev, rablkno)) {
273 273 rabp = GETBLK(dev, rablkno, bsize);
274 274 if (rabp->b_flags & B_DONE)
275 275 brelse(rabp);
276 276 else {
277 277 rabp->b_flags |= B_READ|B_ASYNC;
278 278 rabp->b_bcount = bsize;
279 279 (void) bdev_strategy(rabp);
280 280 if (lwp != NULL)
281 281 lwp->lwp_ru.inblock++;
282 282 CPU_STATS_ADD_K(sys, bread, 1);
283 283 }
284 284 }
285 285 if (bp == NULL)
286 286 return (BREAD(dev, blkno, bsize));
287 287 (void) biowait(bp);
288 288 return (bp);
289 289 }
290 290
291 291 /*
292 292 * Common code for writing a buffer with various options.
293 293 *
294 294 * force_wait - wait for write completion regardless of B_ASYNC flag
295 295 * do_relse - release the buffer when we are done
296 296 * clear_flags - flags to clear from the buffer
297 297 */
298 298 void
299 299 bwrite_common(void *arg, struct buf *bp, int force_wait,
300 300 int do_relse, int clear_flags)
301 301 {
302 302 register int do_wait;
303 303 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
304 304 int flag;
305 305 klwp_t *lwp = ttolwp(curthread);
306 306 struct cpu *cpup;
307 307
308 308 ASSERT(SEMA_HELD(&bp->b_sem));
309 309 flag = bp->b_flags;
310 310 bp->b_flags &= ~clear_flags;
311 311 if (lwp != NULL)
312 312 lwp->lwp_ru.oublock++;
313 313 CPU_STATS_ENTER_K();
314 314 cpup = CPU; /* get pointer AFTER preemption is disabled */
315 315 CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
316 316 CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
317 317 do_wait = ((flag & B_ASYNC) == 0 || force_wait);
318 318 if (do_wait == 0)
319 319 CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
320 320 CPU_STATS_EXIT_K();
321 321 if (ufsvfsp == NULL) {
322 322 (void) bdev_strategy(bp);
323 323 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
324 324 /* ufs && logging */
325 325 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
326 326 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
327 327 /* ufs && snapshots */
328 328 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
329 329 } else {
330 330 ub.ub_bwrites.value.ul++; /* ufs && !logging */
331 331 (void) bdev_strategy(bp);
332 332 }
333 333 if (do_wait) {
334 334 (void) biowait(bp);
335 335 if (do_relse) {
336 336 brelse(bp);
337 337 }
338 338 }
339 339 }
340 340
341 341 /*
342 342 * Write the buffer, waiting for completion (unless B_ASYNC is set).
343 343 * Then release the buffer.
344 344 * This interface is provided for binary compatibility. Using
345 345 * BWRITE() directly avoids the extra function call overhead invoked
346 346 * by calling this routine.
347 347 */
348 348 void
349 349 bwrite(struct buf *bp)
350 350 {
351 351 BWRITE(bp);
352 352 }
353 353
354 354 /*
355 355 * Write the buffer, waiting for completion.
356 356 * But don't release the buffer afterwards.
357 357 * This interface is provided for binary compatibility. Using
358 358 * BWRITE2() directly avoids the extra function call overhead.
359 359 */
360 360 void
361 361 bwrite2(struct buf *bp)
362 362 {
363 363 BWRITE2(bp);
364 364 }
365 365
366 366 /*
367 367 * Release the buffer, marking it so that if it is grabbed
368 368 * for another purpose it will be written out before being
369 369 * given up (e.g. when writing a partial block where it is
370 370 * assumed that another write for the same block will soon follow).
371 371 * Also save the time that the block is first marked as delayed
372 372 * so that it will be written in a reasonable time.
373 373 */
374 374 void
375 375 bdwrite(struct buf *bp)
376 376 {
377 377 ASSERT(SEMA_HELD(&bp->b_sem));
378 378 CPU_STATS_ADD_K(sys, lwrite, 1);
379 379 if ((bp->b_flags & B_DELWRI) == 0)
380 380 bp->b_start = ddi_get_lbolt();
381 381 /*
382 382 * B_DONE allows others to use the buffer, B_DELWRI causes the
383 383 * buffer to be written before being reused, and setting b_resid
384 384 * to zero says the buffer is complete.
385 385 */
386 386 bp->b_flags |= B_DELWRI | B_DONE;
387 387 bp->b_resid = 0;
388 388 brelse(bp);
389 389 }
390 390
391 391 /*
392 392 * Release the buffer, start I/O on it, but don't wait for completion.
393 393 */
394 394 void
395 395 bawrite(struct buf *bp)
396 396 {
397 397 ASSERT(SEMA_HELD(&bp->b_sem));
398 398
399 399 /* Use bfreelist.b_bcount as a weird-ass heuristic */
400 400 if (bfreelist.b_bcount > 4)
401 401 bp->b_flags |= B_ASYNC;
402 402 BWRITE(bp);
403 403 }
404 404
405 405 /*
406 406 * Release the buffer, with no I/O implied.
407 407 */
408 408 void
409 409 brelse(struct buf *bp)
410 410 {
411 411 struct buf **backp;
412 412 uint_t index;
413 413 kmutex_t *hmp;
414 414 struct buf *dp;
415 415 struct hbuf *hp;
416 416
417 417
418 418 ASSERT(SEMA_HELD(&bp->b_sem));
419 419
420 420 /*
421 421 * Clear the retry write flag if the buffer was written without
422 422 * error. The presence of B_DELWRI means the buffer has not yet
423 423 * been written and the presence of B_ERROR means that an error
424 424 * is still occurring.
425 425 */
426 426 if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
427 427 bp->b_flags &= ~B_RETRYWRI;
428 428 }
429 429
430 430 /* Check for anomalous conditions */
431 431 if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
432 432 if (bp->b_flags & B_NOCACHE) {
433 433 /* Don't add to the freelist. Destroy it now */
434 434 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
435 435 sema_destroy(&bp->b_sem);
436 436 sema_destroy(&bp->b_io);
437 437 kmem_free(bp, sizeof (struct buf));
438 438 return;
439 439 }
440 440 /*
441 441 * If a write failed and we are supposed to retry write,
442 442 * don't toss the buffer. Keep it around and mark it
443 443 * delayed write in the hopes that it will eventually
444 444 * get flushed (and still keep the system running.)
445 445 */
446 446 if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
447 447 bp->b_flags |= B_DELWRI;
448 448 /* keep fsflush from trying continuously to flush */
449 449 bp->b_start = ddi_get_lbolt();
450 450 } else
451 451 bp->b_flags |= B_AGE|B_STALE;
452 452 bp->b_flags &= ~B_ERROR;
453 453 bp->b_error = 0;
454 454 }
455 455
456 456 /*
457 457 * If delayed write is set then put in on the delayed
458 458 * write list instead of the free buffer list.
459 459 */
460 460 index = bio_bhash(bp->b_edev, bp->b_blkno);
461 461 hmp = &hbuf[index].b_lock;
462 462
463 463 mutex_enter(hmp);
464 464 hp = &hbuf[index];
465 465 dp = (struct buf *)hp;
466 466
467 467 /*
468 468 * Make sure that the number of entries on this list are
469 469 * Zero <= count <= total # buffers
470 470 */
471 471 ASSERT(hp->b_length >= 0);
472 472 ASSERT(hp->b_length < nbuf);
473 473
474 474 hp->b_length++; /* We are adding this buffer */
475 475
476 476 if (bp->b_flags & B_DELWRI) {
477 477 /*
478 478 * This buffer goes on the delayed write buffer list
479 479 */
480 480 dp = (struct buf *)&dwbuf[index];
481 481 }
482 482 ASSERT(bp->b_bufsize > 0);
483 483 ASSERT(bp->b_bcount > 0);
484 484 ASSERT(bp->b_un.b_addr != NULL);
485 485
486 486 if (bp->b_flags & B_AGE) {
487 487 backp = &dp->av_forw;
488 488 (*backp)->av_back = bp;
489 489 bp->av_forw = *backp;
490 490 *backp = bp;
491 491 bp->av_back = dp;
492 492 } else {
493 493 backp = &dp->av_back;
494 494 (*backp)->av_forw = bp;
495 495 bp->av_back = *backp;
496 496 *backp = bp;
497 497 bp->av_forw = dp;
498 498 }
499 499 mutex_exit(hmp);
500 500
501 501 if (bfreelist.b_flags & B_WANTED) {
502 502 /*
503 503 * Should come here very very rarely.
504 504 */
505 505 mutex_enter(&bfree_lock);
506 506 if (bfreelist.b_flags & B_WANTED) {
507 507 bfreelist.b_flags &= ~B_WANTED;
508 508 cv_broadcast(&bio_mem_cv);
509 509 }
510 510 mutex_exit(&bfree_lock);
511 511 }
512 512
513 513 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
514 514 /*
515 515 * Don't let anyone get the buffer off the freelist before we
516 516 * release our hold on it.
517 517 */
518 518 sema_v(&bp->b_sem);
519 519 }
520 520
521 521 /*
522 522 * Return a count of the number of B_BUSY buffers in the system
523 523 * Can only be used as a good estimate. If 'cleanit' is set,
524 524 * try to flush all bufs.
525 525 */
526 526 int
527 527 bio_busy(int cleanit)
528 528 {
529 529 struct buf *bp, *dp;
530 530 int busy = 0;
531 531 int i;
532 532 kmutex_t *hmp;
533 533
534 534 for (i = 0; i < v.v_hbuf; i++) {
535 535 dp = (struct buf *)&hbuf[i];
536 536 hmp = &hbuf[i].b_lock;
537 537
538 538 mutex_enter(hmp);
539 539 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
540 540 if (bp->b_flags & B_BUSY)
541 541 busy++;
542 542 }
543 543 mutex_exit(hmp);
544 544 }
545 545
546 546 if (cleanit && busy != 0) {
547 547 bflush(NODEV);
548 548 }
549 549
550 550 return (busy);
551 551 }
552 552
553 553 /*
554 554 * this interface is provided for binary compatibility.
555 555 *
556 556 * Assign a buffer for the given block. If the appropriate
557 557 * block is already associated, return it; otherwise search
558 558 * for the oldest non-busy buffer and reassign it.
559 559 */
560 560 struct buf *
561 561 getblk(dev_t dev, daddr_t blkno, long bsize)
562 562 {
563 563 return (getblk_common(/* ufsvfsp */ NULL, dev,
564 564 blkno, bsize, /* errflg */ 0));
565 565 }
566 566
567 567 /*
568 568 * Assign a buffer for the given block. If the appropriate
569 569 * block is already associated, return it; otherwise search
570 570 * for the oldest non-busy buffer and reassign it.
571 571 */
572 572 struct buf *
573 573 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
574 574 {
575 575 ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
576 576 struct buf *bp;
577 577 struct buf *dp;
578 578 struct buf *nbp = NULL;
579 579 struct buf *errbp;
580 580 uint_t index;
581 581 kmutex_t *hmp;
582 582 struct hbuf *hp;
583 583
584 584 if (getmajor(dev) >= devcnt)
585 585 cmn_err(CE_PANIC, "blkdev");
586 586
587 587 biostats.bio_lookup.value.ui32++;
588 588
589 589 index = bio_bhash(dev, blkno);
590 590 hp = &hbuf[index];
591 591 dp = (struct buf *)hp;
592 592 hmp = &hp->b_lock;
593 593
594 594 mutex_enter(hmp);
595 595 loop:
596 596 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
597 597 if (bp->b_blkno != blkno || bp->b_edev != dev ||
598 598 (bp->b_flags & B_STALE))
599 599 continue;
600 600 /*
601 601 * Avoid holding the hash lock in the event that
602 602 * the buffer is locked by someone. Since the hash chain
603 603 * may change when we drop the hash lock
604 604 * we have to start at the beginning of the chain if the
605 605 * buffer identity/contents aren't valid.
606 606 */
607 607 if (!sema_tryp(&bp->b_sem)) {
608 608 biostats.bio_bufbusy.value.ui32++;
609 609 mutex_exit(hmp);
610 610 /*
611 611 * OK, we are dealing with a busy buffer.
612 612 * In the case that we are panicking and we
613 613 * got called from bread(), we have some chance
614 614 * for error recovery. So better bail out from
615 615 * here since sema_p() won't block. If we got
616 616 * called directly from ufs routines, there is
617 617 * no way to report an error yet.
618 618 */
619 619 if (panicstr && errflg)
620 620 goto errout;
621 621 /*
622 622 * For the following line of code to work
623 623 * correctly never kmem_free the buffer "header".
624 624 */
625 625 sema_p(&bp->b_sem);
626 626 if (bp->b_blkno != blkno || bp->b_edev != dev ||
627 627 (bp->b_flags & B_STALE)) {
628 628 sema_v(&bp->b_sem);
629 629 mutex_enter(hmp);
630 630 goto loop; /* start over */
631 631 }
632 632 mutex_enter(hmp);
633 633 }
634 634 /* Found */
635 635 biostats.bio_hit.value.ui32++;
636 636 bp->b_flags &= ~B_AGE;
637 637
638 638 /*
639 639 * Yank it off the free/delayed write lists
640 640 */
641 641 hp->b_length--;
642 642 notavail(bp);
643 643 mutex_exit(hmp);
644 644
645 645 ASSERT((bp->b_flags & B_NOCACHE) == 0);
646 646
647 647 if (nbp == NULL) {
648 648 /*
649 649 * Make the common path short.
650 650 */
651 651 ASSERT(SEMA_HELD(&bp->b_sem));
652 652 return (bp);
653 653 }
654 654
655 655 biostats.bio_bufdup.value.ui32++;
656 656
657 657 /*
658 658 * The buffer must have entered during the lock upgrade
659 659 * so free the new buffer we allocated and return the
660 660 * found buffer.
661 661 */
662 662 kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
663 663 nbp->b_un.b_addr = NULL;
664 664
665 665 /*
666 666 * Account for the memory
667 667 */
668 668 mutex_enter(&bfree_lock);
669 669 bfreelist.b_bufsize += nbp->b_bufsize;
670 670 mutex_exit(&bfree_lock);
671 671
672 672 /*
673 673 * Destroy buf identity, and place on avail list
674 674 */
675 675 nbp->b_dev = (o_dev_t)NODEV;
676 676 nbp->b_edev = NODEV;
677 677 nbp->b_flags = 0;
678 678 nbp->b_file = NULL;
679 679 nbp->b_offset = -1;
680 680
681 681 sema_v(&nbp->b_sem);
682 682 bio_bhdr_free(nbp);
683 683
684 684 ASSERT(SEMA_HELD(&bp->b_sem));
685 685 return (bp);
686 686 }
687 687
688 688 /*
689 689 * bio_getfreeblk may block so check the hash chain again.
690 690 */
691 691 if (nbp == NULL) {
692 692 mutex_exit(hmp);
693 693 nbp = bio_getfreeblk(bsize);
694 694 mutex_enter(hmp);
695 695 goto loop;
696 696 }
697 697
698 698 /*
699 699 * New buffer. Assign nbp and stick it on the hash.
700 700 */
701 701 nbp->b_flags = B_BUSY;
702 702 nbp->b_edev = dev;
703 703 nbp->b_dev = (o_dev_t)cmpdev(dev);
704 704 nbp->b_blkno = blkno;
705 705 nbp->b_iodone = NULL;
706 706 nbp->b_bcount = bsize;
707 707 /*
708 708 * If we are given a ufsvfsp and the vfs_root field is NULL
709 709 * then this must be I/O for a superblock. A superblock's
710 710 * buffer is set up in mountfs() and there is no root vnode
711 711 * at that point.
712 712 */
713 713 if (ufsvfsp && ufsvfsp->vfs_root) {
714 714 nbp->b_vp = ufsvfsp->vfs_root;
715 715 } else {
716 716 nbp->b_vp = NULL;
717 717 }
718 718
719 719 ASSERT((nbp->b_flags & B_NOCACHE) == 0);
720 720
721 721 binshash(nbp, dp);
722 722 mutex_exit(hmp);
723 723
724 724 ASSERT(SEMA_HELD(&nbp->b_sem));
725 725
726 726 return (nbp);
727 727
728 728
729 729 /*
730 730 * Come here in case of an internal error. At this point we couldn't
731 731 * get a buffer, but we have to return one. Hence we allocate some
732 732 * kind of error reply buffer on the fly. This buffer is marked as
733 733 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
734 734 * - B_ERROR will indicate error to the caller.
735 735 * - B_DONE will prevent us from reading the buffer from
736 736 * the device.
737 737 * - B_NOCACHE will cause that this buffer gets free'd in
738 738 * brelse().
739 739 */
740 740
741 741 errout:
742 742 errbp = geteblk();
743 743 sema_p(&errbp->b_sem);
744 744 errbp->b_flags &= ~B_BUSY;
745 745 errbp->b_flags |= (B_ERROR | B_DONE);
746 746 return (errbp);
747 747 }
748 748
749 749 /*
750 750 * Get an empty block, not assigned to any particular device.
751 751 * Returns a locked buffer that is not on any hash or free list.
752 752 */
753 753 struct buf *
754 754 ngeteblk(long bsize)
755 755 {
756 756 struct buf *bp;
757 757
758 758 bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
759 759 bioinit(bp);
760 760 bp->av_forw = bp->av_back = NULL;
761 761 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
762 762 bp->b_bufsize = bsize;
763 763 bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
764 764 bp->b_dev = (o_dev_t)NODEV;
765 765 bp->b_edev = NODEV;
766 766 bp->b_lblkno = 0;
767 767 bp->b_bcount = bsize;
768 768 bp->b_iodone = NULL;
769 769 return (bp);
770 770 }
771 771
772 772 /*
773 773 * Interface of geteblk() is kept intact to maintain driver compatibility.
774 774 * Use ngeteblk() to allocate block size other than 1 KB.
775 775 */
776 776 struct buf *
777 777 geteblk(void)
778 778 {
779 779 return (ngeteblk((long)1024));
780 780 }
781 781
782 782 /*
783 783 * Return a buffer w/o sleeping
784 784 */
785 785 struct buf *
786 786 trygetblk(dev_t dev, daddr_t blkno)
787 787 {
788 788 struct buf *bp;
789 789 struct buf *dp;
790 790 struct hbuf *hp;
791 791 kmutex_t *hmp;
792 792 uint_t index;
793 793
794 794 index = bio_bhash(dev, blkno);
795 795 hp = &hbuf[index];
796 796 hmp = &hp->b_lock;
797 797
798 798 if (!mutex_tryenter(hmp))
799 799 return (NULL);
800 800
801 801 dp = (struct buf *)hp;
802 802 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
803 803 if (bp->b_blkno != blkno || bp->b_edev != dev ||
804 804 (bp->b_flags & B_STALE))
805 805 continue;
806 806 /*
807 807 * Get access to a valid buffer without sleeping
808 808 */
809 809 if (sema_tryp(&bp->b_sem)) {
810 810 if (bp->b_flags & B_DONE) {
811 811 hp->b_length--;
812 812 notavail(bp);
813 813 mutex_exit(hmp);
814 814 return (bp);
815 815 } else {
816 816 sema_v(&bp->b_sem);
817 817 break;
818 818 }
819 819 }
820 820 break;
821 821 }
822 822 mutex_exit(hmp);
823 823 return (NULL);
824 824 }
825 825
826 826 /*
827 827 * Wait for I/O completion on the buffer; return errors
828 828 * to the user.
829 829 */
830 830 int
831 831 iowait(struct buf *bp)
832 832 {
833 833 ASSERT(SEMA_HELD(&bp->b_sem));
834 834 return (biowait(bp));
835 835 }
836 836
837 837 /*
838 838 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
839 839 * and wake up anyone waiting for it.
840 840 */
841 841 void
842 842 iodone(struct buf *bp)
843 843 {
844 844 ASSERT(SEMA_HELD(&bp->b_sem));
845 845 (void) biodone(bp);
846 846 }
847 847
848 848 /*
849 849 * Zero the core associated with a buffer.
850 850 */
851 851 void
852 852 clrbuf(struct buf *bp)
853 853 {
854 854 ASSERT(SEMA_HELD(&bp->b_sem));
855 855 bzero(bp->b_un.b_addr, bp->b_bcount);
856 856 bp->b_resid = 0;
857 857 }
858 858
859 859
860 860 /*
861 861 * Make sure all write-behind blocks on dev (or NODEV for all)
862 862 * are flushed out.
863 863 */
864 864 void
865 865 bflush(dev_t dev)
866 866 {
867 867 struct buf *bp, *dp;
868 868 struct hbuf *hp;
869 869 struct buf *delwri_list = EMPTY_LIST;
870 870 int i, index;
871 871 kmutex_t *hmp;
872 872
873 873 mutex_enter(&blist_lock);
874 874 /*
875 875 * Wait for any invalidates or flushes ahead of us to finish.
876 876 * We really could split blist_lock up per device for better
877 877 * parallelism here.
878 878 */
879 879 while (bio_doinginval || bio_doingflush) {
880 880 bio_flinv_cv_wanted = 1;
881 881 cv_wait(&bio_flushinval_cv, &blist_lock);
882 882 }
883 883 bio_doingflush++;
884 884 /*
885 885 * Gather all B_DELWRI buffer for device.
886 886 * Lock ordering is b_sem > hash lock (brelse).
887 887 * Since we are finding the buffer via the delayed write list,
888 888 * it may be busy and we would block trying to get the
889 889 * b_sem lock while holding hash lock. So transfer all the
890 890 * candidates on the delwri_list and then drop the hash locks.
891 891 */
892 892 for (i = 0; i < v.v_hbuf; i++) {
893 893 hmp = &hbuf[i].b_lock;
894 894 dp = (struct buf *)&dwbuf[i];
895 895 mutex_enter(hmp);
896 896 for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
897 897 if (dev == NODEV || bp->b_edev == dev) {
898 898 if (bp->b_list == NULL) {
899 899 bp->b_list = delwri_list;
900 900 delwri_list = bp;
901 901 }
902 902 }
903 903 }
904 904 mutex_exit(hmp);
905 905 }
906 906 mutex_exit(&blist_lock);
907 907
908 908 /*
909 909 * Now that the hash locks have been dropped grab the semaphores
910 910 * and write back all the buffers that have B_DELWRI set.
911 911 */
912 912 while (delwri_list != EMPTY_LIST) {
913 913 bp = delwri_list;
914 914
915 915 sema_p(&bp->b_sem); /* may block */
916 916 if ((dev != bp->b_edev && dev != NODEV) ||
917 917 (panicstr && bp->b_flags & B_BUSY)) {
918 918 sema_v(&bp->b_sem);
919 919 delwri_list = bp->b_list;
920 920 bp->b_list = NULL;
921 921 continue; /* No longer a candidate */
922 922 }
923 923 if (bp->b_flags & B_DELWRI) {
924 924 index = bio_bhash(bp->b_edev, bp->b_blkno);
925 925 hp = &hbuf[index];
926 926 hmp = &hp->b_lock;
927 927 dp = (struct buf *)hp;
928 928
929 929 bp->b_flags |= B_ASYNC;
930 930 mutex_enter(hmp);
931 931 hp->b_length--;
932 932 notavail(bp);
933 933 mutex_exit(hmp);
934 934 if (bp->b_vp == NULL) { /* !ufs */
935 935 BWRITE(bp);
936 936 } else { /* ufs */
937 937 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
938 938 }
939 939 } else {
940 940 sema_v(&bp->b_sem);
941 941 }
942 942 delwri_list = bp->b_list;
943 943 bp->b_list = NULL;
944 944 }
945 945 mutex_enter(&blist_lock);
946 946 bio_doingflush--;
947 947 if (bio_flinv_cv_wanted) {
948 948 bio_flinv_cv_wanted = 0;
949 949 cv_broadcast(&bio_flushinval_cv);
950 950 }
951 951 mutex_exit(&blist_lock);
952 952 }
953 953
954 954 /*
955 955 * Ensure that a specified block is up-to-date on disk.
956 956 */
957 957 void
958 958 blkflush(dev_t dev, daddr_t blkno)
959 959 {
960 960 struct buf *bp, *dp;
961 961 struct hbuf *hp;
962 962 struct buf *sbp = NULL;
963 963 uint_t index;
964 964 kmutex_t *hmp;
965 965
966 966 index = bio_bhash(dev, blkno);
967 967 hp = &hbuf[index];
968 968 dp = (struct buf *)hp;
969 969 hmp = &hp->b_lock;
970 970
971 971 /*
972 972 * Identify the buffer in the cache belonging to
973 973 * this device and blkno (if any).
974 974 */
975 975 mutex_enter(hmp);
976 976 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
977 977 if (bp->b_blkno != blkno || bp->b_edev != dev ||
978 978 (bp->b_flags & B_STALE))
979 979 continue;
980 980 sbp = bp;
981 981 break;
982 982 }
983 983 mutex_exit(hmp);
984 984 if (sbp == NULL)
985 985 return;
986 986 /*
987 987 * Now check the buffer we have identified and
988 988 * make sure it still belongs to the device and is B_DELWRI
989 989 */
990 990 sema_p(&sbp->b_sem);
991 991 if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
992 992 (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
993 993 mutex_enter(hmp);
994 994 hp->b_length--;
995 995 notavail(sbp);
996 996 mutex_exit(hmp);
997 997 /*
998 998 * XXX - There is nothing to guarantee a synchronous
999 999 * write here if the B_ASYNC flag is set. This needs
1000 1000 * some investigation.
1001 1001 */
1002 1002 if (sbp->b_vp == NULL) { /* !ufs */
1003 1003 BWRITE(sbp); /* synchronous write */
1004 1004 } else { /* ufs */
1005 1005 UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1006 1006 }
1007 1007 } else {
1008 1008 sema_v(&sbp->b_sem);
1009 1009 }
1010 1010 }
1011 1011
1012 1012 /*
1013 1013 * Same as binval, except can force-invalidate delayed-write buffers
1014 1014 * (which are not be already flushed because of device errors). Also
1015 1015 * makes sure that the retry write flag is cleared.
1016 1016 */
1017 1017 int
1018 1018 bfinval(dev_t dev, int force)
1019 1019 {
1020 1020 struct buf *dp;
1021 1021 struct buf *bp;
1022 1022 struct buf *binval_list = EMPTY_LIST;
1023 1023 int i, error = 0;
1024 1024 kmutex_t *hmp;
1025 1025 uint_t index;
1026 1026 struct buf **backp;
1027 1027
1028 1028 mutex_enter(&blist_lock);
1029 1029 /*
1030 1030 * Wait for any flushes ahead of us to finish, it's ok to
1031 1031 * do invalidates in parallel.
1032 1032 */
1033 1033 while (bio_doingflush) {
1034 1034 bio_flinv_cv_wanted = 1;
1035 1035 cv_wait(&bio_flushinval_cv, &blist_lock);
1036 1036 }
1037 1037 bio_doinginval++;
1038 1038
1039 1039 /* Gather bp's */
1040 1040 for (i = 0; i < v.v_hbuf; i++) {
1041 1041 dp = (struct buf *)&hbuf[i];
1042 1042 hmp = &hbuf[i].b_lock;
1043 1043
1044 1044 mutex_enter(hmp);
1045 1045 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1046 1046 if (bp->b_edev == dev) {
1047 1047 if (bp->b_list == NULL) {
1048 1048 bp->b_list = binval_list;
1049 1049 binval_list = bp;
1050 1050 }
1051 1051 }
1052 1052 }
1053 1053 mutex_exit(hmp);
1054 1054 }
1055 1055 mutex_exit(&blist_lock);
1056 1056
1057 1057 /* Invalidate all bp's found */
1058 1058 while (binval_list != EMPTY_LIST) {
1059 1059 bp = binval_list;
1060 1060
1061 1061 sema_p(&bp->b_sem);
1062 1062 if (bp->b_edev == dev) {
1063 1063 if (force && (bp->b_flags & B_DELWRI)) {
1064 1064 /* clear B_DELWRI, move to non-dw freelist */
1065 1065 index = bio_bhash(bp->b_edev, bp->b_blkno);
1066 1066 hmp = &hbuf[index].b_lock;
1067 1067 dp = (struct buf *)&hbuf[index];
1068 1068 mutex_enter(hmp);
1069 1069
1070 1070 /* remove from delayed write freelist */
1071 1071 notavail(bp);
1072 1072
1073 1073 /* add to B_AGE side of non-dw freelist */
1074 1074 backp = &dp->av_forw;
1075 1075 (*backp)->av_back = bp;
1076 1076 bp->av_forw = *backp;
1077 1077 *backp = bp;
1078 1078 bp->av_back = dp;
1079 1079
1080 1080 /*
1081 1081 * make sure write retries and busy are cleared
1082 1082 */
1083 1083 bp->b_flags &=
1084 1084 ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1085 1085 mutex_exit(hmp);
1086 1086 }
1087 1087 if ((bp->b_flags & B_DELWRI) == 0)
1088 1088 bp->b_flags |= B_STALE|B_AGE;
1089 1089 else
1090 1090 error = EIO;
1091 1091 }
1092 1092 sema_v(&bp->b_sem);
1093 1093 binval_list = bp->b_list;
1094 1094 bp->b_list = NULL;
1095 1095 }
1096 1096 mutex_enter(&blist_lock);
1097 1097 bio_doinginval--;
1098 1098 if (bio_flinv_cv_wanted) {
1099 1099 cv_broadcast(&bio_flushinval_cv);
1100 1100 bio_flinv_cv_wanted = 0;
1101 1101 }
1102 1102 mutex_exit(&blist_lock);
1103 1103 return (error);
1104 1104 }
1105 1105
1106 1106 /*
1107 1107 * If possible, invalidate blocks for a dev on demand
1108 1108 */
1109 1109 void
1110 1110 binval(dev_t dev)
1111 1111 {
1112 1112 (void) bfinval(dev, 0);
1113 1113 }
1114 1114
1115 1115 /*
1116 1116 * Initialize the buffer I/O system by freeing
1117 1117 * all buffers and setting all device hash buffer lists to empty.
1118 1118 */
1119 1119 void
1120 1120 binit(void)
1121 1121 {
1122 1122 struct buf *bp;
1123 1123 unsigned int i, pct;
1124 1124 ulong_t bio_max_hwm, bio_default_hwm;
1125 1125
1126 1126 /*
1127 1127 * Maximum/Default values for bufhwm are set to the smallest of:
1128 1128 * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1129 1129 * - 1/4 of kernel virtual memory
1130 1130 * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1131 1131 * Additionally, in order to allow simple tuning by percentage of
1132 1132 * physical memory, bufhwm_pct is used to calculate the default if
1133 1133 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1134 1134 *
1135 1135 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1136 1136 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1137 1137 */
1138 1138 bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1139 1139 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1140 1140 bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1141 1141
1142 1142 pct = BIO_BUF_PERCENT;
1143 1143 if (bufhwm_pct != 0 &&
1144 1144 ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1145 1145 pct = BIO_BUF_PERCENT;
1146 1146 /*
1147 1147 * Invalid user specified value, emit a warning.
1148 1148 */
1149 1149 cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1150 1150 range(1..%d). Using %d as default.",
1151 1151 bufhwm_pct,
1152 1152 100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1153 1153 }
1154 1154
1155 1155 bio_default_hwm = MIN(physmem / pct,
1156 1156 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1157 1157 bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1158 1158
1159 1159 if ((v.v_bufhwm = bufhwm) == 0)
1160 1160 v.v_bufhwm = bio_default_hwm;
1161 1161
1162 1162 if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1163 1163 v.v_bufhwm = (int)bio_max_hwm;
1164 1164 /*
1165 1165 * Invalid user specified value, emit a warning.
1166 1166 */
1167 1167 cmn_err(CE_WARN,
1168 1168 "binit: bufhwm(%d) out \
1169 1169 of range(%d..%lu). Using %lu as default",
1170 1170 bufhwm,
1171 1171 BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1172 1172 }
1173 1173
1174 1174 /*
1175 1175 * Determine the number of hash buckets. Default is to
1176 1176 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1177 1177 * Round up number to the next power of 2.
1178 1178 */
1179 1179 v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1180 1180 BIO_HASHLEN);
1181 1181 v.v_hmask = v.v_hbuf - 1;
1182 1182 v.v_buf = BIO_BHDR_POOL;
1183 1183
1184 1184 hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1185 1185
1186 1186 dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1187 1187
1188 1188 bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1189 1189 bp = &bfreelist;
1190 1190 bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1191 1191
1192 1192 for (i = 0; i < v.v_hbuf; i++) {
1193 1193 hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1194 1194 hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1195 1195
1196 1196 /*
1197 1197 * Initialize the delayed write buffer list.
1198 1198 */
1199 1199 dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1200 1200 dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1201 1201 }
1202 1202 }
1203 1203
1204 1204 /*
1205 1205 * Wait for I/O completion on the buffer; return error code.
1206 1206 * If bp was for synchronous I/O, bp is invalid and associated
1207 1207 * resources are freed on return.
1208 1208 */
1209 1209 int
1210 1210 biowait(struct buf *bp)
1211 1211 {
1212 1212 int error = 0;
1213 1213 struct cpu *cpup;
1214 1214
1215 1215 ASSERT(SEMA_HELD(&bp->b_sem));
1216 1216
1217 1217 cpup = CPU;
1218 1218 atomic_inc_64(&cpup->cpu_stats.sys.iowait);
1219 1219 DTRACE_IO1(wait__start, struct buf *, bp);
1220 1220
1221 1221 /*
1222 1222 * In case of panic, busy wait for completion
1223 1223 */
1224 1224 if (panicstr) {
1225 1225 while ((bp->b_flags & B_DONE) == 0)
1226 1226 drv_usecwait(10);
1227 1227 } else
1228 1228 sema_p(&bp->b_io);
1229 1229
1230 1230 DTRACE_IO1(wait__done, struct buf *, bp);
1231 1231 atomic_dec_64(&cpup->cpu_stats.sys.iowait);
1232 1232
1233 1233 error = geterror(bp);
1234 1234 if ((bp->b_flags & B_ASYNC) == 0) {
1235 1235 if (bp->b_flags & B_REMAPPED)
1236 1236 bp_mapout(bp);
1237 1237 }
1238 1238 return (error);
1239 1239 }
1240 1240
1241 1241 static void
1242 1242 biodone_tnf_probe(struct buf *bp)
1243 1243 {
1244 1244 /* Kernel probe */
1245 1245 TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1246 1246 tnf_device, device, bp->b_edev,
1247 1247 tnf_diskaddr, block, bp->b_lblkno,
1248 1248 tnf_opaque, buf, bp);
1249 1249 }
1250 1250
1251 1251 /*
1252 1252 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1253 1253 * and wake up anyone waiting for it.
1254 1254 */
1255 1255 void
1256 1256 biodone(struct buf *bp)
1257 1257 {
1258 1258 if (bp->b_flags & B_STARTED) {
1259 1259 DTRACE_IO1(done, struct buf *, bp);
1260 1260 bp->b_flags &= ~B_STARTED;
1261 1261 }
1262 1262
1263 1263 /*
1264 1264 * Call the TNF probe here instead of the inline code
1265 1265 * to force our compiler to use the tail call optimization.
1266 1266 */
1267 1267 biodone_tnf_probe(bp);
1268 1268
1269 1269 if (bp->b_iodone != NULL) {
1270 1270 (*(bp->b_iodone))(bp);
1271 1271 return;
1272 1272 }
1273 1273 ASSERT((bp->b_flags & B_DONE) == 0);
1274 1274 ASSERT(SEMA_HELD(&bp->b_sem));
1275 1275 bp->b_flags |= B_DONE;
1276 1276 if (bp->b_flags & B_ASYNC) {
1277 1277 if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1278 1278 bio_pageio_done(bp);
1279 1279 else
1280 1280 brelse(bp); /* release bp to freelist */
1281 1281 } else {
1282 1282 sema_v(&bp->b_io);
1283 1283 }
1284 1284 }
1285 1285
1286 1286 /*
1287 1287 * Pick up the device's error number and pass it to the user;
1288 1288 * if there is an error but the number is 0 set a generalized code.
1289 1289 */
1290 1290 int
1291 1291 geterror(struct buf *bp)
1292 1292 {
1293 1293 int error = 0;
1294 1294
1295 1295 ASSERT(SEMA_HELD(&bp->b_sem));
1296 1296 if (bp->b_flags & B_ERROR) {
1297 1297 error = bp->b_error;
1298 1298 if (!error)
1299 1299 error = EIO;
1300 1300 }
1301 1301 return (error);
1302 1302 }
1303 1303
1304 1304 /*
1305 1305 * Support for pageio buffers.
1306 1306 *
1307 1307 * This stuff should be generalized to provide a generalized bp
1308 1308 * header facility that can be used for things other than pageio.
1309 1309 */
1310 1310
1311 1311 /*
1312 1312 * Allocate and initialize a buf struct for use with pageio.
1313 1313 */
1314 1314 struct buf *
1315 1315 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1316 1316 {
1317 1317 struct buf *bp;
1318 1318 struct cpu *cpup;
1319 1319
1320 1320 if (flags & B_READ) {
1321 1321 CPU_STATS_ENTER_K();
1322 1322 cpup = CPU; /* get pointer AFTER preemption is disabled */
1323 1323 CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1324 1324 CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1325 1325
1326 1326 atomic_add_64(&curzone->zone_pgpgin, btopr(len));
1327 1327
1328 1328 if ((flags & B_ASYNC) == 0) {
1329 1329 klwp_t *lwp = ttolwp(curthread);
1330 1330 if (lwp != NULL)
1331 1331 lwp->lwp_ru.majflt++;
1332 1332 CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1333 1333 /* Kernel probe */
1334 1334 TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1335 1335 tnf_opaque, vnode, pp->p_vnode,
1336 1336 tnf_offset, offset, pp->p_offset);
1337 1337 }
1338 1338 /*
1339 1339 * Update statistics for pages being paged in
1340 1340 */
1341 1341 if (pp != NULL && pp->p_vnode != NULL) {
1342 1342 if (IS_SWAPFSVP(pp->p_vnode)) {
1343 1343 CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
1344 1344 atomic_add_64(&curzone->zone_anonpgin,
1345 1345 btopr(len));
1346 1346 } else {
1347 1347 if (pp->p_vnode->v_flag & VVMEXEC) {
1348 1348 CPU_STATS_ADDQ(cpup, vm, execpgin,
1349 1349 btopr(len));
1350 1350 atomic_add_64(&curzone->zone_execpgin,
1351 1351 btopr(len));
1352 1352 } else {
1353 1353 CPU_STATS_ADDQ(cpup, vm, fspgin,
1354 1354 btopr(len));
1355 1355 atomic_add_64(&curzone->zone_fspgin,
1356 1356 btopr(len));
1357 1357 }
1358 1358 }
1359 1359 }
1360 1360 CPU_STATS_EXIT_K();
1361 1361 TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1362 1362 "page_ws_in:pp %p", pp);
1363 1363 /* Kernel probe */
1364 1364 TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1365 1365 tnf_opaque, vnode, pp->p_vnode,
1366 1366 tnf_offset, offset, pp->p_offset,
1367 1367 tnf_size, size, len);
1368 1368 }
1369 1369
1370 1370 bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1371 1371 bp->b_bcount = len;
1372 1372 bp->b_bufsize = len;
↓ open down ↓ |
1338 lines elided |
↑ open up ↑ |
1373 1373 bp->b_pages = pp;
1374 1374 bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1375 1375 bp->b_offset = -1;
1376 1376 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1377 1377
1378 1378 /* Initialize bp->b_sem in "locked" state */
1379 1379 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1380 1380
1381 1381 VN_HOLD(vp);
1382 1382 bp->b_vp = vp;
1383 - THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */
1384 1383
1385 1384 /*
1386 1385 * Caller sets dev & blkno and can adjust
1387 1386 * b_addr for page offset and can use bp_mapin
1388 1387 * to make pages kernel addressable.
1389 1388 */
1390 1389 return (bp);
1391 1390 }
1392 1391
1393 1392 void
1394 1393 pageio_done(struct buf *bp)
1395 1394 {
1396 1395 ASSERT(SEMA_HELD(&bp->b_sem));
1397 1396 if (bp->b_flags & B_REMAPPED)
1398 1397 bp_mapout(bp);
1399 1398 VN_RELE(bp->b_vp);
1400 1399 bp->b_vp = NULL;
1401 1400 ASSERT((bp->b_flags & B_NOCACHE) != 0);
1402 1401
1403 1402 /* A sema_v(bp->b_sem) is implied if we are destroying it */
1404 1403 sema_destroy(&bp->b_sem);
1405 1404 sema_destroy(&bp->b_io);
1406 1405 kmem_free(bp, sizeof (struct buf));
1407 1406 }
1408 1407
1409 1408 /*
1410 1409 * Check to see whether the buffers, except the one pointed by sbp,
1411 1410 * associated with the device are busy.
1412 1411 * NOTE: This expensive operation shall be improved together with ufs_icheck().
1413 1412 */
1414 1413 int
1415 1414 bcheck(dev_t dev, struct buf *sbp)
1416 1415 {
1417 1416 struct buf *bp;
1418 1417 struct buf *dp;
1419 1418 int i;
1420 1419 kmutex_t *hmp;
1421 1420
1422 1421 /*
1423 1422 * check for busy bufs for this filesystem
1424 1423 */
1425 1424 for (i = 0; i < v.v_hbuf; i++) {
1426 1425 dp = (struct buf *)&hbuf[i];
1427 1426 hmp = &hbuf[i].b_lock;
1428 1427
1429 1428 mutex_enter(hmp);
1430 1429 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1431 1430 /*
1432 1431 * if buf is busy or dirty, then filesystem is busy
1433 1432 */
1434 1433 if ((bp->b_edev == dev) &&
1435 1434 ((bp->b_flags & B_STALE) == 0) &&
1436 1435 (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1437 1436 (bp != sbp)) {
1438 1437 mutex_exit(hmp);
1439 1438 return (1);
1440 1439 }
1441 1440 }
1442 1441 mutex_exit(hmp);
1443 1442 }
1444 1443 return (0);
1445 1444 }
1446 1445
1447 1446 /*
1448 1447 * Hash two 32 bit entities.
1449 1448 */
1450 1449 int
1451 1450 hash2ints(int x, int y)
1452 1451 {
1453 1452 int hash = 0;
1454 1453
1455 1454 hash = x - 1;
1456 1455 hash = ((hash * 7) + (x >> 8)) - 1;
1457 1456 hash = ((hash * 7) + (x >> 16)) - 1;
1458 1457 hash = ((hash * 7) + (x >> 24)) - 1;
1459 1458 hash = ((hash * 7) + y) - 1;
1460 1459 hash = ((hash * 7) + (y >> 8)) - 1;
1461 1460 hash = ((hash * 7) + (y >> 16)) - 1;
1462 1461 hash = ((hash * 7) + (y >> 24)) - 1;
1463 1462
1464 1463 return (hash);
1465 1464 }
1466 1465
1467 1466
1468 1467 /*
1469 1468 * Return a new buffer struct.
1470 1469 * Create a new buffer if we haven't gone over our high water
1471 1470 * mark for memory, otherwise try to get one off the freelist.
1472 1471 *
1473 1472 * Returns a locked buf that has no id and is not on any hash or free
1474 1473 * list.
1475 1474 */
1476 1475 static struct buf *
1477 1476 bio_getfreeblk(long bsize)
1478 1477 {
1479 1478 struct buf *bp, *dp;
1480 1479 struct hbuf *hp;
1481 1480 kmutex_t *hmp;
1482 1481 uint_t start, end;
1483 1482
1484 1483 /*
1485 1484 * mutex_enter(&bfree_lock);
1486 1485 * bfreelist.b_bufsize represents the amount of memory
1487 1486 * mutex_exit(&bfree_lock); protect ref to bfreelist
1488 1487 * we are allowed to allocate in the cache before we hit our hwm.
1489 1488 */
1490 1489 bio_mem_get(bsize); /* Account for our memory request */
1491 1490
1492 1491 again:
1493 1492 bp = bio_bhdr_alloc(); /* Get a buf hdr */
1494 1493 sema_p(&bp->b_sem); /* Should never fail */
1495 1494
1496 1495 ASSERT(bp->b_un.b_addr == NULL);
1497 1496 bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1498 1497 if (bp->b_un.b_addr != NULL) {
1499 1498 /*
1500 1499 * Make the common path short
1501 1500 */
1502 1501 bp->b_bufsize = bsize;
1503 1502 ASSERT(SEMA_HELD(&bp->b_sem));
1504 1503 return (bp);
1505 1504 } else {
1506 1505 struct buf *save;
1507 1506
1508 1507 save = bp; /* Save bp we allocated */
1509 1508 start = end = lastindex;
1510 1509
1511 1510 biostats.bio_bufwant.value.ui32++;
1512 1511
1513 1512 /*
1514 1513 * Memory isn't available from the system now. Scan
1515 1514 * the hash buckets till enough space is found.
1516 1515 */
1517 1516 do {
1518 1517 hp = &hbuf[start];
1519 1518 hmp = &hp->b_lock;
1520 1519 dp = (struct buf *)hp;
1521 1520
1522 1521 mutex_enter(hmp);
1523 1522 bp = dp->av_forw;
1524 1523
1525 1524 while (bp != dp) {
1526 1525
1527 1526 ASSERT(bp != NULL);
1528 1527
1529 1528 if (!sema_tryp(&bp->b_sem)) {
1530 1529 bp = bp->av_forw;
1531 1530 continue;
1532 1531 }
1533 1532
1534 1533 /*
1535 1534 * Since we are going down the freelist
1536 1535 * associated with this hash bucket the
1537 1536 * B_DELWRI flag should not be set.
1538 1537 */
1539 1538 ASSERT(!(bp->b_flags & B_DELWRI));
1540 1539
1541 1540 if (bp->b_bufsize == bsize) {
1542 1541 hp->b_length--;
1543 1542 notavail(bp);
1544 1543 bremhash(bp);
1545 1544 mutex_exit(hmp);
1546 1545
1547 1546 /*
1548 1547 * Didn't kmem_alloc any more, so don't
1549 1548 * count it twice.
1550 1549 */
1551 1550 mutex_enter(&bfree_lock);
1552 1551 bfreelist.b_bufsize += bsize;
1553 1552 mutex_exit(&bfree_lock);
1554 1553
1555 1554 /*
1556 1555 * Update the lastindex value.
1557 1556 */
1558 1557 lastindex = start;
1559 1558
1560 1559 /*
1561 1560 * Put our saved bp back on the list
1562 1561 */
1563 1562 sema_v(&save->b_sem);
1564 1563 bio_bhdr_free(save);
1565 1564 ASSERT(SEMA_HELD(&bp->b_sem));
1566 1565 return (bp);
1567 1566 }
1568 1567 sema_v(&bp->b_sem);
1569 1568 bp = bp->av_forw;
1570 1569 }
1571 1570 mutex_exit(hmp);
1572 1571 start = ((start + 1) % v.v_hbuf);
1573 1572 } while (start != end);
1574 1573
1575 1574 biostats.bio_bufwait.value.ui32++;
1576 1575 bp = save; /* Use original bp */
1577 1576 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1578 1577 }
1579 1578
1580 1579 bp->b_bufsize = bsize;
1581 1580 ASSERT(SEMA_HELD(&bp->b_sem));
1582 1581 return (bp);
1583 1582 }
1584 1583
1585 1584 /*
1586 1585 * Allocate a buffer header. If none currently available, allocate
1587 1586 * a new pool.
1588 1587 */
1589 1588 static struct buf *
1590 1589 bio_bhdr_alloc(void)
1591 1590 {
1592 1591 struct buf *dp, *sdp;
1593 1592 struct buf *bp;
1594 1593 int i;
1595 1594
1596 1595 for (;;) {
1597 1596 mutex_enter(&bhdr_lock);
1598 1597 if (bhdrlist != NULL) {
1599 1598 bp = bhdrlist;
1600 1599 bhdrlist = bp->av_forw;
1601 1600 mutex_exit(&bhdr_lock);
1602 1601 bp->av_forw = NULL;
1603 1602 return (bp);
1604 1603 }
1605 1604 mutex_exit(&bhdr_lock);
1606 1605
1607 1606 /*
1608 1607 * Need to allocate a new pool. If the system is currently
1609 1608 * out of memory, then try freeing things on the freelist.
1610 1609 */
1611 1610 dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1612 1611 if (dp == NULL) {
1613 1612 /*
1614 1613 * System can't give us a pool of headers, try
1615 1614 * recycling from the free lists.
1616 1615 */
1617 1616 bio_recycle(BIO_HEADER, 0);
1618 1617 } else {
1619 1618 sdp = dp;
1620 1619 for (i = 0; i < v.v_buf; i++, dp++) {
1621 1620 /*
1622 1621 * The next two lines are needed since NODEV
1623 1622 * is -1 and not NULL
1624 1623 */
1625 1624 dp->b_dev = (o_dev_t)NODEV;
1626 1625 dp->b_edev = NODEV;
1627 1626 dp->av_forw = dp + 1;
1628 1627 sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1629 1628 NULL);
1630 1629 sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1631 1630 NULL);
1632 1631 dp->b_offset = -1;
1633 1632 }
1634 1633 mutex_enter(&bhdr_lock);
1635 1634 (--dp)->av_forw = bhdrlist; /* Fix last pointer */
1636 1635 bhdrlist = sdp;
1637 1636 nbuf += v.v_buf;
1638 1637 bp = bhdrlist;
1639 1638 bhdrlist = bp->av_forw;
1640 1639 mutex_exit(&bhdr_lock);
1641 1640
1642 1641 bp->av_forw = NULL;
1643 1642 return (bp);
1644 1643 }
1645 1644 }
1646 1645 }
1647 1646
1648 1647 static void
1649 1648 bio_bhdr_free(struct buf *bp)
1650 1649 {
1651 1650 ASSERT(bp->b_back == NULL);
1652 1651 ASSERT(bp->b_forw == NULL);
1653 1652 ASSERT(bp->av_back == NULL);
1654 1653 ASSERT(bp->av_forw == NULL);
1655 1654 ASSERT(bp->b_un.b_addr == NULL);
1656 1655 ASSERT(bp->b_dev == (o_dev_t)NODEV);
1657 1656 ASSERT(bp->b_edev == NODEV);
1658 1657 ASSERT(bp->b_flags == 0);
1659 1658
1660 1659 mutex_enter(&bhdr_lock);
1661 1660 bp->av_forw = bhdrlist;
1662 1661 bhdrlist = bp;
1663 1662 mutex_exit(&bhdr_lock);
1664 1663 }
1665 1664
1666 1665 /*
1667 1666 * If we haven't gone over the high water mark, it's o.k. to
1668 1667 * allocate more buffer space, otherwise recycle buffers
1669 1668 * from the freelist until enough memory is free for a bsize request.
1670 1669 *
1671 1670 * We account for this memory, even though
1672 1671 * we don't allocate it here.
1673 1672 */
1674 1673 static void
1675 1674 bio_mem_get(long bsize)
1676 1675 {
1677 1676 mutex_enter(&bfree_lock);
1678 1677 if (bfreelist.b_bufsize > bsize) {
1679 1678 bfreelist.b_bufsize -= bsize;
1680 1679 mutex_exit(&bfree_lock);
1681 1680 return;
1682 1681 }
1683 1682 mutex_exit(&bfree_lock);
1684 1683 bio_recycle(BIO_MEM, bsize);
1685 1684 }
1686 1685
1687 1686 /*
1688 1687 * flush a list of delayed write buffers.
1689 1688 * (currently used only by bio_recycle below.)
1690 1689 */
1691 1690 static void
1692 1691 bio_flushlist(struct buf *delwri_list)
1693 1692 {
1694 1693 struct buf *bp;
1695 1694
1696 1695 while (delwri_list != EMPTY_LIST) {
1697 1696 bp = delwri_list;
1698 1697 bp->b_flags |= B_AGE | B_ASYNC;
1699 1698 if (bp->b_vp == NULL) { /* !ufs */
1700 1699 BWRITE(bp);
1701 1700 } else { /* ufs */
1702 1701 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1703 1702 }
1704 1703 delwri_list = bp->b_list;
1705 1704 bp->b_list = NULL;
1706 1705 }
1707 1706 }
1708 1707
1709 1708 /*
1710 1709 * Start recycling buffers on the freelist for one of 2 reasons:
1711 1710 * - we need a buffer header
1712 1711 * - we need to free up memory
1713 1712 * Once started we continue to recycle buffers until the B_AGE
1714 1713 * buffers are gone.
1715 1714 */
1716 1715 static void
1717 1716 bio_recycle(int want, long bsize)
1718 1717 {
1719 1718 struct buf *bp, *dp, *dwp, *nbp;
1720 1719 struct hbuf *hp;
1721 1720 int found = 0;
1722 1721 kmutex_t *hmp;
1723 1722 int start, end;
1724 1723 struct buf *delwri_list = EMPTY_LIST;
1725 1724
1726 1725 /*
1727 1726 * Recycle buffers.
1728 1727 */
1729 1728 top:
1730 1729 start = end = lastindex;
1731 1730 do {
1732 1731 hp = &hbuf[start];
1733 1732 hmp = &hp->b_lock;
1734 1733 dp = (struct buf *)hp;
1735 1734
1736 1735 mutex_enter(hmp);
1737 1736 bp = dp->av_forw;
1738 1737
1739 1738 while (bp != dp) {
1740 1739
1741 1740 ASSERT(bp != NULL);
1742 1741
1743 1742 if (!sema_tryp(&bp->b_sem)) {
1744 1743 bp = bp->av_forw;
1745 1744 continue;
1746 1745 }
1747 1746 /*
1748 1747 * Do we really want to nuke all of the B_AGE stuff??
1749 1748 */
1750 1749 if ((bp->b_flags & B_AGE) == 0 && found) {
1751 1750 sema_v(&bp->b_sem);
1752 1751 mutex_exit(hmp);
1753 1752 lastindex = start;
1754 1753 return; /* All done */
1755 1754 }
1756 1755
1757 1756 ASSERT(MUTEX_HELD(&hp->b_lock));
1758 1757 ASSERT(!(bp->b_flags & B_DELWRI));
1759 1758 hp->b_length--;
1760 1759 notavail(bp);
1761 1760
1762 1761 /*
1763 1762 * Remove bhdr from cache, free up memory,
1764 1763 * and add the hdr to the freelist.
1765 1764 */
1766 1765 bremhash(bp);
1767 1766 mutex_exit(hmp);
1768 1767
1769 1768 if (bp->b_bufsize) {
1770 1769 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1771 1770 bp->b_un.b_addr = NULL;
1772 1771 mutex_enter(&bfree_lock);
1773 1772 bfreelist.b_bufsize += bp->b_bufsize;
1774 1773 mutex_exit(&bfree_lock);
1775 1774 }
1776 1775
1777 1776 bp->b_dev = (o_dev_t)NODEV;
1778 1777 bp->b_edev = NODEV;
1779 1778 bp->b_flags = 0;
1780 1779 sema_v(&bp->b_sem);
1781 1780 bio_bhdr_free(bp);
1782 1781 if (want == BIO_HEADER) {
1783 1782 found = 1;
1784 1783 } else {
1785 1784 ASSERT(want == BIO_MEM);
1786 1785 if (!found && bfreelist.b_bufsize >= bsize) {
1787 1786 /* Account for the memory we want */
1788 1787 mutex_enter(&bfree_lock);
1789 1788 if (bfreelist.b_bufsize >= bsize) {
1790 1789 bfreelist.b_bufsize -= bsize;
1791 1790 found = 1;
1792 1791 }
1793 1792 mutex_exit(&bfree_lock);
1794 1793 }
1795 1794 }
1796 1795
1797 1796 /*
1798 1797 * Since we dropped hmp start from the
1799 1798 * begining.
1800 1799 */
1801 1800 mutex_enter(hmp);
1802 1801 bp = dp->av_forw;
1803 1802 }
1804 1803 mutex_exit(hmp);
1805 1804
1806 1805 /*
1807 1806 * Look at the delayed write list.
1808 1807 * First gather into a private list, then write them.
1809 1808 */
1810 1809 dwp = (struct buf *)&dwbuf[start];
1811 1810 mutex_enter(&blist_lock);
1812 1811 bio_doingflush++;
1813 1812 mutex_enter(hmp);
1814 1813 for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1815 1814
1816 1815 ASSERT(bp != NULL);
1817 1816 nbp = bp->av_forw;
1818 1817
1819 1818 if (!sema_tryp(&bp->b_sem))
1820 1819 continue;
1821 1820 ASSERT(bp->b_flags & B_DELWRI);
1822 1821 /*
1823 1822 * Do we really want to nuke all of the B_AGE stuff??
1824 1823 */
1825 1824
1826 1825 if ((bp->b_flags & B_AGE) == 0 && found) {
1827 1826 sema_v(&bp->b_sem);
1828 1827 mutex_exit(hmp);
1829 1828 lastindex = start;
1830 1829 mutex_exit(&blist_lock);
1831 1830 bio_flushlist(delwri_list);
1832 1831 mutex_enter(&blist_lock);
1833 1832 bio_doingflush--;
1834 1833 if (bio_flinv_cv_wanted) {
1835 1834 bio_flinv_cv_wanted = 0;
1836 1835 cv_broadcast(&bio_flushinval_cv);
1837 1836 }
1838 1837 mutex_exit(&blist_lock);
1839 1838 return; /* All done */
1840 1839 }
1841 1840
1842 1841 /*
1843 1842 * If the buffer is already on a flush or
1844 1843 * invalidate list then just skip it.
1845 1844 */
1846 1845 if (bp->b_list != NULL) {
1847 1846 sema_v(&bp->b_sem);
1848 1847 continue;
1849 1848 }
1850 1849 /*
1851 1850 * We are still on the same bucket.
1852 1851 */
1853 1852 hp->b_length--;
1854 1853 notavail(bp);
1855 1854 bp->b_list = delwri_list;
1856 1855 delwri_list = bp;
1857 1856 }
1858 1857 mutex_exit(hmp);
1859 1858 mutex_exit(&blist_lock);
1860 1859 bio_flushlist(delwri_list);
1861 1860 delwri_list = EMPTY_LIST;
1862 1861 mutex_enter(&blist_lock);
1863 1862 bio_doingflush--;
1864 1863 if (bio_flinv_cv_wanted) {
1865 1864 bio_flinv_cv_wanted = 0;
1866 1865 cv_broadcast(&bio_flushinval_cv);
1867 1866 }
1868 1867 mutex_exit(&blist_lock);
1869 1868 start = (start + 1) % v.v_hbuf;
1870 1869
1871 1870 } while (start != end);
1872 1871
1873 1872 if (found)
1874 1873 return;
1875 1874
1876 1875 /*
1877 1876 * Free lists exhausted and we haven't satisfied the request.
1878 1877 * Wait here for more entries to be added to freelist.
1879 1878 * Because this might have just happened, make it timed.
1880 1879 */
1881 1880 mutex_enter(&bfree_lock);
1882 1881 bfreelist.b_flags |= B_WANTED;
1883 1882 (void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
1884 1883 mutex_exit(&bfree_lock);
1885 1884 goto top;
1886 1885 }
1887 1886
1888 1887 /*
1889 1888 * See if the block is associated with some buffer
1890 1889 * (mainly to avoid getting hung up on a wait in breada).
1891 1890 */
1892 1891 static int
1893 1892 bio_incore(dev_t dev, daddr_t blkno)
1894 1893 {
1895 1894 struct buf *bp;
1896 1895 struct buf *dp;
1897 1896 uint_t index;
1898 1897 kmutex_t *hmp;
1899 1898
1900 1899 index = bio_bhash(dev, blkno);
1901 1900 dp = (struct buf *)&hbuf[index];
1902 1901 hmp = &hbuf[index].b_lock;
1903 1902
1904 1903 mutex_enter(hmp);
1905 1904 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1906 1905 if (bp->b_blkno == blkno && bp->b_edev == dev &&
1907 1906 (bp->b_flags & B_STALE) == 0) {
1908 1907 mutex_exit(hmp);
1909 1908 return (1);
1910 1909 }
1911 1910 }
1912 1911 mutex_exit(hmp);
1913 1912 return (0);
1914 1913 }
1915 1914
1916 1915 static void
1917 1916 bio_pageio_done(struct buf *bp)
1918 1917 {
1919 1918 if (bp->b_flags & B_PAGEIO) {
1920 1919
1921 1920 if (bp->b_flags & B_REMAPPED)
1922 1921 bp_mapout(bp);
1923 1922
1924 1923 if (bp->b_flags & B_READ)
1925 1924 pvn_read_done(bp->b_pages, bp->b_flags);
1926 1925 else
1927 1926 pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1928 1927 pageio_done(bp);
1929 1928 } else {
1930 1929 ASSERT(bp->b_flags & B_REMAPPED);
1931 1930 bp_mapout(bp);
1932 1931 brelse(bp);
1933 1932 }
1934 1933 }
1935 1934
1936 1935 /*
1937 1936 * bioerror(9F) - indicate error in buffer header
1938 1937 * If 'error' is zero, remove the error indication.
1939 1938 */
1940 1939 void
1941 1940 bioerror(struct buf *bp, int error)
1942 1941 {
1943 1942 ASSERT(bp != NULL);
1944 1943 ASSERT(error >= 0);
1945 1944 ASSERT(SEMA_HELD(&bp->b_sem));
1946 1945
1947 1946 if (error != 0) {
1948 1947 bp->b_flags |= B_ERROR;
1949 1948 } else {
1950 1949 bp->b_flags &= ~B_ERROR;
1951 1950 }
1952 1951 bp->b_error = error;
1953 1952 }
1954 1953
1955 1954 /*
1956 1955 * bioreset(9F) - reuse a private buffer header after I/O is complete
1957 1956 */
1958 1957 void
1959 1958 bioreset(struct buf *bp)
1960 1959 {
1961 1960 ASSERT(bp != NULL);
1962 1961
1963 1962 biofini(bp);
1964 1963 bioinit(bp);
1965 1964 }
1966 1965
1967 1966 /*
1968 1967 * biosize(9F) - return size of a buffer header
1969 1968 */
1970 1969 size_t
1971 1970 biosize(void)
1972 1971 {
1973 1972 return (sizeof (struct buf));
1974 1973 }
1975 1974
1976 1975 /*
1977 1976 * biomodified(9F) - check if buffer is modified
1978 1977 */
1979 1978 int
1980 1979 biomodified(struct buf *bp)
1981 1980 {
1982 1981 int npf;
1983 1982 int ppattr;
1984 1983 struct page *pp;
1985 1984
1986 1985 ASSERT(bp != NULL);
1987 1986
1988 1987 if ((bp->b_flags & B_PAGEIO) == 0) {
1989 1988 return (-1);
1990 1989 }
1991 1990 pp = bp->b_pages;
1992 1991 npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1993 1992
1994 1993 while (npf > 0) {
1995 1994 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1996 1995 HAT_SYNC_STOPON_MOD);
1997 1996 if (ppattr & P_MOD)
1998 1997 return (1);
1999 1998 pp = pp->p_next;
2000 1999 npf--;
2001 2000 }
2002 2001
2003 2002 return (0);
2004 2003 }
2005 2004
2006 2005 /*
2007 2006 * bioinit(9F) - initialize a buffer structure
2008 2007 */
2009 2008 void
2010 2009 bioinit(struct buf *bp)
2011 2010 {
2012 2011 bzero(bp, sizeof (struct buf));
2013 2012 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
2014 2013 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
2015 2014 bp->b_offset = -1;
2016 2015 }
2017 2016
2018 2017 /*
2019 2018 * biofini(9F) - uninitialize a buffer structure
2020 2019 */
2021 2020 void
2022 2021 biofini(struct buf *bp)
2023 2022 {
2024 2023 sema_destroy(&bp->b_io);
2025 2024 sema_destroy(&bp->b_sem);
2026 2025 }
2027 2026
2028 2027 /*
2029 2028 * bioclone(9F) - clone a buffer
2030 2029 */
2031 2030 struct buf *
2032 2031 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2033 2032 int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2034 2033 {
2035 2034 struct buf *bufp;
2036 2035
2037 2036 ASSERT(bp);
2038 2037 if (bp_mem == NULL) {
2039 2038 bufp = kmem_alloc(sizeof (struct buf), sleep);
2040 2039 if (bufp == NULL) {
2041 2040 return (NULL);
2042 2041 }
2043 2042 bioinit(bufp);
2044 2043 } else {
2045 2044 bufp = bp_mem;
2046 2045 bioreset(bufp);
2047 2046 }
2048 2047
2049 2048 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2050 2049 B_ABRWRITE)
2051 2050
2052 2051 /*
2053 2052 * The cloned buffer does not inherit the B_REMAPPED flag.
2054 2053 */
2055 2054 bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS) | B_BUSY;
2056 2055 bufp->b_bcount = len;
2057 2056 bufp->b_blkno = blkno;
2058 2057 bufp->b_iodone = iodone;
2059 2058 bufp->b_proc = bp->b_proc;
2060 2059 bufp->b_edev = dev;
2061 2060 bufp->b_file = bp->b_file;
2062 2061 bufp->b_offset = bp->b_offset;
2063 2062
2064 2063 if (bp->b_flags & B_SHADOW) {
2065 2064 ASSERT(bp->b_shadow);
2066 2065 ASSERT(bp->b_flags & B_PHYS);
2067 2066
2068 2067 bufp->b_shadow = bp->b_shadow +
2069 2068 btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2070 2069 bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2071 2070 if (bp->b_flags & B_REMAPPED)
2072 2071 bufp->b_proc = NULL;
2073 2072 } else {
2074 2073 if (bp->b_flags & B_PAGEIO) {
2075 2074 struct page *pp;
2076 2075 off_t o;
2077 2076 int i;
2078 2077
2079 2078 pp = bp->b_pages;
2080 2079 o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2081 2080 for (i = btop(o); i > 0; i--) {
2082 2081 pp = pp->p_next;
2083 2082 }
2084 2083 bufp->b_pages = pp;
2085 2084 bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2086 2085 } else {
2087 2086 bufp->b_un.b_addr =
2088 2087 (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2089 2088 if (bp->b_flags & B_REMAPPED)
2090 2089 bufp->b_proc = NULL;
2091 2090 }
2092 2091 }
2093 2092 return (bufp);
2094 2093 }
↓ open down ↓ |
701 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX