1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2019 Joyent, Inc. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #include <sys/types.h> 36 #include <sys/t_lock.h> 37 #include <sys/param.h> 38 #include <sys/time.h> 39 #include <sys/systm.h> 40 #include <sys/sysmacros.h> 41 #include <sys/resource.h> 42 #include <sys/signal.h> 43 #include <sys/cred.h> 44 #include <sys/user.h> 45 #include <sys/buf.h> 46 #include <sys/vfs.h> 47 #include <sys/vnode.h> 48 #include <sys/proc.h> 49 #include <sys/disp.h> 50 #include <sys/file.h> 51 #include <sys/fcntl.h> 52 #include <sys/flock.h> 53 #include <sys/kmem.h> 54 #include <sys/uio.h> 55 #include <sys/dnlc.h> 56 #include <sys/conf.h> 57 #include <sys/mman.h> 58 #include <sys/pathname.h> 59 #include <sys/debug.h> 60 #include <sys/vmsystm.h> 61 #include <sys/cmn_err.h> 62 #include <sys/filio.h> 63 #include <sys/atomic.h> 64 65 #include <sys/fssnap_if.h> 66 #include <sys/fs/ufs_fs.h> 67 #include <sys/fs/ufs_lockfs.h> 68 #include <sys/fs/ufs_filio.h> 69 #include <sys/fs/ufs_inode.h> 70 #include <sys/fs/ufs_fsdir.h> 71 #include <sys/fs/ufs_quota.h> 72 #include <sys/fs/ufs_trans.h> 73 #include <sys/fs/ufs_panic.h> 74 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */ 75 #include <sys/errno.h> 76 77 #include <sys/filio.h> /* _FIOIO */ 78 79 #include <vm/hat.h> 80 #include <vm/page.h> 81 #include <vm/pvn.h> 82 #include <vm/as.h> 83 #include <vm/seg.h> 84 #include <vm/seg_map.h> 85 #include <vm/seg_vn.h> 86 #include <vm/seg_kmem.h> 87 #include <vm/rm.h> 88 #include <sys/swap.h> 89 #include <sys/epm.h> 90 91 #include <fs/fs_subr.h> 92 93 static void *ufs_directio_zero_buf; 94 static int ufs_directio_zero_len = 8192; 95 96 int ufs_directio_enabled = 1; /* feature is enabled */ 97 98 /* 99 * for kstats reader 100 */ 101 struct ufs_directio_kstats { 102 kstat_named_t logical_reads; 103 kstat_named_t phys_reads; 104 kstat_named_t hole_reads; 105 kstat_named_t nread; 106 kstat_named_t logical_writes; 107 kstat_named_t phys_writes; 108 kstat_named_t nwritten; 109 kstat_named_t nflushes; 110 } ufs_directio_kstats = { 111 { "logical_reads", KSTAT_DATA_UINT64 }, 112 { "phys_reads", KSTAT_DATA_UINT64 }, 113 { "hole_reads", KSTAT_DATA_UINT64 }, 114 { "nread", KSTAT_DATA_UINT64 }, 115 { "logical_writes", KSTAT_DATA_UINT64 }, 116 { "phys_writes", KSTAT_DATA_UINT64 }, 117 { "nwritten", KSTAT_DATA_UINT64 }, 118 { "nflushes", KSTAT_DATA_UINT64 }, 119 }; 120 121 kstat_t *ufs_directio_kstatsp; 122 123 /* 124 * use kmem_cache_create for direct-physio buffers. This has shown 125 * a better cache distribution compared to buffers on the 126 * stack. It also avoids semaphore construction/deconstruction 127 * per request 128 */ 129 struct directio_buf { 130 struct directio_buf *next; 131 char *addr; 132 size_t nbytes; 133 struct buf buf; 134 }; 135 static struct kmem_cache *directio_buf_cache; 136 137 138 /* ARGSUSED */ 139 static int 140 directio_buf_constructor(void *dbp, void *cdrarg, int kmflags) 141 { 142 bioinit((struct buf *)&((struct directio_buf *)dbp)->buf); 143 return (0); 144 } 145 146 /* ARGSUSED */ 147 static void 148 directio_buf_destructor(void *dbp, void *cdrarg) 149 { 150 biofini((struct buf *)&((struct directio_buf *)dbp)->buf); 151 } 152 153 void 154 directio_bufs_init(void) 155 { 156 directio_buf_cache = kmem_cache_create("directio_buf_cache", 157 sizeof (struct directio_buf), 0, 158 directio_buf_constructor, directio_buf_destructor, 159 NULL, NULL, NULL, 0); 160 } 161 162 void 163 ufs_directio_init(void) 164 { 165 /* 166 * kstats 167 */ 168 ufs_directio_kstatsp = kstat_create("ufs", 0, 169 "directio", "ufs", KSTAT_TYPE_NAMED, 170 sizeof (ufs_directio_kstats) / sizeof (kstat_named_t), 171 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); 172 if (ufs_directio_kstatsp) { 173 ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats; 174 kstat_install(ufs_directio_kstatsp); 175 } 176 /* 177 * kzero is broken so we have to use a private buf of zeroes 178 */ 179 ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP); 180 directio_bufs_init(); 181 } 182 183 /* 184 * Wait for the first direct IO operation to finish 185 */ 186 static int 187 directio_wait_one(struct directio_buf *dbp, long *bytes_iop) 188 { 189 buf_t *bp; 190 int error; 191 192 /* 193 * Wait for IO to finish 194 */ 195 bp = &dbp->buf; 196 error = biowait(bp); 197 198 /* 199 * bytes_io will be used to figure out a resid 200 * for the caller. The resid is approximated by reporting 201 * the bytes following the first failed IO as the residual. 202 * 203 * I am cautious about using b_resid because I 204 * am not sure how well the disk drivers maintain it. 205 */ 206 if (error) 207 if (bp->b_resid) 208 *bytes_iop = bp->b_bcount - bp->b_resid; 209 else 210 *bytes_iop = 0; 211 else 212 *bytes_iop += bp->b_bcount; 213 /* 214 * Release direct IO resources 215 */ 216 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW); 217 kmem_cache_free(directio_buf_cache, dbp); 218 return (error); 219 } 220 221 /* 222 * Wait for all of the direct IO operations to finish 223 */ 224 225 static int 226 directio_wait(struct directio_buf *tail, long *bytes_iop) 227 { 228 int error = 0, newerror; 229 struct directio_buf *dbp; 230 231 /* 232 * The linked list of directio buf structures is maintained 233 * in reverse order (tail->last request->penultimate request->...) 234 */ 235 while ((dbp = tail) != NULL) { 236 tail = dbp->next; 237 newerror = directio_wait_one(dbp, bytes_iop); 238 if (error == 0) 239 error = newerror; 240 } 241 return (error); 242 } 243 /* 244 * Initiate direct IO request 245 */ 246 static void 247 directio_start(struct ufsvfs *ufsvfsp, struct inode *ip, size_t nbytes, 248 offset_t offset, char *addr, enum seg_rw rw, struct proc *procp, 249 struct directio_buf **tailp, page_t **pplist) 250 { 251 buf_t *bp; 252 struct directio_buf *dbp; 253 254 /* 255 * Allocate a directio buf header 256 * Note - list is maintained in reverse order. 257 * directio_wait_one() depends on this fact when 258 * adjusting the ``bytes_io'' param. bytes_io 259 * is used to compute a residual in the case of error. 260 */ 261 dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP); 262 dbp->next = *tailp; 263 *tailp = dbp; 264 265 /* 266 * Initialize buf header 267 */ 268 dbp->addr = addr; 269 dbp->nbytes = nbytes; 270 bp = &dbp->buf; 271 bp->b_edev = ip->i_dev; 272 bp->b_lblkno = btodt(offset); 273 bp->b_bcount = nbytes; 274 bp->b_un.b_addr = addr; 275 bp->b_proc = procp; 276 bp->b_file = ip->i_vnode; 277 278 /* 279 * Note that S_WRITE implies B_READ and vice versa: a read(2) 280 * will B_READ data from the filesystem and S_WRITE it into 281 * the user's buffer; a write(2) will S_READ data from the 282 * user's buffer and B_WRITE it to the filesystem. 283 */ 284 if (rw == S_WRITE) { 285 bp->b_flags = B_BUSY | B_PHYS | B_READ; 286 ufs_directio_kstats.phys_reads.value.ui64++; 287 ufs_directio_kstats.nread.value.ui64 += nbytes; 288 } else { 289 bp->b_flags = B_BUSY | B_PHYS | B_WRITE; 290 ufs_directio_kstats.phys_writes.value.ui64++; 291 ufs_directio_kstats.nwritten.value.ui64 += nbytes; 292 } 293 bp->b_shadow = pplist; 294 if (pplist != NULL) 295 bp->b_flags |= B_SHADOW; 296 297 /* 298 * Issue I/O request. 299 */ 300 ufsvfsp->vfs_iotstamp = ddi_get_lbolt(); 301 if (ufsvfsp->vfs_snapshot) 302 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 303 else 304 (void) bdev_strategy(bp); 305 306 if (rw == S_WRITE) 307 lwp_stat_update(LWP_STAT_OUBLK, 1); 308 else 309 lwp_stat_update(LWP_STAT_INBLK, 1); 310 311 } 312 313 uint32_t ufs_shared_writes; /* writes done w/ lock shared */ 314 uint32_t ufs_cur_writes; /* # concurrent writes */ 315 uint32_t ufs_maxcur_writes; /* high water concurrent writes */ 316 uint32_t ufs_posix_hits; /* writes done /w lock excl. */ 317 318 /* 319 * Force POSIX syncronous data integrity on all writes for testing. 320 */ 321 uint32_t ufs_force_posix_sdi = 0; 322 323 /* 324 * Direct Write 325 */ 326 327 int 328 ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite, 329 cred_t *cr, int *statusp) 330 { 331 long resid, bytes_written; 332 u_offset_t size, uoff; 333 uio_t *uio = arg_uio; 334 rlim64_t limit = uio->uio_llimit; 335 int on, n, error, newerror, len, has_holes; 336 daddr_t bn; 337 size_t nbytes; 338 struct fs *fs; 339 vnode_t *vp; 340 iovec_t *iov; 341 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 342 struct proc *procp; 343 struct as *as; 344 struct directio_buf *tail; 345 int exclusive, ncur, bmap_peek; 346 uio_t copy_uio; 347 iovec_t copy_iov; 348 char *copy_base; 349 long copy_resid; 350 351 /* 352 * assume that directio isn't possible (normal case) 353 */ 354 *statusp = DIRECTIO_FAILURE; 355 356 /* 357 * Don't go direct 358 */ 359 if (ufs_directio_enabled == 0) 360 return (0); 361 362 /* 363 * mapped file; nevermind 364 */ 365 if (ip->i_mapcnt) 366 return (0); 367 368 /* 369 * CAN WE DO DIRECT IO? 370 */ 371 uoff = uio->uio_loffset; 372 resid = uio->uio_resid; 373 374 /* 375 * beyond limit 376 */ 377 if (uoff + resid > limit) 378 return (0); 379 380 /* 381 * must be sector aligned 382 */ 383 if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1))) 384 return (0); 385 386 /* 387 * SHOULD WE DO DIRECT IO? 388 */ 389 size = ip->i_size; 390 has_holes = -1; 391 392 /* 393 * only on regular files; no metadata 394 */ 395 if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip) 396 return (0); 397 398 /* 399 * Synchronous, allocating writes run very slow in Direct-Mode 400 * XXX - can be fixed with bmap_write changes for large writes!!! 401 * XXX - can be fixed for updates to "almost-full" files 402 * XXX - WARNING - system hangs if bmap_write() has to 403 * allocate lots of pages since pageout 404 * suspends on locked inode 405 */ 406 if (!rewrite && (ip->i_flag & ISYNC)) { 407 if ((uoff + resid) > size) 408 return (0); 409 has_holes = bmap_has_holes(ip); 410 if (has_holes) 411 return (0); 412 } 413 414 /* 415 * Each iovec must be short aligned and sector aligned. If 416 * one is not, then kmem_alloc a new buffer and copy all of 417 * the smaller buffers into the new buffer. This new 418 * buffer will be short aligned and sector aligned. 419 */ 420 iov = uio->uio_iov; 421 nbytes = uio->uio_iovcnt; 422 while (nbytes--) { 423 if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 || 424 (intptr_t)(iov->iov_base) & 1) { 425 copy_resid = uio->uio_resid; 426 copy_base = kmem_alloc(copy_resid, KM_NOSLEEP); 427 if (copy_base == NULL) 428 return (0); 429 copy_iov.iov_base = copy_base; 430 copy_iov.iov_len = copy_resid; 431 copy_uio.uio_iov = ©_iov; 432 copy_uio.uio_iovcnt = 1; 433 copy_uio.uio_segflg = UIO_SYSSPACE; 434 copy_uio.uio_extflg = UIO_COPY_DEFAULT; 435 copy_uio.uio_loffset = uio->uio_loffset; 436 copy_uio.uio_resid = uio->uio_resid; 437 copy_uio.uio_llimit = uio->uio_llimit; 438 error = uiomove(copy_base, copy_resid, UIO_WRITE, uio); 439 if (error) { 440 kmem_free(copy_base, copy_resid); 441 return (0); 442 } 443 uio = ©_uio; 444 break; 445 } 446 iov++; 447 } 448 449 /* 450 * From here on down, all error exits must go to errout and 451 * not simply return a 0. 452 */ 453 454 /* 455 * DIRECTIO 456 */ 457 458 fs = ip->i_fs; 459 460 /* 461 * POSIX check. If attempting a concurrent re-write, make sure 462 * that this will be a single request to the driver to meet 463 * POSIX synchronous data integrity requirements. 464 */ 465 bmap_peek = 0; 466 if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) { 467 int upgrade = 0; 468 469 /* check easy conditions first */ 470 if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) { 471 upgrade = 1; 472 } else { 473 /* now look for contiguous allocation */ 474 len = (ssize_t)blkroundup(fs, resid); 475 error = bmap_read(ip, uoff, &bn, &len); 476 if (error || bn == UFS_HOLE || len == 0) 477 goto errout; 478 /* save a call to bmap_read later */ 479 bmap_peek = 1; 480 if (len < resid) 481 upgrade = 1; 482 } 483 if (upgrade) { 484 rw_exit(&ip->i_contents); 485 rw_enter(&ip->i_contents, RW_WRITER); 486 ufs_posix_hits++; 487 } 488 } 489 490 491 /* 492 * allocate space 493 */ 494 495 /* 496 * If attempting a re-write, there is no allocation to do. 497 * bmap_write would trip an ASSERT if i_contents is held shared. 498 */ 499 if (rewrite) 500 goto skip_alloc; 501 502 do { 503 on = (int)blkoff(fs, uoff); 504 n = (int)MIN(fs->fs_bsize - on, resid); 505 if ((uoff + n) > ip->i_size) { 506 error = bmap_write(ip, uoff, (int)(on + n), 507 (int)(uoff & (offset_t)MAXBOFFSET) == 0, 508 NULL, cr); 509 /* Caller is responsible for updating i_seq if needed */ 510 if (error) 511 break; 512 ip->i_size = uoff + n; 513 ip->i_flag |= IATTCHG; 514 } else if (n == MAXBSIZE) { 515 error = bmap_write(ip, uoff, (int)(on + n), 516 BI_ALLOC_ONLY, NULL, cr); 517 /* Caller is responsible for updating i_seq if needed */ 518 } else { 519 if (has_holes < 0) 520 has_holes = bmap_has_holes(ip); 521 if (has_holes) { 522 uint_t blk_size; 523 u_offset_t offset; 524 525 offset = uoff & (offset_t)fs->fs_bmask; 526 blk_size = (int)blksize(fs, ip, 527 (daddr_t)lblkno(fs, offset)); 528 error = bmap_write(ip, uoff, blk_size, 529 BI_NORMAL, NULL, cr); 530 /* 531 * Caller is responsible for updating 532 * i_seq if needed 533 */ 534 } else 535 error = 0; 536 } 537 if (error) 538 break; 539 uoff += n; 540 resid -= n; 541 /* 542 * if file has grown larger than 2GB, set flag 543 * in superblock if not already set 544 */ 545 if ((ip->i_size > MAXOFF32_T) && 546 !(fs->fs_flags & FSLARGEFILES)) { 547 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 548 mutex_enter(&ufsvfsp->vfs_lock); 549 fs->fs_flags |= FSLARGEFILES; 550 ufs_sbwrite(ufsvfsp); 551 mutex_exit(&ufsvfsp->vfs_lock); 552 } 553 } while (resid); 554 555 if (error) { 556 /* 557 * restore original state 558 */ 559 if (resid) { 560 if (size == ip->i_size) 561 goto errout; 562 (void) ufs_itrunc(ip, size, 0, cr); 563 } 564 /* 565 * try non-directio path 566 */ 567 goto errout; 568 } 569 skip_alloc: 570 571 /* 572 * get rid of cached pages 573 */ 574 vp = ITOV(ip); 575 exclusive = rw_write_held(&ip->i_contents); 576 if (vn_has_cached_data(vp)) { 577 if (!exclusive) { 578 /* 579 * Still holding i_rwlock, so no allocations 580 * can happen after dropping contents. 581 */ 582 rw_exit(&ip->i_contents); 583 rw_enter(&ip->i_contents, RW_WRITER); 584 } 585 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, 586 B_INVAL, cr, NULL); 587 if (vn_has_cached_data(vp)) 588 goto errout; 589 if (!exclusive) 590 rw_downgrade(&ip->i_contents); 591 ufs_directio_kstats.nflushes.value.ui64++; 592 } 593 594 /* 595 * Direct Writes 596 */ 597 598 if (!exclusive) { 599 ufs_shared_writes++; 600 ncur = atomic_inc_32_nv(&ufs_cur_writes); 601 if (ncur > ufs_maxcur_writes) 602 ufs_maxcur_writes = ncur; 603 } 604 605 /* 606 * proc and as are for VM operations in directio_start() 607 */ 608 if (uio->uio_segflg == UIO_USERSPACE) { 609 procp = ttoproc(curthread); 610 as = procp->p_as; 611 } else { 612 procp = NULL; 613 as = &kas; 614 } 615 *statusp = DIRECTIO_SUCCESS; 616 error = 0; 617 newerror = 0; 618 resid = uio->uio_resid; 619 bytes_written = 0; 620 ufs_directio_kstats.logical_writes.value.ui64++; 621 while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) { 622 size_t pglck_len, pglck_size; 623 caddr_t pglck_base; 624 page_t **pplist, **spplist; 625 626 tail = NULL; 627 628 /* 629 * Adjust number of bytes 630 */ 631 iov = uio->uio_iov; 632 pglck_len = (size_t)MIN(iov->iov_len, resid); 633 pglck_base = iov->iov_base; 634 if (pglck_len == 0) { 635 uio->uio_iov++; 636 uio->uio_iovcnt--; 637 continue; 638 } 639 640 /* 641 * Try to Lock down the largest chunck of pages possible. 642 */ 643 pglck_len = (size_t)MIN(pglck_len, ufsvfsp->vfs_ioclustsz); 644 error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ); 645 646 if (error) 647 break; 648 649 pglck_size = pglck_len; 650 while (pglck_len) { 651 652 nbytes = pglck_len; 653 uoff = uio->uio_loffset; 654 655 if (!bmap_peek) { 656 657 /* 658 * Re-adjust number of bytes to contiguous 659 * range. May have already called bmap_read 660 * in the case of a concurrent rewrite. 661 */ 662 len = (ssize_t)blkroundup(fs, nbytes); 663 error = bmap_read(ip, uoff, &bn, &len); 664 if (error) 665 break; 666 if (bn == UFS_HOLE || len == 0) 667 break; 668 } 669 nbytes = (size_t)MIN(nbytes, len); 670 bmap_peek = 0; 671 672 /* 673 * Get the pagelist pointer for this offset to be 674 * passed to directio_start. 675 */ 676 677 if (pplist != NULL) 678 spplist = pplist + 679 btop((uintptr_t)iov->iov_base - 680 ((uintptr_t)pglck_base & PAGEMASK)); 681 else 682 spplist = NULL; 683 684 /* 685 * Kick off the direct write requests 686 */ 687 directio_start(ufsvfsp, ip, nbytes, ldbtob(bn), 688 iov->iov_base, S_READ, procp, &tail, spplist); 689 690 /* 691 * Adjust pointers and counters 692 */ 693 iov->iov_len -= nbytes; 694 iov->iov_base += nbytes; 695 uio->uio_loffset += nbytes; 696 resid -= nbytes; 697 pglck_len -= nbytes; 698 } 699 700 /* 701 * Wait for outstanding requests 702 */ 703 newerror = directio_wait(tail, &bytes_written); 704 705 /* 706 * Release VM resources 707 */ 708 as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ); 709 710 } 711 712 if (!exclusive) { 713 atomic_dec_32(&ufs_cur_writes); 714 /* 715 * If this write was done shared, readers may 716 * have pulled in unmodified pages. Get rid of 717 * these potentially stale pages. 718 */ 719 if (vn_has_cached_data(vp)) { 720 rw_exit(&ip->i_contents); 721 rw_enter(&ip->i_contents, RW_WRITER); 722 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, 723 B_INVAL, cr, NULL); 724 ufs_directio_kstats.nflushes.value.ui64++; 725 rw_downgrade(&ip->i_contents); 726 } 727 } 728 729 /* 730 * If error, adjust resid to begin at the first 731 * un-writable byte. 732 */ 733 if (error == 0) 734 error = newerror; 735 if (error) 736 resid = uio->uio_resid - bytes_written; 737 arg_uio->uio_resid = resid; 738 739 if (!rewrite) { 740 ip->i_flag |= IUPD | ICHG; 741 /* Caller will update i_seq */ 742 TRANS_INODE(ip->i_ufsvfs, ip); 743 } 744 /* 745 * If there is a residual; adjust the EOF if necessary 746 */ 747 if (resid) { 748 if (size != ip->i_size) { 749 if (uio->uio_loffset > size) 750 size = uio->uio_loffset; 751 (void) ufs_itrunc(ip, size, 0, cr); 752 } 753 } 754 755 if (uio == ©_uio) 756 kmem_free(copy_base, copy_resid); 757 758 return (error); 759 760 errout: 761 if (uio == ©_uio) 762 kmem_free(copy_base, copy_resid); 763 764 return (0); 765 } 766 /* 767 * Direct read of a hole 768 */ 769 static int 770 directio_hole(struct uio *uio, size_t nbytes) 771 { 772 int error = 0, nzero; 773 uio_t phys_uio; 774 iovec_t phys_iov; 775 776 ufs_directio_kstats.hole_reads.value.ui64++; 777 ufs_directio_kstats.nread.value.ui64 += nbytes; 778 779 phys_iov.iov_base = uio->uio_iov->iov_base; 780 phys_iov.iov_len = nbytes; 781 782 phys_uio.uio_iov = &phys_iov; 783 phys_uio.uio_iovcnt = 1; 784 phys_uio.uio_resid = phys_iov.iov_len; 785 phys_uio.uio_segflg = uio->uio_segflg; 786 phys_uio.uio_extflg = uio->uio_extflg; 787 while (error == 0 && phys_uio.uio_resid) { 788 nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len); 789 error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ, 790 &phys_uio); 791 } 792 return (error); 793 } 794 795 /* 796 * Direct Read 797 */ 798 int 799 ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp) 800 { 801 ssize_t resid, bytes_read; 802 u_offset_t size, uoff; 803 int error, newerror, len; 804 size_t nbytes; 805 struct fs *fs; 806 vnode_t *vp; 807 daddr_t bn; 808 iovec_t *iov; 809 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 810 struct proc *procp; 811 struct as *as; 812 struct directio_buf *tail; 813 814 /* 815 * assume that directio isn't possible (normal case) 816 */ 817 *statusp = DIRECTIO_FAILURE; 818 819 /* 820 * Don't go direct 821 */ 822 if (ufs_directio_enabled == 0) 823 return (0); 824 825 /* 826 * mapped file; nevermind 827 */ 828 if (ip->i_mapcnt) 829 return (0); 830 831 /* 832 * CAN WE DO DIRECT IO? 833 */ 834 /* 835 * must be sector aligned 836 */ 837 uoff = uio->uio_loffset; 838 resid = uio->uio_resid; 839 if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1))) 840 return (0); 841 /* 842 * must be short aligned and sector aligned 843 */ 844 iov = uio->uio_iov; 845 nbytes = uio->uio_iovcnt; 846 while (nbytes--) { 847 if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0) 848 return (0); 849 if ((intptr_t)(iov++->iov_base) & 1) 850 return (0); 851 } 852 853 /* 854 * DIRECTIO 855 */ 856 fs = ip->i_fs; 857 858 /* 859 * don't read past EOF 860 */ 861 size = ip->i_size; 862 863 /* 864 * The file offset is past EOF so bail out here; we don't want 865 * to update uio_resid and make it look like we read something. 866 * We say that direct I/O was a success to avoid having rdip() 867 * go through the same "read past EOF logic". 868 */ 869 if (uoff >= size) { 870 *statusp = DIRECTIO_SUCCESS; 871 return (0); 872 } 873 874 /* 875 * The read would extend past EOF so make it smaller. 876 */ 877 if ((uoff + resid) > size) { 878 resid = size - uoff; 879 /* 880 * recheck sector alignment 881 */ 882 if (resid & (DEV_BSIZE - 1)) 883 return (0); 884 } 885 886 /* 887 * At this point, we know there is some real work to do. 888 */ 889 ASSERT(resid); 890 891 /* 892 * get rid of cached pages 893 */ 894 vp = ITOV(ip); 895 if (vn_has_cached_data(vp)) { 896 rw_exit(&ip->i_contents); 897 rw_enter(&ip->i_contents, RW_WRITER); 898 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, 899 B_INVAL, cr, NULL); 900 if (vn_has_cached_data(vp)) 901 return (0); 902 rw_downgrade(&ip->i_contents); 903 ufs_directio_kstats.nflushes.value.ui64++; 904 } 905 /* 906 * Direct Reads 907 */ 908 909 /* 910 * proc and as are for VM operations in directio_start() 911 */ 912 if (uio->uio_segflg == UIO_USERSPACE) { 913 procp = ttoproc(curthread); 914 as = procp->p_as; 915 } else { 916 procp = NULL; 917 as = &kas; 918 } 919 920 *statusp = DIRECTIO_SUCCESS; 921 error = 0; 922 newerror = 0; 923 bytes_read = 0; 924 ufs_directio_kstats.logical_reads.value.ui64++; 925 while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) { 926 size_t pglck_len, pglck_size; 927 caddr_t pglck_base; 928 page_t **pplist, **spplist; 929 930 tail = NULL; 931 932 /* 933 * Adjust number of bytes 934 */ 935 iov = uio->uio_iov; 936 pglck_len = (size_t)MIN(iov->iov_len, resid); 937 pglck_base = iov->iov_base; 938 if (pglck_len == 0) { 939 uio->uio_iov++; 940 uio->uio_iovcnt--; 941 continue; 942 } 943 944 /* 945 * Try to Lock down the largest chunck of pages possible. 946 */ 947 pglck_len = (size_t)MIN(pglck_len, ufsvfsp->vfs_ioclustsz); 948 error = as_pagelock(as, &pplist, pglck_base, 949 pglck_len, S_WRITE); 950 951 if (error) 952 break; 953 954 pglck_size = pglck_len; 955 while (pglck_len) { 956 957 nbytes = pglck_len; 958 uoff = uio->uio_loffset; 959 960 /* 961 * Re-adjust number of bytes to contiguous range 962 */ 963 len = (ssize_t)blkroundup(fs, nbytes); 964 error = bmap_read(ip, uoff, &bn, &len); 965 if (error) 966 break; 967 968 if (bn == UFS_HOLE) { 969 nbytes = (size_t)MIN(fs->fs_bsize - 970 (long)blkoff(fs, uoff), nbytes); 971 error = directio_hole(uio, nbytes); 972 /* 973 * Hole reads are not added to the list 974 * processed by directio_wait() below so 975 * account for bytes read here. 976 */ 977 if (!error) 978 bytes_read += nbytes; 979 } else { 980 nbytes = (size_t)MIN(nbytes, len); 981 982 /* 983 * Get the pagelist pointer for this offset 984 * to be passed to directio_start. 985 */ 986 if (pplist != NULL) 987 spplist = pplist + 988 btop((uintptr_t)iov->iov_base - 989 ((uintptr_t)pglck_base & PAGEMASK)); 990 else 991 spplist = NULL; 992 993 /* 994 * Kick off the direct read requests 995 */ 996 directio_start(ufsvfsp, ip, nbytes, 997 ldbtob(bn), iov->iov_base, 998 S_WRITE, procp, &tail, spplist); 999 } 1000 1001 if (error) 1002 break; 1003 1004 /* 1005 * Adjust pointers and counters 1006 */ 1007 iov->iov_len -= nbytes; 1008 iov->iov_base += nbytes; 1009 uio->uio_loffset += nbytes; 1010 resid -= nbytes; 1011 pglck_len -= nbytes; 1012 } 1013 1014 /* 1015 * Wait for outstanding requests 1016 */ 1017 newerror = directio_wait(tail, &bytes_read); 1018 /* 1019 * Release VM resources 1020 */ 1021 as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE); 1022 1023 } 1024 1025 /* 1026 * If error, adjust resid to begin at the first 1027 * un-read byte. 1028 */ 1029 if (error == 0) 1030 error = newerror; 1031 uio->uio_resid -= bytes_read; 1032 return (error); 1033 }