Print this page
11909 THREAD_KPRI_RELEASE does nothing of the sort
Reviewed by: Bryan Cantrill <bryan@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/ufs/ufs_directio.c
+++ new/usr/src/uts/common/fs/ufs/ufs_directio.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 + * Copyright 2019 Joyent, Inc.
24 25 */
25 26
26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 28 /* All Rights Reserved */
28 29
29 30 /*
30 31 * Portions of this source code were derived from Berkeley 4.3 BSD
31 32 * under license from the Regents of the University of California.
32 33 */
33 34
34 35 #include <sys/types.h>
35 36 #include <sys/t_lock.h>
36 37 #include <sys/param.h>
37 38 #include <sys/time.h>
38 39 #include <sys/systm.h>
39 40 #include <sys/sysmacros.h>
40 41 #include <sys/resource.h>
41 42 #include <sys/signal.h>
42 43 #include <sys/cred.h>
43 44 #include <sys/user.h>
44 45 #include <sys/buf.h>
45 46 #include <sys/vfs.h>
46 47 #include <sys/vnode.h>
47 48 #include <sys/proc.h>
48 49 #include <sys/disp.h>
49 50 #include <sys/file.h>
50 51 #include <sys/fcntl.h>
51 52 #include <sys/flock.h>
52 53 #include <sys/kmem.h>
53 54 #include <sys/uio.h>
54 55 #include <sys/dnlc.h>
55 56 #include <sys/conf.h>
56 57 #include <sys/mman.h>
57 58 #include <sys/pathname.h>
58 59 #include <sys/debug.h>
59 60 #include <sys/vmsystm.h>
60 61 #include <sys/cmn_err.h>
61 62 #include <sys/filio.h>
62 63 #include <sys/atomic.h>
63 64
64 65 #include <sys/fssnap_if.h>
65 66 #include <sys/fs/ufs_fs.h>
66 67 #include <sys/fs/ufs_lockfs.h>
67 68 #include <sys/fs/ufs_filio.h>
68 69 #include <sys/fs/ufs_inode.h>
69 70 #include <sys/fs/ufs_fsdir.h>
70 71 #include <sys/fs/ufs_quota.h>
71 72 #include <sys/fs/ufs_trans.h>
72 73 #include <sys/fs/ufs_panic.h>
73 74 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */
74 75 #include <sys/errno.h>
75 76
76 77 #include <sys/filio.h> /* _FIOIO */
77 78
78 79 #include <vm/hat.h>
79 80 #include <vm/page.h>
80 81 #include <vm/pvn.h>
81 82 #include <vm/as.h>
82 83 #include <vm/seg.h>
83 84 #include <vm/seg_map.h>
84 85 #include <vm/seg_vn.h>
85 86 #include <vm/seg_kmem.h>
86 87 #include <vm/rm.h>
87 88 #include <sys/swap.h>
88 89 #include <sys/epm.h>
89 90
90 91 #include <fs/fs_subr.h>
91 92
92 93 static void *ufs_directio_zero_buf;
93 94 static int ufs_directio_zero_len = 8192;
94 95
95 96 int ufs_directio_enabled = 1; /* feature is enabled */
96 97
97 98 /*
98 99 * for kstats reader
99 100 */
100 101 struct ufs_directio_kstats {
101 102 kstat_named_t logical_reads;
102 103 kstat_named_t phys_reads;
103 104 kstat_named_t hole_reads;
104 105 kstat_named_t nread;
105 106 kstat_named_t logical_writes;
106 107 kstat_named_t phys_writes;
107 108 kstat_named_t nwritten;
108 109 kstat_named_t nflushes;
109 110 } ufs_directio_kstats = {
110 111 { "logical_reads", KSTAT_DATA_UINT64 },
111 112 { "phys_reads", KSTAT_DATA_UINT64 },
112 113 { "hole_reads", KSTAT_DATA_UINT64 },
113 114 { "nread", KSTAT_DATA_UINT64 },
114 115 { "logical_writes", KSTAT_DATA_UINT64 },
115 116 { "phys_writes", KSTAT_DATA_UINT64 },
116 117 { "nwritten", KSTAT_DATA_UINT64 },
117 118 { "nflushes", KSTAT_DATA_UINT64 },
118 119 };
119 120
120 121 kstat_t *ufs_directio_kstatsp;
121 122
122 123 /*
123 124 * use kmem_cache_create for direct-physio buffers. This has shown
124 125 * a better cache distribution compared to buffers on the
125 126 * stack. It also avoids semaphore construction/deconstruction
126 127 * per request
127 128 */
128 129 struct directio_buf {
129 130 struct directio_buf *next;
130 131 char *addr;
131 132 size_t nbytes;
132 133 struct buf buf;
133 134 };
134 135 static struct kmem_cache *directio_buf_cache;
135 136
136 137
137 138 /* ARGSUSED */
138 139 static int
139 140 directio_buf_constructor(void *dbp, void *cdrarg, int kmflags)
140 141 {
141 142 bioinit((struct buf *)&((struct directio_buf *)dbp)->buf);
142 143 return (0);
143 144 }
144 145
145 146 /* ARGSUSED */
146 147 static void
147 148 directio_buf_destructor(void *dbp, void *cdrarg)
148 149 {
149 150 biofini((struct buf *)&((struct directio_buf *)dbp)->buf);
150 151 }
151 152
152 153 void
153 154 directio_bufs_init(void)
154 155 {
155 156 directio_buf_cache = kmem_cache_create("directio_buf_cache",
156 157 sizeof (struct directio_buf), 0,
157 158 directio_buf_constructor, directio_buf_destructor,
158 159 NULL, NULL, NULL, 0);
159 160 }
160 161
161 162 void
162 163 ufs_directio_init(void)
163 164 {
164 165 /*
165 166 * kstats
166 167 */
167 168 ufs_directio_kstatsp = kstat_create("ufs", 0,
168 169 "directio", "ufs", KSTAT_TYPE_NAMED,
169 170 sizeof (ufs_directio_kstats) / sizeof (kstat_named_t),
170 171 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
171 172 if (ufs_directio_kstatsp) {
172 173 ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats;
173 174 kstat_install(ufs_directio_kstatsp);
174 175 }
175 176 /*
176 177 * kzero is broken so we have to use a private buf of zeroes
177 178 */
178 179 ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP);
179 180 directio_bufs_init();
180 181 }
181 182
182 183 /*
183 184 * Wait for the first direct IO operation to finish
184 185 */
185 186 static int
186 187 directio_wait_one(struct directio_buf *dbp, long *bytes_iop)
187 188 {
188 189 buf_t *bp;
189 190 int error;
190 191
191 192 /*
192 193 * Wait for IO to finish
193 194 */
194 195 bp = &dbp->buf;
195 196 error = biowait(bp);
196 197
197 198 /*
198 199 * bytes_io will be used to figure out a resid
199 200 * for the caller. The resid is approximated by reporting
200 201 * the bytes following the first failed IO as the residual.
201 202 *
202 203 * I am cautious about using b_resid because I
203 204 * am not sure how well the disk drivers maintain it.
204 205 */
205 206 if (error)
206 207 if (bp->b_resid)
207 208 *bytes_iop = bp->b_bcount - bp->b_resid;
208 209 else
209 210 *bytes_iop = 0;
210 211 else
211 212 *bytes_iop += bp->b_bcount;
212 213 /*
213 214 * Release direct IO resources
↓ open down ↓ |
180 lines elided |
↑ open up ↑ |
214 215 */
215 216 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
216 217 kmem_cache_free(directio_buf_cache, dbp);
217 218 return (error);
218 219 }
219 220
220 221 /*
221 222 * Wait for all of the direct IO operations to finish
222 223 */
223 224
224 -uint32_t ufs_directio_drop_kpri = 0; /* enable kpri hack */
225 -
226 225 static int
227 226 directio_wait(struct directio_buf *tail, long *bytes_iop)
228 227 {
229 228 int error = 0, newerror;
230 229 struct directio_buf *dbp;
231 - uint_t kpri_req_save;
232 230
233 231 /*
234 232 * The linked list of directio buf structures is maintained
235 233 * in reverse order (tail->last request->penultimate request->...)
236 234 */
237 - /*
238 - * This is the k_pri_req hack. Large numbers of threads
239 - * sleeping with kernel priority will cause scheduler thrashing
240 - * on an MP machine. This can be seen running Oracle using
241 - * directio to ufs files. Sleep at normal priority here to
242 - * more closely mimic physio to a device partition. This
243 - * workaround is disabled by default as a niced thread could
244 - * be starved from running while holding i_rwlock and i_contents.
245 - */
246 - if (ufs_directio_drop_kpri) {
247 - kpri_req_save = curthread->t_kpri_req;
248 - curthread->t_kpri_req = 0;
249 - }
250 235 while ((dbp = tail) != NULL) {
251 236 tail = dbp->next;
252 237 newerror = directio_wait_one(dbp, bytes_iop);
253 238 if (error == 0)
254 239 error = newerror;
255 240 }
256 - if (ufs_directio_drop_kpri)
257 - curthread->t_kpri_req = kpri_req_save;
258 241 return (error);
259 242 }
260 243 /*
261 244 * Initiate direct IO request
262 245 */
263 246 static void
264 247 directio_start(struct ufsvfs *ufsvfsp, struct inode *ip, size_t nbytes,
265 - offset_t offset, char *addr, enum seg_rw rw, struct proc *procp,
266 - struct directio_buf **tailp, page_t **pplist)
248 + offset_t offset, char *addr, enum seg_rw rw, struct proc *procp,
249 + struct directio_buf **tailp, page_t **pplist)
267 250 {
268 251 buf_t *bp;
269 252 struct directio_buf *dbp;
270 253
271 254 /*
272 255 * Allocate a directio buf header
273 256 * Note - list is maintained in reverse order.
274 257 * directio_wait_one() depends on this fact when
275 258 * adjusting the ``bytes_io'' param. bytes_io
276 259 * is used to compute a residual in the case of error.
277 260 */
278 261 dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP);
279 262 dbp->next = *tailp;
280 263 *tailp = dbp;
281 264
282 265 /*
283 266 * Initialize buf header
284 267 */
285 268 dbp->addr = addr;
286 269 dbp->nbytes = nbytes;
287 270 bp = &dbp->buf;
288 271 bp->b_edev = ip->i_dev;
289 272 bp->b_lblkno = btodt(offset);
290 273 bp->b_bcount = nbytes;
291 274 bp->b_un.b_addr = addr;
292 275 bp->b_proc = procp;
293 276 bp->b_file = ip->i_vnode;
294 277
295 278 /*
296 279 * Note that S_WRITE implies B_READ and vice versa: a read(2)
297 280 * will B_READ data from the filesystem and S_WRITE it into
298 281 * the user's buffer; a write(2) will S_READ data from the
299 282 * user's buffer and B_WRITE it to the filesystem.
300 283 */
301 284 if (rw == S_WRITE) {
302 285 bp->b_flags = B_BUSY | B_PHYS | B_READ;
303 286 ufs_directio_kstats.phys_reads.value.ui64++;
304 287 ufs_directio_kstats.nread.value.ui64 += nbytes;
305 288 } else {
306 289 bp->b_flags = B_BUSY | B_PHYS | B_WRITE;
307 290 ufs_directio_kstats.phys_writes.value.ui64++;
308 291 ufs_directio_kstats.nwritten.value.ui64 += nbytes;
309 292 }
310 293 bp->b_shadow = pplist;
311 294 if (pplist != NULL)
312 295 bp->b_flags |= B_SHADOW;
313 296
314 297 /*
315 298 * Issue I/O request.
316 299 */
317 300 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
318 301 if (ufsvfsp->vfs_snapshot)
319 302 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
320 303 else
321 304 (void) bdev_strategy(bp);
322 305
323 306 if (rw == S_WRITE)
324 307 lwp_stat_update(LWP_STAT_OUBLK, 1);
325 308 else
326 309 lwp_stat_update(LWP_STAT_INBLK, 1);
327 310
328 311 }
329 312
330 313 uint32_t ufs_shared_writes; /* writes done w/ lock shared */
331 314 uint32_t ufs_cur_writes; /* # concurrent writes */
332 315 uint32_t ufs_maxcur_writes; /* high water concurrent writes */
333 316 uint32_t ufs_posix_hits; /* writes done /w lock excl. */
334 317
335 318 /*
↓ open down ↓ |
59 lines elided |
↑ open up ↑ |
336 319 * Force POSIX syncronous data integrity on all writes for testing.
337 320 */
338 321 uint32_t ufs_force_posix_sdi = 0;
339 322
340 323 /*
341 324 * Direct Write
342 325 */
343 326
344 327 int
345 328 ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite,
346 - cred_t *cr, int *statusp)
329 + cred_t *cr, int *statusp)
347 330 {
348 331 long resid, bytes_written;
349 332 u_offset_t size, uoff;
350 333 uio_t *uio = arg_uio;
351 334 rlim64_t limit = uio->uio_llimit;
352 335 int on, n, error, newerror, len, has_holes;
353 336 daddr_t bn;
354 337 size_t nbytes;
355 338 struct fs *fs;
356 339 vnode_t *vp;
357 340 iovec_t *iov;
358 341 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
359 342 struct proc *procp;
360 343 struct as *as;
361 344 struct directio_buf *tail;
362 345 int exclusive, ncur, bmap_peek;
363 346 uio_t copy_uio;
364 347 iovec_t copy_iov;
365 348 char *copy_base;
366 349 long copy_resid;
367 350
368 351 /*
369 352 * assume that directio isn't possible (normal case)
370 353 */
371 354 *statusp = DIRECTIO_FAILURE;
372 355
373 356 /*
374 357 * Don't go direct
375 358 */
376 359 if (ufs_directio_enabled == 0)
377 360 return (0);
378 361
379 362 /*
380 363 * mapped file; nevermind
381 364 */
382 365 if (ip->i_mapcnt)
383 366 return (0);
384 367
385 368 /*
386 369 * CAN WE DO DIRECT IO?
387 370 */
388 371 uoff = uio->uio_loffset;
389 372 resid = uio->uio_resid;
390 373
391 374 /*
392 375 * beyond limit
393 376 */
394 377 if (uoff + resid > limit)
395 378 return (0);
396 379
397 380 /*
398 381 * must be sector aligned
399 382 */
400 383 if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
401 384 return (0);
402 385
403 386 /*
404 387 * SHOULD WE DO DIRECT IO?
405 388 */
406 389 size = ip->i_size;
↓ open down ↓ |
50 lines elided |
↑ open up ↑ |
407 390 has_holes = -1;
408 391
409 392 /*
410 393 * only on regular files; no metadata
411 394 */
412 395 if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip)
413 396 return (0);
414 397
415 398 /*
416 399 * Synchronous, allocating writes run very slow in Direct-Mode
417 - * XXX - can be fixed with bmap_write changes for large writes!!!
400 + * XXX - can be fixed with bmap_write changes for large writes!!!
418 401 * XXX - can be fixed for updates to "almost-full" files
419 402 * XXX - WARNING - system hangs if bmap_write() has to
420 - * allocate lots of pages since pageout
421 - * suspends on locked inode
403 + * allocate lots of pages since pageout
404 + * suspends on locked inode
422 405 */
423 406 if (!rewrite && (ip->i_flag & ISYNC)) {
424 407 if ((uoff + resid) > size)
425 408 return (0);
426 409 has_holes = bmap_has_holes(ip);
427 410 if (has_holes)
428 411 return (0);
429 412 }
430 413
431 414 /*
432 415 * Each iovec must be short aligned and sector aligned. If
433 416 * one is not, then kmem_alloc a new buffer and copy all of
434 417 * the smaller buffers into the new buffer. This new
435 418 * buffer will be short aligned and sector aligned.
436 419 */
437 420 iov = uio->uio_iov;
438 421 nbytes = uio->uio_iovcnt;
439 422 while (nbytes--) {
440 423 if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 ||
441 424 (intptr_t)(iov->iov_base) & 1) {
442 425 copy_resid = uio->uio_resid;
443 426 copy_base = kmem_alloc(copy_resid, KM_NOSLEEP);
444 427 if (copy_base == NULL)
445 428 return (0);
446 429 copy_iov.iov_base = copy_base;
447 430 copy_iov.iov_len = copy_resid;
448 431 copy_uio.uio_iov = ©_iov;
449 432 copy_uio.uio_iovcnt = 1;
450 433 copy_uio.uio_segflg = UIO_SYSSPACE;
451 434 copy_uio.uio_extflg = UIO_COPY_DEFAULT;
452 435 copy_uio.uio_loffset = uio->uio_loffset;
453 436 copy_uio.uio_resid = uio->uio_resid;
454 437 copy_uio.uio_llimit = uio->uio_llimit;
455 438 error = uiomove(copy_base, copy_resid, UIO_WRITE, uio);
456 439 if (error) {
457 440 kmem_free(copy_base, copy_resid);
458 441 return (0);
459 442 }
460 443 uio = ©_uio;
461 444 break;
462 445 }
463 446 iov++;
464 447 }
465 448
466 449 /*
467 450 * From here on down, all error exits must go to errout and
468 451 * not simply return a 0.
469 452 */
470 453
471 454 /*
472 455 * DIRECTIO
473 456 */
474 457
475 458 fs = ip->i_fs;
476 459
477 460 /*
478 461 * POSIX check. If attempting a concurrent re-write, make sure
479 462 * that this will be a single request to the driver to meet
480 463 * POSIX synchronous data integrity requirements.
481 464 */
482 465 bmap_peek = 0;
483 466 if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) {
484 467 int upgrade = 0;
485 468
486 469 /* check easy conditions first */
487 470 if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) {
488 471 upgrade = 1;
489 472 } else {
490 473 /* now look for contiguous allocation */
491 474 len = (ssize_t)blkroundup(fs, resid);
492 475 error = bmap_read(ip, uoff, &bn, &len);
493 476 if (error || bn == UFS_HOLE || len == 0)
494 477 goto errout;
495 478 /* save a call to bmap_read later */
496 479 bmap_peek = 1;
497 480 if (len < resid)
498 481 upgrade = 1;
499 482 }
500 483 if (upgrade) {
501 484 rw_exit(&ip->i_contents);
502 485 rw_enter(&ip->i_contents, RW_WRITER);
503 486 ufs_posix_hits++;
504 487 }
505 488 }
506 489
507 490
508 491 /*
509 492 * allocate space
510 493 */
511 494
512 495 /*
513 496 * If attempting a re-write, there is no allocation to do.
514 497 * bmap_write would trip an ASSERT if i_contents is held shared.
515 498 */
516 499 if (rewrite)
517 500 goto skip_alloc;
518 501
519 502 do {
520 503 on = (int)blkoff(fs, uoff);
521 504 n = (int)MIN(fs->fs_bsize - on, resid);
522 505 if ((uoff + n) > ip->i_size) {
523 506 error = bmap_write(ip, uoff, (int)(on + n),
524 507 (int)(uoff & (offset_t)MAXBOFFSET) == 0,
525 508 NULL, cr);
526 509 /* Caller is responsible for updating i_seq if needed */
527 510 if (error)
528 511 break;
529 512 ip->i_size = uoff + n;
530 513 ip->i_flag |= IATTCHG;
531 514 } else if (n == MAXBSIZE) {
532 515 error = bmap_write(ip, uoff, (int)(on + n),
533 516 BI_ALLOC_ONLY, NULL, cr);
534 517 /* Caller is responsible for updating i_seq if needed */
535 518 } else {
536 519 if (has_holes < 0)
537 520 has_holes = bmap_has_holes(ip);
538 521 if (has_holes) {
539 522 uint_t blk_size;
540 523 u_offset_t offset;
541 524
542 525 offset = uoff & (offset_t)fs->fs_bmask;
543 526 blk_size = (int)blksize(fs, ip,
544 527 (daddr_t)lblkno(fs, offset));
545 528 error = bmap_write(ip, uoff, blk_size,
546 529 BI_NORMAL, NULL, cr);
547 530 /*
548 531 * Caller is responsible for updating
549 532 * i_seq if needed
550 533 */
551 534 } else
552 535 error = 0;
553 536 }
554 537 if (error)
555 538 break;
556 539 uoff += n;
557 540 resid -= n;
558 541 /*
559 542 * if file has grown larger than 2GB, set flag
560 543 * in superblock if not already set
561 544 */
562 545 if ((ip->i_size > MAXOFF32_T) &&
563 546 !(fs->fs_flags & FSLARGEFILES)) {
564 547 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
565 548 mutex_enter(&ufsvfsp->vfs_lock);
566 549 fs->fs_flags |= FSLARGEFILES;
567 550 ufs_sbwrite(ufsvfsp);
568 551 mutex_exit(&ufsvfsp->vfs_lock);
569 552 }
570 553 } while (resid);
571 554
572 555 if (error) {
573 556 /*
574 557 * restore original state
575 558 */
576 559 if (resid) {
577 560 if (size == ip->i_size)
578 561 goto errout;
579 562 (void) ufs_itrunc(ip, size, 0, cr);
580 563 }
581 564 /*
582 565 * try non-directio path
583 566 */
584 567 goto errout;
585 568 }
586 569 skip_alloc:
587 570
588 571 /*
589 572 * get rid of cached pages
590 573 */
591 574 vp = ITOV(ip);
592 575 exclusive = rw_write_held(&ip->i_contents);
593 576 if (vn_has_cached_data(vp)) {
594 577 if (!exclusive) {
595 578 /*
596 579 * Still holding i_rwlock, so no allocations
597 580 * can happen after dropping contents.
598 581 */
599 582 rw_exit(&ip->i_contents);
600 583 rw_enter(&ip->i_contents, RW_WRITER);
601 584 }
602 585 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
603 586 B_INVAL, cr, NULL);
604 587 if (vn_has_cached_data(vp))
605 588 goto errout;
606 589 if (!exclusive)
607 590 rw_downgrade(&ip->i_contents);
608 591 ufs_directio_kstats.nflushes.value.ui64++;
609 592 }
610 593
611 594 /*
612 595 * Direct Writes
613 596 */
614 597
615 598 if (!exclusive) {
616 599 ufs_shared_writes++;
617 600 ncur = atomic_inc_32_nv(&ufs_cur_writes);
618 601 if (ncur > ufs_maxcur_writes)
619 602 ufs_maxcur_writes = ncur;
620 603 }
621 604
622 605 /*
623 606 * proc and as are for VM operations in directio_start()
624 607 */
625 608 if (uio->uio_segflg == UIO_USERSPACE) {
626 609 procp = ttoproc(curthread);
627 610 as = procp->p_as;
628 611 } else {
629 612 procp = NULL;
630 613 as = &kas;
631 614 }
632 615 *statusp = DIRECTIO_SUCCESS;
633 616 error = 0;
634 617 newerror = 0;
635 618 resid = uio->uio_resid;
636 619 bytes_written = 0;
637 620 ufs_directio_kstats.logical_writes.value.ui64++;
638 621 while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
639 622 size_t pglck_len, pglck_size;
640 623 caddr_t pglck_base;
641 624 page_t **pplist, **spplist;
642 625
643 626 tail = NULL;
644 627
645 628 /*
646 629 * Adjust number of bytes
647 630 */
648 631 iov = uio->uio_iov;
649 632 pglck_len = (size_t)MIN(iov->iov_len, resid);
650 633 pglck_base = iov->iov_base;
651 634 if (pglck_len == 0) {
652 635 uio->uio_iov++;
653 636 uio->uio_iovcnt--;
654 637 continue;
655 638 }
656 639
657 640 /*
658 641 * Try to Lock down the largest chunck of pages possible.
659 642 */
660 643 pglck_len = (size_t)MIN(pglck_len, ufsvfsp->vfs_ioclustsz);
661 644 error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ);
662 645
663 646 if (error)
664 647 break;
665 648
666 649 pglck_size = pglck_len;
667 650 while (pglck_len) {
668 651
669 652 nbytes = pglck_len;
670 653 uoff = uio->uio_loffset;
671 654
672 655 if (!bmap_peek) {
673 656
674 657 /*
675 658 * Re-adjust number of bytes to contiguous
676 659 * range. May have already called bmap_read
677 660 * in the case of a concurrent rewrite.
678 661 */
679 662 len = (ssize_t)blkroundup(fs, nbytes);
680 663 error = bmap_read(ip, uoff, &bn, &len);
681 664 if (error)
682 665 break;
683 666 if (bn == UFS_HOLE || len == 0)
684 667 break;
685 668 }
686 669 nbytes = (size_t)MIN(nbytes, len);
687 670 bmap_peek = 0;
688 671
689 672 /*
690 673 * Get the pagelist pointer for this offset to be
691 674 * passed to directio_start.
692 675 */
693 676
694 677 if (pplist != NULL)
695 678 spplist = pplist +
696 679 btop((uintptr_t)iov->iov_base -
697 680 ((uintptr_t)pglck_base & PAGEMASK));
698 681 else
699 682 spplist = NULL;
700 683
701 684 /*
702 685 * Kick off the direct write requests
703 686 */
704 687 directio_start(ufsvfsp, ip, nbytes, ldbtob(bn),
705 688 iov->iov_base, S_READ, procp, &tail, spplist);
706 689
707 690 /*
708 691 * Adjust pointers and counters
709 692 */
710 693 iov->iov_len -= nbytes;
711 694 iov->iov_base += nbytes;
712 695 uio->uio_loffset += nbytes;
713 696 resid -= nbytes;
714 697 pglck_len -= nbytes;
715 698 }
716 699
717 700 /*
718 701 * Wait for outstanding requests
719 702 */
720 703 newerror = directio_wait(tail, &bytes_written);
721 704
722 705 /*
723 706 * Release VM resources
724 707 */
725 708 as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ);
726 709
727 710 }
728 711
729 712 if (!exclusive) {
730 713 atomic_dec_32(&ufs_cur_writes);
731 714 /*
732 715 * If this write was done shared, readers may
733 716 * have pulled in unmodified pages. Get rid of
734 717 * these potentially stale pages.
735 718 */
736 719 if (vn_has_cached_data(vp)) {
737 720 rw_exit(&ip->i_contents);
738 721 rw_enter(&ip->i_contents, RW_WRITER);
739 722 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
740 723 B_INVAL, cr, NULL);
741 724 ufs_directio_kstats.nflushes.value.ui64++;
742 725 rw_downgrade(&ip->i_contents);
743 726 }
744 727 }
745 728
746 729 /*
747 730 * If error, adjust resid to begin at the first
748 731 * un-writable byte.
749 732 */
750 733 if (error == 0)
751 734 error = newerror;
752 735 if (error)
753 736 resid = uio->uio_resid - bytes_written;
754 737 arg_uio->uio_resid = resid;
755 738
756 739 if (!rewrite) {
757 740 ip->i_flag |= IUPD | ICHG;
758 741 /* Caller will update i_seq */
759 742 TRANS_INODE(ip->i_ufsvfs, ip);
760 743 }
761 744 /*
762 745 * If there is a residual; adjust the EOF if necessary
763 746 */
764 747 if (resid) {
765 748 if (size != ip->i_size) {
766 749 if (uio->uio_loffset > size)
767 750 size = uio->uio_loffset;
768 751 (void) ufs_itrunc(ip, size, 0, cr);
769 752 }
770 753 }
771 754
772 755 if (uio == ©_uio)
773 756 kmem_free(copy_base, copy_resid);
774 757
775 758 return (error);
776 759
777 760 errout:
778 761 if (uio == ©_uio)
779 762 kmem_free(copy_base, copy_resid);
780 763
781 764 return (0);
782 765 }
783 766 /*
784 767 * Direct read of a hole
785 768 */
786 769 static int
787 770 directio_hole(struct uio *uio, size_t nbytes)
788 771 {
789 772 int error = 0, nzero;
790 773 uio_t phys_uio;
791 774 iovec_t phys_iov;
792 775
793 776 ufs_directio_kstats.hole_reads.value.ui64++;
794 777 ufs_directio_kstats.nread.value.ui64 += nbytes;
795 778
796 779 phys_iov.iov_base = uio->uio_iov->iov_base;
797 780 phys_iov.iov_len = nbytes;
798 781
799 782 phys_uio.uio_iov = &phys_iov;
800 783 phys_uio.uio_iovcnt = 1;
801 784 phys_uio.uio_resid = phys_iov.iov_len;
802 785 phys_uio.uio_segflg = uio->uio_segflg;
803 786 phys_uio.uio_extflg = uio->uio_extflg;
804 787 while (error == 0 && phys_uio.uio_resid) {
805 788 nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len);
806 789 error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ,
807 790 &phys_uio);
808 791 }
809 792 return (error);
810 793 }
811 794
812 795 /*
813 796 * Direct Read
814 797 */
815 798 int
816 799 ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp)
817 800 {
818 801 ssize_t resid, bytes_read;
819 802 u_offset_t size, uoff;
820 803 int error, newerror, len;
821 804 size_t nbytes;
822 805 struct fs *fs;
823 806 vnode_t *vp;
824 807 daddr_t bn;
825 808 iovec_t *iov;
826 809 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
827 810 struct proc *procp;
828 811 struct as *as;
829 812 struct directio_buf *tail;
830 813
831 814 /*
832 815 * assume that directio isn't possible (normal case)
833 816 */
834 817 *statusp = DIRECTIO_FAILURE;
835 818
836 819 /*
837 820 * Don't go direct
838 821 */
839 822 if (ufs_directio_enabled == 0)
840 823 return (0);
841 824
842 825 /*
843 826 * mapped file; nevermind
844 827 */
845 828 if (ip->i_mapcnt)
846 829 return (0);
847 830
848 831 /*
849 832 * CAN WE DO DIRECT IO?
850 833 */
851 834 /*
852 835 * must be sector aligned
853 836 */
854 837 uoff = uio->uio_loffset;
855 838 resid = uio->uio_resid;
856 839 if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
857 840 return (0);
858 841 /*
859 842 * must be short aligned and sector aligned
860 843 */
861 844 iov = uio->uio_iov;
862 845 nbytes = uio->uio_iovcnt;
863 846 while (nbytes--) {
864 847 if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0)
865 848 return (0);
866 849 if ((intptr_t)(iov++->iov_base) & 1)
867 850 return (0);
868 851 }
869 852
870 853 /*
871 854 * DIRECTIO
872 855 */
873 856 fs = ip->i_fs;
874 857
875 858 /*
876 859 * don't read past EOF
877 860 */
878 861 size = ip->i_size;
879 862
880 863 /*
881 864 * The file offset is past EOF so bail out here; we don't want
882 865 * to update uio_resid and make it look like we read something.
883 866 * We say that direct I/O was a success to avoid having rdip()
884 867 * go through the same "read past EOF logic".
885 868 */
886 869 if (uoff >= size) {
887 870 *statusp = DIRECTIO_SUCCESS;
888 871 return (0);
889 872 }
890 873
891 874 /*
892 875 * The read would extend past EOF so make it smaller.
893 876 */
894 877 if ((uoff + resid) > size) {
895 878 resid = size - uoff;
896 879 /*
897 880 * recheck sector alignment
898 881 */
899 882 if (resid & (DEV_BSIZE - 1))
900 883 return (0);
901 884 }
902 885
903 886 /*
904 887 * At this point, we know there is some real work to do.
905 888 */
906 889 ASSERT(resid);
907 890
908 891 /*
909 892 * get rid of cached pages
910 893 */
911 894 vp = ITOV(ip);
912 895 if (vn_has_cached_data(vp)) {
913 896 rw_exit(&ip->i_contents);
914 897 rw_enter(&ip->i_contents, RW_WRITER);
915 898 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
916 899 B_INVAL, cr, NULL);
917 900 if (vn_has_cached_data(vp))
918 901 return (0);
919 902 rw_downgrade(&ip->i_contents);
920 903 ufs_directio_kstats.nflushes.value.ui64++;
921 904 }
922 905 /*
923 906 * Direct Reads
924 907 */
925 908
926 909 /*
927 910 * proc and as are for VM operations in directio_start()
928 911 */
929 912 if (uio->uio_segflg == UIO_USERSPACE) {
930 913 procp = ttoproc(curthread);
931 914 as = procp->p_as;
932 915 } else {
933 916 procp = NULL;
934 917 as = &kas;
935 918 }
936 919
937 920 *statusp = DIRECTIO_SUCCESS;
938 921 error = 0;
939 922 newerror = 0;
940 923 bytes_read = 0;
941 924 ufs_directio_kstats.logical_reads.value.ui64++;
942 925 while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
943 926 size_t pglck_len, pglck_size;
944 927 caddr_t pglck_base;
945 928 page_t **pplist, **spplist;
946 929
947 930 tail = NULL;
948 931
949 932 /*
950 933 * Adjust number of bytes
951 934 */
952 935 iov = uio->uio_iov;
953 936 pglck_len = (size_t)MIN(iov->iov_len, resid);
954 937 pglck_base = iov->iov_base;
955 938 if (pglck_len == 0) {
956 939 uio->uio_iov++;
957 940 uio->uio_iovcnt--;
958 941 continue;
959 942 }
960 943
961 944 /*
962 945 * Try to Lock down the largest chunck of pages possible.
963 946 */
964 947 pglck_len = (size_t)MIN(pglck_len, ufsvfsp->vfs_ioclustsz);
965 948 error = as_pagelock(as, &pplist, pglck_base,
966 949 pglck_len, S_WRITE);
967 950
968 951 if (error)
969 952 break;
970 953
971 954 pglck_size = pglck_len;
972 955 while (pglck_len) {
973 956
974 957 nbytes = pglck_len;
975 958 uoff = uio->uio_loffset;
976 959
977 960 /*
978 961 * Re-adjust number of bytes to contiguous range
979 962 */
980 963 len = (ssize_t)blkroundup(fs, nbytes);
981 964 error = bmap_read(ip, uoff, &bn, &len);
982 965 if (error)
983 966 break;
984 967
985 968 if (bn == UFS_HOLE) {
986 969 nbytes = (size_t)MIN(fs->fs_bsize -
987 970 (long)blkoff(fs, uoff), nbytes);
988 971 error = directio_hole(uio, nbytes);
989 972 /*
990 973 * Hole reads are not added to the list
991 974 * processed by directio_wait() below so
992 975 * account for bytes read here.
993 976 */
994 977 if (!error)
995 978 bytes_read += nbytes;
996 979 } else {
997 980 nbytes = (size_t)MIN(nbytes, len);
998 981
999 982 /*
1000 983 * Get the pagelist pointer for this offset
1001 984 * to be passed to directio_start.
1002 985 */
1003 986 if (pplist != NULL)
1004 987 spplist = pplist +
1005 988 btop((uintptr_t)iov->iov_base -
1006 989 ((uintptr_t)pglck_base & PAGEMASK));
1007 990 else
1008 991 spplist = NULL;
1009 992
1010 993 /*
1011 994 * Kick off the direct read requests
1012 995 */
1013 996 directio_start(ufsvfsp, ip, nbytes,
1014 997 ldbtob(bn), iov->iov_base,
1015 998 S_WRITE, procp, &tail, spplist);
1016 999 }
1017 1000
1018 1001 if (error)
1019 1002 break;
1020 1003
1021 1004 /*
1022 1005 * Adjust pointers and counters
1023 1006 */
1024 1007 iov->iov_len -= nbytes;
1025 1008 iov->iov_base += nbytes;
1026 1009 uio->uio_loffset += nbytes;
1027 1010 resid -= nbytes;
1028 1011 pglck_len -= nbytes;
1029 1012 }
1030 1013
1031 1014 /*
1032 1015 * Wait for outstanding requests
1033 1016 */
1034 1017 newerror = directio_wait(tail, &bytes_read);
1035 1018 /*
1036 1019 * Release VM resources
1037 1020 */
1038 1021 as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE);
1039 1022
1040 1023 }
1041 1024
1042 1025 /*
1043 1026 * If error, adjust resid to begin at the first
1044 1027 * un-read byte.
1045 1028 */
1046 1029 if (error == 0)
1047 1030 error = newerror;
1048 1031 uio->uio_resid -= bytes_read;
1049 1032 return (error);
1050 1033 }
↓ open down ↓ |
619 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX