1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Kernel asynchronous I/O. 29 * This is only for raw devices now (as of Nov. 1993). 30 */ 31 32 #include <sys/types.h> 33 #include <sys/errno.h> 34 #include <sys/conf.h> 35 #include <sys/file.h> 36 #include <sys/fs/snode.h> 37 #include <sys/unistd.h> 38 #include <sys/cmn_err.h> 39 #include <vm/as.h> 40 #include <vm/faultcode.h> 41 #include <sys/sysmacros.h> 42 #include <sys/procfs.h> 43 #include <sys/kmem.h> 44 #include <sys/autoconf.h> 45 #include <sys/ddi_impldefs.h> 46 #include <sys/sunddi.h> 47 #include <sys/aio_impl.h> 48 #include <sys/debug.h> 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/vmsystm.h> 52 #include <sys/fs/pxfs_ki.h> 53 #include <sys/contract/process_impl.h> 54 55 /* 56 * external entry point. 57 */ 58 #ifdef _LP64 59 static int64_t kaioc(long, long, long, long, long, long); 60 #endif 61 static int kaio(ulong_t *, rval_t *); 62 63 64 #define AIO_64 0 65 #define AIO_32 1 66 #define AIO_LARGEFILE 2 67 68 /* 69 * implementation specific functions (private) 70 */ 71 #ifdef _LP64 72 static int alio(int, aiocb_t **, int, struct sigevent *); 73 #endif 74 static int aionotify(void); 75 static int aioinit(void); 76 static int aiostart(void); 77 static void alio_cleanup(aio_t *, aiocb_t **, int, int); 78 static int (*check_vp(struct vnode *, int))(vnode_t *, struct aio_req *, 79 cred_t *); 80 static void lio_set_error(aio_req_t *, int portused); 81 static aio_t *aio_aiop_alloc(); 82 static int aio_req_alloc(aio_req_t **, aio_result_t *); 83 static int aio_lio_alloc(aio_lio_t **); 84 static aio_req_t *aio_req_done(void *); 85 static aio_req_t *aio_req_remove(aio_req_t *); 86 static int aio_req_find(aio_result_t *, aio_req_t **); 87 static int aio_hash_insert(struct aio_req_t *, aio_t *); 88 static int aio_req_setup(aio_req_t **, aio_t *, aiocb_t *, 89 aio_result_t *, vnode_t *, int); 90 static int aio_cleanup_thread(aio_t *); 91 static aio_lio_t *aio_list_get(aio_result_t *); 92 static void lio_set_uerror(void *, int); 93 extern void aio_zerolen(aio_req_t *); 94 static int aiowait(struct timeval *, int, long *); 95 static int aiowaitn(void *, uint_t, uint_t *, timespec_t *); 96 static int aio_unlock_requests(caddr_t iocblist, int iocb_index, 97 aio_req_t *reqlist, aio_t *aiop, model_t model); 98 static int aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max); 99 static int aiosuspend(void *, int, struct timespec *, int, 100 long *, int); 101 static int aliowait(int, void *, int, void *, int); 102 static int aioerror(void *, int); 103 static int aio_cancel(int, void *, long *, int); 104 static int arw(int, int, char *, int, offset_t, aio_result_t *, int); 105 static int aiorw(int, void *, int, int); 106 107 static int alioLF(int, void *, int, void *); 108 static int aio_req_setupLF(aio_req_t **, aio_t *, aiocb64_32_t *, 109 aio_result_t *, vnode_t *, int); 110 static int alio32(int, void *, int, void *); 111 static int driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p); 112 static int driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p); 113 114 #ifdef _SYSCALL32_IMPL 115 static void aiocb_LFton(aiocb64_32_t *, aiocb_t *); 116 void aiocb_32ton(aiocb32_t *, aiocb_t *); 117 #endif /* _SYSCALL32_IMPL */ 118 119 /* 120 * implementation specific functions (external) 121 */ 122 void aio_req_free(aio_t *, aio_req_t *); 123 124 /* 125 * Event Port framework 126 */ 127 128 void aio_req_free_port(aio_t *, aio_req_t *); 129 static int aio_port_callback(void *, int *, pid_t, int, void *); 130 131 /* 132 * This is the loadable module wrapper. 133 */ 134 #include <sys/modctl.h> 135 #include <sys/syscall.h> 136 137 #ifdef _LP64 138 139 static struct sysent kaio_sysent = { 140 6, 141 SE_NOUNLOAD | SE_64RVAL | SE_ARGC, 142 (int (*)())kaioc 143 }; 144 145 #ifdef _SYSCALL32_IMPL 146 static struct sysent kaio_sysent32 = { 147 7, 148 SE_NOUNLOAD | SE_64RVAL, 149 kaio 150 }; 151 #endif /* _SYSCALL32_IMPL */ 152 153 #else /* _LP64 */ 154 155 static struct sysent kaio_sysent = { 156 7, 157 SE_NOUNLOAD | SE_32RVAL1, 158 kaio 159 }; 160 161 #endif /* _LP64 */ 162 163 /* 164 * Module linkage information for the kernel. 165 */ 166 167 static struct modlsys modlsys = { 168 &mod_syscallops, 169 "kernel Async I/O", 170 &kaio_sysent 171 }; 172 173 #ifdef _SYSCALL32_IMPL 174 static struct modlsys modlsys32 = { 175 &mod_syscallops32, 176 "kernel Async I/O for 32 bit compatibility", 177 &kaio_sysent32 178 }; 179 #endif /* _SYSCALL32_IMPL */ 180 181 182 static struct modlinkage modlinkage = { 183 MODREV_1, 184 { &modlsys, 185 #ifdef _SYSCALL32_IMPL 186 &modlsys32, 187 #endif 188 NULL 189 } 190 }; 191 192 int 193 _init(void) 194 { 195 int retval; 196 197 if ((retval = mod_install(&modlinkage)) != 0) 198 return (retval); 199 200 return (0); 201 } 202 203 int 204 _fini(void) 205 { 206 int retval; 207 208 retval = mod_remove(&modlinkage); 209 210 return (retval); 211 } 212 213 int 214 _info(struct modinfo *modinfop) 215 { 216 return (mod_info(&modlinkage, modinfop)); 217 } 218 219 #ifdef _LP64 220 static int64_t 221 kaioc( 222 long a0, 223 long a1, 224 long a2, 225 long a3, 226 long a4, 227 long a5) 228 { 229 int error; 230 long rval = 0; 231 232 switch ((int)a0 & ~AIO_POLL_BIT) { 233 case AIOREAD: 234 error = arw((int)a0, (int)a1, (char *)a2, (int)a3, 235 (offset_t)a4, (aio_result_t *)a5, FREAD); 236 break; 237 case AIOWRITE: 238 error = arw((int)a0, (int)a1, (char *)a2, (int)a3, 239 (offset_t)a4, (aio_result_t *)a5, FWRITE); 240 break; 241 case AIOWAIT: 242 error = aiowait((struct timeval *)a1, (int)a2, &rval); 243 break; 244 case AIOWAITN: 245 error = aiowaitn((void *)a1, (uint_t)a2, (uint_t *)a3, 246 (timespec_t *)a4); 247 break; 248 case AIONOTIFY: 249 error = aionotify(); 250 break; 251 case AIOINIT: 252 error = aioinit(); 253 break; 254 case AIOSTART: 255 error = aiostart(); 256 break; 257 case AIOLIO: 258 error = alio((int)a1, (aiocb_t **)a2, (int)a3, 259 (struct sigevent *)a4); 260 break; 261 case AIOLIOWAIT: 262 error = aliowait((int)a1, (void *)a2, (int)a3, 263 (struct sigevent *)a4, AIO_64); 264 break; 265 case AIOSUSPEND: 266 error = aiosuspend((void *)a1, (int)a2, (timespec_t *)a3, 267 (int)a4, &rval, AIO_64); 268 break; 269 case AIOERROR: 270 error = aioerror((void *)a1, AIO_64); 271 break; 272 case AIOAREAD: 273 error = aiorw((int)a0, (void *)a1, FREAD, AIO_64); 274 break; 275 case AIOAWRITE: 276 error = aiorw((int)a0, (void *)a1, FWRITE, AIO_64); 277 break; 278 case AIOCANCEL: 279 error = aio_cancel((int)a1, (void *)a2, &rval, AIO_64); 280 break; 281 282 /* 283 * The large file related stuff is valid only for 284 * 32 bit kernel and not for 64 bit kernel 285 * On 64 bit kernel we convert large file calls 286 * to regular 64bit calls. 287 */ 288 289 default: 290 error = EINVAL; 291 } 292 if (error) 293 return ((int64_t)set_errno(error)); 294 return (rval); 295 } 296 #endif 297 298 static int 299 kaio( 300 ulong_t *uap, 301 rval_t *rvp) 302 { 303 long rval = 0; 304 int error = 0; 305 offset_t off; 306 307 308 rvp->r_vals = 0; 309 #if defined(_LITTLE_ENDIAN) 310 off = ((u_offset_t)uap[5] << 32) | (u_offset_t)uap[4]; 311 #else 312 off = ((u_offset_t)uap[4] << 32) | (u_offset_t)uap[5]; 313 #endif 314 315 switch (uap[0] & ~AIO_POLL_BIT) { 316 /* 317 * It must be the 32 bit system call on 64 bit kernel 318 */ 319 case AIOREAD: 320 return (arw((int)uap[0], (int)uap[1], (char *)uap[2], 321 (int)uap[3], off, (aio_result_t *)uap[6], FREAD)); 322 case AIOWRITE: 323 return (arw((int)uap[0], (int)uap[1], (char *)uap[2], 324 (int)uap[3], off, (aio_result_t *)uap[6], FWRITE)); 325 case AIOWAIT: 326 error = aiowait((struct timeval *)uap[1], (int)uap[2], 327 &rval); 328 break; 329 case AIOWAITN: 330 error = aiowaitn((void *)uap[1], (uint_t)uap[2], 331 (uint_t *)uap[3], (timespec_t *)uap[4]); 332 break; 333 case AIONOTIFY: 334 return (aionotify()); 335 case AIOINIT: 336 return (aioinit()); 337 case AIOSTART: 338 return (aiostart()); 339 case AIOLIO: 340 return (alio32((int)uap[1], (void *)uap[2], (int)uap[3], 341 (void *)uap[4])); 342 case AIOLIOWAIT: 343 return (aliowait((int)uap[1], (void *)uap[2], 344 (int)uap[3], (struct sigevent *)uap[4], AIO_32)); 345 case AIOSUSPEND: 346 error = aiosuspend((void *)uap[1], (int)uap[2], 347 (timespec_t *)uap[3], (int)uap[4], 348 &rval, AIO_32); 349 break; 350 case AIOERROR: 351 return (aioerror((void *)uap[1], AIO_32)); 352 case AIOAREAD: 353 return (aiorw((int)uap[0], (void *)uap[1], 354 FREAD, AIO_32)); 355 case AIOAWRITE: 356 return (aiorw((int)uap[0], (void *)uap[1], 357 FWRITE, AIO_32)); 358 case AIOCANCEL: 359 error = (aio_cancel((int)uap[1], (void *)uap[2], &rval, 360 AIO_32)); 361 break; 362 case AIOLIO64: 363 return (alioLF((int)uap[1], (void *)uap[2], 364 (int)uap[3], (void *)uap[4])); 365 case AIOLIOWAIT64: 366 return (aliowait(uap[1], (void *)uap[2], 367 (int)uap[3], (void *)uap[4], AIO_LARGEFILE)); 368 case AIOSUSPEND64: 369 error = aiosuspend((void *)uap[1], (int)uap[2], 370 (timespec_t *)uap[3], (int)uap[4], &rval, 371 AIO_LARGEFILE); 372 break; 373 case AIOERROR64: 374 return (aioerror((void *)uap[1], AIO_LARGEFILE)); 375 case AIOAREAD64: 376 return (aiorw((int)uap[0], (void *)uap[1], FREAD, 377 AIO_LARGEFILE)); 378 case AIOAWRITE64: 379 return (aiorw((int)uap[0], (void *)uap[1], FWRITE, 380 AIO_LARGEFILE)); 381 case AIOCANCEL64: 382 error = (aio_cancel((int)uap[1], (void *)uap[2], 383 &rval, AIO_LARGEFILE)); 384 break; 385 default: 386 return (EINVAL); 387 } 388 389 rvp->r_val1 = rval; 390 return (error); 391 } 392 393 /* 394 * wake up LWPs in this process that are sleeping in 395 * aiowait(). 396 */ 397 static int 398 aionotify(void) 399 { 400 aio_t *aiop; 401 402 aiop = curproc->p_aio; 403 if (aiop == NULL) 404 return (0); 405 406 mutex_enter(&aiop->aio_mutex); 407 aiop->aio_notifycnt++; 408 cv_broadcast(&aiop->aio_waitcv); 409 mutex_exit(&aiop->aio_mutex); 410 411 return (0); 412 } 413 414 static int 415 timeval2reltime(struct timeval *timout, timestruc_t *rqtime, 416 timestruc_t **rqtp, int *blocking) 417 { 418 #ifdef _SYSCALL32_IMPL 419 struct timeval32 wait_time_32; 420 #endif 421 struct timeval wait_time; 422 model_t model = get_udatamodel(); 423 424 *rqtp = NULL; 425 if (timout == NULL) { /* wait indefinitely */ 426 *blocking = 1; 427 return (0); 428 } 429 430 /* 431 * Need to correctly compare with the -1 passed in for a user 432 * address pointer, with both 32 bit and 64 bit apps. 433 */ 434 if (model == DATAMODEL_NATIVE) { 435 if ((intptr_t)timout == (intptr_t)-1) { /* don't wait */ 436 *blocking = 0; 437 return (0); 438 } 439 440 if (copyin(timout, &wait_time, sizeof (wait_time))) 441 return (EFAULT); 442 } 443 #ifdef _SYSCALL32_IMPL 444 else { 445 /* 446 * -1 from a 32bit app. It will not get sign extended. 447 * don't wait if -1. 448 */ 449 if ((intptr_t)timout == (intptr_t)((uint32_t)-1)) { 450 *blocking = 0; 451 return (0); 452 } 453 454 if (copyin(timout, &wait_time_32, sizeof (wait_time_32))) 455 return (EFAULT); 456 TIMEVAL32_TO_TIMEVAL(&wait_time, &wait_time_32); 457 } 458 #endif /* _SYSCALL32_IMPL */ 459 460 if (wait_time.tv_sec == 0 && wait_time.tv_usec == 0) { /* don't wait */ 461 *blocking = 0; 462 return (0); 463 } 464 465 if (wait_time.tv_sec < 0 || 466 wait_time.tv_usec < 0 || wait_time.tv_usec >= MICROSEC) 467 return (EINVAL); 468 469 rqtime->tv_sec = wait_time.tv_sec; 470 rqtime->tv_nsec = wait_time.tv_usec * 1000; 471 *rqtp = rqtime; 472 *blocking = 1; 473 474 return (0); 475 } 476 477 static int 478 timespec2reltime(timespec_t *timout, timestruc_t *rqtime, 479 timestruc_t **rqtp, int *blocking) 480 { 481 #ifdef _SYSCALL32_IMPL 482 timespec32_t wait_time_32; 483 #endif 484 model_t model = get_udatamodel(); 485 486 *rqtp = NULL; 487 if (timout == NULL) { 488 *blocking = 1; 489 return (0); 490 } 491 492 if (model == DATAMODEL_NATIVE) { 493 if (copyin(timout, rqtime, sizeof (*rqtime))) 494 return (EFAULT); 495 } 496 #ifdef _SYSCALL32_IMPL 497 else { 498 if (copyin(timout, &wait_time_32, sizeof (wait_time_32))) 499 return (EFAULT); 500 TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32); 501 } 502 #endif /* _SYSCALL32_IMPL */ 503 504 if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) { 505 *blocking = 0; 506 return (0); 507 } 508 509 if (rqtime->tv_sec < 0 || 510 rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC) 511 return (EINVAL); 512 513 *rqtp = rqtime; 514 *blocking = 1; 515 516 return (0); 517 } 518 519 /*ARGSUSED*/ 520 static int 521 aiowait( 522 struct timeval *timout, 523 int dontblockflg, 524 long *rval) 525 { 526 int error; 527 aio_t *aiop; 528 aio_req_t *reqp; 529 clock_t status; 530 int blocking; 531 int timecheck; 532 timestruc_t rqtime; 533 timestruc_t *rqtp; 534 535 aiop = curproc->p_aio; 536 if (aiop == NULL) 537 return (EINVAL); 538 539 /* 540 * Establish the absolute future time for the timeout. 541 */ 542 error = timeval2reltime(timout, &rqtime, &rqtp, &blocking); 543 if (error) 544 return (error); 545 if (rqtp) { 546 timestruc_t now; 547 timecheck = timechanged; 548 gethrestime(&now); 549 timespecadd(rqtp, &now); 550 } 551 552 mutex_enter(&aiop->aio_mutex); 553 for (;;) { 554 /* process requests on poll queue */ 555 if (aiop->aio_pollq) { 556 mutex_exit(&aiop->aio_mutex); 557 aio_cleanup(0); 558 mutex_enter(&aiop->aio_mutex); 559 } 560 if ((reqp = aio_req_remove(NULL)) != NULL) { 561 *rval = (long)reqp->aio_req_resultp; 562 break; 563 } 564 /* user-level done queue might not be empty */ 565 if (aiop->aio_notifycnt > 0) { 566 aiop->aio_notifycnt--; 567 *rval = 1; 568 break; 569 } 570 /* don't block if no outstanding aio */ 571 if (aiop->aio_outstanding == 0 && dontblockflg) { 572 error = EINVAL; 573 break; 574 } 575 if (blocking) { 576 status = cv_waituntil_sig(&aiop->aio_waitcv, 577 &aiop->aio_mutex, rqtp, timecheck); 578 579 if (status > 0) /* check done queue again */ 580 continue; 581 if (status == 0) { /* interrupted by a signal */ 582 error = EINTR; 583 *rval = -1; 584 } else { /* timer expired */ 585 error = ETIME; 586 } 587 } 588 break; 589 } 590 mutex_exit(&aiop->aio_mutex); 591 if (reqp) { 592 aphysio_unlock(reqp); 593 aio_copyout_result(reqp); 594 mutex_enter(&aiop->aio_mutex); 595 aio_req_free(aiop, reqp); 596 mutex_exit(&aiop->aio_mutex); 597 } 598 return (error); 599 } 600 601 /* 602 * aiowaitn can be used to reap completed asynchronous requests submitted with 603 * lio_listio, aio_read or aio_write. 604 * This function only reaps asynchronous raw I/Os. 605 */ 606 607 /*ARGSUSED*/ 608 static int 609 aiowaitn(void *uiocb, uint_t nent, uint_t *nwait, timespec_t *timout) 610 { 611 int error = 0; 612 aio_t *aiop; 613 aio_req_t *reqlist = NULL; 614 caddr_t iocblist = NULL; /* array of iocb ptr's */ 615 uint_t waitcnt, cnt = 0; /* iocb cnt */ 616 size_t iocbsz; /* users iocb size */ 617 size_t riocbsz; /* returned iocb size */ 618 int iocb_index = 0; 619 model_t model = get_udatamodel(); 620 int blocking = 1; 621 int timecheck; 622 timestruc_t rqtime; 623 timestruc_t *rqtp; 624 625 aiop = curproc->p_aio; 626 if (aiop == NULL || nent == 0 || nent > _AIO_LISTIO_MAX) 627 return (EINVAL); 628 629 if (aiop->aio_outstanding == 0) 630 return (EAGAIN); 631 632 if (copyin(nwait, &waitcnt, sizeof (uint_t))) 633 return (EFAULT); 634 635 /* set *nwait to zero, if we must return prematurely */ 636 if (copyout(&cnt, nwait, sizeof (uint_t))) 637 return (EFAULT); 638 639 if (waitcnt == 0) { 640 blocking = 0; 641 rqtp = NULL; 642 waitcnt = nent; 643 } else { 644 error = timespec2reltime(timout, &rqtime, &rqtp, &blocking); 645 if (error) 646 return (error); 647 } 648 649 if (model == DATAMODEL_NATIVE) 650 iocbsz = (sizeof (aiocb_t *) * nent); 651 #ifdef _SYSCALL32_IMPL 652 else 653 iocbsz = (sizeof (caddr32_t) * nent); 654 #endif /* _SYSCALL32_IMPL */ 655 656 /* 657 * Only one aio_waitn call is allowed at a time. 658 * The active aio_waitn will collect all requests 659 * out of the "done" list and if necessary it will wait 660 * for some/all pending requests to fulfill the nwait 661 * parameter. 662 * A second or further aio_waitn calls will sleep here 663 * until the active aio_waitn finishes and leaves the kernel 664 * If the second call does not block (poll), then return 665 * immediately with the error code : EAGAIN. 666 * If the second call should block, then sleep here, but 667 * do not touch the timeout. The timeout starts when this 668 * aio_waitn-call becomes active. 669 */ 670 671 mutex_enter(&aiop->aio_mutex); 672 673 while (aiop->aio_flags & AIO_WAITN) { 674 if (blocking == 0) { 675 mutex_exit(&aiop->aio_mutex); 676 return (EAGAIN); 677 } 678 679 /* block, no timeout */ 680 aiop->aio_flags |= AIO_WAITN_PENDING; 681 if (!cv_wait_sig(&aiop->aio_waitncv, &aiop->aio_mutex)) { 682 mutex_exit(&aiop->aio_mutex); 683 return (EINTR); 684 } 685 } 686 687 /* 688 * Establish the absolute future time for the timeout. 689 */ 690 if (rqtp) { 691 timestruc_t now; 692 timecheck = timechanged; 693 gethrestime(&now); 694 timespecadd(rqtp, &now); 695 } 696 697 if (iocbsz > aiop->aio_iocbsz && aiop->aio_iocb != NULL) { 698 kmem_free(aiop->aio_iocb, aiop->aio_iocbsz); 699 aiop->aio_iocb = NULL; 700 } 701 702 if (aiop->aio_iocb == NULL) { 703 iocblist = kmem_zalloc(iocbsz, KM_NOSLEEP); 704 if (iocblist == NULL) { 705 mutex_exit(&aiop->aio_mutex); 706 return (ENOMEM); 707 } 708 aiop->aio_iocb = (aiocb_t **)iocblist; 709 aiop->aio_iocbsz = iocbsz; 710 } else { 711 iocblist = (char *)aiop->aio_iocb; 712 } 713 714 aiop->aio_waitncnt = waitcnt; 715 aiop->aio_flags |= AIO_WAITN; 716 717 for (;;) { 718 /* push requests on poll queue to done queue */ 719 if (aiop->aio_pollq) { 720 mutex_exit(&aiop->aio_mutex); 721 aio_cleanup(0); 722 mutex_enter(&aiop->aio_mutex); 723 } 724 725 /* check for requests on done queue */ 726 if (aiop->aio_doneq) { 727 cnt += aio_reqlist_concat(aiop, &reqlist, nent - cnt); 728 aiop->aio_waitncnt = waitcnt - cnt; 729 } 730 731 /* user-level done queue might not be empty */ 732 if (aiop->aio_notifycnt > 0) { 733 aiop->aio_notifycnt--; 734 error = 0; 735 break; 736 } 737 738 /* 739 * if we are here second time as a result of timer 740 * expiration, we reset error if there are enough 741 * aiocb's to satisfy request. 742 * We return also if all requests are already done 743 * and we picked up the whole done queue. 744 */ 745 746 if ((cnt >= waitcnt) || (cnt > 0 && aiop->aio_pending == 0 && 747 aiop->aio_doneq == NULL)) { 748 error = 0; 749 break; 750 } 751 752 if ((cnt < waitcnt) && blocking) { 753 int rval = cv_waituntil_sig(&aiop->aio_waitcv, 754 &aiop->aio_mutex, rqtp, timecheck); 755 if (rval > 0) 756 continue; 757 if (rval < 0) { 758 error = ETIME; 759 blocking = 0; 760 continue; 761 } 762 error = EINTR; 763 } 764 break; 765 } 766 767 mutex_exit(&aiop->aio_mutex); 768 769 if (cnt > 0) { 770 771 iocb_index = aio_unlock_requests(iocblist, iocb_index, reqlist, 772 aiop, model); 773 774 if (model == DATAMODEL_NATIVE) 775 riocbsz = (sizeof (aiocb_t *) * cnt); 776 #ifdef _SYSCALL32_IMPL 777 else 778 riocbsz = (sizeof (caddr32_t) * cnt); 779 #endif /* _SYSCALL32_IMPL */ 780 781 if (copyout(iocblist, uiocb, riocbsz) || 782 copyout(&cnt, nwait, sizeof (uint_t))) 783 error = EFAULT; 784 } 785 786 /* check if there is another thread waiting for execution */ 787 mutex_enter(&aiop->aio_mutex); 788 aiop->aio_flags &= ~AIO_WAITN; 789 if (aiop->aio_flags & AIO_WAITN_PENDING) { 790 aiop->aio_flags &= ~AIO_WAITN_PENDING; 791 cv_signal(&aiop->aio_waitncv); 792 } 793 mutex_exit(&aiop->aio_mutex); 794 795 return (error); 796 } 797 798 /* 799 * aio_unlock_requests 800 * copyouts the result of the request as well as the return value. 801 * It builds the list of completed asynchronous requests, 802 * unlocks the allocated memory ranges and 803 * put the aio request structure back into the free list. 804 */ 805 806 static int 807 aio_unlock_requests( 808 caddr_t iocblist, 809 int iocb_index, 810 aio_req_t *reqlist, 811 aio_t *aiop, 812 model_t model) 813 { 814 aio_req_t *reqp, *nreqp; 815 816 if (model == DATAMODEL_NATIVE) { 817 for (reqp = reqlist; reqp != NULL; reqp = nreqp) { 818 (((caddr_t *)iocblist)[iocb_index++]) = 819 reqp->aio_req_iocb.iocb; 820 nreqp = reqp->aio_req_next; 821 aphysio_unlock(reqp); 822 aio_copyout_result(reqp); 823 mutex_enter(&aiop->aio_mutex); 824 aio_req_free(aiop, reqp); 825 mutex_exit(&aiop->aio_mutex); 826 } 827 } 828 #ifdef _SYSCALL32_IMPL 829 else { 830 for (reqp = reqlist; reqp != NULL; reqp = nreqp) { 831 ((caddr32_t *)iocblist)[iocb_index++] = 832 reqp->aio_req_iocb.iocb32; 833 nreqp = reqp->aio_req_next; 834 aphysio_unlock(reqp); 835 aio_copyout_result(reqp); 836 mutex_enter(&aiop->aio_mutex); 837 aio_req_free(aiop, reqp); 838 mutex_exit(&aiop->aio_mutex); 839 } 840 } 841 #endif /* _SYSCALL32_IMPL */ 842 return (iocb_index); 843 } 844 845 /* 846 * aio_reqlist_concat 847 * moves "max" elements from the done queue to the reqlist queue and removes 848 * the AIO_DONEQ flag. 849 * - reqlist queue is a simple linked list 850 * - done queue is a double linked list 851 */ 852 853 static int 854 aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max) 855 { 856 aio_req_t *q2, *q2work, *list; 857 int count = 0; 858 859 list = *reqlist; 860 q2 = aiop->aio_doneq; 861 q2work = q2; 862 while (max-- > 0) { 863 q2work->aio_req_flags &= ~AIO_DONEQ; 864 q2work = q2work->aio_req_next; 865 count++; 866 if (q2work == q2) 867 break; 868 } 869 870 if (q2work == q2) { 871 /* all elements revised */ 872 q2->aio_req_prev->aio_req_next = list; 873 list = q2; 874 aiop->aio_doneq = NULL; 875 } else { 876 /* 877 * max < elements in the doneq 878 * detach only the required amount of elements 879 * out of the doneq 880 */ 881 q2work->aio_req_prev->aio_req_next = list; 882 list = q2; 883 884 aiop->aio_doneq = q2work; 885 q2work->aio_req_prev = q2->aio_req_prev; 886 q2->aio_req_prev->aio_req_next = q2work; 887 } 888 *reqlist = list; 889 return (count); 890 } 891 892 /*ARGSUSED*/ 893 static int 894 aiosuspend( 895 void *aiocb, 896 int nent, 897 struct timespec *timout, 898 int flag, 899 long *rval, 900 int run_mode) 901 { 902 int error; 903 aio_t *aiop; 904 aio_req_t *reqp, *found, *next; 905 caddr_t cbplist = NULL; 906 aiocb_t *cbp, **ucbp; 907 #ifdef _SYSCALL32_IMPL 908 aiocb32_t *cbp32; 909 caddr32_t *ucbp32; 910 #endif /* _SYSCALL32_IMPL */ 911 aiocb64_32_t *cbp64; 912 int rv; 913 int i; 914 size_t ssize; 915 model_t model = get_udatamodel(); 916 int blocking; 917 int timecheck; 918 timestruc_t rqtime; 919 timestruc_t *rqtp; 920 921 aiop = curproc->p_aio; 922 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 923 return (EINVAL); 924 925 /* 926 * Establish the absolute future time for the timeout. 927 */ 928 error = timespec2reltime(timout, &rqtime, &rqtp, &blocking); 929 if (error) 930 return (error); 931 if (rqtp) { 932 timestruc_t now; 933 timecheck = timechanged; 934 gethrestime(&now); 935 timespecadd(rqtp, &now); 936 } 937 938 /* 939 * If we are not blocking and there's no IO complete 940 * skip aiocb copyin. 941 */ 942 if (!blocking && (aiop->aio_pollq == NULL) && 943 (aiop->aio_doneq == NULL)) { 944 return (EAGAIN); 945 } 946 947 if (model == DATAMODEL_NATIVE) 948 ssize = (sizeof (aiocb_t *) * nent); 949 #ifdef _SYSCALL32_IMPL 950 else 951 ssize = (sizeof (caddr32_t) * nent); 952 #endif /* _SYSCALL32_IMPL */ 953 954 cbplist = kmem_alloc(ssize, KM_NOSLEEP); 955 if (cbplist == NULL) 956 return (ENOMEM); 957 958 if (copyin(aiocb, cbplist, ssize)) { 959 error = EFAULT; 960 goto done; 961 } 962 963 found = NULL; 964 /* 965 * we need to get the aio_cleanupq_mutex since we call 966 * aio_req_done(). 967 */ 968 mutex_enter(&aiop->aio_cleanupq_mutex); 969 mutex_enter(&aiop->aio_mutex); 970 for (;;) { 971 /* push requests on poll queue to done queue */ 972 if (aiop->aio_pollq) { 973 mutex_exit(&aiop->aio_mutex); 974 mutex_exit(&aiop->aio_cleanupq_mutex); 975 aio_cleanup(0); 976 mutex_enter(&aiop->aio_cleanupq_mutex); 977 mutex_enter(&aiop->aio_mutex); 978 } 979 /* check for requests on done queue */ 980 if (aiop->aio_doneq) { 981 if (model == DATAMODEL_NATIVE) 982 ucbp = (aiocb_t **)cbplist; 983 #ifdef _SYSCALL32_IMPL 984 else 985 ucbp32 = (caddr32_t *)cbplist; 986 #endif /* _SYSCALL32_IMPL */ 987 for (i = 0; i < nent; i++) { 988 if (model == DATAMODEL_NATIVE) { 989 if ((cbp = *ucbp++) == NULL) 990 continue; 991 if (run_mode != AIO_LARGEFILE) 992 reqp = aio_req_done( 993 &cbp->aio_resultp); 994 else { 995 cbp64 = (aiocb64_32_t *)cbp; 996 reqp = aio_req_done( 997 &cbp64->aio_resultp); 998 } 999 } 1000 #ifdef _SYSCALL32_IMPL 1001 else { 1002 if (run_mode == AIO_32) { 1003 if ((cbp32 = 1004 (aiocb32_t *)(uintptr_t) 1005 *ucbp32++) == NULL) 1006 continue; 1007 reqp = aio_req_done( 1008 &cbp32->aio_resultp); 1009 } else if (run_mode == AIO_LARGEFILE) { 1010 if ((cbp64 = 1011 (aiocb64_32_t *)(uintptr_t) 1012 *ucbp32++) == NULL) 1013 continue; 1014 reqp = aio_req_done( 1015 &cbp64->aio_resultp); 1016 } 1017 1018 } 1019 #endif /* _SYSCALL32_IMPL */ 1020 if (reqp) { 1021 reqp->aio_req_next = found; 1022 found = reqp; 1023 } 1024 if (aiop->aio_doneq == NULL) 1025 break; 1026 } 1027 if (found) 1028 break; 1029 } 1030 if (aiop->aio_notifycnt > 0) { 1031 /* 1032 * nothing on the kernel's queue. the user 1033 * has notified the kernel that it has items 1034 * on a user-level queue. 1035 */ 1036 aiop->aio_notifycnt--; 1037 *rval = 1; 1038 error = 0; 1039 break; 1040 } 1041 /* don't block if nothing is outstanding */ 1042 if (aiop->aio_outstanding == 0) { 1043 error = EAGAIN; 1044 break; 1045 } 1046 if (blocking) { 1047 /* 1048 * drop the aio_cleanupq_mutex as we are 1049 * going to block. 1050 */ 1051 mutex_exit(&aiop->aio_cleanupq_mutex); 1052 rv = cv_waituntil_sig(&aiop->aio_waitcv, 1053 &aiop->aio_mutex, rqtp, timecheck); 1054 /* 1055 * we have to drop aio_mutex and 1056 * grab it in the right order. 1057 */ 1058 mutex_exit(&aiop->aio_mutex); 1059 mutex_enter(&aiop->aio_cleanupq_mutex); 1060 mutex_enter(&aiop->aio_mutex); 1061 if (rv > 0) /* check done queue again */ 1062 continue; 1063 if (rv == 0) /* interrupted by a signal */ 1064 error = EINTR; 1065 else /* timer expired */ 1066 error = ETIME; 1067 } else { 1068 error = EAGAIN; 1069 } 1070 break; 1071 } 1072 mutex_exit(&aiop->aio_mutex); 1073 mutex_exit(&aiop->aio_cleanupq_mutex); 1074 for (reqp = found; reqp != NULL; reqp = next) { 1075 next = reqp->aio_req_next; 1076 aphysio_unlock(reqp); 1077 aio_copyout_result(reqp); 1078 mutex_enter(&aiop->aio_mutex); 1079 aio_req_free(aiop, reqp); 1080 mutex_exit(&aiop->aio_mutex); 1081 } 1082 done: 1083 kmem_free(cbplist, ssize); 1084 return (error); 1085 } 1086 1087 /* 1088 * initialize aio by allocating an aio_t struct for this 1089 * process. 1090 */ 1091 static int 1092 aioinit(void) 1093 { 1094 proc_t *p = curproc; 1095 aio_t *aiop; 1096 mutex_enter(&p->p_lock); 1097 if ((aiop = p->p_aio) == NULL) { 1098 aiop = aio_aiop_alloc(); 1099 p->p_aio = aiop; 1100 } 1101 mutex_exit(&p->p_lock); 1102 if (aiop == NULL) 1103 return (ENOMEM); 1104 return (0); 1105 } 1106 1107 /* 1108 * start a special thread that will cleanup after aio requests 1109 * that are preventing a segment from being unmapped. as_unmap() 1110 * blocks until all phsyio to this segment is completed. this 1111 * doesn't happen until all the pages in this segment are not 1112 * SOFTLOCKed. Some pages will be SOFTLOCKed when there are aio 1113 * requests still outstanding. this special thread will make sure 1114 * that these SOFTLOCKed pages will eventually be SOFTUNLOCKed. 1115 * 1116 * this function will return an error if the process has only 1117 * one LWP. the assumption is that the caller is a separate LWP 1118 * that remains blocked in the kernel for the life of this process. 1119 */ 1120 static int 1121 aiostart(void) 1122 { 1123 proc_t *p = curproc; 1124 aio_t *aiop; 1125 int first, error = 0; 1126 1127 if (p->p_lwpcnt == 1) 1128 return (EDEADLK); 1129 mutex_enter(&p->p_lock); 1130 if ((aiop = p->p_aio) == NULL) 1131 error = EINVAL; 1132 else { 1133 first = aiop->aio_ok; 1134 if (aiop->aio_ok == 0) 1135 aiop->aio_ok = 1; 1136 } 1137 mutex_exit(&p->p_lock); 1138 if (error == 0 && first == 0) { 1139 return (aio_cleanup_thread(aiop)); 1140 /* should return only to exit */ 1141 } 1142 return (error); 1143 } 1144 1145 /* 1146 * Associate an aiocb with a port. 1147 * This function is used by aiorw() to associate a transaction with a port. 1148 * Allocate an event port structure (port_alloc_event()) and store the 1149 * delivered user pointer (portnfy_user) in the portkev_user field of the 1150 * port_kevent_t structure.. 1151 * The aio_req_portkev pointer in the aio_req_t structure was added to identify 1152 * the port association. 1153 */ 1154 1155 static int 1156 aio_req_assoc_port_rw(port_notify_t *pntfy, aiocb_t *cbp, 1157 aio_req_t *reqp, int event) 1158 { 1159 port_kevent_t *pkevp = NULL; 1160 int error; 1161 1162 error = port_alloc_event(pntfy->portnfy_port, PORT_ALLOC_DEFAULT, 1163 PORT_SOURCE_AIO, &pkevp); 1164 if (error) { 1165 if ((error == ENOMEM) || (error == EAGAIN)) 1166 error = EAGAIN; 1167 else 1168 error = EINVAL; 1169 } else { 1170 port_init_event(pkevp, (uintptr_t)cbp, pntfy->portnfy_user, 1171 aio_port_callback, reqp); 1172 pkevp->portkev_events = event; 1173 reqp->aio_req_portkev = pkevp; 1174 reqp->aio_req_port = pntfy->portnfy_port; 1175 } 1176 return (error); 1177 } 1178 1179 #ifdef _LP64 1180 1181 /* 1182 * Asynchronous list IO. A chain of aiocb's are copied in 1183 * one at a time. If the aiocb is invalid, it is skipped. 1184 * For each aiocb, the appropriate driver entry point is 1185 * called. Optimize for the common case where the list 1186 * of requests is to the same file descriptor. 1187 * 1188 * One possible optimization is to define a new driver entry 1189 * point that supports a list of IO requests. Whether this 1190 * improves performance depends somewhat on the driver's 1191 * locking strategy. Processing a list could adversely impact 1192 * the driver's interrupt latency. 1193 */ 1194 static int 1195 alio( 1196 int mode_arg, 1197 aiocb_t **aiocb_arg, 1198 int nent, 1199 struct sigevent *sigev) 1200 { 1201 file_t *fp; 1202 file_t *prev_fp = NULL; 1203 int prev_mode = -1; 1204 struct vnode *vp; 1205 aio_lio_t *head; 1206 aio_req_t *reqp; 1207 aio_t *aiop; 1208 caddr_t cbplist; 1209 aiocb_t cb; 1210 aiocb_t *aiocb = &cb; 1211 aiocb_t *cbp; 1212 aiocb_t **ucbp; 1213 struct sigevent sigevk; 1214 sigqueue_t *sqp; 1215 int (*aio_func)(); 1216 int mode; 1217 int error = 0; 1218 int aio_errors = 0; 1219 int i; 1220 size_t ssize; 1221 int deadhead = 0; 1222 int aio_notsupported = 0; 1223 int lio_head_port; 1224 int aio_port; 1225 int aio_thread; 1226 port_kevent_t *pkevtp = NULL; 1227 int portused = 0; 1228 port_notify_t pnotify; 1229 int event; 1230 1231 aiop = curproc->p_aio; 1232 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 1233 return (EINVAL); 1234 1235 ssize = (sizeof (aiocb_t *) * nent); 1236 cbplist = kmem_alloc(ssize, KM_SLEEP); 1237 ucbp = (aiocb_t **)cbplist; 1238 1239 if (copyin(aiocb_arg, cbplist, ssize) || 1240 (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent)))) { 1241 kmem_free(cbplist, ssize); 1242 return (EFAULT); 1243 } 1244 1245 /* Event Ports */ 1246 if (sigev && 1247 (sigevk.sigev_notify == SIGEV_THREAD || 1248 sigevk.sigev_notify == SIGEV_PORT)) { 1249 if (sigevk.sigev_notify == SIGEV_THREAD) { 1250 pnotify.portnfy_port = sigevk.sigev_signo; 1251 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr; 1252 } else if (copyin(sigevk.sigev_value.sival_ptr, 1253 &pnotify, sizeof (pnotify))) { 1254 kmem_free(cbplist, ssize); 1255 return (EFAULT); 1256 } 1257 error = port_alloc_event(pnotify.portnfy_port, 1258 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp); 1259 if (error) { 1260 if (error == ENOMEM || error == EAGAIN) 1261 error = EAGAIN; 1262 else 1263 error = EINVAL; 1264 kmem_free(cbplist, ssize); 1265 return (error); 1266 } 1267 lio_head_port = pnotify.portnfy_port; 1268 portused = 1; 1269 } 1270 1271 /* 1272 * a list head should be allocated if notification is 1273 * enabled for this list. 1274 */ 1275 head = NULL; 1276 1277 if (mode_arg == LIO_WAIT || sigev) { 1278 mutex_enter(&aiop->aio_mutex); 1279 error = aio_lio_alloc(&head); 1280 mutex_exit(&aiop->aio_mutex); 1281 if (error) 1282 goto done; 1283 deadhead = 1; 1284 head->lio_nent = nent; 1285 head->lio_refcnt = nent; 1286 head->lio_port = -1; 1287 head->lio_portkev = NULL; 1288 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL && 1289 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) { 1290 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 1291 if (sqp == NULL) { 1292 error = EAGAIN; 1293 goto done; 1294 } 1295 sqp->sq_func = NULL; 1296 sqp->sq_next = NULL; 1297 sqp->sq_info.si_code = SI_ASYNCIO; 1298 sqp->sq_info.si_pid = curproc->p_pid; 1299 sqp->sq_info.si_ctid = PRCTID(curproc); 1300 sqp->sq_info.si_zoneid = getzoneid(); 1301 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 1302 sqp->sq_info.si_signo = sigevk.sigev_signo; 1303 sqp->sq_info.si_value = sigevk.sigev_value; 1304 head->lio_sigqp = sqp; 1305 } else { 1306 head->lio_sigqp = NULL; 1307 } 1308 if (pkevtp) { 1309 /* 1310 * Prepare data to send when list of aiocb's 1311 * has completed. 1312 */ 1313 port_init_event(pkevtp, (uintptr_t)sigev, 1314 (void *)(uintptr_t)pnotify.portnfy_user, 1315 NULL, head); 1316 pkevtp->portkev_events = AIOLIO; 1317 head->lio_portkev = pkevtp; 1318 head->lio_port = pnotify.portnfy_port; 1319 } 1320 } 1321 1322 for (i = 0; i < nent; i++, ucbp++) { 1323 1324 cbp = *ucbp; 1325 /* skip entry if it can't be copied. */ 1326 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) { 1327 if (head) { 1328 mutex_enter(&aiop->aio_mutex); 1329 head->lio_nent--; 1330 head->lio_refcnt--; 1331 mutex_exit(&aiop->aio_mutex); 1332 } 1333 continue; 1334 } 1335 1336 /* skip if opcode for aiocb is LIO_NOP */ 1337 mode = aiocb->aio_lio_opcode; 1338 if (mode == LIO_NOP) { 1339 cbp = NULL; 1340 if (head) { 1341 mutex_enter(&aiop->aio_mutex); 1342 head->lio_nent--; 1343 head->lio_refcnt--; 1344 mutex_exit(&aiop->aio_mutex); 1345 } 1346 continue; 1347 } 1348 1349 /* increment file descriptor's ref count. */ 1350 if ((fp = getf(aiocb->aio_fildes)) == NULL) { 1351 lio_set_uerror(&cbp->aio_resultp, EBADF); 1352 if (head) { 1353 mutex_enter(&aiop->aio_mutex); 1354 head->lio_nent--; 1355 head->lio_refcnt--; 1356 mutex_exit(&aiop->aio_mutex); 1357 } 1358 aio_errors++; 1359 continue; 1360 } 1361 1362 /* 1363 * check the permission of the partition 1364 */ 1365 if ((fp->f_flag & mode) == 0) { 1366 releasef(aiocb->aio_fildes); 1367 lio_set_uerror(&cbp->aio_resultp, EBADF); 1368 if (head) { 1369 mutex_enter(&aiop->aio_mutex); 1370 head->lio_nent--; 1371 head->lio_refcnt--; 1372 mutex_exit(&aiop->aio_mutex); 1373 } 1374 aio_errors++; 1375 continue; 1376 } 1377 1378 /* 1379 * common case where requests are to the same fd 1380 * for the same r/w operation. 1381 * for UFS, need to set EBADFD 1382 */ 1383 vp = fp->f_vnode; 1384 if (fp != prev_fp || mode != prev_mode) { 1385 aio_func = check_vp(vp, mode); 1386 if (aio_func == NULL) { 1387 prev_fp = NULL; 1388 releasef(aiocb->aio_fildes); 1389 lio_set_uerror(&cbp->aio_resultp, EBADFD); 1390 aio_notsupported++; 1391 if (head) { 1392 mutex_enter(&aiop->aio_mutex); 1393 head->lio_nent--; 1394 head->lio_refcnt--; 1395 mutex_exit(&aiop->aio_mutex); 1396 } 1397 continue; 1398 } else { 1399 prev_fp = fp; 1400 prev_mode = mode; 1401 } 1402 } 1403 1404 error = aio_req_setup(&reqp, aiop, aiocb, 1405 &cbp->aio_resultp, vp, 0); 1406 if (error) { 1407 releasef(aiocb->aio_fildes); 1408 lio_set_uerror(&cbp->aio_resultp, error); 1409 if (head) { 1410 mutex_enter(&aiop->aio_mutex); 1411 head->lio_nent--; 1412 head->lio_refcnt--; 1413 mutex_exit(&aiop->aio_mutex); 1414 } 1415 aio_errors++; 1416 continue; 1417 } 1418 1419 reqp->aio_req_lio = head; 1420 deadhead = 0; 1421 1422 /* 1423 * Set the errno field now before sending the request to 1424 * the driver to avoid a race condition 1425 */ 1426 (void) suword32(&cbp->aio_resultp.aio_errno, 1427 EINPROGRESS); 1428 1429 reqp->aio_req_iocb.iocb = (caddr_t)cbp; 1430 1431 event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE; 1432 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT); 1433 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD); 1434 if (aio_port | aio_thread) { 1435 port_kevent_t *lpkevp; 1436 /* 1437 * Prepare data to send with each aiocb completed. 1438 */ 1439 if (aio_port) { 1440 void *paddr = 1441 aiocb->aio_sigevent.sigev_value.sival_ptr; 1442 if (copyin(paddr, &pnotify, sizeof (pnotify))) 1443 error = EFAULT; 1444 } else { /* aio_thread */ 1445 pnotify.portnfy_port = 1446 aiocb->aio_sigevent.sigev_signo; 1447 pnotify.portnfy_user = 1448 aiocb->aio_sigevent.sigev_value.sival_ptr; 1449 } 1450 if (error) 1451 /* EMPTY */; 1452 else if (pkevtp != NULL && 1453 pnotify.portnfy_port == lio_head_port) 1454 error = port_dup_event(pkevtp, &lpkevp, 1455 PORT_ALLOC_DEFAULT); 1456 else 1457 error = port_alloc_event(pnotify.portnfy_port, 1458 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, 1459 &lpkevp); 1460 if (error == 0) { 1461 port_init_event(lpkevp, (uintptr_t)cbp, 1462 (void *)(uintptr_t)pnotify.portnfy_user, 1463 aio_port_callback, reqp); 1464 lpkevp->portkev_events = event; 1465 reqp->aio_req_portkev = lpkevp; 1466 reqp->aio_req_port = pnotify.portnfy_port; 1467 } 1468 } 1469 1470 /* 1471 * send the request to driver. 1472 */ 1473 if (error == 0) { 1474 if (aiocb->aio_nbytes == 0) { 1475 clear_active_fd(aiocb->aio_fildes); 1476 aio_zerolen(reqp); 1477 continue; 1478 } 1479 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, 1480 CRED()); 1481 } 1482 1483 /* 1484 * the fd's ref count is not decremented until the IO has 1485 * completed unless there was an error. 1486 */ 1487 if (error) { 1488 releasef(aiocb->aio_fildes); 1489 lio_set_uerror(&cbp->aio_resultp, error); 1490 if (head) { 1491 mutex_enter(&aiop->aio_mutex); 1492 head->lio_nent--; 1493 head->lio_refcnt--; 1494 mutex_exit(&aiop->aio_mutex); 1495 } 1496 if (error == ENOTSUP) 1497 aio_notsupported++; 1498 else 1499 aio_errors++; 1500 lio_set_error(reqp, portused); 1501 } else { 1502 clear_active_fd(aiocb->aio_fildes); 1503 } 1504 } 1505 1506 if (aio_notsupported) { 1507 error = ENOTSUP; 1508 } else if (aio_errors) { 1509 /* 1510 * return EIO if any request failed 1511 */ 1512 error = EIO; 1513 } 1514 1515 if (mode_arg == LIO_WAIT) { 1516 mutex_enter(&aiop->aio_mutex); 1517 while (head->lio_refcnt > 0) { 1518 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 1519 mutex_exit(&aiop->aio_mutex); 1520 error = EINTR; 1521 goto done; 1522 } 1523 } 1524 mutex_exit(&aiop->aio_mutex); 1525 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_64); 1526 } 1527 1528 done: 1529 kmem_free(cbplist, ssize); 1530 if (deadhead) { 1531 if (head->lio_sigqp) 1532 kmem_free(head->lio_sigqp, sizeof (sigqueue_t)); 1533 if (head->lio_portkev) 1534 port_free_event(head->lio_portkev); 1535 kmem_free(head, sizeof (aio_lio_t)); 1536 } 1537 return (error); 1538 } 1539 1540 #endif /* _LP64 */ 1541 1542 /* 1543 * Asynchronous list IO. 1544 * If list I/O is called with LIO_WAIT it can still return 1545 * before all the I/O's are completed if a signal is caught 1546 * or if the list include UFS I/O requests. If this happens, 1547 * libaio will call aliowait() to wait for the I/O's to 1548 * complete 1549 */ 1550 /*ARGSUSED*/ 1551 static int 1552 aliowait( 1553 int mode, 1554 void *aiocb, 1555 int nent, 1556 void *sigev, 1557 int run_mode) 1558 { 1559 aio_lio_t *head; 1560 aio_t *aiop; 1561 caddr_t cbplist; 1562 aiocb_t *cbp, **ucbp; 1563 #ifdef _SYSCALL32_IMPL 1564 aiocb32_t *cbp32; 1565 caddr32_t *ucbp32; 1566 aiocb64_32_t *cbp64; 1567 #endif 1568 int error = 0; 1569 int i; 1570 size_t ssize = 0; 1571 model_t model = get_udatamodel(); 1572 1573 aiop = curproc->p_aio; 1574 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 1575 return (EINVAL); 1576 1577 if (model == DATAMODEL_NATIVE) 1578 ssize = (sizeof (aiocb_t *) * nent); 1579 #ifdef _SYSCALL32_IMPL 1580 else 1581 ssize = (sizeof (caddr32_t) * nent); 1582 #endif /* _SYSCALL32_IMPL */ 1583 1584 if (ssize == 0) 1585 return (EINVAL); 1586 1587 cbplist = kmem_alloc(ssize, KM_SLEEP); 1588 1589 if (model == DATAMODEL_NATIVE) 1590 ucbp = (aiocb_t **)cbplist; 1591 #ifdef _SYSCALL32_IMPL 1592 else 1593 ucbp32 = (caddr32_t *)cbplist; 1594 #endif /* _SYSCALL32_IMPL */ 1595 1596 if (copyin(aiocb, cbplist, ssize)) { 1597 error = EFAULT; 1598 goto done; 1599 } 1600 1601 /* 1602 * To find the list head, we go through the 1603 * list of aiocb structs, find the request 1604 * its for, then get the list head that reqp 1605 * points to 1606 */ 1607 head = NULL; 1608 1609 for (i = 0; i < nent; i++) { 1610 if (model == DATAMODEL_NATIVE) { 1611 /* 1612 * Since we are only checking for a NULL pointer 1613 * Following should work on both native data sizes 1614 * as well as for largefile aiocb. 1615 */ 1616 if ((cbp = *ucbp++) == NULL) 1617 continue; 1618 if (run_mode != AIO_LARGEFILE) 1619 if (head = aio_list_get(&cbp->aio_resultp)) 1620 break; 1621 else { 1622 /* 1623 * This is a case when largefile call is 1624 * made on 32 bit kernel. 1625 * Treat each pointer as pointer to 1626 * aiocb64_32 1627 */ 1628 if (head = aio_list_get((aio_result_t *) 1629 &(((aiocb64_32_t *)cbp)->aio_resultp))) 1630 break; 1631 } 1632 } 1633 #ifdef _SYSCALL32_IMPL 1634 else { 1635 if (run_mode == AIO_LARGEFILE) { 1636 if ((cbp64 = (aiocb64_32_t *) 1637 (uintptr_t)*ucbp32++) == NULL) 1638 continue; 1639 if (head = aio_list_get((aio_result_t *) 1640 &cbp64->aio_resultp)) 1641 break; 1642 } else if (run_mode == AIO_32) { 1643 if ((cbp32 = (aiocb32_t *) 1644 (uintptr_t)*ucbp32++) == NULL) 1645 continue; 1646 if (head = aio_list_get((aio_result_t *) 1647 &cbp32->aio_resultp)) 1648 break; 1649 } 1650 } 1651 #endif /* _SYSCALL32_IMPL */ 1652 } 1653 1654 if (head == NULL) { 1655 error = EINVAL; 1656 goto done; 1657 } 1658 1659 mutex_enter(&aiop->aio_mutex); 1660 while (head->lio_refcnt > 0) { 1661 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 1662 mutex_exit(&aiop->aio_mutex); 1663 error = EINTR; 1664 goto done; 1665 } 1666 } 1667 mutex_exit(&aiop->aio_mutex); 1668 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, run_mode); 1669 done: 1670 kmem_free(cbplist, ssize); 1671 return (error); 1672 } 1673 1674 aio_lio_t * 1675 aio_list_get(aio_result_t *resultp) 1676 { 1677 aio_lio_t *head = NULL; 1678 aio_t *aiop; 1679 aio_req_t **bucket; 1680 aio_req_t *reqp; 1681 long index; 1682 1683 aiop = curproc->p_aio; 1684 if (aiop == NULL) 1685 return (NULL); 1686 1687 if (resultp) { 1688 index = AIO_HASH(resultp); 1689 bucket = &aiop->aio_hash[index]; 1690 for (reqp = *bucket; reqp != NULL; 1691 reqp = reqp->aio_hash_next) { 1692 if (reqp->aio_req_resultp == resultp) { 1693 head = reqp->aio_req_lio; 1694 return (head); 1695 } 1696 } 1697 } 1698 return (NULL); 1699 } 1700 1701 1702 static void 1703 lio_set_uerror(void *resultp, int error) 1704 { 1705 /* 1706 * the resultp field is a pointer to where the 1707 * error should be written out to the user's 1708 * aiocb. 1709 * 1710 */ 1711 if (get_udatamodel() == DATAMODEL_NATIVE) { 1712 (void) sulword(&((aio_result_t *)resultp)->aio_return, 1713 (ssize_t)-1); 1714 (void) suword32(&((aio_result_t *)resultp)->aio_errno, error); 1715 } 1716 #ifdef _SYSCALL32_IMPL 1717 else { 1718 (void) suword32(&((aio_result32_t *)resultp)->aio_return, 1719 (uint_t)-1); 1720 (void) suword32(&((aio_result32_t *)resultp)->aio_errno, error); 1721 } 1722 #endif /* _SYSCALL32_IMPL */ 1723 } 1724 1725 /* 1726 * do cleanup completion for all requests in list. memory for 1727 * each request is also freed. 1728 */ 1729 static void 1730 alio_cleanup(aio_t *aiop, aiocb_t **cbp, int nent, int run_mode) 1731 { 1732 int i; 1733 aio_req_t *reqp; 1734 aio_result_t *resultp; 1735 aiocb64_32_t *aiocb_64; 1736 1737 for (i = 0; i < nent; i++) { 1738 if (get_udatamodel() == DATAMODEL_NATIVE) { 1739 if (cbp[i] == NULL) 1740 continue; 1741 if (run_mode == AIO_LARGEFILE) { 1742 aiocb_64 = (aiocb64_32_t *)cbp[i]; 1743 resultp = (aio_result_t *) 1744 &aiocb_64->aio_resultp; 1745 } else 1746 resultp = &cbp[i]->aio_resultp; 1747 } 1748 #ifdef _SYSCALL32_IMPL 1749 else { 1750 aiocb32_t *aiocb_32; 1751 caddr32_t *cbp32; 1752 1753 cbp32 = (caddr32_t *)cbp; 1754 if (cbp32[i] == NULL) 1755 continue; 1756 if (run_mode == AIO_32) { 1757 aiocb_32 = (aiocb32_t *)(uintptr_t)cbp32[i]; 1758 resultp = (aio_result_t *)&aiocb_32-> 1759 aio_resultp; 1760 } else if (run_mode == AIO_LARGEFILE) { 1761 aiocb_64 = (aiocb64_32_t *)(uintptr_t)cbp32[i]; 1762 resultp = (aio_result_t *)&aiocb_64-> 1763 aio_resultp; 1764 } 1765 } 1766 #endif /* _SYSCALL32_IMPL */ 1767 /* 1768 * we need to get the aio_cleanupq_mutex since we call 1769 * aio_req_done(). 1770 */ 1771 mutex_enter(&aiop->aio_cleanupq_mutex); 1772 mutex_enter(&aiop->aio_mutex); 1773 reqp = aio_req_done(resultp); 1774 mutex_exit(&aiop->aio_mutex); 1775 mutex_exit(&aiop->aio_cleanupq_mutex); 1776 if (reqp != NULL) { 1777 aphysio_unlock(reqp); 1778 aio_copyout_result(reqp); 1779 mutex_enter(&aiop->aio_mutex); 1780 aio_req_free(aiop, reqp); 1781 mutex_exit(&aiop->aio_mutex); 1782 } 1783 } 1784 } 1785 1786 /* 1787 * Write out the results for an aio request that is done. 1788 */ 1789 static int 1790 aioerror(void *cb, int run_mode) 1791 { 1792 aio_result_t *resultp; 1793 aio_t *aiop; 1794 aio_req_t *reqp; 1795 int retval; 1796 1797 aiop = curproc->p_aio; 1798 if (aiop == NULL || cb == NULL) 1799 return (EINVAL); 1800 1801 if (get_udatamodel() == DATAMODEL_NATIVE) { 1802 if (run_mode == AIO_LARGEFILE) 1803 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)-> 1804 aio_resultp; 1805 else 1806 resultp = &((aiocb_t *)cb)->aio_resultp; 1807 } 1808 #ifdef _SYSCALL32_IMPL 1809 else { 1810 if (run_mode == AIO_LARGEFILE) 1811 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)-> 1812 aio_resultp; 1813 else if (run_mode == AIO_32) 1814 resultp = (aio_result_t *)&((aiocb32_t *)cb)-> 1815 aio_resultp; 1816 } 1817 #endif /* _SYSCALL32_IMPL */ 1818 /* 1819 * we need to get the aio_cleanupq_mutex since we call 1820 * aio_req_find(). 1821 */ 1822 mutex_enter(&aiop->aio_cleanupq_mutex); 1823 mutex_enter(&aiop->aio_mutex); 1824 retval = aio_req_find(resultp, &reqp); 1825 mutex_exit(&aiop->aio_mutex); 1826 mutex_exit(&aiop->aio_cleanupq_mutex); 1827 if (retval == 0) { 1828 aphysio_unlock(reqp); 1829 aio_copyout_result(reqp); 1830 mutex_enter(&aiop->aio_mutex); 1831 aio_req_free(aiop, reqp); 1832 mutex_exit(&aiop->aio_mutex); 1833 return (0); 1834 } else if (retval == 1) 1835 return (EINPROGRESS); 1836 else if (retval == 2) 1837 return (EINVAL); 1838 return (0); 1839 } 1840 1841 /* 1842 * aio_cancel - if no requests outstanding, 1843 * return AIO_ALLDONE 1844 * else 1845 * return AIO_NOTCANCELED 1846 */ 1847 static int 1848 aio_cancel( 1849 int fildes, 1850 void *cb, 1851 long *rval, 1852 int run_mode) 1853 { 1854 aio_t *aiop; 1855 void *resultp; 1856 int index; 1857 aio_req_t **bucket; 1858 aio_req_t *ent; 1859 1860 1861 /* 1862 * Verify valid file descriptor 1863 */ 1864 if ((getf(fildes)) == NULL) { 1865 return (EBADF); 1866 } 1867 releasef(fildes); 1868 1869 aiop = curproc->p_aio; 1870 if (aiop == NULL) 1871 return (EINVAL); 1872 1873 if (aiop->aio_outstanding == 0) { 1874 *rval = AIO_ALLDONE; 1875 return (0); 1876 } 1877 1878 mutex_enter(&aiop->aio_mutex); 1879 if (cb != NULL) { 1880 if (get_udatamodel() == DATAMODEL_NATIVE) { 1881 if (run_mode == AIO_LARGEFILE) 1882 resultp = (aio_result_t *)&((aiocb64_32_t *)cb) 1883 ->aio_resultp; 1884 else 1885 resultp = &((aiocb_t *)cb)->aio_resultp; 1886 } 1887 #ifdef _SYSCALL32_IMPL 1888 else { 1889 if (run_mode == AIO_LARGEFILE) 1890 resultp = (aio_result_t *)&((aiocb64_32_t *)cb) 1891 ->aio_resultp; 1892 else if (run_mode == AIO_32) 1893 resultp = (aio_result_t *)&((aiocb32_t *)cb) 1894 ->aio_resultp; 1895 } 1896 #endif /* _SYSCALL32_IMPL */ 1897 index = AIO_HASH(resultp); 1898 bucket = &aiop->aio_hash[index]; 1899 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 1900 if (ent->aio_req_resultp == resultp) { 1901 if ((ent->aio_req_flags & AIO_PENDING) == 0) { 1902 mutex_exit(&aiop->aio_mutex); 1903 *rval = AIO_ALLDONE; 1904 return (0); 1905 } 1906 mutex_exit(&aiop->aio_mutex); 1907 *rval = AIO_NOTCANCELED; 1908 return (0); 1909 } 1910 } 1911 mutex_exit(&aiop->aio_mutex); 1912 *rval = AIO_ALLDONE; 1913 return (0); 1914 } 1915 1916 for (index = 0; index < AIO_HASHSZ; index++) { 1917 bucket = &aiop->aio_hash[index]; 1918 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 1919 if (ent->aio_req_fd == fildes) { 1920 if ((ent->aio_req_flags & AIO_PENDING) != 0) { 1921 mutex_exit(&aiop->aio_mutex); 1922 *rval = AIO_NOTCANCELED; 1923 return (0); 1924 } 1925 } 1926 } 1927 } 1928 mutex_exit(&aiop->aio_mutex); 1929 *rval = AIO_ALLDONE; 1930 return (0); 1931 } 1932 1933 /* 1934 * solaris version of asynchronous read and write 1935 */ 1936 static int 1937 arw( 1938 int opcode, 1939 int fdes, 1940 char *bufp, 1941 int bufsize, 1942 offset_t offset, 1943 aio_result_t *resultp, 1944 int mode) 1945 { 1946 file_t *fp; 1947 int error; 1948 struct vnode *vp; 1949 aio_req_t *reqp; 1950 aio_t *aiop; 1951 int (*aio_func)(); 1952 #ifdef _LP64 1953 aiocb_t aiocb; 1954 #else 1955 aiocb64_32_t aiocb64; 1956 #endif 1957 1958 aiop = curproc->p_aio; 1959 if (aiop == NULL) 1960 return (EINVAL); 1961 1962 if ((fp = getf(fdes)) == NULL) { 1963 return (EBADF); 1964 } 1965 1966 /* 1967 * check the permission of the partition 1968 */ 1969 if ((fp->f_flag & mode) == 0) { 1970 releasef(fdes); 1971 return (EBADF); 1972 } 1973 1974 vp = fp->f_vnode; 1975 aio_func = check_vp(vp, mode); 1976 if (aio_func == NULL) { 1977 releasef(fdes); 1978 return (EBADFD); 1979 } 1980 #ifdef _LP64 1981 aiocb.aio_fildes = fdes; 1982 aiocb.aio_buf = bufp; 1983 aiocb.aio_nbytes = bufsize; 1984 aiocb.aio_offset = offset; 1985 aiocb.aio_sigevent.sigev_notify = 0; 1986 error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 1); 1987 #else 1988 aiocb64.aio_fildes = fdes; 1989 aiocb64.aio_buf = (caddr32_t)bufp; 1990 aiocb64.aio_nbytes = bufsize; 1991 aiocb64.aio_offset = offset; 1992 aiocb64.aio_sigevent.sigev_notify = 0; 1993 error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 1); 1994 #endif 1995 if (error) { 1996 releasef(fdes); 1997 return (error); 1998 } 1999 2000 /* 2001 * enable polling on this request if the opcode has 2002 * the AIO poll bit set 2003 */ 2004 if (opcode & AIO_POLL_BIT) 2005 reqp->aio_req_flags |= AIO_POLL; 2006 2007 if (bufsize == 0) { 2008 clear_active_fd(fdes); 2009 aio_zerolen(reqp); 2010 return (0); 2011 } 2012 /* 2013 * send the request to driver. 2014 */ 2015 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED()); 2016 /* 2017 * the fd is stored in the aio_req_t by aio_req_setup(), and 2018 * is released by the aio_cleanup_thread() when the IO has 2019 * completed. 2020 */ 2021 if (error) { 2022 releasef(fdes); 2023 mutex_enter(&aiop->aio_mutex); 2024 aio_req_free(aiop, reqp); 2025 aiop->aio_pending--; 2026 if (aiop->aio_flags & AIO_REQ_BLOCK) 2027 cv_signal(&aiop->aio_cleanupcv); 2028 mutex_exit(&aiop->aio_mutex); 2029 return (error); 2030 } 2031 clear_active_fd(fdes); 2032 return (0); 2033 } 2034 2035 /* 2036 * posix version of asynchronous read and write 2037 */ 2038 static int 2039 aiorw( 2040 int opcode, 2041 void *aiocb_arg, 2042 int mode, 2043 int run_mode) 2044 { 2045 #ifdef _SYSCALL32_IMPL 2046 aiocb32_t aiocb32; 2047 struct sigevent32 *sigev32; 2048 port_notify32_t pntfy32; 2049 #endif 2050 aiocb64_32_t aiocb64; 2051 aiocb_t aiocb; 2052 file_t *fp; 2053 int error, fd; 2054 size_t bufsize; 2055 struct vnode *vp; 2056 aio_req_t *reqp; 2057 aio_t *aiop; 2058 int (*aio_func)(); 2059 aio_result_t *resultp; 2060 struct sigevent *sigev; 2061 model_t model; 2062 int aio_use_port = 0; 2063 port_notify_t pntfy; 2064 2065 model = get_udatamodel(); 2066 aiop = curproc->p_aio; 2067 if (aiop == NULL) 2068 return (EINVAL); 2069 2070 if (model == DATAMODEL_NATIVE) { 2071 if (run_mode != AIO_LARGEFILE) { 2072 if (copyin(aiocb_arg, &aiocb, sizeof (aiocb_t))) 2073 return (EFAULT); 2074 bufsize = aiocb.aio_nbytes; 2075 resultp = &(((aiocb_t *)aiocb_arg)->aio_resultp); 2076 if ((fp = getf(fd = aiocb.aio_fildes)) == NULL) { 2077 return (EBADF); 2078 } 2079 sigev = &aiocb.aio_sigevent; 2080 } else { 2081 /* 2082 * We come here only when we make largefile 2083 * call on 32 bit kernel using 32 bit library. 2084 */ 2085 if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t))) 2086 return (EFAULT); 2087 bufsize = aiocb64.aio_nbytes; 2088 resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg) 2089 ->aio_resultp); 2090 if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL) 2091 return (EBADF); 2092 sigev = (struct sigevent *)&aiocb64.aio_sigevent; 2093 } 2094 2095 if (sigev->sigev_notify == SIGEV_PORT) { 2096 if (copyin((void *)sigev->sigev_value.sival_ptr, 2097 &pntfy, sizeof (port_notify_t))) { 2098 releasef(fd); 2099 return (EFAULT); 2100 } 2101 aio_use_port = 1; 2102 } else if (sigev->sigev_notify == SIGEV_THREAD) { 2103 pntfy.portnfy_port = aiocb.aio_sigevent.sigev_signo; 2104 pntfy.portnfy_user = 2105 aiocb.aio_sigevent.sigev_value.sival_ptr; 2106 aio_use_port = 1; 2107 } 2108 } 2109 #ifdef _SYSCALL32_IMPL 2110 else { 2111 if (run_mode == AIO_32) { 2112 /* 32 bit system call is being made on 64 bit kernel */ 2113 if (copyin(aiocb_arg, &aiocb32, sizeof (aiocb32_t))) 2114 return (EFAULT); 2115 2116 bufsize = aiocb32.aio_nbytes; 2117 aiocb_32ton(&aiocb32, &aiocb); 2118 resultp = (aio_result_t *)&(((aiocb32_t *)aiocb_arg)-> 2119 aio_resultp); 2120 if ((fp = getf(fd = aiocb32.aio_fildes)) == NULL) { 2121 return (EBADF); 2122 } 2123 sigev32 = &aiocb32.aio_sigevent; 2124 } else if (run_mode == AIO_LARGEFILE) { 2125 /* 2126 * We come here only when we make largefile 2127 * call on 64 bit kernel using 32 bit library. 2128 */ 2129 if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t))) 2130 return (EFAULT); 2131 bufsize = aiocb64.aio_nbytes; 2132 aiocb_LFton(&aiocb64, &aiocb); 2133 resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg) 2134 ->aio_resultp); 2135 if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL) 2136 return (EBADF); 2137 sigev32 = &aiocb64.aio_sigevent; 2138 } 2139 2140 if (sigev32->sigev_notify == SIGEV_PORT) { 2141 if (copyin( 2142 (void *)(uintptr_t)sigev32->sigev_value.sival_ptr, 2143 &pntfy32, sizeof (port_notify32_t))) { 2144 releasef(fd); 2145 return (EFAULT); 2146 } 2147 pntfy.portnfy_port = pntfy32.portnfy_port; 2148 pntfy.portnfy_user = (void *)(uintptr_t) 2149 pntfy32.portnfy_user; 2150 aio_use_port = 1; 2151 } else if (sigev32->sigev_notify == SIGEV_THREAD) { 2152 pntfy.portnfy_port = sigev32->sigev_signo; 2153 pntfy.portnfy_user = (void *)(uintptr_t) 2154 sigev32->sigev_value.sival_ptr; 2155 aio_use_port = 1; 2156 } 2157 } 2158 #endif /* _SYSCALL32_IMPL */ 2159 2160 /* 2161 * check the permission of the partition 2162 */ 2163 2164 if ((fp->f_flag & mode) == 0) { 2165 releasef(fd); 2166 return (EBADF); 2167 } 2168 2169 vp = fp->f_vnode; 2170 aio_func = check_vp(vp, mode); 2171 if (aio_func == NULL) { 2172 releasef(fd); 2173 return (EBADFD); 2174 } 2175 if (run_mode == AIO_LARGEFILE) 2176 error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 0); 2177 else 2178 error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 0); 2179 2180 if (error) { 2181 releasef(fd); 2182 return (error); 2183 } 2184 /* 2185 * enable polling on this request if the opcode has 2186 * the AIO poll bit set 2187 */ 2188 if (opcode & AIO_POLL_BIT) 2189 reqp->aio_req_flags |= AIO_POLL; 2190 2191 if (model == DATAMODEL_NATIVE) 2192 reqp->aio_req_iocb.iocb = aiocb_arg; 2193 #ifdef _SYSCALL32_IMPL 2194 else 2195 reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)aiocb_arg; 2196 #endif 2197 2198 if (aio_use_port) { 2199 int event = (run_mode == AIO_LARGEFILE)? 2200 ((mode == FREAD)? AIOAREAD64 : AIOAWRITE64) : 2201 ((mode == FREAD)? AIOAREAD : AIOAWRITE); 2202 error = aio_req_assoc_port_rw(&pntfy, aiocb_arg, reqp, event); 2203 } 2204 2205 /* 2206 * send the request to driver. 2207 */ 2208 if (error == 0) { 2209 if (bufsize == 0) { 2210 clear_active_fd(fd); 2211 aio_zerolen(reqp); 2212 return (0); 2213 } 2214 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED()); 2215 } 2216 2217 /* 2218 * the fd is stored in the aio_req_t by aio_req_setup(), and 2219 * is released by the aio_cleanup_thread() when the IO has 2220 * completed. 2221 */ 2222 if (error) { 2223 releasef(fd); 2224 mutex_enter(&aiop->aio_mutex); 2225 if (aio_use_port) 2226 aio_deq(&aiop->aio_portpending, reqp); 2227 aio_req_free(aiop, reqp); 2228 aiop->aio_pending--; 2229 if (aiop->aio_flags & AIO_REQ_BLOCK) 2230 cv_signal(&aiop->aio_cleanupcv); 2231 mutex_exit(&aiop->aio_mutex); 2232 return (error); 2233 } 2234 clear_active_fd(fd); 2235 return (0); 2236 } 2237 2238 2239 /* 2240 * set error for a list IO entry that failed. 2241 */ 2242 static void 2243 lio_set_error(aio_req_t *reqp, int portused) 2244 { 2245 aio_t *aiop = curproc->p_aio; 2246 2247 if (aiop == NULL) 2248 return; 2249 2250 mutex_enter(&aiop->aio_mutex); 2251 if (portused) 2252 aio_deq(&aiop->aio_portpending, reqp); 2253 aiop->aio_pending--; 2254 /* request failed, AIO_PHYSIODONE set to aviod physio cleanup. */ 2255 reqp->aio_req_flags |= AIO_PHYSIODONE; 2256 /* 2257 * Need to free the request now as its never 2258 * going to get on the done queue 2259 * 2260 * Note: aio_outstanding is decremented in 2261 * aio_req_free() 2262 */ 2263 aio_req_free(aiop, reqp); 2264 if (aiop->aio_flags & AIO_REQ_BLOCK) 2265 cv_signal(&aiop->aio_cleanupcv); 2266 mutex_exit(&aiop->aio_mutex); 2267 } 2268 2269 /* 2270 * check if a specified request is done, and remove it from 2271 * the done queue. otherwise remove anybody from the done queue 2272 * if NULL is specified. 2273 */ 2274 static aio_req_t * 2275 aio_req_done(void *resultp) 2276 { 2277 aio_req_t **bucket; 2278 aio_req_t *ent; 2279 aio_t *aiop = curproc->p_aio; 2280 long index; 2281 2282 ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex)); 2283 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2284 2285 if (resultp) { 2286 index = AIO_HASH(resultp); 2287 bucket = &aiop->aio_hash[index]; 2288 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 2289 if (ent->aio_req_resultp == (aio_result_t *)resultp) { 2290 if (ent->aio_req_flags & AIO_DONEQ) { 2291 return (aio_req_remove(ent)); 2292 } 2293 return (NULL); 2294 } 2295 } 2296 /* no match, resultp is invalid */ 2297 return (NULL); 2298 } 2299 return (aio_req_remove(NULL)); 2300 } 2301 2302 /* 2303 * determine if a user-level resultp pointer is associated with an 2304 * active IO request. Zero is returned when the request is done, 2305 * and the request is removed from the done queue. Only when the 2306 * return value is zero, is the "reqp" pointer valid. One is returned 2307 * when the request is inprogress. Two is returned when the request 2308 * is invalid. 2309 */ 2310 static int 2311 aio_req_find(aio_result_t *resultp, aio_req_t **reqp) 2312 { 2313 aio_req_t **bucket; 2314 aio_req_t *ent; 2315 aio_t *aiop = curproc->p_aio; 2316 long index; 2317 2318 ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex)); 2319 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2320 2321 index = AIO_HASH(resultp); 2322 bucket = &aiop->aio_hash[index]; 2323 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) { 2324 if (ent->aio_req_resultp == resultp) { 2325 if (ent->aio_req_flags & AIO_DONEQ) { 2326 *reqp = aio_req_remove(ent); 2327 return (0); 2328 } 2329 return (1); 2330 } 2331 } 2332 /* no match, resultp is invalid */ 2333 return (2); 2334 } 2335 2336 /* 2337 * remove a request from the done queue. 2338 */ 2339 static aio_req_t * 2340 aio_req_remove(aio_req_t *reqp) 2341 { 2342 aio_t *aiop = curproc->p_aio; 2343 2344 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2345 2346 if (reqp != NULL) { 2347 ASSERT(reqp->aio_req_flags & AIO_DONEQ); 2348 if (reqp->aio_req_next == reqp) { 2349 /* only one request on queue */ 2350 if (reqp == aiop->aio_doneq) { 2351 aiop->aio_doneq = NULL; 2352 } else { 2353 ASSERT(reqp == aiop->aio_cleanupq); 2354 aiop->aio_cleanupq = NULL; 2355 } 2356 } else { 2357 reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev; 2358 reqp->aio_req_prev->aio_req_next = reqp->aio_req_next; 2359 /* 2360 * The request can be either on the aio_doneq or the 2361 * aio_cleanupq 2362 */ 2363 if (reqp == aiop->aio_doneq) 2364 aiop->aio_doneq = reqp->aio_req_next; 2365 2366 if (reqp == aiop->aio_cleanupq) 2367 aiop->aio_cleanupq = reqp->aio_req_next; 2368 } 2369 reqp->aio_req_flags &= ~AIO_DONEQ; 2370 reqp->aio_req_next = NULL; 2371 reqp->aio_req_prev = NULL; 2372 } else if ((reqp = aiop->aio_doneq) != NULL) { 2373 ASSERT(reqp->aio_req_flags & AIO_DONEQ); 2374 if (reqp == reqp->aio_req_next) { 2375 /* only one request on queue */ 2376 aiop->aio_doneq = NULL; 2377 } else { 2378 reqp->aio_req_prev->aio_req_next = reqp->aio_req_next; 2379 reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev; 2380 aiop->aio_doneq = reqp->aio_req_next; 2381 } 2382 reqp->aio_req_flags &= ~AIO_DONEQ; 2383 reqp->aio_req_next = NULL; 2384 reqp->aio_req_prev = NULL; 2385 } 2386 if (aiop->aio_doneq == NULL && (aiop->aio_flags & AIO_WAITN)) 2387 cv_broadcast(&aiop->aio_waitcv); 2388 return (reqp); 2389 } 2390 2391 static int 2392 aio_req_setup( 2393 aio_req_t **reqpp, 2394 aio_t *aiop, 2395 aiocb_t *arg, 2396 aio_result_t *resultp, 2397 vnode_t *vp, 2398 int old_solaris_req) 2399 { 2400 sigqueue_t *sqp = NULL; 2401 aio_req_t *reqp; 2402 struct uio *uio; 2403 struct sigevent *sigev; 2404 int error; 2405 2406 sigev = &arg->aio_sigevent; 2407 if (sigev->sigev_notify == SIGEV_SIGNAL && 2408 sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) { 2409 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 2410 if (sqp == NULL) 2411 return (EAGAIN); 2412 sqp->sq_func = NULL; 2413 sqp->sq_next = NULL; 2414 sqp->sq_info.si_code = SI_ASYNCIO; 2415 sqp->sq_info.si_pid = curproc->p_pid; 2416 sqp->sq_info.si_ctid = PRCTID(curproc); 2417 sqp->sq_info.si_zoneid = getzoneid(); 2418 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 2419 sqp->sq_info.si_signo = sigev->sigev_signo; 2420 sqp->sq_info.si_value = sigev->sigev_value; 2421 } 2422 2423 mutex_enter(&aiop->aio_mutex); 2424 2425 if (aiop->aio_flags & AIO_REQ_BLOCK) { 2426 mutex_exit(&aiop->aio_mutex); 2427 if (sqp) 2428 kmem_free(sqp, sizeof (sigqueue_t)); 2429 return (EIO); 2430 } 2431 /* 2432 * get an aio_reqp from the free list or allocate one 2433 * from dynamic memory. 2434 */ 2435 if (error = aio_req_alloc(&reqp, resultp)) { 2436 mutex_exit(&aiop->aio_mutex); 2437 if (sqp) 2438 kmem_free(sqp, sizeof (sigqueue_t)); 2439 return (error); 2440 } 2441 aiop->aio_pending++; 2442 aiop->aio_outstanding++; 2443 reqp->aio_req_flags = AIO_PENDING; 2444 if (old_solaris_req) { 2445 /* this is an old solaris aio request */ 2446 reqp->aio_req_flags |= AIO_SOLARIS; 2447 aiop->aio_flags |= AIO_SOLARIS_REQ; 2448 } 2449 if (sigev->sigev_notify == SIGEV_THREAD || 2450 sigev->sigev_notify == SIGEV_PORT) 2451 aio_enq(&aiop->aio_portpending, reqp, 0); 2452 mutex_exit(&aiop->aio_mutex); 2453 /* 2454 * initialize aio request. 2455 */ 2456 reqp->aio_req_fd = arg->aio_fildes; 2457 reqp->aio_req_sigqp = sqp; 2458 reqp->aio_req_iocb.iocb = NULL; 2459 reqp->aio_req_lio = NULL; 2460 reqp->aio_req_buf.b_file = vp; 2461 uio = reqp->aio_req.aio_uio; 2462 uio->uio_iovcnt = 1; 2463 uio->uio_iov->iov_base = (caddr_t)arg->aio_buf; 2464 uio->uio_iov->iov_len = arg->aio_nbytes; 2465 uio->uio_loffset = arg->aio_offset; 2466 *reqpp = reqp; 2467 return (0); 2468 } 2469 2470 /* 2471 * Allocate p_aio struct. 2472 */ 2473 static aio_t * 2474 aio_aiop_alloc(void) 2475 { 2476 aio_t *aiop; 2477 2478 ASSERT(MUTEX_HELD(&curproc->p_lock)); 2479 2480 aiop = kmem_zalloc(sizeof (struct aio), KM_NOSLEEP); 2481 if (aiop) { 2482 mutex_init(&aiop->aio_mutex, NULL, MUTEX_DEFAULT, NULL); 2483 mutex_init(&aiop->aio_cleanupq_mutex, NULL, MUTEX_DEFAULT, 2484 NULL); 2485 mutex_init(&aiop->aio_portq_mutex, NULL, MUTEX_DEFAULT, NULL); 2486 } 2487 return (aiop); 2488 } 2489 2490 /* 2491 * Allocate an aio_req struct. 2492 */ 2493 static int 2494 aio_req_alloc(aio_req_t **nreqp, aio_result_t *resultp) 2495 { 2496 aio_req_t *reqp; 2497 aio_t *aiop = curproc->p_aio; 2498 2499 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2500 2501 if ((reqp = aiop->aio_free) != NULL) { 2502 aiop->aio_free = reqp->aio_req_next; 2503 bzero(reqp, sizeof (*reqp)); 2504 } else { 2505 /* 2506 * Check whether memory is getting tight. 2507 * This is a temporary mechanism to avoid memory 2508 * exhaustion by a single process until we come up 2509 * with a per process solution such as setrlimit(). 2510 */ 2511 if (freemem < desfree) 2512 return (EAGAIN); 2513 reqp = kmem_zalloc(sizeof (struct aio_req_t), KM_NOSLEEP); 2514 if (reqp == NULL) 2515 return (EAGAIN); 2516 } 2517 reqp->aio_req.aio_uio = &reqp->aio_req_uio; 2518 reqp->aio_req.aio_uio->uio_iov = &reqp->aio_req_iov; 2519 reqp->aio_req.aio_private = reqp; 2520 reqp->aio_req_buf.b_offset = -1; 2521 reqp->aio_req_resultp = resultp; 2522 if (aio_hash_insert(reqp, aiop)) { 2523 reqp->aio_req_next = aiop->aio_free; 2524 aiop->aio_free = reqp; 2525 return (EBUSY); 2526 } 2527 *nreqp = reqp; 2528 return (0); 2529 } 2530 2531 /* 2532 * Allocate an aio_lio_t struct. 2533 */ 2534 static int 2535 aio_lio_alloc(aio_lio_t **head) 2536 { 2537 aio_lio_t *liop; 2538 aio_t *aiop = curproc->p_aio; 2539 2540 ASSERT(MUTEX_HELD(&aiop->aio_mutex)); 2541 2542 if ((liop = aiop->aio_lio_free) != NULL) { 2543 aiop->aio_lio_free = liop->lio_next; 2544 } else { 2545 /* 2546 * Check whether memory is getting tight. 2547 * This is a temporary mechanism to avoid memory 2548 * exhaustion by a single process until we come up 2549 * with a per process solution such as setrlimit(). 2550 */ 2551 if (freemem < desfree) 2552 return (EAGAIN); 2553 2554 liop = kmem_zalloc(sizeof (aio_lio_t), KM_NOSLEEP); 2555 if (liop == NULL) 2556 return (EAGAIN); 2557 } 2558 *head = liop; 2559 return (0); 2560 } 2561 2562 /* 2563 * this is a special per-process thread that is only activated if 2564 * the process is unmapping a segment with outstanding aio. normally, 2565 * the process will have completed the aio before unmapping the 2566 * segment. If the process does unmap a segment with outstanding aio, 2567 * this special thread will guarentee that the locked pages due to 2568 * aphysio() are released, thereby permitting the segment to be 2569 * unmapped. In addition to this, the cleanup thread is woken up 2570 * during DR operations to release the locked pages. 2571 */ 2572 2573 static int 2574 aio_cleanup_thread(aio_t *aiop) 2575 { 2576 proc_t *p = curproc; 2577 struct as *as = p->p_as; 2578 int poked = 0; 2579 kcondvar_t *cvp; 2580 int exit_flag = 0; 2581 int rqclnup = 0; 2582 2583 sigfillset(&curthread->t_hold); 2584 sigdiffset(&curthread->t_hold, &cantmask); 2585 for (;;) { 2586 /* 2587 * if a segment is being unmapped, and the current 2588 * process's done queue is not empty, then every request 2589 * on the doneq with locked resources should be forced 2590 * to release their locks. By moving the doneq request 2591 * to the cleanupq, aio_cleanup() will process the cleanupq, 2592 * and place requests back onto the doneq. All requests 2593 * processed by aio_cleanup() will have their physical 2594 * resources unlocked. 2595 */ 2596 mutex_enter(&aiop->aio_mutex); 2597 if ((aiop->aio_flags & AIO_CLEANUP) == 0) { 2598 aiop->aio_flags |= AIO_CLEANUP; 2599 mutex_enter(&as->a_contents); 2600 if (aiop->aio_rqclnup) { 2601 aiop->aio_rqclnup = 0; 2602 rqclnup = 1; 2603 } 2604 mutex_exit(&as->a_contents); 2605 if (aiop->aio_doneq) { 2606 aio_req_t *doneqhead = aiop->aio_doneq; 2607 aiop->aio_doneq = NULL; 2608 aio_cleanupq_concat(aiop, doneqhead, AIO_DONEQ); 2609 } 2610 } 2611 mutex_exit(&aiop->aio_mutex); 2612 aio_cleanup(AIO_CLEANUP_THREAD); 2613 /* 2614 * thread should block on the cleanupcv while 2615 * AIO_CLEANUP is set. 2616 */ 2617 cvp = &aiop->aio_cleanupcv; 2618 mutex_enter(&aiop->aio_mutex); 2619 2620 if (aiop->aio_pollq != NULL || aiop->aio_cleanupq != NULL || 2621 aiop->aio_notifyq != NULL || 2622 aiop->aio_portcleanupq != NULL) { 2623 mutex_exit(&aiop->aio_mutex); 2624 continue; 2625 } 2626 mutex_enter(&as->a_contents); 2627 2628 /* 2629 * AIO_CLEANUP determines when the cleanup thread 2630 * should be active. This flag is set when 2631 * the cleanup thread is awakened by as_unmap() or 2632 * due to DR operations. 2633 * The flag is cleared when the blocking as_unmap() 2634 * that originally awakened us is allowed to 2635 * complete. as_unmap() blocks when trying to 2636 * unmap a segment that has SOFTLOCKed pages. when 2637 * the segment's pages are all SOFTUNLOCKed, 2638 * as->a_flags & AS_UNMAPWAIT should be zero. 2639 * 2640 * In case of cleanup request by DR, the flag is cleared 2641 * once all the pending aio requests have been processed. 2642 * 2643 * The flag shouldn't be cleared right away if the 2644 * cleanup thread was interrupted because the process 2645 * is doing forkall(). This happens when cv_wait_sig() 2646 * returns zero, because it was awakened by a pokelwps(). 2647 * If the process is not exiting, it must be doing forkall(). 2648 */ 2649 if ((poked == 0) && 2650 ((!rqclnup && (AS_ISUNMAPWAIT(as) == 0)) || 2651 (aiop->aio_pending == 0))) { 2652 aiop->aio_flags &= ~(AIO_CLEANUP | AIO_CLEANUP_PORT); 2653 cvp = &as->a_cv; 2654 rqclnup = 0; 2655 } 2656 mutex_exit(&aiop->aio_mutex); 2657 if (poked) { 2658 /* 2659 * If the process is exiting/killed, don't return 2660 * immediately without waiting for pending I/O's 2661 * and releasing the page locks. 2662 */ 2663 if (p->p_flag & (SEXITLWPS|SKILLED)) { 2664 /* 2665 * If exit_flag is set, then it is 2666 * safe to exit because we have released 2667 * page locks of completed I/O's. 2668 */ 2669 if (exit_flag) 2670 break; 2671 2672 mutex_exit(&as->a_contents); 2673 2674 /* 2675 * Wait for all the pending aio to complete. 2676 */ 2677 mutex_enter(&aiop->aio_mutex); 2678 aiop->aio_flags |= AIO_REQ_BLOCK; 2679 while (aiop->aio_pending != 0) 2680 cv_wait(&aiop->aio_cleanupcv, 2681 &aiop->aio_mutex); 2682 mutex_exit(&aiop->aio_mutex); 2683 exit_flag = 1; 2684 continue; 2685 } else if (p->p_flag & 2686 (SHOLDFORK|SHOLDFORK1|SHOLDWATCH)) { 2687 /* 2688 * hold LWP until it 2689 * is continued. 2690 */ 2691 mutex_exit(&as->a_contents); 2692 mutex_enter(&p->p_lock); 2693 stop(PR_SUSPENDED, SUSPEND_NORMAL); 2694 mutex_exit(&p->p_lock); 2695 poked = 0; 2696 continue; 2697 } 2698 } else { 2699 /* 2700 * When started this thread will sleep on as->a_cv. 2701 * as_unmap will awake this thread if the 2702 * segment has SOFTLOCKed pages (poked = 0). 2703 * 1. pokelwps() awakes this thread => 2704 * break the loop to check SEXITLWPS, SHOLDFORK, etc 2705 * 2. as_unmap awakes this thread => 2706 * to break the loop it is necessary that 2707 * - AS_UNMAPWAIT is set (as_unmap is waiting for 2708 * memory to be unlocked) 2709 * - AIO_CLEANUP is not set 2710 * (if AIO_CLEANUP is set we have to wait for 2711 * pending requests. aio_done will send a signal 2712 * for every request which completes to continue 2713 * unmapping the corresponding address range) 2714 * 3. A cleanup request will wake this thread up, ex. 2715 * by the DR operations. The aio_rqclnup flag will 2716 * be set. 2717 */ 2718 while (poked == 0) { 2719 /* 2720 * The clean up requests that came in 2721 * after we had just cleaned up, couldn't 2722 * be causing the unmap thread to block - as 2723 * unmap event happened first. 2724 * Let aio_done() wake us up if it sees a need. 2725 */ 2726 if (aiop->aio_rqclnup && 2727 (aiop->aio_flags & AIO_CLEANUP) == 0) 2728 break; 2729 poked = !cv_wait_sig(cvp, &as->a_contents); 2730 if (AS_ISUNMAPWAIT(as) == 0) 2731 cv_signal(cvp); 2732 if (aiop->aio_outstanding != 0) 2733 break; 2734 } 2735 } 2736 mutex_exit(&as->a_contents); 2737 } 2738 exit: 2739 mutex_exit(&as->a_contents); 2740 ASSERT((curproc->p_flag & (SEXITLWPS|SKILLED))); 2741 aston(curthread); /* make thread do post_syscall */ 2742 return (0); 2743 } 2744 2745 /* 2746 * save a reference to a user's outstanding aio in a hash list. 2747 */ 2748 static int 2749 aio_hash_insert( 2750 aio_req_t *aio_reqp, 2751 aio_t *aiop) 2752 { 2753 long index; 2754 aio_result_t *resultp = aio_reqp->aio_req_resultp; 2755 aio_req_t *current; 2756 aio_req_t **nextp; 2757 2758 index = AIO_HASH(resultp); 2759 nextp = &aiop->aio_hash[index]; 2760 while ((current = *nextp) != NULL) { 2761 if (current->aio_req_resultp == resultp) 2762 return (DUPLICATE); 2763 nextp = ¤t->aio_hash_next; 2764 } 2765 *nextp = aio_reqp; 2766 aio_reqp->aio_hash_next = NULL; 2767 return (0); 2768 } 2769 2770 static int 2771 (*check_vp(struct vnode *vp, int mode))(vnode_t *, struct aio_req *, 2772 cred_t *) 2773 { 2774 struct snode *sp; 2775 dev_t dev; 2776 struct cb_ops *cb; 2777 major_t major; 2778 int (*aio_func)(); 2779 2780 dev = vp->v_rdev; 2781 major = getmajor(dev); 2782 2783 /* 2784 * return NULL for requests to files and STREAMs so 2785 * that libaio takes care of them. 2786 */ 2787 if (vp->v_type == VCHR) { 2788 /* no stream device for kaio */ 2789 if (STREAMSTAB(major)) { 2790 return (NULL); 2791 } 2792 } else { 2793 return (NULL); 2794 } 2795 2796 /* 2797 * Check old drivers which do not have async I/O entry points. 2798 */ 2799 if (devopsp[major]->devo_rev < 3) 2800 return (NULL); 2801 2802 cb = devopsp[major]->devo_cb_ops; 2803 2804 if (cb->cb_rev < 1) 2805 return (NULL); 2806 2807 /* 2808 * Check whether this device is a block device. 2809 * Kaio is not supported for devices like tty. 2810 */ 2811 if (cb->cb_strategy == nodev || cb->cb_strategy == NULL) 2812 return (NULL); 2813 2814 /* 2815 * Clustering: If vnode is a PXFS vnode, then the device may be remote. 2816 * We cannot call the driver directly. Instead return the 2817 * PXFS functions. 2818 */ 2819 2820 if (IS_PXFSVP(vp)) { 2821 if (mode & FREAD) 2822 return (clpxfs_aio_read); 2823 else 2824 return (clpxfs_aio_write); 2825 } 2826 if (mode & FREAD) 2827 aio_func = (cb->cb_aread == nodev) ? NULL : driver_aio_read; 2828 else 2829 aio_func = (cb->cb_awrite == nodev) ? NULL : driver_aio_write; 2830 2831 /* 2832 * Do we need this ? 2833 * nodev returns ENXIO anyway. 2834 */ 2835 if (aio_func == nodev) 2836 return (NULL); 2837 2838 sp = VTOS(vp); 2839 smark(sp, SACC); 2840 return (aio_func); 2841 } 2842 2843 /* 2844 * Clustering: We want check_vp to return a function prototyped 2845 * correctly that will be common to both PXFS and regular case. 2846 * We define this intermediate function that will do the right 2847 * thing for driver cases. 2848 */ 2849 2850 static int 2851 driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p) 2852 { 2853 dev_t dev; 2854 struct cb_ops *cb; 2855 2856 ASSERT(vp->v_type == VCHR); 2857 ASSERT(!IS_PXFSVP(vp)); 2858 dev = VTOS(vp)->s_dev; 2859 ASSERT(STREAMSTAB(getmajor(dev)) == NULL); 2860 2861 cb = devopsp[getmajor(dev)]->devo_cb_ops; 2862 2863 ASSERT(cb->cb_awrite != nodev); 2864 return ((*cb->cb_awrite)(dev, aio, cred_p)); 2865 } 2866 2867 /* 2868 * Clustering: We want check_vp to return a function prototyped 2869 * correctly that will be common to both PXFS and regular case. 2870 * We define this intermediate function that will do the right 2871 * thing for driver cases. 2872 */ 2873 2874 static int 2875 driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p) 2876 { 2877 dev_t dev; 2878 struct cb_ops *cb; 2879 2880 ASSERT(vp->v_type == VCHR); 2881 ASSERT(!IS_PXFSVP(vp)); 2882 dev = VTOS(vp)->s_dev; 2883 ASSERT(!STREAMSTAB(getmajor(dev))); 2884 2885 cb = devopsp[getmajor(dev)]->devo_cb_ops; 2886 2887 ASSERT(cb->cb_aread != nodev); 2888 return ((*cb->cb_aread)(dev, aio, cred_p)); 2889 } 2890 2891 /* 2892 * This routine is called when a largefile call is made by a 32bit 2893 * process on a ILP32 or LP64 kernel. All 64bit processes are large 2894 * file by definition and will call alio() instead. 2895 */ 2896 static int 2897 alioLF( 2898 int mode_arg, 2899 void *aiocb_arg, 2900 int nent, 2901 void *sigev) 2902 { 2903 file_t *fp; 2904 file_t *prev_fp = NULL; 2905 int prev_mode = -1; 2906 struct vnode *vp; 2907 aio_lio_t *head; 2908 aio_req_t *reqp; 2909 aio_t *aiop; 2910 caddr_t cbplist; 2911 aiocb64_32_t cb64; 2912 aiocb64_32_t *aiocb = &cb64; 2913 aiocb64_32_t *cbp; 2914 caddr32_t *ucbp; 2915 #ifdef _LP64 2916 aiocb_t aiocb_n; 2917 #endif 2918 struct sigevent32 sigevk; 2919 sigqueue_t *sqp; 2920 int (*aio_func)(); 2921 int mode; 2922 int error = 0; 2923 int aio_errors = 0; 2924 int i; 2925 size_t ssize; 2926 int deadhead = 0; 2927 int aio_notsupported = 0; 2928 int lio_head_port; 2929 int aio_port; 2930 int aio_thread; 2931 port_kevent_t *pkevtp = NULL; 2932 int portused = 0; 2933 port_notify32_t pnotify; 2934 int event; 2935 2936 aiop = curproc->p_aio; 2937 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 2938 return (EINVAL); 2939 2940 ASSERT(get_udatamodel() == DATAMODEL_ILP32); 2941 2942 ssize = (sizeof (caddr32_t) * nent); 2943 cbplist = kmem_alloc(ssize, KM_SLEEP); 2944 ucbp = (caddr32_t *)cbplist; 2945 2946 if (copyin(aiocb_arg, cbplist, ssize) || 2947 (sigev && copyin(sigev, &sigevk, sizeof (sigevk)))) { 2948 kmem_free(cbplist, ssize); 2949 return (EFAULT); 2950 } 2951 2952 /* Event Ports */ 2953 if (sigev && 2954 (sigevk.sigev_notify == SIGEV_THREAD || 2955 sigevk.sigev_notify == SIGEV_PORT)) { 2956 if (sigevk.sigev_notify == SIGEV_THREAD) { 2957 pnotify.portnfy_port = sigevk.sigev_signo; 2958 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr; 2959 } else if (copyin( 2960 (void *)(uintptr_t)sigevk.sigev_value.sival_ptr, 2961 &pnotify, sizeof (pnotify))) { 2962 kmem_free(cbplist, ssize); 2963 return (EFAULT); 2964 } 2965 error = port_alloc_event(pnotify.portnfy_port, 2966 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp); 2967 if (error) { 2968 if (error == ENOMEM || error == EAGAIN) 2969 error = EAGAIN; 2970 else 2971 error = EINVAL; 2972 kmem_free(cbplist, ssize); 2973 return (error); 2974 } 2975 lio_head_port = pnotify.portnfy_port; 2976 portused = 1; 2977 } 2978 2979 /* 2980 * a list head should be allocated if notification is 2981 * enabled for this list. 2982 */ 2983 head = NULL; 2984 2985 if (mode_arg == LIO_WAIT || sigev) { 2986 mutex_enter(&aiop->aio_mutex); 2987 error = aio_lio_alloc(&head); 2988 mutex_exit(&aiop->aio_mutex); 2989 if (error) 2990 goto done; 2991 deadhead = 1; 2992 head->lio_nent = nent; 2993 head->lio_refcnt = nent; 2994 head->lio_port = -1; 2995 head->lio_portkev = NULL; 2996 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL && 2997 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) { 2998 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 2999 if (sqp == NULL) { 3000 error = EAGAIN; 3001 goto done; 3002 } 3003 sqp->sq_func = NULL; 3004 sqp->sq_next = NULL; 3005 sqp->sq_info.si_code = SI_ASYNCIO; 3006 sqp->sq_info.si_pid = curproc->p_pid; 3007 sqp->sq_info.si_ctid = PRCTID(curproc); 3008 sqp->sq_info.si_zoneid = getzoneid(); 3009 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 3010 sqp->sq_info.si_signo = sigevk.sigev_signo; 3011 sqp->sq_info.si_value.sival_int = 3012 sigevk.sigev_value.sival_int; 3013 head->lio_sigqp = sqp; 3014 } else { 3015 head->lio_sigqp = NULL; 3016 } 3017 if (pkevtp) { 3018 /* 3019 * Prepare data to send when list of aiocb's 3020 * has completed. 3021 */ 3022 port_init_event(pkevtp, (uintptr_t)sigev, 3023 (void *)(uintptr_t)pnotify.portnfy_user, 3024 NULL, head); 3025 pkevtp->portkev_events = AIOLIO64; 3026 head->lio_portkev = pkevtp; 3027 head->lio_port = pnotify.portnfy_port; 3028 } 3029 } 3030 3031 for (i = 0; i < nent; i++, ucbp++) { 3032 3033 cbp = (aiocb64_32_t *)(uintptr_t)*ucbp; 3034 /* skip entry if it can't be copied. */ 3035 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) { 3036 if (head) { 3037 mutex_enter(&aiop->aio_mutex); 3038 head->lio_nent--; 3039 head->lio_refcnt--; 3040 mutex_exit(&aiop->aio_mutex); 3041 } 3042 continue; 3043 } 3044 3045 /* skip if opcode for aiocb is LIO_NOP */ 3046 mode = aiocb->aio_lio_opcode; 3047 if (mode == LIO_NOP) { 3048 cbp = NULL; 3049 if (head) { 3050 mutex_enter(&aiop->aio_mutex); 3051 head->lio_nent--; 3052 head->lio_refcnt--; 3053 mutex_exit(&aiop->aio_mutex); 3054 } 3055 continue; 3056 } 3057 3058 /* increment file descriptor's ref count. */ 3059 if ((fp = getf(aiocb->aio_fildes)) == NULL) { 3060 lio_set_uerror(&cbp->aio_resultp, EBADF); 3061 if (head) { 3062 mutex_enter(&aiop->aio_mutex); 3063 head->lio_nent--; 3064 head->lio_refcnt--; 3065 mutex_exit(&aiop->aio_mutex); 3066 } 3067 aio_errors++; 3068 continue; 3069 } 3070 3071 /* 3072 * check the permission of the partition 3073 */ 3074 if ((fp->f_flag & mode) == 0) { 3075 releasef(aiocb->aio_fildes); 3076 lio_set_uerror(&cbp->aio_resultp, EBADF); 3077 if (head) { 3078 mutex_enter(&aiop->aio_mutex); 3079 head->lio_nent--; 3080 head->lio_refcnt--; 3081 mutex_exit(&aiop->aio_mutex); 3082 } 3083 aio_errors++; 3084 continue; 3085 } 3086 3087 /* 3088 * common case where requests are to the same fd 3089 * for the same r/w operation 3090 * for UFS, need to set EBADFD 3091 */ 3092 vp = fp->f_vnode; 3093 if (fp != prev_fp || mode != prev_mode) { 3094 aio_func = check_vp(vp, mode); 3095 if (aio_func == NULL) { 3096 prev_fp = NULL; 3097 releasef(aiocb->aio_fildes); 3098 lio_set_uerror(&cbp->aio_resultp, EBADFD); 3099 aio_notsupported++; 3100 if (head) { 3101 mutex_enter(&aiop->aio_mutex); 3102 head->lio_nent--; 3103 head->lio_refcnt--; 3104 mutex_exit(&aiop->aio_mutex); 3105 } 3106 continue; 3107 } else { 3108 prev_fp = fp; 3109 prev_mode = mode; 3110 } 3111 } 3112 3113 #ifdef _LP64 3114 aiocb_LFton(aiocb, &aiocb_n); 3115 error = aio_req_setup(&reqp, aiop, &aiocb_n, 3116 (aio_result_t *)&cbp->aio_resultp, vp, 0); 3117 #else 3118 error = aio_req_setupLF(&reqp, aiop, aiocb, 3119 (aio_result_t *)&cbp->aio_resultp, vp, 0); 3120 #endif /* _LP64 */ 3121 if (error) { 3122 releasef(aiocb->aio_fildes); 3123 lio_set_uerror(&cbp->aio_resultp, error); 3124 if (head) { 3125 mutex_enter(&aiop->aio_mutex); 3126 head->lio_nent--; 3127 head->lio_refcnt--; 3128 mutex_exit(&aiop->aio_mutex); 3129 } 3130 aio_errors++; 3131 continue; 3132 } 3133 3134 reqp->aio_req_lio = head; 3135 deadhead = 0; 3136 3137 /* 3138 * Set the errno field now before sending the request to 3139 * the driver to avoid a race condition 3140 */ 3141 (void) suword32(&cbp->aio_resultp.aio_errno, 3142 EINPROGRESS); 3143 3144 reqp->aio_req_iocb.iocb32 = *ucbp; 3145 3146 event = (mode == LIO_READ)? AIOAREAD64 : AIOAWRITE64; 3147 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT); 3148 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD); 3149 if (aio_port | aio_thread) { 3150 port_kevent_t *lpkevp; 3151 /* 3152 * Prepare data to send with each aiocb completed. 3153 */ 3154 if (aio_port) { 3155 void *paddr = (void *)(uintptr_t) 3156 aiocb->aio_sigevent.sigev_value.sival_ptr; 3157 if (copyin(paddr, &pnotify, sizeof (pnotify))) 3158 error = EFAULT; 3159 } else { /* aio_thread */ 3160 pnotify.portnfy_port = 3161 aiocb->aio_sigevent.sigev_signo; 3162 pnotify.portnfy_user = 3163 aiocb->aio_sigevent.sigev_value.sival_ptr; 3164 } 3165 if (error) 3166 /* EMPTY */; 3167 else if (pkevtp != NULL && 3168 pnotify.portnfy_port == lio_head_port) 3169 error = port_dup_event(pkevtp, &lpkevp, 3170 PORT_ALLOC_DEFAULT); 3171 else 3172 error = port_alloc_event(pnotify.portnfy_port, 3173 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, 3174 &lpkevp); 3175 if (error == 0) { 3176 port_init_event(lpkevp, (uintptr_t)*ucbp, 3177 (void *)(uintptr_t)pnotify.portnfy_user, 3178 aio_port_callback, reqp); 3179 lpkevp->portkev_events = event; 3180 reqp->aio_req_portkev = lpkevp; 3181 reqp->aio_req_port = pnotify.portnfy_port; 3182 } 3183 } 3184 3185 /* 3186 * send the request to driver. 3187 */ 3188 if (error == 0) { 3189 if (aiocb->aio_nbytes == 0) { 3190 clear_active_fd(aiocb->aio_fildes); 3191 aio_zerolen(reqp); 3192 continue; 3193 } 3194 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, 3195 CRED()); 3196 } 3197 3198 /* 3199 * the fd's ref count is not decremented until the IO has 3200 * completed unless there was an error. 3201 */ 3202 if (error) { 3203 releasef(aiocb->aio_fildes); 3204 lio_set_uerror(&cbp->aio_resultp, error); 3205 if (head) { 3206 mutex_enter(&aiop->aio_mutex); 3207 head->lio_nent--; 3208 head->lio_refcnt--; 3209 mutex_exit(&aiop->aio_mutex); 3210 } 3211 if (error == ENOTSUP) 3212 aio_notsupported++; 3213 else 3214 aio_errors++; 3215 lio_set_error(reqp, portused); 3216 } else { 3217 clear_active_fd(aiocb->aio_fildes); 3218 } 3219 } 3220 3221 if (aio_notsupported) { 3222 error = ENOTSUP; 3223 } else if (aio_errors) { 3224 /* 3225 * return EIO if any request failed 3226 */ 3227 error = EIO; 3228 } 3229 3230 if (mode_arg == LIO_WAIT) { 3231 mutex_enter(&aiop->aio_mutex); 3232 while (head->lio_refcnt > 0) { 3233 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 3234 mutex_exit(&aiop->aio_mutex); 3235 error = EINTR; 3236 goto done; 3237 } 3238 } 3239 mutex_exit(&aiop->aio_mutex); 3240 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_LARGEFILE); 3241 } 3242 3243 done: 3244 kmem_free(cbplist, ssize); 3245 if (deadhead) { 3246 if (head->lio_sigqp) 3247 kmem_free(head->lio_sigqp, sizeof (sigqueue_t)); 3248 if (head->lio_portkev) 3249 port_free_event(head->lio_portkev); 3250 kmem_free(head, sizeof (aio_lio_t)); 3251 } 3252 return (error); 3253 } 3254 3255 #ifdef _SYSCALL32_IMPL 3256 static void 3257 aiocb_LFton(aiocb64_32_t *src, aiocb_t *dest) 3258 { 3259 dest->aio_fildes = src->aio_fildes; 3260 dest->aio_buf = (void *)(uintptr_t)src->aio_buf; 3261 dest->aio_nbytes = (size_t)src->aio_nbytes; 3262 dest->aio_offset = (off_t)src->aio_offset; 3263 dest->aio_reqprio = src->aio_reqprio; 3264 dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify; 3265 dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo; 3266 3267 /* 3268 * See comment in sigqueue32() on handling of 32-bit 3269 * sigvals in a 64-bit kernel. 3270 */ 3271 dest->aio_sigevent.sigev_value.sival_int = 3272 (int)src->aio_sigevent.sigev_value.sival_int; 3273 dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval)) 3274 (uintptr_t)src->aio_sigevent.sigev_notify_function; 3275 dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *) 3276 (uintptr_t)src->aio_sigevent.sigev_notify_attributes; 3277 dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2; 3278 dest->aio_lio_opcode = src->aio_lio_opcode; 3279 dest->aio_state = src->aio_state; 3280 dest->aio__pad[0] = src->aio__pad[0]; 3281 } 3282 #endif 3283 3284 /* 3285 * This function is used only for largefile calls made by 3286 * 32 bit applications. 3287 */ 3288 static int 3289 aio_req_setupLF( 3290 aio_req_t **reqpp, 3291 aio_t *aiop, 3292 aiocb64_32_t *arg, 3293 aio_result_t *resultp, 3294 vnode_t *vp, 3295 int old_solaris_req) 3296 { 3297 sigqueue_t *sqp = NULL; 3298 aio_req_t *reqp; 3299 struct uio *uio; 3300 struct sigevent32 *sigev; 3301 int error; 3302 3303 sigev = &arg->aio_sigevent; 3304 if (sigev->sigev_notify == SIGEV_SIGNAL && 3305 sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) { 3306 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 3307 if (sqp == NULL) 3308 return (EAGAIN); 3309 sqp->sq_func = NULL; 3310 sqp->sq_next = NULL; 3311 sqp->sq_info.si_code = SI_ASYNCIO; 3312 sqp->sq_info.si_pid = curproc->p_pid; 3313 sqp->sq_info.si_ctid = PRCTID(curproc); 3314 sqp->sq_info.si_zoneid = getzoneid(); 3315 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 3316 sqp->sq_info.si_signo = sigev->sigev_signo; 3317 sqp->sq_info.si_value.sival_int = sigev->sigev_value.sival_int; 3318 } 3319 3320 mutex_enter(&aiop->aio_mutex); 3321 3322 if (aiop->aio_flags & AIO_REQ_BLOCK) { 3323 mutex_exit(&aiop->aio_mutex); 3324 if (sqp) 3325 kmem_free(sqp, sizeof (sigqueue_t)); 3326 return (EIO); 3327 } 3328 /* 3329 * get an aio_reqp from the free list or allocate one 3330 * from dynamic memory. 3331 */ 3332 if (error = aio_req_alloc(&reqp, resultp)) { 3333 mutex_exit(&aiop->aio_mutex); 3334 if (sqp) 3335 kmem_free(sqp, sizeof (sigqueue_t)); 3336 return (error); 3337 } 3338 aiop->aio_pending++; 3339 aiop->aio_outstanding++; 3340 reqp->aio_req_flags = AIO_PENDING; 3341 if (old_solaris_req) { 3342 /* this is an old solaris aio request */ 3343 reqp->aio_req_flags |= AIO_SOLARIS; 3344 aiop->aio_flags |= AIO_SOLARIS_REQ; 3345 } 3346 if (sigev->sigev_notify == SIGEV_THREAD || 3347 sigev->sigev_notify == SIGEV_PORT) 3348 aio_enq(&aiop->aio_portpending, reqp, 0); 3349 mutex_exit(&aiop->aio_mutex); 3350 /* 3351 * initialize aio request. 3352 */ 3353 reqp->aio_req_fd = arg->aio_fildes; 3354 reqp->aio_req_sigqp = sqp; 3355 reqp->aio_req_iocb.iocb = NULL; 3356 reqp->aio_req_lio = NULL; 3357 reqp->aio_req_buf.b_file = vp; 3358 uio = reqp->aio_req.aio_uio; 3359 uio->uio_iovcnt = 1; 3360 uio->uio_iov->iov_base = (caddr_t)(uintptr_t)arg->aio_buf; 3361 uio->uio_iov->iov_len = arg->aio_nbytes; 3362 uio->uio_loffset = arg->aio_offset; 3363 *reqpp = reqp; 3364 return (0); 3365 } 3366 3367 /* 3368 * This routine is called when a non largefile call is made by a 32bit 3369 * process on a ILP32 or LP64 kernel. 3370 */ 3371 static int 3372 alio32( 3373 int mode_arg, 3374 void *aiocb_arg, 3375 int nent, 3376 void *sigev) 3377 { 3378 file_t *fp; 3379 file_t *prev_fp = NULL; 3380 int prev_mode = -1; 3381 struct vnode *vp; 3382 aio_lio_t *head; 3383 aio_req_t *reqp; 3384 aio_t *aiop; 3385 caddr_t cbplist; 3386 aiocb_t cb; 3387 aiocb_t *aiocb = &cb; 3388 #ifdef _LP64 3389 aiocb32_t *cbp; 3390 caddr32_t *ucbp; 3391 aiocb32_t cb32; 3392 aiocb32_t *aiocb32 = &cb32; 3393 struct sigevent32 sigevk; 3394 #else 3395 aiocb_t *cbp, **ucbp; 3396 struct sigevent sigevk; 3397 #endif 3398 sigqueue_t *sqp; 3399 int (*aio_func)(); 3400 int mode; 3401 int error = 0; 3402 int aio_errors = 0; 3403 int i; 3404 size_t ssize; 3405 int deadhead = 0; 3406 int aio_notsupported = 0; 3407 int lio_head_port; 3408 int aio_port; 3409 int aio_thread; 3410 port_kevent_t *pkevtp = NULL; 3411 int portused = 0; 3412 #ifdef _LP64 3413 port_notify32_t pnotify; 3414 #else 3415 port_notify_t pnotify; 3416 #endif 3417 int event; 3418 3419 aiop = curproc->p_aio; 3420 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX) 3421 return (EINVAL); 3422 3423 #ifdef _LP64 3424 ssize = (sizeof (caddr32_t) * nent); 3425 #else 3426 ssize = (sizeof (aiocb_t *) * nent); 3427 #endif 3428 cbplist = kmem_alloc(ssize, KM_SLEEP); 3429 ucbp = (void *)cbplist; 3430 3431 if (copyin(aiocb_arg, cbplist, ssize) || 3432 (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent32)))) { 3433 kmem_free(cbplist, ssize); 3434 return (EFAULT); 3435 } 3436 3437 /* Event Ports */ 3438 if (sigev && 3439 (sigevk.sigev_notify == SIGEV_THREAD || 3440 sigevk.sigev_notify == SIGEV_PORT)) { 3441 if (sigevk.sigev_notify == SIGEV_THREAD) { 3442 pnotify.portnfy_port = sigevk.sigev_signo; 3443 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr; 3444 } else if (copyin( 3445 (void *)(uintptr_t)sigevk.sigev_value.sival_ptr, 3446 &pnotify, sizeof (pnotify))) { 3447 kmem_free(cbplist, ssize); 3448 return (EFAULT); 3449 } 3450 error = port_alloc_event(pnotify.portnfy_port, 3451 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp); 3452 if (error) { 3453 if (error == ENOMEM || error == EAGAIN) 3454 error = EAGAIN; 3455 else 3456 error = EINVAL; 3457 kmem_free(cbplist, ssize); 3458 return (error); 3459 } 3460 lio_head_port = pnotify.portnfy_port; 3461 portused = 1; 3462 } 3463 3464 /* 3465 * a list head should be allocated if notification is 3466 * enabled for this list. 3467 */ 3468 head = NULL; 3469 3470 if (mode_arg == LIO_WAIT || sigev) { 3471 mutex_enter(&aiop->aio_mutex); 3472 error = aio_lio_alloc(&head); 3473 mutex_exit(&aiop->aio_mutex); 3474 if (error) 3475 goto done; 3476 deadhead = 1; 3477 head->lio_nent = nent; 3478 head->lio_refcnt = nent; 3479 head->lio_port = -1; 3480 head->lio_portkev = NULL; 3481 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL && 3482 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) { 3483 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP); 3484 if (sqp == NULL) { 3485 error = EAGAIN; 3486 goto done; 3487 } 3488 sqp->sq_func = NULL; 3489 sqp->sq_next = NULL; 3490 sqp->sq_info.si_code = SI_ASYNCIO; 3491 sqp->sq_info.si_pid = curproc->p_pid; 3492 sqp->sq_info.si_ctid = PRCTID(curproc); 3493 sqp->sq_info.si_zoneid = getzoneid(); 3494 sqp->sq_info.si_uid = crgetuid(curproc->p_cred); 3495 sqp->sq_info.si_signo = sigevk.sigev_signo; 3496 sqp->sq_info.si_value.sival_int = 3497 sigevk.sigev_value.sival_int; 3498 head->lio_sigqp = sqp; 3499 } else { 3500 head->lio_sigqp = NULL; 3501 } 3502 if (pkevtp) { 3503 /* 3504 * Prepare data to send when list of aiocb's has 3505 * completed. 3506 */ 3507 port_init_event(pkevtp, (uintptr_t)sigev, 3508 (void *)(uintptr_t)pnotify.portnfy_user, 3509 NULL, head); 3510 pkevtp->portkev_events = AIOLIO; 3511 head->lio_portkev = pkevtp; 3512 head->lio_port = pnotify.portnfy_port; 3513 } 3514 } 3515 3516 for (i = 0; i < nent; i++, ucbp++) { 3517 3518 /* skip entry if it can't be copied. */ 3519 #ifdef _LP64 3520 cbp = (aiocb32_t *)(uintptr_t)*ucbp; 3521 if (cbp == NULL || copyin(cbp, aiocb32, sizeof (*aiocb32))) 3522 #else 3523 cbp = (aiocb_t *)*ucbp; 3524 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) 3525 #endif 3526 { 3527 if (head) { 3528 mutex_enter(&aiop->aio_mutex); 3529 head->lio_nent--; 3530 head->lio_refcnt--; 3531 mutex_exit(&aiop->aio_mutex); 3532 } 3533 continue; 3534 } 3535 #ifdef _LP64 3536 /* 3537 * copy 32 bit structure into 64 bit structure 3538 */ 3539 aiocb_32ton(aiocb32, aiocb); 3540 #endif /* _LP64 */ 3541 3542 /* skip if opcode for aiocb is LIO_NOP */ 3543 mode = aiocb->aio_lio_opcode; 3544 if (mode == LIO_NOP) { 3545 cbp = NULL; 3546 if (head) { 3547 mutex_enter(&aiop->aio_mutex); 3548 head->lio_nent--; 3549 head->lio_refcnt--; 3550 mutex_exit(&aiop->aio_mutex); 3551 } 3552 continue; 3553 } 3554 3555 /* increment file descriptor's ref count. */ 3556 if ((fp = getf(aiocb->aio_fildes)) == NULL) { 3557 lio_set_uerror(&cbp->aio_resultp, EBADF); 3558 if (head) { 3559 mutex_enter(&aiop->aio_mutex); 3560 head->lio_nent--; 3561 head->lio_refcnt--; 3562 mutex_exit(&aiop->aio_mutex); 3563 } 3564 aio_errors++; 3565 continue; 3566 } 3567 3568 /* 3569 * check the permission of the partition 3570 */ 3571 if ((fp->f_flag & mode) == 0) { 3572 releasef(aiocb->aio_fildes); 3573 lio_set_uerror(&cbp->aio_resultp, EBADF); 3574 if (head) { 3575 mutex_enter(&aiop->aio_mutex); 3576 head->lio_nent--; 3577 head->lio_refcnt--; 3578 mutex_exit(&aiop->aio_mutex); 3579 } 3580 aio_errors++; 3581 continue; 3582 } 3583 3584 /* 3585 * common case where requests are to the same fd 3586 * for the same r/w operation 3587 * for UFS, need to set EBADFD 3588 */ 3589 vp = fp->f_vnode; 3590 if (fp != prev_fp || mode != prev_mode) { 3591 aio_func = check_vp(vp, mode); 3592 if (aio_func == NULL) { 3593 prev_fp = NULL; 3594 releasef(aiocb->aio_fildes); 3595 lio_set_uerror(&cbp->aio_resultp, EBADFD); 3596 aio_notsupported++; 3597 if (head) { 3598 mutex_enter(&aiop->aio_mutex); 3599 head->lio_nent--; 3600 head->lio_refcnt--; 3601 mutex_exit(&aiop->aio_mutex); 3602 } 3603 continue; 3604 } else { 3605 prev_fp = fp; 3606 prev_mode = mode; 3607 } 3608 } 3609 3610 error = aio_req_setup(&reqp, aiop, aiocb, 3611 (aio_result_t *)&cbp->aio_resultp, vp, 0); 3612 if (error) { 3613 releasef(aiocb->aio_fildes); 3614 lio_set_uerror(&cbp->aio_resultp, error); 3615 if (head) { 3616 mutex_enter(&aiop->aio_mutex); 3617 head->lio_nent--; 3618 head->lio_refcnt--; 3619 mutex_exit(&aiop->aio_mutex); 3620 } 3621 aio_errors++; 3622 continue; 3623 } 3624 3625 reqp->aio_req_lio = head; 3626 deadhead = 0; 3627 3628 /* 3629 * Set the errno field now before sending the request to 3630 * the driver to avoid a race condition 3631 */ 3632 (void) suword32(&cbp->aio_resultp.aio_errno, 3633 EINPROGRESS); 3634 3635 reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)cbp; 3636 3637 event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE; 3638 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT); 3639 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD); 3640 if (aio_port | aio_thread) { 3641 port_kevent_t *lpkevp; 3642 /* 3643 * Prepare data to send with each aiocb completed. 3644 */ 3645 #ifdef _LP64 3646 if (aio_port) { 3647 void *paddr = (void *)(uintptr_t) 3648 aiocb32->aio_sigevent.sigev_value.sival_ptr; 3649 if (copyin(paddr, &pnotify, sizeof (pnotify))) 3650 error = EFAULT; 3651 } else { /* aio_thread */ 3652 pnotify.portnfy_port = 3653 aiocb32->aio_sigevent.sigev_signo; 3654 pnotify.portnfy_user = 3655 aiocb32->aio_sigevent.sigev_value.sival_ptr; 3656 } 3657 #else 3658 if (aio_port) { 3659 void *paddr = 3660 aiocb->aio_sigevent.sigev_value.sival_ptr; 3661 if (copyin(paddr, &pnotify, sizeof (pnotify))) 3662 error = EFAULT; 3663 } else { /* aio_thread */ 3664 pnotify.portnfy_port = 3665 aiocb->aio_sigevent.sigev_signo; 3666 pnotify.portnfy_user = 3667 aiocb->aio_sigevent.sigev_value.sival_ptr; 3668 } 3669 #endif 3670 if (error) 3671 /* EMPTY */; 3672 else if (pkevtp != NULL && 3673 pnotify.portnfy_port == lio_head_port) 3674 error = port_dup_event(pkevtp, &lpkevp, 3675 PORT_ALLOC_DEFAULT); 3676 else 3677 error = port_alloc_event(pnotify.portnfy_port, 3678 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, 3679 &lpkevp); 3680 if (error == 0) { 3681 port_init_event(lpkevp, (uintptr_t)cbp, 3682 (void *)(uintptr_t)pnotify.portnfy_user, 3683 aio_port_callback, reqp); 3684 lpkevp->portkev_events = event; 3685 reqp->aio_req_portkev = lpkevp; 3686 reqp->aio_req_port = pnotify.portnfy_port; 3687 } 3688 } 3689 3690 /* 3691 * send the request to driver. 3692 */ 3693 if (error == 0) { 3694 if (aiocb->aio_nbytes == 0) { 3695 clear_active_fd(aiocb->aio_fildes); 3696 aio_zerolen(reqp); 3697 continue; 3698 } 3699 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, 3700 CRED()); 3701 } 3702 3703 /* 3704 * the fd's ref count is not decremented until the IO has 3705 * completed unless there was an error. 3706 */ 3707 if (error) { 3708 releasef(aiocb->aio_fildes); 3709 lio_set_uerror(&cbp->aio_resultp, error); 3710 if (head) { 3711 mutex_enter(&aiop->aio_mutex); 3712 head->lio_nent--; 3713 head->lio_refcnt--; 3714 mutex_exit(&aiop->aio_mutex); 3715 } 3716 if (error == ENOTSUP) 3717 aio_notsupported++; 3718 else 3719 aio_errors++; 3720 lio_set_error(reqp, portused); 3721 } else { 3722 clear_active_fd(aiocb->aio_fildes); 3723 } 3724 } 3725 3726 if (aio_notsupported) { 3727 error = ENOTSUP; 3728 } else if (aio_errors) { 3729 /* 3730 * return EIO if any request failed 3731 */ 3732 error = EIO; 3733 } 3734 3735 if (mode_arg == LIO_WAIT) { 3736 mutex_enter(&aiop->aio_mutex); 3737 while (head->lio_refcnt > 0) { 3738 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) { 3739 mutex_exit(&aiop->aio_mutex); 3740 error = EINTR; 3741 goto done; 3742 } 3743 } 3744 mutex_exit(&aiop->aio_mutex); 3745 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_32); 3746 } 3747 3748 done: 3749 kmem_free(cbplist, ssize); 3750 if (deadhead) { 3751 if (head->lio_sigqp) 3752 kmem_free(head->lio_sigqp, sizeof (sigqueue_t)); 3753 if (head->lio_portkev) 3754 port_free_event(head->lio_portkev); 3755 kmem_free(head, sizeof (aio_lio_t)); 3756 } 3757 return (error); 3758 } 3759 3760 3761 #ifdef _SYSCALL32_IMPL 3762 void 3763 aiocb_32ton(aiocb32_t *src, aiocb_t *dest) 3764 { 3765 dest->aio_fildes = src->aio_fildes; 3766 dest->aio_buf = (caddr_t)(uintptr_t)src->aio_buf; 3767 dest->aio_nbytes = (size_t)src->aio_nbytes; 3768 dest->aio_offset = (off_t)src->aio_offset; 3769 dest->aio_reqprio = src->aio_reqprio; 3770 dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify; 3771 dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo; 3772 3773 /* 3774 * See comment in sigqueue32() on handling of 32-bit 3775 * sigvals in a 64-bit kernel. 3776 */ 3777 dest->aio_sigevent.sigev_value.sival_int = 3778 (int)src->aio_sigevent.sigev_value.sival_int; 3779 dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval)) 3780 (uintptr_t)src->aio_sigevent.sigev_notify_function; 3781 dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *) 3782 (uintptr_t)src->aio_sigevent.sigev_notify_attributes; 3783 dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2; 3784 dest->aio_lio_opcode = src->aio_lio_opcode; 3785 dest->aio_state = src->aio_state; 3786 dest->aio__pad[0] = src->aio__pad[0]; 3787 } 3788 #endif /* _SYSCALL32_IMPL */ 3789 3790 /* 3791 * aio_port_callback() is called just before the event is retrieved from the 3792 * port. The task of this callback function is to finish the work of the 3793 * transaction for the application, it means : 3794 * - copyout transaction data to the application 3795 * (this thread is running in the right process context) 3796 * - keep trace of the transaction (update of counters). 3797 * - free allocated buffers 3798 * The aiocb pointer is the object element of the port_kevent_t structure. 3799 * 3800 * flag : 3801 * PORT_CALLBACK_DEFAULT : do copyout and free resources 3802 * PORT_CALLBACK_CLOSE : don't do copyout, free resources 3803 */ 3804 3805 /*ARGSUSED*/ 3806 int 3807 aio_port_callback(void *arg, int *events, pid_t pid, int flag, void *evp) 3808 { 3809 aio_t *aiop = curproc->p_aio; 3810 aio_req_t *reqp = arg; 3811 struct iovec *iov; 3812 struct buf *bp; 3813 void *resultp; 3814 3815 if (pid != curproc->p_pid) { 3816 /* wrong proc !!, can not deliver data here ... */ 3817 return (EACCES); 3818 } 3819 3820 mutex_enter(&aiop->aio_portq_mutex); 3821 reqp->aio_req_portkev = NULL; 3822 aio_req_remove_portq(aiop, reqp); /* remove request from portq */ 3823 mutex_exit(&aiop->aio_portq_mutex); 3824 aphysio_unlock(reqp); /* unlock used pages */ 3825 mutex_enter(&aiop->aio_mutex); 3826 if (reqp->aio_req_flags & AIO_COPYOUTDONE) { 3827 aio_req_free_port(aiop, reqp); /* back to free list */ 3828 mutex_exit(&aiop->aio_mutex); 3829 return (0); 3830 } 3831 3832 iov = reqp->aio_req_uio.uio_iov; 3833 bp = &reqp->aio_req_buf; 3834 resultp = (void *)reqp->aio_req_resultp; 3835 aio_req_free_port(aiop, reqp); /* request struct back to free list */ 3836 mutex_exit(&aiop->aio_mutex); 3837 if (flag == PORT_CALLBACK_DEFAULT) 3838 aio_copyout_result_port(iov, bp, resultp); 3839 return (0); 3840 }