1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Kernel asynchronous I/O.
29 * This is only for raw devices now (as of Nov. 1993).
30 */
31
32 #include <sys/types.h>
33 #include <sys/errno.h>
34 #include <sys/conf.h>
35 #include <sys/file.h>
36 #include <sys/fs/snode.h>
37 #include <sys/unistd.h>
38 #include <sys/cmn_err.h>
39 #include <vm/as.h>
40 #include <vm/faultcode.h>
41 #include <sys/sysmacros.h>
42 #include <sys/procfs.h>
43 #include <sys/kmem.h>
44 #include <sys/autoconf.h>
45 #include <sys/ddi_impldefs.h>
46 #include <sys/sunddi.h>
47 #include <sys/aio_impl.h>
48 #include <sys/debug.h>
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/vmsystm.h>
52 #include <sys/fs/pxfs_ki.h>
53 #include <sys/contract/process_impl.h>
54
55 /*
56 * external entry point.
57 */
58 #ifdef _LP64
59 static int64_t kaioc(long, long, long, long, long, long);
60 #endif
61 static int kaio(ulong_t *, rval_t *);
62
63
64 #define AIO_64 0
65 #define AIO_32 1
66 #define AIO_LARGEFILE 2
67
68 /*
69 * implementation specific functions (private)
70 */
71 #ifdef _LP64
72 static int alio(int, aiocb_t **, int, struct sigevent *);
73 #endif
74 static int aionotify(void);
75 static int aioinit(void);
76 static int aiostart(void);
77 static void alio_cleanup(aio_t *, aiocb_t **, int, int);
78 static int (*check_vp(struct vnode *, int))(vnode_t *, struct aio_req *,
79 cred_t *);
80 static void lio_set_error(aio_req_t *, int portused);
81 static aio_t *aio_aiop_alloc();
82 static int aio_req_alloc(aio_req_t **, aio_result_t *);
83 static int aio_lio_alloc(aio_lio_t **);
84 static aio_req_t *aio_req_done(void *);
85 static aio_req_t *aio_req_remove(aio_req_t *);
86 static int aio_req_find(aio_result_t *, aio_req_t **);
87 static int aio_hash_insert(struct aio_req_t *, aio_t *);
88 static int aio_req_setup(aio_req_t **, aio_t *, aiocb_t *,
89 aio_result_t *, vnode_t *, int);
90 static int aio_cleanup_thread(aio_t *);
91 static aio_lio_t *aio_list_get(aio_result_t *);
92 static void lio_set_uerror(void *, int);
93 extern void aio_zerolen(aio_req_t *);
94 static int aiowait(struct timeval *, int, long *);
95 static int aiowaitn(void *, uint_t, uint_t *, timespec_t *);
96 static int aio_unlock_requests(caddr_t iocblist, int iocb_index,
97 aio_req_t *reqlist, aio_t *aiop, model_t model);
98 static int aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max);
99 static int aiosuspend(void *, int, struct timespec *, int,
100 long *, int);
101 static int aliowait(int, void *, int, void *, int);
102 static int aioerror(void *, int);
103 static int aio_cancel(int, void *, long *, int);
104 static int arw(int, int, char *, int, offset_t, aio_result_t *, int);
105 static int aiorw(int, void *, int, int);
106
107 static int alioLF(int, void *, int, void *);
108 static int aio_req_setupLF(aio_req_t **, aio_t *, aiocb64_32_t *,
109 aio_result_t *, vnode_t *, int);
110 static int alio32(int, void *, int, void *);
111 static int driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
112 static int driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
113
114 #ifdef _SYSCALL32_IMPL
115 static void aiocb_LFton(aiocb64_32_t *, aiocb_t *);
116 void aiocb_32ton(aiocb32_t *, aiocb_t *);
117 #endif /* _SYSCALL32_IMPL */
118
119 /*
120 * implementation specific functions (external)
121 */
122 void aio_req_free(aio_t *, aio_req_t *);
123
124 /*
125 * Event Port framework
126 */
127
128 void aio_req_free_port(aio_t *, aio_req_t *);
129 static int aio_port_callback(void *, int *, pid_t, int, void *);
130
131 /*
132 * This is the loadable module wrapper.
133 */
134 #include <sys/modctl.h>
135 #include <sys/syscall.h>
136
137 #ifdef _LP64
138
139 static struct sysent kaio_sysent = {
140 6,
141 SE_NOUNLOAD | SE_64RVAL | SE_ARGC,
142 (int (*)())kaioc
143 };
144
145 #ifdef _SYSCALL32_IMPL
146 static struct sysent kaio_sysent32 = {
147 7,
148 SE_NOUNLOAD | SE_64RVAL,
149 kaio
150 };
151 #endif /* _SYSCALL32_IMPL */
152
153 #else /* _LP64 */
154
155 static struct sysent kaio_sysent = {
156 7,
157 SE_NOUNLOAD | SE_32RVAL1,
158 kaio
159 };
160
161 #endif /* _LP64 */
162
163 /*
164 * Module linkage information for the kernel.
165 */
166
167 static struct modlsys modlsys = {
168 &mod_syscallops,
169 "kernel Async I/O",
170 &kaio_sysent
171 };
172
173 #ifdef _SYSCALL32_IMPL
174 static struct modlsys modlsys32 = {
175 &mod_syscallops32,
176 "kernel Async I/O for 32 bit compatibility",
177 &kaio_sysent32
178 };
179 #endif /* _SYSCALL32_IMPL */
180
181
182 static struct modlinkage modlinkage = {
183 MODREV_1,
184 { &modlsys,
185 #ifdef _SYSCALL32_IMPL
186 &modlsys32,
187 #endif
188 NULL
189 }
190 };
191
192 int
193 _init(void)
194 {
195 int retval;
196
197 if ((retval = mod_install(&modlinkage)) != 0)
198 return (retval);
199
200 return (0);
201 }
202
203 int
204 _fini(void)
205 {
206 int retval;
207
208 retval = mod_remove(&modlinkage);
209
210 return (retval);
211 }
212
213 int
214 _info(struct modinfo *modinfop)
215 {
216 return (mod_info(&modlinkage, modinfop));
217 }
218
219 #ifdef _LP64
220 static int64_t
221 kaioc(
222 long a0,
223 long a1,
224 long a2,
225 long a3,
226 long a4,
227 long a5)
228 {
229 int error;
230 long rval = 0;
231
232 switch ((int)a0 & ~AIO_POLL_BIT) {
233 case AIOREAD:
234 error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
235 (offset_t)a4, (aio_result_t *)a5, FREAD);
236 break;
237 case AIOWRITE:
238 error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
239 (offset_t)a4, (aio_result_t *)a5, FWRITE);
240 break;
241 case AIOWAIT:
242 error = aiowait((struct timeval *)a1, (int)a2, &rval);
243 break;
244 case AIOWAITN:
245 error = aiowaitn((void *)a1, (uint_t)a2, (uint_t *)a3,
246 (timespec_t *)a4);
247 break;
248 case AIONOTIFY:
249 error = aionotify();
250 break;
251 case AIOINIT:
252 error = aioinit();
253 break;
254 case AIOSTART:
255 error = aiostart();
256 break;
257 case AIOLIO:
258 error = alio((int)a1, (aiocb_t **)a2, (int)a3,
259 (struct sigevent *)a4);
260 break;
261 case AIOLIOWAIT:
262 error = aliowait((int)a1, (void *)a2, (int)a3,
263 (struct sigevent *)a4, AIO_64);
264 break;
265 case AIOSUSPEND:
266 error = aiosuspend((void *)a1, (int)a2, (timespec_t *)a3,
267 (int)a4, &rval, AIO_64);
268 break;
269 case AIOERROR:
270 error = aioerror((void *)a1, AIO_64);
271 break;
272 case AIOAREAD:
273 error = aiorw((int)a0, (void *)a1, FREAD, AIO_64);
274 break;
275 case AIOAWRITE:
276 error = aiorw((int)a0, (void *)a1, FWRITE, AIO_64);
277 break;
278 case AIOCANCEL:
279 error = aio_cancel((int)a1, (void *)a2, &rval, AIO_64);
280 break;
281
282 /*
283 * The large file related stuff is valid only for
284 * 32 bit kernel and not for 64 bit kernel
285 * On 64 bit kernel we convert large file calls
286 * to regular 64bit calls.
287 */
288
289 default:
290 error = EINVAL;
291 }
292 if (error)
293 return ((int64_t)set_errno(error));
294 return (rval);
295 }
296 #endif
297
298 static int
299 kaio(
300 ulong_t *uap,
301 rval_t *rvp)
302 {
303 long rval = 0;
304 int error = 0;
305 offset_t off;
306
307
308 rvp->r_vals = 0;
309 #if defined(_LITTLE_ENDIAN)
310 off = ((u_offset_t)uap[5] << 32) | (u_offset_t)uap[4];
311 #else
312 off = ((u_offset_t)uap[4] << 32) | (u_offset_t)uap[5];
313 #endif
314
315 switch (uap[0] & ~AIO_POLL_BIT) {
316 /*
317 * It must be the 32 bit system call on 64 bit kernel
318 */
319 case AIOREAD:
320 return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
321 (int)uap[3], off, (aio_result_t *)uap[6], FREAD));
322 case AIOWRITE:
323 return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
324 (int)uap[3], off, (aio_result_t *)uap[6], FWRITE));
325 case AIOWAIT:
326 error = aiowait((struct timeval *)uap[1], (int)uap[2],
327 &rval);
328 break;
329 case AIOWAITN:
330 error = aiowaitn((void *)uap[1], (uint_t)uap[2],
331 (uint_t *)uap[3], (timespec_t *)uap[4]);
332 break;
333 case AIONOTIFY:
334 return (aionotify());
335 case AIOINIT:
336 return (aioinit());
337 case AIOSTART:
338 return (aiostart());
339 case AIOLIO:
340 return (alio32((int)uap[1], (void *)uap[2], (int)uap[3],
341 (void *)uap[4]));
342 case AIOLIOWAIT:
343 return (aliowait((int)uap[1], (void *)uap[2],
344 (int)uap[3], (struct sigevent *)uap[4], AIO_32));
345 case AIOSUSPEND:
346 error = aiosuspend((void *)uap[1], (int)uap[2],
347 (timespec_t *)uap[3], (int)uap[4],
348 &rval, AIO_32);
349 break;
350 case AIOERROR:
351 return (aioerror((void *)uap[1], AIO_32));
352 case AIOAREAD:
353 return (aiorw((int)uap[0], (void *)uap[1],
354 FREAD, AIO_32));
355 case AIOAWRITE:
356 return (aiorw((int)uap[0], (void *)uap[1],
357 FWRITE, AIO_32));
358 case AIOCANCEL:
359 error = (aio_cancel((int)uap[1], (void *)uap[2], &rval,
360 AIO_32));
361 break;
362 case AIOLIO64:
363 return (alioLF((int)uap[1], (void *)uap[2],
364 (int)uap[3], (void *)uap[4]));
365 case AIOLIOWAIT64:
366 return (aliowait(uap[1], (void *)uap[2],
367 (int)uap[3], (void *)uap[4], AIO_LARGEFILE));
368 case AIOSUSPEND64:
369 error = aiosuspend((void *)uap[1], (int)uap[2],
370 (timespec_t *)uap[3], (int)uap[4], &rval,
371 AIO_LARGEFILE);
372 break;
373 case AIOERROR64:
374 return (aioerror((void *)uap[1], AIO_LARGEFILE));
375 case AIOAREAD64:
376 return (aiorw((int)uap[0], (void *)uap[1], FREAD,
377 AIO_LARGEFILE));
378 case AIOAWRITE64:
379 return (aiorw((int)uap[0], (void *)uap[1], FWRITE,
380 AIO_LARGEFILE));
381 case AIOCANCEL64:
382 error = (aio_cancel((int)uap[1], (void *)uap[2],
383 &rval, AIO_LARGEFILE));
384 break;
385 default:
386 return (EINVAL);
387 }
388
389 rvp->r_val1 = rval;
390 return (error);
391 }
392
393 /*
394 * wake up LWPs in this process that are sleeping in
395 * aiowait().
396 */
397 static int
398 aionotify(void)
399 {
400 aio_t *aiop;
401
402 aiop = curproc->p_aio;
403 if (aiop == NULL)
404 return (0);
405
406 mutex_enter(&aiop->aio_mutex);
407 aiop->aio_notifycnt++;
408 cv_broadcast(&aiop->aio_waitcv);
409 mutex_exit(&aiop->aio_mutex);
410
411 return (0);
412 }
413
414 static int
415 timeval2reltime(struct timeval *timout, timestruc_t *rqtime,
416 timestruc_t **rqtp, int *blocking)
417 {
418 #ifdef _SYSCALL32_IMPL
419 struct timeval32 wait_time_32;
420 #endif
421 struct timeval wait_time;
422 model_t model = get_udatamodel();
423
424 *rqtp = NULL;
425 if (timout == NULL) { /* wait indefinitely */
426 *blocking = 1;
427 return (0);
428 }
429
430 /*
431 * Need to correctly compare with the -1 passed in for a user
432 * address pointer, with both 32 bit and 64 bit apps.
433 */
434 if (model == DATAMODEL_NATIVE) {
435 if ((intptr_t)timout == (intptr_t)-1) { /* don't wait */
436 *blocking = 0;
437 return (0);
438 }
439
440 if (copyin(timout, &wait_time, sizeof (wait_time)))
441 return (EFAULT);
442 }
443 #ifdef _SYSCALL32_IMPL
444 else {
445 /*
446 * -1 from a 32bit app. It will not get sign extended.
447 * don't wait if -1.
448 */
449 if ((intptr_t)timout == (intptr_t)((uint32_t)-1)) {
450 *blocking = 0;
451 return (0);
452 }
453
454 if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
455 return (EFAULT);
456 TIMEVAL32_TO_TIMEVAL(&wait_time, &wait_time_32);
457 }
458 #endif /* _SYSCALL32_IMPL */
459
460 if (wait_time.tv_sec == 0 && wait_time.tv_usec == 0) { /* don't wait */
461 *blocking = 0;
462 return (0);
463 }
464
465 if (wait_time.tv_sec < 0 ||
466 wait_time.tv_usec < 0 || wait_time.tv_usec >= MICROSEC)
467 return (EINVAL);
468
469 rqtime->tv_sec = wait_time.tv_sec;
470 rqtime->tv_nsec = wait_time.tv_usec * 1000;
471 *rqtp = rqtime;
472 *blocking = 1;
473
474 return (0);
475 }
476
477 static int
478 timespec2reltime(timespec_t *timout, timestruc_t *rqtime,
479 timestruc_t **rqtp, int *blocking)
480 {
481 #ifdef _SYSCALL32_IMPL
482 timespec32_t wait_time_32;
483 #endif
484 model_t model = get_udatamodel();
485
486 *rqtp = NULL;
487 if (timout == NULL) {
488 *blocking = 1;
489 return (0);
490 }
491
492 if (model == DATAMODEL_NATIVE) {
493 if (copyin(timout, rqtime, sizeof (*rqtime)))
494 return (EFAULT);
495 }
496 #ifdef _SYSCALL32_IMPL
497 else {
498 if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
499 return (EFAULT);
500 TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32);
501 }
502 #endif /* _SYSCALL32_IMPL */
503
504 if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) {
505 *blocking = 0;
506 return (0);
507 }
508
509 if (rqtime->tv_sec < 0 ||
510 rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC)
511 return (EINVAL);
512
513 *rqtp = rqtime;
514 *blocking = 1;
515
516 return (0);
517 }
518
519 /*ARGSUSED*/
520 static int
521 aiowait(
522 struct timeval *timout,
523 int dontblockflg,
524 long *rval)
525 {
526 int error;
527 aio_t *aiop;
528 aio_req_t *reqp;
529 clock_t status;
530 int blocking;
531 int timecheck;
532 timestruc_t rqtime;
533 timestruc_t *rqtp;
534
535 aiop = curproc->p_aio;
536 if (aiop == NULL)
537 return (EINVAL);
538
539 /*
540 * Establish the absolute future time for the timeout.
541 */
542 error = timeval2reltime(timout, &rqtime, &rqtp, &blocking);
543 if (error)
544 return (error);
545 if (rqtp) {
546 timestruc_t now;
547 timecheck = timechanged;
548 gethrestime(&now);
549 timespecadd(rqtp, &now);
550 }
551
552 mutex_enter(&aiop->aio_mutex);
553 for (;;) {
554 /* process requests on poll queue */
555 if (aiop->aio_pollq) {
556 mutex_exit(&aiop->aio_mutex);
557 aio_cleanup(0);
558 mutex_enter(&aiop->aio_mutex);
559 }
560 if ((reqp = aio_req_remove(NULL)) != NULL) {
561 *rval = (long)reqp->aio_req_resultp;
562 break;
563 }
564 /* user-level done queue might not be empty */
565 if (aiop->aio_notifycnt > 0) {
566 aiop->aio_notifycnt--;
567 *rval = 1;
568 break;
569 }
570 /* don't block if no outstanding aio */
571 if (aiop->aio_outstanding == 0 && dontblockflg) {
572 error = EINVAL;
573 break;
574 }
575 if (blocking) {
576 status = cv_waituntil_sig(&aiop->aio_waitcv,
577 &aiop->aio_mutex, rqtp, timecheck);
578
579 if (status > 0) /* check done queue again */
580 continue;
581 if (status == 0) { /* interrupted by a signal */
582 error = EINTR;
583 *rval = -1;
584 } else { /* timer expired */
585 error = ETIME;
586 }
587 }
588 break;
589 }
590 mutex_exit(&aiop->aio_mutex);
591 if (reqp) {
592 aphysio_unlock(reqp);
593 aio_copyout_result(reqp);
594 mutex_enter(&aiop->aio_mutex);
595 aio_req_free(aiop, reqp);
596 mutex_exit(&aiop->aio_mutex);
597 }
598 return (error);
599 }
600
601 /*
602 * aiowaitn can be used to reap completed asynchronous requests submitted with
603 * lio_listio, aio_read or aio_write.
604 * This function only reaps asynchronous raw I/Os.
605 */
606
607 /*ARGSUSED*/
608 static int
609 aiowaitn(void *uiocb, uint_t nent, uint_t *nwait, timespec_t *timout)
610 {
611 int error = 0;
612 aio_t *aiop;
613 aio_req_t *reqlist = NULL;
614 caddr_t iocblist = NULL; /* array of iocb ptr's */
615 uint_t waitcnt, cnt = 0; /* iocb cnt */
616 size_t iocbsz; /* users iocb size */
617 size_t riocbsz; /* returned iocb size */
618 int iocb_index = 0;
619 model_t model = get_udatamodel();
620 int blocking = 1;
621 int timecheck;
622 timestruc_t rqtime;
623 timestruc_t *rqtp;
624
625 aiop = curproc->p_aio;
626 if (aiop == NULL || nent == 0 || nent > _AIO_LISTIO_MAX)
627 return (EINVAL);
628
629 if (aiop->aio_outstanding == 0)
630 return (EAGAIN);
631
632 if (copyin(nwait, &waitcnt, sizeof (uint_t)))
633 return (EFAULT);
634
635 /* set *nwait to zero, if we must return prematurely */
636 if (copyout(&cnt, nwait, sizeof (uint_t)))
637 return (EFAULT);
638
639 if (waitcnt == 0) {
640 blocking = 0;
641 rqtp = NULL;
642 waitcnt = nent;
643 } else {
644 error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
645 if (error)
646 return (error);
647 }
648
649 if (model == DATAMODEL_NATIVE)
650 iocbsz = (sizeof (aiocb_t *) * nent);
651 #ifdef _SYSCALL32_IMPL
652 else
653 iocbsz = (sizeof (caddr32_t) * nent);
654 #endif /* _SYSCALL32_IMPL */
655
656 /*
657 * Only one aio_waitn call is allowed at a time.
658 * The active aio_waitn will collect all requests
659 * out of the "done" list and if necessary it will wait
660 * for some/all pending requests to fulfill the nwait
661 * parameter.
662 * A second or further aio_waitn calls will sleep here
663 * until the active aio_waitn finishes and leaves the kernel
664 * If the second call does not block (poll), then return
665 * immediately with the error code : EAGAIN.
666 * If the second call should block, then sleep here, but
667 * do not touch the timeout. The timeout starts when this
668 * aio_waitn-call becomes active.
669 */
670
671 mutex_enter(&aiop->aio_mutex);
672
673 while (aiop->aio_flags & AIO_WAITN) {
674 if (blocking == 0) {
675 mutex_exit(&aiop->aio_mutex);
676 return (EAGAIN);
677 }
678
679 /* block, no timeout */
680 aiop->aio_flags |= AIO_WAITN_PENDING;
681 if (!cv_wait_sig(&aiop->aio_waitncv, &aiop->aio_mutex)) {
682 mutex_exit(&aiop->aio_mutex);
683 return (EINTR);
684 }
685 }
686
687 /*
688 * Establish the absolute future time for the timeout.
689 */
690 if (rqtp) {
691 timestruc_t now;
692 timecheck = timechanged;
693 gethrestime(&now);
694 timespecadd(rqtp, &now);
695 }
696
697 if (iocbsz > aiop->aio_iocbsz && aiop->aio_iocb != NULL) {
698 kmem_free(aiop->aio_iocb, aiop->aio_iocbsz);
699 aiop->aio_iocb = NULL;
700 }
701
702 if (aiop->aio_iocb == NULL) {
703 iocblist = kmem_zalloc(iocbsz, KM_NOSLEEP);
704 if (iocblist == NULL) {
705 mutex_exit(&aiop->aio_mutex);
706 return (ENOMEM);
707 }
708 aiop->aio_iocb = (aiocb_t **)iocblist;
709 aiop->aio_iocbsz = iocbsz;
710 } else {
711 iocblist = (char *)aiop->aio_iocb;
712 }
713
714 aiop->aio_waitncnt = waitcnt;
715 aiop->aio_flags |= AIO_WAITN;
716
717 for (;;) {
718 /* push requests on poll queue to done queue */
719 if (aiop->aio_pollq) {
720 mutex_exit(&aiop->aio_mutex);
721 aio_cleanup(0);
722 mutex_enter(&aiop->aio_mutex);
723 }
724
725 /* check for requests on done queue */
726 if (aiop->aio_doneq) {
727 cnt += aio_reqlist_concat(aiop, &reqlist, nent - cnt);
728 aiop->aio_waitncnt = waitcnt - cnt;
729 }
730
731 /* user-level done queue might not be empty */
732 if (aiop->aio_notifycnt > 0) {
733 aiop->aio_notifycnt--;
734 error = 0;
735 break;
736 }
737
738 /*
739 * if we are here second time as a result of timer
740 * expiration, we reset error if there are enough
741 * aiocb's to satisfy request.
742 * We return also if all requests are already done
743 * and we picked up the whole done queue.
744 */
745
746 if ((cnt >= waitcnt) || (cnt > 0 && aiop->aio_pending == 0 &&
747 aiop->aio_doneq == NULL)) {
748 error = 0;
749 break;
750 }
751
752 if ((cnt < waitcnt) && blocking) {
753 int rval = cv_waituntil_sig(&aiop->aio_waitcv,
754 &aiop->aio_mutex, rqtp, timecheck);
755 if (rval > 0)
756 continue;
757 if (rval < 0) {
758 error = ETIME;
759 blocking = 0;
760 continue;
761 }
762 error = EINTR;
763 }
764 break;
765 }
766
767 mutex_exit(&aiop->aio_mutex);
768
769 if (cnt > 0) {
770
771 iocb_index = aio_unlock_requests(iocblist, iocb_index, reqlist,
772 aiop, model);
773
774 if (model == DATAMODEL_NATIVE)
775 riocbsz = (sizeof (aiocb_t *) * cnt);
776 #ifdef _SYSCALL32_IMPL
777 else
778 riocbsz = (sizeof (caddr32_t) * cnt);
779 #endif /* _SYSCALL32_IMPL */
780
781 if (copyout(iocblist, uiocb, riocbsz) ||
782 copyout(&cnt, nwait, sizeof (uint_t)))
783 error = EFAULT;
784 }
785
786 /* check if there is another thread waiting for execution */
787 mutex_enter(&aiop->aio_mutex);
788 aiop->aio_flags &= ~AIO_WAITN;
789 if (aiop->aio_flags & AIO_WAITN_PENDING) {
790 aiop->aio_flags &= ~AIO_WAITN_PENDING;
791 cv_signal(&aiop->aio_waitncv);
792 }
793 mutex_exit(&aiop->aio_mutex);
794
795 return (error);
796 }
797
798 /*
799 * aio_unlock_requests
800 * copyouts the result of the request as well as the return value.
801 * It builds the list of completed asynchronous requests,
802 * unlocks the allocated memory ranges and
803 * put the aio request structure back into the free list.
804 */
805
806 static int
807 aio_unlock_requests(
808 caddr_t iocblist,
809 int iocb_index,
810 aio_req_t *reqlist,
811 aio_t *aiop,
812 model_t model)
813 {
814 aio_req_t *reqp, *nreqp;
815
816 if (model == DATAMODEL_NATIVE) {
817 for (reqp = reqlist; reqp != NULL; reqp = nreqp) {
818 (((caddr_t *)iocblist)[iocb_index++]) =
819 reqp->aio_req_iocb.iocb;
820 nreqp = reqp->aio_req_next;
821 aphysio_unlock(reqp);
822 aio_copyout_result(reqp);
823 mutex_enter(&aiop->aio_mutex);
824 aio_req_free(aiop, reqp);
825 mutex_exit(&aiop->aio_mutex);
826 }
827 }
828 #ifdef _SYSCALL32_IMPL
829 else {
830 for (reqp = reqlist; reqp != NULL; reqp = nreqp) {
831 ((caddr32_t *)iocblist)[iocb_index++] =
832 reqp->aio_req_iocb.iocb32;
833 nreqp = reqp->aio_req_next;
834 aphysio_unlock(reqp);
835 aio_copyout_result(reqp);
836 mutex_enter(&aiop->aio_mutex);
837 aio_req_free(aiop, reqp);
838 mutex_exit(&aiop->aio_mutex);
839 }
840 }
841 #endif /* _SYSCALL32_IMPL */
842 return (iocb_index);
843 }
844
845 /*
846 * aio_reqlist_concat
847 * moves "max" elements from the done queue to the reqlist queue and removes
848 * the AIO_DONEQ flag.
849 * - reqlist queue is a simple linked list
850 * - done queue is a double linked list
851 */
852
853 static int
854 aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max)
855 {
856 aio_req_t *q2, *q2work, *list;
857 int count = 0;
858
859 list = *reqlist;
860 q2 = aiop->aio_doneq;
861 q2work = q2;
862 while (max-- > 0) {
863 q2work->aio_req_flags &= ~AIO_DONEQ;
864 q2work = q2work->aio_req_next;
865 count++;
866 if (q2work == q2)
867 break;
868 }
869
870 if (q2work == q2) {
871 /* all elements revised */
872 q2->aio_req_prev->aio_req_next = list;
873 list = q2;
874 aiop->aio_doneq = NULL;
875 } else {
876 /*
877 * max < elements in the doneq
878 * detach only the required amount of elements
879 * out of the doneq
880 */
881 q2work->aio_req_prev->aio_req_next = list;
882 list = q2;
883
884 aiop->aio_doneq = q2work;
885 q2work->aio_req_prev = q2->aio_req_prev;
886 q2->aio_req_prev->aio_req_next = q2work;
887 }
888 *reqlist = list;
889 return (count);
890 }
891
892 /*ARGSUSED*/
893 static int
894 aiosuspend(
895 void *aiocb,
896 int nent,
897 struct timespec *timout,
898 int flag,
899 long *rval,
900 int run_mode)
901 {
902 int error;
903 aio_t *aiop;
904 aio_req_t *reqp, *found, *next;
905 caddr_t cbplist = NULL;
906 aiocb_t *cbp, **ucbp;
907 #ifdef _SYSCALL32_IMPL
908 aiocb32_t *cbp32;
909 caddr32_t *ucbp32;
910 #endif /* _SYSCALL32_IMPL */
911 aiocb64_32_t *cbp64;
912 int rv;
913 int i;
914 size_t ssize;
915 model_t model = get_udatamodel();
916 int blocking;
917 int timecheck;
918 timestruc_t rqtime;
919 timestruc_t *rqtp;
920
921 aiop = curproc->p_aio;
922 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
923 return (EINVAL);
924
925 /*
926 * Establish the absolute future time for the timeout.
927 */
928 error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
929 if (error)
930 return (error);
931 if (rqtp) {
932 timestruc_t now;
933 timecheck = timechanged;
934 gethrestime(&now);
935 timespecadd(rqtp, &now);
936 }
937
938 /*
939 * If we are not blocking and there's no IO complete
940 * skip aiocb copyin.
941 */
942 if (!blocking && (aiop->aio_pollq == NULL) &&
943 (aiop->aio_doneq == NULL)) {
944 return (EAGAIN);
945 }
946
947 if (model == DATAMODEL_NATIVE)
948 ssize = (sizeof (aiocb_t *) * nent);
949 #ifdef _SYSCALL32_IMPL
950 else
951 ssize = (sizeof (caddr32_t) * nent);
952 #endif /* _SYSCALL32_IMPL */
953
954 cbplist = kmem_alloc(ssize, KM_NOSLEEP);
955 if (cbplist == NULL)
956 return (ENOMEM);
957
958 if (copyin(aiocb, cbplist, ssize)) {
959 error = EFAULT;
960 goto done;
961 }
962
963 found = NULL;
964 /*
965 * we need to get the aio_cleanupq_mutex since we call
966 * aio_req_done().
967 */
968 mutex_enter(&aiop->aio_cleanupq_mutex);
969 mutex_enter(&aiop->aio_mutex);
970 for (;;) {
971 /* push requests on poll queue to done queue */
972 if (aiop->aio_pollq) {
973 mutex_exit(&aiop->aio_mutex);
974 mutex_exit(&aiop->aio_cleanupq_mutex);
975 aio_cleanup(0);
976 mutex_enter(&aiop->aio_cleanupq_mutex);
977 mutex_enter(&aiop->aio_mutex);
978 }
979 /* check for requests on done queue */
980 if (aiop->aio_doneq) {
981 if (model == DATAMODEL_NATIVE)
982 ucbp = (aiocb_t **)cbplist;
983 #ifdef _SYSCALL32_IMPL
984 else
985 ucbp32 = (caddr32_t *)cbplist;
986 #endif /* _SYSCALL32_IMPL */
987 for (i = 0; i < nent; i++) {
988 if (model == DATAMODEL_NATIVE) {
989 if ((cbp = *ucbp++) == NULL)
990 continue;
991 if (run_mode != AIO_LARGEFILE)
992 reqp = aio_req_done(
993 &cbp->aio_resultp);
994 else {
995 cbp64 = (aiocb64_32_t *)cbp;
996 reqp = aio_req_done(
997 &cbp64->aio_resultp);
998 }
999 }
1000 #ifdef _SYSCALL32_IMPL
1001 else {
1002 if (run_mode == AIO_32) {
1003 if ((cbp32 =
1004 (aiocb32_t *)(uintptr_t)
1005 *ucbp32++) == NULL)
1006 continue;
1007 reqp = aio_req_done(
1008 &cbp32->aio_resultp);
1009 } else if (run_mode == AIO_LARGEFILE) {
1010 if ((cbp64 =
1011 (aiocb64_32_t *)(uintptr_t)
1012 *ucbp32++) == NULL)
1013 continue;
1014 reqp = aio_req_done(
1015 &cbp64->aio_resultp);
1016 }
1017
1018 }
1019 #endif /* _SYSCALL32_IMPL */
1020 if (reqp) {
1021 reqp->aio_req_next = found;
1022 found = reqp;
1023 }
1024 if (aiop->aio_doneq == NULL)
1025 break;
1026 }
1027 if (found)
1028 break;
1029 }
1030 if (aiop->aio_notifycnt > 0) {
1031 /*
1032 * nothing on the kernel's queue. the user
1033 * has notified the kernel that it has items
1034 * on a user-level queue.
1035 */
1036 aiop->aio_notifycnt--;
1037 *rval = 1;
1038 error = 0;
1039 break;
1040 }
1041 /* don't block if nothing is outstanding */
1042 if (aiop->aio_outstanding == 0) {
1043 error = EAGAIN;
1044 break;
1045 }
1046 if (blocking) {
1047 /*
1048 * drop the aio_cleanupq_mutex as we are
1049 * going to block.
1050 */
1051 mutex_exit(&aiop->aio_cleanupq_mutex);
1052 rv = cv_waituntil_sig(&aiop->aio_waitcv,
1053 &aiop->aio_mutex, rqtp, timecheck);
1054 /*
1055 * we have to drop aio_mutex and
1056 * grab it in the right order.
1057 */
1058 mutex_exit(&aiop->aio_mutex);
1059 mutex_enter(&aiop->aio_cleanupq_mutex);
1060 mutex_enter(&aiop->aio_mutex);
1061 if (rv > 0) /* check done queue again */
1062 continue;
1063 if (rv == 0) /* interrupted by a signal */
1064 error = EINTR;
1065 else /* timer expired */
1066 error = ETIME;
1067 } else {
1068 error = EAGAIN;
1069 }
1070 break;
1071 }
1072 mutex_exit(&aiop->aio_mutex);
1073 mutex_exit(&aiop->aio_cleanupq_mutex);
1074 for (reqp = found; reqp != NULL; reqp = next) {
1075 next = reqp->aio_req_next;
1076 aphysio_unlock(reqp);
1077 aio_copyout_result(reqp);
1078 mutex_enter(&aiop->aio_mutex);
1079 aio_req_free(aiop, reqp);
1080 mutex_exit(&aiop->aio_mutex);
1081 }
1082 done:
1083 kmem_free(cbplist, ssize);
1084 return (error);
1085 }
1086
1087 /*
1088 * initialize aio by allocating an aio_t struct for this
1089 * process.
1090 */
1091 static int
1092 aioinit(void)
1093 {
1094 proc_t *p = curproc;
1095 aio_t *aiop;
1096 mutex_enter(&p->p_lock);
1097 if ((aiop = p->p_aio) == NULL) {
1098 aiop = aio_aiop_alloc();
1099 p->p_aio = aiop;
1100 }
1101 mutex_exit(&p->p_lock);
1102 if (aiop == NULL)
1103 return (ENOMEM);
1104 return (0);
1105 }
1106
1107 /*
1108 * start a special thread that will cleanup after aio requests
1109 * that are preventing a segment from being unmapped. as_unmap()
1110 * blocks until all phsyio to this segment is completed. this
1111 * doesn't happen until all the pages in this segment are not
1112 * SOFTLOCKed. Some pages will be SOFTLOCKed when there are aio
1113 * requests still outstanding. this special thread will make sure
1114 * that these SOFTLOCKed pages will eventually be SOFTUNLOCKed.
1115 *
1116 * this function will return an error if the process has only
1117 * one LWP. the assumption is that the caller is a separate LWP
1118 * that remains blocked in the kernel for the life of this process.
1119 */
1120 static int
1121 aiostart(void)
1122 {
1123 proc_t *p = curproc;
1124 aio_t *aiop;
1125 int first, error = 0;
1126
1127 if (p->p_lwpcnt == 1)
1128 return (EDEADLK);
1129 mutex_enter(&p->p_lock);
1130 if ((aiop = p->p_aio) == NULL)
1131 error = EINVAL;
1132 else {
1133 first = aiop->aio_ok;
1134 if (aiop->aio_ok == 0)
1135 aiop->aio_ok = 1;
1136 }
1137 mutex_exit(&p->p_lock);
1138 if (error == 0 && first == 0) {
1139 return (aio_cleanup_thread(aiop));
1140 /* should return only to exit */
1141 }
1142 return (error);
1143 }
1144
1145 /*
1146 * Associate an aiocb with a port.
1147 * This function is used by aiorw() to associate a transaction with a port.
1148 * Allocate an event port structure (port_alloc_event()) and store the
1149 * delivered user pointer (portnfy_user) in the portkev_user field of the
1150 * port_kevent_t structure..
1151 * The aio_req_portkev pointer in the aio_req_t structure was added to identify
1152 * the port association.
1153 */
1154
1155 static int
1156 aio_req_assoc_port_rw(port_notify_t *pntfy, aiocb_t *cbp,
1157 aio_req_t *reqp, int event)
1158 {
1159 port_kevent_t *pkevp = NULL;
1160 int error;
1161
1162 error = port_alloc_event(pntfy->portnfy_port, PORT_ALLOC_DEFAULT,
1163 PORT_SOURCE_AIO, &pkevp);
1164 if (error) {
1165 if ((error == ENOMEM) || (error == EAGAIN))
1166 error = EAGAIN;
1167 else
1168 error = EINVAL;
1169 } else {
1170 port_init_event(pkevp, (uintptr_t)cbp, pntfy->portnfy_user,
1171 aio_port_callback, reqp);
1172 pkevp->portkev_events = event;
1173 reqp->aio_req_portkev = pkevp;
1174 reqp->aio_req_port = pntfy->portnfy_port;
1175 }
1176 return (error);
1177 }
1178
1179 #ifdef _LP64
1180
1181 /*
1182 * Asynchronous list IO. A chain of aiocb's are copied in
1183 * one at a time. If the aiocb is invalid, it is skipped.
1184 * For each aiocb, the appropriate driver entry point is
1185 * called. Optimize for the common case where the list
1186 * of requests is to the same file descriptor.
1187 *
1188 * One possible optimization is to define a new driver entry
1189 * point that supports a list of IO requests. Whether this
1190 * improves performance depends somewhat on the driver's
1191 * locking strategy. Processing a list could adversely impact
1192 * the driver's interrupt latency.
1193 */
1194 static int
1195 alio(
1196 int mode_arg,
1197 aiocb_t **aiocb_arg,
1198 int nent,
1199 struct sigevent *sigev)
1200 {
1201 file_t *fp;
1202 file_t *prev_fp = NULL;
1203 int prev_mode = -1;
1204 struct vnode *vp;
1205 aio_lio_t *head;
1206 aio_req_t *reqp;
1207 aio_t *aiop;
1208 caddr_t cbplist;
1209 aiocb_t cb;
1210 aiocb_t *aiocb = &cb;
1211 aiocb_t *cbp;
1212 aiocb_t **ucbp;
1213 struct sigevent sigevk;
1214 sigqueue_t *sqp;
1215 int (*aio_func)();
1216 int mode;
1217 int error = 0;
1218 int aio_errors = 0;
1219 int i;
1220 size_t ssize;
1221 int deadhead = 0;
1222 int aio_notsupported = 0;
1223 int lio_head_port;
1224 int aio_port;
1225 int aio_thread;
1226 port_kevent_t *pkevtp = NULL;
1227 int portused = 0;
1228 port_notify_t pnotify;
1229 int event;
1230
1231 aiop = curproc->p_aio;
1232 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
1233 return (EINVAL);
1234
1235 ssize = (sizeof (aiocb_t *) * nent);
1236 cbplist = kmem_alloc(ssize, KM_SLEEP);
1237 ucbp = (aiocb_t **)cbplist;
1238
1239 if (copyin(aiocb_arg, cbplist, ssize) ||
1240 (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent)))) {
1241 kmem_free(cbplist, ssize);
1242 return (EFAULT);
1243 }
1244
1245 /* Event Ports */
1246 if (sigev &&
1247 (sigevk.sigev_notify == SIGEV_THREAD ||
1248 sigevk.sigev_notify == SIGEV_PORT)) {
1249 if (sigevk.sigev_notify == SIGEV_THREAD) {
1250 pnotify.portnfy_port = sigevk.sigev_signo;
1251 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
1252 } else if (copyin(sigevk.sigev_value.sival_ptr,
1253 &pnotify, sizeof (pnotify))) {
1254 kmem_free(cbplist, ssize);
1255 return (EFAULT);
1256 }
1257 error = port_alloc_event(pnotify.portnfy_port,
1258 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
1259 if (error) {
1260 if (error == ENOMEM || error == EAGAIN)
1261 error = EAGAIN;
1262 else
1263 error = EINVAL;
1264 kmem_free(cbplist, ssize);
1265 return (error);
1266 }
1267 lio_head_port = pnotify.portnfy_port;
1268 portused = 1;
1269 }
1270
1271 /*
1272 * a list head should be allocated if notification is
1273 * enabled for this list.
1274 */
1275 head = NULL;
1276
1277 if (mode_arg == LIO_WAIT || sigev) {
1278 mutex_enter(&aiop->aio_mutex);
1279 error = aio_lio_alloc(&head);
1280 mutex_exit(&aiop->aio_mutex);
1281 if (error)
1282 goto done;
1283 deadhead = 1;
1284 head->lio_nent = nent;
1285 head->lio_refcnt = nent;
1286 head->lio_port = -1;
1287 head->lio_portkev = NULL;
1288 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
1289 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
1290 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
1291 if (sqp == NULL) {
1292 error = EAGAIN;
1293 goto done;
1294 }
1295 sqp->sq_func = NULL;
1296 sqp->sq_next = NULL;
1297 sqp->sq_info.si_code = SI_ASYNCIO;
1298 sqp->sq_info.si_pid = curproc->p_pid;
1299 sqp->sq_info.si_ctid = PRCTID(curproc);
1300 sqp->sq_info.si_zoneid = getzoneid();
1301 sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
1302 sqp->sq_info.si_signo = sigevk.sigev_signo;
1303 sqp->sq_info.si_value = sigevk.sigev_value;
1304 head->lio_sigqp = sqp;
1305 } else {
1306 head->lio_sigqp = NULL;
1307 }
1308 if (pkevtp) {
1309 /*
1310 * Prepare data to send when list of aiocb's
1311 * has completed.
1312 */
1313 port_init_event(pkevtp, (uintptr_t)sigev,
1314 (void *)(uintptr_t)pnotify.portnfy_user,
1315 NULL, head);
1316 pkevtp->portkev_events = AIOLIO;
1317 head->lio_portkev = pkevtp;
1318 head->lio_port = pnotify.portnfy_port;
1319 }
1320 }
1321
1322 for (i = 0; i < nent; i++, ucbp++) {
1323
1324 cbp = *ucbp;
1325 /* skip entry if it can't be copied. */
1326 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) {
1327 if (head) {
1328 mutex_enter(&aiop->aio_mutex);
1329 head->lio_nent--;
1330 head->lio_refcnt--;
1331 mutex_exit(&aiop->aio_mutex);
1332 }
1333 continue;
1334 }
1335
1336 /* skip if opcode for aiocb is LIO_NOP */
1337 mode = aiocb->aio_lio_opcode;
1338 if (mode == LIO_NOP) {
1339 cbp = NULL;
1340 if (head) {
1341 mutex_enter(&aiop->aio_mutex);
1342 head->lio_nent--;
1343 head->lio_refcnt--;
1344 mutex_exit(&aiop->aio_mutex);
1345 }
1346 continue;
1347 }
1348
1349 /* increment file descriptor's ref count. */
1350 if ((fp = getf(aiocb->aio_fildes)) == NULL) {
1351 lio_set_uerror(&cbp->aio_resultp, EBADF);
1352 if (head) {
1353 mutex_enter(&aiop->aio_mutex);
1354 head->lio_nent--;
1355 head->lio_refcnt--;
1356 mutex_exit(&aiop->aio_mutex);
1357 }
1358 aio_errors++;
1359 continue;
1360 }
1361
1362 /*
1363 * check the permission of the partition
1364 */
1365 if ((fp->f_flag & mode) == 0) {
1366 releasef(aiocb->aio_fildes);
1367 lio_set_uerror(&cbp->aio_resultp, EBADF);
1368 if (head) {
1369 mutex_enter(&aiop->aio_mutex);
1370 head->lio_nent--;
1371 head->lio_refcnt--;
1372 mutex_exit(&aiop->aio_mutex);
1373 }
1374 aio_errors++;
1375 continue;
1376 }
1377
1378 /*
1379 * common case where requests are to the same fd
1380 * for the same r/w operation.
1381 * for UFS, need to set EBADFD
1382 */
1383 vp = fp->f_vnode;
1384 if (fp != prev_fp || mode != prev_mode) {
1385 aio_func = check_vp(vp, mode);
1386 if (aio_func == NULL) {
1387 prev_fp = NULL;
1388 releasef(aiocb->aio_fildes);
1389 lio_set_uerror(&cbp->aio_resultp, EBADFD);
1390 aio_notsupported++;
1391 if (head) {
1392 mutex_enter(&aiop->aio_mutex);
1393 head->lio_nent--;
1394 head->lio_refcnt--;
1395 mutex_exit(&aiop->aio_mutex);
1396 }
1397 continue;
1398 } else {
1399 prev_fp = fp;
1400 prev_mode = mode;
1401 }
1402 }
1403
1404 error = aio_req_setup(&reqp, aiop, aiocb,
1405 &cbp->aio_resultp, vp, 0);
1406 if (error) {
1407 releasef(aiocb->aio_fildes);
1408 lio_set_uerror(&cbp->aio_resultp, error);
1409 if (head) {
1410 mutex_enter(&aiop->aio_mutex);
1411 head->lio_nent--;
1412 head->lio_refcnt--;
1413 mutex_exit(&aiop->aio_mutex);
1414 }
1415 aio_errors++;
1416 continue;
1417 }
1418
1419 reqp->aio_req_lio = head;
1420 deadhead = 0;
1421
1422 /*
1423 * Set the errno field now before sending the request to
1424 * the driver to avoid a race condition
1425 */
1426 (void) suword32(&cbp->aio_resultp.aio_errno,
1427 EINPROGRESS);
1428
1429 reqp->aio_req_iocb.iocb = (caddr_t)cbp;
1430
1431 event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE;
1432 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
1433 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
1434 if (aio_port | aio_thread) {
1435 port_kevent_t *lpkevp;
1436 /*
1437 * Prepare data to send with each aiocb completed.
1438 */
1439 if (aio_port) {
1440 void *paddr =
1441 aiocb->aio_sigevent.sigev_value.sival_ptr;
1442 if (copyin(paddr, &pnotify, sizeof (pnotify)))
1443 error = EFAULT;
1444 } else { /* aio_thread */
1445 pnotify.portnfy_port =
1446 aiocb->aio_sigevent.sigev_signo;
1447 pnotify.portnfy_user =
1448 aiocb->aio_sigevent.sigev_value.sival_ptr;
1449 }
1450 if (error)
1451 /* EMPTY */;
1452 else if (pkevtp != NULL &&
1453 pnotify.portnfy_port == lio_head_port)
1454 error = port_dup_event(pkevtp, &lpkevp,
1455 PORT_ALLOC_DEFAULT);
1456 else
1457 error = port_alloc_event(pnotify.portnfy_port,
1458 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
1459 &lpkevp);
1460 if (error == 0) {
1461 port_init_event(lpkevp, (uintptr_t)cbp,
1462 (void *)(uintptr_t)pnotify.portnfy_user,
1463 aio_port_callback, reqp);
1464 lpkevp->portkev_events = event;
1465 reqp->aio_req_portkev = lpkevp;
1466 reqp->aio_req_port = pnotify.portnfy_port;
1467 }
1468 }
1469
1470 /*
1471 * send the request to driver.
1472 */
1473 if (error == 0) {
1474 if (aiocb->aio_nbytes == 0) {
1475 clear_active_fd(aiocb->aio_fildes);
1476 aio_zerolen(reqp);
1477 continue;
1478 }
1479 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
1480 CRED());
1481 }
1482
1483 /*
1484 * the fd's ref count is not decremented until the IO has
1485 * completed unless there was an error.
1486 */
1487 if (error) {
1488 releasef(aiocb->aio_fildes);
1489 lio_set_uerror(&cbp->aio_resultp, error);
1490 if (head) {
1491 mutex_enter(&aiop->aio_mutex);
1492 head->lio_nent--;
1493 head->lio_refcnt--;
1494 mutex_exit(&aiop->aio_mutex);
1495 }
1496 if (error == ENOTSUP)
1497 aio_notsupported++;
1498 else
1499 aio_errors++;
1500 lio_set_error(reqp, portused);
1501 } else {
1502 clear_active_fd(aiocb->aio_fildes);
1503 }
1504 }
1505
1506 if (aio_notsupported) {
1507 error = ENOTSUP;
1508 } else if (aio_errors) {
1509 /*
1510 * return EIO if any request failed
1511 */
1512 error = EIO;
1513 }
1514
1515 if (mode_arg == LIO_WAIT) {
1516 mutex_enter(&aiop->aio_mutex);
1517 while (head->lio_refcnt > 0) {
1518 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
1519 mutex_exit(&aiop->aio_mutex);
1520 error = EINTR;
1521 goto done;
1522 }
1523 }
1524 mutex_exit(&aiop->aio_mutex);
1525 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_64);
1526 }
1527
1528 done:
1529 kmem_free(cbplist, ssize);
1530 if (deadhead) {
1531 if (head->lio_sigqp)
1532 kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
1533 if (head->lio_portkev)
1534 port_free_event(head->lio_portkev);
1535 kmem_free(head, sizeof (aio_lio_t));
1536 }
1537 return (error);
1538 }
1539
1540 #endif /* _LP64 */
1541
1542 /*
1543 * Asynchronous list IO.
1544 * If list I/O is called with LIO_WAIT it can still return
1545 * before all the I/O's are completed if a signal is caught
1546 * or if the list include UFS I/O requests. If this happens,
1547 * libaio will call aliowait() to wait for the I/O's to
1548 * complete
1549 */
1550 /*ARGSUSED*/
1551 static int
1552 aliowait(
1553 int mode,
1554 void *aiocb,
1555 int nent,
1556 void *sigev,
1557 int run_mode)
1558 {
1559 aio_lio_t *head;
1560 aio_t *aiop;
1561 caddr_t cbplist;
1562 aiocb_t *cbp, **ucbp;
1563 #ifdef _SYSCALL32_IMPL
1564 aiocb32_t *cbp32;
1565 caddr32_t *ucbp32;
1566 aiocb64_32_t *cbp64;
1567 #endif
1568 int error = 0;
1569 int i;
1570 size_t ssize = 0;
1571 model_t model = get_udatamodel();
1572
1573 aiop = curproc->p_aio;
1574 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
1575 return (EINVAL);
1576
1577 if (model == DATAMODEL_NATIVE)
1578 ssize = (sizeof (aiocb_t *) * nent);
1579 #ifdef _SYSCALL32_IMPL
1580 else
1581 ssize = (sizeof (caddr32_t) * nent);
1582 #endif /* _SYSCALL32_IMPL */
1583
1584 if (ssize == 0)
1585 return (EINVAL);
1586
1587 cbplist = kmem_alloc(ssize, KM_SLEEP);
1588
1589 if (model == DATAMODEL_NATIVE)
1590 ucbp = (aiocb_t **)cbplist;
1591 #ifdef _SYSCALL32_IMPL
1592 else
1593 ucbp32 = (caddr32_t *)cbplist;
1594 #endif /* _SYSCALL32_IMPL */
1595
1596 if (copyin(aiocb, cbplist, ssize)) {
1597 error = EFAULT;
1598 goto done;
1599 }
1600
1601 /*
1602 * To find the list head, we go through the
1603 * list of aiocb structs, find the request
1604 * its for, then get the list head that reqp
1605 * points to
1606 */
1607 head = NULL;
1608
1609 for (i = 0; i < nent; i++) {
1610 if (model == DATAMODEL_NATIVE) {
1611 /*
1612 * Since we are only checking for a NULL pointer
1613 * Following should work on both native data sizes
1614 * as well as for largefile aiocb.
1615 */
1616 if ((cbp = *ucbp++) == NULL)
1617 continue;
1618 if (run_mode != AIO_LARGEFILE)
1619 if (head = aio_list_get(&cbp->aio_resultp))
1620 break;
1621 else {
1622 /*
1623 * This is a case when largefile call is
1624 * made on 32 bit kernel.
1625 * Treat each pointer as pointer to
1626 * aiocb64_32
1627 */
1628 if (head = aio_list_get((aio_result_t *)
1629 &(((aiocb64_32_t *)cbp)->aio_resultp)))
1630 break;
1631 }
1632 }
1633 #ifdef _SYSCALL32_IMPL
1634 else {
1635 if (run_mode == AIO_LARGEFILE) {
1636 if ((cbp64 = (aiocb64_32_t *)
1637 (uintptr_t)*ucbp32++) == NULL)
1638 continue;
1639 if (head = aio_list_get((aio_result_t *)
1640 &cbp64->aio_resultp))
1641 break;
1642 } else if (run_mode == AIO_32) {
1643 if ((cbp32 = (aiocb32_t *)
1644 (uintptr_t)*ucbp32++) == NULL)
1645 continue;
1646 if (head = aio_list_get((aio_result_t *)
1647 &cbp32->aio_resultp))
1648 break;
1649 }
1650 }
1651 #endif /* _SYSCALL32_IMPL */
1652 }
1653
1654 if (head == NULL) {
1655 error = EINVAL;
1656 goto done;
1657 }
1658
1659 mutex_enter(&aiop->aio_mutex);
1660 while (head->lio_refcnt > 0) {
1661 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
1662 mutex_exit(&aiop->aio_mutex);
1663 error = EINTR;
1664 goto done;
1665 }
1666 }
1667 mutex_exit(&aiop->aio_mutex);
1668 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, run_mode);
1669 done:
1670 kmem_free(cbplist, ssize);
1671 return (error);
1672 }
1673
1674 aio_lio_t *
1675 aio_list_get(aio_result_t *resultp)
1676 {
1677 aio_lio_t *head = NULL;
1678 aio_t *aiop;
1679 aio_req_t **bucket;
1680 aio_req_t *reqp;
1681 long index;
1682
1683 aiop = curproc->p_aio;
1684 if (aiop == NULL)
1685 return (NULL);
1686
1687 if (resultp) {
1688 index = AIO_HASH(resultp);
1689 bucket = &aiop->aio_hash[index];
1690 for (reqp = *bucket; reqp != NULL;
1691 reqp = reqp->aio_hash_next) {
1692 if (reqp->aio_req_resultp == resultp) {
1693 head = reqp->aio_req_lio;
1694 return (head);
1695 }
1696 }
1697 }
1698 return (NULL);
1699 }
1700
1701
1702 static void
1703 lio_set_uerror(void *resultp, int error)
1704 {
1705 /*
1706 * the resultp field is a pointer to where the
1707 * error should be written out to the user's
1708 * aiocb.
1709 *
1710 */
1711 if (get_udatamodel() == DATAMODEL_NATIVE) {
1712 (void) sulword(&((aio_result_t *)resultp)->aio_return,
1713 (ssize_t)-1);
1714 (void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
1715 }
1716 #ifdef _SYSCALL32_IMPL
1717 else {
1718 (void) suword32(&((aio_result32_t *)resultp)->aio_return,
1719 (uint_t)-1);
1720 (void) suword32(&((aio_result32_t *)resultp)->aio_errno, error);
1721 }
1722 #endif /* _SYSCALL32_IMPL */
1723 }
1724
1725 /*
1726 * do cleanup completion for all requests in list. memory for
1727 * each request is also freed.
1728 */
1729 static void
1730 alio_cleanup(aio_t *aiop, aiocb_t **cbp, int nent, int run_mode)
1731 {
1732 int i;
1733 aio_req_t *reqp;
1734 aio_result_t *resultp;
1735 aiocb64_32_t *aiocb_64;
1736
1737 for (i = 0; i < nent; i++) {
1738 if (get_udatamodel() == DATAMODEL_NATIVE) {
1739 if (cbp[i] == NULL)
1740 continue;
1741 if (run_mode == AIO_LARGEFILE) {
1742 aiocb_64 = (aiocb64_32_t *)cbp[i];
1743 resultp = (aio_result_t *)
1744 &aiocb_64->aio_resultp;
1745 } else
1746 resultp = &cbp[i]->aio_resultp;
1747 }
1748 #ifdef _SYSCALL32_IMPL
1749 else {
1750 aiocb32_t *aiocb_32;
1751 caddr32_t *cbp32;
1752
1753 cbp32 = (caddr32_t *)cbp;
1754 if (cbp32[i] == NULL)
1755 continue;
1756 if (run_mode == AIO_32) {
1757 aiocb_32 = (aiocb32_t *)(uintptr_t)cbp32[i];
1758 resultp = (aio_result_t *)&aiocb_32->
1759 aio_resultp;
1760 } else if (run_mode == AIO_LARGEFILE) {
1761 aiocb_64 = (aiocb64_32_t *)(uintptr_t)cbp32[i];
1762 resultp = (aio_result_t *)&aiocb_64->
1763 aio_resultp;
1764 }
1765 }
1766 #endif /* _SYSCALL32_IMPL */
1767 /*
1768 * we need to get the aio_cleanupq_mutex since we call
1769 * aio_req_done().
1770 */
1771 mutex_enter(&aiop->aio_cleanupq_mutex);
1772 mutex_enter(&aiop->aio_mutex);
1773 reqp = aio_req_done(resultp);
1774 mutex_exit(&aiop->aio_mutex);
1775 mutex_exit(&aiop->aio_cleanupq_mutex);
1776 if (reqp != NULL) {
1777 aphysio_unlock(reqp);
1778 aio_copyout_result(reqp);
1779 mutex_enter(&aiop->aio_mutex);
1780 aio_req_free(aiop, reqp);
1781 mutex_exit(&aiop->aio_mutex);
1782 }
1783 }
1784 }
1785
1786 /*
1787 * Write out the results for an aio request that is done.
1788 */
1789 static int
1790 aioerror(void *cb, int run_mode)
1791 {
1792 aio_result_t *resultp;
1793 aio_t *aiop;
1794 aio_req_t *reqp;
1795 int retval;
1796
1797 aiop = curproc->p_aio;
1798 if (aiop == NULL || cb == NULL)
1799 return (EINVAL);
1800
1801 if (get_udatamodel() == DATAMODEL_NATIVE) {
1802 if (run_mode == AIO_LARGEFILE)
1803 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
1804 aio_resultp;
1805 else
1806 resultp = &((aiocb_t *)cb)->aio_resultp;
1807 }
1808 #ifdef _SYSCALL32_IMPL
1809 else {
1810 if (run_mode == AIO_LARGEFILE)
1811 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
1812 aio_resultp;
1813 else if (run_mode == AIO_32)
1814 resultp = (aio_result_t *)&((aiocb32_t *)cb)->
1815 aio_resultp;
1816 }
1817 #endif /* _SYSCALL32_IMPL */
1818 /*
1819 * we need to get the aio_cleanupq_mutex since we call
1820 * aio_req_find().
1821 */
1822 mutex_enter(&aiop->aio_cleanupq_mutex);
1823 mutex_enter(&aiop->aio_mutex);
1824 retval = aio_req_find(resultp, &reqp);
1825 mutex_exit(&aiop->aio_mutex);
1826 mutex_exit(&aiop->aio_cleanupq_mutex);
1827 if (retval == 0) {
1828 aphysio_unlock(reqp);
1829 aio_copyout_result(reqp);
1830 mutex_enter(&aiop->aio_mutex);
1831 aio_req_free(aiop, reqp);
1832 mutex_exit(&aiop->aio_mutex);
1833 return (0);
1834 } else if (retval == 1)
1835 return (EINPROGRESS);
1836 else if (retval == 2)
1837 return (EINVAL);
1838 return (0);
1839 }
1840
1841 /*
1842 * aio_cancel - if no requests outstanding,
1843 * return AIO_ALLDONE
1844 * else
1845 * return AIO_NOTCANCELED
1846 */
1847 static int
1848 aio_cancel(
1849 int fildes,
1850 void *cb,
1851 long *rval,
1852 int run_mode)
1853 {
1854 aio_t *aiop;
1855 void *resultp;
1856 int index;
1857 aio_req_t **bucket;
1858 aio_req_t *ent;
1859
1860
1861 /*
1862 * Verify valid file descriptor
1863 */
1864 if ((getf(fildes)) == NULL) {
1865 return (EBADF);
1866 }
1867 releasef(fildes);
1868
1869 aiop = curproc->p_aio;
1870 if (aiop == NULL)
1871 return (EINVAL);
1872
1873 if (aiop->aio_outstanding == 0) {
1874 *rval = AIO_ALLDONE;
1875 return (0);
1876 }
1877
1878 mutex_enter(&aiop->aio_mutex);
1879 if (cb != NULL) {
1880 if (get_udatamodel() == DATAMODEL_NATIVE) {
1881 if (run_mode == AIO_LARGEFILE)
1882 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
1883 ->aio_resultp;
1884 else
1885 resultp = &((aiocb_t *)cb)->aio_resultp;
1886 }
1887 #ifdef _SYSCALL32_IMPL
1888 else {
1889 if (run_mode == AIO_LARGEFILE)
1890 resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
1891 ->aio_resultp;
1892 else if (run_mode == AIO_32)
1893 resultp = (aio_result_t *)&((aiocb32_t *)cb)
1894 ->aio_resultp;
1895 }
1896 #endif /* _SYSCALL32_IMPL */
1897 index = AIO_HASH(resultp);
1898 bucket = &aiop->aio_hash[index];
1899 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
1900 if (ent->aio_req_resultp == resultp) {
1901 if ((ent->aio_req_flags & AIO_PENDING) == 0) {
1902 mutex_exit(&aiop->aio_mutex);
1903 *rval = AIO_ALLDONE;
1904 return (0);
1905 }
1906 mutex_exit(&aiop->aio_mutex);
1907 *rval = AIO_NOTCANCELED;
1908 return (0);
1909 }
1910 }
1911 mutex_exit(&aiop->aio_mutex);
1912 *rval = AIO_ALLDONE;
1913 return (0);
1914 }
1915
1916 for (index = 0; index < AIO_HASHSZ; index++) {
1917 bucket = &aiop->aio_hash[index];
1918 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
1919 if (ent->aio_req_fd == fildes) {
1920 if ((ent->aio_req_flags & AIO_PENDING) != 0) {
1921 mutex_exit(&aiop->aio_mutex);
1922 *rval = AIO_NOTCANCELED;
1923 return (0);
1924 }
1925 }
1926 }
1927 }
1928 mutex_exit(&aiop->aio_mutex);
1929 *rval = AIO_ALLDONE;
1930 return (0);
1931 }
1932
1933 /*
1934 * solaris version of asynchronous read and write
1935 */
1936 static int
1937 arw(
1938 int opcode,
1939 int fdes,
1940 char *bufp,
1941 int bufsize,
1942 offset_t offset,
1943 aio_result_t *resultp,
1944 int mode)
1945 {
1946 file_t *fp;
1947 int error;
1948 struct vnode *vp;
1949 aio_req_t *reqp;
1950 aio_t *aiop;
1951 int (*aio_func)();
1952 #ifdef _LP64
1953 aiocb_t aiocb;
1954 #else
1955 aiocb64_32_t aiocb64;
1956 #endif
1957
1958 aiop = curproc->p_aio;
1959 if (aiop == NULL)
1960 return (EINVAL);
1961
1962 if ((fp = getf(fdes)) == NULL) {
1963 return (EBADF);
1964 }
1965
1966 /*
1967 * check the permission of the partition
1968 */
1969 if ((fp->f_flag & mode) == 0) {
1970 releasef(fdes);
1971 return (EBADF);
1972 }
1973
1974 vp = fp->f_vnode;
1975 aio_func = check_vp(vp, mode);
1976 if (aio_func == NULL) {
1977 releasef(fdes);
1978 return (EBADFD);
1979 }
1980 #ifdef _LP64
1981 aiocb.aio_fildes = fdes;
1982 aiocb.aio_buf = bufp;
1983 aiocb.aio_nbytes = bufsize;
1984 aiocb.aio_offset = offset;
1985 aiocb.aio_sigevent.sigev_notify = 0;
1986 error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 1);
1987 #else
1988 aiocb64.aio_fildes = fdes;
1989 aiocb64.aio_buf = (caddr32_t)bufp;
1990 aiocb64.aio_nbytes = bufsize;
1991 aiocb64.aio_offset = offset;
1992 aiocb64.aio_sigevent.sigev_notify = 0;
1993 error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 1);
1994 #endif
1995 if (error) {
1996 releasef(fdes);
1997 return (error);
1998 }
1999
2000 /*
2001 * enable polling on this request if the opcode has
2002 * the AIO poll bit set
2003 */
2004 if (opcode & AIO_POLL_BIT)
2005 reqp->aio_req_flags |= AIO_POLL;
2006
2007 if (bufsize == 0) {
2008 clear_active_fd(fdes);
2009 aio_zerolen(reqp);
2010 return (0);
2011 }
2012 /*
2013 * send the request to driver.
2014 */
2015 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
2016 /*
2017 * the fd is stored in the aio_req_t by aio_req_setup(), and
2018 * is released by the aio_cleanup_thread() when the IO has
2019 * completed.
2020 */
2021 if (error) {
2022 releasef(fdes);
2023 mutex_enter(&aiop->aio_mutex);
2024 aio_req_free(aiop, reqp);
2025 aiop->aio_pending--;
2026 if (aiop->aio_flags & AIO_REQ_BLOCK)
2027 cv_signal(&aiop->aio_cleanupcv);
2028 mutex_exit(&aiop->aio_mutex);
2029 return (error);
2030 }
2031 clear_active_fd(fdes);
2032 return (0);
2033 }
2034
2035 /*
2036 * posix version of asynchronous read and write
2037 */
2038 static int
2039 aiorw(
2040 int opcode,
2041 void *aiocb_arg,
2042 int mode,
2043 int run_mode)
2044 {
2045 #ifdef _SYSCALL32_IMPL
2046 aiocb32_t aiocb32;
2047 struct sigevent32 *sigev32;
2048 port_notify32_t pntfy32;
2049 #endif
2050 aiocb64_32_t aiocb64;
2051 aiocb_t aiocb;
2052 file_t *fp;
2053 int error, fd;
2054 size_t bufsize;
2055 struct vnode *vp;
2056 aio_req_t *reqp;
2057 aio_t *aiop;
2058 int (*aio_func)();
2059 aio_result_t *resultp;
2060 struct sigevent *sigev;
2061 model_t model;
2062 int aio_use_port = 0;
2063 port_notify_t pntfy;
2064
2065 model = get_udatamodel();
2066 aiop = curproc->p_aio;
2067 if (aiop == NULL)
2068 return (EINVAL);
2069
2070 if (model == DATAMODEL_NATIVE) {
2071 if (run_mode != AIO_LARGEFILE) {
2072 if (copyin(aiocb_arg, &aiocb, sizeof (aiocb_t)))
2073 return (EFAULT);
2074 bufsize = aiocb.aio_nbytes;
2075 resultp = &(((aiocb_t *)aiocb_arg)->aio_resultp);
2076 if ((fp = getf(fd = aiocb.aio_fildes)) == NULL) {
2077 return (EBADF);
2078 }
2079 sigev = &aiocb.aio_sigevent;
2080 } else {
2081 /*
2082 * We come here only when we make largefile
2083 * call on 32 bit kernel using 32 bit library.
2084 */
2085 if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
2086 return (EFAULT);
2087 bufsize = aiocb64.aio_nbytes;
2088 resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
2089 ->aio_resultp);
2090 if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
2091 return (EBADF);
2092 sigev = (struct sigevent *)&aiocb64.aio_sigevent;
2093 }
2094
2095 if (sigev->sigev_notify == SIGEV_PORT) {
2096 if (copyin((void *)sigev->sigev_value.sival_ptr,
2097 &pntfy, sizeof (port_notify_t))) {
2098 releasef(fd);
2099 return (EFAULT);
2100 }
2101 aio_use_port = 1;
2102 } else if (sigev->sigev_notify == SIGEV_THREAD) {
2103 pntfy.portnfy_port = aiocb.aio_sigevent.sigev_signo;
2104 pntfy.portnfy_user =
2105 aiocb.aio_sigevent.sigev_value.sival_ptr;
2106 aio_use_port = 1;
2107 }
2108 }
2109 #ifdef _SYSCALL32_IMPL
2110 else {
2111 if (run_mode == AIO_32) {
2112 /* 32 bit system call is being made on 64 bit kernel */
2113 if (copyin(aiocb_arg, &aiocb32, sizeof (aiocb32_t)))
2114 return (EFAULT);
2115
2116 bufsize = aiocb32.aio_nbytes;
2117 aiocb_32ton(&aiocb32, &aiocb);
2118 resultp = (aio_result_t *)&(((aiocb32_t *)aiocb_arg)->
2119 aio_resultp);
2120 if ((fp = getf(fd = aiocb32.aio_fildes)) == NULL) {
2121 return (EBADF);
2122 }
2123 sigev32 = &aiocb32.aio_sigevent;
2124 } else if (run_mode == AIO_LARGEFILE) {
2125 /*
2126 * We come here only when we make largefile
2127 * call on 64 bit kernel using 32 bit library.
2128 */
2129 if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
2130 return (EFAULT);
2131 bufsize = aiocb64.aio_nbytes;
2132 aiocb_LFton(&aiocb64, &aiocb);
2133 resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
2134 ->aio_resultp);
2135 if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
2136 return (EBADF);
2137 sigev32 = &aiocb64.aio_sigevent;
2138 }
2139
2140 if (sigev32->sigev_notify == SIGEV_PORT) {
2141 if (copyin(
2142 (void *)(uintptr_t)sigev32->sigev_value.sival_ptr,
2143 &pntfy32, sizeof (port_notify32_t))) {
2144 releasef(fd);
2145 return (EFAULT);
2146 }
2147 pntfy.portnfy_port = pntfy32.portnfy_port;
2148 pntfy.portnfy_user = (void *)(uintptr_t)
2149 pntfy32.portnfy_user;
2150 aio_use_port = 1;
2151 } else if (sigev32->sigev_notify == SIGEV_THREAD) {
2152 pntfy.portnfy_port = sigev32->sigev_signo;
2153 pntfy.portnfy_user = (void *)(uintptr_t)
2154 sigev32->sigev_value.sival_ptr;
2155 aio_use_port = 1;
2156 }
2157 }
2158 #endif /* _SYSCALL32_IMPL */
2159
2160 /*
2161 * check the permission of the partition
2162 */
2163
2164 if ((fp->f_flag & mode) == 0) {
2165 releasef(fd);
2166 return (EBADF);
2167 }
2168
2169 vp = fp->f_vnode;
2170 aio_func = check_vp(vp, mode);
2171 if (aio_func == NULL) {
2172 releasef(fd);
2173 return (EBADFD);
2174 }
2175 if (run_mode == AIO_LARGEFILE)
2176 error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp, 0);
2177 else
2178 error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp, 0);
2179
2180 if (error) {
2181 releasef(fd);
2182 return (error);
2183 }
2184 /*
2185 * enable polling on this request if the opcode has
2186 * the AIO poll bit set
2187 */
2188 if (opcode & AIO_POLL_BIT)
2189 reqp->aio_req_flags |= AIO_POLL;
2190
2191 if (model == DATAMODEL_NATIVE)
2192 reqp->aio_req_iocb.iocb = aiocb_arg;
2193 #ifdef _SYSCALL32_IMPL
2194 else
2195 reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)aiocb_arg;
2196 #endif
2197
2198 if (aio_use_port) {
2199 int event = (run_mode == AIO_LARGEFILE)?
2200 ((mode == FREAD)? AIOAREAD64 : AIOAWRITE64) :
2201 ((mode == FREAD)? AIOAREAD : AIOAWRITE);
2202 error = aio_req_assoc_port_rw(&pntfy, aiocb_arg, reqp, event);
2203 }
2204
2205 /*
2206 * send the request to driver.
2207 */
2208 if (error == 0) {
2209 if (bufsize == 0) {
2210 clear_active_fd(fd);
2211 aio_zerolen(reqp);
2212 return (0);
2213 }
2214 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
2215 }
2216
2217 /*
2218 * the fd is stored in the aio_req_t by aio_req_setup(), and
2219 * is released by the aio_cleanup_thread() when the IO has
2220 * completed.
2221 */
2222 if (error) {
2223 releasef(fd);
2224 mutex_enter(&aiop->aio_mutex);
2225 if (aio_use_port)
2226 aio_deq(&aiop->aio_portpending, reqp);
2227 aio_req_free(aiop, reqp);
2228 aiop->aio_pending--;
2229 if (aiop->aio_flags & AIO_REQ_BLOCK)
2230 cv_signal(&aiop->aio_cleanupcv);
2231 mutex_exit(&aiop->aio_mutex);
2232 return (error);
2233 }
2234 clear_active_fd(fd);
2235 return (0);
2236 }
2237
2238
2239 /*
2240 * set error for a list IO entry that failed.
2241 */
2242 static void
2243 lio_set_error(aio_req_t *reqp, int portused)
2244 {
2245 aio_t *aiop = curproc->p_aio;
2246
2247 if (aiop == NULL)
2248 return;
2249
2250 mutex_enter(&aiop->aio_mutex);
2251 if (portused)
2252 aio_deq(&aiop->aio_portpending, reqp);
2253 aiop->aio_pending--;
2254 /* request failed, AIO_PHYSIODONE set to aviod physio cleanup. */
2255 reqp->aio_req_flags |= AIO_PHYSIODONE;
2256 /*
2257 * Need to free the request now as its never
2258 * going to get on the done queue
2259 *
2260 * Note: aio_outstanding is decremented in
2261 * aio_req_free()
2262 */
2263 aio_req_free(aiop, reqp);
2264 if (aiop->aio_flags & AIO_REQ_BLOCK)
2265 cv_signal(&aiop->aio_cleanupcv);
2266 mutex_exit(&aiop->aio_mutex);
2267 }
2268
2269 /*
2270 * check if a specified request is done, and remove it from
2271 * the done queue. otherwise remove anybody from the done queue
2272 * if NULL is specified.
2273 */
2274 static aio_req_t *
2275 aio_req_done(void *resultp)
2276 {
2277 aio_req_t **bucket;
2278 aio_req_t *ent;
2279 aio_t *aiop = curproc->p_aio;
2280 long index;
2281
2282 ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
2283 ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2284
2285 if (resultp) {
2286 index = AIO_HASH(resultp);
2287 bucket = &aiop->aio_hash[index];
2288 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
2289 if (ent->aio_req_resultp == (aio_result_t *)resultp) {
2290 if (ent->aio_req_flags & AIO_DONEQ) {
2291 return (aio_req_remove(ent));
2292 }
2293 return (NULL);
2294 }
2295 }
2296 /* no match, resultp is invalid */
2297 return (NULL);
2298 }
2299 return (aio_req_remove(NULL));
2300 }
2301
2302 /*
2303 * determine if a user-level resultp pointer is associated with an
2304 * active IO request. Zero is returned when the request is done,
2305 * and the request is removed from the done queue. Only when the
2306 * return value is zero, is the "reqp" pointer valid. One is returned
2307 * when the request is inprogress. Two is returned when the request
2308 * is invalid.
2309 */
2310 static int
2311 aio_req_find(aio_result_t *resultp, aio_req_t **reqp)
2312 {
2313 aio_req_t **bucket;
2314 aio_req_t *ent;
2315 aio_t *aiop = curproc->p_aio;
2316 long index;
2317
2318 ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
2319 ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2320
2321 index = AIO_HASH(resultp);
2322 bucket = &aiop->aio_hash[index];
2323 for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
2324 if (ent->aio_req_resultp == resultp) {
2325 if (ent->aio_req_flags & AIO_DONEQ) {
2326 *reqp = aio_req_remove(ent);
2327 return (0);
2328 }
2329 return (1);
2330 }
2331 }
2332 /* no match, resultp is invalid */
2333 return (2);
2334 }
2335
2336 /*
2337 * remove a request from the done queue.
2338 */
2339 static aio_req_t *
2340 aio_req_remove(aio_req_t *reqp)
2341 {
2342 aio_t *aiop = curproc->p_aio;
2343
2344 ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2345
2346 if (reqp != NULL) {
2347 ASSERT(reqp->aio_req_flags & AIO_DONEQ);
2348 if (reqp->aio_req_next == reqp) {
2349 /* only one request on queue */
2350 if (reqp == aiop->aio_doneq) {
2351 aiop->aio_doneq = NULL;
2352 } else {
2353 ASSERT(reqp == aiop->aio_cleanupq);
2354 aiop->aio_cleanupq = NULL;
2355 }
2356 } else {
2357 reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
2358 reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
2359 /*
2360 * The request can be either on the aio_doneq or the
2361 * aio_cleanupq
2362 */
2363 if (reqp == aiop->aio_doneq)
2364 aiop->aio_doneq = reqp->aio_req_next;
2365
2366 if (reqp == aiop->aio_cleanupq)
2367 aiop->aio_cleanupq = reqp->aio_req_next;
2368 }
2369 reqp->aio_req_flags &= ~AIO_DONEQ;
2370 reqp->aio_req_next = NULL;
2371 reqp->aio_req_prev = NULL;
2372 } else if ((reqp = aiop->aio_doneq) != NULL) {
2373 ASSERT(reqp->aio_req_flags & AIO_DONEQ);
2374 if (reqp == reqp->aio_req_next) {
2375 /* only one request on queue */
2376 aiop->aio_doneq = NULL;
2377 } else {
2378 reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
2379 reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
2380 aiop->aio_doneq = reqp->aio_req_next;
2381 }
2382 reqp->aio_req_flags &= ~AIO_DONEQ;
2383 reqp->aio_req_next = NULL;
2384 reqp->aio_req_prev = NULL;
2385 }
2386 if (aiop->aio_doneq == NULL && (aiop->aio_flags & AIO_WAITN))
2387 cv_broadcast(&aiop->aio_waitcv);
2388 return (reqp);
2389 }
2390
2391 static int
2392 aio_req_setup(
2393 aio_req_t **reqpp,
2394 aio_t *aiop,
2395 aiocb_t *arg,
2396 aio_result_t *resultp,
2397 vnode_t *vp,
2398 int old_solaris_req)
2399 {
2400 sigqueue_t *sqp = NULL;
2401 aio_req_t *reqp;
2402 struct uio *uio;
2403 struct sigevent *sigev;
2404 int error;
2405
2406 sigev = &arg->aio_sigevent;
2407 if (sigev->sigev_notify == SIGEV_SIGNAL &&
2408 sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) {
2409 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
2410 if (sqp == NULL)
2411 return (EAGAIN);
2412 sqp->sq_func = NULL;
2413 sqp->sq_next = NULL;
2414 sqp->sq_info.si_code = SI_ASYNCIO;
2415 sqp->sq_info.si_pid = curproc->p_pid;
2416 sqp->sq_info.si_ctid = PRCTID(curproc);
2417 sqp->sq_info.si_zoneid = getzoneid();
2418 sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
2419 sqp->sq_info.si_signo = sigev->sigev_signo;
2420 sqp->sq_info.si_value = sigev->sigev_value;
2421 }
2422
2423 mutex_enter(&aiop->aio_mutex);
2424
2425 if (aiop->aio_flags & AIO_REQ_BLOCK) {
2426 mutex_exit(&aiop->aio_mutex);
2427 if (sqp)
2428 kmem_free(sqp, sizeof (sigqueue_t));
2429 return (EIO);
2430 }
2431 /*
2432 * get an aio_reqp from the free list or allocate one
2433 * from dynamic memory.
2434 */
2435 if (error = aio_req_alloc(&reqp, resultp)) {
2436 mutex_exit(&aiop->aio_mutex);
2437 if (sqp)
2438 kmem_free(sqp, sizeof (sigqueue_t));
2439 return (error);
2440 }
2441 aiop->aio_pending++;
2442 aiop->aio_outstanding++;
2443 reqp->aio_req_flags = AIO_PENDING;
2444 if (old_solaris_req) {
2445 /* this is an old solaris aio request */
2446 reqp->aio_req_flags |= AIO_SOLARIS;
2447 aiop->aio_flags |= AIO_SOLARIS_REQ;
2448 }
2449 if (sigev->sigev_notify == SIGEV_THREAD ||
2450 sigev->sigev_notify == SIGEV_PORT)
2451 aio_enq(&aiop->aio_portpending, reqp, 0);
2452 mutex_exit(&aiop->aio_mutex);
2453 /*
2454 * initialize aio request.
2455 */
2456 reqp->aio_req_fd = arg->aio_fildes;
2457 reqp->aio_req_sigqp = sqp;
2458 reqp->aio_req_iocb.iocb = NULL;
2459 reqp->aio_req_lio = NULL;
2460 reqp->aio_req_buf.b_file = vp;
2461 uio = reqp->aio_req.aio_uio;
2462 uio->uio_iovcnt = 1;
2463 uio->uio_iov->iov_base = (caddr_t)arg->aio_buf;
2464 uio->uio_iov->iov_len = arg->aio_nbytes;
2465 uio->uio_loffset = arg->aio_offset;
2466 *reqpp = reqp;
2467 return (0);
2468 }
2469
2470 /*
2471 * Allocate p_aio struct.
2472 */
2473 static aio_t *
2474 aio_aiop_alloc(void)
2475 {
2476 aio_t *aiop;
2477
2478 ASSERT(MUTEX_HELD(&curproc->p_lock));
2479
2480 aiop = kmem_zalloc(sizeof (struct aio), KM_NOSLEEP);
2481 if (aiop) {
2482 mutex_init(&aiop->aio_mutex, NULL, MUTEX_DEFAULT, NULL);
2483 mutex_init(&aiop->aio_cleanupq_mutex, NULL, MUTEX_DEFAULT,
2484 NULL);
2485 mutex_init(&aiop->aio_portq_mutex, NULL, MUTEX_DEFAULT, NULL);
2486 }
2487 return (aiop);
2488 }
2489
2490 /*
2491 * Allocate an aio_req struct.
2492 */
2493 static int
2494 aio_req_alloc(aio_req_t **nreqp, aio_result_t *resultp)
2495 {
2496 aio_req_t *reqp;
2497 aio_t *aiop = curproc->p_aio;
2498
2499 ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2500
2501 if ((reqp = aiop->aio_free) != NULL) {
2502 aiop->aio_free = reqp->aio_req_next;
2503 bzero(reqp, sizeof (*reqp));
2504 } else {
2505 /*
2506 * Check whether memory is getting tight.
2507 * This is a temporary mechanism to avoid memory
2508 * exhaustion by a single process until we come up
2509 * with a per process solution such as setrlimit().
2510 */
2511 if (freemem < desfree)
2512 return (EAGAIN);
2513 reqp = kmem_zalloc(sizeof (struct aio_req_t), KM_NOSLEEP);
2514 if (reqp == NULL)
2515 return (EAGAIN);
2516 }
2517 reqp->aio_req.aio_uio = &reqp->aio_req_uio;
2518 reqp->aio_req.aio_uio->uio_iov = &reqp->aio_req_iov;
2519 reqp->aio_req.aio_private = reqp;
2520 reqp->aio_req_buf.b_offset = -1;
2521 reqp->aio_req_resultp = resultp;
2522 if (aio_hash_insert(reqp, aiop)) {
2523 reqp->aio_req_next = aiop->aio_free;
2524 aiop->aio_free = reqp;
2525 return (EBUSY);
2526 }
2527 *nreqp = reqp;
2528 return (0);
2529 }
2530
2531 /*
2532 * Allocate an aio_lio_t struct.
2533 */
2534 static int
2535 aio_lio_alloc(aio_lio_t **head)
2536 {
2537 aio_lio_t *liop;
2538 aio_t *aiop = curproc->p_aio;
2539
2540 ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2541
2542 if ((liop = aiop->aio_lio_free) != NULL) {
2543 aiop->aio_lio_free = liop->lio_next;
2544 } else {
2545 /*
2546 * Check whether memory is getting tight.
2547 * This is a temporary mechanism to avoid memory
2548 * exhaustion by a single process until we come up
2549 * with a per process solution such as setrlimit().
2550 */
2551 if (freemem < desfree)
2552 return (EAGAIN);
2553
2554 liop = kmem_zalloc(sizeof (aio_lio_t), KM_NOSLEEP);
2555 if (liop == NULL)
2556 return (EAGAIN);
2557 }
2558 *head = liop;
2559 return (0);
2560 }
2561
2562 /*
2563 * this is a special per-process thread that is only activated if
2564 * the process is unmapping a segment with outstanding aio. normally,
2565 * the process will have completed the aio before unmapping the
2566 * segment. If the process does unmap a segment with outstanding aio,
2567 * this special thread will guarentee that the locked pages due to
2568 * aphysio() are released, thereby permitting the segment to be
2569 * unmapped. In addition to this, the cleanup thread is woken up
2570 * during DR operations to release the locked pages.
2571 */
2572
2573 static int
2574 aio_cleanup_thread(aio_t *aiop)
2575 {
2576 proc_t *p = curproc;
2577 struct as *as = p->p_as;
2578 int poked = 0;
2579 kcondvar_t *cvp;
2580 int exit_flag = 0;
2581 int rqclnup = 0;
2582
2583 sigfillset(&curthread->t_hold);
2584 sigdiffset(&curthread->t_hold, &cantmask);
2585 for (;;) {
2586 /*
2587 * if a segment is being unmapped, and the current
2588 * process's done queue is not empty, then every request
2589 * on the doneq with locked resources should be forced
2590 * to release their locks. By moving the doneq request
2591 * to the cleanupq, aio_cleanup() will process the cleanupq,
2592 * and place requests back onto the doneq. All requests
2593 * processed by aio_cleanup() will have their physical
2594 * resources unlocked.
2595 */
2596 mutex_enter(&aiop->aio_mutex);
2597 if ((aiop->aio_flags & AIO_CLEANUP) == 0) {
2598 aiop->aio_flags |= AIO_CLEANUP;
2599 mutex_enter(&as->a_contents);
2600 if (aiop->aio_rqclnup) {
2601 aiop->aio_rqclnup = 0;
2602 rqclnup = 1;
2603 }
2604 mutex_exit(&as->a_contents);
2605 if (aiop->aio_doneq) {
2606 aio_req_t *doneqhead = aiop->aio_doneq;
2607 aiop->aio_doneq = NULL;
2608 aio_cleanupq_concat(aiop, doneqhead, AIO_DONEQ);
2609 }
2610 }
2611 mutex_exit(&aiop->aio_mutex);
2612 aio_cleanup(AIO_CLEANUP_THREAD);
2613 /*
2614 * thread should block on the cleanupcv while
2615 * AIO_CLEANUP is set.
2616 */
2617 cvp = &aiop->aio_cleanupcv;
2618 mutex_enter(&aiop->aio_mutex);
2619
2620 if (aiop->aio_pollq != NULL || aiop->aio_cleanupq != NULL ||
2621 aiop->aio_notifyq != NULL ||
2622 aiop->aio_portcleanupq != NULL) {
2623 mutex_exit(&aiop->aio_mutex);
2624 continue;
2625 }
2626 mutex_enter(&as->a_contents);
2627
2628 /*
2629 * AIO_CLEANUP determines when the cleanup thread
2630 * should be active. This flag is set when
2631 * the cleanup thread is awakened by as_unmap() or
2632 * due to DR operations.
2633 * The flag is cleared when the blocking as_unmap()
2634 * that originally awakened us is allowed to
2635 * complete. as_unmap() blocks when trying to
2636 * unmap a segment that has SOFTLOCKed pages. when
2637 * the segment's pages are all SOFTUNLOCKed,
2638 * as->a_flags & AS_UNMAPWAIT should be zero.
2639 *
2640 * In case of cleanup request by DR, the flag is cleared
2641 * once all the pending aio requests have been processed.
2642 *
2643 * The flag shouldn't be cleared right away if the
2644 * cleanup thread was interrupted because the process
2645 * is doing forkall(). This happens when cv_wait_sig()
2646 * returns zero, because it was awakened by a pokelwps().
2647 * If the process is not exiting, it must be doing forkall().
2648 */
2649 if ((poked == 0) &&
2650 ((!rqclnup && (AS_ISUNMAPWAIT(as) == 0)) ||
2651 (aiop->aio_pending == 0))) {
2652 aiop->aio_flags &= ~(AIO_CLEANUP | AIO_CLEANUP_PORT);
2653 cvp = &as->a_cv;
2654 rqclnup = 0;
2655 }
2656 mutex_exit(&aiop->aio_mutex);
2657 if (poked) {
2658 /*
2659 * If the process is exiting/killed, don't return
2660 * immediately without waiting for pending I/O's
2661 * and releasing the page locks.
2662 */
2663 if (p->p_flag & (SEXITLWPS|SKILLED)) {
2664 /*
2665 * If exit_flag is set, then it is
2666 * safe to exit because we have released
2667 * page locks of completed I/O's.
2668 */
2669 if (exit_flag)
2670 break;
2671
2672 mutex_exit(&as->a_contents);
2673
2674 /*
2675 * Wait for all the pending aio to complete.
2676 */
2677 mutex_enter(&aiop->aio_mutex);
2678 aiop->aio_flags |= AIO_REQ_BLOCK;
2679 while (aiop->aio_pending != 0)
2680 cv_wait(&aiop->aio_cleanupcv,
2681 &aiop->aio_mutex);
2682 mutex_exit(&aiop->aio_mutex);
2683 exit_flag = 1;
2684 continue;
2685 } else if (p->p_flag &
2686 (SHOLDFORK|SHOLDFORK1|SHOLDWATCH)) {
2687 /*
2688 * hold LWP until it
2689 * is continued.
2690 */
2691 mutex_exit(&as->a_contents);
2692 mutex_enter(&p->p_lock);
2693 stop(PR_SUSPENDED, SUSPEND_NORMAL);
2694 mutex_exit(&p->p_lock);
2695 poked = 0;
2696 continue;
2697 }
2698 } else {
2699 /*
2700 * When started this thread will sleep on as->a_cv.
2701 * as_unmap will awake this thread if the
2702 * segment has SOFTLOCKed pages (poked = 0).
2703 * 1. pokelwps() awakes this thread =>
2704 * break the loop to check SEXITLWPS, SHOLDFORK, etc
2705 * 2. as_unmap awakes this thread =>
2706 * to break the loop it is necessary that
2707 * - AS_UNMAPWAIT is set (as_unmap is waiting for
2708 * memory to be unlocked)
2709 * - AIO_CLEANUP is not set
2710 * (if AIO_CLEANUP is set we have to wait for
2711 * pending requests. aio_done will send a signal
2712 * for every request which completes to continue
2713 * unmapping the corresponding address range)
2714 * 3. A cleanup request will wake this thread up, ex.
2715 * by the DR operations. The aio_rqclnup flag will
2716 * be set.
2717 */
2718 while (poked == 0) {
2719 /*
2720 * The clean up requests that came in
2721 * after we had just cleaned up, couldn't
2722 * be causing the unmap thread to block - as
2723 * unmap event happened first.
2724 * Let aio_done() wake us up if it sees a need.
2725 */
2726 if (aiop->aio_rqclnup &&
2727 (aiop->aio_flags & AIO_CLEANUP) == 0)
2728 break;
2729 poked = !cv_wait_sig(cvp, &as->a_contents);
2730 if (AS_ISUNMAPWAIT(as) == 0)
2731 cv_signal(cvp);
2732 if (aiop->aio_outstanding != 0)
2733 break;
2734 }
2735 }
2736 mutex_exit(&as->a_contents);
2737 }
2738 exit:
2739 mutex_exit(&as->a_contents);
2740 ASSERT((curproc->p_flag & (SEXITLWPS|SKILLED)));
2741 aston(curthread); /* make thread do post_syscall */
2742 return (0);
2743 }
2744
2745 /*
2746 * save a reference to a user's outstanding aio in a hash list.
2747 */
2748 static int
2749 aio_hash_insert(
2750 aio_req_t *aio_reqp,
2751 aio_t *aiop)
2752 {
2753 long index;
2754 aio_result_t *resultp = aio_reqp->aio_req_resultp;
2755 aio_req_t *current;
2756 aio_req_t **nextp;
2757
2758 index = AIO_HASH(resultp);
2759 nextp = &aiop->aio_hash[index];
2760 while ((current = *nextp) != NULL) {
2761 if (current->aio_req_resultp == resultp)
2762 return (DUPLICATE);
2763 nextp = ¤t->aio_hash_next;
2764 }
2765 *nextp = aio_reqp;
2766 aio_reqp->aio_hash_next = NULL;
2767 return (0);
2768 }
2769
2770 static int
2771 (*check_vp(struct vnode *vp, int mode))(vnode_t *, struct aio_req *,
2772 cred_t *)
2773 {
2774 struct snode *sp;
2775 dev_t dev;
2776 struct cb_ops *cb;
2777 major_t major;
2778 int (*aio_func)();
2779
2780 dev = vp->v_rdev;
2781 major = getmajor(dev);
2782
2783 /*
2784 * return NULL for requests to files and STREAMs so
2785 * that libaio takes care of them.
2786 */
2787 if (vp->v_type == VCHR) {
2788 /* no stream device for kaio */
2789 if (STREAMSTAB(major)) {
2790 return (NULL);
2791 }
2792 } else {
2793 return (NULL);
2794 }
2795
2796 /*
2797 * Check old drivers which do not have async I/O entry points.
2798 */
2799 if (devopsp[major]->devo_rev < 3)
2800 return (NULL);
2801
2802 cb = devopsp[major]->devo_cb_ops;
2803
2804 if (cb->cb_rev < 1)
2805 return (NULL);
2806
2807 /*
2808 * Check whether this device is a block device.
2809 * Kaio is not supported for devices like tty.
2810 */
2811 if (cb->cb_strategy == nodev || cb->cb_strategy == NULL)
2812 return (NULL);
2813
2814 /*
2815 * Clustering: If vnode is a PXFS vnode, then the device may be remote.
2816 * We cannot call the driver directly. Instead return the
2817 * PXFS functions.
2818 */
2819
2820 if (IS_PXFSVP(vp)) {
2821 if (mode & FREAD)
2822 return (clpxfs_aio_read);
2823 else
2824 return (clpxfs_aio_write);
2825 }
2826 if (mode & FREAD)
2827 aio_func = (cb->cb_aread == nodev) ? NULL : driver_aio_read;
2828 else
2829 aio_func = (cb->cb_awrite == nodev) ? NULL : driver_aio_write;
2830
2831 /*
2832 * Do we need this ?
2833 * nodev returns ENXIO anyway.
2834 */
2835 if (aio_func == nodev)
2836 return (NULL);
2837
2838 sp = VTOS(vp);
2839 smark(sp, SACC);
2840 return (aio_func);
2841 }
2842
2843 /*
2844 * Clustering: We want check_vp to return a function prototyped
2845 * correctly that will be common to both PXFS and regular case.
2846 * We define this intermediate function that will do the right
2847 * thing for driver cases.
2848 */
2849
2850 static int
2851 driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
2852 {
2853 dev_t dev;
2854 struct cb_ops *cb;
2855
2856 ASSERT(vp->v_type == VCHR);
2857 ASSERT(!IS_PXFSVP(vp));
2858 dev = VTOS(vp)->s_dev;
2859 ASSERT(STREAMSTAB(getmajor(dev)) == NULL);
2860
2861 cb = devopsp[getmajor(dev)]->devo_cb_ops;
2862
2863 ASSERT(cb->cb_awrite != nodev);
2864 return ((*cb->cb_awrite)(dev, aio, cred_p));
2865 }
2866
2867 /*
2868 * Clustering: We want check_vp to return a function prototyped
2869 * correctly that will be common to both PXFS and regular case.
2870 * We define this intermediate function that will do the right
2871 * thing for driver cases.
2872 */
2873
2874 static int
2875 driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
2876 {
2877 dev_t dev;
2878 struct cb_ops *cb;
2879
2880 ASSERT(vp->v_type == VCHR);
2881 ASSERT(!IS_PXFSVP(vp));
2882 dev = VTOS(vp)->s_dev;
2883 ASSERT(!STREAMSTAB(getmajor(dev)));
2884
2885 cb = devopsp[getmajor(dev)]->devo_cb_ops;
2886
2887 ASSERT(cb->cb_aread != nodev);
2888 return ((*cb->cb_aread)(dev, aio, cred_p));
2889 }
2890
2891 /*
2892 * This routine is called when a largefile call is made by a 32bit
2893 * process on a ILP32 or LP64 kernel. All 64bit processes are large
2894 * file by definition and will call alio() instead.
2895 */
2896 static int
2897 alioLF(
2898 int mode_arg,
2899 void *aiocb_arg,
2900 int nent,
2901 void *sigev)
2902 {
2903 file_t *fp;
2904 file_t *prev_fp = NULL;
2905 int prev_mode = -1;
2906 struct vnode *vp;
2907 aio_lio_t *head;
2908 aio_req_t *reqp;
2909 aio_t *aiop;
2910 caddr_t cbplist;
2911 aiocb64_32_t cb64;
2912 aiocb64_32_t *aiocb = &cb64;
2913 aiocb64_32_t *cbp;
2914 caddr32_t *ucbp;
2915 #ifdef _LP64
2916 aiocb_t aiocb_n;
2917 #endif
2918 struct sigevent32 sigevk;
2919 sigqueue_t *sqp;
2920 int (*aio_func)();
2921 int mode;
2922 int error = 0;
2923 int aio_errors = 0;
2924 int i;
2925 size_t ssize;
2926 int deadhead = 0;
2927 int aio_notsupported = 0;
2928 int lio_head_port;
2929 int aio_port;
2930 int aio_thread;
2931 port_kevent_t *pkevtp = NULL;
2932 int portused = 0;
2933 port_notify32_t pnotify;
2934 int event;
2935
2936 aiop = curproc->p_aio;
2937 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
2938 return (EINVAL);
2939
2940 ASSERT(get_udatamodel() == DATAMODEL_ILP32);
2941
2942 ssize = (sizeof (caddr32_t) * nent);
2943 cbplist = kmem_alloc(ssize, KM_SLEEP);
2944 ucbp = (caddr32_t *)cbplist;
2945
2946 if (copyin(aiocb_arg, cbplist, ssize) ||
2947 (sigev && copyin(sigev, &sigevk, sizeof (sigevk)))) {
2948 kmem_free(cbplist, ssize);
2949 return (EFAULT);
2950 }
2951
2952 /* Event Ports */
2953 if (sigev &&
2954 (sigevk.sigev_notify == SIGEV_THREAD ||
2955 sigevk.sigev_notify == SIGEV_PORT)) {
2956 if (sigevk.sigev_notify == SIGEV_THREAD) {
2957 pnotify.portnfy_port = sigevk.sigev_signo;
2958 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
2959 } else if (copyin(
2960 (void *)(uintptr_t)sigevk.sigev_value.sival_ptr,
2961 &pnotify, sizeof (pnotify))) {
2962 kmem_free(cbplist, ssize);
2963 return (EFAULT);
2964 }
2965 error = port_alloc_event(pnotify.portnfy_port,
2966 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
2967 if (error) {
2968 if (error == ENOMEM || error == EAGAIN)
2969 error = EAGAIN;
2970 else
2971 error = EINVAL;
2972 kmem_free(cbplist, ssize);
2973 return (error);
2974 }
2975 lio_head_port = pnotify.portnfy_port;
2976 portused = 1;
2977 }
2978
2979 /*
2980 * a list head should be allocated if notification is
2981 * enabled for this list.
2982 */
2983 head = NULL;
2984
2985 if (mode_arg == LIO_WAIT || sigev) {
2986 mutex_enter(&aiop->aio_mutex);
2987 error = aio_lio_alloc(&head);
2988 mutex_exit(&aiop->aio_mutex);
2989 if (error)
2990 goto done;
2991 deadhead = 1;
2992 head->lio_nent = nent;
2993 head->lio_refcnt = nent;
2994 head->lio_port = -1;
2995 head->lio_portkev = NULL;
2996 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
2997 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
2998 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
2999 if (sqp == NULL) {
3000 error = EAGAIN;
3001 goto done;
3002 }
3003 sqp->sq_func = NULL;
3004 sqp->sq_next = NULL;
3005 sqp->sq_info.si_code = SI_ASYNCIO;
3006 sqp->sq_info.si_pid = curproc->p_pid;
3007 sqp->sq_info.si_ctid = PRCTID(curproc);
3008 sqp->sq_info.si_zoneid = getzoneid();
3009 sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3010 sqp->sq_info.si_signo = sigevk.sigev_signo;
3011 sqp->sq_info.si_value.sival_int =
3012 sigevk.sigev_value.sival_int;
3013 head->lio_sigqp = sqp;
3014 } else {
3015 head->lio_sigqp = NULL;
3016 }
3017 if (pkevtp) {
3018 /*
3019 * Prepare data to send when list of aiocb's
3020 * has completed.
3021 */
3022 port_init_event(pkevtp, (uintptr_t)sigev,
3023 (void *)(uintptr_t)pnotify.portnfy_user,
3024 NULL, head);
3025 pkevtp->portkev_events = AIOLIO64;
3026 head->lio_portkev = pkevtp;
3027 head->lio_port = pnotify.portnfy_port;
3028 }
3029 }
3030
3031 for (i = 0; i < nent; i++, ucbp++) {
3032
3033 cbp = (aiocb64_32_t *)(uintptr_t)*ucbp;
3034 /* skip entry if it can't be copied. */
3035 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) {
3036 if (head) {
3037 mutex_enter(&aiop->aio_mutex);
3038 head->lio_nent--;
3039 head->lio_refcnt--;
3040 mutex_exit(&aiop->aio_mutex);
3041 }
3042 continue;
3043 }
3044
3045 /* skip if opcode for aiocb is LIO_NOP */
3046 mode = aiocb->aio_lio_opcode;
3047 if (mode == LIO_NOP) {
3048 cbp = NULL;
3049 if (head) {
3050 mutex_enter(&aiop->aio_mutex);
3051 head->lio_nent--;
3052 head->lio_refcnt--;
3053 mutex_exit(&aiop->aio_mutex);
3054 }
3055 continue;
3056 }
3057
3058 /* increment file descriptor's ref count. */
3059 if ((fp = getf(aiocb->aio_fildes)) == NULL) {
3060 lio_set_uerror(&cbp->aio_resultp, EBADF);
3061 if (head) {
3062 mutex_enter(&aiop->aio_mutex);
3063 head->lio_nent--;
3064 head->lio_refcnt--;
3065 mutex_exit(&aiop->aio_mutex);
3066 }
3067 aio_errors++;
3068 continue;
3069 }
3070
3071 /*
3072 * check the permission of the partition
3073 */
3074 if ((fp->f_flag & mode) == 0) {
3075 releasef(aiocb->aio_fildes);
3076 lio_set_uerror(&cbp->aio_resultp, EBADF);
3077 if (head) {
3078 mutex_enter(&aiop->aio_mutex);
3079 head->lio_nent--;
3080 head->lio_refcnt--;
3081 mutex_exit(&aiop->aio_mutex);
3082 }
3083 aio_errors++;
3084 continue;
3085 }
3086
3087 /*
3088 * common case where requests are to the same fd
3089 * for the same r/w operation
3090 * for UFS, need to set EBADFD
3091 */
3092 vp = fp->f_vnode;
3093 if (fp != prev_fp || mode != prev_mode) {
3094 aio_func = check_vp(vp, mode);
3095 if (aio_func == NULL) {
3096 prev_fp = NULL;
3097 releasef(aiocb->aio_fildes);
3098 lio_set_uerror(&cbp->aio_resultp, EBADFD);
3099 aio_notsupported++;
3100 if (head) {
3101 mutex_enter(&aiop->aio_mutex);
3102 head->lio_nent--;
3103 head->lio_refcnt--;
3104 mutex_exit(&aiop->aio_mutex);
3105 }
3106 continue;
3107 } else {
3108 prev_fp = fp;
3109 prev_mode = mode;
3110 }
3111 }
3112
3113 #ifdef _LP64
3114 aiocb_LFton(aiocb, &aiocb_n);
3115 error = aio_req_setup(&reqp, aiop, &aiocb_n,
3116 (aio_result_t *)&cbp->aio_resultp, vp, 0);
3117 #else
3118 error = aio_req_setupLF(&reqp, aiop, aiocb,
3119 (aio_result_t *)&cbp->aio_resultp, vp, 0);
3120 #endif /* _LP64 */
3121 if (error) {
3122 releasef(aiocb->aio_fildes);
3123 lio_set_uerror(&cbp->aio_resultp, error);
3124 if (head) {
3125 mutex_enter(&aiop->aio_mutex);
3126 head->lio_nent--;
3127 head->lio_refcnt--;
3128 mutex_exit(&aiop->aio_mutex);
3129 }
3130 aio_errors++;
3131 continue;
3132 }
3133
3134 reqp->aio_req_lio = head;
3135 deadhead = 0;
3136
3137 /*
3138 * Set the errno field now before sending the request to
3139 * the driver to avoid a race condition
3140 */
3141 (void) suword32(&cbp->aio_resultp.aio_errno,
3142 EINPROGRESS);
3143
3144 reqp->aio_req_iocb.iocb32 = *ucbp;
3145
3146 event = (mode == LIO_READ)? AIOAREAD64 : AIOAWRITE64;
3147 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
3148 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
3149 if (aio_port | aio_thread) {
3150 port_kevent_t *lpkevp;
3151 /*
3152 * Prepare data to send with each aiocb completed.
3153 */
3154 if (aio_port) {
3155 void *paddr = (void *)(uintptr_t)
3156 aiocb->aio_sigevent.sigev_value.sival_ptr;
3157 if (copyin(paddr, &pnotify, sizeof (pnotify)))
3158 error = EFAULT;
3159 } else { /* aio_thread */
3160 pnotify.portnfy_port =
3161 aiocb->aio_sigevent.sigev_signo;
3162 pnotify.portnfy_user =
3163 aiocb->aio_sigevent.sigev_value.sival_ptr;
3164 }
3165 if (error)
3166 /* EMPTY */;
3167 else if (pkevtp != NULL &&
3168 pnotify.portnfy_port == lio_head_port)
3169 error = port_dup_event(pkevtp, &lpkevp,
3170 PORT_ALLOC_DEFAULT);
3171 else
3172 error = port_alloc_event(pnotify.portnfy_port,
3173 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
3174 &lpkevp);
3175 if (error == 0) {
3176 port_init_event(lpkevp, (uintptr_t)*ucbp,
3177 (void *)(uintptr_t)pnotify.portnfy_user,
3178 aio_port_callback, reqp);
3179 lpkevp->portkev_events = event;
3180 reqp->aio_req_portkev = lpkevp;
3181 reqp->aio_req_port = pnotify.portnfy_port;
3182 }
3183 }
3184
3185 /*
3186 * send the request to driver.
3187 */
3188 if (error == 0) {
3189 if (aiocb->aio_nbytes == 0) {
3190 clear_active_fd(aiocb->aio_fildes);
3191 aio_zerolen(reqp);
3192 continue;
3193 }
3194 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
3195 CRED());
3196 }
3197
3198 /*
3199 * the fd's ref count is not decremented until the IO has
3200 * completed unless there was an error.
3201 */
3202 if (error) {
3203 releasef(aiocb->aio_fildes);
3204 lio_set_uerror(&cbp->aio_resultp, error);
3205 if (head) {
3206 mutex_enter(&aiop->aio_mutex);
3207 head->lio_nent--;
3208 head->lio_refcnt--;
3209 mutex_exit(&aiop->aio_mutex);
3210 }
3211 if (error == ENOTSUP)
3212 aio_notsupported++;
3213 else
3214 aio_errors++;
3215 lio_set_error(reqp, portused);
3216 } else {
3217 clear_active_fd(aiocb->aio_fildes);
3218 }
3219 }
3220
3221 if (aio_notsupported) {
3222 error = ENOTSUP;
3223 } else if (aio_errors) {
3224 /*
3225 * return EIO if any request failed
3226 */
3227 error = EIO;
3228 }
3229
3230 if (mode_arg == LIO_WAIT) {
3231 mutex_enter(&aiop->aio_mutex);
3232 while (head->lio_refcnt > 0) {
3233 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
3234 mutex_exit(&aiop->aio_mutex);
3235 error = EINTR;
3236 goto done;
3237 }
3238 }
3239 mutex_exit(&aiop->aio_mutex);
3240 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_LARGEFILE);
3241 }
3242
3243 done:
3244 kmem_free(cbplist, ssize);
3245 if (deadhead) {
3246 if (head->lio_sigqp)
3247 kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
3248 if (head->lio_portkev)
3249 port_free_event(head->lio_portkev);
3250 kmem_free(head, sizeof (aio_lio_t));
3251 }
3252 return (error);
3253 }
3254
3255 #ifdef _SYSCALL32_IMPL
3256 static void
3257 aiocb_LFton(aiocb64_32_t *src, aiocb_t *dest)
3258 {
3259 dest->aio_fildes = src->aio_fildes;
3260 dest->aio_buf = (void *)(uintptr_t)src->aio_buf;
3261 dest->aio_nbytes = (size_t)src->aio_nbytes;
3262 dest->aio_offset = (off_t)src->aio_offset;
3263 dest->aio_reqprio = src->aio_reqprio;
3264 dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
3265 dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;
3266
3267 /*
3268 * See comment in sigqueue32() on handling of 32-bit
3269 * sigvals in a 64-bit kernel.
3270 */
3271 dest->aio_sigevent.sigev_value.sival_int =
3272 (int)src->aio_sigevent.sigev_value.sival_int;
3273 dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
3274 (uintptr_t)src->aio_sigevent.sigev_notify_function;
3275 dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
3276 (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
3277 dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
3278 dest->aio_lio_opcode = src->aio_lio_opcode;
3279 dest->aio_state = src->aio_state;
3280 dest->aio__pad[0] = src->aio__pad[0];
3281 }
3282 #endif
3283
3284 /*
3285 * This function is used only for largefile calls made by
3286 * 32 bit applications.
3287 */
3288 static int
3289 aio_req_setupLF(
3290 aio_req_t **reqpp,
3291 aio_t *aiop,
3292 aiocb64_32_t *arg,
3293 aio_result_t *resultp,
3294 vnode_t *vp,
3295 int old_solaris_req)
3296 {
3297 sigqueue_t *sqp = NULL;
3298 aio_req_t *reqp;
3299 struct uio *uio;
3300 struct sigevent32 *sigev;
3301 int error;
3302
3303 sigev = &arg->aio_sigevent;
3304 if (sigev->sigev_notify == SIGEV_SIGNAL &&
3305 sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) {
3306 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
3307 if (sqp == NULL)
3308 return (EAGAIN);
3309 sqp->sq_func = NULL;
3310 sqp->sq_next = NULL;
3311 sqp->sq_info.si_code = SI_ASYNCIO;
3312 sqp->sq_info.si_pid = curproc->p_pid;
3313 sqp->sq_info.si_ctid = PRCTID(curproc);
3314 sqp->sq_info.si_zoneid = getzoneid();
3315 sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3316 sqp->sq_info.si_signo = sigev->sigev_signo;
3317 sqp->sq_info.si_value.sival_int = sigev->sigev_value.sival_int;
3318 }
3319
3320 mutex_enter(&aiop->aio_mutex);
3321
3322 if (aiop->aio_flags & AIO_REQ_BLOCK) {
3323 mutex_exit(&aiop->aio_mutex);
3324 if (sqp)
3325 kmem_free(sqp, sizeof (sigqueue_t));
3326 return (EIO);
3327 }
3328 /*
3329 * get an aio_reqp from the free list or allocate one
3330 * from dynamic memory.
3331 */
3332 if (error = aio_req_alloc(&reqp, resultp)) {
3333 mutex_exit(&aiop->aio_mutex);
3334 if (sqp)
3335 kmem_free(sqp, sizeof (sigqueue_t));
3336 return (error);
3337 }
3338 aiop->aio_pending++;
3339 aiop->aio_outstanding++;
3340 reqp->aio_req_flags = AIO_PENDING;
3341 if (old_solaris_req) {
3342 /* this is an old solaris aio request */
3343 reqp->aio_req_flags |= AIO_SOLARIS;
3344 aiop->aio_flags |= AIO_SOLARIS_REQ;
3345 }
3346 if (sigev->sigev_notify == SIGEV_THREAD ||
3347 sigev->sigev_notify == SIGEV_PORT)
3348 aio_enq(&aiop->aio_portpending, reqp, 0);
3349 mutex_exit(&aiop->aio_mutex);
3350 /*
3351 * initialize aio request.
3352 */
3353 reqp->aio_req_fd = arg->aio_fildes;
3354 reqp->aio_req_sigqp = sqp;
3355 reqp->aio_req_iocb.iocb = NULL;
3356 reqp->aio_req_lio = NULL;
3357 reqp->aio_req_buf.b_file = vp;
3358 uio = reqp->aio_req.aio_uio;
3359 uio->uio_iovcnt = 1;
3360 uio->uio_iov->iov_base = (caddr_t)(uintptr_t)arg->aio_buf;
3361 uio->uio_iov->iov_len = arg->aio_nbytes;
3362 uio->uio_loffset = arg->aio_offset;
3363 *reqpp = reqp;
3364 return (0);
3365 }
3366
3367 /*
3368 * This routine is called when a non largefile call is made by a 32bit
3369 * process on a ILP32 or LP64 kernel.
3370 */
3371 static int
3372 alio32(
3373 int mode_arg,
3374 void *aiocb_arg,
3375 int nent,
3376 void *sigev)
3377 {
3378 file_t *fp;
3379 file_t *prev_fp = NULL;
3380 int prev_mode = -1;
3381 struct vnode *vp;
3382 aio_lio_t *head;
3383 aio_req_t *reqp;
3384 aio_t *aiop;
3385 caddr_t cbplist;
3386 aiocb_t cb;
3387 aiocb_t *aiocb = &cb;
3388 #ifdef _LP64
3389 aiocb32_t *cbp;
3390 caddr32_t *ucbp;
3391 aiocb32_t cb32;
3392 aiocb32_t *aiocb32 = &cb32;
3393 struct sigevent32 sigevk;
3394 #else
3395 aiocb_t *cbp, **ucbp;
3396 struct sigevent sigevk;
3397 #endif
3398 sigqueue_t *sqp;
3399 int (*aio_func)();
3400 int mode;
3401 int error = 0;
3402 int aio_errors = 0;
3403 int i;
3404 size_t ssize;
3405 int deadhead = 0;
3406 int aio_notsupported = 0;
3407 int lio_head_port;
3408 int aio_port;
3409 int aio_thread;
3410 port_kevent_t *pkevtp = NULL;
3411 int portused = 0;
3412 #ifdef _LP64
3413 port_notify32_t pnotify;
3414 #else
3415 port_notify_t pnotify;
3416 #endif
3417 int event;
3418
3419 aiop = curproc->p_aio;
3420 if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
3421 return (EINVAL);
3422
3423 #ifdef _LP64
3424 ssize = (sizeof (caddr32_t) * nent);
3425 #else
3426 ssize = (sizeof (aiocb_t *) * nent);
3427 #endif
3428 cbplist = kmem_alloc(ssize, KM_SLEEP);
3429 ucbp = (void *)cbplist;
3430
3431 if (copyin(aiocb_arg, cbplist, ssize) ||
3432 (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent32)))) {
3433 kmem_free(cbplist, ssize);
3434 return (EFAULT);
3435 }
3436
3437 /* Event Ports */
3438 if (sigev &&
3439 (sigevk.sigev_notify == SIGEV_THREAD ||
3440 sigevk.sigev_notify == SIGEV_PORT)) {
3441 if (sigevk.sigev_notify == SIGEV_THREAD) {
3442 pnotify.portnfy_port = sigevk.sigev_signo;
3443 pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
3444 } else if (copyin(
3445 (void *)(uintptr_t)sigevk.sigev_value.sival_ptr,
3446 &pnotify, sizeof (pnotify))) {
3447 kmem_free(cbplist, ssize);
3448 return (EFAULT);
3449 }
3450 error = port_alloc_event(pnotify.portnfy_port,
3451 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
3452 if (error) {
3453 if (error == ENOMEM || error == EAGAIN)
3454 error = EAGAIN;
3455 else
3456 error = EINVAL;
3457 kmem_free(cbplist, ssize);
3458 return (error);
3459 }
3460 lio_head_port = pnotify.portnfy_port;
3461 portused = 1;
3462 }
3463
3464 /*
3465 * a list head should be allocated if notification is
3466 * enabled for this list.
3467 */
3468 head = NULL;
3469
3470 if (mode_arg == LIO_WAIT || sigev) {
3471 mutex_enter(&aiop->aio_mutex);
3472 error = aio_lio_alloc(&head);
3473 mutex_exit(&aiop->aio_mutex);
3474 if (error)
3475 goto done;
3476 deadhead = 1;
3477 head->lio_nent = nent;
3478 head->lio_refcnt = nent;
3479 head->lio_port = -1;
3480 head->lio_portkev = NULL;
3481 if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
3482 sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
3483 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
3484 if (sqp == NULL) {
3485 error = EAGAIN;
3486 goto done;
3487 }
3488 sqp->sq_func = NULL;
3489 sqp->sq_next = NULL;
3490 sqp->sq_info.si_code = SI_ASYNCIO;
3491 sqp->sq_info.si_pid = curproc->p_pid;
3492 sqp->sq_info.si_ctid = PRCTID(curproc);
3493 sqp->sq_info.si_zoneid = getzoneid();
3494 sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3495 sqp->sq_info.si_signo = sigevk.sigev_signo;
3496 sqp->sq_info.si_value.sival_int =
3497 sigevk.sigev_value.sival_int;
3498 head->lio_sigqp = sqp;
3499 } else {
3500 head->lio_sigqp = NULL;
3501 }
3502 if (pkevtp) {
3503 /*
3504 * Prepare data to send when list of aiocb's has
3505 * completed.
3506 */
3507 port_init_event(pkevtp, (uintptr_t)sigev,
3508 (void *)(uintptr_t)pnotify.portnfy_user,
3509 NULL, head);
3510 pkevtp->portkev_events = AIOLIO;
3511 head->lio_portkev = pkevtp;
3512 head->lio_port = pnotify.portnfy_port;
3513 }
3514 }
3515
3516 for (i = 0; i < nent; i++, ucbp++) {
3517
3518 /* skip entry if it can't be copied. */
3519 #ifdef _LP64
3520 cbp = (aiocb32_t *)(uintptr_t)*ucbp;
3521 if (cbp == NULL || copyin(cbp, aiocb32, sizeof (*aiocb32)))
3522 #else
3523 cbp = (aiocb_t *)*ucbp;
3524 if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb)))
3525 #endif
3526 {
3527 if (head) {
3528 mutex_enter(&aiop->aio_mutex);
3529 head->lio_nent--;
3530 head->lio_refcnt--;
3531 mutex_exit(&aiop->aio_mutex);
3532 }
3533 continue;
3534 }
3535 #ifdef _LP64
3536 /*
3537 * copy 32 bit structure into 64 bit structure
3538 */
3539 aiocb_32ton(aiocb32, aiocb);
3540 #endif /* _LP64 */
3541
3542 /* skip if opcode for aiocb is LIO_NOP */
3543 mode = aiocb->aio_lio_opcode;
3544 if (mode == LIO_NOP) {
3545 cbp = NULL;
3546 if (head) {
3547 mutex_enter(&aiop->aio_mutex);
3548 head->lio_nent--;
3549 head->lio_refcnt--;
3550 mutex_exit(&aiop->aio_mutex);
3551 }
3552 continue;
3553 }
3554
3555 /* increment file descriptor's ref count. */
3556 if ((fp = getf(aiocb->aio_fildes)) == NULL) {
3557 lio_set_uerror(&cbp->aio_resultp, EBADF);
3558 if (head) {
3559 mutex_enter(&aiop->aio_mutex);
3560 head->lio_nent--;
3561 head->lio_refcnt--;
3562 mutex_exit(&aiop->aio_mutex);
3563 }
3564 aio_errors++;
3565 continue;
3566 }
3567
3568 /*
3569 * check the permission of the partition
3570 */
3571 if ((fp->f_flag & mode) == 0) {
3572 releasef(aiocb->aio_fildes);
3573 lio_set_uerror(&cbp->aio_resultp, EBADF);
3574 if (head) {
3575 mutex_enter(&aiop->aio_mutex);
3576 head->lio_nent--;
3577 head->lio_refcnt--;
3578 mutex_exit(&aiop->aio_mutex);
3579 }
3580 aio_errors++;
3581 continue;
3582 }
3583
3584 /*
3585 * common case where requests are to the same fd
3586 * for the same r/w operation
3587 * for UFS, need to set EBADFD
3588 */
3589 vp = fp->f_vnode;
3590 if (fp != prev_fp || mode != prev_mode) {
3591 aio_func = check_vp(vp, mode);
3592 if (aio_func == NULL) {
3593 prev_fp = NULL;
3594 releasef(aiocb->aio_fildes);
3595 lio_set_uerror(&cbp->aio_resultp, EBADFD);
3596 aio_notsupported++;
3597 if (head) {
3598 mutex_enter(&aiop->aio_mutex);
3599 head->lio_nent--;
3600 head->lio_refcnt--;
3601 mutex_exit(&aiop->aio_mutex);
3602 }
3603 continue;
3604 } else {
3605 prev_fp = fp;
3606 prev_mode = mode;
3607 }
3608 }
3609
3610 error = aio_req_setup(&reqp, aiop, aiocb,
3611 (aio_result_t *)&cbp->aio_resultp, vp, 0);
3612 if (error) {
3613 releasef(aiocb->aio_fildes);
3614 lio_set_uerror(&cbp->aio_resultp, error);
3615 if (head) {
3616 mutex_enter(&aiop->aio_mutex);
3617 head->lio_nent--;
3618 head->lio_refcnt--;
3619 mutex_exit(&aiop->aio_mutex);
3620 }
3621 aio_errors++;
3622 continue;
3623 }
3624
3625 reqp->aio_req_lio = head;
3626 deadhead = 0;
3627
3628 /*
3629 * Set the errno field now before sending the request to
3630 * the driver to avoid a race condition
3631 */
3632 (void) suword32(&cbp->aio_resultp.aio_errno,
3633 EINPROGRESS);
3634
3635 reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)cbp;
3636
3637 event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE;
3638 aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
3639 aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
3640 if (aio_port | aio_thread) {
3641 port_kevent_t *lpkevp;
3642 /*
3643 * Prepare data to send with each aiocb completed.
3644 */
3645 #ifdef _LP64
3646 if (aio_port) {
3647 void *paddr = (void *)(uintptr_t)
3648 aiocb32->aio_sigevent.sigev_value.sival_ptr;
3649 if (copyin(paddr, &pnotify, sizeof (pnotify)))
3650 error = EFAULT;
3651 } else { /* aio_thread */
3652 pnotify.portnfy_port =
3653 aiocb32->aio_sigevent.sigev_signo;
3654 pnotify.portnfy_user =
3655 aiocb32->aio_sigevent.sigev_value.sival_ptr;
3656 }
3657 #else
3658 if (aio_port) {
3659 void *paddr =
3660 aiocb->aio_sigevent.sigev_value.sival_ptr;
3661 if (copyin(paddr, &pnotify, sizeof (pnotify)))
3662 error = EFAULT;
3663 } else { /* aio_thread */
3664 pnotify.portnfy_port =
3665 aiocb->aio_sigevent.sigev_signo;
3666 pnotify.portnfy_user =
3667 aiocb->aio_sigevent.sigev_value.sival_ptr;
3668 }
3669 #endif
3670 if (error)
3671 /* EMPTY */;
3672 else if (pkevtp != NULL &&
3673 pnotify.portnfy_port == lio_head_port)
3674 error = port_dup_event(pkevtp, &lpkevp,
3675 PORT_ALLOC_DEFAULT);
3676 else
3677 error = port_alloc_event(pnotify.portnfy_port,
3678 PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
3679 &lpkevp);
3680 if (error == 0) {
3681 port_init_event(lpkevp, (uintptr_t)cbp,
3682 (void *)(uintptr_t)pnotify.portnfy_user,
3683 aio_port_callback, reqp);
3684 lpkevp->portkev_events = event;
3685 reqp->aio_req_portkev = lpkevp;
3686 reqp->aio_req_port = pnotify.portnfy_port;
3687 }
3688 }
3689
3690 /*
3691 * send the request to driver.
3692 */
3693 if (error == 0) {
3694 if (aiocb->aio_nbytes == 0) {
3695 clear_active_fd(aiocb->aio_fildes);
3696 aio_zerolen(reqp);
3697 continue;
3698 }
3699 error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
3700 CRED());
3701 }
3702
3703 /*
3704 * the fd's ref count is not decremented until the IO has
3705 * completed unless there was an error.
3706 */
3707 if (error) {
3708 releasef(aiocb->aio_fildes);
3709 lio_set_uerror(&cbp->aio_resultp, error);
3710 if (head) {
3711 mutex_enter(&aiop->aio_mutex);
3712 head->lio_nent--;
3713 head->lio_refcnt--;
3714 mutex_exit(&aiop->aio_mutex);
3715 }
3716 if (error == ENOTSUP)
3717 aio_notsupported++;
3718 else
3719 aio_errors++;
3720 lio_set_error(reqp, portused);
3721 } else {
3722 clear_active_fd(aiocb->aio_fildes);
3723 }
3724 }
3725
3726 if (aio_notsupported) {
3727 error = ENOTSUP;
3728 } else if (aio_errors) {
3729 /*
3730 * return EIO if any request failed
3731 */
3732 error = EIO;
3733 }
3734
3735 if (mode_arg == LIO_WAIT) {
3736 mutex_enter(&aiop->aio_mutex);
3737 while (head->lio_refcnt > 0) {
3738 if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
3739 mutex_exit(&aiop->aio_mutex);
3740 error = EINTR;
3741 goto done;
3742 }
3743 }
3744 mutex_exit(&aiop->aio_mutex);
3745 alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_32);
3746 }
3747
3748 done:
3749 kmem_free(cbplist, ssize);
3750 if (deadhead) {
3751 if (head->lio_sigqp)
3752 kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
3753 if (head->lio_portkev)
3754 port_free_event(head->lio_portkev);
3755 kmem_free(head, sizeof (aio_lio_t));
3756 }
3757 return (error);
3758 }
3759
3760
3761 #ifdef _SYSCALL32_IMPL
3762 void
3763 aiocb_32ton(aiocb32_t *src, aiocb_t *dest)
3764 {
3765 dest->aio_fildes = src->aio_fildes;
3766 dest->aio_buf = (caddr_t)(uintptr_t)src->aio_buf;
3767 dest->aio_nbytes = (size_t)src->aio_nbytes;
3768 dest->aio_offset = (off_t)src->aio_offset;
3769 dest->aio_reqprio = src->aio_reqprio;
3770 dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
3771 dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;
3772
3773 /*
3774 * See comment in sigqueue32() on handling of 32-bit
3775 * sigvals in a 64-bit kernel.
3776 */
3777 dest->aio_sigevent.sigev_value.sival_int =
3778 (int)src->aio_sigevent.sigev_value.sival_int;
3779 dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
3780 (uintptr_t)src->aio_sigevent.sigev_notify_function;
3781 dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
3782 (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
3783 dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
3784 dest->aio_lio_opcode = src->aio_lio_opcode;
3785 dest->aio_state = src->aio_state;
3786 dest->aio__pad[0] = src->aio__pad[0];
3787 }
3788 #endif /* _SYSCALL32_IMPL */
3789
3790 /*
3791 * aio_port_callback() is called just before the event is retrieved from the
3792 * port. The task of this callback function is to finish the work of the
3793 * transaction for the application, it means :
3794 * - copyout transaction data to the application
3795 * (this thread is running in the right process context)
3796 * - keep trace of the transaction (update of counters).
3797 * - free allocated buffers
3798 * The aiocb pointer is the object element of the port_kevent_t structure.
3799 *
3800 * flag :
3801 * PORT_CALLBACK_DEFAULT : do copyout and free resources
3802 * PORT_CALLBACK_CLOSE : don't do copyout, free resources
3803 */
3804
3805 /*ARGSUSED*/
3806 int
3807 aio_port_callback(void *arg, int *events, pid_t pid, int flag, void *evp)
3808 {
3809 aio_t *aiop = curproc->p_aio;
3810 aio_req_t *reqp = arg;
3811 struct iovec *iov;
3812 struct buf *bp;
3813 void *resultp;
3814
3815 if (pid != curproc->p_pid) {
3816 /* wrong proc !!, can not deliver data here ... */
3817 return (EACCES);
3818 }
3819
3820 mutex_enter(&aiop->aio_portq_mutex);
3821 reqp->aio_req_portkev = NULL;
3822 aio_req_remove_portq(aiop, reqp); /* remove request from portq */
3823 mutex_exit(&aiop->aio_portq_mutex);
3824 aphysio_unlock(reqp); /* unlock used pages */
3825 mutex_enter(&aiop->aio_mutex);
3826 if (reqp->aio_req_flags & AIO_COPYOUTDONE) {
3827 aio_req_free_port(aiop, reqp); /* back to free list */
3828 mutex_exit(&aiop->aio_mutex);
3829 return (0);
3830 }
3831
3832 iov = reqp->aio_req_uio.uio_iov;
3833 bp = &reqp->aio_req_buf;
3834 resultp = (void *)reqp->aio_req_resultp;
3835 aio_req_free_port(aiop, reqp); /* request struct back to free list */
3836 mutex_exit(&aiop->aio_mutex);
3837 if (flag == PORT_CALLBACK_DEFAULT)
3838 aio_copyout_result_port(iov, bp, resultp);
3839 return (0);
3840 }