Print this page
5880 Increase IOV_MAX to at least 1024
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>


  61 #include <sys/vtrace.h>
  62 #include <sys/debug.h>
  63 #include <sys/strredir.h>
  64 #include <sys/fs/fifonode.h>
  65 #include <sys/fs/snode.h>
  66 #include <sys/strlog.h>
  67 #include <sys/strsun.h>
  68 #include <sys/project.h>
  69 #include <sys/kbio.h>
  70 #include <sys/msio.h>
  71 #include <sys/tty.h>
  72 #include <sys/ptyvar.h>
  73 #include <sys/vuid_event.h>
  74 #include <sys/modctl.h>
  75 #include <sys/sunddi.h>
  76 #include <sys/sunldi_impl.h>
  77 #include <sys/autoconf.h>
  78 #include <sys/policy.h>
  79 #include <sys/dld.h>
  80 #include <sys/zone.h>

  81 #include <c2/audit.h>
  82 
  83 /*
  84  * This define helps improve the readability of streams code while
  85  * still maintaining a very old streams performance enhancement.  The
  86  * performance enhancement basically involved having all callers
  87  * of straccess() perform the first check that straccess() will do
  88  * locally before actually calling straccess().  (There by reducing
  89  * the number of unnecessary calls to straccess().)
  90  */
  91 #define i_straccess(x, y)       ((stp->sd_sidp == NULL) ? 0 : \
  92                                     (stp->sd_vnode->v_type == VFIFO) ? 0 : \
  93                                     straccess((x), (y)))
  94 
  95 /*
  96  * what is mblk_pull_len?
  97  *
  98  * If a streams message consists of many short messages,
  99  * a performance degradation occurs from copyout overhead.
 100  * To decrease the per mblk overhead, messages that are


 970 strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
 971     int *errorp)
 972 {
 973         mblk_t *bp;
 974         int error;
 975         ssize_t rbytes = 0;
 976 
 977         /* Holding sd_lock prevents the read queue from changing  */
 978         ASSERT(MUTEX_HELD(&stp->sd_lock));
 979 
 980         if (uiop != NULL && stp->sd_struiordq != NULL &&
 981             q->q_first == NULL &&
 982             (!first || (stp->sd_wakeq & RSLEEP))) {
 983                 /*
 984                  * Stream supports rwnext() for the read side.
 985                  * If this is the first time we're called by e.g. strread
 986                  * only do the downcall if there is a deferred wakeup
 987                  * (registered in sd_wakeq).
 988                  */
 989                 struiod_t uiod;


 990 
 991                 if (first)
 992                         stp->sd_wakeq &= ~RSLEEP;
 993 
 994                 (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov,
 995                     sizeof (uiod.d_iov) / sizeof (*uiod.d_iov));






 996                 uiod.d_mp = 0;
 997                 /*
 998                  * Mark that a thread is in rwnext on the read side
 999                  * to prevent strrput from nacking ioctls immediately.
1000                  * When the last concurrent rwnext returns
1001                  * the ioctls are nack'ed.
1002                  */
1003                 ASSERT(MUTEX_HELD(&stp->sd_lock));
1004                 stp->sd_struiodnak++;
1005                 /*
1006                  * Note: rwnext will drop sd_lock.
1007                  */
1008                 error = rwnext(q, &uiod);
1009                 ASSERT(MUTEX_NOT_HELD(&stp->sd_lock));
1010                 mutex_enter(&stp->sd_lock);
1011                 stp->sd_struiodnak--;
1012                 while (stp->sd_struiodnak == 0 &&
1013                     ((bp = stp->sd_struionak) != NULL)) {
1014                         stp->sd_struionak = bp->b_next;
1015                         bp->b_next = NULL;
1016                         bp->b_datap->db_type = M_IOCNAK;
1017                         /*
1018                          * Protect against the driver passing up
1019                          * messages after it has done a qprocsoff.
1020                          */
1021                         if (_OTHERQ(q)->q_next == NULL)
1022                                 freemsg(bp);
1023                         else {
1024                                 mutex_exit(&stp->sd_lock);
1025                                 qreply(q, bp);
1026                                 mutex_enter(&stp->sd_lock);
1027                         }
1028                 }
1029                 ASSERT(MUTEX_HELD(&stp->sd_lock));
1030                 if (error == 0 || error == EWOULDBLOCK) {
1031                         if ((bp = uiod.d_mp) != NULL) {
1032                                 *errorp = 0;
1033                                 ASSERT(MUTEX_HELD(&stp->sd_lock));


1034                                 return (bp);
1035                         }
1036                         error = 0;
1037                 } else if (error == EINVAL) {
1038                         /*
1039                          * The stream plumbing must have
1040                          * changed while we were away, so
1041                          * just turn off rwnext()s.
1042                          */
1043                         error = 0;
1044                 } else if (error == EBUSY) {
1045                         /*
1046                          * The module might have data in transit using putnext
1047                          * Fall back on waiting + getq.
1048                          */
1049                         error = 0;
1050                 } else {
1051                         *errorp = error;
1052                         ASSERT(MUTEX_HELD(&stp->sd_lock));


1053                         return (NULL);
1054                 }




1055                 /*
1056                  * Try a getq in case a rwnext() generated mblk
1057                  * has bubbled up via strrput().
1058                  */
1059         }
1060         *errorp = 0;
1061         ASSERT(MUTEX_HELD(&stp->sd_lock));
1062 
1063         /*
1064          * If we have a valid uio, try and use this as a guide for how
1065          * many bytes to retrieve from the queue via getq_noenab().
1066          * Doing this can avoid unneccesary counting of overlong
1067          * messages in putback(). We currently only do this for sockets
1068          * and only if there is no sd_rputdatafunc hook.
1069          *
1070          * The sd_rputdatafunc hook transforms the entire message
1071          * before any bytes in it can be given to a client. So, rbytes
1072          * must be 0 if there is a hook.
1073          */
1074         if ((uiop != NULL) && (stp->sd_vnode->v_type == VSOCK) &&


2529  *
2530  * Caller should *not* hold sd_lock.
2531  * When EWOULDBLOCK is returned the caller has to redo the canputnext
2532  * under sd_lock in order to avoid missing a backenabling wakeup.
2533  *
2534  * Use iosize = -1 to not send any M_DATA. iosize = 0 sends zero-length M_DATA.
2535  *
2536  * Set MSG_IGNFLOW in flags to ignore flow control for hipri messages.
2537  * For sync streams we can only ignore flow control by reverting to using
2538  * putnext.
2539  *
2540  * If sd_maxblk is less than *iosize this routine might return without
2541  * transferring all of *iosize. In all cases, on return *iosize will contain
2542  * the amount of data that was transferred.
2543  */
2544 static int
2545 strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
2546     int b_flag, int pri, int flags)
2547 {
2548         struiod_t uiod;


2549         mblk_t *mp;
2550         queue_t *wqp = stp->sd_wrq;
2551         int error = 0;
2552         ssize_t count = *iosize;
2553 
2554         ASSERT(MUTEX_NOT_HELD(&stp->sd_lock));
2555 
2556         if (uiop != NULL && count >= 0)
2557                 flags |= stp->sd_struiowrq ? STRUIO_POSTPONE : 0;
2558 
2559         if (!(flags & STRUIO_POSTPONE)) {
2560                 /*
2561                  * Use regular canputnext, strmakedata, putnext sequence.
2562                  */
2563                 if (pri == 0) {
2564                         if (!canputnext(wqp) && !(flags & MSG_IGNFLOW)) {
2565                                 freemsg(mctl);
2566                                 return (EWOULDBLOCK);
2567                         }
2568                 } else {


2620         if ((error = strmakedata(iosize, uiop, stp, flags, &mp)) != 0) {
2621                 freemsg(mctl);
2622                 /*
2623                  * map EAGAIN to ENOMEM since EAGAIN means "flow controlled".
2624                  */
2625                 return (error == EAGAIN ? ENOMEM : error);
2626         }
2627         if (mctl != NULL) {
2628                 if (mctl->b_cont == NULL)
2629                         mctl->b_cont = mp;
2630                 else if (mp != NULL)
2631                         linkb(mctl, mp);
2632                 mp = mctl;
2633         } else if (mp == NULL) {
2634                 return (0);
2635         }
2636 
2637         mp->b_flag |= b_flag;
2638         mp->b_band = (uchar_t)pri;
2639 
2640         (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov,
2641             sizeof (uiod.d_iov) / sizeof (*uiod.d_iov));






2642         uiod.d_uio.uio_offset = 0;
2643         uiod.d_mp = mp;
2644         error = rwnext(wqp, &uiod);
2645         if (! uiod.d_mp) {
2646                 uioskip(uiop, *iosize);


2647                 return (error);
2648         }
2649         ASSERT(mp == uiod.d_mp);
2650         if (error == EINVAL) {
2651                 /*
2652                  * The stream plumbing must have changed while
2653                  * we were away, so just turn off rwnext()s.
2654                  */
2655                 error = 0;
2656         } else if (error == EBUSY || error == EWOULDBLOCK) {
2657                 /*
2658                  * Couldn't enter a perimeter or took a page fault,
2659                  * so fall-back to putnext().
2660                  */
2661                 error = 0;
2662         } else {
2663                 freemsg(mp);


2664                 return (error);
2665         }
2666         /* Have to check canput before consuming data from the uio */
2667         if (pri == 0) {
2668                 if (!canputnext(wqp) && !(flags & MSG_IGNFLOW)) {
2669                         freemsg(mp);


2670                         return (EWOULDBLOCK);
2671                 }
2672         } else {
2673                 if (!bcanputnext(wqp, pri) && !(flags & MSG_IGNFLOW)) {
2674                         freemsg(mp);


2675                         return (EWOULDBLOCK);
2676                 }
2677         }
2678         ASSERT(mp == uiod.d_mp);
2679         /* Copyin data from the uio */
2680         if ((error = struioget(wqp, mp, &uiod, 0)) != 0) {
2681                 freemsg(mp);


2682                 return (error);
2683         }
2684         uioskip(uiop, *iosize);
2685         if (flags & MSG_IGNFLOW) {
2686                 /*
2687                  * XXX Hack: Don't get stuck running service procedures.
2688                  * This is needed for sockfs when sending the unbind message
2689                  * out of the rput procedure - we don't want a put procedure
2690                  * to run service procedures.
2691                  */
2692                 putnext(wqp, mp);
2693         } else {
2694                 stream_willservice(stp);
2695                 putnext(wqp, mp);
2696                 stream_runservice(stp);
2697         }


2698         return (0);
2699 }
2700 
2701 /*
2702  * Write attempts to break the write request into messages conforming
2703  * with the minimum and maximum packet sizes set downstream.
2704  *
2705  * Write will not block if downstream queue is full and
2706  * O_NDELAY is set, otherwise it will block waiting for the queue to get room.
2707  *
2708  * A write of zero bytes gets packaged into a zero length message and sent
2709  * downstream like any other message.
2710  *
2711  * If buffers of the requested sizes are not available, the write will
2712  * sleep until the buffers become available.
2713  *
2714  * Write (if specified) will supply a write offset in a message if it
2715  * makes sense. This can be specified by downstream modules as part of
2716  * a M_SETOPTS message.  Write will not supply the write offset if it
2717  * cannot supply any data in a buffer.  In other words, write will never




  61 #include <sys/vtrace.h>
  62 #include <sys/debug.h>
  63 #include <sys/strredir.h>
  64 #include <sys/fs/fifonode.h>
  65 #include <sys/fs/snode.h>
  66 #include <sys/strlog.h>
  67 #include <sys/strsun.h>
  68 #include <sys/project.h>
  69 #include <sys/kbio.h>
  70 #include <sys/msio.h>
  71 #include <sys/tty.h>
  72 #include <sys/ptyvar.h>
  73 #include <sys/vuid_event.h>
  74 #include <sys/modctl.h>
  75 #include <sys/sunddi.h>
  76 #include <sys/sunldi_impl.h>
  77 #include <sys/autoconf.h>
  78 #include <sys/policy.h>
  79 #include <sys/dld.h>
  80 #include <sys/zone.h>
  81 #include <sys/limits.h>
  82 #include <c2/audit.h>
  83 
  84 /*
  85  * This define helps improve the readability of streams code while
  86  * still maintaining a very old streams performance enhancement.  The
  87  * performance enhancement basically involved having all callers
  88  * of straccess() perform the first check that straccess() will do
  89  * locally before actually calling straccess().  (There by reducing
  90  * the number of unnecessary calls to straccess().)
  91  */
  92 #define i_straccess(x, y)       ((stp->sd_sidp == NULL) ? 0 : \
  93                                     (stp->sd_vnode->v_type == VFIFO) ? 0 : \
  94                                     straccess((x), (y)))
  95 
  96 /*
  97  * what is mblk_pull_len?
  98  *
  99  * If a streams message consists of many short messages,
 100  * a performance degradation occurs from copyout overhead.
 101  * To decrease the per mblk overhead, messages that are


 971 strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
 972     int *errorp)
 973 {
 974         mblk_t *bp;
 975         int error;
 976         ssize_t rbytes = 0;
 977 
 978         /* Holding sd_lock prevents the read queue from changing  */
 979         ASSERT(MUTEX_HELD(&stp->sd_lock));
 980 
 981         if (uiop != NULL && stp->sd_struiordq != NULL &&
 982             q->q_first == NULL &&
 983             (!first || (stp->sd_wakeq & RSLEEP))) {
 984                 /*
 985                  * Stream supports rwnext() for the read side.
 986                  * If this is the first time we're called by e.g. strread
 987                  * only do the downcall if there is a deferred wakeup
 988                  * (registered in sd_wakeq).
 989                  */
 990                 struiod_t uiod;
 991                 struct iovec buf[IOV_MAX_STACK];
 992                 int iovlen = 0;
 993 
 994                 if (first)
 995                         stp->sd_wakeq &= ~RSLEEP;
 996 
 997                 if (uiop->uio_iovcnt > IOV_MAX_STACK) {
 998                         iovlen = uiop->uio_iovcnt * sizeof (iovec_t);
 999                         uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP);
1000                 } else {
1001                         uiod.d_iov = buf;
1002                 }
1003 
1004                 (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt);
1005                 uiod.d_mp = 0;
1006                 /*
1007                  * Mark that a thread is in rwnext on the read side
1008                  * to prevent strrput from nacking ioctls immediately.
1009                  * When the last concurrent rwnext returns
1010                  * the ioctls are nack'ed.
1011                  */
1012                 ASSERT(MUTEX_HELD(&stp->sd_lock));
1013                 stp->sd_struiodnak++;
1014                 /*
1015                  * Note: rwnext will drop sd_lock.
1016                  */
1017                 error = rwnext(q, &uiod);
1018                 ASSERT(MUTEX_NOT_HELD(&stp->sd_lock));
1019                 mutex_enter(&stp->sd_lock);
1020                 stp->sd_struiodnak--;
1021                 while (stp->sd_struiodnak == 0 &&
1022                     ((bp = stp->sd_struionak) != NULL)) {
1023                         stp->sd_struionak = bp->b_next;
1024                         bp->b_next = NULL;
1025                         bp->b_datap->db_type = M_IOCNAK;
1026                         /*
1027                          * Protect against the driver passing up
1028                          * messages after it has done a qprocsoff.
1029                          */
1030                         if (_OTHERQ(q)->q_next == NULL)
1031                                 freemsg(bp);
1032                         else {
1033                                 mutex_exit(&stp->sd_lock);
1034                                 qreply(q, bp);
1035                                 mutex_enter(&stp->sd_lock);
1036                         }
1037                 }
1038                 ASSERT(MUTEX_HELD(&stp->sd_lock));
1039                 if (error == 0 || error == EWOULDBLOCK) {
1040                         if ((bp = uiod.d_mp) != NULL) {
1041                                 *errorp = 0;
1042                                 ASSERT(MUTEX_HELD(&stp->sd_lock));
1043                                 if (iovlen != 0)
1044                                         kmem_free(uiod.d_iov, iovlen);
1045                                 return (bp);
1046                         }
1047                         error = 0;
1048                 } else if (error == EINVAL) {
1049                         /*
1050                          * The stream plumbing must have
1051                          * changed while we were away, so
1052                          * just turn off rwnext()s.
1053                          */
1054                         error = 0;
1055                 } else if (error == EBUSY) {
1056                         /*
1057                          * The module might have data in transit using putnext
1058                          * Fall back on waiting + getq.
1059                          */
1060                         error = 0;
1061                 } else {
1062                         *errorp = error;
1063                         ASSERT(MUTEX_HELD(&stp->sd_lock));
1064                         if (iovlen != 0)
1065                                 kmem_free(uiod.d_iov, iovlen);
1066                         return (NULL);
1067                 }
1068 
1069                 if (iovlen != 0)
1070                         kmem_free(uiod.d_iov, iovlen);
1071 
1072                 /*
1073                  * Try a getq in case a rwnext() generated mblk
1074                  * has bubbled up via strrput().
1075                  */
1076         }
1077         *errorp = 0;
1078         ASSERT(MUTEX_HELD(&stp->sd_lock));
1079 
1080         /*
1081          * If we have a valid uio, try and use this as a guide for how
1082          * many bytes to retrieve from the queue via getq_noenab().
1083          * Doing this can avoid unneccesary counting of overlong
1084          * messages in putback(). We currently only do this for sockets
1085          * and only if there is no sd_rputdatafunc hook.
1086          *
1087          * The sd_rputdatafunc hook transforms the entire message
1088          * before any bytes in it can be given to a client. So, rbytes
1089          * must be 0 if there is a hook.
1090          */
1091         if ((uiop != NULL) && (stp->sd_vnode->v_type == VSOCK) &&


2546  *
2547  * Caller should *not* hold sd_lock.
2548  * When EWOULDBLOCK is returned the caller has to redo the canputnext
2549  * under sd_lock in order to avoid missing a backenabling wakeup.
2550  *
2551  * Use iosize = -1 to not send any M_DATA. iosize = 0 sends zero-length M_DATA.
2552  *
2553  * Set MSG_IGNFLOW in flags to ignore flow control for hipri messages.
2554  * For sync streams we can only ignore flow control by reverting to using
2555  * putnext.
2556  *
2557  * If sd_maxblk is less than *iosize this routine might return without
2558  * transferring all of *iosize. In all cases, on return *iosize will contain
2559  * the amount of data that was transferred.
2560  */
2561 static int
2562 strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
2563     int b_flag, int pri, int flags)
2564 {
2565         struiod_t uiod;
2566         struct iovec buf[IOV_MAX_STACK];
2567         int iovlen = 0;
2568         mblk_t *mp;
2569         queue_t *wqp = stp->sd_wrq;
2570         int error = 0;
2571         ssize_t count = *iosize;
2572 
2573         ASSERT(MUTEX_NOT_HELD(&stp->sd_lock));
2574 
2575         if (uiop != NULL && count >= 0)
2576                 flags |= stp->sd_struiowrq ? STRUIO_POSTPONE : 0;
2577 
2578         if (!(flags & STRUIO_POSTPONE)) {
2579                 /*
2580                  * Use regular canputnext, strmakedata, putnext sequence.
2581                  */
2582                 if (pri == 0) {
2583                         if (!canputnext(wqp) && !(flags & MSG_IGNFLOW)) {
2584                                 freemsg(mctl);
2585                                 return (EWOULDBLOCK);
2586                         }
2587                 } else {


2639         if ((error = strmakedata(iosize, uiop, stp, flags, &mp)) != 0) {
2640                 freemsg(mctl);
2641                 /*
2642                  * map EAGAIN to ENOMEM since EAGAIN means "flow controlled".
2643                  */
2644                 return (error == EAGAIN ? ENOMEM : error);
2645         }
2646         if (mctl != NULL) {
2647                 if (mctl->b_cont == NULL)
2648                         mctl->b_cont = mp;
2649                 else if (mp != NULL)
2650                         linkb(mctl, mp);
2651                 mp = mctl;
2652         } else if (mp == NULL) {
2653                 return (0);
2654         }
2655 
2656         mp->b_flag |= b_flag;
2657         mp->b_band = (uchar_t)pri;
2658 
2659         if (uiop->uio_iovcnt > IOV_MAX_STACK) {
2660                 iovlen = uiop->uio_iovcnt * sizeof (iovec_t);
2661                 uiod.d_iov = (struct iovec *)kmem_alloc(iovlen, KM_SLEEP);
2662         } else {
2663                 uiod.d_iov = buf;
2664         }
2665 
2666         (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt);
2667         uiod.d_uio.uio_offset = 0;
2668         uiod.d_mp = mp;
2669         error = rwnext(wqp, &uiod);
2670         if (! uiod.d_mp) {
2671                 uioskip(uiop, *iosize);
2672                 if (iovlen != 0)
2673                         kmem_free(uiod.d_iov, iovlen);
2674                 return (error);
2675         }
2676         ASSERT(mp == uiod.d_mp);
2677         if (error == EINVAL) {
2678                 /*
2679                  * The stream plumbing must have changed while
2680                  * we were away, so just turn off rwnext()s.
2681                  */
2682                 error = 0;
2683         } else if (error == EBUSY || error == EWOULDBLOCK) {
2684                 /*
2685                  * Couldn't enter a perimeter or took a page fault,
2686                  * so fall-back to putnext().
2687                  */
2688                 error = 0;
2689         } else {
2690                 freemsg(mp);
2691                 if (iovlen != 0)
2692                         kmem_free(uiod.d_iov, iovlen);
2693                 return (error);
2694         }
2695         /* Have to check canput before consuming data from the uio */
2696         if (pri == 0) {
2697                 if (!canputnext(wqp) && !(flags & MSG_IGNFLOW)) {
2698                         freemsg(mp);
2699                         if (iovlen != 0)
2700                                 kmem_free(uiod.d_iov, iovlen);
2701                         return (EWOULDBLOCK);
2702                 }
2703         } else {
2704                 if (!bcanputnext(wqp, pri) && !(flags & MSG_IGNFLOW)) {
2705                         freemsg(mp);
2706                         if (iovlen != 0)
2707                                 kmem_free(uiod.d_iov, iovlen);
2708                         return (EWOULDBLOCK);
2709                 }
2710         }
2711         ASSERT(mp == uiod.d_mp);
2712         /* Copyin data from the uio */
2713         if ((error = struioget(wqp, mp, &uiod, 0)) != 0) {
2714                 freemsg(mp);
2715                 if (iovlen != 0)
2716                         kmem_free(uiod.d_iov, iovlen);
2717                 return (error);
2718         }
2719         uioskip(uiop, *iosize);
2720         if (flags & MSG_IGNFLOW) {
2721                 /*
2722                  * XXX Hack: Don't get stuck running service procedures.
2723                  * This is needed for sockfs when sending the unbind message
2724                  * out of the rput procedure - we don't want a put procedure
2725                  * to run service procedures.
2726                  */
2727                 putnext(wqp, mp);
2728         } else {
2729                 stream_willservice(stp);
2730                 putnext(wqp, mp);
2731                 stream_runservice(stp);
2732         }
2733         if (iovlen != 0)
2734                 kmem_free(uiod.d_iov, iovlen);
2735         return (0);
2736 }
2737 
2738 /*
2739  * Write attempts to break the write request into messages conforming
2740  * with the minimum and maximum packet sizes set downstream.
2741  *
2742  * Write will not block if downstream queue is full and
2743  * O_NDELAY is set, otherwise it will block waiting for the queue to get room.
2744  *
2745  * A write of zero bytes gets packaged into a zero length message and sent
2746  * downstream like any other message.
2747  *
2748  * If buffers of the requested sizes are not available, the write will
2749  * sleep until the buffers become available.
2750  *
2751  * Write (if specified) will supply a write offset in a message if it
2752  * makes sense. This can be specified by downstream modules as part of
2753  * a M_SETOPTS message.  Write will not supply the write offset if it
2754  * cannot supply any data in a buffer.  In other words, write will never