Print this page
select: shortcircuit fd_sets_count if given 0 fds (such as for timing)
if nfds is low, fastpath to try to maintain throughput
libc: only have one select implementation, and move the pollfds onto the heap if they cross some threshold


  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*      Copyright (c) 1988 AT&T     */
  28 /*        All Rights Reserved   */
  29 
  30 #pragma ident   "%Z%%M% %I%     %E% SMI"
  31 
  32 /*
  33  * Emulation of select() system call using poll() system call.
  34  *
  35  * Assumptions:
  36  *      polling for input only is most common.
  37  *      polling for exceptional conditions is very rare.
  38  *
  39  * Note that is it not feasible to emulate all error conditions,
  40  * in particular conditions that would return EFAULT are far too
  41  * difficult to check for in a library routine.
  42  */
  43 
  44 #pragma weak _select = select
  45 
  46 #include "lint.h"
  47 #include <values.h>
  48 #include <pthread.h>
  49 #include <errno.h>

  50 #include <sys/time.h>
  51 #include <sys/types.h>
  52 #include <sys/select.h>
  53 #include <sys/poll.h>
  54 #include <alloca.h>
  55 #include "libc.h"
  56 

















































  57 int
  58 pselect(int nfds, fd_set *in0, fd_set *out0, fd_set *ex0,
  59         const timespec_t *tsp, const sigset_t *sigmask)
  60 {
  61         long *in, *out, *ex;
  62         ulong_t m;      /* bit mask */
  63         int j;          /* loop counter */
  64         ulong_t b;      /* bits to test */
  65         int n, rv;
  66         struct pollfd *pfd;
  67         struct pollfd *p;
  68         int lastj = -1;


  69 
  70         /* "zero" is read-only, it could go in the text segment */
  71         static fd_set zero = { 0 };
  72 
  73         /*
  74          * Check for invalid conditions at outset.
  75          * Required for spec1170.
  76          * SUSV3: We must behave as a cancellation point even if we fail early.
  77          */
  78         if (nfds < 0 || nfds > FD_SETSIZE) {
  79                 pthread_testcancel();
  80                 errno = EINVAL;
  81                 return (-1);
  82         }
  83         p = pfd = (struct pollfd *)alloca(nfds * sizeof (struct pollfd));
  84 
  85         if (tsp != NULL) {
  86                 /* check timespec validity */
  87                 if (tsp->tv_nsec < 0 || tsp->tv_nsec >= NANOSEC ||
  88                     tsp->tv_sec < 0) {
  89                         pthread_testcancel();
  90                         errno = EINVAL;
  91                         return (-1);
  92                 }
  93         }
  94 
  95         /*
  96          * If any input args are null, point them at the null array.
  97          */
  98         if (in0 == NULL)
  99                 in0 = &zero;
 100         if (out0 == NULL)
 101                 out0 = &zero;
 102         if (ex0 == NULL)
 103                 ex0 = &zero;
 104 















 105         /*
 106          * For each fd, if any bits are set convert them into
 107          * the appropriate pollfd struct.
 108          */
 109         in = (long *)in0->fds_bits;
 110         out = (long *)out0->fds_bits;
 111         ex = (long *)ex0->fds_bits;
 112         for (n = 0; n < nfds; n += NFDBITS) {
 113                 b = (ulong_t)(*in | *out | *ex);
 114                 for (j = 0, m = 1; b != 0; j++, b >>= 1, m <<= 1) {
 115                         if (b & 1) {
 116                                 p->fd = n + j;
 117                                 if (p->fd >= nfds)
 118                                         goto done;
 119                                 p->events = 0;
 120                                 if (*in & m)
 121                                         p->events |= POLLRDNORM;
 122                                 if (*out & m)
 123                                         p->events |= POLLWRNORM;
 124                                 if (*ex & m)
 125                                         p->events |= POLLRDBAND;
 126                                 p++;
 127                         }
 128                 }
 129                 in++;
 130                 out++;
 131                 ex++;
 132         }
 133 done:
 134         /*
 135          * Now do the poll.
 136          */
 137         n = (int)(p - pfd);             /* number of pollfd's */
 138         do {
 139                 rv = _pollsys(pfd, (nfds_t)n, tsp, sigmask);
 140         } while (rv < 0 && errno == EAGAIN);
 141 
 142         if (rv < 0)          /* no need to set bit masks */
 143                 return (rv);
 144 
 145         if (rv == 0) {
 146                 /*
 147                  * Clear out bit masks, just in case.
 148                  * On the assumption that usually only
 149                  * one bit mask is set, use three loops.
 150                  */
 151                 if (in0 != &zero) {
 152                         in = (long *)in0->fds_bits;
 153                         for (n = 0; n < nfds; n += NFDBITS)
 154                                 *in++ = 0;
 155                 }
 156                 if (out0 != &zero) {
 157                         out = (long *)out0->fds_bits;
 158                         for (n = 0; n < nfds; n += NFDBITS)
 159                                 *out++ = 0;
 160                 }
 161                 if (ex0 != &zero) {
 162                         ex = (long *)ex0->fds_bits;
 163                         for (n = 0; n < nfds; n += NFDBITS)
 164                                 *ex++ = 0;
 165                 }
 166                 return (0);

 167         }
 168 
 169         /*
 170          * Check for EINVAL error case first to avoid changing any bits
 171          * if we're going to return an error.
 172          */
 173         for (p = pfd, j = n; j-- > 0; p++) {
 174                 /*
 175                  * select will return EBADF immediately if any fd's
 176                  * are bad.  poll will complete the poll on the
 177                  * rest of the fd's and include the error indication
 178                  * in the returned bits.  This is a rare case so we
 179                  * accept this difference and return the error after
 180                  * doing more work than select would've done.
 181                  */
 182                 if (p->revents & POLLNVAL) {
 183                         errno = EBADF;
 184                         return (-1);

 185                 }
 186                 /*
 187                  * We would like to make POLLHUP available to select,
 188                  * checking to see if we have pending data to be read.
 189                  * BUT until we figure out how not to break Xsun's
 190                  * dependencies on select's existing features...
 191                  * This is what we _thought_ would work ... sigh!
 192                  */
 193                 /*
 194                  * if ((p->revents & POLLHUP) &&
 195                  *      !(p->revents & (POLLRDNORM|POLLRDBAND))) {
 196                  *      errno = EINTR;
 197                  *      return (-1);

 198                  * }
 199                  */
 200         }
 201 
 202         /*
 203          * Convert results of poll back into bits
 204          * in the argument arrays.
 205          *
 206          * We assume POLLRDNORM, POLLWRNORM, and POLLRDBAND will only be set
 207          * on return from poll if they were set on input, thus we don't
 208          * worry about accidentally setting the corresponding bits in the
 209          * zero array if the input bit masks were null.
 210          *
 211          * Must return number of bits set, not number of ready descriptors
 212          * (as the man page says, and as poll() does).
 213          */
 214         rv = 0;
 215         for (p = pfd; n-- > 0; p++) {
 216                 j = (int)(p->fd / NFDBITS);
 217                 /* have we moved into another word of the bit mask yet? */
 218                 if (j != lastj) {
 219                         /* clear all output bits to start with */
 220                         in = (long *)&in0->fds_bits[j];
 221                         out = (long *)&out0->fds_bits[j];
 222                         ex = (long *)&ex0->fds_bits[j];
 223                         /*
 224                          * In case we made "zero" read-only (e.g., with
 225                          * cc -R), avoid actually storing into it.
 226                          */
 227                         if (in0 != &zero)
 228                                 *in = 0;
 229                         if (out0 != &zero)
 230                                 *out = 0;
 231                         if (ex0 != &zero)
 232                                 *ex = 0;
 233                         lastj = j;
 234                 }
 235                 if (p->revents) {


 261                          * output conditions.
 262                          */
 263                         if ((p->revents & (POLLHUP|POLLERR)) &&
 264                             (p->events & POLLWRNORM)) {
 265                                 if ((*out & m) == 0)
 266                                         rv++;   /* wasn't already set */
 267                                 *out |= m;
 268                         }
 269                         /*
 270                          * Only set this bit on return if we asked about
 271                          * output conditions.
 272                          */
 273                         if ((p->revents & (POLLHUP|POLLERR)) &&
 274                             (p->events & POLLRDBAND)) {
 275                                 if ((*ex & m) == 0)
 276                                         rv++;   /* wasn't already set */
 277                                 *ex |= m;
 278                         }
 279                 }
 280         }



 281         return (rv);
 282 }
 283 
 284 int
 285 select(int nfds, fd_set *in0, fd_set *out0, fd_set *ex0, struct timeval *tv)
 286 {
 287         timespec_t ts;
 288         timespec_t *tsp;
 289 
 290         if (tv == NULL)
 291                 tsp = NULL;
 292         else {
 293                 /* check timeval validity */
 294                 if (tv->tv_usec < 0 || tv->tv_usec >= MICROSEC) {
 295                         errno = EINVAL;
 296                         return (-1);
 297                 }
 298                 /*
 299                  * Convert timeval to timespec.
 300                  * To preserve compatibility with past behavior,


  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*      Copyright (c) 1988 AT&T     */
  28 /*        All Rights Reserved   */
  29 


  30 /*
  31  * Emulation of select() system call using poll() system call.
  32  *
  33  * Assumptions:
  34  *      polling for input only is most common.
  35  *      polling for exceptional conditions is very rare.
  36  *
  37  * Note that is it not feasible to emulate all error conditions,
  38  * in particular conditions that would return EFAULT are far too
  39  * difficult to check for in a library routine.
  40  */
  41 
  42 #pragma weak _select = select
  43 
  44 #include "lint.h"
  45 #include <values.h>
  46 #include <pthread.h>
  47 #include <errno.h>
  48 #include <stdlib.h>
  49 #include <sys/time.h>
  50 #include <sys/types.h>
  51 #include <sys/select.h>
  52 #include <sys/poll.h>
  53 #include <alloca.h>
  54 #include "libc.h"
  55 
  56 /*
  57  * STACK_PFD_LIM
  58  *
  59  *   The limit at which pselect allocates pollfd structures in the heap,
  60  *   rather than on the stack.  These limits match the historical behaviour
  61  *   with the * _large_fdset implementations.
  62  *
  63  * BULK_ALLOC_LIM
  64  *
  65  *   The limit below which we'll just allocate nfds pollfds, rather than
  66  *   counting how many we actually need.
  67  */
  68 #if defined(_LP64)
  69 #define STACK_PFD_LIM   FD_SETSIZE
  70 #define BULK_ALLOC_LIM  8192
  71 #else
  72 #define STACK_PFD_LIM   1024
  73 #define BULK_ALLOC_LIM  1024
  74 #endif
  75 
  76 /*
  77  * The previous _large_fdset implementations are, unfortunately, baked into
  78  * the ABI.
  79  */
  80 #pragma weak select_large_fdset = select
  81 #pragma weak pselect_large_fdset = pselect
  82 
  83 #define fd_set_size(nfds)       (((nfds) + (NFDBITS - 1)) / NFDBITS)
  84 
  85 static nfds_t
  86 fd_sets_count(int limit, fd_set *in, fd_set *out, fd_set *ex)
  87 {
  88         nfds_t total = 0;
  89 
  90         if (limit <= 0)
  91                 return (0);
  92 
  93         for (int i = 0; i < fd_set_size(limit); i++) {
  94                 long v = (in->fds_bits[i] | out->fds_bits[i] | ex->fds_bits[i]);
  95 
  96                 while (v != 0) {
  97                         v &= v - 1;
  98                         total++;
  99                 }
 100         }
 101 
 102         return (total);
 103 }
 104 
 105 int
 106 pselect(int nfds, fd_set *in0, fd_set *out0, fd_set *ex0,
 107         const timespec_t *tsp, const sigset_t *sigmask)
 108 {
 109         long *in, *out, *ex;
 110         ulong_t m;      /* bit mask */
 111         int j;          /* loop counter */
 112         ulong_t b;      /* bits to test */
 113         int n, rv;
 114         struct pollfd *pfd;
 115         struct pollfd *p;
 116         int lastj = -1;
 117         nfds_t npfds = 0;
 118         boolean_t heap_pfds = B_FALSE;
 119 
 120         /* "zero" is read-only, it could go in the text segment */
 121         static fd_set zero = { 0 };
 122 
 123         /*
 124          * Check for invalid conditions at outset.
 125          * Required for spec1170.
 126          * SUSV3: We must behave as a cancellation point even if we fail early.
 127          */
 128         if (nfds < 0 || nfds > FD_SETSIZE) {
 129                 pthread_testcancel();
 130                 errno = EINVAL;
 131                 return (-1);
 132         }

 133 
 134         if (tsp != NULL) {
 135                 /* check timespec validity */
 136                 if (tsp->tv_nsec < 0 || tsp->tv_nsec >= NANOSEC ||
 137                     tsp->tv_sec < 0) {
 138                         pthread_testcancel();
 139                         errno = EINVAL;
 140                         return (-1);
 141                 }
 142         }
 143 
 144         /*
 145          * If any input args are null, point them at the null array.
 146          */
 147         if (in0 == NULL)
 148                 in0 = &zero;
 149         if (out0 == NULL)
 150                 out0 = &zero;
 151         if (ex0 == NULL)
 152                 ex0 = &zero;
 153 
 154         if (nfds <= BULK_ALLOC_LIM) {
 155                 p = pfd = alloca(nfds * sizeof (struct pollfd));
 156         } else {
 157                 npfds = fd_sets_count(nfds, in0, out0, ex0);
 158 
 159                 if (npfds > STACK_PFD_LIM) {
 160                         p = pfd = malloc(npfds * sizeof (struct pollfd));
 161                         if (p == NULL)
 162                                 return (-1);
 163                         heap_pfds = B_TRUE;
 164                 } else {
 165                         p = pfd = alloca(npfds * sizeof (struct pollfd));
 166                 }
 167         }
 168 
 169         /*
 170          * For each fd, if any bits are set convert them into
 171          * the appropriate pollfd struct.
 172          */
 173         in = (long *)in0->fds_bits;
 174         out = (long *)out0->fds_bits;
 175         ex = (long *)ex0->fds_bits;
 176         for (n = 0; n < nfds; n += NFDBITS) {
 177                 b = (ulong_t)(*in | *out | *ex);
 178                 for (j = 0, m = 1; b != 0; j++, b >>= 1, m <<= 1) {
 179                         if (b & 1) {
 180                                 p->fd = n + j;
 181                                 if (p->fd >= nfds)
 182                                         goto done;
 183                                 p->events = 0;
 184                                 if (*in & m)
 185                                         p->events |= POLLRDNORM;
 186                                 if (*out & m)
 187                                         p->events |= POLLWRNORM;
 188                                 if (*ex & m)
 189                                         p->events |= POLLRDBAND;
 190                                 p++;
 191                         }
 192                 }
 193                 in++;
 194                 out++;
 195                 ex++;
 196         }
 197 done:
 198         /*
 199          * Now do the poll.
 200          */
 201         npfds = (int)(p - pfd);
 202         do {
 203                 rv = _pollsys(pfd, npfds, tsp, sigmask);
 204         } while (rv < 0 && errno == EAGAIN);
 205 
 206         if (rv < 0)          /* no need to set bit masks */
 207                 goto out;
 208 
 209         if (rv == 0) {
 210                 /*
 211                  * Clear out bit masks, just in case.
 212                  * On the assumption that usually only
 213                  * one bit mask is set, use three loops.
 214                  */
 215                 if (in0 != &zero) {
 216                         in = (long *)in0->fds_bits;
 217                         for (n = 0; n < nfds; n += NFDBITS)
 218                                 *in++ = 0;
 219                 }
 220                 if (out0 != &zero) {
 221                         out = (long *)out0->fds_bits;
 222                         for (n = 0; n < nfds; n += NFDBITS)
 223                                 *out++ = 0;
 224                 }
 225                 if (ex0 != &zero) {
 226                         ex = (long *)ex0->fds_bits;
 227                         for (n = 0; n < nfds; n += NFDBITS)
 228                                 *ex++ = 0;
 229                 }
 230                 rv = 0;
 231                 goto out;
 232         }
 233 
 234         /*
 235          * Check for EINVAL error case first to avoid changing any bits
 236          * if we're going to return an error.
 237          */
 238         for (p = pfd, n = npfds; n-- > 0; p++) {
 239                 /*
 240                  * select will return EBADF immediately if any fd's
 241                  * are bad.  poll will complete the poll on the
 242                  * rest of the fd's and include the error indication
 243                  * in the returned bits.  This is a rare case so we
 244                  * accept this difference and return the error after
 245                  * doing more work than select would've done.
 246                  */
 247                 if (p->revents & POLLNVAL) {
 248                         errno = EBADF;
 249                         rv = -1;
 250                         goto out;
 251                 }
 252                 /*
 253                  * We would like to make POLLHUP available to select,
 254                  * checking to see if we have pending data to be read.
 255                  * BUT until we figure out how not to break Xsun's
 256                  * dependencies on select's existing features...
 257                  * This is what we _thought_ would work ... sigh!
 258                  */
 259                 /*
 260                  * if ((p->revents & POLLHUP) &&
 261                  *      !(p->revents & (POLLRDNORM|POLLRDBAND))) {
 262                  *      errno = EINTR;
 263                  *      rv = -1;
 264                  *      goto out;
 265                  * }
 266                  */
 267         }
 268 
 269         /*
 270          * Convert results of poll back into bits
 271          * in the argument arrays.
 272          *
 273          * We assume POLLRDNORM, POLLWRNORM, and POLLRDBAND will only be set
 274          * on return from poll if they were set on input, thus we don't
 275          * worry about accidentally setting the corresponding bits in the
 276          * zero array if the input bit masks were null.
 277          *
 278          * Must return number of bits set, not number of ready descriptors
 279          * (as the man page says, and as poll() does).
 280          */
 281         rv = 0;
 282         for (p = pfd, n = npfds; n-- > 0; p++) {
 283                 j = (int)(p->fd / NFDBITS);
 284                 /* have we moved into another word of the bit mask yet? */
 285                 if (j != lastj) {
 286                         /* clear all output bits to start with */
 287                         in = (long *)&in0->fds_bits[j];
 288                         out = (long *)&out0->fds_bits[j];
 289                         ex = (long *)&ex0->fds_bits[j];
 290                         /*
 291                          * In case we made "zero" read-only (e.g., with
 292                          * cc -R), avoid actually storing into it.
 293                          */
 294                         if (in0 != &zero)
 295                                 *in = 0;
 296                         if (out0 != &zero)
 297                                 *out = 0;
 298                         if (ex0 != &zero)
 299                                 *ex = 0;
 300                         lastj = j;
 301                 }
 302                 if (p->revents) {


 328                          * output conditions.
 329                          */
 330                         if ((p->revents & (POLLHUP|POLLERR)) &&
 331                             (p->events & POLLWRNORM)) {
 332                                 if ((*out & m) == 0)
 333                                         rv++;   /* wasn't already set */
 334                                 *out |= m;
 335                         }
 336                         /*
 337                          * Only set this bit on return if we asked about
 338                          * output conditions.
 339                          */
 340                         if ((p->revents & (POLLHUP|POLLERR)) &&
 341                             (p->events & POLLRDBAND)) {
 342                                 if ((*ex & m) == 0)
 343                                         rv++;   /* wasn't already set */
 344                                 *ex |= m;
 345                         }
 346                 }
 347         }
 348 out:
 349         if (heap_pfds)
 350                 free(pfd);
 351         return (rv);
 352 }
 353 
 354 int
 355 select(int nfds, fd_set *in0, fd_set *out0, fd_set *ex0, struct timeval *tv)
 356 {
 357         timespec_t ts;
 358         timespec_t *tsp;
 359 
 360         if (tv == NULL)
 361                 tsp = NULL;
 362         else {
 363                 /* check timeval validity */
 364                 if (tv->tv_usec < 0 || tv->tv_usec >= MICROSEC) {
 365                         errno = EINVAL;
 366                         return (-1);
 367                 }
 368                 /*
 369                  * Convert timeval to timespec.
 370                  * To preserve compatibility with past behavior,