1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * Copyright (c) 2012 by Delphix. All rights reserved.
  28  * Copyright 2016 Joyent, Inc.
  29  */
  30 
  31 #include <sys/types.h>
  32 #include <sys/devops.h>
  33 #include <sys/conf.h>
  34 #include <sys/modctl.h>
  35 #include <sys/sunddi.h>
  36 #include <sys/stat.h>
  37 #include <sys/poll_impl.h>
  38 #include <sys/errno.h>
  39 #include <sys/kmem.h>
  40 #include <sys/mkdev.h>
  41 #include <sys/debug.h>
  42 #include <sys/file.h>
  43 #include <sys/sysmacros.h>
  44 #include <sys/systm.h>
  45 #include <sys/bitmap.h>
  46 #include <sys/devpoll.h>
  47 #include <sys/rctl.h>
  48 #include <sys/resource.h>
  49 #include <sys/schedctl.h>
  50 #include <sys/epoll.h>
  51 
  52 #define RESERVED        1
  53 
  54 /* local data struct */
  55 static  dp_entry_t      **devpolltbl;   /* dev poll entries */
  56 static  size_t          dptblsize;
  57 
  58 static  kmutex_t        devpoll_lock;   /* lock protecting dev tbl */
  59 int                     devpoll_init;   /* is /dev/poll initialized already */
  60 
  61 /* device local functions */
  62 
  63 static int dpopen(dev_t *devp, int flag, int otyp, cred_t *credp);
  64 static int dpwrite(dev_t dev, struct uio *uiop, cred_t *credp);
  65 static int dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
  66     int *rvalp);
  67 static int dppoll(dev_t dev, short events, int anyyet, short *reventsp,
  68     struct pollhead **phpp);
  69 static int dpclose(dev_t dev, int flag, int otyp, cred_t *credp);
  70 static dev_info_t *dpdevi;
  71 
  72 
  73 static struct cb_ops    dp_cb_ops = {
  74         dpopen,                 /* open */
  75         dpclose,                /* close */
  76         nodev,                  /* strategy */
  77         nodev,                  /* print */
  78         nodev,                  /* dump */
  79         nodev,                  /* read */
  80         dpwrite,                /* write */
  81         dpioctl,                /* ioctl */
  82         nodev,                  /* devmap */
  83         nodev,                  /* mmap */
  84         nodev,                  /* segmap */
  85         dppoll,                 /* poll */
  86         ddi_prop_op,            /* prop_op */
  87         (struct streamtab *)0,  /* streamtab */
  88         D_MP,                   /* flags */
  89         CB_REV,                 /* cb_ops revision */
  90         nodev,                  /* aread */
  91         nodev                   /* awrite */
  92 };
  93 
  94 static int dpattach(dev_info_t *, ddi_attach_cmd_t);
  95 static int dpdetach(dev_info_t *, ddi_detach_cmd_t);
  96 static int dpinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
  97 
  98 static struct dev_ops dp_ops = {
  99         DEVO_REV,               /* devo_rev */
 100         0,                      /* refcnt */
 101         dpinfo,                 /* info */
 102         nulldev,                /* identify */
 103         nulldev,                /* probe */
 104         dpattach,               /* attach */
 105         dpdetach,               /* detach */
 106         nodev,                  /* reset */
 107         &dp_cb_ops,         /* driver operations */
 108         (struct bus_ops *)NULL, /* bus operations */
 109         nulldev,                /* power */
 110         ddi_quiesce_not_needed,         /* quiesce */
 111 };
 112 
 113 
 114 static struct modldrv modldrv = {
 115         &mod_driverops,             /* type of module - a driver */
 116         "/dev/poll driver",
 117         &dp_ops,
 118 };
 119 
 120 static struct modlinkage modlinkage = {
 121         MODREV_1,
 122         {   (void *)&modldrv,
 123             NULL }
 124 };
 125 
 126 static void pcachelink_assoc(pollcache_t *, pollcache_t *);
 127 static void pcachelink_mark_stale(pollcache_t *);
 128 static void pcachelink_purge_stale(pollcache_t *);
 129 static void pcachelink_purge_all(pollcache_t *);
 130 
 131 
 132 /*
 133  * Locking Design
 134  *
 135  * The /dev/poll driver shares most of its code with poll sys call whose
 136  * code is in common/syscall/poll.c. In poll(2) design, the pollcache
 137  * structure is per lwp. An implicit assumption is made there that some
 138  * portion of pollcache will never be touched by other lwps. E.g., in
 139  * poll(2) design, no lwp will ever need to grow bitmap of other lwp.
 140  * This assumption is not true for /dev/poll; hence the need for extra
 141  * locking.
 142  *
 143  * To allow more parallelism, each /dev/poll file descriptor (indexed by
 144  * minor number) has its own lock. Since read (dpioctl) is a much more
 145  * frequent operation than write, we want to allow multiple reads on same
 146  * /dev/poll fd. However, we prevent writes from being starved by giving
 147  * priority to write operation. Theoretically writes can starve reads as
 148  * well. But in practical sense this is not important because (1) writes
 149  * happens less often than reads, and (2) write operation defines the
 150  * content of poll fd a cache set. If writes happens so often that they
 151  * can starve reads, that means the cached set is very unstable. It may
 152  * not make sense to read an unstable cache set anyway. Therefore, the
 153  * writers starving readers case is not handled in this design.
 154  */
 155 
 156 int
 157 _init()
 158 {
 159         int     error;
 160 
 161         dptblsize = DEVPOLLSIZE;
 162         devpolltbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP);
 163         mutex_init(&devpoll_lock, NULL, MUTEX_DEFAULT, NULL);
 164         devpoll_init = 1;
 165         if ((error = mod_install(&modlinkage)) != 0) {
 166                 kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize);
 167                 devpoll_init = 0;
 168         }
 169         return (error);
 170 }
 171 
 172 int
 173 _fini()
 174 {
 175         int error;
 176 
 177         if ((error = mod_remove(&modlinkage)) != 0) {
 178                 return (error);
 179         }
 180         mutex_destroy(&devpoll_lock);
 181         kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize);
 182         return (0);
 183 }
 184 
 185 int
 186 _info(struct modinfo *modinfop)
 187 {
 188         return (mod_info(&modlinkage, modinfop));
 189 }
 190 
 191 /*ARGSUSED*/
 192 static int
 193 dpattach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 194 {
 195         if (ddi_create_minor_node(devi, "poll", S_IFCHR, 0, DDI_PSEUDO, NULL)
 196             == DDI_FAILURE) {
 197                 ddi_remove_minor_node(devi, NULL);
 198                 return (DDI_FAILURE);
 199         }
 200         dpdevi = devi;
 201         return (DDI_SUCCESS);
 202 }
 203 
 204 static int
 205 dpdetach(dev_info_t *devi, ddi_detach_cmd_t cmd)
 206 {
 207         if (cmd != DDI_DETACH)
 208                 return (DDI_FAILURE);
 209 
 210         ddi_remove_minor_node(devi, NULL);
 211         return (DDI_SUCCESS);
 212 }
 213 
 214 /* ARGSUSED */
 215 static int
 216 dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 217 {
 218         int error;
 219 
 220         switch (infocmd) {
 221         case DDI_INFO_DEVT2DEVINFO:
 222                 *result = (void *)dpdevi;
 223                 error = DDI_SUCCESS;
 224                 break;
 225         case DDI_INFO_DEVT2INSTANCE:
 226                 *result = (void *)0;
 227                 error = DDI_SUCCESS;
 228                 break;
 229         default:
 230                 error = DDI_FAILURE;
 231         }
 232         return (error);
 233 }
 234 
 235 /*
 236  * dp_pcache_poll has similar logic to pcache_poll() in poll.c. The major
 237  * differences are: (1) /dev/poll requires scanning the bitmap starting at
 238  * where it was stopped last time, instead of always starting from 0,
 239  * (2) since user may not have cleaned up the cached fds when they are
 240  * closed, some polldats in cache may refer to closed or reused fds. We
 241  * need to check for those cases.
 242  *
 243  * NOTE: Upon closing an fd, automatic poll cache cleanup is done for
 244  *       poll(2) caches but NOT for /dev/poll caches. So expect some
 245  *       stale entries!
 246  */
 247 static int
 248 dp_pcache_poll(dp_entry_t *dpep, void *dpbuf,
 249     pollcache_t *pcp, nfds_t nfds, int *fdcntp)
 250 {
 251         int             start, ostart, end;
 252         int             fdcnt, fd;
 253         boolean_t       done;
 254         file_t          *fp;
 255         short           revent;
 256         boolean_t       no_wrap;
 257         pollhead_t      *php;
 258         polldat_t       *pdp;
 259         pollfd_t        *pfdp;
 260         epoll_event_t   *epoll;
 261         int             error = 0;
 262         short           mask = POLLRDHUP | POLLWRBAND;
 263         boolean_t       is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
 264 
 265         ASSERT(MUTEX_HELD(&pcp->pc_lock));
 266         if (pcp->pc_bitmap == NULL) {
 267                 /*
 268                  * No Need to search because no poll fd
 269                  * has been cached.
 270                  */
 271                 return (error);
 272         }
 273 
 274         if (is_epoll) {
 275                 pfdp = NULL;
 276                 epoll = (epoll_event_t *)dpbuf;
 277         } else {
 278                 pfdp = (pollfd_t *)dpbuf;
 279                 epoll = NULL;
 280         }
 281 retry:
 282         start = ostart = pcp->pc_mapstart;
 283         end = pcp->pc_mapend;
 284         php = NULL;
 285 
 286         if (start == 0) {
 287                 /*
 288                  * started from every begining, no need to wrap around.
 289                  */
 290                 no_wrap = B_TRUE;
 291         } else {
 292                 no_wrap = B_FALSE;
 293         }
 294         done = B_FALSE;
 295         fdcnt = 0;
 296         while ((fdcnt < nfds) && !done) {
 297                 php = NULL;
 298                 revent = 0;
 299                 /*
 300                  * Examine the bit map in a circular fashion
 301                  * to avoid starvation. Always resume from
 302                  * last stop. Scan till end of the map. Then
 303                  * wrap around.
 304                  */
 305                 fd = bt_getlowbit(pcp->pc_bitmap, start, end);
 306                 ASSERT(fd <= end);
 307                 if (fd >= 0) {
 308                         if (fd == end) {
 309                                 if (no_wrap) {
 310                                         done = B_TRUE;
 311                                 } else {
 312                                         start = 0;
 313                                         end = ostart - 1;
 314                                         no_wrap = B_TRUE;
 315                                 }
 316                         } else {
 317                                 start = fd + 1;
 318                         }
 319                         pdp = pcache_lookup_fd(pcp, fd);
 320 repoll:
 321                         ASSERT(pdp != NULL);
 322                         ASSERT(pdp->pd_fd == fd);
 323                         if (pdp->pd_fp == NULL) {
 324                                 /*
 325                                  * The fd is POLLREMOVed. This fd is
 326                                  * logically no longer cached. So move
 327                                  * on to the next one.
 328                                  */
 329                                 continue;
 330                         }
 331                         if ((fp = getf(fd)) == NULL) {
 332                                 /*
 333                                  * The fd has been closed, but user has not
 334                                  * done a POLLREMOVE on this fd yet. Instead
 335                                  * of cleaning it here implicitly, we return
 336                                  * POLLNVAL. This is consistent with poll(2)
 337                                  * polling a closed fd. Hope this will remind
 338                                  * user to do a POLLREMOVE.
 339                                  */
 340                                 if (!is_epoll && pfdp != NULL) {
 341                                         pfdp[fdcnt].fd = fd;
 342                                         pfdp[fdcnt].revents = POLLNVAL;
 343                                         fdcnt++;
 344                                         continue;
 345                                 }
 346 
 347                                 /*
 348                                  * In the epoll compatibility case, we actually
 349                                  * perform the implicit removal to remain
 350                                  * closer to the epoll semantics.
 351                                  */
 352                                 if (is_epoll) {
 353                                         pdp->pd_fp = NULL;
 354                                         pdp->pd_events = 0;
 355 
 356                                         if (php != NULL) {
 357                                                 pollhead_delete(php, pdp);
 358                                                 pdp->pd_php = NULL;
 359                                         }
 360 
 361                                         BT_CLEAR(pcp->pc_bitmap, fd);
 362                                         continue;
 363                                 }
 364                         }
 365 
 366                         if (fp != pdp->pd_fp) {
 367                                 /*
 368                                  * user is polling on a cached fd which was
 369                                  * closed and then reused. Unfortunately
 370                                  * there is no good way to inform user.
 371                                  * If the file struct is also reused, we
 372                                  * may not be able to detect the fd reuse
 373                                  * at all.  As long as this does not
 374                                  * cause system failure and/or memory leak,
 375                                  * we will play along. Man page states if
 376                                  * user does not clean up closed fds, polling
 377                                  * results will be indeterministic.
 378                                  *
 379                                  * XXX - perhaps log the detection of fd
 380                                  *       reuse?
 381                                  */
 382                                 pdp->pd_fp = fp;
 383                         }
 384                         /*
 385                          * XXX - pollrelock() logic needs to know which
 386                          * which pollcache lock to grab. It'd be a
 387                          * cleaner solution if we could pass pcp as
 388                          * an arguement in VOP_POLL interface instead
 389                          * of implicitly passing it using thread_t
 390                          * struct. On the other hand, changing VOP_POLL
 391                          * interface will require all driver/file system
 392                          * poll routine to change. May want to revisit
 393                          * the tradeoff later.
 394                          */
 395                         curthread->t_pollcache = pcp;
 396                         error = VOP_POLL(fp->f_vnode, pdp->pd_events, 0,
 397                             &revent, &php, NULL);
 398                         curthread->t_pollcache = NULL;
 399                         releasef(fd);
 400                         if (error != 0) {
 401                                 break;
 402                         }
 403 
 404                         /*
 405                          * layered devices (e.g. console driver)
 406                          * may change the vnode and thus the pollhead
 407                          * pointer out from underneath us.
 408                          */
 409                         if (php != NULL && pdp->pd_php != NULL &&
 410                             php != pdp->pd_php) {
 411                                 pollhead_delete(pdp->pd_php, pdp);
 412                                 pdp->pd_php = php;
 413                                 pollhead_insert(php, pdp);
 414                                 /*
 415                                  * The bit should still be set.
 416                                  */
 417                                 ASSERT(BT_TEST(pcp->pc_bitmap, fd));
 418                                 goto retry;
 419                         }
 420 
 421                         if (revent != 0) {
 422                                 if (pfdp != NULL) {
 423                                         pfdp[fdcnt].fd = fd;
 424                                         pfdp[fdcnt].events = pdp->pd_events;
 425                                         pfdp[fdcnt].revents = revent;
 426                                 } else if (epoll != NULL) {
 427                                         epoll_event_t *ep = &epoll[fdcnt];
 428 
 429                                         ASSERT(epoll != NULL);
 430                                         ep->data.u64 = pdp->pd_epolldata;
 431 
 432                                         /*
 433                                          * If any of the event bits are set for
 434                                          * which poll and epoll representations
 435                                          * differ, swizzle in the native epoll
 436                                          * values.
 437                                          */
 438                                         if (revent & mask) {
 439                                                 ep->events = (revent & ~mask) |
 440                                                     ((revent & POLLRDHUP) ?
 441                                                     EPOLLRDHUP : 0) |
 442                                                     ((revent & POLLWRBAND) ?
 443                                                     EPOLLWRBAND : 0);
 444                                         } else {
 445                                                 ep->events = revent;
 446                                         }
 447 
 448                                         /*
 449                                          * We define POLLWRNORM to be POLLOUT,
 450                                          * but epoll has separate definitions
 451                                          * for them; if POLLOUT is set and the
 452                                          * user has asked for EPOLLWRNORM, set
 453                                          * that as well.
 454                                          */
 455                                         if ((revent & POLLOUT) &&
 456                                             (pdp->pd_events & EPOLLWRNORM)) {
 457                                                 ep->events |= EPOLLWRNORM;
 458                                         }
 459                                 } else {
 460                                         pollstate_t *ps =
 461                                             curthread->t_pollstate;
 462                                         /*
 463                                          * The devpoll handle itself is being
 464                                          * polled.  Notify the caller of any
 465                                          * readable event(s), leaving as much
 466                                          * state as possible untouched.
 467                                          */
 468                                         VERIFY(fdcnt == 0);
 469                                         VERIFY(ps != NULL);
 470 
 471                                         /*
 472                                          * If a call to pollunlock() fails
 473                                          * during VOP_POLL, skip over the fd
 474                                          * and continue polling.
 475                                          *
 476                                          * Otherwise, report that there is an
 477                                          * event pending.
 478                                          */
 479                                         if ((ps->ps_flags & POLLSTATE_ULFAIL)
 480                                             != 0) {
 481                                                 ps->ps_flags &=
 482                                                     ~POLLSTATE_ULFAIL;
 483                                                 continue;
 484                                         } else {
 485                                                 fdcnt++;
 486                                                 break;
 487                                         }
 488                                 }
 489 
 490                                 /*
 491                                  * If POLLET is set, clear the bit in the
 492                                  * bitmap -- which effectively latches the
 493                                  * edge on a pollwakeup() from the driver.
 494                                  */
 495                                 if (pdp->pd_events & POLLET)
 496                                         BT_CLEAR(pcp->pc_bitmap, fd);
 497 
 498                                 /*
 499                                  * If POLLONESHOT is set, perform the implicit
 500                                  * POLLREMOVE.
 501                                  */
 502                                 if (pdp->pd_events & POLLONESHOT) {
 503                                         pdp->pd_fp = NULL;
 504                                         pdp->pd_events = 0;
 505 
 506                                         if (php != NULL) {
 507                                                 pollhead_delete(php, pdp);
 508                                                 pdp->pd_php = NULL;
 509                                         }
 510 
 511                                         BT_CLEAR(pcp->pc_bitmap, fd);
 512                                 }
 513 
 514                                 fdcnt++;
 515                         } else if (php != NULL) {
 516                                 /*
 517                                  * We clear a bit or cache a poll fd if
 518                                  * the driver returns a poll head ptr,
 519                                  * which is expected in the case of 0
 520                                  * revents. Some buggy driver may return
 521                                  * NULL php pointer with 0 revents. In
 522                                  * this case, we just treat the driver as
 523                                  * "noncachable" and not clearing the bit
 524                                  * in bitmap.
 525                                  */
 526                                 if ((pdp->pd_php != NULL) &&
 527                                     ((pcp->pc_flag & PC_POLLWAKE) == 0)) {
 528                                         BT_CLEAR(pcp->pc_bitmap, fd);
 529                                 }
 530                                 if (pdp->pd_php == NULL) {
 531                                         pollhead_insert(php, pdp);
 532                                         pdp->pd_php = php;
 533                                         /*
 534                                          * An event of interest may have
 535                                          * arrived between the VOP_POLL() and
 536                                          * the pollhead_insert(); check again.
 537                                          */
 538                                         goto repoll;
 539                                 }
 540                         }
 541                 } else {
 542                         /*
 543                          * No bit set in the range. Check for wrap around.
 544                          */
 545                         if (!no_wrap) {
 546                                 start = 0;
 547                                 end = ostart - 1;
 548                                 no_wrap = B_TRUE;
 549                         } else {
 550                                 done = B_TRUE;
 551                         }
 552                 }
 553         }
 554 
 555         if (!done) {
 556                 pcp->pc_mapstart = start;
 557         }
 558         ASSERT(*fdcntp == 0);
 559         *fdcntp = fdcnt;
 560         return (error);
 561 }
 562 
 563 /*ARGSUSED*/
 564 static int
 565 dpopen(dev_t *devp, int flag, int otyp, cred_t *credp)
 566 {
 567         minor_t         minordev;
 568         dp_entry_t      *dpep;
 569         pollcache_t     *pcp;
 570 
 571         ASSERT(devpoll_init);
 572         ASSERT(dptblsize <= MAXMIN);
 573         mutex_enter(&devpoll_lock);
 574         for (minordev = 0; minordev < dptblsize; minordev++) {
 575                 if (devpolltbl[minordev] == NULL) {
 576                         devpolltbl[minordev] = (dp_entry_t *)RESERVED;
 577                         break;
 578                 }
 579         }
 580         if (minordev == dptblsize) {
 581                 dp_entry_t      **newtbl;
 582                 size_t          oldsize;
 583 
 584                 /*
 585                  * Used up every entry in the existing devpoll table.
 586                  * Grow the table by DEVPOLLSIZE.
 587                  */
 588                 if ((oldsize = dptblsize) >= MAXMIN) {
 589                         mutex_exit(&devpoll_lock);
 590                         return (ENXIO);
 591                 }
 592                 dptblsize += DEVPOLLSIZE;
 593                 if (dptblsize > MAXMIN) {
 594                         dptblsize = MAXMIN;
 595                 }
 596                 newtbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP);
 597                 bcopy(devpolltbl, newtbl, sizeof (caddr_t) * oldsize);
 598                 kmem_free(devpolltbl, sizeof (caddr_t) * oldsize);
 599                 devpolltbl = newtbl;
 600                 devpolltbl[minordev] = (dp_entry_t *)RESERVED;
 601         }
 602         mutex_exit(&devpoll_lock);
 603 
 604         dpep = kmem_zalloc(sizeof (dp_entry_t), KM_SLEEP);
 605         /*
 606          * allocate a pollcache skeleton here. Delay allocating bitmap
 607          * structures until dpwrite() time, since we don't know the
 608          * optimal size yet.  We also delay setting the pid until either
 609          * dpwrite() or attempt to poll on the instance, allowing parents
 610          * to create instances of /dev/poll for their children.  (In the
 611          * epoll compatibility case, this check isn't performed to maintain
 612          * semantic compatibility.)
 613          */
 614         pcp = pcache_alloc();
 615         dpep->dpe_pcache = pcp;
 616         pcp->pc_pid = -1;
 617         *devp = makedevice(getmajor(*devp), minordev);  /* clone the driver */
 618         mutex_enter(&devpoll_lock);
 619         ASSERT(minordev < dptblsize);
 620         ASSERT(devpolltbl[minordev] == (dp_entry_t *)RESERVED);
 621         devpolltbl[minordev] = dpep;
 622         mutex_exit(&devpoll_lock);
 623         return (0);
 624 }
 625 
 626 /*
 627  * Write to dev/poll add/remove fd's to/from a cached poll fd set,
 628  * or change poll events for a watched fd.
 629  */
 630 /*ARGSUSED*/
 631 static int
 632 dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
 633 {
 634         minor_t         minor;
 635         dp_entry_t      *dpep;
 636         pollcache_t     *pcp;
 637         pollfd_t        *pollfdp, *pfdp;
 638         dvpoll_epollfd_t *epfdp;
 639         uintptr_t       limit;
 640         int             error, size;
 641         ssize_t         uiosize;
 642         nfds_t          pollfdnum;
 643         struct pollhead *php = NULL;
 644         polldat_t       *pdp;
 645         int             fd;
 646         file_t          *fp;
 647         boolean_t       is_epoll, fds_added = B_FALSE;
 648 
 649         minor = getminor(dev);
 650 
 651         mutex_enter(&devpoll_lock);
 652         ASSERT(minor < dptblsize);
 653         dpep = devpolltbl[minor];
 654         ASSERT(dpep != NULL);
 655         mutex_exit(&devpoll_lock);
 656 
 657         mutex_enter(&dpep->dpe_lock);
 658         pcp = dpep->dpe_pcache;
 659         is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
 660         size = (is_epoll) ? sizeof (dvpoll_epollfd_t) : sizeof (pollfd_t);
 661         mutex_exit(&dpep->dpe_lock);
 662 
 663         if (!is_epoll && curproc->p_pid != pcp->pc_pid) {
 664                 if (pcp->pc_pid != -1) {
 665                         return (EACCES);
 666                 }
 667 
 668                 pcp->pc_pid = curproc->p_pid;
 669         }
 670 
 671         uiosize = uiop->uio_resid;
 672         pollfdnum = uiosize / size;
 673         mutex_enter(&curproc->p_lock);
 674         if (pollfdnum > (uint_t)rctl_enforced_value(
 675             rctlproc_legacy[RLIMIT_NOFILE], curproc->p_rctls, curproc)) {
 676                 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
 677                     curproc->p_rctls, curproc, RCA_SAFE);
 678                 mutex_exit(&curproc->p_lock);
 679                 return (EINVAL);
 680         }
 681         mutex_exit(&curproc->p_lock);
 682         /*
 683          * Copy in the pollfd array.  Walk through the array and add
 684          * each polled fd to the cached set.
 685          */
 686         pollfdp = kmem_alloc(uiosize, KM_SLEEP);
 687         limit = (uintptr_t)pollfdp + (pollfdnum * size);
 688 
 689         /*
 690          * Although /dev/poll uses the write(2) interface to cache fds, it's
 691          * not supposed to function as a seekable device. To prevent offset
 692          * from growing and eventually exceed the maximum, reset the offset
 693          * here for every call.
 694          */
 695         uiop->uio_loffset = 0;
 696         if ((error = uiomove((caddr_t)pollfdp, uiosize, UIO_WRITE, uiop))
 697             != 0) {
 698                 kmem_free(pollfdp, uiosize);
 699                 return (error);
 700         }
 701         /*
 702          * We are about to enter the core portion of dpwrite(). Make sure this
 703          * write has exclusive access in this portion of the code, i.e., no
 704          * other writers in this code.
 705          *
 706          * Waiting for all readers to drop their references to the dpe is
 707          * unecessary since the pollcache itself is protected by pc_lock.
 708          */
 709         mutex_enter(&dpep->dpe_lock);
 710         dpep->dpe_writerwait++;
 711         while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0) {
 712                 ASSERT(dpep->dpe_refcnt != 0);
 713 
 714                 if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
 715                         dpep->dpe_writerwait--;
 716                         mutex_exit(&dpep->dpe_lock);
 717                         kmem_free(pollfdp, uiosize);
 718                         return (EINTR);
 719                 }
 720         }
 721         dpep->dpe_writerwait--;
 722         dpep->dpe_flag |= DP_WRITER_PRESENT;
 723         dpep->dpe_refcnt++;
 724 
 725         if (!is_epoll && (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0) {
 726                 /*
 727                  * The epoll compat mode was enabled while we were waiting to
 728                  * establish write access. It is not safe to continue since
 729                  * state was prepared for non-epoll operation.
 730                  */
 731                 error = EBUSY;
 732                 goto bypass;
 733         }
 734         mutex_exit(&dpep->dpe_lock);
 735 
 736         /*
 737          * Since the dpwrite() may recursively walk an added /dev/poll handle,
 738          * pollstate_enter() deadlock and loop detection must be used.
 739          */
 740         (void) pollstate_create();
 741         VERIFY(pollstate_enter(pcp) == PSE_SUCCESS);
 742 
 743         if (pcp->pc_bitmap == NULL) {
 744                 pcache_create(pcp, pollfdnum);
 745         }
 746         for (pfdp = pollfdp; (uintptr_t)pfdp < limit;
 747             pfdp = (pollfd_t *)((uintptr_t)pfdp + size)) {
 748                 fd = pfdp->fd;
 749                 if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) {
 750                         /*
 751                          * epoll semantics demand that we return EBADF if our
 752                          * specified fd is invalid.
 753                          */
 754                         if (is_epoll) {
 755                                 error = EBADF;
 756                                 break;
 757                         }
 758 
 759                         continue;
 760                 }
 761 
 762                 pdp = pcache_lookup_fd(pcp, fd);
 763                 if (pfdp->events != POLLREMOVE) {
 764 
 765                         fp = NULL;
 766 
 767                         if (pdp == NULL) {
 768                                 /*
 769                                  * If we're in epoll compatibility mode, check
 770                                  * that the fd is valid before allocating
 771                                  * anything for it; epoll semantics demand that
 772                                  * we return EBADF if our specified fd is
 773                                  * invalid.
 774                                  */
 775                                 if (is_epoll) {
 776                                         if ((fp = getf(fd)) == NULL) {
 777                                                 error = EBADF;
 778                                                 break;
 779                                         }
 780                                 }
 781 
 782                                 pdp = pcache_alloc_fd(0);
 783                                 pdp->pd_fd = fd;
 784                                 pdp->pd_pcache = pcp;
 785                                 pcache_insert_fd(pcp, pdp, pollfdnum);
 786                         } else {
 787                                 /*
 788                                  * epoll semantics demand that we error out if
 789                                  * a file descriptor is added twice, which we
 790                                  * check (imperfectly) by checking if we both
 791                                  * have the file descriptor cached and the
 792                                  * file pointer that correponds to the file
 793                                  * descriptor matches our cached value.  If
 794                                  * there is a pointer mismatch, the file
 795                                  * descriptor was closed without being removed.
 796                                  * The converse is clearly not true, however,
 797                                  * so to narrow the window by which a spurious
 798                                  * EEXIST may be returned, we also check if
 799                                  * this fp has been added to an epoll control
 800                                  * descriptor in the past; if it hasn't, we
 801                                  * know that this is due to fp reuse -- it's
 802                                  * not a true EEXIST case.  (By performing this
 803                                  * additional check, we limit the window of
 804                                  * spurious EEXIST to situations where a single
 805                                  * file descriptor is being used across two or
 806                                  * more epoll control descriptors -- and even
 807                                  * then, the file descriptor must be closed and
 808                                  * reused in a relatively tight time span.)
 809                                  */
 810                                 if (is_epoll) {
 811                                         if (pdp->pd_fp != NULL &&
 812                                             (fp = getf(fd)) != NULL &&
 813                                             fp == pdp->pd_fp &&
 814                                             (fp->f_flag2 & FEPOLLED)) {
 815                                                 error = EEXIST;
 816                                                 releasef(fd);
 817                                                 break;
 818                                         }
 819 
 820                                         /*
 821                                          * We have decided that the cached
 822                                          * information was stale: it either
 823                                          * didn't match, or the fp had never
 824                                          * actually been epoll()'d on before.
 825                                          * We need to now clear our pd_events
 826                                          * to assure that we don't mistakenly
 827                                          * operate on cached event disposition.
 828                                          */
 829                                         pdp->pd_events = 0;
 830                                 }
 831                         }
 832 
 833                         if (is_epoll) {
 834                                 epfdp = (dvpoll_epollfd_t *)pfdp;
 835                                 pdp->pd_epolldata = epfdp->dpep_data;
 836                         }
 837 
 838                         ASSERT(pdp->pd_fd == fd);
 839                         ASSERT(pdp->pd_pcache == pcp);
 840                         if (fd >= pcp->pc_mapsize) {
 841                                 mutex_exit(&pcp->pc_lock);
 842                                 pcache_grow_map(pcp, fd);
 843                                 mutex_enter(&pcp->pc_lock);
 844                         }
 845                         if (fd > pcp->pc_mapend) {
 846                                 pcp->pc_mapend = fd;
 847                         }
 848                         if (fp == NULL && (fp = getf(fd)) == NULL) {
 849                                 /*
 850                                  * The fd is not valid. Since we can't pass
 851                                  * this error back in the write() call, set
 852                                  * the bit in bitmap to force DP_POLL ioctl
 853                                  * to examine it.
 854                                  */
 855                                 BT_SET(pcp->pc_bitmap, fd);
 856                                 pdp->pd_events |= pfdp->events;
 857                                 continue;
 858                         }
 859 
 860                         /*
 861                          * To (greatly) reduce EEXIST false positives, we
 862                          * denote that this fp has been epoll()'d.  We do this
 863                          * regardless of epoll compatibility mode, as the flag
 864                          * is harmless if not in epoll compatibility mode.
 865                          */
 866                         fp->f_flag2 |= FEPOLLED;
 867 
 868                         /*
 869                          * Don't do VOP_POLL for an already cached fd with
 870                          * same poll events.
 871                          */
 872                         if ((pdp->pd_events == pfdp->events) &&
 873                             (pdp->pd_fp == fp)) {
 874                                 /*
 875                                  * the events are already cached
 876                                  */
 877                                 releasef(fd);
 878                                 continue;
 879                         }
 880 
 881                         /*
 882                          * do VOP_POLL and cache this poll fd.
 883                          */
 884                         /*
 885                          * XXX - pollrelock() logic needs to know which
 886                          * which pollcache lock to grab. It'd be a
 887                          * cleaner solution if we could pass pcp as
 888                          * an arguement in VOP_POLL interface instead
 889                          * of implicitly passing it using thread_t
 890                          * struct. On the other hand, changing VOP_POLL
 891                          * interface will require all driver/file system
 892                          * poll routine to change. May want to revisit
 893                          * the tradeoff later.
 894                          */
 895                         curthread->t_pollcache = pcp;
 896                         error = VOP_POLL(fp->f_vnode, pfdp->events, 0,
 897                             &pfdp->revents, &php, NULL);
 898                         curthread->t_pollcache = NULL;
 899                         /*
 900                          * We always set the bit when this fd is cached;
 901                          * this forces the first DP_POLL to poll this fd.
 902                          * Real performance gain comes from subsequent
 903                          * DP_POLL.  We also attempt a pollhead_insert();
 904                          * if it's not possible, we'll do it in dpioctl().
 905                          */
 906                         BT_SET(pcp->pc_bitmap, fd);
 907                         if (error != 0) {
 908                                 releasef(fd);
 909                                 break;
 910                         }
 911                         pdp->pd_fp = fp;
 912                         pdp->pd_events |= pfdp->events;
 913                         if (php != NULL) {
 914                                 if (pdp->pd_php == NULL) {
 915                                         pollhead_insert(php, pdp);
 916                                         pdp->pd_php = php;
 917                                 } else {
 918                                         if (pdp->pd_php != php) {
 919                                                 pollhead_delete(pdp->pd_php,
 920                                                     pdp);
 921                                                 pollhead_insert(php, pdp);
 922                                                 pdp->pd_php = php;
 923                                         }
 924                                 }
 925                         }
 926                         fds_added = B_TRUE;
 927                         releasef(fd);
 928                 } else {
 929                         if (pdp == NULL || pdp->pd_fp == NULL) {
 930                                 if (is_epoll) {
 931                                         /*
 932                                          * As with the add case (above), epoll
 933                                          * semantics demand that we error out
 934                                          * in this case.
 935                                          */
 936                                         error = ENOENT;
 937                                         break;
 938                                 }
 939 
 940                                 continue;
 941                         }
 942                         ASSERT(pdp->pd_fd == fd);
 943                         pdp->pd_fp = NULL;
 944                         pdp->pd_events = 0;
 945                         ASSERT(pdp->pd_thread == NULL);
 946                         if (pdp->pd_php != NULL) {
 947                                 pollhead_delete(pdp->pd_php, pdp);
 948                                 pdp->pd_php = NULL;
 949                         }
 950                         BT_CLEAR(pcp->pc_bitmap, fd);
 951                 }
 952         }
 953         /*
 954          * Wake any pollcache waiters so they can check the new descriptors.
 955          *
 956          * Any fds added to an recursive-capable pollcache could themselves be
 957          * /dev/poll handles. To ensure that proper event propagation occurs,
 958          * parent pollcaches are woken too, so that they can create any needed
 959          * pollcache links.
 960          */
 961         if (fds_added) {
 962                 cv_broadcast(&pcp->pc_cv);
 963                 pcache_wake_parents(pcp);
 964         }
 965         pollstate_exit(pcp);
 966         mutex_enter(&dpep->dpe_lock);
 967 bypass:
 968         dpep->dpe_flag &= ~DP_WRITER_PRESENT;
 969         dpep->dpe_refcnt--;
 970         cv_broadcast(&dpep->dpe_cv);
 971         mutex_exit(&dpep->dpe_lock);
 972         kmem_free(pollfdp, uiosize);
 973         return (error);
 974 }
 975 
 976 #define DP_SIGMASK_RESTORE(ksetp) {                                     \
 977         if (ksetp != NULL) {                                            \
 978                 mutex_enter(&p->p_lock);                         \
 979                 if (lwp->lwp_cursig == 0) {                          \
 980                         t->t_hold = lwp->lwp_sigoldmask;          \
 981                         t->t_flag &= ~T_TOMASK;                          \
 982                 }                                                       \
 983                 mutex_exit(&p->p_lock);                                  \
 984         }                                                               \
 985 }
 986 
 987 /*ARGSUSED*/
 988 static int
 989 dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
 990 {
 991         minor_t         minor;
 992         dp_entry_t      *dpep;
 993         pollcache_t     *pcp;
 994         hrtime_t        now;
 995         int             error = 0;
 996         boolean_t       is_epoll;
 997         STRUCT_DECL(dvpoll, dvpoll);
 998 
 999         if (cmd == DP_POLL || cmd == DP_PPOLL) {
1000                 /* do this now, before we sleep on DP_WRITER_PRESENT */
1001                 now = gethrtime();
1002         }
1003 
1004         minor = getminor(dev);
1005         mutex_enter(&devpoll_lock);
1006         ASSERT(minor < dptblsize);
1007         dpep = devpolltbl[minor];
1008         mutex_exit(&devpoll_lock);
1009         ASSERT(dpep != NULL);
1010         pcp = dpep->dpe_pcache;
1011 
1012         mutex_enter(&dpep->dpe_lock);
1013         is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
1014 
1015         if (cmd == DP_EPOLLCOMPAT) {
1016                 if (dpep->dpe_refcnt != 0) {
1017                         /*
1018                          * We can't turn on epoll compatibility while there
1019                          * are outstanding operations.
1020                          */
1021                         mutex_exit(&dpep->dpe_lock);
1022                         return (EBUSY);
1023                 }
1024 
1025                 /*
1026                  * epoll compatibility is a one-way street: there's no way
1027                  * to turn it off for a particular open.
1028                  */
1029                 dpep->dpe_flag |= DP_ISEPOLLCOMPAT;
1030                 mutex_exit(&dpep->dpe_lock);
1031 
1032                 return (0);
1033         }
1034 
1035         if (!is_epoll && curproc->p_pid != pcp->pc_pid) {
1036                 if (pcp->pc_pid != -1) {
1037                         mutex_exit(&dpep->dpe_lock);
1038                         return (EACCES);
1039                 }
1040 
1041                 pcp->pc_pid = curproc->p_pid;
1042         }
1043 
1044         /* Wait until all writers have cleared the handle before continuing */
1045         while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0 ||
1046             (dpep->dpe_writerwait != 0)) {
1047                 if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
1048                         mutex_exit(&dpep->dpe_lock);
1049                         return (EINTR);
1050                 }
1051         }
1052         dpep->dpe_refcnt++;
1053         mutex_exit(&dpep->dpe_lock);
1054 
1055         switch (cmd) {
1056         case    DP_POLL:
1057         case    DP_PPOLL:
1058         {
1059                 pollstate_t     *ps;
1060                 nfds_t          nfds;
1061                 int             fdcnt = 0;
1062                 size_t          size, fdsize, dpsize;
1063                 hrtime_t        deadline = 0;
1064                 k_sigset_t      *ksetp = NULL;
1065                 k_sigset_t      kset;
1066                 sigset_t        set;
1067                 kthread_t       *t = curthread;
1068                 klwp_t          *lwp = ttolwp(t);
1069                 struct proc     *p = ttoproc(curthread);
1070 
1071                 STRUCT_INIT(dvpoll, mode);
1072 
1073                 /*
1074                  * The dp_setp member is only required/consumed for DP_PPOLL,
1075                  * which otherwise uses the same structure as DP_POLL.
1076                  */
1077                 if (cmd == DP_POLL) {
1078                         dpsize = (uintptr_t)STRUCT_FADDR(dvpoll, dp_setp) -
1079                             (uintptr_t)STRUCT_FADDR(dvpoll, dp_fds);
1080                 } else {
1081                         ASSERT(cmd == DP_PPOLL);
1082                         dpsize = STRUCT_SIZE(dvpoll);
1083                 }
1084 
1085                 if ((mode & FKIOCTL) != 0) {
1086                         /* Kernel-internal ioctl call */
1087                         bcopy((caddr_t)arg, STRUCT_BUF(dvpoll), dpsize);
1088                         error = 0;
1089                 } else {
1090                         error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll),
1091                             dpsize);
1092                 }
1093 
1094                 if (error) {
1095                         DP_REFRELE(dpep);
1096                         return (EFAULT);
1097                 }
1098 
1099                 deadline = STRUCT_FGET(dvpoll, dp_timeout);
1100                 if (deadline > 0) {
1101                         /*
1102                          * Convert the deadline from relative milliseconds
1103                          * to absolute nanoseconds.  They must wait for at
1104                          * least a tick.
1105                          */
1106                         deadline = MSEC2NSEC(deadline);
1107                         deadline = MAX(deadline, nsec_per_tick);
1108                         deadline += now;
1109                 }
1110 
1111                 if (cmd == DP_PPOLL) {
1112                         void *setp = STRUCT_FGETP(dvpoll, dp_setp);
1113 
1114                         if (setp != NULL) {
1115                                 if (copyin(setp, &set, sizeof (set))) {
1116                                         DP_REFRELE(dpep);
1117                                         return (EFAULT);
1118                                 }
1119 
1120                                 sigutok(&set, &kset);
1121                                 ksetp = &kset;
1122 
1123                                 mutex_enter(&p->p_lock);
1124                                 schedctl_finish_sigblock(t);
1125                                 lwp->lwp_sigoldmask = t->t_hold;
1126                                 t->t_hold = *ksetp;
1127                                 t->t_flag |= T_TOMASK;
1128 
1129                                 /*
1130                                  * Like ppoll() with a non-NULL sigset, we'll
1131                                  * call cv_reltimedwait_sig() just to check for
1132                                  * signals.  This call will return immediately
1133                                  * with either 0 (signalled) or -1 (no signal).
1134                                  * There are some conditions whereby we can
1135                                  * get 0 from cv_reltimedwait_sig() without
1136                                  * a true signal (e.g., a directed stop), so
1137                                  * we restore our signal mask in the unlikely
1138                                  * event that lwp_cursig is 0.
1139                                  */
1140                                 if (!cv_reltimedwait_sig(&t->t_delay_cv,
1141                                     &p->p_lock, 0, TR_CLOCK_TICK)) {
1142                                         if (lwp->lwp_cursig == 0) {
1143                                                 t->t_hold = lwp->lwp_sigoldmask;
1144                                                 t->t_flag &= ~T_TOMASK;
1145                                         }
1146 
1147                                         mutex_exit(&p->p_lock);
1148 
1149                                         DP_REFRELE(dpep);
1150                                         return (EINTR);
1151                                 }
1152 
1153                                 mutex_exit(&p->p_lock);
1154                         }
1155                 }
1156 
1157                 if ((nfds = STRUCT_FGET(dvpoll, dp_nfds)) == 0) {
1158                         /*
1159                          * We are just using DP_POLL to sleep, so
1160                          * we don't any of the devpoll apparatus.
1161                          * Do not check for signals if we have a zero timeout.
1162                          */
1163                         DP_REFRELE(dpep);
1164                         if (deadline == 0) {
1165                                 DP_SIGMASK_RESTORE(ksetp);
1166                                 return (0);
1167                         }
1168 
1169                         mutex_enter(&curthread->t_delay_lock);
1170                         while ((error =
1171                             cv_timedwait_sig_hrtime(&curthread->t_delay_cv,
1172                             &curthread->t_delay_lock, deadline)) > 0)
1173                                 continue;
1174                         mutex_exit(&curthread->t_delay_lock);
1175 
1176                         DP_SIGMASK_RESTORE(ksetp);
1177 
1178                         return (error == 0 ? EINTR : 0);
1179                 }
1180 
1181                 if (is_epoll) {
1182                         size = nfds * (fdsize = sizeof (epoll_event_t));
1183                 } else {
1184                         size = nfds * (fdsize = sizeof (pollfd_t));
1185                 }
1186 
1187                 /*
1188                  * XXX It would be nice not to have to alloc each time, but it
1189                  * requires another per thread structure hook. This can be
1190                  * implemented later if data suggests that it's necessary.
1191                  */
1192                 ps = pollstate_create();
1193 
1194                 if (ps->ps_dpbufsize < size) {
1195                         /*
1196                          * If nfds is larger than twice the current maximum
1197                          * open file count, we'll silently clamp it.  This
1198                          * only limits our exposure to allocating an
1199                          * inordinate amount of kernel memory; it doesn't
1200                          * otherwise affect the semantics.  (We have this
1201                          * check at twice the maximum instead of merely the
1202                          * maximum because some applications pass an nfds that
1203                          * is only slightly larger than their limit.)
1204                          */
1205                         mutex_enter(&p->p_lock);
1206                         if ((nfds >> 1) > p->p_fno_ctl) {
1207                                 nfds = p->p_fno_ctl;
1208                                 size = nfds * fdsize;
1209                         }
1210                         mutex_exit(&p->p_lock);
1211 
1212                         if (ps->ps_dpbufsize < size) {
1213                                 kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize);
1214                                 ps->ps_dpbuf = kmem_zalloc(size, KM_SLEEP);
1215                                 ps->ps_dpbufsize = size;
1216                         }
1217                 }
1218 
1219                 VERIFY(pollstate_enter(pcp) == PSE_SUCCESS);
1220                 for (;;) {
1221                         pcp->pc_flag &= ~PC_POLLWAKE;
1222 
1223                         /*
1224                          * Mark all child pcachelinks as stale.
1225                          * Those which are still part of the tree will be
1226                          * marked as valid during the poll.
1227                          */
1228                         pcachelink_mark_stale(pcp);
1229 
1230                         error = dp_pcache_poll(dpep, ps->ps_dpbuf,
1231                             pcp, nfds, &fdcnt);
1232                         if (fdcnt > 0 || error != 0)
1233                                 break;
1234 
1235                         /* Purge still-stale child pcachelinks */
1236                         pcachelink_purge_stale(pcp);
1237 
1238                         /*
1239                          * A pollwake has happened since we polled cache.
1240                          */
1241                         if (pcp->pc_flag & PC_POLLWAKE)
1242                                 continue;
1243 
1244                         /*
1245                          * Sleep until we are notified, signaled, or timed out.
1246                          */
1247                         if (deadline == 0) {
1248                                 /* immediate timeout; do not check signals */
1249                                 break;
1250                         }
1251 
1252                         error = cv_timedwait_sig_hrtime(&pcp->pc_cv,
1253                             &pcp->pc_lock, deadline);
1254 
1255                         /*
1256                          * If we were awakened by a signal or timeout then
1257                          * break the loop, else poll again.
1258                          */
1259                         if (error <= 0) {
1260                                 error = (error == 0) ? EINTR : 0;
1261                                 break;
1262                         } else {
1263                                 error = 0;
1264                         }
1265                 }
1266                 pollstate_exit(pcp);
1267 
1268                 DP_SIGMASK_RESTORE(ksetp);
1269 
1270                 if (error == 0 && fdcnt > 0) {
1271                         if (copyout(ps->ps_dpbuf,
1272                             STRUCT_FGETP(dvpoll, dp_fds), fdcnt * fdsize)) {
1273                                 DP_REFRELE(dpep);
1274                                 return (EFAULT);
1275                         }
1276                         *rvalp = fdcnt;
1277                 }
1278                 break;
1279         }
1280 
1281         case    DP_ISPOLLED:
1282         {
1283                 pollfd_t        pollfd;
1284                 polldat_t       *pdp;
1285 
1286                 STRUCT_INIT(dvpoll, mode);
1287                 error = copyin((caddr_t)arg, &pollfd, sizeof (pollfd_t));
1288                 if (error) {
1289                         DP_REFRELE(dpep);
1290                         return (EFAULT);
1291                 }
1292                 mutex_enter(&pcp->pc_lock);
1293                 if (pcp->pc_hash == NULL) {
1294                         /*
1295                          * No Need to search because no poll fd
1296                          * has been cached.
1297                          */
1298                         mutex_exit(&pcp->pc_lock);
1299                         DP_REFRELE(dpep);
1300                         return (0);
1301                 }
1302                 if (pollfd.fd < 0) {
1303                         mutex_exit(&pcp->pc_lock);
1304                         break;
1305                 }
1306                 pdp = pcache_lookup_fd(pcp, pollfd.fd);
1307                 if ((pdp != NULL) && (pdp->pd_fd == pollfd.fd) &&
1308                     (pdp->pd_fp != NULL)) {
1309                         pollfd.revents = pdp->pd_events;
1310                         if (copyout(&pollfd, (caddr_t)arg, sizeof (pollfd_t))) {
1311                                 mutex_exit(&pcp->pc_lock);
1312                                 DP_REFRELE(dpep);
1313                                 return (EFAULT);
1314                         }
1315                         *rvalp = 1;
1316                 }
1317                 mutex_exit(&pcp->pc_lock);
1318                 break;
1319         }
1320 
1321         default:
1322                 DP_REFRELE(dpep);
1323                 return (EINVAL);
1324         }
1325         DP_REFRELE(dpep);
1326         return (error);
1327 }
1328 
1329 /*
1330  * Overview of Recursive Polling
1331  *
1332  * It is possible for /dev/poll to poll for events on file descriptors which
1333  * themselves are /dev/poll handles.  Pending events in the child handle are
1334  * represented as readable data via the POLLIN flag.  To limit surface area,
1335  * this recursion is presently allowed on only /dev/poll handles which have
1336  * been placed in epoll mode via the DP_EPOLLCOMPAT ioctl.  Recursion depth is
1337  * limited to 5 in order to be consistent with Linux epoll.
1338  *
1339  * Extending dppoll() for VOP_POLL:
1340  *
1341  * The recursive /dev/poll implementation begins by extending dppoll() to
1342  * report when resources contained in the pollcache have relevant event state.
1343  * At the highest level, it means calling dp_pcache_poll() so it indicates if
1344  * fd events are present without consuming them or altering the pollcache
1345  * bitmap.  This ensures that a subsequent DP_POLL operation on the bitmap will
1346  * yield the initiating event.  Additionally, the VOP_POLL should return in
1347  * such a way that dp_pcache_poll() does not clear the parent bitmap entry
1348  * which corresponds to the child /dev/poll fd.  This means that child
1349  * pollcaches will be checked during every poll which facilitates wake-up
1350  * behavior detailed below.
1351  *
1352  * Pollcache Links and Wake Events:
1353  *
1354  * Recursive /dev/poll avoids complicated pollcache locking constraints during
1355  * pollwakeup events by eschewing the traditional pollhead mechanism in favor
1356  * of a different approach.  For each pollcache at the root of a recursive
1357  * /dev/poll "tree", pcachelink_t structures are established to all child
1358  * /dev/poll pollcaches.  During pollnotify() in a child pollcache, the
1359  * linked list of pcachelink_t entries is walked, where those marked as valid
1360  * incur a cv_broadcast to their parent pollcache.  Most notably, these
1361  * pcachelink_t cv wakeups are performed without acquiring pc_lock on the
1362  * parent pollcache (which would require careful deadlock avoidance).  This
1363  * still allows the woken poll on the parent to discover the pertinent events
1364  * due to the fact that bitmap entires for the child pollcache are always
1365  * maintained by the dppoll() logic above.
1366  *
1367  * Depth Limiting and Loop Prevention:
1368  *
1369  * As each pollcache is encountered (either via DP_POLL or dppoll()), depth and
1370  * loop constraints are enforced via pollstate_enter().  The pollcache_t
1371  * pointer is compared against any existing entries in ps_pc_stack and is added
1372  * to the end if no match (and therefore loop) is found.  Once poll operations
1373  * for a given pollcache_t are complete, pollstate_exit() clears the pointer
1374  * from the list.  The pollstate_enter() and pollstate_exit() functions are
1375  * responsible for acquiring and releasing pc_lock, respectively.
1376  *
1377  * Deadlock Safety:
1378  *
1379  * Descending through a tree of recursive /dev/poll handles involves the tricky
1380  * business of sequentially entering multiple pollcache locks.  This tree
1381  * topology cannot define a lock acquisition order in such a way that it is
1382  * immune to deadlocks between threads.  The pollstate_enter() and
1383  * pollstate_exit() functions provide an interface for recursive /dev/poll
1384  * operations to safely lock pollcaches while failing gracefully in the face of
1385  * deadlocking topologies. (See pollstate_contend() for more detail about how
1386  * deadlocks are detected and resolved.)
1387  */
1388 
1389 /*ARGSUSED*/
1390 static int
1391 dppoll(dev_t dev, short events, int anyyet, short *reventsp,
1392     struct pollhead **phpp)
1393 {
1394         minor_t         minor;
1395         dp_entry_t      *dpep;
1396         pollcache_t     *pcp;
1397         int             res, rc = 0;
1398 
1399         minor = getminor(dev);
1400         mutex_enter(&devpoll_lock);
1401         ASSERT(minor < dptblsize);
1402         dpep = devpolltbl[minor];
1403         ASSERT(dpep != NULL);
1404         mutex_exit(&devpoll_lock);
1405 
1406         mutex_enter(&dpep->dpe_lock);
1407         if ((dpep->dpe_flag & DP_ISEPOLLCOMPAT) == 0) {
1408                 /* Poll recursion is not yet supported for non-epoll handles */
1409                 *reventsp = POLLERR;
1410                 mutex_exit(&dpep->dpe_lock);
1411                 return (0);
1412         } else {
1413                 dpep->dpe_refcnt++;
1414                 pcp = dpep->dpe_pcache;
1415                 mutex_exit(&dpep->dpe_lock);
1416         }
1417 
1418         res = pollstate_enter(pcp);
1419         if (res == PSE_SUCCESS) {
1420                 nfds_t          nfds = 1;
1421                 int             fdcnt = 0;
1422                 pollstate_t     *ps = curthread->t_pollstate;
1423 
1424                 rc = dp_pcache_poll(dpep, NULL, pcp, nfds, &fdcnt);
1425                 if (rc == 0) {
1426                         *reventsp = (fdcnt > 0) ? POLLIN : 0;
1427                 }
1428                 pcachelink_assoc(pcp, ps->ps_pc_stack[0]);
1429                 pollstate_exit(pcp);
1430         } else {
1431                 switch (res) {
1432                 case PSE_FAIL_DEPTH:
1433                         rc = EINVAL;
1434                         break;
1435                 case PSE_FAIL_LOOP:
1436                 case PSE_FAIL_DEADLOCK:
1437                         rc = ELOOP;
1438                         break;
1439                 default:
1440                         /*
1441                          * If anything else has gone awry, such as being polled
1442                          * from an unexpected context, fall back to the
1443                          * recursion-intolerant response.
1444                          */
1445                         *reventsp = POLLERR;
1446                         rc = 0;
1447                         break;
1448                 }
1449         }
1450 
1451         DP_REFRELE(dpep);
1452         return (rc);
1453 }
1454 
1455 /*
1456  * devpoll close should do enough clean up before the pollcache is deleted,
1457  * i.e., it should ensure no one still references the pollcache later.
1458  * There is no "permission" check in here. Any process having the last
1459  * reference of this /dev/poll fd can close.
1460  */
1461 /*ARGSUSED*/
1462 static int
1463 dpclose(dev_t dev, int flag, int otyp, cred_t *credp)
1464 {
1465         minor_t         minor;
1466         dp_entry_t      *dpep;
1467         pollcache_t     *pcp;
1468         int             i;
1469         polldat_t       **hashtbl;
1470         polldat_t       *pdp;
1471 
1472         minor = getminor(dev);
1473 
1474         mutex_enter(&devpoll_lock);
1475         dpep = devpolltbl[minor];
1476         ASSERT(dpep != NULL);
1477         devpolltbl[minor] = NULL;
1478         mutex_exit(&devpoll_lock);
1479         pcp = dpep->dpe_pcache;
1480         ASSERT(pcp != NULL);
1481         /*
1482          * At this point, no other lwp can access this pollcache via the
1483          * /dev/poll fd. This pollcache is going away, so do the clean
1484          * up without the pc_lock.
1485          */
1486         hashtbl = pcp->pc_hash;
1487         for (i = 0; i < pcp->pc_hashsize; i++) {
1488                 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) {
1489                         if (pdp->pd_php != NULL) {
1490                                 pollhead_delete(pdp->pd_php, pdp);
1491                                 pdp->pd_php = NULL;
1492                                 pdp->pd_fp = NULL;
1493                         }
1494                 }
1495         }
1496         /*
1497          * pollwakeup() may still interact with this pollcache. Wait until
1498          * it is done.
1499          */
1500         mutex_enter(&pcp->pc_no_exit);
1501         ASSERT(pcp->pc_busy >= 0);
1502         while (pcp->pc_busy > 0)
1503                 cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit);
1504         mutex_exit(&pcp->pc_no_exit);
1505 
1506         /* Clean up any pollcache links created via recursive /dev/poll */
1507         if (pcp->pc_parents != NULL || pcp->pc_children != NULL) {
1508                 /*
1509                  * Because of the locking rules for pcachelink manipulation,
1510                  * acquring pc_lock is required for this step.
1511                  */
1512                 mutex_enter(&pcp->pc_lock);
1513                 pcachelink_purge_all(pcp);
1514                 mutex_exit(&pcp->pc_lock);
1515         }
1516 
1517         pcache_destroy(pcp);
1518         ASSERT(dpep->dpe_refcnt == 0);
1519         kmem_free(dpep, sizeof (dp_entry_t));
1520         return (0);
1521 }
1522 
1523 static void
1524 pcachelink_locked_rele(pcachelink_t *pl)
1525 {
1526         ASSERT(MUTEX_HELD(&pl->pcl_lock));
1527         VERIFY(pl->pcl_refcnt >= 1);
1528 
1529         pl->pcl_refcnt--;
1530         if (pl->pcl_refcnt == 0) {
1531                 VERIFY(pl->pcl_state == PCL_INVALID);
1532                 ASSERT(pl->pcl_parent_pc == NULL);
1533                 ASSERT(pl->pcl_child_pc == NULL);
1534                 ASSERT(pl->pcl_parent_next == NULL);
1535                 ASSERT(pl->pcl_child_next == NULL);
1536 
1537                 pl->pcl_state = PCL_FREE;
1538                 mutex_destroy(&pl->pcl_lock);
1539                 kmem_free(pl, sizeof (pcachelink_t));
1540         } else {
1541                 mutex_exit(&pl->pcl_lock);
1542         }
1543 }
1544 
1545 /*
1546  * Associate parent and child pollcaches via a pcachelink_t.  If an existing
1547  * link (stale or valid) between the two is found, it will be reused.  If a
1548  * suitable link is not found for reuse, a new one will be allocated.
1549  */
1550 static void
1551 pcachelink_assoc(pollcache_t *child, pollcache_t *parent)
1552 {
1553         pcachelink_t    *pl, **plpn;
1554 
1555         ASSERT(MUTEX_HELD(&child->pc_lock));
1556         ASSERT(MUTEX_HELD(&parent->pc_lock));
1557 
1558         /* Search for an existing link we can reuse. */
1559         plpn = &child->pc_parents;
1560         for (pl = child->pc_parents; pl != NULL; pl = *plpn) {
1561                 mutex_enter(&pl->pcl_lock);
1562                 if (pl->pcl_state == PCL_INVALID) {
1563                         /* Clean any invalid links while walking the list */
1564                         *plpn = pl->pcl_parent_next;
1565                         pl->pcl_child_pc = NULL;
1566                         pl->pcl_parent_next = NULL;
1567                         pcachelink_locked_rele(pl);
1568                 } else if (pl->pcl_parent_pc == parent) {
1569                         /* Successfully found parent link */
1570                         ASSERT(pl->pcl_state == PCL_VALID ||
1571                             pl->pcl_state == PCL_STALE);
1572                         pl->pcl_state = PCL_VALID;
1573                         mutex_exit(&pl->pcl_lock);
1574                         return;
1575                 } else {
1576                         plpn = &pl->pcl_parent_next;
1577                         mutex_exit(&pl->pcl_lock);
1578                 }
1579         }
1580 
1581         /* No existing link to the parent was found.  Create a fresh one. */
1582         pl = kmem_zalloc(sizeof (pcachelink_t), KM_SLEEP);
1583         mutex_init(&pl->pcl_lock,  NULL, MUTEX_DEFAULT, NULL);
1584 
1585         pl->pcl_parent_pc = parent;
1586         pl->pcl_child_next = parent->pc_children;
1587         parent->pc_children = pl;
1588         pl->pcl_refcnt++;
1589 
1590         pl->pcl_child_pc = child;
1591         pl->pcl_parent_next = child->pc_parents;
1592         child->pc_parents = pl;
1593         pl->pcl_refcnt++;
1594 
1595         pl->pcl_state = PCL_VALID;
1596 }
1597 
1598 /*
1599  * Mark all child links in a pollcache as stale.  Any invalid child links found
1600  * during iteration are purged.
1601  */
1602 static void
1603 pcachelink_mark_stale(pollcache_t *pcp)
1604 {
1605         pcachelink_t    *pl, **plpn;
1606 
1607         ASSERT(MUTEX_HELD(&pcp->pc_lock));
1608 
1609         plpn = &pcp->pc_children;
1610         for (pl = pcp->pc_children; pl != NULL; pl = *plpn) {
1611                 mutex_enter(&pl->pcl_lock);
1612                 if (pl->pcl_state == PCL_INVALID) {
1613                         /*
1614                          * Remove any invalid links while we are going to the
1615                          * trouble of walking the list.
1616                          */
1617                         *plpn = pl->pcl_child_next;
1618                         pl->pcl_parent_pc = NULL;
1619                         pl->pcl_child_next = NULL;
1620                         pcachelink_locked_rele(pl);
1621                 } else {
1622                         pl->pcl_state = PCL_STALE;
1623                         plpn = &pl->pcl_child_next;
1624                         mutex_exit(&pl->pcl_lock);
1625                 }
1626         }
1627 }
1628 
1629 /*
1630  * Purge all stale (or invalid) child links from a pollcache.
1631  */
1632 static void
1633 pcachelink_purge_stale(pollcache_t *pcp)
1634 {
1635         pcachelink_t    *pl, **plpn;
1636 
1637         ASSERT(MUTEX_HELD(&pcp->pc_lock));
1638 
1639         plpn = &pcp->pc_children;
1640         for (pl = pcp->pc_children; pl != NULL; pl = *plpn) {
1641                 mutex_enter(&pl->pcl_lock);
1642                 switch (pl->pcl_state) {
1643                 case PCL_STALE:
1644                         pl->pcl_state = PCL_INVALID;
1645                         /* FALLTHROUGH */
1646                 case PCL_INVALID:
1647                         *plpn = pl->pcl_child_next;
1648                         pl->pcl_parent_pc = NULL;
1649                         pl->pcl_child_next = NULL;
1650                         pcachelink_locked_rele(pl);
1651                         break;
1652                 default:
1653                         plpn = &pl->pcl_child_next;
1654                         mutex_exit(&pl->pcl_lock);
1655                 }
1656         }
1657 }
1658 
1659 /*
1660  * Purge all child and parent links from a pollcache, regardless of status.
1661  */
1662 static void
1663 pcachelink_purge_all(pollcache_t *pcp)
1664 {
1665         pcachelink_t    *pl, **plpn;
1666 
1667         ASSERT(MUTEX_HELD(&pcp->pc_lock));
1668 
1669         plpn = &pcp->pc_parents;
1670         for (pl = pcp->pc_parents; pl != NULL; pl = *plpn) {
1671                 mutex_enter(&pl->pcl_lock);
1672                 pl->pcl_state = PCL_INVALID;
1673                 *plpn = pl->pcl_parent_next;
1674                 pl->pcl_child_pc = NULL;
1675                 pl->pcl_parent_next = NULL;
1676                 pcachelink_locked_rele(pl);
1677         }
1678 
1679         plpn = &pcp->pc_children;
1680         for (pl = pcp->pc_children; pl != NULL; pl = *plpn) {
1681                 mutex_enter(&pl->pcl_lock);
1682                 pl->pcl_state = PCL_INVALID;
1683                 *plpn = pl->pcl_child_next;
1684                 pl->pcl_parent_pc = NULL;
1685                 pl->pcl_child_next = NULL;
1686                 pcachelink_locked_rele(pl);
1687         }
1688 
1689         ASSERT(pcp->pc_parents == NULL);
1690         ASSERT(pcp->pc_children == NULL);
1691 }