1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*      All Rights Reserved */
  28 
  29 #include <sys/types.h>
  30 #include <sys/sysmacros.h>
  31 #include <sys/param.h>
  32 #include <sys/systm.h>
  33 #include <sys/errno.h>
  34 #include <sys/signal.h>
  35 #include <sys/cred.h>
  36 #include <sys/user.h>
  37 #include <sys/conf.h>
  38 #include <sys/vfs.h>
  39 #include <sys/vnode.h>
  40 #include <sys/pathname.h>
  41 #include <sys/file.h>
  42 #include <sys/proc.h>
  43 #include <sys/var.h>
  44 #include <sys/cpuvar.h>
  45 #include <sys/open.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/priocntl.h>
  48 #include <sys/procset.h>
  49 #include <sys/prsystm.h>
  50 #include <sys/debug.h>
  51 #include <sys/kmem.h>
  52 #include <sys/atomic.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/poll.h>
  55 #include <sys/rctl.h>
  56 #include <sys/port_impl.h>
  57 
  58 #include <c2/audit.h>
  59 #include <sys/nbmlock.h>
  60 
  61 #ifdef DEBUG
  62 
  63 static uint32_t afd_maxfd;      /* # of entries in maximum allocated array */
  64 static uint32_t afd_alloc;      /* count of kmem_alloc()s */
  65 static uint32_t afd_free;       /* count of kmem_free()s */
  66 static uint32_t afd_wait;       /* count of waits on non-zero ref count */
  67 #define MAXFD(x)        (afd_maxfd = ((afd_maxfd >= (x))? afd_maxfd : (x)))
  68 #define COUNT(x)        atomic_add_32(&x, 1)
  69 
  70 #else   /* DEBUG */
  71 
  72 #define MAXFD(x)
  73 #define COUNT(x)
  74 
  75 #endif  /* DEBUG */
  76 
  77 kmem_cache_t *file_cache;
  78 
  79 static void port_close_fd(portfd_t *);
  80 
  81 /*
  82  * File descriptor allocation.
  83  *
  84  * fd_find(fip, minfd) finds the first available descriptor >= minfd.
  85  * The most common case is open(2), in which minfd = 0, but we must also
  86  * support fcntl(fd, F_DUPFD, minfd).
  87  *
  88  * The algorithm is as follows: we keep all file descriptors in an infix
  89  * binary tree in which each node records the number of descriptors
  90  * allocated in its right subtree, including itself.  Starting at minfd,
  91  * we ascend the tree until we find a non-fully allocated right subtree.
  92  * We then descend that subtree in a binary search for the smallest fd.
  93  * Finally, we ascend the tree again to increment the allocation count
  94  * of every subtree containing the newly-allocated fd.  Freeing an fd
  95  * requires only the last step: we ascend the tree to decrement allocation
  96  * counts.  Each of these three steps (ascent to find non-full subtree,
  97  * descent to find lowest fd, ascent to update allocation counts) is
  98  * O(log n), thus the algorithm as a whole is O(log n).
  99  *
 100  * We don't implement the fd tree using the customary left/right/parent
 101  * pointers, but instead take advantage of the glorious mathematics of
 102  * full infix binary trees.  For reference, here's an illustration of the
 103  * logical structure of such a tree, rooted at 4 (binary 100), covering
 104  * the range 1-7 (binary 001-111).  Our canonical trees do not include
 105  * fd 0; we'll deal with that later.
 106  *
 107  *            100
 108  *           /   \
 109  *          /     \
 110  *        010     110
 111  *        / \     / \
 112  *      001 011 101 111
 113  *
 114  * We make the following observations, all of which are easily proven by
 115  * induction on the depth of the tree:
 116  *
 117  * (T1) The least-significant bit (LSB) of any node is equal to its level
 118  *      in the tree.  In our example, nodes 001, 011, 101 and 111 are at
 119  *      level 0; nodes 010 and 110 are at level 1; and node 100 is at level 2.
 120  *
 121  * (T2) The child size (CSIZE) of node N -- that is, the total number of
 122  *      right-branch descendants in a child of node N, including itself -- is
 123  *      given by clearing all but the least significant bit of N.  This
 124  *      follows immediately from (T1).  Applying this rule to our example, we
 125  *      see that CSIZE(100) = 100, CSIZE(x10) = 10, and CSIZE(xx1) = 1.
 126  *
 127  * (T3) The nearest left ancestor (LPARENT) of node N -- that is, the nearest
 128  *      ancestor containing node N in its right child -- is given by clearing
 129  *      the LSB of N.  For example, LPARENT(111) = 110 and LPARENT(110) = 100.
 130  *      Clearing the LSB of nodes 001, 010 or 100 yields zero, reflecting
 131  *      the fact that these are leftmost nodes.  Note that this algorithm
 132  *      automatically skips generations as necessary.  For example, the parent
 133  *      of node 101 is 110, which is a *right* ancestor (not what we want);
 134  *      but its grandparent is 100, which is a left ancestor. Clearing the LSB
 135  *      of 101 gets us to 100 directly, skipping right past the uninteresting
 136  *      generation (110).
 137  *
 138  *      Note that since LPARENT clears the LSB, whereas CSIZE clears all *but*
 139  *      the LSB, we can express LPARENT() nicely in terms of CSIZE():
 140  *
 141  *      LPARENT(N) = N - CSIZE(N)
 142  *
 143  * (T4) The nearest right ancestor (RPARENT) of node N is given by:
 144  *
 145  *      RPARENT(N) = N + CSIZE(N)
 146  *
 147  * (T5) For every interior node, the children differ from their parent by
 148  *      CSIZE(parent) / 2.  In our example, CSIZE(100) / 2 = 2 = 10 binary,
 149  *      and indeed, the children of 100 are 100 +/- 10 = 010 and 110.
 150  *
 151  * Next, we'll need a few two's-complement math tricks.  Suppose a number,
 152  * N, has the following form:
 153  *
 154  *              N = xxxx10...0
 155  *
 156  * That is, the binary representation of N consists of some string of bits,
 157  * then a 1, then all zeroes.  This amounts to nothing more than saying that
 158  * N has a least-significant bit, which is true for any N != 0.  If we look
 159  * at N and N - 1 together, we see that we can combine them in useful ways:
 160  *
 161  *                N = xxxx10...0
 162  *            N - 1 = xxxx01...1
 163  *      ------------------------
 164  *      N & (N - 1) = xxxx000000
 165  *      N | (N - 1) = xxxx111111
 166  *      N ^ (N - 1) =     111111
 167  *
 168  * In particular, this suggests several easy ways to clear all but the LSB,
 169  * which by (T2) is exactly what we need to determine CSIZE(N) = 10...0.
 170  * We'll opt for this formulation:
 171  *
 172  *      (C1) CSIZE(N) = (N - 1) ^ (N | (N - 1))
 173  *
 174  * Similarly, we have an easy way to determine LPARENT(N), which requires
 175  * that we clear the LSB of N:
 176  *
 177  *      (L1) LPARENT(N) = N & (N - 1)
 178  *
 179  * We note in the above relations that (N | (N - 1)) - N = CSIZE(N) - 1.
 180  * When combined with (T4), this yields an easy way to compute RPARENT(N):
 181  *
 182  *      (R1) RPARENT(N) = (N | (N - 1)) + 1
 183  *
 184  * Finally, to accommodate fd 0 we must adjust all of our results by +/-1 to
 185  * move the fd range from [1, 2^n) to [0, 2^n - 1).  This is straightforward,
 186  * so there's no need to belabor the algebra; the revised relations become:
 187  *
 188  *      (C1a) CSIZE(N) = N ^ (N | (N + 1))
 189  *
 190  *      (L1a) LPARENT(N) = (N & (N + 1)) - 1
 191  *
 192  *      (R1a) RPARENT(N) = N | (N + 1)
 193  *
 194  * This completes the mathematical framework.  We now have all the tools
 195  * we need to implement fd_find() and fd_reserve().
 196  *
 197  * fd_find(fip, minfd) finds the smallest available file descriptor >= minfd.
 198  * It does not actually allocate the descriptor; that's done by fd_reserve().
 199  * fd_find() proceeds in two steps:
 200  *
 201  * (1) Find the leftmost subtree that contains a descriptor >= minfd.
 202  *     We start at the right subtree rooted at minfd.  If this subtree is
 203  *     not full -- if fip->fi_list[minfd].uf_alloc != CSIZE(minfd) -- then
 204  *     step 1 is done.  Otherwise, we know that all fds in this subtree
 205  *     are taken, so we ascend to RPARENT(minfd) using (R1a).  We repeat
 206  *     this process until we either find a candidate subtree or exceed
 207  *     fip->fi_nfiles.  We use (C1a) to compute CSIZE().
 208  *
 209  * (2) Find the smallest fd in the subtree discovered by step 1.
 210  *     Starting at the root of this subtree, we descend to find the
 211  *     smallest available fd.  Since the left children have the smaller
 212  *     fds, we will descend rightward only when the left child is full.
 213  *
 214  *     We begin by comparing the number of allocated fds in the root
 215  *     to the number of allocated fds in its right child; if they differ
 216  *     by exactly CSIZE(child), we know the left subtree is full, so we
 217  *     descend right; that is, the right child becomes the search root.
 218  *     Otherwise we leave the root alone and start following the right
 219  *     child's left children.  As fortune would have it, this is very
 220  *     simple computationally: by (T5), the right child of fd is just
 221  *     fd + size, where size = CSIZE(fd) / 2.  Applying (T5) again,
 222  *     we find that the right child's left child is fd + size - (size / 2) =
 223  *     fd + (size / 2); *its* left child is fd + (size / 2) - (size / 4) =
 224  *     fd + (size / 4), and so on.  In general, fd's right child's
 225  *     leftmost nth descendant is fd + (size >> n).  Thus, to follow
 226  *     the right child's left descendants, we just halve the size in
 227  *     each iteration of the search.
 228  *
 229  *     When we descend leftward, we must keep track of the number of fds
 230  *     that were allocated in all the right subtrees we rejected, so we
 231  *     know how many of the root fd's allocations are in the remaining
 232  *     (as yet unexplored) leftmost part of its right subtree.  When we
 233  *     encounter a fully-allocated left child -- that is, when we find
 234  *     that fip->fi_list[fd].uf_alloc == ralloc + size -- we descend right
 235  *     (as described earlier), resetting ralloc to zero.
 236  *
 237  * fd_reserve(fip, fd, incr) either allocates or frees fd, depending
 238  * on whether incr is 1 or -1.  Starting at fd, fd_reserve() ascends
 239  * the leftmost ancestors (see (T3)) and updates the allocation counts.
 240  * At each step we use (L1a) to compute LPARENT(), the next left ancestor.
 241  *
 242  * flist_minsize() finds the minimal tree that still covers all
 243  * used fds; as long as the allocation count of a root node is zero, we
 244  * don't need that node or its right subtree.
 245  *
 246  * flist_nalloc() counts the number of allocated fds in the tree, by starting
 247  * at the top of the tree and summing the right-subtree allocation counts as
 248  * it descends leftwards.
 249  *
 250  * Note: we assume that flist_grow() will keep fip->fi_nfiles of the form
 251  * 2^n - 1.  This ensures that the fd trees are always full, which saves
 252  * quite a bit of boundary checking.
 253  */
 254 static int
 255 fd_find(uf_info_t *fip, int minfd)
 256 {
 257         int size, ralloc, fd;
 258 
 259         ASSERT(MUTEX_HELD(&fip->fi_lock));
 260         ASSERT((fip->fi_nfiles & (fip->fi_nfiles + 1)) == 0);
 261 
 262         for (fd = minfd; (uint_t)fd < fip->fi_nfiles; fd |= fd + 1) {
 263                 size = fd ^ (fd | (fd + 1));
 264                 if (fip->fi_list[fd].uf_alloc == size)
 265                         continue;
 266                 for (ralloc = 0, size >>= 1; size != 0; size >>= 1) {
 267                         ralloc += fip->fi_list[fd + size].uf_alloc;
 268                         if (fip->fi_list[fd].uf_alloc == ralloc + size) {
 269                                 fd += size;
 270                                 ralloc = 0;
 271                         }
 272                 }
 273                 return (fd);
 274         }
 275         return (-1);
 276 }
 277 
 278 static void
 279 fd_reserve(uf_info_t *fip, int fd, int incr)
 280 {
 281         int pfd;
 282         uf_entry_t *ufp = &fip->fi_list[fd];
 283 
 284         ASSERT((uint_t)fd < fip->fi_nfiles);
 285         ASSERT((ufp->uf_busy == 0 && incr == 1) ||
 286             (ufp->uf_busy == 1 && incr == -1));
 287         ASSERT(MUTEX_HELD(&ufp->uf_lock));
 288         ASSERT(MUTEX_HELD(&fip->fi_lock));
 289 
 290         for (pfd = fd; pfd >= 0; pfd = (pfd & (pfd + 1)) - 1)
 291                 fip->fi_list[pfd].uf_alloc += incr;
 292 
 293         ufp->uf_busy += incr;
 294 }
 295 
 296 static int
 297 flist_minsize(uf_info_t *fip)
 298 {
 299         int fd;
 300 
 301         /*
 302          * We'd like to ASSERT(MUTEX_HELD(&fip->fi_lock)), but we're called
 303          * by flist_fork(), which relies on other mechanisms for mutual
 304          * exclusion.
 305          */
 306         ASSERT((fip->fi_nfiles & (fip->fi_nfiles + 1)) == 0);
 307 
 308         for (fd = fip->fi_nfiles; fd != 0; fd >>= 1)
 309                 if (fip->fi_list[fd >> 1].uf_alloc != 0)
 310                         break;
 311 
 312         return (fd);
 313 }
 314 
 315 static int
 316 flist_nalloc(uf_info_t *fip)
 317 {
 318         int fd;
 319         int nalloc = 0;
 320 
 321         ASSERT(MUTEX_HELD(&fip->fi_lock));
 322         ASSERT((fip->fi_nfiles & (fip->fi_nfiles + 1)) == 0);
 323 
 324         for (fd = fip->fi_nfiles; fd != 0; fd >>= 1)
 325                 nalloc += fip->fi_list[fd >> 1].uf_alloc;
 326 
 327         return (nalloc);
 328 }
 329 
 330 /*
 331  * Increase size of the fi_list array to accommodate at least maxfd.
 332  * We keep the size of the form 2^n - 1 for benefit of fd_find().
 333  */
 334 static void
 335 flist_grow(int maxfd)
 336 {
 337         uf_info_t *fip = P_FINFO(curproc);
 338         int newcnt, oldcnt;
 339         uf_entry_t *src, *dst, *newlist, *oldlist, *newend, *oldend;
 340         uf_rlist_t *urp;
 341 
 342         for (newcnt = 1; newcnt <= maxfd; newcnt = (newcnt << 1) | 1)
 343                 continue;
 344 
 345         newlist = kmem_zalloc(newcnt * sizeof (uf_entry_t), KM_SLEEP);
 346 
 347         mutex_enter(&fip->fi_lock);
 348         oldcnt = fip->fi_nfiles;
 349         if (newcnt <= oldcnt) {
 350                 mutex_exit(&fip->fi_lock);
 351                 kmem_free(newlist, newcnt * sizeof (uf_entry_t));
 352                 return;
 353         }
 354         ASSERT((newcnt & (newcnt + 1)) == 0);
 355         oldlist = fip->fi_list;
 356         oldend = oldlist + oldcnt;
 357         newend = newlist + oldcnt;      /* no need to lock beyond old end */
 358 
 359         /*
 360          * fi_list and fi_nfiles cannot change while any uf_lock is held,
 361          * so we must grab all the old locks *and* the new locks up to oldcnt.
 362          * (Locks beyond the end of oldcnt aren't visible until we store
 363          * the new fi_nfiles, which is the last thing we do before dropping
 364          * all the locks, so there's no need to acquire these locks).
 365          * Holding the new locks is necessary because when fi_list changes
 366          * to point to the new list, fi_nfiles won't have been stored yet.
 367          * If we *didn't* hold the new locks, someone doing a UF_ENTER()
 368          * could see the new fi_list, grab the new uf_lock, and then see
 369          * fi_nfiles change while the lock is held -- in violation of
 370          * UF_ENTER() semantics.
 371          */
 372         for (src = oldlist; src < oldend; src++)
 373                 mutex_enter(&src->uf_lock);
 374 
 375         for (dst = newlist; dst < newend; dst++)
 376                 mutex_enter(&dst->uf_lock);
 377 
 378         for (src = oldlist, dst = newlist; src < oldend; src++, dst++) {
 379                 dst->uf_file = src->uf_file;
 380                 dst->uf_fpollinfo = src->uf_fpollinfo;
 381                 dst->uf_refcnt = src->uf_refcnt;
 382                 dst->uf_alloc = src->uf_alloc;
 383                 dst->uf_flag = src->uf_flag;
 384                 dst->uf_busy = src->uf_busy;
 385                 dst->uf_portfd = src->uf_portfd;
 386         }
 387 
 388         /*
 389          * As soon as we store the new flist, future locking operations
 390          * will use it.  Therefore, we must ensure that all the state
 391          * we've just established reaches global visibility before the
 392          * new flist does.
 393          */
 394         membar_producer();
 395         fip->fi_list = newlist;
 396 
 397         /*
 398          * Routines like getf() make an optimistic check on the validity
 399          * of the supplied file descriptor: if it's less than the current
 400          * value of fi_nfiles -- examined without any locks -- then it's
 401          * safe to attempt a UF_ENTER() on that fd (which is a valid
 402          * assumption because fi_nfiles only increases).  Therefore, it
 403          * is critical that the new value of fi_nfiles not reach global
 404          * visibility until after the new fi_list: if it happened the
 405          * other way around, getf() could see the new fi_nfiles and attempt
 406          * a UF_ENTER() on the old fi_list, which would write beyond its
 407          * end if the fd exceeded the old fi_nfiles.
 408          */
 409         membar_producer();
 410         fip->fi_nfiles = newcnt;
 411 
 412         /*
 413          * The new state is consistent now, so we can drop all the locks.
 414          */
 415         for (dst = newlist; dst < newend; dst++)
 416                 mutex_exit(&dst->uf_lock);
 417 
 418         for (src = oldlist; src < oldend; src++) {
 419                 /*
 420                  * If any threads are blocked on the old cvs, wake them.
 421                  * This will force them to wake up, discover that fi_list
 422                  * has changed, and go back to sleep on the new cvs.
 423                  */
 424                 cv_broadcast(&src->uf_wanted_cv);
 425                 cv_broadcast(&src->uf_closing_cv);
 426                 mutex_exit(&src->uf_lock);
 427         }
 428 
 429         mutex_exit(&fip->fi_lock);
 430 
 431         /*
 432          * Retire the old flist.  We can't actually kmem_free() it now
 433          * because someone may still have a pointer to it.  Instead,
 434          * we link it onto a list of retired flists.  The new flist
 435          * is at least double the size of the previous flist, so the
 436          * total size of all retired flists will be less than the size
 437          * of the current one (to prove, consider the sum of a geometric
 438          * series in powers of 2).  exit() frees the retired flists.
 439          */
 440         urp = kmem_zalloc(sizeof (uf_rlist_t), KM_SLEEP);
 441         urp->ur_list = oldlist;
 442         urp->ur_nfiles = oldcnt;
 443 
 444         mutex_enter(&fip->fi_lock);
 445         urp->ur_next = fip->fi_rlist;
 446         fip->fi_rlist = urp;
 447         mutex_exit(&fip->fi_lock);
 448 }
 449 
 450 /*
 451  * Utility functions for keeping track of the active file descriptors.
 452  */
 453 void
 454 clear_stale_fd()                /* called from post_syscall() */
 455 {
 456         afd_t *afd = &curthread->t_activefd;
 457         int i;
 458 
 459         /* uninitialized is ok here, a_nfd is then zero */
 460         for (i = 0; i < afd->a_nfd; i++) {
 461                 /* assert that this should not be necessary */
 462                 ASSERT(afd->a_fd[i] == -1);
 463                 afd->a_fd[i] = -1;
 464         }
 465         afd->a_stale = 0;
 466 }
 467 
 468 void
 469 free_afd(afd_t *afd)            /* called below and from thread_free() */
 470 {
 471         int i;
 472 
 473         /* free the buffer if it was kmem_alloc()ed */
 474         if (afd->a_nfd > sizeof (afd->a_buf) / sizeof (afd->a_buf[0])) {
 475                 COUNT(afd_free);
 476                 kmem_free(afd->a_fd, afd->a_nfd * sizeof (afd->a_fd[0]));
 477         }
 478 
 479         /* (re)initialize the structure */
 480         afd->a_fd = &afd->a_buf[0];
 481         afd->a_nfd = sizeof (afd->a_buf) / sizeof (afd->a_buf[0]);
 482         afd->a_stale = 0;
 483         for (i = 0; i < afd->a_nfd; i++)
 484                 afd->a_fd[i] = -1;
 485 }
 486 
 487 static void
 488 set_active_fd(int fd)
 489 {
 490         afd_t *afd = &curthread->t_activefd;
 491         int i;
 492         int *old_fd;
 493         int old_nfd;
 494         int *new_fd;
 495         int new_nfd;
 496 
 497         if (afd->a_nfd == 0) {       /* first time initialization */
 498                 ASSERT(fd == -1);
 499                 mutex_enter(&afd->a_fdlock);
 500                 free_afd(afd);
 501                 mutex_exit(&afd->a_fdlock);
 502         }
 503 
 504         /* insert fd into vacant slot, if any */
 505         for (i = 0; i < afd->a_nfd; i++) {
 506                 if (afd->a_fd[i] == -1) {
 507                         afd->a_fd[i] = fd;
 508                         return;
 509                 }
 510         }
 511 
 512         /*
 513          * Reallocate the a_fd[] array to add one more slot.
 514          */
 515         ASSERT(fd == -1);
 516         old_nfd = afd->a_nfd;
 517         old_fd = afd->a_fd;
 518         new_nfd = old_nfd + 1;
 519         new_fd = kmem_alloc(new_nfd * sizeof (afd->a_fd[0]), KM_SLEEP);
 520         MAXFD(new_nfd);
 521         COUNT(afd_alloc);
 522 
 523         mutex_enter(&afd->a_fdlock);
 524         afd->a_fd = new_fd;
 525         afd->a_nfd = new_nfd;
 526         for (i = 0; i < old_nfd; i++)
 527                 afd->a_fd[i] = old_fd[i];
 528         afd->a_fd[i] = fd;
 529         mutex_exit(&afd->a_fdlock);
 530 
 531         if (old_nfd > sizeof (afd->a_buf) / sizeof (afd->a_buf[0])) {
 532                 COUNT(afd_free);
 533                 kmem_free(old_fd, old_nfd * sizeof (afd->a_fd[0]));
 534         }
 535 }
 536 
 537 void
 538 clear_active_fd(int fd)         /* called below and from aio.c */
 539 {
 540         afd_t *afd = &curthread->t_activefd;
 541         int i;
 542 
 543         for (i = 0; i < afd->a_nfd; i++) {
 544                 if (afd->a_fd[i] == fd) {
 545                         afd->a_fd[i] = -1;
 546                         break;
 547                 }
 548         }
 549         ASSERT(i < afd->a_nfd);           /* not found is not ok */
 550 }
 551 
 552 /*
 553  * Does this thread have this fd active?
 554  */
 555 static int
 556 is_active_fd(kthread_t *t, int fd)
 557 {
 558         afd_t *afd = &t->t_activefd;
 559         int i;
 560 
 561         ASSERT(t != curthread);
 562         mutex_enter(&afd->a_fdlock);
 563         /* uninitialized is ok here, a_nfd is then zero */
 564         for (i = 0; i < afd->a_nfd; i++) {
 565                 if (afd->a_fd[i] == fd) {
 566                         mutex_exit(&afd->a_fdlock);
 567                         return (1);
 568                 }
 569         }
 570         mutex_exit(&afd->a_fdlock);
 571         return (0);
 572 }
 573 
 574 /*
 575  * Convert a user supplied file descriptor into a pointer to a file
 576  * structure.  Only task is to check range of the descriptor (soft
 577  * resource limit was enforced at open time and shouldn't be checked
 578  * here).
 579  */
 580 file_t *
 581 getf(int fd)
 582 {
 583         uf_info_t *fip = P_FINFO(curproc);
 584         uf_entry_t *ufp;
 585         file_t *fp;
 586 
 587         if ((uint_t)fd >= fip->fi_nfiles)
 588                 return (NULL);
 589 
 590         /*
 591          * Reserve a slot in the active fd array now so we can call
 592          * set_active_fd(fd) for real below, while still inside UF_ENTER().
 593          */
 594         set_active_fd(-1);
 595 
 596         UF_ENTER(ufp, fip, fd);
 597 
 598         if ((fp = ufp->uf_file) == NULL) {
 599                 UF_EXIT(ufp);
 600 
 601                 if (fd == fip->fi_badfd && fip->fi_action > 0)
 602                         tsignal(curthread, fip->fi_action);
 603 
 604                 return (NULL);
 605         }
 606         ufp->uf_refcnt++;
 607 
 608         set_active_fd(fd);      /* record the active file descriptor */
 609 
 610         UF_EXIT(ufp);
 611 
 612         return (fp);
 613 }
 614 
 615 /*
 616  * Close whatever file currently occupies the file descriptor slot
 617  * and install the new file, usually NULL, in the file descriptor slot.
 618  * The close must complete before we release the file descriptor slot.
 619  * If newfp != NULL we only return an error if we can't allocate the
 620  * slot so the caller knows that it needs to free the filep;
 621  * in the other cases we return the error number from closef().
 622  */
 623 int
 624 closeandsetf(int fd, file_t *newfp)
 625 {
 626         proc_t *p = curproc;
 627         uf_info_t *fip = P_FINFO(p);
 628         uf_entry_t *ufp;
 629         file_t *fp;
 630         fpollinfo_t *fpip;
 631         portfd_t *pfd;
 632         int error;
 633 
 634         if ((uint_t)fd >= fip->fi_nfiles) {
 635                 if (newfp == NULL)
 636                         return (EBADF);
 637                 flist_grow(fd);
 638         }
 639 
 640         if (newfp != NULL) {
 641                 /*
 642                  * If ufp is reserved but has no file pointer, it's in the
 643                  * transition between ufalloc() and setf().  We must wait
 644                  * for this transition to complete before assigning the
 645                  * new non-NULL file pointer.
 646                  */
 647                 mutex_enter(&fip->fi_lock);
 648                 if (fd == fip->fi_badfd) {
 649                         mutex_exit(&fip->fi_lock);
 650                         if (fip->fi_action > 0)
 651                                 tsignal(curthread, fip->fi_action);
 652                         return (EBADF);
 653                 }
 654                 UF_ENTER(ufp, fip, fd);
 655                 while (ufp->uf_busy && ufp->uf_file == NULL) {
 656                         mutex_exit(&fip->fi_lock);
 657                         cv_wait_stop(&ufp->uf_wanted_cv, &ufp->uf_lock, 250);
 658                         UF_EXIT(ufp);
 659                         mutex_enter(&fip->fi_lock);
 660                         UF_ENTER(ufp, fip, fd);
 661                 }
 662                 if ((fp = ufp->uf_file) == NULL) {
 663                         ASSERT(ufp->uf_fpollinfo == NULL);
 664                         ASSERT(ufp->uf_flag == 0);
 665                         fd_reserve(fip, fd, 1);
 666                         ufp->uf_file = newfp;
 667                         UF_EXIT(ufp);
 668                         mutex_exit(&fip->fi_lock);
 669                         return (0);
 670                 }
 671                 mutex_exit(&fip->fi_lock);
 672         } else {
 673                 UF_ENTER(ufp, fip, fd);
 674                 if ((fp = ufp->uf_file) == NULL) {
 675                         UF_EXIT(ufp);
 676                         return (EBADF);
 677                 }
 678         }
 679 
 680         ASSERT(ufp->uf_busy);
 681         ufp->uf_file = NULL;
 682         ufp->uf_flag = 0;
 683 
 684         /*
 685          * If the file descriptor reference count is non-zero, then
 686          * some other lwp in the process is performing system call
 687          * activity on the file.  To avoid blocking here for a long
 688          * time (the other lwp might be in a long term sleep in its
 689          * system call), we scan all other lwps in the process to
 690          * find the ones with this fd as one of their active fds,
 691          * set their a_stale flag, and set them running if they
 692          * are in an interruptible sleep so they will emerge from
 693          * their system calls immediately.  post_syscall() will
 694          * test the a_stale flag and set errno to EBADF.
 695          */
 696         ASSERT(ufp->uf_refcnt == 0 || p->p_lwpcnt > 1);
 697         if (ufp->uf_refcnt > 0) {
 698                 kthread_t *t;
 699 
 700                 /*
 701                  * We call sprlock_proc(p) to ensure that the thread
 702                  * list will not change while we are scanning it.
 703                  * To do this, we must drop ufp->uf_lock and then
 704                  * reacquire it (so we are not holding both p->p_lock
 705                  * and ufp->uf_lock at the same time).  ufp->uf_lock
 706                  * must be held for is_active_fd() to be correct
 707                  * (set_active_fd() is called while holding ufp->uf_lock).
 708                  *
 709                  * This is a convoluted dance, but it is better than
 710                  * the old brute-force method of stopping every thread
 711                  * in the process by calling holdlwps(SHOLDFORK1).
 712                  */
 713 
 714                 UF_EXIT(ufp);
 715                 COUNT(afd_wait);
 716 
 717                 mutex_enter(&p->p_lock);
 718                 sprlock_proc(p);
 719                 mutex_exit(&p->p_lock);
 720 
 721                 UF_ENTER(ufp, fip, fd);
 722                 ASSERT(ufp->uf_file == NULL);
 723 
 724                 if (ufp->uf_refcnt > 0) {
 725                         for (t = curthread->t_forw;
 726                             t != curthread;
 727                             t = t->t_forw) {
 728                                 if (is_active_fd(t, fd)) {
 729                                         thread_lock(t);
 730                                         t->t_activefd.a_stale = 1;
 731                                         t->t_post_sys = 1;
 732                                         if (ISWAKEABLE(t))
 733                                                 setrun_locked(t);
 734                                         thread_unlock(t);
 735                                 }
 736                         }
 737                 }
 738 
 739                 UF_EXIT(ufp);
 740 
 741                 mutex_enter(&p->p_lock);
 742                 sprunlock(p);
 743 
 744                 UF_ENTER(ufp, fip, fd);
 745                 ASSERT(ufp->uf_file == NULL);
 746         }
 747 
 748         /*
 749          * Wait for other lwps to stop using this file descriptor.
 750          */
 751         while (ufp->uf_refcnt > 0) {
 752                 cv_wait_stop(&ufp->uf_closing_cv, &ufp->uf_lock, 250);
 753                 /*
 754                  * cv_wait_stop() drops ufp->uf_lock, so the file list
 755                  * can change.  Drop the lock on our (possibly) stale
 756                  * ufp and let UF_ENTER() find and lock the current ufp.
 757                  */
 758                 UF_EXIT(ufp);
 759                 UF_ENTER(ufp, fip, fd);
 760         }
 761 
 762 #ifdef DEBUG
 763         /*
 764          * catch a watchfd on device's pollhead list but not on fpollinfo list
 765          */
 766         if (ufp->uf_fpollinfo != NULL)
 767                 checkwfdlist(fp->f_vnode, ufp->uf_fpollinfo);
 768 #endif  /* DEBUG */
 769 
 770         /*
 771          * We may need to cleanup some cached poll states in t_pollstate
 772          * before the fd can be reused. It is important that we don't
 773          * access a stale thread structure. We will do the cleanup in two
 774          * phases to avoid deadlock and holding uf_lock for too long.
 775          * In phase 1, hold the uf_lock and call pollblockexit() to set
 776          * state in t_pollstate struct so that a thread does not exit on
 777          * us. In phase 2, we drop the uf_lock and call pollcacheclean().
 778          */
 779         pfd = ufp->uf_portfd;
 780         ufp->uf_portfd = NULL;
 781         fpip = ufp->uf_fpollinfo;
 782         ufp->uf_fpollinfo = NULL;
 783         if (fpip != NULL)
 784                 pollblockexit(fpip);
 785         UF_EXIT(ufp);
 786         if (fpip != NULL)
 787                 pollcacheclean(fpip, fd);
 788         if (pfd)
 789                 port_close_fd(pfd);
 790 
 791         /*
 792          * Keep the file descriptor entry reserved across the closef().
 793          */
 794         error = closef(fp);
 795 
 796         setf(fd, newfp);
 797 
 798         /* Only return closef() error when closing is all we do */
 799         return (newfp == NULL ? error : 0);
 800 }
 801 
 802 /*
 803  * Decrement uf_refcnt; wakeup anyone waiting to close the file.
 804  */
 805 void
 806 releasef(int fd)
 807 {
 808         uf_info_t *fip = P_FINFO(curproc);
 809         uf_entry_t *ufp;
 810 
 811         UF_ENTER(ufp, fip, fd);
 812         ASSERT(ufp->uf_refcnt > 0);
 813         clear_active_fd(fd);    /* clear the active file descriptor */
 814         if (--ufp->uf_refcnt == 0)
 815                 cv_broadcast(&ufp->uf_closing_cv);
 816         UF_EXIT(ufp);
 817 }
 818 
 819 /*
 820  * Identical to releasef() but can be called from another process.
 821  */
 822 void
 823 areleasef(int fd, uf_info_t *fip)
 824 {
 825         uf_entry_t *ufp;
 826 
 827         UF_ENTER(ufp, fip, fd);
 828         ASSERT(ufp->uf_refcnt > 0);
 829         if (--ufp->uf_refcnt == 0)
 830                 cv_broadcast(&ufp->uf_closing_cv);
 831         UF_EXIT(ufp);
 832 }
 833 
 834 /*
 835  * Duplicate all file descriptors across a fork.
 836  */
 837 void
 838 flist_fork(uf_info_t *pfip, uf_info_t *cfip)
 839 {
 840         int fd, nfiles;
 841         uf_entry_t *pufp, *cufp;
 842 
 843         mutex_init(&cfip->fi_lock, NULL, MUTEX_DEFAULT, NULL);
 844         cfip->fi_rlist = NULL;
 845 
 846         /*
 847          * We don't need to hold fi_lock because all other lwp's in the
 848          * parent have been held.
 849          */
 850         cfip->fi_nfiles = nfiles = flist_minsize(pfip);
 851 
 852         cfip->fi_list = kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP);
 853 
 854         for (fd = 0, pufp = pfip->fi_list, cufp = cfip->fi_list; fd < nfiles;
 855             fd++, pufp++, cufp++) {
 856                 cufp->uf_file = pufp->uf_file;
 857                 cufp->uf_alloc = pufp->uf_alloc;
 858                 cufp->uf_flag = pufp->uf_flag;
 859                 cufp->uf_busy = pufp->uf_busy;
 860                 if (pufp->uf_file == NULL) {
 861                         ASSERT(pufp->uf_flag == 0);
 862                         if (pufp->uf_busy) {
 863                                 /*
 864                                  * Grab locks to appease ASSERTs in fd_reserve
 865                                  */
 866                                 mutex_enter(&cfip->fi_lock);
 867                                 mutex_enter(&cufp->uf_lock);
 868                                 fd_reserve(cfip, fd, -1);
 869                                 mutex_exit(&cufp->uf_lock);
 870                                 mutex_exit(&cfip->fi_lock);
 871                         }
 872                 }
 873         }
 874 }
 875 
 876 /*
 877  * Close all open file descriptors for the current process.
 878  * This is only called from exit(), which is single-threaded,
 879  * so we don't need any locking.
 880  */
 881 void
 882 closeall(uf_info_t *fip)
 883 {
 884         int fd;
 885         file_t *fp;
 886         uf_entry_t *ufp;
 887 
 888         ufp = fip->fi_list;
 889         for (fd = 0; fd < fip->fi_nfiles; fd++, ufp++) {
 890                 if ((fp = ufp->uf_file) != NULL) {
 891                         ufp->uf_file = NULL;
 892                         if (ufp->uf_portfd != NULL) {
 893                                 portfd_t *pfd;
 894                                 /* remove event port association */
 895                                 pfd = ufp->uf_portfd;
 896                                 ufp->uf_portfd = NULL;
 897                                 port_close_fd(pfd);
 898                         }
 899                         ASSERT(ufp->uf_fpollinfo == NULL);
 900                         (void) closef(fp);
 901                 }
 902         }
 903 
 904         kmem_free(fip->fi_list, fip->fi_nfiles * sizeof (uf_entry_t));
 905         fip->fi_list = NULL;
 906         fip->fi_nfiles = 0;
 907         while (fip->fi_rlist != NULL) {
 908                 uf_rlist_t *urp = fip->fi_rlist;
 909                 fip->fi_rlist = urp->ur_next;
 910                 kmem_free(urp->ur_list, urp->ur_nfiles * sizeof (uf_entry_t));
 911                 kmem_free(urp, sizeof (uf_rlist_t));
 912         }
 913 }
 914 
 915 /*
 916  * Internal form of close.  Decrement reference count on file
 917  * structure.  Decrement reference count on the vnode following
 918  * removal of the referencing file structure.
 919  */
 920 int
 921 closef(file_t *fp)
 922 {
 923         vnode_t *vp;
 924         int error;
 925         int count;
 926         int flag;
 927         offset_t offset;
 928 
 929         /*
 930          * audit close of file (may be exit)
 931          */
 932         if (AU_AUDITING())
 933                 audit_closef(fp);
 934         ASSERT(MUTEX_NOT_HELD(&P_FINFO(curproc)->fi_lock));
 935 
 936         mutex_enter(&fp->f_tlock);
 937 
 938         ASSERT(fp->f_count > 0);
 939 
 940         count = fp->f_count--;
 941         flag = fp->f_flag;
 942         offset = fp->f_offset;
 943 
 944         vp = fp->f_vnode;
 945 
 946         error = VOP_CLOSE(vp, flag, count, offset, fp->f_cred, NULL);
 947 
 948         if (count > 1) {
 949                 mutex_exit(&fp->f_tlock);
 950                 return (error);
 951         }
 952         ASSERT(fp->f_count == 0);
 953         mutex_exit(&fp->f_tlock);
 954 
 955         VN_RELE(vp);
 956         /*
 957          * deallocate resources to audit_data
 958          */
 959         if (audit_active)
 960                 audit_unfalloc(fp);
 961         crfree(fp->f_cred);
 962         kmem_cache_free(file_cache, fp);
 963         return (error);
 964 }
 965 
 966 /*
 967  * This is a combination of ufalloc() and setf().
 968  */
 969 int
 970 ufalloc_file(int start, file_t *fp)
 971 {
 972         proc_t *p = curproc;
 973         uf_info_t *fip = P_FINFO(p);
 974         int filelimit;
 975         uf_entry_t *ufp;
 976         int nfiles;
 977         int fd;
 978 
 979         /*
 980          * Assertion is to convince the correctness of the following
 981          * assignment for filelimit after casting to int.
 982          */
 983         ASSERT(p->p_fno_ctl <= INT_MAX);
 984         filelimit = (int)p->p_fno_ctl;
 985 
 986         for (;;) {
 987                 mutex_enter(&fip->fi_lock);
 988                 fd = fd_find(fip, start);
 989                 if (fd >= 0 && fd == fip->fi_badfd) {
 990                         start = fd + 1;
 991                         mutex_exit(&fip->fi_lock);
 992                         continue;
 993                 }
 994                 if ((uint_t)fd < filelimit)
 995                         break;
 996                 if (fd >= filelimit) {
 997                         mutex_exit(&fip->fi_lock);
 998                         mutex_enter(&p->p_lock);
 999                         (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
1000                             p->p_rctls, p, RCA_SAFE);
1001                         mutex_exit(&p->p_lock);
1002                         return (-1);
1003                 }
1004                 /* fd_find() returned -1 */
1005                 nfiles = fip->fi_nfiles;
1006                 mutex_exit(&fip->fi_lock);
1007                 flist_grow(MAX(start, nfiles));
1008         }
1009 
1010         UF_ENTER(ufp, fip, fd);
1011         fd_reserve(fip, fd, 1);
1012         ASSERT(ufp->uf_file == NULL);
1013         ufp->uf_file = fp;
1014         UF_EXIT(ufp);
1015         mutex_exit(&fip->fi_lock);
1016         return (fd);
1017 }
1018 
1019 /*
1020  * Allocate a user file descriptor greater than or equal to "start".
1021  */
1022 int
1023 ufalloc(int start)
1024 {
1025         return (ufalloc_file(start, NULL));
1026 }
1027 
1028 /*
1029  * Check that a future allocation of count fds on proc p has a good
1030  * chance of succeeding.  If not, do rctl processing as if we'd failed
1031  * the allocation.
1032  *
1033  * Our caller must guarantee that p cannot disappear underneath us.
1034  */
1035 int
1036 ufcanalloc(proc_t *p, uint_t count)
1037 {
1038         uf_info_t *fip = P_FINFO(p);
1039         int filelimit;
1040         int current;
1041 
1042         if (count == 0)
1043                 return (1);
1044 
1045         ASSERT(p->p_fno_ctl <= INT_MAX);
1046         filelimit = (int)p->p_fno_ctl;
1047 
1048         mutex_enter(&fip->fi_lock);
1049         current = flist_nalloc(fip);            /* # of in-use descriptors */
1050         mutex_exit(&fip->fi_lock);
1051 
1052         /*
1053          * If count is a positive integer, the worst that can happen is
1054          * an overflow to a negative value, which is caught by the >= 0 check.
1055          */
1056         current += count;
1057         if (count <= INT_MAX && current >= 0 && current <= filelimit)
1058                 return (1);
1059 
1060         mutex_enter(&p->p_lock);
1061         (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
1062             p->p_rctls, p, RCA_SAFE);
1063         mutex_exit(&p->p_lock);
1064         return (0);
1065 }
1066 
1067 /*
1068  * Allocate a user file descriptor and a file structure.
1069  * Initialize the descriptor to point at the file structure.
1070  * If fdp is NULL, the user file descriptor will not be allocated.
1071  */
1072 int
1073 falloc(vnode_t *vp, int flag, file_t **fpp, int *fdp)
1074 {
1075         file_t *fp;
1076         int fd;
1077 
1078         if (fdp) {
1079                 if ((fd = ufalloc(0)) == -1)
1080                         return (EMFILE);
1081         }
1082         fp = kmem_cache_alloc(file_cache, KM_SLEEP);
1083         /*
1084          * Note: falloc returns the fp locked
1085          */
1086         mutex_enter(&fp->f_tlock);
1087         fp->f_count = 1;
1088         fp->f_flag = (ushort_t)flag;
1089         fp->f_flag2 = (flag & (FSEARCH|FEXEC)) >> 16;
1090         fp->f_vnode = vp;
1091         fp->f_offset = 0;
1092         fp->f_audit_data = 0;
1093         crhold(fp->f_cred = CRED());
1094         /*
1095          * allocate resources to audit_data
1096          */
1097         if (audit_active)
1098                 audit_falloc(fp);
1099         *fpp = fp;
1100         if (fdp)
1101                 *fdp = fd;
1102         return (0);
1103 }
1104 
1105 /*ARGSUSED*/
1106 static int
1107 file_cache_constructor(void *buf, void *cdrarg, int kmflags)
1108 {
1109         file_t *fp = buf;
1110 
1111         mutex_init(&fp->f_tlock, NULL, MUTEX_DEFAULT, NULL);
1112         return (0);
1113 }
1114 
1115 /*ARGSUSED*/
1116 static void
1117 file_cache_destructor(void *buf, void *cdrarg)
1118 {
1119         file_t *fp = buf;
1120 
1121         mutex_destroy(&fp->f_tlock);
1122 }
1123 
1124 void
1125 finit()
1126 {
1127         file_cache = kmem_cache_create("file_cache", sizeof (file_t), 0,
1128             file_cache_constructor, file_cache_destructor, NULL, NULL, NULL, 0);
1129 }
1130 
1131 void
1132 unfalloc(file_t *fp)
1133 {
1134         ASSERT(MUTEX_HELD(&fp->f_tlock));
1135         if (--fp->f_count <= 0) {
1136                 /*
1137                  * deallocate resources to audit_data
1138                  */
1139                 if (audit_active)
1140                         audit_unfalloc(fp);
1141                 crfree(fp->f_cred);
1142                 mutex_exit(&fp->f_tlock);
1143                 kmem_cache_free(file_cache, fp);
1144         } else
1145                 mutex_exit(&fp->f_tlock);
1146 }
1147 
1148 /*
1149  * Given a file descriptor, set the user's
1150  * file pointer to the given parameter.
1151  */
1152 void
1153 setf(int fd, file_t *fp)
1154 {
1155         uf_info_t *fip = P_FINFO(curproc);
1156         uf_entry_t *ufp;
1157 
1158         if (AU_AUDITING())
1159                 audit_setf(fp, fd);
1160 
1161         if (fp == NULL) {
1162                 mutex_enter(&fip->fi_lock);
1163                 UF_ENTER(ufp, fip, fd);
1164                 fd_reserve(fip, fd, -1);
1165                 mutex_exit(&fip->fi_lock);
1166         } else {
1167                 UF_ENTER(ufp, fip, fd);
1168                 ASSERT(ufp->uf_busy);
1169         }
1170         ASSERT(ufp->uf_fpollinfo == NULL);
1171         ASSERT(ufp->uf_flag == 0);
1172         ufp->uf_file = fp;
1173         cv_broadcast(&ufp->uf_wanted_cv);
1174         UF_EXIT(ufp);
1175 }
1176 
1177 /*
1178  * Given a file descriptor, return the file table flags, plus,
1179  * if this is a socket in asynchronous mode, the FASYNC flag.
1180  * getf() may or may not have been called before calling f_getfl().
1181  */
1182 int
1183 f_getfl(int fd, int *flagp)
1184 {
1185         uf_info_t *fip = P_FINFO(curproc);
1186         uf_entry_t *ufp;
1187         file_t *fp;
1188         int error;
1189 
1190         if ((uint_t)fd >= fip->fi_nfiles)
1191                 error = EBADF;
1192         else {
1193                 UF_ENTER(ufp, fip, fd);
1194                 if ((fp = ufp->uf_file) == NULL)
1195                         error = EBADF;
1196                 else {
1197                         vnode_t *vp = fp->f_vnode;
1198                         int flag = fp->f_flag | (fp->f_flag2 << 16);
1199 
1200                         /*
1201                          * BSD fcntl() FASYNC compatibility.
1202                          */
1203                         if (vp->v_type == VSOCK)
1204                                 flag |= sock_getfasync(vp);
1205                         *flagp = flag;
1206                         error = 0;
1207                 }
1208                 UF_EXIT(ufp);
1209         }
1210 
1211         return (error);
1212 }
1213 
1214 /*
1215  * Given a file descriptor, return the user's file flags.
1216  * Force the FD_CLOEXEC flag for writable self-open /proc files.
1217  * getf() may or may not have been called before calling f_getfd_error().
1218  */
1219 int
1220 f_getfd_error(int fd, int *flagp)
1221 {
1222         uf_info_t *fip = P_FINFO(curproc);
1223         uf_entry_t *ufp;
1224         file_t *fp;
1225         int flag;
1226         int error;
1227 
1228         if ((uint_t)fd >= fip->fi_nfiles)
1229                 error = EBADF;
1230         else {
1231                 UF_ENTER(ufp, fip, fd);
1232                 if ((fp = ufp->uf_file) == NULL)
1233                         error = EBADF;
1234                 else {
1235                         flag = ufp->uf_flag;
1236                         if ((fp->f_flag & FWRITE) && pr_isself(fp->f_vnode))
1237                                 flag |= FD_CLOEXEC;
1238                         *flagp = flag;
1239                         error = 0;
1240                 }
1241                 UF_EXIT(ufp);
1242         }
1243 
1244         return (error);
1245 }
1246 
1247 /*
1248  * getf() must have been called before calling f_getfd().
1249  */
1250 char
1251 f_getfd(int fd)
1252 {
1253         int flag = 0;
1254         (void) f_getfd_error(fd, &flag);
1255         return ((char)flag);
1256 }
1257 
1258 /*
1259  * Given a file descriptor and file flags, set the user's file flags.
1260  * At present, the only valid flag is FD_CLOEXEC.
1261  * getf() may or may not have been called before calling f_setfd_error().
1262  */
1263 int
1264 f_setfd_error(int fd, int flags)
1265 {
1266         uf_info_t *fip = P_FINFO(curproc);
1267         uf_entry_t *ufp;
1268         int error;
1269 
1270         if ((uint_t)fd >= fip->fi_nfiles)
1271                 error = EBADF;
1272         else {
1273                 UF_ENTER(ufp, fip, fd);
1274                 if (ufp->uf_file == NULL)
1275                         error = EBADF;
1276                 else {
1277                         ufp->uf_flag = flags & FD_CLOEXEC;
1278                         error = 0;
1279                 }
1280                 UF_EXIT(ufp);
1281         }
1282         return (error);
1283 }
1284 
1285 void
1286 f_setfd(int fd, char flags)
1287 {
1288         (void) f_setfd_error(fd, flags);
1289 }
1290 
1291 #define BADFD_MIN       3
1292 #define BADFD_MAX       255
1293 
1294 /*
1295  * Attempt to allocate a file descriptor which is bad and which
1296  * is "poison" to the application.  It cannot be closed (except
1297  * on exec), allocated for a different use, etc.
1298  */
1299 int
1300 f_badfd(int start, int *fdp, int action)
1301 {
1302         int fdr;
1303         int badfd;
1304         uf_info_t *fip = P_FINFO(curproc);
1305 
1306 #ifdef _LP64
1307         /* No restrictions on 64 bit _file */
1308         if (get_udatamodel() != DATAMODEL_ILP32)
1309                 return (EINVAL);
1310 #endif
1311 
1312         if (start > BADFD_MAX || start < BADFD_MIN)
1313                 return (EINVAL);
1314 
1315         if (action >= NSIG || action < 0)
1316                 return (EINVAL);
1317 
1318         mutex_enter(&fip->fi_lock);
1319         badfd = fip->fi_badfd;
1320         mutex_exit(&fip->fi_lock);
1321 
1322         if (badfd != -1)
1323                 return (EAGAIN);
1324 
1325         fdr = ufalloc(start);
1326 
1327         if (fdr > BADFD_MAX) {
1328                 setf(fdr, NULL);
1329                 return (EMFILE);
1330         }
1331         if (fdr < 0)
1332                 return (EMFILE);
1333 
1334         mutex_enter(&fip->fi_lock);
1335         if (fip->fi_badfd != -1) {
1336                 /* Lost race */
1337                 mutex_exit(&fip->fi_lock);
1338                 setf(fdr, NULL);
1339                 return (EAGAIN);
1340         }
1341         fip->fi_action = action;
1342         fip->fi_badfd = fdr;
1343         mutex_exit(&fip->fi_lock);
1344         setf(fdr, NULL);
1345 
1346         *fdp = fdr;
1347 
1348         return (0);
1349 }
1350 
1351 /*
1352  * Allocate a file descriptor and assign it to the vnode "*vpp",
1353  * performing the usual open protocol upon it and returning the
1354  * file descriptor allocated.  It is the responsibility of the
1355  * caller to dispose of "*vpp" if any error occurs.
1356  */
1357 int
1358 fassign(vnode_t **vpp, int mode, int *fdp)
1359 {
1360         file_t *fp;
1361         int error;
1362         int fd;
1363 
1364         if (error = falloc((vnode_t *)NULL, mode, &fp, &fd))
1365                 return (error);
1366         if (error = VOP_OPEN(vpp, mode, fp->f_cred, NULL)) {
1367                 setf(fd, NULL);
1368                 unfalloc(fp);
1369                 return (error);
1370         }
1371         fp->f_vnode = *vpp;
1372         mutex_exit(&fp->f_tlock);
1373         /*
1374          * Fill in the slot falloc reserved.
1375          */
1376         setf(fd, fp);
1377         *fdp = fd;
1378         return (0);
1379 }
1380 
1381 /*
1382  * When a process forks it must increment the f_count of all file pointers
1383  * since there is a new process pointing at them.  fcnt_add(fip, 1) does this.
1384  * Since we are called when there is only 1 active lwp we don't need to
1385  * hold fi_lock or any uf_lock.  If the fork fails, fork_fail() calls
1386  * fcnt_add(fip, -1) to restore the counts.
1387  */
1388 void
1389 fcnt_add(uf_info_t *fip, int incr)
1390 {
1391         int i;
1392         uf_entry_t *ufp;
1393         file_t *fp;
1394 
1395         ufp = fip->fi_list;
1396         for (i = 0; i < fip->fi_nfiles; i++, ufp++) {
1397                 if ((fp = ufp->uf_file) != NULL) {
1398                         mutex_enter(&fp->f_tlock);
1399                         ASSERT((incr == 1 && fp->f_count >= 1) ||
1400                             (incr == -1 && fp->f_count >= 2));
1401                         fp->f_count += incr;
1402                         mutex_exit(&fp->f_tlock);
1403                 }
1404         }
1405 }
1406 
1407 /*
1408  * This is called from exec to close all fd's that have the FD_CLOEXEC flag
1409  * set and also to close all self-open for write /proc file descriptors.
1410  */
1411 void
1412 close_exec(uf_info_t *fip)
1413 {
1414         int fd;
1415         file_t *fp;
1416         fpollinfo_t *fpip;
1417         uf_entry_t *ufp;
1418         portfd_t *pfd;
1419 
1420         ufp = fip->fi_list;
1421         for (fd = 0; fd < fip->fi_nfiles; fd++, ufp++) {
1422                 if ((fp = ufp->uf_file) != NULL &&
1423                     ((ufp->uf_flag & FD_CLOEXEC) ||
1424                     ((fp->f_flag & FWRITE) && pr_isself(fp->f_vnode)))) {
1425                         fpip = ufp->uf_fpollinfo;
1426                         mutex_enter(&fip->fi_lock);
1427                         mutex_enter(&ufp->uf_lock);
1428                         fd_reserve(fip, fd, -1);
1429                         mutex_exit(&fip->fi_lock);
1430                         ufp->uf_file = NULL;
1431                         ufp->uf_fpollinfo = NULL;
1432                         ufp->uf_flag = 0;
1433                         /*
1434                          * We may need to cleanup some cached poll states
1435                          * in t_pollstate before the fd can be reused. It
1436                          * is important that we don't access a stale thread
1437                          * structure. We will do the cleanup in two
1438                          * phases to avoid deadlock and holding uf_lock for
1439                          * too long. In phase 1, hold the uf_lock and call
1440                          * pollblockexit() to set state in t_pollstate struct
1441                          * so that a thread does not exit on us. In phase 2,
1442                          * we drop the uf_lock and call pollcacheclean().
1443                          */
1444                         pfd = ufp->uf_portfd;
1445                         ufp->uf_portfd = NULL;
1446                         if (fpip != NULL)
1447                                 pollblockexit(fpip);
1448                         mutex_exit(&ufp->uf_lock);
1449                         if (fpip != NULL)
1450                                 pollcacheclean(fpip, fd);
1451                         if (pfd)
1452                                 port_close_fd(pfd);
1453                         (void) closef(fp);
1454                 }
1455         }
1456 
1457         /* Reset bad fd */
1458         fip->fi_badfd = -1;
1459         fip->fi_action = -1;
1460 }
1461 
1462 /*
1463  * Utility function called by most of the *at() system call interfaces.
1464  *
1465  * Generate a starting vnode pointer for an (fd, path) pair where 'fd'
1466  * is an open file descriptor for a directory to be used as the starting
1467  * point for the lookup of the relative pathname 'path' (or, if path is
1468  * NULL, generate a vnode pointer for the direct target of the operation).
1469  *
1470  * If we successfully return a non-NULL startvp, it has been the target
1471  * of VN_HOLD() and the caller must call VN_RELE() on it.
1472  */
1473 int
1474 fgetstartvp(int fd, char *path, vnode_t **startvpp)
1475 {
1476         vnode_t         *startvp;
1477         file_t          *startfp;
1478         char            startchar;
1479 
1480         if (fd == AT_FDCWD && path == NULL)
1481                 return (EFAULT);
1482 
1483         if (fd == AT_FDCWD) {
1484                 /*
1485                  * Start from the current working directory.
1486                  */
1487                 startvp = NULL;
1488         } else {
1489                 if (path == NULL)
1490                         startchar = '\0';
1491                 else if (copyin(path, &startchar, sizeof (char)))
1492                         return (EFAULT);
1493 
1494                 if (startchar == '/') {
1495                         /*
1496                          * 'path' is an absolute pathname.
1497                          */
1498                         startvp = NULL;
1499                 } else {
1500                         /*
1501                          * 'path' is a relative pathname or we will
1502                          * be applying the operation to 'fd' itself.
1503                          */
1504                         if ((startfp = getf(fd)) == NULL)
1505                                 return (EBADF);
1506                         startvp = startfp->f_vnode;
1507                         VN_HOLD(startvp);
1508                         releasef(fd);
1509                 }
1510         }
1511         *startvpp = startvp;
1512         return (0);
1513 }
1514 
1515 /*
1516  * Called from fchownat() and fchmodat() to set ownership and mode.
1517  * The contents of *vap must be set before calling here.
1518  */
1519 int
1520 fsetattrat(int fd, char *path, int flags, struct vattr *vap)
1521 {
1522         vnode_t         *startvp;
1523         vnode_t         *vp;
1524         int             error;
1525 
1526         /*
1527          * Since we are never called to set the size of a file, we don't
1528          * need to check for non-blocking locks (via nbl_need_check(vp)).
1529          */
1530         ASSERT(!(vap->va_mask & AT_SIZE));
1531 
1532         if ((error = fgetstartvp(fd, path, &startvp)) != 0)
1533                 return (error);
1534         if (AU_AUDITING() && startvp != NULL)
1535                 audit_setfsat_path(1);
1536 
1537         /*
1538          * Do lookup for fchownat/fchmodat when path not NULL
1539          */
1540         if (path != NULL) {
1541                 if (error = lookupnameat(path, UIO_USERSPACE,
1542                     (flags == AT_SYMLINK_NOFOLLOW) ?
1543                     NO_FOLLOW : FOLLOW,
1544                     NULLVPP, &vp, startvp)) {
1545                         if (startvp != NULL)
1546                                 VN_RELE(startvp);
1547                         return (error);
1548                 }
1549         } else {
1550                 vp = startvp;
1551                 ASSERT(vp);
1552                 VN_HOLD(vp);
1553         }
1554 
1555         if (vn_is_readonly(vp)) {
1556                 error = EROFS;
1557         } else {
1558                 error = VOP_SETATTR(vp, vap, 0, CRED(), NULL);
1559         }
1560 
1561         if (startvp != NULL)
1562                 VN_RELE(startvp);
1563         VN_RELE(vp);
1564 
1565         return (error);
1566 }
1567 
1568 /*
1569  * Return true if the given vnode is referenced by any
1570  * entry in the current process's file descriptor table.
1571  */
1572 int
1573 fisopen(vnode_t *vp)
1574 {
1575         int fd;
1576         file_t *fp;
1577         vnode_t *ovp;
1578         uf_info_t *fip = P_FINFO(curproc);
1579         uf_entry_t *ufp;
1580 
1581         mutex_enter(&fip->fi_lock);
1582         for (fd = 0; fd < fip->fi_nfiles; fd++) {
1583                 UF_ENTER(ufp, fip, fd);
1584                 if ((fp = ufp->uf_file) != NULL &&
1585                     (ovp = fp->f_vnode) != NULL && VN_CMP(vp, ovp)) {
1586                         UF_EXIT(ufp);
1587                         mutex_exit(&fip->fi_lock);
1588                         return (1);
1589                 }
1590                 UF_EXIT(ufp);
1591         }
1592         mutex_exit(&fip->fi_lock);
1593         return (0);
1594 }
1595 
1596 /*
1597  * Return zero if at least one file currently open (by curproc) shouldn't be
1598  * allowed to change zones.
1599  */
1600 int
1601 files_can_change_zones(void)
1602 {
1603         int fd;
1604         file_t *fp;
1605         uf_info_t *fip = P_FINFO(curproc);
1606         uf_entry_t *ufp;
1607 
1608         mutex_enter(&fip->fi_lock);
1609         for (fd = 0; fd < fip->fi_nfiles; fd++) {
1610                 UF_ENTER(ufp, fip, fd);
1611                 if ((fp = ufp->uf_file) != NULL &&
1612                     !vn_can_change_zones(fp->f_vnode)) {
1613                         UF_EXIT(ufp);
1614                         mutex_exit(&fip->fi_lock);
1615                         return (0);
1616                 }
1617                 UF_EXIT(ufp);
1618         }
1619         mutex_exit(&fip->fi_lock);
1620         return (1);
1621 }
1622 
1623 #ifdef DEBUG
1624 
1625 /*
1626  * The following functions are only used in ASSERT()s elsewhere.
1627  * They do not modify the state of the system.
1628  */
1629 
1630 /*
1631  * Return true (1) if the current thread is in the fpollinfo
1632  * list for this file descriptor, else false (0).
1633  */
1634 static int
1635 curthread_in_plist(uf_entry_t *ufp)
1636 {
1637         fpollinfo_t *fpip;
1638 
1639         ASSERT(MUTEX_HELD(&ufp->uf_lock));
1640         for (fpip = ufp->uf_fpollinfo; fpip; fpip = fpip->fp_next)
1641                 if (fpip->fp_thread == curthread)
1642                         return (1);
1643         return (0);
1644 }
1645 
1646 /*
1647  * Sanity check to make sure that after lwp_exit(),
1648  * curthread does not appear on any fd's fpollinfo list.
1649  */
1650 void
1651 checkfpollinfo(void)
1652 {
1653         int fd;
1654         uf_info_t *fip = P_FINFO(curproc);
1655         uf_entry_t *ufp;
1656 
1657         mutex_enter(&fip->fi_lock);
1658         for (fd = 0; fd < fip->fi_nfiles; fd++) {
1659                 UF_ENTER(ufp, fip, fd);
1660                 ASSERT(!curthread_in_plist(ufp));
1661                 UF_EXIT(ufp);
1662         }
1663         mutex_exit(&fip->fi_lock);
1664 }
1665 
1666 /*
1667  * Return true (1) if the current thread is in the fpollinfo
1668  * list for this file descriptor, else false (0).
1669  * This is the same as curthread_in_plist(),
1670  * but is called w/o holding uf_lock.
1671  */
1672 int
1673 infpollinfo(int fd)
1674 {
1675         uf_info_t *fip = P_FINFO(curproc);
1676         uf_entry_t *ufp;
1677         int rc;
1678 
1679         UF_ENTER(ufp, fip, fd);
1680         rc = curthread_in_plist(ufp);
1681         UF_EXIT(ufp);
1682         return (rc);
1683 }
1684 
1685 #endif  /* DEBUG */
1686 
1687 /*
1688  * Add the curthread to fpollinfo list, meaning this fd is currently in the
1689  * thread's poll cache. Each lwp polling this file descriptor should call
1690  * this routine once.
1691  */
1692 void
1693 addfpollinfo(int fd)
1694 {
1695         struct uf_entry *ufp;
1696         fpollinfo_t *fpip;
1697         uf_info_t *fip = P_FINFO(curproc);
1698 
1699         fpip = kmem_zalloc(sizeof (fpollinfo_t), KM_SLEEP);
1700         fpip->fp_thread = curthread;
1701         UF_ENTER(ufp, fip, fd);
1702         /*
1703          * Assert we are not already on the list, that is, that
1704          * this lwp did not call addfpollinfo twice for the same fd.
1705          */
1706         ASSERT(!curthread_in_plist(ufp));
1707         /*
1708          * addfpollinfo is always done inside the getf/releasef pair.
1709          */
1710         ASSERT(ufp->uf_refcnt >= 1);
1711         fpip->fp_next = ufp->uf_fpollinfo;
1712         ufp->uf_fpollinfo = fpip;
1713         UF_EXIT(ufp);
1714 }
1715 
1716 /*
1717  * Delete curthread from fpollinfo list if it is there.
1718  */
1719 void
1720 delfpollinfo(int fd)
1721 {
1722         struct uf_entry *ufp;
1723         struct fpollinfo *fpip;
1724         struct fpollinfo **fpipp;
1725         uf_info_t *fip = P_FINFO(curproc);
1726 
1727         UF_ENTER(ufp, fip, fd);
1728         for (fpipp = &ufp->uf_fpollinfo;
1729             (fpip = *fpipp) != NULL;
1730             fpipp = &fpip->fp_next) {
1731                 if (fpip->fp_thread == curthread) {
1732                         *fpipp = fpip->fp_next;
1733                         kmem_free(fpip, sizeof (fpollinfo_t));
1734                         break;
1735                 }
1736         }
1737         /*
1738          * Assert that we are not still on the list, that is, that
1739          * this lwp did not call addfpollinfo twice for the same fd.
1740          */
1741         ASSERT(!curthread_in_plist(ufp));
1742         UF_EXIT(ufp);
1743 }
1744 
1745 /*
1746  * fd is associated with a port. pfd is a pointer to the fd entry in the
1747  * cache of the port.
1748  */
1749 
1750 void
1751 addfd_port(int fd, portfd_t *pfd)
1752 {
1753         struct uf_entry *ufp;
1754         uf_info_t *fip = P_FINFO(curproc);
1755 
1756         UF_ENTER(ufp, fip, fd);
1757         /*
1758          * addfd_port is always done inside the getf/releasef pair.
1759          */
1760         ASSERT(ufp->uf_refcnt >= 1);
1761         if (ufp->uf_portfd == NULL) {
1762                 /* first entry */
1763                 ufp->uf_portfd = pfd;
1764                 pfd->pfd_next = NULL;
1765         } else {
1766                 pfd->pfd_next = ufp->uf_portfd;
1767                 ufp->uf_portfd = pfd;
1768                 pfd->pfd_next->pfd_prev = pfd;
1769         }
1770         UF_EXIT(ufp);
1771 }
1772 
1773 void
1774 delfd_port(int fd, portfd_t *pfd)
1775 {
1776         struct uf_entry *ufp;
1777         uf_info_t *fip = P_FINFO(curproc);
1778 
1779         UF_ENTER(ufp, fip, fd);
1780         /*
1781          * delfd_port is always done inside the getf/releasef pair.
1782          */
1783         ASSERT(ufp->uf_refcnt >= 1);
1784         if (ufp->uf_portfd == pfd) {
1785                 /* remove first entry */
1786                 ufp->uf_portfd = pfd->pfd_next;
1787         } else {
1788                 pfd->pfd_prev->pfd_next = pfd->pfd_next;
1789                 if (pfd->pfd_next != NULL)
1790                         pfd->pfd_next->pfd_prev = pfd->pfd_prev;
1791         }
1792         UF_EXIT(ufp);
1793 }
1794 
1795 static void
1796 port_close_fd(portfd_t *pfd)
1797 {
1798         portfd_t        *pfdn;
1799 
1800         /*
1801          * At this point, no other thread should access
1802          * the portfd_t list for this fd. The uf_file, uf_portfd
1803          * pointers in the uf_entry_t struct for this fd would
1804          * be set to NULL.
1805          */
1806         for (; pfd != NULL; pfd = pfdn) {
1807                 pfdn = pfd->pfd_next;
1808                 port_close_pfd(pfd);
1809         }
1810 }