Print this page
    
Code review comments from jeffpc
7029 want per-process exploit mitigation features (secflags)
7030 want basic address space layout randomization (aslr)
7031 noexec_user_stack should be a secflag
7032 want a means to forbid mappings around NULL.
    
      
        | Split | Close | 
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/os/fork.c
          +++ new/usr/src/uts/common/os/fork.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2013, Joyent, Inc. All rights reserved.
  25   25   */
  26   26  
  27   27  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28   28  /*        All Rights Reserved   */
  29   29  
  30   30  #include <sys/types.h>
  31   31  #include <sys/param.h>
  32   32  #include <sys/sysmacros.h>
  33   33  #include <sys/signal.h>
  34   34  #include <sys/cred.h>
  35   35  #include <sys/policy.h>
  36   36  #include <sys/user.h>
  37   37  #include <sys/systm.h>
  38   38  #include <sys/cpuvar.h>
  39   39  #include <sys/vfs.h>
  40   40  #include <sys/vnode.h>
  41   41  #include <sys/file.h>
  42   42  #include <sys/errno.h>
  43   43  #include <sys/time.h>
  44   44  #include <sys/proc.h>
  45   45  #include <sys/cmn_err.h>
  46   46  #include <sys/acct.h>
  47   47  #include <sys/tuneable.h>
  48   48  #include <sys/class.h>
  49   49  #include <sys/kmem.h>
  50   50  #include <sys/session.h>
  51   51  #include <sys/ucontext.h>
  52   52  #include <sys/stack.h>
  53   53  #include <sys/procfs.h>
  54   54  #include <sys/prsystm.h>
  55   55  #include <sys/vmsystm.h>
  56   56  #include <sys/vtrace.h>
  57   57  #include <sys/debug.h>
  58   58  #include <sys/shm_impl.h>
  59   59  #include <sys/door_data.h>
  60   60  #include <vm/as.h>
  61   61  #include <vm/rm.h>
  62   62  #include <c2/audit.h>
  63   63  #include <sys/var.h>
  64   64  #include <sys/schedctl.h>
  65   65  #include <sys/utrap.h>
  66   66  #include <sys/task.h>
  67   67  #include <sys/resource.h>
  68   68  #include <sys/cyclic.h>
  69   69  #include <sys/lgrp.h>
  70   70  #include <sys/rctl.h>
  71   71  #include <sys/contract_impl.h>
  72   72  #include <sys/contract/process_impl.h>
  73   73  #include <sys/list.h>
  74   74  #include <sys/dtrace.h>
  75   75  #include <sys/pool.h>
  76   76  #include <sys/zone.h>
  77   77  #include <sys/sdt.h>
  78   78  #include <sys/class.h>
  79   79  #include <sys/corectl.h>
  80   80  #include <sys/brand.h>
  81   81  #include <sys/fork.h>
  82   82  
  83   83  static int64_t cfork(int, int, int);
  84   84  static int getproc(proc_t **, pid_t, uint_t);
  85   85  #define GETPROC_USER    0x0
  86   86  #define GETPROC_KERNEL  0x1
  87   87  
  88   88  static void fork_fail(proc_t *);
  89   89  static void forklwp_fail(proc_t *);
  90   90  
  91   91  int fork_fail_pending;
  92   92  
  93   93  extern struct kmem_cache *process_cache;
  94   94  
  95   95  /*
  96   96   * The vfork() system call trap is no longer invoked by libc.
  97   97   * It is retained only for the benefit of applications running
  98   98   * within a solaris10 branded zone.  It should be eliminated
  99   99   * when we no longer support solaris10 branded zones.
 100  100   */
 101  101  int64_t
 102  102  vfork(void)
 103  103  {
 104  104          curthread->t_post_sys = 1;      /* so vfwait() will be called */
 105  105          return (cfork(1, 1, 0));
 106  106  }
 107  107  
 108  108  /*
 109  109   * forksys system call - forkx, forkallx, vforkx.  This is the
 110  110   * interface invoked by libc for fork1(), forkall(), and vfork()
 111  111   */
 112  112  int64_t
 113  113  forksys(int subcode, int flags)
 114  114  {
 115  115          switch (subcode) {
 116  116          case 0:
 117  117                  return (cfork(0, 1, flags));    /* forkx(flags) */
 118  118          case 1:
 119  119                  return (cfork(0, 0, flags));    /* forkallx(flags) */
 120  120          case 2:
 121  121                  curthread->t_post_sys = 1;      /* so vfwait() will be called */
 122  122                  return (cfork(1, 1, flags));    /* vforkx(flags) */
 123  123          default:
 124  124                  return ((int64_t)set_errno(EINVAL));
 125  125          }
 126  126  }
 127  127  
 128  128  /* ARGSUSED */
 129  129  static int64_t
 130  130  cfork(int isvfork, int isfork1, int flags)
 131  131  {
 132  132          proc_t *p = ttoproc(curthread);
 133  133          struct as *as;
 134  134          proc_t *cp, **orphpp;
 135  135          klwp_t *clone;
 136  136          kthread_t *t;
 137  137          task_t *tk;
 138  138          rval_t  r;
 139  139          int error;
 140  140          int i;
 141  141          rctl_set_t *dup_set;
 142  142          rctl_alloc_gp_t *dup_gp;
 143  143          rctl_entity_p_t e;
 144  144          lwpdir_t *ldp;
 145  145          lwpent_t *lep;
 146  146          lwpent_t *clep;
 147  147  
 148  148          /*
 149  149           * Allow only these two flags.
 150  150           */
 151  151          if ((flags & ~(FORK_NOSIGCHLD | FORK_WAITPID)) != 0) {
 152  152                  error = EINVAL;
 153  153                  atomic_inc_32(&curproc->p_zone->zone_ffmisc);
 154  154                  goto forkerr;
 155  155          }
 156  156  
 157  157          /*
 158  158           * fork is not supported for the /proc agent lwp.
 159  159           */
 160  160          if (curthread == p->p_agenttp) {
 161  161                  error = ENOTSUP;
 162  162                  atomic_inc_32(&curproc->p_zone->zone_ffmisc);
 163  163                  goto forkerr;
 164  164          }
 165  165  
 166  166          if ((error = secpolicy_basic_fork(CRED())) != 0) {
 167  167                  atomic_inc_32(&p->p_zone->zone_ffmisc);
 168  168                  goto forkerr;
 169  169          }
 170  170  
 171  171          /*
 172  172           * If the calling lwp is doing a fork1() then the
 173  173           * other lwps in this process are not duplicated and
 174  174           * don't need to be held where their kernel stacks can be
 175  175           * cloned.  If doing forkall(), the process is held with
 176  176           * SHOLDFORK, so that the lwps are at a point where their
 177  177           * stacks can be copied which is on entry or exit from
 178  178           * the kernel.
 179  179           */
 180  180          if (!holdlwps(isfork1 ? SHOLDFORK1 : SHOLDFORK)) {
 181  181                  aston(curthread);
 182  182                  error = EINTR;
 183  183                  atomic_inc_32(&p->p_zone->zone_ffmisc);
 184  184                  goto forkerr;
 185  185          }
 186  186  
 187  187  #if defined(__sparc)
 188  188          /*
 189  189           * Ensure that the user stack is fully constructed
 190  190           * before creating the child process structure.
 191  191           */
 192  192          (void) flush_user_windows_to_stack(NULL);
 193  193  #endif
 194  194  
 195  195          mutex_enter(&p->p_lock);
 196  196          /*
 197  197           * If this is vfork(), cancel any suspend request we might
 198  198           * have gotten from some other thread via lwp_suspend().
 199  199           * Otherwise we could end up with a deadlock on return
 200  200           * from the vfork() in both the parent and the child.
 201  201           */
 202  202          if (isvfork)
 203  203                  curthread->t_proc_flag &= ~TP_HOLDLWP;
 204  204          /*
 205  205           * Prevent our resource set associations from being changed during fork.
 206  206           */
 207  207          pool_barrier_enter();
 208  208          mutex_exit(&p->p_lock);
 209  209  
 210  210          /*
 211  211           * Create a child proc struct. Place a VN_HOLD on appropriate vnodes.
 212  212           */
 213  213          if (getproc(&cp, 0, GETPROC_USER) < 0) {
 214  214                  mutex_enter(&p->p_lock);
 215  215                  pool_barrier_exit();
 216  216                  continuelwps(p);
 217  217                  mutex_exit(&p->p_lock);
 218  218                  error = EAGAIN;
 219  219                  goto forkerr;
 220  220          }
 221  221  
 222  222          TRACE_2(TR_FAC_PROC, TR_PROC_FORK, "proc_fork:cp %p p %p", cp, p);
 223  223  
 224  224          /*
 225  225           * Assign an address space to child
 226  226           */
 227  227          if (isvfork) {
 228  228                  /*
 229  229                   * Clear any watched areas and remember the
 230  230                   * watched pages for restoring in vfwait().
 231  231                   */
 232  232                  as = p->p_as;
 233  233                  if (avl_numnodes(&as->a_wpage) != 0) {
 234  234                          AS_LOCK_ENTER(as, RW_WRITER);
 235  235                          as_clearwatch(as);
 236  236                          p->p_wpage = as->a_wpage;
 237  237                          avl_create(&as->a_wpage, wp_compare,
 238  238                              sizeof (struct watched_page),
 239  239                              offsetof(struct watched_page, wp_link));
 240  240                          AS_LOCK_EXIT(as);
 241  241                  }
 242  242                  cp->p_as = as;
 243  243                  cp->p_flag |= SVFORK;
 244  244  
 245  245                  /*
 246  246                   * Use the parent's shm segment list information for
 247  247                   * the child as it uses its address space till it execs.
 248  248                   */
 249  249                  cp->p_segacct = p->p_segacct;
 250  250          } else {
 251  251                  /*
 252  252                   * We need to hold P_PR_LOCK until the address space has
 253  253                   * been duplicated and we've had a chance to remove from the
 254  254                   * child any DTrace probes that were in the parent. Holding
 255  255                   * P_PR_LOCK prevents any new probes from being added and any
 256  256                   * extant probes from being removed.
 257  257                   */
 258  258                  mutex_enter(&p->p_lock);
 259  259                  sprlock_proc(p);
 260  260                  p->p_flag |= SFORKING;
 261  261                  mutex_exit(&p->p_lock);
 262  262  
 263  263                  error = as_dup(p->p_as, cp);
 264  264                  if (error != 0) {
 265  265                          mutex_enter(&p->p_lock);
 266  266                          sprunlock(p);
 267  267                          fork_fail(cp);
 268  268                          mutex_enter(&pidlock);
 269  269                          orphpp = &p->p_orphan;
 270  270                          while (*orphpp != cp)
 271  271                                  orphpp = &(*orphpp)->p_nextorph;
 272  272                          *orphpp = cp->p_nextorph;
 273  273                          if (p->p_child == cp)
 274  274                                  p->p_child = cp->p_sibling;
 275  275                          if (cp->p_sibling)
 276  276                                  cp->p_sibling->p_psibling = cp->p_psibling;
 277  277                          if (cp->p_psibling)
 278  278                                  cp->p_psibling->p_sibling = cp->p_sibling;
 279  279                          mutex_enter(&cp->p_lock);
 280  280                          tk = cp->p_task;
 281  281                          task_detach(cp);
 282  282                          ASSERT(cp->p_pool->pool_ref > 0);
 283  283                          atomic_dec_32(&cp->p_pool->pool_ref);
 284  284                          mutex_exit(&cp->p_lock);
 285  285                          pid_exit(cp, tk);
 286  286                          mutex_exit(&pidlock);
 287  287                          task_rele(tk);
 288  288  
 289  289                          mutex_enter(&p->p_lock);
 290  290                          p->p_flag &= ~SFORKING;
 291  291                          pool_barrier_exit();
 292  292                          continuelwps(p);
 293  293                          mutex_exit(&p->p_lock);
 294  294                          /*
 295  295                           * Preserve ENOMEM error condition but
 296  296                           * map all others to EAGAIN.
 297  297                           */
 298  298                          error = (error == ENOMEM) ? ENOMEM : EAGAIN;
 299  299                          atomic_inc_32(&p->p_zone->zone_ffnomem);
 300  300                          goto forkerr;
 301  301                  }
 302  302  
 303  303                  /*
 304  304                   * Remove all DTrace tracepoints from the child process. We
 305  305                   * need to do this _before_ duplicating USDT providers since
 306  306                   * any associated probes may be immediately enabled.
 307  307                   */
 308  308                  if (p->p_dtrace_count > 0)
 309  309                          dtrace_fasttrap_fork(p, cp);
 310  310  
 311  311                  mutex_enter(&p->p_lock);
 312  312                  sprunlock(p);
 313  313  
 314  314                  /* Duplicate parent's shared memory */
 315  315                  if (p->p_segacct)
 316  316                          shmfork(p, cp);
 317  317  
 318  318                  /*
 319  319                   * Duplicate any helper actions and providers. The SFORKING
 320  320                   * we set above informs the code to enable USDT probes that
 321  321                   * sprlock() may fail because the child is being forked.
 322  322                   */
 323  323                  if (p->p_dtrace_helpers != NULL) {
 324  324                          ASSERT(dtrace_helpers_fork != NULL);
 325  325                          (*dtrace_helpers_fork)(p, cp);
 326  326                  }
 327  327  
 328  328                  mutex_enter(&p->p_lock);
 329  329                  p->p_flag &= ~SFORKING;
 330  330                  mutex_exit(&p->p_lock);
 331  331          }
 332  332  
 333  333          /*
 334  334           * Duplicate parent's resource controls.
 335  335           */
 336  336          dup_set = rctl_set_create();
 337  337          for (;;) {
 338  338                  dup_gp = rctl_set_dup_prealloc(p->p_rctls);
 339  339                  mutex_enter(&p->p_rctls->rcs_lock);
 340  340                  if (rctl_set_dup_ready(p->p_rctls, dup_gp))
 341  341                          break;
 342  342                  mutex_exit(&p->p_rctls->rcs_lock);
 343  343                  rctl_prealloc_destroy(dup_gp);
 344  344          }
 345  345          e.rcep_p.proc = cp;
 346  346          e.rcep_t = RCENTITY_PROCESS;
 347  347          cp->p_rctls = rctl_set_dup(p->p_rctls, p, cp, &e, dup_set, dup_gp,
 348  348              RCD_DUP | RCD_CALLBACK);
 349  349          mutex_exit(&p->p_rctls->rcs_lock);
 350  350  
 351  351          rctl_prealloc_destroy(dup_gp);
 352  352  
 353  353          /*
 354  354           * Allocate the child's lwp directory and lwpid hash table.
 355  355           */
 356  356          if (isfork1)
 357  357                  cp->p_lwpdir_sz = 2;
 358  358          else
 359  359                  cp->p_lwpdir_sz = p->p_lwpdir_sz;
 360  360          cp->p_lwpdir = cp->p_lwpfree = ldp =
 361  361              kmem_zalloc(cp->p_lwpdir_sz * sizeof (lwpdir_t), KM_SLEEP);
 362  362          for (i = 1; i < cp->p_lwpdir_sz; i++, ldp++)
 363  363                  ldp->ld_next = ldp + 1;
 364  364          cp->p_tidhash_sz = (cp->p_lwpdir_sz + 2) / 2;
 365  365          cp->p_tidhash =
 366  366              kmem_zalloc(cp->p_tidhash_sz * sizeof (tidhash_t), KM_SLEEP);
 367  367  
 368  368          /*
 369  369           * Duplicate parent's lwps.
 370  370           * Mutual exclusion is not needed because the process is
 371  371           * in the hold state and only the current lwp is running.
 372  372           */
 373  373          klgrpset_clear(cp->p_lgrpset);
 374  374          if (isfork1) {
 375  375                  clone = forklwp(ttolwp(curthread), cp, curthread->t_tid);
 376  376                  if (clone == NULL)
 377  377                          goto forklwperr;
 378  378                  /*
 379  379                   * Inherit only the lwp_wait()able flag,
 380  380                   * Daemon threads should not call fork1(), but oh well...
 381  381                   */
 382  382                  lwptot(clone)->t_proc_flag |=
 383  383                      (curthread->t_proc_flag & TP_TWAIT);
 384  384          } else {
 385  385                  /* this is forkall(), no one can be in lwp_wait() */
 386  386                  ASSERT(p->p_lwpwait == 0 && p->p_lwpdwait == 0);
 387  387                  /* for each entry in the parent's lwp directory... */
 388  388                  for (i = 0, ldp = p->p_lwpdir; i < p->p_lwpdir_sz; i++, ldp++) {
 389  389                          klwp_t *clwp;
 390  390                          kthread_t *ct;
 391  391  
 392  392                          if ((lep = ldp->ld_entry) == NULL)
 393  393                                  continue;
 394  394  
 395  395                          if ((t = lep->le_thread) != NULL) {
 396  396                                  clwp = forklwp(ttolwp(t), cp, t->t_tid);
 397  397                                  if (clwp == NULL)
 398  398                                          goto forklwperr;
 399  399                                  ct = lwptot(clwp);
 400  400                                  /*
 401  401                                   * Inherit lwp_wait()able and daemon flags.
 402  402                                   */
 403  403                                  ct->t_proc_flag |=
 404  404                                      (t->t_proc_flag & (TP_TWAIT|TP_DAEMON));
 405  405                                  /*
 406  406                                   * Keep track of the clone of curthread to
 407  407                                   * post return values through lwp_setrval().
 408  408                                   * Mark other threads for special treatment
 409  409                                   * by lwp_rtt() / post_syscall().
 410  410                                   */
 411  411                                  if (t == curthread)
 412  412                                          clone = clwp;
 413  413                                  else
 414  414                                          ct->t_flag |= T_FORKALL;
 415  415                          } else {
 416  416                                  /*
 417  417                                   * Replicate zombie lwps in the child.
 418  418                                   */
 419  419                                  clep = kmem_zalloc(sizeof (*clep), KM_SLEEP);
 420  420                                  clep->le_lwpid = lep->le_lwpid;
 421  421                                  clep->le_start = lep->le_start;
 422  422                                  lwp_hash_in(cp, clep,
 423  423                                      cp->p_tidhash, cp->p_tidhash_sz, 0);
 424  424                          }
 425  425                  }
 426  426          }
 427  427  
 428  428          /*
 429  429           * Put new process in the parent's process contract, or put it
 430  430           * in a new one if there is an active process template.  Send a
 431  431           * fork event (if requested) to whatever contract the child is
 432  432           * a member of.  Fails if the parent has been SIGKILLed.
 433  433           */
 434  434          if (contract_process_fork(NULL, cp, p, B_TRUE) == NULL) {
 435  435                  atomic_inc_32(&p->p_zone->zone_ffmisc);
 436  436                  goto forklwperr;
 437  437          }
 438  438  
 439  439          /*
 440  440           * No fork failures occur beyond this point.
 441  441           */
 442  442  
 443  443          cp->p_lwpid = p->p_lwpid;
 444  444          if (!isfork1) {
 445  445                  cp->p_lwpdaemon = p->p_lwpdaemon;
 446  446                  cp->p_zombcnt = p->p_zombcnt;
 447  447                  /*
 448  448                   * If the parent's lwp ids have wrapped around, so have the
 449  449                   * child's.
 450  450                   */
 451  451                  cp->p_flag |= p->p_flag & SLWPWRAP;
 452  452          }
 453  453  
 454  454          mutex_enter(&p->p_lock);
 455  455          corectl_path_hold(cp->p_corefile = p->p_corefile);
 456  456          corectl_content_hold(cp->p_content = p->p_content);
 457  457          mutex_exit(&p->p_lock);
 458  458  
 459  459          /*
 460  460           * Duplicate process context ops, if any.
 461  461           */
 462  462          if (p->p_pctx)
 463  463                  forkpctx(p, cp);
 464  464  
 465  465  #ifdef __sparc
 466  466          utrap_dup(p, cp);
 467  467  #endif
 468  468          /*
 469  469           * If the child process has been marked to stop on exit
 470  470           * from this fork, arrange for all other lwps to stop in
 471  471           * sympathy with the active lwp.
 472  472           */
 473  473          if (PTOU(cp)->u_systrap &&
 474  474              prismember(&PTOU(cp)->u_exitmask, curthread->t_sysnum)) {
 475  475                  mutex_enter(&cp->p_lock);
 476  476                  t = cp->p_tlist;
 477  477                  do {
 478  478                          t->t_proc_flag |= TP_PRSTOP;
 479  479                          aston(t);       /* so TP_PRSTOP will be seen */
 480  480                  } while ((t = t->t_forw) != cp->p_tlist);
 481  481                  mutex_exit(&cp->p_lock);
 482  482          }
 483  483          /*
 484  484           * If the parent process has been marked to stop on exit
 485  485           * from this fork, and its asynchronous-stop flag has not
 486  486           * been set, arrange for all other lwps to stop before
 487  487           * they return back to user level.
 488  488           */
 489  489          if (!(p->p_proc_flag & P_PR_ASYNC) && PTOU(p)->u_systrap &&
 490  490              prismember(&PTOU(p)->u_exitmask, curthread->t_sysnum)) {
 491  491                  mutex_enter(&p->p_lock);
 492  492                  t = p->p_tlist;
 493  493                  do {
 494  494                          t->t_proc_flag |= TP_PRSTOP;
 495  495                          aston(t);       /* so TP_PRSTOP will be seen */
 496  496                  } while ((t = t->t_forw) != p->p_tlist);
 497  497                  mutex_exit(&p->p_lock);
 498  498          }
 499  499  
 500  500          if (PROC_IS_BRANDED(p))
 501  501                  BROP(p)->b_lwp_setrval(clone, p->p_pid, 1);
 502  502          else
 503  503                  lwp_setrval(clone, p->p_pid, 1);
 504  504  
 505  505          /* set return values for parent */
 506  506          r.r_val1 = (int)cp->p_pid;
 507  507          r.r_val2 = 0;
 508  508  
 509  509          /*
 510  510           * pool_barrier_exit() can now be called because the child process has:
 511  511           * - all identifying features cloned or set (p_pid, p_task, p_pool)
 512  512           * - all resource sets associated (p_tlist->*->t_cpupart, p_as->a_mset)
 513  513           * - any other fields set which are used in resource set binding.
 514  514           */
 515  515          mutex_enter(&p->p_lock);
 516  516          pool_barrier_exit();
 517  517          mutex_exit(&p->p_lock);
 518  518  
 519  519          mutex_enter(&pidlock);
 520  520          mutex_enter(&cp->p_lock);
 521  521  
 522  522          /*
 523  523           * Set flags telling the child what (not) to do on exit.
 524  524           */
 525  525          if (flags & FORK_NOSIGCHLD)
 526  526                  cp->p_pidflag |= CLDNOSIGCHLD;
 527  527          if (flags & FORK_WAITPID)
 528  528                  cp->p_pidflag |= CLDWAITPID;
 529  529  
 530  530          /*
 531  531           * Now that there are lwps and threads attached, add the new
 532  532           * process to the process group.
 533  533           */
 534  534          pgjoin(cp, p->p_pgidp);
 535  535          cp->p_stat = SRUN;
 536  536          /*
 537  537           * We are now done with all the lwps in the child process.
 538  538           */
 539  539          t = cp->p_tlist;
 540  540          do {
 541  541                  /*
 542  542                   * Set the lwp_suspend()ed lwps running.
 543  543                   * They will suspend properly at syscall exit.
 544  544                   */
 545  545                  if (t->t_proc_flag & TP_HOLDLWP)
 546  546                          lwp_create_done(t);
 547  547                  else {
 548  548                          /* set TS_CREATE to allow continuelwps() to work */
 549  549                          thread_lock(t);
 550  550                          ASSERT(t->t_state == TS_STOPPED &&
 551  551                              !(t->t_schedflag & (TS_CREATE|TS_CSTART)));
 552  552                          t->t_schedflag |= TS_CREATE;
 553  553                          thread_unlock(t);
 554  554                  }
 555  555          } while ((t = t->t_forw) != cp->p_tlist);
 556  556          mutex_exit(&cp->p_lock);
 557  557  
 558  558          if (isvfork) {
 559  559                  CPU_STATS_ADDQ(CPU, sys, sysvfork, 1);
 560  560                  mutex_enter(&p->p_lock);
 561  561                  p->p_flag |= SVFWAIT;
 562  562                  curthread->t_flag |= T_VFPARENT;
 563  563                  DTRACE_PROC1(create, proc_t *, cp);
 564  564                  cv_broadcast(&pr_pid_cv[p->p_slot]);    /* inform /proc */
 565  565                  mutex_exit(&p->p_lock);
 566  566                  /*
 567  567                   * Grab child's p_lock before dropping pidlock to ensure
 568  568                   * the process will not disappear before we set it running.
 569  569                   */
 570  570                  mutex_enter(&cp->p_lock);
 571  571                  mutex_exit(&pidlock);
 572  572                  sigdefault(cp);
 573  573                  continuelwps(cp);
 574  574                  mutex_exit(&cp->p_lock);
 575  575          } else {
 576  576                  CPU_STATS_ADDQ(CPU, sys, sysfork, 1);
 577  577                  DTRACE_PROC1(create, proc_t *, cp);
 578  578                  /*
 579  579                   * It is CL_FORKRET's job to drop pidlock.
 580  580                   * If we do it here, the process could be set running
 581  581                   * and disappear before CL_FORKRET() is called.
 582  582                   */
 583  583                  CL_FORKRET(curthread, cp->p_tlist);
 584  584                  schedctl_set_cidpri(curthread);
 585  585                  ASSERT(MUTEX_NOT_HELD(&pidlock));
 586  586          }
 587  587  
 588  588          return (r.r_vals);
 589  589  
 590  590  forklwperr:
 591  591          if (isvfork) {
 592  592                  if (avl_numnodes(&p->p_wpage) != 0) {
 593  593                          /* restore watchpoints to parent */
 594  594                          as = p->p_as;
 595  595                          AS_LOCK_ENTER(as, RW_WRITER);
 596  596                          as->a_wpage = p->p_wpage;
 597  597                          avl_create(&p->p_wpage, wp_compare,
 598  598                              sizeof (struct watched_page),
 599  599                              offsetof(struct watched_page, wp_link));
 600  600                          as_setwatch(as);
 601  601                          AS_LOCK_EXIT(as);
 602  602                  }
 603  603          } else {
 604  604                  if (cp->p_segacct)
 605  605                          shmexit(cp);
 606  606                  as = cp->p_as;
 607  607                  cp->p_as = &kas;
 608  608                  as_free(as);
 609  609          }
 610  610  
 611  611          if (cp->p_lwpdir) {
 612  612                  for (i = 0, ldp = cp->p_lwpdir; i < cp->p_lwpdir_sz; i++, ldp++)
 613  613                          if ((lep = ldp->ld_entry) != NULL)
 614  614                                  kmem_free(lep, sizeof (*lep));
 615  615                  kmem_free(cp->p_lwpdir,
 616  616                      cp->p_lwpdir_sz * sizeof (*cp->p_lwpdir));
 617  617          }
 618  618          cp->p_lwpdir = NULL;
 619  619          cp->p_lwpfree = NULL;
 620  620          cp->p_lwpdir_sz = 0;
 621  621  
 622  622          if (cp->p_tidhash)
 623  623                  kmem_free(cp->p_tidhash,
 624  624                      cp->p_tidhash_sz * sizeof (*cp->p_tidhash));
 625  625          cp->p_tidhash = NULL;
 626  626          cp->p_tidhash_sz = 0;
 627  627  
 628  628          forklwp_fail(cp);
 629  629          fork_fail(cp);
 630  630          rctl_set_free(cp->p_rctls);
 631  631          mutex_enter(&pidlock);
 632  632  
 633  633          /*
 634  634           * Detach failed child from task.
 635  635           */
 636  636          mutex_enter(&cp->p_lock);
 637  637          tk = cp->p_task;
 638  638          task_detach(cp);
 639  639          ASSERT(cp->p_pool->pool_ref > 0);
 640  640          atomic_dec_32(&cp->p_pool->pool_ref);
 641  641          mutex_exit(&cp->p_lock);
 642  642  
 643  643          orphpp = &p->p_orphan;
 644  644          while (*orphpp != cp)
 645  645                  orphpp = &(*orphpp)->p_nextorph;
 646  646          *orphpp = cp->p_nextorph;
 647  647          if (p->p_child == cp)
 648  648                  p->p_child = cp->p_sibling;
 649  649          if (cp->p_sibling)
 650  650                  cp->p_sibling->p_psibling = cp->p_psibling;
 651  651          if (cp->p_psibling)
 652  652                  cp->p_psibling->p_sibling = cp->p_sibling;
 653  653          pid_exit(cp, tk);
 654  654          mutex_exit(&pidlock);
 655  655  
 656  656          task_rele(tk);
 657  657  
 658  658          mutex_enter(&p->p_lock);
 659  659          pool_barrier_exit();
 660  660          continuelwps(p);
 661  661          mutex_exit(&p->p_lock);
 662  662          error = EAGAIN;
 663  663  forkerr:
 664  664          return ((int64_t)set_errno(error));
 665  665  }
 666  666  
 667  667  /*
 668  668   * Free allocated resources from getproc() if a fork failed.
 669  669   */
 670  670  static void
 671  671  fork_fail(proc_t *cp)
 672  672  {
 673  673          uf_info_t *fip = P_FINFO(cp);
 674  674  
 675  675          fcnt_add(fip, -1);
 676  676          sigdelq(cp, NULL, 0);
 677  677  
 678  678          mutex_enter(&pidlock);
 679  679          upcount_dec(crgetruid(cp->p_cred), crgetzoneid(cp->p_cred));
 680  680          mutex_exit(&pidlock);
 681  681  
 682  682          /*
 683  683           * single threaded, so no locking needed here
 684  684           */
 685  685          crfree(cp->p_cred);
 686  686  
 687  687          kmem_free(fip->fi_list, fip->fi_nfiles * sizeof (uf_entry_t));
 688  688  
 689  689          VN_RELE(PTOU(curproc)->u_cdir);
 690  690          if (PTOU(curproc)->u_rdir)
 691  691                  VN_RELE(PTOU(curproc)->u_rdir);
 692  692          if (cp->p_exec)
 693  693                  VN_RELE(cp->p_exec);
 694  694          if (cp->p_execdir)
 695  695                  VN_RELE(cp->p_execdir);
 696  696          if (PTOU(curproc)->u_cwd)
 697  697                  refstr_rele(PTOU(curproc)->u_cwd);
 698  698          if (PROC_IS_BRANDED(cp)) {
 699  699                  brand_clearbrand(cp, B_TRUE);
 700  700          }
 701  701  }
 702  702  
 703  703  /*
 704  704   * Clean up the lwps already created for this child process.
 705  705   * The fork failed while duplicating all the lwps of the parent
 706  706   * and those lwps already created must be freed.
 707  707   * This process is invisible to the rest of the system,
 708  708   * so we don't need to hold p->p_lock to protect the list.
 709  709   */
 710  710  static void
 711  711  forklwp_fail(proc_t *p)
 712  712  {
 713  713          kthread_t *t;
 714  714          task_t *tk;
 715  715          int branded = 0;
 716  716  
 717  717          if (PROC_IS_BRANDED(p))
 718  718                  branded = 1;
 719  719  
 720  720          while ((t = p->p_tlist) != NULL) {
 721  721                  /*
 722  722                   * First remove the lwp from the process's p_tlist.
 723  723                   */
 724  724                  if (t != t->t_forw)
 725  725                          p->p_tlist = t->t_forw;
 726  726                  else
 727  727                          p->p_tlist = NULL;
 728  728                  p->p_lwpcnt--;
 729  729                  t->t_forw->t_back = t->t_back;
 730  730                  t->t_back->t_forw = t->t_forw;
 731  731  
 732  732                  tk = p->p_task;
 733  733                  mutex_enter(&p->p_zone->zone_nlwps_lock);
 734  734                  tk->tk_nlwps--;
 735  735                  tk->tk_proj->kpj_nlwps--;
 736  736                  p->p_zone->zone_nlwps--;
 737  737                  mutex_exit(&p->p_zone->zone_nlwps_lock);
 738  738  
 739  739                  ASSERT(t->t_schedctl == NULL);
 740  740  
 741  741                  if (branded)
 742  742                          BROP(p)->b_freelwp(ttolwp(t));
 743  743  
 744  744                  if (t->t_door != NULL) {
 745  745                          kmem_free(t->t_door, sizeof (door_data_t));
 746  746                          t->t_door = NULL;
 747  747                  }
 748  748                  lwp_ctmpl_clear(ttolwp(t));
 749  749  
 750  750                  /*
 751  751                   * Remove the thread from the all threads list.
 752  752                   * We need to hold pidlock for this.
 753  753                   */
 754  754                  mutex_enter(&pidlock);
 755  755                  t->t_next->t_prev = t->t_prev;
 756  756                  t->t_prev->t_next = t->t_next;
 757  757                  CL_EXIT(t);     /* tell the scheduler that we're exiting */
 758  758                  cv_broadcast(&t->t_joincv);     /* tell anyone in thread_join */
 759  759                  mutex_exit(&pidlock);
 760  760  
 761  761                  /*
 762  762                   * Let the lgroup load averages know that this thread isn't
 763  763                   * going to show up (i.e. un-do what was done on behalf of
 764  764                   * this thread by the earlier lgrp_move_thread()).
 765  765                   */
 766  766                  kpreempt_disable();
 767  767                  lgrp_move_thread(t, NULL, 1);
 768  768                  kpreempt_enable();
 769  769  
 770  770                  /*
 771  771                   * The thread was created TS_STOPPED.
 772  772                   * We change it to TS_FREE to avoid an
 773  773                   * ASSERT() panic in thread_free().
 774  774                   */
 775  775                  t->t_state = TS_FREE;
 776  776                  thread_rele(t);
 777  777                  thread_free(t);
 778  778          }
 779  779  }
 780  780  
 781  781  extern struct as kas;
 782  782  
 783  783  /*
 784  784   * fork a kernel process.
 785  785   */
 786  786  int
 787  787  newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
 788  788      pid_t pid)
 789  789  {
 790  790          proc_t *p;
 791  791          struct user *up;
 792  792          kthread_t *t;
 793  793          cont_process_t *ctp = NULL;
 794  794          rctl_entity_p_t e;
 795  795  
 796  796          ASSERT(cid != sysdccid);
 797  797          ASSERT(cid != syscid || ct == NULL);
 798  798          if (CLASS_KERNEL(cid)) {
 799  799                  rctl_alloc_gp_t *init_gp;
 800  800                  rctl_set_t *init_set;
 801  801  
 802  802                  ASSERT(pid != 1);
 803  803  
 804  804                  if (getproc(&p, pid, GETPROC_KERNEL) < 0)
 805  805                          return (EAGAIN);
 806  806  
 807  807                  /*
 808  808                   * Release the hold on the p_exec and p_execdir, these
 809  809                   * were acquired in getproc()
 810  810                   */
 811  811                  if (p->p_execdir != NULL)
 812  812                          VN_RELE(p->p_execdir);
 813  813                  if (p->p_exec != NULL)
 814  814                          VN_RELE(p->p_exec);
 815  815                  p->p_flag |= SNOWAIT;
 816  816                  p->p_exec = NULL;
 817  817                  p->p_execdir = NULL;
 818  818  
 819  819                  init_set = rctl_set_create();
 820  820                  init_gp = rctl_set_init_prealloc(RCENTITY_PROCESS);
 821  821  
 822  822                  /*
 823  823                   * kernel processes do not inherit /proc tracing flags.
 824  824                   */
 825  825                  sigemptyset(&p->p_sigmask);
 826  826                  premptyset(&p->p_fltmask);
 827  827                  up = PTOU(p);
 828  828                  up->u_systrap = 0;
 829  829                  premptyset(&(up->u_entrymask));
 830  830                  premptyset(&(up->u_exitmask));
 831  831                  mutex_enter(&p->p_lock);
 832  832                  e.rcep_p.proc = p;
 833  833                  e.rcep_t = RCENTITY_PROCESS;
 834  834                  p->p_rctls = rctl_set_init(RCENTITY_PROCESS, p, &e, init_set,
 835  835                      init_gp);
 836  836                  mutex_exit(&p->p_lock);
 837  837  
 838  838                  rctl_prealloc_destroy(init_gp);
 839  839  
 840  840                  t = lwp_kernel_create(p, pc, arg, TS_STOPPED, pri);
 841  841          } else {
 842  842                  rctl_alloc_gp_t *init_gp, *default_gp;
 843  843                  rctl_set_t *init_set;
 844  844                  task_t *tk, *tk_old;
 845  845                  klwp_t *lwp;
 846  846  
 847  847                  if (getproc(&p, pid, GETPROC_USER) < 0)
 848  848                          return (EAGAIN);
 849  849                  /*
 850  850                   * init creates a new task, distinct from the task
 851  851                   * containing kernel "processes".
 852  852                   */
 853  853                  tk = task_create(0, p->p_zone);
 854  854                  mutex_enter(&tk->tk_zone->zone_nlwps_lock);
 855  855                  tk->tk_proj->kpj_ntasks++;
 856  856                  tk->tk_nprocs++;
 857  857                  mutex_exit(&tk->tk_zone->zone_nlwps_lock);
 858  858  
 859  859                  default_gp = rctl_rlimit_set_prealloc(RLIM_NLIMITS);
 860  860                  init_gp = rctl_set_init_prealloc(RCENTITY_PROCESS);
 861  861                  init_set = rctl_set_create();
 862  862  
 863  863                  mutex_enter(&pidlock);
 864  864                  mutex_enter(&p->p_lock);
 865  865                  tk_old = p->p_task;     /* switch to new task */
 866  866  
 867  867                  task_detach(p);
 868  868                  task_begin(tk, p);
 869  869                  mutex_exit(&pidlock);
 870  870  
 871  871                  mutex_enter(&tk_old->tk_zone->zone_nlwps_lock);
 872  872                  tk_old->tk_nprocs--;
 873  873                  mutex_exit(&tk_old->tk_zone->zone_nlwps_lock);
 874  874  
 875  875                  e.rcep_p.proc = p;
 876  876                  e.rcep_t = RCENTITY_PROCESS;
 877  877                  p->p_rctls = rctl_set_init(RCENTITY_PROCESS, p, &e, init_set,
 878  878                      init_gp);
 879  879                  rctlproc_default_init(p, default_gp);
 880  880                  mutex_exit(&p->p_lock);
 881  881  
 882  882                  task_rele(tk_old);
 883  883                  rctl_prealloc_destroy(default_gp);
 884  884                  rctl_prealloc_destroy(init_gp);
 885  885  
 886  886                  if ((lwp = lwp_create(pc, arg, 0, p, TS_STOPPED, pri,
 887  887                      &curthread->t_hold, cid, 1)) == NULL) {
 888  888                          task_t *tk;
 889  889                          fork_fail(p);
 890  890                          mutex_enter(&pidlock);
 891  891                          mutex_enter(&p->p_lock);
 892  892                          tk = p->p_task;
 893  893                          task_detach(p);
 894  894                          ASSERT(p->p_pool->pool_ref > 0);
 895  895                          atomic_add_32(&p->p_pool->pool_ref, -1);
 896  896                          mutex_exit(&p->p_lock);
 897  897                          pid_exit(p, tk);
 898  898                          mutex_exit(&pidlock);
 899  899                          task_rele(tk);
 900  900  
 901  901                          return (EAGAIN);
 902  902                  }
 903  903                  t = lwptot(lwp);
 904  904  
 905  905                  ctp = contract_process_fork(sys_process_tmpl, p, curproc,
 906  906                      B_FALSE);
 907  907                  ASSERT(ctp != NULL);
 908  908                  if (ct != NULL)
 909  909                          *ct = &ctp->conp_contract;
 910  910          }
 911  911  
 912  912          ASSERT3U(t->t_tid, ==, 1);
 913  913          p->p_lwpid = 1;
 914  914          mutex_enter(&pidlock);
 915  915          pgjoin(p, p->p_parent->p_pgidp);
 916  916          p->p_stat = SRUN;
 917  917          mutex_enter(&p->p_lock);
 918  918          t->t_proc_flag &= ~TP_HOLDLWP;
 919  919          lwp_create_done(t);
 920  920          mutex_exit(&p->p_lock);
 921  921          mutex_exit(&pidlock);
 922  922          return (0);
 923  923  }
 924  924  
 925  925  /*
 926  926   * create a child proc struct.
 927  927   */
 928  928  static int
 929  929  getproc(proc_t **cpp, pid_t pid, uint_t flags)
 930  930  {
 931  931          proc_t          *pp, *cp;
 932  932          pid_t           newpid;
 933  933          struct user     *uarea;
 934  934          extern uint_t   nproc;
 935  935          struct cred     *cr;
 936  936          uid_t           ruid;
 937  937          zoneid_t        zoneid;
 938  938          task_t          *task;
 939  939          kproject_t      *proj;
 940  940          zone_t          *zone;
 941  941          int             rctlfail = 0;
 942  942  
 943  943          if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
 944  944                  return (-1);    /* no point in starting new processes */
 945  945  
 946  946          pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
 947  947          task = pp->p_task;
 948  948          proj = task->tk_proj;
 949  949          zone = pp->p_zone;
 950  950  
 951  951          mutex_enter(&pp->p_lock);
 952  952          mutex_enter(&zone->zone_nlwps_lock);
 953  953          if (proj != proj0p) {
 954  954                  if (task->tk_nprocs >= task->tk_nprocs_ctl)
 955  955                          if (rctl_test(rc_task_nprocs, task->tk_rctls,
 956  956                              pp, 1, 0) & RCT_DENY)
 957  957                                  rctlfail = 1;
 958  958  
 959  959                  if (proj->kpj_nprocs >= proj->kpj_nprocs_ctl)
 960  960                          if (rctl_test(rc_project_nprocs, proj->kpj_rctls,
 961  961                              pp, 1, 0) & RCT_DENY)
 962  962                                  rctlfail = 1;
 963  963  
 964  964                  if (zone->zone_nprocs >= zone->zone_nprocs_ctl)
 965  965                          if (rctl_test(rc_zone_nprocs, zone->zone_rctls,
 966  966                              pp, 1, 0) & RCT_DENY)
 967  967                                  rctlfail = 1;
 968  968  
 969  969                  if (rctlfail) {
 970  970                          mutex_exit(&zone->zone_nlwps_lock);
 971  971                          mutex_exit(&pp->p_lock);
 972  972                          atomic_inc_32(&zone->zone_ffcap);
 973  973                          goto punish;
 974  974                  }
 975  975          }
 976  976          task->tk_nprocs++;
 977  977          proj->kpj_nprocs++;
 978  978          zone->zone_nprocs++;
 979  979          mutex_exit(&zone->zone_nlwps_lock);
 980  980          mutex_exit(&pp->p_lock);
 981  981  
 982  982          cp = kmem_cache_alloc(process_cache, KM_SLEEP);
 983  983          bzero(cp, sizeof (proc_t));
 984  984  
 985  985          /*
 986  986           * Make proc entry for child process
 987  987           */
 988  988          mutex_init(&cp->p_splock, NULL, MUTEX_DEFAULT, NULL);
 989  989          mutex_init(&cp->p_crlock, NULL, MUTEX_DEFAULT, NULL);
 990  990          mutex_init(&cp->p_pflock, NULL, MUTEX_DEFAULT, NULL);
 991  991  #if defined(__x86)
 992  992          mutex_init(&cp->p_ldtlock, NULL, MUTEX_DEFAULT, NULL);
 993  993  #endif
 994  994          mutex_init(&cp->p_maplock, NULL, MUTEX_DEFAULT, NULL);
 995  995          cp->p_stat = SIDL;
 996  996          cp->p_mstart = gethrtime();
 997  997          cp->p_as = &kas;
 998  998          /*
 999  999           * p_zone must be set before we call pid_allocate since the process
1000 1000           * will be visible after that and code such as prfind_zone will
1001 1001           * look at the p_zone field.
1002 1002           */
1003 1003          cp->p_zone = pp->p_zone;
1004 1004          cp->p_t1_lgrpid = LGRP_NONE;
1005 1005          cp->p_tr_lgrpid = LGRP_NONE;
1006 1006  
1007 1007          if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) {
1008 1008                  if (nproc == v.v_proc) {
1009 1009                          CPU_STATS_ADDQ(CPU, sys, procovf, 1);
1010 1010                          cmn_err(CE_WARN, "out of processes");
1011 1011                  }
1012 1012                  goto bad;
1013 1013          }
1014 1014  
1015 1015          mutex_enter(&pp->p_lock);
1016 1016          cp->p_exec = pp->p_exec;
1017 1017          cp->p_execdir = pp->p_execdir;
1018 1018          mutex_exit(&pp->p_lock);
1019 1019  
1020 1020          if (cp->p_exec) {
1021 1021                  VN_HOLD(cp->p_exec);
1022 1022                  /*
1023 1023                   * Each VOP_OPEN() must be paired with a corresponding
1024 1024                   * VOP_CLOSE(). In this case, the executable will be
1025 1025                   * closed for the child in either proc_exit() or gexec().
1026 1026                   */
1027 1027                  if (VOP_OPEN(&cp->p_exec, FREAD, CRED(), NULL) != 0) {
1028 1028                          VN_RELE(cp->p_exec);
1029 1029                          cp->p_exec = NULLVP;
1030 1030                          cp->p_execdir = NULLVP;
1031 1031                          goto bad;
1032 1032                  }
1033 1033          }
1034 1034          if (cp->p_execdir)
1035 1035                  VN_HOLD(cp->p_execdir);
1036 1036  
1037 1037          /*
1038 1038           * If not privileged make sure that this user hasn't exceeded
1039 1039           * v.v_maxup processes, and that users collectively haven't
1040 1040           * exceeded v.v_maxupttl processes.
1041 1041           */
1042 1042          mutex_enter(&pidlock);
1043 1043          ASSERT(nproc < v.v_proc);       /* otherwise how'd we get our pid? */
1044 1044          cr = CRED();
1045 1045          ruid = crgetruid(cr);
1046 1046          zoneid = crgetzoneid(cr);
1047 1047          if (nproc >= v.v_maxup &&       /* short-circuit; usually false */
1048 1048              (nproc >= v.v_maxupttl ||
1049 1049              upcount_get(ruid, zoneid) >= v.v_maxup) &&
1050 1050              secpolicy_newproc(cr) != 0) {
1051 1051                  mutex_exit(&pidlock);
1052 1052                  zcmn_err(zoneid, CE_NOTE,
1053 1053                      "out of per-user processes for uid %d", ruid);
1054 1054                  goto bad;
1055 1055          }
1056 1056  
1057 1057          /*
1058 1058           * Everything is cool, put the new proc on the active process list.
1059 1059           * It is already on the pid list and in /proc.
1060 1060           * Increment the per uid process count (upcount).
1061 1061           */
1062 1062          nproc++;
1063 1063          upcount_inc(ruid, zoneid);
1064 1064  
1065 1065          cp->p_next = practive;
1066 1066          practive->p_prev = cp;
1067 1067          practive = cp;
1068 1068  
1069 1069          cp->p_ignore = pp->p_ignore;
1070 1070          cp->p_siginfo = pp->p_siginfo;
1071 1071          cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD);
1072 1072          cp->p_sessp = pp->p_sessp;
1073 1073          sess_hold(pp);
1074 1074          cp->p_brand = pp->p_brand;
1075 1075          if (PROC_IS_BRANDED(pp))
1076 1076                  BROP(pp)->b_copy_procdata(cp, pp);
1077 1077          cp->p_bssbase = pp->p_bssbase;
1078 1078          cp->p_brkbase = pp->p_brkbase;
1079 1079          cp->p_brksize = pp->p_brksize;
  
    | ↓ open down ↓ | 1079 lines elided | ↑ open up ↑ | 
1080 1080          cp->p_brkpageszc = pp->p_brkpageszc;
1081 1081          cp->p_stksize = pp->p_stksize;
1082 1082          cp->p_stkpageszc = pp->p_stkpageszc;
1083 1083          cp->p_stkprot = pp->p_stkprot;
1084 1084          cp->p_datprot = pp->p_datprot;
1085 1085          cp->p_usrstack = pp->p_usrstack;
1086 1086          cp->p_model = pp->p_model;
1087 1087          cp->p_ppid = pp->p_pid;
1088 1088          cp->p_ancpid = pp->p_pid;
1089 1089          cp->p_portcnt = pp->p_portcnt;
     1090 +        /*
     1091 +         * Security flags are preserved on fork, the inherited copy come into
     1092 +         * effect on exec
     1093 +         */
     1094 +        cp->p_secflags = pp->p_secflags;
1090 1095  
1091 1096          /*
1092 1097           * Initialize watchpoint structures
1093 1098           */
1094 1099          avl_create(&cp->p_warea, wa_compare, sizeof (struct watched_area),
1095 1100              offsetof(struct watched_area, wa_link));
1096 1101  
1097 1102          /*
1098 1103           * Initialize immediate resource control values.
1099 1104           */
1100 1105          cp->p_stk_ctl = pp->p_stk_ctl;
1101 1106          cp->p_fsz_ctl = pp->p_fsz_ctl;
1102 1107          cp->p_vmem_ctl = pp->p_vmem_ctl;
1103 1108          cp->p_fno_ctl = pp->p_fno_ctl;
1104 1109  
1105 1110          /*
1106 1111           * Link up to parent-child-sibling chain.  No need to lock
1107 1112           * in general since only a call to freeproc() (done by the
1108 1113           * same parent as newproc()) diddles with the child chain.
1109 1114           */
1110 1115          cp->p_sibling = pp->p_child;
1111 1116          if (pp->p_child)
1112 1117                  pp->p_child->p_psibling = cp;
1113 1118  
1114 1119          cp->p_parent = pp;
1115 1120          pp->p_child = cp;
1116 1121  
1117 1122          cp->p_child_ns = NULL;
1118 1123          cp->p_sibling_ns = NULL;
1119 1124  
1120 1125          cp->p_nextorph = pp->p_orphan;
1121 1126          cp->p_nextofkin = pp;
1122 1127          pp->p_orphan = cp;
1123 1128  
1124 1129          /*
1125 1130           * Inherit profiling state; do not inherit REALPROF profiling state.
1126 1131           */
1127 1132          cp->p_prof = pp->p_prof;
1128 1133          cp->p_rprof_cyclic = CYCLIC_NONE;
1129 1134  
1130 1135          /*
1131 1136           * Inherit pool pointer from the parent.  Kernel processes are
1132 1137           * always bound to the default pool.
1133 1138           */
1134 1139          mutex_enter(&pp->p_lock);
1135 1140          if (flags & GETPROC_KERNEL) {
1136 1141                  cp->p_pool = pool_default;
1137 1142                  cp->p_flag |= SSYS;
1138 1143          } else {
1139 1144                  cp->p_pool = pp->p_pool;
1140 1145          }
1141 1146          atomic_inc_32(&cp->p_pool->pool_ref);
1142 1147          mutex_exit(&pp->p_lock);
1143 1148  
1144 1149          /*
1145 1150           * Add the child process to the current task.  Kernel processes
1146 1151           * are always attached to task0.
1147 1152           */
1148 1153          mutex_enter(&cp->p_lock);
1149 1154          if (flags & GETPROC_KERNEL)
1150 1155                  task_attach(task0p, cp);
1151 1156          else
1152 1157                  task_attach(pp->p_task, cp);
1153 1158          mutex_exit(&cp->p_lock);
1154 1159          mutex_exit(&pidlock);
1155 1160  
1156 1161          avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t),
1157 1162              offsetof(contract_t, ct_ctlist));
1158 1163  
1159 1164          /*
1160 1165           * Duplicate any audit information kept in the process table
1161 1166           */
1162 1167          if (audit_active)       /* copy audit data to cp */
1163 1168                  audit_newproc(cp);
1164 1169  
1165 1170          crhold(cp->p_cred = cr);
1166 1171  
1167 1172          /*
1168 1173           * Bump up the counts on the file structures pointed at by the
1169 1174           * parent's file table since the child will point at them too.
1170 1175           */
1171 1176          fcnt_add(P_FINFO(pp), 1);
1172 1177  
1173 1178          if (PTOU(pp)->u_cdir) {
1174 1179                  VN_HOLD(PTOU(pp)->u_cdir);
1175 1180          } else {
1176 1181                  ASSERT(pp == &p0);
1177 1182                  /*
1178 1183                   * We must be at or before vfs_mountroot(); it will take care of
1179 1184                   * assigning our current directory.
1180 1185                   */
1181 1186          }
1182 1187          if (PTOU(pp)->u_rdir)
1183 1188                  VN_HOLD(PTOU(pp)->u_rdir);
1184 1189          if (PTOU(pp)->u_cwd)
1185 1190                  refstr_hold(PTOU(pp)->u_cwd);
1186 1191  
1187 1192          /*
1188 1193           * copy the parent's uarea.
1189 1194           */
1190 1195          uarea = PTOU(cp);
1191 1196          bcopy(PTOU(pp), uarea, sizeof (*uarea));
1192 1197          flist_fork(P_FINFO(pp), P_FINFO(cp));
1193 1198  
1194 1199          gethrestime(&uarea->u_start);
1195 1200          uarea->u_ticks = ddi_get_lbolt();
1196 1201          uarea->u_mem = rm_asrss(pp->p_as);
1197 1202          uarea->u_acflag = AFORK;
1198 1203  
1199 1204          /*
1200 1205           * If inherit-on-fork, copy /proc tracing flags to child.
1201 1206           */
1202 1207          if ((pp->p_proc_flag & P_PR_FORK) != 0) {
1203 1208                  cp->p_proc_flag |= pp->p_proc_flag & (P_PR_TRACE|P_PR_FORK);
1204 1209                  cp->p_sigmask = pp->p_sigmask;
1205 1210                  cp->p_fltmask = pp->p_fltmask;
1206 1211          } else {
1207 1212                  sigemptyset(&cp->p_sigmask);
1208 1213                  premptyset(&cp->p_fltmask);
1209 1214                  uarea->u_systrap = 0;
1210 1215                  premptyset(&uarea->u_entrymask);
1211 1216                  premptyset(&uarea->u_exitmask);
1212 1217          }
1213 1218          /*
1214 1219           * If microstate accounting is being inherited, mark child
1215 1220           */
1216 1221          if ((pp->p_flag & SMSFORK) != 0)
1217 1222                  cp->p_flag |= pp->p_flag & (SMSFORK|SMSACCT);
1218 1223  
1219 1224          /*
1220 1225           * Inherit fixalignment flag from the parent
1221 1226           */
1222 1227          cp->p_fixalignment = pp->p_fixalignment;
1223 1228  
1224 1229          *cpp = cp;
1225 1230          return (0);
1226 1231  
1227 1232  bad:
1228 1233          ASSERT(MUTEX_NOT_HELD(&pidlock));
1229 1234  
1230 1235          mutex_destroy(&cp->p_crlock);
1231 1236          mutex_destroy(&cp->p_pflock);
1232 1237  #if defined(__x86)
1233 1238          mutex_destroy(&cp->p_ldtlock);
1234 1239  #endif
1235 1240          if (newpid != -1) {
1236 1241                  proc_entry_free(cp->p_pidp);
1237 1242                  (void) pid_rele(cp->p_pidp);
1238 1243          }
1239 1244          kmem_cache_free(process_cache, cp);
1240 1245  
1241 1246          mutex_enter(&zone->zone_nlwps_lock);
1242 1247          task->tk_nprocs--;
1243 1248          proj->kpj_nprocs--;
1244 1249          zone->zone_nprocs--;
1245 1250          mutex_exit(&zone->zone_nlwps_lock);
1246 1251          atomic_inc_32(&zone->zone_ffnoproc);
1247 1252  
1248 1253  punish:
1249 1254          /*
1250 1255           * We most likely got into this situation because some process is
1251 1256           * forking out of control.  As punishment, put it to sleep for a
1252 1257           * bit so it can't eat the machine alive.  Sleep interval is chosen
1253 1258           * to allow no more than one fork failure per cpu per clock tick
1254 1259           * on average (yes, I just made this up).  This has two desirable
1255 1260           * properties: (1) it sets a constant limit on the fork failure
1256 1261           * rate, and (2) the busier the system is, the harsher the penalty
1257 1262           * for abusing it becomes.
1258 1263           */
1259 1264          INCR_COUNT(&fork_fail_pending, &pidlock);
1260 1265          delay(fork_fail_pending / ncpus + 1);
1261 1266          DECR_COUNT(&fork_fail_pending, &pidlock);
1262 1267  
1263 1268          return (-1); /* out of memory or proc slots */
1264 1269  }
1265 1270  
1266 1271  /*
1267 1272   * Release virtual memory.
1268 1273   * In the case of vfork(), the child was given exclusive access to its
1269 1274   * parent's address space.  The parent is waiting in vfwait() for the
1270 1275   * child to release its exclusive claim via relvm().
1271 1276   */
1272 1277  void
1273 1278  relvm()
1274 1279  {
1275 1280          proc_t *p = curproc;
1276 1281  
1277 1282          ASSERT((unsigned)p->p_lwpcnt <= 1);
1278 1283  
1279 1284          prrelvm();      /* inform /proc */
1280 1285  
1281 1286          if (p->p_flag & SVFORK) {
1282 1287                  proc_t *pp = p->p_parent;
1283 1288                  /*
1284 1289                   * The child process is either exec'ing or exit'ing.
1285 1290                   * The child is now separated from the parent's address
1286 1291                   * space.  The parent process is made dispatchable.
1287 1292                   *
1288 1293                   * This is a delicate locking maneuver, involving
1289 1294                   * both the parent's p_lock and the child's p_lock.
1290 1295                   * As soon as the SVFORK flag is turned off, the
1291 1296                   * parent is free to run, but it must not run until
1292 1297                   * we wake it up using its p_cv because it might
1293 1298                   * exit and we would be referencing invalid memory.
1294 1299                   * Therefore, we hold the parent with its p_lock
1295 1300                   * while protecting our p_flags with our own p_lock.
1296 1301                   */
1297 1302  try_again:
1298 1303                  mutex_enter(&p->p_lock);        /* grab child's lock first */
1299 1304                  prbarrier(p);           /* make sure /proc is blocked out */
1300 1305                  mutex_enter(&pp->p_lock);
1301 1306  
1302 1307                  /*
1303 1308                   * Check if parent is locked by /proc.
1304 1309                   */
1305 1310                  if (pp->p_proc_flag & P_PR_LOCK) {
1306 1311                          /*
1307 1312                           * Delay until /proc is done with the parent.
1308 1313                           * We must drop our (the child's) p->p_lock, wait
1309 1314                           * via prbarrier() on the parent, then start over.
1310 1315                           */
1311 1316                          mutex_exit(&p->p_lock);
1312 1317                          prbarrier(pp);
1313 1318                          mutex_exit(&pp->p_lock);
1314 1319                          goto try_again;
1315 1320                  }
1316 1321                  p->p_flag &= ~SVFORK;
1317 1322                  kpreempt_disable();
1318 1323                  p->p_as = &kas;
1319 1324  
1320 1325                  /*
1321 1326                   * notify hat of change in thread's address space
1322 1327                   */
1323 1328                  hat_thread_exit(curthread);
1324 1329                  kpreempt_enable();
1325 1330  
1326 1331                  /*
1327 1332                   * child sizes are copied back to parent because
1328 1333                   * child may have grown.
1329 1334                   */
1330 1335                  pp->p_brkbase = p->p_brkbase;
1331 1336                  pp->p_brksize = p->p_brksize;
1332 1337                  pp->p_stksize = p->p_stksize;
1333 1338  
1334 1339                  /*
1335 1340                   * Copy back the shm accounting information
1336 1341                   * to the parent process.
1337 1342                   */
1338 1343                  pp->p_segacct = p->p_segacct;
1339 1344                  p->p_segacct = NULL;
1340 1345  
1341 1346                  /*
1342 1347                   * The parent is no longer waiting for the vfork()d child.
1343 1348                   * Restore the parent's watched pages, if any.  This is
1344 1349                   * safe because we know the parent is not locked by /proc
1345 1350                   */
1346 1351                  pp->p_flag &= ~SVFWAIT;
1347 1352                  if (avl_numnodes(&pp->p_wpage) != 0) {
1348 1353                          pp->p_as->a_wpage = pp->p_wpage;
1349 1354                          avl_create(&pp->p_wpage, wp_compare,
1350 1355                              sizeof (struct watched_page),
1351 1356                              offsetof(struct watched_page, wp_link));
1352 1357                  }
1353 1358                  cv_signal(&pp->p_cv);
1354 1359                  mutex_exit(&pp->p_lock);
1355 1360                  mutex_exit(&p->p_lock);
1356 1361          } else {
1357 1362                  if (p->p_as != &kas) {
1358 1363                          struct as *as;
1359 1364  
1360 1365                          if (p->p_segacct)
1361 1366                                  shmexit(p);
1362 1367  
1363 1368                          /*
1364 1369                           * We grab p_lock for the benefit of /proc
1365 1370                           */
1366 1371                          kpreempt_disable();
1367 1372                          mutex_enter(&p->p_lock);
1368 1373                          prbarrier(p);   /* make sure /proc is blocked out */
1369 1374                          as = p->p_as;
1370 1375                          p->p_as = &kas;
1371 1376                          mutex_exit(&p->p_lock);
1372 1377  
1373 1378                          /*
1374 1379                           * notify hat of change in thread's address space
1375 1380                           */
1376 1381                          hat_thread_exit(curthread);
1377 1382                          kpreempt_enable();
1378 1383  
1379 1384                          as_free(as);
1380 1385                          p->p_tr_lgrpid = LGRP_NONE;
1381 1386                  }
1382 1387          }
1383 1388  }
1384 1389  
1385 1390  /*
1386 1391   * Wait for child to exec or exit.
1387 1392   * Called by parent of vfork'ed process.
1388 1393   * See important comments in relvm(), above.
1389 1394   */
1390 1395  void
1391 1396  vfwait(pid_t pid)
1392 1397  {
1393 1398          int signalled = 0;
1394 1399          proc_t *pp = ttoproc(curthread);
1395 1400          proc_t *cp;
1396 1401  
1397 1402          /*
1398 1403           * Wait for child to exec or exit.
1399 1404           */
1400 1405          for (;;) {
1401 1406                  mutex_enter(&pidlock);
1402 1407                  cp = prfind(pid);
1403 1408                  if (cp == NULL || cp->p_parent != pp) {
1404 1409                          /*
1405 1410                           * Child has exit()ed.
1406 1411                           */
1407 1412                          mutex_exit(&pidlock);
1408 1413                          break;
1409 1414                  }
1410 1415                  /*
1411 1416                   * Grab the child's p_lock before releasing pidlock.
1412 1417                   * Otherwise, the child could exit and we would be
1413 1418                   * referencing invalid memory.
1414 1419                   */
1415 1420                  mutex_enter(&cp->p_lock);
1416 1421                  mutex_exit(&pidlock);
1417 1422                  if (!(cp->p_flag & SVFORK)) {
1418 1423                          /*
1419 1424                           * Child has exec()ed or is exit()ing.
1420 1425                           */
1421 1426                          mutex_exit(&cp->p_lock);
1422 1427                          break;
1423 1428                  }
1424 1429                  mutex_enter(&pp->p_lock);
1425 1430                  mutex_exit(&cp->p_lock);
1426 1431                  /*
1427 1432                   * We might be waked up spuriously from the cv_wait().
1428 1433                   * We have to do the whole operation over again to be
1429 1434                   * sure the child's SVFORK flag really is turned off.
1430 1435                   * We cannot make reference to the child because it can
1431 1436                   * exit before we return and we would be referencing
1432 1437                   * invalid memory.
1433 1438                   *
1434 1439                   * Because this is potentially a very long-term wait,
1435 1440                   * we call cv_wait_sig() (for its jobcontrol and /proc
1436 1441                   * side-effects) unless there is a current signal, in
1437 1442                   * which case we use cv_wait() because we cannot return
1438 1443                   * from this function until the child has released the
1439 1444                   * address space.  Calling cv_wait_sig() with a current
1440 1445                   * signal would lead to an indefinite loop here because
1441 1446                   * cv_wait_sig() returns immediately in this case.
1442 1447                   */
1443 1448                  if (signalled)
1444 1449                          cv_wait(&pp->p_cv, &pp->p_lock);
1445 1450                  else
1446 1451                          signalled = !cv_wait_sig(&pp->p_cv, &pp->p_lock);
1447 1452                  mutex_exit(&pp->p_lock);
1448 1453          }
1449 1454  
1450 1455          /* restore watchpoints to parent */
1451 1456          if (pr_watch_active(pp)) {
1452 1457                  struct as *as = pp->p_as;
1453 1458                  AS_LOCK_ENTER(as, RW_WRITER);
1454 1459                  as_setwatch(as);
1455 1460                  AS_LOCK_EXIT(as);
1456 1461          }
1457 1462  
1458 1463          mutex_enter(&pp->p_lock);
1459 1464          prbarrier(pp);  /* barrier against /proc locking */
1460 1465          continuelwps(pp);
1461 1466          mutex_exit(&pp->p_lock);
1462 1467  }
  
    | ↓ open down ↓ | 363 lines elided | ↑ open up ↑ | 
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX