illumos-bug3121 Wdiff usr/src/cmd/svc/startd/method.c

Print this page

3121 missing SMF method directories should say something useful
Reviewed by: Gary Mills <gary_mills@fastmail.fm>
Reviewed by: T Nguyen <truongqnguien@gmail.com>
Reviewed by: Richard Elling <richard.elling@gmail.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/cmd/svc/startd/method.c
          +++ new/usr/src/cmd/svc/startd/method.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2011 Joyent Inc.
  25   25   */
  26   26  
  27   27  /*
  28   28   * method.c - method execution functions
  29   29   *
  30   30   * This file contains the routines needed to run a method:  a fork(2)-exec(2)
  31   31   * invocation monitored using either the contract filesystem or waitpid(2).
  32   32   * (Plain fork1(2) support is provided in fork.c.)
  33   33   *
  34   34   * Contract Transfer
  35   35   *   When we restart a service, we want to transfer any contracts that the old
  36   36   *   service's contract inherited.  This means that (a) we must not abandon the
  37   37   *   old contract when the service dies and (b) we must write the id of the old
  38   38   *   contract into the terms of the new contract.  There should be limits to
  39   39   *   (a), though, since we don't want to keep the contract around forever.  To
  40   40   *   this end we'll say that services in the offline state may have a contract
  41   41   *   to be transfered and services in the disabled or maintenance states cannot.
  42   42   *   This means that when a service transitions from online (or degraded) to
  43   43   *   offline, the contract should be preserved, and when the service transitions
  44   44   *   from offline to online (i.e., the start method), we'll transfer inherited
  45   45   *   contracts.
  46   46   */
  47   47  
  48   48  #include <sys/contract/process.h>
  49   49  #include <sys/ctfs.h>
  50   50  #include <sys/stat.h>
  51   51  #include <sys/time.h>
  52   52  #include <sys/types.h>
  53   53  #include <sys/uio.h>
  54   54  #include <sys/wait.h>
  55   55  #include <alloca.h>
  56   56  #include <assert.h>
  57   57  #include <errno.h>
  58   58  #include <fcntl.h>
  59   59  #include <libcontract.h>
  60   60  #include <libcontract_priv.h>
  61   61  #include <libgen.h>
  62   62  #include <librestart.h>
  63   63  #include <libscf.h>
  64   64  #include <limits.h>
  65   65  #include <port.h>
  66   66  #include <sac.h>
  67   67  #include <signal.h>
  68   68  #include <stdlib.h>
  69   69  #include <string.h>
  70   70  #include <strings.h>
  71   71  #include <unistd.h>
  72   72  #include <atomic.h>
  73   73  #include <poll.h>
  74   74  #include <libscf_priv.h>
  75   75  
  76   76  #include "startd.h"
  77   77  
  78   78  #define SBIN_SH         "/sbin/sh"
  79   79  
  80   80  /*
  81   81   * Used to tell if contracts are in the process of being
  82   82   * stored into the svc.startd internal hash table.
  83   83   */
  84   84  volatile uint16_t       storing_contract = 0;
  85   85  
  86   86  /*
  87   87   * Mapping from restart_on method-type to contract events.  Must correspond to
  88   88   * enum method_restart_t.
  89   89   */
  90   90  static uint_t method_events[] = {
  91   91          /* METHOD_RESTART_ALL */
  92   92          CT_PR_EV_HWERR | CT_PR_EV_SIGNAL | CT_PR_EV_CORE | CT_PR_EV_EMPTY,
  93   93          /* METHOD_RESTART_EXTERNAL_FAULT */
  94   94          CT_PR_EV_HWERR | CT_PR_EV_SIGNAL,
  95   95          /* METHOD_RESTART_ANY_FAULT */
  96   96          CT_PR_EV_HWERR | CT_PR_EV_SIGNAL | CT_PR_EV_CORE
  97   97  };
  98   98  
  99   99  /*
 100  100   * method_record_start(restarter_inst_t *)
 101  101   *   Record a service start for rate limiting.  Place the current time
 102  102   *   in the circular array of instance starts.
 103  103   */
 104  104  static void
 105  105  method_record_start(restarter_inst_t *inst)
 106  106  {
 107  107          int index = inst->ri_start_index++ % RINST_START_TIMES;
 108  108  
 109  109          inst->ri_start_time[index] = gethrtime();
 110  110  }
 111  111  
 112  112  /*
 113  113   * method_rate_critical(restarter_inst_t *)
 114  114   *    Return true if the average start interval is less than the permitted
 115  115   *    interval.  The implicit interval defaults to RINST_FAILURE_RATE_NS and
 116  116   *    RINST_START_TIMES but may be overridden with the svc properties
 117  117   *    startd/critical_failure_count and startd/critical_failure_period
 118  118   *    which represent the number of failures to consider and the amount of
 119  119   *    time in seconds in which that number may occur, respectively. Note that
 120  120   *    this time is measured as of the transition to 'enabled' rather than wall
 121  121   *    clock time.
 122  122   *    Implicit success if insufficient measurements for an average exist.
 123  123   */
 124  124  static int
 125  125  method_rate_critical(restarter_inst_t *inst)
 126  126  {
 127  127          hrtime_t critical_failure_period = RINST_FAILURE_RATE_NS;
 128  128          uint_t critical_failure_count = RINST_START_TIMES;
 129  129          uint_t n = inst->ri_start_index;
 130  130          hrtime_t avg_ns = 0;
 131  131          uint64_t scf_fr, scf_st;
 132  132          scf_propvec_t *prop = NULL;
 133  133          scf_propvec_t restart_critical[] = {
 134  134                  { "critical_failure_period", NULL, SCF_TYPE_INTEGER, NULL, 0 },
 135  135                  { "critical_failure_count", NULL, SCF_TYPE_INTEGER, NULL, 0 },
 136  136                  { NULL }
 137  137          };
 138  138  
 139  139          restart_critical[0].pv_ptr = &scf_fr;
 140  140          restart_critical[1].pv_ptr = &scf_st;
 141  141  
 142  142          if (scf_read_propvec(inst->ri_i.i_fmri, "startd",
 143  143              B_TRUE, restart_critical, &prop) != SCF_FAILED) {
 144  144                  /*
 145  145                   * critical_failure_period is expressed
 146  146                   * in seconds but tracked in ns
 147  147                   */
 148  148                  critical_failure_period = (hrtime_t)scf_fr * NANOSEC;
 149  149                  critical_failure_count = (uint_t)scf_st;
 150  150          }
 151  151          if (inst->ri_start_index < critical_failure_count)
 152  152                  return (0);
 153  153  
 154  154          avg_ns =
 155  155              (inst->ri_start_time[(n - 1) % critical_failure_count] -
 156  156              inst->ri_start_time[n % critical_failure_count]) /
 157  157              (critical_failure_count - 1);
 158  158  
 159  159          return (avg_ns < critical_failure_period);
 160  160  }
 161  161  
 162  162  /*
 163  163   * int method_is_transient()
 164  164   *   Determine if the method for the given instance is transient,
 165  165   *   from a contract perspective. Return 1 if it is, and 0 if it isn't.
 166  166   */
 167  167  static int
 168  168  method_is_transient(restarter_inst_t *inst, int type)
 169  169  {
 170  170          if (instance_is_transient_style(inst) || type != METHOD_START)
 171  171                  return (1);
 172  172          else
 173  173                  return (0);
 174  174  }
 175  175  
 176  176  /*
 177  177   * void method_store_contract()
 178  178   *   Store the newly created contract id into local structures and
 179  179   *   the repository.  If the repository connection is broken it is rebound.
 180  180   */
 181  181  static void
 182  182  method_store_contract(restarter_inst_t *inst, int type, ctid_t *cid)
 183  183  {
 184  184          int r;
 185  185          boolean_t primary;
 186  186  
 187  187          if (errno = contract_latest(cid))
 188  188                  uu_die("%s: Couldn't get new contract's id", inst->ri_i.i_fmri);
 189  189  
 190  190          primary = !method_is_transient(inst, type);
 191  191  
 192  192          if (!primary) {
 193  193                  if (inst->ri_i.i_transient_ctid != 0) {
 194  194                          log_framework(LOG_INFO,
 195  195                              "%s: transient ctid expected to be 0 but "
 196  196                              "was set to %ld\n", inst->ri_i.i_fmri,
 197  197                              inst->ri_i.i_transient_ctid);
 198  198                  }
 199  199  
 200  200                  inst->ri_i.i_transient_ctid = *cid;
 201  201          } else {
 202  202                  if (inst->ri_i.i_primary_ctid != 0) {
 203  203                          /*
 204  204                           * There was an old contract that we transferred.
 205  205                           * Remove it.
 206  206                           */
 207  207                          method_remove_contract(inst, B_TRUE, B_FALSE);
 208  208                  }
 209  209  
 210  210                  if (inst->ri_i.i_primary_ctid != 0) {
 211  211                          log_framework(LOG_INFO,
 212  212                              "%s: primary ctid expected to be 0 but "
 213  213                              "was set to %ld\n", inst->ri_i.i_fmri,
 214  214                              inst->ri_i.i_primary_ctid);
 215  215                  }
 216  216  
 217  217                  inst->ri_i.i_primary_ctid = *cid;
 218  218                  inst->ri_i.i_primary_ctid_stopped = 0;
 219  219  
 220  220                  log_framework(LOG_DEBUG, "Storing primary contract %ld for "
 221  221                      "%s.\n", *cid, inst->ri_i.i_fmri);
 222  222  
 223  223                  contract_hash_store(*cid, inst->ri_id);
 224  224          }
 225  225  
 226  226  again:
 227  227          if (inst->ri_mi_deleted)
 228  228                  return;
 229  229  
 230  230          r = restarter_store_contract(inst->ri_m_inst, *cid, primary ?
 231  231              RESTARTER_CONTRACT_PRIMARY : RESTARTER_CONTRACT_TRANSIENT);
 232  232          switch (r) {
 233  233          case 0:
 234  234                  break;
 235  235  
 236  236          case ECANCELED:
 237  237                  inst->ri_mi_deleted = B_TRUE;
 238  238                  break;
 239  239  
 240  240          case ECONNABORTED:
 241  241                  libscf_handle_rebind(scf_instance_handle(inst->ri_m_inst));
 242  242                  /* FALLTHROUGH */
 243  243  
 244  244          case EBADF:
 245  245                  libscf_reget_instance(inst);
 246  246                  goto again;
 247  247  
 248  248          case ENOMEM:
 249  249          case EPERM:
 250  250          case EACCES:
 251  251          case EROFS:
 252  252                  uu_die("%s: Couldn't store contract id %ld",
 253  253                      inst->ri_i.i_fmri, *cid);
 254  254                  /* NOTREACHED */
 255  255  
 256  256          case EINVAL:
 257  257          default:
 258  258                  bad_error("restarter_store_contract", r);
 259  259          }
 260  260  }
 261  261  
 262  262  /*
 263  263   * void method_remove_contract()
 264  264   *   Remove any non-permanent contracts from internal structures and
 265  265   *   the repository, then abandon them.
 266  266   *   Returns
 267  267   *     0 - success
 268  268   *     ECANCELED - inst was deleted from the repository
 269  269   *
 270  270   *   If the repository connection was broken, it is rebound.
 271  271   */
 272  272  void
 273  273  method_remove_contract(restarter_inst_t *inst, boolean_t primary,
 274  274      boolean_t abandon)
 275  275  {
 276  276          ctid_t * const ctidp = primary ? &inst->ri_i.i_primary_ctid :
 277  277              &inst->ri_i.i_transient_ctid;
 278  278  
 279  279          int r;
 280  280  
 281  281          assert(*ctidp != 0);
 282  282  
 283  283          log_framework(LOG_DEBUG, "Removing %s contract %lu for %s.\n",
 284  284              primary ? "primary" : "transient", *ctidp, inst->ri_i.i_fmri);
 285  285  
 286  286          if (abandon)
 287  287                  contract_abandon(*ctidp);
 288  288  
 289  289  again:
 290  290          if (inst->ri_mi_deleted) {
 291  291                  r = ECANCELED;
 292  292                  goto out;
 293  293          }
 294  294  
 295  295          r = restarter_remove_contract(inst->ri_m_inst, *ctidp, primary ?
 296  296              RESTARTER_CONTRACT_PRIMARY : RESTARTER_CONTRACT_TRANSIENT);
 297  297          switch (r) {
 298  298          case 0:
 299  299                  break;
 300  300  
 301  301          case ECANCELED:
 302  302                  inst->ri_mi_deleted = B_TRUE;
 303  303                  break;
 304  304  
 305  305          case ECONNABORTED:
 306  306                  libscf_handle_rebind(scf_instance_handle(inst->ri_m_inst));
 307  307                  /* FALLTHROUGH */
 308  308  
 309  309          case EBADF:
 310  310                  libscf_reget_instance(inst);
 311  311                  goto again;
 312  312  
 313  313          case ENOMEM:
 314  314          case EPERM:
 315  315          case EACCES:
 316  316          case EROFS:
 317  317                  log_error(LOG_INFO, "%s: Couldn't remove contract id %ld: "
 318  318                      "%s.\n", inst->ri_i.i_fmri, *ctidp, strerror(r));
 319  319                  break;
 320  320  
 321  321          case EINVAL:
 322  322          default:
 323  323                  bad_error("restarter_remove_contract", r);
 324  324          }
 325  325  
 326  326  out:
 327  327          if (primary)
 328  328                  contract_hash_remove(*ctidp);
 329  329  
 330  330          *ctidp = 0;
 331  331  }
 332  332  
 333  333  static const char *method_names[] = { "start", "stop", "refresh" };
 334  334  
 335  335  /*
 336  336   * int method_ready_contract(restarter_inst_t *, int, method_restart_t, int)
 337  337   *
 338  338   *   Activate a contract template for the type method of inst.  type,
 339  339   *   restart_on, and cte_mask dictate the critical events term of the contract.
 340  340   *   Returns
 341  341   *     0 - success
 342  342   *     ECANCELED - inst has been deleted from the repository
 343  343   */
 344  344  static int
 345  345  method_ready_contract(restarter_inst_t *inst, int type,
 346  346      method_restart_t restart_on, uint_t cte_mask)
 347  347  {
 348  348          int tmpl, err, istrans, iswait, ret;
 349  349          uint_t cevents, fevents;
 350  350  
 351  351          /*
 352  352           * Correctly supporting wait-style services is tricky without
 353  353           * rearchitecting startd to cope with multiple event sources
 354  354           * simultaneously trying to stop an instance.  Until a better
 355  355           * solution is implemented, we avoid this problem for
 356  356           * wait-style services by making contract events fatal and
 357  357           * letting the wait code alone handle stopping the service.
 358  358           */
 359  359          iswait = instance_is_wait_style(inst);
 360  360          istrans = method_is_transient(inst, type);
 361  361  
 362  362          tmpl = open64(CTFS_ROOT "/process/template", O_RDWR);
 363  363          if (tmpl == -1)
 364  364                  uu_die("Could not create contract template");
 365  365  
 366  366          /*
 367  367           * We assume non-login processes are unlikely to create
 368  368           * multiple process groups, and set CT_PR_PGRPONLY for all
 369  369           * wait-style services' contracts.
 370  370           */
 371  371          err = ct_pr_tmpl_set_param(tmpl, CT_PR_INHERIT | CT_PR_REGENT |
 372  372              (iswait ? CT_PR_PGRPONLY : 0));
 373  373          assert(err == 0);
 374  374  
 375  375          if (istrans) {
 376  376                  cevents = 0;
 377  377                  fevents = 0;
 378  378          } else {
 379  379                  assert(restart_on >= 0);
 380  380                  assert(restart_on <= METHOD_RESTART_ANY_FAULT);
 381  381                  cevents = method_events[restart_on] & ~cte_mask;
 382  382                  fevents = iswait ?
 383  383                      (method_events[restart_on] & ~cte_mask & CT_PR_ALLFATAL) :
 384  384                      0;
 385  385          }
 386  386  
 387  387          err = ct_tmpl_set_critical(tmpl, cevents);
 388  388          assert(err == 0);
 389  389  
 390  390          err = ct_tmpl_set_informative(tmpl, 0);
 391  391          assert(err == 0);
 392  392          err = ct_pr_tmpl_set_fatal(tmpl, fevents);
 393  393          assert(err == 0);
 394  394  
 395  395          err = ct_tmpl_set_cookie(tmpl, istrans ?  METHOD_OTHER_COOKIE :
 396  396              METHOD_START_COOKIE);
 397  397          assert(err == 0);
 398  398  
 399  399          if (type == METHOD_START && inst->ri_i.i_primary_ctid != 0) {
 400  400                  ret = ct_pr_tmpl_set_transfer(tmpl, inst->ri_i.i_primary_ctid);
 401  401                  switch (ret) {
 402  402                  case 0:
 403  403                          break;
 404  404  
 405  405                  case ENOTEMPTY:
 406  406                          /* No contracts for you! */
 407  407                          method_remove_contract(inst, B_TRUE, B_TRUE);
 408  408                          if (inst->ri_mi_deleted) {
 409  409                                  ret = ECANCELED;
 410  410                                  goto out;
 411  411                          }
 412  412                          break;
 413  413  
 414  414                  case EINVAL:
 415  415                  case ESRCH:
 416  416                  case EACCES:
 417  417                  default:
 418  418                          bad_error("ct_pr_tmpl_set_transfer", ret);
 419  419                  }
 420  420          }
 421  421  
 422  422          err = ct_pr_tmpl_set_svc_fmri(tmpl, inst->ri_i.i_fmri);
 423  423          assert(err == 0);
 424  424          err = ct_pr_tmpl_set_svc_aux(tmpl, method_names[type]);
 425  425          assert(err == 0);
 426  426  
 427  427          err = ct_tmpl_activate(tmpl);
 428  428          assert(err == 0);
 429  429  
 430  430          ret = 0;
 431  431  
 432  432  out:
 433  433          err = close(tmpl);
 434  434          assert(err == 0);
 435  435  
 436  436          return (ret);
 437  437  }
 438  438  
 439  439  static void
 440  440  exec_method(const restarter_inst_t *inst, int type, const char *method,
 441  441      struct method_context *mcp, uint8_t need_session)
 442  442  {
 443  443          char *cmd;
 444  444          const char *errf;
 445  445          char **nenv;
 446  446          int rsmc_errno = 0;
 447  447  
 448  448          cmd = uu_msprintf("exec %s", method);
 449  449  
 450  450          if (inst->ri_utmpx_prefix[0] != '\0' && inst->ri_utmpx_prefix != NULL)
 451  451                  (void) utmpx_mark_init(getpid(), inst->ri_utmpx_prefix);
 452  452  
 453  453          setlog(inst->ri_logstem);
 454  454          log_instance(inst, B_FALSE, "Executing %s method (\"%s\").",
 455  455              method_names[type], method);
 456  456  
 457  457          if (need_session)
 458  458                  (void) setpgrp();
 459  459  
 460  460          /* Set credentials. */
 461  461          rsmc_errno = restarter_set_method_context(mcp, &errf);
 462  462          if (rsmc_errno != 0) {
 463  463                  log_instance(inst, B_FALSE,
 464  464                      "svc.startd could not set context for method: ");
 465  465  
 466  466                  if (rsmc_errno == -1) {
 467  467                          if (strcmp(errf, "core_set_process_path") == 0) {
 468  468                                  log_instance(inst, B_FALSE,
 469  469                                      "Could not set corefile path.");
 470  470                          } else if (strcmp(errf, "setproject") == 0) {
 471  471                                  log_instance(inst, B_FALSE, "%s: a resource "
 472  472                                      "control assignment failed", errf);
 473  473                          } else if (strcmp(errf, "pool_set_binding") == 0) {
 474  474                                  log_instance(inst, B_FALSE, "%s: a system "
 475  475                                      "error occurred", errf);
 476  476                          } else {
 477  477  #ifndef NDEBUG
 478  478                                  uu_warn("%s:%d: Bad function name \"%s\" for "
 479  479                                      "error %d from "
 480  480                                      "restarter_set_method_context().\n",
 481  481                                      __FILE__, __LINE__, errf, rsmc_errno);
 482  482  #endif
 483  483                                  abort();
 484  484                          }
 485  485  
 486  486                          exit(1);
 487  487                  }
 488  488  
 489  489                  if (errf != NULL && strcmp(errf, "pool_set_binding") == 0) {
 490  490                          switch (rsmc_errno) {
 491  491                          case ENOENT:
 492  492                                  log_instance(inst, B_FALSE, "%s: the pool "
 493  493                                      "could not be found", errf);
 494  494                                  break;
 495  495  
 496  496                          case EBADF:
 497  497                                  log_instance(inst, B_FALSE, "%s: the "
 498  498                                      "configuration is invalid", errf);
 499  499                                  break;
 500  500  
 501  501                          case EINVAL:
 502  502                                  log_instance(inst, B_FALSE, "%s: pool name "
 503  503                                      "\"%s\" is invalid", errf,
 504  504                                      mcp->resource_pool);
 505  505                                  break;
 506  506  
 507  507                          default:
 508  508  #ifndef NDEBUG

↓ open down ↓

508 lines elided

↑ open up ↑

 509  509                                  uu_warn("%s:%d: Bad error %d for function %s "
 510  510                                      "in restarter_set_method_context().\n",
 511  511                                      __FILE__, __LINE__, rsmc_errno, errf);
 512  512  #endif
 513  513                                  abort();
 514  514                          }
 515  515  
 516  516                          exit(SMF_EXIT_ERR_CONFIG);
 517  517                  }
 518  518  
      519 +                if (errf != NULL && strcmp(errf, "chdir") == 0) {
      520 +                        switch (rsmc_errno) {
      521 +                        case EACCES:
      522 +                        case EFAULT:
      523 +                        case EIO:
      524 +                        case ELOOP:
      525 +                        case ENAMETOOLONG:
      526 +                        case ENOENT:
      527 +                        case ENOLINK:
      528 +                        case ENOTDIR:
      529 +                                log_instance(inst, B_FALSE, "%s: %s (\"%s\")",
      530 +                                    errf,
      531 +                                    strerror(rsmc_errno), mcp->working_dir);
      532 +                                break;
      533 +
      534 +                        default:
      535 +#ifndef NDEBUG
      536 +                                uu_warn("%s:%d: Bad error %d for function %s "
      537 +                                    "in restarter_set_method_context().\n",
      538 +                                    __FILE__, __LINE__, rsmc_errno, errf);
      539 +#endif
      540 +                                abort();
      541 +                        }
      542 +
      543 +                        exit(SMF_EXIT_ERR_CONFIG);
      544 +                }
      545 +
 519  546                  if (errf != NULL) {
 520  547                          errno = rsmc_errno;
 521  548                          perror(errf);
 522  549  
 523  550                          switch (rsmc_errno) {
 524  551                          case EINVAL:
 525  552                          case EPERM:
 526  553                          case ENOENT:
 527  554                          case ENAMETOOLONG:
 528  555                          case ERANGE:

 529  556                          case ESRCH:
 530  557                                  exit(SMF_EXIT_ERR_CONFIG);
 531  558                                  /* NOTREACHED */
 532  559  
 533  560                          default:
 534  561                                  exit(1);
 535  562                          }
 536  563                  }
 537  564  
 538  565                  switch (rsmc_errno) {
 539  566                  case ENOMEM:
 540  567                          log_instance(inst, B_FALSE, "Out of memory.");
 541  568                          exit(1);
 542  569                          /* NOTREACHED */
 543  570  
 544  571                  case ENOENT:
 545  572                          log_instance(inst, B_FALSE, "Missing passwd entry for "
 546  573                              "user.");
 547  574                          exit(SMF_EXIT_ERR_CONFIG);
 548  575                          /* NOTREACHED */
 549  576  
 550  577                  default:
 551  578  #ifndef NDEBUG
 552  579                          uu_warn("%s:%d: Bad miscellaneous error %d from "
 553  580                              "restarter_set_method_context().\n", __FILE__,
 554  581                              __LINE__, rsmc_errno);
 555  582  #endif
 556  583                          abort();
 557  584                  }
 558  585          }
 559  586  
 560  587          nenv = set_smf_env(mcp->env, mcp->env_sz, NULL, inst,
 561  588              method_names[type]);
 562  589  
 563  590          log_preexec();
 564  591  
 565  592          (void) execle(SBIN_SH, SBIN_SH, "-c", cmd, NULL, nenv);
 566  593  
 567  594          exit(10);
 568  595  }
 569  596  
 570  597  static void
 571  598  write_status(restarter_inst_t *inst, const char *mname, int stat)
 572  599  {
 573  600          int r;
 574  601  
 575  602  again:
 576  603          if (inst->ri_mi_deleted)
 577  604                  return;
 578  605  
 579  606          r = libscf_write_method_status(inst->ri_m_inst, mname, stat);
 580  607          switch (r) {
 581  608          case 0:
 582  609                  break;
 583  610  
 584  611          case ECONNABORTED:
 585  612                  libscf_reget_instance(inst);
 586  613                  goto again;
 587  614  
 588  615          case ECANCELED:
 589  616                  inst->ri_mi_deleted = 1;
 590  617                  break;
 591  618  
 592  619          case EPERM:
 593  620          case EACCES:
 594  621          case EROFS:
 595  622                  log_framework(LOG_INFO, "Could not write exit status "
 596  623                      "for %s method of %s: %s.\n", mname,
 597  624                      inst->ri_i.i_fmri, strerror(r));
 598  625                  break;
 599  626  
 600  627          case ENAMETOOLONG:
 601  628          default:
 602  629                  bad_error("libscf_write_method_status", r);
 603  630          }
 604  631  }
 605  632  
 606  633  /*
 607  634   * int method_run()
 608  635   *   Execute the type method of instp.  If it requires a fork(), wait for it
 609  636   *   to return and return its exit code in *exit_code.  Otherwise set
 610  637   *   *exit_code to 0 if the method succeeds & -1 if it fails.  If the
 611  638   *   repository connection is broken, it is rebound, but inst may not be
 612  639   *   reset.
 613  640   *   Returns
 614  641   *     0 - success
 615  642   *     EINVAL - A correct method or method context couldn't be retrieved.
 616  643   *     EIO - Contract kill failed.
 617  644   *     EFAULT - Method couldn't be executed successfully.
 618  645   *     ELOOP - Retry threshold exceeded.
 619  646   *     ECANCELED - inst was deleted from the repository before method was run
 620  647   *     ERANGE - Timeout retry threshold exceeded.
 621  648   *     EAGAIN - Failed due to external cause, retry.
 622  649   */
 623  650  int
 624  651  method_run(restarter_inst_t **instp, int type, int *exit_code)
 625  652  {
 626  653          char *method;
 627  654          int ret_status;
 628  655          pid_t pid;
 629  656          method_restart_t restart_on;
 630  657          uint_t cte_mask;
 631  658          uint8_t need_session;
 632  659          scf_handle_t *h;
 633  660          scf_snapshot_t *snap;
 634  661          const char *mname;
 635  662          mc_error_t *m_error;
 636  663          struct method_context *mcp;
 637  664          int result = 0, timeout_fired = 0;
 638  665          int sig, r;
 639  666          boolean_t transient;
 640  667          uint64_t timeout;
 641  668          uint8_t timeout_retry;
 642  669          ctid_t ctid;
 643  670          int ctfd = -1;
 644  671          restarter_inst_t *inst = *instp;
 645  672          int id = inst->ri_id;
 646  673          int forkerr;
 647  674  
 648  675          assert(MUTEX_HELD(&inst->ri_lock));
 649  676          assert(instance_in_transition(inst));
 650  677  
 651  678          if (inst->ri_mi_deleted)
 652  679                  return (ECANCELED);
 653  680  
 654  681          *exit_code = 0;
 655  682  
 656  683          assert(0 <= type && type <= 2);
 657  684          mname = method_names[type];
 658  685  
 659  686          if (type == METHOD_START)
 660  687                  inst->ri_pre_online_hook();
 661  688  
 662  689          h = scf_instance_handle(inst->ri_m_inst);
 663  690  
 664  691          snap = scf_snapshot_create(h);
 665  692          if (snap == NULL ||
 666  693              scf_instance_get_snapshot(inst->ri_m_inst, "running", snap) != 0) {
 667  694                  log_framework(LOG_DEBUG,
 668  695                      "Could not get running snapshot for %s.  "
 669  696                      "Using editing version to run method %s.\n",
 670  697                      inst->ri_i.i_fmri, mname);
 671  698                  scf_snapshot_destroy(snap);
 672  699                  snap = NULL;
 673  700          }
 674  701  
 675  702          /*
 676  703           * After this point, we may be logging to the instance log.
 677  704           * Make sure we've noted where that log is as a property of
 678  705           * the instance.
 679  706           */
 680  707          r = libscf_note_method_log(inst->ri_m_inst, st->st_log_prefix,
 681  708              inst->ri_logstem);
 682  709          if (r != 0) {
 683  710                  log_framework(LOG_WARNING,
 684  711                      "%s: couldn't note log location: %s\n",
 685  712                      inst->ri_i.i_fmri, strerror(r));
 686  713          }
 687  714  
 688  715          if ((method = libscf_get_method(h, type, inst, snap, &restart_on,
 689  716              &cte_mask, &need_session, &timeout, &timeout_retry)) == NULL) {
 690  717                  if (errno == LIBSCF_PGROUP_ABSENT)  {
 691  718                          log_framework(LOG_DEBUG,
 692  719                              "%s: instance has no method property group '%s'.\n",
 693  720                              inst->ri_i.i_fmri, mname);
 694  721                          if (type == METHOD_REFRESH)
 695  722                                  log_instance(inst, B_TRUE, "No '%s' method "
 696  723                                      "defined.  Treating as :true.", mname);
 697  724                          else
 698  725                                  log_instance(inst, B_TRUE, "Method property "
 699  726                                      "group '%s' is not present.", mname);
 700  727                          scf_snapshot_destroy(snap);
 701  728                          return (0);
 702  729                  } else if (errno == LIBSCF_PROPERTY_ABSENT)  {
 703  730                          log_framework(LOG_DEBUG,
 704  731                              "%s: instance has no '%s/exec' method property.\n",
 705  732                              inst->ri_i.i_fmri, mname);
 706  733                          log_instance(inst, B_TRUE, "Method property '%s/exec "
 707  734                              "is not present.", mname);
 708  735                          scf_snapshot_destroy(snap);
 709  736                          return (0);
 710  737                  } else {
 711  738                          log_error(LOG_WARNING,
 712  739                              "%s: instance libscf_get_method failed\n",
 713  740                              inst->ri_i.i_fmri);
 714  741                          scf_snapshot_destroy(snap);
 715  742                          return (EINVAL);
 716  743                  }
 717  744          }
 718  745  
 719  746          /* open service contract if stopping a non-transient service */
 720  747          if (type == METHOD_STOP && (!instance_is_transient_style(inst))) {
 721  748                  if (inst->ri_i.i_primary_ctid == 0) {
 722  749                          /* service is not running, nothing to stop */
 723  750                          log_framework(LOG_DEBUG, "%s: instance has no primary "
 724  751                              "contract, no service to stop.\n",
 725  752                              inst->ri_i.i_fmri);
 726  753                          scf_snapshot_destroy(snap);
 727  754                          return (0);
 728  755                  }
 729  756                  if ((ctfd = contract_open(inst->ri_i.i_primary_ctid, "process",
 730  757                      "events", O_RDONLY)) < 0) {
 731  758                          result = EFAULT;
 732  759                          log_instance(inst, B_TRUE, "Could not open service "
 733  760                              "contract %ld.  Stop method not run.",
 734  761                              inst->ri_i.i_primary_ctid);
 735  762                          goto out;
 736  763                  }
 737  764          }
 738  765  
 739  766          if (restarter_is_null_method(method)) {
 740  767                  log_framework(LOG_DEBUG, "%s: null method succeeds\n",
 741  768                      inst->ri_i.i_fmri);
 742  769  
 743  770                  log_instance(inst, B_TRUE, "Executing %s method (null).",
 744  771                      mname);
 745  772  
 746  773                  if (type == METHOD_START)
 747  774                          write_status(inst, mname, 0);
 748  775                  goto out;
 749  776          }
 750  777  
 751  778          sig = restarter_is_kill_method(method);
 752  779          if (sig >= 0) {
 753  780  
 754  781                  if (inst->ri_i.i_primary_ctid == 0) {
 755  782                          log_error(LOG_ERR, "%s: :kill with no contract\n",
 756  783                              inst->ri_i.i_fmri);
 757  784                          log_instance(inst, B_TRUE, "Invalid use of \":kill\" "
 758  785                              "as stop method for transient service.");
 759  786                          result = EINVAL;
 760  787                          goto out;
 761  788                  }
 762  789  
 763  790                  log_framework(LOG_DEBUG,
 764  791                      "%s: :killing contract with signal %d\n",
 765  792                      inst->ri_i.i_fmri, sig);
 766  793  
 767  794                  log_instance(inst, B_TRUE, "Executing %s method (:kill).",
 768  795                      mname);
 769  796  
 770  797                  if (contract_kill(inst->ri_i.i_primary_ctid, sig,
 771  798                      inst->ri_i.i_fmri) != 0) {
 772  799                          result = EIO;
 773  800                          goto out;
 774  801                  } else
 775  802                          goto assured_kill;
 776  803          }
 777  804  
 778  805          log_framework(LOG_DEBUG, "%s: forking to run method %s\n",
 779  806              inst->ri_i.i_fmri, method);
 780  807  
 781  808          m_error = restarter_get_method_context(RESTARTER_METHOD_CONTEXT_VERSION,
 782  809              inst->ri_m_inst, snap, mname, method, &mcp);
 783  810  
 784  811          if (m_error != NULL) {
 785  812                  log_instance(inst, B_TRUE, "%s", m_error->msg);
 786  813                  restarter_mc_error_destroy(m_error);
 787  814                  result = EINVAL;
 788  815                  goto out;
 789  816          }
 790  817  
 791  818          r = method_ready_contract(inst, type, restart_on, cte_mask);
 792  819          if (r != 0) {
 793  820                  assert(r == ECANCELED);
 794  821                  assert(inst->ri_mi_deleted);
 795  822                  restarter_free_method_context(mcp);
 796  823                  result = ECANCELED;
 797  824                  goto out;
 798  825          }
 799  826  
 800  827          /*
 801  828           * Validate safety of method contexts, to save children work.
 802  829           */
 803  830          if (!restarter_rm_libs_loadable())
 804  831                  log_framework(LOG_DEBUG, "%s: method contexts limited "
 805  832                      "to root-accessible libraries\n", inst->ri_i.i_fmri);
 806  833  
 807  834          /*
 808  835           * If the service is restarting too quickly, send it to
 809  836           * maintenance.
 810  837           */
 811  838          if (type == METHOD_START) {
 812  839                  method_record_start(inst);
 813  840                  if (method_rate_critical(inst)) {
 814  841                          log_instance(inst, B_TRUE, "Restarting too quickly, "
 815  842                              "changing state to maintenance.");
 816  843                          result = ELOOP;
 817  844                          restarter_free_method_context(mcp);
 818  845                          goto out;
 819  846                  }
 820  847          }
 821  848  
 822  849          atomic_add_16(&storing_contract, 1);
 823  850          pid = startd_fork1(&forkerr);
 824  851          if (pid == 0)
 825  852                  exec_method(inst, type, method, mcp, need_session);
 826  853  
 827  854          if (pid == -1) {
 828  855                  atomic_add_16(&storing_contract, -1);
 829  856                  if (forkerr == EAGAIN)
 830  857                          result = EAGAIN;
 831  858                  else
 832  859                          result = EFAULT;
 833  860  
 834  861                  log_error(LOG_WARNING,
 835  862                      "%s: Couldn't fork to execute method %s: %s\n",
 836  863                      inst->ri_i.i_fmri, method, strerror(forkerr));
 837  864  
 838  865                  restarter_free_method_context(mcp);
 839  866                  goto out;
 840  867          }
 841  868  
 842  869  
 843  870          /*
 844  871           * Get the contract id, decide whether it is primary or transient, and
 845  872           * stash it in inst & the repository.
 846  873           */
 847  874          method_store_contract(inst, type, &ctid);
 848  875          atomic_add_16(&storing_contract, -1);
 849  876  
 850  877          restarter_free_method_context(mcp);
 851  878  
 852  879          /*
 853  880           * Similarly for the start method PID.
 854  881           */
 855  882          if (type == METHOD_START && !inst->ri_mi_deleted)
 856  883                  (void) libscf_write_start_pid(inst->ri_m_inst, pid);
 857  884  
 858  885          if (instance_is_wait_style(inst) && type == METHOD_START) {
 859  886                  /* Wait style instances don't get timeouts on start methods. */
 860  887                  if (wait_register(pid, inst->ri_i.i_fmri, 1, 0)) {
 861  888                          log_error(LOG_WARNING,
 862  889                              "%s: couldn't register %ld for wait\n",
 863  890                              inst->ri_i.i_fmri, pid);
 864  891                          result = EFAULT;
 865  892                          goto contract_out;
 866  893                  }
 867  894                  write_status(inst, mname, 0);
 868  895  
 869  896          } else {
 870  897                  int r, err;
 871  898                  time_t start_time;
 872  899                  time_t end_time;
 873  900  
 874  901                  /*
 875  902                   * Because on upgrade/live-upgrade we may have no chance
 876  903                   * to override faulty timeout values on the way to
 877  904                   * manifest import, all services on the path to manifest
 878  905                   * import are treated the same as INFINITE timeout services.
 879  906                   */
 880  907  
 881  908                  start_time = time(NULL);
 882  909                  if (timeout != METHOD_TIMEOUT_INFINITE && !is_timeout_ovr(inst))
 883  910                          timeout_insert(inst, ctid, timeout);
 884  911                  else
 885  912                          timeout = METHOD_TIMEOUT_INFINITE;
 886  913  
 887  914                  /* Unlock the instance while waiting for the method. */
 888  915                  MUTEX_UNLOCK(&inst->ri_lock);
 889  916  
 890  917                  do {
 891  918                          r = waitpid(pid, &ret_status, NULL);
 892  919                  } while (r == -1 && errno == EINTR);
 893  920                  if (r == -1)
 894  921                          err = errno;
 895  922  
 896  923                  /* Re-grab the lock. */
 897  924                  inst = inst_lookup_by_id(id);
 898  925  
 899  926                  /*
 900  927                   * inst can't be removed, as the removal thread waits
 901  928                   * for completion of this one.
 902  929                   */
 903  930                  assert(inst != NULL);
 904  931                  *instp = inst;
 905  932  
 906  933                  if (inst->ri_timeout != NULL && inst->ri_timeout->te_fired)
 907  934                          timeout_fired = 1;
 908  935  
 909  936                  timeout_remove(inst, ctid);
 910  937  
 911  938                  log_framework(LOG_DEBUG,
 912  939                      "%s method for %s exited with status %d.\n", mname,
 913  940                      inst->ri_i.i_fmri, WEXITSTATUS(ret_status));
 914  941  
 915  942                  if (r == -1) {
 916  943                          log_error(LOG_WARNING,
 917  944                              "Couldn't waitpid() for %s method of %s (%s).\n",
 918  945                              mname, inst->ri_i.i_fmri, strerror(err));
 919  946                          result = EFAULT;
 920  947                          goto contract_out;
 921  948                  }
 922  949  
 923  950                  if (type == METHOD_START)
 924  951                          write_status(inst, mname, ret_status);
 925  952  
 926  953                  /* return ERANGE if this service doesn't retry on timeout */
 927  954                  if (timeout_fired == 1 && timeout_retry == 0) {
 928  955                          result = ERANGE;
 929  956                          goto contract_out;
 930  957                  }
 931  958  
 932  959                  if (!WIFEXITED(ret_status)) {
 933  960                          /*
 934  961                           * If method didn't exit itself (it was killed by an
 935  962                           * external entity, etc.), consider the entire
 936  963                           * method_run as failed.
 937  964                           */
 938  965                          if (WIFSIGNALED(ret_status)) {
 939  966                                  char buf[SIG2STR_MAX];
 940  967                                  (void) sig2str(WTERMSIG(ret_status), buf);
 941  968  
 942  969                                  log_error(LOG_WARNING, "%s: Method \"%s\" "
 943  970                                      "failed due to signal %s.\n",
 944  971                                      inst->ri_i.i_fmri, method, buf);
 945  972                                  log_instance(inst, B_TRUE, "Method \"%s\" "
 946  973                                      "failed due to signal %s.", mname, buf);
 947  974                          } else {
 948  975                                  log_error(LOG_WARNING, "%s: Method \"%s\" "
 949  976                                      "failed with exit status %d.\n",
 950  977                                      inst->ri_i.i_fmri, method,
 951  978                                      WEXITSTATUS(ret_status));
 952  979                                  log_instance(inst, B_TRUE, "Method \"%s\" "
 953  980                                      "failed with exit status %d.", mname,
 954  981                                      WEXITSTATUS(ret_status));
 955  982                          }
 956  983                          result = EAGAIN;
 957  984                          goto contract_out;
 958  985                  }
 959  986  
 960  987                  *exit_code = WEXITSTATUS(ret_status);
 961  988                  if (*exit_code != 0) {
 962  989                          log_error(LOG_WARNING,
 963  990                              "%s: Method \"%s\" failed with exit status %d.\n",
 964  991                              inst->ri_i.i_fmri, method, WEXITSTATUS(ret_status));
 965  992                  }
 966  993  
 967  994                  log_instance(inst, B_TRUE, "Method \"%s\" exited with status "
 968  995                      "%d.", mname, *exit_code);
 969  996  
 970  997                  if (*exit_code != 0)
 971  998                          goto contract_out;
 972  999  
 973 1000                  end_time = time(NULL);
 974 1001  
 975 1002                  /* Give service contract remaining seconds to empty */
 976 1003                  if (timeout != METHOD_TIMEOUT_INFINITE)
 977 1004                          timeout -= (end_time - start_time);
 978 1005          }
 979 1006  
 980 1007  assured_kill:
 981 1008          /*
 982 1009           * For stop methods, assure that the service contract has emptied
 983 1010           * before returning.
 984 1011           */
 985 1012          if (type == METHOD_STOP && (!instance_is_transient_style(inst)) &&
 986 1013              !(contract_is_empty(inst->ri_i.i_primary_ctid))) {
 987 1014                  int times = 0;
 988 1015  
 989 1016                  if (timeout != METHOD_TIMEOUT_INFINITE)
 990 1017                          timeout_insert(inst, inst->ri_i.i_primary_ctid,
 991 1018                              timeout);
 992 1019  
 993 1020                  for (;;) {
 994 1021                          /*
 995 1022                           * Check frequently at first, then back off.  This
 996 1023                           * keeps startd from idling while shutting down.
 997 1024                           */
 998 1025                          if (times < 20) {
 999 1026                                  (void) poll(NULL, 0, 5);
1000 1027                                  times++;
1001 1028                          } else {
1002 1029                                  (void) poll(NULL, 0, 100);
1003 1030                          }
1004 1031                          if (contract_is_empty(inst->ri_i.i_primary_ctid))
1005 1032                                  break;
1006 1033                  }
1007 1034  
1008 1035                  if (timeout != METHOD_TIMEOUT_INFINITE)
1009 1036                          if (inst->ri_timeout->te_fired)
1010 1037                                  result = EFAULT;
1011 1038  
1012 1039                  timeout_remove(inst, inst->ri_i.i_primary_ctid);
1013 1040          }
1014 1041  
1015 1042  contract_out:
1016 1043          /* Abandon contracts for transient methods & methods that fail. */
1017 1044          transient = method_is_transient(inst, type);
1018 1045          if ((transient || *exit_code != 0 || result != 0) &&
1019 1046              (restarter_is_kill_method(method) < 0))
1020 1047                  method_remove_contract(inst, !transient, B_TRUE);
1021 1048  
1022 1049  out:
1023 1050          if (ctfd >= 0)
1024 1051                  (void) close(ctfd);
1025 1052          scf_snapshot_destroy(snap);
1026 1053          free(method);
1027 1054          return (result);
1028 1055  }
1029 1056  
1030 1057  /*
1031 1058   * The method thread executes a service method to effect a state transition.
1032 1059   * The next_state of info->sf_id should be non-_NONE on entrance, and it will
1033 1060   * be _NONE on exit (state will either be what next_state was (on success), or
1034 1061   * it will be _MAINT (on error)).
1035 1062   *
1036 1063   * There are six classes of methods to consider: start & other (stop, refresh)
1037 1064   * for each of "normal" services, wait services, and transient services.  For
1038 1065   * each, the method must be fetched from the repository & executed.  fork()ed
1039 1066   * methods must be waited on, except for the start method of wait services
1040 1067   * (which must be registered with the wait subsystem via wait_register()).  If
1041 1068   * the method succeeded (returned 0), then for start methods its contract
1042 1069   * should be recorded as the primary contract for the service.  For other
1043 1070   * methods, it should be abandoned.  If the method fails, then depending on
1044 1071   * the failure, either the method should be reexecuted or the service should
1045 1072   * be put into maintenance.  Either way the contract should be abandoned.
1046 1073   */
1047 1074  void *
1048 1075  method_thread(void *arg)
1049 1076  {
1050 1077          fork_info_t *info = arg;
1051 1078          restarter_inst_t *inst;
1052 1079          scf_handle_t    *local_handle;
1053 1080          scf_instance_t  *s_inst = NULL;
1054 1081          int r, exit_code;
1055 1082          boolean_t retryable;
1056 1083          restarter_str_t reason;
1057 1084  
1058 1085          assert(0 <= info->sf_method_type && info->sf_method_type <= 2);
1059 1086  
1060 1087          /* Get (and lock) the restarter_inst_t. */
1061 1088          inst = inst_lookup_by_id(info->sf_id);
1062 1089  
1063 1090          assert(inst->ri_method_thread != 0);
1064 1091          assert(instance_in_transition(inst) == 1);
1065 1092  
1066 1093          /*
1067 1094           * We cannot leave this function with inst in transition, because
1068 1095           * protocol.c withholds messages for inst otherwise.
1069 1096           */
1070 1097  
1071 1098          log_framework(LOG_DEBUG, "method_thread() running %s method for %s.\n",
1072 1099              method_names[info->sf_method_type], inst->ri_i.i_fmri);
1073 1100  
1074 1101          local_handle = libscf_handle_create_bound_loop();
1075 1102  
1076 1103  rebind_retry:
1077 1104          /* get scf_instance_t */
1078 1105          switch (r = libscf_fmri_get_instance(local_handle, inst->ri_i.i_fmri,
1079 1106              &s_inst)) {
1080 1107          case 0:
1081 1108                  break;
1082 1109  
1083 1110          case ECONNABORTED:
1084 1111                  libscf_handle_rebind(local_handle);
1085 1112                  goto rebind_retry;
1086 1113  
1087 1114          case ENOENT:
1088 1115                  /*
1089 1116                   * It's not there, but we need to call this so protocol.c
1090 1117                   * doesn't think it's in transition anymore.
1091 1118                   */
1092 1119                  (void) restarter_instance_update_states(local_handle, inst,
1093 1120                      inst->ri_i.i_state, RESTARTER_STATE_NONE, RERR_NONE,
1094 1121                      restarter_str_none);
1095 1122                  goto out;
1096 1123  
1097 1124          case EINVAL:
1098 1125          case ENOTSUP:
1099 1126          default:
1100 1127                  bad_error("libscf_fmri_get_instance", r);
1101 1128          }
1102 1129  
1103 1130          inst->ri_m_inst = s_inst;
1104 1131          inst->ri_mi_deleted = B_FALSE;
1105 1132  
1106 1133  retry:
1107 1134          if (info->sf_method_type == METHOD_START)
1108 1135                  log_transition(inst, START_REQUESTED);
1109 1136  
1110 1137          r = method_run(&inst, info->sf_method_type, &exit_code);
1111 1138  
1112 1139          if (r == 0 && exit_code == 0) {
1113 1140                  /* Success! */
1114 1141                  assert(inst->ri_i.i_next_state != RESTARTER_STATE_NONE);
1115 1142  
1116 1143                  /*
1117 1144                   * When a stop method succeeds, remove the primary contract of
1118 1145                   * the service, unless we're going to offline, in which case
1119 1146                   * retain the contract so we can transfer inherited contracts to
1120 1147                   * the replacement service.
1121 1148                   */
1122 1149  
1123 1150                  if (info->sf_method_type == METHOD_STOP &&
1124 1151                      inst->ri_i.i_primary_ctid != 0) {
1125 1152                          if (inst->ri_i.i_next_state == RESTARTER_STATE_OFFLINE)
1126 1153                                  inst->ri_i.i_primary_ctid_stopped = 1;
1127 1154                          else
1128 1155                                  method_remove_contract(inst, B_TRUE, B_TRUE);
1129 1156                  }
1130 1157                  /*
1131 1158                   * We don't care whether the handle was rebound because this is
1132 1159                   * the last thing we do with it.
1133 1160                   */
1134 1161                  (void) restarter_instance_update_states(local_handle, inst,
1135 1162                      inst->ri_i.i_next_state, RESTARTER_STATE_NONE,
1136 1163                      info->sf_event_type, info->sf_reason);
1137 1164  
1138 1165                  (void) update_fault_count(inst, FAULT_COUNT_RESET);
1139 1166  
1140 1167                  goto out;
1141 1168          }
1142 1169  
1143 1170          /* Failure.  Retry or go to maintenance. */
1144 1171  
1145 1172          if (r != 0 && r != EAGAIN) {
1146 1173                  retryable = B_FALSE;
1147 1174          } else {
1148 1175                  switch (exit_code) {
1149 1176                  case SMF_EXIT_ERR_CONFIG:
1150 1177                  case SMF_EXIT_ERR_NOSMF:
1151 1178                  case SMF_EXIT_ERR_PERM:
1152 1179                  case SMF_EXIT_ERR_FATAL:
1153 1180                          retryable = B_FALSE;
1154 1181                          break;
1155 1182  
1156 1183                  default:
1157 1184                          retryable = B_TRUE;
1158 1185                  }
1159 1186          }
1160 1187  
1161 1188          if (retryable && update_fault_count(inst, FAULT_COUNT_INCR) != 1)
1162 1189                  goto retry;
1163 1190  
1164 1191          /* maintenance */
1165 1192          if (r == ELOOP)
1166 1193                  log_transition(inst, START_FAILED_REPEATEDLY);
1167 1194          else if (r == ERANGE)
1168 1195                  log_transition(inst, START_FAILED_TIMEOUT_FATAL);
1169 1196          else if (exit_code == SMF_EXIT_ERR_CONFIG)
1170 1197                  log_transition(inst, START_FAILED_CONFIGURATION);
1171 1198          else if (exit_code == SMF_EXIT_ERR_FATAL)
1172 1199                  log_transition(inst, START_FAILED_FATAL);
1173 1200          else
1174 1201                  log_transition(inst, START_FAILED_OTHER);
1175 1202  
1176 1203          if (r == ELOOP) {
1177 1204                  reason = restarter_str_restarting_too_quickly;
1178 1205          } else if (retryable) {
1179 1206                  reason = restarter_str_fault_threshold_reached;
1180 1207          } else {
1181 1208                  reason = restarter_str_method_failed;
1182 1209          }
1183 1210  
1184 1211          (void) restarter_instance_update_states(local_handle, inst,
1185 1212              RESTARTER_STATE_MAINT, RESTARTER_STATE_NONE, RERR_FAULT,
1186 1213              reason);
1187 1214  
1188 1215          if (!method_is_transient(inst, info->sf_method_type) &&
1189 1216              inst->ri_i.i_primary_ctid != 0)
1190 1217                  method_remove_contract(inst, B_TRUE, B_TRUE);
1191 1218  
1192 1219  out:
1193 1220          inst->ri_method_thread = 0;
1194 1221  
1195 1222          /*
1196 1223           * Unlock the mutex after broadcasting to avoid a race condition
1197 1224           * with restarter_delete_inst() when the 'inst' structure is freed.
1198 1225           */
1199 1226          (void) pthread_cond_broadcast(&inst->ri_method_cv);
1200 1227          MUTEX_UNLOCK(&inst->ri_lock);
1201 1228  
1202 1229          scf_instance_destroy(s_inst);
1203 1230          scf_handle_destroy(local_handle);
1204 1231          startd_free(info, sizeof (fork_info_t));
1205 1232          return (NULL);
1206 1233  }

↓ open down ↓

678 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX