illumos-gate Wdiff usr/src/cmd/svc/startd/method.c

Print this page

7711 SMF: Finish implementing support for degraded state

Split	Close
Expand all
Collapse all

          --- old/usr/src/cmd/svc/startd/method.c
          +++ new/usr/src/cmd/svc/startd/method.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2011 Joyent Inc.
  25   25   */
  26   26  
  27   27  /*
  28   28   * method.c - method execution functions
  29   29   *
  30   30   * This file contains the routines needed to run a method:  a fork(2)-exec(2)
  31   31   * invocation monitored using either the contract filesystem or waitpid(2).
  32   32   * (Plain fork1(2) support is provided in fork.c.)
  33   33   *
  34   34   * Contract Transfer
  35   35   *   When we restart a service, we want to transfer any contracts that the old
  36   36   *   service's contract inherited.  This means that (a) we must not abandon the
  37   37   *   old contract when the service dies and (b) we must write the id of the old
  38   38   *   contract into the terms of the new contract.  There should be limits to
  39   39   *   (a), though, since we don't want to keep the contract around forever.  To
  40   40   *   this end we'll say that services in the offline state may have a contract
  41   41   *   to be transfered and services in the disabled or maintenance states cannot.
  42   42   *   This means that when a service transitions from online (or degraded) to
  43   43   *   offline, the contract should be preserved, and when the service transitions
  44   44   *   from offline to online (i.e., the start method), we'll transfer inherited
  45   45   *   contracts.
  46   46   */
  47   47  
  48   48  #include <sys/contract/process.h>
  49   49  #include <sys/ctfs.h>
  50   50  #include <sys/stat.h>
  51   51  #include <sys/time.h>
  52   52  #include <sys/types.h>
  53   53  #include <sys/uio.h>
  54   54  #include <sys/wait.h>
  55   55  #include <alloca.h>
  56   56  #include <assert.h>
  57   57  #include <errno.h>
  58   58  #include <fcntl.h>
  59   59  #include <libcontract.h>
  60   60  #include <libcontract_priv.h>
  61   61  #include <libgen.h>
  62   62  #include <librestart.h>
  63   63  #include <libscf.h>
  64   64  #include <limits.h>
  65   65  #include <port.h>
  66   66  #include <sac.h>
  67   67  #include <signal.h>
  68   68  #include <stdlib.h>
  69   69  #include <string.h>
  70   70  #include <strings.h>
  71   71  #include <unistd.h>
  72   72  #include <atomic.h>
  73   73  #include <poll.h>
  74   74  #include <libscf_priv.h>
  75   75  
  76   76  #include "startd.h"
  77   77  
  78   78  #define SBIN_SH         "/sbin/sh"
  79   79  
  80   80  /*
  81   81   * Used to tell if contracts are in the process of being
  82   82   * stored into the svc.startd internal hash table.
  83   83   */
  84   84  volatile uint16_t       storing_contract = 0;
  85   85  
  86   86  /*
  87   87   * Mapping from restart_on method-type to contract events.  Must correspond to
  88   88   * enum method_restart_t.
  89   89   */
  90   90  static uint_t method_events[] = {
  91   91          /* METHOD_RESTART_ALL */
  92   92          CT_PR_EV_HWERR | CT_PR_EV_SIGNAL | CT_PR_EV_CORE | CT_PR_EV_EMPTY,
  93   93          /* METHOD_RESTART_EXTERNAL_FAULT */
  94   94          CT_PR_EV_HWERR | CT_PR_EV_SIGNAL,
  95   95          /* METHOD_RESTART_ANY_FAULT */
  96   96          CT_PR_EV_HWERR | CT_PR_EV_SIGNAL | CT_PR_EV_CORE
  97   97  };
  98   98  
  99   99  /*
 100  100   * method_record_start(restarter_inst_t *)
 101  101   *   Record a service start for rate limiting.  Place the current time
 102  102   *   in the circular array of instance starts.
 103  103   */
 104  104  static void
 105  105  method_record_start(restarter_inst_t *inst)
 106  106  {
 107  107          int index = inst->ri_start_index++ % RINST_START_TIMES;
 108  108  
 109  109          inst->ri_start_time[index] = gethrtime();
 110  110  }
 111  111  
 112  112  /*
 113  113   * method_rate_critical(restarter_inst_t *)
 114  114   *    Return true if the average start interval is less than the permitted
 115  115   *    interval.  The implicit interval defaults to RINST_FAILURE_RATE_NS and
 116  116   *    RINST_START_TIMES but may be overridden with the svc properties
 117  117   *    startd/critical_failure_count and startd/critical_failure_period
 118  118   *    which represent the number of failures to consider and the amount of
 119  119   *    time in seconds in which that number may occur, respectively. Note that
 120  120   *    this time is measured as of the transition to 'enabled' rather than wall
 121  121   *    clock time.
 122  122   *    Implicit success if insufficient measurements for an average exist.
 123  123   */
 124  124  int
 125  125  method_rate_critical(restarter_inst_t *inst)
 126  126  {
 127  127          hrtime_t critical_failure_period;
 128  128          uint_t critical_failure_count = RINST_START_TIMES;
 129  129          uint_t n = inst->ri_start_index;
 130  130          hrtime_t avg_ns = 0;
 131  131          uint64_t scf_fr, scf_st;
 132  132          scf_propvec_t *prop = NULL;
 133  133          scf_propvec_t restart_critical[] = {
 134  134                  { "critical_failure_period", NULL, SCF_TYPE_INTEGER, NULL, 0 },
 135  135                  { "critical_failure_count", NULL, SCF_TYPE_INTEGER, NULL, 0 },
 136  136                  { NULL }
 137  137          };
 138  138  
 139  139          if (instance_is_wait_style(inst))
 140  140                  critical_failure_period = RINST_WT_SVC_FAILURE_RATE_NS;
 141  141          else
 142  142                  critical_failure_period = RINST_FAILURE_RATE_NS;
 143  143  
 144  144          restart_critical[0].pv_ptr = &scf_fr;
 145  145          restart_critical[1].pv_ptr = &scf_st;
 146  146  
 147  147          if (scf_read_propvec(inst->ri_i.i_fmri, "startd",
 148  148              B_TRUE, restart_critical, &prop) != SCF_FAILED) {
 149  149                  /*
 150  150                   * critical_failure_period is expressed
 151  151                   * in seconds but tracked in ns
 152  152                   */
 153  153                  critical_failure_period = (hrtime_t)scf_fr * NANOSEC;
 154  154                  critical_failure_count = (uint_t)scf_st;
 155  155          }
 156  156          if (inst->ri_start_index < critical_failure_count)
 157  157                  return (0);
 158  158  
 159  159          avg_ns =
 160  160              (inst->ri_start_time[(n - 1) % critical_failure_count] -
 161  161              inst->ri_start_time[n % critical_failure_count]) /
 162  162              (critical_failure_count - 1);
 163  163  
 164  164          return (avg_ns < critical_failure_period);
 165  165  }
 166  166  
 167  167  /*
 168  168   * int method_is_transient()
 169  169   *   Determine if the method for the given instance is transient,
 170  170   *   from a contract perspective. Return 1 if it is, and 0 if it isn't.
 171  171   */
 172  172  static int
 173  173  method_is_transient(restarter_inst_t *inst, int type)
 174  174  {
 175  175          if (instance_is_transient_style(inst) || type != METHOD_START)
 176  176                  return (1);
 177  177          else
 178  178                  return (0);
 179  179  }
 180  180  
 181  181  /*
 182  182   * void method_store_contract()
 183  183   *   Store the newly created contract id into local structures and
 184  184   *   the repository.  If the repository connection is broken it is rebound.
 185  185   */
 186  186  static void
 187  187  method_store_contract(restarter_inst_t *inst, int type, ctid_t *cid)
 188  188  {
 189  189          int r;
 190  190          boolean_t primary;
 191  191  
 192  192          if (errno = contract_latest(cid))
 193  193                  uu_die("%s: Couldn't get new contract's id", inst->ri_i.i_fmri);
 194  194  
 195  195          primary = !method_is_transient(inst, type);
 196  196  
 197  197          if (!primary) {
 198  198                  if (inst->ri_i.i_transient_ctid != 0) {
 199  199                          log_framework(LOG_INFO,
 200  200                              "%s: transient ctid expected to be 0 but "
 201  201                              "was set to %ld\n", inst->ri_i.i_fmri,
 202  202                              inst->ri_i.i_transient_ctid);
 203  203                  }
 204  204  
 205  205                  inst->ri_i.i_transient_ctid = *cid;
 206  206          } else {
 207  207                  if (inst->ri_i.i_primary_ctid != 0) {
 208  208                          /*
 209  209                           * There was an old contract that we transferred.
 210  210                           * Remove it.
 211  211                           */
 212  212                          method_remove_contract(inst, B_TRUE, B_FALSE);
 213  213                  }
 214  214  
 215  215                  if (inst->ri_i.i_primary_ctid != 0) {
 216  216                          log_framework(LOG_INFO,
 217  217                              "%s: primary ctid expected to be 0 but "
 218  218                              "was set to %ld\n", inst->ri_i.i_fmri,
 219  219                              inst->ri_i.i_primary_ctid);
 220  220                  }
 221  221  
 222  222                  inst->ri_i.i_primary_ctid = *cid;
 223  223                  inst->ri_i.i_primary_ctid_stopped = 0;
 224  224  
 225  225                  log_framework(LOG_DEBUG, "Storing primary contract %ld for "
 226  226                      "%s.\n", *cid, inst->ri_i.i_fmri);
 227  227  
 228  228                  contract_hash_store(*cid, inst->ri_id);
 229  229          }
 230  230  
 231  231  again:
 232  232          if (inst->ri_mi_deleted)
 233  233                  return;
 234  234  
 235  235          r = restarter_store_contract(inst->ri_m_inst, *cid, primary ?
 236  236              RESTARTER_CONTRACT_PRIMARY : RESTARTER_CONTRACT_TRANSIENT);
 237  237          switch (r) {
 238  238          case 0:
 239  239                  break;
 240  240  
 241  241          case ECANCELED:
 242  242                  inst->ri_mi_deleted = B_TRUE;
 243  243                  break;
 244  244  
 245  245          case ECONNABORTED:
 246  246                  libscf_handle_rebind(scf_instance_handle(inst->ri_m_inst));
 247  247                  /* FALLTHROUGH */
 248  248  
 249  249          case EBADF:
 250  250                  libscf_reget_instance(inst);
 251  251                  goto again;
 252  252  
 253  253          case ENOMEM:
 254  254          case EPERM:
 255  255          case EACCES:
 256  256          case EROFS:
 257  257                  uu_die("%s: Couldn't store contract id %ld",
 258  258                      inst->ri_i.i_fmri, *cid);
 259  259                  /* NOTREACHED */
 260  260  
 261  261          case EINVAL:
 262  262          default:
 263  263                  bad_error("restarter_store_contract", r);
 264  264          }
 265  265  }
 266  266  
 267  267  /*
 268  268   * void method_remove_contract()
 269  269   *   Remove any non-permanent contracts from internal structures and
 270  270   *   the repository, then abandon them.
 271  271   *   Returns
 272  272   *     0 - success
 273  273   *     ECANCELED - inst was deleted from the repository
 274  274   *
 275  275   *   If the repository connection was broken, it is rebound.
 276  276   */
 277  277  void
 278  278  method_remove_contract(restarter_inst_t *inst, boolean_t primary,
 279  279      boolean_t abandon)
 280  280  {
 281  281          ctid_t * const ctidp = primary ? &inst->ri_i.i_primary_ctid :
 282  282              &inst->ri_i.i_transient_ctid;
 283  283  
 284  284          int r;
 285  285  
 286  286          assert(*ctidp != 0);
 287  287  
 288  288          log_framework(LOG_DEBUG, "Removing %s contract %lu for %s.\n",
 289  289              primary ? "primary" : "transient", *ctidp, inst->ri_i.i_fmri);
 290  290  
 291  291          if (abandon)
 292  292                  contract_abandon(*ctidp);
 293  293  
 294  294  again:
 295  295          if (inst->ri_mi_deleted) {
 296  296                  r = ECANCELED;
 297  297                  goto out;
 298  298          }
 299  299  
 300  300          r = restarter_remove_contract(inst->ri_m_inst, *ctidp, primary ?
 301  301              RESTARTER_CONTRACT_PRIMARY : RESTARTER_CONTRACT_TRANSIENT);
 302  302          switch (r) {
 303  303          case 0:
 304  304                  break;
 305  305  
 306  306          case ECANCELED:
 307  307                  inst->ri_mi_deleted = B_TRUE;
 308  308                  break;
 309  309  
 310  310          case ECONNABORTED:
 311  311                  libscf_handle_rebind(scf_instance_handle(inst->ri_m_inst));
 312  312                  /* FALLTHROUGH */
 313  313  
 314  314          case EBADF:
 315  315                  libscf_reget_instance(inst);
 316  316                  goto again;
 317  317  
 318  318          case ENOMEM:
 319  319          case EPERM:
 320  320          case EACCES:
 321  321          case EROFS:
 322  322                  log_error(LOG_INFO, "%s: Couldn't remove contract id %ld: "
 323  323                      "%s.\n", inst->ri_i.i_fmri, *ctidp, strerror(r));
 324  324                  break;
 325  325  
 326  326          case EINVAL:
 327  327          default:
 328  328                  bad_error("restarter_remove_contract", r);
 329  329          }
 330  330  
 331  331  out:
 332  332          if (primary)
 333  333                  contract_hash_remove(*ctidp);
 334  334  
 335  335          *ctidp = 0;
 336  336  }
 337  337  
 338  338  static const char *method_names[] = { "start", "stop", "refresh" };
 339  339  
 340  340  /*
 341  341   * int method_ready_contract(restarter_inst_t *, int, method_restart_t, int)
 342  342   *
 343  343   *   Activate a contract template for the type method of inst.  type,
 344  344   *   restart_on, and cte_mask dictate the critical events term of the contract.
 345  345   *   Returns
 346  346   *     0 - success
 347  347   *     ECANCELED - inst has been deleted from the repository
 348  348   */
 349  349  static int
 350  350  method_ready_contract(restarter_inst_t *inst, int type,
 351  351      method_restart_t restart_on, uint_t cte_mask)
 352  352  {
 353  353          int tmpl, err, istrans, iswait, ret;
 354  354          uint_t cevents, fevents;
 355  355  
 356  356          /*
 357  357           * Correctly supporting wait-style services is tricky without
 358  358           * rearchitecting startd to cope with multiple event sources
 359  359           * simultaneously trying to stop an instance.  Until a better
 360  360           * solution is implemented, we avoid this problem for
 361  361           * wait-style services by making contract events fatal and
 362  362           * letting the wait code alone handle stopping the service.
 363  363           */
 364  364          iswait = instance_is_wait_style(inst);
 365  365          istrans = method_is_transient(inst, type);
 366  366  
 367  367          tmpl = open64(CTFS_ROOT "/process/template", O_RDWR);
 368  368          if (tmpl == -1)
 369  369                  uu_die("Could not create contract template");
 370  370  
 371  371          /*
 372  372           * We assume non-login processes are unlikely to create
 373  373           * multiple process groups, and set CT_PR_PGRPONLY for all
 374  374           * wait-style services' contracts.
 375  375           */
 376  376          err = ct_pr_tmpl_set_param(tmpl, CT_PR_INHERIT | CT_PR_REGENT |
 377  377              (iswait ? CT_PR_PGRPONLY : 0));
 378  378          assert(err == 0);
 379  379  
 380  380          if (istrans) {
 381  381                  cevents = 0;
 382  382                  fevents = 0;
 383  383          } else {
 384  384                  assert(restart_on >= 0);
 385  385                  assert(restart_on <= METHOD_RESTART_ANY_FAULT);
 386  386                  cevents = method_events[restart_on] & ~cte_mask;
 387  387                  fevents = iswait ?
 388  388                      (method_events[restart_on] & ~cte_mask & CT_PR_ALLFATAL) :
 389  389                      0;
 390  390          }
 391  391  
 392  392          err = ct_tmpl_set_critical(tmpl, cevents);
 393  393          assert(err == 0);
 394  394  
 395  395          err = ct_tmpl_set_informative(tmpl, 0);
 396  396          assert(err == 0);
 397  397          err = ct_pr_tmpl_set_fatal(tmpl, fevents);
 398  398          assert(err == 0);
 399  399  
 400  400          err = ct_tmpl_set_cookie(tmpl, istrans ?  METHOD_OTHER_COOKIE :
 401  401              METHOD_START_COOKIE);
 402  402          assert(err == 0);
 403  403  
 404  404          if (type == METHOD_START && inst->ri_i.i_primary_ctid != 0) {
 405  405                  ret = ct_pr_tmpl_set_transfer(tmpl, inst->ri_i.i_primary_ctid);
 406  406                  switch (ret) {
 407  407                  case 0:
 408  408                          break;
 409  409  
 410  410                  case ENOTEMPTY:
 411  411                          /* No contracts for you! */
 412  412                          method_remove_contract(inst, B_TRUE, B_TRUE);
 413  413                          if (inst->ri_mi_deleted) {
 414  414                                  ret = ECANCELED;
 415  415                                  goto out;
 416  416                          }
 417  417                          break;
 418  418  
 419  419                  case EINVAL:
 420  420                  case ESRCH:
 421  421                  case EACCES:
 422  422                  default:
 423  423                          bad_error("ct_pr_tmpl_set_transfer", ret);
 424  424                  }
 425  425          }
 426  426  
 427  427          err = ct_pr_tmpl_set_svc_fmri(tmpl, inst->ri_i.i_fmri);
 428  428          assert(err == 0);
 429  429          err = ct_pr_tmpl_set_svc_aux(tmpl, method_names[type]);
 430  430          assert(err == 0);
 431  431  
 432  432          err = ct_tmpl_activate(tmpl);
 433  433          assert(err == 0);
 434  434  
 435  435          ret = 0;
 436  436  
 437  437  out:
 438  438          err = close(tmpl);
 439  439          assert(err == 0);
 440  440  
 441  441          return (ret);
 442  442  }
 443  443  
 444  444  static void
 445  445  exec_method(const restarter_inst_t *inst, int type, const char *method,
 446  446      struct method_context *mcp, uint8_t need_session)
 447  447  {
 448  448          char *cmd;
 449  449          const char *errf;
 450  450          char **nenv;
 451  451          int rsmc_errno = 0;
 452  452  
 453  453          cmd = uu_msprintf("exec %s", method);
 454  454  
 455  455          if (inst->ri_utmpx_prefix[0] != '\0' && inst->ri_utmpx_prefix != NULL)
 456  456                  (void) utmpx_mark_init(getpid(), inst->ri_utmpx_prefix);
 457  457  
 458  458          setlog(inst->ri_logstem);
 459  459          log_instance(inst, B_FALSE, "Executing %s method (\"%s\").",
 460  460              method_names[type], method);
 461  461  
 462  462          if (need_session)
 463  463                  (void) setpgrp();
 464  464  
 465  465          /* Set credentials. */
 466  466          rsmc_errno = restarter_set_method_context(mcp, &errf);
 467  467          if (rsmc_errno != 0) {
 468  468                  log_instance(inst, B_FALSE,
 469  469                      "svc.startd could not set context for method: ");
 470  470  
 471  471                  if (rsmc_errno == -1) {
 472  472                          if (strcmp(errf, "core_set_process_path") == 0) {
 473  473                                  log_instance(inst, B_FALSE,
 474  474                                      "Could not set corefile path.");
 475  475                          } else if (strcmp(errf, "setproject") == 0) {
 476  476                                  log_instance(inst, B_FALSE, "%s: a resource "
 477  477                                      "control assignment failed", errf);
 478  478                          } else if (strcmp(errf, "pool_set_binding") == 0) {
 479  479                                  log_instance(inst, B_FALSE, "%s: a system "
 480  480                                      "error occurred", errf);
 481  481                          } else {
 482  482  #ifndef NDEBUG
 483  483                                  uu_warn("%s:%d: Bad function name \"%s\" for "
 484  484                                      "error %d from "
 485  485                                      "restarter_set_method_context().\n",
 486  486                                      __FILE__, __LINE__, errf, rsmc_errno);
 487  487  #endif
 488  488                                  abort();
 489  489                          }
 490  490  
 491  491                          exit(1);
 492  492                  }
 493  493  
 494  494                  if (errf != NULL && strcmp(errf, "pool_set_binding") == 0) {
 495  495                          switch (rsmc_errno) {
 496  496                          case ENOENT:
 497  497                                  log_instance(inst, B_FALSE, "%s: the pool "
 498  498                                      "could not be found", errf);
 499  499                                  break;
 500  500  
 501  501                          case EBADF:
 502  502                                  log_instance(inst, B_FALSE, "%s: the "
 503  503                                      "configuration is invalid", errf);
 504  504                                  break;
 505  505  
 506  506                          case EINVAL:
 507  507                                  log_instance(inst, B_FALSE, "%s: pool name "
 508  508                                      "\"%s\" is invalid", errf,
 509  509                                      mcp->resource_pool);
 510  510                                  break;
 511  511  
 512  512                          default:
 513  513  #ifndef NDEBUG
 514  514                                  uu_warn("%s:%d: Bad error %d for function %s "
 515  515                                      "in restarter_set_method_context().\n",
 516  516                                      __FILE__, __LINE__, rsmc_errno, errf);
 517  517  #endif
 518  518                                  abort();
 519  519                          }
 520  520  
 521  521                          exit(SMF_EXIT_ERR_CONFIG);
 522  522                  }
 523  523  
 524  524                  if (errf != NULL && strcmp(errf, "chdir") == 0) {
 525  525                          switch (rsmc_errno) {
 526  526                          case EACCES:
 527  527                          case EFAULT:
 528  528                          case EIO:
 529  529                          case ELOOP:
 530  530                          case ENAMETOOLONG:
 531  531                          case ENOENT:
 532  532                          case ENOLINK:
 533  533                          case ENOTDIR:
 534  534                                  log_instance(inst, B_FALSE, "%s: %s (\"%s\")",
 535  535                                      errf,
 536  536                                      strerror(rsmc_errno), mcp->working_dir);
 537  537                                  break;
 538  538  
 539  539                          default:
 540  540  #ifndef NDEBUG
 541  541                                  uu_warn("%s:%d: Bad error %d for function %s "
 542  542                                      "in restarter_set_method_context().\n",
 543  543                                      __FILE__, __LINE__, rsmc_errno, errf);
 544  544  #endif
 545  545                                  abort();
 546  546                          }
 547  547  
 548  548                          exit(SMF_EXIT_ERR_CONFIG);
 549  549                  }
 550  550  
 551  551                  if (errf != NULL) {
 552  552                          errno = rsmc_errno;
 553  553                          perror(errf);
 554  554  
 555  555                          switch (rsmc_errno) {
 556  556                          case EINVAL:
 557  557                          case EPERM:
 558  558                          case ENOENT:
 559  559                          case ENAMETOOLONG:
 560  560                          case ERANGE:
 561  561                          case ESRCH:
 562  562                                  exit(SMF_EXIT_ERR_CONFIG);
 563  563                                  /* NOTREACHED */
 564  564  
 565  565                          default:
 566  566                                  exit(1);
 567  567                          }
 568  568                  }
 569  569  
 570  570                  switch (rsmc_errno) {
 571  571                  case ENOMEM:
 572  572                          log_instance(inst, B_FALSE, "Out of memory.");
 573  573                          exit(1);
 574  574                          /* NOTREACHED */
 575  575  
 576  576                  case ENOENT:
 577  577                          log_instance(inst, B_FALSE, "Missing passwd entry for "
 578  578                              "user.");
 579  579                          exit(SMF_EXIT_ERR_CONFIG);
 580  580                          /* NOTREACHED */
 581  581  
 582  582                  default:
 583  583  #ifndef NDEBUG
 584  584                          uu_warn("%s:%d: Bad miscellaneous error %d from "
 585  585                              "restarter_set_method_context().\n", __FILE__,
 586  586                              __LINE__, rsmc_errno);
 587  587  #endif
 588  588                          abort();
 589  589                  }
 590  590          }
 591  591  
 592  592          nenv = set_smf_env(mcp->env, mcp->env_sz, NULL, inst,
 593  593              method_names[type]);
 594  594  
 595  595          log_preexec();
 596  596  
 597  597          (void) execle(SBIN_SH, SBIN_SH, "-c", cmd, NULL, nenv);
 598  598  
 599  599          exit(10);
 600  600  }
 601  601  
 602  602  static void
 603  603  write_status(restarter_inst_t *inst, const char *mname, int stat)
 604  604  {
 605  605          int r;
 606  606  
 607  607  again:
 608  608          if (inst->ri_mi_deleted)
 609  609                  return;
 610  610  
 611  611          r = libscf_write_method_status(inst->ri_m_inst, mname, stat);
 612  612          switch (r) {
 613  613          case 0:
 614  614                  break;
 615  615  
 616  616          case ECONNABORTED:
 617  617                  libscf_reget_instance(inst);
 618  618                  goto again;
 619  619  
 620  620          case ECANCELED:
 621  621                  inst->ri_mi_deleted = 1;
 622  622                  break;
 623  623  
 624  624          case EPERM:
 625  625          case EACCES:
 626  626          case EROFS:
 627  627                  log_framework(LOG_INFO, "Could not write exit status "
 628  628                      "for %s method of %s: %s.\n", mname,
 629  629                      inst->ri_i.i_fmri, strerror(r));
 630  630                  break;
 631  631  
 632  632          case ENAMETOOLONG:
 633  633          default:
 634  634                  bad_error("libscf_write_method_status", r);
 635  635          }
 636  636  }
 637  637  
 638  638  /*
 639  639   * int method_run()
 640  640   *   Execute the type method of instp.  If it requires a fork(), wait for it
 641  641   *   to return and return its exit code in *exit_code.  Otherwise set
 642  642   *   *exit_code to 0 if the method succeeds & -1 if it fails.  If the
 643  643   *   repository connection is broken, it is rebound, but inst may not be
 644  644   *   reset.
 645  645   *   Returns
 646  646   *     0 - success
 647  647   *     EINVAL - A correct method or method context couldn't be retrieved.
 648  648   *     EIO - Contract kill failed.
 649  649   *     EFAULT - Method couldn't be executed successfully.
 650  650   *     ELOOP - Retry threshold exceeded.
 651  651   *     ECANCELED - inst was deleted from the repository before method was run
 652  652   *     ERANGE - Timeout retry threshold exceeded.
 653  653   *     EAGAIN - Failed due to external cause, retry.
 654  654   */
 655  655  int
 656  656  method_run(restarter_inst_t **instp, int type, int *exit_code)
 657  657  {
 658  658          char *method;
 659  659          int ret_status;
 660  660          pid_t pid;
 661  661          method_restart_t restart_on;
 662  662          uint_t cte_mask;
 663  663          uint8_t need_session;
 664  664          scf_handle_t *h;
 665  665          scf_snapshot_t *snap;
 666  666          const char *mname;
 667  667          mc_error_t *m_error;
 668  668          struct method_context *mcp;
 669  669          int result = 0, timeout_fired = 0;
 670  670          int sig, r;
 671  671          boolean_t transient;
 672  672          uint64_t timeout;
 673  673          uint8_t timeout_retry;
 674  674          ctid_t ctid;
 675  675          int ctfd = -1;
 676  676          restarter_inst_t *inst = *instp;
 677  677          int id = inst->ri_id;
 678  678          int forkerr;
 679  679  
 680  680          assert(MUTEX_HELD(&inst->ri_lock));
 681  681          assert(instance_in_transition(inst));
 682  682  
 683  683          if (inst->ri_mi_deleted)
 684  684                  return (ECANCELED);
 685  685  
 686  686          *exit_code = 0;
 687  687  
 688  688          assert(0 <= type && type <= 2);
 689  689          mname = method_names[type];
 690  690  
 691  691          if (type == METHOD_START)
 692  692                  inst->ri_pre_online_hook();
 693  693  
 694  694          h = scf_instance_handle(inst->ri_m_inst);
 695  695  
 696  696          snap = scf_snapshot_create(h);
 697  697          if (snap == NULL ||
 698  698              scf_instance_get_snapshot(inst->ri_m_inst, "running", snap) != 0) {
 699  699                  log_framework(LOG_DEBUG,
 700  700                      "Could not get running snapshot for %s.  "
 701  701                      "Using editing version to run method %s.\n",
 702  702                      inst->ri_i.i_fmri, mname);
 703  703                  scf_snapshot_destroy(snap);
 704  704                  snap = NULL;
 705  705          }
 706  706  
 707  707          /*
 708  708           * After this point, we may be logging to the instance log.
 709  709           * Make sure we've noted where that log is as a property of
 710  710           * the instance.
 711  711           */
 712  712          r = libscf_note_method_log(inst->ri_m_inst, st->st_log_prefix,
 713  713              inst->ri_logstem);
 714  714          if (r != 0) {
 715  715                  log_framework(LOG_WARNING,
 716  716                      "%s: couldn't note log location: %s\n",
 717  717                      inst->ri_i.i_fmri, strerror(r));
 718  718          }
 719  719  
 720  720          if ((method = libscf_get_method(h, type, inst, snap, &restart_on,
 721  721              &cte_mask, &need_session, &timeout, &timeout_retry)) == NULL) {
 722  722                  if (errno == LIBSCF_PGROUP_ABSENT)  {
 723  723                          log_framework(LOG_DEBUG,
 724  724                              "%s: instance has no method property group '%s'.\n",
 725  725                              inst->ri_i.i_fmri, mname);
 726  726                          if (type == METHOD_REFRESH)
 727  727                                  log_instance(inst, B_TRUE, "No '%s' method "
 728  728                                      "defined.  Treating as :true.", mname);
 729  729                          else
 730  730                                  log_instance(inst, B_TRUE, "Method property "
 731  731                                      "group '%s' is not present.", mname);
 732  732                          scf_snapshot_destroy(snap);
 733  733                          return (0);
 734  734                  } else if (errno == LIBSCF_PROPERTY_ABSENT)  {
 735  735                          log_framework(LOG_DEBUG,
 736  736                              "%s: instance has no '%s/exec' method property.\n",
 737  737                              inst->ri_i.i_fmri, mname);
 738  738                          log_instance(inst, B_TRUE, "Method property '%s/exec "
 739  739                              "is not present.", mname);
 740  740                          scf_snapshot_destroy(snap);
 741  741                          return (0);
 742  742                  } else {
 743  743                          log_error(LOG_WARNING,
 744  744                              "%s: instance libscf_get_method failed\n",
 745  745                              inst->ri_i.i_fmri);
 746  746                          scf_snapshot_destroy(snap);
 747  747                          return (EINVAL);
 748  748                  }
 749  749          }
 750  750  
 751  751          /* open service contract if stopping a non-transient service */
 752  752          if (type == METHOD_STOP && (!instance_is_transient_style(inst))) {
 753  753                  if (inst->ri_i.i_primary_ctid == 0) {
 754  754                          /* service is not running, nothing to stop */
 755  755                          log_framework(LOG_DEBUG, "%s: instance has no primary "
 756  756                              "contract, no service to stop.\n",
 757  757                              inst->ri_i.i_fmri);
 758  758                          scf_snapshot_destroy(snap);
 759  759                          return (0);
 760  760                  }
 761  761                  if ((ctfd = contract_open(inst->ri_i.i_primary_ctid, "process",
 762  762                      "events", O_RDONLY)) < 0) {
 763  763                          result = EFAULT;
 764  764                          log_instance(inst, B_TRUE, "Could not open service "
 765  765                              "contract %ld.  Stop method not run.",
 766  766                              inst->ri_i.i_primary_ctid);
 767  767                          goto out;
 768  768                  }
 769  769          }
 770  770  
 771  771          if (restarter_is_null_method(method)) {
 772  772                  log_framework(LOG_DEBUG, "%s: null method succeeds\n",
 773  773                      inst->ri_i.i_fmri);
 774  774  
 775  775                  log_instance(inst, B_TRUE, "Executing %s method (null).",
 776  776                      mname);
 777  777  
 778  778                  if (type == METHOD_START)
 779  779                          write_status(inst, mname, 0);
 780  780                  goto out;
 781  781          }
 782  782  
 783  783          sig = restarter_is_kill_method(method);
 784  784          if (sig >= 0) {
 785  785  
 786  786                  if (inst->ri_i.i_primary_ctid == 0) {
 787  787                          log_error(LOG_ERR, "%s: :kill with no contract\n",
 788  788                              inst->ri_i.i_fmri);
 789  789                          log_instance(inst, B_TRUE, "Invalid use of \":kill\" "
 790  790                              "as stop method for transient service.");
 791  791                          result = EINVAL;
 792  792                          goto out;
 793  793                  }
 794  794  
 795  795                  log_framework(LOG_DEBUG,
 796  796                      "%s: :killing contract with signal %d\n",
 797  797                      inst->ri_i.i_fmri, sig);
 798  798  
 799  799                  log_instance(inst, B_TRUE, "Executing %s method (:kill).",
 800  800                      mname);
 801  801  
 802  802                  if (contract_kill(inst->ri_i.i_primary_ctid, sig,
 803  803                      inst->ri_i.i_fmri) != 0) {
 804  804                          result = EIO;
 805  805                          goto out;
 806  806                  } else
 807  807                          goto assured_kill;
 808  808          }
 809  809  
 810  810          log_framework(LOG_DEBUG, "%s: forking to run method %s\n",
 811  811              inst->ri_i.i_fmri, method);
 812  812  
 813  813          m_error = restarter_get_method_context(RESTARTER_METHOD_CONTEXT_VERSION,
 814  814              inst->ri_m_inst, snap, mname, method, &mcp);
 815  815  
 816  816          if (m_error != NULL) {
 817  817                  log_instance(inst, B_TRUE, "%s", m_error->msg);
 818  818                  restarter_mc_error_destroy(m_error);
 819  819                  result = EINVAL;
 820  820                  goto out;
 821  821          }
 822  822  
 823  823          r = method_ready_contract(inst, type, restart_on, cte_mask);
 824  824          if (r != 0) {
 825  825                  assert(r == ECANCELED);
 826  826                  assert(inst->ri_mi_deleted);
 827  827                  restarter_free_method_context(mcp);
 828  828                  result = ECANCELED;
 829  829                  goto out;
 830  830          }
 831  831  
 832  832          /*
 833  833           * Validate safety of method contexts, to save children work.
 834  834           */
 835  835          if (!restarter_rm_libs_loadable())
 836  836                  log_framework(LOG_DEBUG, "%s: method contexts limited "
 837  837                      "to root-accessible libraries\n", inst->ri_i.i_fmri);
 838  838  
 839  839          /*
 840  840           * For wait-style svc, sanity check that method exists to prevent an
 841  841           * infinite loop.
 842  842           */
 843  843          if (instance_is_wait_style(inst) && type == METHOD_START) {
 844  844                  char *pend;
 845  845                  struct stat64 sbuf;
 846  846  
 847  847                  /*
 848  848                   * We need to handle start method strings that have arguments,
 849  849                   * such as '/lib/svc/method/console-login %i'.
 850  850                   */
 851  851                  if ((pend = strchr(method, ' ')) != NULL)
 852  852                          *pend = '\0';
 853  853  
 854  854                  if (*method == '/' && stat64(method, &sbuf) == -1 &&
 855  855                      errno == ENOENT) {
 856  856                          log_instance(inst, B_TRUE, "Missing start method (%s), "
 857  857                              "changing state to maintenance.", method);
 858  858                          restarter_free_method_context(mcp);
 859  859                          result = ENOENT;
 860  860                          goto out;
 861  861                  }
 862  862                  if (pend != NULL)
 863  863                          *pend = ' ';
 864  864          }
 865  865  
 866  866          /*
 867  867           * If the service is restarting too quickly, send it to
 868  868           * maintenance.
 869  869           */
 870  870          if (type == METHOD_START) {
 871  871                  method_record_start(inst);
 872  872                  if (method_rate_critical(inst) &&
 873  873                      !instance_is_wait_style(inst)) {
 874  874                          log_instance(inst, B_TRUE, "Restarting too quickly, "
 875  875                              "changing state to maintenance.");
 876  876                          result = ELOOP;
 877  877                          restarter_free_method_context(mcp);
 878  878                          goto out;
 879  879                  }
 880  880          }
 881  881  
 882  882          atomic_add_16(&storing_contract, 1);
 883  883          pid = startd_fork1(&forkerr);
 884  884          if (pid == 0)
 885  885                  exec_method(inst, type, method, mcp, need_session);
 886  886  
 887  887          if (pid == -1) {
 888  888                  atomic_add_16(&storing_contract, -1);
 889  889                  if (forkerr == EAGAIN)
 890  890                          result = EAGAIN;
 891  891                  else
 892  892                          result = EFAULT;
 893  893  
 894  894                  log_error(LOG_WARNING,
 895  895                      "%s: Couldn't fork to execute method %s: %s\n",
 896  896                      inst->ri_i.i_fmri, method, strerror(forkerr));
 897  897  
 898  898                  restarter_free_method_context(mcp);
 899  899                  goto out;
 900  900          }
 901  901  
 902  902  
 903  903          /*
 904  904           * Get the contract id, decide whether it is primary or transient, and
 905  905           * stash it in inst & the repository.
 906  906           */
 907  907          method_store_contract(inst, type, &ctid);
 908  908          atomic_add_16(&storing_contract, -1);
 909  909  
 910  910          restarter_free_method_context(mcp);
 911  911  
 912  912          /*
 913  913           * Similarly for the start method PID.
 914  914           */
 915  915          if (type == METHOD_START && !inst->ri_mi_deleted)
 916  916                  (void) libscf_write_start_pid(inst->ri_m_inst, pid);
 917  917  
 918  918          if (instance_is_wait_style(inst) && type == METHOD_START) {
 919  919                  /* Wait style instances don't get timeouts on start methods. */
 920  920                  if (wait_register(pid, inst->ri_i.i_fmri, 1, 0)) {
 921  921                          log_error(LOG_WARNING,
 922  922                              "%s: couldn't register %ld for wait\n",
 923  923                              inst->ri_i.i_fmri, pid);
 924  924                          result = EFAULT;
 925  925                          goto contract_out;
 926  926                  }
 927  927                  write_status(inst, mname, 0);
 928  928  
 929  929          } else {
 930  930                  int r, err;
 931  931                  time_t start_time;
 932  932                  time_t end_time;
 933  933  
 934  934                  /*
 935  935                   * Because on upgrade/live-upgrade we may have no chance
 936  936                   * to override faulty timeout values on the way to
 937  937                   * manifest import, all services on the path to manifest
 938  938                   * import are treated the same as INFINITE timeout services.
 939  939                   */
 940  940  
 941  941                  start_time = time(NULL);
 942  942                  if (timeout != METHOD_TIMEOUT_INFINITE && !is_timeout_ovr(inst))
 943  943                          timeout_insert(inst, ctid, timeout);
 944  944                  else
 945  945                          timeout = METHOD_TIMEOUT_INFINITE;
 946  946  
 947  947                  /* Unlock the instance while waiting for the method. */
 948  948                  MUTEX_UNLOCK(&inst->ri_lock);
 949  949  
 950  950                  do {
 951  951                          r = waitpid(pid, &ret_status, NULL);
 952  952                  } while (r == -1 && errno == EINTR);
 953  953                  if (r == -1)
 954  954                          err = errno;
 955  955  
 956  956                  /* Re-grab the lock. */
 957  957                  inst = inst_lookup_by_id(id);
 958  958  
 959  959                  /*
 960  960                   * inst can't be removed, as the removal thread waits
 961  961                   * for completion of this one.
 962  962                   */
 963  963                  assert(inst != NULL);
 964  964                  *instp = inst;
 965  965  
 966  966                  if (inst->ri_timeout != NULL && inst->ri_timeout->te_fired)
 967  967                          timeout_fired = 1;
 968  968  
 969  969                  timeout_remove(inst, ctid);
 970  970  
 971  971                  log_framework(LOG_DEBUG,
 972  972                      "%s method for %s exited with status %d.\n", mname,
 973  973                      inst->ri_i.i_fmri, WEXITSTATUS(ret_status));
 974  974  
 975  975                  if (r == -1) {
 976  976                          log_error(LOG_WARNING,
 977  977                              "Couldn't waitpid() for %s method of %s (%s).\n",
 978  978                              mname, inst->ri_i.i_fmri, strerror(err));
 979  979                          result = EFAULT;
 980  980                          goto contract_out;
 981  981                  }
 982  982  
 983  983                  if (type == METHOD_START)
 984  984                          write_status(inst, mname, ret_status);
 985  985  
 986  986                  /* return ERANGE if this service doesn't retry on timeout */
 987  987                  if (timeout_fired == 1 && timeout_retry == 0) {
 988  988                          result = ERANGE;
 989  989                          goto contract_out;
 990  990                  }
 991  991  
 992  992                  if (!WIFEXITED(ret_status)) {
 993  993                          /*
 994  994                           * If method didn't exit itself (it was killed by an
 995  995                           * external entity, etc.), consider the entire
 996  996                           * method_run as failed.
 997  997                           */
 998  998                          if (WIFSIGNALED(ret_status)) {
 999  999                                  char buf[SIG2STR_MAX];
1000 1000                                  (void) sig2str(WTERMSIG(ret_status), buf);
1001 1001  
1002 1002                                  log_error(LOG_WARNING, "%s: Method \"%s\" "
1003 1003                                      "failed due to signal %s.\n",
1004 1004                                      inst->ri_i.i_fmri, method, buf);
1005 1005                                  log_instance(inst, B_TRUE, "Method \"%s\" "
1006 1006                                      "failed due to signal %s.", mname, buf);
1007 1007                          } else {
1008 1008                                  log_error(LOG_WARNING, "%s: Method \"%s\" "
1009 1009                                      "failed with exit status %d.\n",
1010 1010                                      inst->ri_i.i_fmri, method,
1011 1011                                      WEXITSTATUS(ret_status));
1012 1012                                  log_instance(inst, B_TRUE, "Method \"%s\" "
1013 1013                                      "failed with exit status %d.", mname,
1014 1014                                      WEXITSTATUS(ret_status));
1015 1015                          }
1016 1016                          result = EAGAIN;
1017 1017                          goto contract_out;
1018 1018                  }
1019 1019  
1020 1020                  *exit_code = WEXITSTATUS(ret_status);
1021 1021                  if (*exit_code != 0) {
1022 1022                          log_error(LOG_WARNING,
1023 1023                              "%s: Method \"%s\" failed with exit status %d.\n",
1024 1024                              inst->ri_i.i_fmri, method, WEXITSTATUS(ret_status));
1025 1025                  }
1026 1026  
1027 1027                  log_instance(inst, B_TRUE, "Method \"%s\" exited with status "
1028 1028                      "%d.", mname, *exit_code);
1029 1029  
1030 1030                  if (*exit_code != 0)
1031 1031                          goto contract_out;
1032 1032  
1033 1033                  end_time = time(NULL);
1034 1034  
1035 1035                  /* Give service contract remaining seconds to empty */
1036 1036                  if (timeout != METHOD_TIMEOUT_INFINITE)
1037 1037                          timeout -= (end_time - start_time);
1038 1038          }
1039 1039  
1040 1040  assured_kill:
1041 1041          /*
1042 1042           * For stop methods, assure that the service contract has emptied
1043 1043           * before returning.
1044 1044           */
1045 1045          if (type == METHOD_STOP && (!instance_is_transient_style(inst)) &&
1046 1046              !(contract_is_empty(inst->ri_i.i_primary_ctid))) {
1047 1047                  int times = 0;
1048 1048  
1049 1049                  if (timeout != METHOD_TIMEOUT_INFINITE)
1050 1050                          timeout_insert(inst, inst->ri_i.i_primary_ctid,
1051 1051                              timeout);
1052 1052  
1053 1053                  for (;;) {
1054 1054                          /*
1055 1055                           * Check frequently at first, then back off.  This
1056 1056                           * keeps startd from idling while shutting down.
1057 1057                           */
1058 1058                          if (times < 20) {
1059 1059                                  (void) poll(NULL, 0, 5);
1060 1060                                  times++;
1061 1061                          } else {
1062 1062                                  (void) poll(NULL, 0, 100);
1063 1063                          }
1064 1064                          if (contract_is_empty(inst->ri_i.i_primary_ctid))
1065 1065                                  break;
1066 1066                  }
1067 1067  
1068 1068                  if (timeout != METHOD_TIMEOUT_INFINITE)
1069 1069                          if (inst->ri_timeout->te_fired)
1070 1070                                  result = EFAULT;
1071 1071  
1072 1072                  timeout_remove(inst, inst->ri_i.i_primary_ctid);
1073 1073          }
1074 1074  
1075 1075  contract_out:
1076 1076          /* Abandon contracts for transient methods & methods that fail. */
1077 1077          transient = method_is_transient(inst, type);
1078 1078          if ((transient || *exit_code != 0 || result != 0) &&
1079 1079              (restarter_is_kill_method(method) < 0))
1080 1080                  method_remove_contract(inst, !transient, B_TRUE);
1081 1081  
1082 1082  out:
1083 1083          if (ctfd >= 0)
1084 1084                  (void) close(ctfd);
1085 1085          scf_snapshot_destroy(snap);
1086 1086          free(method);
1087 1087          return (result);
1088 1088  }
1089 1089  
1090 1090  /*
1091 1091   * The method thread executes a service method to effect a state transition.
1092 1092   * The next_state of info->sf_id should be non-_NONE on entrance, and it will
1093 1093   * be _NONE on exit (state will either be what next_state was (on success), or
1094 1094   * it will be _MAINT (on error)).
1095 1095   *
1096 1096   * There are six classes of methods to consider: start & other (stop, refresh)
1097 1097   * for each of "normal" services, wait services, and transient services.  For
1098 1098   * each, the method must be fetched from the repository & executed.  fork()ed
1099 1099   * methods must be waited on, except for the start method of wait services
1100 1100   * (which must be registered with the wait subsystem via wait_register()).  If
1101 1101   * the method succeeded (returned 0), then for start methods its contract
1102 1102   * should be recorded as the primary contract for the service.  For other
1103 1103   * methods, it should be abandoned.  If the method fails, then depending on
1104 1104   * the failure, either the method should be reexecuted or the service should
1105 1105   * be put into maintenance.  Either way the contract should be abandoned.
1106 1106   */
1107 1107  void *
1108 1108  method_thread(void *arg)
1109 1109  {
1110 1110          fork_info_t *info = arg;
1111 1111          restarter_inst_t *inst;
1112 1112          scf_handle_t    *local_handle;
1113 1113          scf_instance_t  *s_inst = NULL;
1114 1114          int r, exit_code;
1115 1115          boolean_t retryable;
1116 1116          restarter_str_t reason;
1117 1117  
1118 1118          assert(0 <= info->sf_method_type && info->sf_method_type <= 2);
1119 1119  
1120 1120          /* Get (and lock) the restarter_inst_t. */
1121 1121          inst = inst_lookup_by_id(info->sf_id);
1122 1122  
1123 1123          assert(inst->ri_method_thread != 0);
1124 1124          assert(instance_in_transition(inst) == 1);
1125 1125  
1126 1126          /*
1127 1127           * We cannot leave this function with inst in transition, because
1128 1128           * protocol.c withholds messages for inst otherwise.
1129 1129           */
1130 1130  
1131 1131          log_framework(LOG_DEBUG, "method_thread() running %s method for %s.\n",
1132 1132              method_names[info->sf_method_type], inst->ri_i.i_fmri);
1133 1133  
1134 1134          local_handle = libscf_handle_create_bound_loop();
1135 1135  
1136 1136  rebind_retry:
1137 1137          /* get scf_instance_t */
1138 1138          switch (r = libscf_fmri_get_instance(local_handle, inst->ri_i.i_fmri,
1139 1139              &s_inst)) {
1140 1140          case 0:
1141 1141                  break;
1142 1142  
1143 1143          case ECONNABORTED:
1144 1144                  libscf_handle_rebind(local_handle);
1145 1145                  goto rebind_retry;
1146 1146  
1147 1147          case ENOENT:
1148 1148                  /*
1149 1149                   * It's not there, but we need to call this so protocol.c
1150 1150                   * doesn't think it's in transition anymore.
1151 1151                   */
1152 1152                  (void) restarter_instance_update_states(local_handle, inst,
1153 1153                      inst->ri_i.i_state, RESTARTER_STATE_NONE, RERR_NONE,
1154 1154                      restarter_str_none);
1155 1155                  goto out;
1156 1156  
1157 1157          case EINVAL:
1158 1158          case ENOTSUP:
1159 1159          default:
1160 1160                  bad_error("libscf_fmri_get_instance", r);
1161 1161          }

↓ open down ↓

1161 lines elided

↑ open up ↑

1162 1162  
1163 1163          inst->ri_m_inst = s_inst;
1164 1164          inst->ri_mi_deleted = B_FALSE;
1165 1165  
1166 1166  retry:
1167 1167          if (info->sf_method_type == METHOD_START)
1168 1168                  log_transition(inst, START_REQUESTED);
1169 1169  
1170 1170          r = method_run(&inst, info->sf_method_type, &exit_code);
1171 1171  
1172      -        if (r == 0 && exit_code == 0) {
     1172 +        if (r == 0 && (exit_code == 0 || exit_code == SMF_EXIT_MON_DEGRADE)) {
1173 1173                  /* Success! */
1174 1174                  assert(inst->ri_i.i_next_state != RESTARTER_STATE_NONE);
1175 1175  
1176 1176                  /*
1177 1177                   * When a stop method succeeds, remove the primary contract of
1178 1178                   * the service, unless we're going to offline, in which case
1179 1179                   * retain the contract so we can transfer inherited contracts to
1180 1180                   * the replacement service.
1181 1181                   */
1182 1182  
1183 1183                  if (info->sf_method_type == METHOD_STOP &&
1184 1184                      inst->ri_i.i_primary_ctid != 0) {
1185 1185                          if (inst->ri_i.i_next_state == RESTARTER_STATE_OFFLINE)
1186 1186                                  inst->ri_i.i_primary_ctid_stopped = 1;
1187 1187                          else
1188 1188                                  method_remove_contract(inst, B_TRUE, B_TRUE);
1189 1189                  }
     1190 +
1190 1191                  /*
     1192 +                 * When a start method returns with SMF_EXIT_MON_DEGRADE we
     1193 +                 * transition the service into degraded.
     1194 +                 */
     1195 +                if (info->sf_method_type == METHOD_START &&
     1196 +                    exit_code == SMF_EXIT_MON_DEGRADE) {
     1197 +                        inst->ri_i.i_next_state = RESTARTER_STATE_DEGRADED;
     1198 +                        info->sf_reason = restarter_str_method_failed;
     1199 +                }
     1200 +
     1201 +                /*
1191 1202                   * We don't care whether the handle was rebound because this is
1192 1203                   * the last thing we do with it.
1193 1204                   */
1194 1205                  (void) restarter_instance_update_states(local_handle, inst,
1195 1206                      inst->ri_i.i_next_state, RESTARTER_STATE_NONE,
1196 1207                      info->sf_event_type, info->sf_reason);
1197 1208  
1198 1209                  (void) update_fault_count(inst, FAULT_COUNT_RESET);
1199 1210  
1200 1211                  goto out;

1201 1212          }
1202 1213  
1203 1214          /* Failure.  Retry or go to maintenance. */
1204 1215  
1205 1216          if (r != 0 && r != EAGAIN) {
1206 1217                  retryable = B_FALSE;
1207 1218          } else {
1208 1219                  switch (exit_code) {
1209 1220                  case SMF_EXIT_ERR_CONFIG:
1210 1221                  case SMF_EXIT_ERR_NOSMF:
1211 1222                  case SMF_EXIT_ERR_PERM:
1212 1223                  case SMF_EXIT_ERR_FATAL:
1213 1224                          retryable = B_FALSE;
1214 1225                          break;
1215 1226  
1216 1227                  default:
1217 1228                          retryable = B_TRUE;
1218 1229                  }
1219 1230          }
1220 1231  
1221 1232          if (retryable && update_fault_count(inst, FAULT_COUNT_INCR) != 1)
1222 1233                  goto retry;
1223 1234  
1224 1235          /* maintenance */
1225 1236          if (r == ELOOP)
1226 1237                  log_transition(inst, START_FAILED_REPEATEDLY);
1227 1238          else if (r == ERANGE)
1228 1239                  log_transition(inst, START_FAILED_TIMEOUT_FATAL);
1229 1240          else if (exit_code == SMF_EXIT_ERR_CONFIG)
1230 1241                  log_transition(inst, START_FAILED_CONFIGURATION);
1231 1242          else if (exit_code == SMF_EXIT_ERR_FATAL)
1232 1243                  log_transition(inst, START_FAILED_FATAL);
1233 1244          else
1234 1245                  log_transition(inst, START_FAILED_OTHER);
1235 1246  
1236 1247          if (r == ELOOP) {
1237 1248                  reason = restarter_str_restarting_too_quickly;
1238 1249          } else if (retryable) {
1239 1250                  reason = restarter_str_fault_threshold_reached;
1240 1251          } else {
1241 1252                  reason = restarter_str_method_failed;
1242 1253          }
1243 1254  
1244 1255          (void) restarter_instance_update_states(local_handle, inst,
1245 1256              RESTARTER_STATE_MAINT, RESTARTER_STATE_NONE, RERR_FAULT,
1246 1257              reason);
1247 1258  
1248 1259          if (!method_is_transient(inst, info->sf_method_type) &&
1249 1260              inst->ri_i.i_primary_ctid != 0)
1250 1261                  method_remove_contract(inst, B_TRUE, B_TRUE);
1251 1262  
1252 1263  out:
1253 1264          inst->ri_method_thread = 0;
1254 1265  
1255 1266          /*
1256 1267           * Unlock the mutex after broadcasting to avoid a race condition
1257 1268           * with restarter_delete_inst() when the 'inst' structure is freed.
1258 1269           */
1259 1270          (void) pthread_cond_broadcast(&inst->ri_method_cv);
1260 1271          MUTEX_UNLOCK(&inst->ri_lock);
1261 1272  
1262 1273          scf_instance_destroy(s_inst);
1263 1274          scf_handle_destroy(local_handle);
1264 1275          startd_free(info, sizeof (fork_info_t));
1265 1276          return (NULL);
1266 1277  }

↓ open down ↓

66 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX