illumos-gate Wdiff usr/src/cmd/svc/startd/method.c

Print this page

7928 Add support for SMF_EXIT_TEMP_TRANSIENT

Split	Close
Expand all
Collapse all

          --- old/usr/src/cmd/svc/startd/method.c
          +++ new/usr/src/cmd/svc/startd/method.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.

↓ open down ↓

14 lines elided

↑ open up ↑

  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2011 Joyent Inc.
       25 + * Copyright 2017 RackTop Systems.
  25   26   */
  26   27  
  27   28  /*
  28   29   * method.c - method execution functions
  29   30   *
  30   31   * This file contains the routines needed to run a method:  a fork(2)-exec(2)
  31   32   * invocation monitored using either the contract filesystem or waitpid(2).
  32   33   * (Plain fork1(2) support is provided in fork.c.)
  33   34   *
  34   35   * Contract Transfer

  35   36   *   When we restart a service, we want to transfer any contracts that the old
  36   37   *   service's contract inherited.  This means that (a) we must not abandon the
  37   38   *   old contract when the service dies and (b) we must write the id of the old
  38   39   *   contract into the terms of the new contract.  There should be limits to
  39   40   *   (a), though, since we don't want to keep the contract around forever.  To
  40   41   *   this end we'll say that services in the offline state may have a contract
  41   42   *   to be transfered and services in the disabled or maintenance states cannot.
  42   43   *   This means that when a service transitions from online (or degraded) to
  43   44   *   offline, the contract should be preserved, and when the service transitions
  44   45   *   from offline to online (i.e., the start method), we'll transfer inherited
  45   46   *   contracts.
  46   47   */
  47   48  
  48   49  #include <sys/contract/process.h>
  49   50  #include <sys/ctfs.h>
  50   51  #include <sys/stat.h>
  51   52  #include <sys/time.h>
  52   53  #include <sys/types.h>
  53   54  #include <sys/uio.h>
  54   55  #include <sys/wait.h>
  55   56  #include <alloca.h>
  56   57  #include <assert.h>
  57   58  #include <errno.h>
  58   59  #include <fcntl.h>
  59   60  #include <libcontract.h>
  60   61  #include <libcontract_priv.h>
  61   62  #include <libgen.h>
  62   63  #include <librestart.h>
  63   64  #include <libscf.h>
  64   65  #include <limits.h>
  65   66  #include <port.h>
  66   67  #include <sac.h>
  67   68  #include <signal.h>
  68   69  #include <stdlib.h>
  69   70  #include <string.h>
  70   71  #include <strings.h>
  71   72  #include <unistd.h>
  72   73  #include <atomic.h>
  73   74  #include <poll.h>
  74   75  #include <libscf_priv.h>
  75   76  
  76   77  #include "startd.h"
  77   78  
  78   79  #define SBIN_SH         "/sbin/sh"
  79   80  
  80   81  /*
  81   82   * Used to tell if contracts are in the process of being
  82   83   * stored into the svc.startd internal hash table.
  83   84   */
  84   85  volatile uint16_t       storing_contract = 0;
  85   86  
  86   87  /*
  87   88   * Mapping from restart_on method-type to contract events.  Must correspond to
  88   89   * enum method_restart_t.
  89   90   */
  90   91  static uint_t method_events[] = {
  91   92          /* METHOD_RESTART_ALL */
  92   93          CT_PR_EV_HWERR | CT_PR_EV_SIGNAL | CT_PR_EV_CORE | CT_PR_EV_EMPTY,
  93   94          /* METHOD_RESTART_EXTERNAL_FAULT */
  94   95          CT_PR_EV_HWERR | CT_PR_EV_SIGNAL,
  95   96          /* METHOD_RESTART_ANY_FAULT */
  96   97          CT_PR_EV_HWERR | CT_PR_EV_SIGNAL | CT_PR_EV_CORE
  97   98  };
  98   99  
  99  100  /*
 100  101   * method_record_start(restarter_inst_t *)
 101  102   *   Record a service start for rate limiting.  Place the current time
 102  103   *   in the circular array of instance starts.
 103  104   */
 104  105  static void
 105  106  method_record_start(restarter_inst_t *inst)
 106  107  {
 107  108          int index = inst->ri_start_index++ % RINST_START_TIMES;
 108  109  
 109  110          inst->ri_start_time[index] = gethrtime();
 110  111  }
 111  112  
 112  113  /*
 113  114   * method_rate_critical(restarter_inst_t *)
 114  115   *    Return true if the average start interval is less than the permitted
 115  116   *    interval.  The implicit interval defaults to RINST_FAILURE_RATE_NS and
 116  117   *    RINST_START_TIMES but may be overridden with the svc properties
 117  118   *    startd/critical_failure_count and startd/critical_failure_period
 118  119   *    which represent the number of failures to consider and the amount of
 119  120   *    time in seconds in which that number may occur, respectively. Note that
 120  121   *    this time is measured as of the transition to 'enabled' rather than wall
 121  122   *    clock time.
 122  123   *    Implicit success if insufficient measurements for an average exist.
 123  124   */
 124  125  int
 125  126  method_rate_critical(restarter_inst_t *inst)
 126  127  {
 127  128          hrtime_t critical_failure_period;
 128  129          uint_t critical_failure_count = RINST_START_TIMES;
 129  130          uint_t n = inst->ri_start_index;
 130  131          hrtime_t avg_ns = 0;
 131  132          uint64_t scf_fr, scf_st;
 132  133          scf_propvec_t *prop = NULL;
 133  134          scf_propvec_t restart_critical[] = {
 134  135                  { "critical_failure_period", NULL, SCF_TYPE_INTEGER, NULL, 0 },
 135  136                  { "critical_failure_count", NULL, SCF_TYPE_INTEGER, NULL, 0 },
 136  137                  { NULL }
 137  138          };
 138  139  
 139  140          if (instance_is_wait_style(inst))
 140  141                  critical_failure_period = RINST_WT_SVC_FAILURE_RATE_NS;
 141  142          else
 142  143                  critical_failure_period = RINST_FAILURE_RATE_NS;
 143  144  
 144  145          restart_critical[0].pv_ptr = &scf_fr;
 145  146          restart_critical[1].pv_ptr = &scf_st;
 146  147  
 147  148          if (scf_read_propvec(inst->ri_i.i_fmri, "startd",
 148  149              B_TRUE, restart_critical, &prop) != SCF_FAILED) {
 149  150                  /*
 150  151                   * critical_failure_period is expressed
 151  152                   * in seconds but tracked in ns
 152  153                   */
 153  154                  critical_failure_period = (hrtime_t)scf_fr * NANOSEC;
 154  155                  critical_failure_count = (uint_t)scf_st;
 155  156          }
 156  157          if (inst->ri_start_index < critical_failure_count)
 157  158                  return (0);
 158  159  
 159  160          avg_ns =
 160  161              (inst->ri_start_time[(n - 1) % critical_failure_count] -
 161  162              inst->ri_start_time[n % critical_failure_count]) /
 162  163              (critical_failure_count - 1);
 163  164  
 164  165          return (avg_ns < critical_failure_period);
 165  166  }
 166  167  
 167  168  /*
 168  169   * int method_is_transient()
 169  170   *   Determine if the method for the given instance is transient,
 170  171   *   from a contract perspective. Return 1 if it is, and 0 if it isn't.
 171  172   */

↓ open down ↓

137 lines elided

↑ open up ↑

 172  173  static int
 173  174  method_is_transient(restarter_inst_t *inst, int type)
 174  175  {
 175  176          if (instance_is_transient_style(inst) || type != METHOD_START)
 176  177                  return (1);
 177  178          else
 178  179                  return (0);
 179  180  }
 180  181  
 181  182  /*
      183 + * int method_failed()
      184 + *   Return 1 if the exit_code indicates failure (not all non-zero
      185 + *   exit codes do) otherwise return 0.
      186 + */
      187 +static int
      188 +method_failed(int exit_code)
      189 +{
      190 +        if (exit_code != 0 && exit_code != SMF_EXIT_TEMP_TRANSIENT)
      191 +                return (1);
      192 +        else
      193 +                return (0);
      194 +}
      195 +
      196 +/*
 182  197   * void method_store_contract()
 183  198   *   Store the newly created contract id into local structures and
 184  199   *   the repository.  If the repository connection is broken it is rebound.
 185  200   */
 186  201  static void
 187  202  method_store_contract(restarter_inst_t *inst, int type, ctid_t *cid)
 188  203  {
 189  204          int r;
 190  205          boolean_t primary;
 191  206

 192  207          if (errno = contract_latest(cid))
 193  208                  uu_die("%s: Couldn't get new contract's id", inst->ri_i.i_fmri);
 194  209  
 195  210          primary = !method_is_transient(inst, type);
 196  211  
 197  212          if (!primary) {
 198  213                  if (inst->ri_i.i_transient_ctid != 0) {
 199  214                          log_framework(LOG_INFO,
 200  215                              "%s: transient ctid expected to be 0 but "
 201  216                              "was set to %ld\n", inst->ri_i.i_fmri,
 202  217                              inst->ri_i.i_transient_ctid);
 203  218                  }
 204  219  
 205  220                  inst->ri_i.i_transient_ctid = *cid;
 206  221          } else {
 207  222                  if (inst->ri_i.i_primary_ctid != 0) {
 208  223                          /*
 209  224                           * There was an old contract that we transferred.
 210  225                           * Remove it.
 211  226                           */
 212  227                          method_remove_contract(inst, B_TRUE, B_FALSE);
 213  228                  }
 214  229  
 215  230                  if (inst->ri_i.i_primary_ctid != 0) {
 216  231                          log_framework(LOG_INFO,
 217  232                              "%s: primary ctid expected to be 0 but "
 218  233                              "was set to %ld\n", inst->ri_i.i_fmri,
 219  234                              inst->ri_i.i_primary_ctid);
 220  235                  }
 221  236  
 222  237                  inst->ri_i.i_primary_ctid = *cid;
 223  238                  inst->ri_i.i_primary_ctid_stopped = 0;
 224  239  
 225  240                  log_framework(LOG_DEBUG, "Storing primary contract %ld for "
 226  241                      "%s.\n", *cid, inst->ri_i.i_fmri);
 227  242  
 228  243                  contract_hash_store(*cid, inst->ri_id);
 229  244          }
 230  245  
 231  246  again:
 232  247          if (inst->ri_mi_deleted)
 233  248                  return;
 234  249  
 235  250          r = restarter_store_contract(inst->ri_m_inst, *cid, primary ?
 236  251              RESTARTER_CONTRACT_PRIMARY : RESTARTER_CONTRACT_TRANSIENT);
 237  252          switch (r) {
 238  253          case 0:
 239  254                  break;
 240  255  
 241  256          case ECANCELED:
 242  257                  inst->ri_mi_deleted = B_TRUE;
 243  258                  break;
 244  259  
 245  260          case ECONNABORTED:
 246  261                  libscf_handle_rebind(scf_instance_handle(inst->ri_m_inst));
 247  262                  /* FALLTHROUGH */
 248  263  
 249  264          case EBADF:
 250  265                  libscf_reget_instance(inst);
 251  266                  goto again;
 252  267  
 253  268          case ENOMEM:
 254  269          case EPERM:
 255  270          case EACCES:
 256  271          case EROFS:
 257  272                  uu_die("%s: Couldn't store contract id %ld",
 258  273                      inst->ri_i.i_fmri, *cid);
 259  274                  /* NOTREACHED */
 260  275  
 261  276          case EINVAL:
 262  277          default:
 263  278                  bad_error("restarter_store_contract", r);
 264  279          }
 265  280  }
 266  281  
 267  282  /*
 268  283   * void method_remove_contract()
 269  284   *   Remove any non-permanent contracts from internal structures and
 270  285   *   the repository, then abandon them.
 271  286   *   Returns
 272  287   *     0 - success
 273  288   *     ECANCELED - inst was deleted from the repository
 274  289   *
 275  290   *   If the repository connection was broken, it is rebound.
 276  291   */
 277  292  void
 278  293  method_remove_contract(restarter_inst_t *inst, boolean_t primary,
 279  294      boolean_t abandon)
 280  295  {
 281  296          ctid_t * const ctidp = primary ? &inst->ri_i.i_primary_ctid :
 282  297              &inst->ri_i.i_transient_ctid;
 283  298  
 284  299          int r;
 285  300  
 286  301          assert(*ctidp != 0);
 287  302  
 288  303          log_framework(LOG_DEBUG, "Removing %s contract %lu for %s.\n",
 289  304              primary ? "primary" : "transient", *ctidp, inst->ri_i.i_fmri);
 290  305  
 291  306          if (abandon)
 292  307                  contract_abandon(*ctidp);
 293  308  
 294  309  again:
 295  310          if (inst->ri_mi_deleted) {
 296  311                  r = ECANCELED;
 297  312                  goto out;
 298  313          }
 299  314  
 300  315          r = restarter_remove_contract(inst->ri_m_inst, *ctidp, primary ?
 301  316              RESTARTER_CONTRACT_PRIMARY : RESTARTER_CONTRACT_TRANSIENT);
 302  317          switch (r) {
 303  318          case 0:
 304  319                  break;
 305  320  
 306  321          case ECANCELED:
 307  322                  inst->ri_mi_deleted = B_TRUE;
 308  323                  break;
 309  324  
 310  325          case ECONNABORTED:
 311  326                  libscf_handle_rebind(scf_instance_handle(inst->ri_m_inst));
 312  327                  /* FALLTHROUGH */
 313  328  
 314  329          case EBADF:
 315  330                  libscf_reget_instance(inst);
 316  331                  goto again;
 317  332  
 318  333          case ENOMEM:
 319  334          case EPERM:
 320  335          case EACCES:
 321  336          case EROFS:
 322  337                  log_error(LOG_INFO, "%s: Couldn't remove contract id %ld: "
 323  338                      "%s.\n", inst->ri_i.i_fmri, *ctidp, strerror(r));
 324  339                  break;
 325  340  
 326  341          case EINVAL:
 327  342          default:
 328  343                  bad_error("restarter_remove_contract", r);
 329  344          }
 330  345  
 331  346  out:
 332  347          if (primary)
 333  348                  contract_hash_remove(*ctidp);
 334  349  
 335  350          *ctidp = 0;
 336  351  }
 337  352  
 338  353  static const char *method_names[] = { "start", "stop", "refresh" };
 339  354  
 340  355  /*
 341  356   * int method_ready_contract(restarter_inst_t *, int, method_restart_t, int)
 342  357   *
 343  358   *   Activate a contract template for the type method of inst.  type,
 344  359   *   restart_on, and cte_mask dictate the critical events term of the contract.
 345  360   *   Returns
 346  361   *     0 - success
 347  362   *     ECANCELED - inst has been deleted from the repository
 348  363   */
 349  364  static int
 350  365  method_ready_contract(restarter_inst_t *inst, int type,
 351  366      method_restart_t restart_on, uint_t cte_mask)
 352  367  {
 353  368          int tmpl, err, istrans, iswait, ret;
 354  369          uint_t cevents, fevents;
 355  370  
 356  371          /*
 357  372           * Correctly supporting wait-style services is tricky without
 358  373           * rearchitecting startd to cope with multiple event sources
 359  374           * simultaneously trying to stop an instance.  Until a better
 360  375           * solution is implemented, we avoid this problem for
 361  376           * wait-style services by making contract events fatal and
 362  377           * letting the wait code alone handle stopping the service.
 363  378           */
 364  379          iswait = instance_is_wait_style(inst);
 365  380          istrans = method_is_transient(inst, type);
 366  381  
 367  382          tmpl = open64(CTFS_ROOT "/process/template", O_RDWR);
 368  383          if (tmpl == -1)
 369  384                  uu_die("Could not create contract template");
 370  385  
 371  386          /*
 372  387           * We assume non-login processes are unlikely to create
 373  388           * multiple process groups, and set CT_PR_PGRPONLY for all
 374  389           * wait-style services' contracts.
 375  390           */
 376  391          err = ct_pr_tmpl_set_param(tmpl, CT_PR_INHERIT | CT_PR_REGENT |
 377  392              (iswait ? CT_PR_PGRPONLY : 0));
 378  393          assert(err == 0);
 379  394  
 380  395          if (istrans) {
 381  396                  cevents = 0;
 382  397                  fevents = 0;
 383  398          } else {
 384  399                  assert(restart_on >= 0);
 385  400                  assert(restart_on <= METHOD_RESTART_ANY_FAULT);
 386  401                  cevents = method_events[restart_on] & ~cte_mask;
 387  402                  fevents = iswait ?
 388  403                      (method_events[restart_on] & ~cte_mask & CT_PR_ALLFATAL) :
 389  404                      0;
 390  405          }
 391  406  
 392  407          err = ct_tmpl_set_critical(tmpl, cevents);
 393  408          assert(err == 0);
 394  409  
 395  410          err = ct_tmpl_set_informative(tmpl, 0);
 396  411          assert(err == 0);
 397  412          err = ct_pr_tmpl_set_fatal(tmpl, fevents);
 398  413          assert(err == 0);
 399  414  
 400  415          err = ct_tmpl_set_cookie(tmpl, istrans ?  METHOD_OTHER_COOKIE :
 401  416              METHOD_START_COOKIE);
 402  417          assert(err == 0);
 403  418  
 404  419          if (type == METHOD_START && inst->ri_i.i_primary_ctid != 0) {
 405  420                  ret = ct_pr_tmpl_set_transfer(tmpl, inst->ri_i.i_primary_ctid);
 406  421                  switch (ret) {
 407  422                  case 0:
 408  423                          break;
 409  424  
 410  425                  case ENOTEMPTY:
 411  426                          /* No contracts for you! */
 412  427                          method_remove_contract(inst, B_TRUE, B_TRUE);
 413  428                          if (inst->ri_mi_deleted) {
 414  429                                  ret = ECANCELED;
 415  430                                  goto out;
 416  431                          }
 417  432                          break;
 418  433  
 419  434                  case EINVAL:
 420  435                  case ESRCH:
 421  436                  case EACCES:
 422  437                  default:
 423  438                          bad_error("ct_pr_tmpl_set_transfer", ret);
 424  439                  }
 425  440          }
 426  441  
 427  442          err = ct_pr_tmpl_set_svc_fmri(tmpl, inst->ri_i.i_fmri);
 428  443          assert(err == 0);
 429  444          err = ct_pr_tmpl_set_svc_aux(tmpl, method_names[type]);
 430  445          assert(err == 0);
 431  446  
 432  447          err = ct_tmpl_activate(tmpl);
 433  448          assert(err == 0);
 434  449  
 435  450          ret = 0;
 436  451  
 437  452  out:
 438  453          err = close(tmpl);
 439  454          assert(err == 0);
 440  455  
 441  456          return (ret);
 442  457  }
 443  458  
 444  459  static void
 445  460  exec_method(const restarter_inst_t *inst, int type, const char *method,
 446  461      struct method_context *mcp, uint8_t need_session)
 447  462  {
 448  463          char *cmd;
 449  464          const char *errf;
 450  465          char **nenv;
 451  466          int rsmc_errno = 0;
 452  467  
 453  468          cmd = uu_msprintf("exec %s", method);
 454  469  
 455  470          if (inst->ri_utmpx_prefix[0] != '\0' && inst->ri_utmpx_prefix != NULL)
 456  471                  (void) utmpx_mark_init(getpid(), inst->ri_utmpx_prefix);
 457  472  
 458  473          setlog(inst->ri_logstem);
 459  474          log_instance(inst, B_FALSE, "Executing %s method (\"%s\").",
 460  475              method_names[type], method);
 461  476  
 462  477          if (need_session)
 463  478                  (void) setpgrp();
 464  479  
 465  480          /* Set credentials. */
 466  481          rsmc_errno = restarter_set_method_context(mcp, &errf);
 467  482          if (rsmc_errno != 0) {
 468  483                  log_instance(inst, B_FALSE,
 469  484                      "svc.startd could not set context for method: ");
 470  485  
 471  486                  if (rsmc_errno == -1) {
 472  487                          if (strcmp(errf, "core_set_process_path") == 0) {
 473  488                                  log_instance(inst, B_FALSE,
 474  489                                      "Could not set corefile path.");
 475  490                          } else if (strcmp(errf, "setproject") == 0) {
 476  491                                  log_instance(inst, B_FALSE, "%s: a resource "
 477  492                                      "control assignment failed", errf);
 478  493                          } else if (strcmp(errf, "pool_set_binding") == 0) {
 479  494                                  log_instance(inst, B_FALSE, "%s: a system "
 480  495                                      "error occurred", errf);
 481  496                          } else {
 482  497  #ifndef NDEBUG
 483  498                                  uu_warn("%s:%d: Bad function name \"%s\" for "
 484  499                                      "error %d from "
 485  500                                      "restarter_set_method_context().\n",
 486  501                                      __FILE__, __LINE__, errf, rsmc_errno);
 487  502  #endif
 488  503                                  abort();
 489  504                          }
 490  505  
 491  506                          exit(1);
 492  507                  }
 493  508  
 494  509                  if (errf != NULL && strcmp(errf, "pool_set_binding") == 0) {
 495  510                          switch (rsmc_errno) {
 496  511                          case ENOENT:
 497  512                                  log_instance(inst, B_FALSE, "%s: the pool "
 498  513                                      "could not be found", errf);
 499  514                                  break;
 500  515  
 501  516                          case EBADF:
 502  517                                  log_instance(inst, B_FALSE, "%s: the "
 503  518                                      "configuration is invalid", errf);
 504  519                                  break;
 505  520  
 506  521                          case EINVAL:
 507  522                                  log_instance(inst, B_FALSE, "%s: pool name "
 508  523                                      "\"%s\" is invalid", errf,
 509  524                                      mcp->resource_pool);
 510  525                                  break;
 511  526  
 512  527                          default:
 513  528  #ifndef NDEBUG
 514  529                                  uu_warn("%s:%d: Bad error %d for function %s "
 515  530                                      "in restarter_set_method_context().\n",
 516  531                                      __FILE__, __LINE__, rsmc_errno, errf);
 517  532  #endif
 518  533                                  abort();
 519  534                          }
 520  535  
 521  536                          exit(SMF_EXIT_ERR_CONFIG);
 522  537                  }
 523  538  
 524  539                  if (errf != NULL && strcmp(errf, "chdir") == 0) {
 525  540                          switch (rsmc_errno) {
 526  541                          case EACCES:
 527  542                          case EFAULT:
 528  543                          case EIO:
 529  544                          case ELOOP:
 530  545                          case ENAMETOOLONG:
 531  546                          case ENOENT:
 532  547                          case ENOLINK:
 533  548                          case ENOTDIR:
 534  549                                  log_instance(inst, B_FALSE, "%s: %s (\"%s\")",
 535  550                                      errf,
 536  551                                      strerror(rsmc_errno), mcp->working_dir);
 537  552                                  break;
 538  553  
 539  554                          default:
 540  555  #ifndef NDEBUG
 541  556                                  uu_warn("%s:%d: Bad error %d for function %s "
 542  557                                      "in restarter_set_method_context().\n",
 543  558                                      __FILE__, __LINE__, rsmc_errno, errf);
 544  559  #endif
 545  560                                  abort();
 546  561                          }
 547  562  
 548  563                          exit(SMF_EXIT_ERR_CONFIG);
 549  564                  }
 550  565  
 551  566                  if (errf != NULL) {
 552  567                          errno = rsmc_errno;
 553  568                          perror(errf);
 554  569  
 555  570                          switch (rsmc_errno) {
 556  571                          case EINVAL:
 557  572                          case EPERM:
 558  573                          case ENOENT:
 559  574                          case ENAMETOOLONG:
 560  575                          case ERANGE:
 561  576                          case ESRCH:
 562  577                                  exit(SMF_EXIT_ERR_CONFIG);
 563  578                                  /* NOTREACHED */
 564  579  
 565  580                          default:
 566  581                                  exit(1);
 567  582                          }
 568  583                  }
 569  584  
 570  585                  switch (rsmc_errno) {
 571  586                  case ENOMEM:
 572  587                          log_instance(inst, B_FALSE, "Out of memory.");
 573  588                          exit(1);
 574  589                          /* NOTREACHED */
 575  590  
 576  591                  case ENOENT:
 577  592                          log_instance(inst, B_FALSE, "Missing passwd entry for "
 578  593                              "user.");
 579  594                          exit(SMF_EXIT_ERR_CONFIG);
 580  595                          /* NOTREACHED */
 581  596  
 582  597                  default:
 583  598  #ifndef NDEBUG
 584  599                          uu_warn("%s:%d: Bad miscellaneous error %d from "
 585  600                              "restarter_set_method_context().\n", __FILE__,
 586  601                              __LINE__, rsmc_errno);
 587  602  #endif
 588  603                          abort();
 589  604                  }
 590  605          }
 591  606  
 592  607          nenv = set_smf_env(mcp->env, mcp->env_sz, NULL, inst,
 593  608              method_names[type]);
 594  609  
 595  610          log_preexec();
 596  611  
 597  612          (void) execle(SBIN_SH, SBIN_SH, "-c", cmd, NULL, nenv);
 598  613  
 599  614          exit(10);
 600  615  }
 601  616  
 602  617  static void
 603  618  write_status(restarter_inst_t *inst, const char *mname, int stat)
 604  619  {
 605  620          int r;
 606  621  
 607  622  again:
 608  623          if (inst->ri_mi_deleted)
 609  624                  return;
 610  625  
 611  626          r = libscf_write_method_status(inst->ri_m_inst, mname, stat);
 612  627          switch (r) {
 613  628          case 0:
 614  629                  break;
 615  630  
 616  631          case ECONNABORTED:
 617  632                  libscf_reget_instance(inst);
 618  633                  goto again;
 619  634  
 620  635          case ECANCELED:
 621  636                  inst->ri_mi_deleted = 1;
 622  637                  break;
 623  638  
 624  639          case EPERM:
 625  640          case EACCES:
 626  641          case EROFS:
 627  642                  log_framework(LOG_INFO, "Could not write exit status "
 628  643                      "for %s method of %s: %s.\n", mname,
 629  644                      inst->ri_i.i_fmri, strerror(r));
 630  645                  break;
 631  646  
 632  647          case ENAMETOOLONG:
 633  648          default:
 634  649                  bad_error("libscf_write_method_status", r);
 635  650          }
 636  651  }
 637  652  
 638  653  /*
 639  654   * int method_run()
 640  655   *   Execute the type method of instp.  If it requires a fork(), wait for it
 641  656   *   to return and return its exit code in *exit_code.  Otherwise set
 642  657   *   *exit_code to 0 if the method succeeds & -1 if it fails.  If the
 643  658   *   repository connection is broken, it is rebound, but inst may not be
 644  659   *   reset.
 645  660   *   Returns
 646  661   *     0 - success
 647  662   *     EINVAL - A correct method or method context couldn't be retrieved.
 648  663   *     EIO - Contract kill failed.
 649  664   *     EFAULT - Method couldn't be executed successfully.
 650  665   *     ELOOP - Retry threshold exceeded.
 651  666   *     ECANCELED - inst was deleted from the repository before method was run
 652  667   *     ERANGE - Timeout retry threshold exceeded.
 653  668   *     EAGAIN - Failed due to external cause, retry.
 654  669   */
 655  670  int
 656  671  method_run(restarter_inst_t **instp, int type, int *exit_code)
 657  672  {
 658  673          char *method;
 659  674          int ret_status;
 660  675          pid_t pid;
 661  676          method_restart_t restart_on;
 662  677          uint_t cte_mask;
 663  678          uint8_t need_session;
 664  679          scf_handle_t *h;
 665  680          scf_snapshot_t *snap;
 666  681          const char *mname;
 667  682          mc_error_t *m_error;
 668  683          struct method_context *mcp;
 669  684          int result = 0, timeout_fired = 0;
 670  685          int sig, r;
 671  686          boolean_t transient;
 672  687          uint64_t timeout;
 673  688          uint8_t timeout_retry;
 674  689          ctid_t ctid;
 675  690          int ctfd = -1;
 676  691          restarter_inst_t *inst = *instp;
 677  692          int id = inst->ri_id;
 678  693          int forkerr;
 679  694  
 680  695          assert(MUTEX_HELD(&inst->ri_lock));
 681  696          assert(instance_in_transition(inst));
 682  697  
 683  698          if (inst->ri_mi_deleted)
 684  699                  return (ECANCELED);
 685  700  
 686  701          *exit_code = 0;
 687  702  
 688  703          assert(0 <= type && type <= 2);
 689  704          mname = method_names[type];
 690  705  
 691  706          if (type == METHOD_START)
 692  707                  inst->ri_pre_online_hook();
 693  708  
 694  709          h = scf_instance_handle(inst->ri_m_inst);
 695  710  
 696  711          snap = scf_snapshot_create(h);
 697  712          if (snap == NULL ||
 698  713              scf_instance_get_snapshot(inst->ri_m_inst, "running", snap) != 0) {
 699  714                  log_framework(LOG_DEBUG,
 700  715                      "Could not get running snapshot for %s.  "
 701  716                      "Using editing version to run method %s.\n",
 702  717                      inst->ri_i.i_fmri, mname);
 703  718                  scf_snapshot_destroy(snap);
 704  719                  snap = NULL;
 705  720          }
 706  721  
 707  722          /*
 708  723           * After this point, we may be logging to the instance log.
 709  724           * Make sure we've noted where that log is as a property of
 710  725           * the instance.
 711  726           */
 712  727          r = libscf_note_method_log(inst->ri_m_inst, st->st_log_prefix,
 713  728              inst->ri_logstem);
 714  729          if (r != 0) {
 715  730                  log_framework(LOG_WARNING,
 716  731                      "%s: couldn't note log location: %s\n",
 717  732                      inst->ri_i.i_fmri, strerror(r));
 718  733          }
 719  734  
 720  735          if ((method = libscf_get_method(h, type, inst, snap, &restart_on,
 721  736              &cte_mask, &need_session, &timeout, &timeout_retry)) == NULL) {
 722  737                  if (errno == LIBSCF_PGROUP_ABSENT)  {
 723  738                          log_framework(LOG_DEBUG,
 724  739                              "%s: instance has no method property group '%s'.\n",
 725  740                              inst->ri_i.i_fmri, mname);
 726  741                          if (type == METHOD_REFRESH)
 727  742                                  log_instance(inst, B_TRUE, "No '%s' method "
 728  743                                      "defined.  Treating as :true.", mname);
 729  744                          else
 730  745                                  log_instance(inst, B_TRUE, "Method property "
 731  746                                      "group '%s' is not present.", mname);
 732  747                          scf_snapshot_destroy(snap);
 733  748                          return (0);
 734  749                  } else if (errno == LIBSCF_PROPERTY_ABSENT)  {
 735  750                          log_framework(LOG_DEBUG,
 736  751                              "%s: instance has no '%s/exec' method property.\n",
 737  752                              inst->ri_i.i_fmri, mname);
 738  753                          log_instance(inst, B_TRUE, "Method property '%s/exec "
 739  754                              "is not present.", mname);
 740  755                          scf_snapshot_destroy(snap);
 741  756                          return (0);
 742  757                  } else {
 743  758                          log_error(LOG_WARNING,
 744  759                              "%s: instance libscf_get_method failed\n",
 745  760                              inst->ri_i.i_fmri);
 746  761                          scf_snapshot_destroy(snap);
 747  762                          return (EINVAL);
 748  763                  }
 749  764          }
 750  765  
 751  766          /* open service contract if stopping a non-transient service */
 752  767          if (type == METHOD_STOP && (!instance_is_transient_style(inst))) {
 753  768                  if (inst->ri_i.i_primary_ctid == 0) {
 754  769                          /* service is not running, nothing to stop */
 755  770                          log_framework(LOG_DEBUG, "%s: instance has no primary "
 756  771                              "contract, no service to stop.\n",
 757  772                              inst->ri_i.i_fmri);
 758  773                          scf_snapshot_destroy(snap);
 759  774                          return (0);
 760  775                  }
 761  776                  if ((ctfd = contract_open(inst->ri_i.i_primary_ctid, "process",
 762  777                      "events", O_RDONLY)) < 0) {
 763  778                          result = EFAULT;
 764  779                          log_instance(inst, B_TRUE, "Could not open service "
 765  780                              "contract %ld.  Stop method not run.",
 766  781                              inst->ri_i.i_primary_ctid);
 767  782                          goto out;
 768  783                  }
 769  784          }
 770  785  
 771  786          if (restarter_is_null_method(method)) {
 772  787                  log_framework(LOG_DEBUG, "%s: null method succeeds\n",
 773  788                      inst->ri_i.i_fmri);
 774  789  
 775  790                  log_instance(inst, B_TRUE, "Executing %s method (null).",
 776  791                      mname);
 777  792  
 778  793                  if (type == METHOD_START)
 779  794                          write_status(inst, mname, 0);
 780  795                  goto out;
 781  796          }
 782  797  
 783  798          sig = restarter_is_kill_method(method);
 784  799          if (sig >= 0) {
 785  800  
 786  801                  if (inst->ri_i.i_primary_ctid == 0) {
 787  802                          log_error(LOG_ERR, "%s: :kill with no contract\n",
 788  803                              inst->ri_i.i_fmri);
 789  804                          log_instance(inst, B_TRUE, "Invalid use of \":kill\" "
 790  805                              "as stop method for transient service.");
 791  806                          result = EINVAL;
 792  807                          goto out;
 793  808                  }
 794  809  
 795  810                  log_framework(LOG_DEBUG,
 796  811                      "%s: :killing contract with signal %d\n",
 797  812                      inst->ri_i.i_fmri, sig);
 798  813  
 799  814                  log_instance(inst, B_TRUE, "Executing %s method (:kill).",
 800  815                      mname);
 801  816  
 802  817                  if (contract_kill(inst->ri_i.i_primary_ctid, sig,
 803  818                      inst->ri_i.i_fmri) != 0) {
 804  819                          result = EIO;
 805  820                          goto out;
 806  821                  } else
 807  822                          goto assured_kill;
 808  823          }
 809  824  
 810  825          log_framework(LOG_DEBUG, "%s: forking to run method %s\n",
 811  826              inst->ri_i.i_fmri, method);
 812  827  
 813  828          m_error = restarter_get_method_context(RESTARTER_METHOD_CONTEXT_VERSION,
 814  829              inst->ri_m_inst, snap, mname, method, &mcp);
 815  830  
 816  831          if (m_error != NULL) {
 817  832                  log_instance(inst, B_TRUE, "%s", m_error->msg);
 818  833                  restarter_mc_error_destroy(m_error);
 819  834                  result = EINVAL;
 820  835                  goto out;
 821  836          }
 822  837  
 823  838          r = method_ready_contract(inst, type, restart_on, cte_mask);
 824  839          if (r != 0) {
 825  840                  assert(r == ECANCELED);
 826  841                  assert(inst->ri_mi_deleted);
 827  842                  restarter_free_method_context(mcp);
 828  843                  result = ECANCELED;
 829  844                  goto out;
 830  845          }
 831  846  
 832  847          /*
 833  848           * Validate safety of method contexts, to save children work.
 834  849           */
 835  850          if (!restarter_rm_libs_loadable())
 836  851                  log_framework(LOG_DEBUG, "%s: method contexts limited "
 837  852                      "to root-accessible libraries\n", inst->ri_i.i_fmri);
 838  853  
 839  854          /*
 840  855           * For wait-style svc, sanity check that method exists to prevent an
 841  856           * infinite loop.
 842  857           */
 843  858          if (instance_is_wait_style(inst) && type == METHOD_START) {
 844  859                  char *pend;
 845  860                  struct stat64 sbuf;
 846  861  
 847  862                  /*
 848  863                   * We need to handle start method strings that have arguments,
 849  864                   * such as '/lib/svc/method/console-login %i'.
 850  865                   */
 851  866                  if ((pend = strchr(method, ' ')) != NULL)
 852  867                          *pend = '\0';
 853  868  
 854  869                  if (*method == '/' && stat64(method, &sbuf) == -1 &&
 855  870                      errno == ENOENT) {
 856  871                          log_instance(inst, B_TRUE, "Missing start method (%s), "
 857  872                              "changing state to maintenance.", method);
 858  873                          restarter_free_method_context(mcp);
 859  874                          result = ENOENT;
 860  875                          goto out;
 861  876                  }
 862  877                  if (pend != NULL)
 863  878                          *pend = ' ';
 864  879          }
 865  880  
 866  881          /*
 867  882           * If the service is restarting too quickly, send it to
 868  883           * maintenance.
 869  884           */
 870  885          if (type == METHOD_START) {
 871  886                  method_record_start(inst);
 872  887                  if (method_rate_critical(inst) &&
 873  888                      !instance_is_wait_style(inst)) {
 874  889                          log_instance(inst, B_TRUE, "Restarting too quickly, "
 875  890                              "changing state to maintenance.");
 876  891                          result = ELOOP;
 877  892                          restarter_free_method_context(mcp);
 878  893                          goto out;
 879  894                  }
 880  895          }
 881  896  
 882  897          atomic_add_16(&storing_contract, 1);
 883  898          pid = startd_fork1(&forkerr);
 884  899          if (pid == 0)
 885  900                  exec_method(inst, type, method, mcp, need_session);
 886  901  
 887  902          if (pid == -1) {
 888  903                  atomic_add_16(&storing_contract, -1);
 889  904                  if (forkerr == EAGAIN)
 890  905                          result = EAGAIN;
 891  906                  else
 892  907                          result = EFAULT;
 893  908  
 894  909                  log_error(LOG_WARNING,
 895  910                      "%s: Couldn't fork to execute method %s: %s\n",
 896  911                      inst->ri_i.i_fmri, method, strerror(forkerr));
 897  912  
 898  913                  restarter_free_method_context(mcp);
 899  914                  goto out;
 900  915          }
 901  916  
 902  917  
 903  918          /*
 904  919           * Get the contract id, decide whether it is primary or transient, and
 905  920           * stash it in inst & the repository.
 906  921           */
 907  922          method_store_contract(inst, type, &ctid);
 908  923          atomic_add_16(&storing_contract, -1);
 909  924  
 910  925          restarter_free_method_context(mcp);
 911  926  
 912  927          /*
 913  928           * Similarly for the start method PID.
 914  929           */
 915  930          if (type == METHOD_START && !inst->ri_mi_deleted)
 916  931                  (void) libscf_write_start_pid(inst->ri_m_inst, pid);
 917  932  
 918  933          if (instance_is_wait_style(inst) && type == METHOD_START) {
 919  934                  /* Wait style instances don't get timeouts on start methods. */
 920  935                  if (wait_register(pid, inst->ri_i.i_fmri, 1, 0)) {
 921  936                          log_error(LOG_WARNING,
 922  937                              "%s: couldn't register %ld for wait\n",
 923  938                              inst->ri_i.i_fmri, pid);
 924  939                          result = EFAULT;
 925  940                          goto contract_out;
 926  941                  }
 927  942                  write_status(inst, mname, 0);
 928  943  
 929  944          } else {
 930  945                  int r, err;
 931  946                  time_t start_time;
 932  947                  time_t end_time;
 933  948  
 934  949                  /*
 935  950                   * Because on upgrade/live-upgrade we may have no chance
 936  951                   * to override faulty timeout values on the way to
 937  952                   * manifest import, all services on the path to manifest
 938  953                   * import are treated the same as INFINITE timeout services.
 939  954                   */
 940  955  
 941  956                  start_time = time(NULL);
 942  957                  if (timeout != METHOD_TIMEOUT_INFINITE && !is_timeout_ovr(inst))
 943  958                          timeout_insert(inst, ctid, timeout);
 944  959                  else
 945  960                          timeout = METHOD_TIMEOUT_INFINITE;
 946  961  
 947  962                  /* Unlock the instance while waiting for the method. */
 948  963                  MUTEX_UNLOCK(&inst->ri_lock);
 949  964  
 950  965                  do {
 951  966                          r = waitpid(pid, &ret_status, NULL);
 952  967                  } while (r == -1 && errno == EINTR);
 953  968                  if (r == -1)
 954  969                          err = errno;
 955  970  
 956  971                  /* Re-grab the lock. */
 957  972                  inst = inst_lookup_by_id(id);
 958  973  
 959  974                  /*
 960  975                   * inst can't be removed, as the removal thread waits
 961  976                   * for completion of this one.
 962  977                   */
 963  978                  assert(inst != NULL);
 964  979                  *instp = inst;
 965  980  
 966  981                  if (inst->ri_timeout != NULL && inst->ri_timeout->te_fired)
 967  982                          timeout_fired = 1;
 968  983  
 969  984                  timeout_remove(inst, ctid);
 970  985  
 971  986                  log_framework(LOG_DEBUG,
 972  987                      "%s method for %s exited with status %d.\n", mname,
 973  988                      inst->ri_i.i_fmri, WEXITSTATUS(ret_status));
 974  989  
 975  990                  if (r == -1) {
 976  991                          log_error(LOG_WARNING,
 977  992                              "Couldn't waitpid() for %s method of %s (%s).\n",
 978  993                              mname, inst->ri_i.i_fmri, strerror(err));
 979  994                          result = EFAULT;
 980  995                          goto contract_out;
 981  996                  }
 982  997  
 983  998                  if (type == METHOD_START)
 984  999                          write_status(inst, mname, ret_status);
 985 1000  
 986 1001                  /* return ERANGE if this service doesn't retry on timeout */
 987 1002                  if (timeout_fired == 1 && timeout_retry == 0) {
 988 1003                          result = ERANGE;
 989 1004                          goto contract_out;
 990 1005                  }
 991 1006  
 992 1007                  if (!WIFEXITED(ret_status)) {
 993 1008                          /*
 994 1009                           * If method didn't exit itself (it was killed by an
 995 1010                           * external entity, etc.), consider the entire
 996 1011                           * method_run as failed.
 997 1012                           */
 998 1013                          if (WIFSIGNALED(ret_status)) {
 999 1014                                  char buf[SIG2STR_MAX];
1000 1015                                  (void) sig2str(WTERMSIG(ret_status), buf);
1001 1016  
1002 1017                                  log_error(LOG_WARNING, "%s: Method \"%s\" "
1003 1018                                      "failed due to signal %s.\n",
1004 1019                                      inst->ri_i.i_fmri, method, buf);
1005 1020                                  log_instance(inst, B_TRUE, "Method \"%s\" "
1006 1021                                      "failed due to signal %s.", mname, buf);
1007 1022                          } else {
1008 1023                                  log_error(LOG_WARNING, "%s: Method \"%s\" "
1009 1024                                      "failed with exit status %d.\n",
1010 1025                                      inst->ri_i.i_fmri, method,

↓ open down ↓

819 lines elided

↑ open up ↑

1011 1026                                      WEXITSTATUS(ret_status));
1012 1027                                  log_instance(inst, B_TRUE, "Method \"%s\" "
1013 1028                                      "failed with exit status %d.", mname,
1014 1029                                      WEXITSTATUS(ret_status));
1015 1030                          }
1016 1031                          result = EAGAIN;
1017 1032                          goto contract_out;
1018 1033                  }
1019 1034  
1020 1035                  *exit_code = WEXITSTATUS(ret_status);
1021      -                if (*exit_code != 0) {
     1036 +                if (method_failed(*exit_code) != 0) {
1022 1037                          log_error(LOG_WARNING,
1023 1038                              "%s: Method \"%s\" failed with exit status %d.\n",
1024 1039                              inst->ri_i.i_fmri, method, WEXITSTATUS(ret_status));
1025 1040                  }
1026 1041  
     1042 +                if (type == METHOD_STOP &&
     1043 +                    *exit_code == SMF_EXIT_TEMP_TRANSIENT) {
     1044 +                        log_instance(inst, B_TRUE, "Invalid use of "
     1045 +                            "\"$SMF_EXIT_TEMP_TRANSIENT\" in stop method.");
     1046 +                        result = EINVAL;
     1047 +                        goto contract_out;
     1048 +                }
     1049 +
1027 1050                  log_instance(inst, B_TRUE, "Method \"%s\" exited with status "
1028 1051                      "%d.", mname, *exit_code);
1029 1052  
1030      -                if (*exit_code != 0)
     1053 +                if (method_failed(*exit_code) != 0)
1031 1054                          goto contract_out;
1032 1055  
1033 1056                  end_time = time(NULL);
1034 1057  
1035 1058                  /* Give service contract remaining seconds to empty */
1036 1059                  if (timeout != METHOD_TIMEOUT_INFINITE)
1037 1060                          timeout -= (end_time - start_time);
1038 1061          }
1039 1062  
1040 1063  assured_kill:

1041 1064          /*
1042 1065           * For stop methods, assure that the service contract has emptied
1043 1066           * before returning.
1044 1067           */
1045 1068          if (type == METHOD_STOP && (!instance_is_transient_style(inst)) &&
1046 1069              !(contract_is_empty(inst->ri_i.i_primary_ctid))) {
1047 1070                  int times = 0;
1048 1071  
1049 1072                  if (timeout != METHOD_TIMEOUT_INFINITE)
1050 1073                          timeout_insert(inst, inst->ri_i.i_primary_ctid,
1051 1074                              timeout);
1052 1075  
1053 1076                  for (;;) {
1054 1077                          /*
1055 1078                           * Check frequently at first, then back off.  This
1056 1079                           * keeps startd from idling while shutting down.
1057 1080                           */
1058 1081                          if (times < 20) {
1059 1082                                  (void) poll(NULL, 0, 5);
1060 1083                                  times++;
1061 1084                          } else {
1062 1085                                  (void) poll(NULL, 0, 100);
1063 1086                          }
1064 1087                          if (contract_is_empty(inst->ri_i.i_primary_ctid))
1065 1088                                  break;
1066 1089                  }
1067 1090  
1068 1091                  if (timeout != METHOD_TIMEOUT_INFINITE)
1069 1092                          if (inst->ri_timeout->te_fired)
1070 1093                                  result = EFAULT;
1071 1094  
1072 1095                  timeout_remove(inst, inst->ri_i.i_primary_ctid);
1073 1096          }
1074 1097  
1075 1098  contract_out:
1076 1099          /* Abandon contracts for transient methods & methods that fail. */
1077 1100          transient = method_is_transient(inst, type);
1078 1101          if ((transient || *exit_code != 0 || result != 0) &&
1079 1102              (restarter_is_kill_method(method) < 0))
1080 1103                  method_remove_contract(inst, !transient, B_TRUE);
1081 1104  
1082 1105  out:
1083 1106          if (ctfd >= 0)
1084 1107                  (void) close(ctfd);
1085 1108          scf_snapshot_destroy(snap);
1086 1109          free(method);
1087 1110          return (result);
1088 1111  }
1089 1112  
1090 1113  /*
1091 1114   * The method thread executes a service method to effect a state transition.
1092 1115   * The next_state of info->sf_id should be non-_NONE on entrance, and it will
1093 1116   * be _NONE on exit (state will either be what next_state was (on success), or
1094 1117   * it will be _MAINT (on error)).
1095 1118   *
1096 1119   * There are six classes of methods to consider: start & other (stop, refresh)
1097 1120   * for each of "normal" services, wait services, and transient services.  For
1098 1121   * each, the method must be fetched from the repository & executed.  fork()ed
1099 1122   * methods must be waited on, except for the start method of wait services
1100 1123   * (which must be registered with the wait subsystem via wait_register()).  If
1101 1124   * the method succeeded (returned 0), then for start methods its contract
1102 1125   * should be recorded as the primary contract for the service.  For other
1103 1126   * methods, it should be abandoned.  If the method fails, then depending on
1104 1127   * the failure, either the method should be reexecuted or the service should
1105 1128   * be put into maintenance.  Either way the contract should be abandoned.
1106 1129   */
1107 1130  void *
1108 1131  method_thread(void *arg)
1109 1132  {
1110 1133          fork_info_t *info = arg;
1111 1134          restarter_inst_t *inst;
1112 1135          scf_handle_t    *local_handle;
1113 1136          scf_instance_t  *s_inst = NULL;
1114 1137          int r, exit_code;
1115 1138          boolean_t retryable;
1116 1139          restarter_str_t reason;
1117 1140  
1118 1141          assert(0 <= info->sf_method_type && info->sf_method_type <= 2);
1119 1142  
1120 1143          /* Get (and lock) the restarter_inst_t. */
1121 1144          inst = inst_lookup_by_id(info->sf_id);
1122 1145  
1123 1146          assert(inst->ri_method_thread != 0);
1124 1147          assert(instance_in_transition(inst) == 1);
1125 1148  
1126 1149          /*
1127 1150           * We cannot leave this function with inst in transition, because
1128 1151           * protocol.c withholds messages for inst otherwise.
1129 1152           */
1130 1153  
1131 1154          log_framework(LOG_DEBUG, "method_thread() running %s method for %s.\n",
1132 1155              method_names[info->sf_method_type], inst->ri_i.i_fmri);
1133 1156  
1134 1157          local_handle = libscf_handle_create_bound_loop();
1135 1158  
1136 1159  rebind_retry:
1137 1160          /* get scf_instance_t */
1138 1161          switch (r = libscf_fmri_get_instance(local_handle, inst->ri_i.i_fmri,
1139 1162              &s_inst)) {
1140 1163          case 0:
1141 1164                  break;
1142 1165  
1143 1166          case ECONNABORTED:
1144 1167                  libscf_handle_rebind(local_handle);
1145 1168                  goto rebind_retry;
1146 1169  
1147 1170          case ENOENT:
1148 1171                  /*
1149 1172                   * It's not there, but we need to call this so protocol.c
1150 1173                   * doesn't think it's in transition anymore.
1151 1174                   */
1152 1175                  (void) restarter_instance_update_states(local_handle, inst,
1153 1176                      inst->ri_i.i_state, RESTARTER_STATE_NONE, RERR_NONE,
1154 1177                      restarter_str_none);
1155 1178                  goto out;
1156 1179  
1157 1180          case EINVAL:
1158 1181          case ENOTSUP:
1159 1182          default:
1160 1183                  bad_error("libscf_fmri_get_instance", r);
1161 1184          }

↓ open down ↓

121 lines elided

↑ open up ↑

1162 1185  
1163 1186          inst->ri_m_inst = s_inst;
1164 1187          inst->ri_mi_deleted = B_FALSE;
1165 1188  
1166 1189  retry:
1167 1190          if (info->sf_method_type == METHOD_START)
1168 1191                  log_transition(inst, START_REQUESTED);
1169 1192  
1170 1193          r = method_run(&inst, info->sf_method_type, &exit_code);
1171 1194  
1172      -        if (r == 0 && exit_code == 0) {
     1195 +        if (r == 0 && method_failed(exit_code) == 0) {
1173 1196                  /* Success! */
1174 1197                  assert(inst->ri_i.i_next_state != RESTARTER_STATE_NONE);
1175 1198  
1176 1199                  /*
1177 1200                   * When a stop method succeeds, remove the primary contract of
1178 1201                   * the service, unless we're going to offline, in which case
1179 1202                   * retain the contract so we can transfer inherited contracts to
1180 1203                   * the replacement service.
1181 1204                   */
1182 1205

1183 1206                  if (info->sf_method_type == METHOD_STOP &&
1184 1207                      inst->ri_i.i_primary_ctid != 0) {
1185 1208                          if (inst->ri_i.i_next_state == RESTARTER_STATE_OFFLINE)
1186 1209                                  inst->ri_i.i_primary_ctid_stopped = 1;
1187 1210                          else
1188 1211                                  method_remove_contract(inst, B_TRUE, B_TRUE);
1189 1212                  }
1190 1213                  /*
1191 1214                   * We don't care whether the handle was rebound because this is
1192 1215                   * the last thing we do with it.
1193 1216                   */
1194 1217                  (void) restarter_instance_update_states(local_handle, inst,
1195 1218                      inst->ri_i.i_next_state, RESTARTER_STATE_NONE,
1196 1219                      info->sf_event_type, info->sf_reason);
1197 1220  
1198 1221                  (void) update_fault_count(inst, FAULT_COUNT_RESET);
1199 1222  
1200 1223                  goto out;
1201 1224          }
1202 1225  
1203 1226          /* Failure.  Retry or go to maintenance. */
1204 1227  
1205 1228          if (r != 0 && r != EAGAIN) {
1206 1229                  retryable = B_FALSE;
1207 1230          } else {
1208 1231                  switch (exit_code) {
1209 1232                  case SMF_EXIT_ERR_CONFIG:
1210 1233                  case SMF_EXIT_ERR_NOSMF:
1211 1234                  case SMF_EXIT_ERR_PERM:
1212 1235                  case SMF_EXIT_ERR_FATAL:
1213 1236                          retryable = B_FALSE;
1214 1237                          break;
1215 1238  
1216 1239                  default:
1217 1240                          retryable = B_TRUE;
1218 1241                  }
1219 1242          }
1220 1243  
1221 1244          if (retryable && update_fault_count(inst, FAULT_COUNT_INCR) != 1)
1222 1245                  goto retry;
1223 1246  
1224 1247          /* maintenance */
1225 1248          if (r == ELOOP)
1226 1249                  log_transition(inst, START_FAILED_REPEATEDLY);
1227 1250          else if (r == ERANGE)
1228 1251                  log_transition(inst, START_FAILED_TIMEOUT_FATAL);
1229 1252          else if (exit_code == SMF_EXIT_ERR_CONFIG)
1230 1253                  log_transition(inst, START_FAILED_CONFIGURATION);
1231 1254          else if (exit_code == SMF_EXIT_ERR_FATAL)
1232 1255                  log_transition(inst, START_FAILED_FATAL);
1233 1256          else
1234 1257                  log_transition(inst, START_FAILED_OTHER);
1235 1258  
1236 1259          if (r == ELOOP) {
1237 1260                  reason = restarter_str_restarting_too_quickly;
1238 1261          } else if (retryable) {
1239 1262                  reason = restarter_str_fault_threshold_reached;
1240 1263          } else {
1241 1264                  reason = restarter_str_method_failed;
1242 1265          }
1243 1266  
1244 1267          (void) restarter_instance_update_states(local_handle, inst,
1245 1268              RESTARTER_STATE_MAINT, RESTARTER_STATE_NONE, RERR_FAULT,
1246 1269              reason);
1247 1270  
1248 1271          if (!method_is_transient(inst, info->sf_method_type) &&
1249 1272              inst->ri_i.i_primary_ctid != 0)
1250 1273                  method_remove_contract(inst, B_TRUE, B_TRUE);
1251 1274  
1252 1275  out:
1253 1276          inst->ri_method_thread = 0;
1254 1277  
1255 1278          /*
1256 1279           * Unlock the mutex after broadcasting to avoid a race condition
1257 1280           * with restarter_delete_inst() when the 'inst' structure is freed.
1258 1281           */
1259 1282          (void) pthread_cond_broadcast(&inst->ri_method_cv);
1260 1283          MUTEX_UNLOCK(&inst->ri_lock);
1261 1284  
1262 1285          scf_instance_destroy(s_inst);
1263 1286          scf_handle_destroy(local_handle);
1264 1287          startd_free(info, sizeof (fork_info_t));
1265 1288          return (NULL);
1266 1289  }

↓ open down ↓

84 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX