7267 SMF is fast and loose with optional dependencies (fixes)
Reviewed by: Dan McDonald <danmcd@omniti.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Albert Lee <trisk@omniti.com>

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  *
  25  * Copyright 2016 RackTop Systems.
  26  */
  27 
  28 
  29 /*
  30  * transition.c - Graph State Machine
  31  *
  32  * The graph state machine is implemented here, with a typical approach
  33  * of a function per state.  Separating the implementation allows more
  34  * clarity into the actions taken on notification of state change, as well
  35  * as a place for future expansion including hooks for configurable actions.
  36  * All functions are called with dgraph_lock held.
  37  *
  38  * The start action for this state machine is not explicit.  The states
  39  * (ONLINE and DEGRADED) which need to know when they're entering the state
  40  * due to a daemon restart implement this understanding by checking for
  41  * transition from uninitialized.  In the future, this would likely be better
  42  * as an explicit start action instead of relying on an overloaded transition.
  43  *
  44  * All gt_enter functions use the same set of return codes.
  45  *    0              success
  46  *    ECONNABORTED   repository connection aborted
  47  */
  48 
  49 #include "startd.h"
  50 
  51 static int
  52 gt_running(restarter_instance_state_t state)
  53 {
  54         if (state == RESTARTER_STATE_ONLINE ||
  55             state == RESTARTER_STATE_DEGRADED)
  56                 return (1);
  57 
  58         return (0);
  59 }
  60 
  61 static int
  62 gt_enter_uninit(scf_handle_t *h, graph_vertex_t *v,
  63     restarter_instance_state_t old_state, restarter_error_t rerr)
  64 {
  65         int err;
  66         scf_instance_t *inst;
  67 
  68         /* Initialize instance by refreshing it. */
  69 
  70         err = libscf_fmri_get_instance(h, v->gv_name, &inst);
  71         switch (err) {
  72         case 0:
  73                 break;
  74 
  75         case ECONNABORTED:
  76                 return (ECONNABORTED);
  77 
  78         case ENOENT:
  79                 return (0);
  80 
  81         case EINVAL:
  82         case ENOTSUP:
  83         default:
  84                 bad_error("libscf_fmri_get_instance", err);
  85         }
  86 
  87         err = refresh_vertex(v, inst);
  88         if (err == 0)
  89                 graph_enable_by_vertex(v, v->gv_flags & GV_ENABLED, 0);
  90 
  91         scf_instance_destroy(inst);
  92 
  93         /* If the service was running, propagate a stop event. */
  94         if (gt_running(old_state)) {
  95                 log_framework(LOG_DEBUG, "Propagating stop of %s.\n",
  96                     v->gv_name);
  97 
  98                 graph_transition_propagate(v, PROPAGATE_STOP, rerr);
  99         }
 100 
 101         graph_transition_sulogin(RESTARTER_STATE_UNINIT, old_state);
 102         return (0);
 103 }
 104 
 105 /* ARGSUSED */
 106 static int
 107 gt_enter_maint(scf_handle_t *h, graph_vertex_t *v,
 108     restarter_instance_state_t old_state, restarter_error_t rerr)
 109 {
 110         int to_offline = v->gv_flags & GV_TOOFFLINE;
 111 
 112         /*
 113          * If the service was running, propagate a stop event.  If the
 114          * service was not running the maintenance transition may satisfy
 115          * optional dependencies and should be propagated to determine
 116          * whether new dependents are satisfiable.
 117          * Instances that transition to maintenance and have the GV_TOOFFLINE
 118          * flag are special because they can expose new subtree leaves so
 119          * propagate the offline to the instance dependencies.
 120          */
 121 
 122         /* instance transitioning to maintenance is considered disabled */
 123         v->gv_flags &= ~GV_TODISABLE;
 124         v->gv_flags &= ~GV_TOOFFLINE;
 125 
 126         if (gt_running(old_state)) {
 127                 /*
 128                  * Handle state change during instance disabling.
 129                  * Propagate offline to the new exposed leaves.
 130                  */
 131                 if (to_offline) {
 132                         log_framework(LOG_DEBUG, "%s removed from subtree\n",
 133                             v->gv_name);
 134 
 135                         graph_offline_subtree_leaves(v, (void *)h);
 136                 }
 137 
 138                 log_framework(LOG_DEBUG, "Propagating maintenance (stop) of "
 139                     "%s.\n", v->gv_name);
 140 
 141                 graph_transition_propagate(v, PROPAGATE_STOP, rerr);
 142 
 143                 /*
 144                  * The maintenance transition may satisfy optional_all/restart
 145                  * dependencies and should be propagated to determine
 146                  * whether new dependents are satisfiable.
 147                  */
 148                 graph_transition_propagate(v, PROPAGATE_SAT, rerr);
 149         } else {
 150                 log_framework(LOG_DEBUG, "Propagating maintenance of %s.\n",
 151                     v->gv_name);
 152 
 153                 graph_transition_propagate(v, PROPAGATE_SAT, rerr);
 154         }
 155 
 156         graph_transition_sulogin(RESTARTER_STATE_MAINT, old_state);
 157         return (0);
 158 }
 159 
 160 /* ARGSUSED */
 161 static int
 162 gt_enter_offline(scf_handle_t *h, graph_vertex_t *v,
 163     restarter_instance_state_t old_state, restarter_error_t rerr)
 164 {
 165         int to_offline = v->gv_flags & GV_TOOFFLINE;
 166         int to_disable = v->gv_flags & GV_TODISABLE;
 167 
 168         v->gv_flags &= ~GV_TOOFFLINE;
 169 
 170         /*
 171          * If the instance should be enabled, see if we can start it.
 172          * Otherwise send a disable command.
 173          * If a instance has the GV_TOOFFLINE flag set then it must
 174          * remains offline until the disable process completes.
 175          */
 176         if (v->gv_flags & GV_ENABLED) {
 177                 if (to_offline == 0 && to_disable == 0)
 178                         graph_start_if_satisfied(v);
 179         } else {
 180                 if (gt_running(old_state) && v->gv_post_disable_f)
 181                         v->gv_post_disable_f();
 182 
 183                 vertex_send_event(v, RESTARTER_EVENT_TYPE_DISABLE);
 184         }
 185 
 186         /*
 187          * If the service was running, propagate a stop event.  If the
 188          * service was not running the offline transition may satisfy
 189          * optional dependencies and should be propagated to determine
 190          * whether new dependents are satisfiable.
 191          * Instances that transition to offline and have the GV_TOOFFLINE flag
 192          * are special because they can expose new subtree leaves so propagate
 193          * the offline to the instance dependencies.
 194          */
 195         if (gt_running(old_state)) {
 196                 /*
 197                  * Handle state change during instance disabling.
 198                  * Propagate offline to the new exposed leaves.
 199                  */
 200                 if (to_offline) {
 201                         log_framework(LOG_DEBUG, "%s removed from subtree\n",
 202                             v->gv_name);
 203 
 204                         graph_offline_subtree_leaves(v, (void *)h);
 205                 }
 206 
 207                 log_framework(LOG_DEBUG, "Propagating stop of %s.\n",
 208                     v->gv_name);
 209 
 210                 graph_transition_propagate(v, PROPAGATE_STOP, rerr);
 211 
 212                 /*
 213                  * The offline transition may satisfy require_any/restart
 214                  * dependencies and should be propagated to determine
 215                  * whether new dependents are satisfiable.
 216                  */
 217                 graph_transition_propagate(v, PROPAGATE_SAT, rerr);
 218         } else {
 219                 log_framework(LOG_DEBUG, "Propagating offline of %s.\n",
 220                     v->gv_name);
 221 
 222                 graph_transition_propagate(v, PROPAGATE_SAT, rerr);
 223         }
 224 
 225         graph_transition_sulogin(RESTARTER_STATE_OFFLINE, old_state);
 226         return (0);
 227 }
 228 
 229 /* ARGSUSED */
 230 static int
 231 gt_enter_disabled(scf_handle_t *h, graph_vertex_t *v,
 232     restarter_instance_state_t old_state, restarter_error_t rerr)
 233 {
 234         int to_offline = v->gv_flags & GV_TOOFFLINE;
 235 
 236         v->gv_flags &= ~GV_TODISABLE;
 237         v->gv_flags &= ~GV_TOOFFLINE;
 238 
 239         /*
 240          * If the instance should be disabled, no problem.  Otherwise,
 241          * send an enable command, which should result in the instance
 242          * moving to OFFLINE unless the instance is part of a subtree
 243          * (non root) and in this case the result is unpredictable.
 244          */
 245         if (v->gv_flags & GV_ENABLED) {
 246                 vertex_send_event(v, RESTARTER_EVENT_TYPE_ENABLE);
 247         } else if (gt_running(old_state) && v->gv_post_disable_f) {
 248                 v->gv_post_disable_f();
 249         }
 250 
 251         /*
 252          * If the service was running, propagate this as a stop.  If the
 253          * service was not running the disabled transition may satisfy
 254          * optional dependencies and should be propagated to determine
 255          * whether new dependents are satisfiable.
 256          */
 257         if (gt_running(old_state)) {
 258                 /*
 259                  * We need to propagate the offline to new exposed leaves in
 260                  * case we've just disabled an instance that was part of a
 261                  * subtree.
 262                  */
 263                 if (to_offline) {
 264                         log_framework(LOG_DEBUG, "%s removed from subtree\n",
 265                             v->gv_name);
 266 
 267                         /*
 268                          * Handle state change during instance disabling.
 269                          * Propagate offline to the new exposed leaves.
 270                          */
 271                         graph_offline_subtree_leaves(v, (void *)h);
 272                 }
 273 
 274 
 275                 log_framework(LOG_DEBUG, "Propagating stop of %s.\n",
 276                     v->gv_name);
 277 
 278                 graph_transition_propagate(v, PROPAGATE_STOP, rerr);
 279 
 280                 /*
 281                  * The disable transition may satisfy optional_all/restart
 282                  * dependencies and should be propagated to determine
 283                  * whether new dependents are satisfiable.
 284                  */
 285                 graph_transition_propagate(v, PROPAGATE_SAT, rerr);
 286         } else {
 287                 log_framework(LOG_DEBUG, "Propagating disable of %s.\n",
 288                     v->gv_name);
 289 
 290                 graph_transition_propagate(v, PROPAGATE_SAT, rerr);
 291         }
 292 
 293         graph_transition_sulogin(RESTARTER_STATE_DISABLED, old_state);
 294         return (0);
 295 }
 296 
 297 static int
 298 gt_internal_online_or_degraded(scf_handle_t *h, graph_vertex_t *v,
 299     restarter_instance_state_t old_state, restarter_error_t rerr)
 300 {
 301         int r;
 302 
 303         /*
 304          * If the instance has just come up, update the start
 305          * snapshot.
 306          */
 307         if (gt_running(old_state) == 0) {
 308                 /*
 309                  * Don't fire if we're just recovering state
 310                  * after a restart.
 311                  */
 312                 if (old_state != RESTARTER_STATE_UNINIT &&
 313                     v->gv_post_online_f)
 314                         v->gv_post_online_f();
 315 
 316                 r = libscf_snapshots_poststart(h, v->gv_name, B_TRUE);
 317                 switch (r) {
 318                 case 0:
 319                 case ENOENT:
 320                         /*
 321                          * If ENOENT, the instance must have been
 322                          * deleted.  Pretend we were successful since
 323                          * we should get a delete event later.
 324                          */
 325                         break;
 326 
 327                 case ECONNABORTED:
 328                         return (ECONNABORTED);
 329 
 330                 case EACCES:
 331                 case ENOTSUP:
 332                 default:
 333                         bad_error("libscf_snapshots_poststart", r);
 334                 }
 335         }
 336 
 337         if (!(v->gv_flags & GV_ENABLED)) {
 338                 vertex_send_event(v, RESTARTER_EVENT_TYPE_DISABLE);
 339         } else if (v->gv_flags & GV_TOOFFLINE) {
 340                 /*
 341                  * If the vertex has the GV_TOOFFLINE flag set then that's
 342                  * because the instance was transitioning from offline to
 343                  * online and the reverse disable algorithm doesn't offline
 344                  * those instances because it was already appearing offline.
 345                  * So do it now.
 346                  */
 347                 offline_vertex(v);
 348         }
 349 
 350         if (gt_running(old_state) == 0) {
 351                 log_framework(LOG_DEBUG, "Propagating start of %s.\n",
 352                     v->gv_name);
 353 
 354                 graph_transition_propagate(v, PROPAGATE_START, rerr);
 355         } else if (rerr == RERR_REFRESH) {
 356                 /* For refresh we'll get a message sans state change */
 357 
 358                 log_framework(LOG_DEBUG, "Propagating refresh of %s.\n",
 359                     v->gv_name);
 360 
 361                 graph_transition_propagate(v, PROPAGATE_STOP, rerr);
 362         }
 363 
 364         return (0);
 365 }
 366 
 367 static int
 368 gt_enter_online(scf_handle_t *h, graph_vertex_t *v,
 369     restarter_instance_state_t old_state, restarter_error_t rerr)
 370 {
 371         int r;
 372 
 373         r = gt_internal_online_or_degraded(h, v, old_state, rerr);
 374         if (r != 0)
 375                 return (r);
 376 
 377         graph_transition_sulogin(RESTARTER_STATE_ONLINE, old_state);
 378         return (0);
 379 }
 380 
 381 static int
 382 gt_enter_degraded(scf_handle_t *h, graph_vertex_t *v,
 383     restarter_instance_state_t old_state, restarter_error_t rerr)
 384 {
 385         int r;
 386 
 387         r = gt_internal_online_or_degraded(h, v, old_state, rerr);
 388         if (r != 0)
 389                 return (r);
 390 
 391         graph_transition_sulogin(RESTARTER_STATE_DEGRADED, old_state);
 392         return (0);
 393 }
 394 
 395 /*
 396  * gt_transition() implements the state transition for the graph
 397  * state machine.  It can return:
 398  *    0              success
 399  *    ECONNABORTED   repository connection aborted
 400  *
 401  * v->gv_state should be set to the state we're transitioning to before
 402  * calling this function.
 403  */
 404 int
 405 gt_transition(scf_handle_t *h, graph_vertex_t *v, restarter_error_t rerr,
 406     restarter_instance_state_t old_state)
 407 {
 408         int err;
 409         int lost_repository = 0;
 410 
 411         /*
 412          * If there's a common set of work to be done on exit from the
 413          * old_state, include it as a separate set of functions here.  For
 414          * now there's no such work, so there are no gt_exit functions.
 415          */
 416 
 417         err = vertex_subgraph_dependencies_shutdown(h, v, old_state);
 418         switch (err) {
 419         case 0:
 420                 break;
 421 
 422         case ECONNABORTED:
 423                 lost_repository = 1;
 424                 break;
 425 
 426         default:
 427                 bad_error("vertex_subgraph_dependencies_shutdown", err);
 428         }
 429 
 430         /*
 431          * Now call the appropriate gt_enter function for the new state.
 432          */
 433         switch (v->gv_state) {
 434         case RESTARTER_STATE_UNINIT:
 435                 err = gt_enter_uninit(h, v, old_state, rerr);
 436                 break;
 437 
 438         case RESTARTER_STATE_DISABLED:
 439                 err = gt_enter_disabled(h, v, old_state, rerr);
 440                 break;
 441 
 442         case RESTARTER_STATE_OFFLINE:
 443                 err = gt_enter_offline(h, v, old_state, rerr);
 444                 break;
 445 
 446         case RESTARTER_STATE_ONLINE:
 447                 err = gt_enter_online(h, v, old_state, rerr);
 448                 break;
 449 
 450         case RESTARTER_STATE_DEGRADED:
 451                 err = gt_enter_degraded(h, v, old_state, rerr);
 452                 break;
 453 
 454         case RESTARTER_STATE_MAINT:
 455                 err = gt_enter_maint(h, v, old_state, rerr);
 456                 break;
 457 
 458         default:
 459                 /* Shouldn't be in an invalid state. */
 460 #ifndef NDEBUG
 461                 uu_warn("%s:%d: Invalid state %d.\n", __FILE__, __LINE__,
 462                     v->gv_state);
 463 #endif
 464                 abort();
 465         }
 466 
 467         switch (err) {
 468         case 0:
 469                 break;
 470 
 471         case ECONNABORTED:
 472                 lost_repository = 1;
 473                 break;
 474 
 475         default:
 476 #ifndef NDEBUG
 477                 uu_warn("%s:%d: "
 478                     "gt_enter_%s() failed with unexpected error %d.\n",
 479                     __FILE__, __LINE__, instance_state_str[v->gv_state], err);
 480 #endif
 481                 abort();
 482         }
 483 
 484         return (lost_repository ? ECONNABORTED : 0);
 485 }
--- EOF ---