7267 SMF is fast and loose with optional dependencies (fixes)
Reviewed by: Dan McDonald <danmcd@omniti.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Albert Lee <trisk@omniti.com>

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  *
  25  * Copyright 2016 RackTop Systems.
  26  */
  27 
  28 
  29 /*
  30  * transition.c - Graph State Machine
  31  *
  32  * The graph state machine is implemented here, with a typical approach
  33  * of a function per state.  Separating the implementation allows more
  34  * clarity into the actions taken on notification of state change, as well
  35  * as a place for future expansion including hooks for configurable actions.
  36  * All functions are called with dgraph_lock held.
  37  *
  38  * The start action for this state machine is not explicit.  The states
  39  * (ONLINE and DEGRADED) which need to know when they're entering the state
  40  * due to a daemon restart implement this understanding by checking for
  41  * transition from uninitialized.  In the future, this would likely be better
  42  * as an explicit start action instead of relying on an overloaded transition.
  43  *
  44  * All gt_enter functions use the same set of return codes.
  45  *    0              success
  46  *    ECONNABORTED   repository connection aborted
  47  */
  48 
  49 #include "startd.h"
  50 
  51 static int
  52 gt_running(restarter_instance_state_t state)
  53 {
  54         if (state == RESTARTER_STATE_ONLINE ||
  55             state == RESTARTER_STATE_DEGRADED)
  56                 return (1);
  57 
  58         return (0);
  59 }
  60 
  61 static int
  62 gt_enter_uninit(scf_handle_t *h, graph_vertex_t *v,
  63     restarter_instance_state_t old_state, restarter_error_t rerr)
  64 {
  65         int err;
  66         scf_instance_t *inst;
  67 
  68         /* Initialize instance by refreshing it. */
  69 
  70         err = libscf_fmri_get_instance(h, v->gv_name, &inst);
  71         switch (err) {
  72         case 0:
  73                 break;
  74 
  75         case ECONNABORTED:
  76                 return (ECONNABORTED);
  77 
  78         case ENOENT:
  79                 return (0);
  80 
  81         case EINVAL:
  82         case ENOTSUP:
  83         default:
  84                 bad_error("libscf_fmri_get_instance", err);
  85         }
  86 
  87         err = refresh_vertex(v, inst);
  88         if (err == 0)
  89                 graph_enable_by_vertex(v, v->gv_flags & GV_ENABLED, 0);
  90 
  91         scf_instance_destroy(inst);
  92 
  93         /* If the service was running, propagate a stop event. */
  94         if (gt_running(old_state)) {
  95                 log_framework(LOG_DEBUG, "Propagating stop of %s.\n",
  96                     v->gv_name);
  97 
  98                 graph_transition_propagate(v, PROPAGATE_STOP, rerr);
  99         }
 100 
 101         graph_transition_sulogin(RESTARTER_STATE_UNINIT, old_state);
 102         return (0);
 103 }
 104 
 105 /* ARGSUSED */
 106 static int
 107 gt_enter_maint(scf_handle_t *h, graph_vertex_t *v,
 108     restarter_instance_state_t old_state, restarter_error_t rerr)
 109 {
 110         int to_offline = v->gv_flags & GV_TOOFFLINE;
 111 
 112         /*
 113          * If the service was running, propagate a stop event.  If the
 114          * service was not running the maintenance transition may satisfy
 115          * optional dependencies and should be propagated to determine
 116          * whether new dependents are satisfiable.
 117          * Instances that transition to maintenance and have the GV_TOOFFLINE
 118          * flag are special because they can expose new subtree leaves so
 119          * propagate the offline to the instance dependencies.
 120          */
 121 
 122         /* instance transitioning to maintenance is considered disabled */
 123         v->gv_flags &= ~GV_TODISABLE;
 124         v->gv_flags &= ~GV_TOOFFLINE;
 125 
 126         if (gt_running(old_state)) {
 127                 /*
 128                  * Handle state change during instance disabling.
 129                  * Propagate offline to the new exposed leaves.
 130                  */
 131                 if (to_offline) {
 132                         log_framework(LOG_DEBUG, "%s removed from subtree\n",
 133                             v->gv_name);
 134 
 135                         graph_offline_subtree_leaves(v, (void *)h);
 136                 }
 137 
 138                 log_framework(LOG_DEBUG, "Propagating maintenance (stop) of "
 139                     "%s.\n", v->gv_name);
 140 
 141                 graph_transition_propagate(v, PROPAGATE_STOP, rerr);
 142 
 143                 /*
 144                  * The maintenance transition may satisfy optional_all/restart
 145                  * dependencies and should be propagated to determine
 146                  * whether new dependents are satisfiable.
 147                  */
 148                 graph_transition_propagate(v, PROPAGATE_SAT, rerr);
 149         } else {
 150                 log_framework(LOG_DEBUG, "Propagating maintenance of %s.\n",
 151                     v->gv_name);
 152 
 153                 graph_transition_propagate(v, PROPAGATE_SAT, rerr);
 154         }
 155 
 156         graph_transition_sulogin(RESTARTER_STATE_MAINT, old_state);
 157         return (0);
 158 }
 159 
 160 /* ARGSUSED */
 161 static int
 162 gt_enter_offline(scf_handle_t *h, graph_vertex_t *v,
 163     restarter_instance_state_t old_state, restarter_error_t rerr)
 164 {
 165         int to_offline = v->gv_flags & GV_TOOFFLINE;

 166 
 167         v->gv_flags &= ~GV_TOOFFLINE;
 168 
 169         /*
 170          * If the instance should be enabled, see if we can start it.
 171          * Otherwise send a disable command.
 172          * If a instance has the GV_TOOFFLINE flag set then it must
 173          * remains offline until the disable process completes.
 174          */
 175         if (v->gv_flags & GV_ENABLED) {
 176                 if (to_offline == 0)
 177                         graph_start_if_satisfied(v);
 178         } else {
 179                 if (gt_running(old_state) && v->gv_post_disable_f)
 180                         v->gv_post_disable_f();
 181 
 182                 vertex_send_event(v, RESTARTER_EVENT_TYPE_DISABLE);
 183         }
 184 
 185         /*
 186          * If the service was running, propagate a stop event.  If the
 187          * service was not running the offline transition may satisfy
 188          * optional dependencies and should be propagated to determine
 189          * whether new dependents are satisfiable.
 190          * Instances that transition to offline and have the GV_TOOFFLINE flag
 191          * are special because they can expose new subtree leaves so propagate
 192          * the offline to the instance dependencies.
 193          */
 194         if (gt_running(old_state)) {
 195                 /*
 196                  * Handle state change during instance disabling.
 197                  * Propagate offline to the new exposed leaves.
 198                  */
 199                 if (to_offline) {
 200                         log_framework(LOG_DEBUG, "%s removed from subtree\n",
 201                             v->gv_name);
 202 
 203                         graph_offline_subtree_leaves(v, (void *)h);
 204                 }
 205 
 206                 log_framework(LOG_DEBUG, "Propagating stop of %s.\n",
 207                     v->gv_name);
 208 
 209                 graph_transition_propagate(v, PROPAGATE_STOP, rerr);
 210 
 211                 /*
 212                  * The offline transition may satisfy require_any/restart
 213                  * dependencies and should be propagated to determine
 214                  * whether new dependents are satisfiable.
 215                  */
 216                 graph_transition_propagate(v, PROPAGATE_SAT, rerr);
 217         } else {
 218                 log_framework(LOG_DEBUG, "Propagating offline of %s.\n",
 219                     v->gv_name);
 220 
 221                 graph_transition_propagate(v, PROPAGATE_SAT, rerr);
 222         }
 223 
 224         graph_transition_sulogin(RESTARTER_STATE_OFFLINE, old_state);
 225         return (0);
 226 }
 227 
 228 /* ARGSUSED */
 229 static int
 230 gt_enter_disabled(scf_handle_t *h, graph_vertex_t *v,
 231     restarter_instance_state_t old_state, restarter_error_t rerr)
 232 {
 233         int to_offline = v->gv_flags & GV_TOOFFLINE;
 234 
 235         v->gv_flags &= ~GV_TODISABLE;
 236         v->gv_flags &= ~GV_TOOFFLINE;
 237 
 238         /*
 239          * If the instance should be disabled, no problem.  Otherwise,
 240          * send an enable command, which should result in the instance
 241          * moving to OFFLINE unless the instance is part of a subtree
 242          * (non root) and in this case the result is unpredictable.
 243          */
 244         if (v->gv_flags & GV_ENABLED) {
 245                 vertex_send_event(v, RESTARTER_EVENT_TYPE_ENABLE);
 246         } else if (gt_running(old_state) && v->gv_post_disable_f) {
 247                 v->gv_post_disable_f();
 248         }
 249 
 250         /*
 251          * If the service was running, propagate this as a stop.  If the
 252          * service was not running the disabled transition may satisfy
 253          * optional dependencies and should be propagated to determine
 254          * whether new dependents are satisfiable.
 255          */
 256         if (gt_running(old_state)) {
 257                 /*
 258                  * We need to propagate the offline to new exposed leaves in
 259                  * case we've just disabled an instance that was part of a
 260                  * subtree.
 261                  */
 262                 if (to_offline) {
 263                         log_framework(LOG_DEBUG, "%s removed from subtree\n",
 264                             v->gv_name);
 265 
 266                         /*
 267                          * Handle state change during instance disabling.
 268                          * Propagate offline to the new exposed leaves.
 269                          */
 270                         graph_offline_subtree_leaves(v, (void *)h);
 271                 }
 272 
 273 
 274                 log_framework(LOG_DEBUG, "Propagating stop of %s.\n",
 275                     v->gv_name);
 276 
 277                 graph_transition_propagate(v, PROPAGATE_STOP, rerr);
 278 
 279                 /*
 280                  * The disable transition may satisfy optional_all/restart
 281                  * dependencies and should be propagated to determine
 282                  * whether new dependents are satisfiable.
 283                  */
 284                 graph_transition_propagate(v, PROPAGATE_SAT, rerr);
 285         } else {
 286                 log_framework(LOG_DEBUG, "Propagating disable of %s.\n",
 287                     v->gv_name);
 288 
 289                 graph_transition_propagate(v, PROPAGATE_SAT, rerr);
 290         }
 291 
 292         graph_transition_sulogin(RESTARTER_STATE_DISABLED, old_state);
 293         return (0);
 294 }
 295 
 296 static int
 297 gt_internal_online_or_degraded(scf_handle_t *h, graph_vertex_t *v,
 298     restarter_instance_state_t old_state, restarter_error_t rerr)
 299 {
 300         int r;
 301 
 302         /*
 303          * If the instance has just come up, update the start
 304          * snapshot.
 305          */
 306         if (gt_running(old_state) == 0) {
 307                 /*
 308                  * Don't fire if we're just recovering state
 309                  * after a restart.
 310                  */
 311                 if (old_state != RESTARTER_STATE_UNINIT &&
 312                     v->gv_post_online_f)
 313                         v->gv_post_online_f();
 314 
 315                 r = libscf_snapshots_poststart(h, v->gv_name, B_TRUE);
 316                 switch (r) {
 317                 case 0:
 318                 case ENOENT:
 319                         /*
 320                          * If ENOENT, the instance must have been
 321                          * deleted.  Pretend we were successful since
 322                          * we should get a delete event later.
 323                          */
 324                         break;
 325 
 326                 case ECONNABORTED:
 327                         return (ECONNABORTED);
 328 
 329                 case EACCES:
 330                 case ENOTSUP:
 331                 default:
 332                         bad_error("libscf_snapshots_poststart", r);
 333                 }
 334         }
 335 
 336         if (!(v->gv_flags & GV_ENABLED)) {
 337                 vertex_send_event(v, RESTARTER_EVENT_TYPE_DISABLE);
 338         } else if (v->gv_flags & GV_TOOFFLINE) {
 339                 /*
 340                  * If the vertex has the GV_TOOFFLINE flag set then that's
 341                  * because the instance was transitioning from offline to
 342                  * online and the reverse disable algorithm doesn't offline
 343                  * those instances because it was already appearing offline.
 344                  * So do it now.
 345                  */
 346                 offline_vertex(v);
 347         }
 348 
 349         if (gt_running(old_state) == 0) {
 350                 log_framework(LOG_DEBUG, "Propagating start of %s.\n",
 351                     v->gv_name);
 352 
 353                 graph_transition_propagate(v, PROPAGATE_START, rerr);
 354         } else if (rerr == RERR_REFRESH) {
 355                 /* For refresh we'll get a message sans state change */
 356 
 357                 log_framework(LOG_DEBUG, "Propagating refresh of %s.\n",
 358                     v->gv_name);
 359 
 360                 graph_transition_propagate(v, PROPAGATE_STOP, rerr);
 361         }
 362 
 363         return (0);
 364 }
 365 
 366 static int
 367 gt_enter_online(scf_handle_t *h, graph_vertex_t *v,
 368     restarter_instance_state_t old_state, restarter_error_t rerr)
 369 {
 370         int r;
 371 
 372         r = gt_internal_online_or_degraded(h, v, old_state, rerr);
 373         if (r != 0)
 374                 return (r);
 375 
 376         graph_transition_sulogin(RESTARTER_STATE_ONLINE, old_state);
 377         return (0);
 378 }
 379 
 380 static int
 381 gt_enter_degraded(scf_handle_t *h, graph_vertex_t *v,
 382     restarter_instance_state_t old_state, restarter_error_t rerr)
 383 {
 384         int r;
 385 
 386         r = gt_internal_online_or_degraded(h, v, old_state, rerr);
 387         if (r != 0)
 388                 return (r);
 389 
 390         graph_transition_sulogin(RESTARTER_STATE_DEGRADED, old_state);
 391         return (0);
 392 }
 393 
 394 /*
 395  * gt_transition() implements the state transition for the graph
 396  * state machine.  It can return:
 397  *    0              success
 398  *    ECONNABORTED   repository connection aborted
 399  *
 400  * v->gv_state should be set to the state we're transitioning to before
 401  * calling this function.
 402  */
 403 int
 404 gt_transition(scf_handle_t *h, graph_vertex_t *v, restarter_error_t rerr,
 405     restarter_instance_state_t old_state)
 406 {
 407         int err;
 408         int lost_repository = 0;
 409 
 410         /*
 411          * If there's a common set of work to be done on exit from the
 412          * old_state, include it as a separate set of functions here.  For
 413          * now there's no such work, so there are no gt_exit functions.
 414          */
 415 
 416         err = vertex_subgraph_dependencies_shutdown(h, v, old_state);
 417         switch (err) {
 418         case 0:
 419                 break;
 420 
 421         case ECONNABORTED:
 422                 lost_repository = 1;
 423                 break;
 424 
 425         default:
 426                 bad_error("vertex_subgraph_dependencies_shutdown", err);
 427         }
 428 
 429         /*
 430          * Now call the appropriate gt_enter function for the new state.
 431          */
 432         switch (v->gv_state) {
 433         case RESTARTER_STATE_UNINIT:
 434                 err = gt_enter_uninit(h, v, old_state, rerr);
 435                 break;
 436 
 437         case RESTARTER_STATE_DISABLED:
 438                 err = gt_enter_disabled(h, v, old_state, rerr);
 439                 break;
 440 
 441         case RESTARTER_STATE_OFFLINE:
 442                 err = gt_enter_offline(h, v, old_state, rerr);
 443                 break;
 444 
 445         case RESTARTER_STATE_ONLINE:
 446                 err = gt_enter_online(h, v, old_state, rerr);
 447                 break;
 448 
 449         case RESTARTER_STATE_DEGRADED:
 450                 err = gt_enter_degraded(h, v, old_state, rerr);
 451                 break;
 452 
 453         case RESTARTER_STATE_MAINT:
 454                 err = gt_enter_maint(h, v, old_state, rerr);
 455                 break;
 456 
 457         default:
 458                 /* Shouldn't be in an invalid state. */
 459 #ifndef NDEBUG
 460                 uu_warn("%s:%d: Invalid state %d.\n", __FILE__, __LINE__,
 461                     v->gv_state);
 462 #endif
 463                 abort();
 464         }
 465 
 466         switch (err) {
 467         case 0:
 468                 break;
 469 
 470         case ECONNABORTED:
 471                 lost_repository = 1;
 472                 break;
 473 
 474         default:
 475 #ifndef NDEBUG
 476                 uu_warn("%s:%d: "
 477                     "gt_enter_%s() failed with unexpected error %d.\n",
 478                     __FILE__, __LINE__, instance_state_str[v->gv_state], err);
 479 #endif
 480                 abort();
 481         }
 482 
 483         return (lost_repository ? ECONNABORTED : 0);
 484 }
--- EOF ---