Print this page
7711 SMF: Finish implementing support for degraded state
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/cmd/svc/startd/restarter.c
+++ new/usr/src/cmd/svc/startd/restarter.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
25 + * Copyright 2017 RackTop Systems.
25 26 */
26 27
27 28 /*
28 29 * restarter.c - service manipulation
29 30 *
30 31 * This component manages services whose restarter is svc.startd, the standard
31 32 * restarter. It translates restarter protocol events from the graph engine
32 33 * into actions on processes, as a delegated restarter would do.
33 34 *
34 35 * The master restarter manages a number of always-running threads:
35 36 * - restarter event thread: events from the graph engine
36 37 * - timeout thread: thread to fire queued timeouts
37 38 * - contract thread: thread to handle contract events
38 39 * - wait thread: thread to handle wait-based services
39 40 *
40 41 * The other threads are created as-needed:
41 42 * - per-instance method threads
42 43 * - per-instance event processing threads
43 44 *
44 45 * The interaction of all threads must result in the following conditions
45 46 * being satisfied (on a per-instance basis):
46 47 * - restarter events must be processed in order
47 48 * - method execution must be serialized
48 49 * - instance delete must be held until outstanding methods are complete
49 50 * - contract events shouldn't be processed while a method is running
50 51 * - timeouts should fire even when a method is running
51 52 *
52 53 * Service instances are represented by restarter_inst_t's and are kept in the
53 54 * instance_list list.
54 55 *
55 56 * Service States
56 57 * The current state of a service instance is kept in
57 58 * restarter_inst_t->ri_i.i_state. If transition to a new state could take
58 59 * some time, then before we effect the transition we set
59 60 * restarter_inst_t->ri_i.i_next_state to the target state, and afterwards we
60 61 * rotate i_next_state to i_state and set i_next_state to
61 62 * RESTARTER_STATE_NONE. So usually i_next_state is _NONE when ri_lock is not
62 63 * held. The exception is when we launch methods, which are done with
63 64 * a separate thread. To keep any other threads from grabbing ri_lock before
64 65 * method_thread() does, we set ri_method_thread to the thread id of the
65 66 * method thread, and when it is nonzero any thread with a different thread id
66 67 * waits on ri_method_cv.
67 68 *
68 69 * Method execution is serialized by blocking on ri_method_cv in
69 70 * inst_lookup_by_id() and waiting for a 0 value of ri_method_thread. This
70 71 * also prevents the instance structure from being deleted until all
71 72 * outstanding operations such as method_thread() have finished.
72 73 *
73 74 * Lock ordering:
74 75 *
75 76 * dgraph_lock [can be held when taking:]
76 77 * utmpx_lock
77 78 * dictionary->dict_lock
78 79 * st->st_load_lock
79 80 * wait_info_lock
80 81 * ru->restarter_update_lock
81 82 * restarter_queue->rpeq_lock
82 83 * instance_list.ril_lock
83 84 * inst->ri_lock
84 85 * st->st_configd_live_lock
85 86 *
86 87 * instance_list.ril_lock
87 88 * graph_queue->gpeq_lock
88 89 * gu->gu_lock
89 90 * st->st_configd_live_lock
90 91 * dictionary->dict_lock
91 92 * inst->ri_lock
92 93 * graph_queue->gpeq_lock
93 94 * gu->gu_lock
94 95 * tu->tu_lock
95 96 * tq->tq_lock
96 97 * inst->ri_queue_lock
97 98 * wait_info_lock
98 99 * bp->cb_lock
99 100 * utmpx_lock
100 101 *
101 102 * single_user_thread_lock
102 103 * wait_info_lock
103 104 * utmpx_lock
104 105 *
105 106 * gu_freeze_lock
106 107 *
107 108 * logbuf_mutex nests inside pretty much everything.
108 109 */
109 110
110 111 #include <sys/contract/process.h>
111 112 #include <sys/ctfs.h>
112 113 #include <sys/stat.h>
113 114 #include <sys/time.h>
114 115 #include <sys/types.h>
115 116 #include <sys/uio.h>
116 117 #include <sys/wait.h>
117 118 #include <assert.h>
118 119 #include <errno.h>
119 120 #include <fcntl.h>
120 121 #include <libcontract.h>
121 122 #include <libcontract_priv.h>
122 123 #include <libintl.h>
123 124 #include <librestart.h>
124 125 #include <librestart_priv.h>
125 126 #include <libuutil.h>
126 127 #include <limits.h>
127 128 #include <poll.h>
128 129 #include <port.h>
129 130 #include <pthread.h>
130 131 #include <stdarg.h>
131 132 #include <stdio.h>
132 133 #include <strings.h>
133 134 #include <unistd.h>
134 135
135 136 #include "startd.h"
136 137 #include "protocol.h"
137 138
138 139 static uu_list_pool_t *restarter_instance_pool;
139 140 static restarter_instance_list_t instance_list;
140 141
141 142 static uu_list_pool_t *restarter_queue_pool;
142 143
143 144 #define WT_SVC_ERR_THROTTLE 1 /* 1 sec delay for erroring wait svc */
144 145
145 146 /*
146 147 * Function used to reset the restart times for an instance, when
147 148 * an administrative task comes along and essentially makes the times
148 149 * in this array ineffective.
149 150 */
150 151 static void
151 152 reset_start_times(restarter_inst_t *inst)
152 153 {
153 154 inst->ri_start_index = 0;
154 155 bzero(inst->ri_start_time, sizeof (inst->ri_start_time));
155 156 }
156 157
157 158 /*ARGSUSED*/
158 159 static int
159 160 restarter_instance_compare(const void *lc_arg, const void *rc_arg,
160 161 void *private)
161 162 {
162 163 int lc_id = ((const restarter_inst_t *)lc_arg)->ri_id;
163 164 int rc_id = *(int *)rc_arg;
164 165
165 166 if (lc_id > rc_id)
166 167 return (1);
167 168 if (lc_id < rc_id)
168 169 return (-1);
169 170 return (0);
170 171 }
171 172
172 173 static restarter_inst_t *
173 174 inst_lookup_by_name(const char *name)
174 175 {
175 176 int id;
176 177
177 178 id = dict_lookup_byname(name);
178 179 if (id == -1)
179 180 return (NULL);
180 181
181 182 return (inst_lookup_by_id(id));
182 183 }
183 184
184 185 restarter_inst_t *
185 186 inst_lookup_by_id(int id)
186 187 {
187 188 restarter_inst_t *inst;
188 189
189 190 MUTEX_LOCK(&instance_list.ril_lock);
190 191 inst = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL);
191 192 if (inst != NULL)
192 193 MUTEX_LOCK(&inst->ri_lock);
193 194 MUTEX_UNLOCK(&instance_list.ril_lock);
194 195
195 196 if (inst != NULL) {
196 197 while (inst->ri_method_thread != 0 &&
197 198 !pthread_equal(inst->ri_method_thread, pthread_self())) {
198 199 ++inst->ri_method_waiters;
199 200 (void) pthread_cond_wait(&inst->ri_method_cv,
200 201 &inst->ri_lock);
201 202 assert(inst->ri_method_waiters > 0);
202 203 --inst->ri_method_waiters;
203 204 }
204 205 }
205 206
206 207 return (inst);
207 208 }
208 209
209 210 static restarter_inst_t *
210 211 inst_lookup_queue(const char *name)
211 212 {
212 213 int id;
213 214 restarter_inst_t *inst;
214 215
215 216 id = dict_lookup_byname(name);
216 217 if (id == -1)
217 218 return (NULL);
218 219
219 220 MUTEX_LOCK(&instance_list.ril_lock);
220 221 inst = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL);
221 222 if (inst != NULL)
222 223 MUTEX_LOCK(&inst->ri_queue_lock);
223 224 MUTEX_UNLOCK(&instance_list.ril_lock);
224 225
225 226 return (inst);
226 227 }
227 228
228 229 const char *
229 230 service_style(int flags)
230 231 {
231 232 switch (flags & RINST_STYLE_MASK) {
232 233 case RINST_CONTRACT: return ("contract");
233 234 case RINST_TRANSIENT: return ("transient");
234 235 case RINST_WAIT: return ("wait");
235 236
236 237 default:
237 238 #ifndef NDEBUG
238 239 uu_warn("%s:%d: Bad flags 0x%x.\n", __FILE__, __LINE__, flags);
239 240 #endif
240 241 abort();
241 242 /* NOTREACHED */
242 243 }
243 244 }
244 245
245 246 /*
246 247 * Fails with ECONNABORTED or ECANCELED.
247 248 */
248 249 static int
249 250 check_contract(restarter_inst_t *inst, boolean_t primary,
250 251 scf_instance_t *scf_inst)
251 252 {
252 253 ctid_t *ctidp;
253 254 int fd, r;
254 255
255 256 ctidp = primary ? &inst->ri_i.i_primary_ctid :
256 257 &inst->ri_i.i_transient_ctid;
257 258
258 259 assert(*ctidp >= 1);
259 260
260 261 fd = contract_open(*ctidp, NULL, "status", O_RDONLY);
261 262 if (fd >= 0) {
262 263 r = close(fd);
263 264 assert(r == 0);
264 265 return (0);
265 266 }
266 267
267 268 r = restarter_remove_contract(scf_inst, *ctidp, primary ?
268 269 RESTARTER_CONTRACT_PRIMARY : RESTARTER_CONTRACT_TRANSIENT);
269 270 switch (r) {
270 271 case 0:
271 272 case ECONNABORTED:
272 273 case ECANCELED:
273 274 *ctidp = 0;
274 275 return (r);
275 276
276 277 case ENOMEM:
277 278 uu_die("Out of memory\n");
278 279 /* NOTREACHED */
279 280
280 281 case EPERM:
281 282 uu_die("Insufficient privilege.\n");
282 283 /* NOTREACHED */
283 284
284 285 case EACCES:
285 286 uu_die("Repository backend access denied.\n");
286 287 /* NOTREACHED */
287 288
288 289 case EROFS:
289 290 log_error(LOG_INFO, "Could not remove unusable contract id %ld "
290 291 "for %s from repository.\n", *ctidp, inst->ri_i.i_fmri);
291 292 return (0);
292 293
293 294 case EINVAL:
294 295 case EBADF:
295 296 default:
296 297 assert(0);
297 298 abort();
298 299 /* NOTREACHED */
299 300 }
300 301 }
301 302
302 303 static int stop_instance(scf_handle_t *, restarter_inst_t *, stop_cause_t);
303 304
304 305 /*
305 306 * int restarter_insert_inst(scf_handle_t *, char *)
306 307 * If the inst is already in the restarter list, return its id. If the inst
307 308 * is not in the restarter list, initialize a restarter_inst_t, initialize its
308 309 * states, insert it into the list, and return 0.
309 310 *
310 311 * Fails with
311 312 * ENOENT - name is not in the repository
312 313 */
313 314 static int
314 315 restarter_insert_inst(scf_handle_t *h, const char *name)
315 316 {
316 317 int id, r;
317 318 restarter_inst_t *inst;
318 319 uu_list_index_t idx;
319 320 scf_service_t *scf_svc;
320 321 scf_instance_t *scf_inst;
321 322 scf_snapshot_t *snap = NULL;
322 323 scf_propertygroup_t *pg;
323 324 char *svc_name, *inst_name;
324 325 char logfilebuf[PATH_MAX];
325 326 char *c;
326 327 boolean_t do_commit_states;
327 328 restarter_instance_state_t state, next_state;
328 329 protocol_states_t *ps;
329 330 pid_t start_pid;
330 331 restarter_str_t reason = restarter_str_insert_in_graph;
331 332
332 333 MUTEX_LOCK(&instance_list.ril_lock);
333 334
334 335 /*
335 336 * We don't use inst_lookup_by_name() here because we want the lookup
336 337 * & insert to be atomic.
337 338 */
338 339 id = dict_lookup_byname(name);
339 340 if (id != -1) {
340 341 inst = uu_list_find(instance_list.ril_instance_list, &id, NULL,
341 342 &idx);
342 343 if (inst != NULL) {
343 344 MUTEX_UNLOCK(&instance_list.ril_lock);
344 345 return (0);
345 346 }
346 347 }
347 348
348 349 /* Allocate an instance */
349 350 inst = startd_zalloc(sizeof (restarter_inst_t));
350 351 inst->ri_utmpx_prefix = startd_alloc(max_scf_value_size);
351 352 inst->ri_utmpx_prefix[0] = '\0';
352 353
353 354 inst->ri_i.i_fmri = startd_alloc(strlen(name) + 1);
354 355 (void) strcpy((char *)inst->ri_i.i_fmri, name);
355 356
356 357 inst->ri_queue = startd_list_create(restarter_queue_pool, inst, 0);
357 358
358 359 /*
359 360 * id shouldn't be -1 since we use the same dictionary as graph.c, but
360 361 * just in case.
361 362 */
362 363 inst->ri_id = (id != -1 ? id : dict_insert(name));
363 364
364 365 special_online_hooks_get(name, &inst->ri_pre_online_hook,
365 366 &inst->ri_post_online_hook, &inst->ri_post_offline_hook);
366 367
367 368 scf_svc = safe_scf_service_create(h);
368 369 scf_inst = safe_scf_instance_create(h);
369 370 pg = safe_scf_pg_create(h);
370 371 svc_name = startd_alloc(max_scf_name_size);
371 372 inst_name = startd_alloc(max_scf_name_size);
372 373
373 374 rep_retry:
374 375 if (snap != NULL)
375 376 scf_snapshot_destroy(snap);
376 377 if (inst->ri_logstem != NULL)
377 378 startd_free(inst->ri_logstem, PATH_MAX);
378 379 if (inst->ri_common_name != NULL)
379 380 free(inst->ri_common_name);
380 381 if (inst->ri_C_common_name != NULL)
381 382 free(inst->ri_C_common_name);
382 383 snap = NULL;
383 384 inst->ri_logstem = NULL;
384 385 inst->ri_common_name = NULL;
385 386 inst->ri_C_common_name = NULL;
386 387
387 388 if (scf_handle_decode_fmri(h, name, NULL, scf_svc, scf_inst, NULL,
388 389 NULL, SCF_DECODE_FMRI_EXACT) != 0) {
389 390 switch (scf_error()) {
390 391 case SCF_ERROR_CONNECTION_BROKEN:
391 392 libscf_handle_rebind(h);
392 393 goto rep_retry;
393 394
394 395 case SCF_ERROR_NOT_FOUND:
395 396 goto deleted;
396 397 }
397 398
398 399 uu_die("Can't decode FMRI %s: %s\n", name,
399 400 scf_strerror(scf_error()));
400 401 }
401 402
402 403 /*
403 404 * If there's no running snapshot, then we execute using the editing
404 405 * snapshot. Pending snapshots will be taken later.
405 406 */
406 407 snap = libscf_get_running_snapshot(scf_inst);
407 408
408 409 if ((scf_service_get_name(scf_svc, svc_name, max_scf_name_size) < 0) ||
409 410 (scf_instance_get_name(scf_inst, inst_name, max_scf_name_size) <
410 411 0)) {
411 412 switch (scf_error()) {
412 413 case SCF_ERROR_NOT_SET:
413 414 break;
414 415
415 416 case SCF_ERROR_CONNECTION_BROKEN:
416 417 libscf_handle_rebind(h);
417 418 goto rep_retry;
418 419
419 420 default:
420 421 assert(0);
421 422 abort();
422 423 }
423 424
424 425 goto deleted;
425 426 }
426 427
427 428 (void) snprintf(logfilebuf, PATH_MAX, "%s:%s", svc_name, inst_name);
428 429 for (c = logfilebuf; *c != '\0'; c++)
429 430 if (*c == '/')
430 431 *c = '-';
431 432
432 433 inst->ri_logstem = startd_alloc(PATH_MAX);
433 434 (void) snprintf(inst->ri_logstem, PATH_MAX, "%s%s", logfilebuf,
434 435 LOG_SUFFIX);
435 436
436 437 /*
437 438 * If the restarter group is missing, use uninit/none. Otherwise,
438 439 * we're probably being restarted & don't want to mess up the states
439 440 * that are there.
440 441 */
441 442 state = RESTARTER_STATE_UNINIT;
442 443 next_state = RESTARTER_STATE_NONE;
443 444
444 445 r = scf_instance_get_pg(scf_inst, SCF_PG_RESTARTER, pg);
445 446 if (r != 0) {
446 447 switch (scf_error()) {
447 448 case SCF_ERROR_CONNECTION_BROKEN:
448 449 libscf_handle_rebind(h);
449 450 goto rep_retry;
450 451
451 452 case SCF_ERROR_NOT_SET:
452 453 goto deleted;
453 454
454 455 case SCF_ERROR_NOT_FOUND:
455 456 /*
456 457 * This shouldn't happen since the graph engine should
457 458 * have initialized the state to uninitialized/none if
458 459 * there was no restarter pg. In case somebody
459 460 * deleted it, though....
460 461 */
461 462 do_commit_states = B_TRUE;
462 463 break;
463 464
464 465 default:
465 466 assert(0);
466 467 abort();
467 468 }
468 469 } else {
469 470 r = libscf_read_states(pg, &state, &next_state);
470 471 if (r != 0) {
471 472 do_commit_states = B_TRUE;
472 473 } else {
473 474 if (next_state != RESTARTER_STATE_NONE) {
474 475 /*
475 476 * Force next_state to _NONE since we
476 477 * don't look for method processes.
477 478 */
478 479 next_state = RESTARTER_STATE_NONE;
479 480 do_commit_states = B_TRUE;
480 481 } else {
481 482 /*
482 483 * The reason for transition will depend on
483 484 * state.
484 485 */
485 486 if (st->st_initial == 0)
486 487 reason = restarter_str_startd_restart;
487 488 else if (state == RESTARTER_STATE_MAINT)
488 489 reason = restarter_str_bad_repo_state;
489 490 /*
490 491 * Inform the restarter of our state without
491 492 * changing the STIME in the repository.
492 493 */
493 494 ps = startd_alloc(sizeof (*ps));
494 495 inst->ri_i.i_state = ps->ps_state = state;
495 496 inst->ri_i.i_next_state = ps->ps_state_next =
496 497 next_state;
497 498 ps->ps_reason = reason;
498 499
499 500 graph_protocol_send_event(inst->ri_i.i_fmri,
500 501 GRAPH_UPDATE_STATE_CHANGE, ps);
501 502
502 503 do_commit_states = B_FALSE;
503 504 }
504 505 }
505 506 }
506 507
507 508 switch (libscf_get_startd_properties(scf_inst, snap, &inst->ri_flags,
508 509 &inst->ri_utmpx_prefix)) {
509 510 case 0:
510 511 break;
511 512
512 513 case ECONNABORTED:
513 514 libscf_handle_rebind(h);
514 515 goto rep_retry;
515 516
516 517 case ECANCELED:
517 518 goto deleted;
518 519
519 520 case ENOENT:
520 521 /*
521 522 * This is odd, because the graph engine should have required
522 523 * the general property group. So we'll just use default
523 524 * flags in anticipation of the graph engine sending us
524 525 * REMOVE_INSTANCE when it finds out that the general property
525 526 * group has been deleted.
526 527 */
527 528 inst->ri_flags = RINST_CONTRACT;
528 529 break;
529 530
530 531 default:
531 532 assert(0);
532 533 abort();
533 534 }
534 535
535 536 r = libscf_get_template_values(scf_inst, snap,
536 537 &inst->ri_common_name, &inst->ri_C_common_name);
537 538
538 539 /*
539 540 * Copy our names to smaller buffers to reduce our memory footprint.
540 541 */
541 542 if (inst->ri_common_name != NULL) {
542 543 char *tmp = safe_strdup(inst->ri_common_name);
543 544 startd_free(inst->ri_common_name, max_scf_value_size);
544 545 inst->ri_common_name = tmp;
545 546 }
546 547
547 548 if (inst->ri_C_common_name != NULL) {
548 549 char *tmp = safe_strdup(inst->ri_C_common_name);
549 550 startd_free(inst->ri_C_common_name, max_scf_value_size);
550 551 inst->ri_C_common_name = tmp;
551 552 }
552 553
553 554 switch (r) {
554 555 case 0:
555 556 break;
556 557
557 558 case ECONNABORTED:
558 559 libscf_handle_rebind(h);
559 560 goto rep_retry;
560 561
561 562 case ECANCELED:
562 563 goto deleted;
563 564
564 565 case ECHILD:
565 566 case ENOENT:
566 567 break;
567 568
568 569 default:
569 570 assert(0);
570 571 abort();
571 572 }
572 573
573 574 switch (libscf_read_method_ids(h, scf_inst, inst->ri_i.i_fmri,
574 575 &inst->ri_i.i_primary_ctid, &inst->ri_i.i_transient_ctid,
575 576 &start_pid)) {
576 577 case 0:
577 578 break;
578 579
579 580 case ECONNABORTED:
580 581 libscf_handle_rebind(h);
581 582 goto rep_retry;
582 583
583 584 case ECANCELED:
584 585 goto deleted;
585 586
586 587 default:
587 588 assert(0);
588 589 abort();
589 590 }
590 591
591 592 if (inst->ri_i.i_primary_ctid >= 1) {
592 593 contract_hash_store(inst->ri_i.i_primary_ctid, inst->ri_id);
593 594
594 595 switch (check_contract(inst, B_TRUE, scf_inst)) {
595 596 case 0:
596 597 break;
597 598
598 599 case ECONNABORTED:
599 600 libscf_handle_rebind(h);
600 601 goto rep_retry;
601 602
602 603 case ECANCELED:
603 604 goto deleted;
604 605
605 606 default:
606 607 assert(0);
607 608 abort();
608 609 }
609 610 }
610 611
611 612 if (inst->ri_i.i_transient_ctid >= 1) {
612 613 switch (check_contract(inst, B_FALSE, scf_inst)) {
613 614 case 0:
614 615 break;
615 616
616 617 case ECONNABORTED:
617 618 libscf_handle_rebind(h);
618 619 goto rep_retry;
619 620
620 621 case ECANCELED:
621 622 goto deleted;
622 623
623 624 default:
624 625 assert(0);
625 626 abort();
626 627 }
627 628 }
628 629
629 630 /* No more failures we live through, so add it to the list. */
630 631 (void) pthread_mutex_init(&inst->ri_lock, &mutex_attrs);
631 632 (void) pthread_mutex_init(&inst->ri_queue_lock, &mutex_attrs);
632 633 MUTEX_LOCK(&inst->ri_lock);
633 634 MUTEX_LOCK(&inst->ri_queue_lock);
634 635
635 636 (void) pthread_cond_init(&inst->ri_method_cv, NULL);
636 637
637 638 uu_list_node_init(inst, &inst->ri_link, restarter_instance_pool);
638 639 uu_list_insert(instance_list.ril_instance_list, inst, idx);
639 640 MUTEX_UNLOCK(&instance_list.ril_lock);
640 641
641 642 if (start_pid != -1 &&
642 643 (inst->ri_flags & RINST_STYLE_MASK) == RINST_WAIT) {
643 644 int ret;
644 645 ret = wait_register(start_pid, inst->ri_i.i_fmri, 0, 1);
645 646 if (ret == -1) {
646 647 /*
647 648 * Implication: if we can't reregister the
648 649 * instance, we will start another one. Two
649 650 * instances may or may not result in a resource
650 651 * conflict.
651 652 */
652 653 log_error(LOG_WARNING,
653 654 "%s: couldn't reregister %ld for wait\n",
654 655 inst->ri_i.i_fmri, start_pid);
655 656 } else if (ret == 1) {
656 657 /*
657 658 * Leading PID has exited.
658 659 */
659 660 (void) stop_instance(h, inst, RSTOP_EXIT);
660 661 }
661 662 }
662 663
663 664
664 665 scf_pg_destroy(pg);
665 666
666 667 if (do_commit_states)
667 668 (void) restarter_instance_update_states(h, inst, state,
668 669 next_state, RERR_NONE, reason);
669 670
670 671 log_framework(LOG_DEBUG, "%s is a %s-style service\n", name,
671 672 service_style(inst->ri_flags));
672 673
673 674 MUTEX_UNLOCK(&inst->ri_queue_lock);
674 675 MUTEX_UNLOCK(&inst->ri_lock);
675 676
676 677 startd_free(svc_name, max_scf_name_size);
677 678 startd_free(inst_name, max_scf_name_size);
678 679 scf_snapshot_destroy(snap);
679 680 scf_instance_destroy(scf_inst);
680 681 scf_service_destroy(scf_svc);
681 682
682 683 log_framework(LOG_DEBUG, "%s: inserted instance into restarter list\n",
683 684 name);
684 685
685 686 return (0);
686 687
687 688 deleted:
688 689 MUTEX_UNLOCK(&instance_list.ril_lock);
689 690 startd_free(inst_name, max_scf_name_size);
690 691 startd_free(svc_name, max_scf_name_size);
691 692 if (snap != NULL)
692 693 scf_snapshot_destroy(snap);
693 694 scf_pg_destroy(pg);
694 695 scf_instance_destroy(scf_inst);
695 696 scf_service_destroy(scf_svc);
696 697 startd_free((void *)inst->ri_i.i_fmri, strlen(inst->ri_i.i_fmri) + 1);
697 698 uu_list_destroy(inst->ri_queue);
698 699 if (inst->ri_logstem != NULL)
699 700 startd_free(inst->ri_logstem, PATH_MAX);
700 701 if (inst->ri_common_name != NULL)
701 702 free(inst->ri_common_name);
702 703 if (inst->ri_C_common_name != NULL)
703 704 free(inst->ri_C_common_name);
704 705 startd_free(inst->ri_utmpx_prefix, max_scf_value_size);
705 706 startd_free(inst, sizeof (restarter_inst_t));
706 707 return (ENOENT);
707 708 }
708 709
709 710 static void
710 711 restarter_delete_inst(restarter_inst_t *ri)
711 712 {
712 713 int id;
713 714 restarter_inst_t *rip;
714 715 void *cookie = NULL;
715 716 restarter_instance_qentry_t *e;
716 717
717 718 assert(MUTEX_HELD(&ri->ri_lock));
718 719
719 720 /*
720 721 * Must drop the instance lock so we can pick up the instance_list
721 722 * lock & remove the instance.
722 723 */
723 724 id = ri->ri_id;
724 725 MUTEX_UNLOCK(&ri->ri_lock);
725 726
726 727 MUTEX_LOCK(&instance_list.ril_lock);
727 728
728 729 rip = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL);
729 730 if (rip == NULL) {
730 731 MUTEX_UNLOCK(&instance_list.ril_lock);
731 732 return;
732 733 }
733 734
734 735 assert(ri == rip);
735 736
736 737 uu_list_remove(instance_list.ril_instance_list, ri);
737 738
738 739 log_framework(LOG_DEBUG, "%s: deleted instance from restarter list\n",
739 740 ri->ri_i.i_fmri);
740 741
741 742 MUTEX_UNLOCK(&instance_list.ril_lock);
742 743
743 744 /*
744 745 * We can lock the instance without holding the instance_list lock
745 746 * since we removed the instance from the list.
746 747 */
747 748 MUTEX_LOCK(&ri->ri_lock);
748 749 MUTEX_LOCK(&ri->ri_queue_lock);
749 750
750 751 if (ri->ri_i.i_primary_ctid >= 1)
751 752 contract_hash_remove(ri->ri_i.i_primary_ctid);
752 753
753 754 while (ri->ri_method_thread != 0 || ri->ri_method_waiters > 0)
754 755 (void) pthread_cond_wait(&ri->ri_method_cv, &ri->ri_lock);
755 756
756 757 while ((e = uu_list_teardown(ri->ri_queue, &cookie)) != NULL)
757 758 startd_free(e, sizeof (*e));
758 759 uu_list_destroy(ri->ri_queue);
759 760
760 761 startd_free((void *)ri->ri_i.i_fmri, strlen(ri->ri_i.i_fmri) + 1);
761 762 startd_free(ri->ri_logstem, PATH_MAX);
762 763 if (ri->ri_common_name != NULL)
763 764 free(ri->ri_common_name);
764 765 if (ri->ri_C_common_name != NULL)
765 766 free(ri->ri_C_common_name);
766 767 startd_free(ri->ri_utmpx_prefix, max_scf_value_size);
767 768 (void) pthread_mutex_destroy(&ri->ri_lock);
768 769 (void) pthread_mutex_destroy(&ri->ri_queue_lock);
769 770 startd_free(ri, sizeof (restarter_inst_t));
770 771 }
771 772
772 773 /*
773 774 * instance_is_wait_style()
774 775 *
775 776 * Returns 1 if the given instance is a "wait-style" service instance.
776 777 */
777 778 int
778 779 instance_is_wait_style(restarter_inst_t *inst)
779 780 {
780 781 assert(MUTEX_HELD(&inst->ri_lock));
781 782 return ((inst->ri_flags & RINST_STYLE_MASK) == RINST_WAIT);
782 783 }
783 784
784 785 /*
785 786 * instance_is_transient_style()
786 787 *
787 788 * Returns 1 if the given instance is a transient service instance.
788 789 */
789 790 int
790 791 instance_is_transient_style(restarter_inst_t *inst)
791 792 {
792 793 assert(MUTEX_HELD(&inst->ri_lock));
793 794 return ((inst->ri_flags & RINST_STYLE_MASK) == RINST_TRANSIENT);
794 795 }
795 796
796 797 /*
797 798 * instance_in_transition()
798 799 * Returns 1 if instance is in transition, 0 if not
799 800 */
800 801 int
801 802 instance_in_transition(restarter_inst_t *inst)
802 803 {
803 804 assert(MUTEX_HELD(&inst->ri_lock));
804 805 if (inst->ri_i.i_next_state == RESTARTER_STATE_NONE)
805 806 return (0);
806 807 return (1);
807 808 }
808 809
809 810 /*
810 811 * returns 1 if instance is already started, 0 if not
811 812 */
812 813 static int
813 814 instance_started(restarter_inst_t *inst)
814 815 {
815 816 int ret;
816 817
817 818 assert(MUTEX_HELD(&inst->ri_lock));
818 819
819 820 if (inst->ri_i.i_state == RESTARTER_STATE_ONLINE ||
820 821 inst->ri_i.i_state == RESTARTER_STATE_DEGRADED)
821 822 ret = 1;
822 823 else
823 824 ret = 0;
824 825
825 826 return (ret);
826 827 }
827 828
828 829 /*
829 830 * Returns
830 831 * 0 - success
831 832 * ECONNRESET - success, but h was rebound
832 833 */
833 834 int
834 835 restarter_instance_update_states(scf_handle_t *h, restarter_inst_t *ri,
835 836 restarter_instance_state_t new_state,
836 837 restarter_instance_state_t new_state_next, restarter_error_t err,
837 838 restarter_str_t reason)
838 839 {
839 840 protocol_states_t *states;
840 841 int e;
841 842 uint_t retry_count = 0, msecs = ALLOC_DELAY;
842 843 boolean_t rebound = B_FALSE;
843 844 int prev_state_online;
844 845 int state_online;
845 846
846 847 assert(MUTEX_HELD(&ri->ri_lock));
847 848
848 849 prev_state_online = instance_started(ri);
849 850
850 851 retry:
851 852 e = _restarter_commit_states(h, &ri->ri_i, new_state, new_state_next,
852 853 restarter_get_str_short(reason));
853 854 switch (e) {
854 855 case 0:
855 856 break;
856 857
857 858 case ENOMEM:
858 859 ++retry_count;
859 860 if (retry_count < ALLOC_RETRY) {
860 861 (void) poll(NULL, 0, msecs);
861 862 msecs *= ALLOC_DELAY_MULT;
862 863 goto retry;
863 864 }
864 865
865 866 /* Like startd_alloc(). */
866 867 uu_die("Insufficient memory.\n");
867 868 /* NOTREACHED */
868 869
869 870 case ECONNABORTED:
870 871 libscf_handle_rebind(h);
871 872 rebound = B_TRUE;
872 873 goto retry;
873 874
874 875 case EPERM:
875 876 case EACCES:
876 877 case EROFS:
877 878 log_error(LOG_NOTICE, "Could not commit state change for %s "
878 879 "to repository: %s.\n", ri->ri_i.i_fmri, strerror(e));
879 880 /* FALLTHROUGH */
880 881
881 882 case ENOENT:
882 883 ri->ri_i.i_state = new_state;
883 884 ri->ri_i.i_next_state = new_state_next;
884 885 break;
885 886
886 887 case EINVAL:
887 888 default:
888 889 bad_error("_restarter_commit_states", e);
889 890 }
890 891
891 892 states = startd_alloc(sizeof (protocol_states_t));
892 893 states->ps_state = new_state;
893 894 states->ps_state_next = new_state_next;
894 895 states->ps_err = err;
895 896 states->ps_reason = reason;
896 897 graph_protocol_send_event(ri->ri_i.i_fmri, GRAPH_UPDATE_STATE_CHANGE,
897 898 (void *)states);
898 899
899 900 state_online = instance_started(ri);
900 901
901 902 if (prev_state_online && !state_online)
902 903 ri->ri_post_offline_hook();
903 904 else if (!prev_state_online && state_online)
904 905 ri->ri_post_online_hook();
905 906
906 907 return (rebound ? ECONNRESET : 0);
907 908 }
908 909
909 910 void
910 911 restarter_mark_pending_snapshot(const char *fmri, uint_t flag)
911 912 {
912 913 restarter_inst_t *inst;
913 914
914 915 assert(flag == RINST_RETAKE_RUNNING || flag == RINST_RETAKE_START);
915 916
916 917 inst = inst_lookup_by_name(fmri);
917 918 if (inst == NULL)
918 919 return;
919 920
920 921 inst->ri_flags |= flag;
921 922
922 923 MUTEX_UNLOCK(&inst->ri_lock);
923 924 }
924 925
925 926 static void
926 927 restarter_take_pending_snapshots(scf_handle_t *h)
927 928 {
928 929 restarter_inst_t *inst;
929 930 int r;
930 931
931 932 MUTEX_LOCK(&instance_list.ril_lock);
932 933
933 934 for (inst = uu_list_first(instance_list.ril_instance_list);
934 935 inst != NULL;
935 936 inst = uu_list_next(instance_list.ril_instance_list, inst)) {
936 937 const char *fmri;
937 938 scf_instance_t *sinst = NULL;
938 939
939 940 MUTEX_LOCK(&inst->ri_lock);
940 941
941 942 /*
942 943 * This is where we'd check inst->ri_method_thread and if it
943 944 * were nonzero we'd wait in anticipation of another thread
944 945 * executing a method for inst. Doing so with the instance_list
945 946 * locked, though, leads to deadlock. Since taking a snapshot
946 947 * during that window won't hurt anything, we'll just continue.
947 948 */
948 949
949 950 fmri = inst->ri_i.i_fmri;
950 951
951 952 if (inst->ri_flags & RINST_RETAKE_RUNNING) {
952 953 scf_snapshot_t *rsnap;
953 954
954 955 (void) libscf_fmri_get_instance(h, fmri, &sinst);
955 956
956 957 rsnap = libscf_get_or_make_running_snapshot(sinst,
957 958 fmri, B_FALSE);
958 959
959 960 scf_instance_destroy(sinst);
960 961
961 962 if (rsnap != NULL)
962 963 inst->ri_flags &= ~RINST_RETAKE_RUNNING;
963 964
964 965 scf_snapshot_destroy(rsnap);
965 966 }
966 967
967 968 if (inst->ri_flags & RINST_RETAKE_START) {
968 969 switch (r = libscf_snapshots_poststart(h, fmri,
969 970 B_FALSE)) {
970 971 case 0:
971 972 case ENOENT:
972 973 inst->ri_flags &= ~RINST_RETAKE_START;
973 974 break;
974 975
975 976 case ECONNABORTED:
976 977 break;
977 978
978 979 case EACCES:
979 980 default:
980 981 bad_error("libscf_snapshots_poststart", r);
981 982 }
982 983 }
983 984
984 985 MUTEX_UNLOCK(&inst->ri_lock);
985 986 }
986 987
987 988 MUTEX_UNLOCK(&instance_list.ril_lock);
988 989 }
989 990
990 991 /* ARGSUSED */
991 992 void *
992 993 restarter_post_fsminimal_thread(void *unused)
993 994 {
994 995 scf_handle_t *h;
995 996 int r;
996 997
997 998 h = libscf_handle_create_bound_loop();
998 999
999 1000 for (;;) {
1000 1001 r = libscf_create_self(h);
1001 1002 if (r == 0)
1002 1003 break;
1003 1004
1004 1005 assert(r == ECONNABORTED);
1005 1006 libscf_handle_rebind(h);
1006 1007 }
1007 1008
1008 1009 restarter_take_pending_snapshots(h);
1009 1010
1010 1011 (void) scf_handle_unbind(h);
1011 1012 scf_handle_destroy(h);
1012 1013
1013 1014 return (NULL);
1014 1015 }
1015 1016
1016 1017 /*
1017 1018 * int stop_instance()
1018 1019 *
1019 1020 * Stop the instance identified by the instance given as the second argument,
1020 1021 * for the cause stated.
1021 1022 *
1022 1023 * Returns
1023 1024 * 0 - success
1024 1025 * -1 - inst is in transition
1025 1026 */
1026 1027 static int
1027 1028 stop_instance(scf_handle_t *local_handle, restarter_inst_t *inst,
1028 1029 stop_cause_t cause)
1029 1030 {
1030 1031 fork_info_t *info;
1031 1032 const char *cp;
1032 1033 int err;
1033 1034 restarter_error_t re;
1034 1035 restarter_str_t reason;
1035 1036 restarter_instance_state_t new_state;
1036 1037
1037 1038 assert(MUTEX_HELD(&inst->ri_lock));
1038 1039 assert(inst->ri_method_thread == 0);
1039 1040
1040 1041 switch (cause) {
1041 1042 case RSTOP_EXIT:
1042 1043 re = RERR_RESTART;
1043 1044 reason = restarter_str_ct_ev_exit;
1044 1045 cp = "all processes in service exited";
1045 1046 break;
1046 1047 case RSTOP_ERR_CFG:
1047 1048 re = RERR_FAULT;
1048 1049 reason = restarter_str_method_failed;
1049 1050 cp = "service exited with a configuration error";
1050 1051 break;
1051 1052 case RSTOP_ERR_EXIT:
1052 1053 re = RERR_RESTART;
1053 1054 reason = restarter_str_ct_ev_exit;
1054 1055 cp = "service exited with an error";
1055 1056 break;
1056 1057 case RSTOP_CORE:
1057 1058 re = RERR_FAULT;
1058 1059 reason = restarter_str_ct_ev_core;
1059 1060 cp = "process dumped core";
1060 1061 break;
1061 1062 case RSTOP_SIGNAL:
1062 1063 re = RERR_FAULT;
1063 1064 reason = restarter_str_ct_ev_signal;
1064 1065 cp = "process received fatal signal from outside the service";
1065 1066 break;
1066 1067 case RSTOP_HWERR:
1067 1068 re = RERR_FAULT;
1068 1069 reason = restarter_str_ct_ev_hwerr;
1069 1070 cp = "process killed due to uncorrectable hardware error";
1070 1071 break;
1071 1072 case RSTOP_DEPENDENCY:
1072 1073 re = RERR_RESTART;
1073 1074 reason = restarter_str_dependency_activity;
1074 1075 cp = "dependency activity requires stop";
1075 1076 break;
1076 1077 case RSTOP_DISABLE:
1077 1078 re = RERR_RESTART;
1078 1079 reason = restarter_str_disable_request;
1079 1080 cp = "service disabled";
1080 1081 break;
1081 1082 case RSTOP_RESTART:
1082 1083 re = RERR_RESTART;
1083 1084 reason = restarter_str_restart_request;
1084 1085 cp = "service restarting";
1085 1086 break;
1086 1087 default:
1087 1088 #ifndef NDEBUG
1088 1089 (void) fprintf(stderr, "Unknown cause %d at %s:%d.\n",
1089 1090 cause, __FILE__, __LINE__);
1090 1091 #endif
1091 1092 abort();
1092 1093 }
1093 1094
1094 1095 /* Services in the disabled and maintenance state are ignored */
1095 1096 if (inst->ri_i.i_state == RESTARTER_STATE_MAINT ||
1096 1097 inst->ri_i.i_state == RESTARTER_STATE_DISABLED) {
1097 1098 log_framework(LOG_DEBUG,
1098 1099 "%s: stop_instance -> is maint/disabled\n",
1099 1100 inst->ri_i.i_fmri);
1100 1101 return (0);
1101 1102 }
1102 1103
1103 1104 /* Already stopped instances are left alone */
1104 1105 if (instance_started(inst) == 0) {
1105 1106 log_framework(LOG_DEBUG, "Restarter: %s is already stopped.\n",
1106 1107 inst->ri_i.i_fmri);
1107 1108 return (0);
1108 1109 }
1109 1110
1110 1111 if (instance_in_transition(inst)) {
1111 1112 /* requeue event by returning -1 */
1112 1113 log_framework(LOG_DEBUG,
1113 1114 "Restarter: Not stopping %s, in transition.\n",
1114 1115 inst->ri_i.i_fmri);
1115 1116 return (-1);
1116 1117 }
1117 1118
1118 1119 log_instance(inst, B_TRUE, "Stopping because %s.", cp);
1119 1120
1120 1121 log_framework(re == RERR_FAULT ? LOG_INFO : LOG_DEBUG,
1121 1122 "%s: Instance stopping because %s.\n", inst->ri_i.i_fmri, cp);
1122 1123
1123 1124 if (instance_is_wait_style(inst) &&
1124 1125 (cause == RSTOP_EXIT ||
1125 1126 cause == RSTOP_ERR_CFG ||
1126 1127 cause == RSTOP_ERR_EXIT)) {
1127 1128 /*
1128 1129 * No need to stop instance, as child has exited; remove
1129 1130 * contract and move the instance to the offline state.
1130 1131 */
1131 1132 switch (err = restarter_instance_update_states(local_handle,
1132 1133 inst, inst->ri_i.i_state, RESTARTER_STATE_OFFLINE, re,
1133 1134 reason)) {
1134 1135 case 0:
1135 1136 case ECONNRESET:
1136 1137 break;
1137 1138
1138 1139 default:
1139 1140 bad_error("restarter_instance_update_states", err);
1140 1141 }
1141 1142
1142 1143 if (cause == RSTOP_ERR_EXIT) {
1143 1144 /*
1144 1145 * The RSTOP_ERR_EXIT cause is set via the
1145 1146 * wait_thread -> wait_remove code path when we have
1146 1147 * a "wait" style svc that exited with an error. If
1147 1148 * the svc is failing too quickly, we throttle it so
1148 1149 * that we don't restart it more than once/second.
1149 1150 * Since we know we're running in the wait thread its
1150 1151 * ok to throttle it right here.
1151 1152 */
1152 1153 (void) update_fault_count(inst, FAULT_COUNT_INCR);
1153 1154 if (method_rate_critical(inst)) {
1154 1155 log_instance(inst, B_TRUE, "Failing too "
1155 1156 "quickly, throttling.");
1156 1157 (void) sleep(WT_SVC_ERR_THROTTLE);
1157 1158 }
1158 1159 } else {
1159 1160 (void) update_fault_count(inst, FAULT_COUNT_RESET);
1160 1161 reset_start_times(inst);
1161 1162 }
1162 1163
1163 1164 if (inst->ri_i.i_primary_ctid != 0) {
1164 1165 inst->ri_m_inst =
1165 1166 safe_scf_instance_create(local_handle);
1166 1167 inst->ri_mi_deleted = B_FALSE;
1167 1168
1168 1169 libscf_reget_instance(inst);
1169 1170 method_remove_contract(inst, B_TRUE, B_TRUE);
1170 1171
1171 1172 scf_instance_destroy(inst->ri_m_inst);
1172 1173 inst->ri_m_inst = NULL;
1173 1174 }
1174 1175
1175 1176 switch (err = restarter_instance_update_states(local_handle,
1176 1177 inst, inst->ri_i.i_next_state, RESTARTER_STATE_NONE, re,
1177 1178 reason)) {
1178 1179 case 0:
1179 1180 case ECONNRESET:
1180 1181 break;
1181 1182
1182 1183 default:
1183 1184 bad_error("restarter_instance_update_states", err);
1184 1185 }
1185 1186
1186 1187 if (cause != RSTOP_ERR_CFG)
1187 1188 return (0);
1188 1189 } else if (instance_is_wait_style(inst) && re == RERR_RESTART) {
1189 1190 /*
1190 1191 * Stopping a wait service through means other than the pid
1191 1192 * exiting should keep wait_thread() from restarting the
1192 1193 * service, by removing it from the wait list.
1193 1194 * We cannot remove it right now otherwise the process will
1194 1195 * end up <defunct> so mark it to be ignored.
1195 1196 */
1196 1197 wait_ignore_by_fmri(inst->ri_i.i_fmri);
1197 1198 }
1198 1199
1199 1200 /*
1200 1201 * There are some configuration errors which we cannot detect until we
1201 1202 * try to run the method. For example, see exec_method() where the
1202 1203 * restarter_set_method_context() call can return SMF_EXIT_ERR_CONFIG
1203 1204 * in several cases. If this happens for a "wait-style" svc,
1204 1205 * wait_remove() sets the cause as RSTOP_ERR_CFG so that we can detect
1205 1206 * the configuration error and go into maintenance, even though it is
1206 1207 * a "wait-style" svc.
1207 1208 */
1208 1209 if (cause == RSTOP_ERR_CFG)
1209 1210 new_state = RESTARTER_STATE_MAINT;
1210 1211 else
1211 1212 new_state = inst->ri_i.i_enabled ?
1212 1213 RESTARTER_STATE_OFFLINE : RESTARTER_STATE_DISABLED;
1213 1214
1214 1215 switch (err = restarter_instance_update_states(local_handle, inst,
1215 1216 inst->ri_i.i_state, new_state, RERR_NONE, reason)) {
1216 1217 case 0:
1217 1218 case ECONNRESET:
1218 1219 break;
1219 1220
1220 1221 default:
1221 1222 bad_error("restarter_instance_update_states", err);
1222 1223 }
1223 1224
1224 1225 info = startd_zalloc(sizeof (fork_info_t));
1225 1226
1226 1227 info->sf_id = inst->ri_id;
1227 1228 info->sf_method_type = METHOD_STOP;
1228 1229 info->sf_event_type = re;
1229 1230 info->sf_reason = reason;
1230 1231 inst->ri_method_thread = startd_thread_create(method_thread, info);
1231 1232
1232 1233 return (0);
1233 1234 }
1234 1235
1235 1236 /*
1236 1237 * Returns
1237 1238 * ENOENT - fmri is not in instance_list
1238 1239 * 0 - success
1239 1240 * ECONNRESET - success, though handle was rebound
1240 1241 * -1 - instance is in transition
1241 1242 */
1242 1243 int
1243 1244 stop_instance_fmri(scf_handle_t *h, const char *fmri, uint_t flags)
1244 1245 {
1245 1246 restarter_inst_t *rip;
1246 1247 int r;
1247 1248
1248 1249 rip = inst_lookup_by_name(fmri);
1249 1250 if (rip == NULL)
1250 1251 return (ENOENT);
1251 1252
1252 1253 r = stop_instance(h, rip, flags);
1253 1254
1254 1255 MUTEX_UNLOCK(&rip->ri_lock);
1255 1256
1256 1257 return (r);
1257 1258 }
1258 1259
1259 1260 static void
1260 1261 unmaintain_instance(scf_handle_t *h, restarter_inst_t *rip,
1261 1262 unmaint_cause_t cause)
1262 1263 {
1263 1264 ctid_t ctid;
1264 1265 scf_instance_t *inst;
1265 1266 int r;
1266 1267 uint_t tries = 0, msecs = ALLOC_DELAY;
1267 1268 const char *cp;
1268 1269 restarter_str_t reason;
1269 1270
1270 1271 assert(MUTEX_HELD(&rip->ri_lock));
1271 1272
1272 1273 if (rip->ri_i.i_state != RESTARTER_STATE_MAINT) {
1273 1274 log_error(LOG_DEBUG, "Restarter: "
1274 1275 "Ignoring maintenance off command because %s is not in the "
1275 1276 "maintenance state.\n", rip->ri_i.i_fmri);
1276 1277 return;
1277 1278 }
1278 1279
1279 1280 switch (cause) {
1280 1281 case RUNMAINT_CLEAR:
1281 1282 cp = "clear requested";
1282 1283 reason = restarter_str_clear_request;
1283 1284 break;
1284 1285 case RUNMAINT_DISABLE:
1285 1286 cp = "disable requested";
1286 1287 reason = restarter_str_disable_request;
1287 1288 break;
1288 1289 default:
1289 1290 #ifndef NDEBUG
1290 1291 (void) fprintf(stderr, "Uncaught case for %d at %s:%d.\n",
1291 1292 cause, __FILE__, __LINE__);
1292 1293 #endif
1293 1294 abort();
1294 1295 }
1295 1296
1296 1297 log_instance(rip, B_TRUE, "Leaving maintenance because %s.",
1297 1298 cp);
1298 1299 log_framework(LOG_DEBUG, "%s: Instance leaving maintenance because "
1299 1300 "%s.\n", rip->ri_i.i_fmri, cp);
1300 1301
1301 1302 (void) restarter_instance_update_states(h, rip, RESTARTER_STATE_UNINIT,
1302 1303 RESTARTER_STATE_NONE, RERR_RESTART, reason);
1303 1304
1304 1305 /*
1305 1306 * If we did ADMIN_MAINT_ON_IMMEDIATE, then there might still be
1306 1307 * a primary contract.
1307 1308 */
1308 1309 if (rip->ri_i.i_primary_ctid == 0)
1309 1310 return;
1310 1311
1311 1312 ctid = rip->ri_i.i_primary_ctid;
1312 1313 contract_abandon(ctid);
1313 1314 rip->ri_i.i_primary_ctid = 0;
1314 1315
1315 1316 rep_retry:
1316 1317 switch (r = libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst)) {
1317 1318 case 0:
1318 1319 break;
1319 1320
1320 1321 case ECONNABORTED:
1321 1322 libscf_handle_rebind(h);
1322 1323 goto rep_retry;
1323 1324
1324 1325 case ENOENT:
1325 1326 /* Must have been deleted. */
1326 1327 return;
1327 1328
1328 1329 case EINVAL:
1329 1330 case ENOTSUP:
1330 1331 default:
1331 1332 bad_error("libscf_handle_rebind", r);
1332 1333 }
1333 1334
1334 1335 again:
1335 1336 r = restarter_remove_contract(inst, ctid, RESTARTER_CONTRACT_PRIMARY);
1336 1337 switch (r) {
1337 1338 case 0:
1338 1339 break;
1339 1340
1340 1341 case ENOMEM:
1341 1342 ++tries;
1342 1343 if (tries < ALLOC_RETRY) {
1343 1344 (void) poll(NULL, 0, msecs);
1344 1345 msecs *= ALLOC_DELAY_MULT;
1345 1346 goto again;
1346 1347 }
1347 1348
1348 1349 uu_die("Insufficient memory.\n");
1349 1350 /* NOTREACHED */
1350 1351
1351 1352 case ECONNABORTED:
1352 1353 scf_instance_destroy(inst);
1353 1354 libscf_handle_rebind(h);
1354 1355 goto rep_retry;
1355 1356
1356 1357 case ECANCELED:
1357 1358 break;
1358 1359
1359 1360 case EPERM:
1360 1361 case EACCES:
1361 1362 case EROFS:
1362 1363 log_error(LOG_INFO,
1363 1364 "Could not remove contract id %lu for %s (%s).\n", ctid,
1364 1365 rip->ri_i.i_fmri, strerror(r));
1365 1366 break;
1366 1367
1367 1368 case EINVAL:
1368 1369 case EBADF:
1369 1370 default:
1370 1371 bad_error("restarter_remove_contract", r);
1371 1372 }
1372 1373
1373 1374 scf_instance_destroy(inst);
1374 1375 }
1375 1376
1376 1377 /*
1377 1378 * enable_inst()
1378 1379 * Set inst->ri_i.i_enabled. Expects 'e' to be _ENABLE, _DISABLE, or
1379 1380 * _ADMIN_DISABLE. If the event is _ENABLE and inst is uninitialized or
1380 1381 * disabled, move it to offline. If the event is _DISABLE or
1381 1382 * _ADMIN_DISABLE, make sure inst will move to disabled.
1382 1383 *
1383 1384 * Returns
1384 1385 * 0 - success
1385 1386 * ECONNRESET - h was rebound
1386 1387 */
1387 1388 static int
1388 1389 enable_inst(scf_handle_t *h, restarter_inst_t *inst,
1389 1390 restarter_instance_qentry_t *riq)
1390 1391 {
1391 1392 restarter_instance_state_t state;
1392 1393 restarter_event_type_t e = riq->riq_type;
1393 1394 restarter_str_t reason = restarter_str_per_configuration;
1394 1395 int r;
1395 1396
1396 1397 assert(MUTEX_HELD(&inst->ri_lock));
1397 1398 assert(e == RESTARTER_EVENT_TYPE_ADMIN_DISABLE ||
1398 1399 e == RESTARTER_EVENT_TYPE_DISABLE ||
1399 1400 e == RESTARTER_EVENT_TYPE_ENABLE);
1400 1401 assert(instance_in_transition(inst) == 0);
1401 1402
1402 1403 state = inst->ri_i.i_state;
1403 1404
1404 1405 if (e == RESTARTER_EVENT_TYPE_ENABLE) {
1405 1406 inst->ri_i.i_enabled = 1;
1406 1407
1407 1408 if (state == RESTARTER_STATE_UNINIT ||
1408 1409 state == RESTARTER_STATE_DISABLED) {
1409 1410 /*
1410 1411 * B_FALSE: Don't log an error if the log_instance()
1411 1412 * fails because it will fail on the miniroot before
1412 1413 * install-discovery runs.
1413 1414 */
1414 1415 log_instance(inst, B_FALSE, "Enabled.");
1415 1416 log_framework(LOG_DEBUG, "%s: Instance enabled.\n",
1416 1417 inst->ri_i.i_fmri);
1417 1418
1418 1419 /*
1419 1420 * If we are coming from DISABLED, it was obviously an
1420 1421 * enable request. If we are coming from UNINIT, it may
1421 1422 * have been a sevice in MAINT that was cleared.
1422 1423 */
1423 1424 if (riq->riq_reason == restarter_str_clear_request)
1424 1425 reason = restarter_str_clear_request;
1425 1426 else if (state == RESTARTER_STATE_DISABLED)
1426 1427 reason = restarter_str_enable_request;
1427 1428 (void) restarter_instance_update_states(h, inst,
1428 1429 RESTARTER_STATE_OFFLINE, RESTARTER_STATE_NONE,
1429 1430 RERR_NONE, reason);
1430 1431 } else {
1431 1432 log_framework(LOG_DEBUG, "Restarter: "
1432 1433 "Not changing state of %s for enable command.\n",
1433 1434 inst->ri_i.i_fmri);
1434 1435 }
1435 1436 } else {
1436 1437 inst->ri_i.i_enabled = 0;
1437 1438
1438 1439 switch (state) {
1439 1440 case RESTARTER_STATE_ONLINE:
1440 1441 case RESTARTER_STATE_DEGRADED:
1441 1442 r = stop_instance(h, inst, RSTOP_DISABLE);
1442 1443 return (r == ECONNRESET ? 0 : r);
1443 1444
1444 1445 case RESTARTER_STATE_OFFLINE:
1445 1446 case RESTARTER_STATE_UNINIT:
1446 1447 if (inst->ri_i.i_primary_ctid != 0) {
1447 1448 inst->ri_m_inst = safe_scf_instance_create(h);
1448 1449 inst->ri_mi_deleted = B_FALSE;
1449 1450
1450 1451 libscf_reget_instance(inst);
1451 1452 method_remove_contract(inst, B_TRUE, B_TRUE);
1452 1453
1453 1454 scf_instance_destroy(inst->ri_m_inst);
1454 1455 }
1455 1456 /* B_FALSE: See log_instance(..., "Enabled."); above */
1456 1457 log_instance(inst, B_FALSE, "Disabled.");
1457 1458 log_framework(LOG_DEBUG, "%s: Instance disabled.\n",
1458 1459 inst->ri_i.i_fmri);
1459 1460
1460 1461 /*
1461 1462 * If we are coming from OFFLINE, it was obviously a
1462 1463 * disable request. But if we are coming from
1463 1464 * UNINIT, it may have been a disable request for a
1464 1465 * service in MAINT.
1465 1466 */
1466 1467 if (riq->riq_reason == restarter_str_disable_request ||
1467 1468 state == RESTARTER_STATE_OFFLINE)
1468 1469 reason = restarter_str_disable_request;
1469 1470 (void) restarter_instance_update_states(h, inst,
1470 1471 RESTARTER_STATE_DISABLED, RESTARTER_STATE_NONE,
1471 1472 RERR_RESTART, reason);
1472 1473 return (0);
1473 1474
1474 1475 case RESTARTER_STATE_DISABLED:
1475 1476 break;
1476 1477
1477 1478 case RESTARTER_STATE_MAINT:
1478 1479 /*
1479 1480 * We only want to pull the instance out of maintenance
1480 1481 * if the disable is on adminstrative request. The
1481 1482 * graph engine sends _DISABLE events whenever a
1482 1483 * service isn't in the disabled state, and we don't
1483 1484 * want to pull the service out of maintenance if,
1484 1485 * for example, it is there due to a dependency cycle.
1485 1486 */
1486 1487 if (e == RESTARTER_EVENT_TYPE_ADMIN_DISABLE)
1487 1488 unmaintain_instance(h, inst, RUNMAINT_DISABLE);
1488 1489 break;
1489 1490
1490 1491 default:
1491 1492 #ifndef NDEBUG
1492 1493 (void) fprintf(stderr, "Restarter instance %s has "
1493 1494 "unknown state %d.\n", inst->ri_i.i_fmri, state);
1494 1495 #endif
1495 1496 abort();
1496 1497 }
1497 1498 }
1498 1499
1499 1500 return (0);
1500 1501 }
1501 1502
1502 1503 static void
1503 1504 start_instance(scf_handle_t *local_handle, restarter_inst_t *inst,
1504 1505 int32_t reason)
1505 1506 {
1506 1507 fork_info_t *info;
1507 1508 restarter_str_t new_reason;
1508 1509
1509 1510 assert(MUTEX_HELD(&inst->ri_lock));
1510 1511 assert(instance_in_transition(inst) == 0);
1511 1512 assert(inst->ri_method_thread == 0);
1512 1513
1513 1514 log_framework(LOG_DEBUG, "%s: trying to start instance\n",
1514 1515 inst->ri_i.i_fmri);
1515 1516
1516 1517 /*
1517 1518 * We want to keep the original reason for restarts and clear actions
1518 1519 */
1519 1520 switch (reason) {
1520 1521 case restarter_str_restart_request:
1521 1522 case restarter_str_clear_request:
1522 1523 new_reason = reason;
1523 1524 break;
1524 1525 default:
1525 1526 new_reason = restarter_str_dependencies_satisfied;
1526 1527 }
1527 1528
1528 1529 /* Services in the disabled and maintenance state are ignored */
1529 1530 if (inst->ri_i.i_state == RESTARTER_STATE_MAINT ||
1530 1531 inst->ri_i.i_state == RESTARTER_STATE_DISABLED ||
1531 1532 inst->ri_i.i_enabled == 0) {
1532 1533 log_framework(LOG_DEBUG,
1533 1534 "%s: start_instance -> is maint/disabled\n",
1534 1535 inst->ri_i.i_fmri);
1535 1536 return;
1536 1537 }
1537 1538
1538 1539 /* Already started instances are left alone */
1539 1540 if (instance_started(inst) == 1) {
1540 1541 log_framework(LOG_DEBUG,
1541 1542 "%s: start_instance -> is already started\n",
1542 1543 inst->ri_i.i_fmri);
1543 1544 return;
1544 1545 }
1545 1546
1546 1547 log_framework(LOG_DEBUG, "%s: starting instance.\n", inst->ri_i.i_fmri);
1547 1548
1548 1549 (void) restarter_instance_update_states(local_handle, inst,
1549 1550 inst->ri_i.i_state, RESTARTER_STATE_ONLINE, RERR_NONE, new_reason);
1550 1551
1551 1552 info = startd_zalloc(sizeof (fork_info_t));
1552 1553
1553 1554 info->sf_id = inst->ri_id;
1554 1555 info->sf_method_type = METHOD_START;
1555 1556 info->sf_event_type = RERR_NONE;
1556 1557 info->sf_reason = new_reason;
1557 1558 inst->ri_method_thread = startd_thread_create(method_thread, info);
1558 1559 }
1559 1560
1560 1561 static int
1561 1562 event_from_tty(scf_handle_t *h, restarter_inst_t *rip)
1562 1563 {
1563 1564 scf_instance_t *inst;
1564 1565 int ret = 0;
1565 1566
1566 1567 if (libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst))
1567 1568 return (-1);
1568 1569
1569 1570 ret = restarter_inst_ractions_from_tty(inst);
1570 1571
1571 1572 scf_instance_destroy(inst);
1572 1573 return (ret);
1573 1574 }
1574 1575
1575 1576 static boolean_t
1576 1577 restart_dump(scf_handle_t *h, restarter_inst_t *rip)
1577 1578 {
1578 1579 scf_instance_t *inst;
1579 1580 boolean_t ret = B_FALSE;
1580 1581
1581 1582 if (libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst))
1582 1583 return (-1);
1583 1584
1584 1585 if (restarter_inst_dump(inst) == 1)
1585 1586 ret = B_TRUE;
1586 1587
1587 1588 scf_instance_destroy(inst);
1588 1589 return (ret);
1589 1590 }
1590 1591
1591 1592 static void
1592 1593 maintain_instance(scf_handle_t *h, restarter_inst_t *rip, int immediate,
1593 1594 restarter_str_t reason)
1594 1595 {
1595 1596 fork_info_t *info;
1596 1597 scf_instance_t *scf_inst = NULL;
1597 1598
1598 1599 assert(MUTEX_HELD(&rip->ri_lock));
1599 1600 assert(reason != restarter_str_none);
1600 1601 assert(rip->ri_method_thread == 0);
1601 1602
1602 1603 log_instance(rip, B_TRUE, "Stopping for maintenance due to %s.",
1603 1604 restarter_get_str_short(reason));
1604 1605 log_framework(LOG_DEBUG, "%s: stopping for maintenance due to %s.\n",
1605 1606 rip->ri_i.i_fmri, restarter_get_str_short(reason));
1606 1607
1607 1608 /* Services in the maintenance state are ignored */
1608 1609 if (rip->ri_i.i_state == RESTARTER_STATE_MAINT) {
1609 1610 log_framework(LOG_DEBUG,
1610 1611 "%s: maintain_instance -> is already in maintenance\n",
1611 1612 rip->ri_i.i_fmri);
1612 1613 return;
1613 1614 }
1614 1615
1615 1616 /*
1616 1617 * If reason state is restarter_str_service_request and
1617 1618 * restarter_actions/auxiliary_fmri property is set with a valid fmri,
1618 1619 * copy the fmri to restarter/auxiliary_fmri so svcs -x can use.
1619 1620 */
1620 1621 if (reason == restarter_str_service_request &&
1621 1622 libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &scf_inst) == 0) {
1622 1623 if (restarter_inst_validate_ractions_aux_fmri(scf_inst) == 0) {
1623 1624 if (restarter_inst_set_aux_fmri(scf_inst))
1624 1625 log_framework(LOG_DEBUG, "%s: "
1625 1626 "restarter_inst_set_aux_fmri failed: ",
1626 1627 rip->ri_i.i_fmri);
1627 1628 } else {
1628 1629 log_framework(LOG_DEBUG, "%s: "
1629 1630 "restarter_inst_validate_ractions_aux_fmri "
1630 1631 "failed: ", rip->ri_i.i_fmri);
1631 1632
1632 1633 if (restarter_inst_reset_aux_fmri(scf_inst))
1633 1634 log_framework(LOG_DEBUG, "%s: "
1634 1635 "restarter_inst_reset_aux_fmri failed: ",
1635 1636 rip->ri_i.i_fmri);
1636 1637 }
1637 1638 scf_instance_destroy(scf_inst);
1638 1639 }
1639 1640
1640 1641 if (immediate || !instance_started(rip)) {
1641 1642 if (rip->ri_i.i_primary_ctid != 0) {
1642 1643 rip->ri_m_inst = safe_scf_instance_create(h);
1643 1644 rip->ri_mi_deleted = B_FALSE;
1644 1645
1645 1646 libscf_reget_instance(rip);
1646 1647 method_remove_contract(rip, B_TRUE, B_TRUE);
1647 1648
1648 1649 scf_instance_destroy(rip->ri_m_inst);
1649 1650 }
1650 1651
1651 1652 (void) restarter_instance_update_states(h, rip,
1652 1653 RESTARTER_STATE_MAINT, RESTARTER_STATE_NONE, RERR_RESTART,
1653 1654 reason);
1654 1655 return;
1655 1656 }
1656 1657
1657 1658 (void) restarter_instance_update_states(h, rip, rip->ri_i.i_state,
1658 1659 RESTARTER_STATE_MAINT, RERR_NONE, reason);
1659 1660
1660 1661 log_transition(rip, MAINT_REQUESTED);
1661 1662
1662 1663 info = startd_zalloc(sizeof (*info));
1663 1664 info->sf_id = rip->ri_id;
1664 1665 info->sf_method_type = METHOD_STOP;
1665 1666 info->sf_event_type = RERR_RESTART;
1666 1667 info->sf_reason = reason;
1667 1668 rip->ri_method_thread = startd_thread_create(method_thread, info);
1668 1669 }
1669 1670
1670 1671 static void
1671 1672 refresh_instance(scf_handle_t *h, restarter_inst_t *rip)
1672 1673 {
1673 1674 scf_instance_t *inst;
1674 1675 scf_snapshot_t *snap;
1675 1676 fork_info_t *info;
1676 1677 int r;
1677 1678
1678 1679 assert(MUTEX_HELD(&rip->ri_lock));
1679 1680
1680 1681 log_instance(rip, B_TRUE, "Rereading configuration.");
1681 1682 log_framework(LOG_DEBUG, "%s: rereading configuration.\n",
1682 1683 rip->ri_i.i_fmri);
1683 1684
1684 1685 rep_retry:
1685 1686 r = libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst);
1686 1687 switch (r) {
1687 1688 case 0:
1688 1689 break;
1689 1690
1690 1691 case ECONNABORTED:
1691 1692 libscf_handle_rebind(h);
1692 1693 goto rep_retry;
1693 1694
1694 1695 case ENOENT:
1695 1696 /* Must have been deleted. */
1696 1697 return;
1697 1698
1698 1699 case EINVAL:
1699 1700 case ENOTSUP:
1700 1701 default:
1701 1702 bad_error("libscf_fmri_get_instance", r);
1702 1703 }
1703 1704
1704 1705 snap = libscf_get_running_snapshot(inst);
1705 1706
1706 1707 r = libscf_get_startd_properties(inst, snap, &rip->ri_flags,
1707 1708 &rip->ri_utmpx_prefix);
1708 1709 switch (r) {
1709 1710 case 0:
1710 1711 log_framework(LOG_DEBUG, "%s is a %s-style service\n",
1711 1712 rip->ri_i.i_fmri, service_style(rip->ri_flags));
1712 1713 break;
1713 1714
1714 1715 case ECONNABORTED:
1715 1716 scf_instance_destroy(inst);
1716 1717 scf_snapshot_destroy(snap);
1717 1718 libscf_handle_rebind(h);
1718 1719 goto rep_retry;
1719 1720
1720 1721 case ECANCELED:
1721 1722 case ENOENT:
1722 1723 /* Succeed in anticipation of REMOVE_INSTANCE. */
1723 1724 break;
1724 1725
1725 1726 default:
1726 1727 bad_error("libscf_get_startd_properties", r);
1727 1728 }
1728 1729
1729 1730 if (instance_started(rip)) {
1730 1731 /* Refresh does not change the state. */
1731 1732 (void) restarter_instance_update_states(h, rip,
1732 1733 rip->ri_i.i_state, rip->ri_i.i_state, RERR_NONE,
1733 1734 restarter_str_refresh);
1734 1735
1735 1736 info = startd_zalloc(sizeof (*info));
1736 1737 info->sf_id = rip->ri_id;
1737 1738 info->sf_method_type = METHOD_REFRESH;
1738 1739 info->sf_event_type = RERR_REFRESH;
1739 1740 info->sf_reason = NULL;
↓ open down ↓ |
1705 lines elided |
↑ open up ↑ |
1740 1741
1741 1742 assert(rip->ri_method_thread == 0);
1742 1743 rip->ri_method_thread =
1743 1744 startd_thread_create(method_thread, info);
1744 1745 }
1745 1746
1746 1747 scf_snapshot_destroy(snap);
1747 1748 scf_instance_destroy(inst);
1748 1749 }
1749 1750
1751 +static void
1752 +degrade_instance(scf_handle_t *h, restarter_inst_t *rip, restarter_str_t reason)
1753 +{
1754 + scf_instance_t *scf_inst = NULL;
1755 +
1756 + assert(MUTEX_HELD(&rip->ri_lock));
1757 +
1758 + log_instance(rip, B_TRUE, "Marking degraded due to %s.",
1759 + restarter_get_str_short(reason));
1760 + log_framework(LOG_DEBUG, "%s: marking degraded due to %s.\n",
1761 + rip->ri_i.i_fmri, restarter_get_str_short(reason));
1762 +
1763 + /* Services that aren't online are ignored */
1764 + if (rip->ri_i.i_state != RESTARTER_STATE_ONLINE) {
1765 + log_framework(LOG_DEBUG,
1766 + "%s: degrade_instance -> is not online\n",
1767 + rip->ri_i.i_fmri);
1768 + return;
1769 + }
1770 +
1771 + /*
1772 + * If reason state is restarter_str_service_request and
1773 + * restarter_actions/auxiliary_fmri property is set with a valid fmri,
1774 + * copy the fmri to restarter/auxiliary_fmri so svcs -x can use.
1775 + */
1776 + if (reason == restarter_str_service_request &&
1777 + libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &scf_inst) == 0) {
1778 + if (restarter_inst_validate_ractions_aux_fmri(scf_inst) == 0) {
1779 + if (restarter_inst_set_aux_fmri(scf_inst))
1780 + log_framework(LOG_DEBUG, "%s: "
1781 + "restarter_inst_set_aux_fmri failed: ",
1782 + rip->ri_i.i_fmri);
1783 + } else {
1784 + log_framework(LOG_DEBUG, "%s: "
1785 + "restarter_inst_validate_ractions_aux_fmri "
1786 + "failed: ", rip->ri_i.i_fmri);
1787 +
1788 + if (restarter_inst_reset_aux_fmri(scf_inst))
1789 + log_framework(LOG_DEBUG, "%s: "
1790 + "restarter_inst_reset_aux_fmri failed: ",
1791 + rip->ri_i.i_fmri);
1792 + }
1793 + scf_instance_destroy(scf_inst);
1794 + }
1795 +
1796 + (void) restarter_instance_update_states(h, rip,
1797 + RESTARTER_STATE_DEGRADED, RESTARTER_STATE_NONE, RERR_NONE, reason);
1798 +
1799 + log_transition(rip, DEGRADE_REQUESTED);
1800 +}
1801 +
1750 1802 const char *event_names[] = { "INVALID", "ADD_INSTANCE", "REMOVE_INSTANCE",
1751 - "ENABLE", "DISABLE", "ADMIN_DEGRADED", "ADMIN_REFRESH",
1752 - "ADMIN_RESTART", "ADMIN_MAINT_OFF", "ADMIN_MAINT_ON",
1753 - "ADMIN_MAINT_ON_IMMEDIATE", "STOP", "START", "DEPENDENCY_CYCLE",
1754 - "INVALID_DEPENDENCY", "ADMIN_DISABLE", "STOP_RESET"
1803 + "ENABLE", "DISABLE", "ADMIN_RESTORE", "ADMIN_DEGRADED",
1804 + "ADMIN_DEGRADE_IMMEDIATE", "ADMIN_REFRESH", "ADMIN_RESTART",
1805 + "ADMIN_MAINT_OFF", "ADMIN_MAINT_ON", "ADMIN_MAINT_ON_IMMEDIATE",
1806 + "STOP", "START", "DEPENDENCY_CYCLE", "INVALID_DEPENDENCY",
1807 + "ADMIN_DISABLE", "STOP_RESET"
1755 1808 };
1756 1809
1757 1810 /*
1758 1811 * void *restarter_process_events()
1759 1812 *
1760 1813 * Called in a separate thread to process the events on an instance's
1761 1814 * queue. Empties the queue completely, and tries to keep the thread
1762 1815 * around for a little while after the queue is empty to save on
1763 1816 * startup costs.
1764 1817 */
1765 1818 static void *
1766 1819 restarter_process_events(void *arg)
1767 1820 {
1768 1821 scf_handle_t *h;
1769 1822 restarter_instance_qentry_t *event;
1770 1823 restarter_inst_t *rip;
1771 1824 char *fmri = (char *)arg;
1772 1825 struct timespec to;
1773 1826
1774 1827 assert(fmri != NULL);
1775 1828
1776 1829 h = libscf_handle_create_bound_loop();
1777 1830
1778 1831 /* grab the queue lock */
1779 1832 rip = inst_lookup_queue(fmri);
1780 1833 if (rip == NULL)
1781 1834 goto out;
1782 1835
1783 1836 again:
1784 1837
1785 1838 while ((event = uu_list_first(rip->ri_queue)) != NULL) {
1786 1839 restarter_inst_t *inst;
1787 1840
1788 1841 /* drop the queue lock */
1789 1842 MUTEX_UNLOCK(&rip->ri_queue_lock);
1790 1843
1791 1844 /*
1792 1845 * Grab the inst lock -- this waits until any outstanding
1793 1846 * method finishes running.
1794 1847 */
1795 1848 inst = inst_lookup_by_name(fmri);
1796 1849 if (inst == NULL) {
1797 1850 /* Getting deleted in the middle isn't an error. */
1798 1851 goto cont;
1799 1852 }
1800 1853
1801 1854 assert(instance_in_transition(inst) == 0);
1802 1855
1803 1856 /* process the event */
1804 1857 switch (event->riq_type) {
1805 1858 case RESTARTER_EVENT_TYPE_ENABLE:
1806 1859 case RESTARTER_EVENT_TYPE_DISABLE:
1807 1860 (void) enable_inst(h, inst, event);
1808 1861 break;
1809 1862
1810 1863 case RESTARTER_EVENT_TYPE_ADMIN_DISABLE:
1811 1864 if (enable_inst(h, inst, event) == 0)
1812 1865 reset_start_times(inst);
1813 1866 break;
1814 1867
1815 1868 case RESTARTER_EVENT_TYPE_REMOVE_INSTANCE:
1816 1869 restarter_delete_inst(inst);
1817 1870 inst = NULL;
1818 1871 goto cont;
1819 1872
1820 1873 case RESTARTER_EVENT_TYPE_STOP_RESET:
1821 1874 reset_start_times(inst);
1822 1875 /* FALLTHROUGH */
1823 1876 case RESTARTER_EVENT_TYPE_STOP:
1824 1877 (void) stop_instance(h, inst, RSTOP_DEPENDENCY);
1825 1878 break;
1826 1879
1827 1880 case RESTARTER_EVENT_TYPE_START:
1828 1881 start_instance(h, inst, event->riq_reason);
1829 1882 break;
1830 1883
1831 1884 case RESTARTER_EVENT_TYPE_DEPENDENCY_CYCLE:
1832 1885 maintain_instance(h, inst, 0,
1833 1886 restarter_str_dependency_cycle);
1834 1887 break;
1835 1888
1836 1889 case RESTARTER_EVENT_TYPE_INVALID_DEPENDENCY:
1837 1890 maintain_instance(h, inst, 0,
1838 1891 restarter_str_invalid_dependency);
1839 1892 break;
1840 1893
1841 1894 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON:
1842 1895 if (event_from_tty(h, inst) == 0)
1843 1896 maintain_instance(h, inst, 0,
1844 1897 restarter_str_service_request);
1845 1898 else
1846 1899 maintain_instance(h, inst, 0,
1847 1900 restarter_str_administrative_request);
1848 1901 break;
1849 1902
1850 1903 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON_IMMEDIATE:
1851 1904 if (event_from_tty(h, inst) == 0)
1852 1905 maintain_instance(h, inst, 1,
1853 1906 restarter_str_service_request);
1854 1907 else
1855 1908 maintain_instance(h, inst, 1,
1856 1909 restarter_str_administrative_request);
1857 1910 break;
1858 1911
↓ open down ↓ |
94 lines elided |
↑ open up ↑ |
1859 1912 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_OFF:
1860 1913 unmaintain_instance(h, inst, RUNMAINT_CLEAR);
1861 1914 reset_start_times(inst);
1862 1915 break;
1863 1916
1864 1917 case RESTARTER_EVENT_TYPE_ADMIN_REFRESH:
1865 1918 refresh_instance(h, inst);
1866 1919 break;
1867 1920
1868 1921 case RESTARTER_EVENT_TYPE_ADMIN_DEGRADED:
1869 - log_framework(LOG_WARNING, "Restarter: "
1870 - "%s command (for %s) unimplemented.\n",
1871 - event_names[event->riq_type], inst->ri_i.i_fmri);
1922 + case RESTARTER_EVENT_TYPE_ADMIN_DEGRADE_IMMEDIATE:
1923 + if (event_from_tty(h, inst) == 0)
1924 + degrade_instance(h, inst,
1925 + restarter_str_service_request);
1926 + else
1927 + degrade_instance(h, inst,
1928 + restarter_str_administrative_request);
1872 1929 break;
1873 1930
1931 + case RESTARTER_EVENT_TYPE_ADMIN_RESTORE:
1874 1932 case RESTARTER_EVENT_TYPE_ADMIN_RESTART:
1875 1933 if (!instance_started(inst)) {
1876 1934 log_framework(LOG_DEBUG, "Restarter: "
1877 1935 "Not restarting %s; not running.\n",
1878 1936 inst->ri_i.i_fmri);
1879 1937 } else {
1880 1938 /*
1881 1939 * Stop the instance. If it can be restarted,
1882 1940 * the graph engine will send a new event.
1883 1941 */
1884 1942 if (restart_dump(h, inst)) {
1885 1943 (void) contract_kill(
1886 1944 inst->ri_i.i_primary_ctid, SIGABRT,
1887 1945 inst->ri_i.i_fmri);
1888 1946 } else if (stop_instance(h, inst,
1889 1947 RSTOP_RESTART) == 0) {
1890 1948 reset_start_times(inst);
1891 1949 }
1892 1950 }
1893 1951 break;
1894 1952
1895 1953 case RESTARTER_EVENT_TYPE_ADD_INSTANCE:
1896 1954 default:
1897 1955 #ifndef NDEBUG
1898 1956 uu_warn("%s:%d: Bad restarter event %d. "
1899 1957 "Aborting.\n", __FILE__, __LINE__, event->riq_type);
1900 1958 #endif
1901 1959 abort();
1902 1960 }
1903 1961
1904 1962 assert(inst != NULL);
1905 1963 MUTEX_UNLOCK(&inst->ri_lock);
1906 1964
1907 1965 cont:
1908 1966 /* grab the queue lock */
1909 1967 rip = inst_lookup_queue(fmri);
1910 1968 if (rip == NULL)
1911 1969 goto out;
1912 1970
1913 1971 /* delete the event */
1914 1972 uu_list_remove(rip->ri_queue, event);
1915 1973 startd_free(event, sizeof (restarter_instance_qentry_t));
1916 1974 }
1917 1975
1918 1976 assert(rip != NULL);
1919 1977
1920 1978 /*
1921 1979 * Try to preserve the thread for a little while for future use.
1922 1980 */
1923 1981 to.tv_sec = 3;
1924 1982 to.tv_nsec = 0;
1925 1983 (void) pthread_cond_reltimedwait_np(&rip->ri_queue_cv,
1926 1984 &rip->ri_queue_lock, &to);
1927 1985
1928 1986 if (uu_list_first(rip->ri_queue) != NULL)
1929 1987 goto again;
1930 1988
1931 1989 rip->ri_queue_thread = 0;
1932 1990 MUTEX_UNLOCK(&rip->ri_queue_lock);
1933 1991
1934 1992 out:
1935 1993 (void) scf_handle_unbind(h);
1936 1994 scf_handle_destroy(h);
1937 1995 free(fmri);
1938 1996 return (NULL);
1939 1997 }
↓ open down ↓ |
56 lines elided |
↑ open up ↑ |
1940 1998
1941 1999 static int
1942 2000 is_admin_event(restarter_event_type_t t) {
1943 2001
1944 2002 switch (t) {
1945 2003 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON:
1946 2004 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON_IMMEDIATE:
1947 2005 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_OFF:
1948 2006 case RESTARTER_EVENT_TYPE_ADMIN_REFRESH:
1949 2007 case RESTARTER_EVENT_TYPE_ADMIN_DEGRADED:
2008 + case RESTARTER_EVENT_TYPE_ADMIN_DEGRADE_IMMEDIATE:
2009 + case RESTARTER_EVENT_TYPE_ADMIN_RESTORE:
1950 2010 case RESTARTER_EVENT_TYPE_ADMIN_RESTART:
1951 2011 return (1);
1952 2012 default:
1953 2013 return (0);
1954 2014 }
1955 2015 }
1956 2016
1957 2017 static void
1958 2018 restarter_queue_event(restarter_inst_t *ri, restarter_protocol_event_t *e)
1959 2019 {
1960 2020 restarter_instance_qentry_t *qe;
1961 2021 int r;
1962 2022
1963 2023 assert(MUTEX_HELD(&ri->ri_queue_lock));
1964 2024 assert(!MUTEX_HELD(&ri->ri_lock));
1965 2025
1966 2026 qe = startd_zalloc(sizeof (restarter_instance_qentry_t));
1967 2027 qe->riq_type = e->rpe_type;
1968 2028 qe->riq_reason = e->rpe_reason;
1969 2029
1970 2030 uu_list_node_init(qe, &qe->riq_link, restarter_queue_pool);
1971 2031 r = uu_list_insert_before(ri->ri_queue, NULL, qe);
1972 2032 assert(r == 0);
1973 2033 }
1974 2034
1975 2035 /*
1976 2036 * void *restarter_event_thread()
1977 2037 *
1978 2038 * Handle incoming graph events by placing them on a per-instance
1979 2039 * queue. We can't lock the main part of the instance structure, so
1980 2040 * just modify the seprarately locked event queue portion.
1981 2041 */
1982 2042 /*ARGSUSED*/
1983 2043 static void *
1984 2044 restarter_event_thread(void *unused)
1985 2045 {
1986 2046 scf_handle_t *h;
1987 2047
1988 2048 /*
1989 2049 * This is a new thread, and thus, gets its own handle
1990 2050 * to the repository.
1991 2051 */
1992 2052 h = libscf_handle_create_bound_loop();
1993 2053
1994 2054 MUTEX_LOCK(&ru->restarter_update_lock);
1995 2055
1996 2056 /*CONSTCOND*/
1997 2057 while (1) {
1998 2058 restarter_protocol_event_t *e;
1999 2059
2000 2060 while (ru->restarter_update_wakeup == 0)
2001 2061 (void) pthread_cond_wait(&ru->restarter_update_cv,
2002 2062 &ru->restarter_update_lock);
2003 2063
2004 2064 ru->restarter_update_wakeup = 0;
2005 2065
2006 2066 while ((e = restarter_event_dequeue()) != NULL) {
2007 2067 restarter_inst_t *rip;
2008 2068 char *fmri;
2009 2069
2010 2070 MUTEX_UNLOCK(&ru->restarter_update_lock);
2011 2071
2012 2072 /*
2013 2073 * ADD_INSTANCE is special: there's likely no
2014 2074 * instance structure yet, so we need to handle the
2015 2075 * addition synchronously.
2016 2076 */
2017 2077 switch (e->rpe_type) {
2018 2078 case RESTARTER_EVENT_TYPE_ADD_INSTANCE:
2019 2079 if (restarter_insert_inst(h, e->rpe_inst) != 0)
2020 2080 log_error(LOG_INFO, "Restarter: "
2021 2081 "Could not add %s.\n", e->rpe_inst);
2022 2082
2023 2083 MUTEX_LOCK(&st->st_load_lock);
2024 2084 if (--st->st_load_instances == 0)
2025 2085 (void) pthread_cond_broadcast(
2026 2086 &st->st_load_cv);
2027 2087 MUTEX_UNLOCK(&st->st_load_lock);
2028 2088
2029 2089 goto nolookup;
2030 2090 }
2031 2091
2032 2092 /*
2033 2093 * Lookup the instance, locking only the event queue.
2034 2094 * Can't grab ri_lock here because it might be held
2035 2095 * by a long-running method.
2036 2096 */
2037 2097 rip = inst_lookup_queue(e->rpe_inst);
2038 2098 if (rip == NULL) {
2039 2099 log_error(LOG_INFO, "Restarter: "
2040 2100 "Ignoring %s command for unknown service "
2041 2101 "%s.\n", event_names[e->rpe_type],
2042 2102 e->rpe_inst);
2043 2103 goto nolookup;
2044 2104 }
2045 2105
2046 2106 /* Keep ADMIN events from filling up the queue. */
2047 2107 if (is_admin_event(e->rpe_type) &&
2048 2108 uu_list_numnodes(rip->ri_queue) >
2049 2109 RINST_QUEUE_THRESHOLD) {
2050 2110 MUTEX_UNLOCK(&rip->ri_queue_lock);
2051 2111 log_instance(rip, B_TRUE, "Instance event "
2052 2112 "queue overflow. Dropping administrative "
2053 2113 "request.");
2054 2114 log_framework(LOG_DEBUG, "%s: Instance event "
2055 2115 "queue overflow. Dropping administrative "
2056 2116 "request.\n", rip->ri_i.i_fmri);
2057 2117 goto nolookup;
2058 2118 }
2059 2119
2060 2120 /* Now add the event to the instance queue. */
2061 2121 restarter_queue_event(rip, e);
2062 2122
2063 2123 if (rip->ri_queue_thread == 0) {
2064 2124 /*
2065 2125 * Start a thread if one isn't already
2066 2126 * running.
2067 2127 */
2068 2128 fmri = safe_strdup(e->rpe_inst);
2069 2129 rip->ri_queue_thread = startd_thread_create(
2070 2130 restarter_process_events, (void *)fmri);
2071 2131 } else {
2072 2132 /*
2073 2133 * Signal the existing thread that there's
2074 2134 * a new event.
2075 2135 */
2076 2136 (void) pthread_cond_broadcast(
2077 2137 &rip->ri_queue_cv);
2078 2138 }
2079 2139
2080 2140 MUTEX_UNLOCK(&rip->ri_queue_lock);
2081 2141 nolookup:
2082 2142 restarter_event_release(e);
2083 2143
2084 2144 MUTEX_LOCK(&ru->restarter_update_lock);
2085 2145 }
2086 2146 }
2087 2147
2088 2148 /*
2089 2149 * Unreachable for now -- there's currently no graceful cleanup
2090 2150 * called on exit().
2091 2151 */
2092 2152 (void) scf_handle_unbind(h);
2093 2153 scf_handle_destroy(h);
2094 2154 return (NULL);
2095 2155 }
2096 2156
2097 2157 static restarter_inst_t *
2098 2158 contract_to_inst(ctid_t ctid)
2099 2159 {
2100 2160 restarter_inst_t *inst;
2101 2161 int id;
2102 2162
2103 2163 id = lookup_inst_by_contract(ctid);
2104 2164 if (id == -1)
2105 2165 return (NULL);
2106 2166
2107 2167 inst = inst_lookup_by_id(id);
2108 2168 if (inst != NULL) {
2109 2169 /*
2110 2170 * Since ri_lock isn't held by the contract id lookup, this
2111 2171 * instance may have been restarted and now be in a new
2112 2172 * contract, making the old contract no longer valid for this
2113 2173 * instance.
2114 2174 */
2115 2175 if (ctid != inst->ri_i.i_primary_ctid) {
2116 2176 MUTEX_UNLOCK(&inst->ri_lock);
2117 2177 inst = NULL;
2118 2178 }
2119 2179 }
2120 2180 return (inst);
2121 2181 }
2122 2182
2123 2183 /*
2124 2184 * void contract_action()
2125 2185 * Take action on contract events.
2126 2186 */
2127 2187 static void
2128 2188 contract_action(scf_handle_t *h, restarter_inst_t *inst, ctid_t id,
2129 2189 uint32_t type)
2130 2190 {
2131 2191 const char *fmri = inst->ri_i.i_fmri;
2132 2192
2133 2193 assert(MUTEX_HELD(&inst->ri_lock));
2134 2194
2135 2195 /*
2136 2196 * If startd has stopped this contract, there is no need to
2137 2197 * stop it again.
2138 2198 */
2139 2199 if (inst->ri_i.i_primary_ctid > 0 &&
2140 2200 inst->ri_i.i_primary_ctid_stopped)
2141 2201 return;
2142 2202
2143 2203 if ((type & (CT_PR_EV_EMPTY | CT_PR_EV_CORE | CT_PR_EV_SIGNAL
2144 2204 | CT_PR_EV_HWERR)) == 0) {
2145 2205 /*
2146 2206 * There shouldn't be other events, since that's not how we set
2147 2207 * the terms. Thus, just log an error and drive on.
2148 2208 */
2149 2209 log_framework(LOG_NOTICE,
2150 2210 "%s: contract %ld received unexpected critical event "
2151 2211 "(%d)\n", fmri, id, type);
2152 2212 return;
2153 2213 }
2154 2214
2155 2215 assert(instance_in_transition(inst) == 0);
2156 2216
2157 2217 if (instance_is_wait_style(inst)) {
2158 2218 /*
2159 2219 * We ignore all events; if they impact the
2160 2220 * process we're monitoring, then the
2161 2221 * wait_thread will stop the instance.
2162 2222 */
2163 2223 log_framework(LOG_DEBUG,
2164 2224 "%s: ignoring contract event on wait-style service\n",
2165 2225 fmri);
2166 2226 } else {
2167 2227 /*
2168 2228 * A CT_PR_EV_EMPTY event is an RSTOP_EXIT request.
2169 2229 */
2170 2230 switch (type) {
2171 2231 case CT_PR_EV_EMPTY:
2172 2232 (void) stop_instance(h, inst, RSTOP_EXIT);
2173 2233 break;
2174 2234 case CT_PR_EV_CORE:
2175 2235 (void) stop_instance(h, inst, RSTOP_CORE);
2176 2236 break;
2177 2237 case CT_PR_EV_SIGNAL:
2178 2238 (void) stop_instance(h, inst, RSTOP_SIGNAL);
2179 2239 break;
2180 2240 case CT_PR_EV_HWERR:
2181 2241 (void) stop_instance(h, inst, RSTOP_HWERR);
2182 2242 break;
2183 2243 }
2184 2244 }
2185 2245 }
2186 2246
2187 2247 /*
2188 2248 * void *restarter_contract_event_thread(void *)
2189 2249 * Listens to the process contract bundle for critical events, taking action
2190 2250 * on events from contracts we know we are responsible for.
2191 2251 */
2192 2252 /*ARGSUSED*/
2193 2253 static void *
2194 2254 restarter_contracts_event_thread(void *unused)
2195 2255 {
2196 2256 int fd, err;
2197 2257 scf_handle_t *local_handle;
2198 2258
2199 2259 /*
2200 2260 * Await graph load completion. That is, stop here, until we've scanned
2201 2261 * the repository for contract - instance associations.
2202 2262 */
2203 2263 MUTEX_LOCK(&st->st_load_lock);
2204 2264 while (!(st->st_load_complete && st->st_load_instances == 0))
2205 2265 (void) pthread_cond_wait(&st->st_load_cv, &st->st_load_lock);
2206 2266 MUTEX_UNLOCK(&st->st_load_lock);
2207 2267
2208 2268 /*
2209 2269 * This is a new thread, and thus, gets its own handle
2210 2270 * to the repository.
2211 2271 */
2212 2272 if ((local_handle = libscf_handle_create_bound(SCF_VERSION)) == NULL)
2213 2273 uu_die("Unable to bind a new repository handle: %s\n",
2214 2274 scf_strerror(scf_error()));
2215 2275
2216 2276 fd = open64(CTFS_ROOT "/process/pbundle", O_RDONLY);
2217 2277 if (fd == -1)
2218 2278 uu_die("process bundle open failed");
2219 2279
2220 2280 /*
2221 2281 * Make sure we get all events (including those generated by configd
2222 2282 * before this thread was started).
2223 2283 */
2224 2284 err = ct_event_reset(fd);
2225 2285 assert(err == 0);
2226 2286
2227 2287 for (;;) {
2228 2288 int efd, sfd;
2229 2289 ct_evthdl_t ev;
2230 2290 uint32_t type;
2231 2291 ctevid_t evid;
2232 2292 ct_stathdl_t status;
2233 2293 ctid_t ctid;
2234 2294 restarter_inst_t *inst;
2235 2295 uint64_t cookie;
2236 2296
2237 2297 if (err = ct_event_read_critical(fd, &ev)) {
2238 2298 log_error(LOG_WARNING,
2239 2299 "Error reading next contract event: %s",
2240 2300 strerror(err));
2241 2301 continue;
2242 2302 }
2243 2303
2244 2304 evid = ct_event_get_evid(ev);
2245 2305 ctid = ct_event_get_ctid(ev);
2246 2306 type = ct_event_get_type(ev);
2247 2307
2248 2308 /* Fetch cookie. */
2249 2309 if ((sfd = contract_open(ctid, "process", "status", O_RDONLY))
2250 2310 < 0) {
2251 2311 ct_event_free(ev);
2252 2312 continue;
2253 2313 }
2254 2314
2255 2315 if (err = ct_status_read(sfd, CTD_COMMON, &status)) {
2256 2316 log_framework(LOG_WARNING, "Could not get status for "
2257 2317 "contract %ld: %s\n", ctid, strerror(err));
2258 2318
2259 2319 startd_close(sfd);
2260 2320 ct_event_free(ev);
2261 2321 continue;
2262 2322 }
2263 2323
2264 2324 cookie = ct_status_get_cookie(status);
2265 2325
2266 2326 log_framework(LOG_DEBUG, "Received event %d for ctid %ld "
2267 2327 "cookie %lld\n", type, ctid, cookie);
2268 2328
2269 2329 ct_status_free(status);
2270 2330
2271 2331 startd_close(sfd);
2272 2332
2273 2333 /*
2274 2334 * svc.configd(1M) restart handling performed by the
2275 2335 * fork_configd_thread. We don't acknowledge, as that thread
2276 2336 * will do so.
2277 2337 */
2278 2338 if (cookie == CONFIGD_COOKIE) {
2279 2339 ct_event_free(ev);
2280 2340 continue;
2281 2341 }
2282 2342
2283 2343 inst = NULL;
2284 2344 if (storing_contract != 0 &&
2285 2345 (inst = contract_to_inst(ctid)) == NULL) {
2286 2346 /*
2287 2347 * This can happen for two reasons:
2288 2348 * - method_run() has not yet stored the
2289 2349 * the contract into the internal hash table.
2290 2350 * - we receive an EMPTY event for an abandoned
2291 2351 * contract.
2292 2352 * If there is any contract in the process of
2293 2353 * being stored into the hash table then re-read
2294 2354 * the event later.
2295 2355 */
2296 2356 log_framework(LOG_DEBUG,
2297 2357 "Reset event %d for unknown "
2298 2358 "contract id %ld\n", type, ctid);
2299 2359
2300 2360 /* don't go too fast */
2301 2361 (void) poll(NULL, 0, 100);
2302 2362
2303 2363 (void) ct_event_reset(fd);
2304 2364 ct_event_free(ev);
2305 2365 continue;
2306 2366 }
2307 2367
2308 2368 /*
2309 2369 * Do not call contract_to_inst() again if first
2310 2370 * call succeeded.
2311 2371 */
2312 2372 if (inst == NULL)
2313 2373 inst = contract_to_inst(ctid);
2314 2374 if (inst == NULL) {
2315 2375 /*
2316 2376 * This can happen if we receive an EMPTY
2317 2377 * event for an abandoned contract.
2318 2378 */
2319 2379 log_framework(LOG_DEBUG,
2320 2380 "Received event %d for unknown contract id "
2321 2381 "%ld\n", type, ctid);
2322 2382 } else {
2323 2383 log_framework(LOG_DEBUG,
2324 2384 "Received event %d for contract id "
2325 2385 "%ld (%s)\n", type, ctid,
2326 2386 inst->ri_i.i_fmri);
2327 2387
2328 2388 contract_action(local_handle, inst, ctid, type);
2329 2389
2330 2390 MUTEX_UNLOCK(&inst->ri_lock);
2331 2391 }
2332 2392
2333 2393 efd = contract_open(ct_event_get_ctid(ev), "process", "ctl",
2334 2394 O_WRONLY);
2335 2395 if (efd != -1) {
2336 2396 (void) ct_ctl_ack(efd, evid);
2337 2397 startd_close(efd);
2338 2398 }
2339 2399
2340 2400 ct_event_free(ev);
2341 2401
2342 2402 }
2343 2403
2344 2404 /*NOTREACHED*/
2345 2405 return (NULL);
2346 2406 }
2347 2407
2348 2408 /*
2349 2409 * Timeout queue, processed by restarter_timeouts_event_thread().
2350 2410 */
2351 2411 timeout_queue_t *timeouts;
2352 2412 static uu_list_pool_t *timeout_pool;
2353 2413
2354 2414 typedef struct timeout_update {
2355 2415 pthread_mutex_t tu_lock;
2356 2416 pthread_cond_t tu_cv;
2357 2417 int tu_wakeup;
2358 2418 } timeout_update_t;
2359 2419
2360 2420 timeout_update_t *tu;
2361 2421
2362 2422 static const char *timeout_ovr_svcs[] = {
2363 2423 "svc:/system/manifest-import:default",
2364 2424 "svc:/network/initial:default",
2365 2425 "svc:/network/service:default",
2366 2426 "svc:/system/rmtmpfiles:default",
2367 2427 "svc:/network/loopback:default",
2368 2428 "svc:/network/physical:default",
2369 2429 "svc:/system/device/local:default",
2370 2430 "svc:/system/filesystem/usr:default",
2371 2431 "svc:/system/filesystem/minimal:default",
2372 2432 "svc:/system/filesystem/local:default",
2373 2433 NULL
2374 2434 };
2375 2435
2376 2436 int
2377 2437 is_timeout_ovr(restarter_inst_t *inst)
2378 2438 {
2379 2439 int i;
2380 2440
2381 2441 for (i = 0; timeout_ovr_svcs[i] != NULL; ++i) {
2382 2442 if (strcmp(inst->ri_i.i_fmri, timeout_ovr_svcs[i]) == 0) {
2383 2443 log_instance(inst, B_TRUE, "Timeout override by "
2384 2444 "svc.startd. Using infinite timeout.");
2385 2445 return (1);
2386 2446 }
2387 2447 }
2388 2448
2389 2449 return (0);
2390 2450 }
2391 2451
2392 2452 /*ARGSUSED*/
2393 2453 static int
2394 2454 timeout_compare(const void *lc_arg, const void *rc_arg, void *private)
2395 2455 {
2396 2456 hrtime_t t1 = ((const timeout_entry_t *)lc_arg)->te_timeout;
2397 2457 hrtime_t t2 = ((const timeout_entry_t *)rc_arg)->te_timeout;
2398 2458
2399 2459 if (t1 > t2)
2400 2460 return (1);
2401 2461 else if (t1 < t2)
2402 2462 return (-1);
2403 2463 return (0);
2404 2464 }
2405 2465
2406 2466 void
2407 2467 timeout_init()
2408 2468 {
2409 2469 timeouts = startd_zalloc(sizeof (timeout_queue_t));
2410 2470
2411 2471 (void) pthread_mutex_init(&timeouts->tq_lock, &mutex_attrs);
2412 2472
2413 2473 timeout_pool = startd_list_pool_create("timeouts",
2414 2474 sizeof (timeout_entry_t), offsetof(timeout_entry_t, te_link),
2415 2475 timeout_compare, UU_LIST_POOL_DEBUG);
2416 2476 assert(timeout_pool != NULL);
2417 2477
2418 2478 timeouts->tq_list = startd_list_create(timeout_pool,
2419 2479 timeouts, UU_LIST_SORTED);
2420 2480 assert(timeouts->tq_list != NULL);
2421 2481
2422 2482 tu = startd_zalloc(sizeof (timeout_update_t));
2423 2483 (void) pthread_cond_init(&tu->tu_cv, NULL);
2424 2484 (void) pthread_mutex_init(&tu->tu_lock, &mutex_attrs);
2425 2485 }
2426 2486
2427 2487 void
2428 2488 timeout_insert(restarter_inst_t *inst, ctid_t cid, uint64_t timeout_sec)
2429 2489 {
2430 2490 hrtime_t now, timeout;
2431 2491 timeout_entry_t *entry;
2432 2492 uu_list_index_t idx;
2433 2493
2434 2494 assert(MUTEX_HELD(&inst->ri_lock));
2435 2495
2436 2496 now = gethrtime();
2437 2497
2438 2498 /*
2439 2499 * If we overflow LLONG_MAX, we're never timing out anyways, so
2440 2500 * just return.
2441 2501 */
2442 2502 if (timeout_sec >= (LLONG_MAX - now) / 1000000000LL) {
2443 2503 log_instance(inst, B_TRUE, "timeout_seconds too large, "
2444 2504 "treating as infinite.");
2445 2505 return;
2446 2506 }
2447 2507
2448 2508 /* hrtime is in nanoseconds. Convert timeout_sec. */
2449 2509 timeout = now + (timeout_sec * 1000000000LL);
2450 2510
2451 2511 entry = startd_alloc(sizeof (timeout_entry_t));
2452 2512 entry->te_timeout = timeout;
2453 2513 entry->te_ctid = cid;
2454 2514 entry->te_fmri = safe_strdup(inst->ri_i.i_fmri);
2455 2515 entry->te_logstem = safe_strdup(inst->ri_logstem);
2456 2516 entry->te_fired = 0;
2457 2517 /* Insert the calculated timeout time onto the queue. */
2458 2518 MUTEX_LOCK(&timeouts->tq_lock);
2459 2519 (void) uu_list_find(timeouts->tq_list, entry, NULL, &idx);
2460 2520 uu_list_node_init(entry, &entry->te_link, timeout_pool);
2461 2521 uu_list_insert(timeouts->tq_list, entry, idx);
2462 2522 MUTEX_UNLOCK(&timeouts->tq_lock);
2463 2523
2464 2524 assert(inst->ri_timeout == NULL);
2465 2525 inst->ri_timeout = entry;
2466 2526
2467 2527 MUTEX_LOCK(&tu->tu_lock);
2468 2528 tu->tu_wakeup = 1;
2469 2529 (void) pthread_cond_broadcast(&tu->tu_cv);
2470 2530 MUTEX_UNLOCK(&tu->tu_lock);
2471 2531 }
2472 2532
2473 2533
2474 2534 void
2475 2535 timeout_remove(restarter_inst_t *inst, ctid_t cid)
2476 2536 {
2477 2537 assert(MUTEX_HELD(&inst->ri_lock));
2478 2538
2479 2539 if (inst->ri_timeout == NULL)
2480 2540 return;
2481 2541
2482 2542 assert(inst->ri_timeout->te_ctid == cid);
2483 2543
2484 2544 MUTEX_LOCK(&timeouts->tq_lock);
2485 2545 uu_list_remove(timeouts->tq_list, inst->ri_timeout);
2486 2546 MUTEX_UNLOCK(&timeouts->tq_lock);
2487 2547
2488 2548 free(inst->ri_timeout->te_fmri);
2489 2549 free(inst->ri_timeout->te_logstem);
2490 2550 startd_free(inst->ri_timeout, sizeof (timeout_entry_t));
2491 2551 inst->ri_timeout = NULL;
2492 2552 }
2493 2553
2494 2554 static int
2495 2555 timeout_now()
2496 2556 {
2497 2557 timeout_entry_t *e;
2498 2558 hrtime_t now;
2499 2559 int ret;
2500 2560
2501 2561 now = gethrtime();
2502 2562
2503 2563 /*
2504 2564 * Walk through the (sorted) timeouts list. While the timeout
2505 2565 * at the head of the list is <= the current time, kill the
2506 2566 * method.
2507 2567 */
2508 2568 MUTEX_LOCK(&timeouts->tq_lock);
2509 2569
2510 2570 for (e = uu_list_first(timeouts->tq_list);
2511 2571 e != NULL && e->te_timeout <= now;
2512 2572 e = uu_list_next(timeouts->tq_list, e)) {
2513 2573 log_framework(LOG_WARNING, "%s: Method or service exit timed "
2514 2574 "out. Killing contract %ld.\n", e->te_fmri, e->te_ctid);
2515 2575 log_instance_fmri(e->te_fmri, e->te_logstem, B_TRUE,
2516 2576 "Method or service exit timed out. Killing contract %ld.",
2517 2577 e->te_ctid);
2518 2578 e->te_fired = 1;
2519 2579 (void) contract_kill(e->te_ctid, SIGKILL, e->te_fmri);
2520 2580 }
2521 2581
2522 2582 if (uu_list_numnodes(timeouts->tq_list) > 0)
2523 2583 ret = 0;
2524 2584 else
2525 2585 ret = -1;
2526 2586
2527 2587 MUTEX_UNLOCK(&timeouts->tq_lock);
2528 2588
2529 2589 return (ret);
2530 2590 }
2531 2591
2532 2592 /*
2533 2593 * void *restarter_timeouts_event_thread(void *)
2534 2594 * Responsible for monitoring the method timeouts. This thread must
2535 2595 * be started before any methods are called.
2536 2596 */
2537 2597 /*ARGSUSED*/
2538 2598 static void *
2539 2599 restarter_timeouts_event_thread(void *unused)
2540 2600 {
2541 2601 /*
2542 2602 * Timeouts are entered on a priority queue, which is processed by
2543 2603 * this thread. As timeouts are specified in seconds, we'll do
2544 2604 * the necessary processing every second, as long as the queue
2545 2605 * is not empty.
2546 2606 */
2547 2607
2548 2608 /*CONSTCOND*/
2549 2609 while (1) {
2550 2610 /*
2551 2611 * As long as the timeout list isn't empty, process it
2552 2612 * every second.
2553 2613 */
2554 2614 if (timeout_now() == 0) {
2555 2615 (void) sleep(1);
2556 2616 continue;
2557 2617 }
2558 2618
2559 2619 /* The list is empty, wait until we have more timeouts. */
2560 2620 MUTEX_LOCK(&tu->tu_lock);
2561 2621
2562 2622 while (tu->tu_wakeup == 0)
2563 2623 (void) pthread_cond_wait(&tu->tu_cv, &tu->tu_lock);
2564 2624
2565 2625 tu->tu_wakeup = 0;
2566 2626 MUTEX_UNLOCK(&tu->tu_lock);
2567 2627 }
2568 2628
2569 2629 return (NULL);
2570 2630 }
2571 2631
2572 2632 void
2573 2633 restarter_start()
2574 2634 {
2575 2635 (void) startd_thread_create(restarter_timeouts_event_thread, NULL);
2576 2636 (void) startd_thread_create(restarter_event_thread, NULL);
2577 2637 (void) startd_thread_create(restarter_contracts_event_thread, NULL);
2578 2638 (void) startd_thread_create(wait_thread, NULL);
2579 2639 }
2580 2640
2581 2641
2582 2642 void
2583 2643 restarter_init()
2584 2644 {
2585 2645 restarter_instance_pool = startd_list_pool_create("restarter_instances",
2586 2646 sizeof (restarter_inst_t), offsetof(restarter_inst_t,
2587 2647 ri_link), restarter_instance_compare, UU_LIST_POOL_DEBUG);
2588 2648 (void) memset(&instance_list, 0, sizeof (instance_list));
2589 2649
2590 2650 (void) pthread_mutex_init(&instance_list.ril_lock, &mutex_attrs);
2591 2651 instance_list.ril_instance_list = startd_list_create(
2592 2652 restarter_instance_pool, &instance_list, UU_LIST_SORTED);
2593 2653
2594 2654 restarter_queue_pool = startd_list_pool_create(
2595 2655 "restarter_instance_queue", sizeof (restarter_instance_qentry_t),
2596 2656 offsetof(restarter_instance_qentry_t, riq_link), NULL,
2597 2657 UU_LIST_POOL_DEBUG);
2598 2658
2599 2659 contract_list_pool = startd_list_pool_create(
2600 2660 "contract_list", sizeof (contract_entry_t),
2601 2661 offsetof(contract_entry_t, ce_link), NULL,
2602 2662 UU_LIST_POOL_DEBUG);
2603 2663 contract_hash_init();
2604 2664
2605 2665 log_framework(LOG_DEBUG, "Initialized restarter\n");
2606 2666 }
↓ open down ↓ |
647 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX