Print this page
8158 Want named threads API
9857 proc manpages should have LIBRARY section
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/cmd/svc/startd/restarter.c
+++ new/usr/src/cmd/svc/startd/restarter.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24 - * Copyright (c) 2013, Joyent, Inc. All rights reserved.
24 + * Copyright 2018 Joyent, Inc.
25 25 */
26 26
27 27 /*
28 28 * restarter.c - service manipulation
29 29 *
30 30 * This component manages services whose restarter is svc.startd, the standard
31 31 * restarter. It translates restarter protocol events from the graph engine
32 32 * into actions on processes, as a delegated restarter would do.
33 33 *
34 34 * The master restarter manages a number of always-running threads:
35 35 * - restarter event thread: events from the graph engine
36 36 * - timeout thread: thread to fire queued timeouts
37 37 * - contract thread: thread to handle contract events
38 38 * - wait thread: thread to handle wait-based services
39 39 *
40 40 * The other threads are created as-needed:
41 41 * - per-instance method threads
42 42 * - per-instance event processing threads
43 43 *
44 44 * The interaction of all threads must result in the following conditions
45 45 * being satisfied (on a per-instance basis):
46 46 * - restarter events must be processed in order
47 47 * - method execution must be serialized
48 48 * - instance delete must be held until outstanding methods are complete
49 49 * - contract events shouldn't be processed while a method is running
50 50 * - timeouts should fire even when a method is running
51 51 *
52 52 * Service instances are represented by restarter_inst_t's and are kept in the
53 53 * instance_list list.
54 54 *
55 55 * Service States
56 56 * The current state of a service instance is kept in
57 57 * restarter_inst_t->ri_i.i_state. If transition to a new state could take
58 58 * some time, then before we effect the transition we set
59 59 * restarter_inst_t->ri_i.i_next_state to the target state, and afterwards we
60 60 * rotate i_next_state to i_state and set i_next_state to
61 61 * RESTARTER_STATE_NONE. So usually i_next_state is _NONE when ri_lock is not
62 62 * held. The exception is when we launch methods, which are done with
63 63 * a separate thread. To keep any other threads from grabbing ri_lock before
64 64 * method_thread() does, we set ri_method_thread to the thread id of the
65 65 * method thread, and when it is nonzero any thread with a different thread id
66 66 * waits on ri_method_cv.
67 67 *
68 68 * Method execution is serialized by blocking on ri_method_cv in
69 69 * inst_lookup_by_id() and waiting for a 0 value of ri_method_thread. This
70 70 * also prevents the instance structure from being deleted until all
71 71 * outstanding operations such as method_thread() have finished.
72 72 *
73 73 * Lock ordering:
74 74 *
75 75 * dgraph_lock [can be held when taking:]
76 76 * utmpx_lock
77 77 * dictionary->dict_lock
78 78 * st->st_load_lock
79 79 * wait_info_lock
80 80 * ru->restarter_update_lock
81 81 * restarter_queue->rpeq_lock
82 82 * instance_list.ril_lock
83 83 * inst->ri_lock
84 84 * st->st_configd_live_lock
85 85 *
86 86 * instance_list.ril_lock
87 87 * graph_queue->gpeq_lock
88 88 * gu->gu_lock
89 89 * st->st_configd_live_lock
90 90 * dictionary->dict_lock
91 91 * inst->ri_lock
92 92 * graph_queue->gpeq_lock
93 93 * gu->gu_lock
94 94 * tu->tu_lock
95 95 * tq->tq_lock
96 96 * inst->ri_queue_lock
97 97 * wait_info_lock
98 98 * bp->cb_lock
99 99 * utmpx_lock
100 100 *
101 101 * single_user_thread_lock
102 102 * wait_info_lock
103 103 * utmpx_lock
104 104 *
105 105 * gu_freeze_lock
106 106 *
107 107 * logbuf_mutex nests inside pretty much everything.
108 108 */
109 109
110 110 #include <sys/contract/process.h>
111 111 #include <sys/ctfs.h>
112 112 #include <sys/stat.h>
113 113 #include <sys/time.h>
114 114 #include <sys/types.h>
115 115 #include <sys/uio.h>
116 116 #include <sys/wait.h>
117 117 #include <assert.h>
118 118 #include <errno.h>
119 119 #include <fcntl.h>
120 120 #include <libcontract.h>
121 121 #include <libcontract_priv.h>
122 122 #include <libintl.h>
123 123 #include <librestart.h>
124 124 #include <librestart_priv.h>
125 125 #include <libuutil.h>
126 126 #include <limits.h>
127 127 #include <poll.h>
128 128 #include <port.h>
129 129 #include <pthread.h>
130 130 #include <stdarg.h>
131 131 #include <stdio.h>
132 132 #include <strings.h>
133 133 #include <unistd.h>
134 134
135 135 #include "startd.h"
136 136 #include "protocol.h"
137 137
138 138 static uu_list_pool_t *restarter_instance_pool;
139 139 static restarter_instance_list_t instance_list;
140 140
141 141 static uu_list_pool_t *restarter_queue_pool;
142 142
143 143 #define WT_SVC_ERR_THROTTLE 1 /* 1 sec delay for erroring wait svc */
144 144
145 145 /*
146 146 * Function used to reset the restart times for an instance, when
147 147 * an administrative task comes along and essentially makes the times
148 148 * in this array ineffective.
149 149 */
150 150 static void
151 151 reset_start_times(restarter_inst_t *inst)
152 152 {
153 153 inst->ri_start_index = 0;
154 154 bzero(inst->ri_start_time, sizeof (inst->ri_start_time));
155 155 }
156 156
157 157 /*ARGSUSED*/
158 158 static int
159 159 restarter_instance_compare(const void *lc_arg, const void *rc_arg,
160 160 void *private)
161 161 {
162 162 int lc_id = ((const restarter_inst_t *)lc_arg)->ri_id;
163 163 int rc_id = *(int *)rc_arg;
164 164
165 165 if (lc_id > rc_id)
166 166 return (1);
167 167 if (lc_id < rc_id)
168 168 return (-1);
169 169 return (0);
170 170 }
171 171
172 172 static restarter_inst_t *
173 173 inst_lookup_by_name(const char *name)
174 174 {
175 175 int id;
176 176
177 177 id = dict_lookup_byname(name);
178 178 if (id == -1)
179 179 return (NULL);
180 180
181 181 return (inst_lookup_by_id(id));
182 182 }
183 183
184 184 restarter_inst_t *
185 185 inst_lookup_by_id(int id)
186 186 {
187 187 restarter_inst_t *inst;
188 188
189 189 MUTEX_LOCK(&instance_list.ril_lock);
190 190 inst = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL);
191 191 if (inst != NULL)
192 192 MUTEX_LOCK(&inst->ri_lock);
193 193 MUTEX_UNLOCK(&instance_list.ril_lock);
194 194
195 195 if (inst != NULL) {
196 196 while (inst->ri_method_thread != 0 &&
197 197 !pthread_equal(inst->ri_method_thread, pthread_self())) {
198 198 ++inst->ri_method_waiters;
199 199 (void) pthread_cond_wait(&inst->ri_method_cv,
200 200 &inst->ri_lock);
201 201 assert(inst->ri_method_waiters > 0);
202 202 --inst->ri_method_waiters;
203 203 }
204 204 }
205 205
206 206 return (inst);
207 207 }
208 208
209 209 static restarter_inst_t *
210 210 inst_lookup_queue(const char *name)
211 211 {
212 212 int id;
213 213 restarter_inst_t *inst;
214 214
215 215 id = dict_lookup_byname(name);
216 216 if (id == -1)
217 217 return (NULL);
218 218
219 219 MUTEX_LOCK(&instance_list.ril_lock);
220 220 inst = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL);
221 221 if (inst != NULL)
222 222 MUTEX_LOCK(&inst->ri_queue_lock);
223 223 MUTEX_UNLOCK(&instance_list.ril_lock);
224 224
225 225 return (inst);
226 226 }
227 227
228 228 const char *
229 229 service_style(int flags)
230 230 {
231 231 switch (flags & RINST_STYLE_MASK) {
232 232 case RINST_CONTRACT: return ("contract");
233 233 case RINST_TRANSIENT: return ("transient");
234 234 case RINST_WAIT: return ("wait");
235 235
236 236 default:
237 237 #ifndef NDEBUG
238 238 uu_warn("%s:%d: Bad flags 0x%x.\n", __FILE__, __LINE__, flags);
239 239 #endif
240 240 abort();
241 241 /* NOTREACHED */
242 242 }
243 243 }
244 244
245 245 /*
246 246 * Fails with ECONNABORTED or ECANCELED.
247 247 */
248 248 static int
249 249 check_contract(restarter_inst_t *inst, boolean_t primary,
250 250 scf_instance_t *scf_inst)
251 251 {
252 252 ctid_t *ctidp;
253 253 int fd, r;
254 254
255 255 ctidp = primary ? &inst->ri_i.i_primary_ctid :
256 256 &inst->ri_i.i_transient_ctid;
257 257
258 258 assert(*ctidp >= 1);
259 259
260 260 fd = contract_open(*ctidp, NULL, "status", O_RDONLY);
261 261 if (fd >= 0) {
262 262 r = close(fd);
263 263 assert(r == 0);
264 264 return (0);
265 265 }
266 266
267 267 r = restarter_remove_contract(scf_inst, *ctidp, primary ?
268 268 RESTARTER_CONTRACT_PRIMARY : RESTARTER_CONTRACT_TRANSIENT);
269 269 switch (r) {
270 270 case 0:
271 271 case ECONNABORTED:
272 272 case ECANCELED:
273 273 *ctidp = 0;
274 274 return (r);
275 275
276 276 case ENOMEM:
277 277 uu_die("Out of memory\n");
278 278 /* NOTREACHED */
279 279
280 280 case EPERM:
281 281 uu_die("Insufficient privilege.\n");
282 282 /* NOTREACHED */
283 283
284 284 case EACCES:
285 285 uu_die("Repository backend access denied.\n");
286 286 /* NOTREACHED */
287 287
288 288 case EROFS:
289 289 log_error(LOG_INFO, "Could not remove unusable contract id %ld "
290 290 "for %s from repository.\n", *ctidp, inst->ri_i.i_fmri);
291 291 return (0);
292 292
293 293 case EINVAL:
294 294 case EBADF:
295 295 default:
296 296 assert(0);
297 297 abort();
298 298 /* NOTREACHED */
299 299 }
300 300 }
301 301
302 302 static int stop_instance(scf_handle_t *, restarter_inst_t *, stop_cause_t);
303 303
304 304 /*
305 305 * int restarter_insert_inst(scf_handle_t *, char *)
306 306 * If the inst is already in the restarter list, return its id. If the inst
307 307 * is not in the restarter list, initialize a restarter_inst_t, initialize its
308 308 * states, insert it into the list, and return 0.
309 309 *
310 310 * Fails with
311 311 * ENOENT - name is not in the repository
312 312 */
313 313 static int
314 314 restarter_insert_inst(scf_handle_t *h, const char *name)
315 315 {
316 316 int id, r;
317 317 restarter_inst_t *inst;
318 318 uu_list_index_t idx;
319 319 scf_service_t *scf_svc;
320 320 scf_instance_t *scf_inst;
321 321 scf_snapshot_t *snap = NULL;
322 322 scf_propertygroup_t *pg;
323 323 char *svc_name, *inst_name;
324 324 char logfilebuf[PATH_MAX];
325 325 char *c;
326 326 boolean_t do_commit_states;
327 327 restarter_instance_state_t state, next_state;
328 328 protocol_states_t *ps;
329 329 pid_t start_pid;
330 330 restarter_str_t reason = restarter_str_insert_in_graph;
331 331
332 332 MUTEX_LOCK(&instance_list.ril_lock);
333 333
334 334 /*
335 335 * We don't use inst_lookup_by_name() here because we want the lookup
336 336 * & insert to be atomic.
337 337 */
338 338 id = dict_lookup_byname(name);
339 339 if (id != -1) {
340 340 inst = uu_list_find(instance_list.ril_instance_list, &id, NULL,
341 341 &idx);
342 342 if (inst != NULL) {
343 343 MUTEX_UNLOCK(&instance_list.ril_lock);
344 344 return (0);
345 345 }
346 346 }
347 347
348 348 /* Allocate an instance */
349 349 inst = startd_zalloc(sizeof (restarter_inst_t));
350 350 inst->ri_utmpx_prefix = startd_alloc(max_scf_value_size);
351 351 inst->ri_utmpx_prefix[0] = '\0';
352 352
353 353 inst->ri_i.i_fmri = startd_alloc(strlen(name) + 1);
354 354 (void) strcpy((char *)inst->ri_i.i_fmri, name);
355 355
356 356 inst->ri_queue = startd_list_create(restarter_queue_pool, inst, 0);
357 357
358 358 /*
359 359 * id shouldn't be -1 since we use the same dictionary as graph.c, but
360 360 * just in case.
361 361 */
362 362 inst->ri_id = (id != -1 ? id : dict_insert(name));
363 363
364 364 special_online_hooks_get(name, &inst->ri_pre_online_hook,
365 365 &inst->ri_post_online_hook, &inst->ri_post_offline_hook);
366 366
367 367 scf_svc = safe_scf_service_create(h);
368 368 scf_inst = safe_scf_instance_create(h);
369 369 pg = safe_scf_pg_create(h);
370 370 svc_name = startd_alloc(max_scf_name_size);
371 371 inst_name = startd_alloc(max_scf_name_size);
372 372
373 373 rep_retry:
374 374 if (snap != NULL)
375 375 scf_snapshot_destroy(snap);
376 376 if (inst->ri_logstem != NULL)
377 377 startd_free(inst->ri_logstem, PATH_MAX);
378 378 if (inst->ri_common_name != NULL)
379 379 free(inst->ri_common_name);
380 380 if (inst->ri_C_common_name != NULL)
381 381 free(inst->ri_C_common_name);
382 382 snap = NULL;
383 383 inst->ri_logstem = NULL;
384 384 inst->ri_common_name = NULL;
385 385 inst->ri_C_common_name = NULL;
386 386
387 387 if (scf_handle_decode_fmri(h, name, NULL, scf_svc, scf_inst, NULL,
388 388 NULL, SCF_DECODE_FMRI_EXACT) != 0) {
389 389 switch (scf_error()) {
390 390 case SCF_ERROR_CONNECTION_BROKEN:
391 391 libscf_handle_rebind(h);
392 392 goto rep_retry;
393 393
394 394 case SCF_ERROR_NOT_FOUND:
395 395 goto deleted;
396 396 }
397 397
398 398 uu_die("Can't decode FMRI %s: %s\n", name,
399 399 scf_strerror(scf_error()));
400 400 }
401 401
402 402 /*
403 403 * If there's no running snapshot, then we execute using the editing
404 404 * snapshot. Pending snapshots will be taken later.
405 405 */
406 406 snap = libscf_get_running_snapshot(scf_inst);
407 407
408 408 if ((scf_service_get_name(scf_svc, svc_name, max_scf_name_size) < 0) ||
409 409 (scf_instance_get_name(scf_inst, inst_name, max_scf_name_size) <
410 410 0)) {
411 411 switch (scf_error()) {
412 412 case SCF_ERROR_NOT_SET:
413 413 break;
414 414
415 415 case SCF_ERROR_CONNECTION_BROKEN:
416 416 libscf_handle_rebind(h);
417 417 goto rep_retry;
418 418
419 419 default:
420 420 assert(0);
421 421 abort();
422 422 }
423 423
424 424 goto deleted;
425 425 }
426 426
427 427 (void) snprintf(logfilebuf, PATH_MAX, "%s:%s", svc_name, inst_name);
428 428 for (c = logfilebuf; *c != '\0'; c++)
429 429 if (*c == '/')
430 430 *c = '-';
431 431
432 432 inst->ri_logstem = startd_alloc(PATH_MAX);
433 433 (void) snprintf(inst->ri_logstem, PATH_MAX, "%s%s", logfilebuf,
434 434 LOG_SUFFIX);
435 435
436 436 /*
437 437 * If the restarter group is missing, use uninit/none. Otherwise,
438 438 * we're probably being restarted & don't want to mess up the states
439 439 * that are there.
440 440 */
441 441 state = RESTARTER_STATE_UNINIT;
442 442 next_state = RESTARTER_STATE_NONE;
443 443
444 444 r = scf_instance_get_pg(scf_inst, SCF_PG_RESTARTER, pg);
445 445 if (r != 0) {
446 446 switch (scf_error()) {
447 447 case SCF_ERROR_CONNECTION_BROKEN:
448 448 libscf_handle_rebind(h);
449 449 goto rep_retry;
450 450
451 451 case SCF_ERROR_NOT_SET:
452 452 goto deleted;
453 453
454 454 case SCF_ERROR_NOT_FOUND:
455 455 /*
456 456 * This shouldn't happen since the graph engine should
457 457 * have initialized the state to uninitialized/none if
458 458 * there was no restarter pg. In case somebody
459 459 * deleted it, though....
460 460 */
461 461 do_commit_states = B_TRUE;
462 462 break;
463 463
464 464 default:
465 465 assert(0);
466 466 abort();
467 467 }
468 468 } else {
469 469 r = libscf_read_states(pg, &state, &next_state);
470 470 if (r != 0) {
471 471 do_commit_states = B_TRUE;
472 472 } else {
473 473 if (next_state != RESTARTER_STATE_NONE) {
474 474 /*
475 475 * Force next_state to _NONE since we
476 476 * don't look for method processes.
477 477 */
478 478 next_state = RESTARTER_STATE_NONE;
479 479 do_commit_states = B_TRUE;
480 480 } else {
481 481 /*
482 482 * The reason for transition will depend on
483 483 * state.
484 484 */
485 485 if (st->st_initial == 0)
486 486 reason = restarter_str_startd_restart;
487 487 else if (state == RESTARTER_STATE_MAINT)
488 488 reason = restarter_str_bad_repo_state;
489 489 /*
490 490 * Inform the restarter of our state without
491 491 * changing the STIME in the repository.
492 492 */
493 493 ps = startd_alloc(sizeof (*ps));
494 494 inst->ri_i.i_state = ps->ps_state = state;
495 495 inst->ri_i.i_next_state = ps->ps_state_next =
496 496 next_state;
497 497 ps->ps_reason = reason;
498 498
499 499 graph_protocol_send_event(inst->ri_i.i_fmri,
500 500 GRAPH_UPDATE_STATE_CHANGE, ps);
501 501
502 502 do_commit_states = B_FALSE;
503 503 }
504 504 }
505 505 }
506 506
507 507 switch (libscf_get_startd_properties(scf_inst, snap, &inst->ri_flags,
508 508 &inst->ri_utmpx_prefix)) {
509 509 case 0:
510 510 break;
511 511
512 512 case ECONNABORTED:
513 513 libscf_handle_rebind(h);
514 514 goto rep_retry;
515 515
516 516 case ECANCELED:
517 517 goto deleted;
518 518
519 519 case ENOENT:
520 520 /*
521 521 * This is odd, because the graph engine should have required
522 522 * the general property group. So we'll just use default
523 523 * flags in anticipation of the graph engine sending us
524 524 * REMOVE_INSTANCE when it finds out that the general property
525 525 * group has been deleted.
526 526 */
527 527 inst->ri_flags = RINST_CONTRACT;
528 528 break;
529 529
530 530 default:
531 531 assert(0);
532 532 abort();
533 533 }
534 534
535 535 r = libscf_get_template_values(scf_inst, snap,
536 536 &inst->ri_common_name, &inst->ri_C_common_name);
537 537
538 538 /*
539 539 * Copy our names to smaller buffers to reduce our memory footprint.
540 540 */
541 541 if (inst->ri_common_name != NULL) {
542 542 char *tmp = safe_strdup(inst->ri_common_name);
543 543 startd_free(inst->ri_common_name, max_scf_value_size);
544 544 inst->ri_common_name = tmp;
545 545 }
546 546
547 547 if (inst->ri_C_common_name != NULL) {
548 548 char *tmp = safe_strdup(inst->ri_C_common_name);
549 549 startd_free(inst->ri_C_common_name, max_scf_value_size);
550 550 inst->ri_C_common_name = tmp;
551 551 }
552 552
553 553 switch (r) {
554 554 case 0:
555 555 break;
556 556
557 557 case ECONNABORTED:
558 558 libscf_handle_rebind(h);
559 559 goto rep_retry;
560 560
561 561 case ECANCELED:
562 562 goto deleted;
563 563
564 564 case ECHILD:
565 565 case ENOENT:
566 566 break;
567 567
568 568 default:
569 569 assert(0);
570 570 abort();
571 571 }
572 572
573 573 switch (libscf_read_method_ids(h, scf_inst, inst->ri_i.i_fmri,
574 574 &inst->ri_i.i_primary_ctid, &inst->ri_i.i_transient_ctid,
575 575 &start_pid)) {
576 576 case 0:
577 577 break;
578 578
579 579 case ECONNABORTED:
580 580 libscf_handle_rebind(h);
581 581 goto rep_retry;
582 582
583 583 case ECANCELED:
584 584 goto deleted;
585 585
586 586 default:
587 587 assert(0);
588 588 abort();
589 589 }
590 590
591 591 if (inst->ri_i.i_primary_ctid >= 1) {
592 592 contract_hash_store(inst->ri_i.i_primary_ctid, inst->ri_id);
593 593
594 594 switch (check_contract(inst, B_TRUE, scf_inst)) {
595 595 case 0:
596 596 break;
597 597
598 598 case ECONNABORTED:
599 599 libscf_handle_rebind(h);
600 600 goto rep_retry;
601 601
602 602 case ECANCELED:
603 603 goto deleted;
604 604
605 605 default:
606 606 assert(0);
607 607 abort();
608 608 }
609 609 }
610 610
611 611 if (inst->ri_i.i_transient_ctid >= 1) {
612 612 switch (check_contract(inst, B_FALSE, scf_inst)) {
613 613 case 0:
614 614 break;
615 615
616 616 case ECONNABORTED:
617 617 libscf_handle_rebind(h);
618 618 goto rep_retry;
619 619
620 620 case ECANCELED:
621 621 goto deleted;
622 622
623 623 default:
624 624 assert(0);
625 625 abort();
626 626 }
627 627 }
628 628
629 629 /* No more failures we live through, so add it to the list. */
630 630 (void) pthread_mutex_init(&inst->ri_lock, &mutex_attrs);
631 631 (void) pthread_mutex_init(&inst->ri_queue_lock, &mutex_attrs);
632 632 MUTEX_LOCK(&inst->ri_lock);
633 633 MUTEX_LOCK(&inst->ri_queue_lock);
634 634
635 635 (void) pthread_cond_init(&inst->ri_method_cv, NULL);
636 636
637 637 uu_list_node_init(inst, &inst->ri_link, restarter_instance_pool);
638 638 uu_list_insert(instance_list.ril_instance_list, inst, idx);
639 639 MUTEX_UNLOCK(&instance_list.ril_lock);
640 640
641 641 if (start_pid != -1 &&
642 642 (inst->ri_flags & RINST_STYLE_MASK) == RINST_WAIT) {
643 643 int ret;
644 644 ret = wait_register(start_pid, inst->ri_i.i_fmri, 0, 1);
645 645 if (ret == -1) {
646 646 /*
647 647 * Implication: if we can't reregister the
648 648 * instance, we will start another one. Two
649 649 * instances may or may not result in a resource
650 650 * conflict.
651 651 */
652 652 log_error(LOG_WARNING,
653 653 "%s: couldn't reregister %ld for wait\n",
654 654 inst->ri_i.i_fmri, start_pid);
655 655 } else if (ret == 1) {
656 656 /*
657 657 * Leading PID has exited.
658 658 */
659 659 (void) stop_instance(h, inst, RSTOP_EXIT);
660 660 }
661 661 }
662 662
663 663
664 664 scf_pg_destroy(pg);
665 665
666 666 if (do_commit_states)
667 667 (void) restarter_instance_update_states(h, inst, state,
668 668 next_state, RERR_NONE, reason);
669 669
670 670 log_framework(LOG_DEBUG, "%s is a %s-style service\n", name,
671 671 service_style(inst->ri_flags));
672 672
673 673 MUTEX_UNLOCK(&inst->ri_queue_lock);
674 674 MUTEX_UNLOCK(&inst->ri_lock);
675 675
676 676 startd_free(svc_name, max_scf_name_size);
677 677 startd_free(inst_name, max_scf_name_size);
678 678 scf_snapshot_destroy(snap);
679 679 scf_instance_destroy(scf_inst);
680 680 scf_service_destroy(scf_svc);
681 681
682 682 log_framework(LOG_DEBUG, "%s: inserted instance into restarter list\n",
683 683 name);
684 684
685 685 return (0);
686 686
687 687 deleted:
688 688 MUTEX_UNLOCK(&instance_list.ril_lock);
689 689 startd_free(inst_name, max_scf_name_size);
690 690 startd_free(svc_name, max_scf_name_size);
691 691 if (snap != NULL)
692 692 scf_snapshot_destroy(snap);
693 693 scf_pg_destroy(pg);
694 694 scf_instance_destroy(scf_inst);
695 695 scf_service_destroy(scf_svc);
696 696 startd_free((void *)inst->ri_i.i_fmri, strlen(inst->ri_i.i_fmri) + 1);
697 697 uu_list_destroy(inst->ri_queue);
698 698 if (inst->ri_logstem != NULL)
699 699 startd_free(inst->ri_logstem, PATH_MAX);
700 700 if (inst->ri_common_name != NULL)
701 701 free(inst->ri_common_name);
702 702 if (inst->ri_C_common_name != NULL)
703 703 free(inst->ri_C_common_name);
704 704 startd_free(inst->ri_utmpx_prefix, max_scf_value_size);
705 705 startd_free(inst, sizeof (restarter_inst_t));
706 706 return (ENOENT);
707 707 }
708 708
709 709 static void
710 710 restarter_delete_inst(restarter_inst_t *ri)
711 711 {
712 712 int id;
713 713 restarter_inst_t *rip;
714 714 void *cookie = NULL;
715 715 restarter_instance_qentry_t *e;
716 716
717 717 assert(MUTEX_HELD(&ri->ri_lock));
718 718
719 719 /*
720 720 * Must drop the instance lock so we can pick up the instance_list
721 721 * lock & remove the instance.
722 722 */
723 723 id = ri->ri_id;
724 724 MUTEX_UNLOCK(&ri->ri_lock);
725 725
726 726 MUTEX_LOCK(&instance_list.ril_lock);
727 727
728 728 rip = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL);
729 729 if (rip == NULL) {
730 730 MUTEX_UNLOCK(&instance_list.ril_lock);
731 731 return;
732 732 }
733 733
734 734 assert(ri == rip);
735 735
736 736 uu_list_remove(instance_list.ril_instance_list, ri);
737 737
738 738 log_framework(LOG_DEBUG, "%s: deleted instance from restarter list\n",
739 739 ri->ri_i.i_fmri);
740 740
741 741 MUTEX_UNLOCK(&instance_list.ril_lock);
742 742
743 743 /*
744 744 * We can lock the instance without holding the instance_list lock
745 745 * since we removed the instance from the list.
746 746 */
747 747 MUTEX_LOCK(&ri->ri_lock);
748 748 MUTEX_LOCK(&ri->ri_queue_lock);
749 749
750 750 if (ri->ri_i.i_primary_ctid >= 1)
751 751 contract_hash_remove(ri->ri_i.i_primary_ctid);
752 752
753 753 while (ri->ri_method_thread != 0 || ri->ri_method_waiters > 0)
754 754 (void) pthread_cond_wait(&ri->ri_method_cv, &ri->ri_lock);
755 755
756 756 while ((e = uu_list_teardown(ri->ri_queue, &cookie)) != NULL)
757 757 startd_free(e, sizeof (*e));
758 758 uu_list_destroy(ri->ri_queue);
759 759
760 760 startd_free((void *)ri->ri_i.i_fmri, strlen(ri->ri_i.i_fmri) + 1);
761 761 startd_free(ri->ri_logstem, PATH_MAX);
762 762 if (ri->ri_common_name != NULL)
763 763 free(ri->ri_common_name);
764 764 if (ri->ri_C_common_name != NULL)
765 765 free(ri->ri_C_common_name);
766 766 startd_free(ri->ri_utmpx_prefix, max_scf_value_size);
767 767 (void) pthread_mutex_destroy(&ri->ri_lock);
768 768 (void) pthread_mutex_destroy(&ri->ri_queue_lock);
769 769 startd_free(ri, sizeof (restarter_inst_t));
770 770 }
771 771
772 772 /*
773 773 * instance_is_wait_style()
774 774 *
775 775 * Returns 1 if the given instance is a "wait-style" service instance.
776 776 */
777 777 int
778 778 instance_is_wait_style(restarter_inst_t *inst)
779 779 {
780 780 assert(MUTEX_HELD(&inst->ri_lock));
781 781 return ((inst->ri_flags & RINST_STYLE_MASK) == RINST_WAIT);
782 782 }
783 783
784 784 /*
785 785 * instance_is_transient_style()
786 786 *
787 787 * Returns 1 if the given instance is a transient service instance.
788 788 */
789 789 int
790 790 instance_is_transient_style(restarter_inst_t *inst)
791 791 {
792 792 assert(MUTEX_HELD(&inst->ri_lock));
793 793 return ((inst->ri_flags & RINST_STYLE_MASK) == RINST_TRANSIENT);
794 794 }
795 795
796 796 /*
797 797 * instance_in_transition()
798 798 * Returns 1 if instance is in transition, 0 if not
799 799 */
800 800 int
801 801 instance_in_transition(restarter_inst_t *inst)
802 802 {
803 803 assert(MUTEX_HELD(&inst->ri_lock));
804 804 if (inst->ri_i.i_next_state == RESTARTER_STATE_NONE)
805 805 return (0);
806 806 return (1);
807 807 }
808 808
809 809 /*
810 810 * returns 1 if instance is already started, 0 if not
811 811 */
812 812 static int
813 813 instance_started(restarter_inst_t *inst)
814 814 {
815 815 int ret;
816 816
817 817 assert(MUTEX_HELD(&inst->ri_lock));
818 818
819 819 if (inst->ri_i.i_state == RESTARTER_STATE_ONLINE ||
820 820 inst->ri_i.i_state == RESTARTER_STATE_DEGRADED)
821 821 ret = 1;
822 822 else
823 823 ret = 0;
824 824
825 825 return (ret);
826 826 }
827 827
828 828 /*
829 829 * Returns
830 830 * 0 - success
831 831 * ECONNRESET - success, but h was rebound
832 832 */
833 833 int
834 834 restarter_instance_update_states(scf_handle_t *h, restarter_inst_t *ri,
835 835 restarter_instance_state_t new_state,
836 836 restarter_instance_state_t new_state_next, restarter_error_t err,
837 837 restarter_str_t reason)
838 838 {
839 839 protocol_states_t *states;
840 840 int e;
841 841 uint_t retry_count = 0, msecs = ALLOC_DELAY;
842 842 boolean_t rebound = B_FALSE;
843 843 int prev_state_online;
844 844 int state_online;
845 845
846 846 assert(MUTEX_HELD(&ri->ri_lock));
847 847
848 848 prev_state_online = instance_started(ri);
849 849
850 850 retry:
851 851 e = _restarter_commit_states(h, &ri->ri_i, new_state, new_state_next,
852 852 restarter_get_str_short(reason));
853 853 switch (e) {
854 854 case 0:
855 855 break;
856 856
857 857 case ENOMEM:
858 858 ++retry_count;
859 859 if (retry_count < ALLOC_RETRY) {
860 860 (void) poll(NULL, 0, msecs);
861 861 msecs *= ALLOC_DELAY_MULT;
862 862 goto retry;
863 863 }
864 864
865 865 /* Like startd_alloc(). */
866 866 uu_die("Insufficient memory.\n");
867 867 /* NOTREACHED */
868 868
869 869 case ECONNABORTED:
870 870 libscf_handle_rebind(h);
871 871 rebound = B_TRUE;
872 872 goto retry;
873 873
874 874 case EPERM:
875 875 case EACCES:
876 876 case EROFS:
877 877 log_error(LOG_NOTICE, "Could not commit state change for %s "
878 878 "to repository: %s.\n", ri->ri_i.i_fmri, strerror(e));
879 879 /* FALLTHROUGH */
880 880
881 881 case ENOENT:
882 882 ri->ri_i.i_state = new_state;
883 883 ri->ri_i.i_next_state = new_state_next;
884 884 break;
885 885
886 886 case EINVAL:
887 887 default:
888 888 bad_error("_restarter_commit_states", e);
889 889 }
890 890
891 891 states = startd_alloc(sizeof (protocol_states_t));
892 892 states->ps_state = new_state;
893 893 states->ps_state_next = new_state_next;
894 894 states->ps_err = err;
895 895 states->ps_reason = reason;
896 896 graph_protocol_send_event(ri->ri_i.i_fmri, GRAPH_UPDATE_STATE_CHANGE,
897 897 (void *)states);
898 898
899 899 state_online = instance_started(ri);
900 900
901 901 if (prev_state_online && !state_online)
902 902 ri->ri_post_offline_hook();
903 903 else if (!prev_state_online && state_online)
904 904 ri->ri_post_online_hook();
905 905
906 906 return (rebound ? ECONNRESET : 0);
907 907 }
908 908
909 909 void
910 910 restarter_mark_pending_snapshot(const char *fmri, uint_t flag)
911 911 {
912 912 restarter_inst_t *inst;
913 913
914 914 assert(flag == RINST_RETAKE_RUNNING || flag == RINST_RETAKE_START);
915 915
916 916 inst = inst_lookup_by_name(fmri);
917 917 if (inst == NULL)
918 918 return;
919 919
920 920 inst->ri_flags |= flag;
921 921
922 922 MUTEX_UNLOCK(&inst->ri_lock);
923 923 }
924 924
925 925 static void
926 926 restarter_take_pending_snapshots(scf_handle_t *h)
927 927 {
928 928 restarter_inst_t *inst;
929 929 int r;
930 930
931 931 MUTEX_LOCK(&instance_list.ril_lock);
932 932
933 933 for (inst = uu_list_first(instance_list.ril_instance_list);
934 934 inst != NULL;
935 935 inst = uu_list_next(instance_list.ril_instance_list, inst)) {
936 936 const char *fmri;
937 937 scf_instance_t *sinst = NULL;
938 938
939 939 MUTEX_LOCK(&inst->ri_lock);
940 940
941 941 /*
942 942 * This is where we'd check inst->ri_method_thread and if it
943 943 * were nonzero we'd wait in anticipation of another thread
944 944 * executing a method for inst. Doing so with the instance_list
945 945 * locked, though, leads to deadlock. Since taking a snapshot
946 946 * during that window won't hurt anything, we'll just continue.
947 947 */
948 948
949 949 fmri = inst->ri_i.i_fmri;
950 950
951 951 if (inst->ri_flags & RINST_RETAKE_RUNNING) {
952 952 scf_snapshot_t *rsnap;
953 953
954 954 (void) libscf_fmri_get_instance(h, fmri, &sinst);
955 955
956 956 rsnap = libscf_get_or_make_running_snapshot(sinst,
957 957 fmri, B_FALSE);
958 958
959 959 scf_instance_destroy(sinst);
960 960
961 961 if (rsnap != NULL)
962 962 inst->ri_flags &= ~RINST_RETAKE_RUNNING;
963 963
964 964 scf_snapshot_destroy(rsnap);
965 965 }
966 966
967 967 if (inst->ri_flags & RINST_RETAKE_START) {
968 968 switch (r = libscf_snapshots_poststart(h, fmri,
969 969 B_FALSE)) {
970 970 case 0:
971 971 case ENOENT:
972 972 inst->ri_flags &= ~RINST_RETAKE_START;
973 973 break;
974 974
975 975 case ECONNABORTED:
976 976 break;
977 977
978 978 case EACCES:
979 979 default:
980 980 bad_error("libscf_snapshots_poststart", r);
981 981 }
982 982 }
983 983
984 984 MUTEX_UNLOCK(&inst->ri_lock);
985 985 }
986 986
↓ open down ↓ |
952 lines elided |
↑ open up ↑ |
987 987 MUTEX_UNLOCK(&instance_list.ril_lock);
988 988 }
989 989
990 990 /* ARGSUSED */
991 991 void *
992 992 restarter_post_fsminimal_thread(void *unused)
993 993 {
994 994 scf_handle_t *h;
995 995 int r;
996 996
997 + (void) pthread_setname_np(pthread_self(), "restarter_post_fsmin");
998 +
997 999 h = libscf_handle_create_bound_loop();
998 1000
999 1001 for (;;) {
1000 1002 r = libscf_create_self(h);
1001 1003 if (r == 0)
1002 1004 break;
1003 1005
1004 1006 assert(r == ECONNABORTED);
1005 1007 libscf_handle_rebind(h);
1006 1008 }
1007 1009
1008 1010 restarter_take_pending_snapshots(h);
1009 1011
1010 1012 (void) scf_handle_unbind(h);
1011 1013 scf_handle_destroy(h);
1012 1014
1013 1015 return (NULL);
1014 1016 }
1015 1017
1016 1018 /*
1017 1019 * int stop_instance()
1018 1020 *
1019 1021 * Stop the instance identified by the instance given as the second argument,
1020 1022 * for the cause stated.
1021 1023 *
1022 1024 * Returns
1023 1025 * 0 - success
1024 1026 * -1 - inst is in transition
1025 1027 */
1026 1028 static int
1027 1029 stop_instance(scf_handle_t *local_handle, restarter_inst_t *inst,
1028 1030 stop_cause_t cause)
1029 1031 {
1030 1032 fork_info_t *info;
1031 1033 const char *cp;
1032 1034 int err;
1033 1035 restarter_error_t re;
1034 1036 restarter_str_t reason;
1035 1037 restarter_instance_state_t new_state;
1036 1038
1037 1039 assert(MUTEX_HELD(&inst->ri_lock));
1038 1040 assert(inst->ri_method_thread == 0);
1039 1041
1040 1042 switch (cause) {
1041 1043 case RSTOP_EXIT:
1042 1044 re = RERR_RESTART;
1043 1045 reason = restarter_str_ct_ev_exit;
1044 1046 cp = "all processes in service exited";
1045 1047 break;
1046 1048 case RSTOP_ERR_CFG:
1047 1049 re = RERR_FAULT;
1048 1050 reason = restarter_str_method_failed;
1049 1051 cp = "service exited with a configuration error";
1050 1052 break;
1051 1053 case RSTOP_ERR_EXIT:
1052 1054 re = RERR_RESTART;
1053 1055 reason = restarter_str_ct_ev_exit;
1054 1056 cp = "service exited with an error";
1055 1057 break;
1056 1058 case RSTOP_CORE:
1057 1059 re = RERR_FAULT;
1058 1060 reason = restarter_str_ct_ev_core;
1059 1061 cp = "process dumped core";
1060 1062 break;
1061 1063 case RSTOP_SIGNAL:
1062 1064 re = RERR_FAULT;
1063 1065 reason = restarter_str_ct_ev_signal;
1064 1066 cp = "process received fatal signal from outside the service";
1065 1067 break;
1066 1068 case RSTOP_HWERR:
1067 1069 re = RERR_FAULT;
1068 1070 reason = restarter_str_ct_ev_hwerr;
1069 1071 cp = "process killed due to uncorrectable hardware error";
1070 1072 break;
1071 1073 case RSTOP_DEPENDENCY:
1072 1074 re = RERR_RESTART;
1073 1075 reason = restarter_str_dependency_activity;
1074 1076 cp = "dependency activity requires stop";
1075 1077 break;
1076 1078 case RSTOP_DISABLE:
1077 1079 re = RERR_RESTART;
1078 1080 reason = restarter_str_disable_request;
1079 1081 cp = "service disabled";
1080 1082 break;
1081 1083 case RSTOP_RESTART:
1082 1084 re = RERR_RESTART;
1083 1085 reason = restarter_str_restart_request;
1084 1086 cp = "service restarting";
1085 1087 break;
1086 1088 default:
1087 1089 #ifndef NDEBUG
1088 1090 (void) fprintf(stderr, "Unknown cause %d at %s:%d.\n",
1089 1091 cause, __FILE__, __LINE__);
1090 1092 #endif
1091 1093 abort();
1092 1094 }
1093 1095
1094 1096 /* Services in the disabled and maintenance state are ignored */
1095 1097 if (inst->ri_i.i_state == RESTARTER_STATE_MAINT ||
1096 1098 inst->ri_i.i_state == RESTARTER_STATE_DISABLED) {
1097 1099 log_framework(LOG_DEBUG,
1098 1100 "%s: stop_instance -> is maint/disabled\n",
1099 1101 inst->ri_i.i_fmri);
1100 1102 return (0);
1101 1103 }
1102 1104
1103 1105 /* Already stopped instances are left alone */
1104 1106 if (instance_started(inst) == 0) {
1105 1107 log_framework(LOG_DEBUG, "Restarter: %s is already stopped.\n",
1106 1108 inst->ri_i.i_fmri);
1107 1109 return (0);
1108 1110 }
1109 1111
1110 1112 if (instance_in_transition(inst)) {
1111 1113 /* requeue event by returning -1 */
1112 1114 log_framework(LOG_DEBUG,
1113 1115 "Restarter: Not stopping %s, in transition.\n",
1114 1116 inst->ri_i.i_fmri);
1115 1117 return (-1);
1116 1118 }
1117 1119
1118 1120 log_instance(inst, B_TRUE, "Stopping because %s.", cp);
1119 1121
1120 1122 log_framework(re == RERR_FAULT ? LOG_INFO : LOG_DEBUG,
1121 1123 "%s: Instance stopping because %s.\n", inst->ri_i.i_fmri, cp);
1122 1124
1123 1125 if (instance_is_wait_style(inst) &&
1124 1126 (cause == RSTOP_EXIT ||
1125 1127 cause == RSTOP_ERR_CFG ||
1126 1128 cause == RSTOP_ERR_EXIT)) {
1127 1129 /*
1128 1130 * No need to stop instance, as child has exited; remove
1129 1131 * contract and move the instance to the offline state.
1130 1132 */
1131 1133 switch (err = restarter_instance_update_states(local_handle,
1132 1134 inst, inst->ri_i.i_state, RESTARTER_STATE_OFFLINE, re,
1133 1135 reason)) {
1134 1136 case 0:
1135 1137 case ECONNRESET:
1136 1138 break;
1137 1139
1138 1140 default:
1139 1141 bad_error("restarter_instance_update_states", err);
1140 1142 }
1141 1143
1142 1144 if (cause == RSTOP_ERR_EXIT) {
1143 1145 /*
1144 1146 * The RSTOP_ERR_EXIT cause is set via the
1145 1147 * wait_thread -> wait_remove code path when we have
1146 1148 * a "wait" style svc that exited with an error. If
1147 1149 * the svc is failing too quickly, we throttle it so
1148 1150 * that we don't restart it more than once/second.
1149 1151 * Since we know we're running in the wait thread its
1150 1152 * ok to throttle it right here.
1151 1153 */
1152 1154 (void) update_fault_count(inst, FAULT_COUNT_INCR);
1153 1155 if (method_rate_critical(inst)) {
1154 1156 log_instance(inst, B_TRUE, "Failing too "
1155 1157 "quickly, throttling.");
1156 1158 (void) sleep(WT_SVC_ERR_THROTTLE);
1157 1159 }
1158 1160 } else {
1159 1161 (void) update_fault_count(inst, FAULT_COUNT_RESET);
1160 1162 reset_start_times(inst);
1161 1163 }
1162 1164
1163 1165 if (inst->ri_i.i_primary_ctid != 0) {
1164 1166 inst->ri_m_inst =
1165 1167 safe_scf_instance_create(local_handle);
1166 1168 inst->ri_mi_deleted = B_FALSE;
1167 1169
1168 1170 libscf_reget_instance(inst);
1169 1171 method_remove_contract(inst, B_TRUE, B_TRUE);
1170 1172
1171 1173 scf_instance_destroy(inst->ri_m_inst);
1172 1174 inst->ri_m_inst = NULL;
1173 1175 }
1174 1176
1175 1177 switch (err = restarter_instance_update_states(local_handle,
1176 1178 inst, inst->ri_i.i_next_state, RESTARTER_STATE_NONE, re,
1177 1179 reason)) {
1178 1180 case 0:
1179 1181 case ECONNRESET:
1180 1182 break;
1181 1183
1182 1184 default:
1183 1185 bad_error("restarter_instance_update_states", err);
1184 1186 }
1185 1187
1186 1188 if (cause != RSTOP_ERR_CFG)
1187 1189 return (0);
1188 1190 } else if (instance_is_wait_style(inst) && re == RERR_RESTART) {
1189 1191 /*
1190 1192 * Stopping a wait service through means other than the pid
1191 1193 * exiting should keep wait_thread() from restarting the
1192 1194 * service, by removing it from the wait list.
1193 1195 * We cannot remove it right now otherwise the process will
1194 1196 * end up <defunct> so mark it to be ignored.
1195 1197 */
1196 1198 wait_ignore_by_fmri(inst->ri_i.i_fmri);
1197 1199 }
1198 1200
1199 1201 /*
1200 1202 * There are some configuration errors which we cannot detect until we
1201 1203 * try to run the method. For example, see exec_method() where the
1202 1204 * restarter_set_method_context() call can return SMF_EXIT_ERR_CONFIG
1203 1205 * in several cases. If this happens for a "wait-style" svc,
1204 1206 * wait_remove() sets the cause as RSTOP_ERR_CFG so that we can detect
1205 1207 * the configuration error and go into maintenance, even though it is
1206 1208 * a "wait-style" svc.
1207 1209 */
1208 1210 if (cause == RSTOP_ERR_CFG)
1209 1211 new_state = RESTARTER_STATE_MAINT;
1210 1212 else
1211 1213 new_state = inst->ri_i.i_enabled ?
1212 1214 RESTARTER_STATE_OFFLINE : RESTARTER_STATE_DISABLED;
1213 1215
1214 1216 switch (err = restarter_instance_update_states(local_handle, inst,
1215 1217 inst->ri_i.i_state, new_state, RERR_NONE, reason)) {
1216 1218 case 0:
1217 1219 case ECONNRESET:
1218 1220 break;
1219 1221
1220 1222 default:
1221 1223 bad_error("restarter_instance_update_states", err);
1222 1224 }
1223 1225
1224 1226 info = startd_zalloc(sizeof (fork_info_t));
1225 1227
1226 1228 info->sf_id = inst->ri_id;
1227 1229 info->sf_method_type = METHOD_STOP;
1228 1230 info->sf_event_type = re;
1229 1231 info->sf_reason = reason;
1230 1232 inst->ri_method_thread = startd_thread_create(method_thread, info);
1231 1233
1232 1234 return (0);
1233 1235 }
1234 1236
1235 1237 /*
1236 1238 * Returns
1237 1239 * ENOENT - fmri is not in instance_list
1238 1240 * 0 - success
1239 1241 * ECONNRESET - success, though handle was rebound
1240 1242 * -1 - instance is in transition
1241 1243 */
1242 1244 int
1243 1245 stop_instance_fmri(scf_handle_t *h, const char *fmri, uint_t flags)
1244 1246 {
1245 1247 restarter_inst_t *rip;
1246 1248 int r;
1247 1249
1248 1250 rip = inst_lookup_by_name(fmri);
1249 1251 if (rip == NULL)
1250 1252 return (ENOENT);
1251 1253
1252 1254 r = stop_instance(h, rip, flags);
1253 1255
1254 1256 MUTEX_UNLOCK(&rip->ri_lock);
1255 1257
1256 1258 return (r);
1257 1259 }
1258 1260
1259 1261 static void
1260 1262 unmaintain_instance(scf_handle_t *h, restarter_inst_t *rip,
1261 1263 unmaint_cause_t cause)
1262 1264 {
1263 1265 ctid_t ctid;
1264 1266 scf_instance_t *inst;
1265 1267 int r;
1266 1268 uint_t tries = 0, msecs = ALLOC_DELAY;
1267 1269 const char *cp;
1268 1270 restarter_str_t reason;
1269 1271
1270 1272 assert(MUTEX_HELD(&rip->ri_lock));
1271 1273
1272 1274 if (rip->ri_i.i_state != RESTARTER_STATE_MAINT) {
1273 1275 log_error(LOG_DEBUG, "Restarter: "
1274 1276 "Ignoring maintenance off command because %s is not in the "
1275 1277 "maintenance state.\n", rip->ri_i.i_fmri);
1276 1278 return;
1277 1279 }
1278 1280
1279 1281 switch (cause) {
1280 1282 case RUNMAINT_CLEAR:
1281 1283 cp = "clear requested";
1282 1284 reason = restarter_str_clear_request;
1283 1285 break;
1284 1286 case RUNMAINT_DISABLE:
1285 1287 cp = "disable requested";
1286 1288 reason = restarter_str_disable_request;
1287 1289 break;
1288 1290 default:
1289 1291 #ifndef NDEBUG
1290 1292 (void) fprintf(stderr, "Uncaught case for %d at %s:%d.\n",
1291 1293 cause, __FILE__, __LINE__);
1292 1294 #endif
1293 1295 abort();
1294 1296 }
1295 1297
1296 1298 log_instance(rip, B_TRUE, "Leaving maintenance because %s.",
1297 1299 cp);
1298 1300 log_framework(LOG_DEBUG, "%s: Instance leaving maintenance because "
1299 1301 "%s.\n", rip->ri_i.i_fmri, cp);
1300 1302
1301 1303 (void) restarter_instance_update_states(h, rip, RESTARTER_STATE_UNINIT,
1302 1304 RESTARTER_STATE_NONE, RERR_RESTART, reason);
1303 1305
1304 1306 /*
1305 1307 * If we did ADMIN_MAINT_ON_IMMEDIATE, then there might still be
1306 1308 * a primary contract.
1307 1309 */
1308 1310 if (rip->ri_i.i_primary_ctid == 0)
1309 1311 return;
1310 1312
1311 1313 ctid = rip->ri_i.i_primary_ctid;
1312 1314 contract_abandon(ctid);
1313 1315 rip->ri_i.i_primary_ctid = 0;
1314 1316
1315 1317 rep_retry:
1316 1318 switch (r = libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst)) {
1317 1319 case 0:
1318 1320 break;
1319 1321
1320 1322 case ECONNABORTED:
1321 1323 libscf_handle_rebind(h);
1322 1324 goto rep_retry;
1323 1325
1324 1326 case ENOENT:
1325 1327 /* Must have been deleted. */
1326 1328 return;
1327 1329
1328 1330 case EINVAL:
1329 1331 case ENOTSUP:
1330 1332 default:
1331 1333 bad_error("libscf_handle_rebind", r);
1332 1334 }
1333 1335
1334 1336 again:
1335 1337 r = restarter_remove_contract(inst, ctid, RESTARTER_CONTRACT_PRIMARY);
1336 1338 switch (r) {
1337 1339 case 0:
1338 1340 break;
1339 1341
1340 1342 case ENOMEM:
1341 1343 ++tries;
1342 1344 if (tries < ALLOC_RETRY) {
1343 1345 (void) poll(NULL, 0, msecs);
1344 1346 msecs *= ALLOC_DELAY_MULT;
1345 1347 goto again;
1346 1348 }
1347 1349
1348 1350 uu_die("Insufficient memory.\n");
1349 1351 /* NOTREACHED */
1350 1352
1351 1353 case ECONNABORTED:
1352 1354 scf_instance_destroy(inst);
1353 1355 libscf_handle_rebind(h);
1354 1356 goto rep_retry;
1355 1357
1356 1358 case ECANCELED:
1357 1359 break;
1358 1360
1359 1361 case EPERM:
1360 1362 case EACCES:
1361 1363 case EROFS:
1362 1364 log_error(LOG_INFO,
1363 1365 "Could not remove contract id %lu for %s (%s).\n", ctid,
1364 1366 rip->ri_i.i_fmri, strerror(r));
1365 1367 break;
1366 1368
1367 1369 case EINVAL:
1368 1370 case EBADF:
1369 1371 default:
1370 1372 bad_error("restarter_remove_contract", r);
1371 1373 }
1372 1374
1373 1375 scf_instance_destroy(inst);
1374 1376 }
1375 1377
1376 1378 /*
1377 1379 * enable_inst()
1378 1380 * Set inst->ri_i.i_enabled. Expects 'e' to be _ENABLE, _DISABLE, or
1379 1381 * _ADMIN_DISABLE. If the event is _ENABLE and inst is uninitialized or
1380 1382 * disabled, move it to offline. If the event is _DISABLE or
1381 1383 * _ADMIN_DISABLE, make sure inst will move to disabled.
1382 1384 *
1383 1385 * Returns
1384 1386 * 0 - success
1385 1387 * ECONNRESET - h was rebound
1386 1388 */
1387 1389 static int
1388 1390 enable_inst(scf_handle_t *h, restarter_inst_t *inst,
1389 1391 restarter_instance_qentry_t *riq)
1390 1392 {
1391 1393 restarter_instance_state_t state;
1392 1394 restarter_event_type_t e = riq->riq_type;
1393 1395 restarter_str_t reason = restarter_str_per_configuration;
1394 1396 int r;
1395 1397
1396 1398 assert(MUTEX_HELD(&inst->ri_lock));
1397 1399 assert(e == RESTARTER_EVENT_TYPE_ADMIN_DISABLE ||
1398 1400 e == RESTARTER_EVENT_TYPE_DISABLE ||
1399 1401 e == RESTARTER_EVENT_TYPE_ENABLE);
1400 1402 assert(instance_in_transition(inst) == 0);
1401 1403
1402 1404 state = inst->ri_i.i_state;
1403 1405
1404 1406 if (e == RESTARTER_EVENT_TYPE_ENABLE) {
1405 1407 inst->ri_i.i_enabled = 1;
1406 1408
1407 1409 if (state == RESTARTER_STATE_UNINIT ||
1408 1410 state == RESTARTER_STATE_DISABLED) {
1409 1411 /*
1410 1412 * B_FALSE: Don't log an error if the log_instance()
1411 1413 * fails because it will fail on the miniroot before
1412 1414 * install-discovery runs.
1413 1415 */
1414 1416 log_instance(inst, B_FALSE, "Enabled.");
1415 1417 log_framework(LOG_DEBUG, "%s: Instance enabled.\n",
1416 1418 inst->ri_i.i_fmri);
1417 1419
1418 1420 /*
1419 1421 * If we are coming from DISABLED, it was obviously an
1420 1422 * enable request. If we are coming from UNINIT, it may
1421 1423 * have been a sevice in MAINT that was cleared.
1422 1424 */
1423 1425 if (riq->riq_reason == restarter_str_clear_request)
1424 1426 reason = restarter_str_clear_request;
1425 1427 else if (state == RESTARTER_STATE_DISABLED)
1426 1428 reason = restarter_str_enable_request;
1427 1429 (void) restarter_instance_update_states(h, inst,
1428 1430 RESTARTER_STATE_OFFLINE, RESTARTER_STATE_NONE,
1429 1431 RERR_NONE, reason);
1430 1432 } else {
1431 1433 log_framework(LOG_DEBUG, "Restarter: "
1432 1434 "Not changing state of %s for enable command.\n",
1433 1435 inst->ri_i.i_fmri);
1434 1436 }
1435 1437 } else {
1436 1438 inst->ri_i.i_enabled = 0;
1437 1439
1438 1440 switch (state) {
1439 1441 case RESTARTER_STATE_ONLINE:
1440 1442 case RESTARTER_STATE_DEGRADED:
1441 1443 r = stop_instance(h, inst, RSTOP_DISABLE);
1442 1444 return (r == ECONNRESET ? 0 : r);
1443 1445
1444 1446 case RESTARTER_STATE_OFFLINE:
1445 1447 case RESTARTER_STATE_UNINIT:
1446 1448 if (inst->ri_i.i_primary_ctid != 0) {
1447 1449 inst->ri_m_inst = safe_scf_instance_create(h);
1448 1450 inst->ri_mi_deleted = B_FALSE;
1449 1451
1450 1452 libscf_reget_instance(inst);
1451 1453 method_remove_contract(inst, B_TRUE, B_TRUE);
1452 1454
1453 1455 scf_instance_destroy(inst->ri_m_inst);
1454 1456 }
1455 1457 /* B_FALSE: See log_instance(..., "Enabled."); above */
1456 1458 log_instance(inst, B_FALSE, "Disabled.");
1457 1459 log_framework(LOG_DEBUG, "%s: Instance disabled.\n",
1458 1460 inst->ri_i.i_fmri);
1459 1461
1460 1462 /*
1461 1463 * If we are coming from OFFLINE, it was obviously a
1462 1464 * disable request. But if we are coming from
1463 1465 * UNINIT, it may have been a disable request for a
1464 1466 * service in MAINT.
1465 1467 */
1466 1468 if (riq->riq_reason == restarter_str_disable_request ||
1467 1469 state == RESTARTER_STATE_OFFLINE)
1468 1470 reason = restarter_str_disable_request;
1469 1471 (void) restarter_instance_update_states(h, inst,
1470 1472 RESTARTER_STATE_DISABLED, RESTARTER_STATE_NONE,
1471 1473 RERR_RESTART, reason);
1472 1474 return (0);
1473 1475
1474 1476 case RESTARTER_STATE_DISABLED:
1475 1477 break;
1476 1478
1477 1479 case RESTARTER_STATE_MAINT:
1478 1480 /*
1479 1481 * We only want to pull the instance out of maintenance
1480 1482 * if the disable is on adminstrative request. The
1481 1483 * graph engine sends _DISABLE events whenever a
1482 1484 * service isn't in the disabled state, and we don't
1483 1485 * want to pull the service out of maintenance if,
1484 1486 * for example, it is there due to a dependency cycle.
1485 1487 */
1486 1488 if (e == RESTARTER_EVENT_TYPE_ADMIN_DISABLE)
1487 1489 unmaintain_instance(h, inst, RUNMAINT_DISABLE);
1488 1490 break;
1489 1491
1490 1492 default:
1491 1493 #ifndef NDEBUG
1492 1494 (void) fprintf(stderr, "Restarter instance %s has "
1493 1495 "unknown state %d.\n", inst->ri_i.i_fmri, state);
1494 1496 #endif
1495 1497 abort();
1496 1498 }
1497 1499 }
1498 1500
1499 1501 return (0);
1500 1502 }
1501 1503
1502 1504 static void
1503 1505 start_instance(scf_handle_t *local_handle, restarter_inst_t *inst,
1504 1506 int32_t reason)
1505 1507 {
1506 1508 fork_info_t *info;
1507 1509 restarter_str_t new_reason;
1508 1510
1509 1511 assert(MUTEX_HELD(&inst->ri_lock));
1510 1512 assert(instance_in_transition(inst) == 0);
1511 1513 assert(inst->ri_method_thread == 0);
1512 1514
1513 1515 log_framework(LOG_DEBUG, "%s: trying to start instance\n",
1514 1516 inst->ri_i.i_fmri);
1515 1517
1516 1518 /*
1517 1519 * We want to keep the original reason for restarts and clear actions
1518 1520 */
1519 1521 switch (reason) {
1520 1522 case restarter_str_restart_request:
1521 1523 case restarter_str_clear_request:
1522 1524 new_reason = reason;
1523 1525 break;
1524 1526 default:
1525 1527 new_reason = restarter_str_dependencies_satisfied;
1526 1528 }
1527 1529
1528 1530 /* Services in the disabled and maintenance state are ignored */
1529 1531 if (inst->ri_i.i_state == RESTARTER_STATE_MAINT ||
1530 1532 inst->ri_i.i_state == RESTARTER_STATE_DISABLED ||
1531 1533 inst->ri_i.i_enabled == 0) {
1532 1534 log_framework(LOG_DEBUG,
1533 1535 "%s: start_instance -> is maint/disabled\n",
1534 1536 inst->ri_i.i_fmri);
1535 1537 return;
1536 1538 }
1537 1539
1538 1540 /* Already started instances are left alone */
1539 1541 if (instance_started(inst) == 1) {
1540 1542 log_framework(LOG_DEBUG,
1541 1543 "%s: start_instance -> is already started\n",
1542 1544 inst->ri_i.i_fmri);
1543 1545 return;
1544 1546 }
1545 1547
1546 1548 log_framework(LOG_DEBUG, "%s: starting instance.\n", inst->ri_i.i_fmri);
1547 1549
1548 1550 (void) restarter_instance_update_states(local_handle, inst,
1549 1551 inst->ri_i.i_state, RESTARTER_STATE_ONLINE, RERR_NONE, new_reason);
1550 1552
1551 1553 info = startd_zalloc(sizeof (fork_info_t));
1552 1554
1553 1555 info->sf_id = inst->ri_id;
1554 1556 info->sf_method_type = METHOD_START;
1555 1557 info->sf_event_type = RERR_NONE;
1556 1558 info->sf_reason = new_reason;
1557 1559 inst->ri_method_thread = startd_thread_create(method_thread, info);
1558 1560 }
1559 1561
1560 1562 static int
1561 1563 event_from_tty(scf_handle_t *h, restarter_inst_t *rip)
1562 1564 {
1563 1565 scf_instance_t *inst;
1564 1566 int ret = 0;
1565 1567
1566 1568 if (libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst))
1567 1569 return (-1);
1568 1570
1569 1571 ret = restarter_inst_ractions_from_tty(inst);
1570 1572
1571 1573 scf_instance_destroy(inst);
1572 1574 return (ret);
1573 1575 }
1574 1576
1575 1577 static boolean_t
1576 1578 restart_dump(scf_handle_t *h, restarter_inst_t *rip)
1577 1579 {
1578 1580 scf_instance_t *inst;
1579 1581 boolean_t ret = B_FALSE;
1580 1582
1581 1583 if (libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst))
1582 1584 return (-1);
1583 1585
1584 1586 if (restarter_inst_dump(inst) == 1)
1585 1587 ret = B_TRUE;
1586 1588
1587 1589 scf_instance_destroy(inst);
1588 1590 return (ret);
1589 1591 }
1590 1592
1591 1593 static void
1592 1594 maintain_instance(scf_handle_t *h, restarter_inst_t *rip, int immediate,
1593 1595 restarter_str_t reason)
1594 1596 {
1595 1597 fork_info_t *info;
1596 1598 scf_instance_t *scf_inst = NULL;
1597 1599
1598 1600 assert(MUTEX_HELD(&rip->ri_lock));
1599 1601 assert(reason != restarter_str_none);
1600 1602 assert(rip->ri_method_thread == 0);
1601 1603
1602 1604 log_instance(rip, B_TRUE, "Stopping for maintenance due to %s.",
1603 1605 restarter_get_str_short(reason));
1604 1606 log_framework(LOG_DEBUG, "%s: stopping for maintenance due to %s.\n",
1605 1607 rip->ri_i.i_fmri, restarter_get_str_short(reason));
1606 1608
1607 1609 /* Services in the maintenance state are ignored */
1608 1610 if (rip->ri_i.i_state == RESTARTER_STATE_MAINT) {
1609 1611 log_framework(LOG_DEBUG,
1610 1612 "%s: maintain_instance -> is already in maintenance\n",
1611 1613 rip->ri_i.i_fmri);
1612 1614 return;
1613 1615 }
1614 1616
1615 1617 /*
1616 1618 * If reason state is restarter_str_service_request and
1617 1619 * restarter_actions/auxiliary_fmri property is set with a valid fmri,
1618 1620 * copy the fmri to restarter/auxiliary_fmri so svcs -x can use.
1619 1621 */
1620 1622 if (reason == restarter_str_service_request &&
1621 1623 libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &scf_inst) == 0) {
1622 1624 if (restarter_inst_validate_ractions_aux_fmri(scf_inst) == 0) {
1623 1625 if (restarter_inst_set_aux_fmri(scf_inst))
1624 1626 log_framework(LOG_DEBUG, "%s: "
1625 1627 "restarter_inst_set_aux_fmri failed: ",
1626 1628 rip->ri_i.i_fmri);
1627 1629 } else {
1628 1630 log_framework(LOG_DEBUG, "%s: "
1629 1631 "restarter_inst_validate_ractions_aux_fmri "
1630 1632 "failed: ", rip->ri_i.i_fmri);
1631 1633
1632 1634 if (restarter_inst_reset_aux_fmri(scf_inst))
1633 1635 log_framework(LOG_DEBUG, "%s: "
1634 1636 "restarter_inst_reset_aux_fmri failed: ",
1635 1637 rip->ri_i.i_fmri);
1636 1638 }
1637 1639 scf_instance_destroy(scf_inst);
1638 1640 }
1639 1641
1640 1642 if (immediate || !instance_started(rip)) {
1641 1643 if (rip->ri_i.i_primary_ctid != 0) {
1642 1644 rip->ri_m_inst = safe_scf_instance_create(h);
1643 1645 rip->ri_mi_deleted = B_FALSE;
1644 1646
1645 1647 libscf_reget_instance(rip);
1646 1648 method_remove_contract(rip, B_TRUE, B_TRUE);
1647 1649
1648 1650 scf_instance_destroy(rip->ri_m_inst);
1649 1651 }
1650 1652
1651 1653 (void) restarter_instance_update_states(h, rip,
1652 1654 RESTARTER_STATE_MAINT, RESTARTER_STATE_NONE, RERR_RESTART,
1653 1655 reason);
1654 1656 return;
1655 1657 }
1656 1658
1657 1659 (void) restarter_instance_update_states(h, rip, rip->ri_i.i_state,
1658 1660 RESTARTER_STATE_MAINT, RERR_NONE, reason);
1659 1661
1660 1662 log_transition(rip, MAINT_REQUESTED);
1661 1663
1662 1664 info = startd_zalloc(sizeof (*info));
1663 1665 info->sf_id = rip->ri_id;
1664 1666 info->sf_method_type = METHOD_STOP;
1665 1667 info->sf_event_type = RERR_RESTART;
1666 1668 info->sf_reason = reason;
1667 1669 rip->ri_method_thread = startd_thread_create(method_thread, info);
1668 1670 }
1669 1671
1670 1672 static void
1671 1673 refresh_instance(scf_handle_t *h, restarter_inst_t *rip)
1672 1674 {
1673 1675 scf_instance_t *inst;
1674 1676 scf_snapshot_t *snap;
1675 1677 fork_info_t *info;
1676 1678 int r;
1677 1679
1678 1680 assert(MUTEX_HELD(&rip->ri_lock));
1679 1681
1680 1682 log_instance(rip, B_TRUE, "Rereading configuration.");
1681 1683 log_framework(LOG_DEBUG, "%s: rereading configuration.\n",
1682 1684 rip->ri_i.i_fmri);
1683 1685
1684 1686 rep_retry:
1685 1687 r = libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst);
1686 1688 switch (r) {
1687 1689 case 0:
1688 1690 break;
1689 1691
1690 1692 case ECONNABORTED:
1691 1693 libscf_handle_rebind(h);
1692 1694 goto rep_retry;
1693 1695
1694 1696 case ENOENT:
1695 1697 /* Must have been deleted. */
1696 1698 return;
1697 1699
1698 1700 case EINVAL:
1699 1701 case ENOTSUP:
1700 1702 default:
1701 1703 bad_error("libscf_fmri_get_instance", r);
1702 1704 }
1703 1705
1704 1706 snap = libscf_get_running_snapshot(inst);
1705 1707
1706 1708 r = libscf_get_startd_properties(inst, snap, &rip->ri_flags,
1707 1709 &rip->ri_utmpx_prefix);
1708 1710 switch (r) {
1709 1711 case 0:
1710 1712 log_framework(LOG_DEBUG, "%s is a %s-style service\n",
1711 1713 rip->ri_i.i_fmri, service_style(rip->ri_flags));
1712 1714 break;
1713 1715
1714 1716 case ECONNABORTED:
1715 1717 scf_instance_destroy(inst);
1716 1718 scf_snapshot_destroy(snap);
1717 1719 libscf_handle_rebind(h);
1718 1720 goto rep_retry;
1719 1721
1720 1722 case ECANCELED:
1721 1723 case ENOENT:
1722 1724 /* Succeed in anticipation of REMOVE_INSTANCE. */
1723 1725 break;
1724 1726
1725 1727 default:
1726 1728 bad_error("libscf_get_startd_properties", r);
1727 1729 }
1728 1730
1729 1731 if (instance_started(rip)) {
1730 1732 /* Refresh does not change the state. */
1731 1733 (void) restarter_instance_update_states(h, rip,
1732 1734 rip->ri_i.i_state, rip->ri_i.i_state, RERR_NONE,
1733 1735 restarter_str_refresh);
1734 1736
1735 1737 info = startd_zalloc(sizeof (*info));
1736 1738 info->sf_id = rip->ri_id;
1737 1739 info->sf_method_type = METHOD_REFRESH;
1738 1740 info->sf_event_type = RERR_REFRESH;
1739 1741 info->sf_reason = NULL;
1740 1742
1741 1743 assert(rip->ri_method_thread == 0);
1742 1744 rip->ri_method_thread =
1743 1745 startd_thread_create(method_thread, info);
1744 1746 }
1745 1747
1746 1748 scf_snapshot_destroy(snap);
1747 1749 scf_instance_destroy(inst);
1748 1750 }
1749 1751
1750 1752 const char *event_names[] = { "INVALID", "ADD_INSTANCE", "REMOVE_INSTANCE",
1751 1753 "ENABLE", "DISABLE", "ADMIN_DEGRADED", "ADMIN_REFRESH",
1752 1754 "ADMIN_RESTART", "ADMIN_MAINT_OFF", "ADMIN_MAINT_ON",
1753 1755 "ADMIN_MAINT_ON_IMMEDIATE", "STOP", "START", "DEPENDENCY_CYCLE",
1754 1756 "INVALID_DEPENDENCY", "ADMIN_DISABLE", "STOP_RESET"
1755 1757 };
1756 1758
1757 1759 /*
1758 1760 * void *restarter_process_events()
1759 1761 *
1760 1762 * Called in a separate thread to process the events on an instance's
1761 1763 * queue. Empties the queue completely, and tries to keep the thread
1762 1764 * around for a little while after the queue is empty to save on
1763 1765 * startup costs.
↓ open down ↓ |
757 lines elided |
↑ open up ↑ |
1764 1766 */
1765 1767 static void *
1766 1768 restarter_process_events(void *arg)
1767 1769 {
1768 1770 scf_handle_t *h;
1769 1771 restarter_instance_qentry_t *event;
1770 1772 restarter_inst_t *rip;
1771 1773 char *fmri = (char *)arg;
1772 1774 struct timespec to;
1773 1775
1776 + (void) pthread_setname_np(pthread_self(), "restarter_process_events");
1777 +
1774 1778 assert(fmri != NULL);
1775 1779
1776 1780 h = libscf_handle_create_bound_loop();
1777 1781
1778 1782 /* grab the queue lock */
1779 1783 rip = inst_lookup_queue(fmri);
1780 1784 if (rip == NULL)
1781 1785 goto out;
1782 1786
1783 1787 again:
1784 1788
1785 1789 while ((event = uu_list_first(rip->ri_queue)) != NULL) {
1786 1790 restarter_inst_t *inst;
1787 1791
1788 1792 /* drop the queue lock */
1789 1793 MUTEX_UNLOCK(&rip->ri_queue_lock);
1790 1794
1791 1795 /*
1792 1796 * Grab the inst lock -- this waits until any outstanding
1793 1797 * method finishes running.
1794 1798 */
1795 1799 inst = inst_lookup_by_name(fmri);
1796 1800 if (inst == NULL) {
1797 1801 /* Getting deleted in the middle isn't an error. */
1798 1802 goto cont;
1799 1803 }
1800 1804
1801 1805 assert(instance_in_transition(inst) == 0);
1802 1806
1803 1807 /* process the event */
1804 1808 switch (event->riq_type) {
1805 1809 case RESTARTER_EVENT_TYPE_ENABLE:
1806 1810 case RESTARTER_EVENT_TYPE_DISABLE:
1807 1811 (void) enable_inst(h, inst, event);
1808 1812 break;
1809 1813
1810 1814 case RESTARTER_EVENT_TYPE_ADMIN_DISABLE:
1811 1815 if (enable_inst(h, inst, event) == 0)
1812 1816 reset_start_times(inst);
1813 1817 break;
1814 1818
1815 1819 case RESTARTER_EVENT_TYPE_REMOVE_INSTANCE:
1816 1820 restarter_delete_inst(inst);
1817 1821 inst = NULL;
1818 1822 goto cont;
1819 1823
1820 1824 case RESTARTER_EVENT_TYPE_STOP_RESET:
1821 1825 reset_start_times(inst);
1822 1826 /* FALLTHROUGH */
1823 1827 case RESTARTER_EVENT_TYPE_STOP:
1824 1828 (void) stop_instance(h, inst, RSTOP_DEPENDENCY);
1825 1829 break;
1826 1830
1827 1831 case RESTARTER_EVENT_TYPE_START:
1828 1832 start_instance(h, inst, event->riq_reason);
1829 1833 break;
1830 1834
1831 1835 case RESTARTER_EVENT_TYPE_DEPENDENCY_CYCLE:
1832 1836 maintain_instance(h, inst, 0,
1833 1837 restarter_str_dependency_cycle);
1834 1838 break;
1835 1839
1836 1840 case RESTARTER_EVENT_TYPE_INVALID_DEPENDENCY:
1837 1841 maintain_instance(h, inst, 0,
1838 1842 restarter_str_invalid_dependency);
1839 1843 break;
1840 1844
1841 1845 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON:
1842 1846 if (event_from_tty(h, inst) == 0)
1843 1847 maintain_instance(h, inst, 0,
1844 1848 restarter_str_service_request);
1845 1849 else
1846 1850 maintain_instance(h, inst, 0,
1847 1851 restarter_str_administrative_request);
1848 1852 break;
1849 1853
1850 1854 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON_IMMEDIATE:
1851 1855 if (event_from_tty(h, inst) == 0)
1852 1856 maintain_instance(h, inst, 1,
1853 1857 restarter_str_service_request);
1854 1858 else
1855 1859 maintain_instance(h, inst, 1,
1856 1860 restarter_str_administrative_request);
1857 1861 break;
1858 1862
1859 1863 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_OFF:
1860 1864 unmaintain_instance(h, inst, RUNMAINT_CLEAR);
1861 1865 reset_start_times(inst);
1862 1866 break;
1863 1867
1864 1868 case RESTARTER_EVENT_TYPE_ADMIN_REFRESH:
1865 1869 refresh_instance(h, inst);
1866 1870 break;
1867 1871
1868 1872 case RESTARTER_EVENT_TYPE_ADMIN_DEGRADED:
1869 1873 log_framework(LOG_WARNING, "Restarter: "
1870 1874 "%s command (for %s) unimplemented.\n",
1871 1875 event_names[event->riq_type], inst->ri_i.i_fmri);
1872 1876 break;
1873 1877
1874 1878 case RESTARTER_EVENT_TYPE_ADMIN_RESTART:
1875 1879 if (!instance_started(inst)) {
1876 1880 log_framework(LOG_DEBUG, "Restarter: "
1877 1881 "Not restarting %s; not running.\n",
1878 1882 inst->ri_i.i_fmri);
1879 1883 } else {
1880 1884 /*
1881 1885 * Stop the instance. If it can be restarted,
1882 1886 * the graph engine will send a new event.
1883 1887 */
1884 1888 if (restart_dump(h, inst)) {
1885 1889 (void) contract_kill(
1886 1890 inst->ri_i.i_primary_ctid, SIGABRT,
1887 1891 inst->ri_i.i_fmri);
1888 1892 } else if (stop_instance(h, inst,
1889 1893 RSTOP_RESTART) == 0) {
1890 1894 reset_start_times(inst);
1891 1895 }
1892 1896 }
1893 1897 break;
1894 1898
1895 1899 case RESTARTER_EVENT_TYPE_ADD_INSTANCE:
1896 1900 default:
1897 1901 #ifndef NDEBUG
1898 1902 uu_warn("%s:%d: Bad restarter event %d. "
1899 1903 "Aborting.\n", __FILE__, __LINE__, event->riq_type);
1900 1904 #endif
1901 1905 abort();
1902 1906 }
1903 1907
1904 1908 assert(inst != NULL);
1905 1909 MUTEX_UNLOCK(&inst->ri_lock);
1906 1910
1907 1911 cont:
1908 1912 /* grab the queue lock */
1909 1913 rip = inst_lookup_queue(fmri);
1910 1914 if (rip == NULL)
1911 1915 goto out;
1912 1916
1913 1917 /* delete the event */
1914 1918 uu_list_remove(rip->ri_queue, event);
1915 1919 startd_free(event, sizeof (restarter_instance_qentry_t));
1916 1920 }
1917 1921
1918 1922 assert(rip != NULL);
1919 1923
1920 1924 /*
1921 1925 * Try to preserve the thread for a little while for future use.
1922 1926 */
1923 1927 to.tv_sec = 3;
1924 1928 to.tv_nsec = 0;
1925 1929 (void) pthread_cond_reltimedwait_np(&rip->ri_queue_cv,
1926 1930 &rip->ri_queue_lock, &to);
1927 1931
1928 1932 if (uu_list_first(rip->ri_queue) != NULL)
1929 1933 goto again;
1930 1934
1931 1935 rip->ri_queue_thread = 0;
↓ open down ↓ |
148 lines elided |
↑ open up ↑ |
1932 1936 MUTEX_UNLOCK(&rip->ri_queue_lock);
1933 1937
1934 1938 out:
1935 1939 (void) scf_handle_unbind(h);
1936 1940 scf_handle_destroy(h);
1937 1941 free(fmri);
1938 1942 return (NULL);
1939 1943 }
1940 1944
1941 1945 static int
1942 -is_admin_event(restarter_event_type_t t) {
1943 -
1946 +is_admin_event(restarter_event_type_t t)
1947 +{
1944 1948 switch (t) {
1945 1949 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON:
1946 1950 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON_IMMEDIATE:
1947 1951 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_OFF:
1948 1952 case RESTARTER_EVENT_TYPE_ADMIN_REFRESH:
1949 1953 case RESTARTER_EVENT_TYPE_ADMIN_DEGRADED:
1950 1954 case RESTARTER_EVENT_TYPE_ADMIN_RESTART:
1951 1955 return (1);
1952 1956 default:
1953 1957 return (0);
1954 1958 }
1955 1959 }
1956 1960
1957 1961 static void
1958 1962 restarter_queue_event(restarter_inst_t *ri, restarter_protocol_event_t *e)
1959 1963 {
1960 1964 restarter_instance_qentry_t *qe;
1961 1965 int r;
1962 1966
1963 1967 assert(MUTEX_HELD(&ri->ri_queue_lock));
1964 1968 assert(!MUTEX_HELD(&ri->ri_lock));
1965 1969
1966 1970 qe = startd_zalloc(sizeof (restarter_instance_qentry_t));
1967 1971 qe->riq_type = e->rpe_type;
1968 1972 qe->riq_reason = e->rpe_reason;
1969 1973
1970 1974 uu_list_node_init(qe, &qe->riq_link, restarter_queue_pool);
1971 1975 r = uu_list_insert_before(ri->ri_queue, NULL, qe);
1972 1976 assert(r == 0);
1973 1977 }
1974 1978
1975 1979 /*
1976 1980 * void *restarter_event_thread()
1977 1981 *
↓ open down ↓ |
24 lines elided |
↑ open up ↑ |
1978 1982 * Handle incoming graph events by placing them on a per-instance
1979 1983 * queue. We can't lock the main part of the instance structure, so
1980 1984 * just modify the seprarately locked event queue portion.
1981 1985 */
1982 1986 /*ARGSUSED*/
1983 1987 static void *
1984 1988 restarter_event_thread(void *unused)
1985 1989 {
1986 1990 scf_handle_t *h;
1987 1991
1992 + (void) pthread_setname_np(pthread_self(), "restarter_event");
1993 +
1988 1994 /*
1989 1995 * This is a new thread, and thus, gets its own handle
1990 1996 * to the repository.
1991 1997 */
1992 1998 h = libscf_handle_create_bound_loop();
1993 1999
1994 2000 MUTEX_LOCK(&ru->restarter_update_lock);
1995 2001
1996 2002 /*CONSTCOND*/
1997 2003 while (1) {
1998 2004 restarter_protocol_event_t *e;
1999 2005
2000 2006 while (ru->restarter_update_wakeup == 0)
2001 2007 (void) pthread_cond_wait(&ru->restarter_update_cv,
2002 2008 &ru->restarter_update_lock);
2003 2009
2004 2010 ru->restarter_update_wakeup = 0;
2005 2011
2006 2012 while ((e = restarter_event_dequeue()) != NULL) {
2007 2013 restarter_inst_t *rip;
2008 2014 char *fmri;
2009 2015
2010 2016 MUTEX_UNLOCK(&ru->restarter_update_lock);
2011 2017
2012 2018 /*
2013 2019 * ADD_INSTANCE is special: there's likely no
2014 2020 * instance structure yet, so we need to handle the
2015 2021 * addition synchronously.
2016 2022 */
2017 2023 switch (e->rpe_type) {
2018 2024 case RESTARTER_EVENT_TYPE_ADD_INSTANCE:
2019 2025 if (restarter_insert_inst(h, e->rpe_inst) != 0)
2020 2026 log_error(LOG_INFO, "Restarter: "
2021 2027 "Could not add %s.\n", e->rpe_inst);
2022 2028
2023 2029 MUTEX_LOCK(&st->st_load_lock);
2024 2030 if (--st->st_load_instances == 0)
2025 2031 (void) pthread_cond_broadcast(
2026 2032 &st->st_load_cv);
2027 2033 MUTEX_UNLOCK(&st->st_load_lock);
2028 2034
2029 2035 goto nolookup;
2030 2036 }
2031 2037
2032 2038 /*
2033 2039 * Lookup the instance, locking only the event queue.
2034 2040 * Can't grab ri_lock here because it might be held
2035 2041 * by a long-running method.
2036 2042 */
2037 2043 rip = inst_lookup_queue(e->rpe_inst);
2038 2044 if (rip == NULL) {
2039 2045 log_error(LOG_INFO, "Restarter: "
2040 2046 "Ignoring %s command for unknown service "
2041 2047 "%s.\n", event_names[e->rpe_type],
2042 2048 e->rpe_inst);
2043 2049 goto nolookup;
2044 2050 }
2045 2051
2046 2052 /* Keep ADMIN events from filling up the queue. */
2047 2053 if (is_admin_event(e->rpe_type) &&
2048 2054 uu_list_numnodes(rip->ri_queue) >
2049 2055 RINST_QUEUE_THRESHOLD) {
2050 2056 MUTEX_UNLOCK(&rip->ri_queue_lock);
2051 2057 log_instance(rip, B_TRUE, "Instance event "
2052 2058 "queue overflow. Dropping administrative "
2053 2059 "request.");
2054 2060 log_framework(LOG_DEBUG, "%s: Instance event "
2055 2061 "queue overflow. Dropping administrative "
2056 2062 "request.\n", rip->ri_i.i_fmri);
2057 2063 goto nolookup;
2058 2064 }
2059 2065
2060 2066 /* Now add the event to the instance queue. */
2061 2067 restarter_queue_event(rip, e);
2062 2068
2063 2069 if (rip->ri_queue_thread == 0) {
2064 2070 /*
2065 2071 * Start a thread if one isn't already
2066 2072 * running.
2067 2073 */
2068 2074 fmri = safe_strdup(e->rpe_inst);
2069 2075 rip->ri_queue_thread = startd_thread_create(
2070 2076 restarter_process_events, (void *)fmri);
2071 2077 } else {
2072 2078 /*
2073 2079 * Signal the existing thread that there's
2074 2080 * a new event.
2075 2081 */
2076 2082 (void) pthread_cond_broadcast(
2077 2083 &rip->ri_queue_cv);
2078 2084 }
2079 2085
2080 2086 MUTEX_UNLOCK(&rip->ri_queue_lock);
2081 2087 nolookup:
2082 2088 restarter_event_release(e);
2083 2089
2084 2090 MUTEX_LOCK(&ru->restarter_update_lock);
2085 2091 }
2086 2092 }
2087 2093
2088 2094 /*
2089 2095 * Unreachable for now -- there's currently no graceful cleanup
2090 2096 * called on exit().
2091 2097 */
2092 2098 (void) scf_handle_unbind(h);
2093 2099 scf_handle_destroy(h);
2094 2100 return (NULL);
2095 2101 }
2096 2102
2097 2103 static restarter_inst_t *
2098 2104 contract_to_inst(ctid_t ctid)
2099 2105 {
2100 2106 restarter_inst_t *inst;
2101 2107 int id;
2102 2108
2103 2109 id = lookup_inst_by_contract(ctid);
2104 2110 if (id == -1)
2105 2111 return (NULL);
2106 2112
2107 2113 inst = inst_lookup_by_id(id);
2108 2114 if (inst != NULL) {
2109 2115 /*
2110 2116 * Since ri_lock isn't held by the contract id lookup, this
2111 2117 * instance may have been restarted and now be in a new
2112 2118 * contract, making the old contract no longer valid for this
2113 2119 * instance.
2114 2120 */
2115 2121 if (ctid != inst->ri_i.i_primary_ctid) {
2116 2122 MUTEX_UNLOCK(&inst->ri_lock);
2117 2123 inst = NULL;
2118 2124 }
2119 2125 }
2120 2126 return (inst);
2121 2127 }
2122 2128
2123 2129 /*
2124 2130 * void contract_action()
2125 2131 * Take action on contract events.
2126 2132 */
2127 2133 static void
2128 2134 contract_action(scf_handle_t *h, restarter_inst_t *inst, ctid_t id,
2129 2135 uint32_t type)
2130 2136 {
2131 2137 const char *fmri = inst->ri_i.i_fmri;
2132 2138
2133 2139 assert(MUTEX_HELD(&inst->ri_lock));
2134 2140
2135 2141 /*
2136 2142 * If startd has stopped this contract, there is no need to
2137 2143 * stop it again.
2138 2144 */
2139 2145 if (inst->ri_i.i_primary_ctid > 0 &&
2140 2146 inst->ri_i.i_primary_ctid_stopped)
2141 2147 return;
2142 2148
2143 2149 if ((type & (CT_PR_EV_EMPTY | CT_PR_EV_CORE | CT_PR_EV_SIGNAL
2144 2150 | CT_PR_EV_HWERR)) == 0) {
2145 2151 /*
2146 2152 * There shouldn't be other events, since that's not how we set
2147 2153 * the terms. Thus, just log an error and drive on.
2148 2154 */
2149 2155 log_framework(LOG_NOTICE,
2150 2156 "%s: contract %ld received unexpected critical event "
2151 2157 "(%d)\n", fmri, id, type);
2152 2158 return;
2153 2159 }
2154 2160
2155 2161 assert(instance_in_transition(inst) == 0);
2156 2162
2157 2163 if (instance_is_wait_style(inst)) {
2158 2164 /*
2159 2165 * We ignore all events; if they impact the
2160 2166 * process we're monitoring, then the
2161 2167 * wait_thread will stop the instance.
2162 2168 */
2163 2169 log_framework(LOG_DEBUG,
2164 2170 "%s: ignoring contract event on wait-style service\n",
2165 2171 fmri);
2166 2172 } else {
2167 2173 /*
2168 2174 * A CT_PR_EV_EMPTY event is an RSTOP_EXIT request.
2169 2175 */
2170 2176 switch (type) {
2171 2177 case CT_PR_EV_EMPTY:
2172 2178 (void) stop_instance(h, inst, RSTOP_EXIT);
2173 2179 break;
2174 2180 case CT_PR_EV_CORE:
2175 2181 (void) stop_instance(h, inst, RSTOP_CORE);
2176 2182 break;
2177 2183 case CT_PR_EV_SIGNAL:
2178 2184 (void) stop_instance(h, inst, RSTOP_SIGNAL);
2179 2185 break;
2180 2186 case CT_PR_EV_HWERR:
2181 2187 (void) stop_instance(h, inst, RSTOP_HWERR);
2182 2188 break;
2183 2189 }
2184 2190 }
2185 2191 }
2186 2192
2187 2193 /*
2188 2194 * void *restarter_contract_event_thread(void *)
↓ open down ↓ |
191 lines elided |
↑ open up ↑ |
2189 2195 * Listens to the process contract bundle for critical events, taking action
2190 2196 * on events from contracts we know we are responsible for.
2191 2197 */
2192 2198 /*ARGSUSED*/
2193 2199 static void *
2194 2200 restarter_contracts_event_thread(void *unused)
2195 2201 {
2196 2202 int fd, err;
2197 2203 scf_handle_t *local_handle;
2198 2204
2205 + (void) pthread_setname_np(pthread_self(), "restarter_contracts_event");
2206 +
2199 2207 /*
2200 2208 * Await graph load completion. That is, stop here, until we've scanned
2201 2209 * the repository for contract - instance associations.
2202 2210 */
2203 2211 MUTEX_LOCK(&st->st_load_lock);
2204 2212 while (!(st->st_load_complete && st->st_load_instances == 0))
2205 2213 (void) pthread_cond_wait(&st->st_load_cv, &st->st_load_lock);
2206 2214 MUTEX_UNLOCK(&st->st_load_lock);
2207 2215
2208 2216 /*
2209 2217 * This is a new thread, and thus, gets its own handle
2210 2218 * to the repository.
2211 2219 */
2212 2220 if ((local_handle = libscf_handle_create_bound(SCF_VERSION)) == NULL)
2213 2221 uu_die("Unable to bind a new repository handle: %s\n",
2214 2222 scf_strerror(scf_error()));
2215 2223
2216 2224 fd = open64(CTFS_ROOT "/process/pbundle", O_RDONLY);
2217 2225 if (fd == -1)
2218 2226 uu_die("process bundle open failed");
2219 2227
2220 2228 /*
2221 2229 * Make sure we get all events (including those generated by configd
2222 2230 * before this thread was started).
2223 2231 */
2224 2232 err = ct_event_reset(fd);
2225 2233 assert(err == 0);
2226 2234
2227 2235 for (;;) {
2228 2236 int efd, sfd;
2229 2237 ct_evthdl_t ev;
2230 2238 uint32_t type;
2231 2239 ctevid_t evid;
2232 2240 ct_stathdl_t status;
2233 2241 ctid_t ctid;
2234 2242 restarter_inst_t *inst;
2235 2243 uint64_t cookie;
2236 2244
2237 2245 if (err = ct_event_read_critical(fd, &ev)) {
2238 2246 log_error(LOG_WARNING,
2239 2247 "Error reading next contract event: %s",
2240 2248 strerror(err));
2241 2249 continue;
2242 2250 }
2243 2251
2244 2252 evid = ct_event_get_evid(ev);
2245 2253 ctid = ct_event_get_ctid(ev);
2246 2254 type = ct_event_get_type(ev);
2247 2255
2248 2256 /* Fetch cookie. */
2249 2257 if ((sfd = contract_open(ctid, "process", "status", O_RDONLY))
2250 2258 < 0) {
2251 2259 ct_event_free(ev);
2252 2260 continue;
2253 2261 }
2254 2262
2255 2263 if (err = ct_status_read(sfd, CTD_COMMON, &status)) {
2256 2264 log_framework(LOG_WARNING, "Could not get status for "
2257 2265 "contract %ld: %s\n", ctid, strerror(err));
2258 2266
2259 2267 startd_close(sfd);
2260 2268 ct_event_free(ev);
2261 2269 continue;
2262 2270 }
2263 2271
2264 2272 cookie = ct_status_get_cookie(status);
2265 2273
2266 2274 log_framework(LOG_DEBUG, "Received event %d for ctid %ld "
2267 2275 "cookie %lld\n", type, ctid, cookie);
2268 2276
2269 2277 ct_status_free(status);
2270 2278
2271 2279 startd_close(sfd);
2272 2280
2273 2281 /*
2274 2282 * svc.configd(1M) restart handling performed by the
2275 2283 * fork_configd_thread. We don't acknowledge, as that thread
2276 2284 * will do so.
2277 2285 */
2278 2286 if (cookie == CONFIGD_COOKIE) {
2279 2287 ct_event_free(ev);
2280 2288 continue;
2281 2289 }
2282 2290
2283 2291 inst = NULL;
2284 2292 if (storing_contract != 0 &&
2285 2293 (inst = contract_to_inst(ctid)) == NULL) {
2286 2294 /*
2287 2295 * This can happen for two reasons:
2288 2296 * - method_run() has not yet stored the
2289 2297 * the contract into the internal hash table.
2290 2298 * - we receive an EMPTY event for an abandoned
2291 2299 * contract.
2292 2300 * If there is any contract in the process of
2293 2301 * being stored into the hash table then re-read
2294 2302 * the event later.
2295 2303 */
2296 2304 log_framework(LOG_DEBUG,
2297 2305 "Reset event %d for unknown "
2298 2306 "contract id %ld\n", type, ctid);
2299 2307
2300 2308 /* don't go too fast */
2301 2309 (void) poll(NULL, 0, 100);
2302 2310
2303 2311 (void) ct_event_reset(fd);
2304 2312 ct_event_free(ev);
2305 2313 continue;
2306 2314 }
2307 2315
2308 2316 /*
2309 2317 * Do not call contract_to_inst() again if first
2310 2318 * call succeeded.
2311 2319 */
2312 2320 if (inst == NULL)
2313 2321 inst = contract_to_inst(ctid);
2314 2322 if (inst == NULL) {
2315 2323 /*
2316 2324 * This can happen if we receive an EMPTY
2317 2325 * event for an abandoned contract.
2318 2326 */
2319 2327 log_framework(LOG_DEBUG,
2320 2328 "Received event %d for unknown contract id "
2321 2329 "%ld\n", type, ctid);
2322 2330 } else {
2323 2331 log_framework(LOG_DEBUG,
2324 2332 "Received event %d for contract id "
2325 2333 "%ld (%s)\n", type, ctid,
2326 2334 inst->ri_i.i_fmri);
2327 2335
2328 2336 contract_action(local_handle, inst, ctid, type);
2329 2337
2330 2338 MUTEX_UNLOCK(&inst->ri_lock);
2331 2339 }
2332 2340
2333 2341 efd = contract_open(ct_event_get_ctid(ev), "process", "ctl",
2334 2342 O_WRONLY);
2335 2343 if (efd != -1) {
2336 2344 (void) ct_ctl_ack(efd, evid);
2337 2345 startd_close(efd);
2338 2346 }
2339 2347
2340 2348 ct_event_free(ev);
2341 2349
2342 2350 }
2343 2351
2344 2352 /*NOTREACHED*/
2345 2353 return (NULL);
2346 2354 }
2347 2355
2348 2356 /*
2349 2357 * Timeout queue, processed by restarter_timeouts_event_thread().
2350 2358 */
2351 2359 timeout_queue_t *timeouts;
2352 2360 static uu_list_pool_t *timeout_pool;
2353 2361
2354 2362 typedef struct timeout_update {
2355 2363 pthread_mutex_t tu_lock;
2356 2364 pthread_cond_t tu_cv;
2357 2365 int tu_wakeup;
2358 2366 } timeout_update_t;
2359 2367
2360 2368 timeout_update_t *tu;
2361 2369
2362 2370 static const char *timeout_ovr_svcs[] = {
2363 2371 "svc:/system/manifest-import:default",
2364 2372 "svc:/network/initial:default",
2365 2373 "svc:/network/service:default",
2366 2374 "svc:/system/rmtmpfiles:default",
2367 2375 "svc:/network/loopback:default",
2368 2376 "svc:/network/physical:default",
2369 2377 "svc:/system/device/local:default",
2370 2378 "svc:/system/filesystem/usr:default",
2371 2379 "svc:/system/filesystem/minimal:default",
2372 2380 "svc:/system/filesystem/local:default",
2373 2381 NULL
2374 2382 };
2375 2383
2376 2384 int
2377 2385 is_timeout_ovr(restarter_inst_t *inst)
2378 2386 {
2379 2387 int i;
2380 2388
2381 2389 for (i = 0; timeout_ovr_svcs[i] != NULL; ++i) {
2382 2390 if (strcmp(inst->ri_i.i_fmri, timeout_ovr_svcs[i]) == 0) {
2383 2391 log_instance(inst, B_TRUE, "Timeout override by "
2384 2392 "svc.startd. Using infinite timeout.");
2385 2393 return (1);
2386 2394 }
2387 2395 }
2388 2396
2389 2397 return (0);
2390 2398 }
2391 2399
2392 2400 /*ARGSUSED*/
2393 2401 static int
2394 2402 timeout_compare(const void *lc_arg, const void *rc_arg, void *private)
2395 2403 {
2396 2404 hrtime_t t1 = ((const timeout_entry_t *)lc_arg)->te_timeout;
2397 2405 hrtime_t t2 = ((const timeout_entry_t *)rc_arg)->te_timeout;
2398 2406
2399 2407 if (t1 > t2)
2400 2408 return (1);
2401 2409 else if (t1 < t2)
2402 2410 return (-1);
2403 2411 return (0);
2404 2412 }
2405 2413
2406 2414 void
2407 2415 timeout_init()
2408 2416 {
2409 2417 timeouts = startd_zalloc(sizeof (timeout_queue_t));
2410 2418
2411 2419 (void) pthread_mutex_init(&timeouts->tq_lock, &mutex_attrs);
2412 2420
2413 2421 timeout_pool = startd_list_pool_create("timeouts",
2414 2422 sizeof (timeout_entry_t), offsetof(timeout_entry_t, te_link),
2415 2423 timeout_compare, UU_LIST_POOL_DEBUG);
2416 2424 assert(timeout_pool != NULL);
2417 2425
2418 2426 timeouts->tq_list = startd_list_create(timeout_pool,
2419 2427 timeouts, UU_LIST_SORTED);
2420 2428 assert(timeouts->tq_list != NULL);
2421 2429
2422 2430 tu = startd_zalloc(sizeof (timeout_update_t));
2423 2431 (void) pthread_cond_init(&tu->tu_cv, NULL);
2424 2432 (void) pthread_mutex_init(&tu->tu_lock, &mutex_attrs);
2425 2433 }
2426 2434
2427 2435 void
2428 2436 timeout_insert(restarter_inst_t *inst, ctid_t cid, uint64_t timeout_sec)
2429 2437 {
2430 2438 hrtime_t now, timeout;
2431 2439 timeout_entry_t *entry;
2432 2440 uu_list_index_t idx;
2433 2441
2434 2442 assert(MUTEX_HELD(&inst->ri_lock));
2435 2443
2436 2444 now = gethrtime();
2437 2445
2438 2446 /*
2439 2447 * If we overflow LLONG_MAX, we're never timing out anyways, so
2440 2448 * just return.
2441 2449 */
2442 2450 if (timeout_sec >= (LLONG_MAX - now) / 1000000000LL) {
2443 2451 log_instance(inst, B_TRUE, "timeout_seconds too large, "
2444 2452 "treating as infinite.");
2445 2453 return;
2446 2454 }
2447 2455
2448 2456 /* hrtime is in nanoseconds. Convert timeout_sec. */
2449 2457 timeout = now + (timeout_sec * 1000000000LL);
2450 2458
2451 2459 entry = startd_alloc(sizeof (timeout_entry_t));
2452 2460 entry->te_timeout = timeout;
2453 2461 entry->te_ctid = cid;
2454 2462 entry->te_fmri = safe_strdup(inst->ri_i.i_fmri);
2455 2463 entry->te_logstem = safe_strdup(inst->ri_logstem);
2456 2464 entry->te_fired = 0;
2457 2465 /* Insert the calculated timeout time onto the queue. */
2458 2466 MUTEX_LOCK(&timeouts->tq_lock);
2459 2467 (void) uu_list_find(timeouts->tq_list, entry, NULL, &idx);
2460 2468 uu_list_node_init(entry, &entry->te_link, timeout_pool);
2461 2469 uu_list_insert(timeouts->tq_list, entry, idx);
2462 2470 MUTEX_UNLOCK(&timeouts->tq_lock);
2463 2471
2464 2472 assert(inst->ri_timeout == NULL);
2465 2473 inst->ri_timeout = entry;
2466 2474
2467 2475 MUTEX_LOCK(&tu->tu_lock);
2468 2476 tu->tu_wakeup = 1;
2469 2477 (void) pthread_cond_broadcast(&tu->tu_cv);
2470 2478 MUTEX_UNLOCK(&tu->tu_lock);
2471 2479 }
2472 2480
2473 2481
2474 2482 void
2475 2483 timeout_remove(restarter_inst_t *inst, ctid_t cid)
2476 2484 {
2477 2485 assert(MUTEX_HELD(&inst->ri_lock));
2478 2486
2479 2487 if (inst->ri_timeout == NULL)
2480 2488 return;
2481 2489
2482 2490 assert(inst->ri_timeout->te_ctid == cid);
2483 2491
2484 2492 MUTEX_LOCK(&timeouts->tq_lock);
2485 2493 uu_list_remove(timeouts->tq_list, inst->ri_timeout);
2486 2494 MUTEX_UNLOCK(&timeouts->tq_lock);
2487 2495
2488 2496 free(inst->ri_timeout->te_fmri);
2489 2497 free(inst->ri_timeout->te_logstem);
2490 2498 startd_free(inst->ri_timeout, sizeof (timeout_entry_t));
2491 2499 inst->ri_timeout = NULL;
2492 2500 }
2493 2501
2494 2502 static int
2495 2503 timeout_now()
2496 2504 {
2497 2505 timeout_entry_t *e;
2498 2506 hrtime_t now;
2499 2507 int ret;
2500 2508
2501 2509 now = gethrtime();
2502 2510
2503 2511 /*
2504 2512 * Walk through the (sorted) timeouts list. While the timeout
2505 2513 * at the head of the list is <= the current time, kill the
2506 2514 * method.
2507 2515 */
2508 2516 MUTEX_LOCK(&timeouts->tq_lock);
2509 2517
2510 2518 for (e = uu_list_first(timeouts->tq_list);
2511 2519 e != NULL && e->te_timeout <= now;
2512 2520 e = uu_list_next(timeouts->tq_list, e)) {
2513 2521 log_framework(LOG_WARNING, "%s: Method or service exit timed "
2514 2522 "out. Killing contract %ld.\n", e->te_fmri, e->te_ctid);
2515 2523 log_instance_fmri(e->te_fmri, e->te_logstem, B_TRUE,
2516 2524 "Method or service exit timed out. Killing contract %ld.",
2517 2525 e->te_ctid);
2518 2526 e->te_fired = 1;
2519 2527 (void) contract_kill(e->te_ctid, SIGKILL, e->te_fmri);
2520 2528 }
2521 2529
2522 2530 if (uu_list_numnodes(timeouts->tq_list) > 0)
2523 2531 ret = 0;
2524 2532 else
2525 2533 ret = -1;
2526 2534
2527 2535 MUTEX_UNLOCK(&timeouts->tq_lock);
2528 2536
2529 2537 return (ret);
2530 2538 }
2531 2539
2532 2540 /*
2533 2541 * void *restarter_timeouts_event_thread(void *)
2534 2542 * Responsible for monitoring the method timeouts. This thread must
2535 2543 * be started before any methods are called.
2536 2544 */
2537 2545 /*ARGSUSED*/
↓ open down ↓ |
329 lines elided |
↑ open up ↑ |
2538 2546 static void *
2539 2547 restarter_timeouts_event_thread(void *unused)
2540 2548 {
2541 2549 /*
2542 2550 * Timeouts are entered on a priority queue, which is processed by
2543 2551 * this thread. As timeouts are specified in seconds, we'll do
2544 2552 * the necessary processing every second, as long as the queue
2545 2553 * is not empty.
2546 2554 */
2547 2555
2556 + (void) pthread_setname_np(pthread_self(), "restarter_timeouts_event");
2557 +
2548 2558 /*CONSTCOND*/
2549 2559 while (1) {
2550 2560 /*
2551 2561 * As long as the timeout list isn't empty, process it
2552 2562 * every second.
2553 2563 */
2554 2564 if (timeout_now() == 0) {
2555 2565 (void) sleep(1);
2556 2566 continue;
2557 2567 }
2558 2568
2559 2569 /* The list is empty, wait until we have more timeouts. */
2560 2570 MUTEX_LOCK(&tu->tu_lock);
2561 2571
2562 2572 while (tu->tu_wakeup == 0)
2563 2573 (void) pthread_cond_wait(&tu->tu_cv, &tu->tu_lock);
2564 2574
2565 2575 tu->tu_wakeup = 0;
2566 2576 MUTEX_UNLOCK(&tu->tu_lock);
2567 2577 }
2568 2578
2569 2579 return (NULL);
2570 2580 }
2571 2581
2572 2582 void
2573 2583 restarter_start()
2574 2584 {
2575 2585 (void) startd_thread_create(restarter_timeouts_event_thread, NULL);
2576 2586 (void) startd_thread_create(restarter_event_thread, NULL);
2577 2587 (void) startd_thread_create(restarter_contracts_event_thread, NULL);
2578 2588 (void) startd_thread_create(wait_thread, NULL);
2579 2589 }
2580 2590
2581 2591
2582 2592 void
2583 2593 restarter_init()
2584 2594 {
2585 2595 restarter_instance_pool = startd_list_pool_create("restarter_instances",
2586 2596 sizeof (restarter_inst_t), offsetof(restarter_inst_t,
2587 2597 ri_link), restarter_instance_compare, UU_LIST_POOL_DEBUG);
2588 2598 (void) memset(&instance_list, 0, sizeof (instance_list));
2589 2599
2590 2600 (void) pthread_mutex_init(&instance_list.ril_lock, &mutex_attrs);
2591 2601 instance_list.ril_instance_list = startd_list_create(
2592 2602 restarter_instance_pool, &instance_list, UU_LIST_SORTED);
2593 2603
2594 2604 restarter_queue_pool = startd_list_pool_create(
2595 2605 "restarter_instance_queue", sizeof (restarter_instance_qentry_t),
2596 2606 offsetof(restarter_instance_qentry_t, riq_link), NULL,
2597 2607 UU_LIST_POOL_DEBUG);
2598 2608
2599 2609 contract_list_pool = startd_list_pool_create(
2600 2610 "contract_list", sizeof (contract_entry_t),
2601 2611 offsetof(contract_entry_t, ce_link), NULL,
2602 2612 UU_LIST_POOL_DEBUG);
2603 2613 contract_hash_init();
2604 2614
2605 2615 log_framework(LOG_DEBUG, "Initialized restarter\n");
2606 2616 }
↓ open down ↓ |
49 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX