Print this page
2831 svc.startd and svc.configd waste memory.
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/cmd/svc/startd/restarter.c
+++ new/usr/src/cmd/svc/startd/restarter.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 */
25 25
26 26 /*
27 27 * restarter.c - service manipulation
28 28 *
29 29 * This component manages services whose restarter is svc.startd, the standard
30 30 * restarter. It translates restarter protocol events from the graph engine
31 31 * into actions on processes, as a delegated restarter would do.
32 32 *
33 33 * The master restarter manages a number of always-running threads:
34 34 * - restarter event thread: events from the graph engine
35 35 * - timeout thread: thread to fire queued timeouts
36 36 * - contract thread: thread to handle contract events
37 37 * - wait thread: thread to handle wait-based services
38 38 *
39 39 * The other threads are created as-needed:
40 40 * - per-instance method threads
41 41 * - per-instance event processing threads
42 42 *
43 43 * The interaction of all threads must result in the following conditions
44 44 * being satisfied (on a per-instance basis):
45 45 * - restarter events must be processed in order
46 46 * - method execution must be serialized
47 47 * - instance delete must be held until outstanding methods are complete
48 48 * - contract events shouldn't be processed while a method is running
49 49 * - timeouts should fire even when a method is running
50 50 *
51 51 * Service instances are represented by restarter_inst_t's and are kept in the
52 52 * instance_list list.
53 53 *
54 54 * Service States
55 55 * The current state of a service instance is kept in
56 56 * restarter_inst_t->ri_i.i_state. If transition to a new state could take
57 57 * some time, then before we effect the transition we set
58 58 * restarter_inst_t->ri_i.i_next_state to the target state, and afterwards we
59 59 * rotate i_next_state to i_state and set i_next_state to
60 60 * RESTARTER_STATE_NONE. So usually i_next_state is _NONE when ri_lock is not
61 61 * held. The exception is when we launch methods, which are done with
62 62 * a separate thread. To keep any other threads from grabbing ri_lock before
63 63 * method_thread() does, we set ri_method_thread to the thread id of the
64 64 * method thread, and when it is nonzero any thread with a different thread id
65 65 * waits on ri_method_cv.
66 66 *
67 67 * Method execution is serialized by blocking on ri_method_cv in
68 68 * inst_lookup_by_id() and waiting for a 0 value of ri_method_thread. This
69 69 * also prevents the instance structure from being deleted until all
70 70 * outstanding operations such as method_thread() have finished.
71 71 *
72 72 * Lock ordering:
73 73 *
74 74 * dgraph_lock [can be held when taking:]
75 75 * utmpx_lock
76 76 * dictionary->dict_lock
77 77 * st->st_load_lock
78 78 * wait_info_lock
79 79 * ru->restarter_update_lock
80 80 * restarter_queue->rpeq_lock
81 81 * instance_list.ril_lock
82 82 * inst->ri_lock
83 83 * st->st_configd_live_lock
84 84 *
85 85 * instance_list.ril_lock
86 86 * graph_queue->gpeq_lock
87 87 * gu->gu_lock
88 88 * st->st_configd_live_lock
89 89 * dictionary->dict_lock
90 90 * inst->ri_lock
91 91 * graph_queue->gpeq_lock
92 92 * gu->gu_lock
93 93 * tu->tu_lock
94 94 * tq->tq_lock
95 95 * inst->ri_queue_lock
96 96 * wait_info_lock
97 97 * bp->cb_lock
98 98 * utmpx_lock
99 99 *
100 100 * single_user_thread_lock
101 101 * wait_info_lock
102 102 * utmpx_lock
103 103 *
104 104 * gu_freeze_lock
105 105 *
106 106 * logbuf_mutex nests inside pretty much everything.
107 107 */
108 108
109 109 #include <sys/contract/process.h>
110 110 #include <sys/ctfs.h>
111 111 #include <sys/stat.h>
112 112 #include <sys/time.h>
113 113 #include <sys/types.h>
114 114 #include <sys/uio.h>
115 115 #include <sys/wait.h>
116 116 #include <assert.h>
117 117 #include <errno.h>
118 118 #include <fcntl.h>
119 119 #include <libcontract.h>
120 120 #include <libcontract_priv.h>
121 121 #include <libintl.h>
122 122 #include <librestart.h>
123 123 #include <librestart_priv.h>
124 124 #include <libuutil.h>
125 125 #include <limits.h>
126 126 #include <poll.h>
127 127 #include <port.h>
128 128 #include <pthread.h>
129 129 #include <stdarg.h>
130 130 #include <stdio.h>
131 131 #include <strings.h>
132 132 #include <unistd.h>
133 133
134 134 #include "startd.h"
135 135 #include "protocol.h"
136 136
137 137 static uu_list_pool_t *restarter_instance_pool;
138 138 static restarter_instance_list_t instance_list;
139 139
140 140 static uu_list_pool_t *restarter_queue_pool;
141 141
142 142 /*
143 143 * Function used to reset the restart times for an instance, when
144 144 * an administrative task comes along and essentially makes the times
145 145 * in this array ineffective.
146 146 */
147 147 static void
148 148 reset_start_times(restarter_inst_t *inst)
149 149 {
150 150 inst->ri_start_index = 0;
151 151 bzero(inst->ri_start_time, sizeof (inst->ri_start_time));
152 152 }
153 153
154 154 /*ARGSUSED*/
155 155 static int
156 156 restarter_instance_compare(const void *lc_arg, const void *rc_arg,
157 157 void *private)
158 158 {
159 159 int lc_id = ((const restarter_inst_t *)lc_arg)->ri_id;
160 160 int rc_id = *(int *)rc_arg;
161 161
162 162 if (lc_id > rc_id)
163 163 return (1);
164 164 if (lc_id < rc_id)
165 165 return (-1);
166 166 return (0);
167 167 }
168 168
169 169 static restarter_inst_t *
170 170 inst_lookup_by_name(const char *name)
171 171 {
172 172 int id;
173 173
174 174 id = dict_lookup_byname(name);
175 175 if (id == -1)
176 176 return (NULL);
177 177
178 178 return (inst_lookup_by_id(id));
179 179 }
180 180
181 181 restarter_inst_t *
182 182 inst_lookup_by_id(int id)
183 183 {
184 184 restarter_inst_t *inst;
185 185
186 186 MUTEX_LOCK(&instance_list.ril_lock);
187 187 inst = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL);
188 188 if (inst != NULL)
189 189 MUTEX_LOCK(&inst->ri_lock);
190 190 MUTEX_UNLOCK(&instance_list.ril_lock);
191 191
192 192 if (inst != NULL) {
193 193 while (inst->ri_method_thread != 0 &&
194 194 !pthread_equal(inst->ri_method_thread, pthread_self())) {
195 195 ++inst->ri_method_waiters;
196 196 (void) pthread_cond_wait(&inst->ri_method_cv,
197 197 &inst->ri_lock);
198 198 assert(inst->ri_method_waiters > 0);
199 199 --inst->ri_method_waiters;
200 200 }
201 201 }
202 202
203 203 return (inst);
204 204 }
205 205
206 206 static restarter_inst_t *
207 207 inst_lookup_queue(const char *name)
208 208 {
209 209 int id;
210 210 restarter_inst_t *inst;
211 211
212 212 id = dict_lookup_byname(name);
213 213 if (id == -1)
214 214 return (NULL);
215 215
216 216 MUTEX_LOCK(&instance_list.ril_lock);
217 217 inst = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL);
218 218 if (inst != NULL)
219 219 MUTEX_LOCK(&inst->ri_queue_lock);
220 220 MUTEX_UNLOCK(&instance_list.ril_lock);
221 221
222 222 return (inst);
223 223 }
224 224
225 225 const char *
226 226 service_style(int flags)
227 227 {
228 228 switch (flags & RINST_STYLE_MASK) {
229 229 case RINST_CONTRACT: return ("contract");
230 230 case RINST_TRANSIENT: return ("transient");
231 231 case RINST_WAIT: return ("wait");
232 232
233 233 default:
234 234 #ifndef NDEBUG
235 235 uu_warn("%s:%d: Bad flags 0x%x.\n", __FILE__, __LINE__, flags);
236 236 #endif
237 237 abort();
238 238 /* NOTREACHED */
239 239 }
240 240 }
241 241
242 242 /*
243 243 * Fails with ECONNABORTED or ECANCELED.
244 244 */
245 245 static int
246 246 check_contract(restarter_inst_t *inst, boolean_t primary,
247 247 scf_instance_t *scf_inst)
248 248 {
249 249 ctid_t *ctidp;
250 250 int fd, r;
251 251
252 252 ctidp = primary ? &inst->ri_i.i_primary_ctid :
253 253 &inst->ri_i.i_transient_ctid;
254 254
255 255 assert(*ctidp >= 1);
256 256
257 257 fd = contract_open(*ctidp, NULL, "status", O_RDONLY);
258 258 if (fd >= 0) {
259 259 r = close(fd);
260 260 assert(r == 0);
261 261 return (0);
262 262 }
263 263
264 264 r = restarter_remove_contract(scf_inst, *ctidp, primary ?
265 265 RESTARTER_CONTRACT_PRIMARY : RESTARTER_CONTRACT_TRANSIENT);
266 266 switch (r) {
267 267 case 0:
268 268 case ECONNABORTED:
269 269 case ECANCELED:
270 270 *ctidp = 0;
271 271 return (r);
272 272
273 273 case ENOMEM:
274 274 uu_die("Out of memory\n");
275 275 /* NOTREACHED */
276 276
277 277 case EPERM:
278 278 uu_die("Insufficient privilege.\n");
279 279 /* NOTREACHED */
280 280
281 281 case EACCES:
282 282 uu_die("Repository backend access denied.\n");
283 283 /* NOTREACHED */
284 284
285 285 case EROFS:
286 286 log_error(LOG_INFO, "Could not remove unusable contract id %ld "
287 287 "for %s from repository.\n", *ctidp, inst->ri_i.i_fmri);
288 288 return (0);
289 289
290 290 case EINVAL:
291 291 case EBADF:
292 292 default:
293 293 assert(0);
294 294 abort();
295 295 /* NOTREACHED */
296 296 }
297 297 }
298 298
299 299 static int stop_instance(scf_handle_t *, restarter_inst_t *, stop_cause_t);
300 300
301 301 /*
302 302 * int restarter_insert_inst(scf_handle_t *, char *)
303 303 * If the inst is already in the restarter list, return its id. If the inst
304 304 * is not in the restarter list, initialize a restarter_inst_t, initialize its
305 305 * states, insert it into the list, and return 0.
306 306 *
307 307 * Fails with
308 308 * ENOENT - name is not in the repository
309 309 */
310 310 static int
311 311 restarter_insert_inst(scf_handle_t *h, const char *name)
312 312 {
313 313 int id, r;
314 314 restarter_inst_t *inst;
315 315 uu_list_index_t idx;
316 316 scf_service_t *scf_svc;
317 317 scf_instance_t *scf_inst;
318 318 scf_snapshot_t *snap = NULL;
319 319 scf_propertygroup_t *pg;
320 320 char *svc_name, *inst_name;
321 321 char logfilebuf[PATH_MAX];
322 322 char *c;
323 323 boolean_t do_commit_states;
324 324 restarter_instance_state_t state, next_state;
325 325 protocol_states_t *ps;
326 326 pid_t start_pid;
327 327 restarter_str_t reason = restarter_str_insert_in_graph;
328 328
329 329 MUTEX_LOCK(&instance_list.ril_lock);
330 330
331 331 /*
332 332 * We don't use inst_lookup_by_name() here because we want the lookup
333 333 * & insert to be atomic.
334 334 */
335 335 id = dict_lookup_byname(name);
336 336 if (id != -1) {
337 337 inst = uu_list_find(instance_list.ril_instance_list, &id, NULL,
338 338 &idx);
339 339 if (inst != NULL) {
340 340 MUTEX_UNLOCK(&instance_list.ril_lock);
341 341 return (0);
342 342 }
343 343 }
344 344
345 345 /* Allocate an instance */
346 346 inst = startd_zalloc(sizeof (restarter_inst_t));
347 347 inst->ri_utmpx_prefix = startd_alloc(max_scf_value_size);
348 348 inst->ri_utmpx_prefix[0] = '\0';
349 349
350 350 inst->ri_i.i_fmri = startd_alloc(strlen(name) + 1);
351 351 (void) strcpy((char *)inst->ri_i.i_fmri, name);
352 352
353 353 inst->ri_queue = startd_list_create(restarter_queue_pool, inst, 0);
354 354
355 355 /*
356 356 * id shouldn't be -1 since we use the same dictionary as graph.c, but
357 357 * just in case.
358 358 */
359 359 inst->ri_id = (id != -1 ? id : dict_insert(name));
360 360
361 361 special_online_hooks_get(name, &inst->ri_pre_online_hook,
362 362 &inst->ri_post_online_hook, &inst->ri_post_offline_hook);
363 363
364 364 scf_svc = safe_scf_service_create(h);
365 365 scf_inst = safe_scf_instance_create(h);
↓ open down ↓ |
365 lines elided |
↑ open up ↑ |
366 366 pg = safe_scf_pg_create(h);
367 367 svc_name = startd_alloc(max_scf_name_size);
368 368 inst_name = startd_alloc(max_scf_name_size);
369 369
370 370 rep_retry:
371 371 if (snap != NULL)
372 372 scf_snapshot_destroy(snap);
373 373 if (inst->ri_logstem != NULL)
374 374 startd_free(inst->ri_logstem, PATH_MAX);
375 375 if (inst->ri_common_name != NULL)
376 - startd_free(inst->ri_common_name, max_scf_value_size);
376 + startd_free(inst->ri_common_name,
377 + strlen(inst->ri_common_name) + 1);
377 378 if (inst->ri_C_common_name != NULL)
378 - startd_free(inst->ri_C_common_name, max_scf_value_size);
379 + startd_free(inst->ri_C_common_name,
380 + strlen(inst->ri_C_common_name) + 1);
379 381 snap = NULL;
380 382 inst->ri_logstem = NULL;
381 383 inst->ri_common_name = NULL;
382 384 inst->ri_C_common_name = NULL;
383 385
384 386 if (scf_handle_decode_fmri(h, name, NULL, scf_svc, scf_inst, NULL,
385 387 NULL, SCF_DECODE_FMRI_EXACT) != 0) {
386 388 switch (scf_error()) {
387 389 case SCF_ERROR_CONNECTION_BROKEN:
388 390 libscf_handle_rebind(h);
389 391 goto rep_retry;
390 392
391 393 case SCF_ERROR_NOT_FOUND:
392 394 goto deleted;
393 395 }
394 396
395 397 uu_die("Can't decode FMRI %s: %s\n", name,
396 398 scf_strerror(scf_error()));
397 399 }
398 400
399 401 /*
400 402 * If there's no running snapshot, then we execute using the editing
401 403 * snapshot. Pending snapshots will be taken later.
402 404 */
403 405 snap = libscf_get_running_snapshot(scf_inst);
404 406
405 407 if ((scf_service_get_name(scf_svc, svc_name, max_scf_name_size) < 0) ||
406 408 (scf_instance_get_name(scf_inst, inst_name, max_scf_name_size) <
407 409 0)) {
408 410 switch (scf_error()) {
409 411 case SCF_ERROR_NOT_SET:
410 412 break;
411 413
412 414 case SCF_ERROR_CONNECTION_BROKEN:
413 415 libscf_handle_rebind(h);
414 416 goto rep_retry;
415 417
416 418 default:
417 419 assert(0);
418 420 abort();
419 421 }
420 422
421 423 goto deleted;
422 424 }
423 425
424 426 (void) snprintf(logfilebuf, PATH_MAX, "%s:%s", svc_name, inst_name);
425 427 for (c = logfilebuf; *c != '\0'; c++)
426 428 if (*c == '/')
427 429 *c = '-';
428 430
429 431 inst->ri_logstem = startd_alloc(PATH_MAX);
430 432 (void) snprintf(inst->ri_logstem, PATH_MAX, "%s%s", logfilebuf,
431 433 LOG_SUFFIX);
432 434
433 435 /*
434 436 * If the restarter group is missing, use uninit/none. Otherwise,
435 437 * we're probably being restarted & don't want to mess up the states
436 438 * that are there.
437 439 */
438 440 state = RESTARTER_STATE_UNINIT;
439 441 next_state = RESTARTER_STATE_NONE;
440 442
441 443 r = scf_instance_get_pg(scf_inst, SCF_PG_RESTARTER, pg);
442 444 if (r != 0) {
443 445 switch (scf_error()) {
444 446 case SCF_ERROR_CONNECTION_BROKEN:
445 447 libscf_handle_rebind(h);
446 448 goto rep_retry;
447 449
448 450 case SCF_ERROR_NOT_SET:
449 451 goto deleted;
450 452
451 453 case SCF_ERROR_NOT_FOUND:
452 454 /*
453 455 * This shouldn't happen since the graph engine should
454 456 * have initialized the state to uninitialized/none if
455 457 * there was no restarter pg. In case somebody
456 458 * deleted it, though....
457 459 */
458 460 do_commit_states = B_TRUE;
459 461 break;
460 462
461 463 default:
462 464 assert(0);
463 465 abort();
464 466 }
465 467 } else {
466 468 r = libscf_read_states(pg, &state, &next_state);
467 469 if (r != 0) {
468 470 do_commit_states = B_TRUE;
469 471 } else {
470 472 if (next_state != RESTARTER_STATE_NONE) {
471 473 /*
472 474 * Force next_state to _NONE since we
473 475 * don't look for method processes.
474 476 */
475 477 next_state = RESTARTER_STATE_NONE;
476 478 do_commit_states = B_TRUE;
477 479 } else {
478 480 /*
479 481 * The reason for transition will depend on
480 482 * state.
481 483 */
482 484 if (st->st_initial == 0)
483 485 reason = restarter_str_startd_restart;
484 486 else if (state == RESTARTER_STATE_MAINT)
485 487 reason = restarter_str_bad_repo_state;
486 488 /*
487 489 * Inform the restarter of our state without
488 490 * changing the STIME in the repository.
489 491 */
490 492 ps = startd_alloc(sizeof (*ps));
491 493 inst->ri_i.i_state = ps->ps_state = state;
492 494 inst->ri_i.i_next_state = ps->ps_state_next =
493 495 next_state;
494 496 ps->ps_reason = reason;
495 497
496 498 graph_protocol_send_event(inst->ri_i.i_fmri,
497 499 GRAPH_UPDATE_STATE_CHANGE, ps);
498 500
499 501 do_commit_states = B_FALSE;
500 502 }
501 503 }
502 504 }
503 505
504 506 switch (libscf_get_startd_properties(scf_inst, snap, &inst->ri_flags,
505 507 &inst->ri_utmpx_prefix)) {
506 508 case 0:
507 509 break;
508 510
509 511 case ECONNABORTED:
510 512 libscf_handle_rebind(h);
511 513 goto rep_retry;
512 514
513 515 case ECANCELED:
514 516 goto deleted;
515 517
516 518 case ENOENT:
517 519 /*
518 520 * This is odd, because the graph engine should have required
519 521 * the general property group. So we'll just use default
520 522 * flags in anticipation of the graph engine sending us
521 523 * REMOVE_INSTANCE when it finds out that the general property
↓ open down ↓ |
133 lines elided |
↑ open up ↑ |
522 524 * group has been deleted.
523 525 */
524 526 inst->ri_flags = RINST_CONTRACT;
525 527 break;
526 528
527 529 default:
528 530 assert(0);
529 531 abort();
530 532 }
531 533
532 - switch (libscf_get_template_values(scf_inst, snap,
533 - &inst->ri_common_name, &inst->ri_C_common_name)) {
534 + r = libscf_get_template_values(scf_inst, snap,
535 + &inst->ri_common_name, &inst->ri_C_common_name);
536 +
537 + /*
538 + * Copy our names to smaller buffers to reduce our memory footprint.
539 + */
540 + if (inst->ri_common_name != NULL) {
541 + char *tmp = safe_strdup(inst->ri_common_name);
542 + startd_free(inst->ri_common_name, max_scf_value_size);
543 + inst->ri_common_name = tmp;
544 + }
545 +
546 + if (inst->ri_C_common_name != NULL) {
547 + char *tmp = safe_strdup(inst->ri_C_common_name);
548 + startd_free(inst->ri_C_common_name, max_scf_value_size);
549 + inst->ri_C_common_name = tmp;
550 + }
551 +
552 + switch (r) {
534 553 case 0:
535 554 break;
536 555
537 556 case ECONNABORTED:
538 557 libscf_handle_rebind(h);
539 558 goto rep_retry;
540 559
541 560 case ECANCELED:
542 561 goto deleted;
543 562
544 563 case ECHILD:
545 564 case ENOENT:
546 565 break;
547 566
548 567 default:
549 568 assert(0);
550 569 abort();
551 570 }
552 571
553 572 switch (libscf_read_method_ids(h, scf_inst, inst->ri_i.i_fmri,
554 573 &inst->ri_i.i_primary_ctid, &inst->ri_i.i_transient_ctid,
555 574 &start_pid)) {
556 575 case 0:
557 576 break;
558 577
559 578 case ECONNABORTED:
560 579 libscf_handle_rebind(h);
561 580 goto rep_retry;
562 581
563 582 case ECANCELED:
564 583 goto deleted;
565 584
566 585 default:
567 586 assert(0);
568 587 abort();
569 588 }
570 589
571 590 if (inst->ri_i.i_primary_ctid >= 1) {
572 591 contract_hash_store(inst->ri_i.i_primary_ctid, inst->ri_id);
573 592
574 593 switch (check_contract(inst, B_TRUE, scf_inst)) {
575 594 case 0:
576 595 break;
577 596
578 597 case ECONNABORTED:
579 598 libscf_handle_rebind(h);
580 599 goto rep_retry;
581 600
582 601 case ECANCELED:
583 602 goto deleted;
584 603
585 604 default:
586 605 assert(0);
587 606 abort();
588 607 }
589 608 }
590 609
591 610 if (inst->ri_i.i_transient_ctid >= 1) {
592 611 switch (check_contract(inst, B_FALSE, scf_inst)) {
593 612 case 0:
594 613 break;
595 614
596 615 case ECONNABORTED:
597 616 libscf_handle_rebind(h);
598 617 goto rep_retry;
599 618
600 619 case ECANCELED:
601 620 goto deleted;
602 621
603 622 default:
604 623 assert(0);
605 624 abort();
606 625 }
607 626 }
608 627
609 628 /* No more failures we live through, so add it to the list. */
610 629 (void) pthread_mutex_init(&inst->ri_lock, &mutex_attrs);
611 630 (void) pthread_mutex_init(&inst->ri_queue_lock, &mutex_attrs);
612 631 MUTEX_LOCK(&inst->ri_lock);
613 632 MUTEX_LOCK(&inst->ri_queue_lock);
614 633
615 634 (void) pthread_cond_init(&inst->ri_method_cv, NULL);
616 635
617 636 uu_list_node_init(inst, &inst->ri_link, restarter_instance_pool);
618 637 uu_list_insert(instance_list.ril_instance_list, inst, idx);
619 638 MUTEX_UNLOCK(&instance_list.ril_lock);
620 639
621 640 if (start_pid != -1 &&
622 641 (inst->ri_flags & RINST_STYLE_MASK) == RINST_WAIT) {
623 642 int ret;
624 643 ret = wait_register(start_pid, inst->ri_i.i_fmri, 0, 1);
625 644 if (ret == -1) {
626 645 /*
627 646 * Implication: if we can't reregister the
628 647 * instance, we will start another one. Two
629 648 * instances may or may not result in a resource
630 649 * conflict.
631 650 */
632 651 log_error(LOG_WARNING,
633 652 "%s: couldn't reregister %ld for wait\n",
634 653 inst->ri_i.i_fmri, start_pid);
635 654 } else if (ret == 1) {
636 655 /*
637 656 * Leading PID has exited.
638 657 */
639 658 (void) stop_instance(h, inst, RSTOP_EXIT);
640 659 }
641 660 }
642 661
643 662
644 663 scf_pg_destroy(pg);
645 664
646 665 if (do_commit_states)
647 666 (void) restarter_instance_update_states(h, inst, state,
648 667 next_state, RERR_NONE, reason);
649 668
650 669 log_framework(LOG_DEBUG, "%s is a %s-style service\n", name,
651 670 service_style(inst->ri_flags));
652 671
653 672 MUTEX_UNLOCK(&inst->ri_queue_lock);
654 673 MUTEX_UNLOCK(&inst->ri_lock);
655 674
656 675 startd_free(svc_name, max_scf_name_size);
657 676 startd_free(inst_name, max_scf_name_size);
658 677 scf_snapshot_destroy(snap);
659 678 scf_instance_destroy(scf_inst);
660 679 scf_service_destroy(scf_svc);
661 680
662 681 log_framework(LOG_DEBUG, "%s: inserted instance into restarter list\n",
663 682 name);
664 683
665 684 return (0);
666 685
667 686 deleted:
668 687 MUTEX_UNLOCK(&instance_list.ril_lock);
669 688 startd_free(inst_name, max_scf_name_size);
670 689 startd_free(svc_name, max_scf_name_size);
↓ open down ↓ |
127 lines elided |
↑ open up ↑ |
671 690 if (snap != NULL)
672 691 scf_snapshot_destroy(snap);
673 692 scf_pg_destroy(pg);
674 693 scf_instance_destroy(scf_inst);
675 694 scf_service_destroy(scf_svc);
676 695 startd_free((void *)inst->ri_i.i_fmri, strlen(inst->ri_i.i_fmri) + 1);
677 696 uu_list_destroy(inst->ri_queue);
678 697 if (inst->ri_logstem != NULL)
679 698 startd_free(inst->ri_logstem, PATH_MAX);
680 699 if (inst->ri_common_name != NULL)
681 - startd_free(inst->ri_common_name, max_scf_value_size);
700 + startd_free(inst->ri_common_name,
701 + strlen(inst->ri_common_name) + 1);
682 702 if (inst->ri_C_common_name != NULL)
683 - startd_free(inst->ri_C_common_name, max_scf_value_size);
703 + startd_free(inst->ri_C_common_name,
704 + strlen(inst->ri_C_common_name) + 1);
684 705 startd_free(inst->ri_utmpx_prefix, max_scf_value_size);
685 706 startd_free(inst, sizeof (restarter_inst_t));
686 707 return (ENOENT);
687 708 }
688 709
689 710 static void
690 711 restarter_delete_inst(restarter_inst_t *ri)
691 712 {
692 713 int id;
693 714 restarter_inst_t *rip;
694 715 void *cookie = NULL;
695 716 restarter_instance_qentry_t *e;
696 717
697 718 assert(MUTEX_HELD(&ri->ri_lock));
698 719
699 720 /*
700 721 * Must drop the instance lock so we can pick up the instance_list
701 722 * lock & remove the instance.
702 723 */
703 724 id = ri->ri_id;
704 725 MUTEX_UNLOCK(&ri->ri_lock);
705 726
706 727 MUTEX_LOCK(&instance_list.ril_lock);
707 728
708 729 rip = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL);
709 730 if (rip == NULL) {
710 731 MUTEX_UNLOCK(&instance_list.ril_lock);
711 732 return;
712 733 }
713 734
714 735 assert(ri == rip);
715 736
716 737 uu_list_remove(instance_list.ril_instance_list, ri);
717 738
718 739 log_framework(LOG_DEBUG, "%s: deleted instance from restarter list\n",
719 740 ri->ri_i.i_fmri);
720 741
721 742 MUTEX_UNLOCK(&instance_list.ril_lock);
722 743
723 744 /*
724 745 * We can lock the instance without holding the instance_list lock
725 746 * since we removed the instance from the list.
726 747 */
727 748 MUTEX_LOCK(&ri->ri_lock);
728 749 MUTEX_LOCK(&ri->ri_queue_lock);
729 750
730 751 if (ri->ri_i.i_primary_ctid >= 1)
731 752 contract_hash_remove(ri->ri_i.i_primary_ctid);
732 753
↓ open down ↓ |
39 lines elided |
↑ open up ↑ |
733 754 while (ri->ri_method_thread != 0 || ri->ri_method_waiters > 0)
734 755 (void) pthread_cond_wait(&ri->ri_method_cv, &ri->ri_lock);
735 756
736 757 while ((e = uu_list_teardown(ri->ri_queue, &cookie)) != NULL)
737 758 startd_free(e, sizeof (*e));
738 759 uu_list_destroy(ri->ri_queue);
739 760
740 761 startd_free((void *)ri->ri_i.i_fmri, strlen(ri->ri_i.i_fmri) + 1);
741 762 startd_free(ri->ri_logstem, PATH_MAX);
742 763 if (ri->ri_common_name != NULL)
743 - startd_free(ri->ri_common_name, max_scf_value_size);
764 + startd_free(ri->ri_common_name,
765 + strlen(ri->ri_common_name) + 1);
744 766 if (ri->ri_C_common_name != NULL)
745 - startd_free(ri->ri_C_common_name, max_scf_value_size);
767 + startd_free(ri->ri_C_common_name,
768 + strlen(ri->ri_C_common_name) + 1);
746 769 startd_free(ri->ri_utmpx_prefix, max_scf_value_size);
747 770 (void) pthread_mutex_destroy(&ri->ri_lock);
748 771 (void) pthread_mutex_destroy(&ri->ri_queue_lock);
749 772 startd_free(ri, sizeof (restarter_inst_t));
750 773 }
751 774
752 775 /*
753 776 * instance_is_wait_style()
754 777 *
755 778 * Returns 1 if the given instance is a "wait-style" service instance.
756 779 */
757 780 int
758 781 instance_is_wait_style(restarter_inst_t *inst)
759 782 {
760 783 assert(MUTEX_HELD(&inst->ri_lock));
761 784 return ((inst->ri_flags & RINST_STYLE_MASK) == RINST_WAIT);
762 785 }
763 786
764 787 /*
765 788 * instance_is_transient_style()
766 789 *
767 790 * Returns 1 if the given instance is a transient service instance.
768 791 */
769 792 int
770 793 instance_is_transient_style(restarter_inst_t *inst)
771 794 {
772 795 assert(MUTEX_HELD(&inst->ri_lock));
773 796 return ((inst->ri_flags & RINST_STYLE_MASK) == RINST_TRANSIENT);
774 797 }
775 798
776 799 /*
777 800 * instance_in_transition()
778 801 * Returns 1 if instance is in transition, 0 if not
779 802 */
780 803 int
781 804 instance_in_transition(restarter_inst_t *inst)
782 805 {
783 806 assert(MUTEX_HELD(&inst->ri_lock));
784 807 if (inst->ri_i.i_next_state == RESTARTER_STATE_NONE)
785 808 return (0);
786 809 return (1);
787 810 }
788 811
789 812 /*
790 813 * returns 1 if instance is already started, 0 if not
791 814 */
792 815 static int
793 816 instance_started(restarter_inst_t *inst)
794 817 {
795 818 int ret;
796 819
797 820 assert(MUTEX_HELD(&inst->ri_lock));
798 821
799 822 if (inst->ri_i.i_state == RESTARTER_STATE_ONLINE ||
800 823 inst->ri_i.i_state == RESTARTER_STATE_DEGRADED)
801 824 ret = 1;
802 825 else
803 826 ret = 0;
804 827
805 828 return (ret);
806 829 }
807 830
808 831 /*
809 832 * Returns
810 833 * 0 - success
811 834 * ECONNRESET - success, but h was rebound
812 835 */
813 836 int
814 837 restarter_instance_update_states(scf_handle_t *h, restarter_inst_t *ri,
815 838 restarter_instance_state_t new_state,
816 839 restarter_instance_state_t new_state_next, restarter_error_t err,
817 840 restarter_str_t reason)
818 841 {
819 842 protocol_states_t *states;
820 843 int e;
821 844 uint_t retry_count = 0, msecs = ALLOC_DELAY;
822 845 boolean_t rebound = B_FALSE;
823 846 int prev_state_online;
824 847 int state_online;
825 848
826 849 assert(MUTEX_HELD(&ri->ri_lock));
827 850
828 851 prev_state_online = instance_started(ri);
829 852
830 853 retry:
831 854 e = _restarter_commit_states(h, &ri->ri_i, new_state, new_state_next,
832 855 restarter_get_str_short(reason));
833 856 switch (e) {
834 857 case 0:
835 858 break;
836 859
837 860 case ENOMEM:
838 861 ++retry_count;
839 862 if (retry_count < ALLOC_RETRY) {
840 863 (void) poll(NULL, 0, msecs);
841 864 msecs *= ALLOC_DELAY_MULT;
842 865 goto retry;
843 866 }
844 867
845 868 /* Like startd_alloc(). */
846 869 uu_die("Insufficient memory.\n");
847 870 /* NOTREACHED */
848 871
849 872 case ECONNABORTED:
850 873 libscf_handle_rebind(h);
851 874 rebound = B_TRUE;
852 875 goto retry;
853 876
854 877 case EPERM:
855 878 case EACCES:
856 879 case EROFS:
857 880 log_error(LOG_NOTICE, "Could not commit state change for %s "
858 881 "to repository: %s.\n", ri->ri_i.i_fmri, strerror(e));
859 882 /* FALLTHROUGH */
860 883
861 884 case ENOENT:
862 885 ri->ri_i.i_state = new_state;
863 886 ri->ri_i.i_next_state = new_state_next;
864 887 break;
865 888
866 889 case EINVAL:
867 890 default:
868 891 bad_error("_restarter_commit_states", e);
869 892 }
870 893
871 894 states = startd_alloc(sizeof (protocol_states_t));
872 895 states->ps_state = new_state;
873 896 states->ps_state_next = new_state_next;
874 897 states->ps_err = err;
875 898 states->ps_reason = reason;
876 899 graph_protocol_send_event(ri->ri_i.i_fmri, GRAPH_UPDATE_STATE_CHANGE,
877 900 (void *)states);
878 901
879 902 state_online = instance_started(ri);
880 903
881 904 if (prev_state_online && !state_online)
882 905 ri->ri_post_offline_hook();
883 906 else if (!prev_state_online && state_online)
884 907 ri->ri_post_online_hook();
885 908
886 909 return (rebound ? ECONNRESET : 0);
887 910 }
888 911
889 912 void
890 913 restarter_mark_pending_snapshot(const char *fmri, uint_t flag)
891 914 {
892 915 restarter_inst_t *inst;
893 916
894 917 assert(flag == RINST_RETAKE_RUNNING || flag == RINST_RETAKE_START);
895 918
896 919 inst = inst_lookup_by_name(fmri);
897 920 if (inst == NULL)
898 921 return;
899 922
900 923 inst->ri_flags |= flag;
901 924
902 925 MUTEX_UNLOCK(&inst->ri_lock);
903 926 }
904 927
905 928 static void
906 929 restarter_take_pending_snapshots(scf_handle_t *h)
907 930 {
908 931 restarter_inst_t *inst;
909 932 int r;
910 933
911 934 MUTEX_LOCK(&instance_list.ril_lock);
912 935
913 936 for (inst = uu_list_first(instance_list.ril_instance_list);
914 937 inst != NULL;
915 938 inst = uu_list_next(instance_list.ril_instance_list, inst)) {
916 939 const char *fmri;
917 940 scf_instance_t *sinst = NULL;
918 941
919 942 MUTEX_LOCK(&inst->ri_lock);
920 943
921 944 /*
922 945 * This is where we'd check inst->ri_method_thread and if it
923 946 * were nonzero we'd wait in anticipation of another thread
924 947 * executing a method for inst. Doing so with the instance_list
925 948 * locked, though, leads to deadlock. Since taking a snapshot
926 949 * during that window won't hurt anything, we'll just continue.
927 950 */
928 951
929 952 fmri = inst->ri_i.i_fmri;
930 953
931 954 if (inst->ri_flags & RINST_RETAKE_RUNNING) {
932 955 scf_snapshot_t *rsnap;
933 956
934 957 (void) libscf_fmri_get_instance(h, fmri, &sinst);
935 958
936 959 rsnap = libscf_get_or_make_running_snapshot(sinst,
937 960 fmri, B_FALSE);
938 961
939 962 scf_instance_destroy(sinst);
940 963
941 964 if (rsnap != NULL)
942 965 inst->ri_flags &= ~RINST_RETAKE_RUNNING;
943 966
944 967 scf_snapshot_destroy(rsnap);
945 968 }
946 969
947 970 if (inst->ri_flags & RINST_RETAKE_START) {
948 971 switch (r = libscf_snapshots_poststart(h, fmri,
949 972 B_FALSE)) {
950 973 case 0:
951 974 case ENOENT:
952 975 inst->ri_flags &= ~RINST_RETAKE_START;
953 976 break;
954 977
955 978 case ECONNABORTED:
956 979 break;
957 980
958 981 case EACCES:
959 982 default:
960 983 bad_error("libscf_snapshots_poststart", r);
961 984 }
962 985 }
963 986
964 987 MUTEX_UNLOCK(&inst->ri_lock);
965 988 }
966 989
967 990 MUTEX_UNLOCK(&instance_list.ril_lock);
968 991 }
969 992
970 993 /* ARGSUSED */
971 994 void *
972 995 restarter_post_fsminimal_thread(void *unused)
973 996 {
974 997 scf_handle_t *h;
975 998 int r;
976 999
977 1000 h = libscf_handle_create_bound_loop();
978 1001
979 1002 for (;;) {
980 1003 r = libscf_create_self(h);
981 1004 if (r == 0)
982 1005 break;
983 1006
984 1007 assert(r == ECONNABORTED);
985 1008 libscf_handle_rebind(h);
986 1009 }
987 1010
988 1011 restarter_take_pending_snapshots(h);
989 1012
990 1013 (void) scf_handle_unbind(h);
991 1014 scf_handle_destroy(h);
992 1015
993 1016 return (NULL);
994 1017 }
995 1018
996 1019 /*
997 1020 * int stop_instance()
998 1021 *
999 1022 * Stop the instance identified by the instance given as the second argument,
1000 1023 * for the cause stated.
1001 1024 *
1002 1025 * Returns
1003 1026 * 0 - success
1004 1027 * -1 - inst is in transition
1005 1028 */
1006 1029 static int
1007 1030 stop_instance(scf_handle_t *local_handle, restarter_inst_t *inst,
1008 1031 stop_cause_t cause)
1009 1032 {
1010 1033 fork_info_t *info;
1011 1034 const char *cp;
1012 1035 int err;
1013 1036 restarter_error_t re;
1014 1037 restarter_str_t reason;
1015 1038
1016 1039 assert(MUTEX_HELD(&inst->ri_lock));
1017 1040 assert(inst->ri_method_thread == 0);
1018 1041
1019 1042 switch (cause) {
1020 1043 case RSTOP_EXIT:
1021 1044 re = RERR_RESTART;
1022 1045 reason = restarter_str_ct_ev_exit;
1023 1046 cp = "all processes in service exited";
1024 1047 break;
1025 1048 case RSTOP_CORE:
1026 1049 re = RERR_FAULT;
1027 1050 reason = restarter_str_ct_ev_core;
1028 1051 cp = "process dumped core";
1029 1052 break;
1030 1053 case RSTOP_SIGNAL:
1031 1054 re = RERR_FAULT;
1032 1055 reason = restarter_str_ct_ev_signal;
1033 1056 cp = "process received fatal signal from outside the service";
1034 1057 break;
1035 1058 case RSTOP_HWERR:
1036 1059 re = RERR_FAULT;
1037 1060 reason = restarter_str_ct_ev_hwerr;
1038 1061 cp = "process killed due to uncorrectable hardware error";
1039 1062 break;
1040 1063 case RSTOP_DEPENDENCY:
1041 1064 re = RERR_RESTART;
1042 1065 reason = restarter_str_dependency_activity;
1043 1066 cp = "dependency activity requires stop";
1044 1067 break;
1045 1068 case RSTOP_DISABLE:
1046 1069 re = RERR_RESTART;
1047 1070 reason = restarter_str_disable_request;
1048 1071 cp = "service disabled";
1049 1072 break;
1050 1073 case RSTOP_RESTART:
1051 1074 re = RERR_RESTART;
1052 1075 reason = restarter_str_restart_request;
1053 1076 cp = "service restarting";
1054 1077 break;
1055 1078 default:
1056 1079 #ifndef NDEBUG
1057 1080 (void) fprintf(stderr, "Unknown cause %d at %s:%d.\n",
1058 1081 cause, __FILE__, __LINE__);
1059 1082 #endif
1060 1083 abort();
1061 1084 }
1062 1085
1063 1086 /* Services in the disabled and maintenance state are ignored */
1064 1087 if (inst->ri_i.i_state == RESTARTER_STATE_MAINT ||
1065 1088 inst->ri_i.i_state == RESTARTER_STATE_DISABLED) {
1066 1089 log_framework(LOG_DEBUG,
1067 1090 "%s: stop_instance -> is maint/disabled\n",
1068 1091 inst->ri_i.i_fmri);
1069 1092 return (0);
1070 1093 }
1071 1094
1072 1095 /* Already stopped instances are left alone */
1073 1096 if (instance_started(inst) == 0) {
1074 1097 log_framework(LOG_DEBUG, "Restarter: %s is already stopped.\n",
1075 1098 inst->ri_i.i_fmri);
1076 1099 return (0);
1077 1100 }
1078 1101
1079 1102 if (instance_in_transition(inst)) {
1080 1103 /* requeue event by returning -1 */
1081 1104 log_framework(LOG_DEBUG,
1082 1105 "Restarter: Not stopping %s, in transition.\n",
1083 1106 inst->ri_i.i_fmri);
1084 1107 return (-1);
1085 1108 }
1086 1109
1087 1110 log_instance(inst, B_TRUE, "Stopping because %s.", cp);
1088 1111
1089 1112 log_framework(re == RERR_FAULT ? LOG_INFO : LOG_DEBUG,
1090 1113 "%s: Instance stopping because %s.\n", inst->ri_i.i_fmri, cp);
1091 1114
1092 1115 if (instance_is_wait_style(inst) && cause == RSTOP_EXIT) {
1093 1116 /*
1094 1117 * No need to stop instance, as child has exited; remove
1095 1118 * contract and move the instance to the offline state.
1096 1119 */
1097 1120 switch (err = restarter_instance_update_states(local_handle,
1098 1121 inst, inst->ri_i.i_state, RESTARTER_STATE_OFFLINE, re,
1099 1122 reason)) {
1100 1123 case 0:
1101 1124 case ECONNRESET:
1102 1125 break;
1103 1126
1104 1127 default:
1105 1128 bad_error("restarter_instance_update_states", err);
1106 1129 }
1107 1130
1108 1131 (void) update_fault_count(inst, FAULT_COUNT_RESET);
1109 1132 reset_start_times(inst);
1110 1133
1111 1134 if (inst->ri_i.i_primary_ctid != 0) {
1112 1135 inst->ri_m_inst =
1113 1136 safe_scf_instance_create(local_handle);
1114 1137 inst->ri_mi_deleted = B_FALSE;
1115 1138
1116 1139 libscf_reget_instance(inst);
1117 1140 method_remove_contract(inst, B_TRUE, B_TRUE);
1118 1141
1119 1142 scf_instance_destroy(inst->ri_m_inst);
1120 1143 inst->ri_m_inst = NULL;
1121 1144 }
1122 1145
1123 1146 switch (err = restarter_instance_update_states(local_handle,
1124 1147 inst, inst->ri_i.i_next_state, RESTARTER_STATE_NONE, re,
1125 1148 reason)) {
1126 1149 case 0:
1127 1150 case ECONNRESET:
1128 1151 break;
1129 1152
1130 1153 default:
1131 1154 bad_error("restarter_instance_update_states", err);
1132 1155 }
1133 1156
1134 1157 return (0);
1135 1158 } else if (instance_is_wait_style(inst) && re == RERR_RESTART) {
1136 1159 /*
1137 1160 * Stopping a wait service through means other than the pid
1138 1161 * exiting should keep wait_thread() from restarting the
1139 1162 * service, by removing it from the wait list.
1140 1163 * We cannot remove it right now otherwise the process will
1141 1164 * end up <defunct> so mark it to be ignored.
1142 1165 */
1143 1166 wait_ignore_by_fmri(inst->ri_i.i_fmri);
1144 1167 }
1145 1168
1146 1169 switch (err = restarter_instance_update_states(local_handle, inst,
1147 1170 inst->ri_i.i_state, inst->ri_i.i_enabled ? RESTARTER_STATE_OFFLINE :
1148 1171 RESTARTER_STATE_DISABLED, RERR_NONE, reason)) {
1149 1172 case 0:
1150 1173 case ECONNRESET:
1151 1174 break;
1152 1175
1153 1176 default:
1154 1177 bad_error("restarter_instance_update_states", err);
1155 1178 }
1156 1179
1157 1180 info = startd_zalloc(sizeof (fork_info_t));
1158 1181
1159 1182 info->sf_id = inst->ri_id;
1160 1183 info->sf_method_type = METHOD_STOP;
1161 1184 info->sf_event_type = re;
1162 1185 info->sf_reason = reason;
1163 1186 inst->ri_method_thread = startd_thread_create(method_thread, info);
1164 1187
1165 1188 return (0);
1166 1189 }
1167 1190
1168 1191 /*
1169 1192 * Returns
1170 1193 * ENOENT - fmri is not in instance_list
1171 1194 * 0 - success
1172 1195 * ECONNRESET - success, though handle was rebound
1173 1196 * -1 - instance is in transition
1174 1197 */
1175 1198 int
1176 1199 stop_instance_fmri(scf_handle_t *h, const char *fmri, uint_t flags)
1177 1200 {
1178 1201 restarter_inst_t *rip;
1179 1202 int r;
1180 1203
1181 1204 rip = inst_lookup_by_name(fmri);
1182 1205 if (rip == NULL)
1183 1206 return (ENOENT);
1184 1207
1185 1208 r = stop_instance(h, rip, flags);
1186 1209
1187 1210 MUTEX_UNLOCK(&rip->ri_lock);
1188 1211
1189 1212 return (r);
1190 1213 }
1191 1214
1192 1215 static void
1193 1216 unmaintain_instance(scf_handle_t *h, restarter_inst_t *rip,
1194 1217 unmaint_cause_t cause)
1195 1218 {
1196 1219 ctid_t ctid;
1197 1220 scf_instance_t *inst;
1198 1221 int r;
1199 1222 uint_t tries = 0, msecs = ALLOC_DELAY;
1200 1223 const char *cp;
1201 1224 restarter_str_t reason;
1202 1225
1203 1226 assert(MUTEX_HELD(&rip->ri_lock));
1204 1227
1205 1228 if (rip->ri_i.i_state != RESTARTER_STATE_MAINT) {
1206 1229 log_error(LOG_DEBUG, "Restarter: "
1207 1230 "Ignoring maintenance off command because %s is not in the "
1208 1231 "maintenance state.\n", rip->ri_i.i_fmri);
1209 1232 return;
1210 1233 }
1211 1234
1212 1235 switch (cause) {
1213 1236 case RUNMAINT_CLEAR:
1214 1237 cp = "clear requested";
1215 1238 reason = restarter_str_clear_request;
1216 1239 break;
1217 1240 case RUNMAINT_DISABLE:
1218 1241 cp = "disable requested";
1219 1242 reason = restarter_str_disable_request;
1220 1243 break;
1221 1244 default:
1222 1245 #ifndef NDEBUG
1223 1246 (void) fprintf(stderr, "Uncaught case for %d at %s:%d.\n",
1224 1247 cause, __FILE__, __LINE__);
1225 1248 #endif
1226 1249 abort();
1227 1250 }
1228 1251
1229 1252 log_instance(rip, B_TRUE, "Leaving maintenance because %s.",
1230 1253 cp);
1231 1254 log_framework(LOG_DEBUG, "%s: Instance leaving maintenance because "
1232 1255 "%s.\n", rip->ri_i.i_fmri, cp);
1233 1256
1234 1257 (void) restarter_instance_update_states(h, rip, RESTARTER_STATE_UNINIT,
1235 1258 RESTARTER_STATE_NONE, RERR_RESTART, reason);
1236 1259
1237 1260 /*
1238 1261 * If we did ADMIN_MAINT_ON_IMMEDIATE, then there might still be
1239 1262 * a primary contract.
1240 1263 */
1241 1264 if (rip->ri_i.i_primary_ctid == 0)
1242 1265 return;
1243 1266
1244 1267 ctid = rip->ri_i.i_primary_ctid;
1245 1268 contract_abandon(ctid);
1246 1269 rip->ri_i.i_primary_ctid = 0;
1247 1270
1248 1271 rep_retry:
1249 1272 switch (r = libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst)) {
1250 1273 case 0:
1251 1274 break;
1252 1275
1253 1276 case ECONNABORTED:
1254 1277 libscf_handle_rebind(h);
1255 1278 goto rep_retry;
1256 1279
1257 1280 case ENOENT:
1258 1281 /* Must have been deleted. */
1259 1282 return;
1260 1283
1261 1284 case EINVAL:
1262 1285 case ENOTSUP:
1263 1286 default:
1264 1287 bad_error("libscf_handle_rebind", r);
1265 1288 }
1266 1289
1267 1290 again:
1268 1291 r = restarter_remove_contract(inst, ctid, RESTARTER_CONTRACT_PRIMARY);
1269 1292 switch (r) {
1270 1293 case 0:
1271 1294 break;
1272 1295
1273 1296 case ENOMEM:
1274 1297 ++tries;
1275 1298 if (tries < ALLOC_RETRY) {
1276 1299 (void) poll(NULL, 0, msecs);
1277 1300 msecs *= ALLOC_DELAY_MULT;
1278 1301 goto again;
1279 1302 }
1280 1303
1281 1304 uu_die("Insufficient memory.\n");
1282 1305 /* NOTREACHED */
1283 1306
1284 1307 case ECONNABORTED:
1285 1308 scf_instance_destroy(inst);
1286 1309 libscf_handle_rebind(h);
1287 1310 goto rep_retry;
1288 1311
1289 1312 case ECANCELED:
1290 1313 break;
1291 1314
1292 1315 case EPERM:
1293 1316 case EACCES:
1294 1317 case EROFS:
1295 1318 log_error(LOG_INFO,
1296 1319 "Could not remove contract id %lu for %s (%s).\n", ctid,
1297 1320 rip->ri_i.i_fmri, strerror(r));
1298 1321 break;
1299 1322
1300 1323 case EINVAL:
1301 1324 case EBADF:
1302 1325 default:
1303 1326 bad_error("restarter_remove_contract", r);
1304 1327 }
1305 1328
1306 1329 scf_instance_destroy(inst);
1307 1330 }
1308 1331
1309 1332 /*
1310 1333 * enable_inst()
1311 1334 * Set inst->ri_i.i_enabled. Expects 'e' to be _ENABLE, _DISABLE, or
1312 1335 * _ADMIN_DISABLE. If the event is _ENABLE and inst is uninitialized or
1313 1336 * disabled, move it to offline. If the event is _DISABLE or
1314 1337 * _ADMIN_DISABLE, make sure inst will move to disabled.
1315 1338 *
1316 1339 * Returns
1317 1340 * 0 - success
1318 1341 * ECONNRESET - h was rebound
1319 1342 */
1320 1343 static int
1321 1344 enable_inst(scf_handle_t *h, restarter_inst_t *inst,
1322 1345 restarter_instance_qentry_t *riq)
1323 1346 {
1324 1347 restarter_instance_state_t state;
1325 1348 restarter_event_type_t e = riq->riq_type;
1326 1349 restarter_str_t reason = restarter_str_per_configuration;
1327 1350 int r;
1328 1351
1329 1352 assert(MUTEX_HELD(&inst->ri_lock));
1330 1353 assert(e == RESTARTER_EVENT_TYPE_ADMIN_DISABLE ||
1331 1354 e == RESTARTER_EVENT_TYPE_DISABLE ||
1332 1355 e == RESTARTER_EVENT_TYPE_ENABLE);
1333 1356 assert(instance_in_transition(inst) == 0);
1334 1357
1335 1358 state = inst->ri_i.i_state;
1336 1359
1337 1360 if (e == RESTARTER_EVENT_TYPE_ENABLE) {
1338 1361 inst->ri_i.i_enabled = 1;
1339 1362
1340 1363 if (state == RESTARTER_STATE_UNINIT ||
1341 1364 state == RESTARTER_STATE_DISABLED) {
1342 1365 /*
1343 1366 * B_FALSE: Don't log an error if the log_instance()
1344 1367 * fails because it will fail on the miniroot before
1345 1368 * install-discovery runs.
1346 1369 */
1347 1370 log_instance(inst, B_FALSE, "Enabled.");
1348 1371 log_framework(LOG_DEBUG, "%s: Instance enabled.\n",
1349 1372 inst->ri_i.i_fmri);
1350 1373
1351 1374 /*
1352 1375 * If we are coming from DISABLED, it was obviously an
1353 1376 * enable request. If we are coming from UNINIT, it may
1354 1377 * have been a sevice in MAINT that was cleared.
1355 1378 */
1356 1379 if (riq->riq_reason == restarter_str_clear_request)
1357 1380 reason = restarter_str_clear_request;
1358 1381 else if (state == RESTARTER_STATE_DISABLED)
1359 1382 reason = restarter_str_enable_request;
1360 1383 (void) restarter_instance_update_states(h, inst,
1361 1384 RESTARTER_STATE_OFFLINE, RESTARTER_STATE_NONE,
1362 1385 RERR_NONE, reason);
1363 1386 } else {
1364 1387 log_framework(LOG_DEBUG, "Restarter: "
1365 1388 "Not changing state of %s for enable command.\n",
1366 1389 inst->ri_i.i_fmri);
1367 1390 }
1368 1391 } else {
1369 1392 inst->ri_i.i_enabled = 0;
1370 1393
1371 1394 switch (state) {
1372 1395 case RESTARTER_STATE_ONLINE:
1373 1396 case RESTARTER_STATE_DEGRADED:
1374 1397 r = stop_instance(h, inst, RSTOP_DISABLE);
1375 1398 return (r == ECONNRESET ? 0 : r);
1376 1399
1377 1400 case RESTARTER_STATE_OFFLINE:
1378 1401 case RESTARTER_STATE_UNINIT:
1379 1402 if (inst->ri_i.i_primary_ctid != 0) {
1380 1403 inst->ri_m_inst = safe_scf_instance_create(h);
1381 1404 inst->ri_mi_deleted = B_FALSE;
1382 1405
1383 1406 libscf_reget_instance(inst);
1384 1407 method_remove_contract(inst, B_TRUE, B_TRUE);
1385 1408
1386 1409 scf_instance_destroy(inst->ri_m_inst);
1387 1410 }
1388 1411 /* B_FALSE: See log_instance(..., "Enabled."); above */
1389 1412 log_instance(inst, B_FALSE, "Disabled.");
1390 1413 log_framework(LOG_DEBUG, "%s: Instance disabled.\n",
1391 1414 inst->ri_i.i_fmri);
1392 1415
1393 1416 /*
1394 1417 * If we are coming from OFFLINE, it was obviously a
1395 1418 * disable request. But if we are coming from
1396 1419 * UNINIT, it may have been a disable request for a
1397 1420 * service in MAINT.
1398 1421 */
1399 1422 if (riq->riq_reason == restarter_str_disable_request ||
1400 1423 state == RESTARTER_STATE_OFFLINE)
1401 1424 reason = restarter_str_disable_request;
1402 1425 (void) restarter_instance_update_states(h, inst,
1403 1426 RESTARTER_STATE_DISABLED, RESTARTER_STATE_NONE,
1404 1427 RERR_RESTART, reason);
1405 1428 return (0);
1406 1429
1407 1430 case RESTARTER_STATE_DISABLED:
1408 1431 break;
1409 1432
1410 1433 case RESTARTER_STATE_MAINT:
1411 1434 /*
1412 1435 * We only want to pull the instance out of maintenance
1413 1436 * if the disable is on adminstrative request. The
1414 1437 * graph engine sends _DISABLE events whenever a
1415 1438 * service isn't in the disabled state, and we don't
1416 1439 * want to pull the service out of maintenance if,
1417 1440 * for example, it is there due to a dependency cycle.
1418 1441 */
1419 1442 if (e == RESTARTER_EVENT_TYPE_ADMIN_DISABLE)
1420 1443 unmaintain_instance(h, inst, RUNMAINT_DISABLE);
1421 1444 break;
1422 1445
1423 1446 default:
1424 1447 #ifndef NDEBUG
1425 1448 (void) fprintf(stderr, "Restarter instance %s has "
1426 1449 "unknown state %d.\n", inst->ri_i.i_fmri, state);
1427 1450 #endif
1428 1451 abort();
1429 1452 }
1430 1453 }
1431 1454
1432 1455 return (0);
1433 1456 }
1434 1457
1435 1458 static void
1436 1459 start_instance(scf_handle_t *local_handle, restarter_inst_t *inst,
1437 1460 int32_t reason)
1438 1461 {
1439 1462 fork_info_t *info;
1440 1463 restarter_str_t new_reason;
1441 1464
1442 1465 assert(MUTEX_HELD(&inst->ri_lock));
1443 1466 assert(instance_in_transition(inst) == 0);
1444 1467 assert(inst->ri_method_thread == 0);
1445 1468
1446 1469 log_framework(LOG_DEBUG, "%s: trying to start instance\n",
1447 1470 inst->ri_i.i_fmri);
1448 1471
1449 1472 /*
1450 1473 * We want to keep the original reason for restarts and clear actions
1451 1474 */
1452 1475 switch (reason) {
1453 1476 case restarter_str_restart_request:
1454 1477 case restarter_str_clear_request:
1455 1478 new_reason = reason;
1456 1479 break;
1457 1480 default:
1458 1481 new_reason = restarter_str_dependencies_satisfied;
1459 1482 }
1460 1483
1461 1484 /* Services in the disabled and maintenance state are ignored */
1462 1485 if (inst->ri_i.i_state == RESTARTER_STATE_MAINT ||
1463 1486 inst->ri_i.i_state == RESTARTER_STATE_DISABLED ||
1464 1487 inst->ri_i.i_enabled == 0) {
1465 1488 log_framework(LOG_DEBUG,
1466 1489 "%s: start_instance -> is maint/disabled\n",
1467 1490 inst->ri_i.i_fmri);
1468 1491 return;
1469 1492 }
1470 1493
1471 1494 /* Already started instances are left alone */
1472 1495 if (instance_started(inst) == 1) {
1473 1496 log_framework(LOG_DEBUG,
1474 1497 "%s: start_instance -> is already started\n",
1475 1498 inst->ri_i.i_fmri);
1476 1499 return;
1477 1500 }
1478 1501
1479 1502 log_framework(LOG_DEBUG, "%s: starting instance.\n", inst->ri_i.i_fmri);
1480 1503
1481 1504 (void) restarter_instance_update_states(local_handle, inst,
1482 1505 inst->ri_i.i_state, RESTARTER_STATE_ONLINE, RERR_NONE, new_reason);
1483 1506
1484 1507 info = startd_zalloc(sizeof (fork_info_t));
1485 1508
1486 1509 info->sf_id = inst->ri_id;
1487 1510 info->sf_method_type = METHOD_START;
1488 1511 info->sf_event_type = RERR_NONE;
1489 1512 info->sf_reason = new_reason;
1490 1513 inst->ri_method_thread = startd_thread_create(method_thread, info);
1491 1514 }
1492 1515
1493 1516 static int
1494 1517 event_from_tty(scf_handle_t *h, restarter_inst_t *rip)
1495 1518 {
1496 1519 scf_instance_t *inst;
1497 1520 int ret = 0;
1498 1521
1499 1522 if (libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst))
1500 1523 return (-1);
1501 1524
1502 1525 ret = restarter_inst_ractions_from_tty(inst);
1503 1526
1504 1527 scf_instance_destroy(inst);
1505 1528 return (ret);
1506 1529 }
1507 1530
1508 1531 static void
1509 1532 maintain_instance(scf_handle_t *h, restarter_inst_t *rip, int immediate,
1510 1533 restarter_str_t reason)
1511 1534 {
1512 1535 fork_info_t *info;
1513 1536 scf_instance_t *scf_inst = NULL;
1514 1537
1515 1538 assert(MUTEX_HELD(&rip->ri_lock));
1516 1539 assert(reason != restarter_str_none);
1517 1540 assert(rip->ri_method_thread == 0);
1518 1541
1519 1542 log_instance(rip, B_TRUE, "Stopping for maintenance due to %s.",
1520 1543 restarter_get_str_short(reason));
1521 1544 log_framework(LOG_DEBUG, "%s: stopping for maintenance due to %s.\n",
1522 1545 rip->ri_i.i_fmri, restarter_get_str_short(reason));
1523 1546
1524 1547 /* Services in the maintenance state are ignored */
1525 1548 if (rip->ri_i.i_state == RESTARTER_STATE_MAINT) {
1526 1549 log_framework(LOG_DEBUG,
1527 1550 "%s: maintain_instance -> is already in maintenance\n",
1528 1551 rip->ri_i.i_fmri);
1529 1552 return;
1530 1553 }
1531 1554
1532 1555 /*
1533 1556 * If reason state is restarter_str_service_request and
1534 1557 * restarter_actions/auxiliary_fmri property is set with a valid fmri,
1535 1558 * copy the fmri to restarter/auxiliary_fmri so svcs -x can use.
1536 1559 */
1537 1560 if (reason == restarter_str_service_request &&
1538 1561 libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &scf_inst) == 0) {
1539 1562 if (restarter_inst_validate_ractions_aux_fmri(scf_inst) == 0) {
1540 1563 if (restarter_inst_set_aux_fmri(scf_inst))
1541 1564 log_framework(LOG_DEBUG, "%s: "
1542 1565 "restarter_inst_set_aux_fmri failed: ",
1543 1566 rip->ri_i.i_fmri);
1544 1567 } else {
1545 1568 log_framework(LOG_DEBUG, "%s: "
1546 1569 "restarter_inst_validate_ractions_aux_fmri "
1547 1570 "failed: ", rip->ri_i.i_fmri);
1548 1571
1549 1572 if (restarter_inst_reset_aux_fmri(scf_inst))
1550 1573 log_framework(LOG_DEBUG, "%s: "
1551 1574 "restarter_inst_reset_aux_fmri failed: ",
1552 1575 rip->ri_i.i_fmri);
1553 1576 }
1554 1577 scf_instance_destroy(scf_inst);
1555 1578 }
1556 1579
1557 1580 if (immediate || !instance_started(rip)) {
1558 1581 if (rip->ri_i.i_primary_ctid != 0) {
1559 1582 rip->ri_m_inst = safe_scf_instance_create(h);
1560 1583 rip->ri_mi_deleted = B_FALSE;
1561 1584
1562 1585 libscf_reget_instance(rip);
1563 1586 method_remove_contract(rip, B_TRUE, B_TRUE);
1564 1587
1565 1588 scf_instance_destroy(rip->ri_m_inst);
1566 1589 }
1567 1590
1568 1591 (void) restarter_instance_update_states(h, rip,
1569 1592 RESTARTER_STATE_MAINT, RESTARTER_STATE_NONE, RERR_RESTART,
1570 1593 reason);
1571 1594 return;
1572 1595 }
1573 1596
1574 1597 (void) restarter_instance_update_states(h, rip, rip->ri_i.i_state,
1575 1598 RESTARTER_STATE_MAINT, RERR_NONE, reason);
1576 1599
1577 1600 log_transition(rip, MAINT_REQUESTED);
1578 1601
1579 1602 info = startd_zalloc(sizeof (*info));
1580 1603 info->sf_id = rip->ri_id;
1581 1604 info->sf_method_type = METHOD_STOP;
1582 1605 info->sf_event_type = RERR_RESTART;
1583 1606 info->sf_reason = reason;
1584 1607 rip->ri_method_thread = startd_thread_create(method_thread, info);
1585 1608 }
1586 1609
1587 1610 static void
1588 1611 refresh_instance(scf_handle_t *h, restarter_inst_t *rip)
1589 1612 {
1590 1613 scf_instance_t *inst;
1591 1614 scf_snapshot_t *snap;
1592 1615 fork_info_t *info;
1593 1616 int r;
1594 1617
1595 1618 assert(MUTEX_HELD(&rip->ri_lock));
1596 1619
1597 1620 log_instance(rip, B_TRUE, "Rereading configuration.");
1598 1621 log_framework(LOG_DEBUG, "%s: rereading configuration.\n",
1599 1622 rip->ri_i.i_fmri);
1600 1623
1601 1624 rep_retry:
1602 1625 r = libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst);
1603 1626 switch (r) {
1604 1627 case 0:
1605 1628 break;
1606 1629
1607 1630 case ECONNABORTED:
1608 1631 libscf_handle_rebind(h);
1609 1632 goto rep_retry;
1610 1633
1611 1634 case ENOENT:
1612 1635 /* Must have been deleted. */
1613 1636 return;
1614 1637
1615 1638 case EINVAL:
1616 1639 case ENOTSUP:
1617 1640 default:
1618 1641 bad_error("libscf_fmri_get_instance", r);
1619 1642 }
1620 1643
1621 1644 snap = libscf_get_running_snapshot(inst);
1622 1645
1623 1646 r = libscf_get_startd_properties(inst, snap, &rip->ri_flags,
1624 1647 &rip->ri_utmpx_prefix);
1625 1648 switch (r) {
1626 1649 case 0:
1627 1650 log_framework(LOG_DEBUG, "%s is a %s-style service\n",
1628 1651 rip->ri_i.i_fmri, service_style(rip->ri_flags));
1629 1652 break;
1630 1653
1631 1654 case ECONNABORTED:
1632 1655 scf_instance_destroy(inst);
1633 1656 scf_snapshot_destroy(snap);
1634 1657 libscf_handle_rebind(h);
1635 1658 goto rep_retry;
1636 1659
1637 1660 case ECANCELED:
1638 1661 case ENOENT:
1639 1662 /* Succeed in anticipation of REMOVE_INSTANCE. */
1640 1663 break;
1641 1664
1642 1665 default:
1643 1666 bad_error("libscf_get_startd_properties", r);
1644 1667 }
1645 1668
1646 1669 if (instance_started(rip)) {
1647 1670 /* Refresh does not change the state. */
1648 1671 (void) restarter_instance_update_states(h, rip,
1649 1672 rip->ri_i.i_state, rip->ri_i.i_state, RERR_NONE,
1650 1673 restarter_str_refresh);
1651 1674
1652 1675 info = startd_zalloc(sizeof (*info));
1653 1676 info->sf_id = rip->ri_id;
1654 1677 info->sf_method_type = METHOD_REFRESH;
1655 1678 info->sf_event_type = RERR_REFRESH;
1656 1679 info->sf_reason = NULL;
1657 1680
1658 1681 assert(rip->ri_method_thread == 0);
1659 1682 rip->ri_method_thread =
1660 1683 startd_thread_create(method_thread, info);
1661 1684 }
1662 1685
1663 1686 scf_snapshot_destroy(snap);
1664 1687 scf_instance_destroy(inst);
1665 1688 }
1666 1689
1667 1690 const char *event_names[] = { "INVALID", "ADD_INSTANCE", "REMOVE_INSTANCE",
1668 1691 "ENABLE", "DISABLE", "ADMIN_DEGRADED", "ADMIN_REFRESH",
1669 1692 "ADMIN_RESTART", "ADMIN_MAINT_OFF", "ADMIN_MAINT_ON",
1670 1693 "ADMIN_MAINT_ON_IMMEDIATE", "STOP", "START", "DEPENDENCY_CYCLE",
1671 1694 "INVALID_DEPENDENCY", "ADMIN_DISABLE", "STOP_RESET"
1672 1695 };
1673 1696
1674 1697 /*
1675 1698 * void *restarter_process_events()
1676 1699 *
1677 1700 * Called in a separate thread to process the events on an instance's
1678 1701 * queue. Empties the queue completely, and tries to keep the thread
1679 1702 * around for a little while after the queue is empty to save on
1680 1703 * startup costs.
1681 1704 */
1682 1705 static void *
1683 1706 restarter_process_events(void *arg)
1684 1707 {
1685 1708 scf_handle_t *h;
1686 1709 restarter_instance_qentry_t *event;
1687 1710 restarter_inst_t *rip;
1688 1711 char *fmri = (char *)arg;
1689 1712 struct timespec to;
1690 1713
1691 1714 assert(fmri != NULL);
1692 1715
1693 1716 h = libscf_handle_create_bound_loop();
1694 1717
1695 1718 /* grab the queue lock */
1696 1719 rip = inst_lookup_queue(fmri);
1697 1720 if (rip == NULL)
1698 1721 goto out;
1699 1722
1700 1723 again:
1701 1724
1702 1725 while ((event = uu_list_first(rip->ri_queue)) != NULL) {
1703 1726 restarter_inst_t *inst;
1704 1727
1705 1728 /* drop the queue lock */
1706 1729 MUTEX_UNLOCK(&rip->ri_queue_lock);
1707 1730
1708 1731 /*
1709 1732 * Grab the inst lock -- this waits until any outstanding
1710 1733 * method finishes running.
1711 1734 */
1712 1735 inst = inst_lookup_by_name(fmri);
1713 1736 if (inst == NULL) {
1714 1737 /* Getting deleted in the middle isn't an error. */
1715 1738 goto cont;
1716 1739 }
1717 1740
1718 1741 assert(instance_in_transition(inst) == 0);
1719 1742
1720 1743 /* process the event */
1721 1744 switch (event->riq_type) {
1722 1745 case RESTARTER_EVENT_TYPE_ENABLE:
1723 1746 case RESTARTER_EVENT_TYPE_DISABLE:
1724 1747 (void) enable_inst(h, inst, event);
1725 1748 break;
1726 1749
1727 1750 case RESTARTER_EVENT_TYPE_ADMIN_DISABLE:
1728 1751 if (enable_inst(h, inst, event) == 0)
1729 1752 reset_start_times(inst);
1730 1753 break;
1731 1754
1732 1755 case RESTARTER_EVENT_TYPE_REMOVE_INSTANCE:
1733 1756 restarter_delete_inst(inst);
1734 1757 inst = NULL;
1735 1758 goto cont;
1736 1759
1737 1760 case RESTARTER_EVENT_TYPE_STOP_RESET:
1738 1761 reset_start_times(inst);
1739 1762 /* FALLTHROUGH */
1740 1763 case RESTARTER_EVENT_TYPE_STOP:
1741 1764 (void) stop_instance(h, inst, RSTOP_DEPENDENCY);
1742 1765 break;
1743 1766
1744 1767 case RESTARTER_EVENT_TYPE_START:
1745 1768 start_instance(h, inst, event->riq_reason);
1746 1769 break;
1747 1770
1748 1771 case RESTARTER_EVENT_TYPE_DEPENDENCY_CYCLE:
1749 1772 maintain_instance(h, inst, 0,
1750 1773 restarter_str_dependency_cycle);
1751 1774 break;
1752 1775
1753 1776 case RESTARTER_EVENT_TYPE_INVALID_DEPENDENCY:
1754 1777 maintain_instance(h, inst, 0,
1755 1778 restarter_str_invalid_dependency);
1756 1779 break;
1757 1780
1758 1781 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON:
1759 1782 if (event_from_tty(h, inst) == 0)
1760 1783 maintain_instance(h, inst, 0,
1761 1784 restarter_str_service_request);
1762 1785 else
1763 1786 maintain_instance(h, inst, 0,
1764 1787 restarter_str_administrative_request);
1765 1788 break;
1766 1789
1767 1790 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON_IMMEDIATE:
1768 1791 if (event_from_tty(h, inst) == 0)
1769 1792 maintain_instance(h, inst, 1,
1770 1793 restarter_str_service_request);
1771 1794 else
1772 1795 maintain_instance(h, inst, 1,
1773 1796 restarter_str_administrative_request);
1774 1797 break;
1775 1798
1776 1799 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_OFF:
1777 1800 unmaintain_instance(h, inst, RUNMAINT_CLEAR);
1778 1801 reset_start_times(inst);
1779 1802 break;
1780 1803
1781 1804 case RESTARTER_EVENT_TYPE_ADMIN_REFRESH:
1782 1805 refresh_instance(h, inst);
1783 1806 break;
1784 1807
1785 1808 case RESTARTER_EVENT_TYPE_ADMIN_DEGRADED:
1786 1809 log_framework(LOG_WARNING, "Restarter: "
1787 1810 "%s command (for %s) unimplemented.\n",
1788 1811 event_names[event->riq_type], inst->ri_i.i_fmri);
1789 1812 break;
1790 1813
1791 1814 case RESTARTER_EVENT_TYPE_ADMIN_RESTART:
1792 1815 if (!instance_started(inst)) {
1793 1816 log_framework(LOG_DEBUG, "Restarter: "
1794 1817 "Not restarting %s; not running.\n",
1795 1818 inst->ri_i.i_fmri);
1796 1819 } else {
1797 1820 /*
1798 1821 * Stop the instance. If it can be restarted,
1799 1822 * the graph engine will send a new event.
1800 1823 */
1801 1824 if (stop_instance(h, inst, RSTOP_RESTART) == 0)
1802 1825 reset_start_times(inst);
1803 1826 }
1804 1827 break;
1805 1828
1806 1829 case RESTARTER_EVENT_TYPE_ADD_INSTANCE:
1807 1830 default:
1808 1831 #ifndef NDEBUG
1809 1832 uu_warn("%s:%d: Bad restarter event %d. "
1810 1833 "Aborting.\n", __FILE__, __LINE__, event->riq_type);
1811 1834 #endif
1812 1835 abort();
1813 1836 }
1814 1837
1815 1838 assert(inst != NULL);
1816 1839 MUTEX_UNLOCK(&inst->ri_lock);
1817 1840
1818 1841 cont:
1819 1842 /* grab the queue lock */
1820 1843 rip = inst_lookup_queue(fmri);
1821 1844 if (rip == NULL)
1822 1845 goto out;
1823 1846
1824 1847 /* delete the event */
1825 1848 uu_list_remove(rip->ri_queue, event);
1826 1849 startd_free(event, sizeof (restarter_instance_qentry_t));
1827 1850 }
1828 1851
1829 1852 assert(rip != NULL);
1830 1853
1831 1854 /*
1832 1855 * Try to preserve the thread for a little while for future use.
1833 1856 */
↓ open down ↓ |
1078 lines elided |
↑ open up ↑ |
1834 1857 to.tv_sec = 3;
1835 1858 to.tv_nsec = 0;
1836 1859 (void) pthread_cond_reltimedwait_np(&rip->ri_queue_cv,
1837 1860 &rip->ri_queue_lock, &to);
1838 1861
1839 1862 if (uu_list_first(rip->ri_queue) != NULL)
1840 1863 goto again;
1841 1864
1842 1865 rip->ri_queue_thread = 0;
1843 1866 MUTEX_UNLOCK(&rip->ri_queue_lock);
1867 +
1844 1868 out:
1845 1869 (void) scf_handle_unbind(h);
1846 1870 scf_handle_destroy(h);
1847 1871 free(fmri);
1848 1872 return (NULL);
1849 1873 }
1850 1874
1851 1875 static int
1852 1876 is_admin_event(restarter_event_type_t t) {
1853 1877
1854 1878 switch (t) {
1855 1879 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON:
1856 1880 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON_IMMEDIATE:
1857 1881 case RESTARTER_EVENT_TYPE_ADMIN_MAINT_OFF:
1858 1882 case RESTARTER_EVENT_TYPE_ADMIN_REFRESH:
1859 1883 case RESTARTER_EVENT_TYPE_ADMIN_DEGRADED:
1860 1884 case RESTARTER_EVENT_TYPE_ADMIN_RESTART:
1861 1885 return (1);
1862 1886 default:
1863 1887 return (0);
1864 1888 }
1865 1889 }
1866 1890
1867 1891 static void
1868 1892 restarter_queue_event(restarter_inst_t *ri, restarter_protocol_event_t *e)
1869 1893 {
1870 1894 restarter_instance_qentry_t *qe;
1871 1895 int r;
1872 1896
1873 1897 assert(MUTEX_HELD(&ri->ri_queue_lock));
1874 1898 assert(!MUTEX_HELD(&ri->ri_lock));
1875 1899
1876 1900 qe = startd_zalloc(sizeof (restarter_instance_qentry_t));
1877 1901 qe->riq_type = e->rpe_type;
1878 1902 qe->riq_reason = e->rpe_reason;
1879 1903
1880 1904 uu_list_node_init(qe, &qe->riq_link, restarter_queue_pool);
1881 1905 r = uu_list_insert_before(ri->ri_queue, NULL, qe);
1882 1906 assert(r == 0);
1883 1907 }
1884 1908
1885 1909 /*
1886 1910 * void *restarter_event_thread()
1887 1911 *
1888 1912 * Handle incoming graph events by placing them on a per-instance
1889 1913 * queue. We can't lock the main part of the instance structure, so
1890 1914 * just modify the seprarately locked event queue portion.
1891 1915 */
1892 1916 /*ARGSUSED*/
1893 1917 static void *
1894 1918 restarter_event_thread(void *unused)
1895 1919 {
1896 1920 scf_handle_t *h;
1897 1921
1898 1922 /*
1899 1923 * This is a new thread, and thus, gets its own handle
1900 1924 * to the repository.
1901 1925 */
1902 1926 h = libscf_handle_create_bound_loop();
1903 1927
1904 1928 MUTEX_LOCK(&ru->restarter_update_lock);
1905 1929
1906 1930 /*CONSTCOND*/
1907 1931 while (1) {
1908 1932 restarter_protocol_event_t *e;
1909 1933
1910 1934 while (ru->restarter_update_wakeup == 0)
1911 1935 (void) pthread_cond_wait(&ru->restarter_update_cv,
1912 1936 &ru->restarter_update_lock);
1913 1937
1914 1938 ru->restarter_update_wakeup = 0;
1915 1939
1916 1940 while ((e = restarter_event_dequeue()) != NULL) {
1917 1941 restarter_inst_t *rip;
1918 1942 char *fmri;
1919 1943
1920 1944 MUTEX_UNLOCK(&ru->restarter_update_lock);
1921 1945
1922 1946 /*
1923 1947 * ADD_INSTANCE is special: there's likely no
1924 1948 * instance structure yet, so we need to handle the
1925 1949 * addition synchronously.
1926 1950 */
1927 1951 switch (e->rpe_type) {
1928 1952 case RESTARTER_EVENT_TYPE_ADD_INSTANCE:
1929 1953 if (restarter_insert_inst(h, e->rpe_inst) != 0)
1930 1954 log_error(LOG_INFO, "Restarter: "
1931 1955 "Could not add %s.\n", e->rpe_inst);
1932 1956
1933 1957 MUTEX_LOCK(&st->st_load_lock);
1934 1958 if (--st->st_load_instances == 0)
1935 1959 (void) pthread_cond_broadcast(
1936 1960 &st->st_load_cv);
1937 1961 MUTEX_UNLOCK(&st->st_load_lock);
1938 1962
1939 1963 goto nolookup;
1940 1964 }
1941 1965
1942 1966 /*
1943 1967 * Lookup the instance, locking only the event queue.
1944 1968 * Can't grab ri_lock here because it might be held
1945 1969 * by a long-running method.
1946 1970 */
1947 1971 rip = inst_lookup_queue(e->rpe_inst);
1948 1972 if (rip == NULL) {
1949 1973 log_error(LOG_INFO, "Restarter: "
1950 1974 "Ignoring %s command for unknown service "
1951 1975 "%s.\n", event_names[e->rpe_type],
1952 1976 e->rpe_inst);
1953 1977 goto nolookup;
1954 1978 }
1955 1979
1956 1980 /* Keep ADMIN events from filling up the queue. */
1957 1981 if (is_admin_event(e->rpe_type) &&
1958 1982 uu_list_numnodes(rip->ri_queue) >
1959 1983 RINST_QUEUE_THRESHOLD) {
1960 1984 MUTEX_UNLOCK(&rip->ri_queue_lock);
1961 1985 log_instance(rip, B_TRUE, "Instance event "
1962 1986 "queue overflow. Dropping administrative "
1963 1987 "request.");
1964 1988 log_framework(LOG_DEBUG, "%s: Instance event "
1965 1989 "queue overflow. Dropping administrative "
1966 1990 "request.\n", rip->ri_i.i_fmri);
1967 1991 goto nolookup;
1968 1992 }
1969 1993
1970 1994 /* Now add the event to the instance queue. */
1971 1995 restarter_queue_event(rip, e);
1972 1996
1973 1997 if (rip->ri_queue_thread == 0) {
1974 1998 /*
1975 1999 * Start a thread if one isn't already
1976 2000 * running.
1977 2001 */
1978 2002 fmri = safe_strdup(e->rpe_inst);
1979 2003 rip->ri_queue_thread = startd_thread_create(
1980 2004 restarter_process_events, (void *)fmri);
1981 2005 } else {
1982 2006 /*
1983 2007 * Signal the existing thread that there's
1984 2008 * a new event.
1985 2009 */
1986 2010 (void) pthread_cond_broadcast(
1987 2011 &rip->ri_queue_cv);
1988 2012 }
1989 2013
1990 2014 MUTEX_UNLOCK(&rip->ri_queue_lock);
1991 2015 nolookup:
1992 2016 restarter_event_release(e);
1993 2017
1994 2018 MUTEX_LOCK(&ru->restarter_update_lock);
1995 2019 }
1996 2020 }
1997 2021
1998 2022 /*
1999 2023 * Unreachable for now -- there's currently no graceful cleanup
2000 2024 * called on exit().
2001 2025 */
2002 2026 (void) scf_handle_unbind(h);
2003 2027 scf_handle_destroy(h);
2004 2028 return (NULL);
2005 2029 }
2006 2030
2007 2031 static restarter_inst_t *
2008 2032 contract_to_inst(ctid_t ctid)
2009 2033 {
2010 2034 restarter_inst_t *inst;
2011 2035 int id;
2012 2036
2013 2037 id = lookup_inst_by_contract(ctid);
2014 2038 if (id == -1)
2015 2039 return (NULL);
2016 2040
2017 2041 inst = inst_lookup_by_id(id);
2018 2042 if (inst != NULL) {
2019 2043 /*
2020 2044 * Since ri_lock isn't held by the contract id lookup, this
2021 2045 * instance may have been restarted and now be in a new
2022 2046 * contract, making the old contract no longer valid for this
2023 2047 * instance.
2024 2048 */
2025 2049 if (ctid != inst->ri_i.i_primary_ctid) {
2026 2050 MUTEX_UNLOCK(&inst->ri_lock);
2027 2051 inst = NULL;
2028 2052 }
2029 2053 }
2030 2054 return (inst);
2031 2055 }
2032 2056
2033 2057 /*
2034 2058 * void contract_action()
2035 2059 * Take action on contract events.
2036 2060 */
2037 2061 static void
2038 2062 contract_action(scf_handle_t *h, restarter_inst_t *inst, ctid_t id,
2039 2063 uint32_t type)
2040 2064 {
2041 2065 const char *fmri = inst->ri_i.i_fmri;
2042 2066
2043 2067 assert(MUTEX_HELD(&inst->ri_lock));
2044 2068
2045 2069 /*
2046 2070 * If startd has stopped this contract, there is no need to
2047 2071 * stop it again.
2048 2072 */
2049 2073 if (inst->ri_i.i_primary_ctid > 0 &&
2050 2074 inst->ri_i.i_primary_ctid_stopped)
2051 2075 return;
2052 2076
2053 2077 if ((type & (CT_PR_EV_EMPTY | CT_PR_EV_CORE | CT_PR_EV_SIGNAL
2054 2078 | CT_PR_EV_HWERR)) == 0) {
2055 2079 /*
2056 2080 * There shouldn't be other events, since that's not how we set
2057 2081 * the terms. Thus, just log an error and drive on.
2058 2082 */
2059 2083 log_framework(LOG_NOTICE,
2060 2084 "%s: contract %ld received unexpected critical event "
2061 2085 "(%d)\n", fmri, id, type);
2062 2086 return;
2063 2087 }
2064 2088
2065 2089 assert(instance_in_transition(inst) == 0);
2066 2090
2067 2091 if (instance_is_wait_style(inst)) {
2068 2092 /*
2069 2093 * We ignore all events; if they impact the
2070 2094 * process we're monitoring, then the
2071 2095 * wait_thread will stop the instance.
2072 2096 */
2073 2097 log_framework(LOG_DEBUG,
2074 2098 "%s: ignoring contract event on wait-style service\n",
2075 2099 fmri);
2076 2100 } else {
2077 2101 /*
2078 2102 * A CT_PR_EV_EMPTY event is an RSTOP_EXIT request.
2079 2103 */
2080 2104 switch (type) {
2081 2105 case CT_PR_EV_EMPTY:
2082 2106 (void) stop_instance(h, inst, RSTOP_EXIT);
2083 2107 break;
2084 2108 case CT_PR_EV_CORE:
2085 2109 (void) stop_instance(h, inst, RSTOP_CORE);
2086 2110 break;
2087 2111 case CT_PR_EV_SIGNAL:
2088 2112 (void) stop_instance(h, inst, RSTOP_SIGNAL);
2089 2113 break;
2090 2114 case CT_PR_EV_HWERR:
2091 2115 (void) stop_instance(h, inst, RSTOP_HWERR);
2092 2116 break;
2093 2117 }
2094 2118 }
2095 2119 }
2096 2120
2097 2121 /*
2098 2122 * void *restarter_contract_event_thread(void *)
2099 2123 * Listens to the process contract bundle for critical events, taking action
2100 2124 * on events from contracts we know we are responsible for.
2101 2125 */
2102 2126 /*ARGSUSED*/
2103 2127 static void *
2104 2128 restarter_contracts_event_thread(void *unused)
2105 2129 {
2106 2130 int fd, err;
2107 2131 scf_handle_t *local_handle;
2108 2132
2109 2133 /*
2110 2134 * Await graph load completion. That is, stop here, until we've scanned
2111 2135 * the repository for contract - instance associations.
2112 2136 */
2113 2137 MUTEX_LOCK(&st->st_load_lock);
2114 2138 while (!(st->st_load_complete && st->st_load_instances == 0))
2115 2139 (void) pthread_cond_wait(&st->st_load_cv, &st->st_load_lock);
2116 2140 MUTEX_UNLOCK(&st->st_load_lock);
2117 2141
2118 2142 /*
2119 2143 * This is a new thread, and thus, gets its own handle
2120 2144 * to the repository.
2121 2145 */
2122 2146 if ((local_handle = libscf_handle_create_bound(SCF_VERSION)) == NULL)
2123 2147 uu_die("Unable to bind a new repository handle: %s\n",
2124 2148 scf_strerror(scf_error()));
2125 2149
2126 2150 fd = open64(CTFS_ROOT "/process/pbundle", O_RDONLY);
2127 2151 if (fd == -1)
2128 2152 uu_die("process bundle open failed");
2129 2153
2130 2154 /*
2131 2155 * Make sure we get all events (including those generated by configd
2132 2156 * before this thread was started).
2133 2157 */
2134 2158 err = ct_event_reset(fd);
2135 2159 assert(err == 0);
2136 2160
2137 2161 for (;;) {
2138 2162 int efd, sfd;
2139 2163 ct_evthdl_t ev;
2140 2164 uint32_t type;
2141 2165 ctevid_t evid;
2142 2166 ct_stathdl_t status;
2143 2167 ctid_t ctid;
2144 2168 restarter_inst_t *inst;
2145 2169 uint64_t cookie;
2146 2170
2147 2171 if (err = ct_event_read_critical(fd, &ev)) {
2148 2172 log_error(LOG_WARNING,
2149 2173 "Error reading next contract event: %s",
2150 2174 strerror(err));
2151 2175 continue;
2152 2176 }
2153 2177
2154 2178 evid = ct_event_get_evid(ev);
2155 2179 ctid = ct_event_get_ctid(ev);
2156 2180 type = ct_event_get_type(ev);
2157 2181
2158 2182 /* Fetch cookie. */
2159 2183 if ((sfd = contract_open(ctid, "process", "status", O_RDONLY))
2160 2184 < 0) {
2161 2185 ct_event_free(ev);
2162 2186 continue;
2163 2187 }
2164 2188
2165 2189 if (err = ct_status_read(sfd, CTD_COMMON, &status)) {
2166 2190 log_framework(LOG_WARNING, "Could not get status for "
2167 2191 "contract %ld: %s\n", ctid, strerror(err));
2168 2192
2169 2193 startd_close(sfd);
2170 2194 ct_event_free(ev);
2171 2195 continue;
2172 2196 }
2173 2197
2174 2198 cookie = ct_status_get_cookie(status);
2175 2199
2176 2200 log_framework(LOG_DEBUG, "Received event %d for ctid %ld "
2177 2201 "cookie %lld\n", type, ctid, cookie);
2178 2202
2179 2203 ct_status_free(status);
2180 2204
2181 2205 startd_close(sfd);
2182 2206
2183 2207 /*
2184 2208 * svc.configd(1M) restart handling performed by the
2185 2209 * fork_configd_thread. We don't acknowledge, as that thread
2186 2210 * will do so.
2187 2211 */
2188 2212 if (cookie == CONFIGD_COOKIE) {
2189 2213 ct_event_free(ev);
2190 2214 continue;
2191 2215 }
2192 2216
2193 2217 inst = NULL;
2194 2218 if (storing_contract != 0 &&
2195 2219 (inst = contract_to_inst(ctid)) == NULL) {
2196 2220 /*
2197 2221 * This can happen for two reasons:
2198 2222 * - method_run() has not yet stored the
2199 2223 * the contract into the internal hash table.
2200 2224 * - we receive an EMPTY event for an abandoned
2201 2225 * contract.
2202 2226 * If there is any contract in the process of
2203 2227 * being stored into the hash table then re-read
2204 2228 * the event later.
2205 2229 */
2206 2230 log_framework(LOG_DEBUG,
2207 2231 "Reset event %d for unknown "
2208 2232 "contract id %ld\n", type, ctid);
2209 2233
2210 2234 /* don't go too fast */
2211 2235 (void) poll(NULL, 0, 100);
2212 2236
2213 2237 (void) ct_event_reset(fd);
2214 2238 ct_event_free(ev);
2215 2239 continue;
2216 2240 }
2217 2241
2218 2242 /*
2219 2243 * Do not call contract_to_inst() again if first
2220 2244 * call succeeded.
2221 2245 */
2222 2246 if (inst == NULL)
2223 2247 inst = contract_to_inst(ctid);
2224 2248 if (inst == NULL) {
2225 2249 /*
2226 2250 * This can happen if we receive an EMPTY
2227 2251 * event for an abandoned contract.
2228 2252 */
2229 2253 log_framework(LOG_DEBUG,
2230 2254 "Received event %d for unknown contract id "
2231 2255 "%ld\n", type, ctid);
2232 2256 } else {
2233 2257 log_framework(LOG_DEBUG,
2234 2258 "Received event %d for contract id "
2235 2259 "%ld (%s)\n", type, ctid,
2236 2260 inst->ri_i.i_fmri);
2237 2261
2238 2262 contract_action(local_handle, inst, ctid, type);
2239 2263
2240 2264 MUTEX_UNLOCK(&inst->ri_lock);
2241 2265 }
2242 2266
2243 2267 efd = contract_open(ct_event_get_ctid(ev), "process", "ctl",
2244 2268 O_WRONLY);
2245 2269 if (efd != -1) {
2246 2270 (void) ct_ctl_ack(efd, evid);
2247 2271 startd_close(efd);
2248 2272 }
2249 2273
2250 2274 ct_event_free(ev);
2251 2275
2252 2276 }
2253 2277
2254 2278 /*NOTREACHED*/
2255 2279 return (NULL);
2256 2280 }
2257 2281
2258 2282 /*
2259 2283 * Timeout queue, processed by restarter_timeouts_event_thread().
2260 2284 */
2261 2285 timeout_queue_t *timeouts;
2262 2286 static uu_list_pool_t *timeout_pool;
2263 2287
2264 2288 typedef struct timeout_update {
2265 2289 pthread_mutex_t tu_lock;
2266 2290 pthread_cond_t tu_cv;
2267 2291 int tu_wakeup;
2268 2292 } timeout_update_t;
2269 2293
2270 2294 timeout_update_t *tu;
2271 2295
2272 2296 static const char *timeout_ovr_svcs[] = {
2273 2297 "svc:/system/manifest-import:default",
2274 2298 "svc:/network/initial:default",
2275 2299 "svc:/network/service:default",
2276 2300 "svc:/system/rmtmpfiles:default",
2277 2301 "svc:/network/loopback:default",
2278 2302 "svc:/network/physical:default",
2279 2303 "svc:/system/device/local:default",
2280 2304 "svc:/system/metainit:default",
2281 2305 "svc:/system/filesystem/usr:default",
2282 2306 "svc:/system/filesystem/minimal:default",
2283 2307 "svc:/system/filesystem/local:default",
2284 2308 NULL
2285 2309 };
2286 2310
2287 2311 int
2288 2312 is_timeout_ovr(restarter_inst_t *inst)
2289 2313 {
2290 2314 int i;
2291 2315
2292 2316 for (i = 0; timeout_ovr_svcs[i] != NULL; ++i) {
2293 2317 if (strcmp(inst->ri_i.i_fmri, timeout_ovr_svcs[i]) == 0) {
2294 2318 log_instance(inst, B_TRUE, "Timeout override by "
2295 2319 "svc.startd. Using infinite timeout.");
2296 2320 return (1);
2297 2321 }
2298 2322 }
2299 2323
2300 2324 return (0);
2301 2325 }
2302 2326
2303 2327 /*ARGSUSED*/
2304 2328 static int
2305 2329 timeout_compare(const void *lc_arg, const void *rc_arg, void *private)
2306 2330 {
2307 2331 hrtime_t t1 = ((const timeout_entry_t *)lc_arg)->te_timeout;
2308 2332 hrtime_t t2 = ((const timeout_entry_t *)rc_arg)->te_timeout;
2309 2333
2310 2334 if (t1 > t2)
2311 2335 return (1);
2312 2336 else if (t1 < t2)
2313 2337 return (-1);
2314 2338 return (0);
2315 2339 }
2316 2340
2317 2341 void
2318 2342 timeout_init()
2319 2343 {
2320 2344 timeouts = startd_zalloc(sizeof (timeout_queue_t));
2321 2345
2322 2346 (void) pthread_mutex_init(&timeouts->tq_lock, &mutex_attrs);
2323 2347
2324 2348 timeout_pool = startd_list_pool_create("timeouts",
2325 2349 sizeof (timeout_entry_t), offsetof(timeout_entry_t, te_link),
2326 2350 timeout_compare, UU_LIST_POOL_DEBUG);
2327 2351 assert(timeout_pool != NULL);
2328 2352
2329 2353 timeouts->tq_list = startd_list_create(timeout_pool,
2330 2354 timeouts, UU_LIST_SORTED);
2331 2355 assert(timeouts->tq_list != NULL);
2332 2356
2333 2357 tu = startd_zalloc(sizeof (timeout_update_t));
2334 2358 (void) pthread_cond_init(&tu->tu_cv, NULL);
2335 2359 (void) pthread_mutex_init(&tu->tu_lock, &mutex_attrs);
2336 2360 }
2337 2361
2338 2362 void
2339 2363 timeout_insert(restarter_inst_t *inst, ctid_t cid, uint64_t timeout_sec)
2340 2364 {
2341 2365 hrtime_t now, timeout;
2342 2366 timeout_entry_t *entry;
2343 2367 uu_list_index_t idx;
2344 2368
2345 2369 assert(MUTEX_HELD(&inst->ri_lock));
2346 2370
2347 2371 now = gethrtime();
2348 2372
2349 2373 /*
2350 2374 * If we overflow LLONG_MAX, we're never timing out anyways, so
2351 2375 * just return.
2352 2376 */
2353 2377 if (timeout_sec >= (LLONG_MAX - now) / 1000000000LL) {
2354 2378 log_instance(inst, B_TRUE, "timeout_seconds too large, "
2355 2379 "treating as infinite.");
2356 2380 return;
2357 2381 }
2358 2382
2359 2383 /* hrtime is in nanoseconds. Convert timeout_sec. */
2360 2384 timeout = now + (timeout_sec * 1000000000LL);
2361 2385
2362 2386 entry = startd_alloc(sizeof (timeout_entry_t));
2363 2387 entry->te_timeout = timeout;
2364 2388 entry->te_ctid = cid;
2365 2389 entry->te_fmri = safe_strdup(inst->ri_i.i_fmri);
2366 2390 entry->te_logstem = safe_strdup(inst->ri_logstem);
2367 2391 entry->te_fired = 0;
2368 2392 /* Insert the calculated timeout time onto the queue. */
2369 2393 MUTEX_LOCK(&timeouts->tq_lock);
2370 2394 (void) uu_list_find(timeouts->tq_list, entry, NULL, &idx);
2371 2395 uu_list_node_init(entry, &entry->te_link, timeout_pool);
2372 2396 uu_list_insert(timeouts->tq_list, entry, idx);
2373 2397 MUTEX_UNLOCK(&timeouts->tq_lock);
2374 2398
2375 2399 assert(inst->ri_timeout == NULL);
2376 2400 inst->ri_timeout = entry;
2377 2401
2378 2402 MUTEX_LOCK(&tu->tu_lock);
2379 2403 tu->tu_wakeup = 1;
2380 2404 (void) pthread_cond_broadcast(&tu->tu_cv);
2381 2405 MUTEX_UNLOCK(&tu->tu_lock);
2382 2406 }
2383 2407
2384 2408
2385 2409 void
2386 2410 timeout_remove(restarter_inst_t *inst, ctid_t cid)
2387 2411 {
2388 2412 assert(MUTEX_HELD(&inst->ri_lock));
2389 2413
2390 2414 if (inst->ri_timeout == NULL)
2391 2415 return;
2392 2416
2393 2417 assert(inst->ri_timeout->te_ctid == cid);
2394 2418
2395 2419 MUTEX_LOCK(&timeouts->tq_lock);
2396 2420 uu_list_remove(timeouts->tq_list, inst->ri_timeout);
2397 2421 MUTEX_UNLOCK(&timeouts->tq_lock);
2398 2422
2399 2423 free(inst->ri_timeout->te_fmri);
2400 2424 free(inst->ri_timeout->te_logstem);
2401 2425 startd_free(inst->ri_timeout, sizeof (timeout_entry_t));
2402 2426 inst->ri_timeout = NULL;
2403 2427 }
2404 2428
2405 2429 static int
2406 2430 timeout_now()
2407 2431 {
2408 2432 timeout_entry_t *e;
2409 2433 hrtime_t now;
2410 2434 int ret;
2411 2435
2412 2436 now = gethrtime();
2413 2437
2414 2438 /*
2415 2439 * Walk through the (sorted) timeouts list. While the timeout
2416 2440 * at the head of the list is <= the current time, kill the
2417 2441 * method.
2418 2442 */
2419 2443 MUTEX_LOCK(&timeouts->tq_lock);
2420 2444
2421 2445 for (e = uu_list_first(timeouts->tq_list);
2422 2446 e != NULL && e->te_timeout <= now;
2423 2447 e = uu_list_next(timeouts->tq_list, e)) {
2424 2448 log_framework(LOG_WARNING, "%s: Method or service exit timed "
2425 2449 "out. Killing contract %ld.\n", e->te_fmri, e->te_ctid);
2426 2450 log_instance_fmri(e->te_fmri, e->te_logstem, B_TRUE,
2427 2451 "Method or service exit timed out. Killing contract %ld.",
2428 2452 e->te_ctid);
2429 2453 e->te_fired = 1;
2430 2454 (void) contract_kill(e->te_ctid, SIGKILL, e->te_fmri);
2431 2455 }
2432 2456
2433 2457 if (uu_list_numnodes(timeouts->tq_list) > 0)
2434 2458 ret = 0;
2435 2459 else
2436 2460 ret = -1;
2437 2461
2438 2462 MUTEX_UNLOCK(&timeouts->tq_lock);
2439 2463
2440 2464 return (ret);
2441 2465 }
2442 2466
2443 2467 /*
2444 2468 * void *restarter_timeouts_event_thread(void *)
2445 2469 * Responsible for monitoring the method timeouts. This thread must
2446 2470 * be started before any methods are called.
2447 2471 */
2448 2472 /*ARGSUSED*/
2449 2473 static void *
2450 2474 restarter_timeouts_event_thread(void *unused)
2451 2475 {
2452 2476 /*
2453 2477 * Timeouts are entered on a priority queue, which is processed by
2454 2478 * this thread. As timeouts are specified in seconds, we'll do
2455 2479 * the necessary processing every second, as long as the queue
2456 2480 * is not empty.
2457 2481 */
2458 2482
2459 2483 /*CONSTCOND*/
2460 2484 while (1) {
2461 2485 /*
2462 2486 * As long as the timeout list isn't empty, process it
2463 2487 * every second.
2464 2488 */
2465 2489 if (timeout_now() == 0) {
2466 2490 (void) sleep(1);
2467 2491 continue;
2468 2492 }
2469 2493
2470 2494 /* The list is empty, wait until we have more timeouts. */
2471 2495 MUTEX_LOCK(&tu->tu_lock);
2472 2496
2473 2497 while (tu->tu_wakeup == 0)
2474 2498 (void) pthread_cond_wait(&tu->tu_cv, &tu->tu_lock);
2475 2499
2476 2500 tu->tu_wakeup = 0;
2477 2501 MUTEX_UNLOCK(&tu->tu_lock);
2478 2502 }
2479 2503
2480 2504 return (NULL);
2481 2505 }
2482 2506
2483 2507 void
2484 2508 restarter_start()
2485 2509 {
2486 2510 (void) startd_thread_create(restarter_timeouts_event_thread, NULL);
2487 2511 (void) startd_thread_create(restarter_event_thread, NULL);
2488 2512 (void) startd_thread_create(restarter_contracts_event_thread, NULL);
2489 2513 (void) startd_thread_create(wait_thread, NULL);
2490 2514 }
2491 2515
2492 2516
2493 2517 void
2494 2518 restarter_init()
2495 2519 {
2496 2520 restarter_instance_pool = startd_list_pool_create("restarter_instances",
2497 2521 sizeof (restarter_inst_t), offsetof(restarter_inst_t,
2498 2522 ri_link), restarter_instance_compare, UU_LIST_POOL_DEBUG);
2499 2523 (void) memset(&instance_list, 0, sizeof (instance_list));
2500 2524
2501 2525 (void) pthread_mutex_init(&instance_list.ril_lock, &mutex_attrs);
2502 2526 instance_list.ril_instance_list = startd_list_create(
2503 2527 restarter_instance_pool, &instance_list, UU_LIST_SORTED);
2504 2528
2505 2529 restarter_queue_pool = startd_list_pool_create(
2506 2530 "restarter_instance_queue", sizeof (restarter_instance_qentry_t),
2507 2531 offsetof(restarter_instance_qentry_t, riq_link), NULL,
2508 2532 UU_LIST_POOL_DEBUG);
2509 2533
2510 2534 contract_list_pool = startd_list_pool_create(
2511 2535 "contract_list", sizeof (contract_entry_t),
2512 2536 offsetof(contract_entry_t, ce_link), NULL,
2513 2537 UU_LIST_POOL_DEBUG);
2514 2538 contract_hash_init();
2515 2539
2516 2540 log_framework(LOG_DEBUG, "Initialized restarter\n");
2517 2541 }
↓ open down ↓ |
664 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX