1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  *
  25  * Copyright 2018 Joyent, Inc.
  26  */
  27 
  28 /*
  29  * wait.c - asynchronous monitoring of "wait registered" start methods
  30  *
  31  * Use event ports to poll on the set of fds representing the /proc/[pid]/psinfo
  32  * files.  If one of these fds returns an event, then we inform the restarter
  33  * that it has stopped.
  34  *
  35  * The wait_info_list holds the series of processes currently being monitored
  36  * for exit.  The wi_fd member, which contains the file descriptor of the psinfo
  37  * file being polled upon ("event ported upon"), will be set to -1 if the file
  38  * descriptor is inactive (already closed or not yet opened).
  39  */
  40 
  41 #ifdef _FILE_OFFSET_BITS
  42 #undef _FILE_OFFSET_BITS
  43 #endif /* _FILE_OFFSET_BITS */
  44 
  45 #include <sys/resource.h>
  46 #include <sys/stat.h>
  47 #include <sys/types.h>
  48 #include <sys/uio.h>
  49 #include <sys/wait.h>
  50 
  51 #include <assert.h>
  52 #include <errno.h>
  53 #include <fcntl.h>
  54 #include <libuutil.h>
  55 #include <poll.h>
  56 #include <port.h>
  57 #include <pthread.h>
  58 #include <procfs.h>
  59 #include <string.h>
  60 #include <stropts.h>
  61 #include <unistd.h>
  62 
  63 #include "startd.h"
  64 
  65 #define WAIT_FILES      262144          /* reasonably high maximum */
  66 
  67 static int port_fd;
  68 static scf_handle_t *wait_hndl;
  69 static struct rlimit init_fd_rlimit;
  70 
  71 static uu_list_pool_t *wait_info_pool;
  72 static uu_list_t *wait_info_list;
  73 
  74 static pthread_mutex_t wait_info_lock;
  75 
  76 /*
  77  * void wait_remove(wait_info_t *, int)
  78  *   Remove the given wait_info structure from our list, performing various
  79  *   cleanup operations along the way.  If the direct flag is false (meaning
  80  *   that we are being called with from restarter instance list context) and
  81  *   the instance should not be ignored, then notify the restarter that the
  82  *   associated instance has exited. If the wi_ignore flag is true then it
  83  *   means that the stop was initiated from within svc.startd, rather than
  84  *   from outside it.
  85  *
  86  *   Since we may no longer be the startd that started this process, we only are
  87  *   concerned with a waitpid(3C) failure if the wi_parent field is non-zero.
  88  */
  89 static void
  90 wait_remove(wait_info_t *wi, int direct)
  91 {
  92         int status;
  93         stop_cause_t cause = RSTOP_EXIT;
  94 
  95         if (waitpid(wi->wi_pid, &status, 0) == -1) {
  96                 if (wi->wi_parent)
  97                         log_framework(LOG_INFO,
  98                             "instance %s waitpid failure: %s\n", wi->wi_fmri,
  99                             strerror(errno));
 100         } else {
 101                 if (WEXITSTATUS(status) != 0) {
 102                         log_framework(LOG_NOTICE,
 103                             "instance %s exited with status %d\n", wi->wi_fmri,
 104                             WEXITSTATUS(status));
 105                         if (WEXITSTATUS(status) == SMF_EXIT_ERR_CONFIG)
 106                                 cause = RSTOP_ERR_CFG;
 107                         else
 108                                 cause = RSTOP_ERR_EXIT;
 109                 }
 110         }
 111 
 112         MUTEX_LOCK(&wait_info_lock);
 113         if (wi->wi_fd != -1) {
 114                 startd_close(wi->wi_fd);
 115                 wi->wi_fd = -1;
 116         }
 117         uu_list_remove(wait_info_list, wi);
 118         MUTEX_UNLOCK(&wait_info_lock);
 119 
 120         /*
 121          * Make an attempt to clear out any utmpx record associated with this
 122          * PID.
 123          */
 124         utmpx_mark_dead(wi->wi_pid, status, B_FALSE);
 125 
 126         if (!direct && !wi->wi_ignore) {
 127                 /*
 128                  * Bind wait_hndl lazily.
 129                  */
 130                 if (wait_hndl == NULL) {
 131                         for (wait_hndl =
 132                             libscf_handle_create_bound(SCF_VERSION);
 133                             wait_hndl == NULL;
 134                             wait_hndl =
 135                             libscf_handle_create_bound(SCF_VERSION)) {
 136                                 log_error(LOG_INFO, "[wait_remove] Unable to "
 137                                     "bind a new repository handle: %s\n",
 138                                     scf_strerror(scf_error()));
 139                                 (void) sleep(2);
 140                         }
 141                 }
 142 
 143                 log_framework(LOG_DEBUG,
 144                     "wait_remove requesting stop of %s\n", wi->wi_fmri);
 145                 (void) stop_instance_fmri(wait_hndl, wi->wi_fmri, cause);
 146         }
 147 
 148         uu_list_node_fini(wi, &wi->wi_link, wait_info_pool);
 149         startd_free(wi, sizeof (wait_info_t));
 150 }
 151 
 152 /*
 153  * void wait_ignore_by_fmri(const char *)
 154  *   wait_ignore_by_fmri is called when svc.startd is going to stop the
 155  *   instance. Since we need to wait on the process and close the utmpx record,
 156  *   we're going to set the wi_ignore flag, so that when the process exits we
 157  *   clean up, but don't tell the restarter to stop it.
 158  */
 159 void
 160 wait_ignore_by_fmri(const char *fmri)
 161 {
 162         wait_info_t *wi;
 163 
 164         MUTEX_LOCK(&wait_info_lock);
 165 
 166         for (wi = uu_list_first(wait_info_list); wi != NULL;
 167             wi = uu_list_next(wait_info_list, wi)) {
 168                 if (strcmp(wi->wi_fmri, fmri) == 0)
 169                         break;
 170         }
 171 
 172         if (wi != NULL) {
 173                 wi->wi_ignore = 1;
 174         }
 175 
 176         MUTEX_UNLOCK(&wait_info_lock);
 177 }
 178 
 179 /*
 180  * int wait_register(pid_t, char *, int, int)
 181  *   wait_register is called after we have called fork(2), and know which pid we
 182  *   wish to monitor.  However, since the child may have already exited by the
 183  *   time we are called, we must handle the error cases from open(2)
 184  *   appropriately.  The am_parent flag is recorded to handle waitpid(2)
 185  *   behaviour on removal; similarly, the direct flag is passed through to a
 186  *   potential call to wait_remove() to govern its behaviour in different
 187  *   contexts.
 188  *
 189  *   Returns 0 if registration successful, 1 if child pid did not exist, and -1
 190  *   if a different error occurred.
 191  */
 192 int
 193 wait_register(pid_t pid, const char *inst_fmri, int am_parent, int direct)
 194 {
 195         char *fname = uu_msprintf("/proc/%ld/psinfo", pid);
 196         int fd;
 197         wait_info_t *wi;
 198 
 199         assert(pid != 0);
 200 
 201         if (fname == NULL)
 202                 return (-1);
 203 
 204         wi = startd_alloc(sizeof (wait_info_t));
 205 
 206         uu_list_node_init(wi, &wi->wi_link, wait_info_pool);
 207 
 208         wi->wi_fd = -1;
 209         wi->wi_pid = pid;
 210         wi->wi_fmri = inst_fmri;
 211         wi->wi_parent = am_parent;
 212         wi->wi_ignore = 0;
 213 
 214         MUTEX_LOCK(&wait_info_lock);
 215         (void) uu_list_insert_before(wait_info_list, NULL, wi);
 216         MUTEX_UNLOCK(&wait_info_lock);
 217 
 218         if ((fd = open(fname, O_RDONLY)) == -1) {
 219                 if (errno == ENOENT) {
 220                         /*
 221                          * Child has already exited.
 222                          */
 223                         wait_remove(wi, direct);
 224                         uu_free(fname);
 225                         return (1);
 226                 } else {
 227                         log_error(LOG_WARNING,
 228                             "open %s failed; not monitoring %s: %s\n", fname,
 229                             inst_fmri, strerror(errno));
 230                         uu_free(fname);
 231                         return (-1);
 232                 }
 233         }
 234 
 235         uu_free(fname);
 236 
 237         wi->wi_fd = fd;
 238 
 239         if (port_associate(port_fd, PORT_SOURCE_FD, fd, 0, wi)) {
 240                 log_error(LOG_WARNING,
 241                     "initial port_association of %d / %s failed: %s\n", fd,
 242                     inst_fmri, strerror(errno));
 243                 return (-1);
 244         }
 245 
 246         log_framework(LOG_DEBUG, "monitoring PID %ld on fd %d (%s)\n", pid, fd,
 247             inst_fmri);
 248 
 249         return (0);
 250 }
 251 
 252 /*ARGSUSED*/
 253 void *
 254 wait_thread(void *args)
 255 {
 256         (void) pthread_setname_np(pthread_self(), "wait");
 257 
 258         for (;;) {
 259                 port_event_t pe;
 260                 int fd;
 261                 wait_info_t *wi;
 262 
 263                 if (port_get(port_fd, &pe, NULL) != 0) {
 264                         if (errno == EINTR)
 265                                 continue;
 266                         else {
 267                                 log_error(LOG_WARNING,
 268                                     "port_get() failed with %s\n",
 269                                     strerror(errno));
 270                                 bad_error("port_get", errno);
 271                         }
 272                 }
 273 
 274                 fd = pe.portev_object;
 275                 wi = pe.portev_user;
 276                 assert(wi != NULL);
 277                 assert(fd == wi->wi_fd);
 278 
 279                 if ((pe.portev_events & POLLHUP) == POLLHUP) {
 280                         psinfo_t psi;
 281 
 282                         if (lseek(fd, 0, SEEK_SET) != 0 ||
 283                             read(fd, &psi, sizeof (psinfo_t)) !=
 284                             sizeof (psinfo_t)) {
 285                                 log_framework(LOG_WARNING,
 286                                     "couldn't get psinfo data for %s (%s); "
 287                                     "assuming failed\n", wi->wi_fmri,
 288                                     strerror(errno));
 289                                 goto err_remove;
 290                         }
 291 
 292                         if (psi.pr_nlwp != 0 ||
 293                             psi.pr_nzomb != 0 ||
 294                             psi.pr_lwp.pr_lwpid != 0) {
 295                                 /*
 296                                  * We have determined, in accordance with the
 297                                  * definition in proc(4), this process is not a
 298                                  * zombie.  Reassociate.
 299                                  */
 300                                 if (port_associate(port_fd, PORT_SOURCE_FD, fd,
 301                                     0, wi))
 302                                         log_error(LOG_WARNING,
 303                                             "port_association of %d / %s "
 304                                             "failed\n", fd, wi->wi_fmri);
 305                                 continue;
 306                         }
 307                 } else if (
 308                     (pe.portev_events & POLLERR) == 0) {
 309                         if (port_associate(port_fd, PORT_SOURCE_FD, fd, 0, wi))
 310                                 log_error(LOG_WARNING,
 311                                     "port_association of %d / %s "
 312                                     "failed\n", fd, wi->wi_fmri);
 313                         continue;
 314                 }
 315 
 316 err_remove:
 317                 wait_remove(wi, 0);
 318         }
 319 
 320         /*LINTED E_FUNC_HAS_NO_RETURN_STMT*/
 321 }
 322 
 323 void
 324 wait_prefork()
 325 {
 326         MUTEX_LOCK(&wait_info_lock);
 327 }
 328 
 329 void
 330 wait_postfork(pid_t pid)
 331 {
 332         wait_info_t *wi;
 333 
 334         MUTEX_UNLOCK(&wait_info_lock);
 335 
 336         if (pid != 0)
 337                 return;
 338 
 339         /*
 340          * Close all of the child's wait-related fds.  The wait_thread() is
 341          * gone, so no need to worry about returning events.  We always exec(2)
 342          * after a fork request, so we needn't free the list elements
 343          * themselves.
 344          */
 345 
 346         for (wi = uu_list_first(wait_info_list);
 347             wi != NULL;
 348             wi = uu_list_next(wait_info_list, wi)) {
 349                 if (wi->wi_fd != -1)
 350                         startd_close(wi->wi_fd);
 351         }
 352 
 353         startd_close(port_fd);
 354 
 355         (void) setrlimit(RLIMIT_NOFILE, &init_fd_rlimit);
 356 }
 357 
 358 void
 359 wait_init()
 360 {
 361         struct rlimit fd_new;
 362 
 363         (void) getrlimit(RLIMIT_NOFILE, &init_fd_rlimit);
 364         (void) getrlimit(RLIMIT_NOFILE, &fd_new);
 365 
 366         fd_new.rlim_max = fd_new.rlim_cur = WAIT_FILES;
 367 
 368         (void) setrlimit(RLIMIT_NOFILE, &fd_new);
 369 
 370         if ((port_fd = port_create()) == -1)
 371                 uu_die("wait_init couldn't port_create");
 372 
 373         wait_info_pool = uu_list_pool_create("wait_info", sizeof (wait_info_t),
 374             offsetof(wait_info_t, wi_link), NULL, UU_LIST_POOL_DEBUG);
 375         if (wait_info_pool == NULL)
 376                 uu_die("wait_init couldn't create wait_info_pool");
 377 
 378         wait_info_list = uu_list_create(wait_info_pool, wait_info_list, 0);
 379         if (wait_info_list == NULL)
 380                 uu_die("wait_init couldn't create wait_info_list");
 381 
 382         (void) pthread_mutex_init(&wait_info_lock, &mutex_attrs);
 383 }