From c1e582586d398b4452f568240985247294f645ef Mon Sep 17 00:00:00 2001 From: William Hubbs Date: Tue, 9 Oct 2018 17:49:02 -0500 Subject: supervise-daemon: add health checks Health checks are a way to monitor a service and make sure it stays healthy. If a service is not healthy, it will be automatically restarted after running the unhealthy() function to clean up. --- NEWS.md | 4 ++ man/supervise-daemon.8 | 9 +++ sh/supervise-daemon.sh | 14 +++++ src/rc/Makefile | 2 +- src/rc/supervise-daemon.c | 136 +++++++++++++++++++++++++++++++++++----------- supervise-daemon-guide.md | 36 ++++++++++++ 6 files changed, 169 insertions(+), 32 deletions(-) diff --git a/NEWS.md b/NEWS.md index d4d96577..f1400197 100644 --- a/NEWS.md +++ b/NEWS.md @@ -22,6 +22,10 @@ This version adds timed shutdown and cancelation of shutdown to openrc-shutdown. Shutdowns can now be delayed for a certain amount of time or scheduled for an exact time. +supervise-daemon supports health checks, which are a periodic way to make sure a +service is healthy. For more information on setting this up, please see +supervise-daemon-guide.md. + ## OpenRC 0.37 start-stop-daemon now supports logging stdout and stderr of daemons to diff --git a/man/supervise-daemon.8 b/man/supervise-daemon.8 index af06ee31..8bcd8b5c 100644 --- a/man/supervise-daemon.8 +++ b/man/supervise-daemon.8 @@ -16,6 +16,10 @@ .Nd starts a daemon and restarts it if it crashes .Sh SYNOPSIS .Nm +.Fl a , -healthcheck-timer +.Ar seconds +.Fl A , -healthcheck-delay +.Ar seconds .Fl D , -respawn-delay .Ar seconds .Fl d , -chdir @@ -90,6 +94,11 @@ Print the action(s) that are taken just before doing them. .Pp The options are as follows: .Bl -tag -width indent +.Fl a , -healthcheck-timer Ar seconds +Run the healthcheck() command, possibly followed by the unhealthy() +command every time this number of seconds passes. +.Fl A , -healthcheck-delay Ar seconds +Wait this long before the first health check. .It Fl D , -respawn-delay Ar seconds wait this number of seconds before restarting a daemon after it crashes. The default is 0. diff --git a/sh/supervise-daemon.sh b/sh/supervise-daemon.sh index 80e0260c..73a70140 100644 --- a/sh/supervise-daemon.sh +++ b/sh/supervise-daemon.sh @@ -10,6 +10,8 @@ # This file may not be copied, modified, propagated, or distributed # except according to the terms contained in the LICENSE file. +extra_commands="healthcheck unhealthy ${extra_commands}" + supervise_start() { if [ -z "$command" ]; then @@ -32,6 +34,8 @@ supervise_start() ${respawn_delay:+--respawn-delay} $respawn_delay \ ${respawn_max:+--respawn-max} $respawn_max \ ${respawn_period:+--respawn-period} $respawn_period \ + ${healthcheck_delay:+--healthcheck-delay} $healthcheck_delay \ + ${healthcheck_timer:+--healthcheck-timer} $healthcheck_timer \ ${command_user+--user} $command_user \ ${umask+--umask} $umask \ ${supervise_daemon_args:-${start_stop_daemon_args}} \ @@ -98,3 +102,13 @@ supervise_status() return 3 fi } + +healthcheck() +{ + return 0 +} + +unhealthy() +{ + return 0 +} diff --git a/src/rc/Makefile b/src/rc/Makefile index 9ba240fa..ea4a8c81 100644 --- a/src/rc/Makefile +++ b/src/rc/Makefile @@ -161,7 +161,7 @@ rc-update: rc-update.o _usage.o rc-misc.o start-stop-daemon: start-stop-daemon.o _usage.o rc-misc.o rc-pipes.o rc-schedules.o ${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD} -supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-schedules.o +supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-plugin.o rc-schedules.o ${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD} service_get_value service_set_value get_options save_options: do_value.o rc-misc.o diff --git a/src/rc/supervise-daemon.c b/src/rc/supervise-daemon.c index 27089152..883c738d 100644 --- a/src/rc/supervise-daemon.c +++ b/src/rc/supervise-daemon.c @@ -61,15 +61,18 @@ static struct pam_conv conv = { NULL, NULL}; #include "queue.h" #include "rc.h" #include "rc-misc.h" +#include "rc-plugin.h" #include "rc-schedules.h" #include "_usage.h" #include "helpers.h" const char *applet = NULL; const char *extraopts = NULL; -const char *getoptstring = "D:d:e:g:I:Kk:m:N:p:R:r:Su:1:2:3" \ +const char *getoptstring = "A:a:D:d:e:g:H:I:Kk:m:N:p:R:r:Su:1:2:3" \ getoptstring_COMMON; const struct option longopts[] = { + { "healthcheck-timer", 1, NULL, 'a'}, + { "healthcheck-delay", 1, NULL, 'A'}, { "respawn-delay", 1, NULL, 'D'}, { "chdir", 1, NULL, 'd'}, { "env", 1, NULL, 'e'}, @@ -91,6 +94,8 @@ const struct option longopts[] = { longopts_COMMON }; const char * const longopts_help[] = { + "set an initial health check delay", + "set a health check timer", "Set a respawn delay", "Change the PWD", "Set an environment string", @@ -113,6 +118,9 @@ const char * const longopts_help[] = { }; const char *usagestring = NULL; +static int healthcheckdelay = 0; +static int healthchecktimer = 0; +static volatile sig_atomic_t do_healthcheck = 0; static int nicelevel = 0; static int ionicec = -1; static int ioniced = 0; @@ -183,6 +191,12 @@ static void handle_signal(int sig) re_exec_supervisor(); } +static void healthcheck(int sig) +{ + if (sig == SIGALRM) + do_healthcheck = 1; +} + static char * expand_home(const char *home, const char *path) { char *opath, *ppath, *p, *nh; @@ -423,11 +437,14 @@ static void child_process(char *exec, char **argv) static void supervisor(char *exec, char **argv) { FILE *fp; + pid_t wait_pid; int i; int nkilled; struct timespec ts; time_t respawn_now= 0; time_t first_spawn= 0; + pid_t health_pid; + int health_status; #ifndef RC_DEBUG signal_setup_restart(SIGHUP, handle_signal); @@ -488,46 +505,88 @@ static void supervisor(char *exec, char **argv) * Supervisor main loop */ i = 0; + if (healthcheckdelay) { + signal_setup(SIGALRM, healthcheck); + alarm(healthcheckdelay); + } else if (healthchecktimer) { + signal_setup(SIGALRM, healthcheck); + alarm(healthchecktimer); + } while (!exiting) { - wait(&i); - if (exiting) { - signal_setup(SIGCHLD, SIG_IGN); - syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid); - nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0, - false, false, true); - if (nkilled > 0) - syslog(LOG_INFO, "killed %d processes", nkilled); - } else { - ts.tv_sec = respawn_delay; - ts.tv_nsec = 0; - nanosleep(&ts, NULL); - if (respawn_max > 0 && respawn_period > 0) { - respawn_now = time(NULL); - if (first_spawn == 0) - first_spawn = respawn_now; - if (respawn_now - first_spawn > respawn_period) { - respawn_count = 0; - first_spawn = 0; - } else - respawn_count++; - if (respawn_count > respawn_max) { - syslog(LOG_WARNING, - "respawned \"%s\" too many times, exiting", exec); - exiting = true; + wait_pid = wait(&i); + if (wait_pid == -1) { + if (do_healthcheck) { + do_healthcheck = 0; + alarm(0); + syslog(LOG_DEBUG, "running health check for %s", svcname); + health_pid = exec_service(svcname, "healthcheck"); + health_status = rc_waitpid(health_pid); + if (WIFEXITED(health_status) && !WEXITSTATUS(health_status)) { + alarm(healthchecktimer); continue; + } else { + syslog(LOG_WARNING, "health check for %s failed", svcname); + health_pid = exec_service(svcname, "unhealthy"); + rc_waitpid(health_pid); + syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid); + nkilled = run_stop_schedule(applet, NULL, NULL, child_pid, 0, + false, false, true); + if (nkilled > 0) + syslog(LOG_INFO, "killed %d processes", nkilled); + else if (errno != 0) + syslog(LOG_INFO, "Unable to kill %d: %s", + child_pid, strerror(errno)); } + } else if (exiting ) { + alarm(0); + syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid); + nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0, + false, false, true); + if (nkilled > 0) + syslog(LOG_INFO, "killed %d processes", nkilled); + continue; } + } else if (wait_pid == child_pid) { if (WIFEXITED(i)) syslog(LOG_WARNING, "%s, pid %d, exited with return code %d", exec, child_pid, WEXITSTATUS(i)); else if (WIFSIGNALED(i)) syslog(LOG_WARNING, "%s, pid %d, terminated by signal %d", exec, child_pid, WTERMSIG(i)); - child_pid = fork(); - if (child_pid == -1) - eerrorx("%s: fork: %s", applet, strerror(errno)); - if (child_pid == 0) - child_process(exec, argv); + } else + continue; + + ts.tv_sec = respawn_delay; + ts.tv_nsec = 0; + nanosleep(&ts, NULL); + if (respawn_max > 0 && respawn_period > 0) { + respawn_now = time(NULL); + if (first_spawn == 0) + first_spawn = respawn_now; + if (respawn_now - first_spawn > respawn_period) { + respawn_count = 0; + first_spawn = 0; + } else + respawn_count++; + if (respawn_count > respawn_max) { + syslog(LOG_WARNING, + "respawned \"%s\" too many times, exiting", exec); + exiting = true; + continue; + } + } + alarm(0); + child_pid = fork(); + if (child_pid == -1) + eerrorx("%s: fork: %s", applet, strerror(errno)); + if (child_pid == 0) + child_process(exec, argv); + if (healthcheckdelay) { + signal_setup(SIGALRM, healthcheck); + alarm(healthcheckdelay); + } else if (healthchecktimer) { + signal_setup(SIGALRM, healthcheck); + alarm(healthchecktimer); } } @@ -612,6 +671,16 @@ int main(int argc, char **argv) while ((opt = getopt_long(argc, argv, getoptstring, longopts, (int *) 0)) != -1) switch (opt) { + case 'a': /* --healthcheck-timer