diff options
author | William Hubbs <w.d.hubbs@gmail.com> | 2018-10-09 17:49:02 -0500 |
---|---|---|
committer | William Hubbs <w.d.hubbs@gmail.com> | 2018-10-23 13:38:14 -0500 |
commit | c1e582586d398b4452f568240985247294f645ef (patch) | |
tree | 5d198ed01618ed89ab04f3ab331597102d03ce99 | |
parent | 7a75bfb00c52687a236c92bec78b5e7ab4844701 (diff) |
supervise-daemon: add health checks
Health checks are a way to monitor a service and make sure it stays
healthy.
If a service is not healthy, it will be automatically restarted after
running the unhealthy() function to clean up.
-rw-r--r-- | NEWS.md | 4 | ||||
-rw-r--r-- | man/supervise-daemon.8 | 9 | ||||
-rw-r--r-- | sh/supervise-daemon.sh | 14 | ||||
-rw-r--r-- | src/rc/Makefile | 2 | ||||
-rw-r--r-- | src/rc/supervise-daemon.c | 136 | ||||
-rw-r--r-- | supervise-daemon-guide.md | 36 |
6 files changed, 169 insertions, 32 deletions
@@ -22,6 +22,10 @@ This version adds timed shutdown and cancelation of shutdown to openrc-shutdown. Shutdowns can now be delayed for a certain amount of time or scheduled for an exact time. +supervise-daemon supports health checks, which are a periodic way to make sure a +service is healthy. For more information on setting this up, please see +supervise-daemon-guide.md. + ## OpenRC 0.37 start-stop-daemon now supports logging stdout and stderr of daemons to diff --git a/man/supervise-daemon.8 b/man/supervise-daemon.8 index af06ee31..8bcd8b5c 100644 --- a/man/supervise-daemon.8 +++ b/man/supervise-daemon.8 @@ -16,6 +16,10 @@ .Nd starts a daemon and restarts it if it crashes .Sh SYNOPSIS .Nm +.Fl a , -healthcheck-timer +.Ar seconds +.Fl A , -healthcheck-delay +.Ar seconds .Fl D , -respawn-delay .Ar seconds .Fl d , -chdir @@ -90,6 +94,11 @@ Print the action(s) that are taken just before doing them. .Pp The options are as follows: .Bl -tag -width indent +.Fl a , -healthcheck-timer Ar seconds +Run the healthcheck() command, possibly followed by the unhealthy() +command every time this number of seconds passes. +.Fl A , -healthcheck-delay Ar seconds +Wait this long before the first health check. .It Fl D , -respawn-delay Ar seconds wait this number of seconds before restarting a daemon after it crashes. The default is 0. diff --git a/sh/supervise-daemon.sh b/sh/supervise-daemon.sh index 80e0260c..73a70140 100644 --- a/sh/supervise-daemon.sh +++ b/sh/supervise-daemon.sh @@ -10,6 +10,8 @@ # This file may not be copied, modified, propagated, or distributed # except according to the terms contained in the LICENSE file. +extra_commands="healthcheck unhealthy ${extra_commands}" + supervise_start() { if [ -z "$command" ]; then @@ -32,6 +34,8 @@ supervise_start() ${respawn_delay:+--respawn-delay} $respawn_delay \ ${respawn_max:+--respawn-max} $respawn_max \ ${respawn_period:+--respawn-period} $respawn_period \ + ${healthcheck_delay:+--healthcheck-delay} $healthcheck_delay \ + ${healthcheck_timer:+--healthcheck-timer} $healthcheck_timer \ ${command_user+--user} $command_user \ ${umask+--umask} $umask \ ${supervise_daemon_args:-${start_stop_daemon_args}} \ @@ -98,3 +102,13 @@ supervise_status() return 3 fi } + +healthcheck() +{ + return 0 +} + +unhealthy() +{ + return 0 +} diff --git a/src/rc/Makefile b/src/rc/Makefile index 9ba240fa..ea4a8c81 100644 --- a/src/rc/Makefile +++ b/src/rc/Makefile @@ -161,7 +161,7 @@ rc-update: rc-update.o _usage.o rc-misc.o start-stop-daemon: start-stop-daemon.o _usage.o rc-misc.o rc-pipes.o rc-schedules.o ${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD} -supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-schedules.o +supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-plugin.o rc-schedules.o ${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD} service_get_value service_set_value get_options save_options: do_value.o rc-misc.o diff --git a/src/rc/supervise-daemon.c b/src/rc/supervise-daemon.c index 27089152..883c738d 100644 --- a/src/rc/supervise-daemon.c +++ b/src/rc/supervise-daemon.c @@ -61,15 +61,18 @@ static struct pam_conv conv = { NULL, NULL}; #include "queue.h" #include "rc.h" #include "rc-misc.h" +#include "rc-plugin.h" #include "rc-schedules.h" #include "_usage.h" #include "helpers.h" const char *applet = NULL; const char *extraopts = NULL; -const char *getoptstring = "D:d:e:g:I:Kk:m:N:p:R:r:Su:1:2:3" \ +const char *getoptstring = "A:a:D:d:e:g:H:I:Kk:m:N:p:R:r:Su:1:2:3" \ getoptstring_COMMON; const struct option longopts[] = { + { "healthcheck-timer", 1, NULL, 'a'}, + { "healthcheck-delay", 1, NULL, 'A'}, { "respawn-delay", 1, NULL, 'D'}, { "chdir", 1, NULL, 'd'}, { "env", 1, NULL, 'e'}, @@ -91,6 +94,8 @@ const struct option longopts[] = { longopts_COMMON }; const char * const longopts_help[] = { + "set an initial health check delay", + "set a health check timer", "Set a respawn delay", "Change the PWD", "Set an environment string", @@ -113,6 +118,9 @@ const char * const longopts_help[] = { }; const char *usagestring = NULL; +static int healthcheckdelay = 0; +static int healthchecktimer = 0; +static volatile sig_atomic_t do_healthcheck = 0; static int nicelevel = 0; static int ionicec = -1; static int ioniced = 0; @@ -183,6 +191,12 @@ static void handle_signal(int sig) re_exec_supervisor(); } +static void healthcheck(int sig) +{ + if (sig == SIGALRM) + do_healthcheck = 1; +} + static char * expand_home(const char *home, const char *path) { char *opath, *ppath, *p, *nh; @@ -423,11 +437,14 @@ static void child_process(char *exec, char **argv) static void supervisor(char *exec, char **argv) { FILE *fp; + pid_t wait_pid; int i; int nkilled; struct timespec ts; time_t respawn_now= 0; time_t first_spawn= 0; + pid_t health_pid; + int health_status; #ifndef RC_DEBUG signal_setup_restart(SIGHUP, handle_signal); @@ -488,46 +505,88 @@ static void supervisor(char *exec, char **argv) * Supervisor main loop */ i = 0; + if (healthcheckdelay) { + signal_setup(SIGALRM, healthcheck); + alarm(healthcheckdelay); + } else if (healthchecktimer) { + signal_setup(SIGALRM, healthcheck); + alarm(healthchecktimer); + } while (!exiting) { - wait(&i); - if (exiting) { - signal_setup(SIGCHLD, SIG_IGN); - syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid); - nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0, - false, false, true); - if (nkilled > 0) - syslog(LOG_INFO, "killed %d processes", nkilled); - } else { - ts.tv_sec = respawn_delay; - ts.tv_nsec = 0; - nanosleep(&ts, NULL); - if (respawn_max > 0 && respawn_period > 0) { - respawn_now = time(NULL); - if (first_spawn == 0) - first_spawn = respawn_now; - if (respawn_now - first_spawn > respawn_period) { - respawn_count = 0; - first_spawn = 0; - } else - respawn_count++; - if (respawn_count > respawn_max) { - syslog(LOG_WARNING, - "respawned \"%s\" too many times, exiting", exec); - exiting = true; + wait_pid = wait(&i); + if (wait_pid == -1) { + if (do_healthcheck) { + do_healthcheck = 0; + alarm(0); + syslog(LOG_DEBUG, "running health check for %s", svcname); + health_pid = exec_service(svcname, "healthcheck"); + health_status = rc_waitpid(health_pid); + if (WIFEXITED(health_status) && !WEXITSTATUS(health_status)) { + alarm(healthchecktimer); continue; + } else { + syslog(LOG_WARNING, "health check for %s failed", svcname); + health_pid = exec_service(svcname, "unhealthy"); + rc_waitpid(health_pid); + syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid); + nkilled = run_stop_schedule(applet, NULL, NULL, child_pid, 0, + false, false, true); + if (nkilled > 0) + syslog(LOG_INFO, "killed %d processes", nkilled); + else if (errno != 0) + syslog(LOG_INFO, "Unable to kill %d: %s", + child_pid, strerror(errno)); } + } else if (exiting ) { + alarm(0); + syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid); + nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0, + false, false, true); + if (nkilled > 0) + syslog(LOG_INFO, "killed %d processes", nkilled); + continue; } + } else if (wait_pid == child_pid) { if (WIFEXITED(i)) syslog(LOG_WARNING, "%s, pid %d, exited with return code %d", exec, child_pid, WEXITSTATUS(i)); else if (WIFSIGNALED(i)) syslog(LOG_WARNING, "%s, pid %d, terminated by signal %d", exec, child_pid, WTERMSIG(i)); - child_pid = fork(); - if (child_pid == -1) - eerrorx("%s: fork: %s", applet, strerror(errno)); - if (child_pid == 0) - child_process(exec, argv); + } else + continue; + + ts.tv_sec = respawn_delay; + ts.tv_nsec = 0; + nanosleep(&ts, NULL); + if (respawn_max > 0 && respawn_period > 0) { + respawn_now = time(NULL); + if (first_spawn == 0) + first_spawn = respawn_now; + if (respawn_now - first_spawn > respawn_period) { + respawn_count = 0; + first_spawn = 0; + } else + respawn_count++; + if (respawn_count > respawn_max) { + syslog(LOG_WARNING, + "respawned \"%s\" too many times, exiting", exec); + exiting = true; + continue; + } + } + alarm(0); + child_pid = fork(); + if (child_pid == -1) + eerrorx("%s: fork: %s", applet, strerror(errno)); + if (child_pid == 0) + child_process(exec, argv); + if (healthcheckdelay) { + signal_setup(SIGALRM, healthcheck); + alarm(healthcheckdelay); + } else if (healthchecktimer) { + signal_setup(SIGALRM, healthcheck); + alarm(healthchecktimer); } } @@ -612,6 +671,16 @@ int main(int argc, char **argv) while ((opt = getopt_long(argc, argv, getoptstring, longopts, (int *) 0)) != -1) switch (opt) { + case 'a': /* --healthcheck-timer <time> */ + if (sscanf(optarg, "%d", &healthchecktimer) != 1 || healthchecktimer < 1) + eerrorx("%s: invalid health check timer %s", applet, optarg); + break; + + case 'A': /* --healthcheck-delay <time> */ + if (sscanf(optarg, "%d", &healthcheckdelay) != 1 || healthcheckdelay < 1) + eerrorx("%s: invalid health check delay %s", applet, optarg); + break; + case 'D': /* --respawn-delay time */ n = sscanf(optarg, "%d", &respawn_delay); if (n != 1 || respawn_delay < 1) @@ -668,6 +737,11 @@ int main(int argc, char **argv) gid = gr->gr_gid; break; + case 'H': /* --healthcheck-timer <minutes> */ + if (sscanf(optarg, "%d", &healthchecktimer) != 1 || healthchecktimer < 1) + eerrorx("%s: invalid health check timer %s", applet, optarg); + break; + case 'k': if (parse_mode(&numask, optarg)) eerrorx("%s: invalid mode `%s'", diff --git a/supervise-daemon-guide.md b/supervise-daemon-guide.md index 0b15a858..07ab55cf 100644 --- a/supervise-daemon-guide.md +++ b/supervise-daemon-guide.md @@ -22,6 +22,28 @@ The following is a brief guide on using this capability. instructs it not to fork to the command_args_foreground variable shown below. +# Health Checks + +Health checks are a way to make sure a service monitored by +supervise-daemon stays healthy. To configure a health check for a +service, you need to write a healthcheck() function, and optionally an +unhealthy() function in the service script. Also, you will need to set +the healthcheck_timer and optionally healthcheck_delay variables. + +## healthcheck() function + +The healthcheck() function is run repeatedly based on the settings of +the healthcheck_* variables. This function should return zero if the +service is currently healthy or non-zero otherwise. + +## unhealthy() function + +If the healthcheck() function returns non-zero, the unhealthy() function +is run, then the service is restarted. Since the service will be +restarted by the supervisor, the unhealthy function should not try to +restart it; the purpose of the function is to allow any cleanup tasks +other than restarting the service to be run. + # Variable Settings The most important setting is the supervisor variable. At the top of @@ -53,6 +75,20 @@ forks and goes to the background by default. This should be set to the command line option that instructs the daemon to stay in the foreground. ``` sh +healthcheck_delay=seconds +``` + +This is the delay, in seconds, before the first health check is run. +If it is not set, we use the value of healthcheck_timer. + +``` sh +healthcheck_timer=seconds +``` + +This is the number of seconds between health checks. If it is not set, +no health checks will be run. + +``` sh respawn_delay ``` |