aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWilliam Hubbs <w.d.hubbs@gmail.com>2018-10-09 17:49:02 -0500
committerWilliam Hubbs <w.d.hubbs@gmail.com>2018-10-23 13:38:14 -0500
commitc1e582586d398b4452f568240985247294f645ef (patch)
tree5d198ed01618ed89ab04f3ab331597102d03ce99
parent7a75bfb00c52687a236c92bec78b5e7ab4844701 (diff)
supervise-daemon: add health checks
Health checks are a way to monitor a service and make sure it stays healthy. If a service is not healthy, it will be automatically restarted after running the unhealthy() function to clean up.
-rw-r--r--NEWS.md4
-rw-r--r--man/supervise-daemon.89
-rw-r--r--sh/supervise-daemon.sh14
-rw-r--r--src/rc/Makefile2
-rw-r--r--src/rc/supervise-daemon.c136
-rw-r--r--supervise-daemon-guide.md36
6 files changed, 169 insertions, 32 deletions
diff --git a/NEWS.md b/NEWS.md
index d4d96577..f1400197 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -22,6 +22,10 @@ This version adds timed shutdown and cancelation of shutdown to
openrc-shutdown. Shutdowns can now be delayed for a certain amount of
time or scheduled for an exact time.
+supervise-daemon supports health checks, which are a periodic way to make sure a
+service is healthy. For more information on setting this up, please see
+supervise-daemon-guide.md.
+
## OpenRC 0.37
start-stop-daemon now supports logging stdout and stderr of daemons to
diff --git a/man/supervise-daemon.8 b/man/supervise-daemon.8
index af06ee31..8bcd8b5c 100644
--- a/man/supervise-daemon.8
+++ b/man/supervise-daemon.8
@@ -16,6 +16,10 @@
.Nd starts a daemon and restarts it if it crashes
.Sh SYNOPSIS
.Nm
+.Fl a , -healthcheck-timer
+.Ar seconds
+.Fl A , -healthcheck-delay
+.Ar seconds
.Fl D , -respawn-delay
.Ar seconds
.Fl d , -chdir
@@ -90,6 +94,11 @@ Print the action(s) that are taken just before doing them.
.Pp
The options are as follows:
.Bl -tag -width indent
+.Fl a , -healthcheck-timer Ar seconds
+Run the healthcheck() command, possibly followed by the unhealthy()
+command every time this number of seconds passes.
+.Fl A , -healthcheck-delay Ar seconds
+Wait this long before the first health check.
.It Fl D , -respawn-delay Ar seconds
wait this number of seconds before restarting a daemon after it crashes.
The default is 0.
diff --git a/sh/supervise-daemon.sh b/sh/supervise-daemon.sh
index 80e0260c..73a70140 100644
--- a/sh/supervise-daemon.sh
+++ b/sh/supervise-daemon.sh
@@ -10,6 +10,8 @@
# This file may not be copied, modified, propagated, or distributed
# except according to the terms contained in the LICENSE file.
+extra_commands="healthcheck unhealthy ${extra_commands}"
+
supervise_start()
{
if [ -z "$command" ]; then
@@ -32,6 +34,8 @@ supervise_start()
${respawn_delay:+--respawn-delay} $respawn_delay \
${respawn_max:+--respawn-max} $respawn_max \
${respawn_period:+--respawn-period} $respawn_period \
+ ${healthcheck_delay:+--healthcheck-delay} $healthcheck_delay \
+ ${healthcheck_timer:+--healthcheck-timer} $healthcheck_timer \
${command_user+--user} $command_user \
${umask+--umask} $umask \
${supervise_daemon_args:-${start_stop_daemon_args}} \
@@ -98,3 +102,13 @@ supervise_status()
return 3
fi
}
+
+healthcheck()
+{
+ return 0
+}
+
+unhealthy()
+{
+ return 0
+}
diff --git a/src/rc/Makefile b/src/rc/Makefile
index 9ba240fa..ea4a8c81 100644
--- a/src/rc/Makefile
+++ b/src/rc/Makefile
@@ -161,7 +161,7 @@ rc-update: rc-update.o _usage.o rc-misc.o
start-stop-daemon: start-stop-daemon.o _usage.o rc-misc.o rc-pipes.o rc-schedules.o
${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD}
-supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-schedules.o
+supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-plugin.o rc-schedules.o
${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD}
service_get_value service_set_value get_options save_options: do_value.o rc-misc.o
diff --git a/src/rc/supervise-daemon.c b/src/rc/supervise-daemon.c
index 27089152..883c738d 100644
--- a/src/rc/supervise-daemon.c
+++ b/src/rc/supervise-daemon.c
@@ -61,15 +61,18 @@ static struct pam_conv conv = { NULL, NULL};
#include "queue.h"
#include "rc.h"
#include "rc-misc.h"
+#include "rc-plugin.h"
#include "rc-schedules.h"
#include "_usage.h"
#include "helpers.h"
const char *applet = NULL;
const char *extraopts = NULL;
-const char *getoptstring = "D:d:e:g:I:Kk:m:N:p:R:r:Su:1:2:3" \
+const char *getoptstring = "A:a:D:d:e:g:H:I:Kk:m:N:p:R:r:Su:1:2:3" \
getoptstring_COMMON;
const struct option longopts[] = {
+ { "healthcheck-timer", 1, NULL, 'a'},
+ { "healthcheck-delay", 1, NULL, 'A'},
{ "respawn-delay", 1, NULL, 'D'},
{ "chdir", 1, NULL, 'd'},
{ "env", 1, NULL, 'e'},
@@ -91,6 +94,8 @@ const struct option longopts[] = {
longopts_COMMON
};
const char * const longopts_help[] = {
+ "set an initial health check delay",
+ "set a health check timer",
"Set a respawn delay",
"Change the PWD",
"Set an environment string",
@@ -113,6 +118,9 @@ const char * const longopts_help[] = {
};
const char *usagestring = NULL;
+static int healthcheckdelay = 0;
+static int healthchecktimer = 0;
+static volatile sig_atomic_t do_healthcheck = 0;
static int nicelevel = 0;
static int ionicec = -1;
static int ioniced = 0;
@@ -183,6 +191,12 @@ static void handle_signal(int sig)
re_exec_supervisor();
}
+static void healthcheck(int sig)
+{
+ if (sig == SIGALRM)
+ do_healthcheck = 1;
+}
+
static char * expand_home(const char *home, const char *path)
{
char *opath, *ppath, *p, *nh;
@@ -423,11 +437,14 @@ static void child_process(char *exec, char **argv)
static void supervisor(char *exec, char **argv)
{
FILE *fp;
+ pid_t wait_pid;
int i;
int nkilled;
struct timespec ts;
time_t respawn_now= 0;
time_t first_spawn= 0;
+ pid_t health_pid;
+ int health_status;
#ifndef RC_DEBUG
signal_setup_restart(SIGHUP, handle_signal);
@@ -488,46 +505,88 @@ static void supervisor(char *exec, char **argv)
* Supervisor main loop
*/
i = 0;
+ if (healthcheckdelay) {
+ signal_setup(SIGALRM, healthcheck);
+ alarm(healthcheckdelay);
+ } else if (healthchecktimer) {
+ signal_setup(SIGALRM, healthcheck);
+ alarm(healthchecktimer);
+ }
while (!exiting) {
- wait(&i);
- if (exiting) {
- signal_setup(SIGCHLD, SIG_IGN);
- syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);
- nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0,
- false, false, true);
- if (nkilled > 0)
- syslog(LOG_INFO, "killed %d processes", nkilled);
- } else {
- ts.tv_sec = respawn_delay;
- ts.tv_nsec = 0;
- nanosleep(&ts, NULL);
- if (respawn_max > 0 && respawn_period > 0) {
- respawn_now = time(NULL);
- if (first_spawn == 0)
- first_spawn = respawn_now;
- if (respawn_now - first_spawn > respawn_period) {
- respawn_count = 0;
- first_spawn = 0;
- } else
- respawn_count++;
- if (respawn_count > respawn_max) {
- syslog(LOG_WARNING,
- "respawned \"%s\" too many times, exiting", exec);
- exiting = true;
+ wait_pid = wait(&i);
+ if (wait_pid == -1) {
+ if (do_healthcheck) {
+ do_healthcheck = 0;
+ alarm(0);
+ syslog(LOG_DEBUG, "running health check for %s", svcname);
+ health_pid = exec_service(svcname, "healthcheck");
+ health_status = rc_waitpid(health_pid);
+ if (WIFEXITED(health_status) && !WEXITSTATUS(health_status)) {
+ alarm(healthchecktimer);
continue;
+ } else {
+ syslog(LOG_WARNING, "health check for %s failed", svcname);
+ health_pid = exec_service(svcname, "unhealthy");
+ rc_waitpid(health_pid);
+ syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);
+ nkilled = run_stop_schedule(applet, NULL, NULL, child_pid, 0,
+ false, false, true);
+ if (nkilled > 0)
+ syslog(LOG_INFO, "killed %d processes", nkilled);
+ else if (errno != 0)
+ syslog(LOG_INFO, "Unable to kill %d: %s",
+ child_pid, strerror(errno));
}
+ } else if (exiting ) {
+ alarm(0);
+ syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);
+ nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0,
+ false, false, true);
+ if (nkilled > 0)
+ syslog(LOG_INFO, "killed %d processes", nkilled);
+ continue;
}
+ } else if (wait_pid == child_pid) {
if (WIFEXITED(i))
syslog(LOG_WARNING, "%s, pid %d, exited with return code %d",
exec, child_pid, WEXITSTATUS(i));
else if (WIFSIGNALED(i))
syslog(LOG_WARNING, "%s, pid %d, terminated by signal %d",
exec, child_pid, WTERMSIG(i));
- child_pid = fork();
- if (child_pid == -1)
- eerrorx("%s: fork: %s", applet, strerror(errno));
- if (child_pid == 0)
- child_process(exec, argv);
+ } else
+ continue;
+
+ ts.tv_sec = respawn_delay;
+ ts.tv_nsec = 0;
+ nanosleep(&ts, NULL);
+ if (respawn_max > 0 && respawn_period > 0) {
+ respawn_now = time(NULL);
+ if (first_spawn == 0)
+ first_spawn = respawn_now;
+ if (respawn_now - first_spawn > respawn_period) {
+ respawn_count = 0;
+ first_spawn = 0;
+ } else
+ respawn_count++;
+ if (respawn_count > respawn_max) {
+ syslog(LOG_WARNING,
+ "respawned \"%s\" too many times, exiting", exec);
+ exiting = true;
+ continue;
+ }
+ }
+ alarm(0);
+ child_pid = fork();
+ if (child_pid == -1)
+ eerrorx("%s: fork: %s", applet, strerror(errno));
+ if (child_pid == 0)
+ child_process(exec, argv);
+ if (healthcheckdelay) {
+ signal_setup(SIGALRM, healthcheck);
+ alarm(healthcheckdelay);
+ } else if (healthchecktimer) {
+ signal_setup(SIGALRM, healthcheck);
+ alarm(healthchecktimer);
}
}
@@ -612,6 +671,16 @@ int main(int argc, char **argv)
while ((opt = getopt_long(argc, argv, getoptstring, longopts,
(int *) 0)) != -1)
switch (opt) {
+ case 'a': /* --healthcheck-timer <time> */
+ if (sscanf(optarg, "%d", &healthchecktimer) != 1 || healthchecktimer < 1)
+ eerrorx("%s: invalid health check timer %s", applet, optarg);
+ break;
+
+ case 'A': /* --healthcheck-delay <time> */
+ if (sscanf(optarg, "%d", &healthcheckdelay) != 1 || healthcheckdelay < 1)
+ eerrorx("%s: invalid health check delay %s", applet, optarg);
+ break;
+
case 'D': /* --respawn-delay time */
n = sscanf(optarg, "%d", &respawn_delay);
if (n != 1 || respawn_delay < 1)
@@ -668,6 +737,11 @@ int main(int argc, char **argv)
gid = gr->gr_gid;
break;
+ case 'H': /* --healthcheck-timer <minutes> */
+ if (sscanf(optarg, "%d", &healthchecktimer) != 1 || healthchecktimer < 1)
+ eerrorx("%s: invalid health check timer %s", applet, optarg);
+ break;
+
case 'k':
if (parse_mode(&numask, optarg))
eerrorx("%s: invalid mode `%s'",
diff --git a/supervise-daemon-guide.md b/supervise-daemon-guide.md
index 0b15a858..07ab55cf 100644
--- a/supervise-daemon-guide.md
+++ b/supervise-daemon-guide.md
@@ -22,6 +22,28 @@ The following is a brief guide on using this capability.
instructs it not to fork to the command_args_foreground variable shown
below.
+# Health Checks
+
+Health checks are a way to make sure a service monitored by
+supervise-daemon stays healthy. To configure a health check for a
+service, you need to write a healthcheck() function, and optionally an
+unhealthy() function in the service script. Also, you will need to set
+the healthcheck_timer and optionally healthcheck_delay variables.
+
+## healthcheck() function
+
+The healthcheck() function is run repeatedly based on the settings of
+the healthcheck_* variables. This function should return zero if the
+service is currently healthy or non-zero otherwise.
+
+## unhealthy() function
+
+If the healthcheck() function returns non-zero, the unhealthy() function
+is run, then the service is restarted. Since the service will be
+restarted by the supervisor, the unhealthy function should not try to
+restart it; the purpose of the function is to allow any cleanup tasks
+other than restarting the service to be run.
+
# Variable Settings
The most important setting is the supervisor variable. At the top of
@@ -53,6 +75,20 @@ forks and goes to the background by default. This should be set to the
command line option that instructs the daemon to stay in the foreground.
``` sh
+healthcheck_delay=seconds
+```
+
+This is the delay, in seconds, before the first health check is run.
+If it is not set, we use the value of healthcheck_timer.
+
+``` sh
+healthcheck_timer=seconds
+```
+
+This is the number of seconds between health checks. If it is not set,
+no health checks will be run.
+
+``` sh
respawn_delay
```