supervise-daemon: add health checks

Health checks are a way to monitor a service and make sure it stays
healthy.

If a service is not healthy, it will be automatically restarted after
running the unhealthy() function to clean up.
This commit is contained in:
William Hubbs 2018-10-09 17:49:02 -05:00
parent 7a75bfb00c
commit c1e582586d
6 changed files with 169 additions and 32 deletions

View File

@ -22,6 +22,10 @@ This version adds timed shutdown and cancelation of shutdown to
openrc-shutdown. Shutdowns can now be delayed for a certain amount of
time or scheduled for an exact time.
supervise-daemon supports health checks, which are a periodic way to make sure a
service is healthy. For more information on setting this up, please see
supervise-daemon-guide.md.
## OpenRC 0.37
start-stop-daemon now supports logging stdout and stderr of daemons to

View File

@ -16,6 +16,10 @@
.Nd starts a daemon and restarts it if it crashes
.Sh SYNOPSIS
.Nm
.Fl a , -healthcheck-timer
.Ar seconds
.Fl A , -healthcheck-delay
.Ar seconds
.Fl D , -respawn-delay
.Ar seconds
.Fl d , -chdir
@ -90,6 +94,11 @@ Print the action(s) that are taken just before doing them.
.Pp
The options are as follows:
.Bl -tag -width indent
.Fl a , -healthcheck-timer Ar seconds
Run the healthcheck() command, possibly followed by the unhealthy()
command every time this number of seconds passes.
.Fl A , -healthcheck-delay Ar seconds
Wait this long before the first health check.
.It Fl D , -respawn-delay Ar seconds
wait this number of seconds before restarting a daemon after it crashes.
The default is 0.

View File

@ -10,6 +10,8 @@
# This file may not be copied, modified, propagated, or distributed
# except according to the terms contained in the LICENSE file.
extra_commands="healthcheck unhealthy ${extra_commands}"
supervise_start()
{
if [ -z "$command" ]; then
@ -32,6 +34,8 @@ supervise_start()
${respawn_delay:+--respawn-delay} $respawn_delay \
${respawn_max:+--respawn-max} $respawn_max \
${respawn_period:+--respawn-period} $respawn_period \
${healthcheck_delay:+--healthcheck-delay} $healthcheck_delay \
${healthcheck_timer:+--healthcheck-timer} $healthcheck_timer \
${command_user+--user} $command_user \
${umask+--umask} $umask \
${supervise_daemon_args:-${start_stop_daemon_args}} \
@ -98,3 +102,13 @@ supervise_status()
return 3
fi
}
healthcheck()
{
return 0
}
unhealthy()
{
return 0
}

View File

@ -161,7 +161,7 @@ rc-update: rc-update.o _usage.o rc-misc.o
start-stop-daemon: start-stop-daemon.o _usage.o rc-misc.o rc-pipes.o rc-schedules.o
${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD}
supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-schedules.o
supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-plugin.o rc-schedules.o
${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD}
service_get_value service_set_value get_options save_options: do_value.o rc-misc.o

View File

@ -61,15 +61,18 @@ static struct pam_conv conv = { NULL, NULL};
#include "queue.h"
#include "rc.h"
#include "rc-misc.h"
#include "rc-plugin.h"
#include "rc-schedules.h"
#include "_usage.h"
#include "helpers.h"
const char *applet = NULL;
const char *extraopts = NULL;
const char *getoptstring = "D:d:e:g:I:Kk:m:N:p:R:r:Su:1:2:3" \
const char *getoptstring = "A:a:D:d:e:g:H:I:Kk:m:N:p:R:r:Su:1:2:3" \
getoptstring_COMMON;
const struct option longopts[] = {
{ "healthcheck-timer", 1, NULL, 'a'},
{ "healthcheck-delay", 1, NULL, 'A'},
{ "respawn-delay", 1, NULL, 'D'},
{ "chdir", 1, NULL, 'd'},
{ "env", 1, NULL, 'e'},
@ -91,6 +94,8 @@ const struct option longopts[] = {
longopts_COMMON
};
const char * const longopts_help[] = {
"set an initial health check delay",
"set a health check timer",
"Set a respawn delay",
"Change the PWD",
"Set an environment string",
@ -113,6 +118,9 @@ const char * const longopts_help[] = {
};
const char *usagestring = NULL;
static int healthcheckdelay = 0;
static int healthchecktimer = 0;
static volatile sig_atomic_t do_healthcheck = 0;
static int nicelevel = 0;
static int ionicec = -1;
static int ioniced = 0;
@ -183,6 +191,12 @@ static void handle_signal(int sig)
re_exec_supervisor();
}
static void healthcheck(int sig)
{
if (sig == SIGALRM)
do_healthcheck = 1;
}
static char * expand_home(const char *home, const char *path)
{
char *opath, *ppath, *p, *nh;
@ -423,11 +437,14 @@ static void child_process(char *exec, char **argv)
static void supervisor(char *exec, char **argv)
{
FILE *fp;
pid_t wait_pid;
int i;
int nkilled;
struct timespec ts;
time_t respawn_now= 0;
time_t first_spawn= 0;
pid_t health_pid;
int health_status;
#ifndef RC_DEBUG
signal_setup_restart(SIGHUP, handle_signal);
@ -488,46 +505,88 @@ static void supervisor(char *exec, char **argv)
* Supervisor main loop
*/
i = 0;
if (healthcheckdelay) {
signal_setup(SIGALRM, healthcheck);
alarm(healthcheckdelay);
} else if (healthchecktimer) {
signal_setup(SIGALRM, healthcheck);
alarm(healthchecktimer);
}
while (!exiting) {
wait(&i);
if (exiting) {
signal_setup(SIGCHLD, SIG_IGN);
syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);
nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0,
false, false, true);
if (nkilled > 0)
syslog(LOG_INFO, "killed %d processes", nkilled);
} else {
ts.tv_sec = respawn_delay;
ts.tv_nsec = 0;
nanosleep(&ts, NULL);
if (respawn_max > 0 && respawn_period > 0) {
respawn_now = time(NULL);
if (first_spawn == 0)
first_spawn = respawn_now;
if (respawn_now - first_spawn > respawn_period) {
respawn_count = 0;
first_spawn = 0;
} else
respawn_count++;
if (respawn_count > respawn_max) {
syslog(LOG_WARNING,
"respawned \"%s\" too many times, exiting", exec);
exiting = true;
wait_pid = wait(&i);
if (wait_pid == -1) {
if (do_healthcheck) {
do_healthcheck = 0;
alarm(0);
syslog(LOG_DEBUG, "running health check for %s", svcname);
health_pid = exec_service(svcname, "healthcheck");
health_status = rc_waitpid(health_pid);
if (WIFEXITED(health_status) && !WEXITSTATUS(health_status)) {
alarm(healthchecktimer);
continue;
} else {
syslog(LOG_WARNING, "health check for %s failed", svcname);
health_pid = exec_service(svcname, "unhealthy");
rc_waitpid(health_pid);
syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);
nkilled = run_stop_schedule(applet, NULL, NULL, child_pid, 0,
false, false, true);
if (nkilled > 0)
syslog(LOG_INFO, "killed %d processes", nkilled);
else if (errno != 0)
syslog(LOG_INFO, "Unable to kill %d: %s",
child_pid, strerror(errno));
}
} else if (exiting ) {
alarm(0);
syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);
nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0,
false, false, true);
if (nkilled > 0)
syslog(LOG_INFO, "killed %d processes", nkilled);
continue;
}
} else if (wait_pid == child_pid) {
if (WIFEXITED(i))
syslog(LOG_WARNING, "%s, pid %d, exited with return code %d",
exec, child_pid, WEXITSTATUS(i));
else if (WIFSIGNALED(i))
syslog(LOG_WARNING, "%s, pid %d, terminated by signal %d",
exec, child_pid, WTERMSIG(i));
child_pid = fork();
if (child_pid == -1)
eerrorx("%s: fork: %s", applet, strerror(errno));
if (child_pid == 0)
child_process(exec, argv);
} else
continue;
ts.tv_sec = respawn_delay;
ts.tv_nsec = 0;
nanosleep(&ts, NULL);
if (respawn_max > 0 && respawn_period > 0) {
respawn_now = time(NULL);
if (first_spawn == 0)
first_spawn = respawn_now;
if (respawn_now - first_spawn > respawn_period) {
respawn_count = 0;
first_spawn = 0;
} else
respawn_count++;
if (respawn_count > respawn_max) {
syslog(LOG_WARNING,
"respawned \"%s\" too many times, exiting", exec);
exiting = true;
continue;
}
}
alarm(0);
child_pid = fork();
if (child_pid == -1)
eerrorx("%s: fork: %s", applet, strerror(errno));
if (child_pid == 0)
child_process(exec, argv);
if (healthcheckdelay) {
signal_setup(SIGALRM, healthcheck);
alarm(healthcheckdelay);
} else if (healthchecktimer) {
signal_setup(SIGALRM, healthcheck);
alarm(healthchecktimer);
}
}
@ -612,6 +671,16 @@ int main(int argc, char **argv)
while ((opt = getopt_long(argc, argv, getoptstring, longopts,
(int *) 0)) != -1)
switch (opt) {
case 'a': /* --healthcheck-timer <time> */
if (sscanf(optarg, "%d", &healthchecktimer) != 1 || healthchecktimer < 1)
eerrorx("%s: invalid health check timer %s", applet, optarg);
break;
case 'A': /* --healthcheck-delay <time> */
if (sscanf(optarg, "%d", &healthcheckdelay) != 1 || healthcheckdelay < 1)
eerrorx("%s: invalid health check delay %s", applet, optarg);
break;
case 'D': /* --respawn-delay time */
n = sscanf(optarg, "%d", &respawn_delay);
if (n != 1 || respawn_delay < 1)
@ -668,6 +737,11 @@ int main(int argc, char **argv)
gid = gr->gr_gid;
break;
case 'H': /* --healthcheck-timer <minutes> */
if (sscanf(optarg, "%d", &healthchecktimer) != 1 || healthchecktimer < 1)
eerrorx("%s: invalid health check timer %s", applet, optarg);
break;
case 'k':
if (parse_mode(&numask, optarg))
eerrorx("%s: invalid mode `%s'",

View File

@ -22,6 +22,28 @@ The following is a brief guide on using this capability.
instructs it not to fork to the command_args_foreground variable shown
below.
# Health Checks
Health checks are a way to make sure a service monitored by
supervise-daemon stays healthy. To configure a health check for a
service, you need to write a healthcheck() function, and optionally an
unhealthy() function in the service script. Also, you will need to set
the healthcheck_timer and optionally healthcheck_delay variables.
## healthcheck() function
The healthcheck() function is run repeatedly based on the settings of
the healthcheck_* variables. This function should return zero if the
service is currently healthy or non-zero otherwise.
## unhealthy() function
If the healthcheck() function returns non-zero, the unhealthy() function
is run, then the service is restarted. Since the service will be
restarted by the supervisor, the unhealthy function should not try to
restart it; the purpose of the function is to allow any cleanup tasks
other than restarting the service to be run.
# Variable Settings
The most important setting is the supervisor variable. At the top of
@ -52,6 +74,20 @@ This should be used if the daemon you want to monitor
forks and goes to the background by default. This should be set to the
command line option that instructs the daemon to stay in the foreground.
``` sh
healthcheck_delay=seconds
```
This is the delay, in seconds, before the first health check is run.
If it is not set, we use the value of healthcheck_timer.
``` sh
healthcheck_timer=seconds
```
This is the number of seconds between health checks. If it is not set,
no health checks will be run.
``` sh
respawn_delay
```