supervise-daemon: add a --respawn-limit option

Allow limiting the number of times supervise-daemon will attempt to respawn a
daemon once it has died to prevent infinite respawning. Also, set a
reasonable default limit (10 times in a 5 second period).

This is for issue #126.
This commit is contained in:
William Hubbs 2017-05-09 18:20:52 -05:00
parent 96c8ba2fb5
commit 3673040722
4 changed files with 63 additions and 1 deletions

View File

@ -167,6 +167,12 @@ Display name used for the above defined command.
Process name to match when signaling the daemon. Process name to match when signaling the daemon.
.It Ar stopsig .It Ar stopsig
Signal to send when stopping the daemon. Signal to send when stopping the daemon.
.It Ar respawn_limit
Respawn limit
.Xr supervise-daemon 8
will use for this daemon. See
.Xr supervise-daemon 8
for more information about this setting.
.It Ar retry .It Ar retry
Retry schedule to use when stopping the daemon. It can either be a Retry schedule to use when stopping the daemon. It can either be a
timeout in seconds or multiple signal/timeout pairs (like SIGTERM/5). timeout in seconds or multiple signal/timeout pairs (like SIGTERM/5).

View File

@ -34,6 +34,8 @@
.Ar user .Ar user
.Fl r , -chroot .Fl r , -chroot
.Ar chrootpath .Ar chrootpath
.Fl R , -respawn-limit
.Ar limit
.Fl 1 , -stdout .Fl 1 , -stdout
.Ar logfile .Ar logfile
.Fl 2 , -stderr .Fl 2 , -stderr
@ -99,6 +101,24 @@ Modifies the scheduling priority of the daemon.
.It Fl r , -chroot Ar path .It Fl r , -chroot Ar path
chroot to this directory before starting the daemon. All other paths, such chroot to this directory before starting the daemon. All other paths, such
as the path to the daemon, chdir and pidfile, should be relative to the chroot. as the path to the daemon, chdir and pidfile, should be relative to the chroot.
.It Fl R , -respawn-limit Ar limit
Control how agressively
.Nm
will try to respawn a daemon when it fails to start. The limit argument
can be a pair of integers separated bya colon or the string unlimited.
.Pp
If a pair of integers is given, the first is a maximum number of respawn
attempts and the second is a time period. It should be interpreted as:
If the daemon dies and has to be respawned more than <first number>
times in any time period of <second number> seconds, exit and give up.
.Pp
For example, the default is 10:5.
This means if the supervisor respawns a daemon more than ten times
in any 5 second period, it gives up and exits.
.Pp
if unlimited is given as the limit, it means that the supervisor will
not exit or give up, no matter how many times the daemon it is
supervising needs to be respawned.
.It Fl u , -user Ar user .It Fl u , -user Ar user
Start the daemon as the specified user. Start the daemon as the specified user.
.It Fl 1 , -stdout Ar logfile .It Fl 1 , -stdout Ar logfile

View File

@ -25,6 +25,7 @@ supervise_start()
eval supervise-daemon --start \ eval supervise-daemon --start \
${chroot:+--chroot} $chroot \ ${chroot:+--chroot} $chroot \
${pidfile:+--pidfile} $pidfile \ ${pidfile:+--pidfile} $pidfile \
${respawn_limit:+--respawn-limit} $respawn_limit \
${command_user+--user} $command_user \ ${command_user+--user} $command_user \
$supervise_daemon_args \ $supervise_daemon_args \
$command \ $command \

View File

@ -66,7 +66,7 @@ static struct pam_conv conv = { NULL, NULL};
const char *applet = NULL; const char *applet = NULL;
const char *extraopts = NULL; const char *extraopts = NULL;
const char *getoptstring = "d:e:g:I:Kk:N:p:r:Su:1:2:" \ const char *getoptstring = "d:e:g:I:Kk:N:p:r:R:Su:1:2:" \
getoptstring_COMMON; getoptstring_COMMON;
const struct option longopts[] = { const struct option longopts[] = {
{ "chdir", 1, NULL, 'd'}, { "chdir", 1, NULL, 'd'},
@ -79,6 +79,7 @@ const struct option longopts[] = {
{ "pidfile", 1, NULL, 'p'}, { "pidfile", 1, NULL, 'p'},
{ "user", 1, NULL, 'u'}, { "user", 1, NULL, 'u'},
{ "chroot", 1, NULL, 'r'}, { "chroot", 1, NULL, 'r'},
{ "respawn-limit", 1, NULL, 'R'},
{ "start", 0, NULL, 'S'}, { "start", 0, NULL, 'S'},
{ "stdout", 1, NULL, '1'}, { "stdout", 1, NULL, '1'},
{ "stderr", 1, NULL, '2'}, { "stderr", 1, NULL, '2'},
@ -95,6 +96,7 @@ const char * const longopts_help[] = {
"Match pid found in this file", "Match pid found in this file",
"Change the process user", "Change the process user",
"Chroot to this directory", "Chroot to this directory",
"set a respawn limit",
"Start daemon", "Start daemon",
"Redirect stdout to file", "Redirect stdout to file",
"Redirect stderr to file", "Redirect stderr to file",
@ -424,7 +426,13 @@ int main(int argc, char **argv)
char *p; char *p;
char *token; char *token;
int i; int i;
int n;
char exec_file[PATH_MAX]; char exec_file[PATH_MAX];
int respawn_count = 0;
int respawn_max = 10;
int respawn_period = 5;
time_t respawn_now= 0;
time_t first_spawn= 0;
struct passwd *pw; struct passwd *pw;
struct group *gr; struct group *gr;
FILE *fp; FILE *fp;
@ -519,6 +527,17 @@ int main(int argc, char **argv)
ch_root = optarg; ch_root = optarg;
break; break;
case 'R': /* --respawn-limit unlimited|count:period */
if (strcasecmp(optarg, "unlimited") == 0) {
respawn_max = 0;
respawn_period = 0;
} else {
n = sscanf(optarg, "%d:%d", &respawn_max, &respawn_period);
if (n != 2 || respawn_max < 1 || respawn_period < 1)
eerrorx("Invalid respawn-limit setting '%s'", optarg);
}
break;
case 'u': /* --user <username>|<uid> */ case 'u': /* --user <username>|<uid> */
{ {
p = optarg; p = optarg;
@ -713,6 +732,22 @@ int main(int argc, char **argv)
syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid); syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);
kill(child_pid, SIGTERM); kill(child_pid, SIGTERM);
} else { } else {
if (respawn_max > 0 && respawn_period > 0) {
respawn_now = time(NULL);
if (first_spawn == 0)
first_spawn = respawn_now;
if (respawn_now - first_spawn > respawn_period) {
respawn_count = 0;
first_spawn = 0;
} else
respawn_count++;
if (respawn_count >= respawn_max) {
syslog(LOG_INFO, "respawned \"%s\" too many times, "
"exiting", exec);
exiting = true;
continue;
}
}
if (WIFEXITED(i)) if (WIFEXITED(i))
syslog(LOG_INFO, "%s, pid %d, exited with return code %d", syslog(LOG_INFO, "%s, pid %d, exited with return code %d",
exec, child_pid, WEXITSTATUS(i)); exec, child_pid, WEXITSTATUS(i));