add retry and retry interval check config options

author: Paul Buetow <paul@buetow.org> 2023-05-17 01:03:38 +0300
committer: Paul Buetow <paul@buetow.org> 2023-05-17 01:03:38 +0300
commit: e40606ecc44770f551ed4823d89d63602bc8b27f (patch)
tree: 2d6f50fd71a5f78fc6122fd2ade6c7051143db83
parent: 7d8272f3c94bc69ab93e059304d65942e7065055 (diff)
3 files changed, 49 insertions, 31 deletions
diff --git a/README.md b/README.md
index dcc2998..9c5b8d5 100644
--- a/README.md
+++ b/README.md
@@ -105,11 +105,15 @@ To configure Gogios, create a JSON configuration file (e.g., `/etc/gogios.json`)
   "Checks": {
     "Check ICMP4 www.foo.zone": {
       "Plugin": "/usr/local/libexec/nagios/check_ping",
-      "Args": [ "-H", "www.foo.zone", "-4", "-w", "50,10%", "-c", "100,15%" ]
+      "Args": [ "-H", "www.foo.zone", "-4", "-w", "50,10%", "-c", "100,15%" ],
+      "Retries": 3,
+      "RetryInterval": 10
     },
     "Check ICMP6 www.foo.zone": {
       "Plugin": "/usr/local/libexec/nagios/check_ping",
       "Args": [ "-H", "www.foo.zone", "-6", "-w", "50,10%", "-c", "100,15%" ]
+      "Retries": 3,
+      "RetryInterval": 10
     },
     "www.foo.zone HTTP IPv4": {
       "Plugin": "/usr/local/libexec/nagios/check_http",
@@ -175,7 +179,7 @@ To create a high-availability Gogios setup, you can install Gogios on two server
 
 # But why?
 
-With experience in monitoring solutions like Nagios, Icinga, Prometheus and OpsGenie, I know that these tools often came with many features that I didn't necessarily need for personal use. Contact groups, host groups, re-check intervals, check clustering, and the requirement of operating a DBMS and a WebUI added complexity and bloat to my monitoring setup.
+With experience in monitoring solutions like Nagios, Icinga, Prometheus and OpsGenie, I know that these tools often came with many features that I didn't necessarily need for personal use. Contact groups, host groups, check clustering, and the requirement of operating a DBMS and a WebUI added complexity and bloat to my monitoring setup.
 
 My primary goal was to have a single email address for notifications and a simple mechanism to periodically execute standard Nagios check scripts and notify me of any state changes. I wanted the most minimalistic monitoring solution possible but wasn't satisfied with the available options.
 
diff --git a/internal/check.go b/internal/check.go
index a9efd29..47dc6f9 100644
--- a/internal/check.go
+++ b/internal/check.go
@@ -8,9 +8,11 @@ import (
 )
 
 type check struct {
-	Plugin    string
-	Args      []string
-	DependsOn []string `json:"DependsOn,omitempty"`
+	Plugin        string
+	Args          []string
+	DependsOn     []string `json:"DependsOn,omitempty"`
+	Retries       int      `json:"Retries,omitempty"`
+	RetryInterval int      `json:"RetryInterval,omitempty"`
 }
 
 type namedCheck struct {
diff --git a/internal/runchecks.go b/internal/runchecks.go
index 48a80e8..fdd747a 100644
--- a/internal/runchecks.go
+++ b/internal/runchecks.go
@@ -7,8 +7,8 @@ import (
 	"time"
 )
 
-func runChecks(globalCtx context.Context, state state, config config) state {
-	limiterCh := make(chan struct{}, config.CheckConcurrency)
+func runChecks(ctx context.Context, state state, config config) state {
+	limitCh := make(chan struct{}, config.CheckConcurrency)
 	inputCh := make(chan namedCheck)
 	outputCh := make(chan checkResult)
 	deps := newDependency(config)
@@ -35,30 +35,8 @@ func runChecks(globalCtx context.Context, state state, config config) state {
 
 	for check := range inputCh {
 		go func(check namedCheck) {
-			defer inputWg.Done()
-
-			if err := deps.wait(globalCtx, check.DependsOn); err != nil {
-				deps.notOk(check.name)
-				outputCh <- check.skip(err.Error())
-				return
-			}
-
-			limiterCh <- struct{}{}
-			defer func() { <-limiterCh }()
-
-			ctx, cancel := context.WithTimeout(globalCtx,
-				time.Duration(config.CheckTimeoutS)*time.Second)
-			defer cancel()
-
-			checkResult := check.run(ctx)
-
-			if checkResult.status == critical {
-				deps.notOk(check.name)
-			} else {
-				deps.ok(check.name)
-			}
-
-			outputCh <- checkResult
+			outputCh <- runCheck(ctx, limitCh, deps, check, config, check.Retries)
+			inputWg.Done()
 		}(check)
 	}
 
@@ -71,3 +49,37 @@ func runChecks(globalCtx context.Context, state state, config config) state {
 
 	return state
 }
+
+func runCheck(ctx context.Context, limitCh chan struct{},
+	deps dependency, check namedCheck, config config, retries int) checkResult {
+
+	if err := deps.wait(ctx, check.DependsOn); err != nil {
+		deps.notOk(check.name)
+		return check.skip(err.Error())
+	}
+
+	limitCh <- struct{}{}
+
+	checkCtx, cancel := context.WithTimeout(ctx,
+		time.Duration(config.CheckTimeoutS)*time.Second)
+	defer cancel()
+
+	checkResult := check.run(checkCtx)
+
+	if checkResult.status != ok && retries > 0 {
+		<-limitCh
+		retryDuration := time.Duration(check.RetryInterval) * time.Second
+		time.Sleep(retryDuration)
+		log.Printf("Retrying %s after %v", check.name, retryDuration)
+		return runCheck(ctx, limitCh, deps, check, config, retries-1)
+	}
+
+	if checkResult.status == critical {
+		deps.notOk(check.name)
+	} else {
+		deps.ok(check.name)
+	}
+
+	<-limitCh
+	return checkResult
+}
author	Paul Buetow <paul@buetow.org>	2023-05-17 01:03:38 +0300
committer	Paul Buetow <paul@buetow.org>	2023-05-17 01:03:38 +0300
commit	e40606ecc44770f551ed4823d89d63602bc8b27f (patch)
tree	2d6f50fd71a5f78fc6122fd2ade6c7051143db83
parent	7d8272f3c94bc69ab93e059304d65942e7065055 (diff)