summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2023-05-17 01:03:38 +0300
committerPaul Buetow <paul@buetow.org>2023-05-17 01:03:38 +0300
commite40606ecc44770f551ed4823d89d63602bc8b27f (patch)
tree2d6f50fd71a5f78fc6122fd2ade6c7051143db83
parent7d8272f3c94bc69ab93e059304d65942e7065055 (diff)
add retry and retry interval check config options
-rw-r--r--README.md8
-rw-r--r--internal/check.go8
-rw-r--r--internal/runchecks.go64
3 files changed, 49 insertions, 31 deletions
diff --git a/README.md b/README.md
index dcc2998..9c5b8d5 100644
--- a/README.md
+++ b/README.md
@@ -105,11 +105,15 @@ To configure Gogios, create a JSON configuration file (e.g., `/etc/gogios.json`)
"Checks": {
"Check ICMP4 www.foo.zone": {
"Plugin": "/usr/local/libexec/nagios/check_ping",
- "Args": [ "-H", "www.foo.zone", "-4", "-w", "50,10%", "-c", "100,15%" ]
+ "Args": [ "-H", "www.foo.zone", "-4", "-w", "50,10%", "-c", "100,15%" ],
+ "Retries": 3,
+ "RetryInterval": 10
},
"Check ICMP6 www.foo.zone": {
"Plugin": "/usr/local/libexec/nagios/check_ping",
"Args": [ "-H", "www.foo.zone", "-6", "-w", "50,10%", "-c", "100,15%" ]
+ "Retries": 3,
+ "RetryInterval": 10
},
"www.foo.zone HTTP IPv4": {
"Plugin": "/usr/local/libexec/nagios/check_http",
@@ -175,7 +179,7 @@ To create a high-availability Gogios setup, you can install Gogios on two server
# But why?
-With experience in monitoring solutions like Nagios, Icinga, Prometheus and OpsGenie, I know that these tools often came with many features that I didn't necessarily need for personal use. Contact groups, host groups, re-check intervals, check clustering, and the requirement of operating a DBMS and a WebUI added complexity and bloat to my monitoring setup.
+With experience in monitoring solutions like Nagios, Icinga, Prometheus and OpsGenie, I know that these tools often came with many features that I didn't necessarily need for personal use. Contact groups, host groups, check clustering, and the requirement of operating a DBMS and a WebUI added complexity and bloat to my monitoring setup.
My primary goal was to have a single email address for notifications and a simple mechanism to periodically execute standard Nagios check scripts and notify me of any state changes. I wanted the most minimalistic monitoring solution possible but wasn't satisfied with the available options.
diff --git a/internal/check.go b/internal/check.go
index a9efd29..47dc6f9 100644
--- a/internal/check.go
+++ b/internal/check.go
@@ -8,9 +8,11 @@ import (
)
type check struct {
- Plugin string
- Args []string
- DependsOn []string `json:"DependsOn,omitempty"`
+ Plugin string
+ Args []string
+ DependsOn []string `json:"DependsOn,omitempty"`
+ Retries int `json:"Retries,omitempty"`
+ RetryInterval int `json:"RetryInterval,omitempty"`
}
type namedCheck struct {
diff --git a/internal/runchecks.go b/internal/runchecks.go
index 48a80e8..fdd747a 100644
--- a/internal/runchecks.go
+++ b/internal/runchecks.go
@@ -7,8 +7,8 @@ import (
"time"
)
-func runChecks(globalCtx context.Context, state state, config config) state {
- limiterCh := make(chan struct{}, config.CheckConcurrency)
+func runChecks(ctx context.Context, state state, config config) state {
+ limitCh := make(chan struct{}, config.CheckConcurrency)
inputCh := make(chan namedCheck)
outputCh := make(chan checkResult)
deps := newDependency(config)
@@ -35,30 +35,8 @@ func runChecks(globalCtx context.Context, state state, config config) state {
for check := range inputCh {
go func(check namedCheck) {
- defer inputWg.Done()
-
- if err := deps.wait(globalCtx, check.DependsOn); err != nil {
- deps.notOk(check.name)
- outputCh <- check.skip(err.Error())
- return
- }
-
- limiterCh <- struct{}{}
- defer func() { <-limiterCh }()
-
- ctx, cancel := context.WithTimeout(globalCtx,
- time.Duration(config.CheckTimeoutS)*time.Second)
- defer cancel()
-
- checkResult := check.run(ctx)
-
- if checkResult.status == critical {
- deps.notOk(check.name)
- } else {
- deps.ok(check.name)
- }
-
- outputCh <- checkResult
+ outputCh <- runCheck(ctx, limitCh, deps, check, config, check.Retries)
+ inputWg.Done()
}(check)
}
@@ -71,3 +49,37 @@ func runChecks(globalCtx context.Context, state state, config config) state {
return state
}
+
+func runCheck(ctx context.Context, limitCh chan struct{},
+ deps dependency, check namedCheck, config config, retries int) checkResult {
+
+ if err := deps.wait(ctx, check.DependsOn); err != nil {
+ deps.notOk(check.name)
+ return check.skip(err.Error())
+ }
+
+ limitCh <- struct{}{}
+
+ checkCtx, cancel := context.WithTimeout(ctx,
+ time.Duration(config.CheckTimeoutS)*time.Second)
+ defer cancel()
+
+ checkResult := check.run(checkCtx)
+
+ if checkResult.status != ok && retries > 0 {
+ <-limitCh
+ retryDuration := time.Duration(check.RetryInterval) * time.Second
+ time.Sleep(retryDuration)
+ log.Printf("Retrying %s after %v", check.name, retryDuration)
+ return runCheck(ctx, limitCh, deps, check, config, retries-1)
+ }
+
+ if checkResult.status == critical {
+ deps.notOk(check.name)
+ } else {
+ deps.ok(check.name)
+ }
+
+ <-limitCh
+ return checkResult
+}