From e40606ecc44770f551ed4823d89d63602bc8b27f Mon Sep 17 00:00:00 2001 From: Paul Buetow Date: Wed, 17 May 2023 01:03:38 +0300 Subject: add retry and retry interval check config options --- README.md | 8 +++++-- internal/check.go | 8 ++++--- internal/runchecks.go | 64 ++++++++++++++++++++++++++++++--------------------- 3 files changed, 49 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index dcc2998..9c5b8d5 100644 --- a/README.md +++ b/README.md @@ -105,11 +105,15 @@ To configure Gogios, create a JSON configuration file (e.g., `/etc/gogios.json`) "Checks": { "Check ICMP4 www.foo.zone": { "Plugin": "/usr/local/libexec/nagios/check_ping", - "Args": [ "-H", "www.foo.zone", "-4", "-w", "50,10%", "-c", "100,15%" ] + "Args": [ "-H", "www.foo.zone", "-4", "-w", "50,10%", "-c", "100,15%" ], + "Retries": 3, + "RetryInterval": 10 }, "Check ICMP6 www.foo.zone": { "Plugin": "/usr/local/libexec/nagios/check_ping", "Args": [ "-H", "www.foo.zone", "-6", "-w", "50,10%", "-c", "100,15%" ] + "Retries": 3, + "RetryInterval": 10 }, "www.foo.zone HTTP IPv4": { "Plugin": "/usr/local/libexec/nagios/check_http", @@ -175,7 +179,7 @@ To create a high-availability Gogios setup, you can install Gogios on two server # But why? -With experience in monitoring solutions like Nagios, Icinga, Prometheus and OpsGenie, I know that these tools often came with many features that I didn't necessarily need for personal use. Contact groups, host groups, re-check intervals, check clustering, and the requirement of operating a DBMS and a WebUI added complexity and bloat to my monitoring setup. +With experience in monitoring solutions like Nagios, Icinga, Prometheus and OpsGenie, I know that these tools often came with many features that I didn't necessarily need for personal use. Contact groups, host groups, check clustering, and the requirement of operating a DBMS and a WebUI added complexity and bloat to my monitoring setup. My primary goal was to have a single email address for notifications and a simple mechanism to periodically execute standard Nagios check scripts and notify me of any state changes. I wanted the most minimalistic monitoring solution possible but wasn't satisfied with the available options. diff --git a/internal/check.go b/internal/check.go index a9efd29..47dc6f9 100644 --- a/internal/check.go +++ b/internal/check.go @@ -8,9 +8,11 @@ import ( ) type check struct { - Plugin string - Args []string - DependsOn []string `json:"DependsOn,omitempty"` + Plugin string + Args []string + DependsOn []string `json:"DependsOn,omitempty"` + Retries int `json:"Retries,omitempty"` + RetryInterval int `json:"RetryInterval,omitempty"` } type namedCheck struct { diff --git a/internal/runchecks.go b/internal/runchecks.go index 48a80e8..fdd747a 100644 --- a/internal/runchecks.go +++ b/internal/runchecks.go @@ -7,8 +7,8 @@ import ( "time" ) -func runChecks(globalCtx context.Context, state state, config config) state { - limiterCh := make(chan struct{}, config.CheckConcurrency) +func runChecks(ctx context.Context, state state, config config) state { + limitCh := make(chan struct{}, config.CheckConcurrency) inputCh := make(chan namedCheck) outputCh := make(chan checkResult) deps := newDependency(config) @@ -35,30 +35,8 @@ func runChecks(globalCtx context.Context, state state, config config) state { for check := range inputCh { go func(check namedCheck) { - defer inputWg.Done() - - if err := deps.wait(globalCtx, check.DependsOn); err != nil { - deps.notOk(check.name) - outputCh <- check.skip(err.Error()) - return - } - - limiterCh <- struct{}{} - defer func() { <-limiterCh }() - - ctx, cancel := context.WithTimeout(globalCtx, - time.Duration(config.CheckTimeoutS)*time.Second) - defer cancel() - - checkResult := check.run(ctx) - - if checkResult.status == critical { - deps.notOk(check.name) - } else { - deps.ok(check.name) - } - - outputCh <- checkResult + outputCh <- runCheck(ctx, limitCh, deps, check, config, check.Retries) + inputWg.Done() }(check) } @@ -71,3 +49,37 @@ func runChecks(globalCtx context.Context, state state, config config) state { return state } + +func runCheck(ctx context.Context, limitCh chan struct{}, + deps dependency, check namedCheck, config config, retries int) checkResult { + + if err := deps.wait(ctx, check.DependsOn); err != nil { + deps.notOk(check.name) + return check.skip(err.Error()) + } + + limitCh <- struct{}{} + + checkCtx, cancel := context.WithTimeout(ctx, + time.Duration(config.CheckTimeoutS)*time.Second) + defer cancel() + + checkResult := check.run(checkCtx) + + if checkResult.status != ok && retries > 0 { + <-limitCh + retryDuration := time.Duration(check.RetryInterval) * time.Second + time.Sleep(retryDuration) + log.Printf("Retrying %s after %v", check.name, retryDuration) + return runCheck(ctx, limitCh, deps, check, config, retries-1) + } + + if checkResult.status == critical { + deps.notOk(check.name) + } else { + deps.ok(check.name) + } + + <-limitCh + return checkResult +} -- cgit v1.2.3