diff options
-rw-r--r-- | docs/metrics.md | 11 | ||||
-rw-r--r-- | internal/manager/manager.go | 1 | ||||
-rw-r--r-- | internal/metrics/metrics.go | 13 | ||||
-rwxr-xr-x | scripts/silentct-check | 10 |
4 files changed, 34 insertions, 1 deletions
diff --git a/docs/metrics.md b/docs/metrics.md index b2b3445..6fa12c1 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -54,3 +54,14 @@ silentct_error_counter 0 ``` Do not use for alerting, this metric is too noisy and currently used for debug. + +## `"silentct_need_restart"` + +``` +# HELP silentct_need_restart A non-zero value if the monitor needs restarting. +# TYPE silentct_need_restart gauge +silentct_need_restart 0 +``` + +Restarts are normally not needed; but here's a metric until the `silentct-mon` +implementation can assure that all corner-cases are handled without restarts. diff --git a/internal/manager/manager.go b/internal/manager/manager.go index e302b4f..90f6507 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -182,6 +182,7 @@ func (mgr *Manager) addLogs(ctx context.Context, logs []metadata.Log) { mgr.Logger.Infof("adding log %s with existing state on disk\n", log.URL) } else if err != nil { mgr.Logger.Noticef("restart required: failed to bootstrap new log %s: %v\n", log.URL, err) + mgr.Metrics.NeedRestart() } else { mgr.Logger.Infof("bootstrapping log %s at next index 0\n", log.URL) } diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 17e9cce..c5ff0d6 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -12,6 +12,7 @@ type Metrics struct { logTimestamp *prometheus.GaugeVec certificateAlert *prometheus.GaugeVec errorCounter prometheus.Counter + needRestart prometheus.Gauge } func NewMetrics(registry *prometheus.Registry) *Metrics { @@ -50,9 +51,15 @@ func NewMetrics(registry *prometheus.Registry) *Metrics { Help: "The number of errors propagated to the main loop.", }, ), + needRestart: prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "silentct_need_restart", + Help: "A non-zero value if the monitor needs restarting.", + }, + ), } - registry.MustRegister(m.logSize, m.logIndex, m.logTimestamp, m.certificateAlert, m.errorCounter) + registry.MustRegister(m.logSize, m.logIndex, m.logTimestamp, m.certificateAlert, m.errorCounter, m.needRestart) return m } @@ -80,3 +87,7 @@ func (m *Metrics) CertificateAlert(alerts []index.CertificateInfo) { func (m *Metrics) CountError() { m.errorCounter.Inc() } + +func (m *Metrics) NeedRestart() { + m.needRestart.Set(float64(1)) +} diff --git a/scripts/silentct-check b/scripts/silentct-check index 35d38c7..a6a79a3 100755 --- a/scripts/silentct-check +++ b/scripts/silentct-check @@ -77,6 +77,12 @@ while IFS= read -r line; do done <"$metrics_file" #----------------------------------------------------------------------------------------- +# Parse restart metric +#----------------------------------------------------------------------------------------- +line=$(grep "^silentct_need_restart" "$metrics_file") +need_restart=$(echo $line | awk '{print $NF}') + +#----------------------------------------------------------------------------------------- # Emit warnings #----------------------------------------------------------------------------------------- now=$(date +%s) @@ -96,3 +102,7 @@ for stored_at in "${!certificate_alert[@]}"; do observed_at=$(awk "BEGIN {printf \"%.0f\", ${certificate_alert[$stored_at]}}") warn "(mis)-issued certificate? Observed at $(date -d @$observed_at) -- see $stored_at" done + +if [[ $need_restart != 0 ]]; then + warn "silentct-mon needs to be restarted" +fi |