aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRasmus Dahlberg <rgdd@glasklarteknik.se>2025-01-04 14:22:20 +0100
committerRasmus Dahlberg <rgdd@glasklarteknik.se>2025-01-04 14:22:20 +0100
commit8a31fb55ef76714f617f079b67a573906f9cea77 (patch)
tree8821e2b41c4e32502debb461558d10007fddf0cf
parent2d4c4b7ac260958b73527c5df366ba4944f4cd13 (diff)
prometheus: Add silentct_need_restart
-rw-r--r--docs/metrics.md11
-rw-r--r--internal/manager/manager.go1
-rw-r--r--internal/metrics/metrics.go13
-rwxr-xr-xscripts/silentct-check10
4 files changed, 34 insertions, 1 deletions
diff --git a/docs/metrics.md b/docs/metrics.md
index b2b3445..6fa12c1 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -54,3 +54,14 @@ silentct_error_counter 0
```
Do not use for alerting, this metric is too noisy and currently used for debug.
+
+## `"silentct_need_restart"`
+
+```
+# HELP silentct_need_restart A non-zero value if the monitor needs restarting.
+# TYPE silentct_need_restart gauge
+silentct_need_restart 0
+```
+
+Restarts are normally not needed; but here's a metric until the `silentct-mon`
+implementation can assure that all corner-cases are handled without restarts.
diff --git a/internal/manager/manager.go b/internal/manager/manager.go
index e302b4f..90f6507 100644
--- a/internal/manager/manager.go
+++ b/internal/manager/manager.go
@@ -182,6 +182,7 @@ func (mgr *Manager) addLogs(ctx context.Context, logs []metadata.Log) {
mgr.Logger.Infof("adding log %s with existing state on disk\n", log.URL)
} else if err != nil {
mgr.Logger.Noticef("restart required: failed to bootstrap new log %s: %v\n", log.URL, err)
+ mgr.Metrics.NeedRestart()
} else {
mgr.Logger.Infof("bootstrapping log %s at next index 0\n", log.URL)
}
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
index 17e9cce..c5ff0d6 100644
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -12,6 +12,7 @@ type Metrics struct {
logTimestamp *prometheus.GaugeVec
certificateAlert *prometheus.GaugeVec
errorCounter prometheus.Counter
+ needRestart prometheus.Gauge
}
func NewMetrics(registry *prometheus.Registry) *Metrics {
@@ -50,9 +51,15 @@ func NewMetrics(registry *prometheus.Registry) *Metrics {
Help: "The number of errors propagated to the main loop.",
},
),
+ needRestart: prometheus.NewGauge(
+ prometheus.GaugeOpts{
+ Name: "silentct_need_restart",
+ Help: "A non-zero value if the monitor needs restarting.",
+ },
+ ),
}
- registry.MustRegister(m.logSize, m.logIndex, m.logTimestamp, m.certificateAlert, m.errorCounter)
+ registry.MustRegister(m.logSize, m.logIndex, m.logTimestamp, m.certificateAlert, m.errorCounter, m.needRestart)
return m
}
@@ -80,3 +87,7 @@ func (m *Metrics) CertificateAlert(alerts []index.CertificateInfo) {
func (m *Metrics) CountError() {
m.errorCounter.Inc()
}
+
+func (m *Metrics) NeedRestart() {
+ m.needRestart.Set(float64(1))
+}
diff --git a/scripts/silentct-check b/scripts/silentct-check
index 35d38c7..a6a79a3 100755
--- a/scripts/silentct-check
+++ b/scripts/silentct-check
@@ -77,6 +77,12 @@ while IFS= read -r line; do
done <"$metrics_file"
#-----------------------------------------------------------------------------------------
+# Parse restart metric
+#-----------------------------------------------------------------------------------------
+line=$(grep "^silentct_need_restart" "$metrics_file")
+need_restart=$(echo $line | awk '{print $NF}')
+
+#-----------------------------------------------------------------------------------------
# Emit warnings
#-----------------------------------------------------------------------------------------
now=$(date +%s)
@@ -96,3 +102,7 @@ for stored_at in "${!certificate_alert[@]}"; do
observed_at=$(awk "BEGIN {printf \"%.0f\", ${certificate_alert[$stored_at]}}")
warn "(mis)-issued certificate? Observed at $(date -d @$observed_at) -- see $stored_at"
done
+
+if [[ $need_restart != 0 ]]; then
+ warn "silentct-mon needs to be restarted"
+fi