From d8e0b9c937dc974fef7484db3f85fabfe9272d7d Mon Sep 17 00:00:00 2001
From: Rasmus Dahlberg <rgdd@glasklarteknik.se>
Date: Sat, 4 Jan 2025 14:22:20 +0100
Subject: prometheus: Add basic metrics for alerting

- Detect if we're falling behind while downloading
- Detect if there are any found certificates alerting
---
 cmd/silentct-mon/main.go      |  22 ++++++++++
 cmd/silentct-mon/silentct-mon | Bin 0 -> 14044167 bytes
 go.mod                        |   8 ++++
 go.sum                        |  18 ++++++++
 integration/smoke-test        |   3 ++
 internal/manager/manager.go   |  12 +++++-
 internal/metrics/metrics.go   |  71 ++++++++++++++++++++++++++++++
 pkg/storage/index/index.go    |   7 +++
 scripts/silentct-check        |  98 ++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 238 insertions(+), 1 deletion(-)
 create mode 100755 cmd/silentct-mon/silentct-mon
 create mode 100644 internal/metrics/metrics.go
 create mode 100755 scripts/silentct-check

diff --git a/cmd/silentct-mon/main.go b/cmd/silentct-mon/main.go
index e2ecdb7..ce0a548 100644
--- a/cmd/silentct-mon/main.go
+++ b/cmd/silentct-mon/main.go
@@ -6,6 +6,7 @@ import (
 	"flag"
 	"fmt"
 	"log"
+	"net/http"
 	"os"
 	"os/signal"
 	"strings"
@@ -13,11 +14,14 @@ import (
 	"syscall"
 	"time"
 
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promhttp"
 	"rgdd.se/silentct/internal/feedback"
 	"rgdd.se/silentct/internal/flagopt"
 	"rgdd.se/silentct/internal/ioutil"
 	"rgdd.se/silentct/internal/logger"
 	"rgdd.se/silentct/internal/manager"
+	"rgdd.se/silentct/internal/metrics"
 	"rgdd.se/silentct/internal/monitor"
 	"rgdd.se/silentct/pkg/policy"
 )
@@ -43,6 +47,7 @@ Options:
   -e, --please-exit    Toggle to only run until up-to-date (Default: false)
   -f, --force          Override santity checks that may not be fatal (Default: false)
   -o, --output-file    File that all output will be written to (Default: stdout)
+  -m, --metrics-at     Host address to serve /metrics on (Default: disabled)
   -p, --pull-interval  How often nodes are pulled for certificates (Default: 15m)
   -v, --verbosity      Leveled logging output (default: NOTICE)
   -w, --num-workers    Number of parallel workers to fetch each log with (Default: 1)
@@ -57,6 +62,7 @@ type config struct {
 	directory    string
 	pleaseExit   bool
 	force        bool
+	metricsAt    string
 	outputFile   string
 	pullInterval time.Duration
 	numWorkers   uint
@@ -75,6 +81,7 @@ func configure(cmd string, args []string) (cfg config, err error) {
 	flagopt.StringOpt(fs, &cfg.directory, "directory", "d", "")
 	flagopt.BoolOpt(fs, &cfg.pleaseExit, "please-exit", "e", false)
 	flagopt.BoolOpt(fs, &cfg.force, "force", "f", false)
+	flagopt.StringOpt(fs, &cfg.metricsAt, "metrics-at", "m", "")
 	flagopt.StringOpt(fs, &cfg.outputFile, "output-file", "o", "")
 	flagopt.DurationOpt(fs, &cfg.pullInterval, "pull-interval", "p", 15*time.Minute)
 	flagopt.StringOpt(fs, &cfg.verbosity, "verbosity", "v", logger.LevelNotice.String())
@@ -144,10 +151,12 @@ func main() {
 	errorCh := make(chan error)
 	defer close(errorCh)
 
+	registry := prometheus.NewRegistry()
 	mgr, err := manager.New(manager.Config{
 		Policy:     cfg.policy,
 		Bootstrap:  cfg.bootstrap,
 		Directory:  cfg.directory,
+		Metrics:    metrics.NewMetrics(registry),
 		Logger:     cfg.log,
 		AlertDelay: cfg.pullInterval * 3 / 2,
 	}, feventCh, meventCh, mconfigCh, errorCh)
@@ -203,6 +212,19 @@ func main() {
 		fb.RunForever(ctx)
 	}()
 
+	if cfg.metricsAt != "" {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			defer cancel()
+
+			http.Handle("/metrics", promhttp.HandlerFor(registry, promhttp.HandlerOpts{}))
+			if err := http.ListenAndServe(cfg.metricsAt, nil); err != nil {
+				cfg.log.Fatalf("metrics: %v\n", err)
+			}
+		}()
+	}
+
 	os.Exit(func() int {
 		defer wg.Wait()
 		defer cancel()
diff --git a/cmd/silentct-mon/silentct-mon b/cmd/silentct-mon/silentct-mon
new file mode 100755
index 0000000..1965f48
Binary files /dev/null and b/cmd/silentct-mon/silentct-mon differ
diff --git a/go.mod b/go.mod
index 4f49688..b119535 100644
--- a/go.mod
+++ b/go.mod
@@ -4,14 +4,22 @@ go 1.22.7
 
 require (
 	github.com/google/certificate-transparency-go v1.3.0
+	github.com/prometheus/client_golang v1.20.5
 	github.com/transparency-dev/merkle v0.0.2
 	gitlab.torproject.org/rgdd/ct v0.0.0
 	golang.org/x/crypto v0.31.0
 )
 
 require (
+	github.com/beorn7/perks v1.0.1 // indirect
+	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 	github.com/go-logr/logr v1.4.2 // indirect
 	github.com/google/trillian v1.7.0 // indirect
+	github.com/klauspost/compress v1.17.9 // indirect
+	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
+	github.com/prometheus/client_model v0.6.1 // indirect
+	github.com/prometheus/common v0.55.0 // indirect
+	github.com/prometheus/procfs v0.15.1 // indirect
 	golang.org/x/net v0.31.0 // indirect
 	golang.org/x/sys v0.28.0 // indirect
 	google.golang.org/genproto/googleapis/rpc v0.0.0-20241113202542-65e8d215514f // indirect
diff --git a/go.sum b/go.sum
index 8babb4d..abcd96e 100644
--- a/go.sum
+++ b/go.sum
@@ -1,3 +1,7 @@
+github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
+github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
+github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
+github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
 github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
 github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
@@ -8,6 +12,20 @@ github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
 github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/google/trillian v1.7.0 h1:Oib7mKRvZ0Z3GjvNcn2C4clRmFouEOkBcbzw7q8JlFI=
 github.com/google/trillian v1.7.0/go.mod h1:JMp1zzzHe7j2m9m8P/eTWOaoon3R/SwgqUnFMhm4vfw=
+github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA=
+github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw=
+github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
+github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
+github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y=
+github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
+github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
+github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
+github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc=
+github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8=
+github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
+github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
 github.com/transparency-dev/merkle v0.0.2 h1:Q9nBoQcZcgPamMkGn7ghV8XiTZ/kRxn1yCG81+twTK4=
 github.com/transparency-dev/merkle v0.0.2/go.mod h1:pqSy+OXefQ1EDUVmAJ8MUhHB9TXGuzVAT58PqBoHz1A=
 gitlab.torproject.org/rgdd/ct v0.0.0 h1:YeVjFD14bFMMY+oIT6oGuG+8MzcOkFVcKgACqK1IbD0=
diff --git a/integration/smoke-test b/integration/smoke-test
index 4d92478..a128592 100755
--- a/integration/smoke-test
+++ b/integration/smoke-test
@@ -95,3 +95,6 @@ pass "run the monitor and be warned of an unreported certificate"
 #-------------------------------------------------------------------------------
 echo "---" >&2
 echo "All smoke tests passed" >&2
+echo "For interactive tests:" >&2
+echo "go run ../cmd/silentct-mon -c "$dir/config.json" -d "$dir/state" -C "dev:silentct" -p 15s -m localhost:8080 -v DEBUG" >&2
+echo "ALERT_BACKLOG=0 ALERT_FRESHNESS=0 ../scripts/silentct-check" >&2
diff --git a/internal/manager/manager.go b/internal/manager/manager.go
index f8b05fd..0f206b8 100644
--- a/internal/manager/manager.go
+++ b/internal/manager/manager.go
@@ -10,6 +10,7 @@ import (
 	"gitlab.torproject.org/rgdd/ct/pkg/metadata"
 	"rgdd.se/silentct/internal/feedback"
 	"rgdd.se/silentct/internal/logger"
+	"rgdd.se/silentct/internal/metrics"
 	"rgdd.se/silentct/internal/monitor"
 	"rgdd.se/silentct/pkg/policy"
 	"rgdd.se/silentct/pkg/storage"
@@ -19,6 +20,7 @@ type Config struct {
 	Policy    policy.Policy
 	Bootstrap bool   // Whether a new storage should be initialized from scratch
 	Directory string // Path to a directory where everything will be stored
+	Metrics   *metrics.Metrics
 
 	// Optional
 	Logger                  *logger.Logger // Where to output messages and with what verbosity
@@ -140,6 +142,7 @@ func (mgr *Manager) startupConfig() error {
 			return err
 		}
 		mgr.mconfigCh <- monitor.MonitoredLog{Config: log, State: state}
+		mgr.Metrics.LogState(state)
 	}
 	return nil
 }
@@ -164,6 +167,7 @@ func (mgr *Manager) removeLogs(logs []metadata.Log) {
 		state, _ := mgr.GetMonitorState(log)
 		mgr.Logger.Infof("removing log %s with %d entries in its backlog\n", log.URL, state.TreeSize-state.NextIndex)
 		mgr.mconfigCh <- monitor.MonitoredLog{Config: log}
+		mgr.Metrics.RemoveLogState(state)
 	}
 }
 
@@ -179,6 +183,7 @@ func (mgr *Manager) addLogs(ctx context.Context, logs []metadata.Log) {
 			mgr.Logger.Infof("bootstrapping log %s at next index 0\n", log.URL)
 		}
 		mgr.mconfigCh <- monitor.MonitoredLog{Config: log, State: state}
+		mgr.Metrics.LogState(state)
 	}
 }
 
@@ -197,7 +202,11 @@ func (mgr *Manager) monitorJob(msg monitor.Event) error {
 	if err := mgr.AddEntries(msg.State.LogID, msg.Matches); err != nil {
 		return err
 	}
-	return mgr.SetMonitorState(msg.State.LogID, msg.State)
+	if err := mgr.SetMonitorState(msg.State.LogID, msg.State); err != nil {
+		return err
+	}
+	mgr.Metrics.LogState(msg.State)
+	return nil
 }
 
 func (mgr *Manager) alertJob() error {
@@ -208,6 +217,7 @@ func (mgr *Manager) alertJob() error {
 	for _, alert := range alerts {
 		mgr.Logger.Noticef("certificate mis-issuance? No allowlisting for %s\n", alert.StoredAt)
 	}
+	mgr.Metrics.CertificateAlert(mgr.Storage.Index.Alerting())
 	return nil
 }
 
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
new file mode 100644
index 0000000..113d28c
--- /dev/null
+++ b/internal/metrics/metrics.go
@@ -0,0 +1,71 @@
+package metrics
+
+import (
+	"github.com/prometheus/client_golang/prometheus"
+	"rgdd.se/silentct/internal/monitor"
+	"rgdd.se/silentct/pkg/storage/index"
+)
+
+type Metrics struct {
+	logSize          *prometheus.GaugeVec
+	logIndex         *prometheus.GaugeVec
+	logTimestamp     *prometheus.GaugeVec
+	certificateAlert *prometheus.GaugeVec
+}
+
+func NewMetrics(registry *prometheus.Registry) *Metrics {
+	m := &Metrics{
+		logSize: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Name: "silentct_log_size",
+				Help: "The number of entries in the log.",
+			},
+			[]string{"id"},
+		),
+		logIndex: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Name: "silentct_log_index",
+				Help: "The next log entry to be downloaded.",
+			},
+			[]string{"id"},
+		),
+		logTimestamp: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Name: "silentct_log_timestamp",
+				Help: "The log's UNIX timestamp in ms.",
+			},
+			[]string{"id"},
+		),
+		certificateAlert: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Name: "silentct_certificate_alert",
+				Help: "The time the certificate without allowlisting was found.",
+			},
+			[]string{"stored_at"},
+		),
+	}
+
+	registry.MustRegister(m.logSize, m.logIndex, m.logTimestamp, m.certificateAlert)
+	return m
+}
+
+func (m *Metrics) LogState(state monitor.State) {
+	id := state.LogID.Base64String()
+	m.logIndex.WithLabelValues(id).Set(float64(state.NextIndex))
+	m.logSize.WithLabelValues(id).Set(float64(state.TreeSize))
+	m.logTimestamp.WithLabelValues(id).Set(float64(state.Timestamp))
+}
+
+func (m *Metrics) RemoveLogState(state monitor.State) {
+	id := state.LogID.Base64String()
+	m.logIndex.Delete(prometheus.Labels{"id": id})
+	m.logSize.Delete(prometheus.Labels{"id": id})
+	m.logTimestamp.Delete(prometheus.Labels{"id": id})
+}
+
+func (m *Metrics) CertificateAlert(alerts []index.CertificateInfo) {
+	m.certificateAlert.Reset()
+	for _, alert := range alerts {
+		m.certificateAlert.WithLabelValues(alert.StoredAt).Set(float64(alert.ObservedAt.Unix()))
+	}
+}
diff --git a/pkg/storage/index/index.go b/pkg/storage/index/index.go
index 0056565..bf94711 100644
--- a/pkg/storage/index/index.go
+++ b/pkg/storage/index/index.go
@@ -101,6 +101,13 @@ func (ix *Index) TriggerAlerts() ([]CertificateInfo, error) {
 	return alerts, ioutil.CommitJSON(ix.cfg.IndexFile, ix.mem)
 }
 
+func (ix *Index) Alerting() (ret []CertificateInfo) {
+	for _, ci := range ix.mem.Alerting {
+		ret = append(ret, ci[0]) // one is enough for the same crt ID
+	}
+	return
+}
+
 func (index *Index) Validate() error {
 	return nil // FIXME: check that the index is populated with valid values
 }
diff --git a/scripts/silentct-check b/scripts/silentct-check
new file mode 100755
index 0000000..35d38c7
--- /dev/null
+++ b/scripts/silentct-check
@@ -0,0 +1,98 @@
+#!/bin/bash
+
+#
+# A script that emits warnings based on the the silentct-mon prometheus metrics.
+# Mainly meant as an example for those that configure alerts using prometheus.
+#
+
+set -eu
+
+function warn() {
+	echo "WARNING: $*" >&2
+}
+
+function die() {
+	echo "FATAL: $*" >&2
+	exit 1
+}
+
+#-----------------------------------------------------------------------------------------
+# Options
+#-----------------------------------------------------------------------------------------
+METRICS_AT=${METRICS_AT:-http://localhost:8080/metrics}
+ALERT_BACKLOG=${ALERT_BACKLOG:-65536}
+ALERT_FRESHNESS=${ALERT_FRESHNESS:-86400}
+
+#-----------------------------------------------------------------------------------------
+# Download the current prometheus metrics
+#-----------------------------------------------------------------------------------------
+metrics_file=$(mktemp)
+trap "rm -f $metrics_file" EXIT
+curl -so "$metrics_file" "$METRICS_AT" || die "failed retrieving metrics from $METRICS_AT"
+
+#-----------------------------------------------------------------------------------------
+# Parse per-log metrics
+#-----------------------------------------------------------------------------------------
+declare -A log_index
+declare -A log_size
+declare -A log_timestamp
+while IFS= read -r line; do
+	if [[ $line =~ ^# ]]; then
+		continue # skip comments
+	fi
+
+	if [[ $line =~ ^silentct_log_index ]]; then
+		id=$(echo "$line" | grep -oP '(?<=id=")[^"]+')
+		value=$(echo "$line" | awk '{print $NF}')
+		log_index["$id"]=$value
+	fi
+
+	if [[ $line =~ ^silentct_log_size ]]; then
+		id=$(echo "$line" | grep -oP '(?<=id=")[^"]+')
+		value=$(echo "$line" | awk '{print $NF}')
+		log_size["$id"]=$value
+	fi
+
+	if [[ $line =~ ^silentct_log_timestamp ]]; then
+		id=$(echo "$line" | grep -oP '(?<=id=")[^"]+')
+		value=$(echo "$line" | awk '{print $NF}')
+		log_timestamp["$id"]=$value
+	fi
+done <"$metrics_file"
+
+#-----------------------------------------------------------------------------------------
+# Parse certificate-alert metrics
+#-----------------------------------------------------------------------------------------
+declare -A certificate_alert
+while IFS= read -r line; do
+	if [[ $line =~ ^# ]]; then
+		continue # skip comments
+	fi
+
+	if [[ $line =~ ^silentct_certificate_alert ]]; then
+		stored_at=$(echo "$line" | grep -oP '(?<=stored_at=")[^"]+')
+		observed_at=$(echo "$line" | awk '{print $NF}')
+		certificate_alert["$stored_at"]=$observed_at
+	fi
+done <"$metrics_file"
+
+#-----------------------------------------------------------------------------------------
+# Emit warnings
+#-----------------------------------------------------------------------------------------
+now=$(date +%s)
+for id in "${!log_size[@]}"; do
+	backlog=$(awk "BEGIN {print ${log_size[$id]} - ${log_index[$id]}}")
+	if awk "BEGIN {exit !($backlog - $ALERT_BACKLOG >= 0)}"; then
+		warn "log $id -- backlog is at $backlog"
+	fi
+
+	unix_timestamp=$(awk "BEGIN {printf \"%.0f\", ${log_timestamp[$id]} / 1000}")
+	if (( now - unix_timestamp >= ALERT_FRESHNESS )); then
+		warn "log $id -- latest timestamp at $(date -d @$unix_timestamp)"
+	fi
+done
+
+for stored_at in "${!certificate_alert[@]}"; do
+	observed_at=$(awk "BEGIN {printf \"%.0f\", ${certificate_alert[$stored_at]}}")
+	warn "(mis)-issued certificate? Observed at $(date -d @$observed_at) -- see $stored_at"
+done
-- 
cgit v1.2.3