From d8e0b9c937dc974fef7484db3f85fabfe9272d7d Mon Sep 17 00:00:00 2001 From: Rasmus Dahlberg Date: Sat, 4 Jan 2025 14:22:20 +0100 Subject: prometheus: Add basic metrics for alerting - Detect if we're falling behind while downloading - Detect if there are any found certificates alerting --- cmd/silentct-mon/main.go | 22 ++++++++++ cmd/silentct-mon/silentct-mon | Bin 0 -> 14044167 bytes go.mod | 8 ++++ go.sum | 18 ++++++++ integration/smoke-test | 3 ++ internal/manager/manager.go | 12 +++++- internal/metrics/metrics.go | 71 ++++++++++++++++++++++++++++++ pkg/storage/index/index.go | 7 +++ scripts/silentct-check | 98 ++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 238 insertions(+), 1 deletion(-) create mode 100755 cmd/silentct-mon/silentct-mon create mode 100644 internal/metrics/metrics.go create mode 100755 scripts/silentct-check diff --git a/cmd/silentct-mon/main.go b/cmd/silentct-mon/main.go index e2ecdb7..ce0a548 100644 --- a/cmd/silentct-mon/main.go +++ b/cmd/silentct-mon/main.go @@ -6,6 +6,7 @@ import ( "flag" "fmt" "log" + "net/http" "os" "os/signal" "strings" @@ -13,11 +14,14 @@ import ( "syscall" "time" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" "rgdd.se/silentct/internal/feedback" "rgdd.se/silentct/internal/flagopt" "rgdd.se/silentct/internal/ioutil" "rgdd.se/silentct/internal/logger" "rgdd.se/silentct/internal/manager" + "rgdd.se/silentct/internal/metrics" "rgdd.se/silentct/internal/monitor" "rgdd.se/silentct/pkg/policy" ) @@ -43,6 +47,7 @@ Options: -e, --please-exit Toggle to only run until up-to-date (Default: false) -f, --force Override santity checks that may not be fatal (Default: false) -o, --output-file File that all output will be written to (Default: stdout) + -m, --metrics-at Host address to serve /metrics on (Default: disabled) -p, --pull-interval How often nodes are pulled for certificates (Default: 15m) -v, --verbosity Leveled logging output (default: NOTICE) -w, --num-workers Number of parallel workers to fetch each log with (Default: 1) @@ -57,6 +62,7 @@ type config struct { directory string pleaseExit bool force bool + metricsAt string outputFile string pullInterval time.Duration numWorkers uint @@ -75,6 +81,7 @@ func configure(cmd string, args []string) (cfg config, err error) { flagopt.StringOpt(fs, &cfg.directory, "directory", "d", "") flagopt.BoolOpt(fs, &cfg.pleaseExit, "please-exit", "e", false) flagopt.BoolOpt(fs, &cfg.force, "force", "f", false) + flagopt.StringOpt(fs, &cfg.metricsAt, "metrics-at", "m", "") flagopt.StringOpt(fs, &cfg.outputFile, "output-file", "o", "") flagopt.DurationOpt(fs, &cfg.pullInterval, "pull-interval", "p", 15*time.Minute) flagopt.StringOpt(fs, &cfg.verbosity, "verbosity", "v", logger.LevelNotice.String()) @@ -144,10 +151,12 @@ func main() { errorCh := make(chan error) defer close(errorCh) + registry := prometheus.NewRegistry() mgr, err := manager.New(manager.Config{ Policy: cfg.policy, Bootstrap: cfg.bootstrap, Directory: cfg.directory, + Metrics: metrics.NewMetrics(registry), Logger: cfg.log, AlertDelay: cfg.pullInterval * 3 / 2, }, feventCh, meventCh, mconfigCh, errorCh) @@ -203,6 +212,19 @@ func main() { fb.RunForever(ctx) }() + if cfg.metricsAt != "" { + wg.Add(1) + go func() { + defer wg.Done() + defer cancel() + + http.Handle("/metrics", promhttp.HandlerFor(registry, promhttp.HandlerOpts{})) + if err := http.ListenAndServe(cfg.metricsAt, nil); err != nil { + cfg.log.Fatalf("metrics: %v\n", err) + } + }() + } + os.Exit(func() int { defer wg.Wait() defer cancel() diff --git a/cmd/silentct-mon/silentct-mon b/cmd/silentct-mon/silentct-mon new file mode 100755 index 0000000..1965f48 Binary files /dev/null and b/cmd/silentct-mon/silentct-mon differ diff --git a/go.mod b/go.mod index 4f49688..b119535 100644 --- a/go.mod +++ b/go.mod @@ -4,14 +4,22 @@ go 1.22.7 require ( github.com/google/certificate-transparency-go v1.3.0 + github.com/prometheus/client_golang v1.20.5 github.com/transparency-dev/merkle v0.0.2 gitlab.torproject.org/rgdd/ct v0.0.0 golang.org/x/crypto v0.31.0 ) require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/go-logr/logr v1.4.2 // indirect github.com/google/trillian v1.7.0 // indirect + github.com/klauspost/compress v1.17.9 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/prometheus/client_model v0.6.1 // indirect + github.com/prometheus/common v0.55.0 // indirect + github.com/prometheus/procfs v0.15.1 // indirect golang.org/x/net v0.31.0 // indirect golang.org/x/sys v0.28.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20241113202542-65e8d215514f // indirect diff --git a/go.sum b/go.sum index 8babb4d..abcd96e 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,7 @@ +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= @@ -8,6 +12,20 @@ github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/trillian v1.7.0 h1:Oib7mKRvZ0Z3GjvNcn2C4clRmFouEOkBcbzw7q8JlFI= github.com/google/trillian v1.7.0/go.mod h1:JMp1zzzHe7j2m9m8P/eTWOaoon3R/SwgqUnFMhm4vfw= +github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= +github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y= +github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= +github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= +github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= +github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/transparency-dev/merkle v0.0.2 h1:Q9nBoQcZcgPamMkGn7ghV8XiTZ/kRxn1yCG81+twTK4= github.com/transparency-dev/merkle v0.0.2/go.mod h1:pqSy+OXefQ1EDUVmAJ8MUhHB9TXGuzVAT58PqBoHz1A= gitlab.torproject.org/rgdd/ct v0.0.0 h1:YeVjFD14bFMMY+oIT6oGuG+8MzcOkFVcKgACqK1IbD0= diff --git a/integration/smoke-test b/integration/smoke-test index 4d92478..a128592 100755 --- a/integration/smoke-test +++ b/integration/smoke-test @@ -95,3 +95,6 @@ pass "run the monitor and be warned of an unreported certificate" #------------------------------------------------------------------------------- echo "---" >&2 echo "All smoke tests passed" >&2 +echo "For interactive tests:" >&2 +echo "go run ../cmd/silentct-mon -c "$dir/config.json" -d "$dir/state" -C "dev:silentct" -p 15s -m localhost:8080 -v DEBUG" >&2 +echo "ALERT_BACKLOG=0 ALERT_FRESHNESS=0 ../scripts/silentct-check" >&2 diff --git a/internal/manager/manager.go b/internal/manager/manager.go index f8b05fd..0f206b8 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -10,6 +10,7 @@ import ( "gitlab.torproject.org/rgdd/ct/pkg/metadata" "rgdd.se/silentct/internal/feedback" "rgdd.se/silentct/internal/logger" + "rgdd.se/silentct/internal/metrics" "rgdd.se/silentct/internal/monitor" "rgdd.se/silentct/pkg/policy" "rgdd.se/silentct/pkg/storage" @@ -19,6 +20,7 @@ type Config struct { Policy policy.Policy Bootstrap bool // Whether a new storage should be initialized from scratch Directory string // Path to a directory where everything will be stored + Metrics *metrics.Metrics // Optional Logger *logger.Logger // Where to output messages and with what verbosity @@ -140,6 +142,7 @@ func (mgr *Manager) startupConfig() error { return err } mgr.mconfigCh <- monitor.MonitoredLog{Config: log, State: state} + mgr.Metrics.LogState(state) } return nil } @@ -164,6 +167,7 @@ func (mgr *Manager) removeLogs(logs []metadata.Log) { state, _ := mgr.GetMonitorState(log) mgr.Logger.Infof("removing log %s with %d entries in its backlog\n", log.URL, state.TreeSize-state.NextIndex) mgr.mconfigCh <- monitor.MonitoredLog{Config: log} + mgr.Metrics.RemoveLogState(state) } } @@ -179,6 +183,7 @@ func (mgr *Manager) addLogs(ctx context.Context, logs []metadata.Log) { mgr.Logger.Infof("bootstrapping log %s at next index 0\n", log.URL) } mgr.mconfigCh <- monitor.MonitoredLog{Config: log, State: state} + mgr.Metrics.LogState(state) } } @@ -197,7 +202,11 @@ func (mgr *Manager) monitorJob(msg monitor.Event) error { if err := mgr.AddEntries(msg.State.LogID, msg.Matches); err != nil { return err } - return mgr.SetMonitorState(msg.State.LogID, msg.State) + if err := mgr.SetMonitorState(msg.State.LogID, msg.State); err != nil { + return err + } + mgr.Metrics.LogState(msg.State) + return nil } func (mgr *Manager) alertJob() error { @@ -208,6 +217,7 @@ func (mgr *Manager) alertJob() error { for _, alert := range alerts { mgr.Logger.Noticef("certificate mis-issuance? No allowlisting for %s\n", alert.StoredAt) } + mgr.Metrics.CertificateAlert(mgr.Storage.Index.Alerting()) return nil } diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go new file mode 100644 index 0000000..113d28c --- /dev/null +++ b/internal/metrics/metrics.go @@ -0,0 +1,71 @@ +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" + "rgdd.se/silentct/internal/monitor" + "rgdd.se/silentct/pkg/storage/index" +) + +type Metrics struct { + logSize *prometheus.GaugeVec + logIndex *prometheus.GaugeVec + logTimestamp *prometheus.GaugeVec + certificateAlert *prometheus.GaugeVec +} + +func NewMetrics(registry *prometheus.Registry) *Metrics { + m := &Metrics{ + logSize: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "silentct_log_size", + Help: "The number of entries in the log.", + }, + []string{"id"}, + ), + logIndex: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "silentct_log_index", + Help: "The next log entry to be downloaded.", + }, + []string{"id"}, + ), + logTimestamp: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "silentct_log_timestamp", + Help: "The log's UNIX timestamp in ms.", + }, + []string{"id"}, + ), + certificateAlert: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "silentct_certificate_alert", + Help: "The time the certificate without allowlisting was found.", + }, + []string{"stored_at"}, + ), + } + + registry.MustRegister(m.logSize, m.logIndex, m.logTimestamp, m.certificateAlert) + return m +} + +func (m *Metrics) LogState(state monitor.State) { + id := state.LogID.Base64String() + m.logIndex.WithLabelValues(id).Set(float64(state.NextIndex)) + m.logSize.WithLabelValues(id).Set(float64(state.TreeSize)) + m.logTimestamp.WithLabelValues(id).Set(float64(state.Timestamp)) +} + +func (m *Metrics) RemoveLogState(state monitor.State) { + id := state.LogID.Base64String() + m.logIndex.Delete(prometheus.Labels{"id": id}) + m.logSize.Delete(prometheus.Labels{"id": id}) + m.logTimestamp.Delete(prometheus.Labels{"id": id}) +} + +func (m *Metrics) CertificateAlert(alerts []index.CertificateInfo) { + m.certificateAlert.Reset() + for _, alert := range alerts { + m.certificateAlert.WithLabelValues(alert.StoredAt).Set(float64(alert.ObservedAt.Unix())) + } +} diff --git a/pkg/storage/index/index.go b/pkg/storage/index/index.go index 0056565..bf94711 100644 --- a/pkg/storage/index/index.go +++ b/pkg/storage/index/index.go @@ -101,6 +101,13 @@ func (ix *Index) TriggerAlerts() ([]CertificateInfo, error) { return alerts, ioutil.CommitJSON(ix.cfg.IndexFile, ix.mem) } +func (ix *Index) Alerting() (ret []CertificateInfo) { + for _, ci := range ix.mem.Alerting { + ret = append(ret, ci[0]) // one is enough for the same crt ID + } + return +} + func (index *Index) Validate() error { return nil // FIXME: check that the index is populated with valid values } diff --git a/scripts/silentct-check b/scripts/silentct-check new file mode 100755 index 0000000..35d38c7 --- /dev/null +++ b/scripts/silentct-check @@ -0,0 +1,98 @@ +#!/bin/bash + +# +# A script that emits warnings based on the the silentct-mon prometheus metrics. +# Mainly meant as an example for those that configure alerts using prometheus. +# + +set -eu + +function warn() { + echo "WARNING: $*" >&2 +} + +function die() { + echo "FATAL: $*" >&2 + exit 1 +} + +#----------------------------------------------------------------------------------------- +# Options +#----------------------------------------------------------------------------------------- +METRICS_AT=${METRICS_AT:-http://localhost:8080/metrics} +ALERT_BACKLOG=${ALERT_BACKLOG:-65536} +ALERT_FRESHNESS=${ALERT_FRESHNESS:-86400} + +#----------------------------------------------------------------------------------------- +# Download the current prometheus metrics +#----------------------------------------------------------------------------------------- +metrics_file=$(mktemp) +trap "rm -f $metrics_file" EXIT +curl -so "$metrics_file" "$METRICS_AT" || die "failed retrieving metrics from $METRICS_AT" + +#----------------------------------------------------------------------------------------- +# Parse per-log metrics +#----------------------------------------------------------------------------------------- +declare -A log_index +declare -A log_size +declare -A log_timestamp +while IFS= read -r line; do + if [[ $line =~ ^# ]]; then + continue # skip comments + fi + + if [[ $line =~ ^silentct_log_index ]]; then + id=$(echo "$line" | grep -oP '(?<=id=")[^"]+') + value=$(echo "$line" | awk '{print $NF}') + log_index["$id"]=$value + fi + + if [[ $line =~ ^silentct_log_size ]]; then + id=$(echo "$line" | grep -oP '(?<=id=")[^"]+') + value=$(echo "$line" | awk '{print $NF}') + log_size["$id"]=$value + fi + + if [[ $line =~ ^silentct_log_timestamp ]]; then + id=$(echo "$line" | grep -oP '(?<=id=")[^"]+') + value=$(echo "$line" | awk '{print $NF}') + log_timestamp["$id"]=$value + fi +done <"$metrics_file" + +#----------------------------------------------------------------------------------------- +# Parse certificate-alert metrics +#----------------------------------------------------------------------------------------- +declare -A certificate_alert +while IFS= read -r line; do + if [[ $line =~ ^# ]]; then + continue # skip comments + fi + + if [[ $line =~ ^silentct_certificate_alert ]]; then + stored_at=$(echo "$line" | grep -oP '(?<=stored_at=")[^"]+') + observed_at=$(echo "$line" | awk '{print $NF}') + certificate_alert["$stored_at"]=$observed_at + fi +done <"$metrics_file" + +#----------------------------------------------------------------------------------------- +# Emit warnings +#----------------------------------------------------------------------------------------- +now=$(date +%s) +for id in "${!log_size[@]}"; do + backlog=$(awk "BEGIN {print ${log_size[$id]} - ${log_index[$id]}}") + if awk "BEGIN {exit !($backlog - $ALERT_BACKLOG >= 0)}"; then + warn "log $id -- backlog is at $backlog" + fi + + unix_timestamp=$(awk "BEGIN {printf \"%.0f\", ${log_timestamp[$id]} / 1000}") + if (( now - unix_timestamp >= ALERT_FRESHNESS )); then + warn "log $id -- latest timestamp at $(date -d @$unix_timestamp)" + fi +done + +for stored_at in "${!certificate_alert[@]}"; do + observed_at=$(awk "BEGIN {printf \"%.0f\", ${certificate_alert[$stored_at]}}") + warn "(mis)-issued certificate? Observed at $(date -d @$observed_at) -- see $stored_at" +done -- cgit v1.2.3